首页 新闻 会员 周边 捐助

python 集成llamaindex

0
悬赏园豆:10 [待解决问题]
import os
import asyncio
from flask import Flask, request, render_template, jsonify, Response
from llama_index.core import StorageContext, VectorStoreIndex, Settings, Document, SimpleDirectoryReader
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.vector_stores.elasticsearch import ElasticsearchStore
from llama_index.readers.file import (
    DocxReader, HWPReader, PDFReader, EpubReader, FlatReader, 
    HTMLTagReader, ImageReader, IPYNBReader, MarkdownReader, 
    MboxReader, PptxReader, PandasCSVReader, PyMuPDFReader, 
    XMLReader, PagedCSVReader, CSVReader
)

app = Flask(__name__)

UPLOAD_FOLDER = "uploads"
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER

# 确保上传目录存在
os.makedirs(UPLOAD_FOLDER, exist_ok=True)

# 解析器映射
PARSERS = {
    ".pdf": PDFReader(),
    ".docx": DocxReader(),
    ".hwp": HWPReader(),
    ".epub": EpubReader(),
    ".txt": FlatReader(),
    ".html": HTMLTagReader(),
    ".jpg": ImageReader(), ".jpeg": ImageReader(), ".png": ImageReader(),
    ".ipynb": IPYNBReader(),
    ".md": MarkdownReader(),
    ".mbox": MboxReader(),
    ".csv": PandasCSVReader(),
    ".xml": XMLReader()
}

base_url = "http://localhost:11434"
Settings.embed_model = OllamaEmbedding(model_name="bge-m3", base_url=base_url)
Settings.llm = Ollama(model="qwen2.5:7b", base_url=base_url)

# Elasticsearch 配置,使用同步策略
dense_vector_store = ElasticsearchStore(
    es_url="http://localhost:9200",
    index_name="spring-ai-index-test"
)

@app.route('/add')
def upload_form():
    return render_template('upload.html')

@app.route('/')
def chat_html():
    return render_template('chat.html')

@app.route('/chat')
def chat():
    q = request.args.get('q')
    if not q:
        return jsonify({"error": "No query provided"}), 400
    return search(q)

@app.route('/upload', methods=['POST'])
def upload_file():
    try:
        if 'file' not in request.files:
            return jsonify({"error": "No file part"}), 400
        
        file = request.files['file']
        
        # 获取文件扩展名
        ext = os.path.splitext(file.filename)[-1].lower()
        if ext not in PARSERS:
            return jsonify({"error": "Unsupported file type"}), 400    

        # 保存文件
        file_path = os.path.join(app.config["UPLOAD_FOLDER"], file.filename)
        file.save(file_path)

        # 解析文件
        parser = PARSERS[ext]
        file_extractor = {ext: parser}
        documents = SimpleDirectoryReader(
            "./uploads", file_extractor=file_extractor
        ).load_data()

        if not documents:
            return jsonify({"error": "No content extracted from file"}), 400
        
        # 确保 documents 是一个列表
        if not isinstance(documents, list):
            documents = [documents]

        # 确保文档类型是 Document,如果是字符串需要手动包装
        documents = [doc if isinstance(doc, Document) else Document(doc) for doc in documents]

        # 存储到索引
        storage_context = StorageContext.from_defaults(vector_store=dense_vector_store)
        index = VectorStoreIndex.from_documents(
            documents,
            storage_context=storage_context
        )
        # 删除本地上传的文件
        os.remove(file_path)
        
        return jsonify({
            "message": "File processed and stored successfully",
            "filename": file.filename
        }), 200
        
    except UnicodeDecodeError:
        return jsonify({"error": "File must be text encoded in UTF-8"}), 400
    except Exception as e:
        return jsonify({"error": str(e)}), 500

# 查询文档(同步方式)
def search(query: str):
    try:
        storage_context = StorageContext.from_defaults(vector_store=dense_vector_store)
        index = VectorStoreIndex.from_vector_store(
            vector_store=dense_vector_store,
            storage_context=storage_context
        )
        query_engine = index.as_query_engine()
        response = query_engine.query(query)
        return str(response)
    except Exception as e:
        return f"Error: {str(e)}"

if __name__ == '__main__':  
    app.run(host='0.0.0.0', port=5000, debug=True)
问题补充:

有懂行的帮忙看看第二次请求一直报错: Timeout context manager should be used inside a task
github上在这里有提问 https://github.com/elastic/elasticsearch-py/issues/2614

景伟·郭的主页 景伟·郭 | 初学一级 | 园豆:151
提问于:2025-03-28 16:55
< > 豆包AI编程
分享
清除回答草稿
   您需要登录以后才能回答,未注册用户请先注册
Top