from langchain.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain.docstore.document import Document from transformers import pipeline from langchain.chains.question_answering import load_qa_chain import os # Step 1: Load QA pipeline (don't wrap in HuggingFacePipeline) embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small") qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2") multi_directory_path=r'tmp/' def docs_vector_index(): from langchain.document_loaders import DirectoryLoader # Define a directory path directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp" # Create the DirectoryLoader, specifying loaders for each file type loader = DirectoryLoader( directory_path, glob="**/*", # This pattern loads all files; modify as needed ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."] ) print(docs) docs_chunks = text_splitter.split_documents(docs) print(f"docs_chunks length: {len(docs_chunks)}") print('********************docs_chunks',docs_chunks) if len(docs_chunks)>0: db = FAISS.from_documents(docs_chunks, embeddings) return db else: return '' def run_custom_qa(question, retrieved_docs): context = " ".join([doc.page_content for doc in retrieved_docs]) output = qa_pipeline(question=question, context=context) return output #output["answer"] # # Step 6: Ask question # question = "東京大学はいつ設立されましたか?" # relevant_docs = retriever.get_relevant_documents(question) # answer = run_custom_qa(question, relevant_docs) # # print("Answer:", answer) def doc_qa(query, db): print("*************************custom qa doc_qa",query) retriever = db.as_retriever() relevant_docs = retriever.get_relevant_documents(query) response=run_custom_qa(query, relevant_docs) print('response', response) return response