from typing import Optional, List from langchain.document_loaders import TextLoader #for textfiles from langchain.text_splitter import CharacterTextSplitter #text splitter from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models from langchain.document_loaders import UnstructuredPDFLoader #load pdf from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb from langchain.chains import RetrievalQA from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader from langchain.chains.question_answering import load_qa_chain from langchain import HuggingFaceHub import os from langchain.document_loaders import TextLoader, PyMuPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.llms import HuggingFacePipeline from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain import PromptTemplate from langchain.chains import LLMChain from langchain.base_language import BaseLanguageModel from docx import Document from langchain.document_loaders import DirectoryLoader multi_directory_path=r'tmp/' from transformers import pipeline from sentence_transformers import SentenceTransformer #model = SentenceTransformer("sentence-transformers/LaBSE") embeddings = HuggingFaceEmbeddings(model_name='setu4993/LaBSE') from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader after_rag_template = """Answer the question based only on the following context: {context} Question: {question} """ #pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100) #pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200) from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False) #model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base") # Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b") #model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b") #pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200) #pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200) pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2") llm = HuggingFacePipeline(pipeline=pipe) def run_custom_qa(question, retrieved_docs): context = " ".join([doc.page_content for doc in retrieved_docs]) output = pipe(question=question, context=context) return output["answer"] def docs_vector_index(): from langchain.document_loaders import DirectoryLoader # Define a directory path directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp" # Create the DirectoryLoader, specifying loaders for each file type loader = DirectoryLoader( directory_path, glob="**/*", # This pattern loads all files; modify as needed ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."] ) print(docs) docs_chunks = text_splitter.split_documents(docs) print(f"docs_chunks length: {len(docs_chunks)}") print('********************docs_chunks',docs_chunks) if len(docs_chunks)>0: db = FAISS.from_documents(docs_chunks, embeddings) return db else: return '' #chain = load_qa_chain(llm, chain_type="stuff") from langchain.prompts import PromptTemplate template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information. {context} Based on the above information only, answer the below question. {question} Be concise.""" prompt = PromptTemplate.from_template(template) print(prompt.input_variables) #query_llm = LLMChain(llm=llm, prompt=prompt) # def doc_qa1(query, db): # similar_doc = db.similarity_search(query, k=2) # doc_c=[] # for c in similar_doc: # doc_c.append(c.page_content) # context=''.join(doc_c) # #response = query_llm.run({"context": context, "question": query}) # response = query_llm.run(context=context, question=query) # print('response',response) # return response def doc_qa(query, db): print("*************************custom qa doc_qa",query) retriever = db.as_retriever() relevant_docs = retriever.get_relevant_documents(query) response=run_custom_qa(query, relevant_docs) print('response', response) return response