singhdevendra58 commited on
Commit
d23fb76
verified
1 Parent(s): 318d005

Upload 2 files

Browse files
Files changed (2) hide show
  1. src/doc_qa.py +124 -0
  2. src/doc_qa_1.py +62 -0
src/doc_qa.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from langchain.document_loaders import TextLoader #for textfiles
4
+ from langchain.text_splitter import CharacterTextSplitter #text splitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
6
+
7
+ from langchain.document_loaders import UnstructuredPDFLoader #load pdf
8
+ from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
11
+ from langchain.chains.question_answering import load_qa_chain
12
+ from langchain import HuggingFaceHub
13
+ import os
14
+ from langchain.document_loaders import TextLoader, PyMuPDFLoader
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.llms import HuggingFacePipeline
17
+ from langchain.vectorstores import FAISS
18
+ from langchain.embeddings import HuggingFaceEmbeddings
19
+ from langchain import PromptTemplate
20
+ from langchain.chains import LLMChain
21
+ from langchain.base_language import BaseLanguageModel
22
+ from docx import Document
23
+ from langchain.document_loaders import DirectoryLoader
24
+ multi_directory_path=r'tmp/'
25
+
26
+ from transformers import pipeline
27
+
28
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/LaBSE')
29
+
30
+
31
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
32
+
33
+ after_rag_template = """Answer the question based only on the following context:
34
+ {context}
35
+ Question: {question}
36
+ """
37
+ #pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100)
38
+ #pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200)
39
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
40
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False)
41
+ #model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
42
+ # Load model directly
43
+ from transformers import AutoTokenizer, AutoModelForCausalLM
44
+
45
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b")
46
+ #model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b")
47
+ #pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200)
48
+ #pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
49
+
50
+ pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
51
+
52
+ llm = HuggingFacePipeline(pipeline=pipe)
53
+
54
+ def run_custom_qa(question, retrieved_docs):
55
+ context = " ".join([doc.page_content for doc in retrieved_docs])
56
+ output = pipe(question=question, context=context)
57
+ return output["answer"]
58
+
59
+ def docs_vector_index():
60
+ from langchain.document_loaders import DirectoryLoader
61
+ # Define a directory path
62
+ directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
63
+
64
+ # Create the DirectoryLoader, specifying loaders for each file type
65
+ loader = DirectoryLoader(
66
+ directory_path,
67
+ glob="**/*", # This pattern loads all files; modify as needed
68
+
69
+ )
70
+ docs = loader.load()
71
+
72
+ text_splitter = RecursiveCharacterTextSplitter(
73
+ chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
74
+ )
75
+ print(docs)
76
+ docs_chunks = text_splitter.split_documents(docs)
77
+
78
+ print(f"docs_chunks length: {len(docs_chunks)}")
79
+ print('********************docs_chunks',docs_chunks)
80
+ if len(docs_chunks)>0:
81
+ db = FAISS.from_documents(docs_chunks, embeddings)
82
+ return db
83
+ else:
84
+ return ''
85
+
86
+
87
+ #chain = load_qa_chain(llm, chain_type="stuff")
88
+
89
+ from langchain.prompts import PromptTemplate
90
+
91
+ template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information.
92
+ {context}
93
+
94
+ Based on the above information only, answer the below question.
95
+
96
+ {question} Be concise."""
97
+
98
+ prompt = PromptTemplate.from_template(template)
99
+ print(prompt.input_variables)
100
+
101
+
102
+ #query_llm = LLMChain(llm=llm, prompt=prompt)
103
+
104
+ # def doc_qa1(query, db):
105
+ # similar_doc = db.similarity_search(query, k=2)
106
+ # doc_c=[]
107
+ # for c in similar_doc:
108
+ # doc_c.append(c.page_content)
109
+ # context=''.join(doc_c)
110
+ # #response = query_llm.run({"context": context, "question": query})
111
+ # response = query_llm.run(context=context, question=query)
112
+ # print('response',response)
113
+ # return response
114
+
115
+ def doc_qa(query, db):
116
+ print("*************************custom qa doc_qa",query)
117
+ retriever = db.as_retriever()
118
+ relevant_docs = retriever.get_relevant_documents(query)
119
+ response=run_custom_qa(query, relevant_docs)
120
+ print('response', response)
121
+ return response
122
+
123
+
124
+
src/doc_qa_1.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.docstore.document import Document
6
+ from transformers import pipeline
7
+ from langchain.chains.question_answering import load_qa_chain
8
+ import os
9
+
10
+ # Step 1: Load QA pipeline (don't wrap in HuggingFacePipeline)
11
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")
12
+ qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
13
+ multi_directory_path=r'tmp/'
14
+
15
+ def docs_vector_index():
16
+ from langchain.document_loaders import DirectoryLoader
17
+ # Define a directory path
18
+ directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
19
+
20
+ # Create the DirectoryLoader, specifying loaders for each file type
21
+ loader = DirectoryLoader(
22
+ directory_path,
23
+ glob="**/*", # This pattern loads all files; modify as needed
24
+
25
+ )
26
+ docs = loader.load()
27
+
28
+ text_splitter = RecursiveCharacterTextSplitter(
29
+ chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
30
+ )
31
+ print(docs)
32
+ docs_chunks = text_splitter.split_documents(docs)
33
+
34
+ print(f"docs_chunks length: {len(docs_chunks)}")
35
+ print('********************docs_chunks',docs_chunks)
36
+ if len(docs_chunks)>0:
37
+ db = FAISS.from_documents(docs_chunks, embeddings)
38
+ return db
39
+ else:
40
+ return ''
41
+
42
+
43
+
44
+ def run_custom_qa(question, retrieved_docs):
45
+ context = " ".join([doc.page_content for doc in retrieved_docs])
46
+ output = qa_pipeline(question=question, context=context)
47
+ return output #output["answer"]
48
+
49
+ # # Step 6: Ask question
50
+ # question = "鏉变含澶у銇亜銇よō绔嬨仌銈屻伨銇椼仧銇嬶紵"
51
+ # relevant_docs = retriever.get_relevant_documents(question)
52
+ # answer = run_custom_qa(question, relevant_docs)
53
+ #
54
+ # print("Answer:", answer)
55
+
56
+ def doc_qa(query, db):
57
+ print("*************************custom qa doc_qa",query)
58
+ retriever = db.as_retriever()
59
+ relevant_docs = retriever.get_relevant_documents(query)
60
+ response=run_custom_qa(query, relevant_docs)
61
+ print('response', response)
62
+ return response