singhdevendra58 commited on
Commit
1e17e40
·
verified ·
1 Parent(s): 400fe06

Update src/doc_qa.py

Browse files
Files changed (1) hide show
  1. src/doc_qa.py +127 -124
src/doc_qa.py CHANGED
@@ -1,124 +1,127 @@
1
- from typing import Optional, List
2
-
3
- from langchain.document_loaders import TextLoader #for textfiles
4
- from langchain.text_splitter import CharacterTextSplitter #text splitter
5
- from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
6
-
7
- from langchain.document_loaders import UnstructuredPDFLoader #load pdf
8
- from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
9
- from langchain.chains import RetrievalQA
10
- from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
11
- from langchain.chains.question_answering import load_qa_chain
12
- from langchain import HuggingFaceHub
13
- import os
14
- from langchain.document_loaders import TextLoader, PyMuPDFLoader
15
- from langchain.text_splitter import RecursiveCharacterTextSplitter
16
- from langchain.llms import HuggingFacePipeline
17
- from langchain.vectorstores import FAISS
18
- from langchain.embeddings import HuggingFaceEmbeddings
19
- from langchain import PromptTemplate
20
- from langchain.chains import LLMChain
21
- from langchain.base_language import BaseLanguageModel
22
- from docx import Document
23
- from langchain.document_loaders import DirectoryLoader
24
- multi_directory_path=r'tmp/'
25
-
26
- from transformers import pipeline
27
-
28
- embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/LaBSE')
29
-
30
-
31
- from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
32
-
33
- after_rag_template = """Answer the question based only on the following context:
34
- {context}
35
- Question: {question}
36
- """
37
- #pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100)
38
- #pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200)
39
- from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
40
- #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False)
41
- #model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
42
- # Load model directly
43
- from transformers import AutoTokenizer, AutoModelForCausalLM
44
-
45
- #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b")
46
- #model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b")
47
- #pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200)
48
- #pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
49
-
50
- pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
51
-
52
- llm = HuggingFacePipeline(pipeline=pipe)
53
-
54
- def run_custom_qa(question, retrieved_docs):
55
- context = " ".join([doc.page_content for doc in retrieved_docs])
56
- output = pipe(question=question, context=context)
57
- return output["answer"]
58
-
59
- def docs_vector_index():
60
- from langchain.document_loaders import DirectoryLoader
61
- # Define a directory path
62
- directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
63
-
64
- # Create the DirectoryLoader, specifying loaders for each file type
65
- loader = DirectoryLoader(
66
- directory_path,
67
- glob="**/*", # This pattern loads all files; modify as needed
68
-
69
- )
70
- docs = loader.load()
71
-
72
- text_splitter = RecursiveCharacterTextSplitter(
73
- chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
74
- )
75
- print(docs)
76
- docs_chunks = text_splitter.split_documents(docs)
77
-
78
- print(f"docs_chunks length: {len(docs_chunks)}")
79
- print('********************docs_chunks',docs_chunks)
80
- if len(docs_chunks)>0:
81
- db = FAISS.from_documents(docs_chunks, embeddings)
82
- return db
83
- else:
84
- return ''
85
-
86
-
87
- #chain = load_qa_chain(llm, chain_type="stuff")
88
-
89
- from langchain.prompts import PromptTemplate
90
-
91
- template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information.
92
- {context}
93
-
94
- Based on the above information only, answer the below question.
95
-
96
- {question} Be concise."""
97
-
98
- prompt = PromptTemplate.from_template(template)
99
- print(prompt.input_variables)
100
-
101
-
102
- #query_llm = LLMChain(llm=llm, prompt=prompt)
103
-
104
- # def doc_qa1(query, db):
105
- # similar_doc = db.similarity_search(query, k=2)
106
- # doc_c=[]
107
- # for c in similar_doc:
108
- # doc_c.append(c.page_content)
109
- # context=''.join(doc_c)
110
- # #response = query_llm.run({"context": context, "question": query})
111
- # response = query_llm.run(context=context, question=query)
112
- # print('response',response)
113
- # return response
114
-
115
- def doc_qa(query, db):
116
- print("*************************custom qa doc_qa",query)
117
- retriever = db.as_retriever()
118
- relevant_docs = retriever.get_relevant_documents(query)
119
- response=run_custom_qa(query, relevant_docs)
120
- print('response', response)
121
- return response
122
-
123
-
124
-
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from langchain.document_loaders import TextLoader #for textfiles
4
+ from langchain.text_splitter import CharacterTextSplitter #text splitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
6
+
7
+ from langchain.document_loaders import UnstructuredPDFLoader #load pdf
8
+ from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
11
+ from langchain.chains.question_answering import load_qa_chain
12
+ from langchain import HuggingFaceHub
13
+ import os
14
+ from langchain.document_loaders import TextLoader, PyMuPDFLoader
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.llms import HuggingFacePipeline
17
+ from langchain.vectorstores import FAISS
18
+ from langchain.embeddings import HuggingFaceEmbeddings
19
+ from langchain import PromptTemplate
20
+ from langchain.chains import LLMChain
21
+ from langchain.base_language import BaseLanguageModel
22
+ from docx import Document
23
+ from langchain.document_loaders import DirectoryLoader
24
+ multi_directory_path=r'tmp/'
25
+
26
+ from transformers import pipeline
27
+
28
+ from sentence_transformers import SentenceTransformer
29
+ model = SentenceTransformer("sentence-transformers/LaBSE")
30
+
31
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/LaBSE')
32
+
33
+
34
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
35
+
36
+ after_rag_template = """Answer the question based only on the following context:
37
+ {context}
38
+ Question: {question}
39
+ """
40
+ #pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100)
41
+ #pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200)
42
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
43
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False)
44
+ #model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
45
+ # Load model directly
46
+ from transformers import AutoTokenizer, AutoModelForCausalLM
47
+
48
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b")
49
+ #model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b")
50
+ #pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200)
51
+ #pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
52
+
53
+ pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
54
+
55
+ llm = HuggingFacePipeline(pipeline=pipe)
56
+
57
+ def run_custom_qa(question, retrieved_docs):
58
+ context = " ".join([doc.page_content for doc in retrieved_docs])
59
+ output = pipe(question=question, context=context)
60
+ return output["answer"]
61
+
62
+ def docs_vector_index():
63
+ from langchain.document_loaders import DirectoryLoader
64
+ # Define a directory path
65
+ directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
66
+
67
+ # Create the DirectoryLoader, specifying loaders for each file type
68
+ loader = DirectoryLoader(
69
+ directory_path,
70
+ glob="**/*", # This pattern loads all files; modify as needed
71
+
72
+ )
73
+ docs = loader.load()
74
+
75
+ text_splitter = RecursiveCharacterTextSplitter(
76
+ chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
77
+ )
78
+ print(docs)
79
+ docs_chunks = text_splitter.split_documents(docs)
80
+
81
+ print(f"docs_chunks length: {len(docs_chunks)}")
82
+ print('********************docs_chunks',docs_chunks)
83
+ if len(docs_chunks)>0:
84
+ db = FAISS.from_documents(docs_chunks, embeddings)
85
+ return db
86
+ else:
87
+ return ''
88
+
89
+
90
+ #chain = load_qa_chain(llm, chain_type="stuff")
91
+
92
+ from langchain.prompts import PromptTemplate
93
+
94
+ template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information.
95
+ {context}
96
+
97
+ Based on the above information only, answer the below question.
98
+
99
+ {question} Be concise."""
100
+
101
+ prompt = PromptTemplate.from_template(template)
102
+ print(prompt.input_variables)
103
+
104
+
105
+ #query_llm = LLMChain(llm=llm, prompt=prompt)
106
+
107
+ # def doc_qa1(query, db):
108
+ # similar_doc = db.similarity_search(query, k=2)
109
+ # doc_c=[]
110
+ # for c in similar_doc:
111
+ # doc_c.append(c.page_content)
112
+ # context=''.join(doc_c)
113
+ # #response = query_llm.run({"context": context, "question": query})
114
+ # response = query_llm.run(context=context, question=query)
115
+ # print('response',response)
116
+ # return response
117
+
118
+ def doc_qa(query, db):
119
+ print("*************************custom qa doc_qa",query)
120
+ retriever = db.as_retriever()
121
+ relevant_docs = retriever.get_relevant_documents(query)
122
+ response=run_custom_qa(query, relevant_docs)
123
+ print('response', response)
124
+ return response
125
+
126
+
127
+