singhdevendra58 commited on
Commit
0cfdd6a
·
verified ·
1 Parent(s): 0126b6f

Upload 3 files

Browse files
Files changed (3) hide show
  1. chatbot.py +131 -0
  2. doc_qa.py +124 -0
  3. doc_qa_1.py +62 -0
chatbot.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.chains import ConversationalRetrievalChain
7
+ from langchain.llms import OpenAI
8
+ import os
9
+ import tempfile
10
+ from doc_qa import embeddings,llm
11
+ from doc_qa_1 import embeddings,doc_qa
12
+
13
+ def start_message(doc_name):
14
+ st.success("✅ ドキュメントのアップロードが完了しました!")
15
+ st.markdown(f"### 📄 アップロードされました: `{doc_name}`")
16
+ st.markdown("これで文書に関する質問ができます。 💬")
17
+ st.markdown("例えば、次のような質問ができます。:")
18
+ st.markdown("- この文書は何について書かれていますか?")
19
+ st.markdown("- 重要なポイントを要約してください。")
20
+ st.markdown("- 著者は誰ですか?")
21
+ st.markdown("はじめるには、下に質問を入力してください。!")
22
+
23
+ # Function to load individual file
24
+ def load_file(file, suffix):
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
26
+ temp_file.write(file.read())
27
+ temp_file_path = temp_file.name
28
+
29
+ if suffix == ".pdf":
30
+ loader = PyPDFLoader(temp_file_path)
31
+ elif suffix == ".docx":
32
+ loader = Docx2txtLoader(temp_file_path)
33
+ elif suffix == ".txt":
34
+ loader = TextLoader(temp_file_path)
35
+ else:
36
+ return []
37
+
38
+ return loader.load()
39
+ st.set_page_config(
40
+ page_title="QA Assistant",
41
+ page_icon="https://yourdomain.com/logo.png",
42
+ layout="centered"
43
+ )
44
+ # Title
45
+ st.title("📄 ドキュメント質問応答支援ツール")
46
+
47
+ # Step 1: Upload document
48
+ if "file_uploaded" not in st.session_state:
49
+ st.session_state.file_uploaded = False
50
+ st.markdown("""
51
+ 👋 こちらへようこそ!私は文書の内容を理解するためのインテリジェントアシスタントです。
52
+
53
+ あなたは以下のことができます:
54
+
55
+ PDF、DOCX、TXTファイルをアップロード
56
+
57
+ 文書の内容について質問
58
+
59
+ 要約、重要ポイント、または具体的な詳細の取得
60
+
61
+ 🛠️ 質問の例:
62
+ この文書は何について書かれていますか?
63
+
64
+ 主要なポイントを要約してください。
65
+
66
+ 著者は誰ですか?
67
+
68
+ 重要な日付や締め切りは何ですか?
69
+
70
+ 結論や推奨事項は何ですか?
71
+
72
+ 📂 まず、1つ以上の文書をアップロードしてください。
73
+ 💬 その後、下に質問を入力しましょう!
74
+ """)
75
+ if "messages" not in st.session_state:
76
+ st.session_state.messages = []
77
+
78
+
79
+ flag = 0
80
+ # Upload multiple files
81
+ with st.sidebar:
82
+ uploaded_files = st.file_uploader("PDF、DOCX、またはTXTファイルをアップロードしてください。", type=["pdf", "docx", "txt"], accept_multiple_files=True)
83
+ # Load and process documents
84
+ file_names=[]
85
+ if uploaded_files:
86
+ all_docs = []
87
+ for file in uploaded_files:
88
+ suffix = os.path.splitext(file.name)[1]
89
+ docs = load_file(file, suffix)
90
+ all_docs.extend(docs)
91
+ file_names.append(file.name)
92
+
93
+ # Split and embed documents
94
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
95
+ split_docs = text_splitter.split_documents(all_docs)
96
+ #embeddings = OpenAIEmbeddings()
97
+ vectorstore = FAISS.from_documents(split_docs, embeddings)
98
+
99
+ # Setup ConversationalRetrievalChain
100
+ qa_chain = ConversationalRetrievalChain.from_llm(
101
+ llm=llm,
102
+ retriever=vectorstore.as_retriever(),
103
+ return_source_documents=False
104
+ )
105
+ start_message('\n'.join(file_names))
106
+ flag = 1
107
+
108
+ # Initialize session state
109
+ if "chat_history" not in st.session_state:
110
+ st.session_state.chat_history = []
111
+
112
+ for msg in st.session_state.messages:
113
+ st.chat_message(msg["role"]).write(msg["content"])
114
+
115
+ if flag==1:
116
+ if user_query := st.chat_input():
117
+ st.session_state.messages.append({"role": "user", "content": user_query})
118
+ with st.chat_message("user"):
119
+ st.markdown(f"**Q:** {user_query}")
120
+ result=doc_qa(user_query,vectorstore)
121
+ st.session_state.messages.append({"role": "assistant", "content": result["answer"]})
122
+ with st.chat_message("assistant"):
123
+ st.markdown(f"**A:** {result["answer"]}")
124
+ st.session_state.chat_history.append((user_query, result["answer"]))
125
+
126
+ # # Display conversation history
127
+ # if st.session_state.chat_history:
128
+ # st.markdown("### 🗨️ Chat History")
129
+ # for i, (q, a) in enumerate(st.session_state.chat_history, 1):
130
+ # st.markdown(f"**Q{i}:** {q}")
131
+ # st.markdown(f"**A{i}:** {a}")
doc_qa.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List
2
+
3
+ from langchain.document_loaders import TextLoader #for textfiles
4
+ from langchain.text_splitter import CharacterTextSplitter #text splitter
5
+ from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
6
+
7
+ from langchain.document_loaders import UnstructuredPDFLoader #load pdf
8
+ from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
9
+ from langchain.chains import RetrievalQA
10
+ from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
11
+ from langchain.chains.question_answering import load_qa_chain
12
+ from langchain import HuggingFaceHub
13
+ import os
14
+ from langchain.document_loaders import TextLoader, PyMuPDFLoader
15
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
16
+ from langchain.llms import HuggingFacePipeline
17
+ from langchain.vectorstores import FAISS
18
+ from langchain.embeddings import HuggingFaceEmbeddings
19
+ from langchain import PromptTemplate
20
+ from langchain.chains import LLMChain
21
+ from langchain.base_language import BaseLanguageModel
22
+ from docx import Document
23
+ from langchain.document_loaders import DirectoryLoader
24
+ multi_directory_path=r'tmp/'
25
+
26
+ from transformers import pipeline
27
+
28
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/LaBSE')
29
+
30
+
31
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
32
+
33
+ after_rag_template = """Answer the question based only on the following context:
34
+ {context}
35
+ Question: {question}
36
+ """
37
+ #pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100)
38
+ #pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200)
39
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
40
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False)
41
+ #model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
42
+ # Load model directly
43
+ from transformers import AutoTokenizer, AutoModelForCausalLM
44
+
45
+ #tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b")
46
+ #model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b")
47
+ #pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200)
48
+ #pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
49
+
50
+ pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
51
+
52
+ llm = HuggingFacePipeline(pipeline=pipe)
53
+
54
+ def run_custom_qa(question, retrieved_docs):
55
+ context = " ".join([doc.page_content for doc in retrieved_docs])
56
+ output = pipe(question=question, context=context)
57
+ return output["answer"]
58
+
59
+ def docs_vector_index():
60
+ from langchain.document_loaders import DirectoryLoader
61
+ # Define a directory path
62
+ directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
63
+
64
+ # Create the DirectoryLoader, specifying loaders for each file type
65
+ loader = DirectoryLoader(
66
+ directory_path,
67
+ glob="**/*", # This pattern loads all files; modify as needed
68
+
69
+ )
70
+ docs = loader.load()
71
+
72
+ text_splitter = RecursiveCharacterTextSplitter(
73
+ chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
74
+ )
75
+ print(docs)
76
+ docs_chunks = text_splitter.split_documents(docs)
77
+
78
+ print(f"docs_chunks length: {len(docs_chunks)}")
79
+ print('********************docs_chunks',docs_chunks)
80
+ if len(docs_chunks)>0:
81
+ db = FAISS.from_documents(docs_chunks, embeddings)
82
+ return db
83
+ else:
84
+ return ''
85
+
86
+
87
+ #chain = load_qa_chain(llm, chain_type="stuff")
88
+
89
+ from langchain.prompts import PromptTemplate
90
+
91
+ template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information.
92
+ {context}
93
+
94
+ Based on the above information only, answer the below question.
95
+
96
+ {question} Be concise."""
97
+
98
+ prompt = PromptTemplate.from_template(template)
99
+ print(prompt.input_variables)
100
+
101
+
102
+ #query_llm = LLMChain(llm=llm, prompt=prompt)
103
+
104
+ # def doc_qa1(query, db):
105
+ # similar_doc = db.similarity_search(query, k=2)
106
+ # doc_c=[]
107
+ # for c in similar_doc:
108
+ # doc_c.append(c.page_content)
109
+ # context=''.join(doc_c)
110
+ # #response = query_llm.run({"context": context, "question": query})
111
+ # response = query_llm.run(context=context, question=query)
112
+ # print('response',response)
113
+ # return response
114
+
115
+ def doc_qa(query, db):
116
+ print("*************************custom qa doc_qa",query)
117
+ retriever = db.as_retriever()
118
+ relevant_docs = retriever.get_relevant_documents(query)
119
+ response=run_custom_qa(query, relevant_docs)
120
+ print('response', response)
121
+ return response
122
+
123
+
124
+
doc_qa_1.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.docstore.document import Document
6
+ from transformers import pipeline
7
+ from langchain.chains.question_answering import load_qa_chain
8
+ import os
9
+
10
+ # Step 1: Load QA pipeline (don't wrap in HuggingFacePipeline)
11
+ embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")
12
+ qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
13
+ multi_directory_path=r'tmp/'
14
+
15
+ def docs_vector_index():
16
+ from langchain.document_loaders import DirectoryLoader
17
+ # Define a directory path
18
+ directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
19
+
20
+ # Create the DirectoryLoader, specifying loaders for each file type
21
+ loader = DirectoryLoader(
22
+ directory_path,
23
+ glob="**/*", # This pattern loads all files; modify as needed
24
+
25
+ )
26
+ docs = loader.load()
27
+
28
+ text_splitter = RecursiveCharacterTextSplitter(
29
+ chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
30
+ )
31
+ print(docs)
32
+ docs_chunks = text_splitter.split_documents(docs)
33
+
34
+ print(f"docs_chunks length: {len(docs_chunks)}")
35
+ print('********************docs_chunks',docs_chunks)
36
+ if len(docs_chunks)>0:
37
+ db = FAISS.from_documents(docs_chunks, embeddings)
38
+ return db
39
+ else:
40
+ return ''
41
+
42
+
43
+
44
+ def run_custom_qa(question, retrieved_docs):
45
+ context = " ".join([doc.page_content for doc in retrieved_docs])
46
+ output = qa_pipeline(question=question, context=context)
47
+ return output #output["answer"]
48
+
49
+ # # Step 6: Ask question
50
+ # question = "東京大学はいつ設立されましたか?"
51
+ # relevant_docs = retriever.get_relevant_documents(question)
52
+ # answer = run_custom_qa(question, relevant_docs)
53
+ #
54
+ # print("Answer:", answer)
55
+
56
+ def doc_qa(query, db):
57
+ print("*************************custom qa doc_qa",query)
58
+ retriever = db.as_retriever()
59
+ relevant_docs = retriever.get_relevant_documents(query)
60
+ response=run_custom_qa(query, relevant_docs)
61
+ print('response', response)
62
+ return response