Spaces:
Running
Running
Upload 3 files
Browse files- chatbot.py +131 -0
- doc_qa.py +124 -0
- doc_qa_1.py +62 -0
chatbot.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
|
3 |
+
from langchain.text_splitter import CharacterTextSplitter
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.vectorstores import FAISS
|
6 |
+
from langchain.chains import ConversationalRetrievalChain
|
7 |
+
from langchain.llms import OpenAI
|
8 |
+
import os
|
9 |
+
import tempfile
|
10 |
+
from doc_qa import embeddings,llm
|
11 |
+
from doc_qa_1 import embeddings,doc_qa
|
12 |
+
|
13 |
+
def start_message(doc_name):
|
14 |
+
st.success("✅ ドキュメントのアップロードが完了しました!")
|
15 |
+
st.markdown(f"### 📄 アップロードされました: `{doc_name}`")
|
16 |
+
st.markdown("これで文書に関する質問ができます。 💬")
|
17 |
+
st.markdown("例えば、次のような質問ができます。:")
|
18 |
+
st.markdown("- この文書は何について書かれていますか?")
|
19 |
+
st.markdown("- 重要なポイントを要約してください。")
|
20 |
+
st.markdown("- 著者は誰ですか?")
|
21 |
+
st.markdown("はじめるには、下に質問を入力してください。!")
|
22 |
+
|
23 |
+
# Function to load individual file
|
24 |
+
def load_file(file, suffix):
|
25 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
26 |
+
temp_file.write(file.read())
|
27 |
+
temp_file_path = temp_file.name
|
28 |
+
|
29 |
+
if suffix == ".pdf":
|
30 |
+
loader = PyPDFLoader(temp_file_path)
|
31 |
+
elif suffix == ".docx":
|
32 |
+
loader = Docx2txtLoader(temp_file_path)
|
33 |
+
elif suffix == ".txt":
|
34 |
+
loader = TextLoader(temp_file_path)
|
35 |
+
else:
|
36 |
+
return []
|
37 |
+
|
38 |
+
return loader.load()
|
39 |
+
st.set_page_config(
|
40 |
+
page_title="QA Assistant",
|
41 |
+
page_icon="https://yourdomain.com/logo.png",
|
42 |
+
layout="centered"
|
43 |
+
)
|
44 |
+
# Title
|
45 |
+
st.title("📄 ドキュメント質問応答支援ツール")
|
46 |
+
|
47 |
+
# Step 1: Upload document
|
48 |
+
if "file_uploaded" not in st.session_state:
|
49 |
+
st.session_state.file_uploaded = False
|
50 |
+
st.markdown("""
|
51 |
+
👋 こちらへようこそ!私は文書の内容を理解するためのインテリジェントアシスタントです。
|
52 |
+
|
53 |
+
あなたは以下のことができます:
|
54 |
+
|
55 |
+
PDF、DOCX、TXTファイルをアップロード
|
56 |
+
|
57 |
+
文書の内容について質問
|
58 |
+
|
59 |
+
要約、重要ポイント、または具体的な詳細の取得
|
60 |
+
|
61 |
+
🛠️ 質問の例:
|
62 |
+
この文書は何について書かれていますか?
|
63 |
+
|
64 |
+
主要なポイントを要約してください。
|
65 |
+
|
66 |
+
著者は誰ですか?
|
67 |
+
|
68 |
+
重要な日付や締め切りは何ですか?
|
69 |
+
|
70 |
+
結論や推奨事項は何ですか?
|
71 |
+
|
72 |
+
📂 まず、1つ以上の文書をアップロードしてください。
|
73 |
+
💬 その後、下に質問を入力しましょう!
|
74 |
+
""")
|
75 |
+
if "messages" not in st.session_state:
|
76 |
+
st.session_state.messages = []
|
77 |
+
|
78 |
+
|
79 |
+
flag = 0
|
80 |
+
# Upload multiple files
|
81 |
+
with st.sidebar:
|
82 |
+
uploaded_files = st.file_uploader("PDF、DOCX、またはTXTファイルをアップロードしてください。", type=["pdf", "docx", "txt"], accept_multiple_files=True)
|
83 |
+
# Load and process documents
|
84 |
+
file_names=[]
|
85 |
+
if uploaded_files:
|
86 |
+
all_docs = []
|
87 |
+
for file in uploaded_files:
|
88 |
+
suffix = os.path.splitext(file.name)[1]
|
89 |
+
docs = load_file(file, suffix)
|
90 |
+
all_docs.extend(docs)
|
91 |
+
file_names.append(file.name)
|
92 |
+
|
93 |
+
# Split and embed documents
|
94 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
95 |
+
split_docs = text_splitter.split_documents(all_docs)
|
96 |
+
#embeddings = OpenAIEmbeddings()
|
97 |
+
vectorstore = FAISS.from_documents(split_docs, embeddings)
|
98 |
+
|
99 |
+
# Setup ConversationalRetrievalChain
|
100 |
+
qa_chain = ConversationalRetrievalChain.from_llm(
|
101 |
+
llm=llm,
|
102 |
+
retriever=vectorstore.as_retriever(),
|
103 |
+
return_source_documents=False
|
104 |
+
)
|
105 |
+
start_message('\n'.join(file_names))
|
106 |
+
flag = 1
|
107 |
+
|
108 |
+
# Initialize session state
|
109 |
+
if "chat_history" not in st.session_state:
|
110 |
+
st.session_state.chat_history = []
|
111 |
+
|
112 |
+
for msg in st.session_state.messages:
|
113 |
+
st.chat_message(msg["role"]).write(msg["content"])
|
114 |
+
|
115 |
+
if flag==1:
|
116 |
+
if user_query := st.chat_input():
|
117 |
+
st.session_state.messages.append({"role": "user", "content": user_query})
|
118 |
+
with st.chat_message("user"):
|
119 |
+
st.markdown(f"**Q:** {user_query}")
|
120 |
+
result=doc_qa(user_query,vectorstore)
|
121 |
+
st.session_state.messages.append({"role": "assistant", "content": result["answer"]})
|
122 |
+
with st.chat_message("assistant"):
|
123 |
+
st.markdown(f"**A:** {result["answer"]}")
|
124 |
+
st.session_state.chat_history.append((user_query, result["answer"]))
|
125 |
+
|
126 |
+
# # Display conversation history
|
127 |
+
# if st.session_state.chat_history:
|
128 |
+
# st.markdown("### 🗨️ Chat History")
|
129 |
+
# for i, (q, a) in enumerate(st.session_state.chat_history, 1):
|
130 |
+
# st.markdown(f"**Q{i}:** {q}")
|
131 |
+
# st.markdown(f"**A{i}:** {a}")
|
doc_qa.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional, List
|
2 |
+
|
3 |
+
from langchain.document_loaders import TextLoader #for textfiles
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter #text splitter
|
5 |
+
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
|
6 |
+
|
7 |
+
from langchain.document_loaders import UnstructuredPDFLoader #load pdf
|
8 |
+
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
|
9 |
+
from langchain.chains import RetrievalQA
|
10 |
+
from langchain.document_loaders import UnstructuredURLLoader #load urls into docoument-loader
|
11 |
+
from langchain.chains.question_answering import load_qa_chain
|
12 |
+
from langchain import HuggingFaceHub
|
13 |
+
import os
|
14 |
+
from langchain.document_loaders import TextLoader, PyMuPDFLoader
|
15 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
16 |
+
from langchain.llms import HuggingFacePipeline
|
17 |
+
from langchain.vectorstores import FAISS
|
18 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
19 |
+
from langchain import PromptTemplate
|
20 |
+
from langchain.chains import LLMChain
|
21 |
+
from langchain.base_language import BaseLanguageModel
|
22 |
+
from docx import Document
|
23 |
+
from langchain.document_loaders import DirectoryLoader
|
24 |
+
multi_directory_path=r'tmp/'
|
25 |
+
|
26 |
+
from transformers import pipeline
|
27 |
+
|
28 |
+
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/LaBSE')
|
29 |
+
|
30 |
+
|
31 |
+
from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader
|
32 |
+
|
33 |
+
after_rag_template = """Answer the question based only on the following context:
|
34 |
+
{context}
|
35 |
+
Question: {question}
|
36 |
+
"""
|
37 |
+
#pipe = pipeline("text2text-generation", model="google/flan-t5-large" ,max_new_tokens=100)
|
38 |
+
#pipe = pipeline("text2text-generation", model="google/mt5-large" ,max_new_tokens=200)
|
39 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
40 |
+
#tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b", use_fast=False)
|
41 |
+
#model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")
|
42 |
+
# Load model directly
|
43 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
44 |
+
|
45 |
+
#tokenizer = AutoTokenizer.from_pretrained("rinna/bilingual-gpt-neox-4b")
|
46 |
+
#model = AutoModelForCausalLM.from_pretrained("rinna/bilingual-gpt-neox-4b")
|
47 |
+
#pipe = pipeline("text2text-generation", model="rinna/bilingual-gpt-neox-4b" ,max_new_tokens=200)
|
48 |
+
#pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200)
|
49 |
+
|
50 |
+
pipe = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
|
51 |
+
|
52 |
+
llm = HuggingFacePipeline(pipeline=pipe)
|
53 |
+
|
54 |
+
def run_custom_qa(question, retrieved_docs):
|
55 |
+
context = " ".join([doc.page_content for doc in retrieved_docs])
|
56 |
+
output = pipe(question=question, context=context)
|
57 |
+
return output["answer"]
|
58 |
+
|
59 |
+
def docs_vector_index():
|
60 |
+
from langchain.document_loaders import DirectoryLoader
|
61 |
+
# Define a directory path
|
62 |
+
directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
|
63 |
+
|
64 |
+
# Create the DirectoryLoader, specifying loaders for each file type
|
65 |
+
loader = DirectoryLoader(
|
66 |
+
directory_path,
|
67 |
+
glob="**/*", # This pattern loads all files; modify as needed
|
68 |
+
|
69 |
+
)
|
70 |
+
docs = loader.load()
|
71 |
+
|
72 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
73 |
+
chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
|
74 |
+
)
|
75 |
+
print(docs)
|
76 |
+
docs_chunks = text_splitter.split_documents(docs)
|
77 |
+
|
78 |
+
print(f"docs_chunks length: {len(docs_chunks)}")
|
79 |
+
print('********************docs_chunks',docs_chunks)
|
80 |
+
if len(docs_chunks)>0:
|
81 |
+
db = FAISS.from_documents(docs_chunks, embeddings)
|
82 |
+
return db
|
83 |
+
else:
|
84 |
+
return ''
|
85 |
+
|
86 |
+
|
87 |
+
#chain = load_qa_chain(llm, chain_type="stuff")
|
88 |
+
|
89 |
+
from langchain.prompts import PromptTemplate
|
90 |
+
|
91 |
+
template = """You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information.
|
92 |
+
{context}
|
93 |
+
|
94 |
+
Based on the above information only, answer the below question.
|
95 |
+
|
96 |
+
{question} Be concise."""
|
97 |
+
|
98 |
+
prompt = PromptTemplate.from_template(template)
|
99 |
+
print(prompt.input_variables)
|
100 |
+
|
101 |
+
|
102 |
+
#query_llm = LLMChain(llm=llm, prompt=prompt)
|
103 |
+
|
104 |
+
# def doc_qa1(query, db):
|
105 |
+
# similar_doc = db.similarity_search(query, k=2)
|
106 |
+
# doc_c=[]
|
107 |
+
# for c in similar_doc:
|
108 |
+
# doc_c.append(c.page_content)
|
109 |
+
# context=''.join(doc_c)
|
110 |
+
# #response = query_llm.run({"context": context, "question": query})
|
111 |
+
# response = query_llm.run(context=context, question=query)
|
112 |
+
# print('response',response)
|
113 |
+
# return response
|
114 |
+
|
115 |
+
def doc_qa(query, db):
|
116 |
+
print("*************************custom qa doc_qa",query)
|
117 |
+
retriever = db.as_retriever()
|
118 |
+
relevant_docs = retriever.get_relevant_documents(query)
|
119 |
+
response=run_custom_qa(query, relevant_docs)
|
120 |
+
print('response', response)
|
121 |
+
return response
|
122 |
+
|
123 |
+
|
124 |
+
|
doc_qa_1.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.vectorstores import FAISS
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
4 |
+
from langchain.text_splitter import CharacterTextSplitter
|
5 |
+
from langchain.docstore.document import Document
|
6 |
+
from transformers import pipeline
|
7 |
+
from langchain.chains.question_answering import load_qa_chain
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Step 1: Load QA pipeline (don't wrap in HuggingFacePipeline)
|
11 |
+
embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-small")
|
12 |
+
qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2")
|
13 |
+
multi_directory_path=r'tmp/'
|
14 |
+
|
15 |
+
def docs_vector_index():
|
16 |
+
from langchain.document_loaders import DirectoryLoader
|
17 |
+
# Define a directory path
|
18 |
+
directory_path = r"C:\Users\savni\PycharmProjects\DocsSearchEngine\tmp"
|
19 |
+
|
20 |
+
# Create the DirectoryLoader, specifying loaders for each file type
|
21 |
+
loader = DirectoryLoader(
|
22 |
+
directory_path,
|
23 |
+
glob="**/*", # This pattern loads all files; modify as needed
|
24 |
+
|
25 |
+
)
|
26 |
+
docs = loader.load()
|
27 |
+
|
28 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
29 |
+
chunk_size=1024, chunk_overlap=100, separators=[" ", ",", "\n", "."]
|
30 |
+
)
|
31 |
+
print(docs)
|
32 |
+
docs_chunks = text_splitter.split_documents(docs)
|
33 |
+
|
34 |
+
print(f"docs_chunks length: {len(docs_chunks)}")
|
35 |
+
print('********************docs_chunks',docs_chunks)
|
36 |
+
if len(docs_chunks)>0:
|
37 |
+
db = FAISS.from_documents(docs_chunks, embeddings)
|
38 |
+
return db
|
39 |
+
else:
|
40 |
+
return ''
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
def run_custom_qa(question, retrieved_docs):
|
45 |
+
context = " ".join([doc.page_content for doc in retrieved_docs])
|
46 |
+
output = qa_pipeline(question=question, context=context)
|
47 |
+
return output #output["answer"]
|
48 |
+
|
49 |
+
# # Step 6: Ask question
|
50 |
+
# question = "東京大学はいつ設立されましたか?"
|
51 |
+
# relevant_docs = retriever.get_relevant_documents(question)
|
52 |
+
# answer = run_custom_qa(question, relevant_docs)
|
53 |
+
#
|
54 |
+
# print("Answer:", answer)
|
55 |
+
|
56 |
+
def doc_qa(query, db):
|
57 |
+
print("*************************custom qa doc_qa",query)
|
58 |
+
retriever = db.as_retriever()
|
59 |
+
relevant_docs = retriever.get_relevant_documents(query)
|
60 |
+
response=run_custom_qa(query, relevant_docs)
|
61 |
+
print('response', response)
|
62 |
+
return response
|