Spaces:
Sleeping
Sleeping
init
Browse files- .env +1 -0
- .gitignore +1 -0
- Dockerfile +20 -0
- app.py +32 -0
- backend.py +40 -0
- data/sample.docx +0 -0
- requirements.txt +7 -0
- utils.py +82 -0
.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY = 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
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.env
|
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
RUN useradd -m -u 1000 user
|
12 |
+
USER user
|
13 |
+
ENV HOME=/home/user \
|
14 |
+
PATH=/home/user/.local/bin:$PATH
|
15 |
+
|
16 |
+
WORKDIR $HOME/app
|
17 |
+
|
18 |
+
COPY --chown=user . $HOME/app
|
19 |
+
|
20 |
+
CMD ["uvicorn", "backend:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
|
4 |
+
backend_url = "http://0.0.0.0:7860"
|
5 |
+
st.header('fiXit Assignment', divider='orange')
|
6 |
+
st.header('Document-based QA Chatbot')
|
7 |
+
|
8 |
+
if 'uploadFlag' not in st.session_state:
|
9 |
+
upload = st.file_uploader("Upload a DOCX file", type="docx")
|
10 |
+
if upload:
|
11 |
+
files = {'file': upload.getvalue()}
|
12 |
+
response = requests.post(f"{backend_url}/uploaddoc", files=files)
|
13 |
+
if response.status_code == 200:
|
14 |
+
st.success("Document uploaded sucessfully")
|
15 |
+
st.session_state.uploadFlag = True
|
16 |
+
|
17 |
+
if 'uploadFlag' in st.session_state:
|
18 |
+
if 'chat_history' not in st.session_state:
|
19 |
+
st.session_state.chat_history = []
|
20 |
+
|
21 |
+
question = st.text_input("Enter your question/query: ")
|
22 |
+
if question:
|
23 |
+
response = requests.post(f"{backend_url}/question", json={"question": question})
|
24 |
+
if response.status_code == 200:
|
25 |
+
answer = response.json().get("answer")
|
26 |
+
st.session_state.chat_history.append(("user", question))
|
27 |
+
st.session_state.chat_history.append(("assistant", answer))
|
28 |
+
else:
|
29 |
+
st.error("Failed to get an answer from the backend.")
|
30 |
+
for role, message in st.session_state.chat_history:
|
31 |
+
with st.chat_message(role):
|
32 |
+
st.markdown(message)
|
backend.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, UploadFile, File
|
2 |
+
from pydantic import BaseModel
|
3 |
+
from utils import *
|
4 |
+
from io import BytesIO
|
5 |
+
|
6 |
+
document_data = {}
|
7 |
+
|
8 |
+
app = FastAPI()
|
9 |
+
class Q(BaseModel):
|
10 |
+
question: str
|
11 |
+
|
12 |
+
from utils import *
|
13 |
+
|
14 |
+
|
15 |
+
@app.post("/uploaddoc")
|
16 |
+
async def upload_document(file: UploadFile = File()):
|
17 |
+
content = await file.read()
|
18 |
+
file = BytesIO(content)
|
19 |
+
|
20 |
+
text, tables = textprocessing(file)
|
21 |
+
|
22 |
+
text_embds = embed_query(text)
|
23 |
+
table_texts = ["-".join(["|".join(row) for row in table]) for table in tables]
|
24 |
+
table_embds = embed_query(table_texts)
|
25 |
+
|
26 |
+
vectordb = setvecdb(text_embds, table_embds, text, table_texts)
|
27 |
+
document_data["vectordb"] = vectordb
|
28 |
+
return {"message": "Document uploaded and processed successfully"}
|
29 |
+
|
30 |
+
@app.post("/question")
|
31 |
+
async def processquestion(question: Q):
|
32 |
+
query_text = question.question
|
33 |
+
vectordb = document_data.get("vectordb")
|
34 |
+
if vectordb is None:
|
35 |
+
return {"error": "No document uploaded"}
|
36 |
+
|
37 |
+
search_results = retriever(vectordb, query_text)
|
38 |
+
context = "\n".join(search_results['content'].sum())
|
39 |
+
answer = generate_answer(query_text, context)
|
40 |
+
return {"answer": answer}
|
data/sample.docx
ADDED
Binary file (8.2 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
scikit-learn
|
3 |
+
streamlit
|
4 |
+
pandas
|
5 |
+
pydantic
|
6 |
+
python-docx
|
7 |
+
numpy
|
utils.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import docx
|
2 |
+
from openai import OpenAI
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
import numpy as np
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
import os
|
8 |
+
|
9 |
+
|
10 |
+
load_dotenv()
|
11 |
+
client = OpenAI(
|
12 |
+
api_key= os.getenv("OPENAI_API_KEY")
|
13 |
+
)
|
14 |
+
|
15 |
+
def textprocessing(file_path):
|
16 |
+
doc = docx.Document(file_path)
|
17 |
+
text = []
|
18 |
+
tables = []
|
19 |
+
|
20 |
+
for paragraph in doc.paragraphs:
|
21 |
+
if paragraph.text.strip():
|
22 |
+
text.append(paragraph.text)
|
23 |
+
|
24 |
+
for table in doc.tables:
|
25 |
+
table_data = []
|
26 |
+
for row in table.rows:
|
27 |
+
row_data = []
|
28 |
+
for cell in row.cells:
|
29 |
+
row_data.append(cell.text)
|
30 |
+
table_data.append(row_data)
|
31 |
+
tables.append(table_data)
|
32 |
+
|
33 |
+
return text, tables
|
34 |
+
|
35 |
+
def embed_query(docs):
|
36 |
+
embds = []
|
37 |
+
for text in docs:
|
38 |
+
response = client.embeddings.create(input=text, model="text-embedding-ada-002")
|
39 |
+
embds.append(response.data[0].embedding)
|
40 |
+
return embds
|
41 |
+
|
42 |
+
def setvecdb(text_embds, table_embds, text, tables):
|
43 |
+
data = []
|
44 |
+
|
45 |
+
for idx, embedding in enumerate(text_embds):
|
46 |
+
data.append({"type": "text", "content": text[idx], "embedding": embedding})
|
47 |
+
|
48 |
+
for idx, (embedding, table_text) in enumerate(zip(table_embds, tables)):
|
49 |
+
data.append({"type": "table", "content": table_text, "embedding": embedding})
|
50 |
+
|
51 |
+
vectordb = pd.DataFrame(data)
|
52 |
+
return vectordb
|
53 |
+
|
54 |
+
def retriever(vectordb, query_text):
|
55 |
+
qembed = client.embeddings.create(input=query_text, model="text-embedding-ada-002")
|
56 |
+
query_embedding = np.array(qembed.data[0].embedding).reshape(1, -1)
|
57 |
+
|
58 |
+
embeddings = np.stack(vectordb['embedding'].values)
|
59 |
+
similarities = cosine_similarity(query_embedding, embeddings).flatten()
|
60 |
+
|
61 |
+
vectordb['similarity'] = similarities
|
62 |
+
results = vectordb.nlargest(4, 'similarity')
|
63 |
+
|
64 |
+
return results
|
65 |
+
|
66 |
+
prompt = """You are a smart chatbot that answers simple and complex questions of a user on the basis of contents from a document uploaded by the user. The document can have tabular data and textual data, with tables seperated by bar(|) and hyphens(-) to represent its rows and columns.
|
67 |
+
Don't simply rely on direct logic for questions and use complex reasoning often for answering them. Below given is the context and the question. If unaware of the answer, simply respond that you cannot answer it based on the data, do not hallucinate. Make sure to answer questions related to tabular data properly as well.
|
68 |
+
Explain your purpose if asked, make sure you are helpful"""
|
69 |
+
|
70 |
+
def generate_answer(question,ctxt):
|
71 |
+
response = client.chat.completions.create(
|
72 |
+
model = "gpt-3.5-turbo-0125",
|
73 |
+
messages = [
|
74 |
+
{
|
75 |
+
"role" : "system",
|
76 |
+
"content" : prompt
|
77 |
+
},
|
78 |
+
{"role" : "user","content" : question},
|
79 |
+
{"role" : "assistant","content" : ctxt}
|
80 |
+
]
|
81 |
+
)
|
82 |
+
return response.choices[0].message.content
|