File size: 5,986 Bytes
06ab73b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
import streamlit as st
import PyPDF2
import os
import requests
import json
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
import tempfile
# Load environment variables
load_dotenv()
GROQ_API_TOKEN = os.getenv("GROQ_API_TOKEN")
# Function to extract text from PDF
def extract_text_from_pdf(file):
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
temp_file.write(file.getvalue())
temp_file_path = temp_file.name
try:
with open(temp_file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
except Exception as e:
st.error(f"Error processing PDF: {str(e)}")
text = ""
finally:
os.unlink(temp_file_path)
return text
# Function to extract text from TXT
def extract_text_from_txt(file):
return file.getvalue().decode("utf-8")
# Function to query GROQ API
def query_groq(prompt, context, temperature, max_tokens):
headers = {
"Authorization": f"Bearer {GROQ_API_TOKEN}",
"Content-Type": "application/json"
}
data = {
"model": "mixtral-8x7b-32768",
"messages": [
{"role": "system", "content": "You are a helpful assistant. Answer questions based only on the provided context."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {prompt}"}
],
"temperature": temperature,
"max_tokens": max_tokens
}
try:
response = requests.post("https://api.groq.com/openai/v1/chat/completions", headers=headers, json=data)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"]
except requests.exceptions.RequestException as e:
st.error(f"Error querying GROQ API: {str(e)}")
return None
# Function to create vector store
def create_vector_store(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_text(text)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(chunks, embeddings)
return vector_store
# Streamlit UI
st.set_page_config(layout="wide")
# Custom CSS for scrollable chat container
st.markdown("""
<style>
.chat-container {
height: 600px;
display: flex;
flex-direction: column;
border: 1px solid #ccc;
border-radius: 5px;
}
.chat-messages {
flex: 1;
overflow-y: auto;
padding: 10px;
}
.chat-input {
border-top: 1px solid #ccc;
padding: 10px;
}
</style>
""", unsafe_allow_html=True)
st.title("Enhanced Document Query System")
# Create two columns for the split-screen layout
left_column, right_column = st.columns(2)
# Left column: Document upload and processing
with left_column:
st.header("Document Upload")
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "txt"])
doc_type = st.selectbox("Select document type", ["PDF", "TXT"])
# Model parameters
st.subheader("Model Parameters")
temperature = st.slider("Temperature", 0.0, 1.0, 0.5, 0.1)
max_tokens = st.slider("Max Tokens", 100, 2000, 1000, 100)
if uploaded_file is not None:
# Extract text based on document type
if doc_type == "PDF":
doc_text = extract_text_from_pdf(uploaded_file)
else:
doc_text = extract_text_from_txt(uploaded_file)
if doc_text:
st.success("File uploaded and processed successfully!")
# Create vector store
vector_store = create_vector_store(doc_text)
st.session_state.vector_store = vector_store
else:
st.error("Failed to extract text from the document. Please try again.")
# Clear chat history button
if st.button("Clear Chat History"):
st.session_state.messages = []
st.rerun()
# Right column: Chat interface
with right_column:
st.header("Chat Interface")
# Chat history
if "messages" not in st.session_state:
st.session_state.messages = []
# Scrollable chat container
chat_container = st.container()
with chat_container:
st.markdown('<div class="scrollable-chat">', unsafe_allow_html=True)
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
st.markdown('</div>', unsafe_allow_html=True)
# # Display chat history
# for message in st.session_state.messages:
# with st.chat_message(message["role"]):
# st.markdown(message["content"])
# User query input
user_query = st.chat_input("Enter your question about the document:")
if user_query and 'vector_store' in st.session_state:
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": user_query})
with chat_container:
with st.chat_message("user"):
st.markdown(user_query)
# Retrieve relevant context
relevant_docs = st.session_state.vector_store.similarity_search(user_query, k=3)
context = "\n".join([doc.page_content for doc in relevant_docs])
# Query GROQ API
response = query_groq(user_query, context, temperature, max_tokens)
if response:
# Add assistant message to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
with st.chat_message("assistant"):
st.markdown(response)
elif user_query:
st.warning("Please upload and process a document first.") |