Spaces:
Sleeping
Sleeping
import os | |
import tempfile | |
import streamlit as st | |
from dotenv import load_dotenv | |
from langchain_community.document_loaders import PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.vectorstores import Chroma | |
from langchain_community.embeddings import BedrockEmbeddings | |
from langchain_community.chat_models import BedrockChat | |
from langchain.chains import RetrievalQA | |
import boto3 | |
# Load AWS credentials from .env if available | |
load_dotenv() | |
# Setup AWS Bedrock runtime | |
bedrock_runtime = boto3.client("bedrock-runtime", region_name="us-east-1") | |
# UI setup | |
st.set_page_config(page_title="PDF chatbot", layout="wide") | |
st.title("RAG Demo - PDF Q&A") | |
st.markdown(""" | |
1. **Upload Your Documents**: You can upload multiple PDF files for processing. | |
2. **Ask a Question**: Then ask any question based on the documents' content. | |
""") | |
CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db") | |
def main(): | |
st.header("Ask a question") | |
# Initialize vector store with Amazon Titan Embeddings | |
embeddings = BedrockEmbeddings( | |
client=bedrock_runtime, | |
model_id="amazon.titan-embed-text-v1" | |
) | |
vectorstore = Chroma( | |
persist_directory=CHROMA_PATH, | |
embedding_function=embeddings | |
) | |
# Sidebar: Upload & Process PDFs | |
with st.sidebar: | |
st.title("Menu:") | |
uploaded_files = st.file_uploader( | |
"Upload PDF files and click Submit", | |
accept_multiple_files=True, | |
key="pdf_uploader" | |
) | |
if st.button("Submit & Process", key="process_button") and uploaded_files: | |
with st.spinner("Processing..."): | |
for uploaded_file in uploaded_files: | |
try: | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: | |
tmp_file.write(uploaded_file.getvalue()) | |
tmp_path = tmp_file.name | |
loader = PyPDFLoader(tmp_path) | |
pages = loader.load() | |
for page in pages: | |
page.metadata["page_number"] = pages.index(page) + 1 | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=1000, | |
chunk_overlap=200, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
chunks = text_splitter.split_documents(pages) | |
os.unlink(tmp_path) | |
vectorstore.add_documents(chunks) | |
vectorstore.persist() | |
except Exception as e: | |
st.error(f"Error processing {uploaded_file.name}: {str(e)}") | |
continue | |
st.success("Vector store updated with uploaded documents.") | |
# Main QA interface | |
user_question = st.text_input("Ask a Question from the PDF Files", key="user_question") | |
if user_question: | |
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 5}) | |
llm = BedrockChat( | |
client=bedrock_runtime, | |
model_id="anthropic.claude-v2", # or v2:1 | |
model_kwargs={"temperature": 0.0} | |
) | |
chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever) | |
with st.spinner("Generating answer..."): | |
answer = chain.invoke({"query": user_question}) | |
st.write("**Reply:**", answer["result"]) | |
if __name__ == "__main__": | |
main() |