PrajwalW's picture
Update app.py
649b115 verified
import streamlit as st
import boto3
import json
from qdrant_client import QdrantClient
from qdrant_client.http import models
import PyPDF2
import io
import uuid
# Simple function to connect to AWS Bedrock
def connect_to_bedrock():
client = boto3.client('bedrock-runtime', region_name='us-east-1')
return client
# Simple function to connect to QDrant Cloud
def connect_to_qdrant(api_key, url):
client = QdrantClient(url=url, api_key=api_key)
return client
# Extract text from PDF file
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
# Split text into smaller chunks (simple way)
def split_text_into_chunks(text, chunk_size=1000):
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
current_chunk.append(word)
current_size += len(word) + 1 # +1 for space
if current_size >= chunk_size:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_size = 0
if current_chunk: # Add last chunk if not empty
chunks.append(" ".join(current_chunk))
return chunks
# Get embeddings (vector numbers) from AI
def get_embeddings(bedrock_client, text):
body = json.dumps({
"inputText": text
})
response = bedrock_client.invoke_model(
modelId="amazon.titan-embed-text-v1",
body=body
)
result = json.loads(response['body'].read())
return result['embedding']
# Store PDF chunks in QDrant vector database
def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name):
# Create collection if it doesn't exist
try:
qdrant_client.create_collection(
collection_name=collection_name,
vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
)
except:
pass # Collection might already exist
# Store each chunk
points = []
for i, chunk in enumerate(pdf_chunks):
# Get vector representation of text
embedding = get_embeddings(bedrock_client, chunk)
# Create a point for QDrant
point = models.PointStruct(
id=str(uuid.uuid4()),
vector=embedding,
payload={"text": chunk, "chunk_id": i}
)
points.append(point)
# Upload to QDrant
qdrant_client.upsert(
collection_name=collection_name,
points=points
)
return len(points)
# Search for relevant text in QDrant
def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3):
# Get vector for question
question_embedding = get_embeddings(bedrock_client, question)
# Search in QDrant
results = qdrant_client.search(
collection_name=collection_name,
query_vector=question_embedding,
limit=top_k
)
# Extract relevant text
relevant_texts = []
for result in results:
relevant_texts.append(result.payload["text"])
return relevant_texts
# Ask AI to answer question based on PDF content
def ask_ai_with_context(bedrock_client, question, relevant_texts):
context = "\n\n".join(relevant_texts)
prompt = f"""
Based on the following information from a PDF document, please answer the question.
PDF Content:
{context}
Question: {question}
Please provide a clear and helpful answer based only on the information provided above.
If the answer is not in the provided content, please say so.
"""
body = json.dumps({
"anthropic_version": "bedrock-2023-05-31",
"max_tokens": 500,
"messages": [{"role": "user", "content": prompt}]
})
response = bedrock_client.invoke_model(
modelId="anthropic.claude-3-haiku-20240307-v1:0",
body=body
)
result = json.loads(response['body'].read())
return result['content'][0]['text']
# Main app
def main():
st.title("πŸ“„ RAG_2 PDF Chatbot")
st.write("Upload a PDF and ask questions about it!")
# Sidebar for settings
with st.sidebar:
st.subheader("πŸ”§ Setup")
st.write("You need these to use the app:")
# QDrant settings
st.write("**QDrant Cloud Settings:**")
qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io")
qdrant_api_key = st.text_input("QDrant API Key", type="password")
st.write("**Collection Name:**")
collection_name = st.text_input("Collection Name", value="pdf_documents")
st.markdown("---")
st.markdown("""
**How to get QDrant settings:**
1. Go to qdrant.io
2. Create free account
3. Create a cluster
4. Copy URL and API key
""")
# Main content
tab1, tab2 = st.tabs(["πŸ“€ Upload PDF", "πŸ’¬ Chat with PDF"])
with tab1:
st.subheader("Upload Your PDF")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
if uploaded_file and qdrant_url and qdrant_api_key:
if st.button("πŸš€ Process PDF"):
try:
with st.spinner("Processing your PDF..."):
# Connect to services
bedrock_client = connect_to_bedrock()
qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
# Extract text from PDF
st.write("πŸ“– Extracting text from PDF...")
pdf_text = extract_text_from_pdf(uploaded_file)
# Split into chunks
st.write("βœ‚οΈ Breaking text into smaller pieces...")
chunks = split_text_into_chunks(pdf_text)
# Store in QDrant
st.write("πŸ’Ύ Storing in vector database...")
num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name)
st.success(f"βœ… PDF processed successfully! Stored {num_chunks} text chunks.")
st.balloons()
except Exception as e:
st.error(f"❌ Error processing PDF: {str(e)}")
elif uploaded_file:
st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
with tab2:
st.subheader("Ask Questions About Your PDF")
if qdrant_url and qdrant_api_key:
question = st.text_input("πŸ’­ What would you like to know about your PDF?")
if question:
if st.button("πŸ” Get Answer"):
try:
with st.spinner("Searching for answer..."):
# Connect to services
bedrock_client = connect_to_bedrock()
qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
# Search for relevant content
st.write("πŸ” Searching relevant content...")
relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name)
# Get AI answer
st.write("πŸ€– Generating answer...")
answer = ask_ai_with_context(bedrock_client, question, relevant_texts)
# Show answer
st.subheader("πŸ“ Answer:")
st.write(answer)
# Show sources (optional)
with st.expander("πŸ“š Source content used"):
for i, text in enumerate(relevant_texts, 1):
st.write(f"**Source {i}:**")
st.write(text[:200] + "..." if len(text) > 200 else text)
st.write("---")
except Exception as e:
st.error(f"❌ Error: {str(e)}")
else:
st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
# Quick setup guide
def show_setup_guide():
with st.expander("πŸ“– Quick Setup Guide"):
st.markdown("""
**Step 1: Install Required Libraries**
```bash
pip install streamlit boto3 qdrant-client PyPDF2
```
**Step 2: Set up AWS**
- Create AWS account
- Run `aws configure` and enter your keys
**Step 3: Set up QDrant Cloud**
- Go to qdrant.io
- Create free account
- Create a cluster
- Copy URL and API key to sidebar
**Step 4: Run the App**
```bash
streamlit run pdf_chatbot.py
```
""")
# Run the app
if __name__ == "__main__":
show_setup_guide()
main()