import streamlit as st import boto3 import json from qdrant_client import QdrantClient from qdrant_client.http import models import PyPDF2 import io import uuid # Simple function to connect to AWS Bedrock def connect_to_bedrock(): client = boto3.client('bedrock-runtime', region_name='us-east-1') return client # Simple function to connect to QDrant Cloud def connect_to_qdrant(api_key, url): client = QdrantClient(url=url, api_key=api_key) return client # Extract text from PDF file def extract_text_from_pdf(pdf_file): pdf_reader = PyPDF2.PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text # Split text into smaller chunks (simple way) def split_text_into_chunks(text, chunk_size=1000): words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: current_chunk.append(word) current_size += len(word) + 1 # +1 for space if current_size >= chunk_size: chunks.append(" ".join(current_chunk)) current_chunk = [] current_size = 0 if current_chunk: # Add last chunk if not empty chunks.append(" ".join(current_chunk)) return chunks # Get embeddings (vector numbers) from AI def get_embeddings(bedrock_client, text): body = json.dumps({ "inputText": text }) response = bedrock_client.invoke_model( modelId="amazon.titan-embed-text-v1", body=body ) result = json.loads(response['body'].read()) return result['embedding'] # Store PDF chunks in QDrant vector database def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name): # Create collection if it doesn't exist try: qdrant_client.create_collection( collection_name=collection_name, vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE) ) except: pass # Collection might already exist # Store each chunk points = [] for i, chunk in enumerate(pdf_chunks): # Get vector representation of text embedding = get_embeddings(bedrock_client, chunk) # Create a point for QDrant point = models.PointStruct( id=str(uuid.uuid4()), vector=embedding, payload={"text": chunk, "chunk_id": i} ) points.append(point) # Upload to QDrant qdrant_client.upsert( collection_name=collection_name, points=points ) return len(points) # Search for relevant text in QDrant def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3): # Get vector for question question_embedding = get_embeddings(bedrock_client, question) # Search in QDrant results = qdrant_client.search( collection_name=collection_name, query_vector=question_embedding, limit=top_k ) # Extract relevant text relevant_texts = [] for result in results: relevant_texts.append(result.payload["text"]) return relevant_texts # Ask AI to answer question based on PDF content def ask_ai_with_context(bedrock_client, question, relevant_texts): context = "\n\n".join(relevant_texts) prompt = f""" Based on the following information from a PDF document, please answer the question. PDF Content: {context} Question: {question} Please provide a clear and helpful answer based only on the information provided above. If the answer is not in the provided content, please say so. """ body = json.dumps({ "anthropic_version": "bedrock-2023-05-31", "max_tokens": 500, "messages": [{"role": "user", "content": prompt}] }) response = bedrock_client.invoke_model( modelId="anthropic.claude-3-haiku-20240307-v1:0", body=body ) result = json.loads(response['body'].read()) return result['content'][0]['text'] # Main app def main(): st.title("📄 RAG_2 PDF Chatbot") st.write("Upload a PDF and ask questions about it!") # Sidebar for settings with st.sidebar: st.subheader("🔧 Setup") st.write("You need these to use the app:") # QDrant settings st.write("**QDrant Cloud Settings:**") qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io") qdrant_api_key = st.text_input("QDrant API Key", type="password") st.write("**Collection Name:**") collection_name = st.text_input("Collection Name", value="pdf_documents") st.markdown("---") st.markdown(""" **How to get QDrant settings:** 1. Go to qdrant.io 2. Create free account 3. Create a cluster 4. Copy URL and API key """) # Main content tab1, tab2 = st.tabs(["📤 Upload PDF", "💬 Chat with PDF"]) with tab1: st.subheader("Upload Your PDF") uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") if uploaded_file and qdrant_url and qdrant_api_key: if st.button("🚀 Process PDF"): try: with st.spinner("Processing your PDF..."): # Connect to services bedrock_client = connect_to_bedrock() qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url) # Extract text from PDF st.write("📖 Extracting text from PDF...") pdf_text = extract_text_from_pdf(uploaded_file) # Split into chunks st.write("✂️ Breaking text into smaller pieces...") chunks = split_text_into_chunks(pdf_text) # Store in QDrant st.write("💾 Storing in vector database...") num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name) st.success(f"✅ PDF processed successfully! Stored {num_chunks} text chunks.") st.balloons() except Exception as e: st.error(f"❌ Error processing PDF: {str(e)}") elif uploaded_file: st.warning("⚠️ Please enter QDrant settings in the sidebar first!") with tab2: st.subheader("Ask Questions About Your PDF") if qdrant_url and qdrant_api_key: question = st.text_input("💭 What would you like to know about your PDF?") if question: if st.button("🔍 Get Answer"): try: with st.spinner("Searching for answer..."): # Connect to services bedrock_client = connect_to_bedrock() qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url) # Search for relevant content st.write("🔍 Searching relevant content...") relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name) # Get AI answer st.write("🤖 Generating answer...") answer = ask_ai_with_context(bedrock_client, question, relevant_texts) # Show answer st.subheader("📝 Answer:") st.write(answer) # Show sources (optional) with st.expander("📚 Source content used"): for i, text in enumerate(relevant_texts, 1): st.write(f"**Source {i}:**") st.write(text[:200] + "..." if len(text) > 200 else text) st.write("---") except Exception as e: st.error(f"❌ Error: {str(e)}") else: st.warning("⚠️ Please enter QDrant settings in the sidebar first!") # Quick setup guide def show_setup_guide(): with st.expander("📖 Quick Setup Guide"): st.markdown(""" **Step 1: Install Required Libraries** ```bash pip install streamlit boto3 qdrant-client PyPDF2 ``` **Step 2: Set up AWS** - Create AWS account - Run `aws configure` and enter your keys **Step 3: Set up QDrant Cloud** - Go to qdrant.io - Create free account - Create a cluster - Copy URL and API key to sidebar **Step 4: Run the App** ```bash streamlit run pdf_chatbot.py ``` """) # Run the app if __name__ == "__main__": show_setup_guide() main()