Spaces:
Sleeping
Sleeping
import streamlit as st | |
import boto3 | |
import json | |
from qdrant_client import QdrantClient | |
from qdrant_client.http import models | |
import PyPDF2 | |
import io | |
import uuid | |
# Simple function to connect to AWS Bedrock | |
def connect_to_bedrock(): | |
client = boto3.client('bedrock-runtime', region_name='us-east-1') | |
return client | |
# Simple function to connect to QDrant Cloud | |
def connect_to_qdrant(api_key, url): | |
client = QdrantClient(url=url, api_key=api_key) | |
return client | |
# Extract text from PDF file | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = "" | |
for page in pdf_reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
# Split text into smaller chunks (simple way) | |
def split_text_into_chunks(text, chunk_size=1000): | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_size = 0 | |
for word in words: | |
current_chunk.append(word) | |
current_size += len(word) + 1 # +1 for space | |
if current_size >= chunk_size: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_size = 0 | |
if current_chunk: # Add last chunk if not empty | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
# Get embeddings (vector numbers) from AI | |
def get_embeddings(bedrock_client, text): | |
body = json.dumps({ | |
"inputText": text | |
}) | |
response = bedrock_client.invoke_model( | |
modelId="amazon.titan-embed-text-v1", | |
body=body | |
) | |
result = json.loads(response['body'].read()) | |
return result['embedding'] | |
# Store PDF chunks in QDrant vector database | |
def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name): | |
# Create collection if it doesn't exist | |
try: | |
qdrant_client.create_collection( | |
collection_name=collection_name, | |
vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE) | |
) | |
except: | |
pass # Collection might already exist | |
# Store each chunk | |
points = [] | |
for i, chunk in enumerate(pdf_chunks): | |
# Get vector representation of text | |
embedding = get_embeddings(bedrock_client, chunk) | |
# Create a point for QDrant | |
point = models.PointStruct( | |
id=str(uuid.uuid4()), | |
vector=embedding, | |
payload={"text": chunk, "chunk_id": i} | |
) | |
points.append(point) | |
# Upload to QDrant | |
qdrant_client.upsert( | |
collection_name=collection_name, | |
points=points | |
) | |
return len(points) | |
# Search for relevant text in QDrant | |
def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3): | |
# Get vector for question | |
question_embedding = get_embeddings(bedrock_client, question) | |
# Search in QDrant | |
results = qdrant_client.search( | |
collection_name=collection_name, | |
query_vector=question_embedding, | |
limit=top_k | |
) | |
# Extract relevant text | |
relevant_texts = [] | |
for result in results: | |
relevant_texts.append(result.payload["text"]) | |
return relevant_texts | |
# Ask AI to answer question based on PDF content | |
def ask_ai_with_context(bedrock_client, question, relevant_texts): | |
context = "\n\n".join(relevant_texts) | |
prompt = f""" | |
Based on the following information from a PDF document, please answer the question. | |
PDF Content: | |
{context} | |
Question: {question} | |
Please provide a clear and helpful answer based only on the information provided above. | |
If the answer is not in the provided content, please say so. | |
""" | |
body = json.dumps({ | |
"anthropic_version": "bedrock-2023-05-31", | |
"max_tokens": 500, | |
"messages": [{"role": "user", "content": prompt}] | |
}) | |
response = bedrock_client.invoke_model( | |
modelId="anthropic.claude-3-haiku-20240307-v1:0", | |
body=body | |
) | |
result = json.loads(response['body'].read()) | |
return result['content'][0]['text'] | |
# Main app | |
def main(): | |
st.title("π RAG_2 PDF Chatbot") | |
st.write("Upload a PDF and ask questions about it!") | |
# Sidebar for settings | |
with st.sidebar: | |
st.subheader("π§ Setup") | |
st.write("You need these to use the app:") | |
# QDrant settings | |
st.write("**QDrant Cloud Settings:**") | |
qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io") | |
qdrant_api_key = st.text_input("QDrant API Key", type="password") | |
st.write("**Collection Name:**") | |
collection_name = st.text_input("Collection Name", value="pdf_documents") | |
st.markdown("---") | |
st.markdown(""" | |
**How to get QDrant settings:** | |
1. Go to qdrant.io | |
2. Create free account | |
3. Create a cluster | |
4. Copy URL and API key | |
""") | |
# Main content | |
tab1, tab2 = st.tabs(["π€ Upload PDF", "π¬ Chat with PDF"]) | |
with tab1: | |
st.subheader("Upload Your PDF") | |
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") | |
if uploaded_file and qdrant_url and qdrant_api_key: | |
if st.button("π Process PDF"): | |
try: | |
with st.spinner("Processing your PDF..."): | |
# Connect to services | |
bedrock_client = connect_to_bedrock() | |
qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url) | |
# Extract text from PDF | |
st.write("π Extracting text from PDF...") | |
pdf_text = extract_text_from_pdf(uploaded_file) | |
# Split into chunks | |
st.write("βοΈ Breaking text into smaller pieces...") | |
chunks = split_text_into_chunks(pdf_text) | |
# Store in QDrant | |
st.write("πΎ Storing in vector database...") | |
num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name) | |
st.success(f"β PDF processed successfully! Stored {num_chunks} text chunks.") | |
st.balloons() | |
except Exception as e: | |
st.error(f"β Error processing PDF: {str(e)}") | |
elif uploaded_file: | |
st.warning("β οΈ Please enter QDrant settings in the sidebar first!") | |
with tab2: | |
st.subheader("Ask Questions About Your PDF") | |
if qdrant_url and qdrant_api_key: | |
question = st.text_input("π What would you like to know about your PDF?") | |
if question: | |
if st.button("π Get Answer"): | |
try: | |
with st.spinner("Searching for answer..."): | |
# Connect to services | |
bedrock_client = connect_to_bedrock() | |
qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url) | |
# Search for relevant content | |
st.write("π Searching relevant content...") | |
relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name) | |
# Get AI answer | |
st.write("π€ Generating answer...") | |
answer = ask_ai_with_context(bedrock_client, question, relevant_texts) | |
# Show answer | |
st.subheader("π Answer:") | |
st.write(answer) | |
# Show sources (optional) | |
with st.expander("π Source content used"): | |
for i, text in enumerate(relevant_texts, 1): | |
st.write(f"**Source {i}:**") | |
st.write(text[:200] + "..." if len(text) > 200 else text) | |
st.write("---") | |
except Exception as e: | |
st.error(f"β Error: {str(e)}") | |
else: | |
st.warning("β οΈ Please enter QDrant settings in the sidebar first!") | |
# Quick setup guide | |
def show_setup_guide(): | |
with st.expander("π Quick Setup Guide"): | |
st.markdown(""" | |
**Step 1: Install Required Libraries** | |
```bash | |
pip install streamlit boto3 qdrant-client PyPDF2 | |
``` | |
**Step 2: Set up AWS** | |
- Create AWS account | |
- Run `aws configure` and enter your keys | |
**Step 3: Set up QDrant Cloud** | |
- Go to qdrant.io | |
- Create free account | |
- Create a cluster | |
- Copy URL and API key to sidebar | |
**Step 4: Run the App** | |
```bash | |
streamlit run pdf_chatbot.py | |
``` | |
""") | |
# Run the app | |
if __name__ == "__main__": | |
show_setup_guide() | |
main() | |