Spaces:

PrajwalW
/

rag_assignment_2

Sleeping

App Files Files Community

rag_assignment_2 / app.py

PrajwalW

Update app.py

649b115 verified 3 months ago

raw

history blame contribute delete

9.11 kB

	import streamlit as st

	import boto3

	import json

	from qdrant_client import QdrantClient

	from qdrant_client.http import models

	import PyPDF2

	import io

	import uuid

	# Simple function to connect to AWS Bedrock

	def connect_to_bedrock():

	client = boto3.client('bedrock-runtime', region_name='us-east-1')

	return client

	# Simple function to connect to QDrant Cloud

	def connect_to_qdrant(api_key, url):

	client = QdrantClient(url=url, api_key=api_key)

	return client

	# Extract text from PDF file

	def extract_text_from_pdf(pdf_file):

	pdf_reader = PyPDF2.PdfReader(pdf_file)

	text = ""

	for page in pdf_reader.pages:

	text += page.extract_text() + "\n"

	return text

	# Split text into smaller chunks (simple way)

	def split_text_into_chunks(text, chunk_size=1000):

	words = text.split()

	chunks = []

	current_chunk = []

	current_size = 0

	for word in words:

	current_chunk.append(word)

	current_size += len(word) + 1 # +1 for space

	if current_size >= chunk_size:

	chunks.append(" ".join(current_chunk))

	current_chunk = []

	current_size = 0

	if current_chunk: # Add last chunk if not empty

	chunks.append(" ".join(current_chunk))

	return chunks

	# Get embeddings (vector numbers) from AI

	def get_embeddings(bedrock_client, text):

	body = json.dumps({

	"inputText": text

	})

	response = bedrock_client.invoke_model(

	modelId="amazon.titan-embed-text-v1",

	body=body

	)

	result = json.loads(response['body'].read())

	return result['embedding']

	# Store PDF chunks in QDrant vector database

	def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name):

	# Create collection if it doesn't exist

	try:

	qdrant_client.create_collection(

	collection_name=collection_name,

	vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)

	)

	except:

	pass # Collection might already exist

	# Store each chunk

	points = []

	for i, chunk in enumerate(pdf_chunks):

	# Get vector representation of text

	embedding = get_embeddings(bedrock_client, chunk)

	# Create a point for QDrant

	point = models.PointStruct(

	id=str(uuid.uuid4()),

	vector=embedding,

	payload={"text": chunk, "chunk_id": i}

	)

	points.append(point)

	# Upload to QDrant

	qdrant_client.upsert(

	collection_name=collection_name,

	points=points

	)

	return len(points)

	# Search for relevant text in QDrant

	def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3):

	# Get vector for question

	question_embedding = get_embeddings(bedrock_client, question)

	# Search in QDrant

	results = qdrant_client.search(

	collection_name=collection_name,

	query_vector=question_embedding,

	limit=top_k

	)

	# Extract relevant text

	relevant_texts = []

	for result in results:

	relevant_texts.append(result.payload["text"])

	return relevant_texts

	# Ask AI to answer question based on PDF content

	def ask_ai_with_context(bedrock_client, question, relevant_texts):

	context = "\n\n".join(relevant_texts)

	prompt = f"""

	Based on the following information from a PDF document, please answer the question.

	PDF Content:

	{context}

	Question: {question}

	Please provide a clear and helpful answer based only on the information provided above.

	If the answer is not in the provided content, please say so.

	"""

	body = json.dumps({

	"anthropic_version": "bedrock-2023-05-31",

	"max_tokens": 500,

	"messages": [{"role": "user", "content": prompt}]

	})

	response = bedrock_client.invoke_model(

	modelId="anthropic.claude-3-haiku-20240307-v1:0",

	body=body

	)

	result = json.loads(response['body'].read())

	return result['content'][0]['text']

	# Main app

	def main():

	st.title("📄 RAG_2 PDF Chatbot")

	st.write("Upload a PDF and ask questions about it!")

	# Sidebar for settings

	with st.sidebar:

	st.subheader("🔧 Setup")

	st.write("You need these to use the app:")

	# QDrant settings

	st.write("QDrant Cloud Settings:")

	qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io")

	qdrant_api_key = st.text_input("QDrant API Key", type="password")

	st.write("Collection Name:")

	collection_name = st.text_input("Collection Name", value="pdf_documents")

	st.markdown("---")

	st.markdown("""

	How to get QDrant settings:

	1. Go to qdrant.io

	2. Create free account

	3. Create a cluster

	4. Copy URL and API key

	""")

	# Main content

	tab1, tab2 = st.tabs(["📤 Upload PDF", "💬 Chat with PDF"])

	with tab1:

	st.subheader("Upload Your PDF")

	uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

	if uploaded_file and qdrant_url and qdrant_api_key:

	if st.button("🚀 Process PDF"):

	try:

	with st.spinner("Processing your PDF..."):

	# Connect to services

	bedrock_client = connect_to_bedrock()

	qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)

	# Extract text from PDF

	st.write("📖 Extracting text from PDF...")

	pdf_text = extract_text_from_pdf(uploaded_file)

	# Split into chunks

	st.write("✂️ Breaking text into smaller pieces...")

	chunks = split_text_into_chunks(pdf_text)

	# Store in QDrant

	st.write("💾 Storing in vector database...")

	num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name)

	st.success(f"✅ PDF processed successfully! Stored {num_chunks} text chunks.")

	st.balloons()

	except Exception as e:

	st.error(f"❌ Error processing PDF: {str(e)}")

	elif uploaded_file:

	st.warning("⚠️ Please enter QDrant settings in the sidebar first!")

	with tab2:

	st.subheader("Ask Questions About Your PDF")

	if qdrant_url and qdrant_api_key:

	question = st.text_input("💭 What would you like to know about your PDF?")

	if question:

	if st.button("🔍 Get Answer"):

	try:

	with st.spinner("Searching for answer..."):

	# Connect to services

	bedrock_client = connect_to_bedrock()

	qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)

	# Search for relevant content

	st.write("🔍 Searching relevant content...")

	relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name)

	# Get AI answer

	st.write("🤖 Generating answer...")

	answer = ask_ai_with_context(bedrock_client, question, relevant_texts)

	# Show answer

	st.subheader("📝 Answer:")

	st.write(answer)

	# Show sources (optional)

	with st.expander("📚 Source content used"):

	for i, text in enumerate(relevant_texts, 1):

	st.write(f"Source {i}:")

	st.write(text[:200] + "..." if len(text) > 200 else text)

	st.write("---")

	except Exception as e:

	st.error(f"❌ Error: {str(e)}")

	else:

	st.warning("⚠️ Please enter QDrant settings in the sidebar first!")

	# Quick setup guide

	def show_setup_guide():

	with st.expander("📖 Quick Setup Guide"):

	st.markdown("""

	Step 1: Install Required Libraries

	```bash

	pip install streamlit boto3 qdrant-client PyPDF2

	```

	Step 2: Set up AWS

	- Create AWS account

	- Run `aws configure` and enter your keys

	Step 3: Set up QDrant Cloud

	- Go to qdrant.io

	- Create free account

	- Create a cluster

	- Copy URL and API key to sidebar

	Step 4: Run the App

	```bash

	streamlit run pdf_chatbot.py

	```

	""")

	# Run the app

	if __name__ == "__main__":

	show_setup_guide()

	main()