Spaces:

PrajwalW
/

rag_assignment_2

Sleeping

File size: 9,105 Bytes

import streamlit as st

import boto3

import json

from qdrant_client import QdrantClient

from qdrant_client.http import models

import PyPDF2

import io

import uuid

# Simple function to connect to AWS Bedrock

def connect_to_bedrock():

    client = boto3.client('bedrock-runtime', region_name='us-east-1')

    return client

# Simple function to connect to QDrant Cloud

def connect_to_qdrant(api_key, url):

    client = QdrantClient(url=url, api_key=api_key)

    return client

# Extract text from PDF file

def extract_text_from_pdf(pdf_file):

    pdf_reader = PyPDF2.PdfReader(pdf_file)

    text = ""

    for page in pdf_reader.pages:

        text += page.extract_text() + "\n"

    return text

# Split text into smaller chunks (simple way)

def split_text_into_chunks(text, chunk_size=1000):

    words = text.split()

    chunks = []

    current_chunk = []

    current_size = 0

    for word in words:

        current_chunk.append(word)

        current_size += len(word) + 1  # +1 for space

        if current_size >= chunk_size:

            chunks.append(" ".join(current_chunk))

            current_chunk = []

            current_size = 0

    if current_chunk:  # Add last chunk if not empty

        chunks.append(" ".join(current_chunk))

    return chunks

# Get embeddings (vector numbers) from AI

def get_embeddings(bedrock_client, text):

    body = json.dumps({

        "inputText": text

    })

    response = bedrock_client.invoke_model(

        modelId="amazon.titan-embed-text-v1",

        body=body

    )

    result = json.loads(response['body'].read())

    return result['embedding']

# Store PDF chunks in QDrant vector database

def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name):

    # Create collection if it doesn't exist

    try:

        qdrant_client.create_collection(

            collection_name=collection_name,

            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)

        )

    except:

        pass  # Collection might already exist

    # Store each chunk

    points = []

    for i, chunk in enumerate(pdf_chunks):

        # Get vector representation of text

        embedding = get_embeddings(bedrock_client, chunk)

        # Create a point for QDrant

        point = models.PointStruct(

            id=str(uuid.uuid4()),

            vector=embedding,

            payload={"text": chunk, "chunk_id": i}

        )

        points.append(point)

    # Upload to QDrant

    qdrant_client.upsert(

        collection_name=collection_name,

        points=points

    )

    return len(points)

# Search for relevant text in QDrant

def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3):

    # Get vector for question

    question_embedding = get_embeddings(bedrock_client, question)

    # Search in QDrant

    results = qdrant_client.search(

        collection_name=collection_name,

        query_vector=question_embedding,

        limit=top_k

    )

    # Extract relevant text

    relevant_texts = []

    for result in results:

        relevant_texts.append(result.payload["text"])

    return relevant_texts

# Ask AI to answer question based on PDF content

def ask_ai_with_context(bedrock_client, question, relevant_texts):

    context = "\n\n".join(relevant_texts)

    prompt = f"""

    Based on the following information from a PDF document, please answer the question.

    PDF Content:

    {context}

    Question: {question}

    Please provide a clear and helpful answer based only on the information provided above.

    If the answer is not in the provided content, please say so.

    """

    body = json.dumps({

        "anthropic_version": "bedrock-2023-05-31",

        "max_tokens": 500,

        "messages": [{"role": "user", "content": prompt}]

    })

    response = bedrock_client.invoke_model(

        modelId="anthropic.claude-3-haiku-20240307-v1:0",

        body=body

    )

    result = json.loads(response['body'].read())

    return result['content'][0]['text']

# Main app

def main():

    st.title("📄 RAG_2 PDF Chatbot")

    st.write("Upload a PDF and ask questions about it!")

    # Sidebar for settings

    with st.sidebar:

        st.subheader("🔧 Setup")

        st.write("You need these to use the app:")

        # QDrant settings

        st.write("**QDrant Cloud Settings:**")

        qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io")

        qdrant_api_key = st.text_input("QDrant API Key", type="password")

        st.write("**Collection Name:**")

        collection_name = st.text_input("Collection Name", value="pdf_documents")

        st.markdown("---")

        st.markdown("""

        **How to get QDrant settings:**

        1. Go to qdrant.io

        2. Create free account

        3. Create a cluster

        4. Copy URL and API key

        """)

    # Main content

    tab1, tab2 = st.tabs(["📤 Upload PDF", "💬 Chat with PDF"])

    with tab1:

        st.subheader("Upload Your PDF")

        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

        if uploaded_file and qdrant_url and qdrant_api_key:

            if st.button("🚀 Process PDF"):

                try:

                    with st.spinner("Processing your PDF..."):

                        # Connect to services

                        bedrock_client = connect_to_bedrock()

                        qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)

                        # Extract text from PDF

                        st.write("📖 Extracting text from PDF...")

                        pdf_text = extract_text_from_pdf(uploaded_file)

                        # Split into chunks

                        st.write("✂️ Breaking text into smaller pieces...")

                        chunks = split_text_into_chunks(pdf_text)

                        # Store in QDrant

                        st.write("💾 Storing in vector database...")

                        num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name)

                        st.success(f"✅ PDF processed successfully! Stored {num_chunks} text chunks.")

                        st.balloons()

                except Exception as e:

                    st.error(f"❌ Error processing PDF: {str(e)}")

        elif uploaded_file:

            st.warning("⚠️ Please enter QDrant settings in the sidebar first!")

    with tab2:

        st.subheader("Ask Questions About Your PDF")

        if qdrant_url and qdrant_api_key:

            question = st.text_input("💭 What would you like to know about your PDF?")

            if question:

                if st.button("🔍 Get Answer"):

                    try:

                        with st.spinner("Searching for answer..."):

                            # Connect to services

                            bedrock_client = connect_to_bedrock()

                            qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)

                            # Search for relevant content

                            st.write("🔍 Searching relevant content...")

                            relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name)

                            # Get AI answer

                            st.write("🤖 Generating answer...")

                            answer = ask_ai_with_context(bedrock_client, question, relevant_texts)

                            # Show answer

                            st.subheader("📝 Answer:")

                            st.write(answer)

                            # Show sources (optional)

                            with st.expander("📚 Source content used"):

                                for i, text in enumerate(relevant_texts, 1):

                                    st.write(f"**Source {i}:**")

                                    st.write(text[:200] + "..." if len(text) > 200 else text)

                                    st.write("---")

                    except Exception as e:

                        st.error(f"❌ Error: {str(e)}")

        else:

            st.warning("⚠️ Please enter QDrant settings in the sidebar first!")

# Quick setup guide

def show_setup_guide():

    with st.expander("📖 Quick Setup Guide"):

        st.markdown("""

        **Step 1: Install Required Libraries**

        ```bash

        pip install streamlit boto3 qdrant-client PyPDF2

        ```

        **Step 2: Set up AWS**

        - Create AWS account

        - Run `aws configure` and enter your keys

        **Step 3: Set up QDrant Cloud**

        - Go to qdrant.io

        - Create free account

        - Create a cluster

        - Copy URL and API key to sidebar

        **Step 4: Run the App**

        ```bash

        streamlit run pdf_chatbot.py

        ```

        """)

# Run the app

if __name__ == "__main__":

    show_setup_guide()

    main()