Spaces:

PrajwalW
/

rag_assignment_2

Sleeping

App Files Files Community

PrajwalW commited on May 27

Commit

13e2a13

verified ·

1 Parent(s): a6ae532

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -61

app.py CHANGED Viewed

@@ -1,64 +1,438 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
+import boto3
+import json
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
+import PyPDF2
+import io
+import uuid
+# Simple function to connect to AWS Bedrock
+def connect_to_bedrock():
+    client = boto3.client('bedrock-runtime', region_name='us-east-1')
+    return client
+# Simple function to connect to QDrant Cloud
+def connect_to_qdrant(api_key, url):
+    client = QdrantClient(url=url, api_key=api_key)
+    return client
+# Extract text from PDF file
+def extract_text_from_pdf(pdf_file):
+    pdf_reader = PyPDF2.PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text() + "\n"
+    return text
+# Split text into smaller chunks (simple way)
+def split_text_into_chunks(text, chunk_size=1000):
+    words = text.split()
+    chunks = []
+    current_chunk = []
+    current_size = 0
+    for word in words:
+        current_chunk.append(word)
+        current_size += len(word) + 1  # +1 for space
+        if current_size >= chunk_size:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = []
+            current_size = 0
+    if current_chunk:  # Add last chunk if not empty
+        chunks.append(" ".join(current_chunk))
+    return chunks
+# Get embeddings (vector numbers) from AI
+def get_embeddings(bedrock_client, text):
+    body = json.dumps({
+        "inputText": text
+    })
+    response = bedrock_client.invoke_model(
+        modelId="amazon.titan-embed-text-v1",
+        body=body
+    )
+    result = json.loads(response['body'].read())
+    return result['embedding']
+# Store PDF chunks in QDrant vector database
+def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name):
+    # Create collection if it doesn't exist
+    try:
+        qdrant_client.create_collection(
+            collection_name=collection_name,
+            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
+        )
+    except:
+        pass  # Collection might already exist
+    # Store each chunk
+    points = []
+    for i, chunk in enumerate(pdf_chunks):
+        # Get vector representation of text
+        embedding = get_embeddings(bedrock_client, chunk)
+        # Create a point for QDrant
+        point = models.PointStruct(
+            id=str(uuid.uuid4()),
+            vector=embedding,
+            payload={"text": chunk, "chunk_id": i}
+        )
+        points.append(point)
+    # Upload to QDrant
+    qdrant_client.upsert(
+        collection_name=collection_name,
+        points=points
+    )
+    return len(points)
+# Search for relevant text in QDrant
+def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3):
+    # Get vector for question
+    question_embedding = get_embeddings(bedrock_client, question)
+    # Search in QDrant
+    results = qdrant_client.search(
+        collection_name=collection_name,
+        query_vector=question_embedding,
+        limit=top_k
+    )
+    # Extract relevant text
+    relevant_texts = []
+    for result in results:
+        relevant_texts.append(result.payload["text"])
+    return relevant_texts
+# Ask AI to answer question based on PDF content
+def ask_ai_with_context(bedrock_client, question, relevant_texts):
+    context = "\n\n".join(relevant_texts)
+    prompt = f"""
+    Based on the following information from a PDF document, please answer the question.
+    PDF Content:
+    {context}
+    Question: {question}
+    Please provide a clear and helpful answer based only on the information provided above.
+    If the answer is not in the provided content, please say so.
+    """
+    body = json.dumps({
+        "anthropic_version": "bedrock-2023-05-31",
+        "max_tokens": 500,
+        "messages": [{"role": "user", "content": prompt}]
+    })
+    response = bedrock_client.invoke_model(
+        modelId="anthropic.claude-3-haiku-20240307-v1:0",
+        body=body
+    )
+    result = json.loads(response['body'].read())
+    return result['content'][0]['text']
+# Main app
+def main():
+    st.title("📄 Simple PDF Chatbot")
+    st.write("Upload a PDF and ask questions about it!")
+    # Sidebar for settings
+    with st.sidebar:
+        st.subheader("🔧 Setup")
+        st.write("You need these to use the app:")
+        # QDrant settings
+        st.write("**QDrant Cloud Settings:**")
+        qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io")
+        qdrant_api_key = st.text_input("QDrant API Key", type="password")
+        st.write("**Collection Name:**")
+        collection_name = st.text_input("Collection Name", value="pdf_documents")
+        st.markdown("---")
+        st.markdown("""
+        **How to get QDrant settings:**
+        1. Go to qdrant.io
+        2. Create free account
+        3. Create a cluster
+        4. Copy URL and API key
+        """)
+    # Main content
+    tab1, tab2 = st.tabs(["📤 Upload PDF", "💬 Chat with PDF"])
+    with tab1:
+        st.subheader("Upload Your PDF")
+        uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
+        if uploaded_file and qdrant_url and qdrant_api_key:
+            if st.button("🚀 Process PDF"):
+                try:
+                    with st.spinner("Processing your PDF..."):
+                        # Connect to services
+                        bedrock_client = connect_to_bedrock()
+                        qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
+                        # Extract text from PDF
+                        st.write("📖 Extracting text from PDF...")
+                        pdf_text = extract_text_from_pdf(uploaded_file)
+                        # Split into chunks
+                        st.write("✂️ Breaking text into smaller pieces...")
+                        chunks = split_text_into_chunks(pdf_text)
+                        # Store in QDrant
+                        st.write("💾 Storing in vector database...")
+                        num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name)
+                        st.success(f"✅ PDF processed successfully! Stored {num_chunks} text chunks.")
+                        st.balloons()
+                except Exception as e:
+                    st.error(f"❌ Error processing PDF: {str(e)}")
+        elif uploaded_file:
+            st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
+    with tab2:
+        st.subheader("Ask Questions About Your PDF")
+        if qdrant_url and qdrant_api_key:
+            question = st.text_input("💭 What would you like to know about your PDF?")
+            if question:
+                if st.button("🔍 Get Answer"):
+                    try:
+                        with st.spinner("Searching for answer..."):
+                            # Connect to services
+                            bedrock_client = connect_to_bedrock()
+                            qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
+                            # Search for relevant content
+                            st.write("🔍 Searching relevant content...")
+                            relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name)
+                            # Get AI answer
+                            st.write("🤖 Generating answer...")
+                            answer = ask_ai_with_context(bedrock_client, question, relevant_texts)
+                            # Show answer
+                            st.subheader("📝 Answer:")
+                            st.write(answer)
+                            # Show sources (optional)
+                            with st.expander("📚 Source content used"):
+                                for i, text in enumerate(relevant_texts, 1):
+                                    st.write(f"**Source {i}:**")
+                                    st.write(text[:200] + "..." if len(text) > 200 else text)
+                                    st.write("---")
+                    except Exception as e:
+                        st.error(f"❌ Error: {str(e)}")
+        else:
+            st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
+# Quick setup guide
+def show_setup_guide():
+    with st.expander("📖 Quick Setup Guide"):
+        st.markdown("""
+        **Step 1: Install Required Libraries**
+        ```bash
+        pip install streamlit boto3 qdrant-client PyPDF2
+        ```
+        **Step 2: Set up AWS**
+        - Create AWS account
+        - Run `aws configure` and enter your keys
+        **Step 3: Set up QDrant Cloud**
+        - Go to qdrant.io
+        - Create free account
+        - Create a cluster
+        - Copy URL and API key to sidebar
+        **Step 4: Run the App**
+        ```bash
+        streamlit run pdf_chatbot.py
+        ```
+        """)
+# Run the app
 if __name__ == "__main__":
+    show_setup_guide()
+    main()