Spaces:

DarshanaD
/

rag_re_ranking

Sleeping

App Files Files Community

DarshanaD commited on May 27

Commit

933dbc2

1 Parent(s): 3faa6ea

Initial commit

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +364 -52
requirements.txt +10 -1

README.md CHANGED Viewed

@@ -3,8 +3,8 @@ title: Rag Re Ranking
 emoji: 💬
 colorFrom: yellow
 colorTo: purple
-sdk: gradio
-sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0

 emoji: 💬
 colorFrom: yellow
 colorTo: purple
+sdk: stremlit
+sdk_version: 1.35.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,64 +1,376 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import streamlit as st
+import boto3
+import json
+import chromadb
+from datasets import load_dataset
+import uuid
+import time
+# Simple function to connect to AWS Bedrock
+def connect_to_bedrock():
+    client = boto3.client('bedrock-runtime', region_name='us-east-1')
+    return client
+# Simple function to load Wikipedia documents
+def load_wikipedia_docs(num_docs=100):
+    st.write(f"📚 Loading {num_docs} Wikipedia documents...")
+    # Load Wikipedia dataset from Hugging Face
+    dataset = load_dataset("Cohere/wikipedia-22-12-simple-embeddings", split="train")
+    # Take only the first num_docs documents
+    documents = []
+    for i in range(min(num_docs, len(dataset))):
+        doc = dataset[i]
+        documents.append({
+            'text': doc['text'],
+            'title': doc.get('title', f'Document {i+1}'),
+            'id': str(i)
+        })
+    return documents
+# Simple function to split text into chunks
+def split_into_chunks(documents, chunk_size=500):
+    st.write("✂️ Splitting documents into 500-character chunks...")
+    chunks = []
+    chunk_id = 0
+    for doc in documents:
+        text = doc['text']
+        title = doc['title']
+        # Split text into chunks of 500 characters
+        for i in range(0, len(text), chunk_size):
+            chunk_text = text[i:i + chunk_size]
+            if len(chunk_text.strip()) > 50:  # Only keep meaningful chunks
+                chunks.append({
+                    'id': str(chunk_id),
+                    'text': chunk_text,
+                    'title': title,
+                    'doc_id': doc['id']
+                })
+                chunk_id += 1
+    return chunks
+# Get embeddings from Bedrock Titan model
+def get_embeddings(bedrock_client, text):
+    body = json.dumps({
+        "inputText": text
+    })
+    response = bedrock_client.invoke_model(
+        modelId="amazon.titan-embed-text-v1",
+        body=body
+    )
+    result = json.loads(response['body'].read())
+    return result['embedding']
+# Store chunks in ChromaDB
+def store_in_chromadb(bedrock_client, chunks):
+    st.write("💾 Storing chunks in ChromaDB with embeddings...")
+    # Create ChromaDB client
+    chroma_client = chromadb.Client()
+    # Create or get collection
+    try:
+        collection = chroma_client.get_collection("wikipedia_chunks")
+        chroma_client.delete_collection("wikipedia_chunks")
+    except:
+        pass
+    collection = chroma_client.create_collection("wikipedia_chunks")
+    # Prepare data for ChromaDB
+    ids = []
+    texts = []
+    metadatas = []
+    embeddings = []
+    progress_bar = st.progress(0)
+    for i, chunk in enumerate(chunks):
+        # Get embedding for each chunk
+        embedding = get_embeddings(bedrock_client, chunk['text'])
+        ids.append(chunk['id'])
+        texts.append(chunk['text'])
+        metadatas.append({
+            'title': chunk['title'],
+            'doc_id': chunk['doc_id']
+        })
+        embeddings.append(embedding)
+        # Update progress
+        progress_bar.progress((i + 1) / len(chunks))
+        # Add to ChromaDB in batches of 100
+        if len(ids) == 100 or i == len(chunks) - 1:
+            collection.add(
+                ids=ids,
+                documents=texts,
+                metadatas=metadatas,
+                embeddings=embeddings
+            )
+            ids, texts, metadatas, embeddings = [], [], [], []
+    return collection
+# Simple retrieval without re-ranking
+def simple_retrieval(collection, bedrock_client, query, top_k=10):
+    # Get query embedding
+    query_embedding = get_embeddings(bedrock_client, query)
+    # Search in ChromaDB
+    results = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=top_k
+    )
+    # Format results
+    retrieved_docs = []
+    for i in range(len(results['documents'][0])):
+        retrieved_docs.append({
+            'text': results['documents'][0][i],
+            'title': results['metadatas'][0][i]['title'],
+            'distance': results['distances'][0][i]
+        })
+    return retrieved_docs
+# Re-ranking using Claude 3.5
+def rerank_with_claude(bedrock_client, query, documents, top_k=5):
+    # Create prompt for re-ranking
+    docs_text = ""
+    for i, doc in enumerate(documents):
+        docs_text += f"[{i+1}] {doc['text'][:200]}...\n\n"
+    prompt = f"""
+    Given the query: "{query}"
+    Please rank the following documents by relevance to the query.
+    Return only the numbers (1, 2, 3, etc.) of the most relevant documents in order, separated by commas.
+    Return exactly {top_k} numbers.
+    Documents:
+    {docs_text}
+    Most relevant document numbers (in order):
+    """
+    body = json.dumps({
+        "anthropic_version": "bedrock-2023-05-31",
+        "max_tokens": 100,
+        "messages": [{"role": "user", "content": prompt}]
+    })
+    response = bedrock_client.invoke_model(
+        modelId="anthropic.claude-3-haiku-20240307-v1:0",
+        body=body
+    )
+    result = json.loads(response['body'].read())
+    ranking_text = result['content'][0]['text'].strip()
+    try:
+        # Parse the ranking
+        rankings = [int(x.strip()) - 1 for x in ranking_text.split(',')]  # Convert to 0-based index
+        # Reorder documents based on ranking
+        reranked_docs = []
+        for rank in rankings[:top_k]:
+            if 0 <= rank < len(documents):
+                reranked_docs.append(documents[rank])
+        return reranked_docs
+    except:
+        # If parsing fails, return original order
+        return documents[:top_k]
+# Generate answer using retrieved documents
+def generate_answer(bedrock_client, query, documents):
+    # Combine documents into context
+    context = "\n\n".join([f"Source: {doc['title']}\n{doc['text']}" for doc in documents])
+    prompt = f"""
+    Based on the following information, please answer the question.
+    Question: {query}
+    Information:
+    {context}
+    Please provide a clear and comprehensive answer based on the information above.
+    """
+    body = json.dumps({
+        "anthropic_version": "bedrock-2023-05-31",
+        "max_tokens": 500,
+        "messages": [{"role": "user", "content": prompt}]
+    })
+    response = bedrock_client.invoke_model(
+        modelId="anthropic.claude-3-haiku-20240307-v1:0",
+        body=body
+    )
+    result = json.loads(response['body'].read())
+    return result['content'][0]['text']
+# Main app
+def main():
+    st.title("🔍 Wikipedia Retrieval with Re-ranking")
+    st.write("Compare search results with and without re-ranking!")
+    # Initialize session state
+    if 'collection' not in st.session_state:
+        st.session_state.collection = None
+    if 'setup_done' not in st.session_state:
+        st.session_state.setup_done = False
+    # Setup section
+    if not st.session_state.setup_done:
+        st.subheader("🛠️ Setup")
+        if st.button("🚀 Load Wikipedia Data and Setup ChromaDB"):
+            try:
+                with st.spinner("Setting up... This may take a few minutes..."):
+                    # Connect to Bedrock
+                    bedrock_client = connect_to_bedrock()
+                    # Load Wikipedia documents
+                    documents = load_wikipedia_docs(100)
+                    st.success(f"✅ Loaded {len(documents)} documents")
+                    # Split into chunks
+                    chunks = split_into_chunks(documents, 500)
+                    st.success(f"✅ Created {len(chunks)} chunks")
+                    # Store in ChromaDB
+                    collection = store_in_chromadb(bedrock_client, chunks)
+                    st.session_state.collection = collection
+                    st.session_state.setup_done = True
+                    st.success("🎉 Setup complete! You can now test queries below.")
+                    st.balloons()
+            except Exception as e:
+                st.error(f"❌ Setup failed: {str(e)}")
+    else:
+        st.success("✅ Setup completed! ChromaDB is ready with Wikipedia data.")
+        # Query testing section
+        st.subheader("🔍 Test Queries")
+        # Predefined queries
+        sample_queries = [
+            "What are the main causes of climate change?",
+            "How does quantum computing work?",
+            "What were the social impacts of the industrial revolution?"
+        ]
+        # Query selection
+        query_option = st.radio("Choose a query:",
+                               ["Custom Query"] + sample_queries)
+        if query_option == "Custom Query":
+            query = st.text_input("Enter your custom query:")
+        else:
+            query = query_option
+            st.write(f"Selected query: **{query}**")
+        if query:
+            if st.button("🔍 Compare Retrieval Methods"):
+                try:
+                    bedrock_client = connect_to_bedrock()
+                    st.write("---")
+                    # Method 1: Simple Retrieval
+                    st.subheader("📋 Method 1: Simple Retrieval (Baseline)")
+                    with st.spinner("Performing simple retrieval..."):
+                        simple_results = simple_retrieval(st.session_state.collection, bedrock_client, query, 10)
+                        simple_top5 = simple_results[:5]
+                        st.write("**Top 5 Results:**")
+                        for i, doc in enumerate(simple_top5, 1):
+                            with st.expander(f"{i}. {doc['title']} (Distance: {doc['distance']:.3f})"):
+                                st.write(doc['text'][:300] + "...")
+                        # Generate answer with simple retrieval
+                        simple_answer = generate_answer(bedrock_client, query, simple_top5)
+                        st.write("**Answer using Simple Retrieval:**")
+                        st.info(simple_answer)
+                    st.write("---")
+                    # Method 2: Retrieval with Re-ranking
+                    st.subheader("🎯 Method 2: Retrieval with Re-ranking")
+                    with st.spinner("Performing retrieval with re-ranking..."):
+                        # First get more results
+                        initial_results = simple_retrieval(st.session_state.collection, bedrock_client, query, 10)
+                        # Then re-rank them
+                        reranked_results = rerank_with_claude(bedrock_client, query, initial_results, 5)
+                        st.write("**Top 5 Re-ranked Results:**")
+                        for i, doc in enumerate(reranked_results, 1):
+                            with st.expander(f"{i}. {doc['title']} (Re-ranked)"):
+                                st.write(doc['text'][:300] + "...")
+                        # Generate answer with re-ranked results
+                        reranked_answer = generate_answer(bedrock_client, query, reranked_results)
+                        st.write("**Answer using Re-ranked Retrieval:**")
+                        st.success(reranked_answer)
+                    st.write("---")
+                    st.subheader("📊 Comparison Summary")
+                    st.write("**Simple Retrieval:** Uses only vector similarity to find relevant documents.")
+                    st.write("**Re-ranked Retrieval:** Uses Claude 3.5 to intelligently reorder results for better relevance.")
+                except Exception as e:
+                    st.error(f"❌ Error during retrieval: {str(e)}")
+        # Reset button
+        if st.button("🔄 Reset Setup"):
+            st.session_state.collection = None
+            st.session_state.setup_done = False
+            st.rerun()
+# Installation guide
+def show_installation_guide():
+    with st.expander("📖 Installation Guide"):
+        st.markdown("""
+        **Step 1: Install Required Libraries**
+        ```bash
+        pip install streamlit boto3 chromadb datasets
+        ```
+        **Step 2: Set up AWS**
+        ```bash
+        aws configure
+        ```
+        Enter your AWS access keys when prompted.
+        **Step 3: Run the App**
+        ```bash
+        streamlit run reranking_app.py
+        ```
+        **What this app does:**
+        1. Loads 100 Wikipedia documents
+        2. Splits them into 500-character chunks
+        3. Creates embeddings using Bedrock Titan
+        4. Stores in local ChromaDB
+        5. Compares simple vs re-ranked retrieval
+        """)
+# Run the app
 if __name__ == "__main__":
+    show_installation_guide()
+    main()

requirements.txt CHANGED Viewed

	@@ -1 +1,10 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.25.2
+qdrant_client
+streamlit
+boto3
+PyPDF2
+chromadb
+datasets
+streamlit
+boto3