import streamlit as st import boto3 import json import chromadb import pandas as pd import time import re from datetime import datetime # Sample Bollywood movies data (simplified for demo) SAMPLE_MOVIES = [ {"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy", "plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."}, {"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra", "plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."}, {"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker", "plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."}, {"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani", "plot": "Two friends search for their missing college friend and recall their engineering days."}, {"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari", "plot": "A former wrestler trains his daughters to become world-class wrestlers."}, {"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee", "plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."}, {"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", "plot": "A man creates chaos by lying about his identity to get a job."}, {"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee", "plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."}, {"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot", "plot": "A police officer impersonates a crime boss to infiltrate his gang."}, {"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi", "plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."}, {"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif", "plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."}, {"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra", "plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."}, {"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl", "plot": "A woman goes on her honeymoon alone after her wedding is called off."}, {"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar", "plot": "Three friends go on a bachelor trip to Spain and face their fears."}, {"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan", "plot": "An art teacher helps a dyslexic child overcome his learning difficulties."}, {"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra", "plot": "College students making a documentary about freedom fighters become revolutionaries themselves."}, {"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", "plot": "A young man lies about having a mustache to keep his job with a strict boss."}, {"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee", "plot": "A friendship is tested when one friend betrays the other for money and power."}, {"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar", "plot": "A man's daughter tries to reunite him with his college sweetheart."}, {"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar", "plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."} ] # Simple function to connect to AWS Bedrock def connect_to_bedrock(): try: client = boto3.client('bedrock-runtime', region_name='us-east-1') return client except: st.error("⚠️ AWS Bedrock not configured. Using mock responses for demo.") return None # Get embeddings from Bedrock def get_embeddings(bedrock_client, text): if not bedrock_client: # Return dummy embedding for demo import random return [random.random() for _ in range(1536)] try: body = json.dumps({"inputText": text}) response = bedrock_client.invoke_model( modelId="amazon.titan-embed-text-v1", body=body ) result = json.loads(response['body'].read()) return result['embedding'] except: # Return dummy embedding if API fails import random return [random.random() for _ in range(1536)] # Create movie documents and store in ChromaDB def setup_movie_database(bedrock_client): st.write("🎬 Setting up Bollywood movies database...") # Create ChromaDB client chroma_client = chromadb.Client() # Create or recreate collection try: chroma_client.delete_collection("bollywood_movies") except: pass collection = chroma_client.create_collection("bollywood_movies") # Prepare data for ChromaDB ids = [] documents = [] metadatas = [] embeddings = [] progress_bar = st.progress(0) for i, movie in enumerate(SAMPLE_MOVIES): # Create document text doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}" # Get embedding embedding = get_embeddings(bedrock_client, doc_text) # Prepare data ids.append(str(i)) documents.append(doc_text) metadatas.append({ 'title': movie['title'], 'year': movie['year'], 'genre': movie['genre'].lower(), 'director': movie['director'].lower(), 'decade': f"{(movie['year'] // 10) * 10}s" }) embeddings.append(embedding) progress_bar.progress((i + 1) / len(SAMPLE_MOVIES)) # Add to ChromaDB collection.add( ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings ) st.success(f"✅ Added {len(SAMPLE_MOVIES)} movies to database!") return collection # Simple query filter detection def detect_filters(query): query_lower = query.lower() filters = {} # Genre detection genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure'] for genre in genres: if genre in query_lower: filters['genre'] = genre break # Decade detection decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s'] for decade in decades: if decade in query_lower: filters['decade'] = decade break # Year detection years = re.findall(r'\b(19\d{2}|20\d{2})\b', query) if years: year = int(years[0]) filters['decade'] = f"{(year // 10) * 10}s" # Director detection (simple) directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra'] for director in directors: if director in query_lower: filters['director'] = director break return filters # Retrieve without metadata filter def retrieve_without_filter(collection, bedrock_client, query, top_k=5): start_time = time.time() # Get query embedding query_embedding = get_embeddings(bedrock_client, query) # Search without filters results = collection.query( query_embeddings=[query_embedding], n_results=top_k ) end_time = time.time() # Format results movies = [] for i in range(len(results['documents'][0])): movies.append({ 'document': results['documents'][0][i], 'metadata': results['metadatas'][0][i], 'distance': results['distances'][0][i] }) return movies, end_time - start_time # Retrieve with metadata filter def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5): start_time = time.time() # Get query embedding query_embedding = get_embeddings(bedrock_client, query) # Create where clause for filtering where_clause = {} for key, value in filters.items(): where_clause[key] = value # Search with filters try: results = collection.query( query_embeddings=[query_embedding], n_results=top_k, where=where_clause ) except: # If filtering fails, fall back to no filter results = collection.query( query_embeddings=[query_embedding], n_results=top_k ) end_time = time.time() # Format results movies = [] for i in range(len(results['documents'][0])): movies.append({ 'document': results['documents'][0][i], 'metadata': results['metadatas'][0][i], 'distance': results['distances'][0][i] }) return movies, end_time - start_time # Generate answer using Bedrock def generate_answer(bedrock_client, query, movies): if not bedrock_client: return "🎬 Based on the retrieved movies, here are some recommendations that match your query!" # Create context from movies context = "\n\n".join([movie['document'] for movie in movies]) prompt = f""" Based on the following Bollywood movies information, please answer the user's question. Question: {query} Movies Information: {context} Please provide a helpful and informative answer about the movies. """ try: body = json.dumps({ "anthropic_version": "bedrock-2023-05-31", "max_tokens": 400, "messages": [{"role": "user", "content": prompt}] }) response = bedrock_client.invoke_model( modelId="anthropic.claude-3-haiku-20240307-v1:0", body=body ) result = json.loads(response['body'].read()) return result['content'][0]['text'] except: return "🎬 Based on the retrieved movies, here are some great recommendations that match your query!" # Main app def main(): st.title("🎬 Bollywood Movies RAG with Metadata Filtering") st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!") # Initialize session state if 'collection' not in st.session_state: st.session_state.collection = None if 'setup_done' not in st.session_state: st.session_state.setup_done = False # Setup section if not st.session_state.setup_done: st.subheader("🛠️ Setup Movie Database") if st.button("🚀 Load Bollywood Movies Data"): try: bedrock_client = connect_to_bedrock() collection = setup_movie_database(bedrock_client) st.session_state.collection = collection st.session_state.bedrock_client = bedrock_client st.session_state.setup_done = True st.balloons() except Exception as e: st.error(f"❌ Setup failed: {str(e)}") else: st.success("✅ Movie database is ready!") # Sample queries st.subheader("🔍 Try These Sample Queries") sample_queries = [ "What are some good action movies?", "Tell me a few comedy movies from the 1970s", "What is the movie Sholay about?", "Tell me a few movies directed by Hrishikesh Mukherjee", "What are some romantic movies from the 1990s?" ] query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries) if query_option == "Custom Query": query = st.text_input("Enter your question about Bollywood movies:") else: query = query_option st.write(f"Selected: **{query}**") if query: if st.button("🔍 Search Movies"): try: bedrock_client = st.session_state.bedrock_client collection = st.session_state.collection # Detect filters filters = detect_filters(query) st.write("---") # Method 1: Without metadata filter st.subheader("📊 Method 1: Without Metadata Filter") movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query) st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**") st.write("**Retrieved Movies:**") for i, movie in enumerate(movies_no_filter, 1): with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): st.write(f"**Genre:** {movie['metadata']['genre'].title()}") st.write(f"**Director:** {movie['metadata']['director'].title()}") st.write(f"**Distance:** {movie['distance']:.4f}") # Method 2: With metadata filter st.subheader("🎯 Method 2: With Metadata Filter") if filters: st.write(f"**Detected Filters:** {filters}") movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters) st.write(f"⏱️ **Time taken: {time_with_filter:.4f} seconds**") st.write("**Filtered Retrieved Movies:**") for i, movie in enumerate(movies_with_filter, 1): with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): st.write(f"**Genre:** {movie['metadata']['genre'].title()}") st.write(f"**Director:** {movie['metadata']['director'].title()}") st.write(f"**Distance:** {movie['distance']:.4f}") # Performance comparison st.subheader("⚡ Performance Comparison") col1, col2, col3 = st.columns(3) with col1: st.metric("Without Filter", f"{time_no_filter:.4f}s") with col2: st.metric("With Filter", f"{time_with_filter:.4f}s") with col3: speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0 st.metric("Speedup", f"{speedup:.1f}%") # Generate final answer st.subheader("🤖 AI Generated Answer") answer = generate_answer(bedrock_client, query, movies_with_filter) st.success(answer) else: st.write("**No specific filters detected** - using general retrieval") st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**") # Generate answer with no filter results st.subheader("🤖 AI Generated Answer") answer = generate_answer(bedrock_client, query, movies_no_filter) st.success(answer) except Exception as e: st.error(f"❌ Search failed: {str(e)}") # Show movie database if st.checkbox("📋 Show All Movies in Database"): st.subheader("Movie Database") df = pd.DataFrame(SAMPLE_MOVIES) st.dataframe(df) # Reset button if st.button("🔄 Reset Database"): st.session_state.collection = None st.session_state.setup_done = False st.rerun() # Installation and deployment guide def show_guides(): col1, col2 = st.columns(2) with col1: with st.expander("📖 Installation Guide"): st.markdown(""" **Step 1: Install Libraries** ```bash pip install streamlit boto3 chromadb pandas ``` **Step 2: Setup AWS** ```bash aws configure ``` **Step 3: Run Locally** ```bash streamlit run bollywood_rag.py ``` """) with col2: with st.expander("🚀 Deploy to Hugging Face"): st.markdown(""" **Step 1: Create files** - `app.py` (this code) - `requirements.txt` - `README.md` **Step 2: requirements.txt** ``` streamlit boto3 chromadb pandas ``` **Step 3: Deploy** 1. Push to GitHub 2. Connect to Hugging Face Spaces 3. Select Streamlit SDK 4. Add AWS secrets in settings """) # Run the app if __name__ == "__main__": show_guides() main()