Spaces:
Sleeping
Sleeping
import streamlit as st | |
import boto3 | |
import json | |
import chromadb | |
import pandas as pd | |
import time | |
import re | |
from datetime import datetime | |
# Sample Bollywood movies data (simplified for demo) | |
SAMPLE_MOVIES = [ | |
{"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy", | |
"plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."}, | |
{"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra", | |
"plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."}, | |
{"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker", | |
"plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."}, | |
{"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani", | |
"plot": "Two friends search for their missing college friend and recall their engineering days."}, | |
{"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari", | |
"plot": "A former wrestler trains his daughters to become world-class wrestlers."}, | |
{"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee", | |
"plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."}, | |
{"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
"plot": "A man creates chaos by lying about his identity to get a job."}, | |
{"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
"plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."}, | |
{"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot", | |
"plot": "A police officer impersonates a crime boss to infiltrate his gang."}, | |
{"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi", | |
"plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."}, | |
{"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif", | |
"plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."}, | |
{"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra", | |
"plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."}, | |
{"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl", | |
"plot": "A woman goes on her honeymoon alone after her wedding is called off."}, | |
{"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar", | |
"plot": "Three friends go on a bachelor trip to Spain and face their fears."}, | |
{"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan", | |
"plot": "An art teacher helps a dyslexic child overcome his learning difficulties."}, | |
{"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra", | |
"plot": "College students making a documentary about freedom fighters become revolutionaries themselves."}, | |
{"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee", | |
"plot": "A young man lies about having a mustache to keep his job with a strict boss."}, | |
{"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee", | |
"plot": "A friendship is tested when one friend betrays the other for money and power."}, | |
{"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar", | |
"plot": "A man's daughter tries to reunite him with his college sweetheart."}, | |
{"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar", | |
"plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."} | |
] | |
# Simple function to connect to AWS Bedrock | |
def connect_to_bedrock(): | |
try: | |
client = boto3.client('bedrock-runtime', region_name='us-east-1') | |
return client | |
except: | |
st.error("β οΈ AWS Bedrock not configured. Using mock responses for demo.") | |
return None | |
# Get embeddings from Bedrock | |
def get_embeddings(bedrock_client, text): | |
if not bedrock_client: | |
# Return dummy embedding for demo | |
import random | |
return [random.random() for _ in range(1536)] | |
try: | |
body = json.dumps({"inputText": text}) | |
response = bedrock_client.invoke_model( | |
modelId="amazon.titan-embed-text-v1", | |
body=body | |
) | |
result = json.loads(response['body'].read()) | |
return result['embedding'] | |
except: | |
# Return dummy embedding if API fails | |
import random | |
return [random.random() for _ in range(1536)] | |
# Create movie documents and store in ChromaDB | |
def setup_movie_database(bedrock_client): | |
st.write("π¬ Setting up Bollywood movies database...") | |
# Create ChromaDB client | |
chroma_client = chromadb.Client() | |
# Create or recreate collection | |
try: | |
chroma_client.delete_collection("bollywood_movies") | |
except: | |
pass | |
collection = chroma_client.create_collection("bollywood_movies") | |
# Prepare data for ChromaDB | |
ids = [] | |
documents = [] | |
metadatas = [] | |
embeddings = [] | |
progress_bar = st.progress(0) | |
for i, movie in enumerate(SAMPLE_MOVIES): | |
# Create document text | |
doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}" | |
# Get embedding | |
embedding = get_embeddings(bedrock_client, doc_text) | |
# Prepare data | |
ids.append(str(i)) | |
documents.append(doc_text) | |
metadatas.append({ | |
'title': movie['title'], | |
'year': movie['year'], | |
'genre': movie['genre'].lower(), | |
'director': movie['director'].lower(), | |
'decade': f"{(movie['year'] // 10) * 10}s" | |
}) | |
embeddings.append(embedding) | |
progress_bar.progress((i + 1) / len(SAMPLE_MOVIES)) | |
# Add to ChromaDB | |
collection.add( | |
ids=ids, | |
documents=documents, | |
metadatas=metadatas, | |
embeddings=embeddings | |
) | |
st.success(f"β Added {len(SAMPLE_MOVIES)} movies to database!") | |
return collection | |
# Simple query filter detection | |
def detect_filters(query): | |
query_lower = query.lower() | |
filters = {} | |
# Genre detection | |
genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure'] | |
for genre in genres: | |
if genre in query_lower: | |
filters['genre'] = genre | |
break | |
# Decade detection | |
decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s'] | |
for decade in decades: | |
if decade in query_lower: | |
filters['decade'] = decade | |
break | |
# Year detection | |
years = re.findall(r'\b(19\d{2}|20\d{2})\b', query) | |
if years: | |
year = int(years[0]) | |
filters['decade'] = f"{(year // 10) * 10}s" | |
# Director detection (simple) | |
directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra'] | |
for director in directors: | |
if director in query_lower: | |
filters['director'] = director | |
break | |
return filters | |
# Retrieve without metadata filter | |
def retrieve_without_filter(collection, bedrock_client, query, top_k=5): | |
start_time = time.time() | |
# Get query embedding | |
query_embedding = get_embeddings(bedrock_client, query) | |
# Search without filters | |
results = collection.query( | |
query_embeddings=[query_embedding], | |
n_results=top_k | |
) | |
end_time = time.time() | |
# Format results | |
movies = [] | |
for i in range(len(results['documents'][0])): | |
movies.append({ | |
'document': results['documents'][0][i], | |
'metadata': results['metadatas'][0][i], | |
'distance': results['distances'][0][i] | |
}) | |
return movies, end_time - start_time | |
# Retrieve with metadata filter | |
def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5): | |
start_time = time.time() | |
# Get query embedding | |
query_embedding = get_embeddings(bedrock_client, query) | |
# Create where clause for filtering | |
where_clause = {} | |
for key, value in filters.items(): | |
where_clause[key] = value | |
# Search with filters | |
try: | |
results = collection.query( | |
query_embeddings=[query_embedding], | |
n_results=top_k, | |
where=where_clause | |
) | |
except: | |
# If filtering fails, fall back to no filter | |
results = collection.query( | |
query_embeddings=[query_embedding], | |
n_results=top_k | |
) | |
end_time = time.time() | |
# Format results | |
movies = [] | |
for i in range(len(results['documents'][0])): | |
movies.append({ | |
'document': results['documents'][0][i], | |
'metadata': results['metadatas'][0][i], | |
'distance': results['distances'][0][i] | |
}) | |
return movies, end_time - start_time | |
# Generate answer using Bedrock | |
def generate_answer(bedrock_client, query, movies): | |
if not bedrock_client: | |
return "π¬ Based on the retrieved movies, here are some recommendations that match your query!" | |
# Create context from movies | |
context = "\n\n".join([movie['document'] for movie in movies]) | |
prompt = f""" | |
Based on the following Bollywood movies information, please answer the user's question. | |
Question: {query} | |
Movies Information: | |
{context} | |
Please provide a helpful and informative answer about the movies. | |
""" | |
try: | |
body = json.dumps({ | |
"anthropic_version": "bedrock-2023-05-31", | |
"max_tokens": 400, | |
"messages": [{"role": "user", "content": prompt}] | |
}) | |
response = bedrock_client.invoke_model( | |
modelId="anthropic.claude-3-haiku-20240307-v1:0", | |
body=body | |
) | |
result = json.loads(response['body'].read()) | |
return result['content'][0]['text'] | |
except: | |
return "π¬ Based on the retrieved movies, here are some great recommendations that match your query!" | |
# Main app | |
def main(): | |
st.title("π¬ Bollywood Movies RAG with Metadata Filtering") | |
st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!") | |
# Initialize session state | |
if 'collection' not in st.session_state: | |
st.session_state.collection = None | |
if 'setup_done' not in st.session_state: | |
st.session_state.setup_done = False | |
# Setup section | |
if not st.session_state.setup_done: | |
st.subheader("π οΈ Setup Movie Database") | |
if st.button("π Load Bollywood Movies Data"): | |
try: | |
bedrock_client = connect_to_bedrock() | |
collection = setup_movie_database(bedrock_client) | |
st.session_state.collection = collection | |
st.session_state.bedrock_client = bedrock_client | |
st.session_state.setup_done = True | |
st.balloons() | |
except Exception as e: | |
st.error(f"β Setup failed: {str(e)}") | |
else: | |
st.success("β Movie database is ready!") | |
# Sample queries | |
st.subheader("π Try These Sample Queries") | |
sample_queries = [ | |
"What are some good action movies?", | |
"Tell me a few comedy movies from the 1970s", | |
"What is the movie Sholay about?", | |
"Tell me a few movies directed by Hrishikesh Mukherjee", | |
"What are some romantic movies from the 1990s?" | |
] | |
query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries) | |
if query_option == "Custom Query": | |
query = st.text_input("Enter your question about Bollywood movies:") | |
else: | |
query = query_option | |
st.write(f"Selected: **{query}**") | |
if query: | |
if st.button("π Search Movies"): | |
try: | |
bedrock_client = st.session_state.bedrock_client | |
collection = st.session_state.collection | |
# Detect filters | |
filters = detect_filters(query) | |
st.write("---") | |
# Method 1: Without metadata filter | |
st.subheader("π Method 1: Without Metadata Filter") | |
movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query) | |
st.write(f"β±οΈ **Time taken: {time_no_filter:.4f} seconds**") | |
st.write("**Retrieved Movies:**") | |
for i, movie in enumerate(movies_no_filter, 1): | |
with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): | |
st.write(f"**Genre:** {movie['metadata']['genre'].title()}") | |
st.write(f"**Director:** {movie['metadata']['director'].title()}") | |
st.write(f"**Distance:** {movie['distance']:.4f}") | |
# Method 2: With metadata filter | |
st.subheader("π― Method 2: With Metadata Filter") | |
if filters: | |
st.write(f"**Detected Filters:** {filters}") | |
movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters) | |
st.write(f"β±οΈ **Time taken: {time_with_filter:.4f} seconds**") | |
st.write("**Filtered Retrieved Movies:**") | |
for i, movie in enumerate(movies_with_filter, 1): | |
with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"): | |
st.write(f"**Genre:** {movie['metadata']['genre'].title()}") | |
st.write(f"**Director:** {movie['metadata']['director'].title()}") | |
st.write(f"**Distance:** {movie['distance']:.4f}") | |
# Performance comparison | |
st.subheader("β‘ Performance Comparison") | |
col1, col2, col3 = st.columns(3) | |
with col1: | |
st.metric("Without Filter", f"{time_no_filter:.4f}s") | |
with col2: | |
st.metric("With Filter", f"{time_with_filter:.4f}s") | |
with col3: | |
speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0 | |
st.metric("Speedup", f"{speedup:.1f}%") | |
# Generate final answer | |
st.subheader("π€ AI Generated Answer") | |
answer = generate_answer(bedrock_client, query, movies_with_filter) | |
st.success(answer) | |
else: | |
st.write("**No specific filters detected** - using general retrieval") | |
st.write(f"β±οΈ **Time taken: {time_no_filter:.4f} seconds**") | |
# Generate answer with no filter results | |
st.subheader("π€ AI Generated Answer") | |
answer = generate_answer(bedrock_client, query, movies_no_filter) | |
st.success(answer) | |
except Exception as e: | |
st.error(f"β Search failed: {str(e)}") | |
# Show movie database | |
if st.checkbox("π Show All Movies in Database"): | |
st.subheader("Movie Database") | |
df = pd.DataFrame(SAMPLE_MOVIES) | |
st.dataframe(df) | |
# Reset button | |
if st.button("π Reset Database"): | |
st.session_state.collection = None | |
st.session_state.setup_done = False | |
st.rerun() | |
# Installation and deployment guide | |
def show_guides(): | |
col1, col2 = st.columns(2) | |
with col1: | |
with st.expander("π Installation Guide"): | |
st.markdown(""" | |
**Step 1: Install Libraries** | |
```bash | |
pip install streamlit boto3 chromadb pandas | |
``` | |
**Step 2: Setup AWS** | |
```bash | |
aws configure | |
``` | |
**Step 3: Run Locally** | |
```bash | |
streamlit run bollywood_rag.py | |
``` | |
""") | |
with col2: | |
with st.expander("π Deploy to Hugging Face"): | |
st.markdown(""" | |
**Step 1: Create files** | |
- `app.py` (this code) | |
- `requirements.txt` | |
- `README.md` | |
**Step 2: requirements.txt** | |
``` | |
streamlit | |
boto3 | |
chromadb | |
pandas | |
``` | |
**Step 3: Deploy** | |
1. Push to GitHub | |
2. Connect to Hugging Face Spaces | |
3. Select Streamlit SDK | |
4. Add AWS secrets in settings | |
""") | |
# Run the app | |
if __name__ == "__main__": | |
show_guides() | |
main() |