PrajwalW commited on
Commit
d060bf3
Β·
verified Β·
1 Parent(s): 79d9086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -52
app.py CHANGED
@@ -1,64 +1,447 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- response += token
40
- yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
1
+ import streamlit as st
2
+ import boto3
3
+ import json
4
+ import chromadb
5
+ import pandas as pd
6
+ import time
7
+ import re
8
+ from datetime import datetime
9
 
10
+ # Sample Bollywood movies data (simplified for demo)
11
+ SAMPLE_MOVIES = [
12
+ {"title": "Sholay", "year": 1975, "genre": "Action", "director": "Ramesh Sippy",
13
+ "plot": "Two criminals are hired by a retired police officer to capture a bandit terrorizing a village."},
14
+ {"title": "Dilwale Dulhania Le Jayenge", "year": 1995, "genre": "Romance", "director": "Aditya Chopra",
15
+ "plot": "A young man and woman fall in love during a trip to Europe, but face family opposition."},
16
+ {"title": "Lagaan", "year": 2001, "genre": "Drama", "director": "Ashutosh Gowariker",
17
+ "plot": "Villagers accept a challenge from British officers to play cricket to avoid paying tax."},
18
+ {"title": "3 Idiots", "year": 2009, "genre": "Comedy", "director": "Rajkumar Hirani",
19
+ "plot": "Two friends search for their missing college friend and recall their engineering days."},
20
+ {"title": "Dangal", "year": 2016, "genre": "Sports", "director": "Nitesh Tiwari",
21
+ "plot": "A former wrestler trains his daughters to become world-class wrestlers."},
22
+ {"title": "Anand", "year": 1971, "genre": "Drama", "director": "Hrishikesh Mukherjee",
23
+ "plot": "A terminally ill man spreads joy and teaches the meaning of life to a doctor."},
24
+ {"title": "Golmaal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
25
+ "plot": "A man creates chaos by lying about his identity to get a job."},
26
+ {"title": "Chupke Chupke", "year": 1975, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
27
+ "plot": "A newlywed plays pranks on his wife's family by pretending to be someone else."},
28
+ {"title": "Don", "year": 1978, "genre": "Action", "director": "Chandra Barot",
29
+ "plot": "A police officer impersonates a crime boss to infiltrate his gang."},
30
+ {"title": "Andaz Apna Apna", "year": 1994, "genre": "Comedy", "director": "Rajkumar Santoshi",
31
+ "plot": "Two friends compete to marry a wealthy heiress but get caught up in a kidnapping plot."},
32
+ {"title": "Mughal-E-Azam", "year": 1960, "genre": "Romance", "director": "K. Asif",
33
+ "plot": "A Mughal prince falls in love with a court dancer, defying his father the emperor."},
34
+ {"title": "Deewaar", "year": 1975, "genre": "Action", "director": "Yash Chopra",
35
+ "plot": "Two brothers choose different paths in life - one becomes a police officer, the other a criminal."},
36
+ {"title": "Queen", "year": 2013, "genre": "Comedy", "director": "Vikas Bahl",
37
+ "plot": "A woman goes on her honeymoon alone after her wedding is called off."},
38
+ {"title": "Zindagi Na Milegi Dobara", "year": 2011, "genre": "Adventure", "director": "Zoya Akhtar",
39
+ "plot": "Three friends go on a bachelor trip to Spain and face their fears."},
40
+ {"title": "Taare Zameen Par", "year": 2007, "genre": "Drama", "director": "Aamir Khan",
41
+ "plot": "An art teacher helps a dyslexic child overcome his learning difficulties."},
42
+ {"title": "Rang De Basanti", "year": 2006, "genre": "Drama", "director": "Rakeysh Omprakash Mehra",
43
+ "plot": "College students making a documentary about freedom fighters become revolutionaries themselves."},
44
+ {"title": "Gol Maal", "year": 1979, "genre": "Comedy", "director": "Hrishikesh Mukherjee",
45
+ "plot": "A young man lies about having a mustache to keep his job with a strict boss."},
46
+ {"title": "Namak Haraam", "year": 1973, "genre": "Drama", "director": "Hrishikesh Mukherjee",
47
+ "plot": "A friendship is tested when one friend betrays the other for money and power."},
48
+ {"title": "Kuch Kuch Hota Hai", "year": 1998, "genre": "Romance", "director": "Karan Johar",
49
+ "plot": "A man's daughter tries to reunite him with his college sweetheart."},
50
+ {"title": "My Name is Khan", "year": 2010, "genre": "Drama", "director": "Karan Johar",
51
+ "plot": "A man with Asperger's syndrome embarks on a journey to meet the President of the United States."}
52
+ ]
53
 
54
+ # Simple function to connect to AWS Bedrock
55
+ def connect_to_bedrock():
56
+ try:
57
+ client = boto3.client('bedrock-runtime', region_name='us-east-1')
58
+ return client
59
+ except:
60
+ st.error("⚠️ AWS Bedrock not configured. Using mock responses for demo.")
61
+ return None
62
 
63
+ # Get embeddings from Bedrock
64
+ def get_embeddings(bedrock_client, text):
65
+ if not bedrock_client:
66
+ # Return dummy embedding for demo
67
+ import random
68
+ return [random.random() for _ in range(1536)]
69
+
70
+ try:
71
+ body = json.dumps({"inputText": text})
72
+ response = bedrock_client.invoke_model(
73
+ modelId="amazon.titan-embed-text-v1",
74
+ body=body
75
+ )
76
+ result = json.loads(response['body'].read())
77
+ return result['embedding']
78
+ except:
79
+ # Return dummy embedding if API fails
80
+ import random
81
+ return [random.random() for _ in range(1536)]
82
 
83
+ # Create movie documents and store in ChromaDB
84
+ def setup_movie_database(bedrock_client):
85
+ st.write("🎬 Setting up Bollywood movies database...")
86
+
87
+ # Create ChromaDB client
88
+ chroma_client = chromadb.Client()
89
+
90
+ # Create or recreate collection
91
+ try:
92
+ chroma_client.delete_collection("bollywood_movies")
93
+ except:
94
+ pass
95
+
96
+ collection = chroma_client.create_collection("bollywood_movies")
97
+
98
+ # Prepare data for ChromaDB
99
+ ids = []
100
+ documents = []
101
+ metadatas = []
102
+ embeddings = []
103
+
104
+ progress_bar = st.progress(0)
105
+
106
+ for i, movie in enumerate(SAMPLE_MOVIES):
107
+ # Create document text
108
+ doc_text = f"Title: {movie['title']}\nYear: {movie['year']}\nGenre: {movie['genre']}\nDirector: {movie['director']}\nPlot: {movie['plot']}"
109
+
110
+ # Get embedding
111
+ embedding = get_embeddings(bedrock_client, doc_text)
112
+
113
+ # Prepare data
114
+ ids.append(str(i))
115
+ documents.append(doc_text)
116
+ metadatas.append({
117
+ 'title': movie['title'],
118
+ 'year': movie['year'],
119
+ 'genre': movie['genre'].lower(),
120
+ 'director': movie['director'].lower(),
121
+ 'decade': f"{(movie['year'] // 10) * 10}s"
122
+ })
123
+ embeddings.append(embedding)
124
+
125
+ progress_bar.progress((i + 1) / len(SAMPLE_MOVIES))
126
+
127
+ # Add to ChromaDB
128
+ collection.add(
129
+ ids=ids,
130
+ documents=documents,
131
+ metadatas=metadatas,
132
+ embeddings=embeddings
133
+ )
134
+
135
+ st.success(f"βœ… Added {len(SAMPLE_MOVIES)} movies to database!")
136
+ return collection
137
 
138
+ # Simple query filter detection
139
+ def detect_filters(query):
140
+ query_lower = query.lower()
141
+ filters = {}
142
+
143
+ # Genre detection
144
+ genres = ['action', 'comedy', 'drama', 'romance', 'sports', 'adventure']
145
+ for genre in genres:
146
+ if genre in query_lower:
147
+ filters['genre'] = genre
148
+ break
149
+
150
+ # Decade detection
151
+ decades = ['1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
152
+ for decade in decades:
153
+ if decade in query_lower:
154
+ filters['decade'] = decade
155
+ break
156
+
157
+ # Year detection
158
+ years = re.findall(r'\b(19\d{2}|20\d{2})\b', query)
159
+ if years:
160
+ year = int(years[0])
161
+ filters['decade'] = f"{(year // 10) * 10}s"
162
+
163
+ # Director detection (simple)
164
+ directors = ['hrishikesh mukherjee', 'rajkumar hirani', 'aamir khan', 'yash chopra']
165
+ for director in directors:
166
+ if director in query_lower:
167
+ filters['director'] = director
168
+ break
169
+
170
+ return filters
171
 
172
+ # Retrieve without metadata filter
173
+ def retrieve_without_filter(collection, bedrock_client, query, top_k=5):
174
+ start_time = time.time()
175
+
176
+ # Get query embedding
177
+ query_embedding = get_embeddings(bedrock_client, query)
178
+
179
+ # Search without filters
180
+ results = collection.query(
181
+ query_embeddings=[query_embedding],
182
+ n_results=top_k
183
+ )
184
+
185
+ end_time = time.time()
186
+
187
+ # Format results
188
+ movies = []
189
+ for i in range(len(results['documents'][0])):
190
+ movies.append({
191
+ 'document': results['documents'][0][i],
192
+ 'metadata': results['metadatas'][0][i],
193
+ 'distance': results['distances'][0][i]
194
+ })
195
+
196
+ return movies, end_time - start_time
197
 
198
+ # Retrieve with metadata filter
199
+ def retrieve_with_filter(collection, bedrock_client, query, filters, top_k=5):
200
+ start_time = time.time()
201
+
202
+ # Get query embedding
203
+ query_embedding = get_embeddings(bedrock_client, query)
204
+
205
+ # Create where clause for filtering
206
+ where_clause = {}
207
+ for key, value in filters.items():
208
+ where_clause[key] = value
209
+
210
+ # Search with filters
211
+ try:
212
+ results = collection.query(
213
+ query_embeddings=[query_embedding],
214
+ n_results=top_k,
215
+ where=where_clause
216
+ )
217
+ except:
218
+ # If filtering fails, fall back to no filter
219
+ results = collection.query(
220
+ query_embeddings=[query_embedding],
221
+ n_results=top_k
222
+ )
223
+
224
+ end_time = time.time()
225
+
226
+ # Format results
227
+ movies = []
228
+ for i in range(len(results['documents'][0])):
229
+ movies.append({
230
+ 'document': results['documents'][0][i],
231
+ 'metadata': results['metadatas'][0][i],
232
+ 'distance': results['distances'][0][i]
233
+ })
234
+
235
+ return movies, end_time - start_time
236
 
237
+ # Generate answer using Bedrock
238
+ def generate_answer(bedrock_client, query, movies):
239
+ if not bedrock_client:
240
+ return "🎬 Based on the retrieved movies, here are some recommendations that match your query!"
241
+
242
+ # Create context from movies
243
+ context = "\n\n".join([movie['document'] for movie in movies])
244
+
245
+ prompt = f"""
246
+ Based on the following Bollywood movies information, please answer the user's question.
247
+
248
+ Question: {query}
249
+
250
+ Movies Information:
251
+ {context}
252
+
253
+ Please provide a helpful and informative answer about the movies.
254
+ """
255
+
256
+ try:
257
+ body = json.dumps({
258
+ "anthropic_version": "bedrock-2023-05-31",
259
+ "max_tokens": 400,
260
+ "messages": [{"role": "user", "content": prompt}]
261
+ })
262
+
263
+ response = bedrock_client.invoke_model(
264
+ modelId="anthropic.claude-3-haiku-20240307-v1:0",
265
+ body=body
266
+ )
267
+
268
+ result = json.loads(response['body'].read())
269
+ return result['content'][0]['text']
270
+ except:
271
+ return "🎬 Based on the retrieved movies, here are some great recommendations that match your query!"
272
 
273
+ # Main app
274
+ def main():
275
+ st.title("🎬 Bollywood Movies RAG with Metadata Filtering")
276
+ st.write("Ask questions about Bollywood movies and see how metadata filtering speeds up retrieval!")
277
+
278
+ # Initialize session state
279
+ if 'collection' not in st.session_state:
280
+ st.session_state.collection = None
281
+ if 'setup_done' not in st.session_state:
282
+ st.session_state.setup_done = False
283
+
284
+ # Setup section
285
+ if not st.session_state.setup_done:
286
+ st.subheader("πŸ› οΈ Setup Movie Database")
287
+
288
+ if st.button("πŸš€ Load Bollywood Movies Data"):
289
+ try:
290
+ bedrock_client = connect_to_bedrock()
291
+ collection = setup_movie_database(bedrock_client)
292
+ st.session_state.collection = collection
293
+ st.session_state.bedrock_client = bedrock_client
294
+ st.session_state.setup_done = True
295
+ st.balloons()
296
+ except Exception as e:
297
+ st.error(f"❌ Setup failed: {str(e)}")
298
+
299
+ else:
300
+ st.success("βœ… Movie database is ready!")
301
+
302
+ # Sample queries
303
+ st.subheader("πŸ” Try These Sample Queries")
304
+ sample_queries = [
305
+ "What are some good action movies?",
306
+ "Tell me a few comedy movies from the 1970s",
307
+ "What is the movie Sholay about?",
308
+ "Tell me a few movies directed by Hrishikesh Mukherjee",
309
+ "What are some romantic movies from the 1990s?"
310
+ ]
311
+
312
+ query_option = st.radio("Choose a query:", ["Custom Query"] + sample_queries)
313
+
314
+ if query_option == "Custom Query":
315
+ query = st.text_input("Enter your question about Bollywood movies:")
316
+ else:
317
+ query = query_option
318
+ st.write(f"Selected: **{query}**")
319
+
320
+ if query:
321
+ if st.button("πŸ” Search Movies"):
322
+ try:
323
+ bedrock_client = st.session_state.bedrock_client
324
+ collection = st.session_state.collection
325
+
326
+ # Detect filters
327
+ filters = detect_filters(query)
328
+
329
+ st.write("---")
330
+
331
+ # Method 1: Without metadata filter
332
+ st.subheader("πŸ“Š Method 1: Without Metadata Filter")
333
+ movies_no_filter, time_no_filter = retrieve_without_filter(collection, bedrock_client, query)
334
+
335
+ st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
336
+ st.write("**Retrieved Movies:**")
337
+ for i, movie in enumerate(movies_no_filter, 1):
338
+ with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
339
+ st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
340
+ st.write(f"**Director:** {movie['metadata']['director'].title()}")
341
+ st.write(f"**Distance:** {movie['distance']:.4f}")
342
+
343
+ # Method 2: With metadata filter
344
+ st.subheader("🎯 Method 2: With Metadata Filter")
345
+
346
+ if filters:
347
+ st.write(f"**Detected Filters:** {filters}")
348
+ movies_with_filter, time_with_filter = retrieve_with_filter(collection, bedrock_client, query, filters)
349
+
350
+ st.write(f"⏱️ **Time taken: {time_with_filter:.4f} seconds**")
351
+ st.write("**Filtered Retrieved Movies:**")
352
+ for i, movie in enumerate(movies_with_filter, 1):
353
+ with st.expander(f"{i}. {movie['metadata']['title']} ({movie['metadata']['year']})"):
354
+ st.write(f"**Genre:** {movie['metadata']['genre'].title()}")
355
+ st.write(f"**Director:** {movie['metadata']['director'].title()}")
356
+ st.write(f"**Distance:** {movie['distance']:.4f}")
357
+
358
+ # Performance comparison
359
+ st.subheader("⚑ Performance Comparison")
360
+ col1, col2, col3 = st.columns(3)
361
+ with col1:
362
+ st.metric("Without Filter", f"{time_no_filter:.4f}s")
363
+ with col2:
364
+ st.metric("With Filter", f"{time_with_filter:.4f}s")
365
+ with col3:
366
+ speedup = ((time_no_filter - time_with_filter) / time_no_filter) * 100 if time_no_filter > 0 else 0
367
+ st.metric("Speedup", f"{speedup:.1f}%")
368
+
369
+ # Generate final answer
370
+ st.subheader("πŸ€– AI Generated Answer")
371
+ answer = generate_answer(bedrock_client, query, movies_with_filter)
372
+ st.success(answer)
373
+
374
+ else:
375
+ st.write("**No specific filters detected** - using general retrieval")
376
+ st.write(f"⏱️ **Time taken: {time_no_filter:.4f} seconds**")
377
+
378
+ # Generate answer with no filter results
379
+ st.subheader("πŸ€– AI Generated Answer")
380
+ answer = generate_answer(bedrock_client, query, movies_no_filter)
381
+ st.success(answer)
382
+
383
+ except Exception as e:
384
+ st.error(f"❌ Search failed: {str(e)}")
385
+
386
+ # Show movie database
387
+ if st.checkbox("πŸ“‹ Show All Movies in Database"):
388
+ st.subheader("Movie Database")
389
+ df = pd.DataFrame(SAMPLE_MOVIES)
390
+ st.dataframe(df)
391
+
392
+ # Reset button
393
+ if st.button("πŸ”„ Reset Database"):
394
+ st.session_state.collection = None
395
+ st.session_state.setup_done = False
396
+ st.rerun()
397
 
398
+ # Installation and deployment guide
399
+ def show_guides():
400
+ col1, col2 = st.columns(2)
401
+
402
+ with col1:
403
+ with st.expander("πŸ“– Installation Guide"):
404
+ st.markdown("""
405
+ **Step 1: Install Libraries**
406
+ ```bash
407
+ pip install streamlit boto3 chromadb pandas
408
+ ```
409
+
410
+ **Step 2: Setup AWS**
411
+ ```bash
412
+ aws configure
413
+ ```
414
+
415
+ **Step 3: Run Locally**
416
+ ```bash
417
+ streamlit run bollywood_rag.py
418
+ ```
419
+ """)
420
+
421
+ with col2:
422
+ with st.expander("πŸš€ Deploy to Hugging Face"):
423
+ st.markdown("""
424
+ **Step 1: Create files**
425
+ - `app.py` (this code)
426
+ - `requirements.txt`
427
+ - `README.md`
428
+
429
+ **Step 2: requirements.txt**
430
+ ```
431
+ streamlit
432
+ boto3
433
+ chromadb
434
+ pandas
435
+ ```
436
+
437
+ **Step 3: Deploy**
438
+ 1. Push to GitHub
439
+ 2. Connect to Hugging Face Spaces
440
+ 3. Select Streamlit SDK
441
+ 4. Add AWS secrets in settings
442
+ """)
443
 
444
+ # Run the app
445
  if __name__ == "__main__":
446
+ show_guides()
447
+ main()