PrajwalW commited on
Commit
13e2a13
Β·
verified Β·
1 Parent(s): a6ae532

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -61
app.py CHANGED
@@ -1,64 +1,438 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
1
+ import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import boto3
4
+
5
+ import json
6
+
7
+ from qdrant_client import QdrantClient
8
+
9
+ from qdrant_client.http import models
10
+
11
+ import PyPDF2
12
+
13
+ import io
14
+
15
+ import uuid
16
+
17
+ # Simple function to connect to AWS Bedrock
18
+
19
+ def connect_to_bedrock():
20
+
21
+ client = boto3.client('bedrock-runtime', region_name='us-east-1')
22
+
23
+ return client
24
+
25
+ # Simple function to connect to QDrant Cloud
26
+
27
+ def connect_to_qdrant(api_key, url):
28
+
29
+ client = QdrantClient(url=url, api_key=api_key)
30
+
31
+ return client
32
+
33
+ # Extract text from PDF file
34
+
35
+ def extract_text_from_pdf(pdf_file):
36
+
37
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
38
+
39
+ text = ""
40
+
41
+ for page in pdf_reader.pages:
42
+
43
+ text += page.extract_text() + "\n"
44
+
45
+ return text
46
+
47
+ # Split text into smaller chunks (simple way)
48
+
49
+ def split_text_into_chunks(text, chunk_size=1000):
50
+
51
+ words = text.split()
52
+
53
+ chunks = []
54
+
55
+ current_chunk = []
56
+
57
+ current_size = 0
58
+
59
+ for word in words:
60
+
61
+ current_chunk.append(word)
62
+
63
+ current_size += len(word) + 1 # +1 for space
64
+
65
+ if current_size >= chunk_size:
66
+
67
+ chunks.append(" ".join(current_chunk))
68
+
69
+ current_chunk = []
70
+
71
+ current_size = 0
72
+
73
+ if current_chunk: # Add last chunk if not empty
74
+
75
+ chunks.append(" ".join(current_chunk))
76
+
77
+ return chunks
78
+
79
+ # Get embeddings (vector numbers) from AI
80
+
81
+ def get_embeddings(bedrock_client, text):
82
+
83
+ body = json.dumps({
84
+
85
+ "inputText": text
86
+
87
+ })
88
+
89
+ response = bedrock_client.invoke_model(
90
+
91
+ modelId="amazon.titan-embed-text-v1",
92
+
93
+ body=body
94
+
95
+ )
96
+
97
+ result = json.loads(response['body'].read())
98
+
99
+ return result['embedding']
100
+
101
+ # Store PDF chunks in QDrant vector database
102
+
103
+ def store_pdf_in_qdrant(qdrant_client, bedrock_client, pdf_chunks, collection_name):
104
+
105
+ # Create collection if it doesn't exist
106
+
107
+ try:
108
+
109
+ qdrant_client.create_collection(
110
+
111
+ collection_name=collection_name,
112
+
113
+ vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
114
+
115
+ )
116
+
117
+ except:
118
+
119
+ pass # Collection might already exist
120
+
121
+ # Store each chunk
122
+
123
+ points = []
124
+
125
+ for i, chunk in enumerate(pdf_chunks):
126
+
127
+ # Get vector representation of text
128
+
129
+ embedding = get_embeddings(bedrock_client, chunk)
130
+
131
+ # Create a point for QDrant
132
+
133
+ point = models.PointStruct(
134
+
135
+ id=str(uuid.uuid4()),
136
+
137
+ vector=embedding,
138
+
139
+ payload={"text": chunk, "chunk_id": i}
140
+
141
+ )
142
+
143
+ points.append(point)
144
+
145
+ # Upload to QDrant
146
+
147
+ qdrant_client.upsert(
148
+
149
+ collection_name=collection_name,
150
+
151
+ points=points
152
+
153
+ )
154
+
155
+ return len(points)
156
+
157
+ # Search for relevant text in QDrant
158
+
159
+ def search_in_qdrant(qdrant_client, bedrock_client, question, collection_name, top_k=3):
160
+
161
+ # Get vector for question
162
+
163
+ question_embedding = get_embeddings(bedrock_client, question)
164
+
165
+ # Search in QDrant
166
+
167
+ results = qdrant_client.search(
168
+
169
+ collection_name=collection_name,
170
+
171
+ query_vector=question_embedding,
172
+
173
+ limit=top_k
174
+
175
+ )
176
+
177
+ # Extract relevant text
178
+
179
+ relevant_texts = []
180
+
181
+ for result in results:
182
+
183
+ relevant_texts.append(result.payload["text"])
184
+
185
+ return relevant_texts
186
+
187
+ # Ask AI to answer question based on PDF content
188
+
189
+ def ask_ai_with_context(bedrock_client, question, relevant_texts):
190
+
191
+ context = "\n\n".join(relevant_texts)
192
+
193
+ prompt = f"""
194
+
195
+ Based on the following information from a PDF document, please answer the question.
196
+
197
+ PDF Content:
198
+
199
+ {context}
200
+
201
+ Question: {question}
202
+
203
+ Please provide a clear and helpful answer based only on the information provided above.
204
+
205
+ If the answer is not in the provided content, please say so.
206
+
207
+ """
208
+
209
+ body = json.dumps({
210
+
211
+ "anthropic_version": "bedrock-2023-05-31",
212
+
213
+ "max_tokens": 500,
214
+
215
+ "messages": [{"role": "user", "content": prompt}]
216
+
217
+ })
218
+
219
+ response = bedrock_client.invoke_model(
220
+
221
+ modelId="anthropic.claude-3-haiku-20240307-v1:0",
222
+
223
+ body=body
224
+
225
+ )
226
+
227
+ result = json.loads(response['body'].read())
228
+
229
+ return result['content'][0]['text']
230
+
231
+ # Main app
232
+
233
+ def main():
234
+
235
+ st.title("πŸ“„ Simple PDF Chatbot")
236
+
237
+ st.write("Upload a PDF and ask questions about it!")
238
+
239
+ # Sidebar for settings
240
+
241
+ with st.sidebar:
242
+
243
+ st.subheader("πŸ”§ Setup")
244
+
245
+ st.write("You need these to use the app:")
246
+
247
+ # QDrant settings
248
+
249
+ st.write("**QDrant Cloud Settings:**")
250
+
251
+ qdrant_url = st.text_input("QDrant URL", placeholder="https://your-cluster.qdrant.io")
252
+
253
+ qdrant_api_key = st.text_input("QDrant API Key", type="password")
254
+
255
+ st.write("**Collection Name:**")
256
+
257
+ collection_name = st.text_input("Collection Name", value="pdf_documents")
258
+
259
+ st.markdown("---")
260
+
261
+ st.markdown("""
262
+
263
+ **How to get QDrant settings:**
264
+
265
+ 1. Go to qdrant.io
266
+
267
+ 2. Create free account
268
+
269
+ 3. Create a cluster
270
+
271
+ 4. Copy URL and API key
272
+
273
+ """)
274
+
275
+ # Main content
276
+
277
+ tab1, tab2 = st.tabs(["πŸ“€ Upload PDF", "πŸ’¬ Chat with PDF"])
278
+
279
+ with tab1:
280
+
281
+ st.subheader("Upload Your PDF")
282
+
283
+ uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
284
+
285
+ if uploaded_file and qdrant_url and qdrant_api_key:
286
+
287
+ if st.button("πŸš€ Process PDF"):
288
+
289
+ try:
290
+
291
+ with st.spinner("Processing your PDF..."):
292
+
293
+ # Connect to services
294
+
295
+ bedrock_client = connect_to_bedrock()
296
+
297
+ qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
298
+
299
+ # Extract text from PDF
300
+
301
+ st.write("πŸ“– Extracting text from PDF...")
302
+
303
+ pdf_text = extract_text_from_pdf(uploaded_file)
304
+
305
+ # Split into chunks
306
+
307
+ st.write("βœ‚οΈ Breaking text into smaller pieces...")
308
+
309
+ chunks = split_text_into_chunks(pdf_text)
310
+
311
+ # Store in QDrant
312
+
313
+ st.write("πŸ’Ύ Storing in vector database...")
314
+
315
+ num_chunks = store_pdf_in_qdrant(qdrant_client, bedrock_client, chunks, collection_name)
316
+
317
+ st.success(f"βœ… PDF processed successfully! Stored {num_chunks} text chunks.")
318
+
319
+ st.balloons()
320
+
321
+ except Exception as e:
322
+
323
+ st.error(f"❌ Error processing PDF: {str(e)}")
324
+
325
+ elif uploaded_file:
326
+
327
+ st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
328
+
329
+ with tab2:
330
+
331
+ st.subheader("Ask Questions About Your PDF")
332
+
333
+ if qdrant_url and qdrant_api_key:
334
+
335
+ question = st.text_input("πŸ’­ What would you like to know about your PDF?")
336
+
337
+ if question:
338
+
339
+ if st.button("πŸ” Get Answer"):
340
+
341
+ try:
342
+
343
+ with st.spinner("Searching for answer..."):
344
+
345
+ # Connect to services
346
+
347
+ bedrock_client = connect_to_bedrock()
348
+
349
+ qdrant_client = connect_to_qdrant(qdrant_api_key, qdrant_url)
350
+
351
+ # Search for relevant content
352
+
353
+ st.write("πŸ” Searching relevant content...")
354
+
355
+ relevant_texts = search_in_qdrant(qdrant_client, bedrock_client, question, collection_name)
356
+
357
+ # Get AI answer
358
+
359
+ st.write("πŸ€– Generating answer...")
360
+
361
+ answer = ask_ai_with_context(bedrock_client, question, relevant_texts)
362
+
363
+ # Show answer
364
+
365
+ st.subheader("πŸ“ Answer:")
366
+
367
+ st.write(answer)
368
+
369
+ # Show sources (optional)
370
+
371
+ with st.expander("πŸ“š Source content used"):
372
+
373
+ for i, text in enumerate(relevant_texts, 1):
374
+
375
+ st.write(f"**Source {i}:**")
376
+
377
+ st.write(text[:200] + "..." if len(text) > 200 else text)
378
+
379
+ st.write("---")
380
+
381
+ except Exception as e:
382
+
383
+ st.error(f"❌ Error: {str(e)}")
384
+
385
+ else:
386
+
387
+ st.warning("⚠️ Please enter QDrant settings in the sidebar first!")
388
+
389
+ # Quick setup guide
390
+
391
+ def show_setup_guide():
392
+
393
+ with st.expander("πŸ“– Quick Setup Guide"):
394
+
395
+ st.markdown("""
396
+
397
+ **Step 1: Install Required Libraries**
398
+
399
+ ```bash
400
+
401
+ pip install streamlit boto3 qdrant-client PyPDF2
402
+
403
+ ```
404
+
405
+ **Step 2: Set up AWS**
406
+
407
+ - Create AWS account
408
+
409
+ - Run `aws configure` and enter your keys
410
+
411
+ **Step 3: Set up QDrant Cloud**
412
+
413
+ - Go to qdrant.io
414
+
415
+ - Create free account
416
+
417
+ - Create a cluster
418
+
419
+ - Copy URL and API key to sidebar
420
+
421
+ **Step 4: Run the App**
422
+
423
+ ```bash
424
+
425
+ streamlit run pdf_chatbot.py
426
+
427
+ ```
428
+
429
+ """)
430
+
431
+ # Run the app
432
 
433
  if __name__ == "__main__":
434
+
435
+ show_setup_guide()
436
+
437
+ main()
438
+