SlouchyBuffalo commited on
Commit
4b398a2
·
verified ·
1 Parent(s): dd2bb11

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +423 -0
app.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import os
4
+ import logging
5
+ import datetime
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.vectorstores import Chroma
10
+ from huggingface_hub import InferenceClient, get_token
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Set HF_HOME for caching Hugging Face assets in persistent storage
17
+ os.environ["HF_HOME"] = "/data/.huggingface"
18
+ os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
+
20
+ # Define persistent storage directories
21
+ DATA_DIR = "/data" # Root persistent storage directory
22
+ DOCS_DIR = os.path.join(DATA_DIR, "documents") # Subdirectory for uploaded PDFs
23
+ CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db") # Subdirectory for Chroma vector store
24
+
25
+ # Create directories if they don't exist
26
+ os.makedirs(DOCS_DIR, exist_ok=True)
27
+ os.makedirs(CHROMA_DIR, exist_ok=True)
28
+
29
+ # Initialize Cerebras InferenceClient
30
+ try:
31
+ token = get_token()
32
+ if not token:
33
+ logger.error("HF_TOKEN is not set in Space secrets")
34
+ client = None
35
+ else:
36
+ client = InferenceClient(
37
+ model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
38
+ provider="cerebras",
39
+ token=token
40
+ )
41
+ logger.info("InferenceClient initialized successfully")
42
+ except Exception as e:
43
+ logger.error(f"Failed to initialize InferenceClient: {str(e)}")
44
+ client = None
45
+
46
+ # Global variables for vector store
47
+ vectorstore = None
48
+ retriever = None
49
+
50
+ def list_uploaded_documents():
51
+ """List all uploaded documents in the persistent storage"""
52
+ try:
53
+ if not os.path.exists(DOCS_DIR):
54
+ return []
55
+
56
+ files = os.listdir(DOCS_DIR)
57
+ pdf_files = [f for f in files if f.lower().endswith('.pdf')]
58
+ file_info = []
59
+
60
+ for file in pdf_files:
61
+ file_path = os.path.join(DOCS_DIR, file)
62
+ file_size = os.path.getsize(file_path)
63
+ file_time = os.path.getmtime(file_path)
64
+ file_info.append({
65
+ "name": file,
66
+ "size": f"{file_size // 1024} KB",
67
+ "date": datetime.datetime.fromtimestamp(file_time).strftime('%Y-%m-%d %H:%M:%S')
68
+ })
69
+
70
+ return file_info
71
+ except Exception as e:
72
+ logger.error(f"Error listing documents: {str(e)}")
73
+ return []
74
+
75
+ def get_document_filenames():
76
+ """Get list of document filenames for dropdown"""
77
+ try:
78
+ if not os.path.exists(DOCS_DIR):
79
+ return []
80
+
81
+ files = os.listdir(DOCS_DIR)
82
+ pdf_files = [f for f in files if f.lower().endswith('.pdf')]
83
+ return pdf_files
84
+ except Exception as e:
85
+ logger.error(f"Error getting document filenames: {str(e)}")
86
+ return []
87
+
88
+ def delete_document(filename):
89
+ """Delete a document from persistent storage and update the vector store"""
90
+ try:
91
+ if not filename:
92
+ return "No file selected for deletion"
93
+
94
+ file_path = os.path.join(DOCS_DIR, filename)
95
+ if not os.path.exists(file_path):
96
+ return f"File {filename} does not exist"
97
+
98
+ # Delete the file
99
+ os.remove(file_path)
100
+ logger.info(f"Deleted file {filename}")
101
+
102
+ # Refresh the vector store
103
+ refresh_status = refresh_vector_store()
104
+
105
+ return f"File {filename} deleted successfully! {refresh_status}"
106
+ except Exception as e:
107
+ logger.error(f"Error deleting document: {str(e)}")
108
+ return f"Error deleting document: {str(e)}"
109
+
110
+ def preview_document(filename, max_pages=3):
111
+ """Generate a preview of the document's content"""
112
+ try:
113
+ if not filename:
114
+ return "No file selected for preview"
115
+
116
+ file_path = os.path.join(DOCS_DIR, filename)
117
+ if not os.path.exists(file_path):
118
+ return f"File {filename} does not exist"
119
+
120
+ loader = PyPDFLoader(file_path)
121
+ documents = loader.load()
122
+
123
+ # Limit preview to first few pages
124
+ preview_docs = documents[:max_pages]
125
+ preview_text = f"Preview of {filename} (first {len(preview_docs)} pages):\n\n"
126
+
127
+ for i, doc in enumerate(preview_docs):
128
+ preview_text += f"--- Page {i+1} ---\n{doc.page_content[:500]}...\n\n"
129
+
130
+ return preview_text
131
+ except Exception as e:
132
+ logger.error(f"Error previewing document: {str(e)}")
133
+ return f"Error previewing document: {str(e)}"
134
+
135
+ @spaces.GPU(duration=180) # Use GPU for vector store recreation
136
+ def refresh_vector_store():
137
+ """Rebuild the vector store from all available documents"""
138
+ global vectorstore, retriever
139
+ try:
140
+ if not os.path.exists(DOCS_DIR):
141
+ logger.warning("Documents directory does not exist")
142
+ return "No documents directory found"
143
+
144
+ files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith('.pdf')]
145
+ if not files:
146
+ logger.warning("No PDF documents found")
147
+
148
+ # Clear the vector store
149
+ if os.path.exists(CHROMA_DIR):
150
+ import shutil
151
+ shutil.rmtree(CHROMA_DIR)
152
+ os.makedirs(CHROMA_DIR, exist_ok=True)
153
+
154
+ vectorstore = None
155
+ retriever = None
156
+ return "No PDF documents found. Vector store cleared."
157
+
158
+ # Load and process all documents
159
+ all_texts = []
160
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
161
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
162
+
163
+ for file in files:
164
+ file_path = os.path.join(DOCS_DIR, file)
165
+ try:
166
+ loader = PyPDFLoader(file_path)
167
+ documents = loader.load()
168
+ texts = text_splitter.split_documents(documents)
169
+
170
+ # Add source file metadata to each chunk
171
+ for i, text in enumerate(texts):
172
+ text.metadata["source"] = file
173
+
174
+ all_texts.extend(texts)
175
+ logger.info(f"Processed {file}, added {len(texts)} chunks")
176
+ except Exception as e:
177
+ logger.error(f"Error processing {file}: {str(e)}")
178
+
179
+ # Create new vector store
180
+ if all_texts:
181
+ # Remove existing vector store
182
+ if os.path.exists(CHROMA_DIR):
183
+ import shutil
184
+ shutil.rmtree(CHROMA_DIR)
185
+ os.makedirs(CHROMA_DIR, exist_ok=True)
186
+
187
+ vectorstore = Chroma.from_documents(
188
+ all_texts, embeddings, persist_directory=CHROMA_DIR
189
+ )
190
+ vectorstore.persist()
191
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
192
+ logger.info(f"Vector store recreated with {len(all_texts)} chunks from {len(files)} files")
193
+ return f"Vector store updated with {len(files)} documents!"
194
+ else:
195
+ logger.warning("No text chunks extracted from documents")
196
+ return "No content could be extracted from the PDF files"
197
+ except Exception as e:
198
+ logger.error(f"Error refreshing vector store: {str(e)}")
199
+ return f"Error refreshing vector store: {str(e)}"
200
+
201
+ @spaces.GPU(duration=180) # Use ZeroGPU (H200) for embedding generation, 180s timeout
202
+ def initialize_rag(file):
203
+ global vectorstore, retriever
204
+ try:
205
+ # Debug file object properties
206
+ logger.info(f"File object: {type(file)}, Attributes: {dir(file)}")
207
+ logger.info(f"File name: {file.name}")
208
+
209
+ # Validate file
210
+ if not file or not file.name:
211
+ logger.error("No file provided or invalid file name")
212
+ return "Error: No file provided or invalid file name"
213
+
214
+ # Verify temporary file exists and is accessible
215
+ if not os.path.exists(file.name):
216
+ logger.error(f"Temporary file {file.name} does not exist")
217
+ return f"Error: Temporary file {file.name} does not exist"
218
+
219
+ # Check temporary file size
220
+ file_size = os.path.getsize(file.name)
221
+ logger.info(f"Temporary file size: {file_size} bytes")
222
+ if file_size == 0:
223
+ logger.error("Uploaded file is empty")
224
+ return "Error: Uploaded file is empty"
225
+
226
+ # Save uploaded file to persistent storage
227
+ file_name = os.path.basename(file.name)
228
+ file_path = os.path.join(DOCS_DIR, file_name)
229
+
230
+ # Check if file exists and its size
231
+ should_save = True
232
+ if os.path.exists(file_path):
233
+ existing_size = os.path.getsize(file_path)
234
+ logger.info(f"Existing file {file_name} size: {existing_size} bytes")
235
+ if existing_size == 0:
236
+ logger.warning(f"Existing file {file_name} is empty, will overwrite")
237
+ else:
238
+ logger.info(f"File {file_name} already exists and is not empty, skipping save")
239
+ should_save = False
240
+
241
+ if should_save:
242
+ try:
243
+ with open(file.name, "rb") as src_file:
244
+ file_content = src_file.read()
245
+ logger.info(f"Read {len(file_content)} bytes from temporary file")
246
+ if not file_content:
247
+ logger.error("File content is empty after reading")
248
+ return "Error: File content is empty after reading"
249
+ with open(file_path, "wb") as dst_file:
250
+ dst_file.write(file_content)
251
+ dst_file.flush() # Ensure write completes
252
+ # Verify written file
253
+ written_size = os.path.getsize(file_path)
254
+ logger.info(f"Saved {file_name} to {file_path}, size: {written_size} bytes")
255
+ if written_size == 0:
256
+ logger.error(f"Failed to write {file_name}, file is empty")
257
+ return f"Error: Failed to write {file_name}, file is empty"
258
+ except PermissionError as e:
259
+ logger.error(f"Permission error writing to {file_path}: {str(e)}")
260
+ return f"Error: Permission denied writing to {file_path}"
261
+ except Exception as e:
262
+ logger.error(f"Error writing file to {file_path}: {str(e)}")
263
+ return f"Error writing file: {str(e)}"
264
+
265
+ # After saving the file, refresh the vector store
266
+ refresh_status = refresh_vector_store()
267
+ logger.info(f"Vector store refresh status: {refresh_status}")
268
+
269
+ return f"Document '{file_name}' processed and saved! {refresh_status}"
270
+ except Exception as e:
271
+ logger.error(f"Error processing document: {str(e)}")
272
+ return f"Error processing document: {str(e)}"
273
+
274
+ def query_documents(query, history, system_prompt, max_tokens, temperature):
275
+ global retriever, client
276
+ try:
277
+ if client is None:
278
+ logger.error("InferenceClient not initialized")
279
+ return history, "Error: InferenceClient not initialized. Check HF_TOKEN."
280
+ if retriever is None:
281
+ logger.error("No documents loaded")
282
+ return history, "Error: No documents loaded. Please upload a document first."
283
+
284
+ # Ensure history is a list of [user, assistant] lists
285
+ logger.info(f"History before processing: {history}")
286
+ if not isinstance(history, list):
287
+ logger.warning("History is not a list, resetting")
288
+ history = []
289
+ history = [[str(item[0]), str(item[1])] for item in history if isinstance(item, (list, tuple)) and len(item) == 2]
290
+
291
+ # Retrieve relevant documents
292
+ docs = retriever.get_relevant_documents(query)
293
+
294
+ # Format context with source information
295
+ context_parts = []
296
+ for doc in docs:
297
+ source = doc.metadata.get('source', 'unknown')
298
+ page = doc.metadata.get('page', 'unknown')
299
+ context_parts.append(f"[Source: {source}, Page: {page}]\n{doc.page_content}")
300
+
301
+ context = "\n\n".join(context_parts)
302
+
303
+ # Call Cerebras inference
304
+ logger.info("Calling Cerebras inference")
305
+ response = client.chat_completion(
306
+ messages=[
307
+ {"role": "system", "content": system_prompt},
308
+ {"role": "user", "content": f"Context: {context}\n\nQuery: {query}"}
309
+ ],
310
+ max_tokens=int(max_tokens),
311
+ temperature=float(temperature),
312
+ stream=False
313
+ )
314
+ answer = response.choices[0].message.content
315
+ logger.info("Inference successful")
316
+
317
+ # Update chat history with list format
318
+ history.append([query, answer])
319
+ logger.info(f"History after append: {history}")
320
+ return history, "" # Clear the query input
321
+ except Exception as e:
322
+ logger.error(f"Error querying documents: {str(e)}")
323
+ return history, f"Error querying documents: {str(e)}"
324
+
325
+ # Load existing vector store on startup
326
+ try:
327
+ if os.path.exists(CHROMA_DIR):
328
+ logger.info("Loading existing vector store")
329
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
330
+ vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
331
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
332
+ logger.info(f"Loaded vector store from {CHROMA_DIR}")
333
+ except Exception as e:
334
+ logger.error(f"Error loading vector store: {str(e)}")
335
+
336
+ # Create the Gradio interface
337
+ with gr.Blocks() as demo:
338
+ gr.Markdown("# RAG Chatbot with Document Management")
339
+
340
+ # File management tab
341
+ with gr.Tab("Document Management"):
342
+ # File upload
343
+ with gr.Row():
344
+ file_input = gr.File(label="Upload Document (PDF)", file_types=[".pdf"])
345
+ file_output = gr.Textbox(label="Upload Status")
346
+
347
+ # Document listing and management
348
+ with gr.Row():
349
+ refresh_btn = gr.Button("Refresh Document List")
350
+ rebuild_vs_btn = gr.Button("Rebuild Vector Store")
351
+
352
+ doc_list = gr.Dataframe(
353
+ headers=["name", "size", "date"],
354
+ label="Uploaded Documents"
355
+ )
356
+
357
+ # Initialize dropdown with existing files
358
+ initial_files = get_document_filenames()
359
+
360
+ with gr.Row():
361
+ selected_doc = gr.Dropdown(
362
+ label="Select Document",
363
+ choices=initial_files,
364
+ allow_custom_value=True # This helps avoid errors when dropdown is updated
365
+ )
366
+ preview_btn = gr.Button("Preview Document")
367
+ delete_btn = gr.Button("Delete Selected Document", variant="stop")
368
+
369
+ doc_preview = gr.Textbox(label="Document Preview", lines=10)
370
+ delete_output = gr.Textbox(label="Operation Status")
371
+
372
+ # Chat interface tab
373
+ with gr.Tab("Chat"):
374
+ chatbot = gr.Chatbot(label="Conversation")
375
+
376
+ # Query and parameters
377
+ with gr.Row():
378
+ query_input = gr.Textbox(label="Query", placeholder="Ask about the document...")
379
+ system_prompt = gr.Textbox(
380
+ label="System Prompt",
381
+ value="You are a helpful assistant answering questions based on the provided document context. Only use the context provided to answer the question. If you don't know the answer, say so."
382
+ )
383
+
384
+ with gr.Row():
385
+ max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=2000, value=500, step=50)
386
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
387
+
388
+ # Buttons
389
+ with gr.Row():
390
+ submit_btn = gr.Button("Send")
391
+ clear_btn = gr.Button("Clear Chat")
392
+
393
+ # Event handlers for file management
394
+ def update_doc_list():
395
+ docs = list_uploaded_documents()
396
+ filenames = get_document_filenames()
397
+ return docs, gr.Dropdown(choices=filenames)
398
+
399
+ file_input.upload(initialize_rag, file_input, file_output).then(
400
+ update_doc_list, None, [doc_list, selected_doc]
401
+ )
402
+
403
+ refresh_btn.click(update_doc_list, None, [doc_list, selected_doc])
404
+ rebuild_vs_btn.click(refresh_vector_store, None, delete_output)
405
+ preview_btn.click(preview_document, selected_doc, doc_preview)
406
+ delete_btn.click(delete_document, selected_doc, delete_output).then(
407
+ update_doc_list, None, [doc_list, selected_doc]
408
+ )
409
+
410
+ # Event handlers for chat
411
+ submit_btn.click(
412
+ query_documents,
413
+ inputs=[query_input, chatbot, system_prompt, max_tokens, temperature],
414
+ outputs=[chatbot, query_input]
415
+ )
416
+
417
+ clear_btn.click(lambda: [], None, chatbot)
418
+
419
+ # Initialize document list on startup
420
+ demo.load(update_doc_list, None, [doc_list, selected_doc])
421
+
422
+ if __name__ == "__main__":
423
+ demo.launch()