SlouchyBuffalo commited on
Commit
ffbac0e
·
verified ·
1 Parent(s): f137b52

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +402 -0
app.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import os
4
+ import logging
5
+ import datetime
6
+ from langchain_community.document_loaders import PyPDFLoader
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain_community.vectorstores import Chroma
10
+ from huggingface_hub import InferenceClient, get_token
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Set HF_HOME for caching Hugging Face assets in persistent storage
17
+ os.environ["HF_HOME"] = "/data/.huggingface"
18
+ os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
+
20
+ # Define persistent storage directories
21
+ DATA_DIR = "/data" # Root persistent storage directory
22
+ DOCS_DIR = os.path.join(DATA_DIR, "documents") # Subdirectory for uploaded PDFs
23
+ CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db") # Subdirectory for Chroma vector store
24
+
25
+ # Create directories if they don't exist
26
+ os.makedirs(DOCS_DIR, exist_ok=True)
27
+ os.makedirs(CHROMA_DIR, exist_ok=True)
28
+
29
+ # Initialize Cerebras InferenceClient
30
+ try:
31
+ token = get_token()
32
+ if not token:
33
+ logger.error("HF_TOKEN is not set in Space secrets")
34
+ client = None
35
+ else:
36
+ client = InferenceClient(
37
+ model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
38
+ provider="cerebras",
39
+ token=token
40
+ )
41
+ logger.info("InferenceClient initialized successfully")
42
+ except Exception as e:
43
+ logger.error(f"Failed to initialize InferenceClient: {str(e)}")
44
+ client = None
45
+
46
+ # Global variables for vector store
47
+ vectorstore = None
48
+ retriever = None
49
+
50
+ def list_uploaded_documents():
51
+ """List all uploaded documents in the persistent storage"""
52
+ try:
53
+ if not os.path.exists(DOCS_DIR):
54
+ return []
55
+
56
+ files = os.listdir(DOCS_DIR)
57
+ pdf_files = [f for f in files if f.lower().endswith('.pdf')]
58
+ file_info = []
59
+
60
+ for file in pdf_files:
61
+ file_path = os.path.join(DOCS_DIR, file)
62
+ file_size = os.path.getsize(file_path)
63
+ file_time = os.path.getmtime(file_path)
64
+ file_info.append({
65
+ "name": file,
66
+ "size": f"{file_size // 1024} KB",
67
+ "date": datetime.datetime.fromtimestamp(file_time).strftime('%Y-%m-%d %H:%M:%S')
68
+ })
69
+
70
+ return file_info
71
+ except Exception as e:
72
+ logger.error(f"Error listing documents: {str(e)}")
73
+ return []
74
+
75
+ def delete_document(filename):
76
+ """Delete a document from persistent storage and update the vector store"""
77
+ try:
78
+ if not filename:
79
+ return "No file selected for deletion"
80
+
81
+ file_path = os.path.join(DOCS_DIR, filename)
82
+ if not os.path.exists(file_path):
83
+ return f"File {filename} does not exist"
84
+
85
+ # Delete the file
86
+ os.remove(file_path)
87
+ logger.info(f"Deleted file {filename}")
88
+
89
+ # Refresh the vector store
90
+ refresh_status = refresh_vector_store()
91
+
92
+ return f"File {filename} deleted successfully! {refresh_status}"
93
+ except Exception as e:
94
+ logger.error(f"Error deleting document: {str(e)}")
95
+ return f"Error deleting document: {str(e)}"
96
+
97
+ def preview_document(filename, max_pages=3):
98
+ """Generate a preview of the document's content"""
99
+ try:
100
+ if not filename:
101
+ return "No file selected for preview"
102
+
103
+ file_path = os.path.join(DOCS_DIR, filename)
104
+ if not os.path.exists(file_path):
105
+ return f"File {filename} does not exist"
106
+
107
+ loader = PyPDFLoader(file_path)
108
+ documents = loader.load()
109
+
110
+ # Limit preview to first few pages
111
+ preview_docs = documents[:max_pages]
112
+ preview_text = f"Preview of {filename} (first {len(preview_docs)} pages):\n\n"
113
+
114
+ for i, doc in enumerate(preview_docs):
115
+ preview_text += f"--- Page {i+1} ---\n{doc.page_content[:500]}...\n\n"
116
+
117
+ return preview_text
118
+ except Exception as e:
119
+ logger.error(f"Error previewing document: {str(e)}")
120
+ return f"Error previewing document: {str(e)}"
121
+
122
+ @spaces.GPU(duration=180) # Use GPU for vector store recreation
123
+ def refresh_vector_store():
124
+ """Rebuild the vector store from all available documents"""
125
+ global vectorstore, retriever
126
+ try:
127
+ if not os.path.exists(DOCS_DIR):
128
+ logger.warning("Documents directory does not exist")
129
+ return "No documents directory found"
130
+
131
+ files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith('.pdf')]
132
+ if not files:
133
+ logger.warning("No PDF documents found")
134
+
135
+ # Clear the vector store
136
+ if os.path.exists(CHROMA_DIR):
137
+ import shutil
138
+ shutil.rmtree(CHROMA_DIR)
139
+ os.makedirs(CHROMA_DIR, exist_ok=True)
140
+
141
+ vectorstore = None
142
+ retriever = None
143
+ return "No PDF documents found. Vector store cleared."
144
+
145
+ # Load and process all documents
146
+ all_texts = []
147
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
148
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
149
+
150
+ for file in files:
151
+ file_path = os.path.join(DOCS_DIR, file)
152
+ try:
153
+ loader = PyPDFLoader(file_path)
154
+ documents = loader.load()
155
+ texts = text_splitter.split_documents(documents)
156
+
157
+ # Add source file metadata to each chunk
158
+ for i, text in enumerate(texts):
159
+ text.metadata["source"] = file
160
+
161
+ all_texts.extend(texts)
162
+ logger.info(f"Processed {file}, added {len(texts)} chunks")
163
+ except Exception as e:
164
+ logger.error(f"Error processing {file}: {str(e)}")
165
+
166
+ # Create new vector store
167
+ if all_texts:
168
+ # Remove existing vector store
169
+ if os.path.exists(CHROMA_DIR):
170
+ import shutil
171
+ shutil.rmtree(CHROMA_DIR)
172
+ os.makedirs(CHROMA_DIR, exist_ok=True)
173
+
174
+ vectorstore = Chroma.from_documents(
175
+ all_texts, embeddings, persist_directory=CHROMA_DIR
176
+ )
177
+ vectorstore.persist()
178
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
179
+ logger.info(f"Vector store recreated with {len(all_texts)} chunks from {len(files)} files")
180
+ return f"Vector store updated with {len(files)} documents!"
181
+ else:
182
+ logger.warning("No text chunks extracted from documents")
183
+ return "No content could be extracted from the PDF files"
184
+ except Exception as e:
185
+ logger.error(f"Error refreshing vector store: {str(e)}")
186
+ return f"Error refreshing vector store: {str(e)}"
187
+
188
+ @spaces.GPU(duration=180) # Use ZeroGPU (H200) for embedding generation, 180s timeout
189
+ def initialize_rag(file):
190
+ global vectorstore, retriever
191
+ try:
192
+ # Debug file object properties
193
+ logger.info(f"File object: {type(file)}, Attributes: {dir(file)}")
194
+ logger.info(f"File name: {file.name}")
195
+
196
+ # Validate file
197
+ if not file or not file.name:
198
+ logger.error("No file provided or invalid file name")
199
+ return "Error: No file provided or invalid file name"
200
+
201
+ # Verify temporary file exists and is accessible
202
+ if not os.path.exists(file.name):
203
+ logger.error(f"Temporary file {file.name} does not exist")
204
+ return f"Error: Temporary file {file.name} does not exist"
205
+
206
+ # Check temporary file size
207
+ file_size = os.path.getsize(file.name)
208
+ logger.info(f"Temporary file size: {file_size} bytes")
209
+ if file_size == 0:
210
+ logger.error("Uploaded file is empty")
211
+ return "Error: Uploaded file is empty"
212
+
213
+ # Save uploaded file to persistent storage
214
+ file_name = os.path.basename(file.name)
215
+ file_path = os.path.join(DOCS_DIR, file_name)
216
+
217
+ # Check if file exists and its size
218
+ should_save = True
219
+ if os.path.exists(file_path):
220
+ existing_size = os.path.getsize(file_path)
221
+ logger.info(f"Existing file {file_name} size: {existing_size} bytes")
222
+ if existing_size == 0:
223
+ logger.warning(f"Existing file {file_name} is empty, will overwrite")
224
+ else:
225
+ logger.info(f"File {file_name} already exists and is not empty, skipping save")
226
+ should_save = False
227
+
228
+ if should_save:
229
+ try:
230
+ with open(file.name, "rb") as src_file:
231
+ file_content = src_file.read()
232
+ logger.info(f"Read {len(file_content)} bytes from temporary file")
233
+ if not file_content:
234
+ logger.error("File content is empty after reading")
235
+ return "Error: File content is empty after reading"
236
+ with open(file_path, "wb") as dst_file:
237
+ dst_file.write(file_content)
238
+ dst_file.flush() # Ensure write completes
239
+ # Verify written file
240
+ written_size = os.path.getsize(file_path)
241
+ logger.info(f"Saved {file_name} to {file_path}, size: {written_size} bytes")
242
+ if written_size == 0:
243
+ logger.error(f"Failed to write {file_name}, file is empty")
244
+ return f"Error: Failed to write {file_name}, file is empty"
245
+ except PermissionError as e:
246
+ logger.error(f"Permission error writing to {file_path}: {str(e)}")
247
+ return f"Error: Permission denied writing to {file_path}"
248
+ except Exception as e:
249
+ logger.error(f"Error writing file to {file_path}: {str(e)}")
250
+ return f"Error writing file: {str(e)}"
251
+
252
+ # After saving the file, refresh the vector store
253
+ refresh_status = refresh_vector_store()
254
+ logger.info(f"Vector store refresh status: {refresh_status}")
255
+
256
+ return f"Document '{file_name}' processed and saved! {refresh_status}"
257
+ except Exception as e:
258
+ logger.error(f"Error processing document: {str(e)}")
259
+ return f"Error processing document: {str(e)}"
260
+
261
+ def query_documents(query, history, system_prompt, max_tokens, temperature):
262
+ global retriever, client
263
+ try:
264
+ if client is None:
265
+ logger.error("InferenceClient not initialized")
266
+ return history, "Error: InferenceClient not initialized. Check HF_TOKEN."
267
+ if retriever is None:
268
+ logger.error("No documents loaded")
269
+ return history, "Error: No documents loaded. Please upload a document first."
270
+
271
+ # Ensure history is a list of [user, assistant] lists
272
+ logger.info(f"History before processing: {history}")
273
+ if not isinstance(history, list):
274
+ logger.warning("History is not a list, resetting")
275
+ history = []
276
+ history = [[str(item[0]), str(item[1])] for item in history if isinstance(item, (list, tuple)) and len(item) == 2]
277
+
278
+ # Retrieve relevant documents
279
+ docs = retriever.get_relevant_documents(query)
280
+
281
+ # Format context with source information
282
+ context_parts = []
283
+ for doc in docs:
284
+ source = doc.metadata.get('source', 'unknown')
285
+ page = doc.metadata.get('page', 'unknown')
286
+ context_parts.append(f"[Source: {source}, Page: {page}]\n{doc.page_content}")
287
+
288
+ context = "\n\n".join(context_parts)
289
+
290
+ # Call Cerebras inference
291
+ logger.info("Calling Cerebras inference")
292
+ response = client.chat_completion(
293
+ messages=[
294
+ {"role": "system", "content": system_prompt},
295
+ {"role": "user", "content": f"Context: {context}\n\nQuery: {query}"}
296
+ ],
297
+ max_tokens=int(max_tokens),
298
+ temperature=float(temperature),
299
+ stream=False
300
+ )
301
+ answer = response.choices[0].message.content
302
+ logger.info("Inference successful")
303
+
304
+ # Update chat history with list format
305
+ history.append([query, answer])
306
+ logger.info(f"History after append: {history}")
307
+ return history, "" # Clear the query input
308
+ except Exception as e:
309
+ logger.error(f"Error querying documents: {str(e)}")
310
+ return history, f"Error querying documents: {str(e)}"
311
+
312
+ # Load existing vector store on startup
313
+ try:
314
+ if os.path.exists(CHROMA_DIR):
315
+ logger.info("Loading existing vector store")
316
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
317
+ vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
318
+ retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
319
+ logger.info(f"Loaded vector store from {CHROMA_DIR}")
320
+ except Exception as e:
321
+ logger.error(f"Error loading vector store: {str(e)}")
322
+
323
+ # Create the Gradio interface
324
+ with gr.Blocks() as demo:
325
+ gr.Markdown("# RAG Chatbot with Document Management")
326
+
327
+ # File management tab
328
+ with gr.Tab("Document Management"):
329
+ # File upload
330
+ with gr.Row():
331
+ file_input = gr.File(label="Upload Document (PDF)", file_types=[".pdf"])
332
+ file_output = gr.Textbox(label="Upload Status")
333
+
334
+ # Document listing and management
335
+ with gr.Row():
336
+ refresh_btn = gr.Button("Refresh Document List")
337
+ rebuild_vs_btn = gr.Button("Rebuild Vector Store")
338
+
339
+ doc_list = gr.Dataframe(
340
+ headers=["name", "size", "date"],
341
+ label="Uploaded Documents"
342
+ )
343
+
344
+ with gr.Row():
345
+ selected_doc = gr.Dropdown(label="Select Document")
346
+ preview_btn = gr.Button("Preview Document")
347
+ delete_btn = gr.Button("Delete Selected Document", variant="stop")
348
+
349
+ doc_preview = gr.Textbox(label="Document Preview", lines=10)
350
+ delete_output = gr.Textbox(label="Operation Status")
351
+
352
+ # Chat interface tab
353
+ with gr.Tab("Chat"):
354
+ chatbot = gr.Chatbot(label="Conversation")
355
+
356
+ # Query and parameters
357
+ with gr.Row():
358
+ query_input = gr.Textbox(label="Query", placeholder="Ask about the document...")
359
+ system_prompt = gr.Textbox(
360
+ label="System Prompt",
361
+ value="You are a helpful assistant answering questions based on the provided document context. Only use the context provided to answer the question. If you don't know the answer, say so."
362
+ )
363
+
364
+ with gr.Row():
365
+ max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=2000, value=500, step=50)
366
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
367
+
368
+ # Buttons
369
+ with gr.Row():
370
+ submit_btn = gr.Button("Send")
371
+ clear_btn = gr.Button("Clear Chat")
372
+
373
+ # Event handlers for file management
374
+ def update_doc_list():
375
+ docs = list_uploaded_documents()
376
+ return docs, [d["name"] for d in docs]
377
+
378
+ file_input.upload(initialize_rag, file_input, file_output).then(
379
+ update_doc_list, None, [doc_list, selected_doc]
380
+ )
381
+
382
+ refresh_btn.click(update_doc_list, None, [doc_list, selected_doc])
383
+ rebuild_vs_btn.click(refresh_vector_store, None, delete_output)
384
+ preview_btn.click(preview_document, selected_doc, doc_preview)
385
+ delete_btn.click(delete_document, selected_doc, delete_output).then(
386
+ update_doc_list, None, [doc_list, selected_doc]
387
+ )
388
+
389
+ # Event handlers for chat
390
+ submit_btn.click(
391
+ query_documents,
392
+ inputs=[query_input, chatbot, system_prompt, max_tokens, temperature],
393
+ outputs=[chatbot, query_input]
394
+ )
395
+
396
+ clear_btn.click(lambda: [], None, chatbot)
397
+
398
+ # Initialize document list on startup
399
+ demo.load(update_doc_list, None, [doc_list, selected_doc])
400
+
401
+ if __name__ == "__main__":
402
+ demo.launch()