SlouchyBuffalo commited on
Commit
dc6bbf5
·
verified ·
1 Parent(s): 4b398a2

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -423
app.py DELETED
@@ -1,423 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import os
4
- import logging
5
- import datetime
6
- from langchain_community.document_loaders import PyPDFLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.vectorstores import Chroma
10
- from huggingface_hub import InferenceClient, get_token
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Set HF_HOME for caching Hugging Face assets in persistent storage
17
- os.environ["HF_HOME"] = "/data/.huggingface"
18
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
-
20
- # Define persistent storage directories
21
- DATA_DIR = "/data" # Root persistent storage directory
22
- DOCS_DIR = os.path.join(DATA_DIR, "documents") # Subdirectory for uploaded PDFs
23
- CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db") # Subdirectory for Chroma vector store
24
-
25
- # Create directories if they don't exist
26
- os.makedirs(DOCS_DIR, exist_ok=True)
27
- os.makedirs(CHROMA_DIR, exist_ok=True)
28
-
29
- # Initialize Cerebras InferenceClient
30
- try:
31
- token = get_token()
32
- if not token:
33
- logger.error("HF_TOKEN is not set in Space secrets")
34
- client = None
35
- else:
36
- client = InferenceClient(
37
- model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
38
- provider="cerebras",
39
- token=token
40
- )
41
- logger.info("InferenceClient initialized successfully")
42
- except Exception as e:
43
- logger.error(f"Failed to initialize InferenceClient: {str(e)}")
44
- client = None
45
-
46
- # Global variables for vector store
47
- vectorstore = None
48
- retriever = None
49
-
50
- def list_uploaded_documents():
51
- """List all uploaded documents in the persistent storage"""
52
- try:
53
- if not os.path.exists(DOCS_DIR):
54
- return []
55
-
56
- files = os.listdir(DOCS_DIR)
57
- pdf_files = [f for f in files if f.lower().endswith('.pdf')]
58
- file_info = []
59
-
60
- for file in pdf_files:
61
- file_path = os.path.join(DOCS_DIR, file)
62
- file_size = os.path.getsize(file_path)
63
- file_time = os.path.getmtime(file_path)
64
- file_info.append({
65
- "name": file,
66
- "size": f"{file_size // 1024} KB",
67
- "date": datetime.datetime.fromtimestamp(file_time).strftime('%Y-%m-%d %H:%M:%S')
68
- })
69
-
70
- return file_info
71
- except Exception as e:
72
- logger.error(f"Error listing documents: {str(e)}")
73
- return []
74
-
75
- def get_document_filenames():
76
- """Get list of document filenames for dropdown"""
77
- try:
78
- if not os.path.exists(DOCS_DIR):
79
- return []
80
-
81
- files = os.listdir(DOCS_DIR)
82
- pdf_files = [f for f in files if f.lower().endswith('.pdf')]
83
- return pdf_files
84
- except Exception as e:
85
- logger.error(f"Error getting document filenames: {str(e)}")
86
- return []
87
-
88
- def delete_document(filename):
89
- """Delete a document from persistent storage and update the vector store"""
90
- try:
91
- if not filename:
92
- return "No file selected for deletion"
93
-
94
- file_path = os.path.join(DOCS_DIR, filename)
95
- if not os.path.exists(file_path):
96
- return f"File {filename} does not exist"
97
-
98
- # Delete the file
99
- os.remove(file_path)
100
- logger.info(f"Deleted file {filename}")
101
-
102
- # Refresh the vector store
103
- refresh_status = refresh_vector_store()
104
-
105
- return f"File {filename} deleted successfully! {refresh_status}"
106
- except Exception as e:
107
- logger.error(f"Error deleting document: {str(e)}")
108
- return f"Error deleting document: {str(e)}"
109
-
110
- def preview_document(filename, max_pages=3):
111
- """Generate a preview of the document's content"""
112
- try:
113
- if not filename:
114
- return "No file selected for preview"
115
-
116
- file_path = os.path.join(DOCS_DIR, filename)
117
- if not os.path.exists(file_path):
118
- return f"File {filename} does not exist"
119
-
120
- loader = PyPDFLoader(file_path)
121
- documents = loader.load()
122
-
123
- # Limit preview to first few pages
124
- preview_docs = documents[:max_pages]
125
- preview_text = f"Preview of {filename} (first {len(preview_docs)} pages):\n\n"
126
-
127
- for i, doc in enumerate(preview_docs):
128
- preview_text += f"--- Page {i+1} ---\n{doc.page_content[:500]}...\n\n"
129
-
130
- return preview_text
131
- except Exception as e:
132
- logger.error(f"Error previewing document: {str(e)}")
133
- return f"Error previewing document: {str(e)}"
134
-
135
- @spaces.GPU(duration=180) # Use GPU for vector store recreation
136
- def refresh_vector_store():
137
- """Rebuild the vector store from all available documents"""
138
- global vectorstore, retriever
139
- try:
140
- if not os.path.exists(DOCS_DIR):
141
- logger.warning("Documents directory does not exist")
142
- return "No documents directory found"
143
-
144
- files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith('.pdf')]
145
- if not files:
146
- logger.warning("No PDF documents found")
147
-
148
- # Clear the vector store
149
- if os.path.exists(CHROMA_DIR):
150
- import shutil
151
- shutil.rmtree(CHROMA_DIR)
152
- os.makedirs(CHROMA_DIR, exist_ok=True)
153
-
154
- vectorstore = None
155
- retriever = None
156
- return "No PDF documents found. Vector store cleared."
157
-
158
- # Load and process all documents
159
- all_texts = []
160
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
161
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
162
-
163
- for file in files:
164
- file_path = os.path.join(DOCS_DIR, file)
165
- try:
166
- loader = PyPDFLoader(file_path)
167
- documents = loader.load()
168
- texts = text_splitter.split_documents(documents)
169
-
170
- # Add source file metadata to each chunk
171
- for i, text in enumerate(texts):
172
- text.metadata["source"] = file
173
-
174
- all_texts.extend(texts)
175
- logger.info(f"Processed {file}, added {len(texts)} chunks")
176
- except Exception as e:
177
- logger.error(f"Error processing {file}: {str(e)}")
178
-
179
- # Create new vector store
180
- if all_texts:
181
- # Remove existing vector store
182
- if os.path.exists(CHROMA_DIR):
183
- import shutil
184
- shutil.rmtree(CHROMA_DIR)
185
- os.makedirs(CHROMA_DIR, exist_ok=True)
186
-
187
- vectorstore = Chroma.from_documents(
188
- all_texts, embeddings, persist_directory=CHROMA_DIR
189
- )
190
- vectorstore.persist()
191
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
192
- logger.info(f"Vector store recreated with {len(all_texts)} chunks from {len(files)} files")
193
- return f"Vector store updated with {len(files)} documents!"
194
- else:
195
- logger.warning("No text chunks extracted from documents")
196
- return "No content could be extracted from the PDF files"
197
- except Exception as e:
198
- logger.error(f"Error refreshing vector store: {str(e)}")
199
- return f"Error refreshing vector store: {str(e)}"
200
-
201
- @spaces.GPU(duration=180) # Use ZeroGPU (H200) for embedding generation, 180s timeout
202
- def initialize_rag(file):
203
- global vectorstore, retriever
204
- try:
205
- # Debug file object properties
206
- logger.info(f"File object: {type(file)}, Attributes: {dir(file)}")
207
- logger.info(f"File name: {file.name}")
208
-
209
- # Validate file
210
- if not file or not file.name:
211
- logger.error("No file provided or invalid file name")
212
- return "Error: No file provided or invalid file name"
213
-
214
- # Verify temporary file exists and is accessible
215
- if not os.path.exists(file.name):
216
- logger.error(f"Temporary file {file.name} does not exist")
217
- return f"Error: Temporary file {file.name} does not exist"
218
-
219
- # Check temporary file size
220
- file_size = os.path.getsize(file.name)
221
- logger.info(f"Temporary file size: {file_size} bytes")
222
- if file_size == 0:
223
- logger.error("Uploaded file is empty")
224
- return "Error: Uploaded file is empty"
225
-
226
- # Save uploaded file to persistent storage
227
- file_name = os.path.basename(file.name)
228
- file_path = os.path.join(DOCS_DIR, file_name)
229
-
230
- # Check if file exists and its size
231
- should_save = True
232
- if os.path.exists(file_path):
233
- existing_size = os.path.getsize(file_path)
234
- logger.info(f"Existing file {file_name} size: {existing_size} bytes")
235
- if existing_size == 0:
236
- logger.warning(f"Existing file {file_name} is empty, will overwrite")
237
- else:
238
- logger.info(f"File {file_name} already exists and is not empty, skipping save")
239
- should_save = False
240
-
241
- if should_save:
242
- try:
243
- with open(file.name, "rb") as src_file:
244
- file_content = src_file.read()
245
- logger.info(f"Read {len(file_content)} bytes from temporary file")
246
- if not file_content:
247
- logger.error("File content is empty after reading")
248
- return "Error: File content is empty after reading"
249
- with open(file_path, "wb") as dst_file:
250
- dst_file.write(file_content)
251
- dst_file.flush() # Ensure write completes
252
- # Verify written file
253
- written_size = os.path.getsize(file_path)
254
- logger.info(f"Saved {file_name} to {file_path}, size: {written_size} bytes")
255
- if written_size == 0:
256
- logger.error(f"Failed to write {file_name}, file is empty")
257
- return f"Error: Failed to write {file_name}, file is empty"
258
- except PermissionError as e:
259
- logger.error(f"Permission error writing to {file_path}: {str(e)}")
260
- return f"Error: Permission denied writing to {file_path}"
261
- except Exception as e:
262
- logger.error(f"Error writing file to {file_path}: {str(e)}")
263
- return f"Error writing file: {str(e)}"
264
-
265
- # After saving the file, refresh the vector store
266
- refresh_status = refresh_vector_store()
267
- logger.info(f"Vector store refresh status: {refresh_status}")
268
-
269
- return f"Document '{file_name}' processed and saved! {refresh_status}"
270
- except Exception as e:
271
- logger.error(f"Error processing document: {str(e)}")
272
- return f"Error processing document: {str(e)}"
273
-
274
- def query_documents(query, history, system_prompt, max_tokens, temperature):
275
- global retriever, client
276
- try:
277
- if client is None:
278
- logger.error("InferenceClient not initialized")
279
- return history, "Error: InferenceClient not initialized. Check HF_TOKEN."
280
- if retriever is None:
281
- logger.error("No documents loaded")
282
- return history, "Error: No documents loaded. Please upload a document first."
283
-
284
- # Ensure history is a list of [user, assistant] lists
285
- logger.info(f"History before processing: {history}")
286
- if not isinstance(history, list):
287
- logger.warning("History is not a list, resetting")
288
- history = []
289
- history = [[str(item[0]), str(item[1])] for item in history if isinstance(item, (list, tuple)) and len(item) == 2]
290
-
291
- # Retrieve relevant documents
292
- docs = retriever.get_relevant_documents(query)
293
-
294
- # Format context with source information
295
- context_parts = []
296
- for doc in docs:
297
- source = doc.metadata.get('source', 'unknown')
298
- page = doc.metadata.get('page', 'unknown')
299
- context_parts.append(f"[Source: {source}, Page: {page}]\n{doc.page_content}")
300
-
301
- context = "\n\n".join(context_parts)
302
-
303
- # Call Cerebras inference
304
- logger.info("Calling Cerebras inference")
305
- response = client.chat_completion(
306
- messages=[
307
- {"role": "system", "content": system_prompt},
308
- {"role": "user", "content": f"Context: {context}\n\nQuery: {query}"}
309
- ],
310
- max_tokens=int(max_tokens),
311
- temperature=float(temperature),
312
- stream=False
313
- )
314
- answer = response.choices[0].message.content
315
- logger.info("Inference successful")
316
-
317
- # Update chat history with list format
318
- history.append([query, answer])
319
- logger.info(f"History after append: {history}")
320
- return history, "" # Clear the query input
321
- except Exception as e:
322
- logger.error(f"Error querying documents: {str(e)}")
323
- return history, f"Error querying documents: {str(e)}"
324
-
325
- # Load existing vector store on startup
326
- try:
327
- if os.path.exists(CHROMA_DIR):
328
- logger.info("Loading existing vector store")
329
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
330
- vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
331
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
332
- logger.info(f"Loaded vector store from {CHROMA_DIR}")
333
- except Exception as e:
334
- logger.error(f"Error loading vector store: {str(e)}")
335
-
336
- # Create the Gradio interface
337
- with gr.Blocks() as demo:
338
- gr.Markdown("# RAG Chatbot with Document Management")
339
-
340
- # File management tab
341
- with gr.Tab("Document Management"):
342
- # File upload
343
- with gr.Row():
344
- file_input = gr.File(label="Upload Document (PDF)", file_types=[".pdf"])
345
- file_output = gr.Textbox(label="Upload Status")
346
-
347
- # Document listing and management
348
- with gr.Row():
349
- refresh_btn = gr.Button("Refresh Document List")
350
- rebuild_vs_btn = gr.Button("Rebuild Vector Store")
351
-
352
- doc_list = gr.Dataframe(
353
- headers=["name", "size", "date"],
354
- label="Uploaded Documents"
355
- )
356
-
357
- # Initialize dropdown with existing files
358
- initial_files = get_document_filenames()
359
-
360
- with gr.Row():
361
- selected_doc = gr.Dropdown(
362
- label="Select Document",
363
- choices=initial_files,
364
- allow_custom_value=True # This helps avoid errors when dropdown is updated
365
- )
366
- preview_btn = gr.Button("Preview Document")
367
- delete_btn = gr.Button("Delete Selected Document", variant="stop")
368
-
369
- doc_preview = gr.Textbox(label="Document Preview", lines=10)
370
- delete_output = gr.Textbox(label="Operation Status")
371
-
372
- # Chat interface tab
373
- with gr.Tab("Chat"):
374
- chatbot = gr.Chatbot(label="Conversation")
375
-
376
- # Query and parameters
377
- with gr.Row():
378
- query_input = gr.Textbox(label="Query", placeholder="Ask about the document...")
379
- system_prompt = gr.Textbox(
380
- label="System Prompt",
381
- value="You are a helpful assistant answering questions based on the provided document context. Only use the context provided to answer the question. If you don't know the answer, say so."
382
- )
383
-
384
- with gr.Row():
385
- max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=2000, value=500, step=50)
386
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
387
-
388
- # Buttons
389
- with gr.Row():
390
- submit_btn = gr.Button("Send")
391
- clear_btn = gr.Button("Clear Chat")
392
-
393
- # Event handlers for file management
394
- def update_doc_list():
395
- docs = list_uploaded_documents()
396
- filenames = get_document_filenames()
397
- return docs, gr.Dropdown(choices=filenames)
398
-
399
- file_input.upload(initialize_rag, file_input, file_output).then(
400
- update_doc_list, None, [doc_list, selected_doc]
401
- )
402
-
403
- refresh_btn.click(update_doc_list, None, [doc_list, selected_doc])
404
- rebuild_vs_btn.click(refresh_vector_store, None, delete_output)
405
- preview_btn.click(preview_document, selected_doc, doc_preview)
406
- delete_btn.click(delete_document, selected_doc, delete_output).then(
407
- update_doc_list, None, [doc_list, selected_doc]
408
- )
409
-
410
- # Event handlers for chat
411
- submit_btn.click(
412
- query_documents,
413
- inputs=[query_input, chatbot, system_prompt, max_tokens, temperature],
414
- outputs=[chatbot, query_input]
415
- )
416
-
417
- clear_btn.click(lambda: [], None, chatbot)
418
-
419
- # Initialize document list on startup
420
- demo.load(update_doc_list, None, [doc_list, selected_doc])
421
-
422
- if __name__ == "__main__":
423
- demo.launch()