SlouchyBuffalo commited on
Commit
dd2bb11
·
verified ·
1 Parent(s): ffbac0e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -402
app.py DELETED
@@ -1,402 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import os
4
- import logging
5
- import datetime
6
- from langchain_community.document_loaders import PyPDFLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain_community.embeddings import HuggingFaceEmbeddings
9
- from langchain_community.vectorstores import Chroma
10
- from huggingface_hub import InferenceClient, get_token
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Set HF_HOME for caching Hugging Face assets in persistent storage
17
- os.environ["HF_HOME"] = "/data/.huggingface"
18
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
-
20
- # Define persistent storage directories
21
- DATA_DIR = "/data" # Root persistent storage directory
22
- DOCS_DIR = os.path.join(DATA_DIR, "documents") # Subdirectory for uploaded PDFs
23
- CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db") # Subdirectory for Chroma vector store
24
-
25
- # Create directories if they don't exist
26
- os.makedirs(DOCS_DIR, exist_ok=True)
27
- os.makedirs(CHROMA_DIR, exist_ok=True)
28
-
29
- # Initialize Cerebras InferenceClient
30
- try:
31
- token = get_token()
32
- if not token:
33
- logger.error("HF_TOKEN is not set in Space secrets")
34
- client = None
35
- else:
36
- client = InferenceClient(
37
- model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
38
- provider="cerebras",
39
- token=token
40
- )
41
- logger.info("InferenceClient initialized successfully")
42
- except Exception as e:
43
- logger.error(f"Failed to initialize InferenceClient: {str(e)}")
44
- client = None
45
-
46
- # Global variables for vector store
47
- vectorstore = None
48
- retriever = None
49
-
50
- def list_uploaded_documents():
51
- """List all uploaded documents in the persistent storage"""
52
- try:
53
- if not os.path.exists(DOCS_DIR):
54
- return []
55
-
56
- files = os.listdir(DOCS_DIR)
57
- pdf_files = [f for f in files if f.lower().endswith('.pdf')]
58
- file_info = []
59
-
60
- for file in pdf_files:
61
- file_path = os.path.join(DOCS_DIR, file)
62
- file_size = os.path.getsize(file_path)
63
- file_time = os.path.getmtime(file_path)
64
- file_info.append({
65
- "name": file,
66
- "size": f"{file_size // 1024} KB",
67
- "date": datetime.datetime.fromtimestamp(file_time).strftime('%Y-%m-%d %H:%M:%S')
68
- })
69
-
70
- return file_info
71
- except Exception as e:
72
- logger.error(f"Error listing documents: {str(e)}")
73
- return []
74
-
75
- def delete_document(filename):
76
- """Delete a document from persistent storage and update the vector store"""
77
- try:
78
- if not filename:
79
- return "No file selected for deletion"
80
-
81
- file_path = os.path.join(DOCS_DIR, filename)
82
- if not os.path.exists(file_path):
83
- return f"File {filename} does not exist"
84
-
85
- # Delete the file
86
- os.remove(file_path)
87
- logger.info(f"Deleted file {filename}")
88
-
89
- # Refresh the vector store
90
- refresh_status = refresh_vector_store()
91
-
92
- return f"File {filename} deleted successfully! {refresh_status}"
93
- except Exception as e:
94
- logger.error(f"Error deleting document: {str(e)}")
95
- return f"Error deleting document: {str(e)}"
96
-
97
- def preview_document(filename, max_pages=3):
98
- """Generate a preview of the document's content"""
99
- try:
100
- if not filename:
101
- return "No file selected for preview"
102
-
103
- file_path = os.path.join(DOCS_DIR, filename)
104
- if not os.path.exists(file_path):
105
- return f"File {filename} does not exist"
106
-
107
- loader = PyPDFLoader(file_path)
108
- documents = loader.load()
109
-
110
- # Limit preview to first few pages
111
- preview_docs = documents[:max_pages]
112
- preview_text = f"Preview of {filename} (first {len(preview_docs)} pages):\n\n"
113
-
114
- for i, doc in enumerate(preview_docs):
115
- preview_text += f"--- Page {i+1} ---\n{doc.page_content[:500]}...\n\n"
116
-
117
- return preview_text
118
- except Exception as e:
119
- logger.error(f"Error previewing document: {str(e)}")
120
- return f"Error previewing document: {str(e)}"
121
-
122
- @spaces.GPU(duration=180) # Use GPU for vector store recreation
123
- def refresh_vector_store():
124
- """Rebuild the vector store from all available documents"""
125
- global vectorstore, retriever
126
- try:
127
- if not os.path.exists(DOCS_DIR):
128
- logger.warning("Documents directory does not exist")
129
- return "No documents directory found"
130
-
131
- files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith('.pdf')]
132
- if not files:
133
- logger.warning("No PDF documents found")
134
-
135
- # Clear the vector store
136
- if os.path.exists(CHROMA_DIR):
137
- import shutil
138
- shutil.rmtree(CHROMA_DIR)
139
- os.makedirs(CHROMA_DIR, exist_ok=True)
140
-
141
- vectorstore = None
142
- retriever = None
143
- return "No PDF documents found. Vector store cleared."
144
-
145
- # Load and process all documents
146
- all_texts = []
147
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
148
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
149
-
150
- for file in files:
151
- file_path = os.path.join(DOCS_DIR, file)
152
- try:
153
- loader = PyPDFLoader(file_path)
154
- documents = loader.load()
155
- texts = text_splitter.split_documents(documents)
156
-
157
- # Add source file metadata to each chunk
158
- for i, text in enumerate(texts):
159
- text.metadata["source"] = file
160
-
161
- all_texts.extend(texts)
162
- logger.info(f"Processed {file}, added {len(texts)} chunks")
163
- except Exception as e:
164
- logger.error(f"Error processing {file}: {str(e)}")
165
-
166
- # Create new vector store
167
- if all_texts:
168
- # Remove existing vector store
169
- if os.path.exists(CHROMA_DIR):
170
- import shutil
171
- shutil.rmtree(CHROMA_DIR)
172
- os.makedirs(CHROMA_DIR, exist_ok=True)
173
-
174
- vectorstore = Chroma.from_documents(
175
- all_texts, embeddings, persist_directory=CHROMA_DIR
176
- )
177
- vectorstore.persist()
178
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
179
- logger.info(f"Vector store recreated with {len(all_texts)} chunks from {len(files)} files")
180
- return f"Vector store updated with {len(files)} documents!"
181
- else:
182
- logger.warning("No text chunks extracted from documents")
183
- return "No content could be extracted from the PDF files"
184
- except Exception as e:
185
- logger.error(f"Error refreshing vector store: {str(e)}")
186
- return f"Error refreshing vector store: {str(e)}"
187
-
188
- @spaces.GPU(duration=180) # Use ZeroGPU (H200) for embedding generation, 180s timeout
189
- def initialize_rag(file):
190
- global vectorstore, retriever
191
- try:
192
- # Debug file object properties
193
- logger.info(f"File object: {type(file)}, Attributes: {dir(file)}")
194
- logger.info(f"File name: {file.name}")
195
-
196
- # Validate file
197
- if not file or not file.name:
198
- logger.error("No file provided or invalid file name")
199
- return "Error: No file provided or invalid file name"
200
-
201
- # Verify temporary file exists and is accessible
202
- if not os.path.exists(file.name):
203
- logger.error(f"Temporary file {file.name} does not exist")
204
- return f"Error: Temporary file {file.name} does not exist"
205
-
206
- # Check temporary file size
207
- file_size = os.path.getsize(file.name)
208
- logger.info(f"Temporary file size: {file_size} bytes")
209
- if file_size == 0:
210
- logger.error("Uploaded file is empty")
211
- return "Error: Uploaded file is empty"
212
-
213
- # Save uploaded file to persistent storage
214
- file_name = os.path.basename(file.name)
215
- file_path = os.path.join(DOCS_DIR, file_name)
216
-
217
- # Check if file exists and its size
218
- should_save = True
219
- if os.path.exists(file_path):
220
- existing_size = os.path.getsize(file_path)
221
- logger.info(f"Existing file {file_name} size: {existing_size} bytes")
222
- if existing_size == 0:
223
- logger.warning(f"Existing file {file_name} is empty, will overwrite")
224
- else:
225
- logger.info(f"File {file_name} already exists and is not empty, skipping save")
226
- should_save = False
227
-
228
- if should_save:
229
- try:
230
- with open(file.name, "rb") as src_file:
231
- file_content = src_file.read()
232
- logger.info(f"Read {len(file_content)} bytes from temporary file")
233
- if not file_content:
234
- logger.error("File content is empty after reading")
235
- return "Error: File content is empty after reading"
236
- with open(file_path, "wb") as dst_file:
237
- dst_file.write(file_content)
238
- dst_file.flush() # Ensure write completes
239
- # Verify written file
240
- written_size = os.path.getsize(file_path)
241
- logger.info(f"Saved {file_name} to {file_path}, size: {written_size} bytes")
242
- if written_size == 0:
243
- logger.error(f"Failed to write {file_name}, file is empty")
244
- return f"Error: Failed to write {file_name}, file is empty"
245
- except PermissionError as e:
246
- logger.error(f"Permission error writing to {file_path}: {str(e)}")
247
- return f"Error: Permission denied writing to {file_path}"
248
- except Exception as e:
249
- logger.error(f"Error writing file to {file_path}: {str(e)}")
250
- return f"Error writing file: {str(e)}"
251
-
252
- # After saving the file, refresh the vector store
253
- refresh_status = refresh_vector_store()
254
- logger.info(f"Vector store refresh status: {refresh_status}")
255
-
256
- return f"Document '{file_name}' processed and saved! {refresh_status}"
257
- except Exception as e:
258
- logger.error(f"Error processing document: {str(e)}")
259
- return f"Error processing document: {str(e)}"
260
-
261
- def query_documents(query, history, system_prompt, max_tokens, temperature):
262
- global retriever, client
263
- try:
264
- if client is None:
265
- logger.error("InferenceClient not initialized")
266
- return history, "Error: InferenceClient not initialized. Check HF_TOKEN."
267
- if retriever is None:
268
- logger.error("No documents loaded")
269
- return history, "Error: No documents loaded. Please upload a document first."
270
-
271
- # Ensure history is a list of [user, assistant] lists
272
- logger.info(f"History before processing: {history}")
273
- if not isinstance(history, list):
274
- logger.warning("History is not a list, resetting")
275
- history = []
276
- history = [[str(item[0]), str(item[1])] for item in history if isinstance(item, (list, tuple)) and len(item) == 2]
277
-
278
- # Retrieve relevant documents
279
- docs = retriever.get_relevant_documents(query)
280
-
281
- # Format context with source information
282
- context_parts = []
283
- for doc in docs:
284
- source = doc.metadata.get('source', 'unknown')
285
- page = doc.metadata.get('page', 'unknown')
286
- context_parts.append(f"[Source: {source}, Page: {page}]\n{doc.page_content}")
287
-
288
- context = "\n\n".join(context_parts)
289
-
290
- # Call Cerebras inference
291
- logger.info("Calling Cerebras inference")
292
- response = client.chat_completion(
293
- messages=[
294
- {"role": "system", "content": system_prompt},
295
- {"role": "user", "content": f"Context: {context}\n\nQuery: {query}"}
296
- ],
297
- max_tokens=int(max_tokens),
298
- temperature=float(temperature),
299
- stream=False
300
- )
301
- answer = response.choices[0].message.content
302
- logger.info("Inference successful")
303
-
304
- # Update chat history with list format
305
- history.append([query, answer])
306
- logger.info(f"History after append: {history}")
307
- return history, "" # Clear the query input
308
- except Exception as e:
309
- logger.error(f"Error querying documents: {str(e)}")
310
- return history, f"Error querying documents: {str(e)}"
311
-
312
- # Load existing vector store on startup
313
- try:
314
- if os.path.exists(CHROMA_DIR):
315
- logger.info("Loading existing vector store")
316
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
317
- vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
318
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
319
- logger.info(f"Loaded vector store from {CHROMA_DIR}")
320
- except Exception as e:
321
- logger.error(f"Error loading vector store: {str(e)}")
322
-
323
- # Create the Gradio interface
324
- with gr.Blocks() as demo:
325
- gr.Markdown("# RAG Chatbot with Document Management")
326
-
327
- # File management tab
328
- with gr.Tab("Document Management"):
329
- # File upload
330
- with gr.Row():
331
- file_input = gr.File(label="Upload Document (PDF)", file_types=[".pdf"])
332
- file_output = gr.Textbox(label="Upload Status")
333
-
334
- # Document listing and management
335
- with gr.Row():
336
- refresh_btn = gr.Button("Refresh Document List")
337
- rebuild_vs_btn = gr.Button("Rebuild Vector Store")
338
-
339
- doc_list = gr.Dataframe(
340
- headers=["name", "size", "date"],
341
- label="Uploaded Documents"
342
- )
343
-
344
- with gr.Row():
345
- selected_doc = gr.Dropdown(label="Select Document")
346
- preview_btn = gr.Button("Preview Document")
347
- delete_btn = gr.Button("Delete Selected Document", variant="stop")
348
-
349
- doc_preview = gr.Textbox(label="Document Preview", lines=10)
350
- delete_output = gr.Textbox(label="Operation Status")
351
-
352
- # Chat interface tab
353
- with gr.Tab("Chat"):
354
- chatbot = gr.Chatbot(label="Conversation")
355
-
356
- # Query and parameters
357
- with gr.Row():
358
- query_input = gr.Textbox(label="Query", placeholder="Ask about the document...")
359
- system_prompt = gr.Textbox(
360
- label="System Prompt",
361
- value="You are a helpful assistant answering questions based on the provided document context. Only use the context provided to answer the question. If you don't know the answer, say so."
362
- )
363
-
364
- with gr.Row():
365
- max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=2000, value=500, step=50)
366
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
367
-
368
- # Buttons
369
- with gr.Row():
370
- submit_btn = gr.Button("Send")
371
- clear_btn = gr.Button("Clear Chat")
372
-
373
- # Event handlers for file management
374
- def update_doc_list():
375
- docs = list_uploaded_documents()
376
- return docs, [d["name"] for d in docs]
377
-
378
- file_input.upload(initialize_rag, file_input, file_output).then(
379
- update_doc_list, None, [doc_list, selected_doc]
380
- )
381
-
382
- refresh_btn.click(update_doc_list, None, [doc_list, selected_doc])
383
- rebuild_vs_btn.click(refresh_vector_store, None, delete_output)
384
- preview_btn.click(preview_document, selected_doc, doc_preview)
385
- delete_btn.click(delete_document, selected_doc, delete_output).then(
386
- update_doc_list, None, [doc_list, selected_doc]
387
- )
388
-
389
- # Event handlers for chat
390
- submit_btn.click(
391
- query_documents,
392
- inputs=[query_input, chatbot, system_prompt, max_tokens, temperature],
393
- outputs=[chatbot, query_input]
394
- )
395
-
396
- clear_btn.click(lambda: [], None, chatbot)
397
-
398
- # Initialize document list on startup
399
- demo.load(update_doc_list, None, [doc_list, selected_doc])
400
-
401
- if __name__ == "__main__":
402
- demo.launch()