SlouchyBuffalo commited on
Commit
f137b52
·
verified ·
1 Parent(s): 80c5225

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -410
app.py DELETED
@@ -1,410 +0,0 @@
1
- import gradio as gr
2
- import spaces
3
- import os
4
- import logging
5
- import datetime
6
- from langchain.document_loaders import PyPDFLoader
7
- from langchain.text_splitter import RecursiveCharacterTextSplitter
8
- from langchain.embeddings import HuggingFaceEmbeddings
9
- from langchain.vectorstores import Chroma
10
- from huggingface_hub import InferenceClient, get_token
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO)
14
- logger = logging.getLogger(__name__)
15
-
16
- # Set HF_HOME for caching Hugging Face assets in persistent storage
17
- os.environ["HF_HOME"] = "/data/.huggingface"
18
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
-
20
- # Define persistent storage directories
21
- DATA_DIR = "/data" # Root persistent storage directory
22
- DOCS_DIR = os.path.join(DATA_DIR, "documents") # Subdirectory for uploaded PDFs
23
- CHROMA_DIR = os.path.join(DATA_DIR, "chroma_db") # Subdirectory for Chroma vector store
24
-
25
- # Create directories if they don't exist
26
- os.makedirs(DOCS_DIR, exist_ok=True)
27
- os.makedirs(CHROMA_DIR, exist_ok=True)
28
-
29
- # Initialize Cerebras InferenceClient
30
- try:
31
- token = get_token()
32
- if not token:
33
- logger.error("HF_TOKEN is not set in Space secrets")
34
- client = None
35
- else:
36
- client = InferenceClient(
37
- model="meta-llama/Llama-4-Scout-17B-16E-Instruct",
38
- provider="cerebras",
39
- token=token
40
- )
41
- logger.info("InferenceClient initialized successfully")
42
- except Exception as e:
43
- logger.error(f"Failed to initialize InferenceClient: {str(e)}")
44
- client = None
45
-
46
- # Global variables for vector store
47
- vectorstore = None
48
- retriever = None
49
-
50
- @spaces.CPU # Use CPU for file operations
51
- def list_uploaded_documents():
52
- """List all uploaded documents in the persistent storage"""
53
- try:
54
- if not os.path.exists(DOCS_DIR):
55
- return []
56
-
57
- files = os.listdir(DOCS_DIR)
58
- pdf_files = [f for f in files if f.lower().endswith('.pdf')]
59
- file_info = []
60
-
61
- for file in pdf_files:
62
- file_path = os.path.join(DOCS_DIR, file)
63
- file_size = os.path.getsize(file_path)
64
- file_time = os.path.getmtime(file_path)
65
- file_info.append({
66
- "name": file,
67
- "size": f"{file_size // 1024} KB",
68
- "date": datetime.datetime.fromtimestamp(file_time).strftime('%Y-%m-%d %H:%M:%S')
69
- })
70
-
71
- return file_info
72
- except Exception as e:
73
- logger.error(f"Error listing documents: {str(e)}")
74
- return []
75
-
76
- @spaces.CPU # Use CPU for file operations
77
- def delete_document(filename):
78
- """Delete a document from persistent storage and update the vector store"""
79
- global vectorstore, retriever
80
- try:
81
- if not filename:
82
- return "No file selected for deletion"
83
-
84
- file_path = os.path.join(DOCS_DIR, filename)
85
- if not os.path.exists(file_path):
86
- return f"File {filename} does not exist"
87
-
88
- # Delete the file
89
- os.remove(file_path)
90
- logger.info(f"Deleted file {filename}")
91
-
92
- # We should also update the vector store to remove documents from this file
93
- # This requires re-creating the vector store from all remaining documents
94
- refresh_status = refresh_vector_store()
95
-
96
- return f"File {filename} deleted successfully! {refresh_status}"
97
- except Exception as e:
98
- logger.error(f"Error deleting document: {str(e)}")
99
- return f"Error deleting document: {str(e)}"
100
-
101
- @spaces.CPU # Use CPU for file operations
102
- def preview_document(filename, max_pages=3):
103
- """Generate a preview of the document's content"""
104
- try:
105
- if not filename:
106
- return "No file selected for preview"
107
-
108
- file_path = os.path.join(DOCS_DIR, filename)
109
- if not os.path.exists(file_path):
110
- return f"File {filename} does not exist"
111
-
112
- loader = PyPDFLoader(file_path)
113
- documents = loader.load()
114
-
115
- # Limit preview to first few pages
116
- preview_docs = documents[:max_pages]
117
- preview_text = f"Preview of {filename} (first {len(preview_docs)} pages):\n\n"
118
-
119
- for i, doc in enumerate(preview_docs):
120
- preview_text += f"--- Page {i+1} ---\n{doc.page_content[:500]}...\n\n"
121
-
122
- return preview_text
123
- except Exception as e:
124
- logger.error(f"Error previewing document: {str(e)}")
125
- return f"Error previewing document: {str(e)}"
126
-
127
- @spaces.GPU(duration=180) # Use GPU for vector store recreation
128
- def refresh_vector_store():
129
- """Rebuild the vector store from all available documents"""
130
- global vectorstore, retriever
131
- try:
132
- if not os.path.exists(DOCS_DIR):
133
- logger.warning("Documents directory does not exist")
134
- return "No documents directory found"
135
-
136
- files = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith('.pdf')]
137
- if not files:
138
- logger.warning("No PDF documents found")
139
-
140
- # Clear the vector store
141
- if os.path.exists(CHROMA_DIR):
142
- import shutil
143
- shutil.rmtree(CHROMA_DIR)
144
- os.makedirs(CHROMA_DIR, exist_ok=True)
145
-
146
- vectorstore = None
147
- retriever = None
148
- return "No PDF documents found. Vector store cleared."
149
-
150
- # Load and process all documents
151
- all_texts = []
152
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
153
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
154
-
155
- for file in files:
156
- file_path = os.path.join(DOCS_DIR, file)
157
- try:
158
- loader = PyPDFLoader(file_path)
159
- documents = loader.load()
160
- texts = text_splitter.split_documents(documents)
161
-
162
- # Add source file metadata to each chunk
163
- for i, text in enumerate(texts):
164
- text.metadata["source"] = file
165
- text.metadata["chunk_id"] = i
166
- text.metadata["page"] = text.metadata.get("page", 0)
167
-
168
- all_texts.extend(texts)
169
- logger.info(f"Processed {file}, added {len(texts)} chunks")
170
- except Exception as e:
171
- logger.error(f"Error processing {file}: {str(e)}")
172
-
173
- # Create new vector store
174
- if all_texts:
175
- # Remove existing vector store
176
- if os.path.exists(CHROMA_DIR):
177
- import shutil
178
- shutil.rmtree(CHROMA_DIR)
179
- os.makedirs(CHROMA_DIR, exist_ok=True)
180
-
181
- vectorstore = Chroma.from_documents(
182
- all_texts, embeddings, persist_directory=CHROMA_DIR
183
- )
184
- vectorstore.persist()
185
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
186
- logger.info(f"Vector store recreated with {len(all_texts)} chunks from {len(files)} files")
187
- return f"Vector store updated with {len(files)} documents and {len(all_texts)} chunks!"
188
- else:
189
- logger.warning("No text chunks extracted from documents")
190
- return "No content could be extracted from the PDF files"
191
- except Exception as e:
192
- logger.error(f"Error refreshing vector store: {str(e)}")
193
- return f"Error refreshing vector store: {str(e)}"
194
-
195
- @spaces.GPU(duration=180) # Use GPU for embedding generation
196
- def initialize_rag(file):
197
- global vectorstore, retriever
198
- try:
199
- # Debug file object properties
200
- logger.info(f"File object: {type(file)}, Attributes: {dir(file)}")
201
- logger.info(f"File name: {file.name}")
202
-
203
- # Validate file
204
- if not file or not file.name:
205
- logger.error("No file provided or invalid file name")
206
- return "Error: No file provided or invalid file name"
207
-
208
- # Verify temporary file exists and is accessible
209
- if not os.path.exists(file.name):
210
- logger.error(f"Temporary file {file.name} does not exist")
211
- return f"Error: Temporary file {file.name} does not exist"
212
-
213
- # Check temporary file size
214
- file_size = os.path.getsize(file.name)
215
- logger.info(f"Temporary file size: {file_size} bytes")
216
- if file_size == 0:
217
- logger.error("Uploaded file is empty")
218
- return "Error: Uploaded file is empty"
219
-
220
- # Save uploaded file to persistent storage
221
- file_name = os.path.basename(file.name)
222
- file_path = os.path.join(DOCS_DIR, file_name)
223
-
224
- # Check if file exists and its size
225
- should_save = True
226
- if os.path.exists(file_path):
227
- existing_size = os.path.getsize(file_path)
228
- logger.info(f"Existing file {file_name} size: {existing_size} bytes")
229
- if existing_size == 0:
230
- logger.warning(f"Existing file {file_name} is empty, will overwrite")
231
- else:
232
- logger.info(f"File {file_name} already exists and is not empty, skipping save")
233
- should_save = False
234
-
235
- if should_save:
236
- try:
237
- with open(file.name, "rb") as src_file:
238
- file_content = src_file.read()
239
- logger.info(f"Read {len(file_content)} bytes from temporary file")
240
- if not file_content:
241
- logger.error("File content is empty after reading")
242
- return "Error: File content is empty after reading"
243
- with open(file_path, "wb") as dst_file:
244
- dst_file.write(file_content)
245
- dst_file.flush() # Ensure write completes
246
- # Verify written file
247
- written_size = os.path.getsize(file_path)
248
- logger.info(f"Saved {file_name} to {file_path}, size: {written_size} bytes")
249
- if written_size == 0:
250
- logger.error(f"Failed to write {file_name}, file is empty")
251
- return f"Error: Failed to write {file_name}, file is empty"
252
- except PermissionError as e:
253
- logger.error(f"Permission error writing to {file_path}: {str(e)}")
254
- return f"Error: Permission denied writing to {file_path}"
255
- except Exception as e:
256
- logger.error(f"Error writing file to {file_path}: {str(e)}")
257
- return f"Error writing file: {str(e)}"
258
-
259
- # After saving the file successfully, refresh the vector store
260
- refresh_status = refresh_vector_store()
261
- logger.info(f"Vector store refresh status: {refresh_status}")
262
-
263
- return f"Document '{file_name}' processed and saved! {refresh_status}"
264
- except Exception as e:
265
- logger.error(f"Error processing document: {str(e)}")
266
- return f"Error processing document: {str(e)}"
267
-
268
- def query_documents(query, history, system_prompt, max_tokens, temperature):
269
- global retriever, client
270
- try:
271
- if client is None:
272
- logger.error("InferenceClient not initialized")
273
- return history, "Error: InferenceClient not initialized. Check HF_TOKEN."
274
- if retriever is None:
275
- logger.error("No documents loaded")
276
- return history, "Error: No documents loaded. Please upload a document first."
277
-
278
- # Ensure history is a list of [user, assistant] lists
279
- logger.info(f"History before processing: {history}")
280
- if not isinstance(history, list):
281
- logger.warning("History is not a list, resetting")
282
- history = []
283
- history = [[str(item[0]), str(item[1])] for item in history if isinstance(item, (list, tuple)) and len(item) == 2]
284
-
285
- # Retrieve relevant documents
286
- docs = retriever.get_relevant_documents(query)
287
-
288
- # Format context with source information
289
- context_parts = []
290
- for doc in docs:
291
- source = doc.metadata.get('source', 'unknown')
292
- page = doc.metadata.get('page', 'unknown')
293
- context_parts.append(f"[Source: {source}, Page: {page}]\n{doc.page_content}")
294
-
295
- context = "\n\n".join(context_parts)
296
-
297
- # Call Cerebras inference
298
- logger.info("Calling Cerebras inference")
299
- response = client.chat_completion(
300
- messages=[
301
- {"role": "system", "content": system_prompt},
302
- {"role": "user", "content": f"Context: {context}\n\nQuery: {query}"}
303
- ],
304
- max_tokens=int(max_tokens),
305
- temperature=float(temperature),
306
- stream=False
307
- )
308
- answer = response.choices[0].message.content
309
- logger.info("Inference successful")
310
-
311
- # Update chat history with list format
312
- history.append([query, answer])
313
- logger.info(f"History after append: {history}")
314
- return history, "" # Clear the query input
315
- except Exception as e:
316
- logger.error(f"Error querying documents: {str(e)}")
317
- return history, f"Error querying documents: {str(e)}"
318
-
319
- # Load existing vector store on startup
320
- try:
321
- if os.path.exists(CHROMA_DIR):
322
- logger.info("Loading existing vector store")
323
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
324
- vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
325
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
326
- logger.info(f"Loaded vector store from {CHROMA_DIR}")
327
- except Exception as e:
328
- logger.error(f"Error loading vector store: {str(e)}")
329
-
330
- # Create the Gradio interface
331
- with gr.Blocks() as demo:
332
- gr.Markdown("# RAG Chatbot with Document Management")
333
-
334
- # File management tab
335
- with gr.Tab("Document Management"):
336
- # File upload
337
- with gr.Row():
338
- file_input = gr.File(label="Upload Document (PDF)", file_types=[".pdf"])
339
- file_output = gr.Textbox(label="Upload Status")
340
-
341
- # Document listing and management
342
- with gr.Row():
343
- refresh_btn = gr.Button("Refresh Document List")
344
- rebuild_vs_btn = gr.Button("Rebuild Vector Store")
345
-
346
- doc_list = gr.Dataframe(
347
- headers=["name", "size", "date"],
348
- label="Uploaded Documents",
349
- interactive=False
350
- )
351
-
352
- with gr.Row():
353
- selected_doc = gr.Dropdown(label="Select Document")
354
- preview_btn = gr.Button("Preview Document")
355
- delete_btn = gr.Button("Delete Selected Document", variant="stop")
356
-
357
- doc_preview = gr.Textbox(label="Document Preview", lines=10)
358
- delete_output = gr.Textbox(label="Operation Status")
359
-
360
- # Chat interface tab
361
- with gr.Tab("Chat"):
362
- chatbot = gr.Chatbot(label="Conversation")
363
-
364
- # Query and parameters
365
- with gr.Row():
366
- query_input = gr.Textbox(label="Query", placeholder="Ask about the document...")
367
- system_prompt = gr.Textbox(
368
- label="System Prompt",
369
- value="You are a helpful assistant answering questions based on the provided document context. Only use the context provided to answer the question. If you don't know the answer, say so."
370
- )
371
-
372
- with gr.Row():
373
- max_tokens = gr.Slider(label="Max Tokens", minimum=50, maximum=2000, value=500, step=50)
374
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1)
375
-
376
- # Buttons
377
- with gr.Row():
378
- submit_btn = gr.Button("Send")
379
- clear_btn = gr.Button("Clear Chat")
380
-
381
- # Event handlers for file management
382
- def update_doc_list():
383
- docs = list_uploaded_documents()
384
- return docs, [d["name"] for d in docs]
385
-
386
- file_input.upload(initialize_rag, file_input, file_output).then(
387
- update_doc_list, None, [doc_list, selected_doc]
388
- )
389
-
390
- refresh_btn.click(update_doc_list, None, [doc_list, selected_doc])
391
- rebuild_vs_btn.click(refresh_vector_store, None, delete_output)
392
- preview_btn.click(preview_document, selected_doc, doc_preview)
393
- delete_btn.click(delete_document, selected_doc, delete_output).then(
394
- update_doc_list, None, [doc_list, selected_doc]
395
- )
396
-
397
- # Event handlers for chat
398
- submit_btn.click(
399
- query_documents,
400
- inputs=[query_input, chatbot, system_prompt, max_tokens, temperature],
401
- outputs=[chatbot, query_input]
402
- )
403
-
404
- clear_btn.click(lambda: [], None, chatbot)
405
-
406
- # Initialize document list on startup
407
- demo.load(update_doc_list, None, [doc_list, selected_doc])
408
-
409
- if __name__ == "__main__":
410
- demo.launch()