AnseMin commited on
Commit
f46dfbd
Β·
1 Parent(s): f901c17

Add data clearing service and vector store management

Browse files

- Introduced `DataClearingService` for managing the clearing of vector store and chat history data.
- Implemented `clear_all_documents` method in `VectorStoreManager` to remove all documents from the vector store.
- Enhanced UI with a button to clear all data, integrating the new service for user interaction.
- Updated logging to provide detailed feedback during data clearing operations.

src/rag/vector_store.py CHANGED
@@ -225,6 +225,37 @@ class VectorStoreManager:
225
  except Exception as e:
226
  logger.error(f"Error searching with metadata filter: {e}")
227
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  # Global vector store manager instance
230
  vector_store_manager = VectorStoreManager()
 
225
  except Exception as e:
226
  logger.error(f"Error searching with metadata filter: {e}")
227
  return []
228
+
229
+ def clear_all_documents(self) -> bool:
230
+ """
231
+ Clear all documents from the vector store collection.
232
+
233
+ Returns:
234
+ True if successful, False otherwise
235
+ """
236
+ try:
237
+ vector_store = self.get_vector_store()
238
+
239
+ # Get all document IDs first
240
+ collection = vector_store._collection
241
+ all_docs = collection.get()
242
+
243
+ if not all_docs or not all_docs.get('ids'):
244
+ logger.info("No documents found in vector store to clear")
245
+ return True
246
+
247
+ # Delete all documents using their IDs
248
+ collection.delete(ids=all_docs['ids'])
249
+
250
+ # Reset the vector store instance to ensure clean state
251
+ self._vector_store = None
252
+
253
+ logger.info(f"Successfully cleared {len(all_docs['ids'])} documents from vector store")
254
+ return True
255
+
256
+ except Exception as e:
257
+ logger.error(f"Error clearing all documents: {e}")
258
+ return False
259
 
260
  # Global vector store manager instance
261
  vector_store_manager = VectorStoreManager()
src/services/data_clearing_service.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data clearing service for both local and Hugging Face Space environments.
3
+ Provides functionality to clear vector store and chat history data.
4
+ """
5
+
6
+ import os
7
+ import shutil
8
+ from pathlib import Path
9
+ from typing import Dict, Any, Tuple, List
10
+ from src.core.config import config
11
+ from src.core.logging_config import get_logger
12
+ from src.rag.vector_store import vector_store_manager
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ class DataClearingService:
18
+ """Service for clearing all RAG-related data across different environments."""
19
+
20
+ def __init__(self):
21
+ """Initialize the data clearing service."""
22
+ self.is_hf_space = bool(os.getenv("SPACE_ID"))
23
+ logger.info(f"DataClearingService initialized (HF Space: {self.is_hf_space})")
24
+
25
+ def get_data_paths(self) -> Tuple[str, str]:
26
+ """
27
+ Get the correct data paths for current environment.
28
+
29
+ Returns:
30
+ Tuple of (vector_store_path, chat_history_path)
31
+ """
32
+ vector_store_path = config.rag.vector_store_path
33
+ chat_history_path = config.rag.chat_history_path
34
+
35
+ logger.info(f"Data paths - Vector store: {vector_store_path}, Chat history: {chat_history_path}")
36
+ return vector_store_path, chat_history_path
37
+
38
+ def clear_vector_store(self) -> Tuple[bool, str, Dict[str, Any]]:
39
+ """
40
+ Clear all documents from the vector store.
41
+
42
+ Returns:
43
+ Tuple of (success, message, stats)
44
+ """
45
+ try:
46
+ # Get initial document count
47
+ collection_info = vector_store_manager.get_collection_info()
48
+ initial_count = collection_info.get("document_count", 0)
49
+
50
+ if initial_count == 0:
51
+ return True, "Vector store is already empty", {"cleared_documents": 0}
52
+
53
+ # Clear the collection using the vector store manager's method
54
+ success = vector_store_manager.clear_all_documents()
55
+
56
+ if not success:
57
+ return False, "Failed to clear vector store", {"error": "clear_all_documents returned False"}
58
+
59
+ logger.info(f"Cleared {initial_count} documents from vector store")
60
+
61
+ return True, f"Successfully cleared {initial_count} documents from vector store", {
62
+ "cleared_documents": initial_count,
63
+ "collection_name": collection_info.get("collection_name", "unknown")
64
+ }
65
+
66
+ except Exception as e:
67
+ error_msg = f"Error clearing vector store: {str(e)}"
68
+ logger.error(error_msg)
69
+ return False, error_msg, {"error": str(e)}
70
+
71
+ def clear_chat_history(self) -> Tuple[bool, str, Dict[str, Any]]:
72
+ """
73
+ Clear all chat history files.
74
+
75
+ Returns:
76
+ Tuple of (success, message, stats)
77
+ """
78
+ try:
79
+ _, chat_history_path = self.get_data_paths()
80
+ chat_dir = Path(chat_history_path)
81
+
82
+ if not chat_dir.exists():
83
+ return True, "Chat history directory doesn't exist", {"cleared_files": 0}
84
+
85
+ # Count files before deletion
86
+ files_to_clear = list(chat_dir.rglob("*"))
87
+ file_count = len([f for f in files_to_clear if f.is_file()])
88
+
89
+ if file_count == 0:
90
+ return True, "Chat history is already empty", {"cleared_files": 0}
91
+
92
+ # Clear all contents of the chat history directory
93
+ for item in chat_dir.iterdir():
94
+ if item.is_file():
95
+ item.unlink()
96
+ logger.debug(f"Removed file: {item}")
97
+ elif item.is_dir():
98
+ shutil.rmtree(item)
99
+ logger.debug(f"Removed directory: {item}")
100
+
101
+ logger.info(f"Cleared {file_count} files from chat history")
102
+
103
+ return True, f"Successfully cleared {file_count} files from chat history", {
104
+ "cleared_files": file_count,
105
+ "chat_history_path": str(chat_dir)
106
+ }
107
+
108
+ except Exception as e:
109
+ error_msg = f"Error clearing chat history: {str(e)}"
110
+ logger.error(error_msg)
111
+ return False, error_msg, {"error": str(e)}
112
+
113
+ def clear_directory_contents(self, directory_path: str) -> Tuple[bool, str, int]:
114
+ """
115
+ Clear all contents of a specific directory.
116
+
117
+ Args:
118
+ directory_path: Path to directory to clear
119
+
120
+ Returns:
121
+ Tuple of (success, message, items_cleared)
122
+ """
123
+ try:
124
+ dir_path = Path(directory_path)
125
+
126
+ if not dir_path.exists():
127
+ return True, f"Directory doesn't exist: {directory_path}", 0
128
+
129
+ items_cleared = 0
130
+ for item in dir_path.iterdir():
131
+ try:
132
+ if item.is_file():
133
+ item.unlink()
134
+ items_cleared += 1
135
+ logger.debug(f"Removed file: {item}")
136
+ elif item.is_dir():
137
+ shutil.rmtree(item)
138
+ items_cleared += 1
139
+ logger.debug(f"Removed directory: {item}")
140
+ except Exception as e:
141
+ logger.warning(f"Failed to remove {item}: {e}")
142
+
143
+ return True, f"Cleared {items_cleared} items from {directory_path}", items_cleared
144
+
145
+ except Exception as e:
146
+ error_msg = f"Error clearing directory {directory_path}: {str(e)}"
147
+ logger.error(error_msg)
148
+ return False, error_msg, 0
149
+
150
+ def clear_all_data(self) -> Tuple[bool, str, Dict[str, Any]]:
151
+ """
152
+ Clear all RAG-related data (vector store + chat history).
153
+
154
+ Returns:
155
+ Tuple of (success, message, combined_stats)
156
+ """
157
+ logger.info("Starting complete data clearing operation")
158
+
159
+ combined_stats = {
160
+ "vector_store": {},
161
+ "chat_history": {},
162
+ "total_cleared_documents": 0,
163
+ "total_cleared_files": 0,
164
+ "environment": "hf_space" if self.is_hf_space else "local",
165
+ "errors": []
166
+ }
167
+
168
+ # Clear vector store
169
+ vs_success, vs_message, vs_stats = self.clear_vector_store()
170
+ combined_stats["vector_store"] = {
171
+ "success": vs_success,
172
+ "message": vs_message,
173
+ **vs_stats
174
+ }
175
+
176
+ if not vs_success:
177
+ combined_stats["errors"].append(f"Vector store: {vs_message}")
178
+ else:
179
+ combined_stats["total_cleared_documents"] = vs_stats.get("cleared_documents", 0)
180
+
181
+ # Clear chat history
182
+ ch_success, ch_message, ch_stats = self.clear_chat_history()
183
+ combined_stats["chat_history"] = {
184
+ "success": ch_success,
185
+ "message": ch_message,
186
+ **ch_stats
187
+ }
188
+
189
+ if not ch_success:
190
+ combined_stats["errors"].append(f"Chat history: {ch_message}")
191
+ else:
192
+ combined_stats["total_cleared_files"] = ch_stats.get("cleared_files", 0)
193
+
194
+ # Overall success
195
+ overall_success = vs_success and ch_success
196
+
197
+ if overall_success:
198
+ total_items = combined_stats["total_cleared_documents"] + combined_stats["total_cleared_files"]
199
+ if total_items == 0:
200
+ overall_message = "All data was already clear"
201
+ else:
202
+ overall_message = f"Successfully cleared all data: {combined_stats['total_cleared_documents']} documents, {combined_stats['total_cleared_files']} files"
203
+ else:
204
+ overall_message = f"Data clearing completed with errors: {'; '.join(combined_stats['errors'])}"
205
+
206
+ logger.info(f"Data clearing operation completed: {overall_message}")
207
+
208
+ return overall_success, overall_message, combined_stats
209
+
210
+ def get_data_status(self) -> Dict[str, Any]:
211
+ """
212
+ Get current status of data directories and vector store.
213
+
214
+ Returns:
215
+ Dictionary with data status information
216
+ """
217
+ try:
218
+ vector_store_path, chat_history_path = self.get_data_paths()
219
+
220
+ # Vector store status
221
+ collection_info = vector_store_manager.get_collection_info()
222
+ vs_document_count = collection_info.get("document_count", 0)
223
+
224
+ # Chat history status
225
+ chat_dir = Path(chat_history_path)
226
+ ch_file_count = 0
227
+ if chat_dir.exists():
228
+ ch_file_count = len([f for f in chat_dir.rglob("*") if f.is_file()])
229
+
230
+ # Directory status
231
+ vs_dir = Path(vector_store_path)
232
+ vs_exists = vs_dir.exists()
233
+ ch_exists = chat_dir.exists()
234
+
235
+ status = {
236
+ "environment": "hf_space" if self.is_hf_space else "local",
237
+ "vector_store": {
238
+ "path": vector_store_path,
239
+ "exists": vs_exists,
240
+ "document_count": vs_document_count,
241
+ "collection_name": collection_info.get("collection_name", "unknown")
242
+ },
243
+ "chat_history": {
244
+ "path": chat_history_path,
245
+ "exists": ch_exists,
246
+ "file_count": ch_file_count
247
+ },
248
+ "total_data_items": vs_document_count + ch_file_count,
249
+ "has_data": vs_document_count > 0 or ch_file_count > 0
250
+ }
251
+
252
+ return status
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error getting data status: {e}")
256
+ return {
257
+ "error": str(e),
258
+ "environment": "hf_space" if self.is_hf_space else "local"
259
+ }
260
+
261
+
262
+ # Global data clearing service instance
263
+ data_clearing_service = DataClearingService()
src/ui/ui.py CHANGED
@@ -15,6 +15,7 @@ from src.core.exceptions import (
15
  )
16
  from src.core.logging_config import get_logger
17
  from src.rag import rag_chat_service, document_ingestion_service
 
18
 
19
  # Use centralized logging
20
  logger = get_logger(__name__)
@@ -252,6 +253,48 @@ def start_new_chat_session():
252
  logger.error(error_msg)
253
  return [], f"❌ {error_msg}"
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  def get_chat_status():
256
  """Get current chat system status."""
257
  try:
@@ -261,6 +304,9 @@ def get_chat_status():
261
  # Check usage stats
262
  usage_stats = rag_chat_service.get_usage_stats()
263
 
 
 
 
264
  # Modern status card design with better styling
265
  status_html = f"""
266
  <div class="status-card">
@@ -273,20 +319,20 @@ def get_chat_status():
273
 
274
  <div class="status-grid">
275
  <div class="status-item">
276
- <div class="status-label">Documents Processed</div>
277
- <div class="status-value">{ingestion_status.get('processed_documents', 0)}</div>
278
  </div>
279
  <div class="status-item">
280
- <div class="status-label">Vector Store</div>
281
- <div class="status-value">{ingestion_status.get('total_documents_in_store', 0)} docs</div>
282
  </div>
283
  <div class="status-item">
284
  <div class="status-label">Session Usage</div>
285
  <div class="status-value">{usage_stats.get('session_messages', 0)}/{usage_stats.get('session_limit', 50)}</div>
286
  </div>
287
  <div class="status-item">
288
- <div class="status-label">Hourly Usage</div>
289
- <div class="status-value">{usage_stats.get('hourly_messages', 0)}/{usage_stats.get('hourly_limit', 100)}</div>
290
  </div>
291
  </div>
292
 
@@ -556,6 +602,16 @@ def create_ui():
556
  transform: translateY(-1px);
557
  }
558
 
 
 
 
 
 
 
 
 
 
 
559
  /* Chat interface styling */
560
  .chat-main-container {
561
  background: #ffffff;
@@ -819,6 +875,7 @@ def create_ui():
819
  with gr.Row(elem_classes=["control-buttons"]):
820
  refresh_status_btn = gr.Button("πŸ”„ Refresh Status", elem_classes=["control-btn", "btn-refresh"])
821
  new_session_btn = gr.Button("πŸ†• New Session", elem_classes=["control-btn", "btn-new-session"])
 
822
 
823
  # Main chat interface
824
  with gr.Column(elem_classes=["chat-main-container"]):
@@ -885,6 +942,13 @@ def create_ui():
885
  inputs=[],
886
  outputs=[status_display]
887
  )
 
 
 
 
 
 
 
888
 
889
  return demo
890
 
 
15
  )
16
  from src.core.logging_config import get_logger
17
  from src.rag import rag_chat_service, document_ingestion_service
18
+ from src.services.data_clearing_service import data_clearing_service
19
 
20
  # Use centralized logging
21
  logger = get_logger(__name__)
 
253
  logger.error(error_msg)
254
  return [], f"❌ {error_msg}"
255
 
256
+ def handle_clear_all_data():
257
+ """Handle clearing all RAG data (vector store + chat history)."""
258
+ try:
259
+ # Clear all data using the data clearing service
260
+ success, message, stats = data_clearing_service.clear_all_data()
261
+
262
+ if success:
263
+ # Reset chat session after clearing data
264
+ session_id = rag_chat_service.start_new_session()
265
+
266
+ # Get updated status
267
+ updated_status = get_chat_status()
268
+
269
+ # Create success message with stats
270
+ if stats.get("total_cleared_documents", 0) > 0 or stats.get("total_cleared_files", 0) > 0:
271
+ clear_msg = f"βœ… {message}"
272
+ session_msg = f"πŸ†• Started new session: {session_id}"
273
+ combined_msg = f'{clear_msg}<br/><div class="session-info">{session_msg}</div>'
274
+ else:
275
+ combined_msg = f'ℹ️ {message}<br/><div class="session-info">πŸ†• Started new session: {session_id}</div>'
276
+
277
+ logger.info(f"Data cleared successfully: {message}")
278
+
279
+ return [], combined_msg, updated_status
280
+ else:
281
+ error_msg = f"❌ {message}"
282
+ logger.error(f"Data clearing failed: {message}")
283
+
284
+ # Still get updated status even on error
285
+ updated_status = get_chat_status()
286
+
287
+ return None, f'<div class="session-info">{error_msg}</div>', updated_status
288
+
289
+ except Exception as e:
290
+ error_msg = f"Error clearing data: {str(e)}"
291
+ logger.error(error_msg)
292
+
293
+ # Get current status
294
+ current_status = get_chat_status()
295
+
296
+ return None, f'<div class="session-info">❌ {error_msg}</div>', current_status
297
+
298
  def get_chat_status():
299
  """Get current chat system status."""
300
  try:
 
304
  # Check usage stats
305
  usage_stats = rag_chat_service.get_usage_stats()
306
 
307
+ # Get data status for additional context
308
+ data_status = data_clearing_service.get_data_status()
309
+
310
  # Modern status card design with better styling
311
  status_html = f"""
312
  <div class="status-card">
 
319
 
320
  <div class="status-grid">
321
  <div class="status-item">
322
+ <div class="status-label">Vector Store Docs</div>
323
+ <div class="status-value">{data_status.get('vector_store', {}).get('document_count', 0)}</div>
324
  </div>
325
  <div class="status-item">
326
+ <div class="status-label">Chat History Files</div>
327
+ <div class="status-value">{data_status.get('chat_history', {}).get('file_count', 0)}</div>
328
  </div>
329
  <div class="status-item">
330
  <div class="status-label">Session Usage</div>
331
  <div class="status-value">{usage_stats.get('session_messages', 0)}/{usage_stats.get('session_limit', 50)}</div>
332
  </div>
333
  <div class="status-item">
334
+ <div class="status-label">Environment</div>
335
+ <div class="status-value">{'HF Space' if data_status.get('environment') == 'hf_space' else 'Local'}</div>
336
  </div>
337
  </div>
338
 
 
602
  transform: translateY(-1px);
603
  }
604
 
605
+ .btn-clear-data {
606
+ background: #dc3545;
607
+ color: white;
608
+ }
609
+
610
+ .btn-clear-data:hover {
611
+ background: #c82333;
612
+ transform: translateY(-1px);
613
+ }
614
+
615
  /* Chat interface styling */
616
  .chat-main-container {
617
  background: #ffffff;
 
875
  with gr.Row(elem_classes=["control-buttons"]):
876
  refresh_status_btn = gr.Button("πŸ”„ Refresh Status", elem_classes=["control-btn", "btn-refresh"])
877
  new_session_btn = gr.Button("πŸ†• New Session", elem_classes=["control-btn", "btn-new-session"])
878
+ clear_data_btn = gr.Button("πŸ—‘οΈ Clear All Data", elem_classes=["control-btn", "btn-clear-data"], variant="stop")
879
 
880
  # Main chat interface
881
  with gr.Column(elem_classes=["chat-main-container"]):
 
942
  inputs=[],
943
  outputs=[status_display]
944
  )
945
+
946
+ # Clear all data handler
947
+ clear_data_btn.click(
948
+ fn=handle_clear_all_data,
949
+ inputs=[],
950
+ outputs=[chatbot, session_info, status_display]
951
+ )
952
 
953
  return demo
954