Spaces:
Sleeping
Sleeping
Merge pull request #4 from ansemin/development
Browse filesRefactor document ingestion and output file handling
- src/parsers/gemini_flash_parser.py +1 -1
- src/rag/ingestion.py +59 -28
- src/services/document_service.py +44 -15
- src/ui/ui.py +11 -1
src/parsers/gemini_flash_parser.py
CHANGED
@@ -85,7 +85,7 @@ class GeminiFlashParser(DocumentParser):
|
|
85 |
|
86 |
# Generate the response
|
87 |
response = client.models.generate_content(
|
88 |
-
model=
|
89 |
contents=[
|
90 |
prompt,
|
91 |
genai.types.Part.from_bytes(
|
|
|
85 |
|
86 |
# Generate the response
|
87 |
response = client.models.generate_content(
|
88 |
+
model=config.model.gemini_model,
|
89 |
contents=[
|
90 |
prompt,
|
91 |
genai.types.Part.from_bytes(
|
src/rag/ingestion.py
CHANGED
@@ -18,12 +18,11 @@ class DocumentIngestionService:
|
|
18 |
|
19 |
def __init__(self):
|
20 |
"""Initialize the document ingestion service."""
|
21 |
-
self.processed_documents = set() # Track processed document hashes
|
22 |
logger.info("Document ingestion service initialized")
|
23 |
|
24 |
-
def
|
25 |
-
"""Create a hash for
|
26 |
-
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
27 |
|
28 |
def prepare_document_metadata(self,
|
29 |
source_path: Optional[str] = None,
|
@@ -44,7 +43,6 @@ class DocumentIngestionService:
|
|
44 |
"source": source_path or "user_upload",
|
45 |
"doc_type": doc_type,
|
46 |
"processed_at": datetime.now().isoformat(),
|
47 |
-
"source_id": self.create_document_hash(source_path or ""),
|
48 |
"ingestion_version": "1.0"
|
49 |
}
|
50 |
|
@@ -53,10 +51,35 @@ class DocumentIngestionService:
|
|
53 |
|
54 |
return metadata
|
55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
def ingest_markdown_content(self,
|
57 |
markdown_content: str,
|
58 |
source_path: Optional[str] = None,
|
59 |
-
metadata: Optional[Dict[str, Any]] = None
|
|
|
60 |
"""
|
61 |
Ingest markdown content into the RAG system.
|
62 |
|
@@ -64,6 +87,7 @@ class DocumentIngestionService:
|
|
64 |
markdown_content: The markdown content to ingest
|
65 |
source_path: Optional source path/filename
|
66 |
metadata: Optional additional metadata
|
|
|
67 |
|
68 |
Returns:
|
69 |
Tuple of (success, message, ingestion_stats)
|
@@ -72,24 +96,34 @@ class DocumentIngestionService:
|
|
72 |
if not markdown_content or not markdown_content.strip():
|
73 |
return False, "No content provided for ingestion", {}
|
74 |
|
75 |
-
# Create
|
76 |
-
|
|
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
doc_metadata = self.prepare_document_metadata(
|
84 |
source_path=source_path,
|
85 |
doc_type="markdown",
|
86 |
additional_metadata=metadata
|
87 |
)
|
88 |
-
doc_metadata["
|
89 |
doc_metadata["content_length"] = len(markdown_content)
|
|
|
90 |
|
91 |
# Chunk the document using markdown-aware chunking
|
92 |
-
logger.info(f"Chunking document: {
|
93 |
chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
|
94 |
|
95 |
if not chunks:
|
@@ -102,23 +136,22 @@ class DocumentIngestionService:
|
|
102 |
if not doc_ids:
|
103 |
return False, "Failed to add documents to vector store", {}
|
104 |
|
105 |
-
# Mark document as processed
|
106 |
-
self.processed_documents.add(content_hash)
|
107 |
-
|
108 |
# Prepare ingestion statistics
|
109 |
ingestion_stats = {
|
110 |
"status": "success",
|
111 |
-
"
|
112 |
"total_chunks": len(chunks),
|
113 |
"document_ids": doc_ids,
|
114 |
"content_length": len(markdown_content),
|
115 |
"has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
|
116 |
"has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
|
117 |
-
"processed_at": datetime.now().isoformat()
|
|
|
118 |
}
|
119 |
|
120 |
-
|
121 |
-
|
|
|
122 |
|
123 |
return True, success_msg, ingestion_stats
|
124 |
|
@@ -147,6 +180,7 @@ class DocumentIngestionService:
|
|
147 |
# Extract metadata from conversion result
|
148 |
original_filename = conversion_result.get("original_filename", "unknown")
|
149 |
conversion_method = conversion_result.get("conversion_method", "unknown")
|
|
|
150 |
|
151 |
additional_metadata = {
|
152 |
"original_filename": original_filename,
|
@@ -155,11 +189,12 @@ class DocumentIngestionService:
|
|
155 |
"conversion_time": conversion_result.get("conversion_time", 0)
|
156 |
}
|
157 |
|
158 |
-
# Ingest the markdown content
|
159 |
return self.ingest_markdown_content(
|
160 |
markdown_content=markdown_content,
|
161 |
source_path=original_filename,
|
162 |
-
metadata=additional_metadata
|
|
|
163 |
)
|
164 |
|
165 |
except Exception as e:
|
@@ -175,7 +210,7 @@ class DocumentIngestionService:
|
|
175 |
Dictionary with system status information
|
176 |
"""
|
177 |
status = {
|
178 |
-
"processed_documents":
|
179 |
"embedding_model_available": False,
|
180 |
"vector_store_available": False,
|
181 |
"system_ready": False
|
@@ -202,10 +237,6 @@ class DocumentIngestionService:
|
|
202 |
|
203 |
return status
|
204 |
|
205 |
-
def clear_processed_documents(self) -> None:
|
206 |
-
"""Clear the set of processed documents."""
|
207 |
-
self.processed_documents.clear()
|
208 |
-
logger.info("Cleared processed documents cache")
|
209 |
|
210 |
def test_ingestion_pipeline(self) -> Dict[str, Any]:
|
211 |
"""
|
|
|
18 |
|
19 |
def __init__(self):
|
20 |
"""Initialize the document ingestion service."""
|
|
|
21 |
logger.info("Document ingestion service initialized")
|
22 |
|
23 |
+
def create_file_hash(self, content: str) -> str:
|
24 |
+
"""Create a full SHA-256 hash for file content to avoid duplicates."""
|
25 |
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()
|
26 |
|
27 |
def prepare_document_metadata(self,
|
28 |
source_path: Optional[str] = None,
|
|
|
43 |
"source": source_path or "user_upload",
|
44 |
"doc_type": doc_type,
|
45 |
"processed_at": datetime.now().isoformat(),
|
|
|
46 |
"ingestion_version": "1.0"
|
47 |
}
|
48 |
|
|
|
51 |
|
52 |
return metadata
|
53 |
|
54 |
+
def check_duplicate_in_vector_store(self, file_hash: str) -> bool:
|
55 |
+
"""Check if document with given file hash already exists in vector store."""
|
56 |
+
try:
|
57 |
+
existing_docs = vector_store_manager.get_vector_store()._collection.get(
|
58 |
+
where={"file_hash": file_hash},
|
59 |
+
limit=1
|
60 |
+
)
|
61 |
+
return len(existing_docs.get('ids', [])) > 0
|
62 |
+
except Exception as e:
|
63 |
+
logger.error(f"Error checking for duplicates: {e}")
|
64 |
+
return False
|
65 |
+
|
66 |
+
def delete_existing_document(self, file_hash: str) -> bool:
|
67 |
+
"""Delete existing document with given file hash from vector store."""
|
68 |
+
try:
|
69 |
+
vector_store_manager.get_vector_store()._collection.delete(
|
70 |
+
where={"file_hash": file_hash}
|
71 |
+
)
|
72 |
+
logger.info(f"Deleted existing document with hash: {file_hash}")
|
73 |
+
return True
|
74 |
+
except Exception as e:
|
75 |
+
logger.error(f"Error deleting existing document: {e}")
|
76 |
+
return False
|
77 |
+
|
78 |
def ingest_markdown_content(self,
|
79 |
markdown_content: str,
|
80 |
source_path: Optional[str] = None,
|
81 |
+
metadata: Optional[Dict[str, Any]] = None,
|
82 |
+
original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
|
83 |
"""
|
84 |
Ingest markdown content into the RAG system.
|
85 |
|
|
|
87 |
markdown_content: The markdown content to ingest
|
88 |
source_path: Optional source path/filename
|
89 |
metadata: Optional additional metadata
|
90 |
+
original_file_content: Original file content for hash calculation
|
91 |
|
92 |
Returns:
|
93 |
Tuple of (success, message, ingestion_stats)
|
|
|
96 |
if not markdown_content or not markdown_content.strip():
|
97 |
return False, "No content provided for ingestion", {}
|
98 |
|
99 |
+
# Create file hash using original content if available, otherwise use markdown content
|
100 |
+
file_content_for_hash = original_file_content or markdown_content
|
101 |
+
file_hash = self.create_file_hash(file_content_for_hash)
|
102 |
|
103 |
+
# Check for duplicates in vector store
|
104 |
+
is_duplicate = self.check_duplicate_in_vector_store(file_hash)
|
105 |
+
replacement_mode = False
|
106 |
|
107 |
+
if is_duplicate:
|
108 |
+
logger.info(f"Document with hash {file_hash} already exists, replacing...")
|
109 |
+
# Delete existing document
|
110 |
+
if self.delete_existing_document(file_hash):
|
111 |
+
replacement_mode = True
|
112 |
+
else:
|
113 |
+
return False, "Failed to replace existing document", {"status": "error"}
|
114 |
+
|
115 |
+
# Prepare document metadata with file hash
|
116 |
doc_metadata = self.prepare_document_metadata(
|
117 |
source_path=source_path,
|
118 |
doc_type="markdown",
|
119 |
additional_metadata=metadata
|
120 |
)
|
121 |
+
doc_metadata["file_hash"] = file_hash
|
122 |
doc_metadata["content_length"] = len(markdown_content)
|
123 |
+
doc_metadata["upload_timestamp"] = datetime.now().isoformat()
|
124 |
|
125 |
# Chunk the document using markdown-aware chunking
|
126 |
+
logger.info(f"Chunking document: {file_hash}")
|
127 |
chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
|
128 |
|
129 |
if not chunks:
|
|
|
136 |
if not doc_ids:
|
137 |
return False, "Failed to add documents to vector store", {}
|
138 |
|
|
|
|
|
|
|
139 |
# Prepare ingestion statistics
|
140 |
ingestion_stats = {
|
141 |
"status": "success",
|
142 |
+
"file_hash": file_hash,
|
143 |
"total_chunks": len(chunks),
|
144 |
"document_ids": doc_ids,
|
145 |
"content_length": len(markdown_content),
|
146 |
"has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
|
147 |
"has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
|
148 |
+
"processed_at": datetime.now().isoformat(),
|
149 |
+
"replacement_mode": replacement_mode
|
150 |
}
|
151 |
|
152 |
+
action = "Updated existing" if replacement_mode else "Successfully ingested"
|
153 |
+
success_msg = f"{action} document with {len(chunks)} chunks"
|
154 |
+
logger.info(f"{success_msg}: {file_hash}")
|
155 |
|
156 |
return True, success_msg, ingestion_stats
|
157 |
|
|
|
180 |
# Extract metadata from conversion result
|
181 |
original_filename = conversion_result.get("original_filename", "unknown")
|
182 |
conversion_method = conversion_result.get("conversion_method", "unknown")
|
183 |
+
original_file_content = conversion_result.get("original_file_content")
|
184 |
|
185 |
additional_metadata = {
|
186 |
"original_filename": original_filename,
|
|
|
189 |
"conversion_time": conversion_result.get("conversion_time", 0)
|
190 |
}
|
191 |
|
192 |
+
# Ingest the markdown content with original file content for proper hashing
|
193 |
return self.ingest_markdown_content(
|
194 |
markdown_content=markdown_content,
|
195 |
source_path=original_filename,
|
196 |
+
metadata=additional_metadata,
|
197 |
+
original_file_content=original_file_content
|
198 |
)
|
199 |
|
200 |
except Exception as e:
|
|
|
210 |
Dictionary with system status information
|
211 |
"""
|
212 |
status = {
|
213 |
+
"processed_documents": 0, # Will be updated from vector store
|
214 |
"embedding_model_available": False,
|
215 |
"vector_store_available": False,
|
216 |
"system_ready": False
|
|
|
237 |
|
238 |
return status
|
239 |
|
|
|
|
|
|
|
|
|
240 |
|
241 |
def test_ingestion_pipeline(self) -> Dict[str, Any]:
|
242 |
"""
|
src/services/document_service.py
CHANGED
@@ -118,8 +118,8 @@ class DocumentService:
|
|
118 |
|
119 |
return content
|
120 |
|
121 |
-
def _create_output_file(self, content: str, output_format: str) -> str:
|
122 |
-
"""Create output file with proper extension."""
|
123 |
# Determine file extension
|
124 |
format_extensions = {
|
125 |
"markdown": ".md",
|
@@ -132,18 +132,47 @@ class DocumentService:
|
|
132 |
if self._check_cancellation():
|
133 |
raise ConversionError("Conversion cancelled before output file creation")
|
134 |
|
135 |
-
# Create
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
return tmp_path
|
149 |
|
@@ -218,7 +247,7 @@ class DocumentService:
|
|
218 |
raise ConversionError("Conversion cancelled")
|
219 |
|
220 |
# Create output file
|
221 |
-
output_path = self._create_output_file(content, output_format)
|
222 |
|
223 |
return content, output_path
|
224 |
|
|
|
118 |
|
119 |
return content
|
120 |
|
121 |
+
def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None) -> str:
|
122 |
+
"""Create output file with proper extension and preserved filename."""
|
123 |
# Determine file extension
|
124 |
format_extensions = {
|
125 |
"markdown": ".md",
|
|
|
132 |
if self._check_cancellation():
|
133 |
raise ConversionError("Conversion cancelled before output file creation")
|
134 |
|
135 |
+
# Create output filename based on original filename if provided
|
136 |
+
if original_file_path:
|
137 |
+
original_name = Path(original_file_path).stem # Get filename without extension
|
138 |
+
# Clean the filename to be filesystem-safe while preserving spaces and common characters
|
139 |
+
clean_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_', '.', '(', ')')).strip()
|
140 |
+
# Replace multiple spaces with single spaces
|
141 |
+
clean_name = ' '.join(clean_name.split())
|
142 |
+
if not clean_name: # Fallback if cleaning removes everything
|
143 |
+
clean_name = "converted_document"
|
144 |
+
|
145 |
+
# Create output file in temp directory with proper name
|
146 |
+
temp_dir = tempfile.gettempdir()
|
147 |
+
output_filename = f"{clean_name}{ext}"
|
148 |
+
tmp_path = os.path.join(temp_dir, output_filename)
|
149 |
+
|
150 |
+
# Handle filename conflicts by adding a number suffix
|
151 |
+
counter = 1
|
152 |
+
base_path = tmp_path
|
153 |
+
while os.path.exists(tmp_path):
|
154 |
+
name_part = f"{clean_name}_{counter}"
|
155 |
+
tmp_path = os.path.join(temp_dir, f"{name_part}{ext}")
|
156 |
+
counter += 1
|
157 |
+
else:
|
158 |
+
# Fallback to random temporary file
|
159 |
+
with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
|
160 |
+
tmp_path = tmp.name
|
161 |
+
|
162 |
+
# Write content to file
|
163 |
+
try:
|
164 |
+
with open(tmp_path, "w", encoding="utf-8") as f:
|
165 |
+
# Write in chunks with cancellation checks
|
166 |
+
chunk_size = 10000 # characters
|
167 |
+
for i in range(0, len(content), chunk_size):
|
168 |
+
if self._check_cancellation():
|
169 |
+
self._safe_delete_file(tmp_path)
|
170 |
+
raise ConversionError("Conversion cancelled during output file writing")
|
171 |
+
|
172 |
+
f.write(content[i:i+chunk_size])
|
173 |
+
except Exception as e:
|
174 |
+
self._safe_delete_file(tmp_path)
|
175 |
+
raise ConversionError(f"Failed to write output file: {str(e)}")
|
176 |
|
177 |
return tmp_path
|
178 |
|
|
|
247 |
raise ConversionError("Conversion cancelled")
|
248 |
|
249 |
# Create output file
|
250 |
+
output_path = self._create_output_file(content, output_format, file_path)
|
251 |
|
252 |
return content, output_path
|
253 |
|
src/ui/ui.py
CHANGED
@@ -170,12 +170,22 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
|
|
170 |
|
171 |
# Auto-ingest the converted document for RAG
|
172 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
conversion_result = {
|
174 |
"markdown_content": content,
|
175 |
"original_filename": Path(file_path).name if file_path else "unknown",
|
176 |
"conversion_method": parser_name,
|
177 |
"file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
|
178 |
-
"conversion_time": 0 # Could be tracked if needed
|
|
|
179 |
}
|
180 |
|
181 |
success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)
|
|
|
170 |
|
171 |
# Auto-ingest the converted document for RAG
|
172 |
try:
|
173 |
+
# Read original file content for proper deduplication hashing
|
174 |
+
original_file_content = None
|
175 |
+
if file_path and Path(file_path).exists():
|
176 |
+
try:
|
177 |
+
with open(file_path, 'rb') as f:
|
178 |
+
original_file_content = f.read().decode('utf-8', errors='ignore')
|
179 |
+
except Exception as e:
|
180 |
+
logger.warning(f"Could not read original file content: {e}")
|
181 |
+
|
182 |
conversion_result = {
|
183 |
"markdown_content": content,
|
184 |
"original_filename": Path(file_path).name if file_path else "unknown",
|
185 |
"conversion_method": parser_name,
|
186 |
"file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
|
187 |
+
"conversion_time": 0, # Could be tracked if needed
|
188 |
+
"original_file_content": original_file_content
|
189 |
}
|
190 |
|
191 |
success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)
|