Spaces:

Ansemin101
/

Markit_v2

Sleeping

App Files Files Community

AnseMin commited on Jun 24

Commit

f901c17

unverified ·

2 Parent(s): 63f3b68 3f1b4af

Merge pull request #4 from ansemin/development

Browse files

Refactor document ingestion and output file handling

Files changed (4) hide show

src/parsers/gemini_flash_parser.py +1 -1
src/rag/ingestion.py +59 -28
src/services/document_service.py +44 -15
src/ui/ui.py +11 -1

src/parsers/gemini_flash_parser.py CHANGED Viewed

@@ -85,7 +85,7 @@ class GeminiFlashParser(DocumentParser):
             # Generate the response
             response = client.models.generate_content(
-                model="gemini-2.0-flash",
                 contents=[
                     prompt,
                     genai.types.Part.from_bytes(

             # Generate the response
             response = client.models.generate_content(
+                model=config.model.gemini_model,
                 contents=[
                     prompt,
                     genai.types.Part.from_bytes(

src/rag/ingestion.py CHANGED Viewed

@@ -18,12 +18,11 @@ class DocumentIngestionService:
     def __init__(self):
         """Initialize the document ingestion service."""
-        self.processed_documents = set()  # Track processed document hashes
         logger.info("Document ingestion service initialized")
-    def create_document_hash(self, content: str) -> str:
-        """Create a hash for document content to avoid duplicates."""
-        return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
     def prepare_document_metadata(self,
                                 source_path: Optional[str] = None,
@@ -44,7 +43,6 @@ class DocumentIngestionService:
             "source": source_path or "user_upload",
             "doc_type": doc_type,
             "processed_at": datetime.now().isoformat(),
-            "source_id": self.create_document_hash(source_path or ""),
             "ingestion_version": "1.0"
         }
@@ -53,10 +51,35 @@ class DocumentIngestionService:
         return metadata
     def ingest_markdown_content(self,
                               markdown_content: str,
                               source_path: Optional[str] = None,
-                              metadata: Optional[Dict[str, Any]] = None) -> Tuple[bool, str, Dict[str, Any]]:
         """
         Ingest markdown content into the RAG system.
@@ -64,6 +87,7 @@ class DocumentIngestionService:
             markdown_content: The markdown content to ingest
             source_path: Optional source path/filename
             metadata: Optional additional metadata
         Returns:
             Tuple of (success, message, ingestion_stats)
@@ -72,24 +96,34 @@ class DocumentIngestionService:
             if not markdown_content or not markdown_content.strip():
                 return False, "No content provided for ingestion", {}
-            # Create document hash to check for duplicates
-            content_hash = self.create_document_hash(markdown_content)
-            if content_hash in self.processed_documents:
-                logger.info(f"Document already processed: {content_hash}")
-                return True, "Document already exists in the system", {"status": "duplicate"}
-            # Prepare document metadata
             doc_metadata = self.prepare_document_metadata(
                 source_path=source_path,
                 doc_type="markdown",
                 additional_metadata=metadata
             )
-            doc_metadata["content_hash"] = content_hash
             doc_metadata["content_length"] = len(markdown_content)
             # Chunk the document using markdown-aware chunking
-            logger.info(f"Chunking document: {content_hash}")
             chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
             if not chunks:
@@ -102,23 +136,22 @@ class DocumentIngestionService:
             if not doc_ids:
                 return False, "Failed to add documents to vector store", {}
-            # Mark document as processed
-            self.processed_documents.add(content_hash)
             # Prepare ingestion statistics
             ingestion_stats = {
                 "status": "success",
-                "content_hash": content_hash,
                 "total_chunks": len(chunks),
                 "document_ids": doc_ids,
                 "content_length": len(markdown_content),
                 "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
                 "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
-                "processed_at": datetime.now().isoformat()
             }
-            success_msg = f"Successfully ingested document with {len(chunks)} chunks"
-            logger.info(f"{success_msg}: {content_hash}")
             return True, success_msg, ingestion_stats
@@ -147,6 +180,7 @@ class DocumentIngestionService:
             # Extract metadata from conversion result
             original_filename = conversion_result.get("original_filename", "unknown")
             conversion_method = conversion_result.get("conversion_method", "unknown")
             additional_metadata = {
                 "original_filename": original_filename,
@@ -155,11 +189,12 @@ class DocumentIngestionService:
                 "conversion_time": conversion_result.get("conversion_time", 0)
             }
-            # Ingest the markdown content
             return self.ingest_markdown_content(
                 markdown_content=markdown_content,
                 source_path=original_filename,
-                metadata=additional_metadata
             )
         except Exception as e:
@@ -175,7 +210,7 @@ class DocumentIngestionService:
             Dictionary with system status information
         """
         status = {
-            "processed_documents": len(self.processed_documents),
             "embedding_model_available": False,
             "vector_store_available": False,
             "system_ready": False
@@ -202,10 +237,6 @@ class DocumentIngestionService:
         return status
-    def clear_processed_documents(self) -> None:
-        """Clear the set of processed documents."""
-        self.processed_documents.clear()
-        logger.info("Cleared processed documents cache")
     def test_ingestion_pipeline(self) -> Dict[str, Any]:
         """

     def __init__(self):
         """Initialize the document ingestion service."""
         logger.info("Document ingestion service initialized")
+    def create_file_hash(self, content: str) -> str:
+        """Create a full SHA-256 hash for file content to avoid duplicates."""
+        return hashlib.sha256(content.encode('utf-8')).hexdigest()
     def prepare_document_metadata(self,
                                 source_path: Optional[str] = None,
             "source": source_path or "user_upload",
             "doc_type": doc_type,
             "processed_at": datetime.now().isoformat(),
             "ingestion_version": "1.0"
         }
         return metadata
+    def check_duplicate_in_vector_store(self, file_hash: str) -> bool:
+        """Check if document with given file hash already exists in vector store."""
+        try:
+            existing_docs = vector_store_manager.get_vector_store()._collection.get(
+                where={"file_hash": file_hash},
+                limit=1
+            )
+            return len(existing_docs.get('ids', [])) > 0
+        except Exception as e:
+            logger.error(f"Error checking for duplicates: {e}")
+            return False
+    def delete_existing_document(self, file_hash: str) -> bool:
+        """Delete existing document with given file hash from vector store."""
+        try:
+            vector_store_manager.get_vector_store()._collection.delete(
+                where={"file_hash": file_hash}
+            )
+            logger.info(f"Deleted existing document with hash: {file_hash}")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting existing document: {e}")
+            return False
     def ingest_markdown_content(self,
                               markdown_content: str,
                               source_path: Optional[str] = None,
+                              metadata: Optional[Dict[str, Any]] = None,
+                              original_file_content: Optional[str] = None) -> Tuple[bool, str, Dict[str, Any]]:
         """
         Ingest markdown content into the RAG system.
             markdown_content: The markdown content to ingest
             source_path: Optional source path/filename
             metadata: Optional additional metadata
+            original_file_content: Original file content for hash calculation
         Returns:
             Tuple of (success, message, ingestion_stats)
             if not markdown_content or not markdown_content.strip():
                 return False, "No content provided for ingestion", {}
+            # Create file hash using original content if available, otherwise use markdown content
+            file_content_for_hash = original_file_content or markdown_content
+            file_hash = self.create_file_hash(file_content_for_hash)
+            # Check for duplicates in vector store
+            is_duplicate = self.check_duplicate_in_vector_store(file_hash)
+            replacement_mode = False
+            if is_duplicate:
+                logger.info(f"Document with hash {file_hash} already exists, replacing...")
+                # Delete existing document
+                if self.delete_existing_document(file_hash):
+                    replacement_mode = True
+                else:
+                    return False, "Failed to replace existing document", {"status": "error"}
+            # Prepare document metadata with file hash
             doc_metadata = self.prepare_document_metadata(
                 source_path=source_path,
                 doc_type="markdown",
                 additional_metadata=metadata
             )
+            doc_metadata["file_hash"] = file_hash
             doc_metadata["content_length"] = len(markdown_content)
+            doc_metadata["upload_timestamp"] = datetime.now().isoformat()
             # Chunk the document using markdown-aware chunking
+            logger.info(f"Chunking document: {file_hash}")
             chunks = document_chunker.chunk_document(markdown_content, doc_metadata)
             if not chunks:
             if not doc_ids:
                 return False, "Failed to add documents to vector store", {}
             # Prepare ingestion statistics
             ingestion_stats = {
                 "status": "success",
+                "file_hash": file_hash,
                 "total_chunks": len(chunks),
                 "document_ids": doc_ids,
                 "content_length": len(markdown_content),
                 "has_tables": any(chunk.metadata.get("has_table", False) for chunk in chunks),
                 "has_code": any(chunk.metadata.get("has_code", False) for chunk in chunks),
+                "processed_at": datetime.now().isoformat(),
+                "replacement_mode": replacement_mode
             }
+            action = "Updated existing" if replacement_mode else "Successfully ingested"
+            success_msg = f"{action} document with {len(chunks)} chunks"
+            logger.info(f"{success_msg}: {file_hash}")
             return True, success_msg, ingestion_stats
             # Extract metadata from conversion result
             original_filename = conversion_result.get("original_filename", "unknown")
             conversion_method = conversion_result.get("conversion_method", "unknown")
+            original_file_content = conversion_result.get("original_file_content")
             additional_metadata = {
                 "original_filename": original_filename,
                 "conversion_time": conversion_result.get("conversion_time", 0)
             }
+            # Ingest the markdown content with original file content for proper hashing
             return self.ingest_markdown_content(
                 markdown_content=markdown_content,
                 source_path=original_filename,
+                metadata=additional_metadata,
+                original_file_content=original_file_content
             )
         except Exception as e:
             Dictionary with system status information
         """
         status = {
+            "processed_documents": 0,  # Will be updated from vector store
             "embedding_model_available": False,
             "vector_store_available": False,
             "system_ready": False
         return status
     def test_ingestion_pipeline(self) -> Dict[str, Any]:
         """

src/services/document_service.py CHANGED Viewed

@@ -118,8 +118,8 @@ class DocumentService:
         return content
-    def _create_output_file(self, content: str, output_format: str) -> str:
-        """Create output file with proper extension."""
         # Determine file extension
         format_extensions = {
             "markdown": ".md",
@@ -132,18 +132,47 @@ class DocumentService:
         if self._check_cancellation():
             raise ConversionError("Conversion cancelled before output file creation")
-        # Create temporary output file
-        with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
-            tmp_path = tmp.name
-            # Write in chunks with cancellation checks
-            chunk_size = 10000  # characters
-            for i in range(0, len(content), chunk_size):
-                if self._check_cancellation():
-                    self._safe_delete_file(tmp_path)
-                    raise ConversionError("Conversion cancelled during output file writing")
-                tmp.write(content[i:i+chunk_size])
         return tmp_path
@@ -218,7 +247,7 @@ class DocumentService:
                 raise ConversionError("Conversion cancelled")
             # Create output file
-            output_path = self._create_output_file(content, output_format)
             return content, output_path

         return content
+    def _create_output_file(self, content: str, output_format: str, original_file_path: Optional[str] = None) -> str:
+        """Create output file with proper extension and preserved filename."""
         # Determine file extension
         format_extensions = {
             "markdown": ".md",
         if self._check_cancellation():
             raise ConversionError("Conversion cancelled before output file creation")
+        # Create output filename based on original filename if provided
+        if original_file_path:
+            original_name = Path(original_file_path).stem  # Get filename without extension
+            # Clean the filename to be filesystem-safe while preserving spaces and common characters
+            clean_name = "".join(c for c in original_name if c.isalnum() or c in (' ', '-', '_', '.', '(', ')')).strip()
+            # Replace multiple spaces with single spaces
+            clean_name = ' '.join(clean_name.split())
+            if not clean_name:  # Fallback if cleaning removes everything
+                clean_name = "converted_document"
+            # Create output file in temp directory with proper name
+            temp_dir = tempfile.gettempdir()
+            output_filename = f"{clean_name}{ext}"
+            tmp_path = os.path.join(temp_dir, output_filename)
+            # Handle filename conflicts by adding a number suffix
+            counter = 1
+            base_path = tmp_path
+            while os.path.exists(tmp_path):
+                name_part = f"{clean_name}_{counter}"
+                tmp_path = os.path.join(temp_dir, f"{name_part}{ext}")
+                counter += 1
+        else:
+            # Fallback to random temporary file
+            with tempfile.NamedTemporaryFile(mode="w", suffix=ext, delete=False, encoding="utf-8") as tmp:
+                tmp_path = tmp.name
+        # Write content to file
+        try:
+            with open(tmp_path, "w", encoding="utf-8") as f:
+                # Write in chunks with cancellation checks
+                chunk_size = 10000  # characters
+                for i in range(0, len(content), chunk_size):
+                    if self._check_cancellation():
+                        self._safe_delete_file(tmp_path)
+                        raise ConversionError("Conversion cancelled during output file writing")
+                    f.write(content[i:i+chunk_size])
+        except Exception as e:
+            self._safe_delete_file(tmp_path)
+            raise ConversionError(f"Failed to write output file: {str(e)}")
         return tmp_path
                 raise ConversionError("Conversion cancelled")
             # Create output file
+            output_path = self._create_output_file(content, output_format, file_path)
             return content, output_path

src/ui/ui.py CHANGED Viewed

@@ -170,12 +170,22 @@ def handle_convert(file_path, parser_name, ocr_method_name, output_format, is_ca
     # Auto-ingest the converted document for RAG
     try:
         conversion_result = {
             "markdown_content": content,
             "original_filename": Path(file_path).name if file_path else "unknown",
             "conversion_method": parser_name,
             "file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
-            "conversion_time": 0  # Could be tracked if needed
         }
         success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)

     # Auto-ingest the converted document for RAG
     try:
+        # Read original file content for proper deduplication hashing
+        original_file_content = None
+        if file_path and Path(file_path).exists():
+            try:
+                with open(file_path, 'rb') as f:
+                    original_file_content = f.read().decode('utf-8', errors='ignore')
+            except Exception as e:
+                logger.warning(f"Could not read original file content: {e}")
         conversion_result = {
             "markdown_content": content,
             "original_filename": Path(file_path).name if file_path else "unknown",
             "conversion_method": parser_name,
             "file_size": Path(file_path).stat().st_size if file_path and Path(file_path).exists() else 0,
+            "conversion_time": 0,  # Could be tracked if needed
+            "original_file_content": original_file_content
         }
         success, ingestion_msg, stats = document_ingestion_service.ingest_from_conversion_result(conversion_result)