Spaces:

Ansemin101
/

Markit_v2

Running

App Files Files Community

AnseMin commited on Mar 18

Commit

fa54d05

1 Parent(s): c4c3253

Error: Too many output

Browse files

Files changed (1) hide show

src/parsers/got_ocr_parser.py +65 -49

src/parsers/got_ocr_parser.py CHANGED Viewed

@@ -21,7 +21,8 @@ class GotOcrParser(DocumentParser):
     """
     _model = None
-    _tokenizer = None
     @classmethod
     def get_name(cls) -> str:
@@ -67,19 +68,13 @@ class GotOcrParser(DocumentParser):
     @classmethod
     def _load_model(cls):
         """Load the GOT-OCR model and tokenizer if not already loaded."""
-        if cls._model is None or cls._tokenizer is None:
             try:
                 # Import dependencies inside the method to avoid global import errors
                 import torch
-                from transformers import AutoModel, AutoTokenizer
-                logger.info("Loading GOT-OCR model and tokenizer...")
-                # Load tokenizer
-                cls._tokenizer = AutoTokenizer.from_pretrained(
-                    'stepfun-ai/GOT-OCR2_0',
-                    trust_remote_code=True
-                )
                 # Determine device
                 device_map = 'cuda' if torch.cuda.is_available() else 'auto'
@@ -88,15 +83,17 @@ class GotOcrParser(DocumentParser):
                 else:
                     logger.warning("Using CPU for model inference (not recommended)")
                 # Load model with explicit float16 for T4 compatibility
-                cls._model = AutoModel.from_pretrained(
-                    'stepfun-ai/GOT-OCR2_0',
-                    trust_remote_code=True,
                     low_cpu_mem_usage=True,
                     device_map=device_map,
-                    use_safetensors=True,
-                    torch_dtype=torch.float16,  # Force float16 for T4 compatibility
-                    pad_token_id=cls._tokenizer.eos_token_id
                 )
                 # Explicitly convert model to half precision (float16)
@@ -107,7 +104,6 @@ class GotOcrParser(DocumentParser):
                     cls._model = cls._model.cuda()
                 # Patch torch.autocast to force float16 instead of bfloat16
-                # This fixes the issue in the model's chat method (line 581)
                 original_autocast = torch.autocast
                 def patched_autocast(*args, **kwargs):
                     # Force dtype to float16 when CUDA is involved
@@ -123,7 +119,7 @@ class GotOcrParser(DocumentParser):
                 return True
             except Exception as e:
                 cls._model = None
-                cls._tokenizer = None
                 logger.error(f"Failed to load GOT-OCR model: {str(e)}")
                 return False
         return True
@@ -138,9 +134,9 @@ class GotOcrParser(DocumentParser):
                 del cls._model
                 cls._model = None
-            if cls._tokenizer is not None:
-                del cls._tokenizer
-                cls._tokenizer = None
             # Clear CUDA cache if available
             if torch.cuda.is_available():
@@ -175,6 +171,7 @@ class GotOcrParser(DocumentParser):
         # Import torch here to ensure it's available
         import torch
         # Validate file path and extension
         file_path = Path(file_path)
@@ -187,31 +184,43 @@ class GotOcrParser(DocumentParser):
                 f"Received file with extension: {file_path.suffix}"
             )
-        # Determine OCR type based on method
-        ocr_type = "format" if ocr_method == "format" else "ocr"
-        logger.info(f"Using OCR method: {ocr_type}")
         # Process the image
         try:
             logger.info(f"Processing image with GOT-OCR: {file_path}")
             # First attempt: Normal processing with autocast
             try:
                 with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
-                    # Use format=True parameter when ocr_type is "format"
-                    if ocr_type == "format":
-                        result = self._model.chat(
-                            self._tokenizer,
-                            str(file_path),
-                            ocr_type='format'
-                        )
                     else:
-                        result = self._model.chat(
-                            self._tokenizer,
-                            str(file_path),
-                            ocr_type='ocr'
-                        )
-                return result
             except RuntimeError as e:
                 # Check if it's a bfloat16 error
                 if "bfloat16" in str(e) or "BFloat16" in str(e):
@@ -227,19 +236,26 @@ class GotOcrParser(DocumentParser):
                         torch.set_default_dtype(torch.float16)
                         with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
-                            # Use format=True parameter when ocr_type is "format"
-                            if ocr_type == "format":
-                                result = self._model.chat(
-                                    self._tokenizer,
-                                    str(file_path),
-                                    ocr_type='format'
-                                )
                             else:
-                                result = self._model.chat(
-                                    self._tokenizer,
-                                    str(file_path),
-                                    ocr_type='ocr'
-                                )
                         # Restore default dtype
                         torch.set_default_dtype(original_dtype)

     """
     _model = None
+    _processor = None
+    _stop_str = "<|im_end|>"
     @classmethod
     def get_name(cls) -> str:
     @classmethod
     def _load_model(cls):
         """Load the GOT-OCR model and tokenizer if not already loaded."""
+        if cls._model is None or cls._processor is None:
             try:
                 # Import dependencies inside the method to avoid global import errors
                 import torch
+                from transformers import AutoModelForImageTextToText, AutoProcessor
+                logger.info("Loading GOT-OCR model and processor...")
                 # Determine device
                 device_map = 'cuda' if torch.cuda.is_available() else 'auto'
                 else:
                     logger.warning("Using CPU for model inference (not recommended)")
+                # Load the processor (includes tokenizer)
+                cls._processor = AutoProcessor.from_pretrained(
+                    'stepfun-ai/GOT-OCR2_0-hf'
+                )
                 # Load model with explicit float16 for T4 compatibility
+                cls._model = AutoModelForImageTextToText.from_pretrained(
+                    'stepfun-ai/GOT-OCR2_0-hf',
                     low_cpu_mem_usage=True,
                     device_map=device_map,
+                    torch_dtype=torch.float16  # Force float16 for T4 compatibility
                 )
                 # Explicitly convert model to half precision (float16)
                     cls._model = cls._model.cuda()
                 # Patch torch.autocast to force float16 instead of bfloat16
                 original_autocast = torch.autocast
                 def patched_autocast(*args, **kwargs):
                     # Force dtype to float16 when CUDA is involved
                 return True
             except Exception as e:
                 cls._model = None
+                cls._processor = None
                 logger.error(f"Failed to load GOT-OCR model: {str(e)}")
                 return False
         return True
                 del cls._model
                 cls._model = None
+            if cls._processor is not None:
+                del cls._processor
+                cls._processor = None
             # Clear CUDA cache if available
             if torch.cuda.is_available():
         # Import torch here to ensure it's available
         import torch
+        from transformers.image_utils import load_image
         # Validate file path and extension
         file_path = Path(file_path)
                 f"Received file with extension: {file_path.suffix}"
             )
+        # Determine format flag based on OCR method
+        format_flag = ocr_method == "format"
+        logger.info(f"Using OCR method: {'format' if format_flag else 'plain'}")
         # Process the image
         try:
             logger.info(f"Processing image with GOT-OCR: {file_path}")
+            # Load image with transformers utils
+            image = load_image(str(file_path))
             # First attempt: Normal processing with autocast
             try:
                 with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                    # Process image with format flag if needed
+                    if format_flag:
+                        inputs = self._processor(image, return_tensors="pt", format=True).to("cuda")
                     else:
+                        inputs = self._processor(image, return_tensors="pt").to("cuda")
+                    # Generate text
+                    generate_ids = self._model.generate(
+                        **inputs,
+                        do_sample=False,
+                        tokenizer=self._processor.tokenizer,
+                        stop_strings=self._stop_str,
+                        max_new_tokens=4096,
+                    )
+                    # Decode the generated text
+                    result = self._processor.decode(
+                        generate_ids[0, inputs["input_ids"].shape[1]:],
+                        skip_special_tokens=True,
+                    )
+                    return result
             except RuntimeError as e:
                 # Check if it's a bfloat16 error
                 if "bfloat16" in str(e) or "BFloat16" in str(e):
                         torch.set_default_dtype(torch.float16)
                         with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                            # Process image with format flag if needed
+                            if format_flag:
+                                inputs = self._processor(image, return_tensors="pt", format=True).to("cuda")
                             else:
+                                inputs = self._processor(image, return_tensors="pt").to("cuda")
+                            # Generate text
+                            generate_ids = self._model.generate(
+                                **inputs,
+                                do_sample=False,
+                                tokenizer=self._processor.tokenizer,
+                                stop_strings=self._stop_str,
+                                max_new_tokens=4096,
+                            )
+                            # Decode the generated text
+                            result = self._processor.decode(
+                                generate_ids[0, inputs["input_ids"].shape[1]:],
+                                skip_special_tokens=True,
+                            )
                         # Restore default dtype
                         torch.set_default_dtype(original_dtype)