AnseMin commited on
Commit
5bb2b30
·
1 Parent(s): c9c21c7

changes on logging for better debugging

Browse files
Files changed (2) hide show
  1. setup.sh +44 -1
  2. src/parsers/got_ocr_parser.py +73 -80
setup.sh CHANGED
@@ -5,6 +5,9 @@ set -e
5
 
6
  echo "Starting setup process..."
7
 
 
 
 
8
  # Check if running with sudo/root permissions for system dependencies
9
  if [ "$EUID" -eq 0 ]; then
10
  # Install system dependencies
@@ -12,7 +15,8 @@ if [ "$EUID" -eq 0 ]; then
12
  apt-get update && apt-get install -y \
13
  wget \
14
  pkg-config \
15
- git
 
16
  echo "System dependencies installed successfully"
17
  else
18
  echo "Not running as root. Skipping system dependencies installation."
@@ -41,6 +45,42 @@ echo "Installing Hugging Face CLI..."
41
  pip install -q -U "huggingface_hub[cli]"
42
  echo "Hugging Face CLI installed successfully"
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Install the project in development mode only if setup.py or pyproject.toml exists
45
  if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
46
  echo "Installing project in development mode..."
@@ -62,4 +102,7 @@ if [ ! -f .env ]; then
62
  fi
63
  fi
64
 
 
 
 
65
  echo "Setup process completed successfully!"
 
5
 
6
  echo "Starting setup process..."
7
 
8
+ # Enable more verbose logging
9
+ set -x
10
+
11
  # Check if running with sudo/root permissions for system dependencies
12
  if [ "$EUID" -eq 0 ]; then
13
  # Install system dependencies
 
15
  apt-get update && apt-get install -y \
16
  wget \
17
  pkg-config \
18
+ git \
19
+ tree # Add tree for directory structure visualization
20
  echo "System dependencies installed successfully"
21
  else
22
  echo "Not running as root. Skipping system dependencies installation."
 
45
  pip install -q -U "huggingface_hub[cli]"
46
  echo "Hugging Face CLI installed successfully"
47
 
48
+ # Add debug section for GOT-OCR repo
49
+ echo "===== GOT-OCR Repository Debugging ====="
50
+
51
+ # Clone the repository for inspection (if it doesn't exist)
52
+ TEMP_DIR="/tmp"
53
+ REPO_DIR="${TEMP_DIR}/GOT-OCR2.0"
54
+
55
+ if [ ! -d "$REPO_DIR" ]; then
56
+ echo "Cloning GOT-OCR2.0 repository for debugging..."
57
+ git clone https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git "$REPO_DIR"
58
+ else
59
+ echo "GOT-OCR2.0 repository already exists at $REPO_DIR"
60
+ fi
61
+
62
+ # Check the repository structure
63
+ echo "GOT-OCR2.0 repository structure:"
64
+ if command -v tree &> /dev/null; then
65
+ tree -L 3 "$REPO_DIR"
66
+ else
67
+ find "$REPO_DIR" -type d -maxdepth 3 | sort
68
+ fi
69
+
70
+ # Check if the demo script exists
71
+ DEMO_SCRIPT="${REPO_DIR}/GOT/demo/run_ocr_2.0.py"
72
+ if [ -f "$DEMO_SCRIPT" ]; then
73
+ echo "Demo script found at: $DEMO_SCRIPT"
74
+ else
75
+ echo "ERROR: Demo script not found at: $DEMO_SCRIPT"
76
+
77
+ # Search for the script in the repository
78
+ echo "Searching for run_ocr_2.0.py in the repository..."
79
+ find "$REPO_DIR" -name "run_ocr_2.0.py" -type f
80
+ fi
81
+
82
+ echo "===== End of GOT-OCR Debugging ====="
83
+
84
  # Install the project in development mode only if setup.py or pyproject.toml exists
85
  if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
86
  echo "Installing project in development mode..."
 
102
  fi
103
  fi
104
 
105
+ # Return to normal logging
106
+ set +x
107
+
108
  echo "Setup process completed successfully!"
src/parsers/got_ocr_parser.py CHANGED
@@ -15,6 +15,8 @@ import latex2markdown
15
 
16
  # Configure logging
17
  logger = logging.getLogger(__name__)
 
 
18
 
19
  class GotOcrParser(DocumentParser):
20
  """Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
@@ -26,7 +28,6 @@ class GotOcrParser(DocumentParser):
26
  # Path to the GOT-OCR repository
27
  _repo_path = None
28
  _weights_path = None
29
- _demo_script_path = None
30
 
31
  @classmethod
32
  def get_name(cls) -> str:
@@ -79,69 +80,68 @@ class GotOcrParser(DocumentParser):
79
  logger.error(f"Missing dependency: {e}")
80
  return False
81
 
82
- @classmethod
83
- def _find_demo_script(cls, base_dir):
84
- """Find the run_ocr_2.0.py script by searching the repository.
85
-
86
- Args:
87
- base_dir: The base directory to start searching from
88
-
89
- Returns:
90
- Path to the script if found, None otherwise
91
- """
92
- logger.info(f"Searching for run_ocr_2.0.py in {base_dir}")
93
- script_paths = []
94
-
95
- # Walk through all directories and find all instances of run_ocr_2.0.py
96
- for root, dirs, files in os.walk(base_dir):
97
- if "run_ocr_2.0.py" in files:
98
- script_path = os.path.join(root, "run_ocr_2.0.py")
99
- script_paths.append(script_path)
100
- logger.info(f"Found run_ocr_2.0.py at: {script_path}")
101
-
102
- if not script_paths:
103
- logger.error("Could not find run_ocr_2.0.py in the repository")
104
- return None
105
-
106
- # If there are multiple instances, try to find the one in demo folder
107
- for path in script_paths:
108
- if os.path.join("demo", "run_ocr_2.0.py") in path:
109
- logger.info(f"Selected demo script at: {path}")
110
- return path
111
-
112
- # If no clear demo folder, just use the first one found
113
- logger.info(f"Selected demo script at: {script_paths[0]}")
114
- return script_paths[0]
115
-
116
  @classmethod
117
  def _setup_repository(cls) -> bool:
118
  """Set up the GOT-OCR2.0 repository if it's not already set up."""
119
- if cls._repo_path is not None and os.path.exists(cls._repo_path) and cls._demo_script_path is not None:
 
120
  return True
121
 
122
  try:
123
  # Create a temporary directory for the repository
124
  repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
 
125
 
126
  # Check if the repository already exists
127
  if not os.path.exists(repo_dir):
128
- logger.info("Cloning GOT-OCR2.0 repository...")
129
  subprocess.run(
130
  ["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
131
  check=True
132
  )
133
  else:
134
- logger.info("GOT-OCR2.0 repository already exists, skipping clone")
135
 
136
  cls._repo_path = repo_dir
137
 
138
- # Find the demo script
139
- cls._demo_script_path = cls._find_demo_script(repo_dir)
140
- if cls._demo_script_path is None:
141
- logger.error("Could not find the run_ocr_2.0.py script in the cloned repository")
142
- return False
143
-
144
- logger.info(f"Using demo script: {cls._demo_script_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
  # Set up the weights directory
147
  weights_dir = os.path.join(repo_dir, "GOT_weights")
@@ -149,6 +149,7 @@ class GotOcrParser(DocumentParser):
149
  os.makedirs(weights_dir, exist_ok=True)
150
 
151
  cls._weights_path = weights_dir
 
152
 
153
  # Check if weights exist, if not download them
154
  weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
@@ -221,17 +222,35 @@ class GotOcrParser(DocumentParser):
221
  try:
222
  logger.info(f"Processing image with GOT-OCR: {file_path}")
223
 
224
- # Check if demo script exists
225
- if not self._demo_script_path or not os.path.exists(self._demo_script_path):
226
- logger.warning("Demo script path not found, trying to locate it again")
227
- self._demo_script_path = self._find_demo_script(self._repo_path)
228
- if not self._demo_script_path:
229
- raise RuntimeError("Could not find the run_ocr_2.0.py script in the repository")
230
-
231
  # Create the command for running the GOT-OCR script
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  cmd = [
233
  sys.executable,
234
- self._demo_script_path,
235
  "--model-name", self._weights_path,
236
  "--image-file", str(file_path),
237
  "--type", ocr_type
@@ -263,18 +282,7 @@ class GotOcrParser(DocumentParser):
263
  # If render was requested, find and return the path to the HTML file
264
  if render:
265
  # The rendered results are in /results/demo.html according to the README
266
- results_dir = os.path.join(os.path.dirname(self._demo_script_path), "..", "..", "results")
267
- if not os.path.exists(results_dir):
268
- # Try to find results directory
269
- for root, dirs, files in os.walk(self._repo_path):
270
- if "demo.html" in files:
271
- html_result_path = os.path.join(root, "demo.html")
272
- logger.info(f"Found rendered HTML at: {html_result_path}")
273
- with open(html_result_path, 'r') as f:
274
- html_content = f.read()
275
- return html_content
276
-
277
- html_result_path = os.path.join(results_dir, "demo.html")
278
  if os.path.exists(html_result_path):
279
  with open(html_result_path, 'r') as f:
280
  html_content = f.read()
@@ -294,21 +302,6 @@ class GotOcrParser(DocumentParser):
294
  except subprocess.CalledProcessError as e:
295
  logger.error(f"Error running GOT-OCR command: {str(e)}")
296
  logger.error(f"Stderr: {e.stderr}")
297
-
298
- # Print repository structure for debugging
299
- logger.error("Repository structure for debugging:")
300
- try:
301
- subprocess.run(
302
- ["find", self._repo_path, "-type", "f", "-name", "*.py"],
303
- check=True,
304
- capture_output=True,
305
- text=True
306
- )
307
- structure_output = subprocess.getoutput(f"find {self._repo_path} -type f -name '*.py'")
308
- logger.error(f"Python files in repository:\n{structure_output}")
309
- except Exception as debug_e:
310
- logger.error(f"Error getting repository structure: {debug_e}")
311
-
312
  raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
313
 
314
  except Exception as e:
 
15
 
16
  # Configure logging
17
  logger = logging.getLogger(__name__)
18
+ # Set logger level to DEBUG for more verbose output
19
+ logger.setLevel(logging.DEBUG)
20
 
21
  class GotOcrParser(DocumentParser):
22
  """Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
 
28
  # Path to the GOT-OCR repository
29
  _repo_path = None
30
  _weights_path = None
 
31
 
32
  @classmethod
33
  def get_name(cls) -> str:
 
80
  logger.error(f"Missing dependency: {e}")
81
  return False
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  @classmethod
84
  def _setup_repository(cls) -> bool:
85
  """Set up the GOT-OCR2.0 repository if it's not already set up."""
86
+ if cls._repo_path is not None and os.path.exists(cls._repo_path):
87
+ logger.debug(f"Repository already set up at: {cls._repo_path}")
88
  return True
89
 
90
  try:
91
  # Create a temporary directory for the repository
92
  repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
93
+ logger.debug(f"Repository directory: {repo_dir}")
94
 
95
  # Check if the repository already exists
96
  if not os.path.exists(repo_dir):
97
+ logger.info(f"Cloning GOT-OCR2.0 repository to {repo_dir}...")
98
  subprocess.run(
99
  ["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
100
  check=True
101
  )
102
  else:
103
+ logger.info(f"GOT-OCR2.0 repository already exists at {repo_dir}, skipping clone")
104
 
105
  cls._repo_path = repo_dir
106
 
107
+ # Debug: List repository contents
108
+ logger.debug("Repository contents:")
109
+ try:
110
+ result = subprocess.run(
111
+ ["find", repo_dir, "-type", "d", "-maxdepth", "3"],
112
+ check=True,
113
+ capture_output=True,
114
+ text=True
115
+ )
116
+ for line in result.stdout.splitlines():
117
+ logger.debug(f" {line}")
118
+ except Exception as e:
119
+ logger.warning(f"Could not list repository contents: {e}")
120
+
121
+ # Check if the demo script exists
122
+ demo_script = os.path.join(repo_dir, "GOT", "demo", "run_ocr_2.0.py")
123
+ if os.path.exists(demo_script):
124
+ logger.info(f"Found demo script at: {demo_script}")
125
+ else:
126
+ logger.warning(f"Demo script not found at expected path: {demo_script}")
127
+ # Try to find it
128
+ logger.info("Searching for run_ocr_2.0.py in the repository...")
129
+ try:
130
+ find_result = subprocess.run(
131
+ ["find", repo_dir, "-name", "run_ocr_2.0.py", "-type", "f"],
132
+ check=True,
133
+ capture_output=True,
134
+ text=True
135
+ )
136
+ if find_result.stdout.strip():
137
+ found_paths = find_result.stdout.strip().splitlines()
138
+ logger.info(f"Found script at alternative locations: {found_paths}")
139
+ # Use the first found path as fallback
140
+ if found_paths:
141
+ alternative_path = found_paths[0]
142
+ logger.info(f"Using alternative path: {alternative_path}")
143
+ except Exception as e:
144
+ logger.warning(f"Could not search for script: {e}")
145
 
146
  # Set up the weights directory
147
  weights_dir = os.path.join(repo_dir, "GOT_weights")
 
149
  os.makedirs(weights_dir, exist_ok=True)
150
 
151
  cls._weights_path = weights_dir
152
+ logger.debug(f"Weights directory: {weights_dir}")
153
 
154
  # Check if weights exist, if not download them
155
  weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
 
222
  try:
223
  logger.info(f"Processing image with GOT-OCR: {file_path}")
224
 
 
 
 
 
 
 
 
225
  # Create the command for running the GOT-OCR script
226
+ script_path = os.path.join(self._repo_path, "GOT", "demo", "run_ocr_2.0.py")
227
+
228
+ # Check if the script exists at the expected path
229
+ if not os.path.exists(script_path):
230
+ logger.error(f"Script not found at: {script_path}")
231
+
232
+ # Try to find the script within the repository
233
+ logger.info("Searching for run_ocr_2.0.py in the repository...")
234
+ try:
235
+ find_result = subprocess.run(
236
+ ["find", self._repo_path, "-name", "run_ocr_2.0.py", "-type", "f"],
237
+ check=True,
238
+ capture_output=True,
239
+ text=True
240
+ )
241
+ found_paths = find_result.stdout.strip().splitlines()
242
+ if found_paths:
243
+ script_path = found_paths[0]
244
+ logger.info(f"Found script at alternative location: {script_path}")
245
+ else:
246
+ raise FileNotFoundError(f"Could not find run_ocr_2.0.py in repository: {self._repo_path}")
247
+ except Exception as search_e:
248
+ logger.error(f"Error searching for script: {str(search_e)}")
249
+ raise FileNotFoundError(f"Script not found and search failed: {str(search_e)}")
250
+
251
  cmd = [
252
  sys.executable,
253
+ script_path,
254
  "--model-name", self._weights_path,
255
  "--image-file", str(file_path),
256
  "--type", ocr_type
 
282
  # If render was requested, find and return the path to the HTML file
283
  if render:
284
  # The rendered results are in /results/demo.html according to the README
285
+ html_result_path = os.path.join(self._repo_path, "results", "demo.html")
 
 
 
 
 
 
 
 
 
 
 
286
  if os.path.exists(html_result_path):
287
  with open(html_result_path, 'r') as f:
288
  html_content = f.read()
 
302
  except subprocess.CalledProcessError as e:
303
  logger.error(f"Error running GOT-OCR command: {str(e)}")
304
  logger.error(f"Stderr: {e.stderr}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
  raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
306
 
307
  except Exception as e: