Spaces:
Sleeping
Sleeping
changes on logging for better debugging
Browse files- setup.sh +44 -1
- src/parsers/got_ocr_parser.py +73 -80
setup.sh
CHANGED
@@ -5,6 +5,9 @@ set -e
|
|
5 |
|
6 |
echo "Starting setup process..."
|
7 |
|
|
|
|
|
|
|
8 |
# Check if running with sudo/root permissions for system dependencies
|
9 |
if [ "$EUID" -eq 0 ]; then
|
10 |
# Install system dependencies
|
@@ -12,7 +15,8 @@ if [ "$EUID" -eq 0 ]; then
|
|
12 |
apt-get update && apt-get install -y \
|
13 |
wget \
|
14 |
pkg-config \
|
15 |
-
git
|
|
|
16 |
echo "System dependencies installed successfully"
|
17 |
else
|
18 |
echo "Not running as root. Skipping system dependencies installation."
|
@@ -41,6 +45,42 @@ echo "Installing Hugging Face CLI..."
|
|
41 |
pip install -q -U "huggingface_hub[cli]"
|
42 |
echo "Hugging Face CLI installed successfully"
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
# Install the project in development mode only if setup.py or pyproject.toml exists
|
45 |
if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
|
46 |
echo "Installing project in development mode..."
|
@@ -62,4 +102,7 @@ if [ ! -f .env ]; then
|
|
62 |
fi
|
63 |
fi
|
64 |
|
|
|
|
|
|
|
65 |
echo "Setup process completed successfully!"
|
|
|
5 |
|
6 |
echo "Starting setup process..."
|
7 |
|
8 |
+
# Enable more verbose logging
|
9 |
+
set -x
|
10 |
+
|
11 |
# Check if running with sudo/root permissions for system dependencies
|
12 |
if [ "$EUID" -eq 0 ]; then
|
13 |
# Install system dependencies
|
|
|
15 |
apt-get update && apt-get install -y \
|
16 |
wget \
|
17 |
pkg-config \
|
18 |
+
git \
|
19 |
+
tree # Add tree for directory structure visualization
|
20 |
echo "System dependencies installed successfully"
|
21 |
else
|
22 |
echo "Not running as root. Skipping system dependencies installation."
|
|
|
45 |
pip install -q -U "huggingface_hub[cli]"
|
46 |
echo "Hugging Face CLI installed successfully"
|
47 |
|
48 |
+
# Add debug section for GOT-OCR repo
|
49 |
+
echo "===== GOT-OCR Repository Debugging ====="
|
50 |
+
|
51 |
+
# Clone the repository for inspection (if it doesn't exist)
|
52 |
+
TEMP_DIR="/tmp"
|
53 |
+
REPO_DIR="${TEMP_DIR}/GOT-OCR2.0"
|
54 |
+
|
55 |
+
if [ ! -d "$REPO_DIR" ]; then
|
56 |
+
echo "Cloning GOT-OCR2.0 repository for debugging..."
|
57 |
+
git clone https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git "$REPO_DIR"
|
58 |
+
else
|
59 |
+
echo "GOT-OCR2.0 repository already exists at $REPO_DIR"
|
60 |
+
fi
|
61 |
+
|
62 |
+
# Check the repository structure
|
63 |
+
echo "GOT-OCR2.0 repository structure:"
|
64 |
+
if command -v tree &> /dev/null; then
|
65 |
+
tree -L 3 "$REPO_DIR"
|
66 |
+
else
|
67 |
+
find "$REPO_DIR" -type d -maxdepth 3 | sort
|
68 |
+
fi
|
69 |
+
|
70 |
+
# Check if the demo script exists
|
71 |
+
DEMO_SCRIPT="${REPO_DIR}/GOT/demo/run_ocr_2.0.py"
|
72 |
+
if [ -f "$DEMO_SCRIPT" ]; then
|
73 |
+
echo "Demo script found at: $DEMO_SCRIPT"
|
74 |
+
else
|
75 |
+
echo "ERROR: Demo script not found at: $DEMO_SCRIPT"
|
76 |
+
|
77 |
+
# Search for the script in the repository
|
78 |
+
echo "Searching for run_ocr_2.0.py in the repository..."
|
79 |
+
find "$REPO_DIR" -name "run_ocr_2.0.py" -type f
|
80 |
+
fi
|
81 |
+
|
82 |
+
echo "===== End of GOT-OCR Debugging ====="
|
83 |
+
|
84 |
# Install the project in development mode only if setup.py or pyproject.toml exists
|
85 |
if [ -f "setup.py" ] || [ -f "pyproject.toml" ]; then
|
86 |
echo "Installing project in development mode..."
|
|
|
102 |
fi
|
103 |
fi
|
104 |
|
105 |
+
# Return to normal logging
|
106 |
+
set +x
|
107 |
+
|
108 |
echo "Setup process completed successfully!"
|
src/parsers/got_ocr_parser.py
CHANGED
@@ -15,6 +15,8 @@ import latex2markdown
|
|
15 |
|
16 |
# Configure logging
|
17 |
logger = logging.getLogger(__name__)
|
|
|
|
|
18 |
|
19 |
class GotOcrParser(DocumentParser):
|
20 |
"""Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
|
@@ -26,7 +28,6 @@ class GotOcrParser(DocumentParser):
|
|
26 |
# Path to the GOT-OCR repository
|
27 |
_repo_path = None
|
28 |
_weights_path = None
|
29 |
-
_demo_script_path = None
|
30 |
|
31 |
@classmethod
|
32 |
def get_name(cls) -> str:
|
@@ -79,69 +80,68 @@ class GotOcrParser(DocumentParser):
|
|
79 |
logger.error(f"Missing dependency: {e}")
|
80 |
return False
|
81 |
|
82 |
-
@classmethod
|
83 |
-
def _find_demo_script(cls, base_dir):
|
84 |
-
"""Find the run_ocr_2.0.py script by searching the repository.
|
85 |
-
|
86 |
-
Args:
|
87 |
-
base_dir: The base directory to start searching from
|
88 |
-
|
89 |
-
Returns:
|
90 |
-
Path to the script if found, None otherwise
|
91 |
-
"""
|
92 |
-
logger.info(f"Searching for run_ocr_2.0.py in {base_dir}")
|
93 |
-
script_paths = []
|
94 |
-
|
95 |
-
# Walk through all directories and find all instances of run_ocr_2.0.py
|
96 |
-
for root, dirs, files in os.walk(base_dir):
|
97 |
-
if "run_ocr_2.0.py" in files:
|
98 |
-
script_path = os.path.join(root, "run_ocr_2.0.py")
|
99 |
-
script_paths.append(script_path)
|
100 |
-
logger.info(f"Found run_ocr_2.0.py at: {script_path}")
|
101 |
-
|
102 |
-
if not script_paths:
|
103 |
-
logger.error("Could not find run_ocr_2.0.py in the repository")
|
104 |
-
return None
|
105 |
-
|
106 |
-
# If there are multiple instances, try to find the one in demo folder
|
107 |
-
for path in script_paths:
|
108 |
-
if os.path.join("demo", "run_ocr_2.0.py") in path:
|
109 |
-
logger.info(f"Selected demo script at: {path}")
|
110 |
-
return path
|
111 |
-
|
112 |
-
# If no clear demo folder, just use the first one found
|
113 |
-
logger.info(f"Selected demo script at: {script_paths[0]}")
|
114 |
-
return script_paths[0]
|
115 |
-
|
116 |
@classmethod
|
117 |
def _setup_repository(cls) -> bool:
|
118 |
"""Set up the GOT-OCR2.0 repository if it's not already set up."""
|
119 |
-
if cls._repo_path is not None and os.path.exists(cls._repo_path)
|
|
|
120 |
return True
|
121 |
|
122 |
try:
|
123 |
# Create a temporary directory for the repository
|
124 |
repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
|
|
|
125 |
|
126 |
# Check if the repository already exists
|
127 |
if not os.path.exists(repo_dir):
|
128 |
-
logger.info("Cloning GOT-OCR2.0 repository...")
|
129 |
subprocess.run(
|
130 |
["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
|
131 |
check=True
|
132 |
)
|
133 |
else:
|
134 |
-
logger.info("GOT-OCR2.0 repository already exists, skipping clone")
|
135 |
|
136 |
cls._repo_path = repo_dir
|
137 |
|
138 |
-
#
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
# Set up the weights directory
|
147 |
weights_dir = os.path.join(repo_dir, "GOT_weights")
|
@@ -149,6 +149,7 @@ class GotOcrParser(DocumentParser):
|
|
149 |
os.makedirs(weights_dir, exist_ok=True)
|
150 |
|
151 |
cls._weights_path = weights_dir
|
|
|
152 |
|
153 |
# Check if weights exist, if not download them
|
154 |
weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
|
@@ -221,17 +222,35 @@ class GotOcrParser(DocumentParser):
|
|
221 |
try:
|
222 |
logger.info(f"Processing image with GOT-OCR: {file_path}")
|
223 |
|
224 |
-
# Check if demo script exists
|
225 |
-
if not self._demo_script_path or not os.path.exists(self._demo_script_path):
|
226 |
-
logger.warning("Demo script path not found, trying to locate it again")
|
227 |
-
self._demo_script_path = self._find_demo_script(self._repo_path)
|
228 |
-
if not self._demo_script_path:
|
229 |
-
raise RuntimeError("Could not find the run_ocr_2.0.py script in the repository")
|
230 |
-
|
231 |
# Create the command for running the GOT-OCR script
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
cmd = [
|
233 |
sys.executable,
|
234 |
-
|
235 |
"--model-name", self._weights_path,
|
236 |
"--image-file", str(file_path),
|
237 |
"--type", ocr_type
|
@@ -263,18 +282,7 @@ class GotOcrParser(DocumentParser):
|
|
263 |
# If render was requested, find and return the path to the HTML file
|
264 |
if render:
|
265 |
# The rendered results are in /results/demo.html according to the README
|
266 |
-
|
267 |
-
if not os.path.exists(results_dir):
|
268 |
-
# Try to find results directory
|
269 |
-
for root, dirs, files in os.walk(self._repo_path):
|
270 |
-
if "demo.html" in files:
|
271 |
-
html_result_path = os.path.join(root, "demo.html")
|
272 |
-
logger.info(f"Found rendered HTML at: {html_result_path}")
|
273 |
-
with open(html_result_path, 'r') as f:
|
274 |
-
html_content = f.read()
|
275 |
-
return html_content
|
276 |
-
|
277 |
-
html_result_path = os.path.join(results_dir, "demo.html")
|
278 |
if os.path.exists(html_result_path):
|
279 |
with open(html_result_path, 'r') as f:
|
280 |
html_content = f.read()
|
@@ -294,21 +302,6 @@ class GotOcrParser(DocumentParser):
|
|
294 |
except subprocess.CalledProcessError as e:
|
295 |
logger.error(f"Error running GOT-OCR command: {str(e)}")
|
296 |
logger.error(f"Stderr: {e.stderr}")
|
297 |
-
|
298 |
-
# Print repository structure for debugging
|
299 |
-
logger.error("Repository structure for debugging:")
|
300 |
-
try:
|
301 |
-
subprocess.run(
|
302 |
-
["find", self._repo_path, "-type", "f", "-name", "*.py"],
|
303 |
-
check=True,
|
304 |
-
capture_output=True,
|
305 |
-
text=True
|
306 |
-
)
|
307 |
-
structure_output = subprocess.getoutput(f"find {self._repo_path} -type f -name '*.py'")
|
308 |
-
logger.error(f"Python files in repository:\n{structure_output}")
|
309 |
-
except Exception as debug_e:
|
310 |
-
logger.error(f"Error getting repository structure: {debug_e}")
|
311 |
-
|
312 |
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
313 |
|
314 |
except Exception as e:
|
|
|
15 |
|
16 |
# Configure logging
|
17 |
logger = logging.getLogger(__name__)
|
18 |
+
# Set logger level to DEBUG for more verbose output
|
19 |
+
logger.setLevel(logging.DEBUG)
|
20 |
|
21 |
class GotOcrParser(DocumentParser):
|
22 |
"""Parser implementation using GOT-OCR 2.0 for document text extraction using GitHub repository.
|
|
|
28 |
# Path to the GOT-OCR repository
|
29 |
_repo_path = None
|
30 |
_weights_path = None
|
|
|
31 |
|
32 |
@classmethod
|
33 |
def get_name(cls) -> str:
|
|
|
80 |
logger.error(f"Missing dependency: {e}")
|
81 |
return False
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
@classmethod
|
84 |
def _setup_repository(cls) -> bool:
|
85 |
"""Set up the GOT-OCR2.0 repository if it's not already set up."""
|
86 |
+
if cls._repo_path is not None and os.path.exists(cls._repo_path):
|
87 |
+
logger.debug(f"Repository already set up at: {cls._repo_path}")
|
88 |
return True
|
89 |
|
90 |
try:
|
91 |
# Create a temporary directory for the repository
|
92 |
repo_dir = os.path.join(tempfile.gettempdir(), "GOT-OCR2.0")
|
93 |
+
logger.debug(f"Repository directory: {repo_dir}")
|
94 |
|
95 |
# Check if the repository already exists
|
96 |
if not os.path.exists(repo_dir):
|
97 |
+
logger.info(f"Cloning GOT-OCR2.0 repository to {repo_dir}...")
|
98 |
subprocess.run(
|
99 |
["git", "clone", "https://github.com/Ucas-HaoranWei/GOT-OCR2.0.git", repo_dir],
|
100 |
check=True
|
101 |
)
|
102 |
else:
|
103 |
+
logger.info(f"GOT-OCR2.0 repository already exists at {repo_dir}, skipping clone")
|
104 |
|
105 |
cls._repo_path = repo_dir
|
106 |
|
107 |
+
# Debug: List repository contents
|
108 |
+
logger.debug("Repository contents:")
|
109 |
+
try:
|
110 |
+
result = subprocess.run(
|
111 |
+
["find", repo_dir, "-type", "d", "-maxdepth", "3"],
|
112 |
+
check=True,
|
113 |
+
capture_output=True,
|
114 |
+
text=True
|
115 |
+
)
|
116 |
+
for line in result.stdout.splitlines():
|
117 |
+
logger.debug(f" {line}")
|
118 |
+
except Exception as e:
|
119 |
+
logger.warning(f"Could not list repository contents: {e}")
|
120 |
+
|
121 |
+
# Check if the demo script exists
|
122 |
+
demo_script = os.path.join(repo_dir, "GOT", "demo", "run_ocr_2.0.py")
|
123 |
+
if os.path.exists(demo_script):
|
124 |
+
logger.info(f"Found demo script at: {demo_script}")
|
125 |
+
else:
|
126 |
+
logger.warning(f"Demo script not found at expected path: {demo_script}")
|
127 |
+
# Try to find it
|
128 |
+
logger.info("Searching for run_ocr_2.0.py in the repository...")
|
129 |
+
try:
|
130 |
+
find_result = subprocess.run(
|
131 |
+
["find", repo_dir, "-name", "run_ocr_2.0.py", "-type", "f"],
|
132 |
+
check=True,
|
133 |
+
capture_output=True,
|
134 |
+
text=True
|
135 |
+
)
|
136 |
+
if find_result.stdout.strip():
|
137 |
+
found_paths = find_result.stdout.strip().splitlines()
|
138 |
+
logger.info(f"Found script at alternative locations: {found_paths}")
|
139 |
+
# Use the first found path as fallback
|
140 |
+
if found_paths:
|
141 |
+
alternative_path = found_paths[0]
|
142 |
+
logger.info(f"Using alternative path: {alternative_path}")
|
143 |
+
except Exception as e:
|
144 |
+
logger.warning(f"Could not search for script: {e}")
|
145 |
|
146 |
# Set up the weights directory
|
147 |
weights_dir = os.path.join(repo_dir, "GOT_weights")
|
|
|
149 |
os.makedirs(weights_dir, exist_ok=True)
|
150 |
|
151 |
cls._weights_path = weights_dir
|
152 |
+
logger.debug(f"Weights directory: {weights_dir}")
|
153 |
|
154 |
# Check if weights exist, if not download them
|
155 |
weight_files = [f for f in os.listdir(weights_dir) if f.endswith(".bin") or f.endswith(".safetensors")]
|
|
|
222 |
try:
|
223 |
logger.info(f"Processing image with GOT-OCR: {file_path}")
|
224 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
# Create the command for running the GOT-OCR script
|
226 |
+
script_path = os.path.join(self._repo_path, "GOT", "demo", "run_ocr_2.0.py")
|
227 |
+
|
228 |
+
# Check if the script exists at the expected path
|
229 |
+
if not os.path.exists(script_path):
|
230 |
+
logger.error(f"Script not found at: {script_path}")
|
231 |
+
|
232 |
+
# Try to find the script within the repository
|
233 |
+
logger.info("Searching for run_ocr_2.0.py in the repository...")
|
234 |
+
try:
|
235 |
+
find_result = subprocess.run(
|
236 |
+
["find", self._repo_path, "-name", "run_ocr_2.0.py", "-type", "f"],
|
237 |
+
check=True,
|
238 |
+
capture_output=True,
|
239 |
+
text=True
|
240 |
+
)
|
241 |
+
found_paths = find_result.stdout.strip().splitlines()
|
242 |
+
if found_paths:
|
243 |
+
script_path = found_paths[0]
|
244 |
+
logger.info(f"Found script at alternative location: {script_path}")
|
245 |
+
else:
|
246 |
+
raise FileNotFoundError(f"Could not find run_ocr_2.0.py in repository: {self._repo_path}")
|
247 |
+
except Exception as search_e:
|
248 |
+
logger.error(f"Error searching for script: {str(search_e)}")
|
249 |
+
raise FileNotFoundError(f"Script not found and search failed: {str(search_e)}")
|
250 |
+
|
251 |
cmd = [
|
252 |
sys.executable,
|
253 |
+
script_path,
|
254 |
"--model-name", self._weights_path,
|
255 |
"--image-file", str(file_path),
|
256 |
"--type", ocr_type
|
|
|
282 |
# If render was requested, find and return the path to the HTML file
|
283 |
if render:
|
284 |
# The rendered results are in /results/demo.html according to the README
|
285 |
+
html_result_path = os.path.join(self._repo_path, "results", "demo.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
if os.path.exists(html_result_path):
|
287 |
with open(html_result_path, 'r') as f:
|
288 |
html_content = f.read()
|
|
|
302 |
except subprocess.CalledProcessError as e:
|
303 |
logger.error(f"Error running GOT-OCR command: {str(e)}")
|
304 |
logger.error(f"Stderr: {e.stderr}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
raise RuntimeError(f"Error processing document with GOT-OCR: {str(e)}")
|
306 |
|
307 |
except Exception as e:
|