Spaces:
Sleeping
Sleeping
major clean up in unused libraries in requirements
Browse files
app.py
CHANGED
@@ -26,81 +26,6 @@ try:
|
|
26 |
except ImportError:
|
27 |
print("python-dotenv not installed, skipping .env file loading")
|
28 |
|
29 |
-
# Function to setup Tesseract
|
30 |
-
def setup_tesseract():
|
31 |
-
"""Setup Tesseract OCR environment."""
|
32 |
-
print("Setting up Tesseract OCR environment...")
|
33 |
-
|
34 |
-
# Create tessdata directory if it doesn't exist
|
35 |
-
tessdata_dir = os.path.join(current_dir, "tessdata")
|
36 |
-
os.makedirs(tessdata_dir, exist_ok=True)
|
37 |
-
|
38 |
-
# Set TESSDATA_PREFIX environment variable if not already set
|
39 |
-
if not os.environ.get('TESSDATA_PREFIX'):
|
40 |
-
# Check multiple possible locations
|
41 |
-
possible_tessdata_dirs = [
|
42 |
-
tessdata_dir, # Our local tessdata directory
|
43 |
-
"/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
|
44 |
-
"/usr/share/tesseract-ocr/tessdata", # Another common location
|
45 |
-
"/usr/local/share/tessdata", # Standard installation location
|
46 |
-
]
|
47 |
-
|
48 |
-
# Use the first directory that exists
|
49 |
-
for dir_path in possible_tessdata_dirs:
|
50 |
-
if os.path.exists(dir_path):
|
51 |
-
os.environ['TESSDATA_PREFIX'] = dir_path
|
52 |
-
print(f"Set TESSDATA_PREFIX to {dir_path}")
|
53 |
-
break
|
54 |
-
else:
|
55 |
-
# If none exist, use our local directory
|
56 |
-
os.environ['TESSDATA_PREFIX'] = tessdata_dir
|
57 |
-
print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
|
58 |
-
|
59 |
-
# Download eng.traineddata if it doesn't exist in our local tessdata
|
60 |
-
eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
|
61 |
-
if not os.path.exists(eng_traineddata):
|
62 |
-
try:
|
63 |
-
print("Downloading eng.traineddata...")
|
64 |
-
url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
65 |
-
urllib.request.urlretrieve(url, eng_traineddata)
|
66 |
-
print("Downloaded eng.traineddata")
|
67 |
-
except Exception as e:
|
68 |
-
print(f"Error downloading eng.traineddata: {e}")
|
69 |
-
|
70 |
-
# Configure pytesseract
|
71 |
-
try:
|
72 |
-
import pytesseract
|
73 |
-
# Check if tesseract is in PATH
|
74 |
-
tesseract_cmd = shutil.which("tesseract")
|
75 |
-
if tesseract_cmd:
|
76 |
-
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
|
77 |
-
print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
|
78 |
-
else:
|
79 |
-
# Try common locations
|
80 |
-
common_locations = [
|
81 |
-
"/usr/bin/tesseract",
|
82 |
-
"/usr/local/bin/tesseract",
|
83 |
-
"/app/tesseract/tesseract"
|
84 |
-
]
|
85 |
-
for location in common_locations:
|
86 |
-
if os.path.isfile(location) and os.access(location, os.X_OK):
|
87 |
-
pytesseract.pytesseract.tesseract_cmd = location
|
88 |
-
print(f"Set pytesseract.tesseract_cmd to {location}")
|
89 |
-
break
|
90 |
-
else:
|
91 |
-
print("Warning: Could not find tesseract executable")
|
92 |
-
except ImportError:
|
93 |
-
print("pytesseract not installed")
|
94 |
-
|
95 |
-
# Try to import tesserocr to verify it's working
|
96 |
-
try:
|
97 |
-
import tesserocr
|
98 |
-
print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
|
99 |
-
except ImportError:
|
100 |
-
print("tesserocr not installed or not working")
|
101 |
-
except Exception as e:
|
102 |
-
print(f"Error importing tesserocr: {e}")
|
103 |
-
|
104 |
# Load Gemini API key from environment variable
|
105 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
106 |
|
@@ -131,8 +56,5 @@ except ModuleNotFoundError:
|
|
131 |
# Try import again
|
132 |
from src.main import main
|
133 |
|
134 |
-
# Call setup function at import time
|
135 |
-
setup_tesseract()
|
136 |
-
|
137 |
if __name__ == "__main__":
|
138 |
main()
|
|
|
26 |
except ImportError:
|
27 |
print("python-dotenv not installed, skipping .env file loading")
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Load Gemini API key from environment variable
|
30 |
gemini_api_key = os.getenv("GOOGLE_API_KEY")
|
31 |
|
|
|
56 |
# Try import again
|
57 |
from src.main import main
|
58 |
|
|
|
|
|
|
|
59 |
if __name__ == "__main__":
|
60 |
main()
|
build.sh
CHANGED
@@ -5,73 +5,11 @@ set -e
|
|
5 |
|
6 |
echo "Starting build process..."
|
7 |
|
8 |
-
# Install system dependencies
|
9 |
-
echo "Installing
|
10 |
apt-get update && apt-get install -y \
|
11 |
-
|
12 |
-
|
13 |
-
libtesseract-dev \
|
14 |
-
libleptonica-dev \
|
15 |
-
pkg-config \
|
16 |
-
wget
|
17 |
-
|
18 |
-
# Create tessdata directory
|
19 |
-
TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
|
20 |
-
mkdir -p "$TESSDATA_DIR"
|
21 |
-
|
22 |
-
# Download traineddata files directly from the official repository
|
23 |
-
echo "Downloading Tesseract traineddata files..."
|
24 |
-
wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
|
25 |
-
wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
|
26 |
-
|
27 |
-
# Set and verify TESSDATA_PREFIX
|
28 |
-
export TESSDATA_PREFIX="$TESSDATA_DIR"
|
29 |
-
echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
|
30 |
-
|
31 |
-
# Verify tesseract installation and data files
|
32 |
-
echo "Verifying Tesseract installation..."
|
33 |
-
if ! command -v tesseract &> /dev/null; then
|
34 |
-
echo "Tesseract installation failed!"
|
35 |
-
exit 1
|
36 |
-
fi
|
37 |
-
echo "Tesseract version: $(tesseract --version)"
|
38 |
-
|
39 |
-
# Verify traineddata files
|
40 |
-
echo "Verifying traineddata files..."
|
41 |
-
if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
|
42 |
-
echo "eng.traineddata is missing!"
|
43 |
-
exit 1
|
44 |
-
fi
|
45 |
-
if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
|
46 |
-
echo "osd.traineddata is missing!"
|
47 |
-
exit 1
|
48 |
-
fi
|
49 |
-
|
50 |
-
echo "Traineddata files in $TESSDATA_DIR:"
|
51 |
-
ls -l "$TESSDATA_DIR"
|
52 |
-
|
53 |
-
# Test Tesseract functionality
|
54 |
-
echo "Testing Tesseract functionality..."
|
55 |
-
echo "Hello World" > test.png
|
56 |
-
if ! tesseract test.png stdout; then
|
57 |
-
echo "Tesseract test failed!"
|
58 |
-
exit 1
|
59 |
-
fi
|
60 |
-
rm test.png
|
61 |
-
|
62 |
-
# Clean and install tesserocr from source
|
63 |
-
echo "Installing tesserocr from source..."
|
64 |
-
pip uninstall -y tesserocr || true
|
65 |
-
CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
|
66 |
-
|
67 |
-
# Verify tesserocr installation
|
68 |
-
echo "Verifying tesserocr installation..."
|
69 |
-
python3 -c "
|
70 |
-
import tesserocr
|
71 |
-
print(f'tesserocr version: {tesserocr.__version__}')
|
72 |
-
print(f'Available languages: {tesserocr.get_languages()}')
|
73 |
-
print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
|
74 |
-
"
|
75 |
|
76 |
# Install Google Gemini API client
|
77 |
echo "Installing Google Gemini API client..."
|
|
|
5 |
|
6 |
echo "Starting build process..."
|
7 |
|
8 |
+
# Install system dependencies
|
9 |
+
echo "Installing system dependencies..."
|
10 |
apt-get update && apt-get install -y \
|
11 |
+
wget \
|
12 |
+
pkg-config
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Install Google Gemini API client
|
15 |
echo "Installing Google Gemini API client..."
|
requirements.txt
CHANGED
@@ -1,35 +1,24 @@
|
|
1 |
# Core dependencies
|
2 |
gradio==5.14.0
|
3 |
-
grpcio-status==1.70.0
|
4 |
markdown==3.7
|
5 |
-
multiprocess==0.70.16
|
6 |
-
pipdeptree==2.25.0
|
7 |
Pillow>=9.0.0,<11.0.0
|
8 |
numpy>=1.21.0
|
9 |
|
10 |
-
#
|
11 |
-
pdf2image>=1.16.0
|
12 |
-
|
13 |
-
# OCR dependencies (for GOT-OCR)
|
14 |
-
pytesseract==0.3.13
|
15 |
-
tesseract==0.1.3
|
16 |
-
tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
|
17 |
opencv-python-headless>=4.5.0 # Headless version for server environments
|
18 |
|
19 |
# Utility dependencies
|
20 |
-
dill==0.3.8 # Downgraded to be compatible with datasets
|
21 |
python-dotenv>=1.0.0
|
22 |
pydantic==2.7.1
|
23 |
|
24 |
# Gemini API client
|
25 |
google-genai>=0.1.0
|
26 |
|
27 |
-
# GOT-OCR dependencies
|
28 |
-
torch
|
29 |
-
torchvision
|
30 |
transformers==4.37.2 # Pin to a specific version that works with safetensors 0.4.3
|
31 |
-
tiktoken
|
32 |
-
verovio
|
33 |
-
accelerate
|
34 |
-
safetensors==0.4.3 # Updated to meet minimum version required by accelerate
|
35 |
-
packaging>=21.0 # For version comparison
|
|
|
1 |
# Core dependencies
|
2 |
gradio==5.14.0
|
|
|
3 |
markdown==3.7
|
|
|
|
|
4 |
Pillow>=9.0.0,<11.0.0
|
5 |
numpy>=1.21.0
|
6 |
|
7 |
+
# Image processing
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
opencv-python-headless>=4.5.0 # Headless version for server environments
|
9 |
|
10 |
# Utility dependencies
|
|
|
11 |
python-dotenv>=1.0.0
|
12 |
pydantic==2.7.1
|
13 |
|
14 |
# Gemini API client
|
15 |
google-genai>=0.1.0
|
16 |
|
17 |
+
# GOT-OCR dependencies (as specified in documentation)
|
18 |
+
torch==2.0.1
|
19 |
+
torchvision==0.15.2
|
20 |
transformers==4.37.2 # Pin to a specific version that works with safetensors 0.4.3
|
21 |
+
tiktoken==0.6.0
|
22 |
+
verovio==4.3.1
|
23 |
+
accelerate==0.28.0
|
24 |
+
safetensors==0.4.3 # Updated to meet minimum version required by accelerate
|
|
setup.sh
CHANGED
@@ -3,18 +3,11 @@
|
|
3 |
# Exit on error
|
4 |
set -e
|
5 |
|
6 |
-
echo "Setting up
|
7 |
-
|
8 |
-
# Install required packages if not already installed
|
9 |
-
if ! command -v tesseract &> /dev/null; then
|
10 |
-
echo "Tesseract not found, attempting to install..."
|
11 |
-
apt-get update -y || echo "Failed to update apt, continuing anyway"
|
12 |
-
apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
|
13 |
-
fi
|
14 |
|
15 |
# Install Python dependencies
|
16 |
echo "Installing Python dependencies..."
|
17 |
-
pip install -q -U
|
18 |
pip install -q -U google-genai
|
19 |
echo "Python dependencies installed successfully"
|
20 |
|
@@ -23,54 +16,4 @@ echo "Installing GOT-OCR dependencies..."
|
|
23 |
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.3
|
24 |
echo "GOT-OCR dependencies installed successfully"
|
25 |
|
26 |
-
|
27 |
-
echo "Installing tesserocr..."
|
28 |
-
pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
|
29 |
-
|
30 |
-
# If tesserocr installation failed, try with specific compiler flags
|
31 |
-
if ! python -c "import tesserocr" &> /dev/null; then
|
32 |
-
echo "Trying alternative tesserocr installation..."
|
33 |
-
CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
|
34 |
-
fi
|
35 |
-
|
36 |
-
# Create tessdata directory if it doesn't exist
|
37 |
-
mkdir -p tessdata
|
38 |
-
|
39 |
-
# Set TESSDATA_PREFIX environment variable
|
40 |
-
export TESSDATA_PREFIX="$(pwd)/tessdata"
|
41 |
-
echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
|
42 |
-
|
43 |
-
# Download eng.traineddata if it doesn't exist
|
44 |
-
if [ ! -f "tessdata/eng.traineddata" ]; then
|
45 |
-
echo "Downloading eng.traineddata..."
|
46 |
-
wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
|
47 |
-
curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
|
48 |
-
echo "Downloaded eng.traineddata"
|
49 |
-
else
|
50 |
-
echo "eng.traineddata already exists"
|
51 |
-
fi
|
52 |
-
|
53 |
-
# Try to copy to system locations (may fail in restricted environments)
|
54 |
-
for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
|
55 |
-
if [ -d "$tessdata_dir" ]; then
|
56 |
-
echo "Copying eng.traineddata to $tessdata_dir..."
|
57 |
-
cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
|
58 |
-
fi
|
59 |
-
done
|
60 |
-
|
61 |
-
# Verify Tesseract installation
|
62 |
-
echo "Verifying Tesseract installation..."
|
63 |
-
tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
|
64 |
-
|
65 |
-
# Test tesserocr if installed
|
66 |
-
echo "Testing tesserocr..."
|
67 |
-
python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
|
68 |
-
|
69 |
-
# Test pytesseract
|
70 |
-
echo "Testing pytesseract..."
|
71 |
-
python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
|
72 |
-
|
73 |
-
echo "Setup completed"
|
74 |
-
|
75 |
-
# Add TESSDATA_PREFIX to .env file for persistence
|
76 |
-
echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env
|
|
|
3 |
# Exit on error
|
4 |
set -e
|
5 |
|
6 |
+
echo "Setting up environment..."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Install Python dependencies
|
9 |
echo "Installing Python dependencies..."
|
10 |
+
pip install -q -U pillow opencv-python-headless
|
11 |
pip install -q -U google-genai
|
12 |
echo "Python dependencies installed successfully"
|
13 |
|
|
|
16 |
pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.3
|
17 |
echo "GOT-OCR dependencies installed successfully"
|
18 |
|
19 |
+
echo "Setup completed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|