AnseMin commited on
Commit
f7427de
·
1 Parent(s): b3a5734

major clean up in unused libraries in requirements

Browse files
Files changed (4) hide show
  1. app.py +0 -78
  2. build.sh +4 -66
  3. requirements.txt +8 -19
  4. setup.sh +3 -60
app.py CHANGED
@@ -26,81 +26,6 @@ try:
26
  except ImportError:
27
  print("python-dotenv not installed, skipping .env file loading")
28
 
29
- # Function to setup Tesseract
30
- def setup_tesseract():
31
- """Setup Tesseract OCR environment."""
32
- print("Setting up Tesseract OCR environment...")
33
-
34
- # Create tessdata directory if it doesn't exist
35
- tessdata_dir = os.path.join(current_dir, "tessdata")
36
- os.makedirs(tessdata_dir, exist_ok=True)
37
-
38
- # Set TESSDATA_PREFIX environment variable if not already set
39
- if not os.environ.get('TESSDATA_PREFIX'):
40
- # Check multiple possible locations
41
- possible_tessdata_dirs = [
42
- tessdata_dir, # Our local tessdata directory
43
- "/usr/share/tesseract-ocr/4.00/tessdata", # Common location in Hugging Face
44
- "/usr/share/tesseract-ocr/tessdata", # Another common location
45
- "/usr/local/share/tessdata", # Standard installation location
46
- ]
47
-
48
- # Use the first directory that exists
49
- for dir_path in possible_tessdata_dirs:
50
- if os.path.exists(dir_path):
51
- os.environ['TESSDATA_PREFIX'] = dir_path
52
- print(f"Set TESSDATA_PREFIX to {dir_path}")
53
- break
54
- else:
55
- # If none exist, use our local directory
56
- os.environ['TESSDATA_PREFIX'] = tessdata_dir
57
- print(f"No existing tessdata directory found, set TESSDATA_PREFIX to {tessdata_dir}")
58
-
59
- # Download eng.traineddata if it doesn't exist in our local tessdata
60
- eng_traineddata = os.path.join(tessdata_dir, "eng.traineddata")
61
- if not os.path.exists(eng_traineddata):
62
- try:
63
- print("Downloading eng.traineddata...")
64
- url = "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
65
- urllib.request.urlretrieve(url, eng_traineddata)
66
- print("Downloaded eng.traineddata")
67
- except Exception as e:
68
- print(f"Error downloading eng.traineddata: {e}")
69
-
70
- # Configure pytesseract
71
- try:
72
- import pytesseract
73
- # Check if tesseract is in PATH
74
- tesseract_cmd = shutil.which("tesseract")
75
- if tesseract_cmd:
76
- pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
77
- print(f"Set pytesseract.tesseract_cmd to {tesseract_cmd}")
78
- else:
79
- # Try common locations
80
- common_locations = [
81
- "/usr/bin/tesseract",
82
- "/usr/local/bin/tesseract",
83
- "/app/tesseract/tesseract"
84
- ]
85
- for location in common_locations:
86
- if os.path.isfile(location) and os.access(location, os.X_OK):
87
- pytesseract.pytesseract.tesseract_cmd = location
88
- print(f"Set pytesseract.tesseract_cmd to {location}")
89
- break
90
- else:
91
- print("Warning: Could not find tesseract executable")
92
- except ImportError:
93
- print("pytesseract not installed")
94
-
95
- # Try to import tesserocr to verify it's working
96
- try:
97
- import tesserocr
98
- print(f"tesserocr imported successfully, version: {tesserocr.tesseract_version()}")
99
- except ImportError:
100
- print("tesserocr not installed or not working")
101
- except Exception as e:
102
- print(f"Error importing tesserocr: {e}")
103
-
104
  # Load Gemini API key from environment variable
105
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
106
 
@@ -131,8 +56,5 @@ except ModuleNotFoundError:
131
  # Try import again
132
  from src.main import main
133
 
134
- # Call setup function at import time
135
- setup_tesseract()
136
-
137
  if __name__ == "__main__":
138
  main()
 
26
  except ImportError:
27
  print("python-dotenv not installed, skipping .env file loading")
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Load Gemini API key from environment variable
30
  gemini_api_key = os.getenv("GOOGLE_API_KEY")
31
 
 
56
  # Try import again
57
  from src.main import main
58
 
 
 
 
59
  if __name__ == "__main__":
60
  main()
build.sh CHANGED
@@ -5,73 +5,11 @@ set -e
5
 
6
  echo "Starting build process..."
7
 
8
- # Install system dependencies for tesseract
9
- echo "Installing Tesseract and dependencies..."
10
  apt-get update && apt-get install -y \
11
- tesseract-ocr \
12
- tesseract-ocr-eng \
13
- libtesseract-dev \
14
- libleptonica-dev \
15
- pkg-config \
16
- wget
17
-
18
- # Create tessdata directory
19
- TESSDATA_DIR="/usr/share/tesseract-ocr/4.00/tessdata"
20
- mkdir -p "$TESSDATA_DIR"
21
-
22
- # Download traineddata files directly from the official repository
23
- echo "Downloading Tesseract traineddata files..."
24
- wget -O "$TESSDATA_DIR/eng.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata"
25
- wget -O "$TESSDATA_DIR/osd.traineddata" "https://github.com/tesseract-ocr/tessdata/raw/main/osd.traineddata"
26
-
27
- # Set and verify TESSDATA_PREFIX
28
- export TESSDATA_PREFIX="$TESSDATA_DIR"
29
- echo "TESSDATA_PREFIX=${TESSDATA_PREFIX}" >> /etc/environment
30
-
31
- # Verify tesseract installation and data files
32
- echo "Verifying Tesseract installation..."
33
- if ! command -v tesseract &> /dev/null; then
34
- echo "Tesseract installation failed!"
35
- exit 1
36
- fi
37
- echo "Tesseract version: $(tesseract --version)"
38
-
39
- # Verify traineddata files
40
- echo "Verifying traineddata files..."
41
- if [ ! -f "$TESSDATA_DIR/eng.traineddata" ]; then
42
- echo "eng.traineddata is missing!"
43
- exit 1
44
- fi
45
- if [ ! -f "$TESSDATA_DIR/osd.traineddata" ]; then
46
- echo "osd.traineddata is missing!"
47
- exit 1
48
- fi
49
-
50
- echo "Traineddata files in $TESSDATA_DIR:"
51
- ls -l "$TESSDATA_DIR"
52
-
53
- # Test Tesseract functionality
54
- echo "Testing Tesseract functionality..."
55
- echo "Hello World" > test.png
56
- if ! tesseract test.png stdout; then
57
- echo "Tesseract test failed!"
58
- exit 1
59
- fi
60
- rm test.png
61
-
62
- # Clean and install tesserocr from source
63
- echo "Installing tesserocr from source..."
64
- pip uninstall -y tesserocr || true
65
- CPPFLAGS="-I/usr/include/tesseract" LDFLAGS="-L/usr/lib/x86_64-linux-gnu/" pip install --no-binary :all: tesserocr
66
-
67
- # Verify tesserocr installation
68
- echo "Verifying tesserocr installation..."
69
- python3 -c "
70
- import tesserocr
71
- print(f'tesserocr version: {tesserocr.__version__}')
72
- print(f'Available languages: {tesserocr.get_languages()}')
73
- print(f'TESSDATA_PREFIX: {tesserocr.get_languages()[1]}')
74
- "
75
 
76
  # Install Google Gemini API client
77
  echo "Installing Google Gemini API client..."
 
5
 
6
  echo "Starting build process..."
7
 
8
+ # Install system dependencies
9
+ echo "Installing system dependencies..."
10
  apt-get update && apt-get install -y \
11
+ wget \
12
+ pkg-config
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Install Google Gemini API client
15
  echo "Installing Google Gemini API client..."
requirements.txt CHANGED
@@ -1,35 +1,24 @@
1
  # Core dependencies
2
  gradio==5.14.0
3
- grpcio-status==1.70.0
4
  markdown==3.7
5
- multiprocess==0.70.16
6
- pipdeptree==2.25.0
7
  Pillow>=9.0.0,<11.0.0
8
  numpy>=1.21.0
9
 
10
- # PDF processing
11
- pdf2image>=1.16.0
12
-
13
- # OCR dependencies (for GOT-OCR)
14
- pytesseract==0.3.13
15
- tesseract==0.1.3
16
- tesserocr>=2.5.0; platform_system != "Windows" # Only install on non-Windows systems
17
  opencv-python-headless>=4.5.0 # Headless version for server environments
18
 
19
  # Utility dependencies
20
- dill==0.3.8 # Downgraded to be compatible with datasets
21
  python-dotenv>=1.0.0
22
  pydantic==2.7.1
23
 
24
  # Gemini API client
25
  google-genai>=0.1.0
26
 
27
- # GOT-OCR dependencies
28
- torch>=2.0.1
29
- torchvision>=0.15.2
30
  transformers==4.37.2 # Pin to a specific version that works with safetensors 0.4.3
31
- tiktoken>=0.6.0
32
- verovio>=4.3.1
33
- accelerate>=0.28.0
34
- safetensors==0.4.3 # Updated to meet minimum version required by accelerate
35
- packaging>=21.0 # For version comparison
 
1
  # Core dependencies
2
  gradio==5.14.0
 
3
  markdown==3.7
 
 
4
  Pillow>=9.0.0,<11.0.0
5
  numpy>=1.21.0
6
 
7
+ # Image processing
 
 
 
 
 
 
8
  opencv-python-headless>=4.5.0 # Headless version for server environments
9
 
10
  # Utility dependencies
 
11
  python-dotenv>=1.0.0
12
  pydantic==2.7.1
13
 
14
  # Gemini API client
15
  google-genai>=0.1.0
16
 
17
+ # GOT-OCR dependencies (as specified in documentation)
18
+ torch==2.0.1
19
+ torchvision==0.15.2
20
  transformers==4.37.2 # Pin to a specific version that works with safetensors 0.4.3
21
+ tiktoken==0.6.0
22
+ verovio==4.3.1
23
+ accelerate==0.28.0
24
+ safetensors==0.4.3 # Updated to meet minimum version required by accelerate
 
setup.sh CHANGED
@@ -3,18 +3,11 @@
3
  # Exit on error
4
  set -e
5
 
6
- echo "Setting up Tesseract OCR environment..."
7
-
8
- # Install required packages if not already installed
9
- if ! command -v tesseract &> /dev/null; then
10
- echo "Tesseract not found, attempting to install..."
11
- apt-get update -y || echo "Failed to update apt, continuing anyway"
12
- apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev || echo "Failed to install tesseract via apt, continuing anyway"
13
- fi
14
 
15
  # Install Python dependencies
16
  echo "Installing Python dependencies..."
17
- pip install -q -U pytesseract pillow opencv-python-headless pdf2image
18
  pip install -q -U google-genai
19
  echo "Python dependencies installed successfully"
20
 
@@ -23,54 +16,4 @@ echo "Installing GOT-OCR dependencies..."
23
  pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.3
24
  echo "GOT-OCR dependencies installed successfully"
25
 
26
- # Install tesserocr with pip
27
- echo "Installing tesserocr..."
28
- pip install -q -U tesserocr || echo "Failed to install tesserocr with pip, trying with specific compiler flags..."
29
-
30
- # If tesserocr installation failed, try with specific compiler flags
31
- if ! python -c "import tesserocr" &> /dev/null; then
32
- echo "Trying alternative tesserocr installation..."
33
- CPPFLAGS="-I/usr/local/include -I/usr/include" LDFLAGS="-L/usr/local/lib -L/usr/lib" pip install -q -U tesserocr || echo "Failed to install tesserocr with compiler flags, continuing anyway"
34
- fi
35
-
36
- # Create tessdata directory if it doesn't exist
37
- mkdir -p tessdata
38
-
39
- # Set TESSDATA_PREFIX environment variable
40
- export TESSDATA_PREFIX="$(pwd)/tessdata"
41
- echo "TESSDATA_PREFIX set to: $TESSDATA_PREFIX"
42
-
43
- # Download eng.traineddata if it doesn't exist
44
- if [ ! -f "tessdata/eng.traineddata" ]; then
45
- echo "Downloading eng.traineddata..."
46
- wget -O tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata || \
47
- curl -o tessdata/eng.traineddata https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
48
- echo "Downloaded eng.traineddata"
49
- else
50
- echo "eng.traineddata already exists"
51
- fi
52
-
53
- # Try to copy to system locations (may fail in restricted environments)
54
- for tessdata_dir in "/usr/share/tesseract-ocr/4.00/tessdata" "/usr/share/tesseract-ocr/tessdata" "/usr/local/share/tessdata"; do
55
- if [ -d "$tessdata_dir" ]; then
56
- echo "Copying eng.traineddata to $tessdata_dir..."
57
- cp -f tessdata/eng.traineddata "$tessdata_dir/" 2>/dev/null || echo "Failed to copy to $tessdata_dir, continuing anyway"
58
- fi
59
- done
60
-
61
- # Verify Tesseract installation
62
- echo "Verifying Tesseract installation..."
63
- tesseract --version || echo "Tesseract not found in PATH, but may still be available to Python"
64
-
65
- # Test tesserocr if installed
66
- echo "Testing tesserocr..."
67
- python -c "import tesserocr; print(f'tesserocr version: {tesserocr.tesseract_version()}')" || echo "tesserocr not working, but may still be able to use pytesseract"
68
-
69
- # Test pytesseract
70
- echo "Testing pytesseract..."
71
- python -c "import pytesseract; print(f'pytesseract path: {pytesseract.tesseract_cmd}')" || echo "pytesseract not working"
72
-
73
- echo "Setup completed"
74
-
75
- # Add TESSDATA_PREFIX to .env file for persistence
76
- echo "TESSDATA_PREFIX=$(pwd)/tessdata" >> .env
 
3
  # Exit on error
4
  set -e
5
 
6
+ echo "Setting up environment..."
 
 
 
 
 
 
 
7
 
8
  # Install Python dependencies
9
  echo "Installing Python dependencies..."
10
+ pip install -q -U pillow opencv-python-headless
11
  pip install -q -U google-genai
12
  echo "Python dependencies installed successfully"
13
 
 
16
  pip install -q -U torch==2.0.1 torchvision==0.15.2 transformers==4.37.2 tiktoken==0.6.0 verovio==4.3.1 accelerate==0.28.0 safetensors==0.4.3
17
  echo "GOT-OCR dependencies installed successfully"
18
 
19
+ echo "Setup completed"