GoConqurer commited on
Commit
7576661
Β·
verified Β·
1 Parent(s): 523502a

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +321 -0
  2. requirements.txt +19 -0
app.py ADDED
@@ -0,0 +1,321 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TextLens - AI-Powered OCR Application
3
+
4
+ Main entry point for the application.
5
+ """
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import time
10
+ import logging
11
+ from threading import Thread
12
+ from PIL import Image
13
+ from transformers import (
14
+ AutoProcessor,
15
+ AutoModelForCausalLM,
16
+ TextIteratorStreamer,
17
+ Qwen2VLForConditionalGeneration,
18
+ )
19
+ from transformers import Qwen2_5_VLForConditionalGeneration
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Model configurations
26
+ QV_MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
27
+ ROLMOCR_MODEL_ID = "reducto/RolmOCR"
28
+
29
+ def progress_bar_html(label: str, primary_color: str = "#4B0082", secondary_color: str = "#9370DB") -> str:
30
+ """Returns an HTML snippet for a thin animated progress bar with a label."""
31
+ return f'''
32
+ <div style="display: flex; align-items: center;">
33
+ <span style="margin-right: 10px; font-size: 14px;">{label}</span>
34
+ <div style="width: 110px; height: 5px; background-color: {secondary_color}; border-radius: 2px; overflow: hidden;">
35
+ <div style="width: 100%; height: 100%; background-color: {primary_color}; animation: loading 1.5s linear infinite;"></div>
36
+ </div>
37
+ </div>
38
+ <style>
39
+ @keyframes loading {{
40
+ 0% {{ transform: translateX(-100%); }}
41
+ 100% {{ transform: translateX(100%); }}
42
+ }}
43
+ </style>
44
+ '''
45
+
46
+ # Load models at startup
47
+ logger.info("πŸš€ Loading OCR models...")
48
+ logger.info("This may take a few minutes on first run...")
49
+
50
+ try:
51
+ # Load Qwen2VL OCR model (primary fast model)
52
+ logger.info(f"Loading Qwen2VL OCR model: {QV_MODEL_ID}")
53
+ qwen_processor = AutoProcessor.from_pretrained(QV_MODEL_ID, trust_remote_code=True)
54
+ qwen_model = Qwen2VLForConditionalGeneration.from_pretrained(
55
+ QV_MODEL_ID,
56
+ trust_remote_code=True,
57
+ torch_dtype=torch.float16
58
+ ).to("cuda" if torch.cuda.is_available() else "cpu").eval()
59
+ logger.info("βœ… Qwen2VL OCR model loaded successfully!")
60
+
61
+ # Load RolmOCR model (specialized document model)
62
+ logger.info(f"Loading RolmOCR model: {ROLMOCR_MODEL_ID}")
63
+ rolmocr_processor = AutoProcessor.from_pretrained(ROLMOCR_MODEL_ID, trust_remote_code=True)
64
+ rolmocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
65
+ ROLMOCR_MODEL_ID,
66
+ trust_remote_code=True,
67
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
68
+ ).to("cuda" if torch.cuda.is_available() else "cpu").eval()
69
+ logger.info("βœ… RolmOCR model loaded successfully!")
70
+
71
+ MODELS_LOADED = True
72
+ logger.info("πŸŽ‰ All models loaded and ready!")
73
+
74
+ except Exception as e:
75
+ logger.error(f"❌ Failed to load models: {str(e)}")
76
+ MODELS_LOADED = False
77
+
78
+ def extract_text_from_image(image, text_query, use_rolmocr=False):
79
+ """Extract text from image using selected OCR model with streaming response."""
80
+
81
+ if not MODELS_LOADED:
82
+ yield "❌ Error: OCR models failed to load. Please check your setup and try again."
83
+ return
84
+
85
+ if image is None:
86
+ yield "❌ No image provided. Please upload an image to extract text."
87
+ return
88
+
89
+ try:
90
+ # Ensure image is in RGB format
91
+ if not isinstance(image, Image.Image):
92
+ yield "❌ Invalid image format. Please upload a valid image file."
93
+ return
94
+
95
+ if image.mode != 'RGB':
96
+ image = image.convert('RGB')
97
+
98
+ # Prepare text query
99
+ if not text_query.strip():
100
+ text_query = "Extract all text from this image"
101
+
102
+ # Select model and processor
103
+ if use_rolmocr:
104
+ processor = rolmocr_processor
105
+ model = rolmocr_model
106
+ model_name = "RolmOCR"
107
+ logger.info("Using RolmOCR for specialized document processing")
108
+ else:
109
+ processor = qwen_processor
110
+ model = qwen_model
111
+ model_name = "Qwen2VL OCR"
112
+ logger.info("Using Qwen2VL OCR for fast text extraction")
113
+
114
+ # Build messages for the model
115
+ messages = [
116
+ {
117
+ "role": "user",
118
+ "content": [
119
+ {"type": "text", "text": text_query},
120
+ {"type": "image", "image": image}
121
+ ]
122
+ }
123
+ ]
124
+
125
+ # Apply chat template and prepare inputs
126
+ prompt_full = processor.apply_chat_template(
127
+ messages,
128
+ tokenize=False,
129
+ add_generation_prompt=True
130
+ )
131
+
132
+ inputs = processor(
133
+ text=[prompt_full],
134
+ images=[image],
135
+ return_tensors="pt",
136
+ padding=True,
137
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
138
+
139
+ # Set up streaming
140
+ streamer = TextIteratorStreamer(
141
+ processor,
142
+ skip_prompt=True,
143
+ skip_special_tokens=True
144
+ )
145
+
146
+ generation_kwargs = dict(
147
+ inputs,
148
+ streamer=streamer,
149
+ max_new_tokens=1024,
150
+ do_sample=False,
151
+ temperature=0.1
152
+ )
153
+
154
+ # Start generation in separate thread
155
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
156
+ thread.start()
157
+
158
+ # Yield progress bar first
159
+ yield progress_bar_html(f"πŸ” Processing with {model_name}")
160
+
161
+ # Stream the response
162
+ buffer = ""
163
+ for new_text in streamer:
164
+ buffer += new_text
165
+ # Clean up any special tokens that might leak through
166
+ clean_buffer = buffer.replace("<|im_end|>", "").replace("<|endoftext|>", "").strip()
167
+ if clean_buffer:
168
+ time.sleep(0.01) # Small delay for smooth streaming
169
+ yield clean_buffer
170
+
171
+ # Ensure thread completes
172
+ thread.join()
173
+
174
+ # Final clean response
175
+ final_response = buffer.replace("<|im_end|>", "").replace("<|endoftext|>", "").strip()
176
+ if not final_response:
177
+ yield "⚠️ No text was detected in the image. Please try with a clearer image or different model."
178
+ else:
179
+ logger.info(f"βœ… Successfully extracted text: {len(final_response)} characters")
180
+ yield final_response
181
+
182
+ except Exception as e:
183
+ error_msg = f"❌ Error processing image: {str(e)}"
184
+ logger.error(f"OCR processing failed: {str(e)}")
185
+ yield error_msg
186
+
187
+ def get_model_status():
188
+ """Get current model status information."""
189
+ if MODELS_LOADED:
190
+ device = "🟒 GPU (CUDA)" if torch.cuda.is_available() else "🟑 CPU"
191
+ return f"""
192
+ **πŸ€– Model Status: βœ… Ready**
193
+
194
+ **Primary Model:** Qwen2VL-OCR-2B (Fast general OCR)
195
+ **Secondary Model:** RolmOCR (Specialized documents)
196
+ **Device:** {device}
197
+ **Memory:** Optimized for streaming inference
198
+
199
+ ✨ Both models loaded and ready for OCR processing!
200
+ """
201
+ else:
202
+ return """
203
+ **πŸ€– Model Status: ❌ Failed to Load**
204
+
205
+ Please check your internet connection and GPU setup.
206
+ Models need to be downloaded on first run.
207
+ """
208
+
209
+ # Create Gradio Interface
210
+ def create_interface():
211
+ """Create the streamlined OCR interface."""
212
+
213
+ with gr.Blocks(
214
+ title="TextLens - Fast AI OCR",
215
+ theme=gr.themes.Soft(),
216
+ css="""
217
+ .container { max-width: 1200px; margin: auto; }
218
+ .header { text-align: center; padding: 20px; }
219
+ .model-status { background: #f0f0f0; padding: 15px; border-radius: 8px; margin: 10px 0; }
220
+ """
221
+ ) as interface:
222
+
223
+ # Header
224
+ gr.HTML("""
225
+ <div class="header">
226
+ <h1>πŸ” TextLens - AI-Powered OCR</h1>
227
+ <p style="font-size: 16px; color: #666;">
228
+ Fast and accurate text extraction using modern AI models
229
+ </p>
230
+ </div>
231
+ """)
232
+
233
+ # Model Status
234
+ with gr.Row():
235
+ with gr.Column():
236
+ status_display = gr.Markdown(
237
+ value=get_model_status(),
238
+ elem_classes=["model-status"]
239
+ )
240
+ refresh_btn = gr.Button("πŸ”„ Refresh Status", size="sm")
241
+
242
+ # Main Interface
243
+ with gr.Row():
244
+ with gr.Column(scale=1):
245
+ gr.Markdown("### πŸ“ Upload Image")
246
+ image_input = gr.Image(
247
+ label="Upload image for OCR",
248
+ type="pil",
249
+ sources=["upload", "clipboard"]
250
+ )
251
+
252
+ text_query = gr.Textbox(
253
+ label="πŸ“ OCR Instructions (optional)",
254
+ placeholder="Extract all text from this image",
255
+ value="Extract all text from this image",
256
+ lines=2
257
+ )
258
+
259
+ use_rolmocr = gr.Checkbox(
260
+ label="🎯 Use RolmOCR (specialized for documents)",
261
+ value=False,
262
+ info="Check for complex documents/tables, uncheck for general text"
263
+ )
264
+
265
+ extract_btn = gr.Button(
266
+ "πŸš€ Extract Text",
267
+ variant="primary",
268
+ size="lg"
269
+ )
270
+
271
+ with gr.Column(scale=1):
272
+ gr.Markdown("### πŸ“„ Extracted Text")
273
+ text_output = gr.Textbox(
274
+ label="OCR Results",
275
+ lines=15,
276
+ max_lines=25,
277
+ placeholder="Extracted text will appear here...\n\nβ€’ Upload an image to get started\nβ€’ Choose between fast OCR or specialized document processing\nβ€’ Results will stream in real-time",
278
+ show_copy_button=True
279
+ )
280
+
281
+ # Event handlers
282
+ extract_btn.click(
283
+ fn=extract_text_from_image,
284
+ inputs=[image_input, text_query, use_rolmocr],
285
+ outputs=text_output,
286
+ show_progress="hidden" # We handle progress with custom HTML
287
+ )
288
+
289
+ # Auto-extract on image upload
290
+ image_input.upload(
291
+ fn=extract_text_from_image,
292
+ inputs=[image_input, text_query, use_rolmocr],
293
+ outputs=text_output,
294
+ show_progress="hidden"
295
+ )
296
+
297
+ refresh_btn.click(
298
+ fn=get_model_status,
299
+ outputs=status_display
300
+ )
301
+
302
+ return interface
303
+
304
+ if __name__ == "__main__":
305
+ logger.info("πŸš€ Starting TextLens OCR application...")
306
+
307
+ try:
308
+ interface = create_interface()
309
+
310
+ # Launch configuration
311
+ interface.launch(
312
+ share=False,
313
+ server_name="0.0.0.0",
314
+ server_port=7860,
315
+ show_error=True,
316
+ debug=False
317
+ )
318
+
319
+ except Exception as e:
320
+ logger.error(f"Failed to start application: {str(e)}")
321
+ raise
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core ML dependencies for Qwen2VL and RolmOCR
2
+ torch>=2.0.0
3
+ transformers>=4.44.0
4
+ accelerate>=0.20.0
5
+ sentencepiece>=0.1.97
6
+ protobuf>=3.20.0
7
+
8
+ # UI framework
9
+ gradio>=4.44.0
10
+
11
+ # Image processing
12
+ pillow>=9.0.0
13
+
14
+ # Essential utilities
15
+ numpy>=1.21.0
16
+ requests>=2.25.0
17
+
18
+ # Optional: HuggingFace Spaces optimization (if deploying to HF Spaces)
19
+ spaces>=0.19.0