Souvik3333 commited on
Commit
6698955
Β·
verified Β·
1 Parent(s): 3c1b4ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -32
app.py CHANGED
@@ -1,8 +1,9 @@
1
  import gradio as gr
2
  from PIL import Image
3
- from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
4
  import torch
5
  import spaces
 
6
 
7
  model_path = "nanonets/Nanonets-OCR-s"
8
 
@@ -33,6 +34,65 @@ def process_tags(content: str) -> str:
33
 
34
  return content
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  @spaces.GPU()
37
  def ocr_image_gradio(image, max_tokens=4096):
38
  """Process image through Nanonets OCR model for Gradio interface"""
@@ -88,6 +148,9 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
88
  πŸ’» GitHub Repository
89
  </a>
90
  </div>
 
 
 
91
  </div>
92
  """)
93
 
@@ -108,9 +171,16 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
108
  )
109
  extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
110
 
 
 
 
 
 
 
 
111
  with gr.Column(scale=2):
112
  output_text = gr.Markdown(
113
- label="Formatted model prediction",
114
  latex_delimiters=[
115
  {"left": "$$", "right": "$$", "display": True},
116
  {"left": "$", "right": "$", "display": False},
@@ -124,16 +194,16 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
124
  show_copy_button=True,
125
  )
126
 
127
- # Event handlers
128
  extract_btn.click(
129
- fn=ocr_image_gradio,
130
  inputs=[image_input, max_tokens_slider],
131
  outputs=output_text,
132
  show_progress=True
133
  )
134
 
135
  image_input.change(
136
- fn=ocr_image_gradio,
137
  inputs=[image_input, max_tokens_slider],
138
  outputs=output_text,
139
  show_progress=True
@@ -142,32 +212,42 @@ with gr.Blocks(title="Nanonets OCR Demo") as demo:
142
  # Add model information section
143
  with gr.Accordion("About Nanonets-OCR-s", open=False):
144
  gr.Markdown("""
145
- ## Nanonets-OCR-s
146
-
147
- Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
148
- It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
149
- for downstream processing by Large Language Models (LLMs).
150
-
151
- ### Key Features
152
-
153
- - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
154
- It distinguishes between inline ($...$) and display ($$...$$) equations.
155
-
156
- - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
157
- for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
158
- style, and context.
159
-
160
- - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
161
- This is crucial for processing legal and business documents.
162
-
163
- - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
164
-
165
- - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, β˜‘, β˜’)
166
- for consistent and reliable processing.
167
-
168
- - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
169
- and HTML table formats.
170
- """)
171
 
172
  if __name__ == "__main__":
173
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  from PIL import Image
3
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
4
  import torch
5
  import spaces
6
+ import threading
7
 
8
  model_path = "nanonets/Nanonets-OCR-s"
9
 
 
34
 
35
  return content
36
 
37
+ @spaces.GPU()
38
+ def ocr_image_gradio_stream(image, max_tokens=4096):
39
+ """Process image through Nanonets OCR model with streaming output"""
40
+ if image is None:
41
+ yield "Please upload an image."
42
+ return
43
+
44
+ try:
45
+ prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and β˜‘ for check boxes."""
46
+
47
+ # Convert PIL image if needed
48
+ if not isinstance(image, Image.Image):
49
+ image = Image.fromarray(image)
50
+
51
+ messages = [
52
+ {"role": "system", "content": "You are a helpful assistant."},
53
+ {"role": "user", "content": [
54
+ {"type": "image", "image": image},
55
+ {"type": "text", "text": prompt},
56
+ ]},
57
+ ]
58
+
59
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
60
+ inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
61
+ inputs = inputs.to(model.device)
62
+
63
+ # Set up streaming
64
+ streamer = TextIteratorStreamer(
65
+ tokenizer=tokenizer,
66
+ skip_prompt=True,
67
+ skip_special_tokens=True,
68
+ clean_up_tokenization_spaces=True
69
+ )
70
+
71
+ generation_kwargs = {
72
+ **inputs,
73
+ "max_new_tokens": max_tokens,
74
+ "do_sample": False,
75
+ "streamer": streamer,
76
+ }
77
+
78
+ # Start generation in a separate thread
79
+ generation_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
80
+ generation_thread.start()
81
+
82
+ # Stream the output
83
+ partial_output = ""
84
+ for new_token in streamer:
85
+ partial_output += new_token
86
+ processed_output = process_tags(partial_output)
87
+ yield processed_output
88
+
89
+ # Ensure thread completes
90
+ generation_thread.join()
91
+
92
+ except Exception as e:
93
+ yield f"Error processing image: {str(e)}"
94
+
95
+ # Non-streaming version as fallback
96
  @spaces.GPU()
97
  def ocr_image_gradio(image, max_tokens=4096):
98
  """Process image through Nanonets OCR model for Gradio interface"""
 
148
  πŸ’» GitHub Repository
149
  </a>
150
  </div>
151
+ <p style="font-size: 0.9em; color: #10b981; font-weight: 500;">
152
+ ✨ Now with streaming output and support for 4 concurrent uploads!
153
+ </p>
154
  </div>
155
  """)
156
 
 
171
  )
172
  extract_btn = gr.Button("Extract Text", variant="primary", size="lg")
173
 
174
+ gr.Markdown("""
175
+ **πŸ’‘ Tips:**
176
+ - Upload supports concurrent processing of up to 4 images
177
+ - Results stream in real-time as they're generated
178
+ - Automatic processing starts when you upload an image
179
+ """)
180
+
181
  with gr.Column(scale=2):
182
  output_text = gr.Markdown(
183
+ label="Streaming model prediction",
184
  latex_delimiters=[
185
  {"left": "$$", "right": "$$", "display": True},
186
  {"left": "$", "right": "$", "display": False},
 
194
  show_copy_button=True,
195
  )
196
 
197
+ # Event handlers with streaming
198
  extract_btn.click(
199
+ fn=ocr_image_gradio_stream,
200
  inputs=[image_input, max_tokens_slider],
201
  outputs=output_text,
202
  show_progress=True
203
  )
204
 
205
  image_input.change(
206
+ fn=ocr_image_gradio_stream,
207
  inputs=[image_input, max_tokens_slider],
208
  outputs=output_text,
209
  show_progress=True
 
212
  # Add model information section
213
  with gr.Accordion("About Nanonets-OCR-s", open=False):
214
  gr.Markdown("""
215
+ ## Nanonets-OCR-s
216
+
217
+ Nanonets-OCR-s is a powerful, state-of-the-art image-to-markdown OCR model that goes far beyond traditional text extraction.
218
+ It transforms documents into structured markdown with intelligent content recognition and semantic tagging, making it ideal
219
+ for downstream processing by Large Language Models (LLMs).
220
+
221
+ ### Key Features
222
+
223
+ - **LaTeX Equation Recognition**: Automatically converts mathematical equations and formulas into properly formatted LaTeX syntax.
224
+ It distinguishes between inline `($...$)` and display `($$...$$)` equations.
225
+
226
+ - **Intelligent Image Description**: Describes images within documents using structured `<img>` tags, making them digestible
227
+ for LLM processing. It can describe various image types, including logos, charts, graphs and so on, detailing their content,
228
+ style, and context.
229
+
230
+ - **Signature Detection & Isolation**: Identifies and isolates signatures from other text, outputting them within a `<signature>` tag.
231
+ This is crucial for processing legal and business documents.
232
+
233
+ - **Watermark Extraction**: Detects and extracts watermark text from documents, placing it within a `<watermark>` tag.
234
+
235
+ - **Smart Checkbox Handling**: Converts form checkboxes and radio buttons into standardized Unicode symbols (☐, β˜‘, β˜’)
236
+ for consistent and reliable processing.
237
+
238
+ - **Complex Table Extraction**: Accurately extracts complex tables from documents and converts them into both markdown
239
+ and HTML table formats.
240
+ """)
241
 
242
  if __name__ == "__main__":
243
+ # Configure for concurrent processing with streaming support
244
+ demo.queue(
245
+ max_size=20, # Maximum queue size
246
+ concurrency_count=4, # Allow 4 concurrent requests
247
+ status_update_rate=0.1, # Update status every 100ms for better streaming experience
248
+ ).launch(
249
+ server_name="0.0.0.0",
250
+ server_port=7860,
251
+ show_error=True,
252
+ share=False
253
+ )