prithivMLmods commited on
Commit
8dd2305
·
verified ·
1 Parent(s): effcf60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -75
app.py CHANGED
@@ -131,6 +131,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
131
  top_k: int = 50,
132
  repetition_penalty: float = 1.2):
133
  """Generate responses for image input using the selected model."""
 
134
  if model_name == "Nanonets-OCR-s":
135
  processor = processor_m
136
  model = model_m
@@ -151,14 +152,17 @@ def generate_image(model_name: str, text: str, image: Image.Image,
151
  yield "Please upload an image.", "Please upload an image."
152
  return
153
 
 
154
  images = [image]
155
 
 
156
  if model_name == "SmolDocling-256M-preview":
157
  if "OTSL" in text or "code" in text:
158
  images = [add_random_padding(img) for img in images]
159
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
160
  text = normalize_values(text, target_max=500)
161
 
 
162
  messages = [
163
  {
164
  "role": "user",
@@ -170,6 +174,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
170
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
171
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
172
 
 
173
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
174
  generation_kwargs = {
175
  **inputs,
@@ -183,11 +188,13 @@ def generate_image(model_name: str, text: str, image: Image.Image,
183
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
184
  thread.start()
185
 
 
186
  buffer = ""
187
  for new_text in streamer:
188
  buffer += new_text.replace("<|im_end|>", "")
189
  yield buffer, buffer
190
 
 
191
  if model_name == "SmolDocling-256M-preview":
192
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
193
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -209,6 +216,7 @@ def generate_video(model_name: str, text: str, video_path: str,
209
  top_k: int = 50,
210
  repetition_penalty: float = 1.2):
211
  """Generate responses for video input using the selected model."""
 
212
  if model_name == "Nanonets-OCR-s":
213
  processor = processor_m
214
  model = model_m
@@ -229,15 +237,18 @@ def generate_video(model_name: str, text: str, video_path: str,
229
  yield "Please upload a video.", "Please upload a video."
230
  return
231
 
 
232
  frames = downsample_video(video_path)
233
  images = [frame for frame, _ in frames]
234
 
 
235
  if model_name == "SmolDocling-256M-preview":
236
  if "OTSL" in text or "code" in text:
237
  images = [add_random_padding(img) for img in images]
238
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
239
  text = normalize_values(text, target_max=500)
240
 
 
241
  messages = [
242
  {
243
  "role": "user",
@@ -249,6 +260,7 @@ def generate_video(model_name: str, text: str, video_path: str,
249
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
250
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
251
 
 
252
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
253
  generation_kwargs = {
254
  **inputs,
@@ -262,11 +274,13 @@ def generate_video(model_name: str, text: str, video_path: str,
262
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
263
  thread.start()
264
 
 
265
  buffer = ""
266
  for new_text in streamer:
267
  buffer += new_text.replace("<|im_end|>", "")
268
  yield buffer, buffer
269
 
 
270
  if model_name == "SmolDocling-256M-preview":
271
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
272
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
@@ -297,92 +311,63 @@ video_examples = [
297
  ["Explain the video in detail.", "videos/2.mp4"]
298
  ]
299
 
300
- # Updated CSS with the new Submit button theme
301
  css = """
302
- .submit-btn {
303
- --stone-50: #fafaf9;
304
- --stone-800: #292524;
305
- --yellow-400: #facc15;
306
-
307
- font-size: 1rem;
308
  cursor: pointer;
309
- position: relative;
310
- font-family: "Rubik", sans-serif;
311
  font-weight: bold;
312
- line-height: 1;
313
- padding: 0.75rem 1.25rem;
314
- transform: translate(-4px, -4px);
315
- outline: 2px solid transparent;
316
- outline-offset: 5px;
317
- border-radius: 9999px;
318
- background-color: var(--yellow-400);
319
- color: var(--stone-800);
320
- transition:
321
- transform 150ms ease,
322
- box-shadow 150ms ease;
323
- text-align: center;
324
- box-shadow:
325
- 0.5px 0.5px 0 0 var(--stone-800),
326
- 1px 1px 0 0 var(--stone-800),
327
- 1.5px 1.5px 0 0 var(--stone-800),
328
- 2px 2px 0 0 var(--stone-800),
329
- 2.5px 2.5px 0 0 var(--stone-800),
330
- 3px 3px 0 0 var(--stone-800),
331
- 0 0 0 2px var(--stone-50),
332
- 0.5px 0.5px 0 2px var(--stone-50),
333
- 1px 1px 0 2px var(--stone-50),
334
- 1.5px 1.5px 0 2px var(--stone-50),
335
- 2px 2px 0 2px var(--stone-50),
336
- 2.5px 2.5px 0 2px var(--stone-50),
337
- 3px 3px 0 2px var(--stone-50),
338
- 3.5px 3.5px 0 2px var(--stone-50),
339
- 4px 4px 0 2px var(--stone-50);
340
- }
341
-
342
- .submit-btn:hover {
343
- transform: translate(0, 0);
344
- box-shadow: 0 0 0 2px var(--stone-50);
345
  }
346
 
347
- .submit-btn:active {
348
- transform: translate(0, 2px);
 
 
 
349
  }
350
 
351
- .submit-btn:focus-visible {
352
- outline-color: var(--yellow-400);
353
- outline-style: dashed;
 
 
 
354
  }
355
 
356
- .submit-btn::before {
357
  content: "";
358
  position: absolute;
359
- inset: 0;
360
- border-radius: 9999px;
361
- opacity: 0.5;
362
- background-image: radial-gradient(
363
- rgb(255 255 255 / 80%) 20%,
364
- transparent 20%
365
- ),
366
- radial-gradient(rgb(255 255 255 / 100%) 20%, transparent 20%);
367
- background-position:
368
- 0 0,
369
- 4px 4px;
370
- background-size: 8px 8px;
371
- mix-blend-mode: hard-light;
372
- animation: dots 0.5s infinite linear;
 
 
 
 
373
  }
374
 
375
- @keyframes dots {
376
- 0% {
377
- background-position:
378
- 0 0,
379
- 4px 4px;
380
- }
381
- 100% {
382
- background-position:
383
- 8px 0,
384
- 12px 4px;
385
- }
386
  }
387
 
388
  .canvas-output {
@@ -401,7 +386,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
401
  with gr.TabItem("Image Inference"):
402
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
403
  image_upload = gr.Image(type="pil", label="Image")
404
- image_submit = gr.Button("Submit", elem_classes="submit-btn")
405
  gr.Examples(
406
  examples=image_examples,
407
  inputs=[image_query, image_upload]
@@ -409,7 +394,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
409
  with gr.TabItem("Video Inference"):
410
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
411
  video_upload = gr.Video(label="Video")
412
- video_submit = gr.Button("Submit", elem_classes="submit-btn")
413
  gr.Examples(
414
  examples=video_examples,
415
  inputs=[video_query, video_upload]
@@ -422,9 +407,11 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
422
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
423
 
424
  with gr.Column():
 
425
  with gr.Column(elem_classes="canvas-output"):
426
  gr.Markdown("## Output")
427
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
 
428
  with gr.Accordion("(Result.md)", open=False):
429
  formatted_output = gr.Markdown(label="(Result.md)")
430
 
@@ -441,6 +428,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
441
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
442
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
443
 
 
444
  image_submit.click(
445
  fn=generate_image,
446
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
 
131
  top_k: int = 50,
132
  repetition_penalty: float = 1.2):
133
  """Generate responses for image input using the selected model."""
134
+ # Model selection
135
  if model_name == "Nanonets-OCR-s":
136
  processor = processor_m
137
  model = model_m
 
152
  yield "Please upload an image.", "Please upload an image."
153
  return
154
 
155
+ # Prepare images as a list (single image for image inference)
156
  images = [image]
157
 
158
+ # SmolDocling-256M specific preprocessing
159
  if model_name == "SmolDocling-256M-preview":
160
  if "OTSL" in text or "code" in text:
161
  images = [add_random_padding(img) for img in images]
162
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
163
  text = normalize_values(text, target_max=500)
164
 
165
+ # Unified message structure for all models
166
  messages = [
167
  {
168
  "role": "user",
 
174
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
175
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
176
 
177
+ # Generation with streaming
178
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
179
  generation_kwargs = {
180
  **inputs,
 
188
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
189
  thread.start()
190
 
191
+ # Stream output
192
  buffer = ""
193
  for new_text in streamer:
194
  buffer += new_text.replace("<|im_end|>", "")
195
  yield buffer, buffer
196
 
197
+ # SmolDocling-256M specific postprocessing
198
  if model_name == "SmolDocling-256M-preview":
199
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
200
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
216
  top_k: int = 50,
217
  repetition_penalty: float = 1.2):
218
  """Generate responses for video input using the selected model."""
219
+ # Model selection
220
  if model_name == "Nanonets-OCR-s":
221
  processor = processor_m
222
  model = model_m
 
237
  yield "Please upload a video.", "Please upload a video."
238
  return
239
 
240
+ # Extract frames from video
241
  frames = downsample_video(video_path)
242
  images = [frame for frame, _ in frames]
243
 
244
+ # SmolDocling-256M specific preprocessing
245
  if model_name == "SmolDocling-256M-preview":
246
  if "OTSL" in text or "code" in text:
247
  images = [add_random_padding(img) for img in images]
248
  if "OCR at text at" in text or "Identify element" in text or "formula" in text:
249
  text = normalize_values(text, target_max=500)
250
 
251
+ # Unified message structure for all models
252
  messages = [
253
  {
254
  "role": "user",
 
260
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
261
  inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
262
 
263
+ # Generation with streaming
264
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
265
  generation_kwargs = {
266
  **inputs,
 
274
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
275
  thread.start()
276
 
277
+ # Stream output
278
  buffer = ""
279
  for new_text in streamer:
280
  buffer += new_text.replace("<|im_end|>", "")
281
  yield buffer, buffer
282
 
283
+ # SmolDocling-256M specific postprocessing
284
  if model_name == "SmolDocling-256M-preview":
285
  cleaned_output = buffer.replace("<end_of_utterance>", "").strip()
286
  if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
 
311
  ["Explain the video in detail.", "videos/2.mp4"]
312
  ]
313
 
314
+ # Updated CSS with new button theme
315
  css = """
316
+ .button {
 
 
 
 
 
317
  cursor: pointer;
318
+ padding: 1em 2em;
 
319
  font-weight: bold;
320
+ font-size: 20px;
321
+ color: #fff;
322
+ position: relative;
323
+ overflow: hidden;
324
+ background: rgba(60, 73, 203, 0.35);
325
+ box-shadow: 0 0px 32px 0 rgba(31, 38, 135, 0.37);
326
+ backdrop-filter: blur(14.5px);
327
+ border: 1px solid rgba(255, 255, 255, 0.18);
328
+ -webkit-backdrop-filter: blur(14.5px);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  }
330
 
331
+ .button:hover {
332
+ box-shadow: 0px 0 32px 0 rgba(31, 38, 135, 0.37),
333
+ 0px 0 32px 0 rgba(31, 38, 135, 0.37), 0 0 42px 0px rgba(31, 38, 135, 0.37),
334
+ 0 0 52px 0 rgba(31, 38, 135, 0.37);
335
+ border: 1px solid rgba(255, 255, 255, 0.58);
336
  }
337
 
338
+ .button,
339
+ .button::before {
340
+ display: grid;
341
+ place-items: center;
342
+ border-radius: 10px;
343
+ box-shadow: 0 0px 32px 0 rgba(31, 38, 135, 0.37);
344
  }
345
 
346
+ .button::before {
347
  content: "";
348
  position: absolute;
349
+ background: rgba(26, 18, 241, 0.25);
350
+ width: 90%;
351
+ height: 80%;
352
+ backdrop-filter: blur(18.5px);
353
+ -webkit-backdrop-filter: blur(18.5px);
354
+ border: 1px solid rgba(255, 255, 255, 0.18);
355
+ transition: 0.4s;
356
+ }
357
+
358
+ .button:hover::before {
359
+ background: rgba(51, 57, 236, 0.4);
360
+ box-shadow: 1px 1px 2px 0 rgba(31, 38, 135, 0.37),
361
+ 2px 2px 2px 0 rgba(31, 38, 135, 0.37), 0 0px 32px 0 rgba(31, 38, 135, 0.37),
362
+ 0 0px 32px 1px rgba(31, 38, 135, 0.37), 0 0px 32px 0 rgba(31, 38, 135, 0.37);
363
+ backdrop-filter: blur(5.5px);
364
+ -webkit-backdrop-filter: blur(5.5px);
365
+ border-radius: 10px;
366
+ border: 1px solid rgba(255, 255, 255, 0.18);
367
  }
368
 
369
+ .button:active::before {
370
+ transform: scale(0.67);
 
 
 
 
 
 
 
 
 
371
  }
372
 
373
  .canvas-output {
 
386
  with gr.TabItem("Image Inference"):
387
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
388
  image_upload = gr.Image(type="pil", label="Image")
389
+ image_submit = gr.Button("Submit", elem_classes="button")
390
  gr.Examples(
391
  examples=image_examples,
392
  inputs=[image_query, image_upload]
 
394
  with gr.TabItem("Video Inference"):
395
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
396
  video_upload = gr.Video(label="Video")
397
+ video_submit = gr.Button("Submit", elem_classes="button")
398
  gr.Examples(
399
  examples=video_examples,
400
  inputs=[video_query, video_upload]
 
407
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
408
 
409
  with gr.Column():
410
+ # Result Canvas with raw and formatted outputs
411
  with gr.Column(elem_classes="canvas-output"):
412
  gr.Markdown("## Output")
413
  raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
414
+
415
  with gr.Accordion("(Result.md)", open=False):
416
  formatted_output = gr.Markdown(label="(Result.md)")
417
 
 
428
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
429
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
430
 
431
+ # Connect submit buttons to generation functions with both outputs
432
  image_submit.click(
433
  fn=generate_image,
434
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],