Adun commited on
Commit
2e85285
·
verified ·
1 Parent(s): 34d8f3a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +362 -143
app.py CHANGED
@@ -1,143 +1,362 @@
1
- import base64
2
- from io import BytesIO
3
- import json
4
- import os
5
- from openai import OpenAI
6
- from dotenv import load_dotenv
7
- from typhoon_ocr import prepare_ocr_messages
8
- import gradio as gr
9
- from PIL import Image
10
-
11
- load_dotenv()
12
-
13
- openai = OpenAI(base_url=os.getenv("TYPHOON_BASE_URL"), api_key=os.getenv("TYPHOON_API_KEY"))
14
-
15
- theme = gr.themes.Soft(
16
- primary_hue=gr.themes.Color(
17
- c50="#f7f7fd",
18
- c100="#dfdef8",
19
- c200="#c4c1f2",
20
- c300="#a29eea",
21
- c400="#8f8ae6",
22
- c500="#756fe0",
23
- c600="#635cc1",
24
- c700="#4f4a9b",
25
- c800="#433f83",
26
- c900="#302d5e",
27
- c950="#302d5e",
28
- ),
29
- secondary_hue="rose",
30
- neutral_hue="stone",
31
- )
32
-
33
- def process_pdf(pdf_or_image_file, task_type, page_number):
34
- if pdf_or_image_file is None:
35
- return None, "No file uploaded"
36
-
37
- orig_filename = pdf_or_image_file.name
38
-
39
- try:
40
- # Use the new simplified function to prepare OCR messages with page number
41
- messages = prepare_ocr_messages(
42
- pdf_or_image_path=orig_filename,
43
- task_type=task_type,
44
- target_image_dim=1800,
45
- target_text_length=8000,
46
- page_num=page_number if page_number else 1
47
- )
48
-
49
- # Extract the image from the message content for display
50
- image_url = messages[0]["content"][1]["image_url"]["url"]
51
- image_base64 = image_url.replace("data:image/png;base64,", "")
52
- image_pil = Image.open(BytesIO(base64.b64decode(image_base64)))
53
-
54
- # Send messages to OpenAI compatible API
55
- response = openai.chat.completions.create(
56
- model=os.getenv("TYPHOON_OCR_MODEL"),
57
- messages=messages,
58
- max_tokens=16384,
59
- extra_body={
60
- "repetition_penalty": 1.2,
61
- "temperature": 0.1,
62
- "top_p": 0.6,
63
- },
64
- )
65
- text_output = response.choices[0].message.content
66
-
67
- # Try to parse the output assuming it is a Python dictionary containing 'natural_text'
68
- try:
69
- json_data = json.loads(text_output)
70
- markdown_out = json_data.get('natural_text', "").replace("<figure>", "").replace("</figure>", "")
71
- except Exception as e:
72
- markdown_out = f"⚠️ Could not extract `natural_text` from output.\nError: {str(e)}"
73
-
74
- return image_pil, markdown_out
75
-
76
- except Exception as e:
77
- return None, f"Error processing file: {str(e)}"
78
-
79
-
80
- # Build the Gradio UI.
81
- with gr.Blocks(theme=theme) as demo:
82
- title = gr.HTML("""
83
- <h1>Typhoon OCR</h1>
84
- <ul>
85
- <li>🤗 <b>Model weights</b>: <a href="https://huggingface.co/scb10x/typhoon-ocr-7b" target="_blank">https://huggingface.co/scb10x/typhoon-ocr-7b</a></li>
86
- </ul>
87
- <br />
88
- <details>
89
- <summary><strong>Disclaimer</strong></summary>
90
- The responses generated by this Artificial Intelligence (AI) system are autonomously constructed and do not necessarily reflect the views or positions of the developing organizations, their affiliates, or any of their employees. These AI-generated responses do not represent those of the organizations. The organizations do not endorse, support, sanction, encourage, verify, or agree with the comments, opinions, or statements generated by this AI. The information produced by this AI is not intended to malign any religion, ethnic group, club, organization, company, individual, anyone, or anything. It is not the intent of the organizations to malign any group or individual. The AI operates based on its programming and training data and its responses should not be interpreted as the explicit intent or opinion of the organizations.
91
- </details>
92
- <br />
93
- <details>
94
- <summary><strong>Terms of use</strong></summary>
95
- By using this service, users are required to agree to the following terms: The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. Vision language models are prone to hallucinations to a greater extent compared to text-only LLMs.
96
- </details>
97
- <br />
98
- <details>
99
- <summary><strong>License</strong></summary>
100
- This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses. The content of this project itself is licensed under the Apache license 2.0.
101
- </details>
102
- """)
103
- with gr.Row():
104
- with gr.Column(scale=1):
105
- # Update file_types to accept PDF as well as common image formats.
106
- pdf_input = gr.File(label="📄 Upload Image file or PDF file", file_types=[".pdf", ".png", ".jpg", ".jpeg"])
107
-
108
- with gr.Group(elem_classes=["task-background"]):
109
- task_dropdown = gr.Radio(["default", "structure"], label="🎯 Select Task", value="default")
110
- gr.HTML("""
111
- <p><b>default</b>: This mode works for most cases and is recommended for files without a clear template such as infographics.</p>
112
- <p><b>structure</b>: This mode offers improved performance for complex layout documents such as those containing images, tables and forms.</p>
113
- <p>We recommend trying both and see which one works better for your use case.</p>
114
- """, elem_classes=["task-dropdown-info"])
115
- demo.css = """
116
- .task-background {
117
- background: var(--block-background-fill) !important;
118
-
119
- }
120
- .task-background > * {
121
- background: var(--block-background-fill) !important;
122
- }
123
- .task-dropdown-info {
124
- padding: 0 16px;
125
- font-size: 12px;
126
- }
127
- """
128
- page_number = gr.Number(label="📄 Page Number (for PDFs only)", value=1, minimum=1, step=1)
129
- run_button = gr.Button("🚀 Run")
130
- image_output = gr.Image(label="📸 Preview Image", type="pil")
131
- with gr.Column(scale=2):
132
- markdown_output = gr.Markdown(label='Markdown Result', show_label=True)
133
-
134
-
135
- # Connect the UI inputs to the processing function.
136
- run_button.click(
137
- fn=process_pdf,
138
- inputs=[pdf_input, task_dropdown, page_number],
139
- outputs=[image_output, markdown_output]
140
- )
141
-
142
- # Launch the Gradio demo (temporary public share for 72 hours)
143
- demo.launch(share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import uuid
4
+ import json
5
+ import time
6
+ import asyncio
7
+ from threading import Thread
8
+
9
+ import gradio as gr
10
+ import spaces
11
+ import torch
12
+ import numpy as np
13
+ from PIL import Image, ImageOps
14
+ import cv2
15
+
16
+ from transformers import (
17
+ Qwen2VLForConditionalGeneration,
18
+ Qwen2_5_VLForConditionalGeneration,
19
+ AutoModelForVision2Seq,
20
+ AutoProcessor,
21
+ TextIteratorStreamer,
22
+ )
23
+ from transformers.image_utils import load_image
24
+
25
+ from docling_core.types.doc import DoclingDocument, DocTagsDocument
26
+
27
+ import re
28
+ import ast
29
+ import html
30
+
31
+ # Constants for text generation
32
+ MAX_MAX_NEW_TOKENS = 2048
33
+ DEFAULT_MAX_NEW_TOKENS = 1024
34
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
35
+
36
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
37
+
38
+ # Load Adun/typhoon_ocr-7B-v1.4
39
+ MODEL_ID_M = "Adun/typhoon_ocr-7B-v1.4"
40
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
41
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
42
+ MODEL_ID_M,
43
+ trust_remote_code=True,
44
+ torch_dtype=torch.float16
45
+ ).to(device).eval()
46
+
47
+
48
+ # Load typhoon-ocr-7b
49
+ MODEL_ID_L = "scb10x/typhoon-ocr-7b"
50
+ processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
51
+ model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
52
+ MODEL_ID_L,
53
+ trust_remote_code=True,
54
+ torch_dtype=torch.float16
55
+ ).to(device).eval()
56
+
57
+
58
+ # Preprocessing functions for SmolDocling-256M
59
+ def add_random_padding(image, min_percent=0.1, max_percent=0.10):
60
+ """Add random padding to an image based on its size."""
61
+ image = image.convert("RGB")
62
+ width, height = image.size
63
+ pad_w_percent = random.uniform(min_percent, max_percent)
64
+ pad_h_percent = random.uniform(min_percent, max_percent)
65
+ pad_w = int(width * pad_w_percent)
66
+ pad_h = int(height * pad_h_percent)
67
+ corner_pixel = image.getpixel((0, 0)) # Top-left corner
68
+ padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
69
+ return padded_image
70
+
71
+ def normalize_values(text, target_max=500):
72
+ """Normalize numerical values in text to a target maximum."""
73
+ def normalize_list(values):
74
+ max_value = max(values) if values else 1
75
+ return [round((v / max_value) * target_max) for v in values]
76
+
77
+ def process_match(match):
78
+ num_list = ast.literal_eval(match.group(0))
79
+ normalized = normalize_list(num_list)
80
+ return "".join([f"<loc_{num}>" for num in normalized])
81
+
82
+ pattern = r"\[([\d\.\s,]+)\]"
83
+ normalized_text = re.sub(pattern, process_match, text)
84
+ return normalized_text
85
+
86
+ def downsample_video(video_path):
87
+ """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
88
+ vidcap = cv2.VideoCapture(video_path)
89
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
90
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
91
+ frames = []
92
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
93
+ for i in frame_indices:
94
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
95
+ success, image = vidcap.read()
96
+ if success:
97
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
98
+ pil_image = Image.fromarray(image)
99
+ timestamp = round(i / fps, 2)
100
+ frames.append((pil_image, timestamp))
101
+ vidcap.release()
102
+ return frames
103
+
104
+ @spaces.GPU
105
+ def generate_image(model_name: str, text: str, image: Image.Image,
106
+ max_new_tokens: int = 1024,
107
+ temperature: float = 0.6,
108
+ top_p: float = 0.9,
109
+ top_k: int = 50,
110
+ repetition_penalty: float = 1.2):
111
+ """Generate responses for image input using the selected model."""
112
+ # Model selection
113
+ if model_name == "Adun/Typhoon-OCR-7B-1.4":
114
+ processor = processor_m
115
+ model = model_m
116
+ # elif model_name == "MonkeyOCR-Recognition":
117
+ # processor = processor_g
118
+ # model = model_g
119
+ # elif model_name == "SmolDocling-256M-preview":
120
+ # processor = processor_x
121
+ # model = model_x
122
+ elif model_name == "Typhoon-OCR-7B":
123
+ processor = processor_l
124
+ model = model_l
125
+ else:
126
+ yield "Invalid model selected."
127
+ return
128
+
129
+ if image is None:
130
+ yield "Please upload an image."
131
+ return
132
+
133
+ # Prepare images as a list (single image for image inference)
134
+ images = [image]
135
+
136
+ # # SmolDocling-256M specific preprocessing
137
+ # if model_name == "SmolDocling-256M-preview":
138
+ # if "OTSL" in text or "code" in text:
139
+ # images = [add_random_padding(img) for img in images]
140
+ # if "OCR at text at" in text or "Identify element" in text or "formula" in text:
141
+ # text = normalize_values(text, target_max=500)
142
+
143
+ # Unified message structure for all models
144
+ messages = [
145
+ {
146
+ "role": "user",
147
+ "content": [{"type": "image"} for _ in images] + [
148
+ {"type": "text", "text": text}
149
+ ]
150
+ }
151
+ ]
152
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
153
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
154
+
155
+ # Generation with streaming
156
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
157
+ generation_kwargs = {
158
+ **inputs,
159
+ "streamer": streamer,
160
+ "max_new_tokens": max_new_tokens,
161
+ "temperature": temperature,
162
+ "top_p": top_p,
163
+ "top_k": top_k,
164
+ "repetition_penalty": repetition_penalty,
165
+ }
166
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
167
+ thread.start()
168
+
169
+ # Stream output and collect full response
170
+ buffer = ""
171
+ full_output = ""
172
+ for new_text in streamer:
173
+ full_output += new_text
174
+ buffer += new_text.replace("<|im_end|>", "")
175
+ yield buffer
176
+
177
+ # SmolDocling-256M specific postprocessing
178
+ # if model_name == "SmolDocling-256M-preview":
179
+ # cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
180
+ # if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
181
+ # if "<chart>" in cleaned_output:
182
+ # cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
183
+ # cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
184
+ # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
185
+ # doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
186
+ # markdown_output = doc.export_to_markdown()
187
+ # yield f"**MD Output:**\n\n{markdown_output}"
188
+ # else:
189
+ # yield cleaned_output
190
+
191
+ @spaces.GPU
192
+ def generate_video(model_name: str, text: str, video_path: str,
193
+ max_new_tokens: int = 1024,
194
+ temperature: float = 0.6,
195
+ top_p: float = 0.9,
196
+ top_k: int = 50,
197
+ repetition_penalty: float = 1.2):
198
+ """Generate responses for video input using the selected model."""
199
+ # Model selection
200
+ if model_name == "Adun/typhoon_ocr-7B-v1.4":
201
+ processor = processor_m
202
+ model = model_m
203
+ # elif model_name == "MonkeyOCR-Recognition":
204
+ # processor = processor_g
205
+ # model = model_g
206
+ # elif model_name == "SmolDocling-256M-preview":
207
+ # processor = processor_x
208
+ # model = model_x
209
+ elif model_name == "Typhoon-OCR-7B":
210
+ processor = processor_l
211
+ model = model_l
212
+ else:
213
+ yield "Invalid model selected."
214
+ return
215
+
216
+ if video_path is None:
217
+ yield "Please upload a video."
218
+ return
219
+
220
+ # Extract frames from video
221
+ frames = downsample_video(video_path)
222
+ images = [frame for frame, _ in frames]
223
+
224
+ # # SmolDocling-256M specific preprocessing
225
+ # if model_name == "SmolDocling-256M-preview":
226
+ # if "OTSL" in text or "code" in text:
227
+ # images = [add_random_padding(img) for img in images]
228
+ # if "OCR at text at" in text or "Identify element" in text or "formula" in text:
229
+ # text = normalize_values(text, target_max=500)
230
+
231
+ # Unified message structure for all models
232
+ messages = [
233
+ {
234
+ "role": "user",
235
+ "content": [{"type": "image"} for _ in images] + [
236
+ {"type": "text", "text": text}
237
+ ]
238
+ }
239
+ ]
240
+ prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
241
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
242
+
243
+ # Generation with streaming
244
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
245
+ generation_kwargs = {
246
+ **inputs,
247
+ "streamer": streamer,
248
+ "max_new_tokens": max_new_tokens,
249
+ "temperature": temperature,
250
+ "top_p": top_p,
251
+ "top_k": top_k,
252
+ "repetition_penalty": repetition_penalty,
253
+ }
254
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
255
+ thread.start()
256
+
257
+ # Stream output and collect full response
258
+ buffer = ""
259
+ full_output = ""
260
+ for new_text in streamer:
261
+ full_output += new_text
262
+ buffer += new_text.replace("<|im_end|>", "")
263
+ yield buffer
264
+
265
+ # # SmolDocling-256M specific postprocessing
266
+ # if model_name == "SmolDocling-256M-preview":
267
+ # cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
268
+ # if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
269
+ # if "<chart>" in cleaned_output:
270
+ # cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
271
+ # cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
272
+ # doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
273
+ # doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
274
+ # markdown_output = doc.export_to_markdown()
275
+ # yield f"**MD Output:**\n\n{markdown_output}"
276
+ # else:
277
+ # yield cleaned_output
278
+
279
+ # Define examples for image and video inference
280
+ image_examples = [
281
+ ["OCR the image", "images/2.jpg"],
282
+ ["Convert this page to docling", "images/1.png"],
283
+ ["Convert this page to docling", "images/3.png"],
284
+ ["Convert chart to OTSL.", "images/4.png"],
285
+ ["Convert code to text", "images/5.jpg"],
286
+ ["Convert this table to OTSL.", "images/6.jpg"],
287
+ ["Convert formula to late.", "images/7.jpg"],
288
+ ]
289
+
290
+ video_examples = [
291
+ ["Explain the video in detail.", "videos/1.mp4"],
292
+ ["Explain the video in detail.", "videos/2.mp4"]
293
+ ]
294
+
295
+ css = """
296
+ .submit-btn {
297
+ background-color: #2980b9 !important;
298
+ color: white !important;
299
+ }
300
+ .submit-btn:hover {
301
+ background-color: #3498db !important;
302
+ }
303
+ """
304
+
305
+ # Create the Gradio Interface
306
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
307
+ gr.Markdown("# **[Multimodal OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
308
+ with gr.Row():
309
+ with gr.Column():
310
+ with gr.Tabs():
311
+ with gr.TabItem("Image Inference"):
312
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
313
+ image_upload = gr.Image(type="pil", label="Image")
314
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
315
+ gr.Examples(
316
+ examples=image_examples,
317
+ inputs=[image_query, image_upload]
318
+ )
319
+ with gr.TabItem("Video Inference"):
320
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
321
+ video_upload = gr.Video(label="Video")
322
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
323
+ gr.Examples(
324
+ examples=video_examples,
325
+ inputs=[video_query, video_upload]
326
+ )
327
+ with gr.Accordion("Advanced options", open=False):
328
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
329
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
330
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
331
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
332
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
333
+ with gr.Column():
334
+ output = gr.Textbox(label="Output", interactive=False, lines=3, scale=2)
335
+ model_choice = gr.Radio(
336
+ choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
337
+ label="Select Model",
338
+ value="Nanonets-OCR-s"
339
+ )
340
+
341
+ #gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
342
+
343
+ # gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
344
+ # gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
345
+ # gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
346
+ gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Aun](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
347
+ gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
348
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
349
+
350
+ image_submit.click(
351
+ fn=generate_image,
352
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
353
+ outputs=output
354
+ )
355
+ video_submit.click(
356
+ fn=generate_video,
357
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
358
+ outputs=output
359
+ )
360
+
361
+ if __name__ == "__main__":
362
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)