prithivMLmods commited on
Commit
6e2c6fa
·
verified ·
1 Parent(s): b2daeaa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +234 -318
app.py CHANGED
@@ -1,387 +1,303 @@
1
- import spaces
2
- import json
3
- import math
4
  import os
5
- import traceback
6
- from io import BytesIO
7
- from typing import Any, Dict, List, Optional, Tuple, Union
8
- import re
9
  import time
 
10
  from threading import Thread
11
- from io import BytesIO
12
- import uuid
13
- import tempfile
14
 
15
  import gradio as gr
16
- import requests
17
  import torch
18
- from PIL import Image
19
- import fitz
20
  import numpy as np
 
21
  import cv2
22
 
23
-
24
  from transformers import (
25
  Qwen2_5_VLForConditionalGeneration,
 
 
26
  AutoProcessor,
27
  TextIteratorStreamer,
28
- AutoTokenizer,
29
  )
 
30
 
31
- from reportlab.lib.pagesizes import A4
32
- from reportlab.lib.styles import getSampleStyleSheet
33
- from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
34
- from reportlab.lib.units import inch
35
-
36
- # --- Constants and Model Setup ---
37
- MAX_INPUT_TOKEN_LENGTH = 4096
38
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
39
-
40
- print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
41
- print("torch.__version__ =", torch.__version__)
42
- print("torch.version.cuda =", torch.version.cuda)
43
- print("cuda available:", torch.cuda.is_available())
44
- print("cuda device count:", torch.cuda.device_count())
45
- if torch.cuda.is_available():
46
- print("current device:", torch.cuda.current_device())
47
- print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
48
-
49
- print("Using device:", device)
50
 
 
51
 
52
- # --- Model Loading ---
53
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
54
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
55
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
56
- MODEL_ID_M, trust_remote_code=True, torch_dtype=torch.float16
 
 
57
  ).to(device).eval()
58
 
 
59
  MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
60
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
61
  model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
62
- MODEL_ID_X, trust_remote_code=True, torch_dtype=torch.float16
 
 
63
  ).to(device).eval()
64
 
 
65
  MODEL_ID_Q = "prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it"
66
  processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
67
  model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
68
- MODEL_ID_Q, trust_remote_code=True, torch_dtype=torch.float16
 
 
69
  ).to(device).eval()
70
 
71
- MODEL_ID_D = "prithivMLmods/DeepCaption-VLA-7B"
72
- processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
73
- model_d = Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
- MODEL_ID_D, trust_remote_code=True, torch_dtype=torch.float16
 
 
 
75
  ).to(device).eval()
76
 
77
-
78
- # --- Video and PDF Utility Functions ---
79
  def downsample_video(video_path):
80
  """
81
- Downsamples the video to 10 evenly spaced frames.
82
- Each frame is returned as a PIL image.
83
  """
84
- try:
85
- vidcap = cv2.VideoCapture(video_path)
86
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
87
- frames = []
88
- # Ensure we don't try to sample more frames than exist
89
- num_frames_to_sample = min(10, total_frames)
90
- if num_frames_to_sample == 0:
91
- vidcap.release()
92
- return []
93
-
94
- frame_indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
 
 
 
 
95
 
96
- for i in frame_indices:
97
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
98
- success, image = vidcap.read()
99
- if success:
100
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
101
- pil_image = Image.fromarray(image)
102
- frames.append(pil_image)
103
- vidcap.release()
104
- return frames
105
- except Exception as e:
106
- print(f"Error processing video: {e}")
107
- return []
108
-
109
- def generate_and_preview_pdf(media_input: Union[str, Image.Image], text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str, state_media_type: str, state_frames: list):
110
  """
111
- Generates a PDF from an image or video frames, saves it, and creates image previews.
112
- Returns the path to the PDF and a list of paths to the preview images.
113
  """
114
- if (media_input is None and not state_frames) or not text_content or not text_content.strip():
115
- raise gr.Error("Cannot generate PDF. Media input or text content is missing.")
116
-
117
- images_to_process = []
118
- if state_media_type == "video":
119
- images_to_process = [Image.fromarray(frame) for frame in state_frames] # Assuming state_frames are numpy arrays
120
- elif isinstance(media_input, Image.Image):
121
- images_to_process = [media_input]
122
-
123
- if not images_to_process:
124
- raise gr.Error("No images found to generate PDF.")
125
-
126
- # --- 1. Generate the PDF ---
127
- temp_dir = tempfile.gettempdir()
128
- pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
129
- doc = SimpleDocTemplate(
130
- pdf_filename,
131
- pagesize=A4,
132
- rightMargin=inch, leftMargin=inch,
133
- topMargin=inch, bottomMargin=inch
134
- )
135
- styles = getSampleStyleSheet()
136
- style_normal = styles["Normal"]
137
- style_normal.fontSize = int(font_size)
138
- style_normal.leading = int(font_size) * line_spacing
139
- style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
140
-
141
- story = []
142
- page_width, _ = A4
143
- available_width = page_width - 2 * inch
144
- image_widths = {
145
- "Small": available_width * 0.3,
146
- "Medium": available_width * 0.6,
147
- "Large": available_width * 0.9,
148
- }
149
- img_width = image_widths[image_size]
150
-
151
- for image in images_to_process:
152
- img_buffer = BytesIO()
153
- image.save(img_buffer, format='PNG')
154
- img_buffer.seek(0)
155
- img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
156
- story.append(img)
157
- story.append(Spacer(1, 6)) # Add a smaller spacer between frames
158
-
159
- story.append(Spacer(1, 12))
160
-
161
- cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
162
- text_paragraphs = cleaned_text.split('\n')
163
-
164
- for para in text_paragraphs:
165
- if para.strip():
166
- story.append(Paragraph(para, style_normal))
167
-
168
- doc.build(story)
169
-
170
- # --- 2. Render PDF pages as images for preview ---
171
- preview_images = []
172
- try:
173
- pdf_doc = fitz.open(pdf_filename)
174
- for page_num in range(len(pdf_doc)):
175
- page = pdf_doc.load_page(page_num)
176
- pix = page.get_pixmap(dpi=150)
177
- preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
178
- pix.save(preview_img_path)
179
- preview_images.append(preview_img_path)
180
- pdf_doc.close()
181
- except Exception as e:
182
- print(f"Error generating PDF preview: {e}")
183
 
184
- return pdf_filename, preview_images
 
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- # --- Core Application Logic ---
188
  @spaces.GPU
189
- def process_document_stream(
190
- model_name: str,
191
- media_input: Union[str, Image.Image],
192
- prompt_input: str,
193
- max_new_tokens: int,
194
- temperature: float,
195
- top_p: float,
196
- top_k: int,
197
- repetition_penalty: float
198
- ):
199
  """
200
- Main generator function that handles model inference for images or videos.
201
- Also returns the type of media and extracted frames for state management.
202
  """
203
- if media_input is None:
204
- yield "Please upload an image or video.", "", "none", []
205
- return
206
- if not prompt_input or not prompt_input.strip():
207
- yield "Please enter a prompt.", "", "none", []
208
- return
209
-
210
- # --- Model Selection ---
211
- if model_name == "Qwen2.5-VL-7B-Instruct": processor, model = processor_m, model_m
212
- elif model_name == "Qwen2.5-VL-3B-Instruct": processor, model = processor_x, model_x
213
- elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it": processor, model = processor_q, model_q
214
- elif model_name == "DeepCaption-VLA-7B": processor, model = processor_d, model_d
215
  else:
216
- yield "Invalid model selected.", "", "none", []
217
  return
218
 
219
- media_type = "none"
220
- saved_frames = []
221
-
222
- # --- Input Processing (Image vs. Video) ---
223
- if isinstance(media_input, str): # It's a video file path
224
- media_type = "video"
225
- frames = downsample_video(media_input)
226
- if not frames:
227
- yield "Could not process video file.", "", "none", []
228
- return
229
- # Convert PIL images to numpy arrays for state to avoid serialization issues
230
- saved_frames = [np.array(f) for f in frames]
231
- messages = [{"role": "user", "content": [{"type": "text", "text": prompt_input}]}]
232
- for frame in frames:
233
- messages[0]["content"].append({"type": "image", "image": frame})
234
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
235
- inputs = processor(text=[prompt_full], images=frames, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
236
-
237
- elif isinstance(media_input, Image.Image): # It's an image
238
- media_type = "image"
239
- messages = [{"role": "user", "content": [{"type": "image", "image": media_input}, {"type": "text", "text": prompt_input}]}]
240
- prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
241
- inputs = processor(text=[prompt_full], images=[media_input], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
242
-
243
- else:
244
- yield "Invalid input type.", "", "none", []
245
  return
246
 
247
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
249
-
250
  generation_kwargs = {
251
  **inputs,
252
  "streamer": streamer,
253
  "max_new_tokens": max_new_tokens,
 
254
  "temperature": temperature,
255
  "top_p": top_p,
256
  "top_k": top_k,
257
  "repetition_penalty": repetition_penalty,
258
- "do_sample": True if temperature > 0 else False
259
  }
260
-
261
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
262
  thread.start()
263
-
264
  buffer = ""
265
  for new_text in streamer:
266
  buffer += new_text
267
- buffer = buffer.replace("<|im_end|>", "")
268
  time.sleep(0.01)
269
- yield buffer, buffer, media_type, saved_frames
270
-
271
- yield buffer, buffer, media_type, saved_frames
272
-
273
-
274
- # --- Gradio UI Definition ---
275
- def create_gradio_interface():
276
- """Builds and returns the Gradio web interface."""
277
- css = """
278
- .main-container { max-width: 1400px; margin: 0 auto; }
279
- .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
280
- .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
281
- #gallery { min-height: 400px; }
282
- """
283
- with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
284
- # Hidden state variables to store media type and frames
285
- state_media_type = gr.State("none")
286
- state_frames = gr.State([])
287
-
288
- gr.HTML("""
289
- <div class="title" style="text-align: center">
290
- <h1>Qwen2.5-VL Outpost👀</h1>
291
- <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
292
- Advanced Vision-Language Models for Image and Video Understanding
293
- </p>
294
- </div>
295
- """)
296
-
297
- with gr.Row():
298
- # Left Column (Inputs)
299
- with gr.Column(scale=1):
300
- model_choice = gr.Dropdown(
301
- choices=[
302
- "Qwen2.5-VL-7B-Instruct",
303
- "Qwen2.5-VL-3B-Instruct",
304
- "Qwen2.5-VL-7B-Abliterated-Caption-it",
305
- "DeepCaption-VLA-7B"
306
- ],
307
- label="Select Model",
308
- value="Qwen2.5-VL-7B-Instruct"
309
- )
310
-
311
- prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your prompt")
312
- media_input = gr.File(label="Upload Image or Video", type="filepath")
313
-
314
-
315
- with gr.Accordion("Advanced Settings", open=False):
316
- max_new_tokens = gr.Slider(minimum=512, maximum=4096, value=2048, step=256, label="Max New Tokens")
317
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.6)
318
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
319
- top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50)
320
- repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
321
-
322
- gr.Markdown("### PDF Export Settings")
323
- font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
324
- line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
325
- alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
326
- image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
327
-
328
- process_btn = gr.Button("🚀 Process Media", variant="primary", elem_classes=["process-button"], size="lg")
329
- clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
330
-
331
- # Right Column (Outputs)
332
- with gr.Column(scale=2):
333
- with gr.Tabs() as tabs:
334
- with gr.Tab("📝 Extracted Content"):
335
- raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
336
- with gr.Row():
337
- examples = gr.Examples(
338
- examples=["images/A.jpg", "images/2.jpg", "images/1.jpg", "videos/1.mp4", "videos/2.mp4"],
339
- inputs=media_input, label="Examples"
340
- )
341
- gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
342
-
343
- with gr.Tab("📰 README.md"):
344
- with gr.Accordion("(Result.md)", open=True):
345
- markdown_output = gr.Markdown()
346
-
347
- with gr.Tab("📋 PDF Preview"):
348
- generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
349
- pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
350
- pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
351
-
352
- # --- Helper function to handle media input ---
353
- def get_media_input(filepath):
354
- if filepath is None:
355
- return None
356
- # Simple check for common image/video extensions
357
- if filepath.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp')):
358
- return Image.open(filepath)
359
- elif filepath.lower().endswith(('.mp4', '.mov', '.avi', '.mkv')):
360
- return filepath # Return path for video
361
- return None # Unsupported file type
362
-
363
- # --- Event Handlers ---
364
- def clear_all_outputs():
365
- return None, "", "Raw output will appear here.", "", None, None, "none", []
366
-
367
- process_btn.click(
368
- fn=lambda *args: process_document_stream(*args),
369
- inputs=[model_choice, media_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
370
- outputs=[raw_output_stream, markdown_output, state_media_type, state_frames]
371
- )
372
-
373
- generate_pdf_btn.click(
374
- fn=generate_and_preview_pdf,
375
- inputs=[media_input, raw_output_stream, font_size, line_spacing, alignment, image_size, state_media_type, state_frames],
376
- outputs=[pdf_output_file, pdf_preview_gallery]
377
- )
378
-
379
- clear_btn.click(
380
- clear_all_outputs,
381
- outputs=[media_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery, state_media_type, state_frames]
382
- )
383
- return demo
384
 
385
  if __name__ == "__main__":
386
- demo = create_gradio_interface()
387
- demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)
 
 
 
 
1
  import os
2
+ import random
3
+ import uuid
4
+ import json
 
5
  import time
6
+ import asyncio
7
  from threading import Thread
 
 
 
8
 
9
  import gradio as gr
10
+ import spaces
11
  import torch
 
 
12
  import numpy as np
13
+ from PIL import Image
14
  import cv2
15
 
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
+ AutoModel,
19
+ AutoTokenizer,
20
  AutoProcessor,
21
  TextIteratorStreamer,
 
22
  )
23
+ from transformers.image_utils import load_image
24
 
25
+ # Constants for text generation
26
+ MAX_MAX_NEW_TOKENS = 2048
27
+ DEFAULT_MAX_NEW_TOKENS = 1024
28
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # Load Qwen2.5-VL-7B-Instruct
33
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
34
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
  model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
+ MODEL_ID_M,
37
+ trust_remote_code=True,
38
+ torch_dtype=torch.float16
39
  ).to(device).eval()
40
 
41
+ # Load Qwen2.5-VL-3B-Instruct
42
  MODEL_ID_X = "Qwen/Qwen2.5-VL-3B-Instruct"
43
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
44
  model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
45
+ MODEL_ID_X,
46
+ trust_remote_code=True,
47
+ torch_dtype=torch.float16
48
  ).to(device).eval()
49
 
50
+ # Load Qwen2.5-VL-7B-Abliterated-Caption-it
51
  MODEL_ID_Q = "prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it"
52
  processor_q = AutoProcessor.from_pretrained(MODEL_ID_Q, trust_remote_code=True)
53
  model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
+ MODEL_ID_Q,
55
+ trust_remote_code=True,
56
+ torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+ # Load allenai/olmOCR-7B-0825
60
+ MODEL_ID_F = "allenai/olmOCR-7B-0825"
61
+ processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
62
+ model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
+ MODEL_ID_F,
64
+ trust_remote_code=True,
65
+ torch_dtype=torch.float16
66
  ).to(device).eval()
67
 
 
 
68
  def downsample_video(video_path):
69
  """
70
+ Downsamples the video to evenly spaced frames.
71
+ Each frame is returned as a PIL image along with its timestamp.
72
  """
73
+ vidcap = cv2.VideoCapture(video_path)
74
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
75
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
76
+ frames = []
77
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
78
+ for i in frame_indices:
79
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
80
+ success, image = vidcap.read()
81
+ if success:
82
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
83
+ pil_image = Image.fromarray(image)
84
+ timestamp = round(i / fps, 2)
85
+ frames.append((pil_image, timestamp))
86
+ vidcap.release()
87
+ return frames
88
 
89
+ @spaces.GPU
90
+ def generate_image(model_name: str, text: str, image: Image.Image,
91
+ max_new_tokens: int = 1024,
92
+ temperature: float = 0.6,
93
+ top_p: float = 0.9,
94
+ top_k: int = 50,
95
+ repetition_penalty: float = 1.2):
 
 
 
 
 
 
 
96
  """
97
+ Generates responses using the selected model for image input.
98
+ Yields raw text and Markdown-formatted text.
99
  """
100
+ if model_name == "Qwen2.5-VL-7B-Instruct":
101
+ processor = processor_m
102
+ model = model_m
103
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
104
+ processor = processor_x
105
+ model = model_x
106
+ elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
107
+ processor = processor_q
108
+ model = model_q
109
+ elif model_name == "olmOCR-7B-0825":
110
+ processor = processor_f
111
+ model = model_f
112
+ else:
113
+ yield "Invalid model selected.", "Invalid model selected."
114
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ if image is None:
117
+ yield "Please upload an image.", "Please upload an image."
118
+ return
119
 
120
+ messages = [{
121
+ "role": "user",
122
+ "content": [
123
+ {"type": "image", "image": image},
124
+ {"type": "text", "text": text},
125
+ ]
126
+ }]
127
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
128
+ inputs = processor(
129
+ text=[prompt_full],
130
+ images=[image],
131
+ return_tensors="pt",
132
+ padding=True,
133
+ truncation=False,
134
+ max_length=MAX_INPUT_TOKEN_LENGTH
135
+ ).to(device)
136
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
137
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
138
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
139
+ thread.start()
140
+ buffer = ""
141
+ for new_text in streamer:
142
+ buffer += new_text
143
+ time.sleep(0.01)
144
+ yield buffer, buffer
145
 
 
146
  @spaces.GPU
147
+ def generate_video(model_name: str, text: str, video_path: str,
148
+ max_new_tokens: int = 1024,
149
+ temperature: float = 0.6,
150
+ top_p: float = 0.9,
151
+ top_k: int = 50,
152
+ repetition_penalty: float = 1.2):
 
 
 
 
153
  """
154
+ Generates responses using the selected model for video input.
155
+ Yields raw text and Markdown-formatted text.
156
  """
157
+ if model_name == "Qwen2.5-VL-7B-Instruct":
158
+ processor = processor_m
159
+ model = model_m
160
+ elif model_name == "Qwen2.5-VL-3B-Instruct":
161
+ processor = processor_x
162
+ model = model_x
163
+ elif model_name == "Qwen2.5-VL-7B-Abliterated-Caption-it":
164
+ processor = processor_q
165
+ model = model_q
166
+ elif model_name == "olmOCR-7B-0825":
167
+ processor = processor_f
168
+ model = model_f
169
  else:
170
+ yield "Invalid model selected.", "Invalid model selected."
171
  return
172
 
173
+ if video_path is None:
174
+ yield "Please upload a video.", "Please upload a video."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  return
176
 
177
+ frames = downsample_video(video_path)
178
+ messages = [
179
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
180
+ {"role": "user", "content": [{"type": "text", "text": text}]}
181
+ ]
182
+ for frame in frames:
183
+ image, timestamp = frame
184
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
185
+ messages[1]["content"].append({"type": "image", "image": image})
186
+ inputs = processor.apply_chat_template(
187
+ messages,
188
+ tokenize=True,
189
+ add_generation_prompt=True,
190
+ return_dict=True,
191
+ return_tensors="pt",
192
+ truncation=False,
193
+ max_length=MAX_INPUT_TOKEN_LENGTH
194
+ ).to(device)
195
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
196
  generation_kwargs = {
197
  **inputs,
198
  "streamer": streamer,
199
  "max_new_tokens": max_new_tokens,
200
+ "do_sample": True,
201
  "temperature": temperature,
202
  "top_p": top_p,
203
  "top_k": top_k,
204
  "repetition_penalty": repetition_penalty,
 
205
  }
 
206
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
207
  thread.start()
 
208
  buffer = ""
209
  for new_text in streamer:
210
  buffer += new_text
 
211
  time.sleep(0.01)
212
+ yield buffer, buffer
213
+
214
+ # Define examples for image and video inference
215
+ image_examples = [
216
+ ["Provide a detailed caption for the image..", "images/A.jpg"],
217
+ ["Explain the pie-chart in detail.", "images/2.jpg"],
218
+ ["Jsonify Data.", "images/1.jpg"],
219
+ ]
220
+
221
+ video_examples = [
222
+ ["Explain the ad in detail", "videos/1.mp4"],
223
+ ["Identify the main actions in the video", "videos/2.mp4"],
224
+ ["Identify the main scenes in the video", "videos/3.mp4"]
225
+ ]
226
+
227
+ css = """
228
+ .submit-btn {
229
+ background-color: #2980b9 !important;
230
+ color: white !important;
231
+ }
232
+ .submit-btn:hover {
233
+ background-color: #3498db !important;
234
+ }
235
+ .canvas-output {
236
+ border: 2px solid #4682B4;
237
+ border-radius: 10px;
238
+ padding: 20px;
239
+ }
240
+ """
241
+
242
+ # Create the Gradio Interface
243
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
244
+ gr.Markdown("# **[Qwen2.5-VL](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
245
+ with gr.Row():
246
+ with gr.Column():
247
+ with gr.Tabs():
248
+ with gr.TabItem("Image Inference"):
249
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
250
+ image_upload = gr.Image(type="pil", label="Image")
251
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
252
+ gr.Examples(
253
+ examples=image_examples,
254
+ inputs=[image_query, image_upload]
255
+ )
256
+ with gr.TabItem("Video Inference"):
257
+ video_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query here...")
258
+ video_upload = gr.Video(label="Video")
259
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
260
+ gr.Examples(
261
+ examples=video_examples,
262
+ inputs=[video_query, video_upload]
263
+ )
264
+ with gr.Accordion("Advanced options", open=False):
265
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
266
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
267
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
268
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
269
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
270
+
271
+ with gr.Column():
272
+ with gr.Column(elem_classes="canvas-output"):
273
+ gr.Markdown("## Output")
274
+ output = gr.Textbox(label="Raw Output", interactive=False, lines=2, scale=2)
275
+
276
+ with gr.Accordion("(Result.md)", open=False):
277
+ markdown_output = gr.Markdown()
278
+
279
+ model_choice = gr.Radio(
280
+ choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Abliterated-Caption-it", "olmOCR-7B-0825"],
281
+ label="Select Model",
282
+ value="Qwen2.5-VL-7B-Instruct"
283
+ )
284
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
285
+ gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
286
+ gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
287
+ gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
288
+ gr.Markdown("> [olmOCR-7B-0825](https://huggingface.co/allenai/olmOCR-7B-0825): olmOCR-7B-0825 is a 7B parameter open large model designed for OCR tasks with robust text extraction, especially in complex document layouts. Multimodal model emphasizing strong document reading and extraction capabilities combined with vision-language understanding to support detailed document parsing tasks.")
289
+ gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
290
+
291
+ image_submit.click(
292
+ fn=generate_image,
293
+ inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
294
+ outputs=[output, markdown_output]
295
+ )
296
+ video_submit.click(
297
+ fn=generate_video,
298
+ inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
299
+ outputs=[output, markdown_output]
300
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
  if __name__ == "__main__":
303
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)