Spaces:
Running
Running
add vebose mode switch
Browse files
app.py
CHANGED
@@ -45,6 +45,7 @@ model_cache = {
|
|
45 |
'size': None,
|
46 |
'model_file': None,
|
47 |
'clip_file': None,
|
|
|
48 |
'llm': None
|
49 |
}
|
50 |
|
@@ -86,19 +87,22 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
|
|
86 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
87 |
)
|
88 |
|
89 |
-
# Load and cache LLM (only on dropdown change)
|
90 |
-
|
91 |
-
|
92 |
-
if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
|
93 |
mf, cf = ensure_weights(size, model_file, clip_file)
|
94 |
-
handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=
|
95 |
-
llm = Llama(
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# Build weight filename lists
|
101 |
-
|
102 |
def get_weight_files(size):
|
103 |
cfg = MODELS[size]
|
104 |
model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
|
@@ -106,14 +110,13 @@ def get_weight_files(size):
|
|
106 |
return model_files, clip_files
|
107 |
|
108 |
# Caption using cached llm with real-time debug logs
|
109 |
-
|
110 |
-
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt,
|
111 |
-
reset_clip: bool):
|
112 |
debug_msgs = []
|
113 |
timestamp = time.strftime('%H:%M:%S')
|
|
|
|
|
114 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
115 |
|
116 |
-
# show which weight files we’re using this run
|
117 |
timestamp = time.strftime('%H:%M:%S')
|
118 |
debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
|
119 |
debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")
|
@@ -145,9 +148,8 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
|
|
145 |
|
146 |
timestamp = time.strftime('%H:%M:%S')
|
147 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
148 |
-
# re-init handler for image
|
149 |
if reset_clip:
|
150 |
-
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=
|
151 |
timestamp = time.strftime('%H:%M:%S')
|
152 |
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
|
153 |
|
@@ -176,10 +178,10 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
|
|
176 |
return content, "\n".join(debug_msgs)
|
177 |
|
178 |
# Gradio UI
|
179 |
-
|
180 |
def main():
|
181 |
logging.basicConfig(level=logging.INFO)
|
182 |
default = '2.2B'
|
|
|
183 |
mf, cf = get_weight_files(default)
|
184 |
|
185 |
with gr.Blocks() as demo:
|
@@ -188,38 +190,46 @@ def main():
|
|
188 |
size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
|
189 |
model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
|
190 |
clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
|
|
|
191 |
|
192 |
-
|
193 |
-
def on_size_change(sz):
|
194 |
mlist, clist = get_weight_files(sz)
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
gr.update(choices=clist, value=clist[0])
|
199 |
-
)
|
200 |
-
# preload with first weights
|
201 |
-
update_llm(sz, mlist[0], clist[0])
|
202 |
-
return update_ui
|
203 |
size_dd.change(
|
204 |
fn=on_size_change,
|
205 |
-
inputs=[size_dd],
|
206 |
outputs=[model_dd, clip_dd]
|
207 |
)
|
208 |
-
model_dd.change(
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
interval
|
213 |
-
sys_p
|
214 |
-
usr_p
|
215 |
reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
|
216 |
-
cam
|
217 |
-
cap
|
218 |
-
log_box
|
219 |
|
220 |
cam.stream(
|
221 |
fn=caption_frame,
|
222 |
-
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip],
|
223 |
outputs=[cap, log_box],
|
224 |
time_limit=600,
|
225 |
)
|
|
|
45 |
'size': None,
|
46 |
'model_file': None,
|
47 |
'clip_file': None,
|
48 |
+
'verbose': None,
|
49 |
'llm': None
|
50 |
}
|
51 |
|
|
|
87 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
88 |
)
|
89 |
|
90 |
+
# Load and cache LLM (only on dropdown or verbose change)
|
91 |
+
def update_llm(size, model_file, clip_file, verbose_mode):
|
92 |
+
if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
|
|
|
93 |
mf, cf = ensure_weights(size, model_file, clip_file)
|
94 |
+
handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
|
95 |
+
llm = Llama(
|
96 |
+
model_path=mf,
|
97 |
+
chat_handler=handler,
|
98 |
+
n_ctx=8192,
|
99 |
+
verbose=verbose_mode,
|
100 |
+
n_threads=max(2, os.cpu_count())
|
101 |
+
)
|
102 |
+
model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'llm': llm})
|
103 |
+
return None
|
104 |
|
105 |
# Build weight filename lists
|
|
|
106 |
def get_weight_files(size):
|
107 |
cfg = MODELS[size]
|
108 |
model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
|
|
|
110 |
return model_files, clip_files
|
111 |
|
112 |
# Caption using cached llm with real-time debug logs
|
113 |
+
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode):
|
|
|
|
|
114 |
debug_msgs = []
|
115 |
timestamp = time.strftime('%H:%M:%S')
|
116 |
+
debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}")
|
117 |
+
timestamp = time.strftime('%H:%M:%S')
|
118 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
119 |
|
|
|
120 |
timestamp = time.strftime('%H:%M:%S')
|
121 |
debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
|
122 |
debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")
|
|
|
148 |
|
149 |
timestamp = time.strftime('%H:%M:%S')
|
150 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
|
|
151 |
if reset_clip:
|
152 |
+
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode)
|
153 |
timestamp = time.strftime('%H:%M:%S')
|
154 |
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
|
155 |
|
|
|
178 |
return content, "\n".join(debug_msgs)
|
179 |
|
180 |
# Gradio UI
|
|
|
181 |
def main():
|
182 |
logging.basicConfig(level=logging.INFO)
|
183 |
default = '2.2B'
|
184 |
+
default_verbose = False
|
185 |
mf, cf = get_weight_files(default)
|
186 |
|
187 |
with gr.Blocks() as demo:
|
|
|
190 |
size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
|
191 |
model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
|
192 |
clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
|
193 |
+
verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
|
194 |
|
195 |
+
def on_size_change(sz, verbose):
|
|
|
196 |
mlist, clist = get_weight_files(sz)
|
197 |
+
update_llm(sz, mlist[0], clist[0], verbose)
|
198 |
+
return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
|
199 |
+
|
|
|
|
|
|
|
|
|
|
|
200 |
size_dd.change(
|
201 |
fn=on_size_change,
|
202 |
+
inputs=[size_dd, verbose_cb],
|
203 |
outputs=[model_dd, clip_dd]
|
204 |
)
|
205 |
+
model_dd.change(
|
206 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
207 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
208 |
+
outputs=[]
|
209 |
+
)
|
210 |
+
clip_dd.change(
|
211 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
212 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
213 |
+
outputs=[]
|
214 |
+
)
|
215 |
+
verbose_cb.change(
|
216 |
+
fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
|
217 |
+
inputs=[size_dd, model_dd, clip_dd, verbose_cb],
|
218 |
+
outputs=[]
|
219 |
+
)
|
220 |
+
update_llm(default, mf[0], cf[0], default_verbose)
|
221 |
|
222 |
+
interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
|
223 |
+
sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
|
224 |
+
usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
|
225 |
reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
|
226 |
+
cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
|
227 |
+
cap = gr.Textbox(interactive=False, label='Caption')
|
228 |
+
log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
|
229 |
|
230 |
cam.stream(
|
231 |
fn=caption_frame,
|
232 |
+
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb],
|
233 |
outputs=[cap, log_box],
|
234 |
time_limit=600,
|
235 |
)
|