Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on Jun 17

Commit

2881733

1 Parent(s): 4decc4b

add vebose mode switch

Browse files

Files changed (1) hide show

app.py +48 -38

app.py CHANGED Viewed

@@ -45,6 +45,7 @@ model_cache = {
     'size': None,
     'model_file': None,
     'clip_file': None,
     'llm': None
 }
@@ -86,19 +87,22 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
-# Load and cache LLM (only on dropdown change)
-def update_llm(size, model_file, clip_file):
-    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
         mf, cf = ensure_weights(size, model_file, clip_file)
-        handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
-        llm = Llama(model_path=mf, chat_handler=handler, n_ctx=8192,
-                    verbose=False, n_threads=max(2, os.cpu_count()))
-        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
-    return None  # no UI output
 # Build weight filename lists
 def get_weight_files(size):
     cfg = MODELS[size]
     model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
@@ -106,14 +110,13 @@ def get_weight_files(size):
     return model_files, clip_files
 # Caption using cached llm with real-time debug logs
-def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt,
-                  reset_clip: bool):
     debug_msgs = []
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
-   # show which weight files we’re using this run
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
     debug_msgs.append(f"[{timestamp}] Using CLIP weights:  {clip_file}")
@@ -145,9 +148,8 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
-    # re-init handler for image
     if reset_clip:
-        model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
         timestamp = time.strftime('%H:%M:%S')
         debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
@@ -176,10 +178,10 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
     return content, "\n".join(debug_msgs)
 # Gradio UI
 def main():
     logging.basicConfig(level=logging.INFO)
     default = '2.2B'
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
@@ -188,38 +190,46 @@ def main():
             size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
             clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
-                # When size changes: update dropdowns AND preload llm with the new first weights
-        def on_size_change(sz):
             mlist, clist = get_weight_files(sz)
-            # update dropdown choices and default values
-            update_ui = (
-                gr.update(choices=mlist, value=mlist[0]),
-                gr.update(choices=clist, value=clist[0])
-            )
-            # preload with first weights
-            update_llm(sz, mlist[0], clist[0])
-            return update_ui
         size_dd.change(
             fn=on_size_change,
-            inputs=[size_dd],
             outputs=[model_dd, clip_dd]
         )
-        model_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
-        clip_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
-        update_llm(default, mf[0], cf[0])
-        interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
-        sys_p    = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
-        usr_p    = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
         reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
-        cam      = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
-        cap      = gr.Textbox(interactive=False, label='Caption')
-        log_box  = gr.Textbox(lines=8, interactive=False, label='Debug Log')
         cam.stream(
             fn=caption_frame,
-            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip],
             outputs=[cap, log_box],
             time_limit=600,
         )

     'size': None,
     'model_file': None,
     'clip_file': None,
+    'verbose': None,
     'llm': None
 }
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
+# Load and cache LLM (only on dropdown or verbose change)
+def update_llm(size, model_file, clip_file, verbose_mode):
+    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
         mf, cf = ensure_weights(size, model_file, clip_file)
+        handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
+        llm = Llama(
+            model_path=mf,
+            chat_handler=handler,
+            n_ctx=8192,
+            verbose=verbose_mode,
+            n_threads=max(2, os.cpu_count())
+        )
+        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'llm': llm})
+    return None
 # Build weight filename lists
 def get_weight_files(size):
     cfg = MODELS[size]
     model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
     return model_files, clip_files
 # Caption using cached llm with real-time debug logs
+def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode):
     debug_msgs = []
     timestamp = time.strftime('%H:%M:%S')
+    debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}")
+    timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
     debug_msgs.append(f"[{timestamp}] Using CLIP weights:  {clip_file}")
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
     if reset_clip:
+        model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode)
         timestamp = time.strftime('%H:%M:%S')
         debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
     return content, "\n".join(debug_msgs)
 # Gradio UI
 def main():
     logging.basicConfig(level=logging.INFO)
     default = '2.2B'
+    default_verbose = False
     mf, cf = get_weight_files(default)
     with gr.Blocks() as demo:
             size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
             model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
             clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
+            verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
+        def on_size_change(sz, verbose):
             mlist, clist = get_weight_files(sz)
+            update_llm(sz, mlist[0], clist[0], verbose)
+            return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
         size_dd.change(
             fn=on_size_change,
+            inputs=[size_dd, verbose_cb],
             outputs=[model_dd, clip_dd]
         )
+        model_dd.change(
+            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
+            outputs=[]
+        )
+        clip_dd.change(
+            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
+            outputs=[]
+        )
+        verbose_cb.change(
+            fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
+            inputs=[size_dd, model_dd, clip_dd, verbose_cb],
+            outputs=[]
+        )
+        update_llm(default, mf[0], cf[0], default_verbose)
+        interval   = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
+        sys_p      = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
+        usr_p      = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
         reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
+        cam        = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
+        cap        = gr.Textbox(interactive=False, label='Caption')
+        log_box    = gr.Textbox(lines=8, interactive=False, label='Debug Log')
         cam.stream(
             fn=caption_frame,
+            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb],
             outputs=[cap, log_box],
             time_limit=600,
         )