Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Sleeping

App Files Files Community

Luigi commited on Jun 17

Commit

5462ff3

1 Parent(s): a459bee

add ui component to allow user enabl or disable reset_clip per frame

Browse files

Files changed (1) hide show

app.py +10 -6

app.py CHANGED Viewed

@@ -107,7 +107,8 @@ def get_weight_files(size):
 # Caption using cached llm with real-time debug logs
-def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
     debug_msgs = []
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
@@ -145,10 +146,12 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
     # re-init handler for image
-    model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
-    timestamp = time.strftime('%H:%M:%S')
-    debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
     t_start = time.time()
@@ -209,15 +212,16 @@ def main():
         interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
         sys_p    = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
         usr_p    = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
         cam      = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
         cap      = gr.Textbox(interactive=False, label='Caption')
         log_box  = gr.Textbox(lines=8, interactive=False, label='Debug Log')
         cam.stream(
             fn=caption_frame,
-            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
             outputs=[cap, log_box],
-            time_limit=600
         )
     demo.launch()

 # Caption using cached llm with real-time debug logs
+def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt,
+                  reset_clip: bool):
     debug_msgs = []
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
     timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
     # re-init handler for image
+    if reset_clip:
+        model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
+        timestamp = time.strftime('%H:%M:%S')
+        debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
+    timestamp = time.strftime('%H:%M:%S')
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
     t_start = time.time()
         interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
         sys_p    = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
         usr_p    = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
+        reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
         cam      = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
         cap      = gr.Textbox(interactive=False, label='Caption')
         log_box  = gr.Textbox(lines=8, interactive=False, label='Debug Log')
         cam.stream(
             fn=caption_frame,
+            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip],
             outputs=[cap, log_box],
+            time_limit=600,
         )
     demo.launch()