Spaces:
Running
Running
add ui component to allow user enabl or disable reset_clip per frame
Browse files
app.py
CHANGED
@@ -107,7 +107,8 @@ def get_weight_files(size):
|
|
107 |
|
108 |
# Caption using cached llm with real-time debug logs
|
109 |
|
110 |
-
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt
|
|
|
111 |
debug_msgs = []
|
112 |
timestamp = time.strftime('%H:%M:%S')
|
113 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
@@ -145,10 +146,12 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
|
|
145 |
timestamp = time.strftime('%H:%M:%S')
|
146 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
147 |
# re-init handler for image
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
151 |
|
|
|
152 |
debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
|
153 |
|
154 |
t_start = time.time()
|
@@ -209,15 +212,16 @@ def main():
|
|
209 |
interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
|
210 |
sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
|
211 |
usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
|
|
|
212 |
cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
|
213 |
cap = gr.Textbox(interactive=False, label='Caption')
|
214 |
log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
|
215 |
|
216 |
cam.stream(
|
217 |
fn=caption_frame,
|
218 |
-
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
|
219 |
outputs=[cap, log_box],
|
220 |
-
time_limit=600
|
221 |
)
|
222 |
|
223 |
demo.launch()
|
|
|
107 |
|
108 |
# Caption using cached llm with real-time debug logs
|
109 |
|
110 |
+
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt,
|
111 |
+
reset_clip: bool):
|
112 |
debug_msgs = []
|
113 |
timestamp = time.strftime('%H:%M:%S')
|
114 |
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
|
|
|
146 |
timestamp = time.strftime('%H:%M:%S')
|
147 |
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
|
148 |
# re-init handler for image
|
149 |
+
if reset_clip:
|
150 |
+
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
|
151 |
+
timestamp = time.strftime('%H:%M:%S')
|
152 |
+
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
|
153 |
|
154 |
+
timestamp = time.strftime('%H:%M:%S')
|
155 |
debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
|
156 |
|
157 |
t_start = time.time()
|
|
|
212 |
interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
|
213 |
sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
|
214 |
usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
|
215 |
+
reset_clip = gr.Checkbox(value=True, label="Reset CLIP handler each frame")
|
216 |
cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
|
217 |
cap = gr.Textbox(interactive=False, label='Caption')
|
218 |
log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
|
219 |
|
220 |
cam.stream(
|
221 |
fn=caption_frame,
|
222 |
+
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip],
|
223 |
outputs=[cap, log_box],
|
224 |
+
time_limit=600,
|
225 |
)
|
226 |
|
227 |
demo.launch()
|