Spaces:
Running
Running
switch to gradio implementation as streamlit + webrtc requires turn server
Browse files- README.md +24 -27
- app.py +56 -125
- requirements.txt +4 -4
README.md
CHANGED
@@ -1,27 +1,28 @@
|
|
1 |
---
|
2 |
-
|
|
|
3 |
emoji: π»
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
-
sdk:
|
7 |
-
|
8 |
pinned: false
|
9 |
license: mit
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
|
14 |
-
# SmolVLM2 Real
|
15 |
|
16 |
-
This Hugging Face Spaces app uses **
|
17 |
|
18 |
## Features
|
19 |
|
20 |
-
* **CPU
|
21 |
-
* **
|
22 |
-
* **Adjustable interval slider** (100
|
23 |
* **Automatic GGUF model download** from Hugging Face Hub when missing.
|
24 |
-
* **Debug logging** in the terminal for tracing inference
|
25 |
|
26 |
## Setup
|
27 |
|
@@ -38,38 +39,34 @@ This Hugging Face Spaces app uses **Streamlit** + **WebRTC** to capture your web
|
|
38 |
pip install -r requirements.txt
|
39 |
```
|
40 |
|
41 |
-
3. **(Optional) Pre
|
42 |
-
|
43 |
|
44 |
-
* `SmolVLM2-500M-Video-Instruct.
|
45 |
* `mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf`
|
46 |
|
47 |
-
To skip
|
48 |
|
49 |
## Usage
|
50 |
|
51 |
1. **Launch the app**:
|
52 |
|
53 |
```bash
|
54 |
-
|
55 |
```
|
56 |
|
57 |
-
2. **Open your browser** at the URL shown (e.g. `http://
|
58 |
|
59 |
-
3. **Allow webcam access** when prompted
|
60 |
|
61 |
-
4. **Adjust the capture interval** using the slider.
|
62 |
|
63 |
-
5. **
|
64 |
-
|
65 |
-
6. **View live captions** in the panel below the video.
|
66 |
|
67 |
## File Structure
|
68 |
|
69 |
-
* `app.py` β Main
|
70 |
* `requirements.txt` β Python dependencies.
|
71 |
-
* `.gguf` model files (auto
|
72 |
|
73 |
## License
|
74 |
-
|
75 |
-
Licensed under the MIT License.
|
|
|
1 |
---
|
2 |
+
|
3 |
+
title: SmolVLM2 Real-Time Captioning Demo
|
4 |
emoji: π»
|
5 |
colorFrom: gray
|
6 |
colorTo: gray
|
7 |
+
sdk: gradio
|
8 |
+
app\_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short\_description: Real-time webcam captioning with SmolVLM2 on llama.cpp
|
12 |
+
sdk\_version: 5.0
|
13 |
+
-----------------
|
14 |
|
15 |
+
# SmolVLM2 Real-Time Captioning Demo
|
16 |
|
17 |
+
This Hugging Face Spaces app uses **Gradio v5 Blocks** to capture your webcam feed every *N* milliseconds and run it through the SmolVLM2 model on your CPU, displaying live captions below each frame.
|
18 |
|
19 |
## Features
|
20 |
|
21 |
+
* **CPU-only inference** via `llama-cpp-python` wrapping `llama.cpp`.
|
22 |
+
* **Gradio live streaming** for low-latency, browser-native video input.
|
23 |
+
* **Adjustable interval slider** (100 ms to 10 s) for frame capture frequency.
|
24 |
* **Automatic GGUF model download** from Hugging Face Hub when missing.
|
25 |
+
* **Debug logging** in the terminal for tracing each inference step.
|
26 |
|
27 |
## Setup
|
28 |
|
|
|
39 |
pip install -r requirements.txt
|
40 |
```
|
41 |
|
42 |
+
3. **(Optional) Pre-download model files**
|
43 |
+
These will be automatically downloaded if absent:
|
44 |
|
45 |
+
* `SmolVLM2-500M-Video-Instruct.Q8_0.gguf`
|
46 |
* `mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf`
|
47 |
|
48 |
+
To skip downloads, place both GGUF files in the repo root.
|
49 |
|
50 |
## Usage
|
51 |
|
52 |
1. **Launch the app**:
|
53 |
|
54 |
```bash
|
55 |
+
python app.py
|
56 |
```
|
57 |
|
58 |
+
2. **Open your browser** at the URL shown in the terminal (e.g. `http://127.0.0.1:7860`).
|
59 |
|
60 |
+
3. **Allow webcam access** when prompted.
|
61 |
|
62 |
+
4. **Adjust the capture interval** using the slider in the UI.
|
63 |
|
64 |
+
5. **Live captions** will appear below each video frame.
|
|
|
|
|
65 |
|
66 |
## File Structure
|
67 |
|
68 |
+
* `app.py` β Main Gradio v5 Blocks application.
|
69 |
* `requirements.txt` β Python dependencies.
|
70 |
+
* `.gguf` model files (auto-downloaded or user-provided).
|
71 |
|
72 |
## License
|
|
|
|
app.py
CHANGED
@@ -1,17 +1,11 @@
|
|
1 |
-
|
2 |
-
import streamlit as st
|
3 |
-
st.set_page_config(layout="wide")
|
4 |
-
|
5 |
-
import av
|
6 |
import cv2
|
7 |
-
import time
|
8 |
import tempfile
|
9 |
import os
|
10 |
from pathlib import Path
|
11 |
from huggingface_hub import hf_hub_download
|
12 |
-
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
|
13 |
from llama_cpp import Llama
|
14 |
-
from llama_cpp.llama_chat_format import
|
15 |
from termcolor import cprint
|
16 |
|
17 |
# βββββββββββββββββββββββββββββββββββββββββ
|
@@ -39,11 +33,6 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
|
|
39 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
40 |
)
|
41 |
|
42 |
-
# Overwrite any previous registration
|
43 |
-
LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
|
44 |
-
"smolvlm2", SmolVLM2ChatHandler, overwrite=True
|
45 |
-
)
|
46 |
-
|
47 |
# βββββββββββββββββββββββββββββββββββββββββ
|
48 |
# 2) Model & CLIP files β download if missing
|
49 |
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
|
@@ -61,7 +50,6 @@ def ensure_models():
|
|
61 |
|
62 |
ensure_models()
|
63 |
|
64 |
-
@st.cache_resource
|
65 |
def load_llm():
|
66 |
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
|
67 |
return Llama(
|
@@ -74,122 +62,65 @@ def load_llm():
|
|
74 |
llm = load_llm()
|
75 |
|
76 |
# βββββββββββββββββββββββββββββββββββββββββ
|
77 |
-
#
|
78 |
def caption_frame(frame):
|
79 |
-
|
|
|
|
|
|
|
80 |
cv2.imwrite(f.name, frame)
|
81 |
uri = Path(f.name).absolute().as_uri()
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
"
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
return out
|
112 |
-
|
113 |
-
# βββββββββββββββββββββββββββββββββββββββββ
|
114 |
-
# 4) Streamlit UI + WebRTC configuration
|
115 |
-
st.title("π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
|
116 |
-
|
117 |
-
interval_ms = st.slider(
|
118 |
-
"Caption every N ms", min_value=100, max_value=10000, value=3000, step=100
|
119 |
-
)
|
120 |
-
|
121 |
-
RTC_CONFIG = RTCConfiguration({
|
122 |
-
"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
|
123 |
-
})
|
124 |
-
|
125 |
-
import concurrent.futures
|
126 |
-
|
127 |
-
class CaptionProcessor(VideoProcessorBase):
|
128 |
-
def __init__(self):
|
129 |
-
self.interval = 1.0
|
130 |
-
self.last_time = time.time()
|
131 |
-
self.caption = ""
|
132 |
-
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
|
133 |
-
self.future = None
|
134 |
-
|
135 |
-
def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
|
136 |
-
img = frame.to_ndarray(format="bgr24")
|
137 |
-
now = time.time()
|
138 |
-
|
139 |
-
# 1) Schedule a new inference if interval has passed and previous is done
|
140 |
-
if now - self.last_time >= self.interval:
|
141 |
-
self.last_time = now
|
142 |
-
# only submit if there isn't already a running task
|
143 |
-
if self.future is None or self.future.done():
|
144 |
-
# copy the frame so that downstream modifying code can't clash
|
145 |
-
img_copy = img.copy()
|
146 |
-
self.future = self.executor.submit(caption_frame, img_copy)
|
147 |
-
|
148 |
-
# 2) If the background task finished, grab its result
|
149 |
-
if self.future and self.future.done():
|
150 |
-
try:
|
151 |
-
self.caption = self.future.result()
|
152 |
-
except Exception as e:
|
153 |
-
self.caption = f"[error: {e}]"
|
154 |
-
self.future = None
|
155 |
-
|
156 |
-
# 3) Draw the **last** caption onto every frame immediately
|
157 |
-
cv2.putText(
|
158 |
-
img,
|
159 |
-
self.caption or "_β¦thinkingβ¦_",
|
160 |
-
org=(10, img.shape[0] - 20),
|
161 |
-
fontFace=cv2.FONT_HERSHEY_SIMPLEX,
|
162 |
-
fontScale=0.6,
|
163 |
-
color=(255, 255, 255),
|
164 |
-
thickness=2,
|
165 |
-
lineType=cv2.LINE_AA,
|
166 |
)
|
167 |
|
168 |
-
|
|
|
|
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
)
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
vp = ctx.video_processor
|
187 |
-
if vp is not None:
|
188 |
-
txt = vp.caption or "_β¦thinkingβ¦_"
|
189 |
-
else:
|
190 |
-
txt = "_β¦loadingβ¦_"
|
191 |
-
placeholder.markdown(f"**Caption:** {txt}")
|
192 |
-
time.sleep(0.1)
|
193 |
-
else:
|
194 |
-
st.info("βΆοΈ Click **Start** above to begin streaming")
|
195 |
|
|
|
|
|
|
1 |
+
import gradio as gr
|
|
|
|
|
|
|
|
|
2 |
import cv2
|
|
|
3 |
import tempfile
|
4 |
import os
|
5 |
from pathlib import Path
|
6 |
from huggingface_hub import hf_hub_download
|
|
|
7 |
from llama_cpp import Llama
|
8 |
+
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
9 |
from termcolor import cprint
|
10 |
|
11 |
# βββββββββββββββββββββββββββββββββββββββββ
|
|
|
33 |
"{% if add_generation_prompt %}Assistant:{% endif %}"
|
34 |
)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
36 |
# βββββββββββββββββββββββββββββββββββββββββ
|
37 |
# 2) Model & CLIP files β download if missing
|
38 |
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
|
|
|
50 |
|
51 |
ensure_models()
|
52 |
|
|
|
53 |
def load_llm():
|
54 |
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
|
55 |
return Llama(
|
|
|
62 |
llm = load_llm()
|
63 |
|
64 |
# βββββββββββββββββββββββββββββββββββββββββ
|
65 |
+
# 4) Captioning helper (stateless prompt)
|
66 |
def caption_frame(frame):
|
67 |
+
# make a writable copy
|
68 |
+
frame = frame.copy()
|
69 |
+
# save frame to temporary file for URI
|
70 |
+
with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
|
71 |
cv2.imwrite(f.name, frame)
|
72 |
uri = Path(f.name).absolute().as_uri()
|
73 |
|
74 |
+
# build a single prompt string
|
75 |
+
messages = [
|
76 |
+
{
|
77 |
+
"role": "system",
|
78 |
+
"content": (
|
79 |
+
"Focus only on describing the key dramatic action or notable event occurring "
|
80 |
+
"in this image. Skip general context or scene-setting details unless they are "
|
81 |
+
"crucial to understanding the main action."
|
82 |
+
),
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"role": "user",
|
86 |
+
"content": [
|
87 |
+
{"type": "image_url", "image_url": uri},
|
88 |
+
{"type": "text", "text": "What is happening in this image?"},
|
89 |
+
],
|
90 |
+
},
|
91 |
+
]
|
92 |
+
|
93 |
+
# stateless completion call
|
94 |
+
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
|
95 |
+
llm.reset() # reset n_tokens back to 0
|
96 |
+
llm._ctx.kv_cache_clear() # clear any cached key/values
|
97 |
+
resp = llm.create_chat_completion(
|
98 |
+
messages = messages,
|
99 |
+
max_tokens=256,
|
100 |
+
temperature=0.1,
|
101 |
+
stop=["<end_of_utterance>"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
)
|
103 |
|
104 |
+
# extract caption
|
105 |
+
caption = (resp.get("choices", [])[0]['message'].get("content", "") or "").strip()
|
106 |
+
return caption
|
107 |
|
108 |
+
# βββββββββββββββββββββββββββββββββββββββββ
|
109 |
+
# 5) Gradio UI (v5 streaming)
|
110 |
+
demo = gr.Blocks()
|
111 |
+
with demo:
|
112 |
+
gr.Markdown("## π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
|
113 |
+
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
|
114 |
+
caption_box = gr.Textbox(interactive=False, label="Caption")
|
115 |
+
|
116 |
+
# stream frames and captions
|
117 |
+
input_img.stream(
|
118 |
+
fn=caption_frame,
|
119 |
+
inputs=[input_img],
|
120 |
+
outputs=[caption_box],
|
121 |
+
stream_every=3,
|
122 |
+
time_limit=600
|
123 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
+
if __name__ == "__main__":
|
126 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
-
|
2 |
-
|
|
|
3 |
llama-cpp-python
|
4 |
huggingface-hub
|
5 |
-
termcolor
|
6 |
-
opencv-python
|
|
|
1 |
+
gradio>=5.0
|
2 |
+
opencv-python
|
3 |
+
pillow
|
4 |
llama-cpp-python
|
5 |
huggingface-hub
|
6 |
+
termcolor
|
|