File size: 6,904 Bytes
221e4b6
ca97f63
221e4b6
 
 
dd0d47d
221e4b6
 
 
 
ca97f63
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca97f63
dd0d47d
 
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd0d47d
221e4b6
 
 
 
 
 
 
 
 
 
636baf9
221e4b6
 
 
 
 
 
 
 
 
 
 
2529cb3
221e4b6
 
 
 
 
 
292fb3c
 
221e4b6
 
 
 
 
292fb3c
 
221e4b6
 
 
 
292fb3c
 
221e4b6
 
292fb3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b7ed26
221e4b6
7b7ed26
 
 
 
 
221e4b6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# app.py
import streamlit as st
st.set_page_config(layout="wide")

import av
import cv2
import time
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
from llama_cpp import Llama
from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
from termcolor import cprint

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# Overwrite any previous registration
LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
    "smolvlm2", SmolVLM2ChatHandler, overwrite=True
)

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 2) Model & CLIP files β€” download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    if not os.path.exists(MODEL_FILE):
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
    if not os.path.exists(CLIP_FILE):
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)

ensure_models()

@st.cache_resource
def load_llm():
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    return Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )

llm = load_llm()

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 3) Helper to run a single frame through the model (with debug)
def caption_frame(frame):
    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
        cv2.imwrite(f.name, frame)
        uri = Path(f.name).absolute().as_uri()

    messages = [
        {
            "role": "system",
            "content": (
                "Focus only on describing the key dramatic action or notable event occurring "
                "in this image. Skip general context or scene-setting details unless they are "
                "crucial to understanding the main action."
            ),
        },
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": uri}},
                {"type": "text",      "text": "What is happening in this image?"},
            ],
        },
    ]

    print("DEBUG β–Ά caption_frame: invoking LLM")
    resp = llm.create_chat_completion(
        messages=messages,
        max_tokens=128,
        temperature=0.1,
        repeat_penalty=1.1,       # discourage exact token repeats
        stop=["<end_of_utterance>"],
    )
    out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
    print(f"DEBUG β–Ά LLM returned: {out!r}")
    return out

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 4) Streamlit UI + WebRTC configuration
st.title("πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")

interval_ms = st.slider(
    "Caption every N ms", min_value=100, max_value=10000, value=3000, step=100
)

RTC_CONFIG = RTCConfiguration({
    "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
})

import concurrent.futures

class CaptionProcessor(VideoProcessorBase):
    def __init__(self):
        self.interval = 1.0
        self.last_time = time.time()
        self.caption = ""
        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
        self.future = None

    def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
        img = frame.to_ndarray(format="bgr24")
        now = time.time()

        # 1) Schedule a new inference if interval has passed and previous is done
        if now - self.last_time >= self.interval:
            self.last_time = now
            # only submit if there isn't already a running task
            if self.future is None or self.future.done():
                # copy the frame so that downstream modifying code can't clash
                img_copy = img.copy()
                self.future = self.executor.submit(caption_frame, img_copy)

        # 2) If the background task finished, grab its result
        if self.future and self.future.done():
            try:
                self.caption = self.future.result()
            except Exception as e:
                self.caption = f"[error: {e}]"
            self.future = None

        # 3) Draw the **last** caption onto every frame immediately
        cv2.putText(
            img,
            self.caption or "_…thinking…_",
            org=(10, img.shape[0] - 20),
            fontFace=cv2.FONT_HERSHEY_SIMPLEX,
            fontScale=0.6,
            color=(255, 255, 255),
            thickness=2,
            lineType=cv2.LINE_AA,
        )

        return av.VideoFrame.from_ndarray(img, format="bgr24")

ctx = webrtc_streamer(
    key="smolvlm2-captioner",
    video_processor_factory=CaptionProcessor,
    rtc_configuration=RTC_CONFIG,
    media_stream_constraints={"video": True, "audio": False},
)

# Update the processor interval
if ctx.video_processor:
    ctx.video_processor.interval = interval_ms / 1000.0

# Placeholder for showing captions
placeholder = st.empty()
if ctx.state.playing:
    placeholder.markdown("**Caption:** _Waiting for first inference…_")
    while ctx.state.playing:
        vp = ctx.video_processor
        if vp is not None:
            txt = vp.caption or "_…thinking…_"
        else:
            txt = "_…loading…_"
        placeholder.markdown(f"**Caption:** {txt}")
        time.sleep(0.1)
else:
    st.info("▢️ Click **Start** above to begin streaming")