File size: 5,841 Bytes
221e4b6
ca97f63
221e4b6
 
 
dd0d47d
221e4b6
 
 
 
ca97f63
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca97f63
dd0d47d
 
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd0d47d
221e4b6
 
 
 
 
 
 
 
 
 
636baf9
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b7ed26
221e4b6
7b7ed26
 
 
 
 
221e4b6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# app.py
import streamlit as st
st.set_page_config(layout="wide")

import av
import cv2
import time
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
from llama_cpp import Llama
from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
from termcolor import cprint

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# Overwrite any previous registration
LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
    "smolvlm2", SmolVLM2ChatHandler, overwrite=True
)

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 2) Model & CLIP files β€” download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    if not os.path.exists(MODEL_FILE):
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
    if not os.path.exists(CLIP_FILE):
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)

ensure_models()

@st.cache_resource
def load_llm():
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    return Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )

llm = load_llm()

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 3) Helper to run a single frame through the model (with debug)
def caption_frame(frame):
    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
        cv2.imwrite(f.name, frame)
        uri = Path(f.name).absolute().as_uri()

    messages = [
        {
            "role": "system",
            "content": (
                "Focus only on describing the key dramatic action or notable event occurring "
                "in this image. Skip general context or scene-setting details unless they are "
                "crucial to understanding the main action."
            ),
        },
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": uri}},
                {"type": "text",      "text": "What is happening in this image?"},
            ],
        },
    ]

    print("DEBUG β–Ά caption_frame: invoking LLM")
    resp = llm.create_chat_completion(
        messages=messages,
        max_tokens=128,
        temperature=0.1,
        repeat_penalty=1.1,       # discourage exact token repeats
        stop=["<end_of_utterance>"],
    )
    out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
    print(f"DEBUG β–Ά LLM returned: {out!r}")
    return out

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 4) Streamlit UI + WebRTC configuration
st.title("πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")

interval_ms = st.slider(
    "Caption every N ms", min_value=100, max_value=10000, value=1000, step=100
)

RTC_CONFIG = RTCConfiguration({
    "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
})

class CaptionProcessor(VideoProcessorBase):
    def __init__(self):
        self.interval = 1.0
        self.last_time = time.time()
        self.caption = ""

    def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
        img = frame.to_ndarray(format="bgr24")
        now = time.time()
        if now - self.last_time >= self.interval:
            self.last_time = now
            print("DEBUG β–Ά CaptionProcessor.recv: time reached, generating caption")
            self.caption = caption_frame(img)
        return av.VideoFrame.from_ndarray(img, format="bgr24")

ctx = webrtc_streamer(
    key="smolvlm2-captioner",
    video_processor_factory=CaptionProcessor,
    rtc_configuration=RTC_CONFIG,
    media_stream_constraints={"video": True, "audio": False},
)

# Update the processor interval
if ctx.video_processor:
    ctx.video_processor.interval = interval_ms / 1000.0

# Placeholder for showing captions
placeholder = st.empty()
if ctx.state.playing:
    placeholder.markdown("**Caption:** _Waiting for first inference…_")
    while ctx.state.playing:
        vp = ctx.video_processor
        if vp is not None:
            txt = vp.caption or "_…thinking…_"
        else:
            txt = "_…loading…_"
        placeholder.markdown(f"**Caption:** {txt}")
        time.sleep(0.1)
else:
    st.info("▢️ Click **Start** above to begin streaming")