Spaces:
Running
Running
File size: 6,904 Bytes
221e4b6 ca97f63 221e4b6 dd0d47d 221e4b6 ca97f63 221e4b6 ca97f63 dd0d47d 221e4b6 dd0d47d 221e4b6 636baf9 221e4b6 2529cb3 221e4b6 292fb3c 221e4b6 292fb3c 221e4b6 292fb3c 221e4b6 292fb3c 221e4b6 7b7ed26 221e4b6 7b7ed26 221e4b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# app.py
import streamlit as st
st.set_page_config(layout="wide")
import av
import cv2
import time
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
from llama_cpp import Llama
from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
from termcolor import cprint
# βββββββββββββββββββββββββββββββββββββββββ
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# Overwrite any previous registration
LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
"smolvlm2", SmolVLM2ChatHandler, overwrite=True
)
# βββββββββββββββββββββββββββββββββββββββββ
# 2) Model & CLIP files β download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
if not os.path.exists(MODEL_FILE):
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
if not os.path.exists(CLIP_FILE):
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
ensure_models()
@st.cache_resource
def load_llm():
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
return Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
llm = load_llm()
# βββββββββββββββββββββββββββββββββββββββββ
# 3) Helper to run a single frame through the model (with debug)
def caption_frame(frame):
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
cv2.imwrite(f.name, frame)
uri = Path(f.name).absolute().as_uri()
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": uri}},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
print("DEBUG βΆ caption_frame: invoking LLM")
resp = llm.create_chat_completion(
messages=messages,
max_tokens=128,
temperature=0.1,
repeat_penalty=1.1, # discourage exact token repeats
stop=["<end_of_utterance>"],
)
out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
print(f"DEBUG βΆ LLM returned: {out!r}")
return out
# βββββββββββββββββββββββββββββββββββββββββ
# 4) Streamlit UI + WebRTC configuration
st.title("π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
interval_ms = st.slider(
"Caption every N ms", min_value=100, max_value=10000, value=3000, step=100
)
RTC_CONFIG = RTCConfiguration({
"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
})
import concurrent.futures
class CaptionProcessor(VideoProcessorBase):
def __init__(self):
self.interval = 1.0
self.last_time = time.time()
self.caption = ""
self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.future = None
def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
img = frame.to_ndarray(format="bgr24")
now = time.time()
# 1) Schedule a new inference if interval has passed and previous is done
if now - self.last_time >= self.interval:
self.last_time = now
# only submit if there isn't already a running task
if self.future is None or self.future.done():
# copy the frame so that downstream modifying code can't clash
img_copy = img.copy()
self.future = self.executor.submit(caption_frame, img_copy)
# 2) If the background task finished, grab its result
if self.future and self.future.done():
try:
self.caption = self.future.result()
except Exception as e:
self.caption = f"[error: {e}]"
self.future = None
# 3) Draw the **last** caption onto every frame immediately
cv2.putText(
img,
self.caption or "_β¦thinkingβ¦_",
org=(10, img.shape[0] - 20),
fontFace=cv2.FONT_HERSHEY_SIMPLEX,
fontScale=0.6,
color=(255, 255, 255),
thickness=2,
lineType=cv2.LINE_AA,
)
return av.VideoFrame.from_ndarray(img, format="bgr24")
ctx = webrtc_streamer(
key="smolvlm2-captioner",
video_processor_factory=CaptionProcessor,
rtc_configuration=RTC_CONFIG,
media_stream_constraints={"video": True, "audio": False},
)
# Update the processor interval
if ctx.video_processor:
ctx.video_processor.interval = interval_ms / 1000.0
# Placeholder for showing captions
placeholder = st.empty()
if ctx.state.playing:
placeholder.markdown("**Caption:** _Waiting for first inferenceβ¦_")
while ctx.state.playing:
vp = ctx.video_processor
if vp is not None:
txt = vp.caption or "_β¦thinkingβ¦_"
else:
txt = "_β¦loadingβ¦_"
placeholder.markdown(f"**Caption:** {txt}")
time.sleep(0.1)
else:
st.info("βΆοΈ Click **Start** above to begin streaming")
|