Spaces:
Running
Running
File size: 5,841 Bytes
221e4b6 ca97f63 221e4b6 dd0d47d 221e4b6 ca97f63 221e4b6 ca97f63 dd0d47d 221e4b6 dd0d47d 221e4b6 636baf9 221e4b6 7b7ed26 221e4b6 7b7ed26 221e4b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# app.py
import streamlit as st
st.set_page_config(layout="wide")
import av
import cv2
import time
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
from llama_cpp import Llama
from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
from termcolor import cprint
# βββββββββββββββββββββββββββββββββββββββββ
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# Overwrite any previous registration
LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
"smolvlm2", SmolVLM2ChatHandler, overwrite=True
)
# βββββββββββββββββββββββββββββββββββββββββ
# 2) Model & CLIP files β download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
if not os.path.exists(MODEL_FILE):
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
if not os.path.exists(CLIP_FILE):
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
ensure_models()
@st.cache_resource
def load_llm():
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
return Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
llm = load_llm()
# βββββββββββββββββββββββββββββββββββββββββ
# 3) Helper to run a single frame through the model (with debug)
def caption_frame(frame):
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
cv2.imwrite(f.name, frame)
uri = Path(f.name).absolute().as_uri()
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": uri}},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
print("DEBUG βΆ caption_frame: invoking LLM")
resp = llm.create_chat_completion(
messages=messages,
max_tokens=128,
temperature=0.1,
repeat_penalty=1.1, # discourage exact token repeats
stop=["<end_of_utterance>"],
)
out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
print(f"DEBUG βΆ LLM returned: {out!r}")
return out
# βββββββββββββββββββββββββββββββββββββββββ
# 4) Streamlit UI + WebRTC configuration
st.title("π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
interval_ms = st.slider(
"Caption every N ms", min_value=100, max_value=10000, value=1000, step=100
)
RTC_CONFIG = RTCConfiguration({
"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
})
class CaptionProcessor(VideoProcessorBase):
def __init__(self):
self.interval = 1.0
self.last_time = time.time()
self.caption = ""
def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
img = frame.to_ndarray(format="bgr24")
now = time.time()
if now - self.last_time >= self.interval:
self.last_time = now
print("DEBUG βΆ CaptionProcessor.recv: time reached, generating caption")
self.caption = caption_frame(img)
return av.VideoFrame.from_ndarray(img, format="bgr24")
ctx = webrtc_streamer(
key="smolvlm2-captioner",
video_processor_factory=CaptionProcessor,
rtc_configuration=RTC_CONFIG,
media_stream_constraints={"video": True, "audio": False},
)
# Update the processor interval
if ctx.video_processor:
ctx.video_processor.interval = interval_ms / 1000.0
# Placeholder for showing captions
placeholder = st.empty()
if ctx.state.playing:
placeholder.markdown("**Caption:** _Waiting for first inferenceβ¦_")
while ctx.state.playing:
vp = ctx.video_processor
if vp is not None:
txt = vp.caption or "_β¦thinkingβ¦_"
else:
txt = "_β¦loadingβ¦_"
placeholder.markdown(f"**Caption:** {txt}")
time.sleep(0.1)
else:
st.info("βΆοΈ Click **Start** above to begin streaming")
|