Spaces:
Running
Running
# app.py | |
import torch; torch.classes.__path__ = [] # Neutralizes the path inspection | |
import os | |
import sys | |
import time | |
import socket | |
import subprocess | |
import atexit | |
import base64 | |
import shutil | |
import cv2 | |
import streamlit as st | |
import requests | |
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase | |
from huggingface_hub import hf_hub_download | |
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
PORT = 8000 | |
BASE_URL = f"http://localhost:{PORT}/v1" | |
MODEL_ALIAS = "gpt-4-vision-preview" | |
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" | |
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
# ββ Helpers to download & launch server βββββββββββββββββββββββββββββββββββββββββ | |
def download_if_missing(repo_id: str, filename: str): | |
if not os.path.exists(filename): | |
cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model") | |
shutil.copy(cached, filename) | |
def ensure_models(): | |
download_if_missing(REPO_ID, MODEL_FILE) | |
download_if_missing(REPO_ID, PROJ_FILE) | |
def start_server(): | |
cmd = [ | |
sys.executable, "-m", "llama_cpp.server", | |
"--model", MODEL_FILE, | |
"--clip_model_path", PROJ_FILE, | |
"--chat_format", "llava-1-5", | |
"--port", str(PORT), | |
"--model_alias", MODEL_ALIAS, | |
] | |
proc = subprocess.Popen( | |
cmd, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.STDOUT, | |
text=True, # so line buffering works | |
bufsize=1, | |
) | |
atexit.register(proc.terminate) | |
for line in proc.stdout: | |
if "Application startup complete." in line: | |
return proc | |
raise RuntimeError(f"Server failed to start on port {PORT}") | |
# ββ Boot llama-cpp-python server ββββββββββββββββββββββββββββββββββββββββββββββββ | |
ensure_models() | |
_server_proc = start_server() | |
# ββ Streamlit UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide") | |
st.title("πΈ Live Camera Captioning with SmolVLM") | |
st.markdown( | |
""" | |
Use the **slider** below to choose how often (in milliseconds) to | |
send a frame to SmolVLM for captioning. The latest caption will | |
be overlaid on your video feed. | |
""" | |
) | |
interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000) | |
# ββ Video processor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
class CaptionProcessor(VideoProcessorBase): | |
def __init__(self, interval_ms: int): | |
self.interval = interval_ms / 1000.0 | |
self.last_time = 0.0 | |
self.caption = "Waiting for caption..." | |
self.font = cv2.FONT_HERSHEY_SIMPLEX | |
def recv(self, frame): | |
img = frame.to_ndarray(format="bgr24") | |
now = time.time() | |
if now - self.last_time >= self.interval: | |
self.last_time = now | |
# JPEG + base64 encode | |
success, buf = cv2.imencode(".jpg", img) | |
if success: | |
b64 = base64.b64encode(buf).decode("utf-8") | |
payload = { | |
"model": MODEL_ALIAS, | |
"messages": [ | |
{ | |
"role": "system", | |
"content": ( | |
"You are a precise imageβcaptioning assistant. " | |
"Identify the main subject, their clothing, posture, and environment." | |
), | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}, | |
{"type": "text", "text": "Caption this image in one detailed sentence."}, | |
], | |
}, | |
], | |
"temperature": 0.1, | |
"max_tokens": 100, | |
} | |
try: | |
r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10) | |
r.raise_for_status() | |
self.caption = r.json()["choices"][0]["message"]["content"].strip() | |
except Exception as e: | |
self.caption = f"[Error] {e}" | |
# overlay caption | |
y = img.shape[0] - 20 | |
cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2) | |
return frame.from_ndarray(img, format="bgr24") | |
webrtc_streamer( | |
key=f"caption_{interval_ms}", | |
video_processor_factory=lambda: CaptionProcessor(interval_ms), | |
media_stream_constraints={"video": True, "audio": False}, | |
) | |