Luigi's picture
initial commit
ca97f63
raw
history blame
5.32 kB
# app.py
import torch; torch.classes.__path__ = [] # Neutralizes the path inspection
import os
import sys
import time
import socket
import subprocess
import atexit
import base64
import shutil
import cv2
import streamlit as st
import requests
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
from huggingface_hub import hf_hub_download
# ── Configuration ──────────────────────────────────────────────────────────────
PORT = 8000
BASE_URL = f"http://localhost:{PORT}/v1"
MODEL_ALIAS = "gpt-4-vision-preview"
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
# ── Helpers to download & launch server ─────────────────────────────────────────
def download_if_missing(repo_id: str, filename: str):
if not os.path.exists(filename):
cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
shutil.copy(cached, filename)
def ensure_models():
download_if_missing(REPO_ID, MODEL_FILE)
download_if_missing(REPO_ID, PROJ_FILE)
def start_server():
cmd = [
sys.executable, "-m", "llama_cpp.server",
"--model", MODEL_FILE,
"--clip_model_path", PROJ_FILE,
"--chat_format", "llava-1-5",
"--port", str(PORT),
"--model_alias", MODEL_ALIAS,
]
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True, # so line buffering works
bufsize=1,
)
atexit.register(proc.terminate)
for line in proc.stdout:
if "Application startup complete." in line:
return proc
raise RuntimeError(f"Server failed to start on port {PORT}")
# ── Boot llama-cpp-python server ────────────────────────────────────────────────
ensure_models()
_server_proc = start_server()
# ── Streamlit UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
st.title("πŸ“Έ Live Camera Captioning with SmolVLM")
st.markdown(
"""
Use the **slider** below to choose how often (in milliseconds) to
send a frame to SmolVLM for captioning. The latest caption will
be overlaid on your video feed.
"""
)
interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
# ── Video processor ────────────────────────────────────────────────────────────
class CaptionProcessor(VideoProcessorBase):
def __init__(self, interval_ms: int):
self.interval = interval_ms / 1000.0
self.last_time = 0.0
self.caption = "Waiting for caption..."
self.font = cv2.FONT_HERSHEY_SIMPLEX
def recv(self, frame):
img = frame.to_ndarray(format="bgr24")
now = time.time()
if now - self.last_time >= self.interval:
self.last_time = now
# JPEG + base64 encode
success, buf = cv2.imencode(".jpg", img)
if success:
b64 = base64.b64encode(buf).decode("utf-8")
payload = {
"model": MODEL_ALIAS,
"messages": [
{
"role": "system",
"content": (
"You are a precise image‐captioning assistant. "
"Identify the main subject, their clothing, posture, and environment."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
{"type": "text", "text": "Caption this image in one detailed sentence."},
],
},
],
"temperature": 0.1,
"max_tokens": 100,
}
try:
r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
r.raise_for_status()
self.caption = r.json()["choices"][0]["message"]["content"].strip()
except Exception as e:
self.caption = f"[Error] {e}"
# overlay caption
y = img.shape[0] - 20
cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
return frame.from_ndarray(img, format="bgr24")
webrtc_streamer(
key=f"caption_{interval_ms}",
video_processor_factory=lambda: CaptionProcessor(interval_ms),
media_stream_constraints={"video": True, "audio": False},
)