Spaces:
Running
Running
File size: 5,321 Bytes
ca97f63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# app.py
import torch; torch.classes.__path__ = [] # Neutralizes the path inspection
import os
import sys
import time
import socket
import subprocess
import atexit
import base64
import shutil
import cv2
import streamlit as st
import requests
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
from huggingface_hub import hf_hub_download
# ββ Configuration ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
PORT = 8000
BASE_URL = f"http://localhost:{PORT}/v1"
MODEL_ALIAS = "gpt-4-vision-preview"
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
# ββ Helpers to download & launch server βββββββββββββββββββββββββββββββββββββββββ
def download_if_missing(repo_id: str, filename: str):
if not os.path.exists(filename):
cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
shutil.copy(cached, filename)
def ensure_models():
download_if_missing(REPO_ID, MODEL_FILE)
download_if_missing(REPO_ID, PROJ_FILE)
def start_server():
cmd = [
sys.executable, "-m", "llama_cpp.server",
"--model", MODEL_FILE,
"--clip_model_path", PROJ_FILE,
"--chat_format", "llava-1-5",
"--port", str(PORT),
"--model_alias", MODEL_ALIAS,
]
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True, # so line buffering works
bufsize=1,
)
atexit.register(proc.terminate)
for line in proc.stdout:
if "Application startup complete." in line:
return proc
raise RuntimeError(f"Server failed to start on port {PORT}")
# ββ Boot llama-cpp-python server ββββββββββββββββββββββββββββββββββββββββββββββββ
ensure_models()
_server_proc = start_server()
# ββ Streamlit UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
st.title("πΈ Live Camera Captioning with SmolVLM")
st.markdown(
"""
Use the **slider** below to choose how often (in milliseconds) to
send a frame to SmolVLM for captioning. The latest caption will
be overlaid on your video feed.
"""
)
interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
# ββ Video processor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class CaptionProcessor(VideoProcessorBase):
def __init__(self, interval_ms: int):
self.interval = interval_ms / 1000.0
self.last_time = 0.0
self.caption = "Waiting for caption..."
self.font = cv2.FONT_HERSHEY_SIMPLEX
def recv(self, frame):
img = frame.to_ndarray(format="bgr24")
now = time.time()
if now - self.last_time >= self.interval:
self.last_time = now
# JPEG + base64 encode
success, buf = cv2.imencode(".jpg", img)
if success:
b64 = base64.b64encode(buf).decode("utf-8")
payload = {
"model": MODEL_ALIAS,
"messages": [
{
"role": "system",
"content": (
"You are a precise imageβcaptioning assistant. "
"Identify the main subject, their clothing, posture, and environment."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
{"type": "text", "text": "Caption this image in one detailed sentence."},
],
},
],
"temperature": 0.1,
"max_tokens": 100,
}
try:
r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
r.raise_for_status()
self.caption = r.json()["choices"][0]["message"]["content"].strip()
except Exception as e:
self.caption = f"[Error] {e}"
# overlay caption
y = img.shape[0] - 20
cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
return frame.from_ndarray(img, format="bgr24")
webrtc_streamer(
key=f"caption_{interval_ms}",
video_processor_factory=lambda: CaptionProcessor(interval_ms),
media_stream_constraints={"video": True, "audio": False},
)
|