Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

File size: 5,321 Bytes

ca97f63

# app.py
import torch; torch.classes.__path__ = []  # Neutralizes the path inspection
import os
import sys
import time
import socket
import subprocess
import atexit
import base64
import shutil

import cv2
import streamlit as st
import requests
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
from huggingface_hub import hf_hub_download

# ── Configuration ──────────────────────────────────────────────────────────────
PORT        = 8000
BASE_URL    = f"http://localhost:{PORT}/v1"
MODEL_ALIAS = "gpt-4-vision-preview"
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"

# ── Helpers to download & launch server ─────────────────────────────────────────
def download_if_missing(repo_id: str, filename: str):
    if not os.path.exists(filename):
        cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
        shutil.copy(cached, filename)

def ensure_models():
    download_if_missing(REPO_ID, MODEL_FILE)
    download_if_missing(REPO_ID, PROJ_FILE)

def start_server():
    cmd = [
        sys.executable, "-m", "llama_cpp.server",
        "--model", MODEL_FILE,
        "--clip_model_path", PROJ_FILE,
        "--chat_format", "llava-1-5",
        "--port", str(PORT),
        "--model_alias", MODEL_ALIAS,
    ]
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,    # so line buffering works
        bufsize=1,
    )
    atexit.register(proc.terminate)

    for line in proc.stdout:
        if "Application startup complete." in line:
            return proc

    raise RuntimeError(f"Server failed to start on port {PORT}")

# ── Boot llama-cpp-python server ────────────────────────────────────────────────
ensure_models()
_server_proc = start_server()

# ── Streamlit UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
st.title("📸 Live Camera Captioning with SmolVLM")
st.markdown(
    """
Use the **slider** below to choose how often (in milliseconds) to
send a frame to SmolVLM for captioning. The latest caption will
be overlaid on your video feed.
"""
)
interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)

# ── Video processor ────────────────────────────────────────────────────────────
class CaptionProcessor(VideoProcessorBase):
    def __init__(self, interval_ms: int):
        self.interval   = interval_ms / 1000.0
        self.last_time  = 0.0
        self.caption    = "Waiting for caption..."
        self.font       = cv2.FONT_HERSHEY_SIMPLEX

    def recv(self, frame):
        img = frame.to_ndarray(format="bgr24")
        now = time.time()
        if now - self.last_time >= self.interval:
            self.last_time = now

            # JPEG + base64 encode
            success, buf = cv2.imencode(".jpg", img)
            if success:
                b64 = base64.b64encode(buf).decode("utf-8")
                payload = {
                    "model": MODEL_ALIAS,
                    "messages": [
                        {
                            "role": "system",
                            "content": (
                                "You are a precise image‐captioning assistant. "
                                "Identify the main subject, their clothing, posture, and environment."
                            ),
                        },
                        {
                            "role": "user",
                            "content": [
                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
                                {"type": "text", "text": "Caption this image in one detailed sentence."},
                            ],
                        },
                    ],
                    "temperature": 0.1,
                    "max_tokens": 100,
                }

                try:
                    r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
                    r.raise_for_status()
                    self.caption = r.json()["choices"][0]["message"]["content"].strip()
                except Exception as e:
                    self.caption = f"[Error] {e}"

        # overlay caption
        y = img.shape[0] - 20
        cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
        return frame.from_ndarray(img, format="bgr24")

webrtc_streamer(
    key=f"caption_{interval_ms}",
    video_processor_factory=lambda: CaptionProcessor(interval_ms),
    media_stream_constraints={"video": True, "audio": False},
)