File size: 5,321 Bytes
ca97f63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# app.py
import torch; torch.classes.__path__ = []  # Neutralizes the path inspection
import os
import sys
import time
import socket
import subprocess
import atexit
import base64
import shutil

import cv2
import streamlit as st
import requests
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
from huggingface_hub import hf_hub_download

# ── Configuration ──────────────────────────────────────────────────────────────
PORT        = 8000
BASE_URL    = f"http://localhost:{PORT}/v1"
MODEL_ALIAS = "gpt-4-vision-preview"
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"

# ── Helpers to download & launch server ─────────────────────────────────────────
def download_if_missing(repo_id: str, filename: str):
    if not os.path.exists(filename):
        cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
        shutil.copy(cached, filename)

def ensure_models():
    download_if_missing(REPO_ID, MODEL_FILE)
    download_if_missing(REPO_ID, PROJ_FILE)

def start_server():
    cmd = [
        sys.executable, "-m", "llama_cpp.server",
        "--model", MODEL_FILE,
        "--clip_model_path", PROJ_FILE,
        "--chat_format", "llava-1-5",
        "--port", str(PORT),
        "--model_alias", MODEL_ALIAS,
    ]
    proc = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,    # so line buffering works
        bufsize=1,
    )
    atexit.register(proc.terminate)

    for line in proc.stdout:
        if "Application startup complete." in line:
            return proc

    raise RuntimeError(f"Server failed to start on port {PORT}")

# ── Boot llama-cpp-python server ────────────────────────────────────────────────
ensure_models()
_server_proc = start_server()

# ── Streamlit UI ───────────────────────────────────────────────────────────────
st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
st.title("πŸ“Έ Live Camera Captioning with SmolVLM")
st.markdown(
    """
Use the **slider** below to choose how often (in milliseconds) to
send a frame to SmolVLM for captioning. The latest caption will
be overlaid on your video feed.
"""
)
interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)

# ── Video processor ────────────────────────────────────────────────────────────
class CaptionProcessor(VideoProcessorBase):
    def __init__(self, interval_ms: int):
        self.interval   = interval_ms / 1000.0
        self.last_time  = 0.0
        self.caption    = "Waiting for caption..."
        self.font       = cv2.FONT_HERSHEY_SIMPLEX

    def recv(self, frame):
        img = frame.to_ndarray(format="bgr24")
        now = time.time()
        if now - self.last_time >= self.interval:
            self.last_time = now

            # JPEG + base64 encode
            success, buf = cv2.imencode(".jpg", img)
            if success:
                b64 = base64.b64encode(buf).decode("utf-8")
                payload = {
                    "model": MODEL_ALIAS,
                    "messages": [
                        {
                            "role": "system",
                            "content": (
                                "You are a precise image‐captioning assistant. "
                                "Identify the main subject, their clothing, posture, and environment."
                            ),
                        },
                        {
                            "role": "user",
                            "content": [
                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
                                {"type": "text", "text": "Caption this image in one detailed sentence."},
                            ],
                        },
                    ],
                    "temperature": 0.1,
                    "max_tokens": 100,
                }

                try:
                    r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
                    r.raise_for_status()
                    self.caption = r.json()["choices"][0]["message"]["content"].strip()
                except Exception as e:
                    self.caption = f"[Error] {e}"

        # overlay caption
        y = img.shape[0] - 20
        cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
        return frame.from_ndarray(img, format="bgr24")

webrtc_streamer(
    key=f"caption_{interval_ms}",
    video_processor_factory=lambda: CaptionProcessor(interval_ms),
    media_stream_constraints={"video": True, "audio": False},
)