Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

File size: 3,934 Bytes

ca97f63
 
 
 
 
dd0d47d
ca97f63
dd0d47d
ca97f63
 
dd0d47d
 
 
ca97f63
 
 
dd0d47d
 
 
ca97f63
 
 
dd0d47d
ca97f63
dd0d47d
 
 
 
 
ca97f63
 
dd0d47d
 
 
 
 
ca97f63
dd0d47d
ca97f63
 
 
 
 
 
dd0d47d
ca97f63
dd0d47d
ca97f63
dd0d47d
 
 
 
 
 
 
 
 
ca97f63
dd0d47d
ca97f63
dd0d47d
ca97f63
dd0d47d

import os
import sys
import time
import socket
import atexit
import subprocess
import shutil
from pathlib import Path

import streamlit as st
import cv2
from PIL import Image
import base64
import requests
from huggingface_hub import hf_hub_download

# --- Configuration (reuse from main.py) ---
PORT = 8000
BASE_URL = f"http://localhost:{PORT}/v1"
MODEL_ALIAS = "gpt-4-vision-preview"
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
PROJ_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"

# Download model files if missing

def download_if_missing(repo_id: str, filename: str) -> None:
    if not os.path.isfile(filename):
        cached = hf_hub_download(repo_id=repo_id, filename=filename)
        shutil.copy(cached, filename)

# Ensure models on startup
ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
ensure_models()

# Start local server for captioning

def start_server() -> subprocess.Popen:
    cmd = [
        sys.executable, "-m", "llama_cpp.server",
        "--model", MODEL_FILE,
        "--clip_model_path", PROJ_FILE,
        "--chat_format", "llava-1-5",
        "--port", str(PORT),
        "--model_alias", MODEL_ALIAS
    ]
    proc = subprocess.Popen(cmd)
    atexit.register(proc.terminate)
    # wait until responsive
    for _ in range(40):
        try:
            with socket.create_connection(("localhost", PORT), timeout=1):
                return proc
        except OSError:
            time.sleep(0.25)
    proc.terminate()
    raise RuntimeError(f"Server failed to start on port {PORT}.")

server_proc = start_server()

# Send image to caption API

def caption_image_file(path: str) -> str:
    b64 = base64.b64encode(open(path, "rb").read()).decode()
    uri = f"data:image/jpeg;base64,{b64}"
    payload = {
        "model": MODEL_ALIAS,
        "messages": [
            {"role": "system", "content": (
                "You are a precise image-captioning assistant. "
                "Identify the main subject, their clothing, posture, and environment."
            )},
            {"role": "user", "content": [
                {"type": "image_url", "image_url": {"url": uri}},
                {"type": "text",      "text": "Caption this image in one detailed sentence."}
            ]}
        ],
        "temperature": 0.1,
        "max_tokens": 100
    }
    resp = requests.post(BASE_URL + "/chat/completions", json=payload)
    resp.raise_for_status()
    return resp.json()["choices"][0]["message"]["content"]

# Helper to handle PIL image

def run_caption(pil_img: Image.Image) -> str:
    tmp = Path("frame.jpg")
    pil_img.save(tmp)
    return caption_image_file(str(tmp))

# --- Streamlit UI ---
st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
st.title("🎥 Real-Time Camera Captioning")

interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
start = st.sidebar.button("Start")
stop = st.sidebar.button("Stop")

if 'running' not in st.session_state:
    st.session_state.running = False

if start:
    st.session_state.running = True
if stop:
    st.session_state.running = False

# Placeholders for video and caption
frame_placeholder = st.empty()
caption_placeholder = st.empty()

# OpenCV camera
cap = cv2.VideoCapture(0)

while st.session_state.running:
    ret, frame = cap.read()
    if not ret:
        st.error("Unable to read from camera.")
        break
    # Convert BGR to RGB
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(rgb)

    # Show frame
    frame_placeholder.image(img, caption="Live Feed", use_container_width=True)

    # Generate and show caption
    with st.spinner("Generating caption..."):
        caption = run_caption(img)
    caption_placeholder.markdown(f"**Caption:** {caption}")

    time.sleep(interval)

cap.release()