Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

initial commit

ca97f63 about 1 month ago

raw

history blame

5.32 kB

	# app.py
	import torch; torch.classes.__path__ = [] # Neutralizes the path inspection
	import os
	import sys
	import time
	import socket
	import subprocess
	import atexit
	import base64
	import shutil

	import cv2
	import streamlit as st
	import requests
	from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
	from huggingface_hub import hf_hub_download

	# ── Configuration ──────────────────────────────────────────────────────────────
	PORT = 8000
	BASE_URL = f"http://localhost:{PORT}/v1"
	MODEL_ALIAS = "gpt-4-vision-preview"
	REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
	MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
	PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"

	# ── Helpers to download & launch server ─────────────────────────────────────────
	def download_if_missing(repo_id: str, filename: str):
	if not os.path.exists(filename):
	cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
	shutil.copy(cached, filename)

	def ensure_models():
	download_if_missing(REPO_ID, MODEL_FILE)
	download_if_missing(REPO_ID, PROJ_FILE)

	def start_server():
	cmd = [
	sys.executable, "-m", "llama_cpp.server",
	"--model", MODEL_FILE,
	"--clip_model_path", PROJ_FILE,
	"--chat_format", "llava-1-5",
	"--port", str(PORT),
	"--model_alias", MODEL_ALIAS,
	]
	proc = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True, # so line buffering works
	bufsize=1,
	)
	atexit.register(proc.terminate)

	for line in proc.stdout:
	if "Application startup complete." in line:
	return proc

	raise RuntimeError(f"Server failed to start on port {PORT}")

	# ── Boot llama-cpp-python server ────────────────────────────────────────────────
	ensure_models()
	_server_proc = start_server()

	# ── Streamlit UI ───────────────────────────────────────────────────────────────
	st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
	st.title("📸 Live Camera Captioning with SmolVLM")
	st.markdown(
	"""
	Use the slider below to choose how often (in milliseconds) to
	send a frame to SmolVLM for captioning. The latest caption will
	be overlaid on your video feed.
	"""
	)
	interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)

	# ── Video processor ────────────────────────────────────────────────────────────
	class CaptionProcessor(VideoProcessorBase):
	def __init__(self, interval_ms: int):
	self.interval = interval_ms / 1000.0
	self.last_time = 0.0
	self.caption = "Waiting for caption..."
	self.font = cv2.FONT_HERSHEY_SIMPLEX

	def recv(self, frame):
	img = frame.to_ndarray(format="bgr24")
	now = time.time()
	if now - self.last_time >= self.interval:
	self.last_time = now

	# JPEG + base64 encode
	success, buf = cv2.imencode(".jpg", img)
	if success:
	b64 = base64.b64encode(buf).decode("utf-8")
	payload = {
	"model": MODEL_ALIAS,
	"messages": [
	{
	"role": "system",
	"content": (
	"You are a precise image‐captioning assistant. "
	"Identify the main subject, their clothing, posture, and environment."
	),
	},
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
	{"type": "text", "text": "Caption this image in one detailed sentence."},
	],
	},
	],
	"temperature": 0.1,
	"max_tokens": 100,
	}

	try:
	r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
	r.raise_for_status()
	self.caption = r.json()["choices"][0]["message"]["content"].strip()
	except Exception as e:
	self.caption = f"[Error] {e}"

	# overlay caption
	y = img.shape[0] - 20
	cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
	return frame.from_ndarray(img, format="bgr24")

	webrtc_streamer(
	key=f"caption_{interval_ms}",
	video_processor_factory=lambda: CaptionProcessor(interval_ms),
	media_stream_constraints={"video": True, "audio": False},
	)