Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

update

dd0d47d about 1 month ago

raw

history blame

3.93 kB

	import os
	import sys
	import time
	import socket
	import atexit
	import subprocess
	import shutil
	from pathlib import Path

	import streamlit as st
	import cv2
	from PIL import Image
	import base64
	import requests
	from huggingface_hub import hf_hub_download

	# --- Configuration (reuse from main.py) ---
	PORT = 8000
	BASE_URL = f"http://localhost:{PORT}/v1"
	MODEL_ALIAS = "gpt-4-vision-preview"
	REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
	MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
	PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"

	# Download model files if missing

	def download_if_missing(repo_id: str, filename: str) -> None:
	if not os.path.isfile(filename):
	cached = hf_hub_download(repo_id=repo_id, filename=filename)
	shutil.copy(cached, filename)

	# Ensure models on startup
	ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
	ensure_models()

	# Start local server for captioning

	def start_server() -> subprocess.Popen:
	cmd = [
	sys.executable, "-m", "llama_cpp.server",
	"--model", MODEL_FILE,
	"--clip_model_path", PROJ_FILE,
	"--chat_format", "llava-1-5",
	"--port", str(PORT),
	"--model_alias", MODEL_ALIAS
	]
	proc = subprocess.Popen(cmd)
	atexit.register(proc.terminate)
	# wait until responsive
	for _ in range(40):
	try:
	with socket.create_connection(("localhost", PORT), timeout=1):
	return proc
	except OSError:
	time.sleep(0.25)
	proc.terminate()
	raise RuntimeError(f"Server failed to start on port {PORT}.")

	server_proc = start_server()

	# Send image to caption API

	def caption_image_file(path: str) -> str:
	b64 = base64.b64encode(open(path, "rb").read()).decode()
	uri = f"data:image/jpeg;base64,{b64}"
	payload = {
	"model": MODEL_ALIAS,
	"messages": [
	{"role": "system", "content": (
	"You are a precise image-captioning assistant. "
	"Identify the main subject, their clothing, posture, and environment."
	)},
	{"role": "user", "content": [
	{"type": "image_url", "image_url": {"url": uri}},
	{"type": "text", "text": "Caption this image in one detailed sentence."}
	]}
	],
	"temperature": 0.1,
	"max_tokens": 100
	}
	resp = requests.post(BASE_URL + "/chat/completions", json=payload)
	resp.raise_for_status()
	return resp.json()["choices"][0]["message"]["content"]

	# Helper to handle PIL image

	def run_caption(pil_img: Image.Image) -> str:
	tmp = Path("frame.jpg")
	pil_img.save(tmp)
	return caption_image_file(str(tmp))

	# --- Streamlit UI ---
	st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
	st.title("🎥 Real-Time Camera Captioning")

	interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
	start = st.sidebar.button("Start")
	stop = st.sidebar.button("Stop")

	if 'running' not in st.session_state:
	st.session_state.running = False

	if start:
	st.session_state.running = True
	if stop:
	st.session_state.running = False

	# Placeholders for video and caption
	frame_placeholder = st.empty()
	caption_placeholder = st.empty()

	# OpenCV camera
	cap = cv2.VideoCapture(0)

	while st.session_state.running:
	ret, frame = cap.read()
	if not ret:
	st.error("Unable to read from camera.")
	break
	# Convert BGR to RGB
	rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	img = Image.fromarray(rgb)

	# Show frame
	frame_placeholder.image(img, caption="Live Feed", use_container_width=True)

	# Generate and show caption
	with st.spinner("Generating caption..."):
	caption = run_caption(img)
	caption_placeholder.markdown(f"Caption: {caption}")

	time.sleep(interval)

	cap.release()