Spaces:
Running
Running
import os | |
import sys | |
import time | |
import socket | |
import atexit | |
import subprocess | |
import shutil | |
from pathlib import Path | |
import streamlit as st | |
import cv2 | |
from PIL import Image | |
import base64 | |
import requests | |
from huggingface_hub import hf_hub_download | |
# --- Configuration (reuse from main.py) --- | |
PORT = 8000 | |
BASE_URL = f"http://localhost:{PORT}/v1" | |
MODEL_ALIAS = "gpt-4-vision-preview" | |
REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" | |
MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
# Download model files if missing | |
def download_if_missing(repo_id: str, filename: str) -> None: | |
if not os.path.isfile(filename): | |
cached = hf_hub_download(repo_id=repo_id, filename=filename) | |
shutil.copy(cached, filename) | |
# Ensure models on startup | |
ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)] | |
ensure_models() | |
# Start local server for captioning | |
def start_server() -> subprocess.Popen: | |
cmd = [ | |
sys.executable, "-m", "llama_cpp.server", | |
"--model", MODEL_FILE, | |
"--clip_model_path", PROJ_FILE, | |
"--chat_format", "llava-1-5", | |
"--port", str(PORT), | |
"--model_alias", MODEL_ALIAS | |
] | |
proc = subprocess.Popen(cmd) | |
atexit.register(proc.terminate) | |
# wait until responsive | |
for _ in range(40): | |
try: | |
with socket.create_connection(("localhost", PORT), timeout=1): | |
return proc | |
except OSError: | |
time.sleep(0.25) | |
proc.terminate() | |
raise RuntimeError(f"Server failed to start on port {PORT}.") | |
server_proc = start_server() | |
# Send image to caption API | |
def caption_image_file(path: str) -> str: | |
b64 = base64.b64encode(open(path, "rb").read()).decode() | |
uri = f"data:image/jpeg;base64,{b64}" | |
payload = { | |
"model": MODEL_ALIAS, | |
"messages": [ | |
{"role": "system", "content": ( | |
"You are a precise image-captioning assistant. " | |
"Identify the main subject, their clothing, posture, and environment." | |
)}, | |
{"role": "user", "content": [ | |
{"type": "image_url", "image_url": {"url": uri}}, | |
{"type": "text", "text": "Caption this image in one detailed sentence."} | |
]} | |
], | |
"temperature": 0.1, | |
"max_tokens": 100 | |
} | |
resp = requests.post(BASE_URL + "/chat/completions", json=payload) | |
resp.raise_for_status() | |
return resp.json()["choices"][0]["message"]["content"] | |
# Helper to handle PIL image | |
def run_caption(pil_img: Image.Image) -> str: | |
tmp = Path("frame.jpg") | |
pil_img.save(tmp) | |
return caption_image_file(str(tmp)) | |
# --- Streamlit UI --- | |
st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide") | |
st.title("🎥 Real-Time Camera Captioning") | |
interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3) | |
start = st.sidebar.button("Start") | |
stop = st.sidebar.button("Stop") | |
if 'running' not in st.session_state: | |
st.session_state.running = False | |
if start: | |
st.session_state.running = True | |
if stop: | |
st.session_state.running = False | |
# Placeholders for video and caption | |
frame_placeholder = st.empty() | |
caption_placeholder = st.empty() | |
# OpenCV camera | |
cap = cv2.VideoCapture(0) | |
while st.session_state.running: | |
ret, frame = cap.read() | |
if not ret: | |
st.error("Unable to read from camera.") | |
break | |
# Convert BGR to RGB | |
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
img = Image.fromarray(rgb) | |
# Show frame | |
frame_placeholder.image(img, caption="Live Feed", use_container_width=True) | |
# Generate and show caption | |
with st.spinner("Generating caption..."): | |
caption = run_caption(img) | |
caption_placeholder.markdown(f"**Caption:** {caption}") | |
time.sleep(interval) | |
cap.release() | |