import os import sys import time import socket import atexit import subprocess import shutil from pathlib import Path import streamlit as st import cv2 from PIL import Image import base64 import requests from huggingface_hub import hf_hub_download # --- Configuration (reuse from main.py) --- PORT = 8000 BASE_URL = f"http://localhost:{PORT}/v1" MODEL_ALIAS = "gpt-4-vision-preview" REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf" PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" # Download model files if missing def download_if_missing(repo_id: str, filename: str) -> None: if not os.path.isfile(filename): cached = hf_hub_download(repo_id=repo_id, filename=filename) shutil.copy(cached, filename) # Ensure models on startup ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)] ensure_models() # Start local server for captioning def start_server() -> subprocess.Popen: cmd = [ sys.executable, "-m", "llama_cpp.server", "--model", MODEL_FILE, "--clip_model_path", PROJ_FILE, "--chat_format", "llava-1-5", "--port", str(PORT), "--model_alias", MODEL_ALIAS ] proc = subprocess.Popen(cmd) atexit.register(proc.terminate) # wait until responsive for _ in range(40): try: with socket.create_connection(("localhost", PORT), timeout=1): return proc except OSError: time.sleep(0.25) proc.terminate() raise RuntimeError(f"Server failed to start on port {PORT}.") server_proc = start_server() # Send image to caption API def caption_image_file(path: str) -> str: b64 = base64.b64encode(open(path, "rb").read()).decode() uri = f"data:image/jpeg;base64,{b64}" payload = { "model": MODEL_ALIAS, "messages": [ {"role": "system", "content": ( "You are a precise image-captioning assistant. " "Identify the main subject, their clothing, posture, and environment." )}, {"role": "user", "content": [ {"type": "image_url", "image_url": {"url": uri}}, {"type": "text", "text": "Caption this image in one detailed sentence."} ]} ], "temperature": 0.1, "max_tokens": 100 } resp = requests.post(BASE_URL + "/chat/completions", json=payload) resp.raise_for_status() return resp.json()["choices"][0]["message"]["content"] # Helper to handle PIL image def run_caption(pil_img: Image.Image) -> str: tmp = Path("frame.jpg") pil_img.save(tmp) return caption_image_file(str(tmp)) # --- Streamlit UI --- st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide") st.title("🎥 Real-Time Camera Captioning") interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3) start = st.sidebar.button("Start") stop = st.sidebar.button("Stop") if 'running' not in st.session_state: st.session_state.running = False if start: st.session_state.running = True if stop: st.session_state.running = False # Placeholders for video and caption frame_placeholder = st.empty() caption_placeholder = st.empty() # OpenCV camera cap = cv2.VideoCapture(0) while st.session_state.running: ret, frame = cap.read() if not ret: st.error("Unable to read from camera.") break # Convert BGR to RGB rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) img = Image.fromarray(rgb) # Show frame frame_placeholder.image(img, caption="Live Feed", use_container_width=True) # Generate and show caption with st.spinner("Generating caption..."): caption = run_caption(img) caption_placeholder.markdown(f"**Caption:** {caption}") time.sleep(interval) cap.release()