Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on 15 days ago

Commit

dd0d47d

1 Parent(s): ca97f63

update

Browse files

Files changed (2) hide show

app.py +105 -103
requirements.txt +4 -10

app.py CHANGED Viewed

@@ -1,132 +1,134 @@
-# app.py
-import torch; torch.classes.__path__ = []  # Neutralizes the path inspection
 import os
 import sys
 import time
 import socket
-import subprocess
 import atexit
-import base64
 import shutil
-import cv2
 import streamlit as st
 import requests
-from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
 from huggingface_hub import hf_hub_download
-# ── Configuration ──────────────────────────────────────────────────────────────
-PORT        = 8000
-BASE_URL    = f"http://localhost:{PORT}/v1"
 MODEL_ALIAS = "gpt-4-vision-preview"
 REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
 MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
-PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
-# ── Helpers to download & launch server ─────────────────────────────────────────
-def download_if_missing(repo_id: str, filename: str):
-    if not os.path.exists(filename):
-        cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
         shutil.copy(cached, filename)
-def ensure_models():
-    download_if_missing(REPO_ID, MODEL_FILE)
-    download_if_missing(REPO_ID, PROJ_FILE)
-def start_server():
     cmd = [
         sys.executable, "-m", "llama_cpp.server",
         "--model", MODEL_FILE,
         "--clip_model_path", PROJ_FILE,
         "--chat_format", "llava-1-5",
         "--port", str(PORT),
-        "--model_alias", MODEL_ALIAS,
     ]
-    proc = subprocess.Popen(
-        cmd,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,    # so line buffering works
-        bufsize=1,
-    )
     atexit.register(proc.terminate)
-    for line in proc.stdout:
-        if "Application startup complete." in line:
-            return proc
-    raise RuntimeError(f"Server failed to start on port {PORT}")
-# ── Boot llama-cpp-python server ────────────────────────────────────────────────
-ensure_models()
-_server_proc = start_server()
-# ── Streamlit UI ───────────────────────────────────────────────────────────────
-st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
-st.title("📸 Live Camera Captioning with SmolVLM")
-st.markdown(
-    """
-Use the **slider** below to choose how often (in milliseconds) to
-send a frame to SmolVLM for captioning. The latest caption will
-be overlaid on your video feed.
-"""
-)
-interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
-# ── Video processor ────────────────────────────────────────────────────────────
-class CaptionProcessor(VideoProcessorBase):
-    def __init__(self, interval_ms: int):
-        self.interval   = interval_ms / 1000.0
-        self.last_time  = 0.0
-        self.caption    = "Waiting for caption..."
-        self.font       = cv2.FONT_HERSHEY_SIMPLEX
-    def recv(self, frame):
-        img = frame.to_ndarray(format="bgr24")
-        now = time.time()
-        if now - self.last_time >= self.interval:
-            self.last_time = now
-            # JPEG + base64 encode
-            success, buf = cv2.imencode(".jpg", img)
-            if success:
-                b64 = base64.b64encode(buf).decode("utf-8")
-                payload = {
-                    "model": MODEL_ALIAS,
-                    "messages": [
-                        {
-                            "role": "system",
-                            "content": (
-                                "You are a precise image‐captioning assistant. "
-                                "Identify the main subject, their clothing, posture, and environment."
-                            ),
-                        },
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
-                                {"type": "text", "text": "Caption this image in one detailed sentence."},
-                            ],
-                        },
-                    ],
-                    "temperature": 0.1,
-                    "max_tokens": 100,
-                }
-                try:
-                    r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
-                    r.raise_for_status()
-                    self.caption = r.json()["choices"][0]["message"]["content"].strip()
-                except Exception as e:
-                    self.caption = f"[Error] {e}"
-        # overlay caption
-        y = img.shape[0] - 20
-        cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
-        return frame.from_ndarray(img, format="bgr24")
-webrtc_streamer(
-    key=f"caption_{interval_ms}",
-    video_processor_factory=lambda: CaptionProcessor(interval_ms),
-    media_stream_constraints={"video": True, "audio": False},
-)

 import os
 import sys
 import time
 import socket
 import atexit
+import subprocess
 import shutil
+from pathlib import Path
 import streamlit as st
+import cv2
+from PIL import Image
+import base64
 import requests
 from huggingface_hub import hf_hub_download
+# --- Configuration (reuse from main.py) ---
+PORT = 8000
+BASE_URL = f"http://localhost:{PORT}/v1"
 MODEL_ALIAS = "gpt-4-vision-preview"
 REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
 MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
+PROJ_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
+# Download model files if missing
+def download_if_missing(repo_id: str, filename: str) -> None:
+    if not os.path.isfile(filename):
+        cached = hf_hub_download(repo_id=repo_id, filename=filename)
         shutil.copy(cached, filename)
+# Ensure models on startup
+ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
+ensure_models()
+# Start local server for captioning
+def start_server() -> subprocess.Popen:
     cmd = [
         sys.executable, "-m", "llama_cpp.server",
         "--model", MODEL_FILE,
         "--clip_model_path", PROJ_FILE,
         "--chat_format", "llava-1-5",
         "--port", str(PORT),
+        "--model_alias", MODEL_ALIAS
     ]
+    proc = subprocess.Popen(cmd)
     atexit.register(proc.terminate)
+    # wait until responsive
+    for _ in range(40):
+        try:
+            with socket.create_connection(("localhost", PORT), timeout=1):
+                return proc
+        except OSError:
+            time.sleep(0.25)
+    proc.terminate()
+    raise RuntimeError(f"Server failed to start on port {PORT}.")
+server_proc = start_server()
+# Send image to caption API
+def caption_image_file(path: str) -> str:
+    b64 = base64.b64encode(open(path, "rb").read()).decode()
+    uri = f"data:image/jpeg;base64,{b64}"
+    payload = {
+        "model": MODEL_ALIAS,
+        "messages": [
+            {"role": "system", "content": (
+                "You are a precise image-captioning assistant. "
+                "Identify the main subject, their clothing, posture, and environment."
+            )},
+            {"role": "user", "content": [
+                {"type": "image_url", "image_url": {"url": uri}},
+                {"type": "text",      "text": "Caption this image in one detailed sentence."}
+            ]}
+        ],
+        "temperature": 0.1,
+        "max_tokens": 100
+    }
+    resp = requests.post(BASE_URL + "/chat/completions", json=payload)
+    resp.raise_for_status()
+    return resp.json()["choices"][0]["message"]["content"]
+# Helper to handle PIL image
+def run_caption(pil_img: Image.Image) -> str:
+    tmp = Path("frame.jpg")
+    pil_img.save(tmp)
+    return caption_image_file(str(tmp))
+# --- Streamlit UI ---
+st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
+st.title("🎥 Real-Time Camera Captioning")
+interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
+start = st.sidebar.button("Start")
+stop = st.sidebar.button("Stop")
+if 'running' not in st.session_state:
+    st.session_state.running = False
+if start:
+    st.session_state.running = True
+if stop:
+    st.session_state.running = False
+# Placeholders for video and caption
+frame_placeholder = st.empty()
+caption_placeholder = st.empty()
+# OpenCV camera
+cap = cv2.VideoCapture(0)
+while st.session_state.running:
+    ret, frame = cap.read()
+    if not ret:
+        st.error("Unable to read from camera.")
+        break
+    # Convert BGR to RGB
+    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(rgb)
+    # Show frame
+    frame_placeholder.image(img, caption="Live Feed", use_container_width=True)
+    # Generate and show caption
+    with st.spinner("Generating caption..."):
+        caption = run_caption(img)
+    caption_placeholder.markdown(f"**Caption:** {caption}")
+    time.sleep(interval)
+cap.release()

requirements.txt CHANGED Viewed

@@ -1,10 +1,4 @@
-# requirements.txt
-streamlit
-streamlit-webrtc
-yolov5
-opencv-python-headless
-numpy
-llama-cpp-python[server]>=0.1.102
-huggingface-hub>=0.13.3
-openai>=0.27.0

+gradio>=3.0
+requests
+huggingface_hub
+llama-cpp-python