Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on 18 days ago

Commit

ca97f63

1 Parent(s): 62ed9f9

initial commit

Browse files

Files changed (3) hide show

README.md +44 -1
app.py +132 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -11,4 +11,47 @@ license: mit
 short_description: SmolVLM2 on llama.cpp
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: SmolVLM2 on llama.cpp
 ---
+# SmolVLM2 Live Inference Demo
+This HuggingFace Spaces demo runs SmolVLM2 2.2B, 500M, or 256M Instruct GGUF models on CPU using `llama-cpp-python` (v0.3.9) which builds `llama.cpp` under the hood, and Gradio v5.33.2 for the UI. It captures frames from your webcam every N milliseconds and performs live inference, displaying the model's response in real time.
+## Setup
+1. **Clone this repository**
+   ```bash
+   git clone <your-space-repo-url>
+   cd <your-space-repo-name>
+   ```
+2. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. **Add your GGUF models**
+   Create a `models/` directory in the root of the repo and upload your `.gguf` files:
+   ```bash
+   mkdir models
+   # then upload:
+   # - smolvlm2-2.2B-instruct.gguf
+   # - smolvlm2-500M-instruct.gguf
+   # - smolvlm2-256M-instruct.gguf
+   ```
+## Usage
+- **Select Model**: Choose one of the `.gguf` files you uploaded.
+- **System Prompt**: Customize the system-level instructions for the model.
+- **User Prompt**: Provide the user query or instruction.
+- **Interval (ms)**: Set how often (in milliseconds) to capture a frame and run inference.
+- **Live Camera Feed**: The demo will start your webcam and capture frames at the specified interval.
+- **Model Output**: See the model’s response below the camera feed.
+## Notes
+- This demo runs entirely on CPU. Inference speed depends on the model size and your machine's CPU performance.
+- Make sure your browser has permission to access your webcam.

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# app.py
+import torch; torch.classes.__path__ = []  # Neutralizes the path inspection
+import os
+import sys
+import time
+import socket
+import subprocess
+import atexit
+import base64
+import shutil
+import cv2
+import streamlit as st
+import requests
+from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
+from huggingface_hub import hf_hub_download
+# ── Configuration ──────────────────────────────────────────────────────────────
+PORT        = 8000
+BASE_URL    = f"http://localhost:{PORT}/v1"
+MODEL_ALIAS = "gpt-4-vision-preview"
+REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
+MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
+PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
+# ── Helpers to download & launch server ─────────────────────────────────────────
+def download_if_missing(repo_id: str, filename: str):
+    if not os.path.exists(filename):
+        cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
+        shutil.copy(cached, filename)
+def ensure_models():
+    download_if_missing(REPO_ID, MODEL_FILE)
+    download_if_missing(REPO_ID, PROJ_FILE)
+def start_server():
+    cmd = [
+        sys.executable, "-m", "llama_cpp.server",
+        "--model", MODEL_FILE,
+        "--clip_model_path", PROJ_FILE,
+        "--chat_format", "llava-1-5",
+        "--port", str(PORT),
+        "--model_alias", MODEL_ALIAS,
+    ]
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,    # so line buffering works
+        bufsize=1,
+    )
+    atexit.register(proc.terminate)
+    for line in proc.stdout:
+        if "Application startup complete." in line:
+            return proc
+    raise RuntimeError(f"Server failed to start on port {PORT}")
+# ── Boot llama-cpp-python server ────────────────────────────────────────────────
+ensure_models()
+_server_proc = start_server()
+# ── Streamlit UI ───────────────────────────────────────────────────────────────
+st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
+st.title("📸 Live Camera Captioning with SmolVLM")
+st.markdown(
+    """
+Use the **slider** below to choose how often (in milliseconds) to
+send a frame to SmolVLM for captioning. The latest caption will
+be overlaid on your video feed.
+"""
+)
+interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
+# ── Video processor ────────────────────────────────────────────────────────────
+class CaptionProcessor(VideoProcessorBase):
+    def __init__(self, interval_ms: int):
+        self.interval   = interval_ms / 1000.0
+        self.last_time  = 0.0
+        self.caption    = "Waiting for caption..."
+        self.font       = cv2.FONT_HERSHEY_SIMPLEX
+    def recv(self, frame):
+        img = frame.to_ndarray(format="bgr24")
+        now = time.time()
+        if now - self.last_time >= self.interval:
+            self.last_time = now
+            # JPEG + base64 encode
+            success, buf = cv2.imencode(".jpg", img)
+            if success:
+                b64 = base64.b64encode(buf).decode("utf-8")
+                payload = {
+                    "model": MODEL_ALIAS,
+                    "messages": [
+                        {
+                            "role": "system",
+                            "content": (
+                                "You are a precise image‐captioning assistant. "
+                                "Identify the main subject, their clothing, posture, and environment."
+                            ),
+                        },
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
+                                {"type": "text", "text": "Caption this image in one detailed sentence."},
+                            ],
+                        },
+                    ],
+                    "temperature": 0.1,
+                    "max_tokens": 100,
+                }
+                try:
+                    r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
+                    r.raise_for_status()
+                    self.caption = r.json()["choices"][0]["message"]["content"].strip()
+                except Exception as e:
+                    self.caption = f"[Error] {e}"
+        # overlay caption
+        y = img.shape[0] - 20
+        cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
+        return frame.from_ndarray(img, format="bgr24")
+webrtc_streamer(
+    key=f"caption_{interval_ms}",
+    video_processor_factory=lambda: CaptionProcessor(interval_ms),
+    media_stream_constraints={"video": True, "audio": False},
+)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+# requirements.txt
+streamlit
+streamlit-webrtc
+yolov5
+opencv-python-headless
+numpy
+llama-cpp-python[server]>=0.1.102
+huggingface-hub>=0.13.3
+openai>=0.27.0