Luigi commited on
Commit
ca97f63
Β·
1 Parent(s): 62ed9f9

initial commit

Browse files
Files changed (3) hide show
  1. README.md +44 -1
  2. app.py +132 -0
  3. requirements.txt +10 -0
README.md CHANGED
@@ -11,4 +11,47 @@ license: mit
11
  short_description: SmolVLM2 on llama.cpp
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: SmolVLM2 on llama.cpp
12
  ---
13
 
14
+ # SmolVLM2 Live Inference Demo
15
+
16
+ This HuggingFace Spaces demo runs SmolVLM2 2.2B, 500M, or 256M Instruct GGUF models on CPU using `llama-cpp-python` (v0.3.9) which builds `llama.cpp` under the hood, and Gradio v5.33.2 for the UI. It captures frames from your webcam every N milliseconds and performs live inference, displaying the model's response in real time.
17
+
18
+ ## Setup
19
+
20
+ 1. **Clone this repository**
21
+
22
+ ```bash
23
+ git clone <your-space-repo-url>
24
+ cd <your-space-repo-name>
25
+ ```
26
+
27
+ 2. **Install dependencies**
28
+
29
+ ```bash
30
+ pip install -r requirements.txt
31
+ ```
32
+
33
+ 3. **Add your GGUF models**
34
+
35
+ Create a `models/` directory in the root of the repo and upload your `.gguf` files:
36
+
37
+ ```bash
38
+ mkdir models
39
+ # then upload:
40
+ # - smolvlm2-2.2B-instruct.gguf
41
+ # - smolvlm2-500M-instruct.gguf
42
+ # - smolvlm2-256M-instruct.gguf
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ - **Select Model**: Choose one of the `.gguf` files you uploaded.
48
+ - **System Prompt**: Customize the system-level instructions for the model.
49
+ - **User Prompt**: Provide the user query or instruction.
50
+ - **Interval (ms)**: Set how often (in milliseconds) to capture a frame and run inference.
51
+ - **Live Camera Feed**: The demo will start your webcam and capture frames at the specified interval.
52
+ - **Model Output**: See the model’s response below the camera feed.
53
+
54
+ ## Notes
55
+
56
+ - This demo runs entirely on CPU. Inference speed depends on the model size and your machine's CPU performance.
57
+ - Make sure your browser has permission to access your webcam.
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import torch; torch.classes.__path__ = [] # Neutralizes the path inspection
3
+ import os
4
+ import sys
5
+ import time
6
+ import socket
7
+ import subprocess
8
+ import atexit
9
+ import base64
10
+ import shutil
11
+
12
+ import cv2
13
+ import streamlit as st
14
+ import requests
15
+ from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
16
+ from huggingface_hub import hf_hub_download
17
+
18
+ # ── Configuration ──────────────────────────────────────────────────────────────
19
+ PORT = 8000
20
+ BASE_URL = f"http://localhost:{PORT}/v1"
21
+ MODEL_ALIAS = "gpt-4-vision-preview"
22
+ REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
23
+ MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
24
+ PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
25
+
26
+ # ── Helpers to download & launch server ─────────────────────────────────────────
27
+ def download_if_missing(repo_id: str, filename: str):
28
+ if not os.path.exists(filename):
29
+ cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
30
+ shutil.copy(cached, filename)
31
+
32
+ def ensure_models():
33
+ download_if_missing(REPO_ID, MODEL_FILE)
34
+ download_if_missing(REPO_ID, PROJ_FILE)
35
+
36
+ def start_server():
37
+ cmd = [
38
+ sys.executable, "-m", "llama_cpp.server",
39
+ "--model", MODEL_FILE,
40
+ "--clip_model_path", PROJ_FILE,
41
+ "--chat_format", "llava-1-5",
42
+ "--port", str(PORT),
43
+ "--model_alias", MODEL_ALIAS,
44
+ ]
45
+ proc = subprocess.Popen(
46
+ cmd,
47
+ stdout=subprocess.PIPE,
48
+ stderr=subprocess.STDOUT,
49
+ text=True, # so line buffering works
50
+ bufsize=1,
51
+ )
52
+ atexit.register(proc.terminate)
53
+
54
+ for line in proc.stdout:
55
+ if "Application startup complete." in line:
56
+ return proc
57
+
58
+ raise RuntimeError(f"Server failed to start on port {PORT}")
59
+
60
+ # ── Boot llama-cpp-python server ────────────────────────────────────────────────
61
+ ensure_models()
62
+ _server_proc = start_server()
63
+
64
+ # ── Streamlit UI ───────────────────────────────────────────────────────────────
65
+ st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
66
+ st.title("πŸ“Έ Live Camera Captioning with SmolVLM")
67
+ st.markdown(
68
+ """
69
+ Use the **slider** below to choose how often (in milliseconds) to
70
+ send a frame to SmolVLM for captioning. The latest caption will
71
+ be overlaid on your video feed.
72
+ """
73
+ )
74
+ interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
75
+
76
+ # ── Video processor ────────────────────────────────────────────────────────────
77
+ class CaptionProcessor(VideoProcessorBase):
78
+ def __init__(self, interval_ms: int):
79
+ self.interval = interval_ms / 1000.0
80
+ self.last_time = 0.0
81
+ self.caption = "Waiting for caption..."
82
+ self.font = cv2.FONT_HERSHEY_SIMPLEX
83
+
84
+ def recv(self, frame):
85
+ img = frame.to_ndarray(format="bgr24")
86
+ now = time.time()
87
+ if now - self.last_time >= self.interval:
88
+ self.last_time = now
89
+
90
+ # JPEG + base64 encode
91
+ success, buf = cv2.imencode(".jpg", img)
92
+ if success:
93
+ b64 = base64.b64encode(buf).decode("utf-8")
94
+ payload = {
95
+ "model": MODEL_ALIAS,
96
+ "messages": [
97
+ {
98
+ "role": "system",
99
+ "content": (
100
+ "You are a precise image‐captioning assistant. "
101
+ "Identify the main subject, their clothing, posture, and environment."
102
+ ),
103
+ },
104
+ {
105
+ "role": "user",
106
+ "content": [
107
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
108
+ {"type": "text", "text": "Caption this image in one detailed sentence."},
109
+ ],
110
+ },
111
+ ],
112
+ "temperature": 0.1,
113
+ "max_tokens": 100,
114
+ }
115
+
116
+ try:
117
+ r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
118
+ r.raise_for_status()
119
+ self.caption = r.json()["choices"][0]["message"]["content"].strip()
120
+ except Exception as e:
121
+ self.caption = f"[Error] {e}"
122
+
123
+ # overlay caption
124
+ y = img.shape[0] - 20
125
+ cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
126
+ return frame.from_ndarray(img, format="bgr24")
127
+
128
+ webrtc_streamer(
129
+ key=f"caption_{interval_ms}",
130
+ video_processor_factory=lambda: CaptionProcessor(interval_ms),
131
+ media_stream_constraints={"video": True, "audio": False},
132
+ )
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+
3
+ streamlit
4
+ streamlit-webrtc
5
+ yolov5
6
+ opencv-python-headless
7
+ numpy
8
+ llama-cpp-python[server]>=0.1.102
9
+ huggingface-hub>=0.13.3
10
+ openai>=0.27.0