Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on 20 days ago

Commit

221e4b6

1 Parent(s): 60e423d

update

Browse files

Files changed (2) hide show

app.py +153 -133
requirements.txt +5 -9

app.py CHANGED Viewed

@@ -1,140 +1,160 @@
-import os
-import sys
-import time
-import socket
-import atexit
-import subprocess
-import shutil
-from pathlib import Path
 import streamlit as st
 import cv2
-from PIL import Image
-import base64
-import requests
 from huggingface_hub import hf_hub_download
-# --- Configuration (reuse from main.py) ---
-PORT = 8000
-BASE_URL = f"http://localhost:{PORT}/v1"
-MODEL_ALIAS = "gpt-4-vision-preview"
-REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
-MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
-PROJ_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
-# Download model files if missing
-def download_if_missing(repo_id: str, filename: str) -> None:
-    if not os.path.isfile(filename):
-        cached = hf_hub_download(repo_id=repo_id, filename=filename)
-        shutil.copy(cached, filename)
-# Ensure models on startup
-ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
 ensure_models()
-# Start local server for captioning
-def start_server() -> subprocess.Popen:
-    cmd = [
-        sys.executable, "-m", "llama_cpp.server",
-        "--model", MODEL_FILE,
-        "--clip_model_path", PROJ_FILE,
-        "--chat_format", "llava-1-5",
-        "--port", str(PORT),
-        "--host", "127.0.0.1",
-        "--model_alias", MODEL_ALIAS
-    ]
-    print("⏳ Launching llama server:", " ".join(cmd), file=sys.stderr)
-    proc = subprocess.Popen(cmd)
-    atexit.register(proc.terminate)
-    # wait until responsive
-    for _ in range(40):
-        try:
-            with socket.create_connection(("localhost", PORT), timeout=1):
-                return proc
-        except OSError:
-            time.sleep(0.25)
-    proc.terminate()
-    out, err = proc.communicate(timeout=1)
-    print("🐛 llama server stdout:\n", out.decode(), file=sys.stderr)
-    print("🐛 llama server stderr:\n", err.decode(), file=sys.stderr)
-    raise RuntimeError(f"Server failed to start on port {PORT}.")
-server_proc = start_server()
-# Send image to caption API
-def caption_image_file(path: str) -> str:
-    b64 = base64.b64encode(open(path, "rb").read()).decode()
-    uri = f"data:image/jpeg;base64,{b64}"
-    payload = {
-        "model": MODEL_ALIAS,
-        "messages": [
-            {"role": "system", "content": (
-                "You are a precise image-captioning assistant. "
-                "Identify the main subject, their clothing, posture, and environment."
-            )},
-            {"role": "user", "content": [
                 {"type": "image_url", "image_url": {"url": uri}},
-                {"type": "text",      "text": "Caption this image in one detailed sentence."}
-            ]}
-        ],
-        "temperature": 0.1,
-        "max_tokens": 100
-    }
-    resp = requests.post(BASE_URL + "/chat/completions", json=payload)
-    resp.raise_for_status()
-    return resp.json()["choices"][0]["message"]["content"]
-# Helper to handle PIL image
-def run_caption(pil_img: Image.Image) -> str:
-    tmp = Path("frame.jpg")
-    pil_img.save(tmp)
-    return caption_image_file(str(tmp))
-# --- Streamlit UI ---
-st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
-st.title("🎥 Real-Time Camera Captioning")
-interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
-start = st.sidebar.button("Start")
-stop = st.sidebar.button("Stop")
-if 'running' not in st.session_state:
-    st.session_state.running = False
-if start:
-    st.session_state.running = True
-if stop:
-    st.session_state.running = False
-# Placeholders for video and caption
-frame_placeholder = st.empty()
-caption_placeholder = st.empty()
-# OpenCV camera
-cap = cv2.VideoCapture(0)
-while st.session_state.running:
-    ret, frame = cap.read()
-    if not ret:
-        st.error("Unable to read from camera.")
-        break
-    # Convert BGR to RGB
-    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    img = Image.fromarray(rgb)
-    # Show frame
-    frame_placeholder.image(img, caption="Live Feed", use_container_width=True)
-    # Generate and show caption
-    with st.spinner("Generating caption..."):
-        caption = run_caption(img)
-    caption_placeholder.markdown(f"**Caption:** {caption}")
-    time.sleep(interval)
-cap.release()

+# app.py
 import streamlit as st
+st.set_page_config(layout="wide")
+import av
 import cv2
+import time
+import tempfile
+import os
+from pathlib import Path
 from huggingface_hub import hf_hub_download
+from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
+from termcolor import cprint
+# —————————————————————————————————————————
+# 1) Inline definition & registration of SmolVLM2ChatHandler
+class SmolVLM2ChatHandler(Llava15ChatHandler):
+    CHAT_FORMAT = (
+        "<|im_start|>"
+        "{% for message in messages %}"
+        "{{ message['role'] | capitalize }}"
+        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
+        "{% else %}: "
+        "{% endif %}"
+        "{% for content in message['content'] %}"
+        "{% if content['type']=='text' %}{{ content['text'] }}"
+        "{% elif content['type']=='image_url' %}"
+        "{% if content['image_url'] is string %}"
+        "{{ content['image_url'] }}\n"
+        "{% elif content['image_url'] is mapping %}"
+        "{{ content['image_url']['url'] }}\n"
+        "{% endif %}"
+        "{% endif %}"
+        "{% endfor %}"
+        "<end_of_utterance>\n"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}Assistant:{% endif %}"
+    )
+# Overwrite any previous registration
+LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
+    "smolvlm2", SmolVLM2ChatHandler, overwrite=True
+)
+# —————————————————————————————————————————
+# 2) Model & CLIP files — download if missing
+MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
+CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
+MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
+CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
+def ensure_models():
+    if not os.path.exists(MODEL_FILE):
+        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+        os.symlink(path, MODEL_FILE)
+    if not os.path.exists(CLIP_FILE):
+        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
+        os.symlink(path, CLIP_FILE)
 ensure_models()
+@st.cache_resource
+def load_llm():
+    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
+    return Llama(
+        model_path=MODEL_FILE,
+        chat_handler=handler,
+        n_ctx=8192,
+        verbose=False,
+    )
+llm = load_llm()
+# —————————————————————————————————————————
+# 3) Helper to run a single frame through the model (with debug)
+def caption_frame(frame):
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
+        cv2.imwrite(f.name, frame)
+        uri = Path(f.name).absolute().as_uri()
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "Focus only on describing the key dramatic action or notable event occurring "
+                "in this image. Skip general context or scene-setting details unless they are "
+                "crucial to understanding the main action."
+            ),
+        },
+        {
+            "role": "user",
+            "content": [
                 {"type": "image_url", "image_url": {"url": uri}},
+                {"type": "text",      "text": "What is happening in this image?"},
+            ],
+        },
+    ]
+    print("DEBUG ▶ caption_frame: invoking LLM")
+    resp = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=128,
+        temperature=0.1,
+        stop=["<end_of_utterance>"],
+    )
+    out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
+    print(f"DEBUG ▶ LLM returned: {out!r}")
+    return out
+# —————————————————————————————————————————
+# 4) Streamlit UI + WebRTC configuration
+st.title("🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
+interval_ms = st.slider(
+    "Caption every N ms", min_value=100, max_value=10000, value=1000, step=100
+)
+RTC_CONFIG = RTCConfiguration({
+    "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
+})
+class CaptionProcessor(VideoProcessorBase):
+    def __init__(self):
+        self.interval = 1.0
+        self.last_time = time.time()
+        self.caption = ""
+    def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
+        img = frame.to_ndarray(format="bgr24")
+        now = time.time()
+        if now - self.last_time >= self.interval:
+            self.last_time = now
+            print("DEBUG ▶ CaptionProcessor.recv: time reached, generating caption")
+            self.caption = caption_frame(img)
+        return av.VideoFrame.from_ndarray(img, format="bgr24")
+ctx = webrtc_streamer(
+    key="smolvlm2-captioner",
+    video_processor_factory=CaptionProcessor,
+    rtc_configuration=RTC_CONFIG,
+    media_stream_constraints={"video": True, "audio": False},
+)
+# Update the processor interval
+if ctx.video_processor:
+    ctx.video_processor.interval = interval_ms / 1000.0
+# Placeholder for showing captions
+placeholder = st.empty()
+if ctx.state.playing:
+    placeholder.markdown("**Caption:** _Waiting for inference…_")
+    while ctx.state.playing:
+        txt = ctx.video_processor.caption or "_…thinking…_"
+        placeholder.markdown(f"**Caption:** {txt}")
+        time.sleep(0.1)
+else:
+    st.info("▶️ Click **Start** above to begin streaming")

requirements.txt CHANGED Viewed

@@ -1,10 +1,6 @@
---extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
---prefer-binary
---only-binary=llama-cpp-pythongradio>=3.0
-requests
-huggingface_hub
 opencv-python
-fastapi
-uvicorn[standard]
-llama-cpp-python[server]==0.3.9
-Pillow

+streamlit
+streamlit-webrtc
+llama-cpp-python
+huggingface-hub
+termcolor
 opencv-python