Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Building

File size: 4,624 Bytes

import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint

# —————————————————————————————————————————
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# —————————————————————————————————————————
# 2) Model & CLIP files — download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    if not os.path.exists(MODEL_FILE):
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
    if not os.path.exists(CLIP_FILE):
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)

ensure_models()

def load_llm():
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    return Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )

llm = load_llm()

# —————————————————————————————————————————
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
    # make a writable copy
    frame = frame.copy()
    # save frame to temporary file for URI
    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
        cv2.imwrite(f.name, frame)
        uri = Path(f.name).absolute().as_uri()

        # build a single prompt string
        messages = [
            {
                "role": "system",
                "content": (
                    "Focus only on describing the key dramatic action or notable event occurring "
                    "in this image. Skip general context or scene-setting details unless they are "
                    "crucial to understanding the main action."
                ),
            },
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": uri},
                    {"type": "text",      "text": "What is happening in this image?"},
                ],
            },
        ]

        # stateless completion call
        llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
        llm.reset()                           # reset n_tokens back to 0
        llm._ctx.kv_cache_clear()      # clear any cached key/values
        resp = llm.create_chat_completion(
            messages = messages,
            max_tokens=256,
            temperature=0.1,
            stop=["<end_of_utterance>"],
        )

    # extract caption
    caption = (resp.get("choices", [])[0]['message'].get("content", "") or "").strip()
    return caption

# —————————————————————————————————————————
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
    gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
    input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
    caption_box = gr.Textbox(interactive=False, label="Caption")

    # stream frames and captions
    input_img.stream(
        fn=caption_frame,
        inputs=[input_img],
        outputs=[caption_box],
        stream_every=3,
        time_limit=600
    )

if __name__ == "__main__":
    demo.launch()