Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

File size: 6,140 Bytes

36dacc6
970f416
dd0d47d
221e4b6
 
 
ca97f63
221e4b6
970f416
221e4b6
 
36dacc6
 
 
 
 
 
 
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dacc6
221e4b6
36dacc6
221e4b6
 
36dacc6
 
 
 
221e4b6
36dacc6
221e4b6
 
36dacc6
 
 
ca97f63
dd0d47d
 
36dacc6
221e4b6
36dacc6
221e4b6
36dacc6
221e4b6
 
 
 
 
36dacc6
 
221e4b6
 
 
 
970f416
221e4b6
36dacc6
970f416
 
c1d8038
36dacc6
 
970f416
 
36dacc6
 
 
 
 
 
221e4b6
36dacc6
221e4b6
970f416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36dacc6
970f416
 
36dacc6
970f416
 
36dacc6
 
970f416
36dacc6
970f416
 
 
292fb3c
36dacc6
292fb3c
970f416
36dacc6
 
970f416
221e4b6
970f416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221e4b6
970f416
36dacc6
970f416

import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# —————————————————————————————————————————
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# —————————————————————————————————————————
# 2) Model & CLIP files — download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    logging.debug("Ensuring model files are present...")
    if not os.path.exists(MODEL_FILE):
        logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
        logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
    else:
        logging.debug(f"Model file {MODEL_FILE} already exists.")

    if not os.path.exists(CLIP_FILE):
        logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)
        logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
    else:
        logging.debug(f"CLIP file {CLIP_FILE} already exists.")

ensure_models()


def load_llm():
    logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    llm = Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )
    logging.info("Llama model loaded successfully.")
    return llm

llm = load_llm()

# —————————————————————————————————————————
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
    logging.debug("caption_frame called.")
    # make a writable copy
    frame = frame.copy()
    frame = cv2.resize(frame, (384, 384))
    logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")

    # save frame to temporary file for URI
    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
        success = cv2.imwrite(f.name, frame)
        if not success:
            logging.error(f"Failed to write frame to {f.name}")
        else:
            logging.debug(f"Frame written to temp file: {f.name}")

        uri = Path(f.name).absolute().as_uri()
        logging.debug(f"Frame URI: {uri}")

        # build a single prompt string
        messages = [
            {
                "role": "system",
                "content": (
                    "Focus only on describing the key dramatic action or notable event occurring "
                    "in this image. Skip general context or scene-setting details unless they are "
                    "crucial to understanding the main action."
                ),
            },
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": uri},
                    {"type": "text",      "text": "What is happening in this image?"},
                ],
            },
        ]
        logging.debug(f"Constructed messages: {messages}")

        # stateless completion call
        logging.debug("Resetting LLM and clearing cache.")
        llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
        llm.reset()                           # reset n_tokens back to 0
        llm._ctx.kv_cache_clear()            # clear any cached key/values
        logging.debug("Sending chat completion request...")
        resp = llm.create_chat_completion(
            messages=messages,
            max_tokens=256,
            temperature=0.1,
            stop=["<end_of_utterance>"],
        )
        logging.debug(f"LLM raw response: {resp}")

    # extract caption
    caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
    logging.debug(f"Extracted caption: {caption}")
    return caption

# —————————————————————————————————————————
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
    gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
    input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
    caption_box = gr.Textbox(interactive=False, label="Caption")

    # stream frames and captions
    input_img.stream(
        fn=caption_frame,
        inputs=[input_img],
        outputs=[caption_box],
        stream_every=3,
        time_limit=600
    )

if __name__ == "__main__":
    logging.debug("Launching Gradio demo...")
    demo.launch()