Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

File size: 6,594 Bytes

import time
import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler

# ----------------------------------------
# Model configurations: per-size prefixes and repos
MODELS = {
    "256M": {
        "model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
        "clip_repo":  "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
        "model_prefix": "SmolVLM2-256M-Video-Instruct",
        "clip_prefix":  "mmproj-SmolVLM2-256M-Video-Instruct",
        "model_variants": ["Q8_0", "f16"],
        "clip_variants":  ["Q8_0", "f16"],
    },
    "500M": {
        "model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
        "clip_repo":  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
        "model_prefix": "SmolVLM2-500M-Video-Instruct",
        "clip_prefix":  "mmproj-SmolVLM2-500M-Video-Instruct",
        "model_variants": ["Q8_0", "f16"],
        "clip_variants":  ["Q8_0", "f16"],
    },
    "2.2B": {
        "model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
        "clip_repo":  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
        "model_prefix": "SmolVLM2-2.2B-Instruct",
        "clip_prefix":  "mmproj-SmolVLM2-2.2B-Instruct",
        "model_variants": ["Q4_K_M", "Q8_0", "f16"],
        "clip_variants":  ["Q8_0", "f16"],
    },
}

# ----------------------------------------
# Cache for loaded model instance
model_cache = {
    'size': None,
    'model_file': None,
    'clip_file': None,
    'llm': None
}

# Helper to download & symlink weights

def ensure_weights(size, model_file, clip_file):
    cfg = MODELS[size]
    if not os.path.exists(model_file):
        logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
        path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
        os.symlink(path, model_file)
    if not os.path.exists(clip_file):
        logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
        path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
        os.symlink(path, clip_file)
    return model_file, clip_file

# Custom chat handler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# Load and cache LLM (only on dropdown change)

def update_llm(size, model_file, clip_file):
    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
        mf, cf = ensure_weights(size, model_file, clip_file)
        handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
        llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False)
        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
    return None  # no UI output

# Build weight filename lists

def get_weight_files(size):
    cfg = MODELS[size]
    model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
    clip_files  = [f"{cfg['clip_prefix']}-{v}.gguf"  for v in cfg['clip_variants']]
    return model_files, clip_files

# Caption using cached llm

def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
    # Use pre-loaded model
    llm = model_cache['llm']
    time.sleep(interval_ms / 1000)
    img = cv2.resize(frame.copy(), (384, 384))
    with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp:
        cv2.imwrite(tmp.name, img)
        uri = Path(tmp.name).absolute().as_uri()
        messages = [
            {"role": "system", "content": sys_prompt},
            {"role": "user",   "content": [
                {"type": "image_url", "image_url": uri},
                {"type": "text",      "text": usr_prompt}
            ]}
        ]
        # re-init handler
        llm.chat_handler.__init__(clip_model_path=clip_file, verbose=False)
        resp = llm.create_chat_completion(
            messages=messages,
            max_tokens=128,
            temperature=0.1,
            stop=["<end_of_utterance>"]
        )
    return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()

# Gradio UI

def main():
    logging.basicConfig(level=logging.INFO)
    default = '2.2B'
    mf, cf = get_weight_files(default)

    with gr.Blocks() as demo:
        gr.Markdown("## 🎥 Real-Time Camera Captioning")
        with gr.Row():
            size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
            model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
            clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')

        # On any selection change, preload the llm
        size_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
        model_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
        clip_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])

        # Initial load
        update_llm(default, mf[0], cf[0])

        interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
        sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
        usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
        cam   = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
        cap   = gr.Textbox(interactive=False, label='Caption')

        cam.stream(
            fn=caption_frame,
            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
            outputs=[cap], time_limit=600
        )

    demo.launch()

if __name__ == '__main__':
    main()