File size: 4,624 Bytes
970f416
dd0d47d
221e4b6
 
 
ca97f63
221e4b6
970f416
221e4b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca97f63
dd0d47d
 
221e4b6
 
 
 
 
 
 
 
 
 
 
 
970f416
221e4b6
970f416
 
 
 
221e4b6
 
 
970f416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292fb3c
 
970f416
 
 
221e4b6
970f416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221e4b6
970f416
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
    CHAT_FORMAT = (
        "<|im_start|>"
        "{% for message in messages %}"
        "{{ message['role'] | capitalize }}"
        "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
        "{% else %}: "
        "{% endif %}"
        "{% for content in message['content'] %}"
        "{% if content['type']=='text' %}{{ content['text'] }}"
        "{% elif content['type']=='image_url' %}"
        "{% if content['image_url'] is string %}"
        "{{ content['image_url'] }}\n"
        "{% elif content['image_url'] is mapping %}"
        "{{ content['image_url']['url'] }}\n"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "<end_of_utterance>\n"
        "{% endfor %}"
        "{% if add_generation_prompt %}Assistant:{% endif %}"
    )

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 2) Model & CLIP files β€” download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE  = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO  = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

def ensure_models():
    if not os.path.exists(MODEL_FILE):
        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
        os.symlink(path, MODEL_FILE)
    if not os.path.exists(CLIP_FILE):
        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
        os.symlink(path, CLIP_FILE)

ensure_models()

def load_llm():
    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
    return Llama(
        model_path=MODEL_FILE,
        chat_handler=handler,
        n_ctx=8192,
        verbose=False,
    )

llm = load_llm()

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
    # make a writable copy
    frame = frame.copy()
    # save frame to temporary file for URI
    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
        cv2.imwrite(f.name, frame)
        uri = Path(f.name).absolute().as_uri()

        # build a single prompt string
        messages = [
            {
                "role": "system",
                "content": (
                    "Focus only on describing the key dramatic action or notable event occurring "
                    "in this image. Skip general context or scene-setting details unless they are "
                    "crucial to understanding the main action."
                ),
            },
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": uri},
                    {"type": "text",      "text": "What is happening in this image?"},
                ],
            },
        ]

        # stateless completion call
        llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
        llm.reset()                           # reset n_tokens back to 0
        llm._ctx.kv_cache_clear()      # clear any cached key/values
        resp = llm.create_chat_completion(
            messages = messages,
            max_tokens=256,
            temperature=0.1,
            stop=["<end_of_utterance>"],
        )

    # extract caption
    caption = (resp.get("choices", [])[0]['message'].get("content", "") or "").strip()
    return caption

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
    gr.Markdown("## πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
    input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
    caption_box = gr.Textbox(interactive=False, label="Caption")

    # stream frames and captions
    input_img.stream(
        fn=caption_frame,
        inputs=[input_img],
        outputs=[caption_box],
        stream_every=3,
        time_limit=600
    )

if __name__ == "__main__":
    demo.launch()