Spaces:
Building
Building
File size: 4,624 Bytes
970f416 dd0d47d 221e4b6 ca97f63 221e4b6 970f416 221e4b6 ca97f63 dd0d47d 221e4b6 970f416 221e4b6 970f416 221e4b6 970f416 292fb3c 970f416 221e4b6 970f416 221e4b6 970f416 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint
# βββββββββββββββββββββββββββββββββββββββββ
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# βββββββββββββββββββββββββββββββββββββββββ
# 2) Model & CLIP files β download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
if not os.path.exists(MODEL_FILE):
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
if not os.path.exists(CLIP_FILE):
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
ensure_models()
def load_llm():
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
return Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
llm = load_llm()
# βββββββββββββββββββββββββββββββββββββββββ
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
# make a writable copy
frame = frame.copy()
# save frame to temporary file for URI
with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
cv2.imwrite(f.name, frame)
uri = Path(f.name).absolute().as_uri()
# build a single prompt string
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": uri},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
# stateless completion call
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm.reset() # reset n_tokens back to 0
llm._ctx.kv_cache_clear() # clear any cached key/values
resp = llm.create_chat_completion(
messages = messages,
max_tokens=256,
temperature=0.1,
stop=["<end_of_utterance>"],
)
# extract caption
caption = (resp.get("choices", [])[0]['message'].get("content", "") or "").strip()
return caption
# βββββββββββββββββββββββββββββββββββββββββ
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
gr.Markdown("## π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
caption_box = gr.Textbox(interactive=False, label="Caption")
# stream frames and captions
input_img.stream(
fn=caption_frame,
inputs=[input_img],
outputs=[caption_box],
stream_every=3,
time_limit=600
)
if __name__ == "__main__":
demo.launch()
|