Spaces:
Running
Running
import logging | |
import gradio as gr | |
import cv2 | |
import tempfile | |
import os | |
from pathlib import Path | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
from llama_cpp.llama_chat_format import Llava15ChatHandler | |
from termcolor import cprint | |
# Configure logging | |
logging.basicConfig( | |
level=logging.DEBUG, | |
format='[%(asctime)s] %(levelname)s: %(message)s', | |
datefmt='%Y-%m-%d %H:%M:%S' | |
) | |
# βββββββββββββββββββββββββββββββββββββββββ | |
# 1) Inline definition & registration of SmolVLM2ChatHandler | |
class SmolVLM2ChatHandler(Llava15ChatHandler): | |
CHAT_FORMAT = ( | |
"<|im_start|>" | |
"{% for message in messages %}" | |
"{{ message['role'] | capitalize }}" | |
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" | |
"{% else %}: " | |
"{% endif %}" | |
"{% for content in message['content'] %}" | |
"{% if content['type']=='text' %}{{ content['text'] }}" | |
"{% elif content['type']=='image_url' %}" | |
"{% if content['image_url'] is string %}" | |
"{{ content['image_url'] }}\n" | |
"{% elif content['image_url'] is mapping %}" | |
"{{ content['image_url']['url'] }}\n" | |
"{% endif %}" | |
"{% endif %}" | |
"{% endfor %}" | |
"<end_of_utterance>\n" | |
"{% endfor %}" | |
"{% if add_generation_prompt %}Assistant:{% endif %}" | |
) | |
# βββββββββββββββββββββββββββββββββββββββββ | |
# 2) Model & CLIP files β download if missing | |
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf" | |
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" | |
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF" | |
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" | |
def ensure_models(): | |
logging.debug("Ensuring model files are present...") | |
if not os.path.exists(MODEL_FILE): | |
logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...") | |
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) | |
os.symlink(path, MODEL_FILE) | |
logging.info(f"Created symlink: {path} -> {MODEL_FILE}") | |
else: | |
logging.debug(f"Model file {MODEL_FILE} already exists.") | |
if not os.path.exists(CLIP_FILE): | |
logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...") | |
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE) | |
os.symlink(path, CLIP_FILE) | |
logging.info(f"Created symlink: {path} -> {CLIP_FILE}") | |
else: | |
logging.debug(f"CLIP file {CLIP_FILE} already exists.") | |
ensure_models() | |
def load_llm(): | |
logging.debug("Loading Llama model with SmolVLM2ChatHandler...") | |
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) | |
llm = Llama( | |
model_path=MODEL_FILE, | |
chat_handler=handler, | |
n_ctx=8192, | |
verbose=False, | |
) | |
logging.info("Llama model loaded successfully.") | |
return llm | |
llm = load_llm() | |
# βββββββββββββββββββββββββββββββββββββββββ | |
# 4) Captioning helper (stateless prompt) | |
def caption_frame(frame): | |
logging.debug("caption_frame called.") | |
# make a writable copy | |
frame = frame.copy() | |
frame = cv2.resize(frame, (384, 384)) | |
logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}") | |
# save frame to temporary file for URI | |
with tempfile.NamedTemporaryFile(suffix='.jpg') as f: | |
success = cv2.imwrite(f.name, frame) | |
if not success: | |
logging.error(f"Failed to write frame to {f.name}") | |
else: | |
logging.debug(f"Frame written to temp file: {f.name}") | |
uri = Path(f.name).absolute().as_uri() | |
logging.debug(f"Frame URI: {uri}") | |
# build a single prompt string | |
messages = [ | |
{ | |
"role": "system", | |
"content": ( | |
"Focus only on describing the key dramatic action or notable event occurring " | |
"in this image. Skip general context or scene-setting details unless they are " | |
"crucial to understanding the main action." | |
), | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"type": "image_url", "image_url": uri}, | |
{"type": "text", "text": "What is happening in this image?"}, | |
], | |
}, | |
] | |
logging.debug(f"Constructed messages: {messages}") | |
# stateless completion call | |
logging.debug("Resetting LLM and clearing cache.") | |
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) | |
llm.reset() # reset n_tokens back to 0 | |
llm._ctx.kv_cache_clear() # clear any cached key/values | |
logging.debug("Sending chat completion request...") | |
resp = llm.create_chat_completion( | |
messages=messages, | |
max_tokens=256, | |
temperature=0.1, | |
stop=["<end_of_utterance>"], | |
) | |
logging.debug(f"LLM raw response: {resp}") | |
# extract caption | |
caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip() | |
logging.debug(f"Extracted caption: {caption}") | |
return caption | |
# βββββββββββββββββββββββββββββββββββββββββ | |
# 5) Gradio UI (v5 streaming) | |
demo = gr.Blocks() | |
with demo: | |
gr.Markdown("## π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)") | |
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed") | |
caption_box = gr.Textbox(interactive=False, label="Caption") | |
# stream frames and captions | |
input_img.stream( | |
fn=caption_frame, | |
inputs=[input_img], | |
outputs=[caption_box], | |
stream_every=3, | |
time_limit=600 | |
) | |
if __name__ == "__main__": | |
logging.debug("Launching Gradio demo...") | |
demo.launch() | |