Luigi's picture
resize frame to 384 x384 resolution
c1d8038
raw
history blame
6.14 kB
import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 2) Model & CLIP files β€” download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
logging.debug("Ensuring model files are present...")
if not os.path.exists(MODEL_FILE):
logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
else:
logging.debug(f"Model file {MODEL_FILE} already exists.")
if not os.path.exists(CLIP_FILE):
logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
else:
logging.debug(f"CLIP file {CLIP_FILE} already exists.")
ensure_models()
def load_llm():
logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm = Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
logging.info("Llama model loaded successfully.")
return llm
llm = load_llm()
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
logging.debug("caption_frame called.")
# make a writable copy
frame = frame.copy()
frame = cv2.resize(frame, (384, 384))
logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
# save frame to temporary file for URI
with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
success = cv2.imwrite(f.name, frame)
if not success:
logging.error(f"Failed to write frame to {f.name}")
else:
logging.debug(f"Frame written to temp file: {f.name}")
uri = Path(f.name).absolute().as_uri()
logging.debug(f"Frame URI: {uri}")
# build a single prompt string
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": uri},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
logging.debug(f"Constructed messages: {messages}")
# stateless completion call
logging.debug("Resetting LLM and clearing cache.")
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm.reset() # reset n_tokens back to 0
llm._ctx.kv_cache_clear() # clear any cached key/values
logging.debug("Sending chat completion request...")
resp = llm.create_chat_completion(
messages=messages,
max_tokens=256,
temperature=0.1,
stop=["<end_of_utterance>"],
)
logging.debug(f"LLM raw response: {resp}")
# extract caption
caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
logging.debug(f"Extracted caption: {caption}")
return caption
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
gr.Markdown("## πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
caption_box = gr.Textbox(interactive=False, label="Caption")
# stream frames and captions
input_img.stream(
fn=caption_frame,
inputs=[input_img],
outputs=[caption_box],
stream_every=3,
time_limit=600
)
if __name__ == "__main__":
logging.debug("Launching Gradio demo...")
demo.launch()