import logging import gradio as gr import cv2 import tempfile import os from pathlib import Path from huggingface_hub import hf_hub_download from llama_cpp import Llama from llama_cpp.llama_chat_format import Llava15ChatHandler from termcolor import cprint # Configure logging logging.basicConfig( level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) # ————————————————————————————————————————— # 1) Inline definition & registration of SmolVLM2ChatHandler class SmolVLM2ChatHandler(Llava15ChatHandler): CHAT_FORMAT = ( "<|im_start|>" "{% for message in messages %}" "{{ message['role'] | capitalize }}" "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" "{% else %}: " "{% endif %}" "{% for content in message['content'] %}" "{% if content['type']=='text' %}{{ content['text'] }}" "{% elif content['type']=='image_url' %}" "{% if content['image_url'] is string %}" "{{ content['image_url'] }}\n" "{% elif content['image_url'] is mapping %}" "{{ content['image_url']['url'] }}\n" "{% endif %}" "{% endif %}" "{% endfor %}" "\n" "{% endfor %}" "{% if add_generation_prompt %}Assistant:{% endif %}" ) # ————————————————————————————————————————— # 2) Model & CLIP files — download if missing MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf" CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf" MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF" CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF" def ensure_models(): logging.debug("Ensuring model files are present...") if not os.path.exists(MODEL_FILE): logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...") path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) os.symlink(path, MODEL_FILE) logging.info(f"Created symlink: {path} -> {MODEL_FILE}") else: logging.debug(f"Model file {MODEL_FILE} already exists.") if not os.path.exists(CLIP_FILE): logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...") path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE) os.symlink(path, CLIP_FILE) logging.info(f"Created symlink: {path} -> {CLIP_FILE}") else: logging.debug(f"CLIP file {CLIP_FILE} already exists.") ensure_models() def load_llm(): logging.debug("Loading Llama model with SmolVLM2ChatHandler...") handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) llm = Llama( model_path=MODEL_FILE, chat_handler=handler, n_ctx=8192, verbose=False, ) logging.info("Llama model loaded successfully.") return llm llm = load_llm() # ————————————————————————————————————————— # 4) Captioning helper (stateless prompt) def caption_frame(frame): logging.debug("caption_frame called.") # make a writable copy frame = frame.copy() logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}") # save frame to temporary file for URI with tempfile.NamedTemporaryFile(suffix='.jpg') as f: success = cv2.imwrite(f.name, frame) if not success: logging.error(f"Failed to write frame to {f.name}") else: logging.debug(f"Frame written to temp file: {f.name}") uri = Path(f.name).absolute().as_uri() logging.debug(f"Frame URI: {uri}") # build a single prompt string messages = [ { "role": "system", "content": ( "Focus only on describing the key dramatic action or notable event occurring " "in this image. Skip general context or scene-setting details unless they are " "crucial to understanding the main action." ), }, { "role": "user", "content": [ {"type": "image_url", "image_url": uri}, {"type": "text", "text": "What is happening in this image?"}, ], }, ] logging.debug(f"Constructed messages: {messages}") # stateless completion call logging.debug("Resetting LLM and clearing cache.") llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False) llm.reset() # reset n_tokens back to 0 llm._ctx.kv_cache_clear() # clear any cached key/values logging.debug("Sending chat completion request...") resp = llm.create_chat_completion( messages=messages, max_tokens=256, temperature=0.1, stop=[""], ) logging.debug(f"LLM raw response: {resp}") # extract caption caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip() logging.debug(f"Extracted caption: {caption}") return caption # ————————————————————————————————————————— # 5) Gradio UI (v5 streaming) demo = gr.Blocks() with demo: gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)") input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed") caption_box = gr.Textbox(interactive=False, label="Caption") # stream frames and captions input_img.stream( fn=caption_frame, inputs=[input_img], outputs=[caption_box], stream_every=3, time_limit=600 ) if __name__ == "__main__": logging.debug("Launching Gradio demo...") demo.launch()