Luigi's picture
add debug messages
36dacc6
raw
history blame
6.1 kB
import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 2) Model & CLIP files β€” download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
logging.debug("Ensuring model files are present...")
if not os.path.exists(MODEL_FILE):
logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
else:
logging.debug(f"Model file {MODEL_FILE} already exists.")
if not os.path.exists(CLIP_FILE):
logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
else:
logging.debug(f"CLIP file {CLIP_FILE} already exists.")
ensure_models()
def load_llm():
logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm = Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
logging.info("Llama model loaded successfully.")
return llm
llm = load_llm()
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
logging.debug("caption_frame called.")
# make a writable copy
frame = frame.copy()
logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
# save frame to temporary file for URI
with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
success = cv2.imwrite(f.name, frame)
if not success:
logging.error(f"Failed to write frame to {f.name}")
else:
logging.debug(f"Frame written to temp file: {f.name}")
uri = Path(f.name).absolute().as_uri()
logging.debug(f"Frame URI: {uri}")
# build a single prompt string
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": uri},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
logging.debug(f"Constructed messages: {messages}")
# stateless completion call
logging.debug("Resetting LLM and clearing cache.")
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm.reset() # reset n_tokens back to 0
llm._ctx.kv_cache_clear() # clear any cached key/values
logging.debug("Sending chat completion request...")
resp = llm.create_chat_completion(
messages=messages,
max_tokens=256,
temperature=0.1,
stop=["<end_of_utterance>"],
)
logging.debug(f"LLM raw response: {resp}")
# extract caption
caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
logging.debug(f"Extracted caption: {caption}")
return caption
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
gr.Markdown("## πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
caption_box = gr.Textbox(interactive=False, label="Caption")
# stream frames and captions
input_img.stream(
fn=caption_frame,
inputs=[input_img],
outputs=[caption_box],
stream_every=3,
time_limit=600
)
if __name__ == "__main__":
logging.debug("Launching Gradio demo...")
demo.launch()