Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

resize frame to 384 x384 resolution

c1d8038 about 1 month ago

raw

history blame

6.14 kB

	import logging
	import gradio as gr
	import cv2
	import tempfile
	import os
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Llava15ChatHandler
	from termcolor import cprint

	# Configure logging
	logging.basicConfig(
	level=logging.DEBUG,
	format='[%(asctime)s] %(levelname)s: %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)

	# —————————————————————————————————————————
	# 1) Inline definition & registration of SmolVLM2ChatHandler
	class SmolVLM2ChatHandler(Llava15ChatHandler):
	CHAT_FORMAT = (
	"<\|im_start\|>"
	"{% for message in messages %}"
	"{{ message['role'] \| capitalize }}"
	"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
	"{% else %}: "
	"{% endif %}"
	"{% for content in message['content'] %}"
	"{% if content['type']=='text' %}{{ content['text'] }}"
	"{% elif content['type']=='image_url' %}"
	"{% if content['image_url'] is string %}"
	"{{ content['image_url'] }}\n"
	"{% elif content['image_url'] is mapping %}"
	"{{ content['image_url']['url'] }}\n"
	"{% endif %}"
	"{% endif %}"
	"{% endfor %}"
	"<end_of_utterance>\n"
	"{% endfor %}"
	"{% if add_generation_prompt %}Assistant:{% endif %}"
	)

	# —————————————————————————————————————————
	# 2) Model & CLIP files — download if missing
	MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
	CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
	MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
	CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"

	def ensure_models():
	logging.debug("Ensuring model files are present...")
	if not os.path.exists(MODEL_FILE):
	logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
	path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
	os.symlink(path, MODEL_FILE)
	logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
	else:
	logging.debug(f"Model file {MODEL_FILE} already exists.")

	if not os.path.exists(CLIP_FILE):
	logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
	path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
	os.symlink(path, CLIP_FILE)
	logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
	else:
	logging.debug(f"CLIP file {CLIP_FILE} already exists.")

	ensure_models()


	def load_llm():
	logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
	handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
	llm = Llama(
	model_path=MODEL_FILE,
	chat_handler=handler,
	n_ctx=8192,
	verbose=False,
	)
	logging.info("Llama model loaded successfully.")
	return llm

	llm = load_llm()

	# —————————————————————————————————————————
	# 4) Captioning helper (stateless prompt)
	def caption_frame(frame):
	logging.debug("caption_frame called.")
	# make a writable copy
	frame = frame.copy()
	frame = cv2.resize(frame, (384, 384))
	logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")

	# save frame to temporary file for URI
	with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
	success = cv2.imwrite(f.name, frame)
	if not success:
	logging.error(f"Failed to write frame to {f.name}")
	else:
	logging.debug(f"Frame written to temp file: {f.name}")

	uri = Path(f.name).absolute().as_uri()
	logging.debug(f"Frame URI: {uri}")

	# build a single prompt string
	messages = [
	{
	"role": "system",
	"content": (
	"Focus only on describing the key dramatic action or notable event occurring "
	"in this image. Skip general context or scene-setting details unless they are "
	"crucial to understanding the main action."
	),
	},
	{
	"role": "user",
	"content": [
	{"type": "image_url", "image_url": uri},
	{"type": "text", "text": "What is happening in this image?"},
	],
	},
	]
	logging.debug(f"Constructed messages: {messages}")

	# stateless completion call
	logging.debug("Resetting LLM and clearing cache.")
	llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
	llm.reset() # reset n_tokens back to 0
	llm._ctx.kv_cache_clear() # clear any cached key/values
	logging.debug("Sending chat completion request...")
	resp = llm.create_chat_completion(
	messages=messages,
	max_tokens=256,
	temperature=0.1,
	stop=["<end_of_utterance>"],
	)
	logging.debug(f"LLM raw response: {resp}")

	# extract caption
	caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
	logging.debug(f"Extracted caption: {caption}")
	return caption

	# —————————————————————————————————————————
	# 5) Gradio UI (v5 streaming)
	demo = gr.Blocks()
	with demo:
	gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
	input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
	caption_box = gr.Textbox(interactive=False, label="Caption")

	# stream frames and captions
	input_img.stream(
	fn=caption_frame,
	inputs=[input_img],
	outputs=[caption_box],
	stream_every=3,
	time_limit=600
	)

	if __name__ == "__main__":
	logging.debug("Launching Gradio demo...")
	demo.launch()