Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

default to smallest model with q8 prcision, enable verbose mode, disable reset clip

65efb90 3 months ago

raw

history blame

10.2 kB

	import time
	import logging
	import gradio as gr
	import cv2
	import os
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Llava15ChatHandler
	import base64
	import gc
	import io
	from contextlib import redirect_stdout, redirect_stderr
	import sys, llama_cpp

	# ----------------------------------------
	# Model configurations: per-size prefixes and repos
	MODELS = {
	"256M": {
	"model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-256M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct",
	"model_variants": ["Q8_0", "Q2_K", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"500M": {
	"model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-500M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct",
	"model_variants": ["Q8_0", "Q2_K", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"2.2B": {
	"model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
	"model_prefix": "SmolVLM2-2.2B-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct",
	"model_variants": ["Q8_0", "Q2_K","Q4_K_M", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	}

	# ----------------------------------------
	# Cache for loaded model instance
	model_cache = {
	'size': None,
	'model_file': None,
	'clip_file': None,
	'verbose': None,
	'llm': None
	}

	# Helper to download & symlink weights

	def ensure_weights(size, model_file, clip_file):
	cfg = MODELS[size]
	if not os.path.exists(model_file):
	logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
	path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
	os.symlink(path, model_file)
	if not os.path.exists(clip_file):
	logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
	path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
	os.symlink(path, clip_file)
	return model_file, clip_file

	# Custom chat handler
	class SmolVLM2ChatHandler(Llava15ChatHandler):
	CHAT_FORMAT = (
	"<\|im_start\|>"
	"{% for message in messages %}"
	"{{ message['role'] \| capitalize }}"
	"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
	"{% else %}: "
	"{% endif %}"
	"{% for content in message['content'] %}"
	"{% if content['type']=='text' %}{{ content['text'] }}"
	"{% elif content['type']=='image_url' %}"
	"{% if content['image_url'] is string %}"
	"{{ content['image_url'] }}\n"
	"{% elif content['image_url'] is mapping %}"
	"{{ content['image_url']['url'] }}\n"
	"{% endif %}"
	"{% endif %}"
	"{% endfor %}"
	"<end_of_utterance>\n"
	"{% endfor %}"
	"{% if add_generation_prompt %}Assistant:{% endif %}"
	)

	# Load and cache LLM (only on dropdown or verbose change)
	def update_llm(size, model_file, clip_file, verbose_mode):
	if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
	mf, cf = ensure_weights(size, model_file, clip_file)
	handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
	llm = Llama(
	model_path=mf,
	chat_handler=handler,
	n_ctx=8192,
	verbose=verbose_mode,
	n_threads=max(2, os.cpu_count())
	)
	model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'llm': llm})
	return None

	# Build weight filename lists
	def get_weight_files(size):
	cfg = MODELS[size]
	model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
	clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
	return model_files, clip_files

	# Caption using cached llm with real-time debug logs
	def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode):
	debug_msgs = []
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}")
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] llama_cpp version: {llama_cpp.__version__}")
	debug_msgs.append(f"[{timestamp}] Python version: {sys.version.split()[0]}")
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
	debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")

	t_resize = time.time()
	img = cv2.resize(frame.copy(), (384, 384))
	elapsed = (time.time() - t_resize) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms")

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms")
	time.sleep(interval_ms / 1000)

	t_enc = time.time()
	success, jpeg = cv2.imencode('.jpg', img, quality=50, )
	elapsed = (time.time() - t_enc) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms")

	uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
	messages = [
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": [
	{"type": "image_url", "image_url": uri},
	{"type": "text", "text": usr_prompt}
	]}
	]

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
	if reset_clip:
	model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode)
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")

	t_start = time.time()
	# right before you call the Llama API:
	buf = io.StringIO()
	with redirect_stdout(buf), redirect_stderr(buf):
	resp = model_cache['llm'].create_chat_completion(
	messages=messages,
	max_tokens=128,
	temperature=0.1,
	stop=["<end_of_utterance>"]
	)
	# grab every line the Llama client printed
	for line in buf.getvalue().splitlines():
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] {line}")

	elapsed = (time.time() - t_start) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms")

	content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars")

	gc.collect()
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Garbage collected")

	return content, "\n".join(debug_msgs)

	# Gradio UI
	def main():
	logging.basicConfig(level=logging.INFO)
	default = '256M'
	default_verbose = True
	mf, cf = get_weight_files(default)

	with gr.Blocks() as demo:
	gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs")
	with gr.Row():
	size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
	model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
	clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
	verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')

	def on_size_change(sz, verbose):
	mlist, clist = get_weight_files(sz)
	update_llm(sz, mlist[0], clist[0], verbose)
	return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])

	size_dd.change(
	fn=on_size_change,
	inputs=[size_dd, verbose_cb],
	outputs=[model_dd, clip_dd]
	)
	model_dd.change(
	fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
	inputs=[size_dd, model_dd, clip_dd, verbose_cb],
	outputs=[]
	)
	clip_dd.change(
	fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
	inputs=[size_dd, model_dd, clip_dd, verbose_cb],
	outputs=[]
	)
	verbose_cb.change(
	fn=lambda sz, mf, cf, verbose: update_llm(sz, mf, cf, verbose),
	inputs=[size_dd, model_dd, clip_dd, verbose_cb],
	outputs=[]
	)
	update_llm(default, mf[0], cf[0], default_verbose)

	interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
	sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
	usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
	reset_clip = gr.Checkbox(value=False, label="Reset CLIP handler each frame")
	cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
	cap = gr.Textbox(interactive=False, label='Caption')
	log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')

	cam.stream(
	fn=caption_frame,
	inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb],
	outputs=[cap, log_box],
	time_limit=600,
	)

	demo.launch()

	if __name__ == '__main__':
	main()