Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

add debug to show which weight files we’re using this run

a459bee 14 days ago

raw

history blame

8.9 kB

	import time
	import logging
	import gradio as gr
	import cv2
	import os
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Llava15ChatHandler
	import base64
	import gc

	# ----------------------------------------
	# Model configurations: per-size prefixes and repos
	MODELS = {
	"256M": {
	"model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-256M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct",
	"model_variants": ["Q2_K","Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"500M": {
	"model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-500M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct",
	"model_variants": ["Q2_K","Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"2.2B": {
	"model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
	"model_prefix": "SmolVLM2-2.2B-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct",
	"model_variants": ["Q2_K","Q4_K_M", "Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	}

	# ----------------------------------------
	# Cache for loaded model instance
	model_cache = {
	'size': None,
	'model_file': None,
	'clip_file': None,
	'llm': None
	}

	# Helper to download & symlink weights

	def ensure_weights(size, model_file, clip_file):
	cfg = MODELS[size]
	if not os.path.exists(model_file):
	logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
	path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
	os.symlink(path, model_file)
	if not os.path.exists(clip_file):
	logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
	path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
	os.symlink(path, clip_file)
	return model_file, clip_file

	# Custom chat handler
	class SmolVLM2ChatHandler(Llava15ChatHandler):
	CHAT_FORMAT = (
	"<\|im_start\|>"
	"{% for message in messages %}"
	"{{ message['role'] \| capitalize }}"
	"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
	"{% else %}: "
	"{% endif %}"
	"{% for content in message['content'] %}"
	"{% if content['type']=='text' %}{{ content['text'] }}"
	"{% elif content['type']=='image_url' %}"
	"{% if content['image_url'] is string %}"
	"{{ content['image_url'] }}\n"
	"{% elif content['image_url'] is mapping %}"
	"{{ content['image_url']['url'] }}\n"
	"{% endif %}"
	"{% endif %}"
	"{% endfor %}"
	"<end_of_utterance>\n"
	"{% endfor %}"
	"{% if add_generation_prompt %}Assistant:{% endif %}"
	)

	# Load and cache LLM (only on dropdown change)

	def update_llm(size, model_file, clip_file):
	if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
	mf, cf = ensure_weights(size, model_file, clip_file)
	handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
	llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024,
	verbose=False, n_threads=max(2, os.cpu_count()))
	model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
	return None # no UI output

	# Build weight filename lists

	def get_weight_files(size):
	cfg = MODELS[size]
	model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
	clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
	return model_files, clip_files

	# Caption using cached llm with real-time debug logs

	def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
	debug_msgs = []
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")

	# show which weight files we’re using this run
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
	debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")

	t_resize = time.time()
	img = cv2.resize(frame.copy(), (384, 384))
	elapsed = (time.time() - t_resize) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms")

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms")
	time.sleep(interval_ms / 1000)

	t_enc = time.time()
	success, jpeg = cv2.imencode('.jpg', img)
	elapsed = (time.time() - t_enc) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms")

	uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
	messages = [
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": [
	{"type": "image_url", "image_url": uri},
	{"type": "text", "text": usr_prompt}
	]}
	]

	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
	# re-init handler for image
	model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=False)
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")

	debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")

	t_start = time.time()
	resp = model_cache['llm'].create_chat_completion(
	messages=messages,
	max_tokens=128,
	temperature=0.1,
	stop=["<end_of_utterance>"]
	)
	elapsed = (time.time() - t_start) * 1000
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms")

	content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars")

	gc.collect()
	timestamp = time.strftime('%H:%M:%S')
	debug_msgs.append(f"[{timestamp}] Garbage collected")

	return content, "\n".join(debug_msgs)

	# Gradio UI

	def main():
	logging.basicConfig(level=logging.INFO)
	default = '2.2B'
	mf, cf = get_weight_files(default)

	with gr.Blocks() as demo:
	gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs")
	with gr.Row():
	size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
	model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
	clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')

	# When size changes: update dropdowns AND preload llm with the new first weights
	def on_size_change(sz):
	mlist, clist = get_weight_files(sz)
	# update dropdown choices and default values
	update_ui = (
	gr.update(choices=mlist, value=mlist[0]),
	gr.update(choices=clist, value=clist[0])
	)
	# preload with first weights
	update_llm(sz, mlist[0], clist[0])
	return update_ui
	size_dd.change(
	fn=on_size_change,
	inputs=[size_dd],
	outputs=[model_dd, clip_dd]
	)
	model_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
	clip_dd.change(lambda sz, mf, cf: update_llm(sz, mf, cf), inputs=[size_dd, model_dd, clip_dd], outputs=[])
	update_llm(default, mf[0], cf[0])

	interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
	sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
	usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
	cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
	cap = gr.Textbox(interactive=False, label='Caption')
	log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')

	cam.stream(
	fn=caption_frame,
	inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
	outputs=[cap, log_box],
	time_limit=600
	)

	demo.launch()

	if __name__ == '__main__':
	main()