Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

SmolVLM2-on-llama.cpp / app.py

Luigi

increase interval default to 3s

c9c43a8 30 days ago

raw

history blame

6.59 kB

	import time
	import logging
	import gradio as gr
	import cv2
	import tempfile
	import os
	from pathlib import Path
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	from llama_cpp.llama_chat_format import Llava15ChatHandler

	# ----------------------------------------
	# Model configurations: per-size prefixes and repos
	MODELS = {
	"256M": {
	"model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-256M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct",
	"model_variants": ["Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"500M": {
	"model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
	"model_prefix": "SmolVLM2-500M-Video-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct",
	"model_variants": ["Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	"2.2B": {
	"model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
	"clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
	"model_prefix": "SmolVLM2-2.2B-Instruct",
	"clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct",
	"model_variants": ["Q4_K_M", "Q8_0", "f16"],
	"clip_variants": ["Q8_0", "f16"],
	},
	}

	# ----------------------------------------
	# Cache for loaded model instance
	model_cache = {
	'size': None,
	'model_file': None,
	'clip_file': None,
	'llm': None
	}

	# Helper to download & symlink weights

	def ensure_weights(size, model_file, clip_file):
	cfg = MODELS[size]
	if not os.path.exists(model_file):
	logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
	path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
	os.symlink(path, model_file)
	if not os.path.exists(clip_file):
	logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
	path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
	os.symlink(path, clip_file)
	return model_file, clip_file

	# Custom chat handler
	class SmolVLM2ChatHandler(Llava15ChatHandler):
	CHAT_FORMAT = (
	"<\|im_start\|>"
	"{% for message in messages %}"
	"{{ message['role'] \| capitalize }}"
	"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
	"{% else %}: "
	"{% endif %}"
	"{% for content in message['content'] %}"
	"{% if content['type']=='text' %}{{ content['text'] }}"
	"{% elif content['type']=='image_url' %}"
	"{% if content['image_url'] is string %}"
	"{{ content['image_url'] }}\n"
	"{% elif content['image_url'] is mapping %}"
	"{{ content['image_url']['url'] }}\n"
	"{% endif %}"
	"{% endif %}"
	"{% endfor %}"
	"<end_of_utterance>\n"
	"{% endfor %}"
	"{% if add_generation_prompt %}Assistant:{% endif %}"
	)

	# Load and cache LLM (only on dropdown change)

	def update_llm(size, model_file, clip_file):
	if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
	mf, cf = ensure_weights(size, model_file, clip_file)
	handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
	llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False)
	model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
	return None # no UI output

	# Build weight filename lists

	def get_weight_files(size):
	cfg = MODELS[size]
	model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
	clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
	return model_files, clip_files

	# Caption using cached llm

	def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
	# Use pre-loaded model
	llm = model_cache['llm']
	time.sleep(interval_ms / 1000)
	img = cv2.resize(frame.copy(), (384, 384))
	with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp:
	cv2.imwrite(tmp.name, img)
	uri = Path(tmp.name).absolute().as_uri()
	messages = [
	{"role": "system", "content": sys_prompt},
	{"role": "user", "content": [
	{"type": "image_url", "image_url": uri},
	{"type": "text", "text": usr_prompt}
	]}
	]
	# re-init handler
	llm.chat_handler.__init__(clip_model_path=clip_file, verbose=False)
	resp = llm.create_chat_completion(
	messages=messages,
	max_tokens=128,
	temperature=0.1,
	stop=["<end_of_utterance>"]
	)
	return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()

	# Gradio UI

	def main():
	logging.basicConfig(level=logging.INFO)
	default = '2.2B'
	mf, cf = get_weight_files(default)

	with gr.Blocks() as demo:
	gr.Markdown("## 🎥 Real-Time Camera Captioning")
	with gr.Row():
	size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
	model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
	clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')

	# On any selection change, preload the llm
	size_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
	model_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
	clip_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])

	# Initial load
	update_llm(default, mf[0], cf[0])

	interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
	sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
	usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
	cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
	cap = gr.Textbox(interactive=False, label='Caption')

	cam.stream(
	fn=caption_frame,
	inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
	outputs=[cap], time_limit=600
	)

	demo.launch()

	if __name__ == '__main__':
	main()