Spaces:
Running
Running
File size: 6,140 Bytes
36dacc6 970f416 dd0d47d 221e4b6 ca97f63 221e4b6 970f416 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 ca97f63 dd0d47d 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 36dacc6 221e4b6 970f416 221e4b6 36dacc6 970f416 c1d8038 36dacc6 970f416 36dacc6 221e4b6 36dacc6 221e4b6 970f416 36dacc6 970f416 36dacc6 970f416 36dacc6 970f416 36dacc6 970f416 292fb3c 36dacc6 292fb3c 970f416 36dacc6 970f416 221e4b6 970f416 221e4b6 970f416 36dacc6 970f416 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import logging
import gradio as gr
import cv2
import tempfile
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
from termcolor import cprint
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='[%(asctime)s] %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
# βββββββββββββββββββββββββββββββββββββββββ
# 1) Inline definition & registration of SmolVLM2ChatHandler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# βββββββββββββββββββββββββββββββββββββββββ
# 2) Model & CLIP files β download if missing
MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
def ensure_models():
logging.debug("Ensuring model files are present...")
if not os.path.exists(MODEL_FILE):
logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
os.symlink(path, MODEL_FILE)
logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
else:
logging.debug(f"Model file {MODEL_FILE} already exists.")
if not os.path.exists(CLIP_FILE):
logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
os.symlink(path, CLIP_FILE)
logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
else:
logging.debug(f"CLIP file {CLIP_FILE} already exists.")
ensure_models()
def load_llm():
logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm = Llama(
model_path=MODEL_FILE,
chat_handler=handler,
n_ctx=8192,
verbose=False,
)
logging.info("Llama model loaded successfully.")
return llm
llm = load_llm()
# βββββββββββββββββββββββββββββββββββββββββ
# 4) Captioning helper (stateless prompt)
def caption_frame(frame):
logging.debug("caption_frame called.")
# make a writable copy
frame = frame.copy()
frame = cv2.resize(frame, (384, 384))
logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
# save frame to temporary file for URI
with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
success = cv2.imwrite(f.name, frame)
if not success:
logging.error(f"Failed to write frame to {f.name}")
else:
logging.debug(f"Frame written to temp file: {f.name}")
uri = Path(f.name).absolute().as_uri()
logging.debug(f"Frame URI: {uri}")
# build a single prompt string
messages = [
{
"role": "system",
"content": (
"Focus only on describing the key dramatic action or notable event occurring "
"in this image. Skip general context or scene-setting details unless they are "
"crucial to understanding the main action."
),
},
{
"role": "user",
"content": [
{"type": "image_url", "image_url": uri},
{"type": "text", "text": "What is happening in this image?"},
],
},
]
logging.debug(f"Constructed messages: {messages}")
# stateless completion call
logging.debug("Resetting LLM and clearing cache.")
llm.chat_handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
llm.reset() # reset n_tokens back to 0
llm._ctx.kv_cache_clear() # clear any cached key/values
logging.debug("Sending chat completion request...")
resp = llm.create_chat_completion(
messages=messages,
max_tokens=256,
temperature=0.1,
stop=["<end_of_utterance>"],
)
logging.debug(f"LLM raw response: {resp}")
# extract caption
caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
logging.debug(f"Extracted caption: {caption}")
return caption
# βββββββββββββββββββββββββββββββββββββββββ
# 5) Gradio UI (v5 streaming)
demo = gr.Blocks()
with demo:
gr.Markdown("## π₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
caption_box = gr.Textbox(interactive=False, label="Caption")
# stream frames and captions
input_img.stream(
fn=caption_frame,
inputs=[input_img],
outputs=[caption_box],
stream_every=3,
time_limit=600
)
if __name__ == "__main__":
logging.debug("Launching Gradio demo...")
demo.launch()
|