Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on Jun 16

Commit

5c50991

1 Parent(s): 76a0b57

1. add more models,

Browse files

2. user can define system and user prompt
3. user can decide update interval

Files changed (1) hide show

app.py +127 -124

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import logging
 import gradio as gr
 import cv2
@@ -7,17 +8,60 @@ from pathlib import Path
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
-from termcolor import cprint
-# Configure logging
-logging.basicConfig(
-    level=logging.DEBUG,
-    format='[%(asctime)s] %(levelname)s: %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
-# —————————————————————————————————————————
-# 1) Inline definition & registration of SmolVLM2ChatHandler
 class SmolVLM2ChatHandler(Llava15ChatHandler):
     CHAT_FORMAT = (
         "<|im_start|>"
@@ -41,127 +85,86 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
-# —————————————————————————————————————————
-# 2) Model & CLIP files — download if missing
-MODEL_FILE = "SmolVLM2-2.2B-Instruct.IQ4_XS.gguf"
-CLIP_FILE  = "mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf"
-MODEL_REPO = "mradermacher/SmolVLM2-2.2B-Instruct-GGUF"
-CLIP_REPO  = "ggml-org/SmolVLM2-2.2B-Instruct-GGUF"
-def ensure_models():
-    logging.debug("Ensuring model files are present...")
-    if not os.path.exists(MODEL_FILE):
-        logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
-        path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-        os.symlink(path, MODEL_FILE)
-        logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
-    else:
-        logging.debug(f"Model file {MODEL_FILE} already exists.")
-    if not os.path.exists(CLIP_FILE):
-        logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
-        path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
-        os.symlink(path, CLIP_FILE)
-        logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
-    else:
-        logging.debug(f"CLIP file {CLIP_FILE} already exists.")
-ensure_models()
-def load_llm():
-    logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
-    handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
-    llm = Llama(
-        model_path=MODEL_FILE,
-        chat_handler=handler,
-        n_ctx=1024,
-        verbose=False,
-    )
-    logging.info("Llama model loaded successfully.")
-    return llm
-llm = load_llm()
-# —————————————————————————————————————————
-# 4) Captioning helper (stateless prompt)
-def caption_frame(frame):
-    logging.debug("caption_frame called.")
-    # make a writable copy
-    frame = frame.copy()
-    frame = cv2.resize(frame, (384, 384))
-    logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
-    # save frame to temporary file for URI
-    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
-        success = cv2.imwrite(f.name, frame)
-        if not success:
-            logging.error(f"Failed to write frame to {f.name}")
-        else:
-            logging.debug(f"Frame written to temp file: {f.name}")
-        uri = Path(f.name).absolute().as_uri()
-        logging.debug(f"Frame URI: {uri}")
-        # build a single prompt string
         messages = [
-            {
-                "role": "system",
-                "content": (
-                    "Focus only on describing the key dramatic action or notable event occurring "
-                    "in this image. Skip general context or scene-setting details unless they are "
-                    "crucial to understanding the main action."
-                ),
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": uri},
-                    {"type": "text",      "text": "What is happening in this image?"},
-                ],
-            },
         ]
-        logging.debug(f"Constructed messages: {messages}")
-        # stateless completion call
-        logging.debug("Resetting LLM and clearing cache.")
-        llm.chat_handler.__init__(clip_model_path=CLIP_FILE, verbose=False)
-        logging.debug("Sending chat completion request...")
         resp = llm.create_chat_completion(
             messages=messages,
             max_tokens=128,
             temperature=0.1,
-            stop=["<end_of_utterance>"],
         )
-        logging.debug(f"LLM raw response: {resp}")
-    # extract caption
-    caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
-    logging.debug(f"Extracted caption: {caption}")
-    return caption
-# —————————————————————————————————————————
-# 5) Gradio UI (v5 streaming)
-demo = gr.Blocks()
-with demo:
-    gr.Markdown("## 🎥 Real-Time Camera Captioning with SmolVLM2 (CPU)")
-    input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
-    caption_box = gr.Textbox(interactive=False, label="Caption")
-    # stream frames and captions
-    input_img.stream(
-        fn=caption_frame,
-        inputs=[input_img],
-        outputs=[caption_box],
-        stream_every=3,
-        time_limit=600
-    )
-if __name__ == "__main__":
-    logging.debug("Launching Gradio demo...")
     demo.launch()
-# todos:
-# 1. add list of models: smolvml2 256m, 500m, 2.2b with varouis precision in choice
-# 2. customizable interval
-# 3. customizable system and user prompts

+import time
 import logging
 import gradio as gr
 import cv2
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from llama_cpp.llama_chat_format import Llava15ChatHandler
+# ----------------------------------------
+# Model configurations: per-size prefixes and repos
+MODELS = {
+    "256M": {
+        "model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
+        "clip_repo":  "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
+        "model_prefix": "SmolVLM2-256M-Video-Instruct",
+        "clip_prefix":  "mmproj-SmolVLM2-256M-Video-Instruct",
+        "model_variants": ["Q8_0", "f16"],
+        "clip_variants":  ["Q8_0", "f16"],
+    },
+    "500M": {
+        "model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
+        "clip_repo":  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
+        "model_prefix": "SmolVLM2-500M-Video-Instruct",
+        "clip_prefix":  "mmproj-SmolVLM2-500M-Video-Instruct",
+        "model_variants": ["Q8_0", "f16"],
+        "clip_variants":  ["Q8_0", "f16"],
+    },
+    "2.2B": {
+        "model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
+        "clip_repo":  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
+        "model_prefix": "SmolVLM2-2.2B-Instruct",
+        "clip_prefix":  "mmproj-SmolVLM2-2.2B-Instruct",
+        "model_variants": ["Q4_K_M", "Q8_0", "f16"],
+        "clip_variants":  ["Q8_0", "f16"],
+    },
+}
+# ----------------------------------------
+# Cache for loaded model instance
+model_cache = {
+    'size': None,
+    'model_file': None,
+    'clip_file': None,
+    'llm': None
+}
+# Helper to download & symlink weights
+def ensure_weights(size, model_file, clip_file):
+    cfg = MODELS[size]
+    if not os.path.exists(model_file):
+        logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
+        path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
+        os.symlink(path, model_file)
+    if not os.path.exists(clip_file):
+        logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
+        path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
+        os.symlink(path, clip_file)
+    return model_file, clip_file
+# Custom chat handler
 class SmolVLM2ChatHandler(Llava15ChatHandler):
     CHAT_FORMAT = (
         "<|im_start|>"
         "{% if add_generation_prompt %}Assistant:{% endif %}"
     )
+# Load and cache LLM (only on dropdown change)
+def update_llm(size, model_file, clip_file):
+    if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
+        mf, cf = ensure_weights(size, model_file, clip_file)
+        handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
+        llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False)
+        model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
+    return None  # no UI output
+# Build weight filename lists
+def get_weight_files(size):
+    cfg = MODELS[size]
+    model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
+    clip_files  = [f"{cfg['clip_prefix']}-{v}.gguf"  for v in cfg['clip_variants']]
+    return model_files, clip_files
+# Caption using cached llm
+def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
+    # Use pre-loaded model
+    llm = model_cache['llm']
+    time.sleep(interval_ms / 1000)
+    img = cv2.resize(frame.copy(), (384, 384))
+    with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp:
+        cv2.imwrite(tmp.name, img)
+        uri = Path(tmp.name).absolute().as_uri()
         messages = [
+            {"role": "system", "content": sys_prompt},
+            {"role": "user",   "content": [
+                {"type": "image_url", "image_url": uri},
+                {"type": "text",      "text": usr_prompt}
+            ]}
         ]
+        # re-init handler
+        llm.chat_handler.__init__(clip_model_path=clip_file, verbose=False)
         resp = llm.create_chat_completion(
             messages=messages,
             max_tokens=128,
             temperature=0.1,
+            stop=["<end_of_utterance>"]
+        )
+    return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
+# Gradio UI
+def main():
+    logging.basicConfig(level=logging.INFO)
+    default = '2.2B'
+    mf, cf = get_weight_files(default)
+    with gr.Blocks() as demo:
+        gr.Markdown("## 🎥 Real-Time Camera Captioning")
+        with gr.Row():
+            size_dd   = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
+            model_dd  = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
+            clip_dd   = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
+        # On any selection change, preload the llm
+        size_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
+        model_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
+        clip_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
+        # Initial load
+        update_llm(default, mf[0], cf[0])
+        interval = gr.Slider(100, 20000, step=100, value=1000, label='Interval (ms)')
+        sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
+        usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
+        cam   = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
+        cap   = gr.Textbox(interactive=False, label='Caption')
+        cam.stream(
+            fn=caption_frame,
+            inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
+            outputs=[cap], time_limit=600
         )
     demo.launch()
+if __name__ == '__main__':
+    main()