Luigi commited on
Commit
5c50991
Β·
1 Parent(s): 76a0b57

1. add more models,

Browse files

2. user can define system and user prompt
3. user can decide update interval

Files changed (1) hide show
  1. app.py +127 -124
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import logging
2
  import gradio as gr
3
  import cv2
@@ -7,17 +8,60 @@ from pathlib import Path
7
  from huggingface_hub import hf_hub_download
8
  from llama_cpp import Llama
9
  from llama_cpp.llama_chat_format import Llava15ChatHandler
10
- from termcolor import cprint
11
 
12
- # Configure logging
13
- logging.basicConfig(
14
- level=logging.DEBUG,
15
- format='[%(asctime)s] %(levelname)s: %(message)s',
16
- datefmt='%Y-%m-%d %H:%M:%S'
17
- )
18
-
19
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
20
- # 1) Inline definition & registration of SmolVLM2ChatHandler
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  class SmolVLM2ChatHandler(Llava15ChatHandler):
22
  CHAT_FORMAT = (
23
  "<|im_start|>"
@@ -41,127 +85,86 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
41
  "{% if add_generation_prompt %}Assistant:{% endif %}"
42
  )
43
 
44
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
45
- # 2) Model & CLIP files β€” download if missing
46
- MODEL_FILE = "SmolVLM2-2.2B-Instruct.IQ4_XS.gguf"
47
- CLIP_FILE = "mmproj-SmolVLM2-2.2B-Instruct-Q8_0.gguf"
48
- MODEL_REPO = "mradermacher/SmolVLM2-2.2B-Instruct-GGUF"
49
- CLIP_REPO = "ggml-org/SmolVLM2-2.2B-Instruct-GGUF"
50
-
51
- def ensure_models():
52
- logging.debug("Ensuring model files are present...")
53
- if not os.path.exists(MODEL_FILE):
54
- logging.info(f"Downloading model file {MODEL_FILE} from {MODEL_REPO}...")
55
- path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
56
- os.symlink(path, MODEL_FILE)
57
- logging.info(f"Created symlink: {path} -> {MODEL_FILE}")
58
- else:
59
- logging.debug(f"Model file {MODEL_FILE} already exists.")
60
-
61
- if not os.path.exists(CLIP_FILE):
62
- logging.info(f"Downloading CLIP file {CLIP_FILE} from {CLIP_REPO}...")
63
- path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
64
- os.symlink(path, CLIP_FILE)
65
- logging.info(f"Created symlink: {path} -> {CLIP_FILE}")
66
- else:
67
- logging.debug(f"CLIP file {CLIP_FILE} already exists.")
68
-
69
- ensure_models()
70
-
71
-
72
- def load_llm():
73
- logging.debug("Loading Llama model with SmolVLM2ChatHandler...")
74
- handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
75
- llm = Llama(
76
- model_path=MODEL_FILE,
77
- chat_handler=handler,
78
- n_ctx=1024,
79
- verbose=False,
80
- )
81
- logging.info("Llama model loaded successfully.")
82
- return llm
83
-
84
- llm = load_llm()
85
-
86
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
87
- # 4) Captioning helper (stateless prompt)
88
- def caption_frame(frame):
89
- logging.debug("caption_frame called.")
90
- # make a writable copy
91
- frame = frame.copy()
92
- frame = cv2.resize(frame, (384, 384))
93
- logging.debug(f"Frame shape: {frame.shape}, dtype: {frame.dtype}")
94
-
95
- # save frame to temporary file for URI
96
- with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
97
- success = cv2.imwrite(f.name, frame)
98
- if not success:
99
- logging.error(f"Failed to write frame to {f.name}")
100
- else:
101
- logging.debug(f"Frame written to temp file: {f.name}")
102
-
103
- uri = Path(f.name).absolute().as_uri()
104
- logging.debug(f"Frame URI: {uri}")
105
-
106
- # build a single prompt string
107
  messages = [
108
- {
109
- "role": "system",
110
- "content": (
111
- "Focus only on describing the key dramatic action or notable event occurring "
112
- "in this image. Skip general context or scene-setting details unless they are "
113
- "crucial to understanding the main action."
114
- ),
115
- },
116
- {
117
- "role": "user",
118
- "content": [
119
- {"type": "image_url", "image_url": uri},
120
- {"type": "text", "text": "What is happening in this image?"},
121
- ],
122
- },
123
  ]
124
- logging.debug(f"Constructed messages: {messages}")
125
-
126
- # stateless completion call
127
- logging.debug("Resetting LLM and clearing cache.")
128
- llm.chat_handler.__init__(clip_model_path=CLIP_FILE, verbose=False)
129
- logging.debug("Sending chat completion request...")
130
  resp = llm.create_chat_completion(
131
  messages=messages,
132
  max_tokens=128,
133
  temperature=0.1,
134
- stop=["<end_of_utterance>"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  )
136
- logging.debug(f"LLM raw response: {resp}")
137
-
138
- # extract caption
139
- caption = (resp.get("choices", [])[0]["message"].get("content", "") or "").strip()
140
- logging.debug(f"Extracted caption: {caption}")
141
- return caption
142
-
143
- # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
144
- # 5) Gradio UI (v5 streaming)
145
- demo = gr.Blocks()
146
- with demo:
147
- gr.Markdown("## πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
148
- input_img = gr.Image(sources=["webcam"], streaming=True, label="Webcam Feed")
149
- caption_box = gr.Textbox(interactive=False, label="Caption")
150
-
151
- # stream frames and captions
152
- input_img.stream(
153
- fn=caption_frame,
154
- inputs=[input_img],
155
- outputs=[caption_box],
156
- stream_every=3,
157
- time_limit=600
158
- )
159
 
160
- if __name__ == "__main__":
161
- logging.debug("Launching Gradio demo...")
162
  demo.launch()
163
 
164
- # todos:
165
- # 1. add list of models: smolvml2 256m, 500m, 2.2b with varouis precision in choice
166
- # 2. customizable interval
167
- # 3. customizable system and user prompts
 
1
+ import time
2
  import logging
3
  import gradio as gr
4
  import cv2
 
8
  from huggingface_hub import hf_hub_download
9
  from llama_cpp import Llama
10
  from llama_cpp.llama_chat_format import Llava15ChatHandler
 
11
 
12
+ # ----------------------------------------
13
+ # Model configurations: per-size prefixes and repos
14
+ MODELS = {
15
+ "256M": {
16
+ "model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
17
+ "clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
18
+ "model_prefix": "SmolVLM2-256M-Video-Instruct",
19
+ "clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct",
20
+ "model_variants": ["Q8_0", "f16"],
21
+ "clip_variants": ["Q8_0", "f16"],
22
+ },
23
+ "500M": {
24
+ "model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
25
+ "clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
26
+ "model_prefix": "SmolVLM2-500M-Video-Instruct",
27
+ "clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct",
28
+ "model_variants": ["Q8_0", "f16"],
29
+ "clip_variants": ["Q8_0", "f16"],
30
+ },
31
+ "2.2B": {
32
+ "model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
33
+ "clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
34
+ "model_prefix": "SmolVLM2-2.2B-Instruct",
35
+ "clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct",
36
+ "model_variants": ["Q4_K_M", "Q8_0", "f16"],
37
+ "clip_variants": ["Q8_0", "f16"],
38
+ },
39
+ }
40
+
41
+ # ----------------------------------------
42
+ # Cache for loaded model instance
43
+ model_cache = {
44
+ 'size': None,
45
+ 'model_file': None,
46
+ 'clip_file': None,
47
+ 'llm': None
48
+ }
49
+
50
+ # Helper to download & symlink weights
51
+
52
+ def ensure_weights(size, model_file, clip_file):
53
+ cfg = MODELS[size]
54
+ if not os.path.exists(model_file):
55
+ logging.info(f"Downloading model file {model_file} from {cfg['model_repo']}...")
56
+ path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
57
+ os.symlink(path, model_file)
58
+ if not os.path.exists(clip_file):
59
+ logging.info(f"Downloading CLIP file {clip_file} from {cfg['clip_repo']}...")
60
+ path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
61
+ os.symlink(path, clip_file)
62
+ return model_file, clip_file
63
+
64
+ # Custom chat handler
65
  class SmolVLM2ChatHandler(Llava15ChatHandler):
66
  CHAT_FORMAT = (
67
  "<|im_start|>"
 
85
  "{% if add_generation_prompt %}Assistant:{% endif %}"
86
  )
87
 
88
+ # Load and cache LLM (only on dropdown change)
89
+
90
+ def update_llm(size, model_file, clip_file):
91
+ if (model_cache['size'], model_cache['model_file'], model_cache['clip_file']) != (size, model_file, clip_file):
92
+ mf, cf = ensure_weights(size, model_file, clip_file)
93
+ handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=False)
94
+ llm = Llama(model_path=mf, chat_handler=handler, n_ctx=1024, verbose=False)
95
+ model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'llm': llm})
96
+ return None # no UI output
97
+
98
+ # Build weight filename lists
99
+
100
+ def get_weight_files(size):
101
+ cfg = MODELS[size]
102
+ model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
103
+ clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
104
+ return model_files, clip_files
105
+
106
+ # Caption using cached llm
107
+
108
+ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt):
109
+ # Use pre-loaded model
110
+ llm = model_cache['llm']
111
+ time.sleep(interval_ms / 1000)
112
+ img = cv2.resize(frame.copy(), (384, 384))
113
+ with tempfile.NamedTemporaryFile(suffix='.jpg') as tmp:
114
+ cv2.imwrite(tmp.name, img)
115
+ uri = Path(tmp.name).absolute().as_uri()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  messages = [
117
+ {"role": "system", "content": sys_prompt},
118
+ {"role": "user", "content": [
119
+ {"type": "image_url", "image_url": uri},
120
+ {"type": "text", "text": usr_prompt}
121
+ ]}
 
 
 
 
 
 
 
 
 
 
122
  ]
123
+ # re-init handler
124
+ llm.chat_handler.__init__(clip_model_path=clip_file, verbose=False)
 
 
 
 
125
  resp = llm.create_chat_completion(
126
  messages=messages,
127
  max_tokens=128,
128
  temperature=0.1,
129
+ stop=["<end_of_utterance>"]
130
+ )
131
+ return resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
132
+
133
+ # Gradio UI
134
+
135
+ def main():
136
+ logging.basicConfig(level=logging.INFO)
137
+ default = '2.2B'
138
+ mf, cf = get_weight_files(default)
139
+
140
+ with gr.Blocks() as demo:
141
+ gr.Markdown("## πŸŽ₯ Real-Time Camera Captioning")
142
+ with gr.Row():
143
+ size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
144
+ model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
145
+ clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
146
+
147
+ # On any selection change, preload the llm
148
+ size_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
149
+ model_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
150
+ clip_dd.change(fn=lambda s, m, c: update_llm(s, m, c), inputs=[size_dd, model_dd, clip_dd], outputs=[])
151
+
152
+ # Initial load
153
+ update_llm(default, mf[0], cf[0])
154
+
155
+ interval = gr.Slider(100, 20000, step=100, value=1000, label='Interval (ms)')
156
+ sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
157
+ usr_p = gr.Textbox(lines=1, value="What is happening in this image?", label='User Prompt')
158
+ cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
159
+ cap = gr.Textbox(interactive=False, label='Caption')
160
+
161
+ cam.stream(
162
+ fn=caption_frame,
163
+ inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p],
164
+ outputs=[cap], time_limit=600
165
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
 
 
167
  demo.launch()
168
 
169
+ if __name__ == '__main__':
170
+ main()