Luigi commited on
Commit
221e4b6
Β·
1 Parent(s): 60e423d
Files changed (2) hide show
  1. app.py +153 -133
  2. requirements.txt +5 -9
app.py CHANGED
@@ -1,140 +1,160 @@
1
- import os
2
- import sys
3
- import time
4
- import socket
5
- import atexit
6
- import subprocess
7
- import shutil
8
- from pathlib import Path
9
-
10
  import streamlit as st
 
 
 
11
  import cv2
12
- from PIL import Image
13
- import base64
14
- import requests
 
15
  from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # --- Configuration (reuse from main.py) ---
18
- PORT = 8000
19
- BASE_URL = f"http://localhost:{PORT}/v1"
20
- MODEL_ALIAS = "gpt-4-vision-preview"
21
- REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
22
- MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
23
- PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
24
-
25
- # Download model files if missing
26
-
27
- def download_if_missing(repo_id: str, filename: str) -> None:
28
- if not os.path.isfile(filename):
29
- cached = hf_hub_download(repo_id=repo_id, filename=filename)
30
- shutil.copy(cached, filename)
31
-
32
- # Ensure models on startup
33
- ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
34
  ensure_models()
35
 
36
- # Start local server for captioning
37
-
38
- def start_server() -> subprocess.Popen:
39
- cmd = [
40
- sys.executable, "-m", "llama_cpp.server",
41
- "--model", MODEL_FILE,
42
- "--clip_model_path", PROJ_FILE,
43
- "--chat_format", "llava-1-5",
44
- "--port", str(PORT),
45
- "--host", "127.0.0.1",
46
- "--model_alias", MODEL_ALIAS
47
- ]
48
-
49
- print("⏳ Launching llama server:", " ".join(cmd), file=sys.stderr)
50
- proc = subprocess.Popen(cmd)
51
- atexit.register(proc.terminate)
52
- # wait until responsive
53
- for _ in range(40):
54
- try:
55
- with socket.create_connection(("localhost", PORT), timeout=1):
56
- return proc
57
- except OSError:
58
- time.sleep(0.25)
59
- proc.terminate()
60
- out, err = proc.communicate(timeout=1)
61
- print("πŸ› llama server stdout:\n", out.decode(), file=sys.stderr)
62
- print("πŸ› llama server stderr:\n", err.decode(), file=sys.stderr)
63
- raise RuntimeError(f"Server failed to start on port {PORT}.")
64
-
65
- server_proc = start_server()
66
-
67
- # Send image to caption API
68
-
69
- def caption_image_file(path: str) -> str:
70
- b64 = base64.b64encode(open(path, "rb").read()).decode()
71
- uri = f"data:image/jpeg;base64,{b64}"
72
- payload = {
73
- "model": MODEL_ALIAS,
74
- "messages": [
75
- {"role": "system", "content": (
76
- "You are a precise image-captioning assistant. "
77
- "Identify the main subject, their clothing, posture, and environment."
78
- )},
79
- {"role": "user", "content": [
80
  {"type": "image_url", "image_url": {"url": uri}},
81
- {"type": "text", "text": "Caption this image in one detailed sentence."}
82
- ]}
83
- ],
84
- "temperature": 0.1,
85
- "max_tokens": 100
86
- }
87
- resp = requests.post(BASE_URL + "/chat/completions", json=payload)
88
- resp.raise_for_status()
89
- return resp.json()["choices"][0]["message"]["content"]
90
-
91
- # Helper to handle PIL image
92
-
93
- def run_caption(pil_img: Image.Image) -> str:
94
- tmp = Path("frame.jpg")
95
- pil_img.save(tmp)
96
- return caption_image_file(str(tmp))
97
-
98
- # --- Streamlit UI ---
99
- st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
100
- st.title("πŸŽ₯ Real-Time Camera Captioning")
101
-
102
- interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
103
- start = st.sidebar.button("Start")
104
- stop = st.sidebar.button("Stop")
105
-
106
- if 'running' not in st.session_state:
107
- st.session_state.running = False
108
-
109
- if start:
110
- st.session_state.running = True
111
- if stop:
112
- st.session_state.running = False
113
-
114
- # Placeholders for video and caption
115
- frame_placeholder = st.empty()
116
- caption_placeholder = st.empty()
117
-
118
- # OpenCV camera
119
- cap = cv2.VideoCapture(0)
120
-
121
- while st.session_state.running:
122
- ret, frame = cap.read()
123
- if not ret:
124
- st.error("Unable to read from camera.")
125
- break
126
- # Convert BGR to RGB
127
- rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
128
- img = Image.fromarray(rgb)
129
-
130
- # Show frame
131
- frame_placeholder.image(img, caption="Live Feed", use_container_width=True)
132
-
133
- # Generate and show caption
134
- with st.spinner("Generating caption..."):
135
- caption = run_caption(img)
136
- caption_placeholder.markdown(f"**Caption:** {caption}")
137
-
138
- time.sleep(interval)
139
-
140
- cap.release()
 
 
 
 
 
 
1
+ # app.py
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
+ st.set_page_config(layout="wide")
4
+
5
+ import av
6
  import cv2
7
+ import time
8
+ import tempfile
9
+ import os
10
+ from pathlib import Path
11
  from huggingface_hub import hf_hub_download
12
+ from streamlit_webrtc import webrtc_streamer, VideoProcessorBase, RTCConfiguration
13
+ from llama_cpp import Llama
14
+ from llama_cpp.llama_chat_format import LlamaChatCompletionHandlerRegistry, Llava15ChatHandler
15
+ from termcolor import cprint
16
+
17
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
18
+ # 1) Inline definition & registration of SmolVLM2ChatHandler
19
+ class SmolVLM2ChatHandler(Llava15ChatHandler):
20
+ CHAT_FORMAT = (
21
+ "<|im_start|>"
22
+ "{% for message in messages %}"
23
+ "{{ message['role'] | capitalize }}"
24
+ "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
25
+ "{% else %}: "
26
+ "{% endif %}"
27
+ "{% for content in message['content'] %}"
28
+ "{% if content['type']=='text' %}{{ content['text'] }}"
29
+ "{% elif content['type']=='image_url' %}"
30
+ "{% if content['image_url'] is string %}"
31
+ "{{ content['image_url'] }}\n"
32
+ "{% elif content['image_url'] is mapping %}"
33
+ "{{ content['image_url']['url'] }}\n"
34
+ "{% endif %}"
35
+ "{% endif %}"
36
+ "{% endfor %}"
37
+ "<end_of_utterance>\n"
38
+ "{% endfor %}"
39
+ "{% if add_generation_prompt %}Assistant:{% endif %}"
40
+ )
41
+
42
+ # Overwrite any previous registration
43
+ LlamaChatCompletionHandlerRegistry().register_chat_completion_handler(
44
+ "smolvlm2", SmolVLM2ChatHandler, overwrite=True
45
+ )
46
+
47
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
48
+ # 2) Model & CLIP files β€” download if missing
49
+ MODEL_FILE = "SmolVLM2-500M-Video-Instruct.Q8_0.gguf"
50
+ CLIP_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
51
+ MODEL_REPO = "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF"
52
+ CLIP_REPO = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
53
+
54
+ def ensure_models():
55
+ if not os.path.exists(MODEL_FILE):
56
+ path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
57
+ os.symlink(path, MODEL_FILE)
58
+ if not os.path.exists(CLIP_FILE):
59
+ path = hf_hub_download(repo_id=CLIP_REPO, filename=CLIP_FILE)
60
+ os.symlink(path, CLIP_FILE)
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  ensure_models()
63
 
64
+ @st.cache_resource
65
+ def load_llm():
66
+ handler = SmolVLM2ChatHandler(clip_model_path=CLIP_FILE, verbose=False)
67
+ return Llama(
68
+ model_path=MODEL_FILE,
69
+ chat_handler=handler,
70
+ n_ctx=8192,
71
+ verbose=False,
72
+ )
73
+
74
+ llm = load_llm()
75
+
76
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
77
+ # 3) Helper to run a single frame through the model (with debug)
78
+ def caption_frame(frame):
79
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
80
+ cv2.imwrite(f.name, frame)
81
+ uri = Path(f.name).absolute().as_uri()
82
+
83
+ messages = [
84
+ {
85
+ "role": "system",
86
+ "content": (
87
+ "Focus only on describing the key dramatic action or notable event occurring "
88
+ "in this image. Skip general context or scene-setting details unless they are "
89
+ "crucial to understanding the main action."
90
+ ),
91
+ },
92
+ {
93
+ "role": "user",
94
+ "content": [
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  {"type": "image_url", "image_url": {"url": uri}},
96
+ {"type": "text", "text": "What is happening in this image?"},
97
+ ],
98
+ },
99
+ ]
100
+
101
+ print("DEBUG β–Ά caption_frame: invoking LLM")
102
+ resp = llm.create_chat_completion(
103
+ messages=messages,
104
+ max_tokens=128,
105
+ temperature=0.1,
106
+ stop=["<end_of_utterance>"],
107
+ )
108
+ out = (resp["choices"][0].get("message", {}).get("content") or "").strip()
109
+ print(f"DEBUG β–Ά LLM returned: {out!r}")
110
+ return out
111
+
112
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
113
+ # 4) Streamlit UI + WebRTC configuration
114
+ st.title("πŸŽ₯ Real-Time Camera Captioning with SmolVLM2 (CPU)")
115
+
116
+ interval_ms = st.slider(
117
+ "Caption every N ms", min_value=100, max_value=10000, value=1000, step=100
118
+ )
119
+
120
+ RTC_CONFIG = RTCConfiguration({
121
+ "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
122
+ })
123
+
124
+ class CaptionProcessor(VideoProcessorBase):
125
+ def __init__(self):
126
+ self.interval = 1.0
127
+ self.last_time = time.time()
128
+ self.caption = ""
129
+
130
+ def recv(self, frame: av.VideoFrame) -> av.VideoFrame:
131
+ img = frame.to_ndarray(format="bgr24")
132
+ now = time.time()
133
+ if now - self.last_time >= self.interval:
134
+ self.last_time = now
135
+ print("DEBUG β–Ά CaptionProcessor.recv: time reached, generating caption")
136
+ self.caption = caption_frame(img)
137
+ return av.VideoFrame.from_ndarray(img, format="bgr24")
138
+
139
+ ctx = webrtc_streamer(
140
+ key="smolvlm2-captioner",
141
+ video_processor_factory=CaptionProcessor,
142
+ rtc_configuration=RTC_CONFIG,
143
+ media_stream_constraints={"video": True, "audio": False},
144
+ )
145
+
146
+ # Update the processor interval
147
+ if ctx.video_processor:
148
+ ctx.video_processor.interval = interval_ms / 1000.0
149
+
150
+ # Placeholder for showing captions
151
+ placeholder = st.empty()
152
+ if ctx.state.playing:
153
+ placeholder.markdown("**Caption:** _Waiting for inference…_")
154
+ while ctx.state.playing:
155
+ txt = ctx.video_processor.caption or "_…thinking…_"
156
+ placeholder.markdown(f"**Caption:** {txt}")
157
+ time.sleep(0.1)
158
+ else:
159
+ st.info("▢️ Click **Start** above to begin streaming")
160
+
requirements.txt CHANGED
@@ -1,10 +1,6 @@
1
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
- --prefer-binary
3
- --only-binary=llama-cpp-pythongradio>=3.0
4
- requests
5
- huggingface_hub
6
  opencv-python
7
- fastapi
8
- uvicorn[standard]
9
- llama-cpp-python[server]==0.3.9
10
- Pillow
 
1
+ streamlit
2
+ streamlit-webrtc
3
+ llama-cpp-python
4
+ huggingface-hub
5
+ termcolor
6
  opencv-python