Luigi commited on
Commit
dd0d47d
Β·
1 Parent(s): ca97f63
Files changed (2) hide show
  1. app.py +105 -103
  2. requirements.txt +4 -10
app.py CHANGED
@@ -1,132 +1,134 @@
1
- # app.py
2
- import torch; torch.classes.__path__ = [] # Neutralizes the path inspection
3
  import os
4
  import sys
5
  import time
6
  import socket
7
- import subprocess
8
  import atexit
9
- import base64
10
  import shutil
 
11
 
12
- import cv2
13
  import streamlit as st
 
 
 
14
  import requests
15
- from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
16
  from huggingface_hub import hf_hub_download
17
 
18
- # ── Configuration ──────────────────────────────────────────────────────────────
19
- PORT = 8000
20
- BASE_URL = f"http://localhost:{PORT}/v1"
21
  MODEL_ALIAS = "gpt-4-vision-preview"
22
  REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
23
  MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
24
- PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
25
 
26
- # ── Helpers to download & launch server ─────────────────────────────────────────
27
- def download_if_missing(repo_id: str, filename: str):
28
- if not os.path.exists(filename):
29
- cached = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="model")
 
30
  shutil.copy(cached, filename)
31
 
32
- def ensure_models():
33
- download_if_missing(REPO_ID, MODEL_FILE)
34
- download_if_missing(REPO_ID, PROJ_FILE)
 
 
35
 
36
- def start_server():
37
  cmd = [
38
  sys.executable, "-m", "llama_cpp.server",
39
  "--model", MODEL_FILE,
40
  "--clip_model_path", PROJ_FILE,
41
  "--chat_format", "llava-1-5",
42
  "--port", str(PORT),
43
- "--model_alias", MODEL_ALIAS,
44
  ]
45
- proc = subprocess.Popen(
46
- cmd,
47
- stdout=subprocess.PIPE,
48
- stderr=subprocess.STDOUT,
49
- text=True, # so line buffering works
50
- bufsize=1,
51
- )
52
  atexit.register(proc.terminate)
 
 
 
 
 
 
 
 
 
53
 
54
- for line in proc.stdout:
55
- if "Application startup complete." in line:
56
- return proc
57
 
58
- raise RuntimeError(f"Server failed to start on port {PORT}")
59
 
60
- # ── Boot llama-cpp-python server ────────────────────────────────────────────────
61
- ensure_models()
62
- _server_proc = start_server()
63
-
64
- # ── Streamlit UI ───────────────────────────────────────────────────────────────
65
- st.set_page_config(page_title="SmolVLM Live Caption Demo", layout="wide")
66
- st.title("πŸ“Έ Live Camera Captioning with SmolVLM")
67
- st.markdown(
68
- """
69
- Use the **slider** below to choose how often (in milliseconds) to
70
- send a frame to SmolVLM for captioning. The latest caption will
71
- be overlaid on your video feed.
72
- """
73
- )
74
- interval_ms = st.sidebar.slider("Caption every N ms", 100, 5000, 3000)
75
-
76
- # ── Video processor ────────────────────────────────────────────────────────────
77
- class CaptionProcessor(VideoProcessorBase):
78
- def __init__(self, interval_ms: int):
79
- self.interval = interval_ms / 1000.0
80
- self.last_time = 0.0
81
- self.caption = "Waiting for caption..."
82
- self.font = cv2.FONT_HERSHEY_SIMPLEX
83
-
84
- def recv(self, frame):
85
- img = frame.to_ndarray(format="bgr24")
86
- now = time.time()
87
- if now - self.last_time >= self.interval:
88
- self.last_time = now
89
-
90
- # JPEG + base64 encode
91
- success, buf = cv2.imencode(".jpg", img)
92
- if success:
93
- b64 = base64.b64encode(buf).decode("utf-8")
94
- payload = {
95
- "model": MODEL_ALIAS,
96
- "messages": [
97
- {
98
- "role": "system",
99
- "content": (
100
- "You are a precise image‐captioning assistant. "
101
- "Identify the main subject, their clothing, posture, and environment."
102
- ),
103
- },
104
- {
105
- "role": "user",
106
- "content": [
107
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}},
108
- {"type": "text", "text": "Caption this image in one detailed sentence."},
109
- ],
110
- },
111
- ],
112
- "temperature": 0.1,
113
- "max_tokens": 100,
114
- }
115
-
116
- try:
117
- r = requests.post(f"{BASE_URL}/chat/completions", json=payload, timeout=10)
118
- r.raise_for_status()
119
- self.caption = r.json()["choices"][0]["message"]["content"].strip()
120
- except Exception as e:
121
- self.caption = f"[Error] {e}"
122
-
123
- # overlay caption
124
- y = img.shape[0] - 20
125
- cv2.putText(img, self.caption, (10, y), self.font, 0.7, (0, 255, 0), 2)
126
- return frame.from_ndarray(img, format="bgr24")
127
-
128
- webrtc_streamer(
129
- key=f"caption_{interval_ms}",
130
- video_processor_factory=lambda: CaptionProcessor(interval_ms),
131
- media_stream_constraints={"video": True, "audio": False},
132
- )
 
 
 
1
  import os
2
  import sys
3
  import time
4
  import socket
 
5
  import atexit
6
+ import subprocess
7
  import shutil
8
+ from pathlib import Path
9
 
 
10
  import streamlit as st
11
+ import cv2
12
+ from PIL import Image
13
+ import base64
14
  import requests
 
15
  from huggingface_hub import hf_hub_download
16
 
17
+ # --- Configuration (reuse from main.py) ---
18
+ PORT = 8000
19
+ BASE_URL = f"http://localhost:{PORT}/v1"
20
  MODEL_ALIAS = "gpt-4-vision-preview"
21
  REPO_ID = "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF"
22
  MODEL_FILE = "SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
23
+ PROJ_FILE = "mmproj-SmolVLM2-500M-Video-Instruct-Q8_0.gguf"
24
 
25
+ # Download model files if missing
26
+
27
+ def download_if_missing(repo_id: str, filename: str) -> None:
28
+ if not os.path.isfile(filename):
29
+ cached = hf_hub_download(repo_id=repo_id, filename=filename)
30
  shutil.copy(cached, filename)
31
 
32
+ # Ensure models on startup
33
+ ensure_models = lambda: [download_if_missing(REPO_ID, MODEL_FILE), download_if_missing(REPO_ID, PROJ_FILE)]
34
+ ensure_models()
35
+
36
+ # Start local server for captioning
37
 
38
+ def start_server() -> subprocess.Popen:
39
  cmd = [
40
  sys.executable, "-m", "llama_cpp.server",
41
  "--model", MODEL_FILE,
42
  "--clip_model_path", PROJ_FILE,
43
  "--chat_format", "llava-1-5",
44
  "--port", str(PORT),
45
+ "--model_alias", MODEL_ALIAS
46
  ]
47
+ proc = subprocess.Popen(cmd)
 
 
 
 
 
 
48
  atexit.register(proc.terminate)
49
+ # wait until responsive
50
+ for _ in range(40):
51
+ try:
52
+ with socket.create_connection(("localhost", PORT), timeout=1):
53
+ return proc
54
+ except OSError:
55
+ time.sleep(0.25)
56
+ proc.terminate()
57
+ raise RuntimeError(f"Server failed to start on port {PORT}.")
58
 
59
+ server_proc = start_server()
 
 
60
 
61
+ # Send image to caption API
62
 
63
+ def caption_image_file(path: str) -> str:
64
+ b64 = base64.b64encode(open(path, "rb").read()).decode()
65
+ uri = f"data:image/jpeg;base64,{b64}"
66
+ payload = {
67
+ "model": MODEL_ALIAS,
68
+ "messages": [
69
+ {"role": "system", "content": (
70
+ "You are a precise image-captioning assistant. "
71
+ "Identify the main subject, their clothing, posture, and environment."
72
+ )},
73
+ {"role": "user", "content": [
74
+ {"type": "image_url", "image_url": {"url": uri}},
75
+ {"type": "text", "text": "Caption this image in one detailed sentence."}
76
+ ]}
77
+ ],
78
+ "temperature": 0.1,
79
+ "max_tokens": 100
80
+ }
81
+ resp = requests.post(BASE_URL + "/chat/completions", json=payload)
82
+ resp.raise_for_status()
83
+ return resp.json()["choices"][0]["message"]["content"]
84
+
85
+ # Helper to handle PIL image
86
+
87
+ def run_caption(pil_img: Image.Image) -> str:
88
+ tmp = Path("frame.jpg")
89
+ pil_img.save(tmp)
90
+ return caption_image_file(str(tmp))
91
+
92
+ # --- Streamlit UI ---
93
+ st.set_page_config(page_title="Real-Time Camera Captioning", layout="wide")
94
+ st.title("πŸŽ₯ Real-Time Camera Captioning")
95
+
96
+ interval = st.sidebar.slider("Interval between captions (seconds)", min_value=1, max_value=10, value=3)
97
+ start = st.sidebar.button("Start")
98
+ stop = st.sidebar.button("Stop")
99
+
100
+ if 'running' not in st.session_state:
101
+ st.session_state.running = False
102
+
103
+ if start:
104
+ st.session_state.running = True
105
+ if stop:
106
+ st.session_state.running = False
107
+
108
+ # Placeholders for video and caption
109
+ frame_placeholder = st.empty()
110
+ caption_placeholder = st.empty()
111
+
112
+ # OpenCV camera
113
+ cap = cv2.VideoCapture(0)
114
+
115
+ while st.session_state.running:
116
+ ret, frame = cap.read()
117
+ if not ret:
118
+ st.error("Unable to read from camera.")
119
+ break
120
+ # Convert BGR to RGB
121
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
122
+ img = Image.fromarray(rgb)
123
+
124
+ # Show frame
125
+ frame_placeholder.image(img, caption="Live Feed", use_container_width=True)
126
+
127
+ # Generate and show caption
128
+ with st.spinner("Generating caption..."):
129
+ caption = run_caption(img)
130
+ caption_placeholder.markdown(f"**Caption:** {caption}")
131
+
132
+ time.sleep(interval)
133
+
134
+ cap.release()
 
requirements.txt CHANGED
@@ -1,10 +1,4 @@
1
- # requirements.txt
2
-
3
- streamlit
4
- streamlit-webrtc
5
- yolov5
6
- opencv-python-headless
7
- numpy
8
- llama-cpp-python[server]>=0.1.102
9
- huggingface-hub>=0.13.3
10
- openai>=0.27.0
 
1
+ gradio>=3.0
2
+ requests
3
+ huggingface_hub
4
+ llama-cpp-python