Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

App Files Files Community

Luigi commited on 12 days ago

Commit

957ece1

1 Parent(s): 08f659b

update dockerfile and app.py

Browse files

Files changed (2) hide show

Dockerfile +16 -16
app.py +6 -24

Dockerfile CHANGED Viewed

@@ -5,7 +5,7 @@ FROM ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Etc/UTC
-# Configure Hugging Face and XDG cache to use a writable /tmp directory
 ENV XDG_CACHE_HOME=/tmp/.cache
 ENV HF_HOME=/tmp/.cache/huggingface
@@ -17,7 +17,7 @@ RUN echo "tzdata tzdata/Areas select Etc" > /tmp/tzdata.seed && \
     echo "tzdata tzdata/Zones/Etc select UTC" >> /tmp/tzdata.seed && \
     debconf-set-selections /tmp/tzdata.seed
-# 1. Install OS-level dependencies (including pkg-config and git)
 RUN echo "### STEP 1: Installing OS-level dependencies" && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
@@ -32,28 +32,28 @@ RUN echo "### STEP 1: Installing OS-level dependencies" && \
         python3-opencv && \
     rm -rf /var/lib/apt/lists/*
-# 2. Prepare application directory
 WORKDIR /app
 COPY requirements.txt ./
 COPY app.py ./
-# (Copy any other source files or directories needed)
-# 3. Install Python dependencies (excluding llama-cpp-python)
-RUN echo "### STEP 2: Installing Python dependencies" && \
-    pip3 install --upgrade pip && \
-    pip3 install --no-cache-dir -r requirements.txt
-# 4. Ensure cache directories are writable by runtime user
-RUN echo "### STEP 3: Creating cache directories" && \
     mkdir -p "$XDG_CACHE_HOME" "$HF_HOME" && \
-    chmod -R a+rwX "$XDG_CACHE_HOME"
-# 5. Build and install llama-cpp-python from source with OpenBLAS
-RUN echo "### STEP 4: Building llama-cpp-python with OpenBLAS" && \
     export CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" && \
-    pip3 install --no-cache-dir --force-reinstall --no-binary llama-cpp-python llama-cpp-python
-# 6. Finalize and launch the app
-RUN echo "### STEP 5: Finalizing Docker image"
 EXPOSE 7860
 CMD ["python3", "app.py"]

 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Etc/UTC
+# Configure cache paths to a writable /tmp location
 ENV XDG_CACHE_HOME=/tmp/.cache
 ENV HF_HOME=/tmp/.cache/huggingface
     echo "tzdata tzdata/Zones/Etc select UTC" >> /tmp/tzdata.seed && \
     debconf-set-selections /tmp/tzdata.seed
+# STEP 1: Install OS-level dependencies
 RUN echo "### STEP 1: Installing OS-level dependencies" && \
     apt-get update && \
     apt-get install -y --no-install-recommends \
         python3-opencv && \
     rm -rf /var/lib/apt/lists/*
+# STEP 2: Prepare application directory and copy source code
 WORKDIR /app
 COPY requirements.txt ./
 COPY app.py ./
+# COPY any other source files or directories needed by your app
+# STEP 3: Install Python dependencies (ensure huggingface_hub is listed)
+RUN echo "### STEP 3: Installing Python dependencies" && \
+    python3 -m pip install --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# STEP 4: Ensure cache directories are writable
+RUN echo "### STEP 4: Creating and permissioning cache directories" && \
     mkdir -p "$XDG_CACHE_HOME" "$HF_HOME" && \
+    chmod -R a+rwX "$XDG_CACHE_HOME" "$HF_HOME"
+# STEP 5: Build and install llama-cpp-python from source with OpenBLAS
+RUN echo "### STEP 5: Building llama-cpp-python with OpenBLAS" && \
     export CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" && \
+    pip install --no-cache-dir --force-reinstall --no-binary llama-cpp-python llama-cpp-python==0.2.0
+# STEP 6: Finalize and launch the application
+RUN echo "### STEP 6: Finalizing Docker image"
 EXPOSE 7860
 CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import gc
 import io
 from contextlib import redirect_stdout, redirect_stderr
 import sys, llama_cpp
-import shutil
 # ----------------------------------------
 # Model configurations: per-size prefixes and repos
@@ -53,27 +52,12 @@ model_cache = {
     'llm': None
 }
-# Helper to download & symlink weights
 def ensure_weights(cfg, model_file, clip_file):
-    # download into HF cache (now in /tmp/.cache)
-    path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
-    # try to link into your working dir, else copy
-    try:
-        os.symlink(path, model_file)
-    except (PermissionError, OSError):
-        print(f"⚠️ symlink failed, copying {path} → {model_file}")
-        shutil.copy2(path, model_file)
-    # repeat for clip_file…
-    clip_path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
-    try:
-        os.symlink(clip_path, clip_file)
-    except (PermissionError, OSError):
-        print(f"⚠️ symlink failed, copying {clip_path} → {clip_file}")
-        shutil.copy2(clip_path, clip_file)
-    return model_file, clip_file
 # Custom chat handler
 class SmolVLM2ChatHandler(Llava15ChatHandler):
@@ -102,7 +86,7 @@ class SmolVLM2ChatHandler(Llava15ChatHandler):
 # Load and cache LLM (only on dropdown or verbose change)
 def update_llm(size, model_file, clip_file, verbose_mode):
     if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
-        mf, cf = ensure_weights(size, model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
         llm = Llama(
             model_path=mf,
@@ -173,7 +157,6 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
     t_start = time.time()
-    # right before you call the Llama API:
     buf = io.StringIO()
     with redirect_stdout(buf), redirect_stderr(buf):
         resp = model_cache['llm'].create_chat_completion(
@@ -182,7 +165,6 @@ def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, u
             temperature=0.1,
             stop=["<end_of_utterance>"]
         )
-    # grab every line the Llama client printed
     for line in buf.getvalue().splitlines():
         timestamp = time.strftime('%H:%M:%S')
         debug_msgs.append(f"[{timestamp}] {line}")

 import io
 from contextlib import redirect_stdout, redirect_stderr
 import sys, llama_cpp
 # ----------------------------------------
 # Model configurations: per-size prefixes and repos
     'llm': None
 }
+# Helper to download weights and return their cache paths
 def ensure_weights(cfg, model_file, clip_file):
+    # Download model and clip into HF cache (writable, e.g. /tmp/.cache)
+    model_path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
+    clip_path  = hf_hub_download(repo_id=cfg['clip_repo'],  filename=clip_file)
+    return model_path, clip_path
 # Custom chat handler
 class SmolVLM2ChatHandler(Llava15ChatHandler):
 # Load and cache LLM (only on dropdown or verbose change)
 def update_llm(size, model_file, clip_file, verbose_mode):
     if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose']) != (size, model_file, clip_file, verbose_mode):
+        mf, cf = ensure_weights(MODELS[size], model_file, clip_file)
         handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
         llm = Llama(
             model_path=mf,
     debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
     t_start = time.time()
     buf = io.StringIO()
     with redirect_stdout(buf), redirect_stderr(buf):
         resp = model_cache['llm'].create_chat_completion(
             temperature=0.1,
             stop=["<end_of_utterance>"]
         )
     for line in buf.getvalue().splitlines():
         timestamp = time.strftime('%H:%M:%S')
         debug_msgs.append(f"[{timestamp}] {line}")