Spaces:

yusufs
/

llama32-3b-instruct

Paused

yusufs commited on 22 days ago

Commit

4dd2e29

verified ·

1 Parent(s): 46e845e

(feat:vllm serve) Dockerfile

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -1,23 +1,7 @@
-FROM python:3.12.7-slim-bookworm
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    git \
-    curl \
-    wget \
-    unzip \
-    gcc
-RUN useradd -m -u 1000 user
-USER user
-ENV PATH="/home/user/.local/bin:$PATH"
-WORKDIR /app
-COPY --chown=user ./requirements.txt requirements.txt
-RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
-COPY --chown=user . /app
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
@@ -35,8 +19,22 @@ COPY --chown=user . /app
 EXPOSE 7860
-#CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
-RUN chmod +x /app/runner.sh
-CMD ["/app/runner.sh"]

+FROM docker.io/vllm/vllm-openai:v0.10.0
+ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
+ENV MODEL_REV="0cb88a4f764b7a12671c53f0838cd831a0843b95"
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
 EXPOSE 7860
+CMD [
+  "vllm", "serve",
+  "--model", "$MODEL_NAME",
+  "--task", "generate",
+  "--revision", "$MODEL_REV",
+  "--code-revision", "$MODEL_REV",
+  "--tokenizer-revision", "$MODEL_REV",
+  "--seed", "42",
+  "--host", "0.0.0.0",
+  "--port", "7860",
+  "--max-num-batched-tokens", "32768",
+  "--max-model-len", "32768",
+  "--dtype", "float16",
+  "--enforce-eager",
+  "--gpu-memory-utilization", "0.9",
+  "--enable-prefix-caching",
+  "--disable-log-requests",
+  "--trust-remote-code"
+]