Spaces:

yusufs
/

llama32-3b-instruct

Paused

yusufs commited on 18 days ago

Commit

258633e

verified ·

1 Parent(s): f6ddd47

Update Dockerfile

Files changed (1) hide show

Dockerfile CHANGED Viewed

@@ -1,26 +1,28 @@
-FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
-ENV VLLM_LOGGING_LEVEL=DEBUG
-ENV HF_HOME=/tmp/.cache/huggingface
-ENV OMP_NUM_THREADS=1
-# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
-ENV VLLM_USE_TRITON_FLASH_ATTN=0
-# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
-# `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
-ENV VLLM_ATTENTION_BACKEND=FLASH_ATTN
-RUN apt-get update && apt-get install -y python3 python3-pip git
-RUN pip3 install --upgrade pip
-# Install vLLM
-RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
-RUN useradd -m appuser
-USER appuser
-RUN mkdir -p /tmp/.cache/huggingface
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
@@ -39,7 +41,7 @@ RUN mkdir -p /tmp/.cache/huggingface
 EXPOSE 7860
 # Export for runtime environment
-CMD python3 -m vllm.entrypoints.openai.api_server \
   --model "meta-llama/Llama-3.2-3B-Instruct" \
   --task generate \
   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \

+FROM vllm/vllm-openai:v0.10.0
+# FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
+# ENV VLLM_LOGGING_LEVEL=DEBUG
+# ENV HF_HOME=/tmp/.cache/huggingface
+# ENV OMP_NUM_THREADS=1
+# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
+# ENV VLLM_USE_TRITON_FLASH_ATTN=0
+# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
+# # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
+# ENV VLLM_ATTENTION_BACKEND=FLASH_ATTN
+# RUN apt-get update && apt-get install -y python3 python3-pip git
+# RUN pip3 install --upgrade pip
+# # Install vLLM
+# RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
+# RUN useradd -m appuser
+# USER appuser
+# RUN mkdir -p /tmp/.cache/huggingface
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
 EXPOSE 7860
 # Export for runtime environment
+CMD vllm serve \
   --model "meta-llama/Llama-3.2-3B-Instruct" \
   --task generate \
   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \