Spaces:

yusufs
/

llama32-3b-instruct

Paused

App Files Files Community

yusufs commited on 17 days ago

Commit

69dcd65

verified ·

1 Parent(s): 3f315a3

Update Dockerfile

Browse files

Files changed (1) hide show

Dockerfile +28 -46

Dockerfile CHANGED Viewed

@@ -1,38 +1,37 @@
-FROM vllm/vllm-openai:v0.10.0
-# FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
-# ENV VLLM_LOGGING_LEVEL=DEBUG
-# ENV HF_HOME=/tmp/.cache/huggingface
-# ENV OMP_NUM_THREADS=1
-# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
-# ENV VLLM_USE_TRITON_FLASH_ATTN=0
-# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
-# # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
-# # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
-# # Backend for attention computation
-# # Available options:
-# # - "TORCH_SDPA": use torch.nn.MultiheadAttention
-# # - "FLASH_ATTN": use FlashAttention
-# # - "XFORMERS": use XFormers
-# # - "ROCM_FLASH": use ROCmFlashAttention
-# # - "FLASHINFER": use flashinfer
-# # - "FLASHMLA": use FlashMLA
-# # Choose XFORMERS that most stable for T4
-# ENV VLLM_ATTENTION_BACKEND=XFORMERS
-# RUN apt-get update && apt-get install -y python3 python3-pip git
-# RUN pip3 install --upgrade pip
-# # Install vLLM
-# RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
-# RUN useradd -m appuser
-# USER appuser
-# RUN mkdir -p /tmp/.cache/huggingface
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
@@ -51,25 +50,7 @@ FROM vllm/vllm-openai:v0.10.0
 EXPOSE 7860
 # Export for runtime environment
-# CMD vllm serve \
-#   --model "meta-llama/Llama-3.2-3B-Instruct" \
-#   --task generate \
-#   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
-#   --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
-#   --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
-#   --seed 42 \
-#   --host 0.0.0.0 \
-#   --port 7860 \
-#   --max-num-batched-tokens 32768 \
-#   --max-model-len 32768 \
-#   --dtype float16 \
-#   --enforce-eager \
-#   --gpu-memory-utilization 0.9 \
-#   --enable-prefix-caching \
-#   --disable-log-requests \
-#   --trust-remote-code
-CMD python -m vllm.entrypoints.openai.api_server \
   --model "meta-llama/Llama-3.2-3B-Instruct" \
   --task generate \
   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
@@ -86,3 +67,4 @@ CMD python -m vllm.entrypoints.openai.api_server \
   --enable-prefix-caching \
   --disable-log-requests \
   --trust-remote-code

+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
+ENV VLLM_LOGGING_LEVEL=DEBUG
+ENV HF_HOME=/tmp/.cache/huggingface
+ENV OMP_NUM_THREADS=1
+# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
+ENV VLLM_USE_TRITON_FLASH_ATTN=0
+# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
+# `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
+# https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
+# Backend for attention computation
+# Available options:
+# - "TORCH_SDPA": use torch.nn.MultiheadAttention
+# - "FLASH_ATTN": use FlashAttention
+# - "XFORMERS": use XFormers
+# - "ROCM_FLASH": use ROCmFlashAttention
+# - "FLASHINFER": use flashinfer
+# - "FLASHMLA": use FlashMLA
+# Choose XFORMERS that most stable for T4
+ENV VLLM_ATTENTION_BACKEND=XFORMERS
+RUN apt-get update && apt-get install -y python3 python3-pip git
+RUN pip3 install --upgrade pip
+# Install vLLM
+RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
+RUN useradd -m appuser
+USER appuser
+RUN mkdir -p /tmp/.cache/huggingface
 # Download at build time,
 # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
 EXPOSE 7860
 # Export for runtime environment
+CMD vllm serve \
   --model "meta-llama/Llama-3.2-3B-Instruct" \
   --task generate \
   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
   --enable-prefix-caching \
   --disable-log-requests \
   --trust-remote-code