Spaces:
Paused
Paused
File size: 3,345 Bytes
053dce6 69dcd65 04254aa 69dcd65 f6ddd47 69dcd65 344825e 69dcd65 053dce6 c2c4fef 6dcec5f c2c4fef ae7cfbb 69dcd65 c2c4fef bc37efd 69dcd65 258633e 69dcd65 bc37efd fc30f26 493a5f1 8dc2050 c360fd3 6d19ece 147b3a2 f6ddd47 69dcd65 94a34ed 7b16e9f 69dcd65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
ENV VLLM_LOGGING_LEVEL=DEBUG
ENV HF_HOME=/tmp/.cache/huggingface
ENV OMP_NUM_THREADS=1
# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
ENV VLLM_USE_TRITON_FLASH_ATTN=0
# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
# `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
# https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
# Backend for attention computation
# Available options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
# - "FLASHINFER": use flashinfer
# - "FLASHMLA": use FlashMLA
# Choose XFORMERS that most stable for T4
ENV VLLM_ATTENTION_BACKEND=XFORMERS
# Set environment variables to avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
# Update the package list and install necessary dependencies
RUN apt-get update && \
apt-get install -y software-properties-common
# Add the 'deadsnakes' PPA to get Python 3.12
RUN add-apt-repository ppa:deadsnakes/ppa
# Update the package list again to include the new repository
RUN apt-get update
# Install Python 3.12, pip, and other necessary development tools
RUN apt-get install -y tzdata git python3.12 python3.12-venv python3.12-dev
# Create a symbolic link to make 'python' and 'pip' commands work
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
RUN python --version && python -m pip --version
# RUN apt-get update && apt-get install -y python3 python3-pip git
# RUN pip install --upgrade pip
# Install vLLM
RUN pip install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
RUN useradd -m appuser
USER appuser
RUN mkdir -p /tmp/.cache/huggingface
# Download at build time,
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
# In Docker Spaces, the secrets management is different for security reasons.
# Once you create a secret in the Settings tab,
# you can expose the secret by adding the following line in your Dockerfile:
#
# For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
# you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
# https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
#
# AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
#
# RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py
EXPOSE 7860
# Export for runtime environment
CMD vllm serve \
--model "meta-llama/Llama-3.2-3B-Instruct" \
--task generate \
--revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
--code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
--tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
--seed 42 \
--host 0.0.0.0 \
--port 7860 \
--max-num-batched-tokens 32768 \
--max-model-len 32768 \
--dtype float16 \
--enforce-eager \
--gpu-memory-utilization 0.9 \
--enable-prefix-caching \
--disable-log-requests \
--trust-remote-code
|