Spaces:

yusufs
/

llama32-3b-instruct

Paused

File size: 3,345 Bytes

053dce6
69dcd65
04254aa
69dcd65
 
 
f6ddd47
69dcd65
 
344825e
69dcd65
 
 
 
 
 
 
 
 
 
 
 
 
053dce6
c2c4fef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dcec5f
 
c2c4fef
ae7cfbb
69dcd65
c2c4fef
bc37efd
69dcd65
 
258633e
69dcd65
bc37efd
fc30f26
 
493a5f1
 
 
 
 
 
 
8dc2050
 
 
c360fd3
 
6d19ece
147b3a2
f6ddd47
69dcd65
94a34ed
7b16e9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69dcd65


FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04

ENV VLLM_LOGGING_LEVEL=DEBUG
ENV HF_HOME=/tmp/.cache/huggingface
ENV OMP_NUM_THREADS=1

# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
ENV VLLM_USE_TRITON_FLASH_ATTN=0

# https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
# `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
# https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
# Backend for attention computation
# Available options:
# - "TORCH_SDPA": use torch.nn.MultiheadAttention
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
# - "FLASHINFER": use flashinfer
# - "FLASHMLA": use FlashMLA
# Choose XFORMERS that most stable for T4
ENV VLLM_ATTENTION_BACKEND=XFORMERS

# Set environment variables to avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# Update the package list and install necessary dependencies
RUN apt-get update && \
    apt-get install -y software-properties-common

# Add the 'deadsnakes' PPA to get Python 3.12
RUN add-apt-repository ppa:deadsnakes/ppa

# Update the package list again to include the new repository
RUN apt-get update

# Install Python 3.12, pip, and other necessary development tools
RUN apt-get install -y tzdata git python3.12 python3.12-venv python3.12-dev

# Create a symbolic link to make 'python' and 'pip' commands work
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 && \
    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1

RUN python --version && python -m pip --version
    
# RUN apt-get update && apt-get install -y python3 python3-pip git
# RUN pip install --upgrade pip

# Install vLLM
RUN pip install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113

RUN useradd -m appuser
USER appuser

RUN mkdir -p /tmp/.cache/huggingface

# Download at build time,
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
# In Docker Spaces, the secrets management is different for security reasons.
# Once you create a secret in the Settings tab,
# you can expose the secret by adding the following line in your Dockerfile:
#
# For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
# you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
# https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
#
# AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
#
# RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py

EXPOSE 7860

# Export for runtime environment
CMD vllm serve \
  --model "meta-llama/Llama-3.2-3B-Instruct" \
  --task generate \
  --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
  --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
  --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
  --seed 42 \
  --host 0.0.0.0 \
  --port 7860 \
  --max-num-batched-tokens 32768 \
  --max-model-len 32768 \
  --dtype float16 \
  --enforce-eager \
  --gpu-memory-utilization 0.9 \
  --enable-prefix-caching \
  --disable-log-requests \
  --trust-remote-code