yusufs commited on
Commit
69dcd65
·
verified ·
1 Parent(s): 3f315a3

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +28 -46
Dockerfile CHANGED
@@ -1,38 +1,37 @@
1
 
2
- FROM vllm/vllm-openai:v0.10.0
3
- # FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
4
 
5
- # ENV VLLM_LOGGING_LEVEL=DEBUG
6
- # ENV HF_HOME=/tmp/.cache/huggingface
7
- # ENV OMP_NUM_THREADS=1
8
 
9
- # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
10
- # ENV VLLM_USE_TRITON_FLASH_ATTN=0
11
 
12
- # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
13
- # # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
14
- # # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
15
- # # Backend for attention computation
16
- # # Available options:
17
- # # - "TORCH_SDPA": use torch.nn.MultiheadAttention
18
- # # - "FLASH_ATTN": use FlashAttention
19
- # # - "XFORMERS": use XFormers
20
- # # - "ROCM_FLASH": use ROCmFlashAttention
21
- # # - "FLASHINFER": use flashinfer
22
- # # - "FLASHMLA": use FlashMLA
23
- # # Choose XFORMERS that most stable for T4
24
- # ENV VLLM_ATTENTION_BACKEND=XFORMERS
25
 
26
- # RUN apt-get update && apt-get install -y python3 python3-pip git
27
- # RUN pip3 install --upgrade pip
28
 
29
- # # Install vLLM
30
- # RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
31
 
32
- # RUN useradd -m appuser
33
- # USER appuser
34
 
35
- # RUN mkdir -p /tmp/.cache/huggingface
36
 
37
  # Download at build time,
38
  # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
@@ -51,25 +50,7 @@ FROM vllm/vllm-openai:v0.10.0
51
  EXPOSE 7860
52
 
53
  # Export for runtime environment
54
- # CMD vllm serve \
55
- # --model "meta-llama/Llama-3.2-3B-Instruct" \
56
- # --task generate \
57
- # --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
58
- # --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
59
- # --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
60
- # --seed 42 \
61
- # --host 0.0.0.0 \
62
- # --port 7860 \
63
- # --max-num-batched-tokens 32768 \
64
- # --max-model-len 32768 \
65
- # --dtype float16 \
66
- # --enforce-eager \
67
- # --gpu-memory-utilization 0.9 \
68
- # --enable-prefix-caching \
69
- # --disable-log-requests \
70
- # --trust-remote-code
71
-
72
- CMD python -m vllm.entrypoints.openai.api_server \
73
  --model "meta-llama/Llama-3.2-3B-Instruct" \
74
  --task generate \
75
  --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
@@ -86,3 +67,4 @@ CMD python -m vllm.entrypoints.openai.api_server \
86
  --enable-prefix-caching \
87
  --disable-log-requests \
88
  --trust-remote-code
 
 
1
 
2
+ FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
 
3
 
4
+ ENV VLLM_LOGGING_LEVEL=DEBUG
5
+ ENV HF_HOME=/tmp/.cache/huggingface
6
+ ENV OMP_NUM_THREADS=1
7
 
8
+ # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
9
+ ENV VLLM_USE_TRITON_FLASH_ATTN=0
10
 
11
+ # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
12
+ # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
13
+ # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
14
+ # Backend for attention computation
15
+ # Available options:
16
+ # - "TORCH_SDPA": use torch.nn.MultiheadAttention
17
+ # - "FLASH_ATTN": use FlashAttention
18
+ # - "XFORMERS": use XFormers
19
+ # - "ROCM_FLASH": use ROCmFlashAttention
20
+ # - "FLASHINFER": use flashinfer
21
+ # - "FLASHMLA": use FlashMLA
22
+ # Choose XFORMERS that most stable for T4
23
+ ENV VLLM_ATTENTION_BACKEND=XFORMERS
24
 
25
+ RUN apt-get update && apt-get install -y python3 python3-pip git
26
+ RUN pip3 install --upgrade pip
27
 
28
+ # Install vLLM
29
+ RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
30
 
31
+ RUN useradd -m appuser
32
+ USER appuser
33
 
34
+ RUN mkdir -p /tmp/.cache/huggingface
35
 
36
  # Download at build time,
37
  # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
 
50
  EXPOSE 7860
51
 
52
  # Export for runtime environment
53
+ CMD vllm serve \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  --model "meta-llama/Llama-3.2-3B-Instruct" \
55
  --task generate \
56
  --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
 
67
  --enable-prefix-caching \
68
  --disable-log-requests \
69
  --trust-remote-code
70
+