yusufs commited on
Commit
258633e
·
verified ·
1 Parent(s): f6ddd47

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +19 -17
Dockerfile CHANGED
@@ -1,26 +1,28 @@
1
- FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
2
 
3
- ENV VLLM_LOGGING_LEVEL=DEBUG
4
- ENV HF_HOME=/tmp/.cache/huggingface
5
- ENV OMP_NUM_THREADS=1
6
 
7
- # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
8
- ENV VLLM_USE_TRITON_FLASH_ATTN=0
 
9
 
10
- # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
11
- # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
12
- ENV VLLM_ATTENTION_BACKEND=FLASH_ATTN
13
 
14
- RUN apt-get update && apt-get install -y python3 python3-pip git
15
- RUN pip3 install --upgrade pip
 
16
 
17
- # Install vLLM
18
- RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
19
 
20
- RUN useradd -m appuser
21
- USER appuser
22
 
23
- RUN mkdir -p /tmp/.cache/huggingface
 
 
 
24
 
25
  # Download at build time,
26
  # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
@@ -39,7 +41,7 @@ RUN mkdir -p /tmp/.cache/huggingface
39
  EXPOSE 7860
40
 
41
  # Export for runtime environment
42
- CMD python3 -m vllm.entrypoints.openai.api_server \
43
  --model "meta-llama/Llama-3.2-3B-Instruct" \
44
  --task generate \
45
  --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
 
 
1
 
2
+ FROM vllm/vllm-openai:v0.10.0
3
+ # FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
 
4
 
5
+ # ENV VLLM_LOGGING_LEVEL=DEBUG
6
+ # ENV HF_HOME=/tmp/.cache/huggingface
7
+ # ENV OMP_NUM_THREADS=1
8
 
9
+ # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
10
+ # ENV VLLM_USE_TRITON_FLASH_ATTN=0
 
11
 
12
+ # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
13
+ # # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
14
+ # ENV VLLM_ATTENTION_BACKEND=FLASH_ATTN
15
 
16
+ # RUN apt-get update && apt-get install -y python3 python3-pip git
17
+ # RUN pip3 install --upgrade pip
18
 
19
+ # # Install vLLM
20
+ # RUN pip3 install vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu113
21
 
22
+ # RUN useradd -m appuser
23
+ # USER appuser
24
+
25
+ # RUN mkdir -p /tmp/.cache/huggingface
26
 
27
  # Download at build time,
28
  # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
 
41
  EXPOSE 7860
42
 
43
  # Export for runtime environment
44
+ CMD vllm serve \
45
  --model "meta-llama/Llama-3.2-3B-Instruct" \
46
  --task generate \
47
  --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \