yusufs commited on
Commit
b983fc2
·
verified ·
1 Parent(s): 258633e

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +12 -1
Dockerfile CHANGED
@@ -2,6 +2,7 @@
2
  FROM vllm/vllm-openai:v0.10.0
3
  # FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
4
 
 
5
  # ENV VLLM_LOGGING_LEVEL=DEBUG
6
  # ENV HF_HOME=/tmp/.cache/huggingface
7
  # ENV OMP_NUM_THREADS=1
@@ -11,7 +12,17 @@ FROM vllm/vllm-openai:v0.10.0
11
 
12
  # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
13
  # # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
14
- # ENV VLLM_ATTENTION_BACKEND=FLASH_ATTN
 
 
 
 
 
 
 
 
 
 
15
 
16
  # RUN apt-get update && apt-get install -y python3 python3-pip git
17
  # RUN pip3 install --upgrade pip
 
2
  FROM vllm/vllm-openai:v0.10.0
3
  # FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04
4
 
5
+ ENV OMP_NUM_THREADS=2
6
  # ENV VLLM_LOGGING_LEVEL=DEBUG
7
  # ENV HF_HOME=/tmp/.cache/huggingface
8
  # ENV OMP_NUM_THREADS=1
 
12
 
13
  # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
14
  # # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
15
+ # # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
16
+ # # Backend for attention computation
17
+ # # Available options:
18
+ # # - "TORCH_SDPA": use torch.nn.MultiheadAttention
19
+ # # - "FLASH_ATTN": use FlashAttention
20
+ # # - "XFORMERS": use XFormers
21
+ # # - "ROCM_FLASH": use ROCmFlashAttention
22
+ # # - "FLASHINFER": use flashinfer
23
+ # # - "FLASHMLA": use FlashMLA
24
+ # # Choose XFORMERS that most stable for T4
25
+ # ENV VLLM_ATTENTION_BACKEND=XFORMERS
26
 
27
  # RUN apt-get update && apt-get install -y python3 python3-pip git
28
  # RUN pip3 install --upgrade pip