yusufs commited on
Commit
8f6f4c2
·
verified ·
1 Parent(s): 1dd5d05

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +234 -210
Dockerfile CHANGED
@@ -1,215 +1,9 @@
1
 
2
- # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
3
- FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
4
 
5
- ENV VLLM_LOGGING_LEVEL=DEBUG
6
- ENV HF_HOME=/tmp/.cache/huggingface
7
- ENV OMP_NUM_THREADS=1
8
-
9
- # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
10
- ENV VLLM_USE_TRITON_FLASH_ATTN=0
11
-
12
- # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
13
- # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
14
- # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
15
- # Backend for attention computation
16
- # Available options:
17
- # - "TORCH_SDPA": use torch.nn.MultiheadAttention
18
- # - "FLASH_ATTN": use FlashAttention
19
- # - "XFORMERS": use XFormers
20
- # - "ROCM_FLASH": use ROCmFlashAttention
21
- # - "FLASHINFER": use flashinfer
22
- # - "FLASHMLA": use FlashMLA
23
- # Choose XFORMERS that most stable for T4
24
- ENV VLLM_ATTENTION_BACKEND=XFORMERS
25
-
26
- # Set environment variables for the xformers build
27
- ENV TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
28
- ENV MAX_JOBS=16
29
-
30
- # Set environment variables to avoid interactive prompts
31
- ENV DEBIAN_FRONTEND=noninteractive
32
-
33
- # Update the package list and install necessary dependencies
34
- RUN apt-get update && \
35
- apt-get install -y software-properties-common
36
-
37
- # Add the 'deadsnakes' PPA to get Python 3.12
38
- RUN add-apt-repository ppa:deadsnakes/ppa
39
-
40
- # Update the package list again to include the new repository
41
- RUN apt-get update
42
-
43
- # Install Python 3.12, pip, and other necessary development tools
44
- RUN apt-get install -y tzdata git curl python3.12 python3.12-venv python3.12-dev python3.12-full python3-pip python3-setuptools
45
-
46
- # RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
47
- # RUN python3.12 get-pip.py
48
-
49
- RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
50
- && update-alternatives --set python3 /usr/bin/python3.12 \
51
- && ln -sf /usr/bin/python3.12-config /usr/bin/python3-config
52
-
53
-
54
- RUN python3 --version && python3 -m pip --version
55
-
56
- # RUN apt-get update && apt-get install -y python3 python3-pip git
57
- # RUN pip install --upgrade pip
58
-
59
- # Install uv for faster install
60
- RUN python3 -m pip install uv --break-system-packages
61
-
62
- # Install vLLM
63
- RUN uv pip install --system --index-strategy unsafe-best-match vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128
64
-
65
-
66
-
67
- # RUN uv pip install --system vllm==0.10.0 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
68
-
69
-
70
- # Downgrade triton because following error occured when using triton==3.3.1
71
- # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
72
- # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
73
- # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: error: Failures have been detected while processing an MLIR pass pipeline
74
- # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: note: Pipeline failed while executing [`ConvertTritonGPUToLLVM` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
75
- # INFO: 10.16.9.222:28100 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
76
- # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError('PassManager::run failed')
77
- # ERROR 08-06 19:13:13 [engine.py:165] Traceback (most recent call last):
78
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 163, in start
79
- # ERROR 08-06 19:13:13 [engine.py:165] self.run_engine_loop()
80
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 226, in run_engine_loop
81
- # ERROR 08-06 19:13:13 [engine.py:165] request_outputs = self.engine_step()
82
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
83
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 252, in engine_step
84
- # ERROR 08-06 19:13:13 [engine.py:165] raise e
85
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 235, in engine_step
86
- # ERROR 08-06 19:13:13 [engine.py:165] return self.engine.step()
87
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
88
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 1334, in step
89
- # ERROR 08-06 19:13:13 [engine.py:165] outputs = self.model_executor.execute_model(
90
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
91
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 146, in execute_model
92
- # ERROR 08-06 19:13:13 [engine.py:165] output = self.collective_rpc("execute_model",
93
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
94
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
95
- # ERROR 08-06 19:13:13 [engine.py:165] answer = run_method(self.driver_worker, method, args, kwargs)
96
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
97
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2985, in run_method
98
- # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
99
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
100
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 417, in execute_model
101
- # ERROR 08-06 19:13:13 [engine.py:165] output = self.model_runner.execute_model(
102
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
103
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
104
- # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
105
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
106
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1703, in execute_model
107
- # ERROR 08-06 19:13:13 [engine.py:165] hidden_or_intermediate_states = model_executable(
108
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^
109
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
110
- # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
111
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
112
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
113
- # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
114
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
115
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 584, in forward
116
- # ERROR 08-06 19:13:13 [engine.py:165] model_output = self.model(input_ids, positions, intermediate_tensors,
117
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
118
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 206, in __call__
119
- # ERROR 08-06 19:13:13 [engine.py:165] return self.forward(*args, **kwargs)
120
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 392, in forward
122
- # ERROR 08-06 19:13:13 [engine.py:165] hidden_states, residual = layer(positions, hidden_states, residual)
123
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
124
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
125
- # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
126
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
128
- # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
129
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
130
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 305, in forward
131
- # ERROR 08-06 19:13:13 [engine.py:165] hidden_states = self.self_attn(positions=positions,
132
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
133
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
134
- # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
135
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
137
- # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
138
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
139
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 203, in forward
140
- # ERROR 08-06 19:13:13 [engine.py:165] attn_output = self.attn(q, k, v)
141
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
142
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
143
- # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
144
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
145
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
146
- # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
147
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
148
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 288, in forward
149
- # ERROR 08-06 19:13:13 [engine.py:165] return torch.ops.vllm.unified_attention(
150
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
151
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
152
- # ERROR 08-06 19:13:13 [engine.py:165] return self._op(*args, **(kwargs or {}))
153
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 448, in unified_attention
155
- # ERROR 08-06 19:13:13 [engine.py:165] output = self.impl.forward(self, query, key, value, kv_cache,
156
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/xformers.py", line 584, in forward
158
- # ERROR 08-06 19:13:13 [engine.py:165] out = PagedAttention.forward_prefix(
159
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
160
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/paged_attn.py", line 214, in forward_prefix
161
- # ERROR 08-06 19:13:13 [engine.py:165] context_attention_fwd(
162
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
163
- # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
164
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
165
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py", line 850, in context_attention_fwd
166
- # ERROR 08-06 19:13:13 [engine.py:165] _fwd_kernel[grid](
167
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
168
- # ERROR 08-06 19:13:13 [engine.py:165] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
169
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
170
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 569, in run
171
- # ERROR 08-06 19:13:13 [engine.py:165] kernel = self.compile(src, target=target, options=options.__dict__)
172
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
173
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 284, in compile
174
- # ERROR 08-06 19:13:13 [engine.py:165] next_module = compile_ir(module, metadata)
175
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
176
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 450, in <lambda>
177
- # ERROR 08-06 19:13:13 [engine.py:165] stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
178
- # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
179
- # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 341, in make_llir
180
- # ERROR 08-06 19:13:13 [engine.py:165] pm.run(mod)
181
- # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError: PassManager::run failed
182
- # DEBUG 08-06 19:13:13 [engine.py:169] MQLLMEngine is shut down.
183
- # INFO: Shutting down
184
- # INFO: Waiting for application shutdown.
185
- # INFO: Application shutdown complete.
186
- # INFO: Finished server process [27]
187
- # RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
188
-
189
- # # Then, install xformers with the --no-build-isolation flag
190
- # RUN uv pip install --system \
191
- # --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
192
-
193
- RUN useradd -m appuser
194
- USER appuser
195
-
196
- RUN mkdir -p /tmp/.cache/huggingface
197
-
198
- # Download at build time,
199
- # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
200
- # In Docker Spaces, the secrets management is different for security reasons.
201
- # Once you create a secret in the Settings tab,
202
- # you can expose the secret by adding the following line in your Dockerfile:
203
- #
204
- # For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
205
- # you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
206
- # https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
207
- #
208
- # AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
209
- #
210
- # RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py
211
-
212
- EXPOSE 7860
213
 
214
  # Export for runtime environment
215
  CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
@@ -229,3 +23,233 @@ CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
229
  --disable-log-requests \
230
  --trust-remote-code
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ FROM vllm/vllm-openai:v0.10.0
 
3
 
4
+ # e.g. install the `audio` optional dependencies
5
+ # NOTE: Make sure the version of vLLM matches the base image!
6
+ RUN uv pip install --system vllm[audio]==0.9.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Export for runtime environment
9
  CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
 
23
  --disable-log-requests \
24
  --trust-remote-code
25
 
26
+ # # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
27
+ # FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
28
+
29
+ # ENV VLLM_LOGGING_LEVEL=DEBUG
30
+ # ENV HF_HOME=/tmp/.cache/huggingface
31
+ # ENV OMP_NUM_THREADS=1
32
+
33
+ # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
34
+ # ENV VLLM_USE_TRITON_FLASH_ATTN=0
35
+
36
+ # # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
37
+ # # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
38
+ # # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
39
+ # # Backend for attention computation
40
+ # # Available options:
41
+ # # - "TORCH_SDPA": use torch.nn.MultiheadAttention
42
+ # # - "FLASH_ATTN": use FlashAttention
43
+ # # - "XFORMERS": use XFormers
44
+ # # - "ROCM_FLASH": use ROCmFlashAttention
45
+ # # - "FLASHINFER": use flashinfer
46
+ # # - "FLASHMLA": use FlashMLA
47
+ # # Choose XFORMERS that most stable for T4
48
+ # ENV VLLM_ATTENTION_BACKEND=XFORMERS
49
+
50
+ # # Set environment variables for the xformers build
51
+ # ENV TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
52
+ # ENV MAX_JOBS=16
53
+
54
+ # # Set environment variables to avoid interactive prompts
55
+ # ENV DEBIAN_FRONTEND=noninteractive
56
+
57
+ # # Update the package list and install necessary dependencies
58
+ # RUN apt-get update && \
59
+ # apt-get install -y software-properties-common
60
+
61
+ # # Add the 'deadsnakes' PPA to get Python 3.12
62
+ # RUN add-apt-repository ppa:deadsnakes/ppa
63
+
64
+ # # Update the package list again to include the new repository
65
+ # RUN apt-get update
66
+
67
+ # # Install Python 3.12, pip, and other necessary development tools
68
+ # RUN apt-get install -y tzdata git curl python3.12 python3.12-venv python3.12-dev python3.12-full python3-pip python3-setuptools
69
+
70
+ # # RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
71
+ # # RUN python3.12 get-pip.py
72
+
73
+ # RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
74
+ # && update-alternatives --set python3 /usr/bin/python3.12 \
75
+ # && ln -sf /usr/bin/python3.12-config /usr/bin/python3-config
76
+
77
+
78
+ # RUN python3 --version && python3 -m pip --version
79
+
80
+ # # RUN apt-get update && apt-get install -y python3 python3-pip git
81
+ # # RUN pip install --upgrade pip
82
+
83
+ # # Install uv for faster install
84
+ # RUN python3 -m pip install uv --break-system-packages
85
+
86
+ # # Install vLLM
87
+ # RUN uv pip install --system --index-strategy unsafe-best-match vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128
88
+
89
+
90
+
91
+ # # RUN uv pip install --system vllm==0.10.0 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
92
+
93
+
94
+ # # Downgrade triton because following error occured when using triton==3.3.1
95
+ # # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
96
+ # # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
97
+ # # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: error: Failures have been detected while processing an MLIR pass pipeline
98
+ # # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: note: Pipeline failed while executing [`ConvertTritonGPUToLLVM` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
99
+ # # INFO: 10.16.9.222:28100 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
100
+ # # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError('PassManager::run failed')
101
+ # # ERROR 08-06 19:13:13 [engine.py:165] Traceback (most recent call last):
102
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 163, in start
103
+ # # ERROR 08-06 19:13:13 [engine.py:165] self.run_engine_loop()
104
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 226, in run_engine_loop
105
+ # # ERROR 08-06 19:13:13 [engine.py:165] request_outputs = self.engine_step()
106
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
107
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 252, in engine_step
108
+ # # ERROR 08-06 19:13:13 [engine.py:165] raise e
109
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 235, in engine_step
110
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self.engine.step()
111
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
112
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 1334, in step
113
+ # # ERROR 08-06 19:13:13 [engine.py:165] outputs = self.model_executor.execute_model(
114
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
115
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 146, in execute_model
116
+ # # ERROR 08-06 19:13:13 [engine.py:165] output = self.collective_rpc("execute_model",
117
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
118
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
119
+ # # ERROR 08-06 19:13:13 [engine.py:165] answer = run_method(self.driver_worker, method, args, kwargs)
120
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
121
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2985, in run_method
122
+ # # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
123
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
124
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 417, in execute_model
125
+ # # ERROR 08-06 19:13:13 [engine.py:165] output = self.model_runner.execute_model(
126
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
127
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
128
+ # # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
129
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
130
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1703, in execute_model
131
+ # # ERROR 08-06 19:13:13 [engine.py:165] hidden_or_intermediate_states = model_executable(
132
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^
133
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
134
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
135
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
136
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
137
+ # # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
138
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
139
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 584, in forward
140
+ # # ERROR 08-06 19:13:13 [engine.py:165] model_output = self.model(input_ids, positions, intermediate_tensors,
141
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
142
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 206, in __call__
143
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self.forward(*args, **kwargs)
144
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
145
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 392, in forward
146
+ # # ERROR 08-06 19:13:13 [engine.py:165] hidden_states, residual = layer(positions, hidden_states, residual)
147
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
148
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
149
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
150
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
151
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
152
+ # # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
153
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
154
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 305, in forward
155
+ # # ERROR 08-06 19:13:13 [engine.py:165] hidden_states = self.self_attn(positions=positions,
156
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
157
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
158
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
159
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
160
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
161
+ # # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
162
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
163
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 203, in forward
164
+ # # ERROR 08-06 19:13:13 [engine.py:165] attn_output = self.attn(q, k, v)
165
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
166
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
167
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
168
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
169
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
170
+ # # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
171
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
172
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 288, in forward
173
+ # # ERROR 08-06 19:13:13 [engine.py:165] return torch.ops.vllm.unified_attention(
174
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
175
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
176
+ # # ERROR 08-06 19:13:13 [engine.py:165] return self._op(*args, **(kwargs or {}))
177
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
178
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 448, in unified_attention
179
+ # # ERROR 08-06 19:13:13 [engine.py:165] output = self.impl.forward(self, query, key, value, kv_cache,
180
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
181
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/xformers.py", line 584, in forward
182
+ # # ERROR 08-06 19:13:13 [engine.py:165] out = PagedAttention.forward_prefix(
183
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
184
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/paged_attn.py", line 214, in forward_prefix
185
+ # # ERROR 08-06 19:13:13 [engine.py:165] context_attention_fwd(
186
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
187
+ # # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
188
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
189
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py", line 850, in context_attention_fwd
190
+ # # ERROR 08-06 19:13:13 [engine.py:165] _fwd_kernel[grid](
191
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
192
+ # # ERROR 08-06 19:13:13 [engine.py:165] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
193
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
194
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 569, in run
195
+ # # ERROR 08-06 19:13:13 [engine.py:165] kernel = self.compile(src, target=target, options=options.__dict__)
196
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
197
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 284, in compile
198
+ # # ERROR 08-06 19:13:13 [engine.py:165] next_module = compile_ir(module, metadata)
199
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
200
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 450, in <lambda>
201
+ # # ERROR 08-06 19:13:13 [engine.py:165] stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
202
+ # # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
203
+ # # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 341, in make_llir
204
+ # # ERROR 08-06 19:13:13 [engine.py:165] pm.run(mod)
205
+ # # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError: PassManager::run failed
206
+ # # DEBUG 08-06 19:13:13 [engine.py:169] MQLLMEngine is shut down.
207
+ # # INFO: Shutting down
208
+ # # INFO: Waiting for application shutdown.
209
+ # # INFO: Application shutdown complete.
210
+ # # INFO: Finished server process [27]
211
+ # # RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
212
+
213
+ # # # Then, install xformers with the --no-build-isolation flag
214
+ # # RUN uv pip install --system \
215
+ # # --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
216
+
217
+ # RUN useradd -m appuser
218
+ # USER appuser
219
+
220
+ # RUN mkdir -p /tmp/.cache/huggingface
221
+
222
+ # # Download at build time,
223
+ # # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
224
+ # # In Docker Spaces, the secrets management is different for security reasons.
225
+ # # Once you create a secret in the Settings tab,
226
+ # # you can expose the secret by adding the following line in your Dockerfile:
227
+ # #
228
+ # # For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
229
+ # # you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
230
+ # # https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
231
+ # #
232
+ # # AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
233
+ # #
234
+ # # RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py
235
+
236
+ # EXPOSE 7860
237
+
238
+ # # Export for runtime environment
239
+ # CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
240
+ # --task generate \
241
+ # --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
242
+ # --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
243
+ # --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
244
+ # --seed 42 \
245
+ # --host 0.0.0.0 \
246
+ # --port 7860 \
247
+ # --max-num-batched-tokens 32768 \
248
+ # --max-model-len 32768 \
249
+ # --dtype float16 \
250
+ # --enforce-eager \
251
+ # --gpu-memory-utilization 0.9 \
252
+ # --enable-prefix-caching \
253
+ # --disable-log-requests \
254
+ # --trust-remote-code
255
+