File size: 18,551 Bytes
b1959b7
8f6f4c2
04254aa
f918ab8
 
 
 
 
 
bda8fe4
 
b1959b7
 
 
 
 
 
bda8fe4
2d28796
 
 
4ac426b
 
 
2d28796
 
 
55e5b94
 
cdf13a3
69dcd65
8f6f4c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
# https://github.com/vllm-project/vllm/blob/v0.10.0/docker/Dockerfile#L9
FROM vllm/vllm-openai:v0.10.0

ENV VLLM_LOGGING_LEVEL=DEBUG
ENV HF_HOME=/tmp/.cache/huggingface
ENV OMP_NUM_THREADS=1
ENV VLLM_USE_TRITON_FLASH_ATTN=0
ENV VLLM_ATTENTION_BACKEND=XFORMERS

# e.g. install the `audio` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image!
RUN uv pip install --system --index-strategy unsafe-best-match vllm[audio]==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128

# Downgrade triton because following error occured when using triton==3.3.1
# https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
# https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128

# Create a user and group with the specified ID
RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser

# Create the home directory and give ownership to 'myuser'
RUN mkdir -p /home/myuser && chown myuser:myuser /home/myuser

# Now, switch to the newly created user
USER myuser

RUN mkdir -p /tmp/.cache/huggingface

ENTRYPOINT ["/bin/bash",  "-c", "vllm serve meta-llama/Llama-3.2-3B-Instruct --task generate --revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --code-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --tokenizer-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]

# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04

# ENV VLLM_LOGGING_LEVEL=DEBUG
# ENV HF_HOME=/tmp/.cache/huggingface
# ENV OMP_NUM_THREADS=1

# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
# ENV VLLM_USE_TRITON_FLASH_ATTN=0

# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
# # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
# # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
# # Backend for attention computation
# # Available options:
# # - "TORCH_SDPA": use torch.nn.MultiheadAttention
# # - "FLASH_ATTN": use FlashAttention
# # - "XFORMERS": use XFormers
# # - "ROCM_FLASH": use ROCmFlashAttention
# # - "FLASHINFER": use flashinfer
# # - "FLASHMLA": use FlashMLA
# # Choose XFORMERS that most stable for T4
# ENV VLLM_ATTENTION_BACKEND=XFORMERS

# # Set environment variables for the xformers build
# ENV TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
# ENV MAX_JOBS=16

# # Set environment variables to avoid interactive prompts
# ENV DEBIAN_FRONTEND=noninteractive

# # Update the package list and install necessary dependencies
# RUN apt-get update && \
#     apt-get install -y software-properties-common

# # Add the 'deadsnakes' PPA to get Python 3.12
# RUN add-apt-repository ppa:deadsnakes/ppa

# # Update the package list again to include the new repository
# RUN apt-get update

# # Install Python 3.12, pip, and other necessary development tools
# RUN apt-get install -y tzdata git curl python3.12 python3.12-venv python3.12-dev python3.12-full python3-pip python3-setuptools

# # RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
# # RUN python3.12 get-pip.py

# RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
#     && update-alternatives --set python3 /usr/bin/python3.12 \
#     && ln -sf /usr/bin/python3.12-config /usr/bin/python3-config


# RUN python3 --version && python3 -m pip --version
    
# # RUN apt-get update && apt-get install -y python3 python3-pip git
# # RUN pip install --upgrade pip

# # Install uv for faster install
# RUN python3 -m pip install uv --break-system-packages

# # Install vLLM
# RUN uv pip install --system --index-strategy unsafe-best-match vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128



# # RUN uv pip install --system vllm==0.10.0 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113


# # Downgrade triton because following error occured when using triton==3.3.1
# # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
# # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
# # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: error: Failures have been detected while processing an MLIR pass pipeline
# # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: note: Pipeline failed while executing [`ConvertTritonGPUToLLVM` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
# # INFO:     10.16.9.222:28100 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
# # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError('PassManager::run failed')
# # ERROR 08-06 19:13:13 [engine.py:165] Traceback (most recent call last):
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 163, in start
# # ERROR 08-06 19:13:13 [engine.py:165]     self.run_engine_loop()
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 226, in run_engine_loop
# # ERROR 08-06 19:13:13 [engine.py:165]     request_outputs = self.engine_step()
# # ERROR 08-06 19:13:13 [engine.py:165]                       ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 252, in engine_step
# # ERROR 08-06 19:13:13 [engine.py:165]     raise e
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 235, in engine_step
# # ERROR 08-06 19:13:13 [engine.py:165]     return self.engine.step()
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 1334, in step
# # ERROR 08-06 19:13:13 [engine.py:165]     outputs = self.model_executor.execute_model(
# # ERROR 08-06 19:13:13 [engine.py:165]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 146, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165]     output = self.collective_rpc("execute_model",
# # ERROR 08-06 19:13:13 [engine.py:165]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
# # ERROR 08-06 19:13:13 [engine.py:165]     answer = run_method(self.driver_worker, method, args, kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2985, in run_method
# # ERROR 08-06 19:13:13 [engine.py:165]     return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 417, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165]     output = self.model_runner.execute_model(
# # ERROR 08-06 19:13:13 [engine.py:165]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
# # ERROR 08-06 19:13:13 [engine.py:165]     return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1703, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165]     hidden_or_intermediate_states = model_executable(
# # ERROR 08-06 19:13:13 [engine.py:165]                                     ^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 584, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     model_output = self.model(input_ids, positions, intermediate_tensors,
# # ERROR 08-06 19:13:13 [engine.py:165]                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 206, in __call__
# # ERROR 08-06 19:13:13 [engine.py:165]     return self.forward(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 392, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     hidden_states, residual = layer(positions, hidden_states, residual)
# # ERROR 08-06 19:13:13 [engine.py:165]                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 305, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     hidden_states = self.self_attn(positions=positions,
# # ERROR 08-06 19:13:13 [engine.py:165]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 203, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     attn_output = self.attn(q, k, v)
# # ERROR 08-06 19:13:13 [engine.py:165]                   ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165]     return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 288, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     return torch.ops.vllm.unified_attention(
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
# # ERROR 08-06 19:13:13 [engine.py:165]     return self._op(*args, **(kwargs or {}))
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 448, in unified_attention
# # ERROR 08-06 19:13:13 [engine.py:165]     output = self.impl.forward(self, query, key, value, kv_cache,
# # ERROR 08-06 19:13:13 [engine.py:165]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/xformers.py", line 584, in forward
# # ERROR 08-06 19:13:13 [engine.py:165]     out = PagedAttention.forward_prefix(
# # ERROR 08-06 19:13:13 [engine.py:165]           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/paged_attn.py", line 214, in forward_prefix
# # ERROR 08-06 19:13:13 [engine.py:165]     context_attention_fwd(
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
# # ERROR 08-06 19:13:13 [engine.py:165]     return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]            ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py", line 850, in context_attention_fwd
# # ERROR 08-06 19:13:13 [engine.py:165]     _fwd_kernel[grid](
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
# # ERROR 08-06 19:13:13 [engine.py:165]     return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165]                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 569, in run
# # ERROR 08-06 19:13:13 [engine.py:165]     kernel = self.compile(src, target=target, options=options.__dict__)
# # ERROR 08-06 19:13:13 [engine.py:165]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 284, in compile
# # ERROR 08-06 19:13:13 [engine.py:165]     next_module = compile_ir(module, metadata)
# # ERROR 08-06 19:13:13 [engine.py:165]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 450, in <lambda>
# # ERROR 08-06 19:13:13 [engine.py:165]     stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
# # ERROR 08-06 19:13:13 [engine.py:165]                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165]   File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 341, in make_llir
# # ERROR 08-06 19:13:13 [engine.py:165]     pm.run(mod)
# # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError: PassManager::run failed
# # DEBUG 08-06 19:13:13 [engine.py:169] MQLLMEngine is shut down.
# # INFO:     Shutting down
# # INFO:     Waiting for application shutdown.
# # INFO:     Application shutdown complete.
# # INFO:     Finished server process [27]
# # RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128

# # # Then, install xformers with the --no-build-isolation flag
# # RUN uv pip install --system \
# #     --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"

# RUN useradd -m appuser
# USER appuser

# RUN mkdir -p /tmp/.cache/huggingface

# # Download at build time,
# # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
# # In Docker Spaces, the secrets management is different for security reasons.
# # Once you create a secret in the Settings tab,
# # you can expose the secret by adding the following line in your Dockerfile:
# #
# # For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
# # you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
# # https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
# #
# # AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
# #
# # RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py

# EXPOSE 7860

# # Export for runtime environment
# CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
#   --task generate \
#   --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
#   --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
#   --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
#   --seed 42 \
#   --host 0.0.0.0 \
#   --port 7860 \
#   --max-num-batched-tokens 32768 \
#   --max-model-len 32768 \
#   --dtype float16 \
#   --enforce-eager \
#   --gpu-memory-utilization 0.9 \
#   --enable-prefix-caching \
#   --disable-log-requests \
#   --trust-remote-code