Spaces:
Paused
Paused
File size: 18,551 Bytes
b1959b7 8f6f4c2 04254aa f918ab8 bda8fe4 b1959b7 bda8fe4 2d28796 4ac426b 2d28796 55e5b94 cdf13a3 69dcd65 8f6f4c2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 |
# https://github.com/vllm-project/vllm/blob/v0.10.0/docker/Dockerfile#L9
FROM vllm/vllm-openai:v0.10.0
ENV VLLM_LOGGING_LEVEL=DEBUG
ENV HF_HOME=/tmp/.cache/huggingface
ENV OMP_NUM_THREADS=1
ENV VLLM_USE_TRITON_FLASH_ATTN=0
ENV VLLM_ATTENTION_BACKEND=XFORMERS
# e.g. install the `audio` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image!
RUN uv pip install --system --index-strategy unsafe-best-match vllm[audio]==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128
# Downgrade triton because following error occured when using triton==3.3.1
# https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
# https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
# Create a user and group with the specified ID
RUN groupadd -r myuser --gid 1000 && useradd -r -g myuser --uid 1000 myuser
# Create the home directory and give ownership to 'myuser'
RUN mkdir -p /home/myuser && chown myuser:myuser /home/myuser
# Now, switch to the newly created user
USER myuser
RUN mkdir -p /tmp/.cache/huggingface
ENTRYPOINT ["/bin/bash", "-c", "vllm serve meta-llama/Llama-3.2-3B-Instruct --task generate --revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --code-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --tokenizer-revision 0cb88a4f764b7a12671c53f0838cd831a0843b95 --seed 42 --host 0.0.0.0 --port 7860 --max-num-batched-tokens 32768 --max-model-len 32768 --dtype float16 --enforce-eager --gpu-memory-utilization 0.9 --enable-prefix-caching --disable-log-requests --trust-remote-code"]
# # FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04
# FROM nvidia/cuda:12.9.1-cudnn-runtime-ubuntu24.04
# ENV VLLM_LOGGING_LEVEL=DEBUG
# ENV HF_HOME=/tmp/.cache/huggingface
# ENV OMP_NUM_THREADS=1
# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/installation/gpu/rocm.inc.md?plain=1#L124
# ENV VLLM_USE_TRITON_FLASH_ATTN=0
# # https://github.com/vllm-project/vllm/blob/v0.10.0/docs/getting_started/quickstart.md?plain=1#L213
# # `FLASH_ATTN` or `FLASHINFER` or `XFORMERS`.
# # https://github.com/vllm-project/vllm/blob/main/vllm/envs.py#L426-L435
# # Backend for attention computation
# # Available options:
# # - "TORCH_SDPA": use torch.nn.MultiheadAttention
# # - "FLASH_ATTN": use FlashAttention
# # - "XFORMERS": use XFormers
# # - "ROCM_FLASH": use ROCmFlashAttention
# # - "FLASHINFER": use flashinfer
# # - "FLASHMLA": use FlashMLA
# # Choose XFORMERS that most stable for T4
# ENV VLLM_ATTENTION_BACKEND=XFORMERS
# # Set environment variables for the xformers build
# ENV TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
# ENV MAX_JOBS=16
# # Set environment variables to avoid interactive prompts
# ENV DEBIAN_FRONTEND=noninteractive
# # Update the package list and install necessary dependencies
# RUN apt-get update && \
# apt-get install -y software-properties-common
# # Add the 'deadsnakes' PPA to get Python 3.12
# RUN add-apt-repository ppa:deadsnakes/ppa
# # Update the package list again to include the new repository
# RUN apt-get update
# # Install Python 3.12, pip, and other necessary development tools
# RUN apt-get install -y tzdata git curl python3.12 python3.12-venv python3.12-dev python3.12-full python3-pip python3-setuptools
# # RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
# # RUN python3.12 get-pip.py
# RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
# && update-alternatives --set python3 /usr/bin/python3.12 \
# && ln -sf /usr/bin/python3.12-config /usr/bin/python3-config
# RUN python3 --version && python3 -m pip --version
# # RUN apt-get update && apt-get install -y python3 python3-pip git
# # RUN pip install --upgrade pip
# # Install uv for faster install
# RUN python3 -m pip install uv --break-system-packages
# # Install vLLM
# RUN uv pip install --system --index-strategy unsafe-best-match vllm==0.10.0 --extra-index-url https://download.pytorch.org/whl/cu128
# # RUN uv pip install --system vllm==0.10.0 torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
# # Downgrade triton because following error occured when using triton==3.3.1
# # https://github.com/vllm-project/vllm/issues/20259#issuecomment-3157159183
# # https://github.com/vllm-project/vllm/issues/19203#issuecomment-2989796604
# # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: error: Failures have been detected while processing an MLIR pass pipeline
# # /usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py:36:0: note: Pipeline failed while executing [`ConvertTritonGPUToLLVM` on 'builtin.module' operation]: reproducer generated at `std::errs, please share the reproducer above with Triton project.`
# # INFO: 10.16.9.222:28100 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
# # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError('PassManager::run failed')
# # ERROR 08-06 19:13:13 [engine.py:165] Traceback (most recent call last):
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 163, in start
# # ERROR 08-06 19:13:13 [engine.py:165] self.run_engine_loop()
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 226, in run_engine_loop
# # ERROR 08-06 19:13:13 [engine.py:165] request_outputs = self.engine_step()
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 252, in engine_step
# # ERROR 08-06 19:13:13 [engine.py:165] raise e
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 235, in engine_step
# # ERROR 08-06 19:13:13 [engine.py:165] return self.engine.step()
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 1334, in step
# # ERROR 08-06 19:13:13 [engine.py:165] outputs = self.model_executor.execute_model(
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 146, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165] output = self.collective_rpc("execute_model",
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
# # ERROR 08-06 19:13:13 [engine.py:165] answer = run_method(self.driver_worker, method, args, kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2985, in run_method
# # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 417, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165] output = self.model_runner.execute_model(
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
# # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1703, in execute_model
# # ERROR 08-06 19:13:13 [engine.py:165] hidden_or_intermediate_states = model_executable(
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 584, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] model_output = self.model(input_ids, positions, intermediate_tensors,
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 206, in __call__
# # ERROR 08-06 19:13:13 [engine.py:165] return self.forward(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 392, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] hidden_states, residual = layer(positions, hidden_states, residual)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 305, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] hidden_states = self.self_attn(positions=positions,
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 203, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] attn_output = self.attn(q, k, v)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return self._call_impl(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
# # ERROR 08-06 19:13:13 [engine.py:165] return forward_call(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 288, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] return torch.ops.vllm.unified_attention(
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
# # ERROR 08-06 19:13:13 [engine.py:165] return self._op(*args, **(kwargs or {}))
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 448, in unified_attention
# # ERROR 08-06 19:13:13 [engine.py:165] output = self.impl.forward(self, query, key, value, kv_cache,
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/xformers.py", line 584, in forward
# # ERROR 08-06 19:13:13 [engine.py:165] out = PagedAttention.forward_prefix(
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/paged_attn.py", line 214, in forward_prefix
# # ERROR 08-06 19:13:13 [engine.py:165] context_attention_fwd(
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
# # ERROR 08-06 19:13:13 [engine.py:165] return func(*args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/ops/prefix_prefill.py", line 850, in context_attention_fwd
# # ERROR 08-06 19:13:13 [engine.py:165] _fwd_kernel[grid](
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
# # ERROR 08-06 19:13:13 [engine.py:165] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 569, in run
# # ERROR 08-06 19:13:13 [engine.py:165] kernel = self.compile(src, target=target, options=options.__dict__)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/compiler/compiler.py", line 284, in compile
# # ERROR 08-06 19:13:13 [engine.py:165] next_module = compile_ir(module, metadata)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 450, in <lambda>
# # ERROR 08-06 19:13:13 [engine.py:165] stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
# # ERROR 08-06 19:13:13 [engine.py:165] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# # ERROR 08-06 19:13:13 [engine.py:165] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/compiler.py", line 341, in make_llir
# # ERROR 08-06 19:13:13 [engine.py:165] pm.run(mod)
# # ERROR 08-06 19:13:13 [engine.py:165] RuntimeError: PassManager::run failed
# # DEBUG 08-06 19:13:13 [engine.py:169] MQLLMEngine is shut down.
# # INFO: Shutting down
# # INFO: Waiting for application shutdown.
# # INFO: Application shutdown complete.
# # INFO: Finished server process [27]
# # RUN uv pip install --system --index-strategy unsafe-best-match triton==3.2 --extra-index-url https://download.pytorch.org/whl/cu128
# # # Then, install xformers with the --no-build-isolation flag
# # RUN uv pip install --system \
# # --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.31"
# RUN useradd -m appuser
# USER appuser
# RUN mkdir -p /tmp/.cache/huggingface
# # Download at build time,
# # to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
# # In Docker Spaces, the secrets management is different for security reasons.
# # Once you create a secret in the Settings tab,
# # you can expose the secret by adding the following line in your Dockerfile:
# #
# # For example, if SECRET_EXAMPLE is the name of the secret you created in the Settings tab,
# # you can read it at build time by mounting it to a file, then reading it with $(cat /run/secrets/SECRET_EXAMPLE).
# # https://huggingface.co/docs/hub/en/spaces-sdks-docker#buildtime
# #
# # AFTER TRIAL AND ERROR WE GOT 16GB (16431849854 bytes) OF LAYERS :(
# #
# # RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true HF_TOKEN=$(cat /run/secrets/HF_TOKEN) python /app/download_model.py
# EXPOSE 7860
# # Export for runtime environment
# CMD vllm serve "meta-llama/Llama-3.2-3B-Instruct" \
# --task generate \
# --revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
# --code-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
# --tokenizer-revision "0cb88a4f764b7a12671c53f0838cd831a0843b95" \
# --seed 42 \
# --host 0.0.0.0 \
# --port 7860 \
# --max-num-batched-tokens 32768 \
# --max-model-len 32768 \
# --dtype float16 \
# --enforce-eager \
# --gpu-memory-utilization 0.9 \
# --enable-prefix-caching \
# --disable-log-requests \
# --trust-remote-code
|