Spaces:
Paused
Paused
(feat:vllm serve) Dockerfile
Browse files- Dockerfile +22 -24
Dockerfile
CHANGED
@@ -1,23 +1,7 @@
|
|
1 |
-
FROM
|
2 |
|
3 |
-
|
4 |
-
|
5 |
-
git \
|
6 |
-
curl \
|
7 |
-
wget \
|
8 |
-
unzip \
|
9 |
-
gcc
|
10 |
-
|
11 |
-
RUN useradd -m -u 1000 user
|
12 |
-
USER user
|
13 |
-
ENV PATH="/home/user/.local/bin:$PATH"
|
14 |
-
|
15 |
-
WORKDIR /app
|
16 |
-
|
17 |
-
COPY --chown=user ./requirements.txt requirements.txt
|
18 |
-
RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
|
19 |
-
|
20 |
-
COPY --chown=user . /app
|
21 |
|
22 |
# Download at build time,
|
23 |
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
|
@@ -35,8 +19,22 @@ COPY --chown=user . /app
|
|
35 |
|
36 |
EXPOSE 7860
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM docker.io/vllm/vllm-openai:v0.10.0
|
2 |
|
3 |
+
ENV MODEL_NAME="meta-llama/Llama-3.2-3B-Instruct"
|
4 |
+
ENV MODEL_REV="0cb88a4f764b7a12671c53f0838cd831a0843b95"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Download at build time,
|
7 |
# to ensure during restart we won't have to wait for the download from HF (only wait for docker pull).
|
|
|
19 |
|
20 |
EXPOSE 7860
|
21 |
|
22 |
+
CMD [
|
23 |
+
"vllm", "serve",
|
24 |
+
"--model", "$MODEL_NAME",
|
25 |
+
"--task", "generate",
|
26 |
+
"--revision", "$MODEL_REV",
|
27 |
+
"--code-revision", "$MODEL_REV",
|
28 |
+
"--tokenizer-revision", "$MODEL_REV",
|
29 |
+
"--seed", "42",
|
30 |
+
"--host", "0.0.0.0",
|
31 |
+
"--port", "7860",
|
32 |
+
"--max-num-batched-tokens", "32768",
|
33 |
+
"--max-model-len", "32768",
|
34 |
+
"--dtype", "float16",
|
35 |
+
"--enforce-eager",
|
36 |
+
"--gpu-memory-utilization", "0.9",
|
37 |
+
"--enable-prefix-caching",
|
38 |
+
"--disable-log-requests",
|
39 |
+
"--trust-remote-code"
|
40 |
+
]
|