|
import modal |
|
|
|
vllm_image = ( |
|
modal.Image.debian_slim(python_version="3.12") |
|
.pip_install( |
|
"vllm==0.7.2", |
|
"transformers==4.51.0", |
|
"huggingface_hub[hf_transfer]", |
|
"flashinfer-python==0.2.0.post2", |
|
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5", |
|
) |
|
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) |
|
) |
|
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"}) |
|
|
|
hf_cache_vol = modal.Volume.from_name("mcp-datascientist-model-weights-vol") |
|
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) |
|
|
|
app = modal.App("example-vllm-openai-compatible") |
|
|
|
N_GPU = 1 |
|
API_KEY = "super-secret-key-mcp-hackathon" |
|
|
|
MINUTES = 60 |
|
VLLM_PORT = 8000 |
|
|
|
MODEL_NAME = "Qwen/Qwen3-14B" |
|
|
|
|
|
@app.function( |
|
image=vllm_image, |
|
gpu=f"A100-40GB", |
|
scaledown_window=15 * MINUTES, |
|
timeout=10 * MINUTES, |
|
volumes={ |
|
"/root/.cache/huggingface": hf_cache_vol, |
|
"/root/.cache/vllm": vllm_cache_vol, |
|
}, |
|
) |
|
@modal.concurrent( |
|
max_inputs=10 |
|
) |
|
@modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES) |
|
def serve(): |
|
import subprocess |
|
|
|
cmd = [ |
|
"vllm", |
|
"serve", |
|
"--uvicorn-log-level=info", |
|
MODEL_NAME, |
|
"--host", |
|
"0.0.0.0", |
|
"--port", |
|
str(VLLM_PORT), |
|
"--api-key", |
|
API_KEY, |
|
] |
|
|
|
subprocess.Popen(" ".join(cmd), shell=True) |
|
|
|
|
|
|