import modal vllm_image = ( modal.Image.debian_slim(python_version="3.12") .pip_install( "vllm==0.7.2", "transformers==4.51.0", "huggingface_hub[hf_transfer]", "flashinfer-python==0.2.0.post2", extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers ) vllm_image = vllm_image.env({"VLLM_USE_V1": "1"}) hf_cache_vol = modal.Volume.from_name("mcp-datascientist-model-weights-vol") vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) app = modal.App("example-vllm-openai-compatible") N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count API_KEY = "super-secret-key-mcp-hackathon" # api key, for auth. for production use, replace with a modal.Secret MINUTES = 60 # seconds VLLM_PORT = 8000 MODEL_NAME = "Qwen/Qwen3-14B" @app.function( image=vllm_image, gpu=f"A100-40GB", scaledown_window=15 * MINUTES, # how long should we stay up with no requests? timeout=10 * MINUTES, # how long should we wait for container start? volumes={ "/root/.cache/huggingface": hf_cache_vol, "/root/.cache/vllm": vllm_cache_vol, }, ) @modal.concurrent( max_inputs=10 ) # how many requests can one replica handle? tune carefully! @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES) def serve(): import subprocess cmd = [ "vllm", "serve", "--uvicorn-log-level=info", MODEL_NAME, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--api-key", API_KEY, ] subprocess.Popen(" ".join(cmd), shell=True)