import modal import os app_name : str = "example-vllm-openai-compatible" app = modal.App(name=app_name) print(f"setting up container image ...") vllm_image = ( modal.Image.debian_slim(python_version="3.12") .pip_install( "vllm==0.7.2", "huggingface_hub[hf_transfer]==0.26.2", "flashinfer-python==0.2.0.post2", # pinning, very unstable extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5", ) .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers ) vllm_image = vllm_image.env({"VLLM_USE_V1": "1"}) print(f" done setting up container image.") MODELS_DIR = "/llamas", MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16" MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d" print(f" downloading model weights...") hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True) vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True) print(f" done downloading model weights.") print(f"building engine...") N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count MINUTES = 60 # seconds VLLM_PORT = 8000 @app.function( image = vllm_image, secrets=[modal.Secret.from_name("api_key")], gpu=f"H100:{N_GPU}", scaledown_window=15 * MINUTES, # how long should we stay up with no requests? timeout=10 * MINUTES, # how long should we wait for container start? volumes={ "/root/.cache/huggingface": hf_cache_vol, "/root/.cache/vllm": vllm_cache_vol, }, ) @modal.concurrent( max_inputs=100 ) # how many requests can one replica handle? tune carefully! @modal.web_server(port=VLLM_PORT, startup_timeout=50 * MINUTES) def serve(): import subprocess API_KEY = os.environ["MODAL_API_KEY"] cmd = [ "vllm", "serve", "--uvicorn-log-level=info", MODEL_NAME, "--revision", MODEL_REVISION, "--host", "0.0.0.0", "--port", str(VLLM_PORT), "--api-key", API_KEY, "--enable-auto-tool-choice" " ", "--tool-call-parser", "llama3_json" ] subprocess.Popen(" ".join(cmd), shell=True) print(f"done building engine.")