PodcastGenerator / modal_setup.py
imessam's picture
DEV: first
ee980d6
import modal
import os
app_name : str = "example-vllm-openai-compatible"
app = modal.App(name=app_name)
print(f"setting up container image ...")
vllm_image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"vllm==0.7.2",
"huggingface_hub[hf_transfer]==0.26.2",
"flashinfer-python==0.2.0.post2", # pinning, very unstable
extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
)
vllm_image = vllm_image.env({"VLLM_USE_V1": "1"})
print(f" done setting up container image.")
MODELS_DIR = "/llamas",
MODEL_NAME = "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
MODEL_REVISION = "a7c09948d9a632c2c840722f519672cd94af885d"
print(f" downloading model weights...")
hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
print(f" done downloading model weights.")
print(f"building engine...")
N_GPU = 1 # tip: for best results, first upgrade to more powerful GPUs, and only then increase GPU count
MINUTES = 60 # seconds
VLLM_PORT = 8000
@app.function(
image = vllm_image,
secrets=[modal.Secret.from_name("api_key")],
gpu=f"H100:{N_GPU}",
scaledown_window=15 * MINUTES, # how long should we stay up with no requests?
timeout=10 * MINUTES, # how long should we wait for container start?
volumes={
"/root/.cache/huggingface": hf_cache_vol,
"/root/.cache/vllm": vllm_cache_vol,
},
)
@modal.concurrent(
max_inputs=100
) # how many requests can one replica handle? tune carefully!
@modal.web_server(port=VLLM_PORT, startup_timeout=50 * MINUTES)
def serve():
import subprocess
API_KEY = os.environ["MODAL_API_KEY"]
cmd = [
"vllm",
"serve",
"--uvicorn-log-level=info",
MODEL_NAME,
"--revision",
MODEL_REVISION,
"--host",
"0.0.0.0",
"--port",
str(VLLM_PORT),
"--api-key",
API_KEY,
"--enable-auto-tool-choice"
" ",
"--tool-call-parser",
"llama3_json"
]
subprocess.Popen(" ".join(cmd), shell=True)
print(f"done building engine.")