Spaces:

trl-lib
/

recommend-vllm-memory

Running

App Files Files Community

recommend-vllm-memory / app.py

qgallouedec HF Staff

Create app.py

ee803a6 verified 14 days ago

raw

history blame contribute delete

5.31 kB

	import math
	import gradio as gr
	from transformers import AutoConfig, AutoModelForCausalLM
	from accelerate import init_empty_weights

	def recommend_gpu_mem_util(
	model_config_url,
	batch_size,
	max_prompt_length,
	max_completion_length,
	tp_size,
	gpu_memory=79,
	precision_in_bytes=2,
	kv_multiplier=2
	):
	# Load model config from HF URL
	try:
	config = AutoConfig.from_pretrained(model_config_url)
	except Exception as e:
	msg = f"Failed to load model config from URL: {e}"
	return msg, {"Error": msg}

	# Extract model config params
	try:
	num_hidden_layers = getattr(config, "num_hidden_layers")
	hidden_size = getattr(config, "hidden_size")
	num_attention_heads = getattr(config, "num_attention_heads")
	num_key_value_heads = getattr(config, "num_key_value_heads", num_attention_heads)
	except Exception as e:
	msg = f"Required field missing in model config: {e}"
	return msg, {"Error": msg}

	# Estimate model no. parameters
	try:
	with init_empty_weights():
	model = AutoModelForCausalLM.from_config(config)
	num_params = sum(p.numel() for p in model.parameters())
	model_params = num_params / 1e9
	est_msg = f"Estimated model_params from config: {model_params:.2f}B"
	except Exception as e:
	msg = f"Failed to estimate model parameters: {e}"
	return msg, {"Error": msg}

	# Calculate all memory and utilization values
	try:
	seq_len = max_prompt_length + max_completion_length

	model_size = float(model_params) * 1024*3 precision_in_bytes / tp_size

	# KV_cache_per_token = kv_multiplier (K and V) * num_hidden_layers * (num_key_value_heads * hidden_size / num_attention_heads) * precision_in_bytes
	kv_cache_per_token = (
	kv_multiplier
	* num_hidden_layers
	* (num_key_value_heads * hidden_size / num_attention_heads)
	* precision_in_bytes
	)
	# KV_cache_total = KV_cache_per_token * Batch_size * Seq_len (max_prompt_length + max_completion_length)
	kv_cache_total = kv_cache_per_token * batch_size * seq_len
	# Buffer = (Model + KV_cache) * 0.2 # generous 20% buffer
	buffer_size = 0.2 * (model_size + kv_cache_total)
	# Total = Model + KV_cache + Buffer
	total_required = model_size + kv_cache_total + buffer_size
	# GPU utilization = Total_reqd / Total_gpu
	gpu_memory_bytes = float(gpu_memory) * 1024**3
	gpu_utilization_ratio = total_required / gpu_memory_bytes
	# Round up to nearest 0.05 - this generous estimate works much better than actual prediction!
	rounded_utilization = math.ceil(gpu_utilization_ratio * 20) / 20 + 0.05

	main_result = f"vllm_gpu_memory_utilization = {rounded_utilization:.2f}"
	ans = {
	"KV_cache_per_token_MB": kv_cache_per_token / 1024**2,
	"KV_cache_total_GB": kv_cache_total / 1024**3,
	"Model_size_GB": model_size / 1024**3,
	"Buffer_GB": buffer_size / 1024**3,
	"Total_required_GB": total_required / 1024**3,
	"GPU_mem_util": gpu_utilization_ratio,
	"GPU_mem_util_recommended": rounded_utilization,
	"model_params": est_msg,
	"num_hidden_layers": num_hidden_layers,
	"hidden_size": hidden_size,
	"num_attention_heads": num_attention_heads,
	"num_key_value_heads": num_key_value_heads,
	}

	return main_result, ans
	except Exception as e:
	msg = f"Error during calculation: {e}"
	return msg, {"Error": msg}

	iface = gr.Interface(
	fn=recommend_gpu_mem_util,
	inputs=[
	gr.Textbox(label="Model Config URL (HuggingFace)", value="https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/resolve/main/config.json"),
	gr.Number(label="per_device_train_batch_size", value=4),
	gr.Number(label="max_prompt_length", value=512),
	gr.Number(label="max_completion_length", value=512),
	gr.Number(label="vllm_tensor_parallel_size (tp_size)", value=1),
	gr.Number(label="GPU Memory (GB)", value=79),
	gr.Number(label="Precision in Bytes (e.g., 2)", value=2),
	gr.Number(label="KV Multiplier", value=2),
	],
	outputs=[
	gr.Textbox(label="Recommended vLLM GPU Memory Utilization"),
	gr.JSON(label="Calculation Details"),
	],
	title="vLLM GRPO GPU Memory Utilization Estimator",
	description = """
	Paste your HuggingFace model config URL (ending in config.json), and enter experiment details.
	Model parameters are automatically extracted and estimated from the config.

	Note: This is a general recommendation and may not be optimal for your specific environment.
	Always verify your actual training GPU requirements. For example, if you're using DeepSpeed, consider utilizing their memory estimation tool:
	https://deepspeed.readthedocs.io/en/latest/memory.html

	If you encounter "not enough memory" errors, try increasing the GPU memory utilization setting.
	If you experience out-of-memory (OOM) errors, lower the utilization value and/or reduce your batch size.
	""",
	allow_flagging="never"
	)

	if __name__ == "__main__":
	iface.launch()