File size: 5,307 Bytes
ee803a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import math
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM
from accelerate import init_empty_weights

def recommend_gpu_mem_util(
    model_config_url,
    batch_size,
    max_prompt_length,
    max_completion_length,
    tp_size,
    gpu_memory=79,
    precision_in_bytes=2,
    kv_multiplier=2
):
    # Load model config from HF URL
    try:
        config = AutoConfig.from_pretrained(model_config_url)
    except Exception as e:
        msg = f"Failed to load model config from URL: {e}"
        return msg, {"Error": msg}

    # Extract model config params
    try:
        num_hidden_layers = getattr(config, "num_hidden_layers")
        hidden_size = getattr(config, "hidden_size")
        num_attention_heads = getattr(config, "num_attention_heads")
        num_key_value_heads = getattr(config, "num_key_value_heads", num_attention_heads)
    except Exception as e:
        msg = f"Required field missing in model config: {e}"
        return msg, {"Error": msg}

    # Estimate model no. parameters
    try:
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(config)
        num_params = sum(p.numel() for p in model.parameters())
        model_params = num_params / 1e9
        est_msg = f"Estimated model_params from config: {model_params:.2f}B"
    except Exception as e:
        msg = f"Failed to estimate model parameters: {e}"
        return msg, {"Error": msg}

    # Calculate all memory and utilization values
    try:
        seq_len = max_prompt_length + max_completion_length

        model_size = float(model_params) * 1024**3 * precision_in_bytes / tp_size
        
        # KV_cache_per_token = kv_multiplier (K and V) * num_hidden_layers * (num_key_value_heads * hidden_size / num_attention_heads) * precision_in_bytes
        kv_cache_per_token = (
            kv_multiplier
            * num_hidden_layers
            * (num_key_value_heads * hidden_size / num_attention_heads)
            * precision_in_bytes
        )
        # KV_cache_total = KV_cache_per_token * Batch_size * Seq_len (max_prompt_length + max_completion_length)
        kv_cache_total = kv_cache_per_token * batch_size * seq_len
        # Buffer = (Model + KV_cache) * 0.2  # generous 20% buffer
        buffer_size = 0.2 * (model_size + kv_cache_total)
        # Total = Model + KV_cache + Buffer
        total_required = model_size + kv_cache_total + buffer_size
        # GPU utilization = Total_reqd / Total_gpu
        gpu_memory_bytes = float(gpu_memory) * 1024**3
        gpu_utilization_ratio = total_required / gpu_memory_bytes
        # Round up to nearest 0.05 - this generous estimate works much better than actual prediction!
        rounded_utilization = math.ceil(gpu_utilization_ratio * 20) / 20 + 0.05

        main_result = f"vllm_gpu_memory_utilization = {rounded_utilization:.2f}"
        ans = {
            "KV_cache_per_token_MB": kv_cache_per_token / 1024**2,
            "KV_cache_total_GB": kv_cache_total / 1024**3,
            "Model_size_GB": model_size / 1024**3,
            "Buffer_GB": buffer_size / 1024**3,
            "Total_required_GB": total_required / 1024**3,
            "GPU_mem_util": gpu_utilization_ratio,
            "GPU_mem_util_recommended": rounded_utilization,
            "model_params": est_msg,
            "num_hidden_layers": num_hidden_layers,
            "hidden_size": hidden_size,
            "num_attention_heads": num_attention_heads,
            "num_key_value_heads": num_key_value_heads,
        }

        return main_result, ans
    except Exception as e:
        msg = f"Error during calculation: {e}"
        return msg, {"Error": msg}

iface = gr.Interface(
    fn=recommend_gpu_mem_util,
    inputs=[
        gr.Textbox(label="Model Config URL (HuggingFace)", value="https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/resolve/main/config.json"),
        gr.Number(label="per_device_train_batch_size", value=4),
        gr.Number(label="max_prompt_length", value=512),
        gr.Number(label="max_completion_length", value=512),
        gr.Number(label="vllm_tensor_parallel_size (tp_size)", value=1),
        gr.Number(label="GPU Memory (GB)", value=79),
        gr.Number(label="Precision in Bytes (e.g., 2)", value=2),
        gr.Number(label="KV Multiplier", value=2),
    ],
    outputs=[
        gr.Textbox(label="Recommended vLLM GPU Memory Utilization"),
        gr.JSON(label="Calculation Details"),
    ],
    title="vLLM GRPO GPU Memory Utilization Estimator",
    description = """
    Paste your HuggingFace model config URL (ending in config.json), and enter experiment details. 
    Model parameters are automatically extracted and estimated from the config.

    Note: This is a general recommendation and may not be optimal for your specific environment.
    Always verify your actual training GPU requirements. For example, if you're using DeepSpeed, consider utilizing their memory estimation tool:
    https://deepspeed.readthedocs.io/en/latest/memory.html

    If you encounter "not enough memory" errors, try increasing the GPU memory utilization setting.
    If you experience out-of-memory (OOM) errors, lower the utilization value and/or reduce your batch size.
    """,
    allow_flagging="never"
)

if __name__ == "__main__":
    iface.launch()