|
|
|
|
|
|
|
|
|
|
|
import torch.nn as nn |
|
|
|
from fairscale.optim import GradScaler |
|
|
|
|
|
class Offload_Transformer: |
|
def get_model_config(): |
|
return { |
|
"vocab_size": 10000, |
|
"ninp": 2048, |
|
"nhid": 2048, |
|
"nhead": 32, |
|
"dropout": 0, |
|
"initrange": 0.1, |
|
"scaler": GradScaler(), |
|
"clip_value": 0.05, |
|
"num_decoder_layers": 10, |
|
"seq_len": 32, |
|
} |
|
|
|
def get_benchmark_config(checkpoint_activation=True): |
|
|
|
return { |
|
"epochs": 1, |
|
"lr": 0.001, |
|
"batch_size": 8, |
|
"criterion": nn.CrossEntropyLoss(), |
|
"checkpoint_activation": checkpoint_activation, |
|
"num_microbatches": 1, |
|
"slices": 3, |
|
} |
|
|
|
def get_golden_real_stats(): |
|
return { |
|
"avg_wps": 192.105, |
|
"std_dev_wps": 39.56, |
|
"peak_mem_usage": 1180848128, |
|
} |
|
|
|
|
|
class Offload_Sequential: |
|
def get_model_config(): |
|
return { |
|
"inputs": 100, |
|
"outputs": 5, |
|
"hidden": 1000, |
|
"layers": 100, |
|
"clip_value": 0.05, |
|
} |
|
|
|
def get_benchmark_config(): |
|
|
|
return { |
|
"epochs": 1, |
|
"lr": 0.001, |
|
"batch_size": 8, |
|
"criterion": nn.CrossEntropyLoss(), |
|
"slices": 3, |
|
"checkpoint_activation": True, |
|
"num_microbatches": 1, |
|
} |
|
|
|
|
|
class FSDP: |
|
def get_model_config(): |
|
return { |
|
"vocab_size": 10000, |
|
"ninp": 2048, |
|
"nhid": 2048, |
|
"nhead": 32, |
|
"dropout": 0, |
|
"initrange": 0.1, |
|
"scaler": GradScaler(), |
|
"clip_value": 0.05, |
|
"num_decoder_layers": 10, |
|
"seq_len": 32, |
|
} |
|
|
|
def get_benchmark_config(): |
|
|
|
return { |
|
"epochs": 1, |
|
"lr": 0.001, |
|
"batch_size": 8, |
|
"criterion": nn.CrossEntropyLoss(), |
|
} |
|
|
|
def get_golden_real_stats(): |
|
raise NotImplementedError("Synthetic data benchmarks are not supported.") |
|
|
|
def get_golden_synthetic_stats(): |
|
return { |
|
"avg_wps": 486.303, |
|
"std_dev_wps": 71.307, |
|
"peak_mem_usage": [5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30, 5.5055 * 2**30], |
|
} |
|
|
|
|
|
class Pipe: |
|
def get_model_config(): |
|
return { |
|
"vocab_size": 10000, |
|
"ninp": 2048, |
|
"nhid": 2048, |
|
"nhead": 32, |
|
"dropout": 0, |
|
"initrange": 0.1, |
|
"scaler": GradScaler(), |
|
"clip_value": 0.05, |
|
"num_decoder_layers": 10, |
|
"seq_len": 32, |
|
} |
|
|
|
def get_benchmark_config(): |
|
|
|
return { |
|
"epochs": 1, |
|
"lr": 0.001, |
|
"batch_size": 8, |
|
"criterion": nn.CrossEntropyLoss(), |
|
} |
|
|
|
def get_golden_real_stats(): |
|
return { |
|
"avg_wps": 703.778, |
|
"std_dev_wps": 5.732, |
|
"peak_mem_usage": [2320996352, 1396742144, 1396742144, 2340010496], |
|
} |
|
|
|
def get_golden_synthetic_stats(): |
|
|
|
raise NotImplementedError("Synthetic data benchmarks are not supported.") |
|
|
|
|
|
class MOE: |
|
def get_model_config(): |
|
return { |
|
"vocab_size": 10000, |
|
"ninp": 1024, |
|
"nhid": 4096, |
|
"nhead": 32, |
|
"dropout": 0, |
|
"initrange": 0.1, |
|
"scaler": GradScaler(), |
|
"clip_value": 0.05, |
|
"num_decoder_layers": 20, |
|
"seq_len": 33, |
|
"is_moe": True, |
|
"num_local_experts": 2, |
|
} |
|
|
|
def get_benchmark_config(): |
|
return { |
|
"epochs": 1, |
|
"lr": 0.001, |
|
"batch_size": 32, |
|
"criterion": nn.CrossEntropyLoss(), |
|
} |
|
|