|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible |
|
with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that |
|
can support a large number of GPUs based on the user specified parameters |
|
''' |
|
FORMAT = ''' |
|
Elasticity should be enabled as: |
|
"elasticity": { |
|
"enabled": true, |
|
"max_train_batch_size": 2000, |
|
"micro_batch_sizes": [2,4,6], |
|
"min_gpus": 1, |
|
"max_gpus" : 10000, |
|
"min_time": 20, |
|
"prefer_larger_batch": true, |
|
"ignore_non_elastic_batch_info": false, |
|
"version": 0.1 |
|
} |
|
''' |
|
|
|
ELASTICITY = 'elasticity' |
|
|
|
|
|
LATEST_ELASTICITY_VERSION = 0.2 |
|
|
|
ENABLED = 'enabled' |
|
ENABLED_DEFAULT = False |
|
|
|
|
|
MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size' |
|
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000 |
|
|
|
|
|
MICRO_BATCHES = 'micro_batch_sizes' |
|
MICRO_BATCHES_DEFAULT = [2, 4, 6] |
|
|
|
|
|
MIN_GPUS = 'min_gpus' |
|
MIN_GPUS_DEFAULT = 1 |
|
MAX_GPUS = 'max_gpus' |
|
MAX_GPUS_DEFAULT = 10000 |
|
|
|
NUM_GPUS_PER_NODE = 'num_gpus_per_node' |
|
NUM_GPUS_PER_NODE_DEFAULT = 1 |
|
|
|
MODEL_PARALLEL_SIZE = "model_parallel_size" |
|
MODEL_PARALLEL_SIZE_DEFAULT = 1 |
|
|
|
|
|
MIN_TIME = "min_time" |
|
MIN_TIME_DEFAULT = 0 |
|
|
|
|
|
|
|
PREFER_LARGER_BATCH = 'prefer_larger_batch' |
|
PREFER_LARGER_BATCH_DEFAULT = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info' |
|
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False |
|
|
|
|
|
VERSION = "version" |
|
VERSION_DEFAULT = LATEST_ELASTICITY_VERSION |
|
|
|
|
|
MINIMUM_DEEPSPEED_VERSION = "0.3.8" |
|
|
|
|
|
DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG" |
|
|