jamtur01's picture
Upload folder using huggingface_hub
9c6594c verified
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
from .fp16.loss_scaler import (
INITIAL_LOSS_SCALE,
SCALE_WINDOW,
DELAYED_SHIFT,
CONSECUTIVE_HYSTERESIS,
MIN_LOSS_SCALE,
)
#########################################
# BFLOAT16 support
#########################################
# BFLOAT16 feature. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
BFLOAT16_FORMAT = '''
BFLOAT16 parameters should be of the format:
"bf16": {
"enabled": true,
"immediate_grad_update": false,
"check_grad_overflow": false
}
'''
BFLOAT16 = "bf16"
BFLOAT16_OLD = "bfloat16" # keeping for backwards compatibility
def get_bfloat16_config(param_dict):
bf16_config_dict = param_dict.get(BFLOAT16, None)
if bf16_config_dict is None:
bf16_config_dict = param_dict.get(BFLOAT16_OLD, {})
return DeepSpeedBF16Config(**bf16_config_dict)
class DeepSpeedBF16Config(DeepSpeedConfigModel):
"""
For bfloat16 configuration
"""
enabled: bool = False
"""
Enable bfloat16 mixed-precision training/inference
"""
immediate_grad_update: bool = False
"""
Apply gradient updates immediately rather than delayed.
"""
check_grad_overflow: bool = False
"""
Check for gradient overflows and underflows
"""
#########################################
# FP16 support
#########################################
# FP16 feature. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
FP16_FORMAT = '''
FP16 parameters should be of the format:
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"consecutive_hysteresis": false,
"min_loss_scale": 1
}
'''
FP16 = "fp16"
def get_float16_config(param_dict):
fp16_config_dict = param_dict.get(FP16, {})
return DeepSpeedFP16Config(**fp16_config_dict)
class DeepSpeedFP16Config(DeepSpeedConfigModel):
"""
For float16 configuration
"""
enabled: bool = False
"""
Enable fp16 mixed-precision training/inference
"""
auto_cast: bool = False
"""
Automatically cast inputs to fp16
"""
loss_scale: float = 0
"""
Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
"""
initial_scale_power: int = 16
"""
For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
"""
loss_scale_window: int = 1000
"""
Iteration intervals for raising/lowering dynamic loss scale value.
"""
hysteresis: int = 2
"""
Delay shift in dynamic loss scaling.
"""
consecutive_hysteresis: bool = False
"""
Refill hysteresis if iteration does not overflow/underflow.
"""
min_loss_scale: int = 1
"""
Minimum dynamic loss scale value.
"""
fp16_master_weights_and_grads: bool = False
"""
Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only).
"""
def initial_dynamic_scale(self):
return 2**self.initial_scale_power
def dynamic_loss_scale_args(self):
return {
INITIAL_LOSS_SCALE: 2**self.initial_scale_power,
SCALE_WINDOW: self.loss_scale_window,
DELAYED_SHIFT: self.hysteresis,
CONSECUTIVE_HYSTERESIS: self.consecutive_hysteresis,
MIN_LOSS_SCALE: self.min_loss_scale,
}