# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from deepspeed.runtime.config_utils import DeepSpeedConfigModel
from .fp16.loss_scaler import (
    INITIAL_LOSS_SCALE,
    SCALE_WINDOW,
    DELAYED_SHIFT,
    CONSECUTIVE_HYSTERESIS,
    MIN_LOSS_SCALE,
)

#########################################
# BFLOAT16 support
#########################################
# BFLOAT16 feature. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
BFLOAT16_FORMAT = '''
BFLOAT16 parameters should be of the format:
"bf16": {
  "enabled": true,
  "immediate_grad_update": false,
  "check_grad_overflow": false
}
'''
BFLOAT16 = "bf16"
BFLOAT16_OLD = "bfloat16"  # keeping for backwards compatibility


def get_bfloat16_config(param_dict):
    bf16_config_dict = param_dict.get(BFLOAT16, None)
    if bf16_config_dict is None:
        bf16_config_dict = param_dict.get(BFLOAT16_OLD, {})
    return DeepSpeedBF16Config(**bf16_config_dict)


class DeepSpeedBF16Config(DeepSpeedConfigModel):
    """
    For bfloat16 configuration
    """

    enabled: bool = False
    """
    Enable bfloat16 mixed-precision training/inference
    """

    immediate_grad_update: bool = False
    """
    Apply gradient updates immediately rather than delayed.
    """

    check_grad_overflow: bool = False
    """
    Check for gradient overflows and underflows
    """


#########################################
# FP16 support
#########################################
# FP16 feature. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
FP16_FORMAT = '''
FP16 parameters should be of the format:
"fp16": {
  "enabled": true,
  "auto_cast": false,
  "loss_scale": 0,
  "initial_scale_power": 16,
  "loss_scale_window": 1000,
  "hysteresis": 2,
  "consecutive_hysteresis": false,
  "min_loss_scale": 1
}
'''
FP16 = "fp16"


def get_float16_config(param_dict):
    fp16_config_dict = param_dict.get(FP16, {})
    return DeepSpeedFP16Config(**fp16_config_dict)


class DeepSpeedFP16Config(DeepSpeedConfigModel):
    """
    For float16 configuration
    """

    enabled: bool = False
    """
    Enable fp16 mixed-precision training/inference
    """

    auto_cast: bool = False
    """
    Automatically cast inputs to fp16
    """

    loss_scale: float = 0
    """
    Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
    """

    initial_scale_power: int = 16
    """
    For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
    """

    loss_scale_window: int = 1000
    """
    Iteration intervals for raising/lowering dynamic loss scale value.
    """

    hysteresis: int = 2
    """
    Delay shift in dynamic loss scaling.
    """

    consecutive_hysteresis: bool = False
    """
    Refill hysteresis if iteration does not overflow/underflow.
    """

    min_loss_scale: int = 1
    """
    Minimum dynamic loss scale value.
    """

    fp16_master_weights_and_grads: bool = False
    """
    Maintain master weights in optimizer state as fp16 instead of fp32 (valid with DeepSpeedCPUAdam only).
    """

    def initial_dynamic_scale(self):
        return 2**self.initial_scale_power

    def dynamic_loss_scale_args(self):
        return {
            INITIAL_LOSS_SCALE: 2**self.initial_scale_power,
            SCALE_WINDOW: self.loss_scale_window,
            DELAYED_SHIFT: self.hysteresis,
            CONSECUTIVE_HYSTERESIS: self.consecutive_hysteresis,
            MIN_LOSS_SCALE: self.min_loss_scale,
        }