File size: 12,082 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
import logging
from typing import List, Tuple

import torch
import torch.nn as nn


class LayerInfo:
    """
    A class to record the layer attributes.
    """

    def __init__(self, name: str, layer: nn.Module, scale: float = 1.0, scale_layer: bool = False) -> None:
        """
        layer_name: name of the layer e.g. fc1, conv1, relu1
        layer: type of the layer e.g. Linear, Conv2d, ReLU
        scaling_factor: user configurable scaling factor for the layer, defaults to 1.0
        found_inf_or_nan: a boolean indicating if any parameter of layer's gradient contains inf/nan
        growth_tracker: tracks number of step since last time scale was increased
        scale_layer: a boolean indicating if the layer should be scaled or not
        """
        self.layer_name = name
        self.layer = layer
        self.scaling_factor = scale
        self.found_inf_or_nan = False
        self.growth_tracker = 0
        self.scale_layer = scale_layer


class GradientHelper:
    """
    A helper class to create instances of backward hooks. The hooks are registered in the
    scale method of LayerwiseGradientScaler.
    """

    def __init__(self, name: str, inputs_multiplier: float, outputs_multiplier: float):
        self.layer_name = name
        self.inputs_multiplier = inputs_multiplier
        self.outputs_multiplier = outputs_multiplier

    def scale_gradients(self, m: nn.Module, inputs: Tuple, outputs: Tuple) -> Tuple[torch.Tensor]:
        """
        Backward hook that is attached to the layers to scale the gradients.
        """
        scaled_up_grads = list()
        for idx in range(len(inputs)):
            if inputs[idx] is not None:
                if self.inputs_multiplier != 1.0 or self.outputs_multiplier != 1.0:
                    logging.debug(
                        "layer = %s \t scale = %s \t scale_down = %s"
                        % (self.layer_name, self.inputs_multiplier, self.outputs_multiplier)
                    )
                scaled_up_grads.append(inputs[idx].mul(self.inputs_multiplier * self.outputs_multiplier))
            else:
                logging.debug("next layer is None")
                scaled_up_grads.append(inputs[idx])
        return tuple(scaled_up_grads)  # type: ignore


class LayerwiseGradientScaler:
    """
    LayerwiseGradientScaler enables using distinct scaling factors for each layer
    of the network.

    Example:

    # Create a convolutional network
        class ConvNet(nn.Module):
            def __init__(self):
                ...

            def forward(self, x):
                ...

        # Create an instance of the model
        model = ConvNet()
        optimizer = torch.optim.SGD(model.parameters())

        # specify the layers to scale and their scaling factor
        layer_scale_dict = {"conv1": 2**10, "conv2": 2**8, "fc1": 2**10, "fc2": 2**9}
        scaler = LayerwiseGradientScaler(model, layer_scale_dict)

        for epoch in num_epochs:
            for inputs, targets in batch:
                optimizer.zero_grad()

                # scale the gradients
                scaler.scale()

                # enables mixed precision training
                with autocast():
                    predictions = model(inputs)
                    loss = loss_function(predictions, targets)

                loss.backward()

                # unscale the gradients
                loss.unscale()

                # step is taken if there are no inf/nan in the gradients
                # scaling factor for each layer are updated
                loss.step(optimizer)

    Args:
        model                      : instance of a Model class, such as ConvNet above
        layer_scale_dict (dict)    : dictionary with key = layer_name and value = scaling_factor
        growth_factor (float)      : per layer scaling factor multiplier
        backoff_factor (float)     : per layer scaling factor multiplier when an inf/nan is found
        growth_interval (int)      : number of steps after which scale is multiplied by growth_factor
        min_scaling_factor (float) : smallest scaling factor
        max_scaling_factor (float) : largest scaling factor
    """

    def __init__(  # type: ignore
        self,
        model,
        layer_scale_dict: dict,
        growth_factor: float = 2.0,
        backoff_factor: float = 0.5,
        growth_interval: int = 10000,
        min_scale: float = torch.finfo(torch.float32).tiny,  # type: ignore
        max_scale: float = torch.finfo(torch.float32).max,  # type: ignore
    ) -> None:
        self._model = model
        self._layer_scale_dict: dict = layer_scale_dict
        self._growth_factor: float = growth_factor
        self._backoff_factor: float = backoff_factor
        self._growth_interval: int = growth_interval
        self._apply_layerwise_scaling: bool = True if len(layer_scale_dict.keys()) > 0 else False
        self._min_scale = min_scale
        self._max_scale = max_scale
        self._handles: List = []
        self.layer_info: List = []

        if self._apply_layerwise_scaling:
            assert self._growth_factor > 1.0, "The growth factor must be > 1.0."
            assert self._backoff_factor < 1.0, "The backoff factor must be < 1.0."
            self.layer_info = self._build_layer_info()

    def _build_layer_info(self) -> List:
        """
        Helper function to create a list of LayerInfo instances.
        """
        layer_info_list = list()

        for name, layer in self._model.named_modules():
            if name != "":
                if name not in self._layer_scale_dict.keys():
                    logging.debug("name = %s, layer = %s, scaling_factor = %s" % (name, layer, 1.0))
                    layer_info_list.append(LayerInfo(name, layer, 1.0))
                else:
                    logging.debug(
                        "name = %s, layer = %s, scaling_factor = %s" % (name, layer, self._layer_scale_dict[name])
                    )
                    layer_info_list.append(LayerInfo(name, layer, self._layer_scale_dict[name], True))
        return layer_info_list

    def scale(self) -> None:
        """
        For each layer calculates the scaling factor for preceding layer's grad inputs
        and current layer's grad outputs. These values are used to register a full backward
        hook. The handle returned from registering the backward hook is appended to a list
        of handles. New hooks are created and registered at every step and a new list of
        handles is created. The handles are flushed out in the unscale function.
        """
        if not self._apply_layerwise_scaling:
            return

        for idx in range(len(self.layer_info)):
            elt = self.layer_info[idx]
            layer_name, layer = elt.layer_name, elt.layer

            inputs_multiplier = 1.0
            if idx > 0:
                inputs_multiplier = self.layer_info[idx - 1].scaling_factor

            outputs_multiplier = 1.0 / elt.scaling_factor
            helper = GradientHelper(layer_name, inputs_multiplier, outputs_multiplier)
            layer_handle = layer.register_full_backward_hook(helper.scale_gradients)
            self._handles.append(layer_handle)
            logging.debug("name = %s \t scale = %s" % (layer_name, elt.scaling_factor))

    def _get_layers_with_finite_values(self) -> List[LayerInfo]:
        layers_with_finite_values: List = []
        for item in self.layer_info:
            if not item.found_inf_or_nan:
                layers_with_finite_values.append(item)
        return layers_with_finite_values

    def unscale(self) -> None:
        """
        For each layer, check if any of the layer's parameters contain an inf/nan.
        If there are no inf/nan in the gradient, then gradient of that layer is
        unscaled by the reciprocal of the scaling factor for that layer.
        Finally, all handles recorded while registering the hooks are deleted.
        """
        if not self._apply_layerwise_scaling:
            return

        layers_with_finite_values = self._get_layers_with_finite_values()
        for item in layers_with_finite_values:
            for param_name, param in item.layer.named_parameters():
                if hasattr(param, "grad") and param.grad is not None:
                    logging.debug("%s scaling down %s by %s" % (item.layer_name, param_name, 1.0 / item.scaling_factor))
                    param.grad.mul_(1.0 / item.scaling_factor)

        while len(self._handles) > 0:
            elt = self._handles.pop()
            elt.remove()

    def _check_for_inf_or_nan(self) -> None:
        """
        For each layer, check if any of the parameters with a gradient attribute
        contain an inf/nan. If any of the parameters' gradient contain an inf/nan,
        then that layer's found_inf_or_nan attribute is set to True and all
        remaining parameters for that layer are skipped.
        """
        for elt in self.layer_info:
            elt.found_inf_or_nan = False
            for _, param in elt.layer.named_parameters():
                if hasattr(param, "grad") and param.grad is not None:
                    if torch.isinf(param.grad).any().item() or torch.isnan(param.grad).any().item():  # type: ignore
                        elt.found_inf_or_nan = True
                        break  # skip all remaining named parameters

    def step(self, optimizer) -> None:  # type: ignore
        """
        If there are no inf/nan in the gradients' of all layers, then optimizer
        takes a step, otherwise not. Update the scaling factor for each layer.
        """
        # using layerwise gradient scaling
        if self._apply_layerwise_scaling:
            self._check_for_inf_or_nan()
            inf_nan_found = any(elt.found_inf_or_nan for elt in self.layer_info)

            if not inf_nan_found:
                optimizer.step()
            self._update_scale()
        # not using layerwise gradient scaling
        else:
            optimizer.step()

    def _update_scale(self) -> None:
        """
        For each layer, if an inf/nan is found, then multiply the scaling factor
        of that layer by the backoff factor and set the growth tracker of that
        layer to 0. Else, increment the growth tracker of the layer. If growth
        tracker equals the growth interval, then multiply the scaling factor of
        the layer by the growth factor and reset the layer's growth tracker to 0.
        Finally, clip the scaling factor to the range
        [self.min_scaling_factor, self.max_scaling_factor]. The min/max scaling
        factor values are user configurable.
        """
        if not self._apply_layerwise_scaling:
            return

        for layer in self.layer_info:
            if layer.found_inf_or_nan:
                if layer.scale_layer:
                    layer.scaling_factor = max(
                        self._min_scale,
                        min(self._backoff_factor * layer.scaling_factor, self._max_scale),
                    )
                    layer.growth_tracker = 0
            else:
                layer.growth_tracker += 1
                if layer.scale_layer and layer.growth_tracker == self._growth_interval:
                    layer.scaling_factor = max(
                        self._min_scale,
                        min(self._growth_factor * layer.scaling_factor, self._max_scale),
                    )
                    layer.growth_tracker = 0

    def get_layer_info(self) -> List[LayerInfo]:
        """
        Returns a list of LayerInfo instances of the model.
        """
        return self.layer_info

    def get_backward_hooks(self) -> List:
        """
        Returns a list of tuples. Each tuple contains the layer name and the
        hook attached to it.
        """
        layer_name_and_hooks = list()
        for name, layer in self._model.named_modules():
            if name != "":
                layer_name_and_hooks.append((name, layer._get_backward_hooks()))
        return layer_name_and_hooks