File size: 7,822 Bytes
e0be88b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import unittest

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitNetQuantConfig,
    OPTForCausalLM,
)
from transformers.testing_utils import (
    backend_empty_cache,
    require_accelerate,
    require_torch_gpu,
    slow,
    torch_device,
)
from transformers.utils import is_accelerate_available, is_torch_available


if is_torch_available():
    import torch

if is_accelerate_available():
    from accelerate import init_empty_weights


@require_torch_gpu
class BitNetQuantConfigTest(unittest.TestCase):
    def test_to_dict(self):
        """
        Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object
        """
        quantization_config = BitNetQuantConfig()
        config_to_dict = quantization_config.to_dict()

        for key in config_to_dict:
            self.assertEqual(getattr(quantization_config, key), config_to_dict[key])


@slow
@require_torch_gpu
@require_accelerate
class BitNetTest(unittest.TestCase):
    model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"

    # called only once for all test in this class
    @classmethod
    def setUpClass(cls):
        """
        Load the model
        """
        cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name)
        cls.quantized_model = AutoModelForCausalLM.from_pretrained(cls.model_name, device_map=torch_device)

    def tearDown(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()

    def test_replace_with_bitlinear(self):
        from transformers.integrations import BitLinear, replace_with_bitnet_linear

        model_id = "facebook/opt-350m"
        config = AutoConfig.from_pretrained(model_id)

        with init_empty_weights():
            model = OPTForCausalLM(config)

        nb_linears = 0
        for module in model.modules():
            if isinstance(module, torch.nn.Linear):
                nb_linears += 1

        model = replace_with_bitnet_linear(model)
        nb_bitnet_linear = 0
        for module in model.modules():
            if isinstance(module, BitLinear):
                nb_bitnet_linear += 1

        self.assertEqual(nb_linears - 1, nb_bitnet_linear)

    def test_quantized_model(self):
        """
        Simple test that checks if the quantized model is working properly
        """
        input_text = "What are we having for dinner?"
        expected_output = "What are we having for dinner? What are we going to do for fun this weekend?"
        input_ids = self.tokenizer(input_text, return_tensors="pt").to(torch_device)

        output = self.quantized_model.generate(**input_ids, max_new_tokens=11, do_sample=False)
        self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), expected_output)

    def test_packing_unpacking(self):
        """
        Simple test the packing and unpacking logic
        """

        from transformers.integrations import pack_weights, unpack_weights

        u = torch.randint(0, 255, (256, 256), dtype=torch.uint8)
        unpacked_u = unpack_weights(u, dtype=torch.bfloat16)
        repacked_u = pack_weights(unpacked_u)
        for i in range(u.shape[0]):
            for j in range(u.shape[1]):
                self.assertEqual(repacked_u[i][j], u[i][j])

    def test_activation_quant(self):
        """
        test the activation function behaviour
        """

        from transformers.integrations import BitLinear

        layer = BitLinear(in_features=4, out_features=2, bias=False, dtype=torch.float32)
        layer.to(torch_device)

        input_tensor = torch.tensor([1.0, -1.0, -1.0, 1.0], dtype=torch.float32).to(torch_device)

        # Quantize the input tensor
        quantized_tensor, scale = layer.activation_quant(input_tensor)

        # Verify the output quantized tensor
        for i in range(input_tensor.shape[0]):
            self.assertEqual(quantized_tensor[i] / scale, input_tensor[i])

        # Verify the scale tensor
        self.assertEqual(scale, 127)

    def test_weights_dtype(self):
        """
        test the weights dtype after loading
        """

        self_attn_q = self.quantized_model.model.layers[0].self_attn.q_proj.weight
        self_attn_k = self.quantized_model.model.layers[0].self_attn.k_proj.weight
        self_attn_v = self.quantized_model.model.layers[0].self_attn.v_proj.weight
        self_attn_o = self.quantized_model.model.layers[0].self_attn.o_proj.weight
        mlp_gate = self.quantized_model.model.layers[0].mlp.gate_proj.weight
        mlp_up = self.quantized_model.model.layers[0].mlp.up_proj.weight
        mlp_down = self.quantized_model.model.layers[0].mlp.down_proj.weight

        self.assertEqual(self_attn_q.dtype, torch.uint8)
        self.assertEqual(self_attn_k.dtype, torch.uint8)
        self.assertEqual(self_attn_v.dtype, torch.uint8)
        self.assertEqual(self_attn_o.dtype, torch.uint8)
        self.assertEqual(mlp_up.dtype, torch.uint8)
        self.assertEqual(mlp_gate.dtype, torch.uint8)
        self.assertEqual(mlp_down.dtype, torch.uint8)

    def test_replace_with_bitlinear_shape(self):
        """
        test that the BitNet layer weight shapes are correct, and the weight_scale is correctly initialized to 1
        """

        from transformers.integrations import replace_with_bitnet_linear

        out_features = 1024
        in_features = 512

        class SimpleLinearModule(torch.nn.Module):
            """
            Simple class to test BitLinear
            """

            def __init__(
                self,
                in_features: int = in_features,
                out_features: int = out_features,
                bias: bool = False,
            ):
                super().__init__()
                self.linear = torch.nn.Linear(in_features=in_features, out_features=out_features, bias=bias)

            def forward(self, x):
                return self.linear(x)

        model = SimpleLinearModule()
        replace_with_bitnet_linear(model)

        self.assertEqual(list(model.linear.weight.shape), [out_features // 4, in_features])
        self.assertEqual(model.linear.weight_scale, 1)


@slow
@require_torch_gpu
@require_accelerate
class BitNetSerializationTest(unittest.TestCase):
    def test_model_serialization(self):
        model_name = "HF1BitLLM/Llama3-8B-1.58-100B-tokens"
        quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch_device)
        input_tensor = torch.zeros((1, 8), dtype=torch.int32, device=torch_device)

        with torch.no_grad():
            logits_ref = quantized_model.forward(input_tensor).logits

        # Save
        saved_model_id = "quant_model"
        quantized_model.save_pretrained(saved_model_id)

        # Remove old model
        del quantized_model
        backend_empty_cache(torch_device)

        # Load and check if the logits match
        model_loaded = AutoModelForCausalLM.from_pretrained("quant_model", device_map=torch_device)

        with torch.no_grad():
            logits_loaded = model_loaded.forward(input_tensor).logits

        self.assertEqual((logits_loaded - logits_ref).abs().mean().item(), 0)