File size: 6,425 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import importlib.metadata
from typing import TYPE_CHECKING
from packaging import version
from .base import HfQuantizer
if TYPE_CHECKING:
from ..modeling_utils import PreTrainedModel
from ..utils import is_accelerate_available, is_auto_awq_available, is_torch_available, logging
from ..utils.quantization_config import AWQLinearVersion
if is_torch_available():
import torch
logger = logging.get_logger(__name__)
class AwqQuantizer(HfQuantizer):
"""
4-bit quantization for Activation-aware Weight Quantization(AWQ) (https://arxiv.org/abs/2306.00978)
"""
# AWQ requires data callibration - we support only inference
requires_calibration = True
required_packages = ["awq", "accelerate"]
def __init__(self, quantization_config, **kwargs):
super().__init__(quantization_config, **kwargs)
def validate_environment(self, device_map, **kwargs):
if not is_auto_awq_available():
raise ImportError("Loading an AWQ quantized model requires auto-awq library (`pip install autoawq`)")
if not is_accelerate_available():
raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
if self.quantization_config.version == AWQLinearVersion.IPEX:
if version.parse(importlib.metadata.version("autoawq")) < version.parse("0.2.6"):
raise RuntimeError(
"To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source."
)
if (
device_map is not None
and isinstance(device_map, dict)
and (torch.device("cpu") not in device_map.values() or len(device_map.values()) > 1)
):
raise ValueError(
"You are attempting to load an IPEX version AWQ model with a device_map that contains more than CPU."
" This is not supported. Please make sure only cpu in the device_map."
)
else:
if not torch.cuda.is_available():
raise RuntimeError(
"GPU is required to run AWQ quantized model. You can use IPEX version AWQ if you have an Intel CPU"
)
if device_map is None:
logger.warning_once(
"You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set "
"your model on a GPU device in order to run your model."
)
elif device_map is not None:
if isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
raise ValueError(
"You are attempting to load an AWQ model with a device_map that contains a CPU or disk device."
" This is not supported. Please remove the CPU or disk device from the device_map."
)
def update_torch_dtype(self, torch_dtype):
if torch_dtype is None:
torch_dtype = torch.float16
elif torch_dtype != torch.float16:
logger.warning("We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.")
return torch_dtype
def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
from ..integrations import get_keys_to_not_convert, replace_quantization_scales, replace_with_awq_linear
self.modules_to_not_convert = get_keys_to_not_convert(model)
if self.quantization_config.modules_to_not_convert is not None:
self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert)
model, has_been_replaced = replace_with_awq_linear(
model, quantization_config=self.quantization_config, modules_to_not_convert=self.modules_to_not_convert
)
model = replace_quantization_scales(model, model.config.model_type)
if not has_been_replaced:
logger.warning(
"You are loading an AWQ model but no linear modules were found in your model."
" Please double check your model architecture, or submit an issue on github if you think this is a bug."
)
def _process_model_after_weight_loading(self, model):
if self.quantization_config.do_fuse:
from ..integrations import fuse_awq_modules
model = fuse_awq_modules(model, self.quantization_config)
model._awq_is_fused = True # TODO: consider storing this flag in model.config instead
if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
from ..integrations import post_init_awq_exllama_modules
model = post_init_awq_exllama_modules(model, self.quantization_config.exllama_config)
if self.quantization_config.version == AWQLinearVersion.IPEX:
from ..integrations import post_init_awq_ipex_modules
model = post_init_awq_ipex_modules(model)
def is_serializable(self, safe_serialization=None):
# AWQ through auto-awq has been always serializable, except if the model is fused.
if self.quantization_config.do_fuse:
logger.warning("You cannot save an AWQ model that uses fused modules!")
return False
if self.quantization_config.version == AWQLinearVersion.EXLLAMA:
logger.warning("You cannot save an AWQ model that uses Exllama backend!")
return False
return True
@property
def is_trainable(self):
# AWQ supports PEFT fine-tuning from version 0.2.0
MIN_AWQ_VERSION_FOR_PEFT = "0.2.0"
return version.parse(importlib.metadata.version("autoawq")) >= version.parse(MIN_AWQ_VERSION_FOR_PEFT)
|