Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /deepspeed /linear /quantization.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified about 1 month ago

raw

history blame contribute delete

6.35 kB

	# Copyright (c) Microsoft Corporation.
	# SPDX-License-Identifier: Apache-2.0

	# DeepSpeed Team

	import copy
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from typing import Optional

	from deepspeed.accelerator import get_accelerator
	from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize
	from .config import QuantizationConfig


	class QuantizedParameter(nn.Parameter):
	"""
	Quantized parameter class that implements weight quantization. Weights
	are stored in quantized form on GPUs, and can be dequantized on-the-fly when
	needed by the model. The weights are actually quantized during any `.to(device)`.

	Arguments:
	data (Tensor): parameter tensor.
	requires_grad (bool, optional): if the parameter requires gradient. Defaults
	to False and is not supported to be True. Argument provided only for interface
	compatibility with torch.nn.Parameter.
	quantization_config (QuantizationConfig, optional):
	quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer
	that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also
	required since the quantizer is stashed in the Parameter itself, some models
	may clone the Parameter by passing an attribute __dict__. For an example, see
	tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone
	"""

	def __new__(
	cls,
	data: Optional[torch.Tensor] = None,
	requires_grad: bool = False, # quantized weights must be frozen
	quantization_config: QuantizationConfig = None,
	quantizer: Quantizer = None,
	):
	if requires_grad:
	raise ValueError(f"requires_grad=True is not supported with QuantizedParameter")
	if data is None:
	data = torch.empty(0)
	self = torch.Tensor._make_subclass(cls, data, requires_grad)
	self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config
	if quantizer is not None:
	self.quantizer = quantizer
	else:
	# if FPQuantizerBuilder is not compatible in this env this init will fail
	self.quantizer = FP_Quantize(quantization_config=self.quantization_config)
	self._ensure_quantized(self)
	return self

	def _ensure_quantized(self, tensor: torch.Tensor):
	# If the tensor is on the accelerator and is not quantized, then quantize it in-place.
	if get_accelerator().on_accelerator(tensor) and tensor.dtype != self.quantization_config.q_dtype:
	with get_accelerator().stream(get_accelerator().current_stream(tensor.device)):
	tensor.data = self.quantizer.quantize(tensor.data,
	q_bits=self.quantization_config.q_bits,
	q_mantisa_bits=self.quantization_config.mantissa_bits)
	assert tensor.dtype == self.quantization_config.q_dtype

	def dequantized(self) -> torch.Tensor:
	"""
	Return a tensor containing the dequantized weights of this parameter.
	"""
	if get_accelerator().on_accelerator(self.data) and self.data.dtype == self.quantization_config.q_dtype:
	with get_accelerator().stream(get_accelerator().current_stream(self.data.device)):
	return self.quantizer.dequantize(self.data,
	q_bits=self.quantization_config.q_bits,
	q_mantisa_bits=self.quantization_config.mantissa_bits)
	return self.data

	def offload(self, revert=False):
	if getattr(self, 'ds_offload', False):
	if revert:
	self.data = self.to(get_accelerator().current_device_name())
	else:
	self.data = self.to('cpu')

	def __getstate__(self):
	state = self.__dict__
	state["data"] = self.data
	state["quantization_config"] = self.quantization_config
	state["requires_grad"] = self.requires_grad
	return state

	def __setstate__(self, state):
	self.quantizer = state["quantizer"]
	self.quantization_config = state["quantization_config"]
	self.data = state["data"]
	self.requires_grad = state["requires_grad"]

	def __deepcopy__(self, memo):
	new_instance = type(self).__new__(type(self))
	state = self.__getstate__()
	new_instance.__setstate__(state)
	new_instance.quantizer = copy.deepcopy(state["quantizer"])
	new_instance.quantization_config = copy.deepcopy(state["quantization_config"])
	new_instance.data = copy.deepcopy(state["data"])
	return new_instance

	def __copy__(self):
	new_instance = type(self).__new__(type(self))
	state = self.__getstate__()
	new_instance.__setstate__(state)
	return new_instance

	def cuda(self, device=None, non_blocking=False):
	device = "cuda" if device is None else device
	self.quantizer.to(device, non_blocking=non_blocking)
	return self.to(device, non_blocking=non_blocking)

	def to(self, args, *kwargs):
	"""
	Move the parameter to the given device. Then, if the device is a cuda device,
	quantize it.
	"""
	tensor = super().to(args, *kwargs)
	self.quantizer.to(args, *kwargs)
	self._ensure_quantized(tensor)
	return tensor


	class QuantizedLinear(nn.Linear):
	"""
	Linear layer that implements weight quantization. Parameters
	are stored via `QuantizedParameter` and are dequantized on-the-fly during any
	forward pass.
	"""

	def __init__(self,
	input_dim: int,
	output_dim: int,
	bias: bool = False,
	quantization_config: QuantizationConfig = None,
	dtype=torch.bfloat16):
	super().__init__(input_dim, output_dim, bias=bias, dtype=dtype)
	assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype"
	self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config)

	def forward(self, input: torch.Tensor) -> torch.Tensor:
	return F.linear(input, self.weight.dequantized(), self.bias)