|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from dataclasses import dataclass, field |
|
from typing import Optional |
|
|
|
|
|
@dataclass |
|
class ModelConfig: |
|
""" |
|
Configuration class for the models. |
|
|
|
Using [`~transformers.HfArgumentParser`] we can turn this class into |
|
[argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the |
|
command line. |
|
|
|
Parameters: |
|
model_name_or_path (`str` or `None`, *optional*, defaults to `None`): |
|
Model checkpoint for weights initialization. |
|
model_revision (`str`, *optional*, defaults to `"main"`): |
|
Specific model version to use. It can be a branch name, a tag name, or a commit id. |
|
torch_dtype (`Literal["auto", "bfloat16", "float16", "float32"]` or `None`, *optional*, defaults to `None`): |
|
Override the default `torch.dtype` and load the model under this dtype. Possible values are |
|
|
|
- `"bfloat16"`: `torch.bfloat16` |
|
- `"float16"`: `torch.float16` |
|
- `"float32"`: `torch.float32` |
|
- `"auto"`: Automatically derive the dtype from the model's weights. |
|
|
|
trust_remote_code (`bool`, *optional*, defaults to `False`): |
|
Whether to allow for custom models defined on the Hub in their own modeling files. This option should only |
|
be set to `True` for repositories you trust and in which you have read the code, as it will execute code |
|
present on the Hub on your local machine. |
|
attn_implementation (`str` or `None`, *optional*, defaults to `None`): |
|
Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case |
|
you must install this manually by running `pip install flash-attn --no-build-isolation`. |
|
use_peft (`bool`, *optional*, defaults to `False`): |
|
Whether to use PEFT for training. |
|
lora_r (`int`, *optional*, defaults to `16`): |
|
LoRA R value. |
|
lora_alpha (`int`, *optional*, defaults to `32`): |
|
LoRA alpha. |
|
lora_dropout (`float`, *optional*, defaults to `0.05`): |
|
LoRA dropout. |
|
lora_target_modules (`Union[str, list[str]]` or `None`, *optional*, defaults to `None`): |
|
LoRA target modules. |
|
lora_modules_to_save (`list[str]` or `None`, *optional*, defaults to `None`): |
|
Model layers to unfreeze & train. |
|
lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`): |
|
Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling). |
|
use_rslora (`bool`, *optional*, defaults to `False`): |
|
Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/βr`, instead of |
|
the original default value of `lora_alpha/r`. |
|
use_dora (`bool`, *optional*, defaults to `False`): |
|
Enable [Weight-Decomposed Low-Rank Adaptation (DoRA)](https://huggingface.co/papers/2402.09353). This |
|
technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is |
|
handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can |
|
improve the performance of LoRA, especially at low ranks. Right now, DoRA only supports linear and Conv2D |
|
layers. DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for |
|
inference. |
|
load_in_8bit (`bool`, *optional*, defaults to `False`): |
|
Whether to use 8 bit precision for the base model. Works only with LoRA. |
|
load_in_4bit (`bool`, *optional*, defaults to `False`): |
|
Whether to use 4 bit precision for the base model. Works only with LoRA. |
|
bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`): |
|
Quantization type (`"fp4"` or `"nf4"`). |
|
use_bnb_nested_quant (`bool`, *optional*, defaults to `False`): |
|
Whether to use nested quantization. |
|
""" |
|
|
|
model_name_or_path: Optional[str] = field( |
|
default=None, |
|
metadata={"help": "Model checkpoint for weights initialization."}, |
|
) |
|
model_revision: str = field( |
|
default="main", |
|
metadata={"help": "Specific model version to use. It can be a branch name, a tag name, or a commit id."}, |
|
) |
|
torch_dtype: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": "Override the default `torch.dtype` and load the model under this dtype.", |
|
"choices": ["auto", "bfloat16", "float16", "float32"], |
|
}, |
|
) |
|
trust_remote_code: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": "Whether to allow for custom models defined on the Hub in their own modeling files. This option " |
|
"should only be set to `True` for repositories you trust and in which you have read the code, as it will " |
|
"execute code present on the Hub on your local machine." |
|
}, |
|
) |
|
attn_implementation: Optional[str] = field( |
|
default=None, |
|
metadata={ |
|
"help": "Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in " |
|
"which case you must install this manually by running `pip install flash-attn --no-build-isolation`." |
|
}, |
|
) |
|
use_peft: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to use PEFT for training."}, |
|
) |
|
lora_r: int = field( |
|
default=16, |
|
metadata={"help": "LoRA R value."}, |
|
) |
|
lora_alpha: int = field( |
|
default=32, |
|
metadata={"help": "LoRA alpha."}, |
|
) |
|
lora_dropout: float = field( |
|
default=0.05, |
|
metadata={"help": "LoRA dropout."}, |
|
) |
|
lora_target_modules: Optional[list[str]] = field( |
|
default=None, |
|
metadata={"help": "LoRA target modules."}, |
|
) |
|
lora_modules_to_save: Optional[list[str]] = field( |
|
default=None, |
|
metadata={"help": "Model layers to unfreeze & train."}, |
|
) |
|
lora_task_type: str = field( |
|
default="CAUSAL_LM", |
|
metadata={"help": "Task type to pass for LoRA (use 'SEQ_CLS' for reward modeling)."}, |
|
) |
|
use_rslora: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": "Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/βr`, " |
|
"instead of the original default value of `lora_alpha/r`." |
|
}, |
|
) |
|
use_dora: bool = field( |
|
default=False, |
|
metadata={ |
|
"help": "Enable Weight-Decomposed Low-Rank Adaptation (DoRA). This technique decomposes the updates of " |
|
"the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " |
|
"magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " |
|
"especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a " |
|
"bigger overhead than pure LoRA, so it is recommended to merge weights for inference." |
|
}, |
|
) |
|
load_in_8bit: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to use 8 bit precision for the base model. Works only with LoRA."}, |
|
) |
|
load_in_4bit: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to use 4 bit precision for the base model. Works only with LoRA."}, |
|
) |
|
bnb_4bit_quant_type: str = field( |
|
default="nf4", |
|
metadata={"help": "Quantization type.", "choices": ["fp4", "nf4"]}, |
|
) |
|
use_bnb_nested_quant: bool = field( |
|
default=False, |
|
metadata={"help": "Whether to use nested quantization."}, |
|
) |
|
|
|
def __post_init__(self): |
|
if self.load_in_8bit and self.load_in_4bit: |
|
raise ValueError("You can't use 8 bit and 4 bit precision at the same time") |
|
|
|
if hasattr(self.lora_target_modules, "__len__") and len(self.lora_target_modules) == 1: |
|
self.lora_target_modules = self.lora_target_modules[0] |
|
|