qwen4bit / transformers_config.json
George-API's picture
Upload transformers_config.json with huggingface_hub
f6bdcf7 verified
raw
history blame
1.72 kB
{
"model_config": {
"model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
"use_cache": false,
"rope_scaling": {
"type": "dynamic",
"factor": 2.0
}
},
"training_config": {
"num_train_epochs": 3,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 4,
"learning_rate": 2e-5,
"lr_scheduler_type": "cosine",
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"optim": "adamw_torch",
"max_grad_norm": 0.3,
"max_seq_length": 2048,
"logging_steps": 10,
"save_steps": 200,
"save_total_limit": 3,
"evaluation_strategy": "steps",
"eval_steps": 200,
"load_best_model_at_end": true,
"output_dir": "fine_tuned_model",
"disable_tqdm": false,
"report_to": ["tensorboard"],
"logging_first_step": true
},
"hardware_config": {
"fp16": true,
"bf16": false,
"gradient_checkpointing": true,
"device_map": "auto",
"use_flash_attention": true
},
"quantization_config": {
"load_in_4bit": true,
"bnb_4bit_compute_dtype": "float16",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true
},
"lora_config": {
"r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"bias": "none",
"target_modules": [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj"
]
},
"dataset_config": {
"sort_by_field": "prompt_number",
"sort_direction": "ascending",
"max_tokens": 2048,
"text_field": "conversations",
"shuffle_seed": 42,
"training_phase_only": true,
"pre_tokenized": true,
"input_ids_field": "input_ids",
"skip_tokenization": true
}
}