Spaces:
Sleeping
Sleeping
{ | |
"model_config": { | |
"model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit", | |
"use_cache": false, | |
"rope_scaling": { | |
"type": "dynamic", | |
"factor": 2.0 | |
} | |
}, | |
"training_config": { | |
"num_train_epochs": 3, | |
"per_device_train_batch_size": 4, | |
"gradient_accumulation_steps": 4, | |
"learning_rate": 2e-5, | |
"lr_scheduler_type": "cosine", | |
"warmup_ratio": 0.03, | |
"weight_decay": 0.01, | |
"optim": "adamw_torch", | |
"max_grad_norm": 0.3, | |
"max_seq_length": 2048, | |
"logging_steps": 10, | |
"save_steps": 200, | |
"save_total_limit": 3, | |
"evaluation_strategy": "steps", | |
"eval_steps": 200, | |
"load_best_model_at_end": true, | |
"output_dir": "fine_tuned_model", | |
"disable_tqdm": false, | |
"report_to": ["tensorboard"], | |
"logging_first_step": true, | |
"dataloader_num_workers": 4 | |
}, | |
"hardware_config": { | |
"fp16": true, | |
"bf16": false, | |
"gradient_checkpointing": true, | |
"device_map": "auto", | |
"attn_implementation": "eager", | |
"use_flash_attention": false, | |
"memory_optimization": { | |
"expandable_segments": true | |
} | |
}, | |
"quantization_config": { | |
"load_in_4bit": true, | |
"bnb_4bit_compute_dtype": "float16", | |
"bnb_4bit_quant_type": "nf4", | |
"bnb_4bit_use_double_quant": true | |
}, | |
"lora_config": { | |
"r": 16, | |
"lora_alpha": 32, | |
"lora_dropout": 0.05, | |
"bias": "none", | |
"target_modules": [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj" | |
] | |
}, | |
"dataset_config": { | |
"sort_by_field": "prompt_number", | |
"max_tokens": 2048, | |
"text_field": "conversations", | |
"training_phase_only": true, | |
"pre_tokenized": true, | |
"input_ids_field": "input_ids", | |
"skip_tokenization": true | |
}, | |
"deepspeed_config": { | |
"zero_optimization": { | |
"stage": 2, | |
"offload_optimizer": { | |
"device": "cpu", | |
"pin_memory": true | |
}, | |
"contiguous_gradients": true, | |
"overlap_comm": true, | |
"reduce_scatter": true, | |
"reduce_bucket_size": 5e8, | |
"allgather_bucket_size": 5e8, | |
"allgather_partitions": true, | |
"allgather_no_copy": true | |
}, | |
"gradient_accumulation_steps": 4, | |
"gradient_clipping": 0.3, | |
"fp16": { | |
"enabled": true, | |
"loss_scale": 0, | |
"loss_scale_window": 1000, | |
"initial_scale_power": 16, | |
"hysteresis": 2, | |
"min_loss_scale": 1 | |
}, | |
"optimizer": { | |
"type": "AdamW", | |
"params": { | |
"lr": 2e-5, | |
"betas": [0.9, 0.999], | |
"eps": 1e-8, | |
"weight_decay": 0.01 | |
} | |
}, | |
"activation_checkpointing": { | |
"partition_activations": true, | |
"cpu_checkpointing": true, | |
"contiguous_memory_optimization": true, | |
"number_checkpoints": null, | |
"synchronize_checkpoint_boundary": false, | |
"profile": false | |
}, | |
"steps_per_print": 10, | |
"train_batch_size": "auto", | |
"train_micro_batch_size_per_gpu": "auto", | |
"wall_clock_breakdown": false, | |
"communication_data_type": "fp16", | |
"comms_logger": { | |
"enabled": false | |
}, | |
"amp": { | |
"enabled": false | |
}, | |
"aio": { | |
"block_size": 1048576, | |
"queue_depth": 8, | |
"thread_count": 1, | |
"single_submit": false, | |
"overlap_events": true | |
} | |
} | |
} |