r1training / transformers_config.json
George-API's picture
Upload folder using huggingface_hub
335441e verified
raw
history blame
1.45 kB
{
"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"dataset_name": "George-API/cognitive-data",
"output_dir": "./results",
"seed": 42,
"# Tokenization settings": "These settings ensure we preserve existing tokenization",
"trust_remote_code": true,
"use_fast_tokenizer": true,
"skip_tokenization": true,
"max_seq_length": 2048,
"chat_template": "chatml",
"# Quantization settings": "4-bit quantization for memory efficiency",
"load_in_4bit": true,
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": "float16",
"bnb_4bit_use_double_quant": true,
"# PEFT settings": "LoRA configuration for efficient fine-tuning",
"use_peft": true,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
"# Training parameters": "Optimized for cognitive science fine-tuning",
"num_train_epochs": 5,
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 8,
"learning_rate": 3e-5,
"weight_decay": 0.01,
"warmup_ratio": 0.1,
"lr_scheduler_type": "linear",
"logging_steps": 10,
"save_strategy": "steps",
"save_steps": 100,
"save_total_limit": 3,
"fp16": true,
"bf16": false,
"max_grad_norm": 0.5,
"# Hugging Face Hub settings": "For saving and sharing the model",
"push_to_hub": true,
"hub_model_id": "DeepSeek-Cognitive-Science",
"hub_private_repo": true
}