Spaces:

George-API
/

r1training

Sleeping

r1training / transformers_config.json

Upload folder using huggingface_hub

335441e verified 5 months ago

1.45 kB

	{
	"model_name": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
	"dataset_name": "George-API/cognitive-data",
	"output_dir": "./results",
	"seed": 42,

	"# Tokenization settings": "These settings ensure we preserve existing tokenization",
	"trust_remote_code": true,
	"use_fast_tokenizer": true,
	"skip_tokenization": true,
	"max_seq_length": 2048,
	"chat_template": "chatml",

	"# Quantization settings": "4-bit quantization for memory efficiency",
	"load_in_4bit": true,
	"bnb_4bit_quant_type": "nf4",
	"bnb_4bit_compute_dtype": "float16",
	"bnb_4bit_use_double_quant": true,

	"# PEFT settings": "LoRA configuration for efficient fine-tuning",
	"use_peft": true,
	"lora_r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],

	"# Training parameters": "Optimized for cognitive science fine-tuning",
	"num_train_epochs": 5,
	"per_device_train_batch_size": 4,
	"gradient_accumulation_steps": 8,
	"learning_rate": 3e-5,
	"weight_decay": 0.01,
	"warmup_ratio": 0.1,
	"lr_scheduler_type": "linear",
	"logging_steps": 10,
	"save_strategy": "steps",
	"save_steps": 100,
	"save_total_limit": 3,
	"fp16": true,
	"bf16": false,
	"max_grad_norm": 0.5,

	"# Hugging Face Hub settings": "For saving and sharing the model",
	"push_to_hub": true,
	"hub_model_id": "DeepSeek-Cognitive-Science",
	"hub_private_repo": true
	}