Spaces:

George-API
/

qwen4bit

Sleeping

qwen4bit / transformers_config.json

Upload transformers_config.json with huggingface_hub

f6bdcf7 verified 5 months ago

1.72 kB

	{
	"model_config": {
	"model_name_or_path": "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit",
	"use_cache": false,
	"rope_scaling": {
	"type": "dynamic",
	"factor": 2.0
	}
	},
	"training_config": {
	"num_train_epochs": 3,
	"per_device_train_batch_size": 2,
	"gradient_accumulation_steps": 4,
	"learning_rate": 2e-5,
	"lr_scheduler_type": "cosine",
	"warmup_ratio": 0.03,
	"weight_decay": 0.01,
	"optim": "adamw_torch",
	"max_grad_norm": 0.3,
	"max_seq_length": 2048,
	"logging_steps": 10,
	"save_steps": 200,
	"save_total_limit": 3,
	"evaluation_strategy": "steps",
	"eval_steps": 200,
	"load_best_model_at_end": true,
	"output_dir": "fine_tuned_model",
	"disable_tqdm": false,
	"report_to": ["tensorboard"],
	"logging_first_step": true
	},
	"hardware_config": {
	"fp16": true,
	"bf16": false,
	"gradient_checkpointing": true,
	"device_map": "auto",
	"use_flash_attention": true
	},
	"quantization_config": {
	"load_in_4bit": true,
	"bnb_4bit_compute_dtype": "float16",
	"bnb_4bit_quant_type": "nf4",
	"bnb_4bit_use_double_quant": true
	},
	"lora_config": {
	"r": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"bias": "none",
	"target_modules": [
	"q_proj",
	"k_proj",
	"v_proj",
	"o_proj",
	"gate_proj",
	"up_proj",
	"down_proj"
	]
	},
	"dataset_config": {
	"sort_by_field": "prompt_number",
	"sort_direction": "ascending",
	"max_tokens": 2048,
	"text_field": "conversations",
	"shuffle_seed": 42,
	"training_phase_only": true,
	"pre_tokenized": true,
	"input_ids_field": "input_ids",
	"skip_tokenization": true
	}
	}