{ "architectures": [ "GPT" ], "batch_size": 4, "context_len": 1024, "d_model": 1024, "device": "cuda", "dtype": "float32", "intermidiate_size": 4096, "load_checkpoint": true, "lr": 0.0006, "model_type": "gpt_custom", "n_epoch": 5, "n_heads": 8, "n_layers": 32, "transformers_version": "4.56.1", "vocab_size": 50304, "weight_decay": 0.1 }