{
  "architectures": [
    "GPT"
  ],
  "batch_size": 4,
  "context_len": 1024,
  "d_model": 1024,
  "device": "cuda",
  "dtype": "float32",
  "intermidiate_size": 4096,
  "load_checkpoint": true,
  "lr": 0.0006,
  "model_type": "gpt_custom",
  "n_epoch": 5,
  "n_heads": 8,
  "n_layers": 32,
  "transformers_version": "4.56.1",
  "vocab_size": 50304,
  "weight_decay": 0.1
}