{
  "ngpus": 4,
  "type": "aligned",
  "gradient_accumulation_steps": 2,
  "tokenizer": {
    "tokens": 50257,
    "model": "gpt2"
  },
  "training": {
    "batch_size": 128,
    "accum": 2,
    "n_iters": 1250000,
    "snapshot_freq": 10000,
    "log_freq": 500,
    "eval_freq": 10000,
    "snapshot_freq_for_preemption": 3000,
    "snapshot_sampling": true,
    "ema": 0.9999,
    "warmup_iter": -1,
    "loss_type": "hybrid",
    "epsilon": 0.0,
    "lambda": 0.0
  },
  "data": {
    "train": "openwebtext-train",
    "valid": "wikitext103",
    "cache_dir": "/home/toolkit/research-diffcodegen/data",
    "debug": false
  },
  "graph": {
    "type": "absorb",
    "gamma": 1.0,
    "file": "/home/toolkit/research-diffcodegen/data",
    "report_all": false,
    "expanded_sigma": true
  },
  "noise": {
    "type": "loglinear",
    "sigma_min": 0.0001,
    "sigma_max": 2.0,
    "ar_diffusion": false,
    "expanded_sigma": true
  },
  "sampling": {
    "predictor": "analytic",
    "steps_per_level": 1,
    "noise_removal": true,
    "strategy": "direct",
    "strategy_param": 0.9
  },
  "annealing": {
    "type": "none",
    "efficient": false,
    "width": 1024,
    "tau": 1024,
    "eval_tau": 1024,
    "steps_per_level": 1,
    "sampling_method": "sdlm",
    "diffusion_loss_weight": 1.0,
    "ce_loss_weight": 1.0,
    "sampling_eps": 0.0001,
    "attention": {
      "context_type": "block_causal",
      "block_type": "full"
    },
    "match_inference": false
  },
  "eval": {
    "batch_size": 16,
    "perplexity": true,
    "perplexity_batch_size": 8
  },
  "optim": {
    "weight_decay": 0.1,
    "optimizer": "AdamW",
    "lr": 0.0002,
    "beta1": 0.9,
    "beta2": 0.95,
    "eps": 1e-08,
    "warmup": 10000,
    "grad_clip": 1.0,
    "scheduler": "cosine"
  },
  "experiment": {
    "name": "MDLM",
    "wandb_project": "Hybrid-SDLM-ALIGNED"
  },
  "model": {
    "name": "HDLM",
    "type": "ddit",
    "hidden_size": 768,
    "cond_dim": 128,
    "length": 1024,
    "n_blocks": 12,
    "n_heads": 12,
    "dropout": 0.1,
    "scale_by_sigma": false,
    "transformer_sigma_conditioning": false,
    "hybrid_sigma_embedding": false,
    "post_process_logits": false,
    "use_timestep_embedding": false
  },
  "model_type": "epsilon_hybrid"
}