File size: 2,376 Bytes

289564a

{
  "ngpus": 4,
  "gradient_accumulation_steps": 8,
  "pretrain_autoregressive_path": "/home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth",
  "tokenizer": {
    "tokens": 50257,
    "model": "gpt2"
  },
  "training": {
    "batch_size": 512,
    "accum": 8,
    "n_iters": 1000000,
    "snapshot_freq": 100,
    "log_freq": 10,
    "eval_freq": 100,
    "snapshot_freq_for_preemption": 3000,
    "weight": "standard",
    "snapshot_sampling": true,
    "ema": 0.9999,
    "warmup_iter": -1
  },
  "data": {
    "train": "openwebtext-train",
    "valid": "wikitext103",
    "cache_dir": "/home/toolkit/research-diffcodegen/data",
    "debug": false
  },
  "graph": {
    "type": "QGamma",
    "gamma": 0.01,
    "file": "/home/toolkit/research-diffcodegen/data",
    "report_all": false,
    "expanded_sigma": true
  },
  "noise": {
    "type": "loglinear",
    "sigma_min": 0.0001,
    "sigma_max": 2.0,
    "ar_diffusion": false,
    "expanded_sigma": true
  },
  "sampling": {
    "predictor": "analytic",
    "steps_per_level": 1,
    "noise_removal": true,
    "strategy": "direct",
    "strategy_param": 0.9
  },
  "annealing": {
    "type": "block",
    "efficient": false,
    "width": 1024,
    "tau": 2048,
    "eval_tau": 512,
    "steps_per_level": 1,
    "sampling_method": "SAR",
    "diffusion_loss_weight": 1.0,
    "ce_loss_weight": 4.0,
    "sampling_eps": 0.0001,
    "attention": {
      "context_type": "block_causal",
      "block_type": "full"
    },
    "match_inference": true
  },
  "eval": {
    "batch_size": 32,
    "perplexity": true,
    "perplexity_batch_size": 16
  },
  "optim": {
    "weight_decay": 0.0,
    "optimizer": "AdamW",
    "lr": 0.0003,
    "beta1": 0.9,
    "beta2": 0.999,
    "eps": 1e-08,
    "warmup": 10000,
    "grad_clip": 1.0,
    "scheduler": "lambda"
  },
  "experiment": {
    "name": "QGamma0.01-v2",
    "wandb_project": "debug-QGamma"
  },
  "model": {
    "name": "gamma_hdlm",
    "type": "ddit",
    "hidden_size": 768,
    "cond_dim": 128,
    "length": 1024,
    "n_blocks": 12,
    "n_heads": 12,
    "scale_by_sigma": false,
    "dropout": 0.1,
    "transformer_sigma_conditioning": true,
    "hybrid_sigma_embedding": true,
    "post_process_logits": true,
    "use_timestep_embedding": true
  },
  "model_type": "gamma_hybrid"
}