hdlm-group
/

hdlm-base-gamma-0.01

+{
+  "ngpus": 4,
+  "gradient_accumulation_steps": 8,
+  "pretrain_autoregressive_path": "/home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth",
+  "tokenizer": {
+    "tokens": 50257,
+    "model": "gpt2"
+  },
+  "training": {
+    "batch_size": 512,
+    "accum": 8,
+    "n_iters": 1000000,
+    "snapshot_freq": 100,
+    "log_freq": 10,
+    "eval_freq": 100,
+    "snapshot_freq_for_preemption": 3000,
+    "weight": "standard",
+    "snapshot_sampling": true,
+    "ema": 0.9999,
+    "warmup_iter": -1
+  },
+  "data": {
+    "train": "openwebtext-train",
+    "valid": "wikitext103",
+    "cache_dir": "/home/toolkit/research-diffcodegen/data",
+    "debug": false
+  },
+  "graph": {
+    "type": "QGamma",
+    "gamma": 0.01,
+    "file": "/home/toolkit/research-diffcodegen/data",
+    "report_all": false,
+    "expanded_sigma": true
+  },
+  "noise": {
+    "type": "loglinear",
+    "sigma_min": 0.0001,
+    "sigma_max": 2.0,
+    "ar_diffusion": false,
+    "expanded_sigma": true
+  },
+  "sampling": {
+    "predictor": "analytic",
+    "steps_per_level": 1,
+    "noise_removal": true,
+    "strategy": "direct",
+    "strategy_param": 0.9
+  },
+  "annealing": {
+    "type": "block",
+    "efficient": false,
+    "width": 1024,
+    "tau": 2048,
+    "eval_tau": 512,
+    "steps_per_level": 1,
+    "sampling_method": "SAR",
+    "diffusion_loss_weight": 1.0,
+    "ce_loss_weight": 4.0,
+    "sampling_eps": 0.0001,
+    "attention": {
+      "context_type": "block_causal",
+      "block_type": "full"
+    },
+    "match_inference": true
+  },
+  "eval": {
+    "batch_size": 32,
+    "perplexity": true,
+    "perplexity_batch_size": 16
+  },
+  "optim": {
+    "weight_decay": 0.0,
+    "optimizer": "AdamW",
+    "lr": 0.0003,
+    "beta1": 0.9,
+    "beta2": 0.999,
+    "eps": 1e-08,
+    "warmup": 10000,
+    "grad_clip": 1.0,
+    "scheduler": "lambda"
+  },
+  "experiment": {
+    "name": "QGamma0.01-v2",
+    "wandb_project": "debug-QGamma"
+  },
+  "model": {
+    "name": "gamma_hdlm",
+    "type": "ddit",
+    "hidden_size": 768,
+    "cond_dim": 128,
+    "length": 1024,
+    "n_blocks": 12,
+    "n_heads": 12,
+    "scale_by_sigma": false,
+    "dropout": 0.1,
+    "transformer_sigma_conditioning": true,
+    "hybrid_sigma_embedding": true,
+    "post_process_logits": true,
+    "use_timestep_embedding": true
+  },
+  "model_type": "gamma_hybrid"
+}