{ "ngpus": 4, "gradient_accumulation_steps": 8, "pretrain_autoregressive_path": "/home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth", "tokenizer": { "tokens": 50257, "model": "gpt2" }, "training": { "batch_size": 512, "accum": 8, "n_iters": 1000000, "snapshot_freq": 500, "log_freq": 100, "eval_freq": 500, "snapshot_freq_for_preemption": 3000, "weight": "standard", "snapshot_sampling": true, "ema": 0.9999, "warmup_iter": -1 }, "data": { "train": "openwebtext-train", "valid": "wikitext103", "cache_dir": "/home/toolkit/research-diffcodegen/data", "debug": false }, "graph": { "type": "QGamma", "gamma": 0.05, "file": "/home/toolkit/research-diffcodegen/data", "report_all": false, "expanded_sigma": true }, "noise": { "type": "loglinear", "sigma_min": 0.0001, "sigma_max": 2.0, "ar_diffusion": false, "expanded_sigma": true }, "sampling": { "predictor": "analytic", "steps_per_level": 1, "noise_removal": true, "strategy": "direct", "strategy_param": 0.9 }, "annealing": { "type": "block", "efficient": false, "width": 1024, "tau": 2048, "eval_tau": 256, "steps_per_level": 1, "sampling_method": "SAR", "diffusion_loss_weight": 1.0, "ce_loss_weight": 4.0, "sampling_eps": 0.0001, "attention": { "context_type": "block_causal", "block_type": "full" }, "match_inference": true }, "eval": { "batch_size": 32, "perplexity": true, "perplexity_batch_size": 16 }, "optim": { "weight_decay": 0.0, "optimizer": "AdamW", "lr": 0.0003, "beta1": 0.9, "beta2": 0.999, "eps": 1e-08, "warmup": 10000, "grad_clip": 1.0, "scheduler": "lambda" }, "experiment": { "name": "QGamma0.05-v2", "wandb_project": "debug-QGamma" }, "model": { "name": "gamma_hdlm", "type": "ddit", "hidden_size": 768, "cond_dim": 128, "length": 1024, "n_blocks": 12, "n_heads": 12, "scale_by_sigma": false, "dropout": 0.1, "transformer_sigma_conditioning": true, "hybrid_sigma_embedding": true, "post_process_logits": true, "use_timestep_embedding": true }, "model_type": "gamma_hybrid" }