ngpus: 4 gradient_accumulation_steps: 8 pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth tokenizer: tokens: 50257 model: gpt2 training: batch_size: 512 accum: ${gradient_accumulation_steps} n_iters: 1000000 snapshot_freq: 100 log_freq: 10 eval_freq: 100 snapshot_freq_for_preemption: 3000 weight: standard snapshot_sampling: true ema: 0.9999 warmup_iter: -1 data: train: openwebtext-train valid: wikitext103 cache_dir: /home/toolkit/research-diffcodegen/data debug: false graph: type: QGamma gamma: 0.01 file: /home/toolkit/research-diffcodegen/data report_all: false expanded_sigma: true noise: type: loglinear sigma_min: 0.0001 sigma_max: 2.0 ar_diffusion: false expanded_sigma: ${graph.expanded_sigma} sampling: predictor: analytic steps_per_level: 1 noise_removal: true strategy: direct strategy_param: 0.9 annealing: type: block efficient: false width: 1024 tau: 2048 eval_tau: 512 steps_per_level: ${sampling.steps_per_level} sampling_method: SAR diffusion_loss_weight: 1.0 ce_loss_weight: 4.0 sampling_eps: 0.0001 attention: context_type: block_causal block_type: full match_inference: true eval: batch_size: 32 perplexity: true perplexity_batch_size: 16 optim: weight_decay: 0.0 optimizer: AdamW lr: 0.0003 beta1: 0.9 beta2: 0.999 eps: 1.0e-08 warmup: 10000 grad_clip: 1.0 scheduler: lambda experiment: name: QGamma0.01-v2 wandb_project: debug-QGamma model: name: gamma_hdlm type: ddit hidden_size: 768 cond_dim: 128 length: 1024 n_blocks: 12 n_heads: 12 scale_by_sigma: false dropout: 0.1 transformer_sigma_conditioning: true hybrid_sigma_embedding: true post_process_logits: true use_timestep_embedding: true model_type: gamma_hybrid