| ngpus: 4 | |
| gradient_accumulation_steps: 8 | |
| pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth | |
| tokenizer: | |
| tokens: 50257 | |
| model: gpt2 | |
| training: | |
| batch_size: 512 | |
| accum: ${gradient_accumulation_steps} | |
| n_iters: 1000000 | |
| snapshot_freq: 100 | |
| log_freq: 10 | |
| eval_freq: 100 | |
| snapshot_freq_for_preemption: 3000 | |
| weight: standard | |
| snapshot_sampling: true | |
| ema: 0.9999 | |
| warmup_iter: -1 | |
| data: | |
| train: openwebtext-train | |
| valid: wikitext103 | |
| cache_dir: /home/toolkit/research-diffcodegen/data | |
| debug: false | |
| graph: | |
| type: QGamma | |
| gamma: 0.01 | |
| file: /home/toolkit/research-diffcodegen/data | |
| report_all: false | |
| expanded_sigma: true | |
| noise: | |
| type: loglinear | |
| sigma_min: 0.0001 | |
| sigma_max: 2.0 | |
| ar_diffusion: false | |
| expanded_sigma: ${graph.expanded_sigma} | |
| sampling: | |
| predictor: analytic | |
| steps_per_level: 1 | |
| noise_removal: true | |
| strategy: direct | |
| strategy_param: 0.9 | |
| annealing: | |
| type: block | |
| efficient: false | |
| width: 1024 | |
| tau: 2048 | |
| eval_tau: 512 | |
| steps_per_level: ${sampling.steps_per_level} | |
| sampling_method: SAR | |
| diffusion_loss_weight: 1.0 | |
| ce_loss_weight: 4.0 | |
| sampling_eps: 0.0001 | |
| attention: | |
| context_type: block_causal | |
| block_type: full | |
| match_inference: true | |
| eval: | |
| batch_size: 32 | |
| perplexity: true | |
| perplexity_batch_size: 16 | |
| optim: | |
| weight_decay: 0.0 | |
| optimizer: AdamW | |
| lr: 0.0003 | |
| beta1: 0.9 | |
| beta2: 0.999 | |
| eps: 1.0e-08 | |
| warmup: 10000 | |
| grad_clip: 1.0 | |
| scheduler: lambda | |
| experiment: | |
| name: QGamma0.01-v2 | |
| wandb_project: debug-QGamma | |
| model: | |
| name: gamma_hdlm | |
| type: ddit | |
| hidden_size: 768 | |
| cond_dim: 128 | |
| length: 1024 | |
| n_blocks: 12 | |
| n_heads: 12 | |
| scale_by_sigma: false | |
| dropout: 0.1 | |
| transformer_sigma_conditioning: true | |
| hybrid_sigma_embedding: true | |
| post_process_logits: true | |
| use_timestep_embedding: true | |
| model_type: gamma_hybrid | |