| { | |
| "ngpus": 4, | |
| "gradient_accumulation_steps": 8, | |
| "pretrain_autoregressive_path": "/home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth", | |
| "tokenizer": { | |
| "tokens": 50257, | |
| "model": "gpt2" | |
| }, | |
| "training": { | |
| "batch_size": 512, | |
| "accum": 8, | |
| "n_iters": 1000000, | |
| "snapshot_freq": 500, | |
| "log_freq": 100, | |
| "eval_freq": 500, | |
| "snapshot_freq_for_preemption": 3000, | |
| "weight": "standard", | |
| "snapshot_sampling": true, | |
| "ema": 0.9999, | |
| "warmup_iter": -1 | |
| }, | |
| "data": { | |
| "train": "openwebtext-train", | |
| "valid": "wikitext103", | |
| "cache_dir": "/home/toolkit/research-diffcodegen/data", | |
| "debug": false | |
| }, | |
| "graph": { | |
| "type": "QGamma", | |
| "gamma": 0.05, | |
| "file": "/home/toolkit/research-diffcodegen/data", | |
| "report_all": false, | |
| "expanded_sigma": true | |
| }, | |
| "noise": { | |
| "type": "loglinear", | |
| "sigma_min": 0.0001, | |
| "sigma_max": 2.0, | |
| "ar_diffusion": false, | |
| "expanded_sigma": true | |
| }, | |
| "sampling": { | |
| "predictor": "analytic", | |
| "steps_per_level": 1, | |
| "noise_removal": true, | |
| "strategy": "direct", | |
| "strategy_param": 0.9 | |
| }, | |
| "annealing": { | |
| "type": "block", | |
| "efficient": false, | |
| "width": 1024, | |
| "tau": 2048, | |
| "eval_tau": 256, | |
| "steps_per_level": 1, | |
| "sampling_method": "SAR", | |
| "diffusion_loss_weight": 1.0, | |
| "ce_loss_weight": 4.0, | |
| "sampling_eps": 0.0001, | |
| "attention": { | |
| "context_type": "block_causal", | |
| "block_type": "full" | |
| }, | |
| "match_inference": true | |
| }, | |
| "eval": { | |
| "batch_size": 32, | |
| "perplexity": true, | |
| "perplexity_batch_size": 16 | |
| }, | |
| "optim": { | |
| "weight_decay": 0.0, | |
| "optimizer": "AdamW", | |
| "lr": 0.0003, | |
| "beta1": 0.9, | |
| "beta2": 0.999, | |
| "eps": 1e-08, | |
| "warmup": 10000, | |
| "grad_clip": 1.0, | |
| "scheduler": "lambda" | |
| }, | |
| "experiment": { | |
| "name": "QGamma0.05-v2", | |
| "wandb_project": "debug-QGamma" | |
| }, | |
| "model": { | |
| "name": "gamma_hdlm", | |
| "type": "ddit", | |
| "hidden_size": 768, | |
| "cond_dim": 128, | |
| "length": 1024, | |
| "n_blocks": 12, | |
| "n_heads": 12, | |
| "scale_by_sigma": false, | |
| "dropout": 0.1, | |
| "transformer_sigma_conditioning": true, | |
| "hybrid_sigma_embedding": true, | |
| "post_process_logits": true, | |
| "use_timestep_embedding": true | |
| }, | |
| "model_type": "gamma_hybrid" | |
| } |