ngpus: 4
gradient_accumulation_steps: 8
pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth
tokenizer:
  tokens: 50257
  model: gpt2
training:
  batch_size: 512
  accum: ${gradient_accumulation_steps}
  n_iters: 1000000
  snapshot_freq: 100
  log_freq: 10
  eval_freq: 100
  snapshot_freq_for_preemption: 3000
  weight: standard
  snapshot_sampling: true
  ema: 0.9999
  warmup_iter: -1
data:
  train: openwebtext-train
  valid: wikitext103
  cache_dir: /home/toolkit/research-diffcodegen/data
  debug: false
graph:
  type: QGamma
  gamma: 0.01
  file: /home/toolkit/research-diffcodegen/data
  report_all: false
  expanded_sigma: true
noise:
  type: loglinear
  sigma_min: 0.0001
  sigma_max: 2.0
  ar_diffusion: false
  expanded_sigma: ${graph.expanded_sigma}
sampling:
  predictor: analytic
  steps_per_level: 1
  noise_removal: true
  strategy: direct
  strategy_param: 0.9
annealing:
  type: block
  efficient: false
  width: 1024
  tau: 2048
  eval_tau: 512
  steps_per_level: ${sampling.steps_per_level}
  sampling_method: SAR
  diffusion_loss_weight: 1.0
  ce_loss_weight: 4.0
  sampling_eps: 0.0001
  attention:
    context_type: block_causal
    block_type: full
  match_inference: true
eval:
  batch_size: 32
  perplexity: true
  perplexity_batch_size: 16
optim:
  weight_decay: 0.0
  optimizer: AdamW
  lr: 0.0003
  beta1: 0.9
  beta2: 0.999
  eps: 1.0e-08
  warmup: 10000
  grad_clip: 1.0
  scheduler: lambda
experiment:
  name: QGamma0.01-v2
  wandb_project: debug-QGamma
model:
  name: gamma_hdlm
  type: ddit
  hidden_size: 768
  cond_dim: 128
  length: 1024
  n_blocks: 12
  n_heads: 12
  scale_by_sigma: false
  dropout: 0.1
  transformer_sigma_conditioning: true
  hybrid_sigma_embedding: true
  post_process_logits: true
  use_timestep_embedding: true
model_type: gamma_hybrid