File size: 1,904 Bytes
9aa89c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
ngpus: 4
gradient_accumulation_steps: 8
pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth
tokenizer:
  tokens: 50257
  model: gpt2
training:
  batch_size: 512
  accum: ${gradient_accumulation_steps}
  n_iters: 1000000
  snapshot_freq: 100
  log_freq: 10
  eval_freq: 100
  snapshot_freq_for_preemption: 3000
  weight: standard
  snapshot_sampling: true
  ema: 0.9999
  warmup_iter: -1
data:
  train: openwebtext-train
  valid: wikitext103
  cache_dir: /home/toolkit/research-diffcodegen/data
  debug: false
graph:
  type: QGamma
  gamma: 0.01
  file: /home/toolkit/research-diffcodegen/data
  report_all: false
  expanded_sigma: true
noise:
  type: loglinear
  sigma_min: 0.0001
  sigma_max: 2.0
  ar_diffusion: false
  expanded_sigma: ${graph.expanded_sigma}
sampling:
  predictor: analytic
  steps_per_level: 1
  noise_removal: true
  strategy: direct
  strategy_param: 0.9
annealing:
  type: block
  efficient: false
  width: 1024
  tau: 2048
  eval_tau: 512
  steps_per_level: ${sampling.steps_per_level}
  sampling_method: SAR
  diffusion_loss_weight: 1.0
  ce_loss_weight: 4.0
  sampling_eps: 0.0001
  attention:
    context_type: block_causal
    block_type: full
  match_inference: true
eval:
  batch_size: 32
  perplexity: true
  perplexity_batch_size: 16
optim:
  weight_decay: 0.0
  optimizer: AdamW
  lr: 0.0003
  beta1: 0.9
  beta2: 0.999
  eps: 1.0e-08
  warmup: 10000
  grad_clip: 1.0
  scheduler: lambda
experiment:
  name: QGamma0.01-v2
  wandb_project: debug-QGamma
model:
  name: gamma_hdlm
  type: ddit
  hidden_size: 768
  cond_dim: 128
  length: 1024
  n_blocks: 12
  n_heads: 12
  scale_by_sigma: false
  dropout: 0.1
  transformer_sigma_conditioning: true
  hybrid_sigma_embedding: true
  post_process_logits: true
  use_timestep_embedding: true
model_type: gamma_hybrid