hdlm-base-gamma-0.01 / config.yaml
nimafathi's picture
Upload HDLM model with complete HF integration
9aa89c6 verified
ngpus: 4
gradient_accumulation_steps: 8
pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth
tokenizer:
tokens: 50257
model: gpt2
training:
batch_size: 512
accum: ${gradient_accumulation_steps}
n_iters: 1000000
snapshot_freq: 100
log_freq: 10
eval_freq: 100
snapshot_freq_for_preemption: 3000
weight: standard
snapshot_sampling: true
ema: 0.9999
warmup_iter: -1
data:
train: openwebtext-train
valid: wikitext103
cache_dir: /home/toolkit/research-diffcodegen/data
debug: false
graph:
type: QGamma
gamma: 0.01
file: /home/toolkit/research-diffcodegen/data
report_all: false
expanded_sigma: true
noise:
type: loglinear
sigma_min: 0.0001
sigma_max: 2.0
ar_diffusion: false
expanded_sigma: ${graph.expanded_sigma}
sampling:
predictor: analytic
steps_per_level: 1
noise_removal: true
strategy: direct
strategy_param: 0.9
annealing:
type: block
efficient: false
width: 1024
tau: 2048
eval_tau: 512
steps_per_level: ${sampling.steps_per_level}
sampling_method: SAR
diffusion_loss_weight: 1.0
ce_loss_weight: 4.0
sampling_eps: 0.0001
attention:
context_type: block_causal
block_type: full
match_inference: true
eval:
batch_size: 32
perplexity: true
perplexity_batch_size: 16
optim:
weight_decay: 0.0
optimizer: AdamW
lr: 0.0003
beta1: 0.9
beta2: 0.999
eps: 1.0e-08
warmup: 10000
grad_clip: 1.0
scheduler: lambda
experiment:
name: QGamma0.01-v2
wandb_project: debug-QGamma
model:
name: gamma_hdlm
type: ddit
hidden_size: 768
cond_dim: 128
length: 1024
n_blocks: 12
n_heads: 12
scale_by_sigma: false
dropout: 0.1
transformer_sigma_conditioning: true
hybrid_sigma_embedding: true
post_process_logits: true
use_timestep_embedding: true
model_type: gamma_hybrid