hdlm-group
/

hdlm-base-gamma-0.01

Text Generation

diffusion-language-model

Model card Files Files and versions

hdlm-base-gamma-0.01 / config.yaml

nimafathi's picture

Upload HDLM model with complete HF integration

9aa89c6 verified 3 months ago

history blame contribute delete

1.9 kB

	ngpus: 4
	gradient_accumulation_steps: 8
	pretrain_autoregressive_path: /home/toolkit/research-diffcodegen/exp_local/openwebtext/mdlm-autoregressive/org-DiTAR-absorb-v2/checkpoints-meta/checkpoint.pth
	tokenizer:
	tokens: 50257
	model: gpt2
	training:
	batch_size: 512
	accum: ${gradient_accumulation_steps}
	n_iters: 1000000
	snapshot_freq: 100
	log_freq: 10
	eval_freq: 100
	snapshot_freq_for_preemption: 3000
	weight: standard
	snapshot_sampling: true
	ema: 0.9999
	warmup_iter: -1
	data:
	train: openwebtext-train
	valid: wikitext103
	cache_dir: /home/toolkit/research-diffcodegen/data
	debug: false
	graph:
	type: QGamma
	gamma: 0.01
	file: /home/toolkit/research-diffcodegen/data
	report_all: false
	expanded_sigma: true
	noise:
	type: loglinear
	sigma_min: 0.0001
	sigma_max: 2.0
	ar_diffusion: false
	expanded_sigma: ${graph.expanded_sigma}
	sampling:
	predictor: analytic
	steps_per_level: 1
	noise_removal: true
	strategy: direct
	strategy_param: 0.9
	annealing:
	type: block
	efficient: false
	width: 1024
	tau: 2048
	eval_tau: 512
	steps_per_level: ${sampling.steps_per_level}
	sampling_method: SAR
	diffusion_loss_weight: 1.0
	ce_loss_weight: 4.0
	sampling_eps: 0.0001
	attention:
	context_type: block_causal
	block_type: full
	match_inference: true
	eval:
	batch_size: 32
	perplexity: true
	perplexity_batch_size: 16
	optim:
	weight_decay: 0.0
	optimizer: AdamW
	lr: 0.0003
	beta1: 0.9
	beta2: 0.999
	eps: 1.0e-08
	warmup: 10000
	grad_clip: 1.0
	scheduler: lambda
	experiment:
	name: QGamma0.01-v2
	wandb_project: debug-QGamma
	model:
	name: gamma_hdlm
	type: ddit
	hidden_size: 768
	cond_dim: 128
	length: 1024
	n_blocks: 12
	n_heads: 12
	scale_by_sigma: false
	dropout: 0.1
	transformer_sigma_conditioning: true
	hybrid_sigma_embedding: true
	post_process_logits: true
	use_timestep_embedding: true
	model_type: gamma_hybrid