|
{ |
|
"ngpus": 8, |
|
"tokens": 50257, |
|
"training": { |
|
"batch_size": 512, |
|
"accum": 2, |
|
"n_iters": 1300001, |
|
"snapshot_freq": 50000, |
|
"log_freq": 50, |
|
"eval_freq": 100, |
|
"snapshot_freq_for_preemption": 10000, |
|
"weight": "standard", |
|
"snapshot_sampling": true, |
|
"ema": 0.9999 |
|
}, |
|
"data": { |
|
"train": "openwebtext", |
|
"valid": "wikitext103", |
|
"cache_dir": "data" |
|
}, |
|
"graph": { |
|
"type": "absorb" |
|
}, |
|
"noise": { |
|
"type": "loglinear", |
|
"sigma_min": 0.0001, |
|
"sigma_max": 20 |
|
}, |
|
"sampling": { |
|
"predictor": "euler", |
|
"steps": 128, |
|
"noise_removal": true |
|
}, |
|
"eval": { |
|
"batch_size": 512, |
|
"perplexity": true, |
|
"perplexity_batch_size": 32 |
|
}, |
|
"optim": { |
|
"weight_decay": 0, |
|
"optimizer": "AdamW", |
|
"lr": 0.0003, |
|
"beta1": 0.9, |
|
"beta2": 0.999, |
|
"eps": 1e-08, |
|
"warmup": 2500, |
|
"grad_clip": 1.0 |
|
}, |
|
"model": { |
|
"name": "medium", |
|
"type": "ddit", |
|
"hidden_size": 1024, |
|
"cond_dim": 128, |
|
"length": 1024, |
|
"n_blocks": 24, |
|
"n_heads": 16, |
|
"scale_by_sigma": true, |
|
"dropout": 0.1 |
|
}, |
|
"work_dir": "absorb_medium" |
|
} |