mode: pt device: gpu precision: bf16 eval_only: false predict_only: false seed: 93789 tokenizer: name: BEE-spoke-data/hf_slimpajama-6B-28672-BPE-forT5 working_dir: null model: liger: true klass: local_t5 name: pszemraj/tFINE-850m-24x24-1024ctx overwrite: dropout_rate: 0.0 num_decoder_layers: 16 num_key_value_heads: 4 num_layers: 16 use_gqa: true add_config: is_bf16: false checkpoint_path: '' random_init: true compile: true data: multi_task: true NTP: 0.3 input_length: 512 max_seq_len: 512 mlm_probability: 0.15 mean_noise_span_length: 3.0 num_workers: 0 optim: name: adamwscale base_lr: 0.001 batch_size: 128 total_steps: 65536 epochs: -1 warmup_steps: 5000 lr_scheduler: cosine weight_decay: 0.01 grad_clip: 1.0 grad_acc: 16 final_cosine: 2.0e-05 eval: every_steps: 500 steps: 0 checkpoint: every_steps: 1500 logging: every_steps: 25 grad_l2: true weights_l2: true use_wandb: true wandb_config: project: nanoT5 entity: amazingvince tags: - gqa - large - e32-d16 - 512 ctx mode: online