diff --git "a/log/log-train-2022-12-07-04-36-17-2" "b/log/log-train-2022-12-07-04-36-17-2" new file mode 100644--- /dev/null +++ "b/log/log-train-2022-12-07-04-36-17-2" @@ -0,0 +1,19322 @@ +2022-12-07 04:36:17,367 INFO [train.py:941] (2/4) Training started +2022-12-07 04:36:17,367 INFO [train.py:951] (2/4) Device: cuda:2 +2022-12-07 04:36:17,415 INFO [lexicon.py:168] (2/4) Loading pre-compiled data/lang_char/Linv.pt +2022-12-07 04:36:17,423 INFO [train.py:962] (2/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 100, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.23', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'b2ce63f3940018e7b433c43fd802fc50ab006a76', 'k2-git-date': 'Wed Nov 23 08:43:43 2022', 'lhotse-version': '1.9.0.dev+git.97bf4b0.dirty', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'ali_meeting', 'icefall-git-sha1': 'f13cf61-dirty', 'icefall-git-date': 'Tue Dec 6 03:34:27 2022', 'icefall-path': '/exp/draj/mini_scale_2022/icefall', 'k2-path': '/exp/draj/mini_scale_2022/k2/k2/python/k2/__init__.py', 'lhotse-path': '/exp/draj/mini_scale_2022/lhotse/lhotse/__init__.py', 'hostname': 'r8n04', 'IP address': '10.1.8.4'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 20, 'start_epoch': 1, 'start_batch': 0, 'exp_dir': PosixPath('pruned_transducer_stateless7/exp/v1'), 'lang_dir': 'data/lang_char', 'base_lr': 0.05, 'lr_batches': 5000, 'lr_epochs': 3.5, 'context_size': 2, 'prune_range': 5, 'lm_scale': 0.25, 'am_scale': 0.0, 'simple_loss_scale': 0.5, 'seed': 42, 'print_diagnostics': False, 'inf_check': False, 'save_every_n': 5000, 'keep_last_k': 10, 'average_period': 200, 'use_fp16': True, 'num_encoder_layers': '2,4,3,2,4', 'feedforward_dims': '1024,1024,2048,2048,1024', 'nhead': '8,8,8,8,8', 'encoder_dims': '384,384,384,384,384', 'attention_dims': '192,192,192,192,192', 'encoder_unmasked_dims': '256,256,256,256,256', 'zipformer_downsampling_factors': '1,2,4,8,2', 'cnn_module_kernels': '31,31,31,31,31', 'decoder_dim': 512, 'joiner_dim': 512, 'manifest_dir': PosixPath('data/manifests'), 'enable_musan': True, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'max_duration': 300, 'max_cuts': 100, 'num_buckets': 50, 'on_the_fly_feats': False, 'shuffle': True, 'num_workers': 8, 'enable_spec_aug': True, 'spec_aug_time_warp_factor': 80, 'blank_id': 0, 'vocab_size': 3290} +2022-12-07 04:36:17,423 INFO [train.py:964] (2/4) About to create model +2022-12-07 04:36:17,821 INFO [zipformer.py:179] (2/4) At encoder stack 4, which has downsampling_factor=2, we will combine the outputs of layers 1 and 3, with downsampling_factors=2 and 8. +2022-12-07 04:36:17,864 INFO [train.py:968] (2/4) Number of model parameters: 75734561 +2022-12-07 04:36:22,640 INFO [train.py:983] (2/4) Using DDP +2022-12-07 04:36:23,011 INFO [asr_datamodule.py:357] (2/4) About to get AMI train cuts +2022-12-07 04:36:23,015 INFO [asr_datamodule.py:204] (2/4) About to get Musan cuts +2022-12-07 04:36:23,015 INFO [asr_datamodule.py:208] (2/4) Enable MUSAN +2022-12-07 04:36:24,405 INFO [asr_datamodule.py:232] (2/4) Enable SpecAugment +2022-12-07 04:36:24,405 INFO [asr_datamodule.py:233] (2/4) Time warp factor: 80 +2022-12-07 04:36:24,405 INFO [asr_datamodule.py:246] (2/4) About to create train dataset +2022-12-07 04:36:24,405 INFO [asr_datamodule.py:259] (2/4) Using DynamicBucketingSampler. +2022-12-07 04:36:24,760 INFO [asr_datamodule.py:268] (2/4) About to create train dataloader +2022-12-07 04:36:24,761 INFO [asr_datamodule.py:381] (2/4) About to get AliMeeting IHM eval cuts +2022-12-07 04:36:24,762 INFO [asr_datamodule.py:300] (2/4) About to create dev dataset +2022-12-07 04:36:24,950 INFO [asr_datamodule.py:315] (2/4) About to create dev dataloader +2022-12-07 04:36:55,258 INFO [train.py:873] (2/4) Epoch 1, batch 0, loss[loss=5.226, simple_loss=4.74, pruned_loss=4.842, over 14324.00 frames. ], tot_loss[loss=5.226, simple_loss=4.74, pruned_loss=4.842, over 14324.00 frames. ], batch size: 55, lr: 2.50e-02, grad_scale: 2.0 +2022-12-07 04:36:55,259 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 04:37:02,328 INFO [train.py:905] (2/4) Epoch 1, validation: loss=4.832, simple_loss=4.375, pruned_loss=4.552, over 857387.00 frames. +2022-12-07 04:37:02,329 INFO [train.py:906] (2/4) Maximum memory allocated so far is 9571MB +2022-12-07 04:37:05,257 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 04:37:05,405 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=85.76 vs. limit=5.0 +2022-12-07 04:37:09,982 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=8.86 vs. limit=2.0 +2022-12-07 04:37:18,881 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:37:32,955 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=212.13 vs. limit=5.0 +2022-12-07 04:37:40,072 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4957, 3.4958, 3.4920, 3.4859, 2.9720, 3.4272, 3.4527, 3.4578], + device='cuda:2'), covar=tensor([0.0023, 0.0025, 0.0020, 0.0028, 0.0087, 0.0021, 0.0015, 0.0021], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([9.1956e-06, 9.2484e-06, 9.0974e-06, 9.3481e-06, 9.2369e-06, 9.2645e-06, + 9.1997e-06, 9.3760e-06], device='cuda:2') +2022-12-07 04:37:43,990 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=22.75 vs. limit=2.0 +2022-12-07 04:37:44,434 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=6.48 vs. limit=2.0 +2022-12-07 04:37:48,807 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=144.90 vs. limit=5.0 +2022-12-07 04:38:01,926 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:38:03,516 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=8.95 vs. limit=2.0 +2022-12-07 04:38:13,696 INFO [train.py:873] (2/4) Epoch 1, batch 100, loss[loss=0.5093, simple_loss=0.448, pruned_loss=0.4989, over 4976.00 frames. ], tot_loss[loss=0.9814, simple_loss=0.8846, pruned_loss=0.8923, over 875846.52 frames. ], batch size: 100, lr: 3.00e-02, grad_scale: 0.125 +2022-12-07 04:38:16,997 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 3.426e+01 1.096e+02 2.039e+02 4.080e+02 7.158e+03, threshold=4.078e+02, percent-clipped=0.0 +2022-12-07 04:38:28,328 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=6.78 vs. limit=2.0 +2022-12-07 04:38:34,153 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=9.01 vs. limit=2.0 +2022-12-07 04:38:42,187 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144.0, num_to_drop=2, layers_to_drop={0, 1} +2022-12-07 04:38:50,157 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1092, 3.0557, 3.3232, 3.2748, 3.1027, 3.2821, 3.3144, 3.3007], + device='cuda:2'), covar=tensor([0.0082, 0.0056, 0.0026, 0.0045, 0.0046, 0.0039, 0.0023, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([8.8579e-06, 8.8805e-06, 8.7757e-06, 8.9963e-06, 8.8975e-06, 8.9016e-06, + 8.8526e-06, 9.0117e-06], device='cuda:2') +2022-12-07 04:39:03,237 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9038, 3.8956, 3.9015, 3.8973, 3.9069, 3.8962, 3.8923, 3.8818], + device='cuda:2'), covar=tensor([0.0024, 0.0035, 0.0023, 0.0025, 0.0015, 0.0032, 0.0043, 0.0025], + device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([8.9646e-06, 8.9072e-06, 9.1700e-06, 8.9149e-06, 9.1003e-06, 9.0624e-06, + 9.0270e-06, 9.0327e-06], device='cuda:2') +2022-12-07 04:39:21,525 INFO [train.py:873] (2/4) Epoch 1, batch 200, loss[loss=0.3322, simple_loss=0.2957, pruned_loss=0.2695, over 2704.00 frames. ], tot_loss[loss=0.6946, simple_loss=0.6132, pruned_loss=0.6691, over 1322185.16 frames. ], batch size: 100, lr: 3.50e-02, grad_scale: 0.25 +2022-12-07 04:39:23,965 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.82 vs. limit=2.0 +2022-12-07 04:39:24,226 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.4620, 6.4623, 6.4613, 6.4614, 6.4615, 6.4625, 6.4607, 6.4625], + device='cuda:2'), covar=tensor([6.1800e-05, 9.5812e-05, 3.5045e-05, 8.9038e-05, 5.3650e-05, 6.4502e-05, + 7.2427e-05, 9.0781e-05], device='cuda:2'), in_proj_covar=tensor([0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009, 0.0009], + device='cuda:2'), out_proj_covar=tensor([9.2883e-06, 9.3747e-06, 9.2683e-06, 8.9862e-06, 9.3520e-06, 9.1465e-06, + 9.2779e-06, 9.1566e-06], device='cuda:2') +2022-12-07 04:39:24,801 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.748e+01 6.194e+01 9.239e+01 1.604e+02 3.274e+02, threshold=1.848e+02, percent-clipped=0.0 +2022-12-07 04:40:15,157 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.46 vs. limit=2.0 +2022-12-07 04:40:17,812 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.86 vs. limit=2.0 +2022-12-07 04:40:22,373 WARNING [optim.py:389] (2/4) Scaling gradients by 0.06466581672430038, model_norm_threshold=184.7753143310547 +2022-12-07 04:40:22,532 INFO [optim.py:451] (2/4) Parameter Dominanting tot_sumsq module.encoder.encoder_embed.conv.6.weight with proportion 0.54, where dominant_sumsq=(grad_sumsq*orig_rms_sq)=4.438e+06, grad_sumsq = 3.527e+09, orig_rms_sq=1.258e-03 +2022-12-07 04:40:27,234 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=22.10 vs. limit=5.0 +2022-12-07 04:40:27,612 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=296.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:40:30,349 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=300.0, num_to_drop=2, layers_to_drop={2, 3} +2022-12-07 04:40:31,029 INFO [train.py:873] (2/4) Epoch 1, batch 300, loss[loss=0.3871, simple_loss=0.3575, pruned_loss=0.24, over 1305.00 frames. ], tot_loss[loss=0.5851, simple_loss=0.5096, pruned_loss=0.5582, over 1558847.19 frames. ], batch size: 100, lr: 4.00e-02, grad_scale: 0.5 +2022-12-07 04:40:34,362 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.885e+01 5.232e+01 7.085e+01 9.806e+01 2.857e+03, threshold=1.417e+02, percent-clipped=2.0 +2022-12-07 04:41:12,174 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=357.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:41:29,385 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=24.72 vs. limit=5.0 +2022-12-07 04:41:33,212 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=387.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:41:43,791 INFO [train.py:873] (2/4) Epoch 1, batch 400, loss[loss=0.4463, simple_loss=0.3676, pruned_loss=0.4121, over 13987.00 frames. ], tot_loss[loss=0.5313, simple_loss=0.4566, pruned_loss=0.4956, over 1696388.90 frames. ], batch size: 20, lr: 4.50e-02, grad_scale: 1.0 +2022-12-07 04:41:47,368 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.807e+01 5.021e+01 6.661e+01 9.478e+01 2.713e+02, threshold=1.332e+02, percent-clipped=6.0 +2022-12-07 04:42:10,284 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=6.67 vs. limit=2.0 +2022-12-07 04:42:10,516 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=439.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 04:42:17,076 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=448.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 04:42:43,290 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=9.70 vs. limit=5.0 +2022-12-07 04:42:55,425 INFO [train.py:873] (2/4) Epoch 1, batch 500, loss[loss=0.484, simple_loss=0.3936, pruned_loss=0.4292, over 14491.00 frames. ], tot_loss[loss=0.5012, simple_loss=0.4254, pruned_loss=0.4539, over 1770224.19 frames. ], batch size: 51, lr: 4.99e-02, grad_scale: 1.0 +2022-12-07 04:42:59,044 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.328e+01 4.979e+01 6.873e+01 9.259e+01 1.851e+02, threshold=1.375e+02, percent-clipped=7.0 +2022-12-07 04:43:01,157 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-12-07 04:43:38,462 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=562.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:43:42,844 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=568.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:43:57,724 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=590.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:44:00,845 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.13 vs. limit=5.0 +2022-12-07 04:44:02,998 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.08 vs. limit=2.0 +2022-12-07 04:44:05,271 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=600.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 04:44:05,744 INFO [train.py:873] (2/4) Epoch 1, batch 600, loss[loss=0.3448, simple_loss=0.2936, pruned_loss=0.2573, over 2653.00 frames. ], tot_loss[loss=0.4827, simple_loss=0.4042, pruned_loss=0.4237, over 1882028.26 frames. ], batch size: 100, lr: 4.98e-02, grad_scale: 1.0 +2022-12-07 04:44:09,407 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.092e+01 6.985e+01 9.438e+01 1.366e+02 4.481e+02, threshold=1.888e+02, percent-clipped=23.0 +2022-12-07 04:44:21,355 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=623.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 04:44:25,213 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=629.0, num_to_drop=2, layers_to_drop={0, 1} +2022-12-07 04:44:39,010 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=648.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:44:41,216 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=651.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 04:44:41,837 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=652.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:45:15,579 INFO [train.py:873] (2/4) Epoch 1, batch 700, loss[loss=0.4811, simple_loss=0.3926, pruned_loss=0.3787, over 14302.00 frames. ], tot_loss[loss=0.4719, simple_loss=0.3904, pruned_loss=0.4016, over 1954861.37 frames. ], batch size: 31, lr: 4.98e-02, grad_scale: 1.0 +2022-12-07 04:45:19,022 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.942e+01 9.279e+01 1.429e+02 2.357e+02 5.770e+02, threshold=2.857e+02, percent-clipped=36.0 +2022-12-07 04:45:42,037 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=739.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 04:45:45,170 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=743.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 04:45:46,915 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.46 vs. limit=2.0 +2022-12-07 04:45:57,228 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7602, 2.9356, 2.8977, 2.9022, 2.7066, 3.0487, 2.7057, 2.9457], + device='cuda:2'), covar=tensor([0.2766, 0.3553, 0.3015, 0.3477, 0.4672, 0.2762, 0.4289, 0.3742], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0025, 0.0024, 0.0025, 0.0026, 0.0024, 0.0027, 0.0026], + device='cuda:2'), out_proj_covar=tensor([2.1123e-05, 2.1799e-05, 2.1529e-05, 2.3162e-05, 2.2321e-05, 2.1281e-05, + 2.4533e-05, 2.1945e-05], device='cuda:2') +2022-12-07 04:46:15,744 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=787.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:46:25,998 INFO [train.py:873] (2/4) Epoch 1, batch 800, loss[loss=0.4308, simple_loss=0.3536, pruned_loss=0.3214, over 14263.00 frames. ], tot_loss[loss=0.4578, simple_loss=0.3766, pruned_loss=0.3758, over 1967318.58 frames. ], batch size: 63, lr: 4.97e-02, grad_scale: 2.0 +2022-12-07 04:46:29,175 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.65 vs. limit=2.0 +2022-12-07 04:46:29,401 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.458e+01 1.291e+02 1.818e+02 2.513e+02 6.152e+02, threshold=3.636e+02, percent-clipped=18.0 +2022-12-07 04:46:48,856 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.38 vs. limit=5.0 +2022-12-07 04:46:58,655 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=847.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:47:04,973 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.51 vs. limit=2.0 +2022-12-07 04:47:12,919 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.82 vs. limit=5.0 +2022-12-07 04:47:26,623 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.32 vs. limit=5.0 +2022-12-07 04:47:32,429 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.09 vs. limit=5.0 +2022-12-07 04:47:32,788 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3321, 4.5288, 4.5465, 4.4306, 4.2653, 4.8242, 4.6047, 4.2042], + device='cuda:2'), covar=tensor([0.2963, 0.1680, 0.1990, 0.1972, 0.2289, 0.1133, 0.1190, 0.2744], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0028, 0.0029, 0.0028, 0.0031, 0.0026, 0.0028, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.9554e-05, 2.5201e-05, 2.6010e-05, 2.4731e-05, 2.8923e-05, 2.4210e-05, + 2.4725e-05, 3.1639e-05], device='cuda:2') +2022-12-07 04:47:36,963 INFO [train.py:873] (2/4) Epoch 1, batch 900, loss[loss=0.4366, simple_loss=0.3588, pruned_loss=0.3137, over 14182.00 frames. ], tot_loss[loss=0.444, simple_loss=0.3647, pruned_loss=0.3496, over 1996976.97 frames. ], batch size: 89, lr: 4.96e-02, grad_scale: 2.0 +2022-12-07 04:47:40,262 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.004e+01 1.837e+02 2.554e+02 3.837e+02 9.711e+02, threshold=5.109e+02, percent-clipped=29.0 +2022-12-07 04:47:41,850 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=908.0, num_to_drop=2, layers_to_drop={0, 1} +2022-12-07 04:47:43,636 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.08 vs. limit=2.0 +2022-12-07 04:47:48,598 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=918.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 04:47:52,716 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=924.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:47:55,125 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.16 vs. limit=5.0 +2022-12-07 04:48:08,946 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=946.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:48:13,024 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=952.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 04:48:42,817 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1472, 2.1674, 2.2205, 2.1039, 2.2203, 2.2139, 2.1415, 2.2295], + device='cuda:2'), covar=tensor([0.2150, 0.1592, 0.1223, 0.1575, 0.1630, 0.1688, 0.1366, 0.1595], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0018, 0.0016, 0.0017, 0.0018, 0.0017, 0.0018, 0.0016], + device='cuda:2'), out_proj_covar=tensor([1.7406e-05, 1.8752e-05, 1.5819e-05, 1.6478e-05, 1.7286e-05, 1.6973e-05, + 1.8117e-05, 1.5585e-05], device='cuda:2') +2022-12-07 04:48:46,724 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1000.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:48:47,390 INFO [train.py:873] (2/4) Epoch 1, batch 1000, loss[loss=0.4066, simple_loss=0.331, pruned_loss=0.2878, over 14286.00 frames. ], tot_loss[loss=0.4305, simple_loss=0.3536, pruned_loss=0.3263, over 1967159.46 frames. ], batch size: 39, lr: 4.95e-02, grad_scale: 2.0 +2022-12-07 04:48:50,981 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.184e+02 2.449e+02 3.112e+02 4.355e+02 1.197e+03, threshold=6.224e+02, percent-clipped=18.0 +2022-12-07 04:49:07,475 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0 +2022-12-07 04:49:17,163 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1043.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 04:49:49,174 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7107, 1.9220, 1.8440, 1.9063, 1.8120, 1.8902, 1.7822, 1.7651], + device='cuda:2'), covar=tensor([0.1451, 0.1034, 0.1114, 0.1109, 0.1190, 0.0952, 0.1328, 0.1287], + device='cuda:2'), in_proj_covar=tensor([0.0033, 0.0028, 0.0030, 0.0030, 0.0031, 0.0028, 0.0030, 0.0033], + device='cuda:2'), out_proj_covar=tensor([3.0328e-05, 2.4496e-05, 2.7116e-05, 2.5273e-05, 2.6729e-05, 2.4900e-05, + 2.6783e-05, 3.0242e-05], device='cuda:2') +2022-12-07 04:49:51,012 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1091.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:49:58,570 INFO [train.py:873] (2/4) Epoch 1, batch 1100, loss[loss=0.3815, simple_loss=0.3225, pruned_loss=0.248, over 14455.00 frames. ], tot_loss[loss=0.4163, simple_loss=0.3428, pruned_loss=0.304, over 1939174.76 frames. ], batch size: 41, lr: 4.94e-02, grad_scale: 2.0 +2022-12-07 04:50:02,057 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.129e+02 2.767e+02 3.669e+02 5.185e+02 1.491e+03, threshold=7.337e+02, percent-clipped=11.0 +2022-12-07 04:50:35,554 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1153.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:51:04,713 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.17 vs. limit=2.0 +2022-12-07 04:51:09,751 INFO [train.py:873] (2/4) Epoch 1, batch 1200, loss[loss=0.3882, simple_loss=0.3261, pruned_loss=0.2498, over 14399.00 frames. ], tot_loss[loss=0.4038, simple_loss=0.3335, pruned_loss=0.2842, over 1961857.99 frames. ], batch size: 41, lr: 4.93e-02, grad_scale: 4.0 +2022-12-07 04:51:11,143 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1203.0, num_to_drop=2, layers_to_drop={2, 3} +2022-12-07 04:51:12,882 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.090e+02 2.727e+02 3.697e+02 4.642e+02 1.114e+03, threshold=7.394e+02, percent-clipped=5.0 +2022-12-07 04:51:15,451 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.14 vs. limit=2.0 +2022-12-07 04:51:18,462 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1214.0, num_to_drop=2, layers_to_drop={2, 3} +2022-12-07 04:51:21,200 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6548, 3.1815, 3.2677, 3.0556, 3.1048, 3.3975, 3.3221, 1.8160], + device='cuda:2'), covar=tensor([0.1312, 0.2341, 0.1689, 0.2116, 0.1695, 0.2035, 0.2270, 0.6692], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0021, 0.0019, 0.0017, 0.0018, 0.0019, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([1.3258e-05, 1.7594e-05, 1.3824e-05, 1.2077e-05, 1.3820e-05, 1.5490e-05, + 1.6071e-05, 2.2311e-05], device='cuda:2') +2022-12-07 04:51:21,848 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1218.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 04:51:25,910 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1224.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 04:51:41,199 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1246.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:51:43,234 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0791, 2.1965, 2.1161, 2.1622, 2.1786, 2.0836, 2.0585, 2.2025], + device='cuda:2'), covar=tensor([0.0760, 0.0678, 0.0583, 0.0473, 0.0645, 0.0686, 0.0826, 0.0677], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0025, 0.0022, 0.0023, 0.0023, 0.0023, 0.0022], + device='cuda:2'), out_proj_covar=tensor([2.3659e-05, 2.3766e-05, 2.3077e-05, 2.2209e-05, 2.2358e-05, 2.3121e-05, + 2.3114e-05, 2.1109e-05], device='cuda:2') +2022-12-07 04:51:55,334 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1266.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:51:59,309 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 04:52:15,058 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1294.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 04:52:20,507 INFO [train.py:873] (2/4) Epoch 1, batch 1300, loss[loss=0.3682, simple_loss=0.2967, pruned_loss=0.2451, over 4993.00 frames. ], tot_loss[loss=0.3934, simple_loss=0.326, pruned_loss=0.2682, over 1924090.87 frames. ], batch size: 100, lr: 4.92e-02, grad_scale: 4.0 +2022-12-07 04:52:23,970 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.301e+02 2.849e+02 3.755e+02 4.812e+02 1.034e+03, threshold=7.510e+02, percent-clipped=8.0 +2022-12-07 04:52:25,532 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7121, 2.9001, 2.9967, 2.9976, 2.8505, 2.9443, 3.0321, 2.4896], + device='cuda:2'), covar=tensor([0.1579, 0.1164, 0.0942, 0.1006, 0.1044, 0.1365, 0.0819, 0.1480], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0037, 0.0034, 0.0034, 0.0037, 0.0031, 0.0033, 0.0041], + device='cuda:2'), out_proj_covar=tensor([3.5951e-05, 3.1154e-05, 2.9705e-05, 2.9264e-05, 3.3735e-05, 2.9084e-05, + 2.7492e-05, 3.9140e-05], device='cuda:2') +2022-12-07 04:53:33,570 INFO [train.py:873] (2/4) Epoch 1, batch 1400, loss[loss=0.3841, simple_loss=0.3199, pruned_loss=0.2413, over 14260.00 frames. ], tot_loss[loss=0.3848, simple_loss=0.3195, pruned_loss=0.2554, over 1895209.43 frames. ], batch size: 28, lr: 4.91e-02, grad_scale: 4.0 +2022-12-07 04:53:36,992 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.548e+02 3.437e+02 4.813e+02 6.365e+02 1.224e+03, threshold=9.626e+02, percent-clipped=13.0 +2022-12-07 04:53:40,350 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 04:53:43,732 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.35 vs. limit=5.0 +2022-12-07 04:53:57,788 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1434.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:54:41,768 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1495.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:54:46,086 INFO [train.py:873] (2/4) Epoch 1, batch 1500, loss[loss=0.3795, simple_loss=0.3187, pruned_loss=0.2327, over 14094.00 frames. ], tot_loss[loss=0.3764, simple_loss=0.3139, pruned_loss=0.2429, over 1926788.95 frames. ], batch size: 29, lr: 4.89e-02, grad_scale: 4.0 +2022-12-07 04:54:47,655 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1503.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 04:54:49,749 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.091e+01 2.709e+02 3.903e+02 5.151e+02 1.116e+03, threshold=7.805e+02, percent-clipped=3.0 +2022-12-07 04:54:52,107 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1509.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 04:55:22,616 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1551.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 04:55:33,706 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1566.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:55:54,526 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.47 vs. limit=5.0 +2022-12-07 04:55:59,965 INFO [train.py:873] (2/4) Epoch 1, batch 1600, loss[loss=0.3381, simple_loss=0.2743, pruned_loss=0.2116, over 5957.00 frames. ], tot_loss[loss=0.3705, simple_loss=0.3093, pruned_loss=0.2343, over 1913683.00 frames. ], batch size: 100, lr: 4.88e-02, grad_scale: 8.0 +2022-12-07 04:56:03,034 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2758, 1.3477, 1.3701, 1.1538, 1.2652, 1.4272, 1.2205, 1.0739], + device='cuda:2'), covar=tensor([0.0899, 0.0594, 0.0657, 0.0794, 0.0734, 0.0589, 0.0929, 0.1497], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0023, 0.0021, 0.0022, 0.0022, 0.0020, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.9757e-05, 1.6148e-05, 1.8890e-05, 1.8123e-05, 1.8741e-05, 1.9135e-05, + 1.6677e-05, 2.0378e-05], device='cuda:2') +2022-12-07 04:56:03,632 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 3.341e+02 4.121e+02 5.397e+02 1.445e+03, threshold=8.242e+02, percent-clipped=8.0 +2022-12-07 04:56:07,849 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1611.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:56:12,601 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1130, 3.2564, 3.6219, 3.1870, 3.3719, 3.5748, 2.8806, 3.5429], + device='cuda:2'), covar=tensor([0.0888, 0.0718, 0.0583, 0.0909, 0.0501, 0.0404, 0.1206, 0.0623], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0034, 0.0031, 0.0034, 0.0031, 0.0030, 0.0036, 0.0032], + device='cuda:2'), out_proj_covar=tensor([2.9623e-05, 2.7600e-05, 2.8575e-05, 2.7858e-05, 2.5347e-05, 2.5072e-05, + 2.9036e-05, 2.7245e-05], device='cuda:2') +2022-12-07 04:56:19,568 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1627.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:56:52,749 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1672.0, num_to_drop=2, layers_to_drop={0, 3} +2022-12-07 04:56:55,366 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.12 vs. limit=2.0 +2022-12-07 04:57:01,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8322, 3.3454, 3.2131, 3.4098, 3.0843, 3.7229, 3.0969, 1.6261], + device='cuda:2'), covar=tensor([0.0870, 0.1698, 0.1368, 0.1464, 0.1157, 0.0948, 0.1179, 0.8932], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0027, 0.0027, 0.0023, 0.0023, 0.0025, 0.0027, 0.0052], + device='cuda:2'), out_proj_covar=tensor([1.4630e-05, 1.9583e-05, 1.8644e-05, 1.4777e-05, 1.5758e-05, 1.7159e-05, + 1.8010e-05, 4.6527e-05], device='cuda:2') +2022-12-07 04:57:13,821 INFO [train.py:873] (2/4) Epoch 1, batch 1700, loss[loss=0.2674, simple_loss=0.2161, pruned_loss=0.1654, over 2607.00 frames. ], tot_loss[loss=0.3653, simple_loss=0.3058, pruned_loss=0.2266, over 1883440.48 frames. ], batch size: 100, lr: 4.86e-02, grad_scale: 8.0 +2022-12-07 04:57:17,699 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 3.504e+02 4.735e+02 5.720e+02 1.294e+03, threshold=9.470e+02, percent-clipped=11.0 +2022-12-07 04:57:34,858 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.20 vs. limit=2.0 +2022-12-07 04:57:39,301 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1281, 3.0600, 3.2018, 3.3302, 3.2425, 3.1281, 3.3276, 2.9839], + device='cuda:2'), covar=tensor([0.0501, 0.0553, 0.0634, 0.0338, 0.0316, 0.0488, 0.0308, 0.0554], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0033, 0.0033, 0.0029, 0.0032, 0.0027, 0.0028, 0.0036], + device='cuda:2'), out_proj_covar=tensor([2.7431e-05, 2.7512e-05, 2.8657e-05, 2.3324e-05, 2.8638e-05, 2.5006e-05, + 2.3201e-05, 3.3919e-05], device='cuda:2') +2022-12-07 04:58:21,539 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1790.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 04:58:29,844 INFO [train.py:873] (2/4) Epoch 1, batch 1800, loss[loss=0.3533, simple_loss=0.2958, pruned_loss=0.2095, over 5995.00 frames. ], tot_loss[loss=0.3592, simple_loss=0.3015, pruned_loss=0.219, over 1857038.34 frames. ], batch size: 100, lr: 4.85e-02, grad_scale: 8.0 +2022-12-07 04:58:33,677 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.780e+01 3.438e+02 4.789e+02 6.464e+02 1.407e+03, threshold=9.578e+02, percent-clipped=4.0 +2022-12-07 04:58:36,015 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=1809.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 04:58:57,937 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1838.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:59:12,096 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=1857.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 04:59:12,184 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1857.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 04:59:21,892 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=1870.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 04:59:43,723 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1899.0, num_to_drop=2, layers_to_drop={2, 3} +2022-12-07 04:59:44,699 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-12-07 04:59:45,047 INFO [train.py:873] (2/4) Epoch 1, batch 1900, loss[loss=0.3258, simple_loss=0.2901, pruned_loss=0.1817, over 14566.00 frames. ], tot_loss[loss=0.3558, simple_loss=0.2989, pruned_loss=0.2137, over 1896176.68 frames. ], batch size: 23, lr: 4.83e-02, grad_scale: 8.0 +2022-12-07 04:59:47,622 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.01 vs. limit=2.0 +2022-12-07 04:59:48,577 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.566e+01 3.394e+02 4.356e+02 5.869e+02 1.320e+03, threshold=8.711e+02, percent-clipped=5.0 +2022-12-07 04:59:55,647 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4920, 3.6986, 3.7714, 3.4876, 3.4700, 3.6962, 3.5908, 3.5044], + device='cuda:2'), covar=tensor([0.0300, 0.0319, 0.0246, 0.0390, 0.0353, 0.0232, 0.0348, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0035, 0.0035, 0.0033, 0.0037, 0.0030, 0.0034, 0.0038], + device='cuda:2'), out_proj_covar=tensor([3.0364e-05, 2.9868e-05, 3.0671e-05, 2.8450e-05, 3.4670e-05, 2.6594e-05, + 2.8211e-05, 3.6991e-05], device='cuda:2') +2022-12-07 04:59:58,563 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1918.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:00:01,469 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1922.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 05:00:08,214 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=1931.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:00:29,209 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.65 vs. limit=5.0 +2022-12-07 05:00:35,453 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=1967.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:01:01,971 INFO [train.py:873] (2/4) Epoch 1, batch 2000, loss[loss=0.3103, simple_loss=0.2753, pruned_loss=0.1726, over 14359.00 frames. ], tot_loss[loss=0.3517, simple_loss=0.2966, pruned_loss=0.2081, over 1970638.54 frames. ], batch size: 31, lr: 4.82e-02, grad_scale: 8.0 +2022-12-07 05:01:05,829 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.042e+02 3.554e+02 4.725e+02 6.724e+02 1.490e+03, threshold=9.451e+02, percent-clipped=5.0 +2022-12-07 05:01:18,632 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.26 vs. limit=2.0 +2022-12-07 05:01:28,720 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3097, 2.7627, 3.8127, 2.9767, 4.2342, 3.9326, 3.3934, 1.8432], + device='cuda:2'), covar=tensor([0.0419, 0.3730, 0.0542, 0.1261, 0.0592, 0.0701, 0.0730, 0.5490], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0032, 0.0017, 0.0023, 0.0019, 0.0019, 0.0017, 0.0036], + device='cuda:2'), out_proj_covar=tensor([1.3516e-05, 2.9665e-05, 1.0998e-05, 1.7141e-05, 1.2366e-05, 1.3424e-05, + 1.3004e-05, 3.2402e-05], device='cuda:2') +2022-12-07 05:01:33,716 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.55 vs. limit=2.0 +2022-12-07 05:01:48,386 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-12-07 05:02:14,851 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2090.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:02:19,258 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-12-07 05:02:23,810 INFO [train.py:873] (2/4) Epoch 1, batch 2100, loss[loss=0.3828, simple_loss=0.3145, pruned_loss=0.2255, over 9501.00 frames. ], tot_loss[loss=0.3419, simple_loss=0.2907, pruned_loss=0.1994, over 1944558.35 frames. ], batch size: 100, lr: 4.80e-02, grad_scale: 16.0 +2022-12-07 05:02:27,743 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.809e+01 3.029e+02 3.794e+02 4.821e+02 1.470e+03, threshold=7.588e+02, percent-clipped=1.0 +2022-12-07 05:02:53,152 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2138.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:03:01,667 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1734, 2.7604, 3.3675, 3.2954, 3.2508, 3.1888, 2.3537, 3.4051], + device='cuda:2'), covar=tensor([0.0257, 0.0667, 0.0265, 0.0201, 0.0236, 0.0227, 0.0888, 0.0261], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0024, 0.0022, 0.0019, 0.0025, 0.0020, 0.0020, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.8570e-05, 2.0741e-05, 1.8313e-05, 1.5661e-05, 2.1237e-05, 1.7302e-05, + 1.8720e-05, 2.0622e-05], device='cuda:2') +2022-12-07 05:03:38,920 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2194.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:03:45,040 INFO [train.py:873] (2/4) Epoch 1, batch 2200, loss[loss=0.3146, simple_loss=0.279, pruned_loss=0.1751, over 14391.00 frames. ], tot_loss[loss=0.3381, simple_loss=0.2885, pruned_loss=0.1956, over 1960760.31 frames. ], batch size: 41, lr: 4.78e-02, grad_scale: 16.0 +2022-12-07 05:03:49,158 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.495e+02 3.028e+02 4.377e+02 7.241e+02 2.014e+03, threshold=8.755e+02, percent-clipped=23.0 +2022-12-07 05:03:54,603 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2213.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:04:01,875 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2222.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:04:05,004 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2226.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:04:16,384 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-12-07 05:04:18,789 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.11 vs. limit=2.0 +2022-12-07 05:04:38,696 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2267.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:04:41,018 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2270.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:05:06,467 INFO [train.py:873] (2/4) Epoch 1, batch 2300, loss[loss=0.2521, simple_loss=0.2028, pruned_loss=0.1507, over 1221.00 frames. ], tot_loss[loss=0.3334, simple_loss=0.2859, pruned_loss=0.1915, over 1921417.81 frames. ], batch size: 100, lr: 4.77e-02, grad_scale: 16.0 +2022-12-07 05:05:10,739 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.291e+01 3.360e+02 4.447e+02 6.275e+02 1.931e+03, threshold=8.894e+02, percent-clipped=10.0 +2022-12-07 05:05:18,529 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2315.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:05:19,934 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=15.14 vs. limit=5.0 +2022-12-07 05:05:38,633 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2340.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:06:00,997 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.08 vs. limit=5.0 +2022-12-07 05:06:28,674 INFO [train.py:873] (2/4) Epoch 1, batch 2400, loss[loss=0.3066, simple_loss=0.2778, pruned_loss=0.1677, over 14064.00 frames. ], tot_loss[loss=0.3309, simple_loss=0.285, pruned_loss=0.189, over 1877839.38 frames. ], batch size: 29, lr: 4.75e-02, grad_scale: 16.0 +2022-12-07 05:06:28,879 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2401.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 05:06:32,586 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 3.236e+02 4.365e+02 6.045e+02 1.327e+03, threshold=8.731e+02, percent-clipped=8.0 +2022-12-07 05:06:49,407 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.13 vs. limit=5.0 +2022-12-07 05:07:14,387 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.65 vs. limit=5.0 +2022-12-07 05:07:20,298 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-12-07 05:07:35,240 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9966, 1.9297, 1.9968, 2.0982, 1.9753, 2.0090, 1.8053, 1.8161], + device='cuda:2'), covar=tensor([0.0187, 0.0197, 0.0229, 0.0120, 0.0206, 0.0097, 0.0336, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0036, 0.0035, 0.0032, 0.0036, 0.0026, 0.0037, 0.0040], + device='cuda:2'), out_proj_covar=tensor([3.3176e-05, 3.6135e-05, 3.6932e-05, 3.1100e-05, 3.6001e-05, 2.6532e-05, + 3.6382e-05, 4.1151e-05], device='cuda:2') +2022-12-07 05:07:37,706 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2274, 3.4230, 4.3726, 4.2712, 4.3431, 4.0725, 3.1326, 4.3234], + device='cuda:2'), covar=tensor([0.0181, 0.0637, 0.0160, 0.0150, 0.0129, 0.0263, 0.0433, 0.0197], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0031, 0.0024, 0.0024, 0.0028, 0.0024, 0.0023, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.2362e-05, 3.0018e-05, 2.2576e-05, 2.0341e-05, 2.5493e-05, 2.1700e-05, + 2.1941e-05, 2.2427e-05], device='cuda:2') +2022-12-07 05:07:44,851 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2494.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:07:46,966 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 05:07:50,099 INFO [train.py:873] (2/4) Epoch 1, batch 2500, loss[loss=0.2853, simple_loss=0.2414, pruned_loss=0.1646, over 5020.00 frames. ], tot_loss[loss=0.3247, simple_loss=0.2815, pruned_loss=0.1844, over 1902104.50 frames. ], batch size: 100, lr: 4.73e-02, grad_scale: 16.0 +2022-12-07 05:07:54,361 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.013e+02 3.632e+02 4.712e+02 6.167e+02 1.692e+03, threshold=9.425e+02, percent-clipped=8.0 +2022-12-07 05:08:00,474 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2513.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:08:09,941 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.52 vs. limit=2.0 +2022-12-07 05:08:10,412 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7698, 1.6030, 2.0786, 1.7830, 1.7614, 1.8651, 1.2075, 1.8261], + device='cuda:2'), covar=tensor([0.0225, 0.0335, 0.0126, 0.0257, 0.0250, 0.0225, 0.0472, 0.0212], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0032, 0.0025, 0.0024, 0.0029, 0.0025, 0.0024, 0.0024], + device='cuda:2'), out_proj_covar=tensor([2.2823e-05, 3.0763e-05, 2.3186e-05, 2.1185e-05, 2.5993e-05, 2.2615e-05, + 2.3061e-05, 2.2525e-05], device='cuda:2') +2022-12-07 05:08:11,288 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2526.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:08:23,748 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2542.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:08:39,739 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2561.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:08:50,002 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=2574.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:09:13,572 INFO [train.py:873] (2/4) Epoch 1, batch 2600, loss[loss=0.3021, simple_loss=0.2573, pruned_loss=0.1734, over 6030.00 frames. ], tot_loss[loss=0.3211, simple_loss=0.2797, pruned_loss=0.1815, over 1882948.03 frames. ], batch size: 100, lr: 4.71e-02, grad_scale: 16.0 +2022-12-07 05:09:15,410 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.22 vs. limit=2.0 +2022-12-07 05:09:17,458 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 2.998e+02 4.312e+02 6.612e+02 1.607e+03, threshold=8.624e+02, percent-clipped=4.0 +2022-12-07 05:10:23,630 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2685.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:10:23,859 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 05:10:25,527 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-12-07 05:10:30,463 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 05:10:32,540 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=2696.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:10:36,428 INFO [train.py:873] (2/4) Epoch 1, batch 2700, loss[loss=0.3271, simple_loss=0.2697, pruned_loss=0.1922, over 5035.00 frames. ], tot_loss[loss=0.3188, simple_loss=0.2786, pruned_loss=0.1797, over 1845462.45 frames. ], batch size: 100, lr: 4.69e-02, grad_scale: 16.0 +2022-12-07 05:10:40,371 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 3.041e+02 4.444e+02 6.160e+02 1.367e+03, threshold=8.889e+02, percent-clipped=4.0 +2022-12-07 05:11:00,000 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0 +2022-12-07 05:11:12,446 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.19 vs. limit=2.0 +2022-12-07 05:11:13,580 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9261, 1.0110, 1.3812, 1.2079, 1.2933, 1.4207, 1.1075, 1.2722], + device='cuda:2'), covar=tensor([0.0404, 0.0449, 0.0088, 0.0210, 0.0169, 0.0116, 0.0515, 0.0097], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0029, 0.0021, 0.0023, 0.0022, 0.0020, 0.0025, 0.0025], + device='cuda:2'), out_proj_covar=tensor([3.1572e-05, 2.6607e-05, 1.7574e-05, 1.8214e-05, 1.8960e-05, 1.7248e-05, + 2.5095e-05, 2.1941e-05], device='cuda:2') +2022-12-07 05:11:14,292 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2746.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:12:00,034 INFO [train.py:873] (2/4) Epoch 1, batch 2800, loss[loss=0.3154, simple_loss=0.2804, pruned_loss=0.1752, over 5979.00 frames. ], tot_loss[loss=0.3177, simple_loss=0.2785, pruned_loss=0.1786, over 1911385.99 frames. ], batch size: 100, lr: 4.67e-02, grad_scale: 8.0 +2022-12-07 05:12:05,006 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.030e+02 3.640e+02 4.591e+02 6.544e+02 1.997e+03, threshold=9.181e+02, percent-clipped=12.0 +2022-12-07 05:12:18,760 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2824.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:12:25,375 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1009, 2.8788, 3.2044, 2.8261, 2.8836, 2.7263, 2.4552, 2.9068], + device='cuda:2'), covar=tensor([0.0254, 0.0444, 0.0290, 0.0311, 0.0283, 0.0636, 0.0848, 0.0265], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0043, 0.0039, 0.0035, 0.0043, 0.0031, 0.0048, 0.0046], + device='cuda:2'), out_proj_covar=tensor([4.2387e-05, 4.6511e-05, 4.5525e-05, 3.5945e-05, 4.7689e-05, 3.4444e-05, + 5.2472e-05, 5.2777e-05], device='cuda:2') +2022-12-07 05:12:59,609 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0954, 1.1422, 1.1877, 1.4309, 1.2447, 1.3440, 1.1230, 1.2793], + device='cuda:2'), covar=tensor([0.0261, 0.0100, 0.0171, 0.0105, 0.0136, 0.0110, 0.0198, 0.0164], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0014, 0.0011, 0.0012, 0.0012, 0.0011, 0.0012], + device='cuda:2'), out_proj_covar=tensor([9.7036e-06, 9.4675e-06, 1.2899e-05, 9.8917e-06, 1.0668e-05, 1.0246e-05, + 9.1369e-06, 1.0445e-05], device='cuda:2') +2022-12-07 05:13:09,525 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2885.0, num_to_drop=2, layers_to_drop={1, 2} +2022-12-07 05:13:16,716 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8505, 4.6086, 4.3777, 4.5829, 4.8103, 3.7621, 5.0197, 4.6180], + device='cuda:2'), covar=tensor([0.0664, 0.0592, 0.0708, 0.0756, 0.0501, 0.0902, 0.0553, 0.1173], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0051, 0.0067, 0.0059, 0.0065, 0.0049, 0.0050, 0.0065], + device='cuda:2'), out_proj_covar=tensor([7.1668e-05, 5.8651e-05, 7.4394e-05, 6.7568e-05, 7.1629e-05, 5.3599e-05, + 6.7243e-05, 7.6312e-05], device='cuda:2') +2022-12-07 05:13:22,398 INFO [train.py:873] (2/4) Epoch 1, batch 2900, loss[loss=0.2103, simple_loss=0.1824, pruned_loss=0.1191, over 2593.00 frames. ], tot_loss[loss=0.3135, simple_loss=0.276, pruned_loss=0.1756, over 1889476.66 frames. ], batch size: 100, lr: 4.65e-02, grad_scale: 8.0 +2022-12-07 05:13:27,204 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.262e+02 3.059e+02 4.261e+02 6.986e+02 2.302e+03, threshold=8.522e+02, percent-clipped=12.0 +2022-12-07 05:13:52,871 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=2937.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:13:55,902 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.77 vs. limit=5.0 +2022-12-07 05:14:43,375 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=2996.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:14:45,351 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=2998.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 05:14:48,186 INFO [train.py:873] (2/4) Epoch 1, batch 3000, loss[loss=0.2719, simple_loss=0.2394, pruned_loss=0.1522, over 5944.00 frames. ], tot_loss[loss=0.3104, simple_loss=0.2743, pruned_loss=0.1733, over 1951744.61 frames. ], batch size: 100, lr: 4.63e-02, grad_scale: 8.0 +2022-12-07 05:14:48,186 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 05:14:56,535 INFO [train.py:905] (2/4) Epoch 1, validation: loss=0.2054, simple_loss=0.2366, pruned_loss=0.08706, over 857387.00 frames. +2022-12-07 05:14:56,535 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17147MB +2022-12-07 05:15:01,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.332e+02 3.340e+02 4.389e+02 5.804e+02 1.068e+03, threshold=8.777e+02, percent-clipped=8.0 +2022-12-07 05:15:11,911 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3019.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:15:12,887 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1058, 2.9389, 3.2391, 2.7856, 2.9852, 2.6963, 2.2910, 2.9919], + device='cuda:2'), covar=tensor([0.0277, 0.0371, 0.0262, 0.0423, 0.0276, 0.0530, 0.1123, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0043, 0.0040, 0.0036, 0.0045, 0.0030, 0.0054, 0.0047], + device='cuda:2'), out_proj_covar=tensor([4.5761e-05, 5.1038e-05, 4.7763e-05, 3.8980e-05, 5.2154e-05, 3.4549e-05, + 6.1052e-05, 5.5861e-05], device='cuda:2') +2022-12-07 05:15:30,367 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3041.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:15:32,923 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3044.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:15:43,669 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-07 05:16:04,050 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3080.0, num_to_drop=2, layers_to_drop={0, 2} +2022-12-07 05:16:21,381 INFO [train.py:873] (2/4) Epoch 1, batch 3100, loss[loss=0.3009, simple_loss=0.2417, pruned_loss=0.1801, over 2643.00 frames. ], tot_loss[loss=0.3054, simple_loss=0.271, pruned_loss=0.1699, over 1923502.41 frames. ], batch size: 100, lr: 4.61e-02, grad_scale: 8.0 +2022-12-07 05:16:26,604 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.583e+02 3.492e+02 4.643e+02 6.451e+02 1.266e+03, threshold=9.287e+02, percent-clipped=10.0 +2022-12-07 05:17:27,815 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3180.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:17:45,612 INFO [train.py:873] (2/4) Epoch 1, batch 3200, loss[loss=0.2686, simple_loss=0.25, pruned_loss=0.1436, over 14428.00 frames. ], tot_loss[loss=0.3064, simple_loss=0.2719, pruned_loss=0.1705, over 1963796.50 frames. ], batch size: 73, lr: 4.59e-02, grad_scale: 8.0 +2022-12-07 05:17:46,283 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.13 vs. limit=2.0 +2022-12-07 05:17:50,388 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.484e+02 3.868e+02 5.118e+02 7.409e+02 2.192e+03, threshold=1.024e+03, percent-clipped=11.0 +2022-12-07 05:18:01,488 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-12-07 05:18:32,020 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7833, 2.0007, 3.2310, 2.6296, 3.1968, 3.1768, 2.0335, 1.9010], + device='cuda:2'), covar=tensor([0.0311, 0.0551, 0.0392, 0.0215, 0.0227, 0.0231, 0.1270, 0.1062], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0032, 0.0033, 0.0033, 0.0032, 0.0031, 0.0048, 0.0038], + device='cuda:2'), out_proj_covar=tensor([2.7367e-05, 2.9165e-05, 2.8479e-05, 2.4964e-05, 2.6928e-05, 2.3753e-05, + 4.8202e-05, 3.5871e-05], device='cuda:2') +2022-12-07 05:18:57,701 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3288.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:19:01,694 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3293.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:19:02,689 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0505, 2.5922, 3.6117, 3.0720, 3.6707, 3.5114, 2.3013, 2.4996], + device='cuda:2'), covar=tensor([0.0487, 0.0585, 0.0701, 0.0339, 0.0312, 0.0421, 0.1716, 0.1017], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0033, 0.0035, 0.0034, 0.0034, 0.0031, 0.0050, 0.0038], + device='cuda:2'), out_proj_covar=tensor([2.8427e-05, 3.0807e-05, 3.0203e-05, 2.6145e-05, 2.8453e-05, 2.4267e-05, + 5.0662e-05, 3.5889e-05], device='cuda:2') +2022-12-07 05:19:08,885 INFO [train.py:873] (2/4) Epoch 1, batch 3300, loss[loss=0.2603, simple_loss=0.2432, pruned_loss=0.1387, over 13977.00 frames. ], tot_loss[loss=0.3027, simple_loss=0.2701, pruned_loss=0.1676, over 2006929.31 frames. ], batch size: 19, lr: 4.57e-02, grad_scale: 8.0 +2022-12-07 05:19:14,103 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.592e+02 2.921e+02 3.689e+02 4.978e+02 9.002e+02, threshold=7.379e+02, percent-clipped=0.0 +2022-12-07 05:19:30,941 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.26 vs. limit=5.0 +2022-12-07 05:19:42,803 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3341.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:19:49,902 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3349.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 05:20:11,287 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3375.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:20:24,037 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3389.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:20:25,403 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=13.84 vs. limit=5.0 +2022-12-07 05:20:34,759 INFO [train.py:873] (2/4) Epoch 1, batch 3400, loss[loss=0.347, simple_loss=0.3001, pruned_loss=0.197, over 14247.00 frames. ], tot_loss[loss=0.3028, simple_loss=0.27, pruned_loss=0.1678, over 1991838.95 frames. ], batch size: 80, lr: 4.55e-02, grad_scale: 8.0 +2022-12-07 05:20:39,711 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.829e+02 3.330e+02 4.669e+02 8.149e+02 2.778e+03, threshold=9.337e+02, percent-clipped=27.0 +2022-12-07 05:21:25,006 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8854, 1.7080, 3.7524, 3.3672, 3.4188, 3.8630, 2.8737, 3.9686], + device='cuda:2'), covar=tensor([0.0167, 0.1767, 0.0268, 0.0455, 0.0246, 0.0174, 0.0431, 0.0228], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0059, 0.0033, 0.0041, 0.0041, 0.0036, 0.0035, 0.0034], + device='cuda:2'), out_proj_covar=tensor([3.6344e-05, 6.5982e-05, 3.8932e-05, 4.5003e-05, 4.2988e-05, 3.8475e-05, + 4.1182e-05, 3.7082e-05], device='cuda:2') +2022-12-07 05:21:34,572 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-12-07 05:21:43,555 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3480.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:22:01,457 INFO [train.py:873] (2/4) Epoch 1, batch 3500, loss[loss=0.2117, simple_loss=0.1777, pruned_loss=0.1229, over 1241.00 frames. ], tot_loss[loss=0.2987, simple_loss=0.2677, pruned_loss=0.1649, over 1972015.78 frames. ], batch size: 100, lr: 4.53e-02, grad_scale: 8.0 +2022-12-07 05:22:06,643 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.311e+02 3.200e+02 4.145e+02 5.923e+02 9.666e+02, threshold=8.289e+02, percent-clipped=1.0 +2022-12-07 05:22:24,411 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3528.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:22:40,202 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3546.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:23:19,929 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.76 vs. limit=5.0 +2022-12-07 05:23:21,045 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3593.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:23:28,042 INFO [train.py:873] (2/4) Epoch 1, batch 3600, loss[loss=0.2903, simple_loss=0.259, pruned_loss=0.1608, over 14266.00 frames. ], tot_loss[loss=0.2971, simple_loss=0.2665, pruned_loss=0.1639, over 1964051.87 frames. ], batch size: 25, lr: 4.50e-02, grad_scale: 8.0 +2022-12-07 05:23:33,317 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.536e+02 3.769e+02 4.638e+02 5.846e+02 1.150e+03, threshold=9.276e+02, percent-clipped=6.0 +2022-12-07 05:23:33,580 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3607.0, num_to_drop=2, layers_to_drop={1, 3} +2022-12-07 05:23:41,017 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.93 vs. limit=5.0 +2022-12-07 05:23:50,012 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.19 vs. limit=2.0 +2022-12-07 05:24:02,791 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3641.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:24:05,419 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3644.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:24:10,124 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.87 vs. limit=5.0 +2022-12-07 05:24:23,795 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.65 vs. limit=5.0 +2022-12-07 05:24:33,156 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3675.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:24:34,081 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3676.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:24:55,837 INFO [train.py:873] (2/4) Epoch 1, batch 3700, loss[loss=0.3472, simple_loss=0.3009, pruned_loss=0.1968, over 14354.00 frames. ], tot_loss[loss=0.2977, simple_loss=0.2671, pruned_loss=0.1642, over 1959212.21 frames. ], batch size: 73, lr: 4.48e-02, grad_scale: 8.0 +2022-12-07 05:25:00,988 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.789e+02 3.561e+02 4.722e+02 6.251e+02 1.502e+03, threshold=9.443e+02, percent-clipped=7.0 +2022-12-07 05:25:07,946 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4851, 1.5400, 1.8591, 1.9770, 1.9134, 1.8151, 1.2925, 2.0352], + device='cuda:2'), covar=tensor([0.0468, 0.0450, 0.0156, 0.0131, 0.0146, 0.0156, 0.0533, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0055, 0.0046, 0.0028, 0.0032, 0.0033, 0.0030, 0.0031, 0.0029], + device='cuda:2'), out_proj_covar=tensor([6.4700e-05, 5.2196e-05, 2.9909e-05, 3.8196e-05, 3.3578e-05, 3.1840e-05, + 3.5288e-05, 3.0281e-05], device='cuda:2') +2022-12-07 05:25:14,824 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3723.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:25:26,851 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3737.0, num_to_drop=2, layers_to_drop={2, 3} +2022-12-07 05:26:22,262 INFO [train.py:873] (2/4) Epoch 1, batch 3800, loss[loss=0.2299, simple_loss=0.201, pruned_loss=0.1294, over 2639.00 frames. ], tot_loss[loss=0.295, simple_loss=0.2654, pruned_loss=0.1623, over 2004201.58 frames. ], batch size: 100, lr: 4.46e-02, grad_scale: 8.0 +2022-12-07 05:26:26,609 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7941, 2.6757, 2.5225, 2.6225, 2.6264, 2.2568, 2.8418, 2.7123], + device='cuda:2'), covar=tensor([0.0650, 0.0701, 0.0935, 0.0823, 0.0660, 0.0731, 0.0778, 0.0901], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0054, 0.0078, 0.0068, 0.0074, 0.0052, 0.0066, 0.0073], + device='cuda:2'), out_proj_covar=tensor([8.0821e-05, 6.9001e-05, 9.5530e-05, 8.5923e-05, 8.8300e-05, 6.3388e-05, + 9.2419e-05, 9.4221e-05], device='cuda:2') +2022-12-07 05:26:27,324 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.307e+02 3.207e+02 4.617e+02 6.095e+02 1.543e+03, threshold=9.233e+02, percent-clipped=5.0 +2022-12-07 05:27:22,150 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0965, 2.0962, 3.0956, 2.7246, 2.8879, 2.8839, 1.6954, 2.3061], + device='cuda:2'), covar=tensor([0.0465, 0.0414, 0.0420, 0.0206, 0.0308, 0.0182, 0.1368, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0041, 0.0034, 0.0039, 0.0037, 0.0038, 0.0034, 0.0056, 0.0039], + device='cuda:2'), out_proj_covar=tensor([3.4808e-05, 3.1776e-05, 3.6243e-05, 2.8610e-05, 3.1292e-05, 2.7404e-05, + 5.8906e-05, 3.8524e-05], device='cuda:2') +2022-12-07 05:27:50,296 INFO [train.py:873] (2/4) Epoch 1, batch 3900, loss[loss=0.2629, simple_loss=0.25, pruned_loss=0.1379, over 14050.00 frames. ], tot_loss[loss=0.2943, simple_loss=0.2644, pruned_loss=0.1621, over 1955037.77 frames. ], batch size: 22, lr: 4.44e-02, grad_scale: 8.0 +2022-12-07 05:27:51,217 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=3902.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 05:27:55,147 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.731e+02 3.646e+02 5.236e+02 6.626e+02 1.873e+03, threshold=1.047e+03, percent-clipped=8.0 +2022-12-07 05:28:08,300 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5545, 1.4017, 1.8034, 1.9924, 1.9362, 1.9390, 1.6667, 2.0038], + device='cuda:2'), covar=tensor([0.0338, 0.0507, 0.0178, 0.0115, 0.0112, 0.0123, 0.0344, 0.0111], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0050, 0.0029, 0.0035, 0.0034, 0.0033, 0.0032, 0.0031], + device='cuda:2'), out_proj_covar=tensor([6.9056e-05, 5.9153e-05, 3.2279e-05, 4.2874e-05, 3.5903e-05, 3.7931e-05, + 3.7540e-05, 3.4240e-05], device='cuda:2') +2022-12-07 05:28:11,849 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=3926.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:28:27,595 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=3944.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:28:31,958 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2877, 0.9902, 1.1707, 0.9375, 1.2667, 0.5786, 1.1489, 1.0794], + device='cuda:2'), covar=tensor([0.0108, 0.0452, 0.0124, 0.0487, 0.0137, 0.0398, 0.0134, 0.0102], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0076, 0.0039, 0.0055, 0.0033, 0.0040, 0.0034, 0.0039], + device='cuda:2'), out_proj_covar=tensor([2.9996e-05, 7.5678e-05, 3.7411e-05, 5.5359e-05, 3.3394e-05, 4.2067e-05, + 3.7185e-05, 3.6969e-05], device='cuda:2') +2022-12-07 05:28:36,953 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6330, 1.3975, 2.6239, 2.1535, 2.4091, 2.2084, 1.7571, 2.5679], + device='cuda:2'), covar=tensor([0.0180, 0.1576, 0.0240, 0.0658, 0.0189, 0.0332, 0.0706, 0.0321], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0064, 0.0037, 0.0049, 0.0046, 0.0039, 0.0036, 0.0038], + device='cuda:2'), out_proj_covar=tensor([4.5570e-05, 7.5906e-05, 4.6086e-05, 6.0431e-05, 5.3699e-05, 4.5682e-05, + 4.6739e-05, 4.5424e-05], device='cuda:2') +2022-12-07 05:28:40,757 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-07 05:29:05,216 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=3987.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:29:09,492 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=3992.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:29:17,377 INFO [train.py:873] (2/4) Epoch 1, batch 4000, loss[loss=0.2366, simple_loss=0.2015, pruned_loss=0.1359, over 2663.00 frames. ], tot_loss[loss=0.292, simple_loss=0.2634, pruned_loss=0.1603, over 1993418.11 frames. ], batch size: 100, lr: 4.42e-02, grad_scale: 8.0 +2022-12-07 05:29:22,734 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.698e+02 3.697e+02 5.328e+02 7.328e+02 1.359e+03, threshold=1.066e+03, percent-clipped=6.0 +2022-12-07 05:29:23,717 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3112, 3.1597, 4.4720, 2.9568, 4.2123, 3.4424, 4.4097, 4.2478], + device='cuda:2'), covar=tensor([0.0182, 0.2774, 0.0158, 0.4991, 0.0120, 0.2170, 0.0223, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0203, 0.0113, 0.0225, 0.0083, 0.0225, 0.0118, 0.0093], + device='cuda:2'), out_proj_covar=tensor([7.9125e-05, 1.6005e-04, 7.9930e-05, 1.8157e-04, 5.8523e-05, 1.7299e-04, + 8.2895e-05, 6.4054e-05], device='cuda:2') +2022-12-07 05:29:31,521 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-12-07 05:29:45,731 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4032.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:30:06,703 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.50 vs. limit=5.0 +2022-12-07 05:30:16,977 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.97 vs. limit=5.0 +2022-12-07 05:30:31,492 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.15 vs. limit=2.0 +2022-12-07 05:30:47,314 INFO [train.py:873] (2/4) Epoch 1, batch 4100, loss[loss=0.2781, simple_loss=0.226, pruned_loss=0.1651, over 2611.00 frames. ], tot_loss[loss=0.2904, simple_loss=0.262, pruned_loss=0.1594, over 1960827.18 frames. ], batch size: 100, lr: 4.40e-02, grad_scale: 8.0 +2022-12-07 05:30:52,473 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.129e+02 3.295e+02 4.605e+02 5.758e+02 1.082e+03, threshold=9.210e+02, percent-clipped=1.0 +2022-12-07 05:30:55,237 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1368, 2.9770, 2.8722, 3.1588, 3.1680, 3.1465, 2.9723, 2.9750], + device='cuda:2'), covar=tensor([0.0232, 0.0431, 0.0377, 0.0341, 0.0259, 0.0320, 0.0439, 0.0336], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0062, 0.0049, 0.0045, 0.0045, 0.0051, 0.0062, 0.0057], + device='cuda:2'), out_proj_covar=tensor([5.4008e-05, 8.0114e-05, 6.1837e-05, 6.0730e-05, 5.8175e-05, 6.9281e-05, + 8.7579e-05, 7.5402e-05], device='cuda:2') +2022-12-07 05:31:22,942 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=4141.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:32:16,951 INFO [train.py:873] (2/4) Epoch 1, batch 4200, loss[loss=0.2706, simple_loss=0.2498, pruned_loss=0.1457, over 14298.00 frames. ], tot_loss[loss=0.2884, simple_loss=0.2611, pruned_loss=0.1579, over 1920191.20 frames. ], batch size: 63, lr: 4.38e-02, grad_scale: 8.0 +2022-12-07 05:32:17,745 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4202.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:32:17,787 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=4202.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:32:21,867 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 3.430e+02 4.467e+02 5.944e+02 1.498e+03, threshold=8.934e+02, percent-clipped=5.0 +2022-12-07 05:32:43,033 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6642, 2.0698, 0.3636, 1.8234, 1.4691, 1.8761, 1.9949, 1.4600], + device='cuda:2'), covar=tensor([0.0364, 0.0140, 0.0762, 0.0152, 0.0422, 0.0174, 0.0139, 0.0250], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0015, 0.0017, 0.0014, 0.0015, 0.0015, 0.0013, 0.0014], + device='cuda:2'), out_proj_covar=tensor([1.2539e-05, 1.3145e-05, 1.8630e-05, 1.3629e-05, 1.4845e-05, 1.4427e-05, + 1.2110e-05, 1.5248e-05], device='cuda:2') +2022-12-07 05:32:51,174 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=15.25 vs. limit=5.0 +2022-12-07 05:33:00,173 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4250.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:33:18,700 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.73 vs. limit=5.0 +2022-12-07 05:33:28,857 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4282.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:33:34,045 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=4288.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:33:45,949 INFO [train.py:873] (2/4) Epoch 1, batch 4300, loss[loss=0.2559, simple_loss=0.246, pruned_loss=0.1329, over 14243.00 frames. ], tot_loss[loss=0.2874, simple_loss=0.2604, pruned_loss=0.1572, over 1943024.34 frames. ], batch size: 94, lr: 4.35e-02, grad_scale: 8.0 +2022-12-07 05:33:51,476 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.040e+01 3.141e+02 3.918e+02 5.765e+02 1.012e+03, threshold=7.835e+02, percent-clipped=6.0 +2022-12-07 05:34:13,560 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4332.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:34:16,433 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.30 vs. limit=5.0 +2022-12-07 05:34:29,123 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=4349.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:34:56,519 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4380.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 05:35:15,296 INFO [train.py:873] (2/4) Epoch 1, batch 4400, loss[loss=0.2747, simple_loss=0.2582, pruned_loss=0.1456, over 14255.00 frames. ], tot_loss[loss=0.2856, simple_loss=0.2594, pruned_loss=0.1559, over 1988573.59 frames. ], batch size: 25, lr: 4.33e-02, grad_scale: 8.0 +2022-12-07 05:35:17,585 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-12-07 05:35:20,267 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.063e+02 3.285e+02 4.190e+02 5.766e+02 1.366e+03, threshold=8.380e+02, percent-clipped=8.0 +2022-12-07 05:35:33,956 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-12-07 05:36:08,718 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4620, 0.7060, 1.2066, 1.0669, 1.2422, 1.4649, 1.2801, 1.0406], + device='cuda:2'), covar=tensor([0.0404, 0.1035, 0.0486, 0.0718, 0.0265, 0.0386, 0.0622, 0.0350], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0029, 0.0032, 0.0027, 0.0026, 0.0031, 0.0026, 0.0029], + device='cuda:2'), out_proj_covar=tensor([2.7825e-05, 3.1904e-05, 3.4065e-05, 2.9654e-05, 2.5495e-05, 3.2463e-05, + 2.7167e-05, 2.9097e-05], device='cuda:2') +2022-12-07 05:36:11,708 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.04 vs. limit=5.0 +2022-12-07 05:36:40,780 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4497.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:36:44,321 INFO [train.py:873] (2/4) Epoch 1, batch 4500, loss[loss=0.226, simple_loss=0.1942, pruned_loss=0.129, over 2600.00 frames. ], tot_loss[loss=0.2836, simple_loss=0.2584, pruned_loss=0.1544, over 2008728.97 frames. ], batch size: 100, lr: 4.31e-02, grad_scale: 8.0 +2022-12-07 05:36:49,368 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.519e+02 3.529e+02 4.865e+02 6.809e+02 1.533e+03, threshold=9.730e+02, percent-clipped=12.0 +2022-12-07 05:37:45,951 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-12-07 05:37:54,676 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4582.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:38:11,655 INFO [train.py:873] (2/4) Epoch 1, batch 4600, loss[loss=0.2522, simple_loss=0.2536, pruned_loss=0.1254, over 14459.00 frames. ], tot_loss[loss=0.2836, simple_loss=0.2585, pruned_loss=0.1544, over 1967669.98 frames. ], batch size: 24, lr: 4.29e-02, grad_scale: 8.0 +2022-12-07 05:38:16,961 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.290e+02 3.319e+02 4.792e+02 5.771e+02 3.822e+03, threshold=9.585e+02, percent-clipped=7.0 +2022-12-07 05:38:28,494 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.14 vs. limit=2.0 +2022-12-07 05:38:37,731 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4630.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:38:44,875 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5870, 3.8222, 4.2838, 4.0725, 4.4627, 4.3198, 4.4647, 4.4353], + device='cuda:2'), covar=tensor([0.0200, 0.0395, 0.0274, 0.0286, 0.0200, 0.0284, 0.0334, 0.0489], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0070, 0.0086, 0.0059, 0.0081, 0.0076, 0.0077, 0.0068], + device='cuda:2'), out_proj_covar=tensor([8.6763e-05, 8.2879e-05, 9.6936e-05, 6.7897e-05, 9.5993e-05, 8.5691e-05, + 9.9859e-05, 8.3776e-05], device='cuda:2') +2022-12-07 05:38:49,751 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=4644.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:39:27,885 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2285, 3.9264, 3.0732, 3.9190, 3.5348, 3.8299, 3.6186, 2.5523], + device='cuda:2'), covar=tensor([0.0234, 0.0416, 0.2110, 0.0178, 0.0137, 0.0135, 0.0348, 0.2316], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0093, 0.0168, 0.0075, 0.0068, 0.0080, 0.0098, 0.0183], + device='cuda:2'), out_proj_covar=tensor([4.8172e-05, 5.4583e-05, 1.0454e-04, 4.0598e-05, 3.8426e-05, 4.4691e-05, + 5.6619e-05, 1.1403e-04], device='cuda:2') +2022-12-07 05:39:41,707 INFO [train.py:873] (2/4) Epoch 1, batch 4700, loss[loss=0.3117, simple_loss=0.286, pruned_loss=0.1687, over 14242.00 frames. ], tot_loss[loss=0.2813, simple_loss=0.2572, pruned_loss=0.1527, over 2011526.20 frames. ], batch size: 35, lr: 4.27e-02, grad_scale: 8.0 +2022-12-07 05:39:46,821 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.177e+02 3.103e+02 4.134e+02 5.921e+02 2.901e+03, threshold=8.268e+02, percent-clipped=6.0 +2022-12-07 05:40:09,726 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8626, 3.4122, 2.5417, 3.4204, 2.9917, 3.2645, 3.2208, 2.2623], + device='cuda:2'), covar=tensor([0.0221, 0.0461, 0.2758, 0.0220, 0.0189, 0.0249, 0.0479, 0.3039], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0096, 0.0175, 0.0078, 0.0070, 0.0081, 0.0101, 0.0189], + device='cuda:2'), out_proj_covar=tensor([4.9682e-05, 5.6334e-05, 1.0858e-04, 4.2581e-05, 3.9758e-05, 4.5883e-05, + 5.8448e-05, 1.1825e-04], device='cuda:2') +2022-12-07 05:41:06,501 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4797.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:41:10,381 INFO [train.py:873] (2/4) Epoch 1, batch 4800, loss[loss=0.3085, simple_loss=0.2755, pruned_loss=0.1708, over 14362.00 frames. ], tot_loss[loss=0.2805, simple_loss=0.2567, pruned_loss=0.1522, over 1995724.27 frames. ], batch size: 66, lr: 4.25e-02, grad_scale: 16.0 +2022-12-07 05:41:10,839 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 05:41:15,575 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.267e+02 3.055e+02 4.514e+02 6.112e+02 1.452e+03, threshold=9.028e+02, percent-clipped=8.0 +2022-12-07 05:41:49,185 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4845.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:42:10,761 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2639, 3.9183, 3.7203, 4.1360, 4.0257, 3.1489, 4.4525, 4.3343], + device='cuda:2'), covar=tensor([0.0889, 0.0665, 0.1035, 0.0853, 0.0877, 0.0818, 0.0701, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0063, 0.0089, 0.0079, 0.0083, 0.0056, 0.0078, 0.0079], + device='cuda:2'), out_proj_covar=tensor([1.0392e-04, 8.8573e-05, 1.1794e-04, 1.0833e-04, 1.0835e-04, 7.4692e-05, + 1.1421e-04, 1.1036e-04], device='cuda:2') +2022-12-07 05:42:12,852 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9234, 1.3776, 2.7421, 2.2502, 2.7652, 2.7053, 2.0946, 3.0771], + device='cuda:2'), covar=tensor([0.0217, 0.2074, 0.0367, 0.0965, 0.0266, 0.0297, 0.0730, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0047, 0.0083, 0.0043, 0.0070, 0.0058, 0.0048, 0.0044, 0.0046], + device='cuda:2'), out_proj_covar=tensor([6.3504e-05, 1.0854e-04, 6.4820e-05, 9.6025e-05, 8.0116e-05, 6.6190e-05, + 6.4270e-05, 6.3949e-05], device='cuda:2') +2022-12-07 05:42:20,883 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-12-07 05:42:39,195 INFO [train.py:873] (2/4) Epoch 1, batch 4900, loss[loss=0.2637, simple_loss=0.257, pruned_loss=0.1352, over 14289.00 frames. ], tot_loss[loss=0.2798, simple_loss=0.2561, pruned_loss=0.1518, over 1968878.52 frames. ], batch size: 25, lr: 4.23e-02, grad_scale: 16.0 +2022-12-07 05:42:44,141 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.236e+02 3.107e+02 4.475e+02 6.071e+02 1.419e+03, threshold=8.951e+02, percent-clipped=8.0 +2022-12-07 05:43:04,100 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3233, 1.1850, 1.3331, 1.2758, 1.3810, 1.4080, 1.0843, 1.5066], + device='cuda:2'), covar=tensor([0.2430, 0.1162, 0.0635, 0.0932, 0.0418, 0.0626, 0.0710, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0054, 0.0034, 0.0034, 0.0041, 0.0033, 0.0044, 0.0042], + device='cuda:2'), out_proj_covar=tensor([1.1008e-04, 7.0326e-05, 4.9328e-05, 4.9951e-05, 4.9213e-05, 4.7354e-05, + 5.8459e-05, 5.2351e-05], device='cuda:2') +2022-12-07 05:43:14,978 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5551, 2.7139, 2.5696, 2.5712, 2.3364, 2.1394, 2.2520, 1.8974], + device='cuda:2'), covar=tensor([0.0447, 0.0387, 0.0213, 0.0502, 0.0436, 0.1273, 0.0205, 0.1356], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0044, 0.0031, 0.0040, 0.0046, 0.0070, 0.0032, 0.0064], + device='cuda:2'), out_proj_covar=tensor([3.3584e-05, 3.3861e-05, 2.3855e-05, 3.2115e-05, 3.5070e-05, 5.8139e-05, + 2.2987e-05, 5.2213e-05], device='cuda:2') +2022-12-07 05:43:16,680 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=4944.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:43:56,245 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 05:43:58,613 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=4992.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:44:10,010 INFO [train.py:873] (2/4) Epoch 1, batch 5000, loss[loss=0.286, simple_loss=0.2657, pruned_loss=0.1531, over 14256.00 frames. ], tot_loss[loss=0.2807, simple_loss=0.2568, pruned_loss=0.1523, over 1978385.50 frames. ], batch size: 80, lr: 4.20e-02, grad_scale: 16.0 +2022-12-07 05:44:15,206 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.545e+02 3.215e+02 4.595e+02 5.686e+02 1.097e+03, threshold=9.191e+02, percent-clipped=3.0 +2022-12-07 05:44:54,963 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-12-07 05:44:58,992 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.49 vs. limit=2.0 +2022-12-07 05:45:06,611 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6976, 1.3799, 2.4008, 2.0020, 2.2959, 2.0834, 1.0174, 1.6035], + device='cuda:2'), covar=tensor([0.0529, 0.0856, 0.0379, 0.0229, 0.0226, 0.0254, 0.1501, 0.0518], + device='cuda:2'), in_proj_covar=tensor([0.0044, 0.0043, 0.0045, 0.0042, 0.0043, 0.0042, 0.0069, 0.0044], + device='cuda:2'), out_proj_covar=tensor([4.5521e-05, 4.3319e-05, 4.7841e-05, 3.7725e-05, 3.9443e-05, 3.8401e-05, + 7.5337e-05, 4.7829e-05], device='cuda:2') +2022-12-07 05:45:11,254 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9319, 1.9951, 1.8365, 1.9107, 1.8289, 1.8152, 2.0117, 1.9765], + device='cuda:2'), covar=tensor([0.1267, 0.0992, 0.1507, 0.1190, 0.1431, 0.0893, 0.1082, 0.1197], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0063, 0.0087, 0.0079, 0.0084, 0.0056, 0.0079, 0.0080], + device='cuda:2'), out_proj_covar=tensor([1.0609e-04, 8.9217e-05, 1.1764e-04, 1.0789e-04, 1.1275e-04, 7.7007e-05, + 1.1685e-04, 1.1067e-04], device='cuda:2') +2022-12-07 05:45:27,610 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4411, 1.1358, 1.2754, 1.3409, 1.2663, 2.2990, 1.4217, 1.0721], + device='cuda:2'), covar=tensor([0.0334, 0.0610, 0.0494, 0.0316, 0.0299, 0.0145, 0.0368, 0.0429], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0024, 0.0020, 0.0021, 0.0021, 0.0020, 0.0020], + device='cuda:2'), out_proj_covar=tensor([1.9854e-05, 2.0237e-05, 2.8950e-05, 2.0232e-05, 2.3628e-05, 2.1211e-05, + 2.3077e-05, 2.1333e-05], device='cuda:2') +2022-12-07 05:45:39,007 INFO [train.py:873] (2/4) Epoch 1, batch 5100, loss[loss=0.2992, simple_loss=0.2694, pruned_loss=0.1645, over 14278.00 frames. ], tot_loss[loss=0.278, simple_loss=0.2552, pruned_loss=0.1504, over 2026970.72 frames. ], batch size: 31, lr: 4.18e-02, grad_scale: 16.0 +2022-12-07 05:45:39,647 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 05:45:44,000 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.500e+02 3.085e+02 4.030e+02 5.035e+02 8.863e+02, threshold=8.060e+02, percent-clipped=0.0 +2022-12-07 05:46:00,290 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-12-07 05:46:43,437 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7425, 2.4474, 4.6849, 3.3300, 3.8465, 2.8631, 4.5873, 4.2991], + device='cuda:2'), covar=tensor([0.0186, 0.4156, 0.0157, 0.5552, 0.0133, 0.2453, 0.0218, 0.0106], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0228, 0.0138, 0.0321, 0.0099, 0.0257, 0.0152, 0.0112], + device='cuda:2'), out_proj_covar=tensor([1.0563e-04, 1.9092e-04, 1.0297e-04, 2.5927e-04, 7.6350e-05, 2.0320e-04, + 1.1433e-04, 8.1979e-05], device='cuda:2') +2022-12-07 05:46:49,426 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-07 05:47:06,724 INFO [train.py:873] (2/4) Epoch 1, batch 5200, loss[loss=0.2395, simple_loss=0.2276, pruned_loss=0.1257, over 6958.00 frames. ], tot_loss[loss=0.2786, simple_loss=0.2555, pruned_loss=0.1509, over 1990998.27 frames. ], batch size: 100, lr: 4.16e-02, grad_scale: 16.0 +2022-12-07 05:47:11,951 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.189e+02 3.667e+02 4.771e+02 6.312e+02 1.162e+03, threshold=9.542e+02, percent-clipped=12.0 +2022-12-07 05:47:36,525 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3998, 1.6175, 2.7053, 2.2365, 2.6067, 2.5277, 1.9978, 2.7073], + device='cuda:2'), covar=tensor([0.0297, 0.1669, 0.0300, 0.1005, 0.0255, 0.0368, 0.0644, 0.0320], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0083, 0.0044, 0.0078, 0.0061, 0.0051, 0.0046, 0.0050], + device='cuda:2'), out_proj_covar=tensor([6.8040e-05, 1.1255e-04, 6.9876e-05, 1.0874e-04, 9.0096e-05, 7.4103e-05, + 6.9734e-05, 7.4726e-05], device='cuda:2') +2022-12-07 05:47:46,277 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.91 vs. limit=5.0 +2022-12-07 05:48:33,138 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 05:48:35,988 INFO [train.py:873] (2/4) Epoch 1, batch 5300, loss[loss=0.2537, simple_loss=0.2233, pruned_loss=0.142, over 4957.00 frames. ], tot_loss[loss=0.2756, simple_loss=0.2531, pruned_loss=0.149, over 1942442.30 frames. ], batch size: 100, lr: 4.14e-02, grad_scale: 16.0 +2022-12-07 05:48:40,962 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.435e+02 3.215e+02 4.180e+02 5.101e+02 1.112e+03, threshold=8.360e+02, percent-clipped=0.0 +2022-12-07 05:48:52,045 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 05:48:58,909 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.42 vs. limit=5.0 +2022-12-07 05:49:17,619 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7907, 2.4388, 2.8192, 2.5114, 2.6297, 2.4658, 2.4018, 2.3277], + device='cuda:2'), covar=tensor([0.0252, 0.1345, 0.0131, 0.0359, 0.0154, 0.0224, 0.0385, 0.1697], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0148, 0.0061, 0.0080, 0.0067, 0.0074, 0.0067, 0.0166], + device='cuda:2'), out_proj_covar=tensor([5.6358e-05, 1.1196e-04, 4.1021e-05, 5.9872e-05, 4.6710e-05, 5.1415e-05, + 5.1748e-05, 1.2195e-04], device='cuda:2') +2022-12-07 05:50:04,682 INFO [train.py:873] (2/4) Epoch 1, batch 5400, loss[loss=0.3006, simple_loss=0.272, pruned_loss=0.1646, over 14386.00 frames. ], tot_loss[loss=0.2749, simple_loss=0.253, pruned_loss=0.1484, over 1922740.92 frames. ], batch size: 73, lr: 4.12e-02, grad_scale: 16.0 +2022-12-07 05:50:09,834 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.361e+02 3.355e+02 4.207e+02 5.481e+02 1.330e+03, threshold=8.415e+02, percent-clipped=4.0 +2022-12-07 05:50:16,351 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9951, 0.5624, 0.8208, 1.1467, 1.1018, 0.5210, 0.6522, 1.1896], + device='cuda:2'), covar=tensor([0.0884, 0.1208, 0.0667, 0.0851, 0.0574, 0.0681, 0.0956, 0.0379], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0029, 0.0033, 0.0025, 0.0027, 0.0032, 0.0024, 0.0026], + device='cuda:2'), out_proj_covar=tensor([3.2089e-05, 3.6889e-05, 3.9972e-05, 3.2770e-05, 2.9597e-05, 3.7541e-05, + 3.0089e-05, 3.0803e-05], device='cuda:2') +2022-12-07 05:50:55,040 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6781, 4.4042, 4.4248, 4.5404, 4.6456, 4.6648, 4.6652, 4.7244], + device='cuda:2'), covar=tensor([0.0274, 0.0326, 0.0308, 0.0192, 0.0187, 0.0228, 0.0326, 0.0393], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0077, 0.0098, 0.0066, 0.0084, 0.0083, 0.0088, 0.0077], + device='cuda:2'), out_proj_covar=tensor([1.0151e-04, 9.5739e-05, 1.1858e-04, 7.7675e-05, 1.0122e-04, 9.7734e-05, + 1.2384e-04, 9.9080e-05], device='cuda:2') +2022-12-07 05:50:57,310 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-12-07 05:51:01,697 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.32 vs. limit=5.0 +2022-12-07 05:51:16,921 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.88 vs. limit=2.0 +2022-12-07 05:51:22,593 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3163, 2.1529, 2.3674, 2.2228, 2.1425, 2.1761, 1.4712, 2.2145], + device='cuda:2'), covar=tensor([0.0459, 0.0504, 0.0475, 0.0317, 0.0477, 0.0383, 0.1910, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0064, 0.0066, 0.0069, 0.0056, 0.0078, 0.0051, 0.0096, 0.0074], + device='cuda:2'), out_proj_covar=tensor([1.0026e-04, 1.2134e-04, 1.1288e-04, 9.5259e-05, 1.3239e-04, 8.4715e-05, + 1.4300e-04, 1.1745e-04], device='cuda:2') +2022-12-07 05:51:33,375 INFO [train.py:873] (2/4) Epoch 1, batch 5500, loss[loss=0.2662, simple_loss=0.2355, pruned_loss=0.1484, over 5973.00 frames. ], tot_loss[loss=0.2719, simple_loss=0.251, pruned_loss=0.1464, over 1915331.68 frames. ], batch size: 100, lr: 4.10e-02, grad_scale: 16.0 +2022-12-07 05:51:38,435 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 2.070e+02 3.601e+02 4.745e+02 6.042e+02 1.360e+03, threshold=9.490e+02, percent-clipped=8.0 +2022-12-07 05:51:57,876 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 05:52:26,005 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8560, 3.5059, 3.5779, 3.8258, 3.6626, 2.9728, 4.0251, 3.8704], + device='cuda:2'), covar=tensor([0.0682, 0.0885, 0.0831, 0.0731, 0.1021, 0.0631, 0.0658, 0.1098], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0064, 0.0086, 0.0081, 0.0084, 0.0053, 0.0079, 0.0083], + device='cuda:2'), out_proj_covar=tensor([1.0728e-04, 9.3463e-05, 1.2184e-04, 1.1318e-04, 1.1637e-04, 7.3423e-05, + 1.1946e-04, 1.1769e-04], device='cuda:2') +2022-12-07 05:52:59,125 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-12-07 05:53:00,152 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.05 vs. limit=5.0 +2022-12-07 05:53:01,583 INFO [train.py:873] (2/4) Epoch 1, batch 5600, loss[loss=0.2243, simple_loss=0.2356, pruned_loss=0.1065, over 14214.00 frames. ], tot_loss[loss=0.2743, simple_loss=0.2525, pruned_loss=0.1481, over 1925691.00 frames. ], batch size: 32, lr: 4.08e-02, grad_scale: 16.0 +2022-12-07 05:53:06,470 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.507e+02 3.374e+02 4.886e+02 6.824e+02 1.449e+03, threshold=9.773e+02, percent-clipped=6.0 +2022-12-07 05:53:24,039 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.11 vs. limit=2.0 +2022-12-07 05:53:24,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6814, 1.5461, 3.0104, 2.8083, 2.6503, 2.8481, 2.0287, 3.0397], + device='cuda:2'), covar=tensor([0.1258, 0.1204, 0.0143, 0.0207, 0.0140, 0.0161, 0.0326, 0.0105], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0083, 0.0042, 0.0051, 0.0043, 0.0045, 0.0042, 0.0041], + device='cuda:2'), out_proj_covar=tensor([1.1715e-04, 1.1357e-04, 5.8682e-05, 8.0724e-05, 5.7098e-05, 6.4478e-05, + 6.3182e-05, 5.6189e-05], device='cuda:2') +2022-12-07 05:53:44,243 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5650.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:53:50,555 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.94 vs. limit=5.0 +2022-12-07 05:53:56,763 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6996, 1.6924, 3.4220, 3.2691, 3.0978, 3.1373, 2.4167, 3.4461], + device='cuda:2'), covar=tensor([0.1426, 0.1373, 0.0136, 0.0116, 0.0142, 0.0163, 0.0344, 0.0102], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0084, 0.0043, 0.0051, 0.0044, 0.0045, 0.0042, 0.0041], + device='cuda:2'), out_proj_covar=tensor([1.1921e-04, 1.1567e-04, 6.0365e-05, 7.9912e-05, 5.8434e-05, 6.3657e-05, + 6.4253e-05, 5.6533e-05], device='cuda:2') +2022-12-07 05:54:29,450 INFO [train.py:873] (2/4) Epoch 1, batch 5700, loss[loss=0.2598, simple_loss=0.2088, pruned_loss=0.1554, over 1244.00 frames. ], tot_loss[loss=0.2744, simple_loss=0.2527, pruned_loss=0.1481, over 1929204.34 frames. ], batch size: 100, lr: 4.06e-02, grad_scale: 16.0 +2022-12-07 05:54:34,444 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.548e+02 3.498e+02 4.626e+02 6.502e+02 1.133e+03, threshold=9.251e+02, percent-clipped=2.0 +2022-12-07 05:54:38,253 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5711.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 05:54:44,763 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.71 vs. limit=2.0 +2022-12-07 05:54:50,284 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3450, 2.9768, 3.6683, 2.6696, 3.4757, 3.2111, 1.6478, 3.4194], + device='cuda:2'), covar=tensor([0.0348, 0.0601, 0.0435, 0.0699, 0.0389, 0.0610, 0.2899, 0.0299], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0063, 0.0068, 0.0056, 0.0079, 0.0051, 0.0099, 0.0075], + device='cuda:2'), out_proj_covar=tensor([1.0727e-04, 1.1689e-04, 1.1303e-04, 9.8957e-05, 1.3985e-04, 8.6991e-05, + 1.4927e-04, 1.2179e-04], device='cuda:2') +2022-12-07 05:55:16,851 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5755.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:55:29,231 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8148, 1.7288, 3.6918, 2.8145, 3.5597, 2.1443, 3.7875, 3.5343], + device='cuda:2'), covar=tensor([0.0259, 0.4681, 0.0271, 0.5002, 0.0144, 0.3181, 0.0340, 0.0212], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0242, 0.0135, 0.0332, 0.0103, 0.0270, 0.0158, 0.0111], + device='cuda:2'), out_proj_covar=tensor([1.1163e-04, 2.0488e-04, 1.0631e-04, 2.7278e-04, 8.1182e-05, 2.1635e-04, + 1.2330e-04, 8.4506e-05], device='cuda:2') +2022-12-07 05:55:58,101 INFO [train.py:873] (2/4) Epoch 1, batch 5800, loss[loss=0.2829, simple_loss=0.2543, pruned_loss=0.1558, over 9503.00 frames. ], tot_loss[loss=0.2744, simple_loss=0.2527, pruned_loss=0.1481, over 1997305.99 frames. ], batch size: 100, lr: 4.04e-02, grad_scale: 16.0 +2022-12-07 05:56:02,978 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-12-07 05:56:03,252 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.919e+02 3.658e+02 4.622e+02 6.453e+02 1.135e+03, threshold=9.244e+02, percent-clipped=6.0 +2022-12-07 05:56:11,928 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5816.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:56:26,413 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9264, 3.6453, 3.5967, 3.9192, 4.0147, 3.9196, 3.8706, 3.5323], + device='cuda:2'), covar=tensor([0.0297, 0.0499, 0.0300, 0.0359, 0.0289, 0.0406, 0.0427, 0.0401], + device='cuda:2'), in_proj_covar=tensor([0.0048, 0.0075, 0.0060, 0.0056, 0.0056, 0.0065, 0.0079, 0.0074], + device='cuda:2'), out_proj_covar=tensor([7.8252e-05, 1.0813e-04, 8.5763e-05, 8.8332e-05, 8.6017e-05, 1.0262e-04, + 1.3223e-04, 1.1421e-04], device='cuda:2') +2022-12-07 05:56:28,326 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1168, 0.9072, 1.0879, 1.4520, 1.8623, 1.8254, 1.0213, 1.0552], + device='cuda:2'), covar=tensor([0.0336, 0.0551, 0.0342, 0.0112, 0.0106, 0.0139, 0.0186, 0.0366], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0020, 0.0017, 0.0017, 0.0021, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.1212e-05, 2.0996e-05, 2.5949e-05, 1.7830e-05, 1.9693e-05, 2.3105e-05, + 2.3416e-05, 2.1811e-05], device='cuda:2') +2022-12-07 05:56:48,632 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5858.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:57:26,650 INFO [train.py:873] (2/4) Epoch 1, batch 5900, loss[loss=0.287, simple_loss=0.2647, pruned_loss=0.1546, over 14241.00 frames. ], tot_loss[loss=0.2715, simple_loss=0.2512, pruned_loss=0.1459, over 1965108.68 frames. ], batch size: 69, lr: 4.02e-02, grad_scale: 16.0 +2022-12-07 05:57:31,260 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8697, 0.7515, 0.8989, 0.9903, 1.2743, 1.0816, 0.8248, 0.8103], + device='cuda:2'), covar=tensor([0.0196, 0.0177, 0.0219, 0.0099, 0.0095, 0.0138, 0.0141, 0.0222], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0018, 0.0020, 0.0018, 0.0017, 0.0021, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.1280e-05, 2.0763e-05, 2.5507e-05, 1.8403e-05, 1.9951e-05, 2.3415e-05, + 2.3806e-05, 2.2394e-05], device='cuda:2') +2022-12-07 05:57:31,835 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.800e+02 3.450e+02 4.399e+02 5.936e+02 1.198e+03, threshold=8.798e+02, percent-clipped=6.0 +2022-12-07 05:57:34,292 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-12-07 05:57:42,918 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=5919.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:57:43,401 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 05:57:46,616 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.30 vs. limit=2.0 +2022-12-07 05:57:53,110 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7682, 0.8571, 1.0892, 1.0412, 0.8779, 1.1607, 0.8684, 1.0817], + device='cuda:2'), covar=tensor([0.2711, 0.0979, 0.0588, 0.0542, 0.0591, 0.0371, 0.0801, 0.0415], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0055, 0.0029, 0.0034, 0.0043, 0.0035, 0.0043, 0.0041], + device='cuda:2'), out_proj_covar=tensor([1.3298e-04, 8.1740e-05, 5.2652e-05, 5.3492e-05, 6.0320e-05, 5.7016e-05, + 6.7152e-05, 5.8463e-05], device='cuda:2') +2022-12-07 05:58:05,001 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5080, 5.4823, 5.0170, 5.7354, 5.4005, 4.5525, 5.9771, 5.8897], + device='cuda:2'), covar=tensor([0.0601, 0.0424, 0.0776, 0.0558, 0.0563, 0.0439, 0.0536, 0.0720], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0066, 0.0093, 0.0089, 0.0092, 0.0060, 0.0086, 0.0089], + device='cuda:2'), out_proj_covar=tensor([1.2122e-04, 1.0066e-04, 1.3327e-04, 1.2579e-04, 1.2894e-04, 8.4963e-05, + 1.3142e-04, 1.2833e-04], device='cuda:2') +2022-12-07 05:58:44,772 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=5990.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:58:54,697 INFO [train.py:873] (2/4) Epoch 1, batch 6000, loss[loss=0.2903, simple_loss=0.2632, pruned_loss=0.1587, over 13545.00 frames. ], tot_loss[loss=0.2712, simple_loss=0.2507, pruned_loss=0.1458, over 1955182.05 frames. ], batch size: 100, lr: 4.00e-02, grad_scale: 16.0 +2022-12-07 05:58:54,697 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 05:59:02,828 INFO [train.py:905] (2/4) Epoch 1, validation: loss=0.159, simple_loss=0.1938, pruned_loss=0.06211, over 857387.00 frames. +2022-12-07 05:59:02,829 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 05:59:07,236 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6006.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 05:59:07,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.676e+02 3.896e+02 5.286e+02 6.584e+02 1.638e+03, threshold=1.057e+03, percent-clipped=9.0 +2022-12-07 05:59:12,663 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 05:59:23,452 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 05:59:24,249 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 05:59:39,717 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6043.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:59:46,759 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6051.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 05:59:52,894 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4852, 1.0811, 1.1888, 1.0953, 1.3288, 1.0094, 1.3681, 1.2298], + device='cuda:2'), covar=tensor([0.0122, 0.0834, 0.0186, 0.0474, 0.0303, 0.0462, 0.0383, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0095, 0.0046, 0.0065, 0.0041, 0.0045, 0.0043, 0.0043], + device='cuda:2'), out_proj_covar=tensor([4.8015e-05, 1.2289e-04, 5.8922e-05, 8.6133e-05, 6.0528e-05, 6.2609e-05, + 6.6466e-05, 5.7498e-05], device='cuda:2') +2022-12-07 06:00:19,261 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.5105, 0.4098, 0.4580, 0.6357, 0.6141, 0.6695, 0.5764, 0.5049], + device='cuda:2'), covar=tensor([0.0105, 0.0109, 0.0067, 0.0081, 0.0075, 0.0102, 0.0070, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0017, 0.0020, 0.0018, 0.0016, 0.0019, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([2.0637e-05, 1.9612e-05, 2.5639e-05, 1.9005e-05, 1.9405e-05, 2.1754e-05, + 2.3717e-05, 2.2367e-05], device='cuda:2') +2022-12-07 06:00:30,200 INFO [train.py:873] (2/4) Epoch 1, batch 6100, loss[loss=0.1816, simple_loss=0.1573, pruned_loss=0.1029, over 1344.00 frames. ], tot_loss[loss=0.2713, simple_loss=0.2514, pruned_loss=0.1456, over 2020059.72 frames. ], batch size: 100, lr: 3.98e-02, grad_scale: 16.0 +2022-12-07 06:00:30,892 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-07 06:00:33,329 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6104.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:00:35,715 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 3.351e+02 5.105e+02 6.275e+02 1.538e+03, threshold=1.021e+03, percent-clipped=3.0 +2022-12-07 06:00:39,119 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6111.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:00:39,147 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3927, 3.9368, 4.4302, 3.7047, 4.2426, 4.0565, 1.8937, 4.1792], + device='cuda:2'), covar=tensor([0.0177, 0.0340, 0.0313, 0.0431, 0.0229, 0.0150, 0.2655, 0.0218], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0063, 0.0068, 0.0054, 0.0083, 0.0053, 0.0100, 0.0075], + device='cuda:2'), out_proj_covar=tensor([1.0991e-04, 1.2027e-04, 1.1647e-04, 9.8498e-05, 1.5181e-04, 9.4740e-05, + 1.5487e-04, 1.2743e-04], device='cuda:2') +2022-12-07 06:00:46,838 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5972, 1.3625, 2.1483, 1.9030, 1.9311, 1.9091, 1.0061, 1.3383], + device='cuda:2'), covar=tensor([0.0287, 0.0394, 0.0172, 0.0151, 0.0149, 0.0178, 0.0902, 0.0560], + device='cuda:2'), in_proj_covar=tensor([0.0045, 0.0046, 0.0045, 0.0043, 0.0045, 0.0040, 0.0075, 0.0050], + device='cuda:2'), out_proj_covar=tensor([5.2540e-05, 5.1161e-05, 5.1424e-05, 4.0628e-05, 4.4723e-05, 4.0166e-05, + 8.8088e-05, 5.7513e-05], device='cuda:2') +2022-12-07 06:00:58,025 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=15.47 vs. limit=5.0 +2022-12-07 06:01:00,382 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6135.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:01:05,271 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 06:01:31,111 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6170.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:01:33,844 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-07 06:01:53,730 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6196.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:01:58,121 INFO [train.py:873] (2/4) Epoch 1, batch 6200, loss[loss=0.2937, simple_loss=0.2643, pruned_loss=0.1616, over 14277.00 frames. ], tot_loss[loss=0.2708, simple_loss=0.2508, pruned_loss=0.1454, over 1937630.84 frames. ], batch size: 76, lr: 3.96e-02, grad_scale: 16.0 +2022-12-07 06:02:00,046 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6203.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:02:03,439 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.672e+01 3.177e+02 4.421e+02 6.121e+02 1.475e+03, threshold=8.841e+02, percent-clipped=2.0 +2022-12-07 06:02:07,148 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6211.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:02:09,665 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6214.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:02:24,759 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6231.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:02:34,821 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 06:02:54,133 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6264.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:02:55,649 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6266.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:03:00,783 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:03:09,317 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 06:03:11,588 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7714, 1.1511, 2.3114, 2.0791, 2.1807, 2.2617, 1.4573, 2.4266], + device='cuda:2'), covar=tensor([0.0681, 0.1154, 0.0136, 0.0345, 0.0164, 0.0149, 0.0412, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0092, 0.0045, 0.0057, 0.0048, 0.0049, 0.0044, 0.0042], + device='cuda:2'), out_proj_covar=tensor([1.3411e-04, 1.3211e-04, 6.6300e-05, 9.5149e-05, 6.9564e-05, 7.2907e-05, + 7.1489e-05, 6.0612e-05], device='cuda:2') +2022-12-07 06:03:26,922 INFO [train.py:873] (2/4) Epoch 1, batch 6300, loss[loss=0.3308, simple_loss=0.2813, pruned_loss=0.1901, over 8638.00 frames. ], tot_loss[loss=0.2703, simple_loss=0.2502, pruned_loss=0.1452, over 1936004.12 frames. ], batch size: 100, lr: 3.94e-02, grad_scale: 16.0 +2022-12-07 06:03:31,235 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6306.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:03:31,914 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.538e+02 3.956e+02 5.182e+02 6.845e+02 1.642e+03, threshold=1.036e+03, percent-clipped=11.0 +2022-12-07 06:03:33,341 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.46 vs. limit=2.0 +2022-12-07 06:03:49,310 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6327.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:04:05,569 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6346.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:04:12,442 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6354.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:04:27,094 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-07 06:04:51,719 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:04:53,610 INFO [train.py:873] (2/4) Epoch 1, batch 6400, loss[loss=0.2827, simple_loss=0.2449, pruned_loss=0.1603, over 5997.00 frames. ], tot_loss[loss=0.2693, simple_loss=0.2503, pruned_loss=0.1442, over 1977445.02 frames. ], batch size: 100, lr: 3.92e-02, grad_scale: 8.0 +2022-12-07 06:05:00,167 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.419e+02 3.375e+02 4.095e+02 5.340e+02 1.039e+03, threshold=8.189e+02, percent-clipped=1.0 +2022-12-07 06:05:03,071 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6411.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:05:26,624 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1375, 2.9155, 3.2580, 2.6150, 3.0643, 2.7583, 1.4976, 3.0393], + device='cuda:2'), covar=tensor([0.0248, 0.0386, 0.0348, 0.0417, 0.0315, 0.0726, 0.2865, 0.0275], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0065, 0.0069, 0.0056, 0.0086, 0.0058, 0.0102, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:05:38,250 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6373, 0.4611, 0.5700, 0.7048, 0.5925, 0.6971, 0.7508, 0.6684], + device='cuda:2'), covar=tensor([0.0031, 0.0295, 0.0058, 0.0061, 0.0060, 0.0068, 0.0043, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0019, 0.0020, 0.0019, 0.0019, 0.0020, 0.0018, 0.0020], + device='cuda:2'), out_proj_covar=tensor([2.1326e-05, 2.3579e-05, 2.7227e-05, 2.1275e-05, 2.3376e-05, 2.2612e-05, + 2.5931e-05, 2.4285e-05], device='cuda:2') +2022-12-07 06:05:44,985 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6459.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:05:47,756 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6462.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:05:52,032 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6467.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:05:57,786 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4094, 1.0502, 1.4542, 1.3698, 1.0543, 1.1754, 1.1807, 1.4690], + device='cuda:2'), covar=tensor([0.2795, 0.1257, 0.0672, 0.0728, 0.0665, 0.0775, 0.0935, 0.0531], + device='cuda:2'), in_proj_covar=tensor([0.0094, 0.0051, 0.0032, 0.0036, 0.0042, 0.0034, 0.0046, 0.0044], + device='cuda:2'), out_proj_covar=tensor([1.4180e-04, 8.2627e-05, 5.9875e-05, 6.2910e-05, 6.5286e-05, 6.0544e-05, + 7.6279e-05, 6.6494e-05], device='cuda:2') +2022-12-07 06:06:12,269 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1475, 2.7477, 3.2467, 2.8134, 2.9660, 2.7950, 1.5383, 2.9633], + device='cuda:2'), covar=tensor([0.0274, 0.0488, 0.0328, 0.0392, 0.0361, 0.0641, 0.2782, 0.0325], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0064, 0.0068, 0.0055, 0.0085, 0.0060, 0.0103, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:06:13,025 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6491.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:06:21,241 INFO [train.py:873] (2/4) Epoch 1, batch 6500, loss[loss=0.2672, simple_loss=0.2576, pruned_loss=0.1384, over 14237.00 frames. ], tot_loss[loss=0.2695, simple_loss=0.2501, pruned_loss=0.1444, over 1975097.24 frames. ], batch size: 39, lr: 3.90e-02, grad_scale: 8.0 +2022-12-07 06:06:27,499 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.203e+01 3.294e+02 4.343e+02 5.711e+02 1.098e+03, threshold=8.686e+02, percent-clipped=4.0 +2022-12-07 06:06:32,696 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6514.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:06:40,879 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6523.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:06:43,760 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6526.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:06:45,629 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6528.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:06:55,311 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.22 vs. limit=2.0 +2022-12-07 06:07:03,131 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0841, 3.1360, 3.8137, 2.9513, 4.4067, 3.9271, 3.2509, 3.1830], + device='cuda:2'), covar=tensor([0.0163, 0.1866, 0.0145, 0.0815, 0.0086, 0.0140, 0.1434, 0.1689], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0181, 0.0074, 0.0112, 0.0078, 0.0094, 0.0080, 0.0212], + device='cuda:2'), out_proj_covar=tensor([7.6081e-05, 1.4145e-04, 5.2198e-05, 8.8545e-05, 5.6200e-05, 7.0871e-05, + 7.1791e-05, 1.6131e-04], device='cuda:2') +2022-12-07 06:07:12,824 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6559.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:07:12,988 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8736, 2.9102, 3.6308, 2.6495, 4.4004, 3.7360, 3.5666, 3.3174], + device='cuda:2'), covar=tensor([0.0175, 0.1873, 0.0098, 0.0987, 0.0092, 0.0175, 0.0586, 0.1175], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0178, 0.0073, 0.0111, 0.0076, 0.0093, 0.0078, 0.0209], + device='cuda:2'), out_proj_covar=tensor([7.4894e-05, 1.3911e-04, 5.1719e-05, 8.7318e-05, 5.5408e-05, 7.0073e-05, + 7.0167e-05, 1.5899e-04], device='cuda:2') +2022-12-07 06:07:15,380 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6562.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:07:20,384 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6567.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:07:50,328 INFO [train.py:873] (2/4) Epoch 1, batch 6600, loss[loss=0.2019, simple_loss=0.1727, pruned_loss=0.1156, over 1231.00 frames. ], tot_loss[loss=0.2684, simple_loss=0.2493, pruned_loss=0.1438, over 1953580.81 frames. ], batch size: 100, lr: 3.89e-02, grad_scale: 8.0 +2022-12-07 06:07:56,665 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.510e+02 3.480e+02 4.603e+02 5.913e+02 1.121e+03, threshold=9.206e+02, percent-clipped=7.0 +2022-12-07 06:08:08,519 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6622.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:08:14,763 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6629.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:08:30,306 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6646.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:08:32,093 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6957, 1.2754, 1.8911, 1.5245, 1.9079, 1.4411, 1.5706, 1.8295], + device='cuda:2'), covar=tensor([0.0603, 0.1070, 0.0272, 0.1178, 0.0251, 0.1043, 0.0723, 0.0251], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0259, 0.0135, 0.0355, 0.0107, 0.0284, 0.0170, 0.0123], + device='cuda:2'), out_proj_covar=tensor([1.2575e-04, 2.2530e-04, 1.1231e-04, 2.9663e-04, 9.1567e-05, 2.3427e-04, + 1.3505e-04, 9.9452e-05], device='cuda:2') +2022-12-07 06:09:09,339 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6690.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:09:12,778 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6694.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:09:17,230 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6699.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:09:18,766 INFO [train.py:873] (2/4) Epoch 1, batch 6700, loss[loss=0.2486, simple_loss=0.2417, pruned_loss=0.1277, over 14166.00 frames. ], tot_loss[loss=0.2657, simple_loss=0.2477, pruned_loss=0.1418, over 1951563.83 frames. ], batch size: 35, lr: 3.87e-02, grad_scale: 8.0 +2022-12-07 06:09:24,613 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.051e+02 3.291e+02 4.382e+02 6.181e+02 1.245e+03, threshold=8.764e+02, percent-clipped=7.0 +2022-12-07 06:09:58,731 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6747.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:10:32,093 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0018, 2.0112, 1.8403, 2.0640, 1.7537, 1.8858, 2.0440, 2.1485], + device='cuda:2'), covar=tensor([0.0950, 0.0667, 0.1165, 0.0819, 0.1082, 0.0768, 0.1004, 0.0850], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0063, 0.0088, 0.0081, 0.0085, 0.0058, 0.0079, 0.0082], + device='cuda:2'), out_proj_covar=tensor([1.1808e-04, 9.7972e-05, 1.2898e-04, 1.1950e-04, 1.2375e-04, 8.3990e-05, + 1.2187e-04, 1.2351e-04], device='cuda:2') +2022-12-07 06:10:34,219 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.08 vs. limit=5.0 +2022-12-07 06:10:37,370 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6791.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:10:46,418 INFO [train.py:873] (2/4) Epoch 1, batch 6800, loss[loss=0.2813, simple_loss=0.259, pruned_loss=0.1518, over 14593.00 frames. ], tot_loss[loss=0.2657, simple_loss=0.2475, pruned_loss=0.1419, over 1943696.33 frames. ], batch size: 23, lr: 3.85e-02, grad_scale: 8.0 +2022-12-07 06:10:52,797 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.483e+01 3.173e+02 4.494e+02 5.972e+02 9.591e+02, threshold=8.989e+02, percent-clipped=5.0 +2022-12-07 06:11:01,545 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6818.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:11:05,724 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6823.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:08,617 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6826.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:19,528 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6839.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:29,962 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.13 vs. limit=2.0 +2022-12-07 06:11:37,266 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6859.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:44,214 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6867.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:49,841 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6874.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:11:53,631 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6878.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:12:13,453 INFO [train.py:873] (2/4) Epoch 1, batch 6900, loss[loss=0.2312, simple_loss=0.1923, pruned_loss=0.1351, over 1218.00 frames. ], tot_loss[loss=0.2661, simple_loss=0.2477, pruned_loss=0.1423, over 1981019.54 frames. ], batch size: 100, lr: 3.83e-02, grad_scale: 8.0 +2022-12-07 06:12:15,879 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-07 06:12:18,842 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6907.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:12:19,586 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.182e+02 3.106e+02 4.451e+02 6.203e+02 1.044e+03, threshold=8.902e+02, percent-clipped=8.0 +2022-12-07 06:12:20,709 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=6909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:12:24,075 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2920, 1.8213, 4.1299, 3.8578, 3.9155, 4.0830, 3.0942, 4.2879], + device='cuda:2'), covar=tensor([0.1467, 0.1732, 0.0142, 0.0131, 0.0110, 0.0127, 0.0280, 0.0091], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0101, 0.0047, 0.0062, 0.0050, 0.0053, 0.0047, 0.0046], + device='cuda:2'), out_proj_covar=tensor([1.5027e-04, 1.5021e-04, 7.4332e-05, 1.0779e-04, 7.6482e-05, 8.5191e-05, + 8.1628e-05, 6.9202e-05], device='cuda:2') +2022-12-07 06:12:24,409 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-12-07 06:12:25,630 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6915.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:12:31,977 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=6922.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:12:46,798 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6939.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 06:13:13,995 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=6970.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:13:14,093 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=6970.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:13:16,553 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7995, 1.5582, 2.8573, 1.7010, 2.7860, 2.7472, 2.0312, 2.6858], + device='cuda:2'), covar=tensor([0.0167, 0.2165, 0.0276, 0.1707, 0.0243, 0.0247, 0.0743, 0.0307], + device='cuda:2'), in_proj_covar=tensor([0.0056, 0.0100, 0.0055, 0.0100, 0.0073, 0.0057, 0.0058, 0.0058], + device='cuda:2'), out_proj_covar=tensor([9.5549e-05, 1.5991e-04, 1.0557e-04, 1.5983e-04, 1.2964e-04, 1.0077e-04, + 1.0574e-04, 1.0246e-04], device='cuda:2') +2022-12-07 06:13:27,145 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=6985.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:13:41,156 INFO [train.py:873] (2/4) Epoch 1, batch 7000, loss[loss=0.2671, simple_loss=0.2253, pruned_loss=0.1544, over 3876.00 frames. ], tot_loss[loss=0.2656, simple_loss=0.2474, pruned_loss=0.1419, over 1996117.47 frames. ], batch size: 100, lr: 3.81e-02, grad_scale: 8.0 +2022-12-07 06:13:47,947 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.205e+02 2.871e+02 4.067e+02 4.755e+02 1.195e+03, threshold=8.134e+02, percent-clipped=1.0 +2022-12-07 06:13:51,215 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 06:14:21,116 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.18 vs. limit=2.0 +2022-12-07 06:14:23,439 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7048.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:14:31,740 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3073, 3.8240, 4.3781, 3.6853, 4.2856, 4.3859, 2.0590, 4.2994], + device='cuda:2'), covar=tensor([0.0236, 0.0400, 0.0436, 0.0434, 0.0310, 0.0153, 0.3165, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0064, 0.0070, 0.0059, 0.0089, 0.0060, 0.0108, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:15:09,089 INFO [train.py:873] (2/4) Epoch 1, batch 7100, loss[loss=0.2759, simple_loss=0.2532, pruned_loss=0.1493, over 14469.00 frames. ], tot_loss[loss=0.2632, simple_loss=0.2458, pruned_loss=0.1403, over 2001688.92 frames. ], batch size: 51, lr: 3.79e-02, grad_scale: 8.0 +2022-12-07 06:15:11,811 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8351, 1.8667, 2.1997, 2.3541, 1.9655, 1.5801, 2.1160, 2.0117], + device='cuda:2'), covar=tensor([0.0185, 0.0272, 0.0136, 0.0179, 0.0204, 0.0595, 0.0075, 0.0409], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0076, 0.0052, 0.0073, 0.0076, 0.0119, 0.0048, 0.0117], + device='cuda:2'), out_proj_covar=tensor([6.4880e-05, 7.7807e-05, 5.1775e-05, 7.6093e-05, 7.5859e-05, 1.1816e-04, + 4.4442e-05, 1.1524e-04], device='cuda:2') +2022-12-07 06:15:15,338 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.451e+02 3.113e+02 4.115e+02 5.668e+02 1.201e+03, threshold=8.229e+02, percent-clipped=4.0 +2022-12-07 06:15:16,436 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7109.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 06:15:23,836 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7118.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:15:28,134 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7123.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:15:45,339 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.26 vs. limit=5.0 +2022-12-07 06:16:03,247 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7163.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:16:05,593 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7166.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:16:09,762 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7171.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:16:35,874 INFO [train.py:873] (2/4) Epoch 1, batch 7200, loss[loss=0.2461, simple_loss=0.2314, pruned_loss=0.1304, over 4984.00 frames. ], tot_loss[loss=0.2618, simple_loss=0.245, pruned_loss=0.1393, over 1996455.42 frames. ], batch size: 100, lr: 3.78e-02, grad_scale: 8.0 +2022-12-07 06:16:42,299 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.199e+02 3.143e+02 4.183e+02 5.672e+02 1.523e+03, threshold=8.365e+02, percent-clipped=9.0 +2022-12-07 06:16:56,279 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7224.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:16:57,864 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2200, 3.7185, 3.7746, 4.2314, 3.9541, 3.2166, 4.3315, 4.2153], + device='cuda:2'), covar=tensor([0.0766, 0.0680, 0.0758, 0.0650, 0.0781, 0.0596, 0.0709, 0.0898], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0064, 0.0084, 0.0075, 0.0082, 0.0054, 0.0075, 0.0079], + device='cuda:2'), out_proj_covar=tensor([1.1368e-04, 1.0189e-04, 1.2539e-04, 1.1438e-04, 1.2152e-04, 8.0118e-05, + 1.1729e-04, 1.2095e-04], device='cuda:2') +2022-12-07 06:17:04,780 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7234.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 06:17:32,281 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7265.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:17:41,190 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7275.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:17:50,912 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7285.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:18:04,545 INFO [train.py:873] (2/4) Epoch 1, batch 7300, loss[loss=0.2827, simple_loss=0.2573, pruned_loss=0.1541, over 14292.00 frames. ], tot_loss[loss=0.2628, simple_loss=0.2458, pruned_loss=0.1399, over 2021394.88 frames. ], batch size: 76, lr: 3.76e-02, grad_scale: 8.0 +2022-12-07 06:18:10,448 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.504e+02 3.489e+02 4.612e+02 5.728e+02 1.039e+03, threshold=9.225e+02, percent-clipped=2.0 +2022-12-07 06:18:32,905 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7333.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:18:35,735 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7336.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:18:40,152 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8051, 3.5907, 3.9012, 3.3596, 3.4185, 2.4074, 3.0926, 2.8268], + device='cuda:2'), covar=tensor([0.0110, 0.0161, 0.0140, 0.0253, 0.0194, 0.0796, 0.0086, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0067, 0.0078, 0.0055, 0.0074, 0.0078, 0.0119, 0.0049, 0.0121], + device='cuda:2'), out_proj_covar=tensor([6.5347e-05, 7.8971e-05, 5.7895e-05, 7.8807e-05, 8.1373e-05, 1.2010e-04, + 4.5680e-05, 1.2097e-04], device='cuda:2') +2022-12-07 06:19:25,273 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6293, 2.5102, 2.3752, 2.7652, 2.3988, 2.2731, 2.7272, 2.7837], + device='cuda:2'), covar=tensor([0.0927, 0.0855, 0.1173, 0.0813, 0.1006, 0.0841, 0.0971, 0.0837], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0065, 0.0088, 0.0082, 0.0086, 0.0056, 0.0080, 0.0084], + device='cuda:2'), out_proj_covar=tensor([1.2160e-04, 1.0215e-04, 1.3207e-04, 1.2439e-04, 1.2819e-04, 8.4333e-05, + 1.2573e-04, 1.2973e-04], device='cuda:2') +2022-12-07 06:19:33,641 INFO [train.py:873] (2/4) Epoch 1, batch 7400, loss[loss=0.2574, simple_loss=0.2448, pruned_loss=0.1349, over 14209.00 frames. ], tot_loss[loss=0.2627, simple_loss=0.2456, pruned_loss=0.1399, over 2006913.02 frames. ], batch size: 84, lr: 3.74e-02, grad_scale: 8.0 +2022-12-07 06:19:36,770 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7404.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 06:19:39,365 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7407.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:19:39,943 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.745e+02 3.232e+02 4.380e+02 5.803e+02 1.577e+03, threshold=8.760e+02, percent-clipped=3.0 +2022-12-07 06:19:49,131 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 06:19:58,748 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1698, 0.6719, 0.8190, 0.8402, 1.2029, 0.5767, 1.2381, 1.1211], + device='cuda:2'), covar=tensor([0.0902, 0.1230, 0.0740, 0.1104, 0.1456, 0.0513, 0.0538, 0.0320], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0025, 0.0026, 0.0022, 0.0022, 0.0027, 0.0020, 0.0022], + device='cuda:2'), out_proj_covar=tensor([3.3080e-05, 4.0174e-05, 3.8856e-05, 3.6327e-05, 3.2518e-05, 4.3127e-05, + 3.1264e-05, 3.2331e-05], device='cuda:2') +2022-12-07 06:20:32,153 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7468.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:21:00,784 INFO [train.py:873] (2/4) Epoch 1, batch 7500, loss[loss=0.2612, simple_loss=0.2437, pruned_loss=0.1394, over 14020.00 frames. ], tot_loss[loss=0.2598, simple_loss=0.2441, pruned_loss=0.1378, over 2095902.86 frames. ], batch size: 26, lr: 3.72e-02, grad_scale: 8.0 +2022-12-07 06:21:06,494 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 3.575e+02 4.286e+02 5.078e+02 8.997e+02, threshold=8.573e+02, percent-clipped=1.0 +2022-12-07 06:21:16,624 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7519.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:21:23,625 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0176, 3.7756, 3.6813, 3.7858, 3.9552, 3.8678, 4.0076, 3.9798], + device='cuda:2'), covar=tensor([0.0418, 0.0570, 0.0671, 0.0445, 0.0315, 0.0357, 0.0549, 0.0637], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0094, 0.0118, 0.0082, 0.0097, 0.0099, 0.0112, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:21:29,249 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7534.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:21:32,888 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 06:21:34,452 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.49 vs. limit=5.0 +2022-12-07 06:22:29,584 INFO [train.py:873] (2/4) Epoch 2, batch 0, loss[loss=0.3432, simple_loss=0.315, pruned_loss=0.1857, over 13501.00 frames. ], tot_loss[loss=0.3432, simple_loss=0.315, pruned_loss=0.1857, over 13501.00 frames. ], batch size: 17, lr: 3.64e-02, grad_scale: 8.0 +2022-12-07 06:22:29,584 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 06:22:36,877 INFO [train.py:905] (2/4) Epoch 2, validation: loss=0.201, simple_loss=0.225, pruned_loss=0.08852, over 857387.00 frames. +2022-12-07 06:22:36,878 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 06:22:38,727 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7565.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:22:54,116 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7582.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:23:16,974 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 3.396e+01 2.991e+02 4.594e+02 6.425e+02 1.765e+03, threshold=9.187e+02, percent-clipped=13.0 +2022-12-07 06:23:17,619 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-07 06:23:21,534 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7613.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:23:31,713 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7624.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:23:37,400 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7631.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:24:05,652 INFO [train.py:873] (2/4) Epoch 2, batch 100, loss[loss=0.2581, simple_loss=0.2303, pruned_loss=0.1429, over 3904.00 frames. ], tot_loss[loss=0.2671, simple_loss=0.2491, pruned_loss=0.1426, over 863216.46 frames. ], batch size: 100, lr: 3.62e-02, grad_scale: 8.0 +2022-12-07 06:24:24,607 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7685.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:24:31,677 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2188, 3.0239, 2.9448, 2.7803, 3.1510, 3.0718, 3.1322, 3.1042], + device='cuda:2'), covar=tensor([0.0472, 0.0580, 0.0646, 0.0773, 0.0381, 0.0436, 0.0796, 0.0844], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0097, 0.0120, 0.0082, 0.0099, 0.0103, 0.0116, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:24:41,656 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7704.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:24:44,979 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.630e+02 3.684e+02 4.699e+02 5.885e+02 1.470e+03, threshold=9.397e+02, percent-clipped=3.0 +2022-12-07 06:25:16,196 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7743.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:25:23,876 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7752.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:25:25,837 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.61 vs. limit=5.0 +2022-12-07 06:25:32,820 INFO [train.py:873] (2/4) Epoch 2, batch 200, loss[loss=0.1676, simple_loss=0.1474, pruned_loss=0.09388, over 2704.00 frames. ], tot_loss[loss=0.2584, simple_loss=0.2426, pruned_loss=0.1372, over 1256095.53 frames. ], batch size: 100, lr: 3.61e-02, grad_scale: 8.0 +2022-12-07 06:25:33,250 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7763.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:25:53,850 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.84 vs. limit=5.0 +2022-12-07 06:26:07,037 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=7802.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:26:08,735 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7804.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:26:12,301 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 3.123e+02 4.049e+02 5.757e+02 1.056e+03, threshold=8.099e+02, percent-clipped=3.0 +2022-12-07 06:26:18,133 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.63 vs. limit=5.0 +2022-12-07 06:26:22,337 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7819.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:26:38,174 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4601, 1.7232, 3.4428, 2.4493, 3.1833, 1.9366, 3.3181, 3.1779], + device='cuda:2'), covar=tensor([0.0458, 0.5871, 0.0341, 0.7514, 0.0182, 0.3894, 0.0664, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0264, 0.0138, 0.0359, 0.0112, 0.0280, 0.0184, 0.0122], + device='cuda:2'), out_proj_covar=tensor([1.3993e-04, 2.4127e-04, 1.1977e-04, 3.0868e-04, 9.7619e-05, 2.4029e-04, + 1.5687e-04, 1.0451e-04], device='cuda:2') +2022-12-07 06:26:59,713 INFO [train.py:873] (2/4) Epoch 2, batch 300, loss[loss=0.2645, simple_loss=0.2158, pruned_loss=0.1566, over 1193.00 frames. ], tot_loss[loss=0.2575, simple_loss=0.2418, pruned_loss=0.1366, over 1506913.95 frames. ], batch size: 100, lr: 3.59e-02, grad_scale: 8.0 +2022-12-07 06:26:59,913 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=7863.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:27:03,184 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7867.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:27:38,764 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.219e+02 3.418e+02 4.416e+02 5.390e+02 1.169e+03, threshold=8.833e+02, percent-clipped=6.0 +2022-12-07 06:27:58,937 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=7931.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:28:26,417 INFO [train.py:873] (2/4) Epoch 2, batch 400, loss[loss=0.2323, simple_loss=0.213, pruned_loss=0.1258, over 3876.00 frames. ], tot_loss[loss=0.2575, simple_loss=0.2426, pruned_loss=0.1362, over 1772063.40 frames. ], batch size: 100, lr: 3.58e-02, grad_scale: 8.0 +2022-12-07 06:28:40,952 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=7979.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:28:41,834 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=7980.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:29:05,269 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.294e+01 3.140e+02 4.082e+02 5.332e+02 1.723e+03, threshold=8.164e+02, percent-clipped=4.0 +2022-12-07 06:29:07,096 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8956, 1.8047, 2.8100, 1.7766, 2.7942, 2.6502, 2.0923, 3.0106], + device='cuda:2'), covar=tensor([0.0189, 0.1888, 0.0355, 0.1772, 0.0247, 0.0344, 0.0739, 0.0214], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0108, 0.0063, 0.0114, 0.0084, 0.0069, 0.0065, 0.0065], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 06:29:28,052 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-12-07 06:29:53,231 INFO [train.py:873] (2/4) Epoch 2, batch 500, loss[loss=0.2311, simple_loss=0.2033, pruned_loss=0.1295, over 3895.00 frames. ], tot_loss[loss=0.2593, simple_loss=0.2436, pruned_loss=0.1375, over 1854701.80 frames. ], batch size: 100, lr: 3.56e-02, grad_scale: 8.0 +2022-12-07 06:29:53,386 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8063.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:29:59,788 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=15.91 vs. limit=5.0 +2022-12-07 06:30:25,008 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8099.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:30:32,478 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.590e+01 3.381e+02 4.581e+02 6.129e+02 1.327e+03, threshold=9.162e+02, percent-clipped=13.0 +2022-12-07 06:30:35,073 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8111.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:30:42,620 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 06:30:44,330 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 06:31:15,325 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8158.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:31:19,467 INFO [train.py:873] (2/4) Epoch 2, batch 600, loss[loss=0.2373, simple_loss=0.2256, pruned_loss=0.1245, over 11179.00 frames. ], tot_loss[loss=0.2586, simple_loss=0.2428, pruned_loss=0.1372, over 1867798.44 frames. ], batch size: 100, lr: 3.54e-02, grad_scale: 8.0 +2022-12-07 06:31:27,499 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8373, 3.8801, 2.8540, 4.1015, 3.5373, 4.0592, 3.8513, 2.6635], + device='cuda:2'), covar=tensor([0.0149, 0.0243, 0.2016, 0.0140, 0.0213, 0.0190, 0.0299, 0.2623], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0148, 0.0243, 0.0115, 0.0105, 0.0114, 0.0146, 0.0272], + device='cuda:2'), out_proj_covar=tensor([8.3281e-05, 1.0120e-04, 1.5886e-04, 7.4491e-05, 7.3905e-05, 7.7013e-05, + 1.0166e-04, 1.7384e-04], device='cuda:2') +2022-12-07 06:31:32,334 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6599, 5.0091, 5.0554, 5.5205, 5.4542, 4.9480, 5.5399, 4.9515], + device='cuda:2'), covar=tensor([0.0212, 0.0573, 0.0239, 0.0288, 0.0353, 0.0300, 0.0405, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0100, 0.0071, 0.0067, 0.0072, 0.0079, 0.0097, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:31:59,046 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.447e+02 3.066e+02 3.782e+02 5.192e+02 1.492e+03, threshold=7.564e+02, percent-clipped=6.0 +2022-12-07 06:32:47,259 INFO [train.py:873] (2/4) Epoch 2, batch 700, loss[loss=0.2396, simple_loss=0.2217, pruned_loss=0.1287, over 5946.00 frames. ], tot_loss[loss=0.2569, simple_loss=0.2418, pruned_loss=0.136, over 1893167.42 frames. ], batch size: 100, lr: 3.53e-02, grad_scale: 8.0 +2022-12-07 06:32:49,058 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8265.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:33:01,863 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8280.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:33:26,031 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 3.101e+02 3.944e+02 5.331e+02 1.080e+03, threshold=7.888e+02, percent-clipped=5.0 +2022-12-07 06:33:41,610 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8326.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:33:43,091 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8328.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:33:46,974 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9264, 2.0068, 4.5662, 3.8508, 4.0801, 4.4262, 3.3884, 4.5225], + device='cuda:2'), covar=tensor([0.1872, 0.1772, 0.0132, 0.0146, 0.0115, 0.0111, 0.0235, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0111, 0.0053, 0.0071, 0.0060, 0.0061, 0.0053, 0.0052], + device='cuda:2'), out_proj_covar=tensor([1.8268e-04, 1.8121e-04, 9.2835e-05, 1.3754e-04, 1.0303e-04, 1.0755e-04, + 1.0182e-04, 9.0226e-05], device='cuda:2') +2022-12-07 06:33:58,677 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6722, 4.1801, 4.1669, 4.6798, 4.5085, 4.3528, 4.6395, 4.0559], + device='cuda:2'), covar=tensor([0.0331, 0.0943, 0.0340, 0.0394, 0.0540, 0.0490, 0.0534, 0.0512], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0107, 0.0074, 0.0070, 0.0074, 0.0082, 0.0102, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:34:13,679 INFO [train.py:873] (2/4) Epoch 2, batch 800, loss[loss=0.2398, simple_loss=0.2429, pruned_loss=0.1184, over 14098.00 frames. ], tot_loss[loss=0.2552, simple_loss=0.2408, pruned_loss=0.1348, over 1952408.15 frames. ], batch size: 29, lr: 3.51e-02, grad_scale: 16.0 +2022-12-07 06:34:22,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8813, 4.3803, 5.0559, 3.8145, 4.8031, 5.0661, 2.6565, 4.7833], + device='cuda:2'), covar=tensor([0.0176, 0.0220, 0.0349, 0.0334, 0.0262, 0.0080, 0.2387, 0.0163], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0074, 0.0079, 0.0063, 0.0102, 0.0067, 0.0120, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 06:34:45,189 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:34:53,071 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.100e+02 2.975e+02 4.226e+02 5.564e+02 1.451e+03, threshold=8.453e+02, percent-clipped=7.0 +2022-12-07 06:35:26,089 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8447.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:35:36,580 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8458.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:35:40,663 INFO [train.py:873] (2/4) Epoch 2, batch 900, loss[loss=0.2849, simple_loss=0.2533, pruned_loss=0.1582, over 7794.00 frames. ], tot_loss[loss=0.2547, simple_loss=0.2405, pruned_loss=0.1344, over 1969542.02 frames. ], batch size: 100, lr: 3.50e-02, grad_scale: 16.0 +2022-12-07 06:36:03,776 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7447, 1.6675, 1.5480, 2.0426, 1.9917, 1.9320, 1.6043, 1.5696], + device='cuda:2'), covar=tensor([0.0256, 0.0359, 0.0409, 0.0108, 0.0140, 0.0135, 0.0342, 0.0432], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0154, 0.0251, 0.0116, 0.0109, 0.0116, 0.0146, 0.0280], + device='cuda:2'), out_proj_covar=tensor([8.8960e-05, 1.0489e-04, 1.6455e-04, 7.6834e-05, 7.9011e-05, 8.1464e-05, + 1.0166e-04, 1.7902e-04], device='cuda:2') +2022-12-07 06:36:06,537 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9199, 4.4287, 4.4659, 4.8872, 4.8471, 4.5637, 4.8494, 4.4652], + device='cuda:2'), covar=tensor([0.0222, 0.0962, 0.0317, 0.0371, 0.0406, 0.0362, 0.0520, 0.0400], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0109, 0.0075, 0.0069, 0.0078, 0.0081, 0.0103, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:36:17,477 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8506.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:36:19,007 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 3.358e+02 4.277e+02 5.886e+02 1.045e+03, threshold=8.554e+02, percent-clipped=3.0 +2022-12-07 06:36:26,690 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6305, 4.2886, 4.1389, 4.4398, 4.3476, 4.4630, 4.5202, 4.6435], + device='cuda:2'), covar=tensor([0.0458, 0.0425, 0.0532, 0.0306, 0.0319, 0.0268, 0.0718, 0.0467], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0101, 0.0120, 0.0096, 0.0110, 0.0112, 0.0127, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:36:29,691 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.46 vs. limit=5.0 +2022-12-07 06:37:06,427 INFO [train.py:873] (2/4) Epoch 2, batch 1000, loss[loss=0.2528, simple_loss=0.2384, pruned_loss=0.1337, over 14286.00 frames. ], tot_loss[loss=0.2545, simple_loss=0.2404, pruned_loss=0.1343, over 1993622.67 frames. ], batch size: 39, lr: 3.48e-02, grad_scale: 16.0 +2022-12-07 06:37:45,735 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7119, 1.4199, 1.2160, 0.9715, 1.4146, 0.9120, 1.4383, 1.4428], + device='cuda:2'), covar=tensor([0.0225, 0.1614, 0.0510, 0.1232, 0.0660, 0.0862, 0.0490, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0043, 0.0118, 0.0055, 0.0083, 0.0054, 0.0052, 0.0051, 0.0048], + device='cuda:2'), out_proj_covar=tensor([7.3013e-05, 1.8495e-04, 9.3805e-05, 1.3649e-04, 9.8843e-05, 9.5351e-05, + 9.6946e-05, 8.5258e-05], device='cuda:2') +2022-12-07 06:37:46,391 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.423e+02 3.331e+02 4.272e+02 5.638e+02 1.026e+03, threshold=8.545e+02, percent-clipped=1.0 +2022-12-07 06:37:57,595 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=8621.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:38:34,123 INFO [train.py:873] (2/4) Epoch 2, batch 1100, loss[loss=0.2581, simple_loss=0.2369, pruned_loss=0.1397, over 6006.00 frames. ], tot_loss[loss=0.2535, simple_loss=0.2394, pruned_loss=0.1338, over 2033988.63 frames. ], batch size: 100, lr: 3.47e-02, grad_scale: 8.0 +2022-12-07 06:39:13,894 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.477e+02 3.265e+02 4.282e+02 5.954e+02 1.002e+03, threshold=8.564e+02, percent-clipped=5.0 +2022-12-07 06:39:26,076 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-07 06:39:34,688 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 06:39:58,162 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8760.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:40:00,478 INFO [train.py:873] (2/4) Epoch 2, batch 1200, loss[loss=0.239, simple_loss=0.2002, pruned_loss=0.1389, over 2627.00 frames. ], tot_loss[loss=0.252, simple_loss=0.2386, pruned_loss=0.1327, over 2005308.48 frames. ], batch size: 100, lr: 3.45e-02, grad_scale: 8.0 +2022-12-07 06:40:39,935 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.46 vs. limit=5.0 +2022-12-07 06:40:40,234 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.965e+02 3.180e+02 4.401e+02 5.751e+02 1.287e+03, threshold=8.803e+02, percent-clipped=4.0 +2022-12-07 06:40:51,294 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8821.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:41:27,155 INFO [train.py:873] (2/4) Epoch 2, batch 1300, loss[loss=0.2182, simple_loss=0.1853, pruned_loss=0.1255, over 1323.00 frames. ], tot_loss[loss=0.252, simple_loss=0.2385, pruned_loss=0.1327, over 1970278.81 frames. ], batch size: 100, lr: 3.44e-02, grad_scale: 8.0 +2022-12-07 06:41:36,359 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.1336, 5.6736, 5.4128, 6.0847, 5.9456, 5.1643, 6.0509, 5.4182], + device='cuda:2'), covar=tensor([0.0210, 0.0603, 0.0241, 0.0258, 0.0386, 0.0262, 0.0317, 0.0348], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0111, 0.0076, 0.0073, 0.0082, 0.0084, 0.0111, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 06:41:36,385 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5086, 5.0058, 4.9059, 5.2713, 4.9751, 5.2414, 5.4062, 5.4154], + device='cuda:2'), covar=tensor([0.0264, 0.0445, 0.0505, 0.0264, 0.0286, 0.0213, 0.0396, 0.0451], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0100, 0.0125, 0.0098, 0.0110, 0.0113, 0.0123, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:42:02,228 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.15 vs. limit=2.0 +2022-12-07 06:42:06,756 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.973e+02 3.684e+02 4.915e+02 1.042e+03, threshold=7.368e+02, percent-clipped=2.0 +2022-12-07 06:42:17,241 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=8921.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:42:22,290 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8927.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:42:28,521 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-12-07 06:42:36,148 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7595, 1.7564, 2.4930, 2.3502, 1.9356, 1.5307, 2.2492, 2.0004], + device='cuda:2'), covar=tensor([0.0107, 0.0160, 0.0061, 0.0098, 0.0116, 0.0322, 0.0063, 0.0188], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0096, 0.0068, 0.0093, 0.0095, 0.0146, 0.0059, 0.0143], + device='cuda:2'), out_proj_covar=tensor([9.0162e-05, 1.1131e-04, 8.0153e-05, 1.1045e-04, 1.1028e-04, 1.6956e-04, + 6.5158e-05, 1.5767e-04], device='cuda:2') +2022-12-07 06:42:53,349 INFO [train.py:873] (2/4) Epoch 2, batch 1400, loss[loss=0.2373, simple_loss=0.2182, pruned_loss=0.1282, over 5015.00 frames. ], tot_loss[loss=0.2519, simple_loss=0.2385, pruned_loss=0.1327, over 1937628.46 frames. ], batch size: 100, lr: 3.42e-02, grad_scale: 8.0 +2022-12-07 06:42:58,580 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=8969.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:43:13,859 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=8986.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:43:15,586 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=8988.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:43:19,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5931, 3.3940, 3.3514, 3.2237, 3.4980, 3.4520, 3.5616, 3.5184], + device='cuda:2'), covar=tensor([0.0491, 0.0486, 0.0587, 0.0789, 0.0343, 0.0353, 0.0587, 0.0615], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0104, 0.0125, 0.0102, 0.0112, 0.0113, 0.0125, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:43:33,021 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.228e+02 3.417e+02 4.711e+02 5.630e+02 1.118e+03, threshold=9.423e+02, percent-clipped=7.0 +2022-12-07 06:43:36,707 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9013.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:44:06,625 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9047.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:44:20,923 INFO [train.py:873] (2/4) Epoch 2, batch 1500, loss[loss=0.2326, simple_loss=0.2008, pruned_loss=0.1322, over 2650.00 frames. ], tot_loss[loss=0.2521, simple_loss=0.2387, pruned_loss=0.1327, over 1965891.47 frames. ], batch size: 100, lr: 3.41e-02, grad_scale: 8.0 +2022-12-07 06:44:30,407 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9074.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:44:38,233 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9720, 1.8421, 2.5048, 2.4473, 1.7295, 1.8035, 2.1308, 2.1983], + device='cuda:2'), covar=tensor([0.0144, 0.0235, 0.0088, 0.0123, 0.0192, 0.0459, 0.0093, 0.0269], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0099, 0.0074, 0.0098, 0.0096, 0.0150, 0.0061, 0.0145], + device='cuda:2'), out_proj_covar=tensor([9.1614e-05, 1.1649e-04, 8.7838e-05, 1.1700e-04, 1.1250e-04, 1.7524e-04, + 6.6636e-05, 1.6084e-04], device='cuda:2') +2022-12-07 06:44:51,232 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8105, 1.3538, 2.2715, 1.7909, 2.2479, 1.4677, 1.7954, 2.2370], + device='cuda:2'), covar=tensor([0.0561, 0.2473, 0.0226, 0.1957, 0.0178, 0.1617, 0.0741, 0.0220], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0289, 0.0150, 0.0394, 0.0115, 0.0301, 0.0197, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0003, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:44:59,854 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.468e+02 3.213e+02 4.122e+02 5.427e+02 9.769e+02, threshold=8.244e+02, percent-clipped=1.0 +2022-12-07 06:45:05,739 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9116.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:45:46,184 INFO [train.py:873] (2/4) Epoch 2, batch 1600, loss[loss=0.307, simple_loss=0.2732, pruned_loss=0.1704, over 9513.00 frames. ], tot_loss[loss=0.2518, simple_loss=0.2386, pruned_loss=0.1325, over 2003576.48 frames. ], batch size: 100, lr: 3.39e-02, grad_scale: 8.0 +2022-12-07 06:46:17,628 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2745, 2.0184, 4.1406, 3.1678, 4.0297, 2.1139, 3.9990, 4.0435], + device='cuda:2'), covar=tensor([0.0129, 0.4585, 0.0158, 0.6316, 0.0092, 0.3242, 0.0306, 0.0118], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0282, 0.0149, 0.0387, 0.0116, 0.0302, 0.0201, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0003, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:46:26,065 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.627e+02 3.267e+02 4.651e+02 6.782e+02 1.677e+03, threshold=9.302e+02, percent-clipped=10.0 +2022-12-07 06:46:35,142 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5499, 1.2504, 1.0390, 1.1511, 1.2549, 1.2720, 1.2474, 1.0762], + device='cuda:2'), covar=tensor([0.3056, 0.0987, 0.1346, 0.0923, 0.0698, 0.0605, 0.0888, 0.0695], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0051, 0.0041, 0.0040, 0.0052, 0.0041, 0.0050, 0.0054], + device='cuda:2'), out_proj_covar=tensor([2.2171e-04, 1.1090e-04, 9.6372e-05, 8.8584e-05, 1.0348e-04, 8.9497e-05, + 1.0981e-04, 1.0624e-04], device='cuda:2') +2022-12-07 06:46:43,983 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3090, 2.0587, 3.4580, 2.5488, 2.3305, 2.1028, 0.9539, 2.4775], + device='cuda:2'), covar=tensor([0.1226, 0.0556, 0.0387, 0.0451, 0.0422, 0.0858, 0.2024, 0.0762], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0055, 0.0050, 0.0050, 0.0054, 0.0049, 0.0096, 0.0058], + device='cuda:2'), out_proj_covar=tensor([7.2387e-05, 7.4163e-05, 6.9248e-05, 6.4981e-05, 6.8552e-05, 6.5087e-05, + 1.3017e-04, 7.9511e-05], device='cuda:2') +2022-12-07 06:46:54,051 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 06:47:13,308 INFO [train.py:873] (2/4) Epoch 2, batch 1700, loss[loss=0.2503, simple_loss=0.2452, pruned_loss=0.1277, over 14414.00 frames. ], tot_loss[loss=0.2511, simple_loss=0.2384, pruned_loss=0.1319, over 2014958.35 frames. ], batch size: 41, lr: 3.38e-02, grad_scale: 8.0 +2022-12-07 06:47:17,822 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6803, 1.9612, 3.7214, 2.9388, 3.4842, 1.8286, 3.0679, 3.4132], + device='cuda:2'), covar=tensor([0.0185, 0.3080, 0.0157, 0.4216, 0.0083, 0.2820, 0.0486, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0275, 0.0147, 0.0383, 0.0114, 0.0297, 0.0201, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0003, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 06:47:18,926 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 06:47:30,402 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9283.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:47:53,364 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.693e+02 2.995e+02 4.223e+02 5.322e+02 8.026e+02, threshold=8.446e+02, percent-clipped=0.0 +2022-12-07 06:48:01,981 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7005, 0.8490, 0.4018, 0.6850, 0.9964, 0.4165, 0.8676, 0.9220], + device='cuda:2'), covar=tensor([0.0221, 0.0132, 0.0078, 0.0300, 0.0142, 0.0099, 0.0122, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0024, 0.0022, 0.0021, 0.0023, 0.0019, 0.0020], + device='cuda:2'), out_proj_covar=tensor([4.1574e-05, 4.6685e-05, 4.3043e-05, 4.2881e-05, 3.6512e-05, 4.2627e-05, + 3.7354e-05, 3.5943e-05], device='cuda:2') +2022-12-07 06:48:21,582 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9342.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:48:39,297 INFO [train.py:873] (2/4) Epoch 2, batch 1800, loss[loss=0.2408, simple_loss=0.2235, pruned_loss=0.129, over 4991.00 frames. ], tot_loss[loss=0.2488, simple_loss=0.2368, pruned_loss=0.1304, over 1932624.56 frames. ], batch size: 100, lr: 3.37e-02, grad_scale: 8.0 +2022-12-07 06:48:44,773 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9369.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:48:59,815 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.06 vs. limit=2.0 +2022-12-07 06:49:19,722 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.290e+02 2.949e+02 3.801e+02 5.110e+02 1.024e+03, threshold=7.602e+02, percent-clipped=3.0 +2022-12-07 06:49:24,135 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9414.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:49:25,774 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9416.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:49:34,661 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.38 vs. limit=2.0 +2022-12-07 06:50:07,246 INFO [train.py:873] (2/4) Epoch 2, batch 1900, loss[loss=0.2245, simple_loss=0.2216, pruned_loss=0.1137, over 14257.00 frames. ], tot_loss[loss=0.2475, simple_loss=0.2359, pruned_loss=0.1295, over 1951703.30 frames. ], batch size: 57, lr: 3.35e-02, grad_scale: 8.0 +2022-12-07 06:50:08,454 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9464.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:50:17,871 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9475.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:50:46,648 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.598e+02 2.987e+02 3.780e+02 5.318e+02 1.239e+03, threshold=7.560e+02, percent-clipped=8.0 +2022-12-07 06:50:52,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4436, 2.3308, 2.3934, 2.3461, 2.4068, 2.1962, 1.3222, 2.2635], + device='cuda:2'), covar=tensor([0.0284, 0.0365, 0.0517, 0.0301, 0.0350, 0.0512, 0.2173, 0.0378], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0078, 0.0078, 0.0062, 0.0104, 0.0073, 0.0119, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 06:51:33,030 INFO [train.py:873] (2/4) Epoch 2, batch 2000, loss[loss=0.2034, simple_loss=0.1811, pruned_loss=0.1128, over 1291.00 frames. ], tot_loss[loss=0.2498, simple_loss=0.2377, pruned_loss=0.131, over 1934786.76 frames. ], batch size: 100, lr: 3.34e-02, grad_scale: 8.0 +2022-12-07 06:51:50,755 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9583.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:51:51,641 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9584.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:52:03,127 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-12-07 06:52:06,942 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-12-07 06:52:12,819 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.740e+02 3.149e+02 4.158e+02 5.178e+02 1.006e+03, threshold=8.316e+02, percent-clipped=10.0 +2022-12-07 06:52:32,382 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9631.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:52:41,063 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.07 vs. limit=5.0 +2022-12-07 06:52:41,419 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9642.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:52:44,021 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9645.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:52:54,859 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9658.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:52:59,452 INFO [train.py:873] (2/4) Epoch 2, batch 2100, loss[loss=0.2211, simple_loss=0.2229, pruned_loss=0.1097, over 14391.00 frames. ], tot_loss[loss=0.2475, simple_loss=0.2357, pruned_loss=0.1297, over 1877164.28 frames. ], batch size: 41, lr: 3.32e-02, grad_scale: 8.0 +2022-12-07 06:53:01,983 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-12-07 06:53:05,117 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=9669.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:53:14,256 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0872, 4.9189, 4.7914, 5.3705, 5.0245, 3.9891, 5.4260, 5.3239], + device='cuda:2'), covar=tensor([0.0776, 0.0625, 0.0704, 0.0563, 0.0716, 0.0485, 0.0592, 0.0790], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0066, 0.0092, 0.0085, 0.0090, 0.0059, 0.0080, 0.0086], + device='cuda:2'), out_proj_covar=tensor([1.3538e-04, 1.1154e-04, 1.4955e-04, 1.3742e-04, 1.4765e-04, 9.6381e-05, + 1.3443e-04, 1.4011e-04], device='cuda:2') +2022-12-07 06:53:22,847 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9690.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:53:39,633 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.662e+02 3.245e+02 4.428e+02 6.131e+02 1.111e+03, threshold=8.856e+02, percent-clipped=8.0 +2022-12-07 06:53:46,126 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=9717.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:53:48,038 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9719.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:54:00,617 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7400, 1.5280, 3.5440, 1.7496, 3.6686, 3.5701, 2.6145, 3.9020], + device='cuda:2'), covar=tensor([0.0155, 0.2629, 0.0333, 0.2242, 0.0230, 0.0276, 0.0668, 0.0153], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0119, 0.0074, 0.0127, 0.0089, 0.0078, 0.0069, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 06:54:09,277 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=9743.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:54:13,575 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8481, 4.5297, 4.4943, 5.1511, 4.8018, 3.6600, 5.0328, 5.0442], + device='cuda:2'), covar=tensor([0.0767, 0.0569, 0.0645, 0.0610, 0.0576, 0.0570, 0.0733, 0.0789], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0071, 0.0096, 0.0090, 0.0094, 0.0061, 0.0086, 0.0088], + device='cuda:2'), out_proj_covar=tensor([1.4159e-04, 1.2026e-04, 1.5604e-04, 1.4624e-04, 1.5474e-04, 9.9175e-05, + 1.4288e-04, 1.4466e-04], device='cuda:2') +2022-12-07 06:54:15,262 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2111, 1.1359, 1.0678, 1.0459, 1.1875, 1.1598, 1.0379, 1.0448], + device='cuda:2'), covar=tensor([0.2340, 0.0927, 0.1328, 0.0558, 0.0517, 0.0386, 0.0820, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0050, 0.0039, 0.0039, 0.0049, 0.0039, 0.0048, 0.0052], + device='cuda:2'), out_proj_covar=tensor([2.1505e-04, 1.1374e-04, 9.9123e-05, 8.9414e-05, 1.0029e-04, 8.9404e-05, + 1.1216e-04, 1.0705e-04], device='cuda:2') +2022-12-07 06:54:26,278 INFO [train.py:873] (2/4) Epoch 2, batch 2200, loss[loss=0.2546, simple_loss=0.2365, pruned_loss=0.1364, over 11133.00 frames. ], tot_loss[loss=0.2488, simple_loss=0.2366, pruned_loss=0.1305, over 1926036.00 frames. ], batch size: 100, lr: 3.31e-02, grad_scale: 8.0 +2022-12-07 06:54:32,296 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9770.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:54:44,456 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2345, 1.1210, 0.8502, 0.8366, 0.9151, 1.0276, 0.6068, 0.9337], + device='cuda:2'), covar=tensor([0.0308, 0.0316, 0.0664, 0.0479, 0.0269, 0.0232, 0.0235, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0017, 0.0018, 0.0018, 0.0018, 0.0018, 0.0016, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.3787e-05, 2.7845e-05, 3.2296e-05, 2.6796e-05, 3.1119e-05, 2.7926e-05, + 3.2262e-05, 2.9275e-05], device='cuda:2') +2022-12-07 06:55:01,546 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=9804.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:55:05,584 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.644e+01 3.042e+02 4.179e+02 5.699e+02 1.039e+03, threshold=8.358e+02, percent-clipped=2.0 +2022-12-07 06:55:37,856 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 06:55:53,425 INFO [train.py:873] (2/4) Epoch 2, batch 2300, loss[loss=0.2433, simple_loss=0.2353, pruned_loss=0.1256, over 14198.00 frames. ], tot_loss[loss=0.2476, simple_loss=0.2357, pruned_loss=0.1298, over 1897964.55 frames. ], batch size: 89, lr: 3.30e-02, grad_scale: 8.0 +2022-12-07 06:56:33,304 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.330e+02 3.202e+02 3.964e+02 5.237e+02 8.345e+02, threshold=7.929e+02, percent-clipped=0.0 +2022-12-07 06:56:35,277 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-12-07 06:57:00,530 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=9940.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:57:04,077 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8628, 1.5320, 3.6741, 3.5197, 3.6845, 3.6863, 2.8175, 3.7962], + device='cuda:2'), covar=tensor([0.1348, 0.1505, 0.0104, 0.0159, 0.0100, 0.0093, 0.0280, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0112, 0.0057, 0.0077, 0.0064, 0.0066, 0.0055, 0.0054], + device='cuda:2'), out_proj_covar=tensor([1.9497e-04, 1.9968e-04, 1.0897e-04, 1.6022e-04, 1.2140e-04, 1.2446e-04, + 1.1336e-04, 9.9887e-05], device='cuda:2') +2022-12-07 06:57:20,361 INFO [train.py:873] (2/4) Epoch 2, batch 2400, loss[loss=0.2223, simple_loss=0.2275, pruned_loss=0.1085, over 14064.00 frames. ], tot_loss[loss=0.2485, simple_loss=0.2363, pruned_loss=0.1303, over 1920294.77 frames. ], batch size: 29, lr: 3.28e-02, grad_scale: 8.0 +2022-12-07 06:57:20,429 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8406, 2.4817, 2.6020, 2.8381, 2.8597, 2.8350, 2.8484, 2.4952], + device='cuda:2'), covar=tensor([0.0328, 0.1164, 0.0397, 0.0412, 0.0635, 0.0547, 0.0580, 0.0572], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0127, 0.0081, 0.0079, 0.0090, 0.0088, 0.0120, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 06:57:29,090 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 06:58:03,556 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.570e+02 3.087e+02 4.289e+02 5.805e+02 1.264e+03, threshold=8.579e+02, percent-clipped=4.0 +2022-12-07 06:58:07,504 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 06:58:07,913 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10014.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:58:30,063 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10039.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:58:50,275 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10062.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 06:58:50,994 INFO [train.py:873] (2/4) Epoch 2, batch 2500, loss[loss=0.2429, simple_loss=0.2001, pruned_loss=0.1428, over 1239.00 frames. ], tot_loss[loss=0.2457, simple_loss=0.2349, pruned_loss=0.1283, over 1943366.02 frames. ], batch size: 100, lr: 3.27e-02, grad_scale: 8.0 +2022-12-07 06:58:57,296 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10070.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:59:21,288 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10099.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:59:22,536 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10100.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:59:28,733 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8967, 1.6710, 2.2496, 2.1882, 2.3380, 1.8546, 2.0820, 1.9246], + device='cuda:2'), covar=tensor([0.0407, 0.1545, 0.0088, 0.0527, 0.0154, 0.0554, 0.0307, 0.1581], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0266, 0.0102, 0.0187, 0.0122, 0.0153, 0.0136, 0.0306], + device='cuda:2'), out_proj_covar=tensor([1.2393e-04, 2.2473e-04, 8.4782e-05, 1.5672e-04, 1.0924e-04, 1.3171e-04, + 1.3714e-04, 2.5517e-04], device='cuda:2') +2022-12-07 06:59:30,092 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.651e+02 3.039e+02 4.037e+02 4.970e+02 1.749e+03, threshold=8.074e+02, percent-clipped=6.0 +2022-12-07 06:59:35,576 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0370, 2.3307, 3.6840, 3.2513, 3.6712, 2.5971, 3.4203, 2.2239], + device='cuda:2'), covar=tensor([0.0135, 0.0296, 0.0163, 0.0232, 0.0069, 0.0587, 0.0047, 0.0650], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0109, 0.0086, 0.0107, 0.0099, 0.0160, 0.0065, 0.0153], + device='cuda:2'), out_proj_covar=tensor([1.0622e-04, 1.3348e-04, 1.1202e-04, 1.3362e-04, 1.2149e-04, 1.9560e-04, + 7.6560e-05, 1.7842e-04], device='cuda:2') +2022-12-07 06:59:37,890 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10118.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 06:59:42,163 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10123.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 06:59:49,820 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7632, 1.4505, 1.9089, 1.6476, 1.8636, 1.4095, 1.6923, 1.9312], + device='cuda:2'), covar=tensor([0.0387, 0.0872, 0.0151, 0.0828, 0.0134, 0.0663, 0.0655, 0.0120], + device='cuda:2'), in_proj_covar=tensor([0.0185, 0.0287, 0.0148, 0.0400, 0.0121, 0.0307, 0.0209, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:00:04,471 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.07 vs. limit=2.0 +2022-12-07 07:00:16,955 INFO [train.py:873] (2/4) Epoch 2, batch 2600, loss[loss=0.2484, simple_loss=0.2427, pruned_loss=0.127, over 14239.00 frames. ], tot_loss[loss=0.2452, simple_loss=0.2345, pruned_loss=0.128, over 1922805.85 frames. ], batch size: 94, lr: 3.26e-02, grad_scale: 8.0 +2022-12-07 07:00:31,383 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 07:00:47,937 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1210, 0.8343, 1.2299, 0.6165, 0.7899, 0.6077, 0.9439, 1.0633], + device='cuda:2'), covar=tensor([0.0112, 0.0370, 0.0219, 0.0271, 0.0307, 0.0526, 0.0191, 0.0154], + device='cuda:2'), in_proj_covar=tensor([0.0046, 0.0116, 0.0056, 0.0084, 0.0050, 0.0051, 0.0048, 0.0047], + device='cuda:2'), out_proj_covar=tensor([8.4410e-05, 1.9604e-04, 1.0466e-04, 1.5279e-04, 1.0298e-04, 9.9261e-05, + 9.9999e-05, 9.0547e-05], device='cuda:2') +2022-12-07 07:00:49,142 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.61 vs. limit=2.0 +2022-12-07 07:00:52,991 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0784, 1.0709, 0.8396, 0.6803, 0.5240, 0.9376, 0.3695, 0.7529], + device='cuda:2'), covar=tensor([0.0313, 0.0128, 0.0197, 0.0297, 0.0457, 0.0141, 0.0159, 0.0155], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0015, 0.0015, 0.0016, 0.0015, 0.0015, 0.0014, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.3565e-05, 2.5581e-05, 2.7823e-05, 2.4922e-05, 2.7663e-05, 2.4113e-05, + 2.9166e-05, 2.5078e-05], device='cuda:2') +2022-12-07 07:00:56,780 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.626e+02 3.080e+02 4.222e+02 5.388e+02 1.025e+03, threshold=8.444e+02, percent-clipped=5.0 +2022-12-07 07:01:24,228 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10240.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:01:35,195 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5818, 3.2188, 2.3768, 3.6808, 3.3935, 3.5464, 3.2194, 2.4677], + device='cuda:2'), covar=tensor([0.0204, 0.0441, 0.2758, 0.0255, 0.0191, 0.0590, 0.0567, 0.2797], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0170, 0.0282, 0.0129, 0.0124, 0.0125, 0.0172, 0.0304], + device='cuda:2'), out_proj_covar=tensor([1.0025e-04, 1.2010e-04, 1.8827e-04, 8.7256e-05, 9.3193e-05, 9.0910e-05, + 1.2306e-04, 2.0180e-04], device='cuda:2') +2022-12-07 07:01:44,396 INFO [train.py:873] (2/4) Epoch 2, batch 2700, loss[loss=0.2844, simple_loss=0.2569, pruned_loss=0.1559, over 13971.00 frames. ], tot_loss[loss=0.2455, simple_loss=0.2347, pruned_loss=0.1282, over 1970693.96 frames. ], batch size: 22, lr: 3.24e-02, grad_scale: 8.0 +2022-12-07 07:02:06,314 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10288.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:02:20,942 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4177, 3.9031, 3.9525, 4.2880, 4.2276, 4.0458, 4.3382, 3.8128], + device='cuda:2'), covar=tensor([0.0291, 0.0947, 0.0308, 0.0409, 0.0602, 0.0479, 0.0517, 0.0473], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0132, 0.0081, 0.0081, 0.0092, 0.0090, 0.0121, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:02:24,379 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.544e+02 3.215e+02 3.934e+02 5.144e+02 9.049e+02, threshold=7.869e+02, percent-clipped=4.0 +2022-12-07 07:02:29,017 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10314.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:02:51,906 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 07:03:05,474 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1113, 4.5465, 4.6840, 5.1344, 4.8416, 4.0324, 5.2251, 5.0940], + device='cuda:2'), covar=tensor([0.0423, 0.0400, 0.0530, 0.0453, 0.0447, 0.0487, 0.0432, 0.0617], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0068, 0.0094, 0.0086, 0.0092, 0.0063, 0.0085, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:03:09,554 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10362.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:03:10,327 INFO [train.py:873] (2/4) Epoch 2, batch 2800, loss[loss=0.2632, simple_loss=0.2482, pruned_loss=0.1391, over 14241.00 frames. ], tot_loss[loss=0.2449, simple_loss=0.2347, pruned_loss=0.1275, over 1985097.66 frames. ], batch size: 69, lr: 3.23e-02, grad_scale: 8.0 +2022-12-07 07:03:33,983 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 07:03:38,826 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10395.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:03:42,174 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:03:50,604 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.391e+02 3.431e+02 4.642e+02 5.590e+02 1.030e+03, threshold=9.284e+02, percent-clipped=5.0 +2022-12-07 07:03:50,772 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10409.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:03:56,399 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-12-07 07:03:58,204 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10418.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:04:23,696 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10447.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:04:29,078 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9680, 3.7272, 2.9660, 4.9680, 4.3622, 4.8219, 4.7919, 2.8116], + device='cuda:2'), covar=tensor([0.0067, 0.0326, 0.2178, 0.0156, 0.0103, 0.0196, 0.0124, 0.2330], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0179, 0.0287, 0.0128, 0.0126, 0.0128, 0.0171, 0.0309], + device='cuda:2'), out_proj_covar=tensor([1.0127e-04, 1.2498e-04, 1.9332e-04, 8.6507e-05, 9.4154e-05, 9.3397e-05, + 1.2385e-04, 2.0547e-04], device='cuda:2') +2022-12-07 07:04:29,849 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10454.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:04:37,270 INFO [train.py:873] (2/4) Epoch 2, batch 2900, loss[loss=0.2291, simple_loss=0.2325, pruned_loss=0.1128, over 14575.00 frames. ], tot_loss[loss=0.2429, simple_loss=0.2335, pruned_loss=0.1261, over 1937575.66 frames. ], batch size: 22, lr: 3.22e-02, grad_scale: 8.0 +2022-12-07 07:04:43,835 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10470.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:04:45,325 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.03 vs. limit=2.0 +2022-12-07 07:04:54,150 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6995, 1.1059, 0.9150, 1.1937, 0.7732, 2.2884, 1.2967, 1.0881], + device='cuda:2'), covar=tensor([0.0329, 0.0970, 0.0620, 0.0522, 0.0818, 0.0270, 0.0225, 0.0384], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0016, 0.0017, 0.0016, 0.0016, 0.0016, 0.0017], + device='cuda:2'), out_proj_covar=tensor([2.6181e-05, 2.8555e-05, 3.0378e-05, 2.7876e-05, 2.9280e-05, 2.5588e-05, + 3.2969e-05, 2.9517e-05], device='cuda:2') +2022-12-07 07:05:01,742 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.25 vs. limit=2.0 +2022-12-07 07:05:18,328 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.484e+02 2.981e+02 3.992e+02 4.851e+02 1.210e+03, threshold=7.983e+02, percent-clipped=1.0 +2022-12-07 07:05:23,917 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10515.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:05:40,348 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6695, 0.5907, 0.6627, 0.7149, 0.6822, 0.4908, 0.6905, 0.5952], + device='cuda:2'), covar=tensor([0.0046, 0.0059, 0.0094, 0.0099, 0.0063, 0.0060, 0.0102, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0022, 0.0024, 0.0021, 0.0020, 0.0024, 0.0018, 0.0019], + device='cuda:2'), out_proj_covar=tensor([4.4684e-05, 4.9245e-05, 4.6159e-05, 4.4794e-05, 3.8320e-05, 4.8478e-05, + 3.9262e-05, 3.6779e-05], device='cuda:2') +2022-12-07 07:06:05,152 INFO [train.py:873] (2/4) Epoch 2, batch 3000, loss[loss=0.2487, simple_loss=0.2444, pruned_loss=0.1265, over 14401.00 frames. ], tot_loss[loss=0.2438, simple_loss=0.2339, pruned_loss=0.1268, over 1928239.31 frames. ], batch size: 73, lr: 3.21e-02, grad_scale: 8.0 +2022-12-07 07:06:05,152 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 07:06:09,574 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2224, 0.9720, 0.6793, 0.9210, 0.5767, 1.9681, 1.3172, 0.8183], + device='cuda:2'), covar=tensor([0.0385, 0.1134, 0.0935, 0.0859, 0.1850, 0.0387, 0.0385, 0.0854], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0016, 0.0017, 0.0016, 0.0017, 0.0016, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.6709e-05, 2.7876e-05, 3.0399e-05, 2.7491e-05, 3.0070e-05, 2.6067e-05, + 3.3781e-05, 3.0868e-05], device='cuda:2') +2022-12-07 07:06:10,929 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4383, 1.9306, 3.7994, 2.8770, 3.5439, 1.6821, 3.2182, 3.3667], + device='cuda:2'), covar=tensor([0.0148, 0.4802, 0.0312, 0.5885, 0.0171, 0.3911, 0.0637, 0.0126], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0297, 0.0152, 0.0404, 0.0124, 0.0308, 0.0218, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:06:13,212 INFO [train.py:905] (2/4) Epoch 2, validation: loss=0.1433, simple_loss=0.1828, pruned_loss=0.05186, over 857387.00 frames. +2022-12-07 07:06:13,213 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 07:06:33,263 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8439, 1.7725, 2.9353, 2.2644, 2.7746, 1.7751, 2.4902, 2.6278], + device='cuda:2'), covar=tensor([0.0258, 0.3063, 0.0219, 0.3976, 0.0163, 0.2358, 0.0628, 0.0208], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0295, 0.0153, 0.0403, 0.0124, 0.0306, 0.0217, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:06:54,388 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.643e+02 3.162e+02 4.298e+02 5.900e+02 1.176e+03, threshold=8.596e+02, percent-clipped=5.0 +2022-12-07 07:07:10,728 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0611, 1.1050, 0.7342, 0.7908, 0.7438, 1.7177, 1.1464, 0.7303], + device='cuda:2'), covar=tensor([0.0311, 0.0547, 0.0565, 0.0320, 0.0820, 0.0251, 0.0184, 0.0856], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0015, 0.0016, 0.0017, 0.0016, 0.0017, 0.0015, 0.0018], + device='cuda:2'), out_proj_covar=tensor([2.6394e-05, 2.6971e-05, 3.0621e-05, 2.6243e-05, 2.9606e-05, 2.6400e-05, + 3.2360e-05, 3.0272e-05], device='cuda:2') +2022-12-07 07:07:13,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2178, 4.3983, 4.5675, 5.1341, 4.9487, 4.2884, 5.0757, 4.4087], + device='cuda:2'), covar=tensor([0.0200, 0.0997, 0.0258, 0.0323, 0.0522, 0.0408, 0.0487, 0.0487], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0135, 0.0084, 0.0082, 0.0091, 0.0091, 0.0122, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:07:16,738 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6940, 4.1855, 4.2230, 4.4370, 4.4528, 4.3846, 4.5970, 4.6218], + device='cuda:2'), covar=tensor([0.0445, 0.0558, 0.0657, 0.0552, 0.0289, 0.0335, 0.0690, 0.0617], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0112, 0.0142, 0.0128, 0.0115, 0.0124, 0.0144, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:07:41,331 INFO [train.py:873] (2/4) Epoch 2, batch 3100, loss[loss=0.2583, simple_loss=0.2425, pruned_loss=0.137, over 13997.00 frames. ], tot_loss[loss=0.2435, simple_loss=0.2337, pruned_loss=0.1267, over 1974224.68 frames. ], batch size: 19, lr: 3.19e-02, grad_scale: 8.0 +2022-12-07 07:08:09,623 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10695.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:08:22,248 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.335e+02 3.394e+02 4.427e+02 5.401e+02 1.921e+03, threshold=8.855e+02, percent-clipped=7.0 +2022-12-07 07:08:26,386 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-12-07 07:08:29,185 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=10718.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:08:51,475 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10743.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:08:52,438 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10744.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:08:59,831 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 07:09:08,156 INFO [train.py:873] (2/4) Epoch 2, batch 3200, loss[loss=0.2334, simple_loss=0.2372, pruned_loss=0.1148, over 14279.00 frames. ], tot_loss[loss=0.2443, simple_loss=0.2345, pruned_loss=0.1271, over 1998797.98 frames. ], batch size: 66, lr: 3.18e-02, grad_scale: 8.0 +2022-12-07 07:09:10,158 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10765.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:09:10,981 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=10766.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:09:11,097 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6808, 1.7465, 2.9187, 2.1568, 2.7103, 1.7908, 2.2751, 2.6516], + device='cuda:2'), covar=tensor([0.0402, 0.3693, 0.0280, 0.4776, 0.0188, 0.2615, 0.0830, 0.0219], + device='cuda:2'), in_proj_covar=tensor([0.0192, 0.0305, 0.0158, 0.0412, 0.0131, 0.0317, 0.0225, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:09:18,366 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10774.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:09:41,467 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5448, 3.9427, 4.7431, 3.9081, 4.5017, 4.6977, 2.0147, 4.4002], + device='cuda:2'), covar=tensor([0.0198, 0.0397, 0.0398, 0.0363, 0.0254, 0.0160, 0.2990, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0093, 0.0094, 0.0075, 0.0118, 0.0083, 0.0135, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:09:45,225 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10805.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:09:49,655 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.759e+01 3.135e+02 4.113e+02 5.521e+02 8.692e+02, threshold=8.227e+02, percent-clipped=0.0 +2022-12-07 07:09:49,793 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=10810.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:10:09,218 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 07:10:11,383 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10835.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:10:26,818 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6532, 2.2003, 2.8430, 2.5636, 2.8643, 2.7568, 2.8359, 2.2905], + device='cuda:2'), covar=tensor([0.0158, 0.1463, 0.0091, 0.0522, 0.0146, 0.0296, 0.0271, 0.1374], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0270, 0.0106, 0.0194, 0.0131, 0.0159, 0.0145, 0.0304], + device='cuda:2'), out_proj_covar=tensor([1.2673e-04, 2.3456e-04, 8.8928e-05, 1.6281e-04, 1.2007e-04, 1.4104e-04, + 1.4745e-04, 2.5789e-04], device='cuda:2') +2022-12-07 07:10:35,550 INFO [train.py:873] (2/4) Epoch 2, batch 3300, loss[loss=0.2653, simple_loss=0.2468, pruned_loss=0.1419, over 14588.00 frames. ], tot_loss[loss=0.2436, simple_loss=0.2341, pruned_loss=0.1265, over 2010207.20 frames. ], batch size: 21, lr: 3.17e-02, grad_scale: 8.0 +2022-12-07 07:10:36,259 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-12-07 07:10:54,191 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=10885.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:11:15,637 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.702e+02 2.971e+02 4.122e+02 5.420e+02 1.001e+03, threshold=8.244e+02, percent-clipped=4.0 +2022-12-07 07:11:18,458 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7577, 3.2949, 3.6607, 3.5702, 3.6048, 3.7208, 1.5956, 3.6119], + device='cuda:2'), covar=tensor([0.0284, 0.0501, 0.0713, 0.0339, 0.0432, 0.0312, 0.3722, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0090, 0.0089, 0.0072, 0.0114, 0.0078, 0.0128, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:11:21,417 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 07:11:46,476 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=10946.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:12:00,254 INFO [train.py:873] (2/4) Epoch 2, batch 3400, loss[loss=0.2335, simple_loss=0.229, pruned_loss=0.1189, over 14557.00 frames. ], tot_loss[loss=0.2427, simple_loss=0.2336, pruned_loss=0.1259, over 2012385.71 frames. ], batch size: 34, lr: 3.16e-02, grad_scale: 8.0 +2022-12-07 07:12:42,275 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.436e+02 2.877e+02 3.778e+02 5.417e+02 1.188e+03, threshold=7.556e+02, percent-clipped=7.0 +2022-12-07 07:12:59,233 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0453, 2.0084, 2.0259, 2.0137, 2.0374, 1.9533, 1.0928, 1.8688], + device='cuda:2'), covar=tensor([0.0276, 0.0251, 0.0379, 0.0192, 0.0259, 0.0432, 0.1486, 0.0347], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0091, 0.0091, 0.0074, 0.0118, 0.0080, 0.0133, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:13:28,831 INFO [train.py:873] (2/4) Epoch 2, batch 3500, loss[loss=0.234, simple_loss=0.238, pruned_loss=0.1151, over 14604.00 frames. ], tot_loss[loss=0.2414, simple_loss=0.2325, pruned_loss=0.1251, over 2052438.78 frames. ], batch size: 22, lr: 3.15e-02, grad_scale: 8.0 +2022-12-07 07:13:30,400 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11065.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:13:37,723 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.05 vs. limit=2.0 +2022-12-07 07:14:00,508 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11100.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:14:08,561 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 3.159e+02 4.112e+02 5.380e+02 8.623e+02, threshold=8.224e+02, percent-clipped=2.0 +2022-12-07 07:14:08,720 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11110.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 07:14:11,081 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11113.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:14:26,033 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11130.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:14:50,086 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11158.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:14:54,227 INFO [train.py:873] (2/4) Epoch 2, batch 3600, loss[loss=0.2368, simple_loss=0.2198, pruned_loss=0.1269, over 5988.00 frames. ], tot_loss[loss=0.2399, simple_loss=0.2315, pruned_loss=0.1241, over 2088369.68 frames. ], batch size: 100, lr: 3.13e-02, grad_scale: 8.0 +2022-12-07 07:14:56,335 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2899, 4.1053, 3.9163, 4.0634, 4.0914, 4.1004, 4.4480, 4.3535], + device='cuda:2'), covar=tensor([0.0730, 0.0593, 0.0916, 0.0905, 0.0451, 0.0495, 0.0661, 0.0707], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0123, 0.0154, 0.0148, 0.0127, 0.0141, 0.0154, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:15:04,132 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11174.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:15:35,172 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.817e+02 3.568e+02 4.695e+02 1.140e+03, threshold=7.135e+02, percent-clipped=4.0 +2022-12-07 07:15:48,509 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0402, 1.1304, 0.9764, 0.8133, 0.6443, 0.9448, 0.3463, 0.8411], + device='cuda:2'), covar=tensor([0.0177, 0.0195, 0.0353, 0.0348, 0.0301, 0.0214, 0.0205, 0.0319], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0014, 0.0016, 0.0013, 0.0016, 0.0015, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.5099e-05, 2.5343e-05, 2.8265e-05, 2.6542e-05, 2.6014e-05, 2.5146e-05, + 3.2064e-05, 2.9047e-05], device='cuda:2') +2022-12-07 07:15:56,911 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11235.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:16:02,093 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11241.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:16:06,681 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0252, 1.3888, 1.6644, 1.3653, 1.1164, 1.5225, 1.5913, 1.2621], + device='cuda:2'), covar=tensor([0.2207, 0.0967, 0.0875, 0.0948, 0.0787, 0.0412, 0.0752, 0.0776], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0052, 0.0041, 0.0043, 0.0056, 0.0042, 0.0053, 0.0061], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:16:09,663 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-12-07 07:16:21,364 INFO [train.py:873] (2/4) Epoch 2, batch 3700, loss[loss=0.2433, simple_loss=0.2374, pruned_loss=0.1246, over 14213.00 frames. ], tot_loss[loss=0.2414, simple_loss=0.2323, pruned_loss=0.1252, over 2009805.37 frames. ], batch size: 99, lr: 3.12e-02, grad_scale: 8.0 +2022-12-07 07:16:44,553 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7450, 0.8391, 1.0443, 0.9845, 0.7688, 1.0610, 0.7040, 0.5303], + device='cuda:2'), covar=tensor([0.1554, 0.0450, 0.0331, 0.0299, 0.0700, 0.0204, 0.0961, 0.0770], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0054, 0.0044, 0.0045, 0.0057, 0.0043, 0.0054, 0.0062], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:16:49,164 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9611, 1.6380, 3.2361, 2.3354, 2.9127, 1.7135, 2.2771, 2.8732], + device='cuda:2'), covar=tensor([0.0276, 0.3725, 0.0199, 0.4427, 0.0146, 0.2938, 0.0956, 0.0205], + device='cuda:2'), in_proj_covar=tensor([0.0187, 0.0300, 0.0150, 0.0409, 0.0129, 0.0308, 0.0234, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:17:02,054 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 3.008e+02 3.803e+02 5.016e+02 1.141e+03, threshold=7.605e+02, percent-clipped=8.0 +2022-12-07 07:17:23,669 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5032, 1.1779, 1.3081, 1.3835, 0.9153, 1.4041, 1.0917, 1.1615], + device='cuda:2'), covar=tensor([0.2083, 0.0964, 0.0928, 0.0598, 0.0993, 0.0348, 0.0992, 0.0892], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0056, 0.0044, 0.0046, 0.0059, 0.0044, 0.0056, 0.0063], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:17:24,396 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6802, 4.5495, 4.3982, 4.8084, 4.7095, 3.8953, 4.9406, 4.8934], + device='cuda:2'), covar=tensor([0.1022, 0.0508, 0.0926, 0.0971, 0.0643, 0.0649, 0.0722, 0.0881], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0073, 0.0097, 0.0088, 0.0097, 0.0065, 0.0088, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:17:28,872 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5475, 2.4608, 2.3730, 2.2493, 2.4614, 2.3517, 2.4788, 2.4819], + device='cuda:2'), covar=tensor([0.0387, 0.0743, 0.1110, 0.1338, 0.0396, 0.0514, 0.0800, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0123, 0.0153, 0.0151, 0.0124, 0.0138, 0.0153, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:17:47,318 INFO [train.py:873] (2/4) Epoch 2, batch 3800, loss[loss=0.2223, simple_loss=0.222, pruned_loss=0.1114, over 14222.00 frames. ], tot_loss[loss=0.2407, simple_loss=0.2322, pruned_loss=0.1246, over 2003142.54 frames. ], batch size: 94, lr: 3.11e-02, grad_scale: 8.0 +2022-12-07 07:18:12,497 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.45 vs. limit=5.0 +2022-12-07 07:18:20,016 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11400.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:18:28,500 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.452e+02 3.176e+02 4.502e+02 5.921e+02 1.352e+03, threshold=9.005e+02, percent-clipped=11.0 +2022-12-07 07:18:38,076 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 07:18:46,160 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11430.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:18:55,260 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 07:19:02,214 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11448.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:19:14,586 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3883, 5.1205, 4.8051, 4.9680, 4.8324, 5.2051, 5.2782, 5.3401], + device='cuda:2'), covar=tensor([0.0330, 0.0407, 0.0554, 0.0569, 0.0312, 0.0235, 0.0367, 0.0523], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0126, 0.0156, 0.0157, 0.0127, 0.0140, 0.0153, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:19:15,325 INFO [train.py:873] (2/4) Epoch 2, batch 3900, loss[loss=0.2255, simple_loss=0.2269, pruned_loss=0.112, over 13580.00 frames. ], tot_loss[loss=0.2395, simple_loss=0.2315, pruned_loss=0.1238, over 2029082.12 frames. ], batch size: 17, lr: 3.10e-02, grad_scale: 8.0 +2022-12-07 07:19:27,737 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11478.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:19:40,863 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7411, 2.6497, 2.4103, 2.4242, 2.5969, 2.5660, 2.6792, 2.6894], + device='cuda:2'), covar=tensor([0.0542, 0.0539, 0.0890, 0.1001, 0.0541, 0.0546, 0.0739, 0.0794], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0125, 0.0155, 0.0159, 0.0127, 0.0141, 0.0152, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:19:55,407 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.433e+02 2.890e+02 4.018e+02 5.176e+02 1.364e+03, threshold=8.037e+02, percent-clipped=4.0 +2022-12-07 07:20:01,046 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 07:20:12,868 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11530.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:20:14,651 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11532.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:20:22,705 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11541.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:20:40,966 INFO [train.py:873] (2/4) Epoch 2, batch 4000, loss[loss=0.2309, simple_loss=0.2121, pruned_loss=0.1248, over 4981.00 frames. ], tot_loss[loss=0.2367, simple_loss=0.2294, pruned_loss=0.122, over 1985046.52 frames. ], batch size: 100, lr: 3.09e-02, grad_scale: 8.0 +2022-12-07 07:21:03,667 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11589.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 07:21:07,149 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11593.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:21:22,286 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.261e+02 3.113e+02 4.062e+02 5.460e+02 1.002e+03, threshold=8.123e+02, percent-clipped=4.0 +2022-12-07 07:21:31,126 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0236, 2.0166, 1.9528, 2.1215, 1.7238, 1.6732, 2.0543, 2.1149], + device='cuda:2'), covar=tensor([0.0906, 0.0745, 0.0977, 0.0786, 0.1235, 0.0942, 0.0810, 0.0739], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0072, 0.0094, 0.0085, 0.0098, 0.0064, 0.0087, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:21:34,982 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8181, 0.9992, 1.0299, 1.0039, 0.7418, 1.1019, 0.6752, 0.6740], + device='cuda:2'), covar=tensor([0.0814, 0.0310, 0.0167, 0.0257, 0.0497, 0.0143, 0.0940, 0.0689], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0054, 0.0041, 0.0044, 0.0055, 0.0042, 0.0054, 0.0058], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:21:38,024 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 07:21:44,104 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11635.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:22:08,542 INFO [train.py:873] (2/4) Epoch 2, batch 4100, loss[loss=0.2176, simple_loss=0.2211, pruned_loss=0.107, over 14534.00 frames. ], tot_loss[loss=0.2375, simple_loss=0.2303, pruned_loss=0.1224, over 2016490.94 frames. ], batch size: 34, lr: 3.08e-02, grad_scale: 8.0 +2022-12-07 07:22:36,728 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5072, 1.2954, 1.4213, 0.9555, 1.1749, 1.1611, 1.3198, 1.3979], + device='cuda:2'), covar=tensor([0.0379, 0.2622, 0.0566, 0.1401, 0.0710, 0.0787, 0.0643, 0.0506], + device='cuda:2'), in_proj_covar=tensor([0.0050, 0.0131, 0.0066, 0.0091, 0.0058, 0.0058, 0.0052, 0.0055], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:22:37,375 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11696.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:22:39,929 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11699.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:22:43,192 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-12-07 07:22:49,364 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.791e+01 3.003e+02 4.031e+02 5.512e+02 8.434e+02, threshold=8.062e+02, percent-clipped=2.0 +2022-12-07 07:23:09,312 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5425, 1.2598, 1.9676, 1.4969, 1.9428, 1.3948, 1.5377, 1.9235], + device='cuda:2'), covar=tensor([0.0675, 0.1361, 0.0183, 0.0807, 0.0161, 0.1209, 0.0986, 0.0198], + device='cuda:2'), in_proj_covar=tensor([0.0200, 0.0302, 0.0152, 0.0416, 0.0134, 0.0318, 0.0237, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0004, 0.0001, 0.0003, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:23:13,098 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2010, 0.6600, 1.2379, 0.9387, 1.1005, 0.8365, 1.7155, 0.8718], + device='cuda:2'), covar=tensor([0.0504, 0.1046, 0.0234, 0.0645, 0.0879, 0.0221, 0.0362, 0.0509], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0021, 0.0020, 0.0019, 0.0022, 0.0016, 0.0019], + device='cuda:2'), out_proj_covar=tensor([4.3182e-05, 4.6381e-05, 4.3842e-05, 4.5692e-05, 4.0879e-05, 4.6369e-05, + 3.6322e-05, 4.1271e-05], device='cuda:2') +2022-12-07 07:23:32,943 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11760.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:23:33,323 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-12-07 07:23:35,179 INFO [train.py:873] (2/4) Epoch 2, batch 4200, loss[loss=0.2094, simple_loss=0.2059, pruned_loss=0.1064, over 3891.00 frames. ], tot_loss[loss=0.2381, simple_loss=0.2306, pruned_loss=0.1228, over 1988448.39 frames. ], batch size: 100, lr: 3.07e-02, grad_scale: 8.0 +2022-12-07 07:23:42,664 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6458, 3.2772, 3.5740, 3.2569, 3.4569, 3.3264, 1.4570, 3.3245], + device='cuda:2'), covar=tensor([0.0198, 0.0376, 0.0475, 0.0372, 0.0339, 0.0403, 0.3241, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0093, 0.0094, 0.0072, 0.0121, 0.0081, 0.0132, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:24:16,691 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.779e+02 3.300e+02 4.114e+02 5.198e+02 1.405e+03, threshold=8.228e+02, percent-clipped=5.0 +2022-12-07 07:24:21,350 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5130, 1.4790, 1.9111, 1.2348, 1.2372, 1.4319, 0.9922, 1.4312], + device='cuda:2'), covar=tensor([0.0474, 0.0778, 0.0269, 0.0712, 0.0498, 0.0468, 0.1380, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0051, 0.0057, 0.0055, 0.0059, 0.0058, 0.0050, 0.0099, 0.0063], + device='cuda:2'), out_proj_covar=tensor([8.6189e-05, 9.1314e-05, 8.6736e-05, 9.1123e-05, 8.6185e-05, 8.1773e-05, + 1.4730e-04, 1.0129e-04], device='cuda:2') +2022-12-07 07:24:22,313 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11816.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:24:34,070 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=11830.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:24:40,811 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6971, 1.5971, 2.1389, 1.7719, 2.1160, 1.7572, 1.9395, 1.7993], + device='cuda:2'), covar=tensor([0.0088, 0.0321, 0.0025, 0.0065, 0.0032, 0.0066, 0.0054, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0281, 0.0116, 0.0205, 0.0140, 0.0167, 0.0160, 0.0315], + device='cuda:2'), out_proj_covar=tensor([1.4082e-04, 2.4986e-04, 9.8149e-05, 1.7530e-04, 1.3145e-04, 1.5254e-04, + 1.6246e-04, 2.7039e-04], device='cuda:2') +2022-12-07 07:24:47,752 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11846.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:25:02,803 INFO [train.py:873] (2/4) Epoch 2, batch 4300, loss[loss=0.2727, simple_loss=0.246, pruned_loss=0.1497, over 11178.00 frames. ], tot_loss[loss=0.2371, simple_loss=0.2305, pruned_loss=0.1219, over 2037151.57 frames. ], batch size: 100, lr: 3.06e-02, grad_scale: 8.0 +2022-12-07 07:25:04,956 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7664, 1.5799, 1.5613, 0.9855, 1.4385, 1.4601, 1.7144, 1.4909], + device='cuda:2'), covar=tensor([0.0249, 0.2067, 0.0514, 0.1206, 0.0729, 0.0434, 0.0406, 0.0413], + device='cuda:2'), in_proj_covar=tensor([0.0052, 0.0135, 0.0067, 0.0093, 0.0058, 0.0057, 0.0052, 0.0057], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:25:15,148 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11877.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:25:15,841 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=11878.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:25:22,103 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.21 vs. limit=5.0 +2022-12-07 07:25:24,630 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11888.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:25:41,738 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11907.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:25:44,081 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.554e+02 3.023e+02 3.927e+02 4.940e+02 9.062e+02, threshold=7.854e+02, percent-clipped=2.0 +2022-12-07 07:25:45,030 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0495, 3.7259, 3.7069, 4.1094, 3.7826, 2.9646, 4.1289, 4.0602], + device='cuda:2'), covar=tensor([0.0592, 0.0673, 0.0609, 0.0560, 0.0581, 0.0683, 0.0580, 0.0634], + device='cuda:2'), in_proj_covar=tensor([0.0094, 0.0073, 0.0094, 0.0089, 0.0099, 0.0066, 0.0086, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:25:52,657 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11920.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:26:17,486 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11948.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:26:21,740 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6157, 1.7310, 4.2087, 3.7466, 3.8324, 3.7505, 3.4963, 4.3970], + device='cuda:2'), covar=tensor([0.2174, 0.2228, 0.0175, 0.0212, 0.0203, 0.0294, 0.0301, 0.0114], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0125, 0.0066, 0.0088, 0.0075, 0.0076, 0.0062, 0.0064], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:26:30,061 INFO [train.py:873] (2/4) Epoch 2, batch 4400, loss[loss=0.2227, simple_loss=0.2297, pruned_loss=0.1078, over 14061.00 frames. ], tot_loss[loss=0.2367, simple_loss=0.2301, pruned_loss=0.1216, over 2052167.22 frames. ], batch size: 29, lr: 3.04e-02, grad_scale: 8.0 +2022-12-07 07:26:34,636 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6256, 1.9748, 2.6095, 2.3876, 2.7336, 2.4687, 2.6361, 2.2206], + device='cuda:2'), covar=tensor([0.0120, 0.1061, 0.0055, 0.0406, 0.0119, 0.0160, 0.0280, 0.0858], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0282, 0.0118, 0.0213, 0.0144, 0.0172, 0.0162, 0.0326], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 07:26:44,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6256, 2.2121, 2.9184, 2.9453, 2.4554, 2.2283, 2.6858, 2.1990], + device='cuda:2'), covar=tensor([0.0077, 0.0161, 0.0161, 0.0136, 0.0074, 0.0273, 0.0036, 0.0266], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0129, 0.0113, 0.0120, 0.0109, 0.0167, 0.0075, 0.0162], + device='cuda:2'), out_proj_covar=tensor([1.2705e-04, 1.6978e-04, 1.5614e-04, 1.6047e-04, 1.4353e-04, 2.2634e-04, + 9.6859e-05, 2.0065e-04], device='cuda:2') +2022-12-07 07:26:46,383 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=11981.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:26:48,664 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=11984.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:26:50,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6514, 1.3324, 1.5569, 1.4191, 1.0224, 1.4450, 1.1145, 1.0428], + device='cuda:2'), covar=tensor([0.2917, 0.1193, 0.0850, 0.0678, 0.0947, 0.0488, 0.1300, 0.0980], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0055, 0.0045, 0.0044, 0.0057, 0.0044, 0.0055, 0.0060], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:26:54,419 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=11991.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:27:06,327 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-12-07 07:27:10,811 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12009.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:27:11,350 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.977e+02 4.090e+02 5.341e+02 9.805e+02, threshold=8.180e+02, percent-clipped=2.0 +2022-12-07 07:27:41,933 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12045.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:27:50,958 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12055.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:27:57,855 INFO [train.py:873] (2/4) Epoch 2, batch 4500, loss[loss=0.2589, simple_loss=0.2488, pruned_loss=0.1345, over 14527.00 frames. ], tot_loss[loss=0.2357, simple_loss=0.2295, pruned_loss=0.121, over 2054972.16 frames. ], batch size: 34, lr: 3.03e-02, grad_scale: 8.0 +2022-12-07 07:27:58,758 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1395, 0.8847, 0.7860, 0.7035, 1.4615, 0.6903, 1.0605, 0.7117], + device='cuda:2'), covar=tensor([0.0382, 0.0511, 0.0264, 0.0826, 0.0269, 0.0430, 0.1103, 0.0716], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0022, 0.0020, 0.0020, 0.0023, 0.0018, 0.0020], + device='cuda:2'), out_proj_covar=tensor([4.4293e-05, 4.5724e-05, 4.3788e-05, 4.7285e-05, 4.1411e-05, 4.9795e-05, + 4.0563e-05, 4.3441e-05], device='cuda:2') +2022-12-07 07:28:02,086 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8722, 1.9041, 2.8029, 1.7516, 2.0413, 2.0199, 1.2041, 1.8737], + device='cuda:2'), covar=tensor([0.1064, 0.0809, 0.0582, 0.1018, 0.0505, 0.0878, 0.1948, 0.1175], + device='cuda:2'), in_proj_covar=tensor([0.0057, 0.0058, 0.0054, 0.0063, 0.0059, 0.0052, 0.0107, 0.0066], + device='cuda:2'), out_proj_covar=tensor([9.5981e-05, 9.4684e-05, 8.7572e-05, 9.7407e-05, 8.8692e-05, 8.6477e-05, + 1.6083e-04, 1.0807e-04], device='cuda:2') +2022-12-07 07:28:08,955 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12076.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:28:38,200 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 3.534e+02 4.689e+02 6.101e+02 2.207e+03, threshold=9.377e+02, percent-clipped=9.0 +2022-12-07 07:29:01,456 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12137.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:29:23,549 INFO [train.py:873] (2/4) Epoch 2, batch 4600, loss[loss=0.2342, simple_loss=0.2331, pruned_loss=0.1176, over 14028.00 frames. ], tot_loss[loss=0.2369, simple_loss=0.2299, pruned_loss=0.122, over 2008515.32 frames. ], batch size: 26, lr: 3.02e-02, grad_scale: 8.0 +2022-12-07 07:29:31,592 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12172.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:29:45,271 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12188.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:29:57,584 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12202.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:30:04,545 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 2.710e+02 3.814e+02 4.807e+02 1.336e+03, threshold=7.627e+02, percent-clipped=4.0 +2022-12-07 07:30:27,144 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12236.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:30:50,578 INFO [train.py:873] (2/4) Epoch 2, batch 4700, loss[loss=0.2264, simple_loss=0.2199, pruned_loss=0.1164, over 9501.00 frames. ], tot_loss[loss=0.2351, simple_loss=0.229, pruned_loss=0.1206, over 2070952.25 frames. ], batch size: 100, lr: 3.01e-02, grad_scale: 8.0 +2022-12-07 07:31:01,590 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12276.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:31:02,403 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1936, 2.8319, 2.8667, 3.2149, 3.1051, 3.2483, 3.2687, 2.7535], + device='cuda:2'), covar=tensor([0.0339, 0.0979, 0.0388, 0.0378, 0.0545, 0.0339, 0.0581, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0141, 0.0089, 0.0084, 0.0094, 0.0088, 0.0132, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:31:14,641 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12291.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:31:25,462 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12304.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:31:30,462 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 2.849e+02 3.785e+02 5.259e+02 1.272e+03, threshold=7.569e+02, percent-clipped=8.0 +2022-12-07 07:31:55,991 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12339.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:31:56,785 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12340.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:32:03,617 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6487, 3.3030, 3.7274, 3.1895, 3.5946, 3.4452, 1.1705, 3.5056], + device='cuda:2'), covar=tensor([0.0185, 0.0380, 0.0357, 0.0388, 0.0290, 0.0357, 0.3294, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0097, 0.0096, 0.0072, 0.0123, 0.0082, 0.0134, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:32:04,505 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5280, 1.2498, 1.4039, 1.3115, 1.0792, 1.3553, 0.9871, 1.0137], + device='cuda:2'), covar=tensor([0.1865, 0.1278, 0.1831, 0.0771, 0.0983, 0.0479, 0.0959, 0.1280], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0057, 0.0046, 0.0047, 0.0057, 0.0044, 0.0056, 0.0065], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:32:09,912 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12355.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:32:14,491 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 07:32:16,668 INFO [train.py:873] (2/4) Epoch 2, batch 4800, loss[loss=0.202, simple_loss=0.2121, pruned_loss=0.09596, over 14643.00 frames. ], tot_loss[loss=0.2352, simple_loss=0.2283, pruned_loss=0.1211, over 1941076.41 frames. ], batch size: 23, lr: 3.00e-02, grad_scale: 8.0 +2022-12-07 07:32:22,999 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=12.64 vs. limit=5.0 +2022-12-07 07:32:50,359 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2564, 3.8146, 3.6925, 4.2239, 4.1669, 3.9274, 4.2821, 3.7369], + device='cuda:2'), covar=tensor([0.0411, 0.0924, 0.0370, 0.0411, 0.0589, 0.0648, 0.0568, 0.0481], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0144, 0.0090, 0.0084, 0.0094, 0.0091, 0.0132, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:32:51,097 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12403.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:32:57,419 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.788e+02 3.011e+02 4.103e+02 5.225e+02 9.734e+02, threshold=8.206e+02, percent-clipped=3.0 +2022-12-07 07:33:15,552 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12432.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:33:20,932 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7621, 2.6962, 3.7834, 3.1074, 3.5833, 3.5562, 3.6166, 3.0881], + device='cuda:2'), covar=tensor([0.0074, 0.1096, 0.0036, 0.0399, 0.0145, 0.0173, 0.0369, 0.0835], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0301, 0.0122, 0.0227, 0.0161, 0.0186, 0.0176, 0.0333], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 07:33:27,429 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12446.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:33:42,230 INFO [train.py:873] (2/4) Epoch 2, batch 4900, loss[loss=0.1988, simple_loss=0.1773, pruned_loss=0.1102, over 1230.00 frames. ], tot_loss[loss=0.2368, simple_loss=0.2291, pruned_loss=0.1222, over 1938592.92 frames. ], batch size: 100, lr: 2.99e-02, grad_scale: 8.0 +2022-12-07 07:33:49,638 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12472.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:34:15,548 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12502.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:34:19,790 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12507.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:34:21,912 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.645e+02 3.534e+02 4.435e+02 5.863e+02 9.969e+02, threshold=8.870e+02, percent-clipped=5.0 +2022-12-07 07:34:25,796 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 07:34:30,741 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12520.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:34:56,805 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12550.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:35:08,167 INFO [train.py:873] (2/4) Epoch 2, batch 5000, loss[loss=0.2475, simple_loss=0.239, pruned_loss=0.128, over 14344.00 frames. ], tot_loss[loss=0.2356, simple_loss=0.2287, pruned_loss=0.1213, over 1891834.83 frames. ], batch size: 66, lr: 2.98e-02, grad_scale: 8.0 +2022-12-07 07:35:17,151 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 07:35:19,769 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12576.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:35:31,859 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5326, 1.5987, 1.6387, 1.0122, 1.2707, 1.4693, 1.5915, 1.5340], + device='cuda:2'), covar=tensor([0.0191, 0.1656, 0.0711, 0.1197, 0.0724, 0.0510, 0.0478, 0.0470], + device='cuda:2'), in_proj_covar=tensor([0.0052, 0.0142, 0.0075, 0.0097, 0.0066, 0.0062, 0.0056, 0.0062], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:35:44,197 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12604.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:35:49,096 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.602e+02 3.073e+02 4.044e+02 5.211e+02 8.552e+02, threshold=8.087e+02, percent-clipped=0.0 +2022-12-07 07:36:01,966 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12624.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:36:15,423 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12640.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:36:25,607 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12652.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:36:35,311 INFO [train.py:873] (2/4) Epoch 2, batch 5100, loss[loss=0.2299, simple_loss=0.2271, pruned_loss=0.1164, over 14249.00 frames. ], tot_loss[loss=0.2345, simple_loss=0.2275, pruned_loss=0.1208, over 1941109.58 frames. ], batch size: 63, lr: 2.97e-02, grad_scale: 8.0 +2022-12-07 07:36:56,665 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12688.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:37:16,599 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.397e+02 3.113e+02 4.447e+02 5.356e+02 1.001e+03, threshold=8.894e+02, percent-clipped=4.0 +2022-12-07 07:37:24,686 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4589, 3.2342, 3.3931, 3.1414, 3.3189, 3.2924, 1.3535, 3.2488], + device='cuda:2'), covar=tensor([0.0291, 0.0478, 0.0596, 0.0573, 0.0527, 0.0509, 0.4094, 0.0361], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0098, 0.0096, 0.0075, 0.0129, 0.0086, 0.0138, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 07:37:34,920 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=12732.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:37:54,043 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0798, 2.5040, 3.5321, 3.4029, 3.2237, 2.2186, 3.2884, 2.4606], + device='cuda:2'), covar=tensor([0.0086, 0.0156, 0.0131, 0.0113, 0.0078, 0.0349, 0.0036, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0130, 0.0125, 0.0124, 0.0113, 0.0170, 0.0079, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:37:54,851 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3162, 1.6331, 1.6499, 1.8440, 1.3899, 1.9648, 1.7897, 1.2984], + device='cuda:2'), covar=tensor([0.3381, 0.1421, 0.2645, 0.1742, 0.1021, 0.0799, 0.0755, 0.1519], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0061, 0.0049, 0.0048, 0.0061, 0.0045, 0.0055, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:38:01,732 INFO [train.py:873] (2/4) Epoch 2, batch 5200, loss[loss=0.2892, simple_loss=0.2527, pruned_loss=0.1628, over 8597.00 frames. ], tot_loss[loss=0.236, simple_loss=0.229, pruned_loss=0.1215, over 2003840.97 frames. ], batch size: 100, lr: 2.96e-02, grad_scale: 8.0 +2022-12-07 07:38:01,918 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=12763.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:38:16,884 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=12780.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:38:35,754 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=12802.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:38:43,163 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.389e+02 3.055e+02 3.810e+02 5.193e+02 9.032e+02, threshold=7.619e+02, percent-clipped=1.0 +2022-12-07 07:38:54,870 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=12824.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 07:38:55,063 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.11 vs. limit=5.0 +2022-12-07 07:39:00,527 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2399, 3.6488, 4.3477, 3.4880, 4.0895, 4.2537, 1.6442, 4.0260], + device='cuda:2'), covar=tensor([0.0181, 0.0324, 0.0317, 0.0397, 0.0328, 0.0170, 0.3076, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0099, 0.0096, 0.0077, 0.0131, 0.0087, 0.0138, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 07:39:28,088 INFO [train.py:873] (2/4) Epoch 2, batch 5300, loss[loss=0.2259, simple_loss=0.1882, pruned_loss=0.1318, over 1258.00 frames. ], tot_loss[loss=0.2351, simple_loss=0.2282, pruned_loss=0.121, over 1951571.84 frames. ], batch size: 100, lr: 2.95e-02, grad_scale: 8.0 +2022-12-07 07:39:48,631 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7790, 1.5328, 3.4601, 1.5819, 3.7949, 3.7340, 2.7501, 3.8609], + device='cuda:2'), covar=tensor([0.0218, 0.2813, 0.0375, 0.2498, 0.0289, 0.0287, 0.0682, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0132, 0.0086, 0.0142, 0.0105, 0.0090, 0.0081, 0.0084], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:40:09,055 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.807e+01 2.959e+02 3.875e+02 4.793e+02 1.192e+03, threshold=7.749e+02, percent-clipped=2.0 +2022-12-07 07:40:17,740 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6390, 1.2447, 2.0237, 1.7496, 2.0254, 1.8928, 1.3904, 2.0060], + device='cuda:2'), covar=tensor([0.0365, 0.0809, 0.0109, 0.0276, 0.0111, 0.0130, 0.0361, 0.0116], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0129, 0.0068, 0.0091, 0.0077, 0.0080, 0.0066, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:40:18,614 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6705, 1.3992, 3.0814, 2.8027, 3.0503, 2.9295, 2.3712, 3.1094], + device='cuda:2'), covar=tensor([0.1390, 0.1499, 0.0108, 0.0233, 0.0137, 0.0145, 0.0298, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0128, 0.0068, 0.0091, 0.0077, 0.0080, 0.0066, 0.0066], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 07:40:35,815 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9139, 1.5597, 1.9397, 1.8052, 1.8709, 1.9651, 1.9655, 1.6360], + device='cuda:2'), covar=tensor([0.1109, 0.2421, 0.1276, 0.1440, 0.1340, 0.1066, 0.1628, 0.1802], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0156, 0.0099, 0.0093, 0.0101, 0.0096, 0.0139, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:40:53,785 INFO [train.py:873] (2/4) Epoch 2, batch 5400, loss[loss=0.2293, simple_loss=0.2094, pruned_loss=0.1246, over 5975.00 frames. ], tot_loss[loss=0.2331, simple_loss=0.2271, pruned_loss=0.1196, over 1938558.26 frames. ], batch size: 100, lr: 2.94e-02, grad_scale: 8.0 +2022-12-07 07:41:35,174 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.219e+02 3.363e+02 4.433e+02 5.944e+02 1.364e+03, threshold=8.866e+02, percent-clipped=4.0 +2022-12-07 07:42:21,076 INFO [train.py:873] (2/4) Epoch 2, batch 5500, loss[loss=0.2498, simple_loss=0.2233, pruned_loss=0.1382, over 5973.00 frames. ], tot_loss[loss=0.2307, simple_loss=0.2259, pruned_loss=0.1177, over 1983929.89 frames. ], batch size: 100, lr: 2.93e-02, grad_scale: 8.0 +2022-12-07 07:42:21,311 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7649, 2.2487, 3.1554, 3.0797, 2.7351, 2.0807, 2.8541, 2.3162], + device='cuda:2'), covar=tensor([0.0067, 0.0127, 0.0093, 0.0077, 0.0070, 0.0273, 0.0037, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0136, 0.0132, 0.0127, 0.0118, 0.0178, 0.0083, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:42:54,052 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.53 vs. limit=2.0 +2022-12-07 07:42:54,587 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=13102.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:43:01,929 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.454e+02 3.268e+02 4.131e+02 4.754e+02 9.036e+02, threshold=8.263e+02, percent-clipped=2.0 +2022-12-07 07:43:08,790 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=13119.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 07:43:21,025 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1106, 1.7763, 1.9842, 2.1059, 2.0772, 2.0769, 2.1281, 1.8973], + device='cuda:2'), covar=tensor([0.0493, 0.1289, 0.0566, 0.0552, 0.0705, 0.0581, 0.0740, 0.0779], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0151, 0.0097, 0.0089, 0.0093, 0.0091, 0.0135, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:43:22,674 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1734, 2.0613, 4.3411, 3.1469, 4.2686, 2.3134, 3.2310, 3.6117], + device='cuda:2'), covar=tensor([0.0172, 0.3634, 0.0122, 0.4978, 0.0091, 0.2295, 0.0603, 0.0188], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0309, 0.0157, 0.0408, 0.0139, 0.0311, 0.0255, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0004, 0.0001, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:43:31,138 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2134, 3.8373, 4.3104, 3.6276, 4.0374, 4.2236, 1.4271, 4.1116], + device='cuda:2'), covar=tensor([0.0149, 0.0255, 0.0275, 0.0282, 0.0258, 0.0170, 0.2998, 0.0183], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0102, 0.0097, 0.0078, 0.0132, 0.0090, 0.0142, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 07:43:35,174 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=13150.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:43:36,172 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3219, 2.2678, 1.7991, 2.5056, 2.4085, 2.6594, 2.2847, 1.9560], + device='cuda:2'), covar=tensor([0.0160, 0.0233, 0.1135, 0.0093, 0.0163, 0.0148, 0.0297, 0.1227], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0217, 0.0326, 0.0147, 0.0156, 0.0155, 0.0203, 0.0341], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:43:46,161 INFO [train.py:873] (2/4) Epoch 2, batch 5600, loss[loss=0.2043, simple_loss=0.2129, pruned_loss=0.09781, over 14365.00 frames. ], tot_loss[loss=0.2329, simple_loss=0.2275, pruned_loss=0.1192, over 2019413.27 frames. ], batch size: 55, lr: 2.92e-02, grad_scale: 8.0 +2022-12-07 07:44:28,274 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.372e+02 3.284e+02 4.567e+02 6.071e+02 1.505e+03, threshold=9.133e+02, percent-clipped=13.0 +2022-12-07 07:45:14,446 INFO [train.py:873] (2/4) Epoch 2, batch 5700, loss[loss=0.2412, simple_loss=0.2385, pruned_loss=0.122, over 14114.00 frames. ], tot_loss[loss=0.2341, simple_loss=0.2275, pruned_loss=0.1204, over 1957646.73 frames. ], batch size: 29, lr: 2.91e-02, grad_scale: 8.0 +2022-12-07 07:45:29,927 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-12-07 07:45:30,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-12-07 07:45:55,408 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.305e+02 2.840e+02 3.959e+02 4.964e+02 1.183e+03, threshold=7.919e+02, percent-clipped=6.0 +2022-12-07 07:45:55,557 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4091, 4.8142, 4.7517, 5.3345, 4.9819, 4.6843, 5.3890, 4.5641], + device='cuda:2'), covar=tensor([0.0279, 0.0885, 0.0304, 0.0416, 0.0702, 0.0293, 0.0440, 0.0459], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0151, 0.0099, 0.0089, 0.0097, 0.0091, 0.0140, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:45:56,056 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-12-07 07:46:40,289 INFO [train.py:873] (2/4) Epoch 2, batch 5800, loss[loss=0.2386, simple_loss=0.2232, pruned_loss=0.127, over 11962.00 frames. ], tot_loss[loss=0.2319, simple_loss=0.2261, pruned_loss=0.1188, over 1921697.78 frames. ], batch size: 100, lr: 2.90e-02, grad_scale: 8.0 +2022-12-07 07:47:21,674 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.414e+02 3.070e+02 4.104e+02 5.303e+02 9.108e+02, threshold=8.208e+02, percent-clipped=3.0 +2022-12-07 07:47:29,381 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=13419.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:47:30,271 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6796, 2.1533, 1.8396, 2.1546, 2.3584, 2.3733, 2.7923, 2.0410], + device='cuda:2'), covar=tensor([0.0613, 0.4418, 0.0939, 0.1469, 0.0912, 0.1437, 0.0854, 0.0612], + device='cuda:2'), in_proj_covar=tensor([0.0060, 0.0155, 0.0078, 0.0103, 0.0071, 0.0067, 0.0059, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0003, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:47:57,159 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 07:47:58,234 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.54 vs. limit=5.0 +2022-12-07 07:48:07,861 INFO [train.py:873] (2/4) Epoch 2, batch 5900, loss[loss=0.2242, simple_loss=0.2278, pruned_loss=0.1103, over 14265.00 frames. ], tot_loss[loss=0.2322, simple_loss=0.2265, pruned_loss=0.119, over 2006687.07 frames. ], batch size: 76, lr: 2.89e-02, grad_scale: 8.0 +2022-12-07 07:48:11,326 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=13467.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:48:33,423 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 07:48:39,625 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-07 07:48:49,280 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 3.527e+02 4.364e+02 5.819e+02 1.292e+03, threshold=8.728e+02, percent-clipped=7.0 +2022-12-07 07:48:52,267 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.17 vs. limit=2.0 +2022-12-07 07:49:02,504 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8780, 1.3815, 2.0843, 1.2921, 1.9855, 2.0475, 1.3302, 1.9537], + device='cuda:2'), covar=tensor([0.0133, 0.0909, 0.0127, 0.1156, 0.0165, 0.0228, 0.0611, 0.0162], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0137, 0.0090, 0.0148, 0.0113, 0.0098, 0.0086, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:49:33,862 INFO [train.py:873] (2/4) Epoch 2, batch 6000, loss[loss=0.2171, simple_loss=0.2235, pruned_loss=0.1053, over 14234.00 frames. ], tot_loss[loss=0.2321, simple_loss=0.2261, pruned_loss=0.119, over 1950410.32 frames. ], batch size: 94, lr: 2.88e-02, grad_scale: 8.0 +2022-12-07 07:49:33,862 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 07:49:42,175 INFO [train.py:905] (2/4) Epoch 2, validation: loss=0.139, simple_loss=0.1792, pruned_loss=0.04941, over 857387.00 frames. +2022-12-07 07:49:42,176 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 07:50:21,910 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1556, 0.9110, 0.5326, 1.1137, 1.0639, 0.3122, 0.8377, 1.0023], + device='cuda:2'), covar=tensor([0.0146, 0.0236, 0.0097, 0.0159, 0.0188, 0.0150, 0.0238, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0019, 0.0020, 0.0017, 0.0018, 0.0022, 0.0016, 0.0018], + device='cuda:2'), out_proj_covar=tensor([4.5903e-05, 4.7441e-05, 4.3644e-05, 4.3794e-05, 4.3538e-05, 4.8738e-05, + 4.2052e-05, 4.2751e-05], device='cuda:2') +2022-12-07 07:50:24,252 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.694e+02 3.403e+02 4.162e+02 5.697e+02 1.072e+03, threshold=8.324e+02, percent-clipped=3.0 +2022-12-07 07:50:27,184 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2483, 2.1002, 3.6836, 3.2928, 2.9538, 2.0614, 3.6741, 2.4090], + device='cuda:2'), covar=tensor([0.0104, 0.0267, 0.0154, 0.0160, 0.0143, 0.0463, 0.0035, 0.0397], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0135, 0.0139, 0.0131, 0.0117, 0.0179, 0.0081, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:50:39,315 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.02 vs. limit=2.0 +2022-12-07 07:51:07,797 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2132, 1.2533, 0.7892, 0.9355, 1.1607, 0.9752, 1.3653, 0.7590], + device='cuda:2'), covar=tensor([0.0318, 0.0809, 0.0738, 0.0621, 0.0358, 0.0247, 0.0175, 0.0683], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0014, 0.0015, 0.0014, 0.0017, 0.0014, 0.0016], + device='cuda:2'), out_proj_covar=tensor([2.6569e-05, 2.7947e-05, 3.3010e-05, 3.0874e-05, 3.0682e-05, 3.3692e-05, + 3.6036e-05, 3.4083e-05], device='cuda:2') +2022-12-07 07:51:09,281 INFO [train.py:873] (2/4) Epoch 2, batch 6100, loss[loss=0.2575, simple_loss=0.2404, pruned_loss=0.1373, over 10356.00 frames. ], tot_loss[loss=0.2323, simple_loss=0.2265, pruned_loss=0.1191, over 1977553.69 frames. ], batch size: 100, lr: 2.87e-02, grad_scale: 8.0 +2022-12-07 07:51:50,708 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 3.186e+02 4.052e+02 4.953e+02 1.165e+03, threshold=8.105e+02, percent-clipped=5.0 +2022-12-07 07:52:13,609 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=13737.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:52:33,219 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6181, 1.9997, 1.7871, 1.8320, 1.5526, 1.8952, 1.9794, 1.2423], + device='cuda:2'), covar=tensor([0.4242, 0.1504, 0.3629, 0.2108, 0.1068, 0.0753, 0.1123, 0.2316], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0062, 0.0049, 0.0053, 0.0062, 0.0047, 0.0064, 0.0078], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:52:34,941 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3049, 1.8628, 4.2419, 2.9906, 4.2511, 1.9778, 3.4882, 3.9427], + device='cuda:2'), covar=tensor([0.0154, 0.3591, 0.0105, 0.4611, 0.0066, 0.2525, 0.0433, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0209, 0.0315, 0.0162, 0.0410, 0.0147, 0.0326, 0.0267, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0004, 0.0001, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:52:35,526 INFO [train.py:873] (2/4) Epoch 2, batch 6200, loss[loss=0.2575, simple_loss=0.245, pruned_loss=0.135, over 14413.00 frames. ], tot_loss[loss=0.2326, simple_loss=0.2266, pruned_loss=0.1193, over 1992679.66 frames. ], batch size: 41, lr: 2.86e-02, grad_scale: 8.0 +2022-12-07 07:53:05,534 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=13798.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 07:53:17,272 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-12-07 07:53:17,359 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.626e+02 2.778e+02 3.775e+02 5.188e+02 1.088e+03, threshold=7.550e+02, percent-clipped=5.0 +2022-12-07 07:53:19,931 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7622, 1.6519, 4.4243, 4.1348, 4.1533, 4.5233, 4.1730, 4.6559], + device='cuda:2'), covar=tensor([0.1620, 0.1563, 0.0108, 0.0107, 0.0121, 0.0105, 0.0138, 0.0083], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0132, 0.0070, 0.0094, 0.0080, 0.0085, 0.0066, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 07:53:34,605 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.38 vs. limit=2.0 +2022-12-07 07:54:02,990 INFO [train.py:873] (2/4) Epoch 2, batch 6300, loss[loss=0.2867, simple_loss=0.2544, pruned_loss=0.1595, over 8638.00 frames. ], tot_loss[loss=0.2306, simple_loss=0.2255, pruned_loss=0.1179, over 2005057.69 frames. ], batch size: 100, lr: 2.86e-02, grad_scale: 8.0 +2022-12-07 07:54:35,048 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.01 vs. limit=5.0 +2022-12-07 07:54:35,554 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7983, 4.8169, 4.4150, 5.0022, 4.6765, 3.7425, 5.0032, 5.0252], + device='cuda:2'), covar=tensor([0.0681, 0.0381, 0.0634, 0.0500, 0.0538, 0.0561, 0.0525, 0.0555], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0073, 0.0096, 0.0092, 0.0100, 0.0065, 0.0091, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:54:43,893 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.964e+01 3.256e+02 4.500e+02 5.561e+02 1.186e+03, threshold=8.999e+02, percent-clipped=4.0 +2022-12-07 07:54:47,573 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7154, 2.7158, 2.4595, 2.8196, 2.4540, 2.2825, 2.7893, 2.8559], + device='cuda:2'), covar=tensor([0.0938, 0.0622, 0.0901, 0.0865, 0.1026, 0.0841, 0.0771, 0.0791], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0074, 0.0098, 0.0095, 0.0102, 0.0067, 0.0093, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:55:13,529 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-12-07 07:55:29,728 INFO [train.py:873] (2/4) Epoch 2, batch 6400, loss[loss=0.2353, simple_loss=0.2315, pruned_loss=0.1195, over 14178.00 frames. ], tot_loss[loss=0.23, simple_loss=0.2252, pruned_loss=0.1174, over 1975838.77 frames. ], batch size: 84, lr: 2.85e-02, grad_scale: 8.0 +2022-12-07 07:55:44,685 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8523, 2.0735, 3.2986, 3.1247, 2.9068, 2.2929, 3.1310, 2.2170], + device='cuda:2'), covar=tensor([0.0056, 0.0166, 0.0102, 0.0080, 0.0058, 0.0244, 0.0036, 0.0242], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0140, 0.0142, 0.0134, 0.0121, 0.0182, 0.0085, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:55:53,904 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9421, 1.7261, 2.3537, 2.4333, 1.9248, 1.6930, 2.2413, 1.9387], + device='cuda:2'), covar=tensor([0.0039, 0.0055, 0.0035, 0.0036, 0.0043, 0.0140, 0.0034, 0.0057], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0140, 0.0142, 0.0134, 0.0121, 0.0183, 0.0085, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 07:56:11,802 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.544e+02 3.193e+02 4.162e+02 5.343e+02 1.659e+03, threshold=8.325e+02, percent-clipped=5.0 +2022-12-07 07:56:57,034 INFO [train.py:873] (2/4) Epoch 2, batch 6500, loss[loss=0.2425, simple_loss=0.2275, pruned_loss=0.1287, over 14252.00 frames. ], tot_loss[loss=0.2304, simple_loss=0.225, pruned_loss=0.1179, over 1918517.90 frames. ], batch size: 63, lr: 2.84e-02, grad_scale: 8.0 +2022-12-07 07:57:04,668 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14072.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:57:09,193 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7922, 0.6272, 0.4245, 1.0469, 0.5714, 1.2857, 1.1643, 1.1148], + device='cuda:2'), covar=tensor([0.0981, 0.3053, 0.1636, 0.1983, 0.1964, 0.0211, 0.0274, 0.3084], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0014, 0.0014, 0.0013, 0.0015, 0.0013, 0.0015], + device='cuda:2'), out_proj_covar=tensor([2.6581e-05, 2.8218e-05, 3.2915e-05, 3.0073e-05, 3.0197e-05, 3.2795e-05, + 3.5054e-05, 3.3730e-05], device='cuda:2') +2022-12-07 07:57:23,634 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14093.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 07:57:26,716 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 07:57:31,984 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14103.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:57:36,082 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3635, 1.4472, 1.4340, 1.1616, 1.3161, 1.4948, 1.1657, 1.1492], + device='cuda:2'), covar=tensor([0.2414, 0.1456, 0.0414, 0.0431, 0.0923, 0.0548, 0.1841, 0.1274], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0061, 0.0048, 0.0054, 0.0062, 0.0047, 0.0063, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:57:38,491 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.039e+02 3.036e+02 3.911e+02 5.054e+02 1.167e+03, threshold=7.822e+02, percent-clipped=3.0 +2022-12-07 07:57:42,450 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4872, 1.8108, 4.0704, 1.8084, 4.0233, 4.1765, 3.6068, 4.7132], + device='cuda:2'), covar=tensor([0.0133, 0.2417, 0.0263, 0.2139, 0.0229, 0.0216, 0.0389, 0.0113], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0140, 0.0092, 0.0148, 0.0112, 0.0098, 0.0088, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:57:46,737 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14120.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 07:57:58,294 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14133.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:58:23,889 INFO [train.py:873] (2/4) Epoch 2, batch 6600, loss[loss=0.1998, simple_loss=0.1996, pruned_loss=0.1, over 3852.00 frames. ], tot_loss[loss=0.2282, simple_loss=0.224, pruned_loss=0.1162, over 1928928.97 frames. ], batch size: 100, lr: 2.83e-02, grad_scale: 8.0 +2022-12-07 07:58:24,889 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14164.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 07:58:30,347 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8252, 1.2995, 2.0528, 1.2094, 1.9789, 2.0547, 1.5576, 1.9748], + device='cuda:2'), covar=tensor([0.0207, 0.1242, 0.0184, 0.1345, 0.0222, 0.0262, 0.0564, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0136, 0.0090, 0.0146, 0.0109, 0.0096, 0.0086, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 07:58:39,681 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14181.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 07:58:44,895 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-07 07:58:47,702 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0952, 1.8459, 1.9776, 2.0632, 2.0287, 2.0372, 2.1492, 1.7468], + device='cuda:2'), covar=tensor([0.0439, 0.0942, 0.0423, 0.0553, 0.0655, 0.0457, 0.0555, 0.0686], + device='cuda:2'), in_proj_covar=tensor([0.0094, 0.0161, 0.0101, 0.0099, 0.0100, 0.0096, 0.0149, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 07:59:06,278 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.319e+02 2.900e+02 3.866e+02 4.857e+02 9.306e+02, threshold=7.732e+02, percent-clipped=4.0 +2022-12-07 07:59:22,746 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.39 vs. limit=5.0 +2022-12-07 07:59:37,429 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7206, 3.3634, 3.7086, 3.3339, 3.5270, 3.5311, 1.4093, 3.5195], + device='cuda:2'), covar=tensor([0.0176, 0.0332, 0.0367, 0.0305, 0.0278, 0.0281, 0.3004, 0.0218], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0105, 0.0099, 0.0081, 0.0134, 0.0091, 0.0140, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 07:59:39,055 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3275, 3.9892, 4.1491, 3.8615, 4.1502, 4.2782, 1.6208, 4.0176], + device='cuda:2'), covar=tensor([0.0193, 0.0364, 0.0555, 0.0341, 0.0340, 0.0188, 0.3281, 0.0266], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0105, 0.0099, 0.0081, 0.0134, 0.0091, 0.0140, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 07:59:51,416 INFO [train.py:873] (2/4) Epoch 2, batch 6700, loss[loss=0.203, simple_loss=0.2062, pruned_loss=0.09989, over 13881.00 frames. ], tot_loss[loss=0.2267, simple_loss=0.2227, pruned_loss=0.1153, over 1860099.51 frames. ], batch size: 23, lr: 2.82e-02, grad_scale: 8.0 +2022-12-07 08:00:13,935 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9884, 2.0134, 1.9118, 1.9984, 2.0199, 1.9321, 1.2051, 1.8818], + device='cuda:2'), covar=tensor([0.0288, 0.0321, 0.0424, 0.0212, 0.0387, 0.0445, 0.1397, 0.0383], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0104, 0.0100, 0.0082, 0.0134, 0.0091, 0.0141, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 08:00:15,957 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0280, 1.5893, 3.9783, 3.6759, 3.7288, 3.8476, 3.0696, 4.0886], + device='cuda:2'), covar=tensor([0.1207, 0.1485, 0.0074, 0.0139, 0.0120, 0.0102, 0.0223, 0.0068], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0134, 0.0071, 0.0095, 0.0079, 0.0086, 0.0066, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 08:00:23,065 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-07 08:00:32,373 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.313e+02 3.050e+02 4.021e+02 5.480e+02 1.335e+03, threshold=8.042e+02, percent-clipped=5.0 +2022-12-07 08:01:01,607 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8726, 1.3559, 3.5151, 1.7700, 3.7146, 3.4571, 2.7493, 4.0248], + device='cuda:2'), covar=tensor([0.0133, 0.2603, 0.0292, 0.2006, 0.0284, 0.0335, 0.0587, 0.0121], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0139, 0.0093, 0.0148, 0.0111, 0.0097, 0.0087, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:01:18,008 INFO [train.py:873] (2/4) Epoch 2, batch 6800, loss[loss=0.2126, simple_loss=0.2126, pruned_loss=0.1064, over 14001.00 frames. ], tot_loss[loss=0.228, simple_loss=0.2236, pruned_loss=0.1162, over 1909635.50 frames. ], batch size: 19, lr: 2.81e-02, grad_scale: 8.0 +2022-12-07 08:01:43,662 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14393.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:01:59,490 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.256e+02 3.284e+02 4.234e+02 5.689e+02 1.698e+03, threshold=8.469e+02, percent-clipped=9.0 +2022-12-07 08:02:01,514 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14413.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:02:14,257 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14428.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:02:25,190 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14441.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:02:29,734 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0691, 2.0203, 1.7954, 1.7507, 1.9849, 1.9318, 2.0130, 2.0585], + device='cuda:2'), covar=tensor([0.0563, 0.0812, 0.1557, 0.1904, 0.0620, 0.0703, 0.1041, 0.0722], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0162, 0.0206, 0.0246, 0.0174, 0.0192, 0.0206, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:02:30,275 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.29 vs. limit=5.0 +2022-12-07 08:02:41,218 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14459.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:02:44,541 INFO [train.py:873] (2/4) Epoch 2, batch 6900, loss[loss=0.2148, simple_loss=0.1785, pruned_loss=0.1256, over 1233.00 frames. ], tot_loss[loss=0.2302, simple_loss=0.2248, pruned_loss=0.1178, over 1936801.14 frames. ], batch size: 100, lr: 2.80e-02, grad_scale: 8.0 +2022-12-07 08:02:46,624 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.61 vs. limit=2.0 +2022-12-07 08:02:46,740 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.59 vs. limit=2.0 +2022-12-07 08:02:53,953 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14474.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:02:55,536 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14476.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 08:03:05,406 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14487.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:03:25,507 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.414e+02 3.007e+02 4.167e+02 5.148e+02 1.275e+03, threshold=8.334e+02, percent-clipped=3.0 +2022-12-07 08:03:57,811 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14548.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:04:11,092 INFO [train.py:873] (2/4) Epoch 2, batch 7000, loss[loss=0.2322, simple_loss=0.213, pruned_loss=0.1257, over 7798.00 frames. ], tot_loss[loss=0.2297, simple_loss=0.2244, pruned_loss=0.1174, over 1887575.19 frames. ], batch size: 100, lr: 2.79e-02, grad_scale: 8.0 +2022-12-07 08:04:25,834 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.56 vs. limit=5.0 +2022-12-07 08:04:38,122 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7081, 1.3441, 3.7422, 3.5855, 3.6509, 3.7079, 3.2970, 3.9001], + device='cuda:2'), covar=tensor([0.1486, 0.1695, 0.0130, 0.0143, 0.0133, 0.0121, 0.0154, 0.0099], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0133, 0.0072, 0.0096, 0.0080, 0.0087, 0.0065, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:04:53,422 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 3.149e+02 3.940e+02 5.259e+02 1.451e+03, threshold=7.881e+02, percent-clipped=6.0 +2022-12-07 08:05:37,945 INFO [train.py:873] (2/4) Epoch 2, batch 7100, loss[loss=0.2313, simple_loss=0.2328, pruned_loss=0.1149, over 14386.00 frames. ], tot_loss[loss=0.2287, simple_loss=0.2241, pruned_loss=0.1166, over 1878240.41 frames. ], batch size: 55, lr: 2.79e-02, grad_scale: 16.0 +2022-12-07 08:06:15,250 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-07 08:06:19,126 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.591e+01 3.151e+02 3.961e+02 5.034e+02 1.065e+03, threshold=7.923e+02, percent-clipped=3.0 +2022-12-07 08:06:34,651 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14728.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:06:44,669 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=14740.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:00,754 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14759.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:04,460 INFO [train.py:873] (2/4) Epoch 2, batch 7200, loss[loss=0.2007, simple_loss=0.1713, pruned_loss=0.1151, over 1178.00 frames. ], tot_loss[loss=0.2295, simple_loss=0.2246, pruned_loss=0.1172, over 1883318.18 frames. ], batch size: 100, lr: 2.78e-02, grad_scale: 16.0 +2022-12-07 08:07:09,582 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14769.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:15,466 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14776.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:15,519 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=14776.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 08:07:37,086 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=14801.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:42,112 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14807.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:07:45,740 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.424e+02 2.988e+02 3.907e+02 5.109e+02 1.521e+03, threshold=7.813e+02, percent-clipped=5.0 +2022-12-07 08:07:56,614 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=14824.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:08:11,930 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1864, 0.8583, 1.1752, 0.9774, 1.1888, 0.4090, 1.1005, 0.9762], + device='cuda:2'), covar=tensor([0.0398, 0.0255, 0.0182, 0.0390, 0.0229, 0.0183, 0.0171, 0.0251], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0019, 0.0016, 0.0017, 0.0021, 0.0014, 0.0015], + device='cuda:2'), out_proj_covar=tensor([4.3871e-05, 4.4724e-05, 4.4133e-05, 4.2759e-05, 4.1329e-05, 4.8569e-05, + 3.9272e-05, 3.8735e-05], device='cuda:2') +2022-12-07 08:08:12,944 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=14843.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:08:20,907 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9426, 1.8788, 1.7597, 2.0496, 1.6884, 1.8208, 1.9636, 2.0604], + device='cuda:2'), covar=tensor([0.0832, 0.0954, 0.1102, 0.0646, 0.1180, 0.0728, 0.0899, 0.0645], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0077, 0.0095, 0.0091, 0.0103, 0.0067, 0.0093, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:08:29,763 INFO [train.py:873] (2/4) Epoch 2, batch 7300, loss[loss=0.2615, simple_loss=0.2391, pruned_loss=0.1419, over 14008.00 frames. ], tot_loss[loss=0.2295, simple_loss=0.2242, pruned_loss=0.1174, over 1898058.06 frames. ], batch size: 26, lr: 2.77e-02, grad_scale: 16.0 +2022-12-07 08:08:30,233 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.40 vs. limit=2.0 +2022-12-07 08:08:32,617 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.38 vs. limit=2.0 +2022-12-07 08:08:57,906 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-07 08:09:06,795 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5821, 2.0111, 4.2178, 2.0512, 4.0756, 4.0702, 3.8418, 4.7933], + device='cuda:2'), covar=tensor([0.0127, 0.2129, 0.0225, 0.1939, 0.0219, 0.0242, 0.0246, 0.0087], + device='cuda:2'), in_proj_covar=tensor([0.0104, 0.0135, 0.0093, 0.0145, 0.0112, 0.0098, 0.0088, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:09:10,699 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.806e+02 3.909e+02 5.179e+02 1.856e+03, threshold=7.818e+02, percent-clipped=6.0 +2022-12-07 08:09:39,280 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7045, 3.3211, 3.9079, 3.3802, 3.4513, 3.6427, 1.4795, 3.4988], + device='cuda:2'), covar=tensor([0.0178, 0.0331, 0.0266, 0.0366, 0.0315, 0.0268, 0.2748, 0.0222], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0104, 0.0099, 0.0081, 0.0137, 0.0092, 0.0146, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 08:09:55,278 INFO [train.py:873] (2/4) Epoch 2, batch 7400, loss[loss=0.1883, simple_loss=0.1729, pruned_loss=0.1019, over 2639.00 frames. ], tot_loss[loss=0.2273, simple_loss=0.223, pruned_loss=0.1158, over 1931679.93 frames. ], batch size: 100, lr: 2.76e-02, grad_scale: 16.0 +2022-12-07 08:10:39,741 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.247e+01 3.005e+02 3.840e+02 4.787e+02 1.067e+03, threshold=7.681e+02, percent-clipped=4.0 +2022-12-07 08:10:54,048 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 08:10:56,074 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0404, 2.8119, 2.3975, 1.6768, 1.8196, 2.5323, 2.9219, 1.8209], + device='cuda:2'), covar=tensor([0.0400, 0.2848, 0.0945, 0.2030, 0.0991, 0.0346, 0.1222, 0.1144], + device='cuda:2'), in_proj_covar=tensor([0.0064, 0.0171, 0.0083, 0.0111, 0.0068, 0.0070, 0.0064, 0.0085], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:10:58,549 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2212, 1.6486, 2.4601, 2.1218, 2.5338, 2.0107, 2.1870, 2.0950], + device='cuda:2'), covar=tensor([0.0070, 0.0421, 0.0035, 0.0117, 0.0052, 0.0078, 0.0064, 0.0255], + device='cuda:2'), in_proj_covar=tensor([0.0197, 0.0322, 0.0144, 0.0257, 0.0198, 0.0213, 0.0217, 0.0360], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:11:22,884 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0224, 0.9256, 0.7114, 1.1178, 0.9338, 0.3321, 1.0970, 0.8558], + device='cuda:2'), covar=tensor([0.0169, 0.0114, 0.0175, 0.0118, 0.0117, 0.0148, 0.0175, 0.0094], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0019, 0.0016, 0.0017, 0.0021, 0.0014, 0.0014], + device='cuda:2'), out_proj_covar=tensor([4.4829e-05, 4.4078e-05, 4.5287e-05, 4.4488e-05, 4.1290e-05, 5.0170e-05, + 3.9843e-05, 3.6616e-05], device='cuda:2') +2022-12-07 08:11:24,370 INFO [train.py:873] (2/4) Epoch 2, batch 7500, loss[loss=0.2226, simple_loss=0.2051, pruned_loss=0.1201, over 3921.00 frames. ], tot_loss[loss=0.228, simple_loss=0.2236, pruned_loss=0.1162, over 1961973.02 frames. ], batch size: 100, lr: 2.75e-02, grad_scale: 8.0 +2022-12-07 08:11:29,633 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15069.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:11:30,114 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 08:11:32,197 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8337, 1.9183, 1.7521, 1.1961, 1.2905, 1.2954, 2.0816, 1.5754], + device='cuda:2'), covar=tensor([0.0573, 0.2584, 0.0922, 0.2234, 0.1088, 0.0708, 0.0808, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0179, 0.0087, 0.0117, 0.0073, 0.0073, 0.0066, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:11:52,715 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=15096.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:12:03,099 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.166e+01 2.953e+02 4.080e+02 5.440e+02 1.285e+03, threshold=8.159e+02, percent-clipped=6.0 +2022-12-07 08:12:05,497 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15117.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:12:48,570 INFO [train.py:873] (2/4) Epoch 3, batch 0, loss[loss=0.2817, simple_loss=0.2565, pruned_loss=0.1535, over 6907.00 frames. ], tot_loss[loss=0.2817, simple_loss=0.2565, pruned_loss=0.1535, over 6907.00 frames. ], batch size: 100, lr: 2.61e-02, grad_scale: 8.0 +2022-12-07 08:12:48,570 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 08:12:55,591 INFO [train.py:905] (2/4) Epoch 3, validation: loss=0.159, simple_loss=0.1997, pruned_loss=0.05911, over 857387.00 frames. +2022-12-07 08:12:55,592 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 08:12:59,154 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1545, 4.7315, 4.6835, 5.1986, 5.0547, 4.3661, 5.2134, 4.4281], + device='cuda:2'), covar=tensor([0.0310, 0.0688, 0.0235, 0.0259, 0.0397, 0.0341, 0.0351, 0.0375], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0156, 0.0101, 0.0093, 0.0097, 0.0093, 0.0142, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:13:12,543 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15143.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:13:33,459 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0910, 1.6197, 2.3130, 1.9766, 2.4825, 1.9413, 2.0813, 2.0470], + device='cuda:2'), covar=tensor([0.0096, 0.0340, 0.0040, 0.0152, 0.0050, 0.0112, 0.0066, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0202, 0.0327, 0.0149, 0.0258, 0.0197, 0.0216, 0.0222, 0.0360], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:13:53,453 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15191.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:13:57,007 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8267, 3.4220, 3.3619, 3.8378, 3.7572, 3.6385, 3.8426, 3.3300], + device='cuda:2'), covar=tensor([0.0368, 0.1024, 0.0356, 0.0395, 0.0540, 0.0579, 0.0500, 0.0535], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0157, 0.0101, 0.0094, 0.0097, 0.0092, 0.0139, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:13:57,086 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=15195.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:13:59,541 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9550, 3.7638, 3.7134, 4.0804, 3.7746, 3.2410, 4.0341, 4.0284], + device='cuda:2'), covar=tensor([0.0708, 0.0588, 0.0646, 0.0607, 0.0800, 0.0476, 0.0620, 0.0684], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0076, 0.0098, 0.0091, 0.0102, 0.0065, 0.0092, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:14:11,334 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 4.526e+01 3.048e+02 3.683e+02 5.299e+02 1.167e+03, threshold=7.365e+02, percent-clipped=4.0 +2022-12-07 08:14:23,258 INFO [train.py:873] (2/4) Epoch 3, batch 100, loss[loss=0.2182, simple_loss=0.2174, pruned_loss=0.1096, over 14145.00 frames. ], tot_loss[loss=0.2242, simple_loss=0.2219, pruned_loss=0.1132, over 892792.47 frames. ], batch size: 99, lr: 2.60e-02, grad_scale: 8.0 +2022-12-07 08:14:28,270 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9792, 2.7877, 3.0196, 2.9352, 2.8166, 2.8314, 1.3228, 2.7037], + device='cuda:2'), covar=tensor([0.0208, 0.0330, 0.0311, 0.0226, 0.0297, 0.0440, 0.2168, 0.0280], + device='cuda:2'), in_proj_covar=tensor([0.0100, 0.0102, 0.0098, 0.0081, 0.0135, 0.0092, 0.0141, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 08:14:49,266 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=15256.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 08:15:12,761 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.44 vs. limit=2.0 +2022-12-07 08:15:38,765 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.471e+02 3.082e+02 4.015e+02 5.606e+02 9.270e+02, threshold=8.031e+02, percent-clipped=6.0 +2022-12-07 08:15:41,479 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4764, 4.1749, 4.0859, 4.7368, 4.3131, 3.7776, 4.7409, 4.6102], + device='cuda:2'), covar=tensor([0.0761, 0.0640, 0.0713, 0.0600, 0.0669, 0.0570, 0.0522, 0.0722], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0077, 0.0099, 0.0092, 0.0104, 0.0066, 0.0092, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:15:49,798 INFO [train.py:873] (2/4) Epoch 3, batch 200, loss[loss=0.2157, simple_loss=0.2241, pruned_loss=0.1037, over 14629.00 frames. ], tot_loss[loss=0.2233, simple_loss=0.2208, pruned_loss=0.113, over 1276937.02 frames. ], batch size: 22, lr: 2.59e-02, grad_scale: 8.0 +2022-12-07 08:16:09,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3444, 1.7313, 1.5033, 1.6486, 1.2717, 1.5785, 1.6363, 0.9961], + device='cuda:2'), covar=tensor([0.3444, 0.1352, 0.3156, 0.1582, 0.1510, 0.1022, 0.1245, 0.2542], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0061, 0.0053, 0.0058, 0.0064, 0.0050, 0.0064, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:16:50,936 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15396.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:17:04,619 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.006e+02 3.006e+02 3.776e+02 5.007e+02 1.251e+03, threshold=7.551e+02, percent-clipped=5.0 +2022-12-07 08:17:14,225 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-07 08:17:16,243 INFO [train.py:873] (2/4) Epoch 3, batch 300, loss[loss=0.2456, simple_loss=0.2428, pruned_loss=0.1241, over 14230.00 frames. ], tot_loss[loss=0.2203, simple_loss=0.2187, pruned_loss=0.1109, over 1487478.38 frames. ], batch size: 76, lr: 2.59e-02, grad_scale: 8.0 +2022-12-07 08:17:32,203 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15444.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:17:39,436 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.70 vs. limit=5.0 +2022-12-07 08:17:45,022 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.48 vs. limit=5.0 +2022-12-07 08:17:48,141 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1497, 2.0595, 2.6200, 1.5053, 1.7343, 2.1556, 1.0507, 2.2440], + device='cuda:2'), covar=tensor([0.1106, 0.0724, 0.0449, 0.1487, 0.1186, 0.0844, 0.3451, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0063, 0.0062, 0.0058, 0.0070, 0.0074, 0.0059, 0.0121, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 08:18:26,068 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.9369, 5.5315, 5.6052, 6.1385, 5.6995, 4.7681, 6.1748, 6.0569], + device='cuda:2'), covar=tensor([0.0618, 0.0432, 0.0424, 0.0372, 0.0580, 0.0276, 0.0488, 0.0407], + device='cuda:2'), in_proj_covar=tensor([0.0097, 0.0076, 0.0098, 0.0093, 0.0105, 0.0065, 0.0094, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:18:31,036 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.142e+02 2.693e+02 3.512e+02 4.941e+02 9.484e+02, threshold=7.024e+02, percent-clipped=2.0 +2022-12-07 08:18:37,465 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 08:18:41,946 INFO [train.py:873] (2/4) Epoch 3, batch 400, loss[loss=0.2386, simple_loss=0.2283, pruned_loss=0.1245, over 14312.00 frames. ], tot_loss[loss=0.2209, simple_loss=0.2187, pruned_loss=0.1115, over 1683760.53 frames. ], batch size: 31, lr: 2.58e-02, grad_scale: 8.0 +2022-12-07 08:19:01,708 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8188, 2.3939, 4.7451, 3.2344, 4.5289, 2.2515, 4.0454, 4.6350], + device='cuda:2'), covar=tensor([0.0122, 0.3778, 0.0123, 0.5981, 0.0112, 0.2911, 0.0541, 0.0110], + device='cuda:2'), in_proj_covar=tensor([0.0218, 0.0328, 0.0177, 0.0432, 0.0157, 0.0336, 0.0289, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0004, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:19:03,987 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=15551.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:19:13,340 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3568, 0.9973, 1.2864, 0.9590, 0.9544, 1.0272, 1.0609, 1.2088], + device='cuda:2'), covar=tensor([0.0145, 0.0499, 0.0288, 0.0373, 0.0589, 0.0395, 0.0347, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0192, 0.0091, 0.0122, 0.0079, 0.0076, 0.0068, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:19:49,096 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 08:19:56,021 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.354e+02 3.036e+02 3.993e+02 5.394e+02 1.334e+03, threshold=7.985e+02, percent-clipped=7.0 +2022-12-07 08:20:07,945 INFO [train.py:873] (2/4) Epoch 3, batch 500, loss[loss=0.2073, simple_loss=0.1913, pruned_loss=0.1117, over 3895.00 frames. ], tot_loss[loss=0.2219, simple_loss=0.2198, pruned_loss=0.112, over 1855415.84 frames. ], batch size: 100, lr: 2.57e-02, grad_scale: 8.0 +2022-12-07 08:20:18,304 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7684, 2.6877, 1.8920, 2.9347, 2.6482, 2.8291, 2.4710, 2.0675], + device='cuda:2'), covar=tensor([0.0151, 0.0432, 0.1566, 0.0097, 0.0234, 0.0190, 0.0435, 0.1944], + device='cuda:2'), in_proj_covar=tensor([0.0190, 0.0264, 0.0351, 0.0157, 0.0183, 0.0178, 0.0239, 0.0354], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0003, 0.0001, 0.0002, 0.0001, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:21:01,086 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.96 vs. limit=5.0 +2022-12-07 08:21:22,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.517e+02 2.935e+02 3.905e+02 4.916e+02 1.140e+03, threshold=7.811e+02, percent-clipped=3.0 +2022-12-07 08:21:32,702 INFO [train.py:873] (2/4) Epoch 3, batch 600, loss[loss=0.2584, simple_loss=0.2451, pruned_loss=0.1359, over 14301.00 frames. ], tot_loss[loss=0.2224, simple_loss=0.2202, pruned_loss=0.1123, over 1909975.40 frames. ], batch size: 35, lr: 2.56e-02, grad_scale: 8.0 +2022-12-07 08:21:35,404 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9739, 1.6824, 4.4471, 4.1109, 4.1083, 4.4906, 4.2032, 4.6361], + device='cuda:2'), covar=tensor([0.1205, 0.1385, 0.0073, 0.0090, 0.0102, 0.0078, 0.0079, 0.0064], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0134, 0.0073, 0.0102, 0.0083, 0.0089, 0.0067, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:22:10,347 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5501, 1.8736, 3.4496, 2.4891, 3.4557, 1.7205, 2.6698, 3.4713], + device='cuda:2'), covar=tensor([0.0283, 0.3728, 0.0178, 0.5363, 0.0172, 0.2897, 0.0792, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0216, 0.0328, 0.0175, 0.0431, 0.0157, 0.0336, 0.0285, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0004, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:22:47,078 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.484e+02 3.132e+02 3.850e+02 5.057e+02 1.369e+03, threshold=7.700e+02, percent-clipped=5.0 +2022-12-07 08:22:58,911 INFO [train.py:873] (2/4) Epoch 3, batch 700, loss[loss=0.2503, simple_loss=0.2387, pruned_loss=0.131, over 9467.00 frames. ], tot_loss[loss=0.2217, simple_loss=0.22, pruned_loss=0.1117, over 1978348.14 frames. ], batch size: 100, lr: 2.56e-02, grad_scale: 8.0 +2022-12-07 08:23:20,475 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=15851.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:23:48,061 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=15883.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:23:52,421 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.90 vs. limit=5.0 +2022-12-07 08:23:54,227 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-12-07 08:24:01,536 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=15899.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:24:12,921 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.224e+02 3.132e+02 4.108e+02 5.472e+02 1.235e+03, threshold=8.216e+02, percent-clipped=5.0 +2022-12-07 08:24:16,385 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3228, 3.8905, 4.4331, 3.6315, 4.0898, 4.3707, 1.5937, 4.0494], + device='cuda:2'), covar=tensor([0.0163, 0.0320, 0.0321, 0.0485, 0.0316, 0.0159, 0.3222, 0.0245], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0106, 0.0100, 0.0084, 0.0138, 0.0094, 0.0144, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 08:24:23,971 INFO [train.py:873] (2/4) Epoch 3, batch 800, loss[loss=0.248, simple_loss=0.2291, pruned_loss=0.1335, over 7799.00 frames. ], tot_loss[loss=0.2223, simple_loss=0.2202, pruned_loss=0.1122, over 1981086.27 frames. ], batch size: 100, lr: 2.55e-02, grad_scale: 8.0 +2022-12-07 08:24:24,188 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7362, 3.1614, 2.0838, 4.1326, 3.6337, 3.8758, 3.0614, 2.4309], + device='cuda:2'), covar=tensor([0.0212, 0.0752, 0.3712, 0.0185, 0.0278, 0.0397, 0.0884, 0.3267], + device='cuda:2'), in_proj_covar=tensor([0.0189, 0.0264, 0.0351, 0.0156, 0.0184, 0.0172, 0.0237, 0.0347], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0003, 0.0001, 0.0002, 0.0001, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:24:40,907 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=15944.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:25:30,356 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16001.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:25:39,381 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.514e+02 2.828e+02 4.017e+02 4.886e+02 9.739e+02, threshold=8.033e+02, percent-clipped=2.0 +2022-12-07 08:25:51,111 INFO [train.py:873] (2/4) Epoch 3, batch 900, loss[loss=0.2239, simple_loss=0.2162, pruned_loss=0.1158, over 4919.00 frames. ], tot_loss[loss=0.2212, simple_loss=0.2194, pruned_loss=0.1115, over 1970699.13 frames. ], batch size: 100, lr: 2.54e-02, grad_scale: 8.0 +2022-12-07 08:26:22,650 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16062.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:26:28,218 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0608, 0.7523, 0.8624, 1.0305, 0.9969, 0.2902, 1.1764, 0.9871], + device='cuda:2'), covar=tensor([0.0337, 0.0398, 0.0219, 0.0338, 0.0114, 0.0198, 0.0169, 0.0144], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0018, 0.0018, 0.0016, 0.0017, 0.0021, 0.0015, 0.0015], + device='cuda:2'), out_proj_covar=tensor([4.3357e-05, 4.5874e-05, 4.6114e-05, 4.6228e-05, 4.2622e-05, 5.2822e-05, + 4.4127e-05, 3.7958e-05], device='cuda:2') +2022-12-07 08:26:42,864 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16085.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:27:06,256 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.269e+02 2.979e+02 3.878e+02 4.942e+02 1.281e+03, threshold=7.755e+02, percent-clipped=6.0 +2022-12-07 08:27:16,835 INFO [train.py:873] (2/4) Epoch 3, batch 1000, loss[loss=0.2167, simple_loss=0.2089, pruned_loss=0.1123, over 5958.00 frames. ], tot_loss[loss=0.2201, simple_loss=0.2188, pruned_loss=0.1107, over 2001724.05 frames. ], batch size: 100, lr: 2.54e-02, grad_scale: 8.0 +2022-12-07 08:27:35,303 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16146.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:27:38,009 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16149.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:27:57,334 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4629, 1.4555, 1.4039, 1.5078, 1.2401, 1.1914, 1.3278, 1.0142], + device='cuda:2'), covar=tensor([0.4068, 0.2006, 0.0999, 0.1177, 0.1291, 0.0727, 0.1986, 0.2779], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0069, 0.0053, 0.0062, 0.0071, 0.0053, 0.0072, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:28:08,764 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3872, 1.3919, 2.4808, 1.3466, 2.4751, 2.4363, 1.7693, 2.4802], + device='cuda:2'), covar=tensor([0.0160, 0.1298, 0.0155, 0.1287, 0.0182, 0.0213, 0.0636, 0.0166], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0140, 0.0097, 0.0151, 0.0115, 0.0102, 0.0093, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:28:29,943 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16210.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:28:31,352 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 3.033e+02 4.056e+02 5.147e+02 1.062e+03, threshold=8.113e+02, percent-clipped=5.0 +2022-12-07 08:28:42,880 INFO [train.py:873] (2/4) Epoch 3, batch 1100, loss[loss=0.2604, simple_loss=0.2397, pruned_loss=0.1406, over 9464.00 frames. ], tot_loss[loss=0.2195, simple_loss=0.2183, pruned_loss=0.1103, over 1990311.15 frames. ], batch size: 100, lr: 2.53e-02, grad_scale: 8.0 +2022-12-07 08:28:54,218 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3318, 0.9936, 1.2416, 0.8678, 0.9264, 1.0662, 1.0356, 1.1722], + device='cuda:2'), covar=tensor([0.0305, 0.0597, 0.0545, 0.0853, 0.0557, 0.0482, 0.0272, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0193, 0.0094, 0.0121, 0.0077, 0.0077, 0.0068, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:28:55,101 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16239.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:28:56,714 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7028, 1.2373, 2.3428, 2.2184, 2.2660, 2.2001, 1.5609, 2.3857], + device='cuda:2'), covar=tensor([0.0834, 0.1136, 0.0156, 0.0269, 0.0260, 0.0153, 0.0390, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0138, 0.0075, 0.0103, 0.0087, 0.0091, 0.0070, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:29:30,142 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3071, 4.6985, 4.6875, 5.2013, 4.8719, 4.4781, 5.1953, 4.3022], + device='cuda:2'), covar=tensor([0.0212, 0.0917, 0.0209, 0.0292, 0.0664, 0.0313, 0.0381, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0166, 0.0102, 0.0099, 0.0101, 0.0092, 0.0152, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:29:58,310 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 2.546e+02 3.661e+02 5.045e+02 9.524e+02, threshold=7.322e+02, percent-clipped=3.0 +2022-12-07 08:30:09,244 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 08:30:09,551 INFO [train.py:873] (2/4) Epoch 3, batch 1200, loss[loss=0.2088, simple_loss=0.2128, pruned_loss=0.1024, over 14497.00 frames. ], tot_loss[loss=0.2179, simple_loss=0.2176, pruned_loss=0.1091, over 1982562.72 frames. ], batch size: 49, lr: 2.52e-02, grad_scale: 8.0 +2022-12-07 08:30:37,102 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16357.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:30:48,994 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16371.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:31:14,946 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5628, 1.1838, 2.0053, 1.8200, 1.9368, 1.9021, 1.5679, 2.0282], + device='cuda:2'), covar=tensor([0.0366, 0.0748, 0.0102, 0.0214, 0.0154, 0.0091, 0.0248, 0.0088], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0139, 0.0074, 0.0104, 0.0087, 0.0092, 0.0071, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:31:23,701 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.462e+02 3.060e+02 3.691e+02 4.545e+02 9.377e+02, threshold=7.381e+02, percent-clipped=3.0 +2022-12-07 08:31:35,382 INFO [train.py:873] (2/4) Epoch 3, batch 1300, loss[loss=0.2286, simple_loss=0.2304, pruned_loss=0.1134, over 14306.00 frames. ], tot_loss[loss=0.2171, simple_loss=0.2172, pruned_loss=0.1085, over 1979452.17 frames. ], batch size: 39, lr: 2.51e-02, grad_scale: 8.0 +2022-12-07 08:31:41,392 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16432.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:31:49,047 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16441.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:31:53,886 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.44 vs. limit=2.0 +2022-12-07 08:31:57,546 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7589, 1.5376, 2.8898, 1.3733, 2.9296, 2.9341, 2.1220, 3.0622], + device='cuda:2'), covar=tensor([0.0196, 0.1995, 0.0279, 0.1931, 0.0241, 0.0235, 0.0715, 0.0129], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0141, 0.0099, 0.0152, 0.0119, 0.0102, 0.0095, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:32:43,983 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16505.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:32:49,561 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 08:32:49,716 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.482e+02 2.808e+02 3.737e+02 4.916e+02 1.118e+03, threshold=7.474e+02, percent-clipped=8.0 +2022-12-07 08:32:54,007 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 08:33:01,054 INFO [train.py:873] (2/4) Epoch 3, batch 1400, loss[loss=0.2322, simple_loss=0.2256, pruned_loss=0.1194, over 14266.00 frames. ], tot_loss[loss=0.2167, simple_loss=0.2169, pruned_loss=0.1082, over 1938069.47 frames. ], batch size: 66, lr: 2.51e-02, grad_scale: 8.0 +2022-12-07 08:33:12,969 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16539.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:33:16,503 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7692, 1.5343, 2.0554, 1.7245, 2.1047, 1.6667, 1.8253, 1.7724], + device='cuda:2'), covar=tensor([0.0074, 0.0266, 0.0031, 0.0069, 0.0046, 0.0113, 0.0046, 0.0118], + device='cuda:2'), in_proj_covar=tensor([0.0201, 0.0329, 0.0163, 0.0263, 0.0210, 0.0217, 0.0222, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:33:17,630 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6876, 0.5891, 0.7582, 1.5328, 0.8412, 0.4376, 1.7094, 1.2251], + device='cuda:2'), covar=tensor([0.0331, 0.0330, 0.0141, 0.0499, 0.0765, 0.0164, 0.0580, 0.0329], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0018, 0.0018, 0.0014, 0.0017, 0.0021, 0.0016, 0.0015], + device='cuda:2'), out_proj_covar=tensor([4.4208e-05, 4.6498e-05, 4.5491e-05, 4.2703e-05, 4.3279e-05, 5.2941e-05, + 4.5696e-05, 3.8841e-05], device='cuda:2') +2022-12-07 08:33:29,785 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-07 08:33:54,096 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16587.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:34:00,576 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 08:34:15,419 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.141e+01 3.095e+02 4.057e+02 5.380e+02 9.685e+02, threshold=8.113e+02, percent-clipped=8.0 +2022-12-07 08:34:27,198 INFO [train.py:873] (2/4) Epoch 3, batch 1500, loss[loss=0.2017, simple_loss=0.2085, pruned_loss=0.09743, over 14531.00 frames. ], tot_loss[loss=0.216, simple_loss=0.216, pruned_loss=0.108, over 1908024.46 frames. ], batch size: 43, lr: 2.50e-02, grad_scale: 8.0 +2022-12-07 08:34:35,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8457, 1.3769, 3.1053, 2.7778, 3.0653, 2.9259, 2.2036, 3.1528], + device='cuda:2'), covar=tensor([0.1042, 0.1226, 0.0086, 0.0206, 0.0144, 0.0113, 0.0363, 0.0105], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0140, 0.0075, 0.0105, 0.0088, 0.0093, 0.0070, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:34:54,272 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16657.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:35:35,815 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16705.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:35:41,765 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.784e+02 3.654e+02 4.523e+02 7.286e+02, threshold=7.308e+02, percent-clipped=0.0 +2022-12-07 08:35:52,814 INFO [train.py:873] (2/4) Epoch 3, batch 1600, loss[loss=0.2215, simple_loss=0.2162, pruned_loss=0.1134, over 13880.00 frames. ], tot_loss[loss=0.2168, simple_loss=0.2164, pruned_loss=0.1087, over 1918276.33 frames. ], batch size: 20, lr: 2.49e-02, grad_scale: 8.0 +2022-12-07 08:35:54,544 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=16727.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:36:06,509 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16741.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:36:11,399 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0862, 2.1590, 3.4381, 3.3888, 2.8910, 2.1311, 3.1415, 2.1063], + device='cuda:2'), covar=tensor([0.0066, 0.0174, 0.0137, 0.0093, 0.0093, 0.0281, 0.0027, 0.0295], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0150, 0.0176, 0.0154, 0.0131, 0.0194, 0.0093, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 08:36:26,358 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7572, 1.7190, 2.1517, 1.5234, 1.7220, 1.9467, 1.0696, 1.9344], + device='cuda:2'), covar=tensor([0.0580, 0.0854, 0.0267, 0.1074, 0.0982, 0.0396, 0.2390, 0.0460], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0066, 0.0061, 0.0070, 0.0075, 0.0062, 0.0119, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 08:36:26,388 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9553, 0.7772, 0.2956, 0.9564, 0.8698, 0.2693, 1.0868, 0.9418], + device='cuda:2'), covar=tensor([0.0216, 0.0192, 0.0080, 0.0182, 0.0178, 0.0117, 0.0232, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0019, 0.0019, 0.0016, 0.0017, 0.0022, 0.0017, 0.0014], + device='cuda:2'), out_proj_covar=tensor([4.7600e-05, 5.1228e-05, 4.7887e-05, 4.7875e-05, 4.5407e-05, 5.4701e-05, + 4.8804e-05, 3.8640e-05], device='cuda:2') +2022-12-07 08:36:48,049 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16789.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:36:58,005 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5285, 2.9910, 4.4837, 4.2549, 4.6397, 2.9255, 4.5164, 3.4523], + device='cuda:2'), covar=tensor([0.0029, 0.0116, 0.0251, 0.0092, 0.0025, 0.0218, 0.0017, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0148, 0.0174, 0.0152, 0.0129, 0.0192, 0.0092, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 08:37:01,906 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=16805.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:37:07,697 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.245e+02 3.316e+02 4.284e+02 5.393e+02 9.437e+02, threshold=8.568e+02, percent-clipped=8.0 +2022-12-07 08:37:19,029 INFO [train.py:873] (2/4) Epoch 3, batch 1700, loss[loss=0.2015, simple_loss=0.1782, pruned_loss=0.1124, over 2621.00 frames. ], tot_loss[loss=0.2175, simple_loss=0.2166, pruned_loss=0.1092, over 1874566.11 frames. ], batch size: 100, lr: 2.49e-02, grad_scale: 8.0 +2022-12-07 08:37:28,026 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-07 08:37:29,372 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7551, 1.7044, 2.9146, 2.1638, 2.8068, 1.5596, 2.0802, 2.7189], + device='cuda:2'), covar=tensor([0.0322, 0.3497, 0.0246, 0.4465, 0.0232, 0.2695, 0.1064, 0.0176], + device='cuda:2'), in_proj_covar=tensor([0.0225, 0.0324, 0.0177, 0.0422, 0.0165, 0.0336, 0.0291, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0004, 0.0002, 0.0004, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:37:42,002 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1528, 0.9256, 0.9106, 1.2619, 1.1138, 0.1827, 1.6362, 1.4685], + device='cuda:2'), covar=tensor([0.0634, 0.0230, 0.0112, 0.1353, 0.0524, 0.0165, 0.0322, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0019, 0.0020, 0.0017, 0.0017, 0.0022, 0.0017, 0.0015], + device='cuda:2'), out_proj_covar=tensor([4.9539e-05, 5.0717e-05, 4.8965e-05, 4.9942e-05, 4.5969e-05, 5.4747e-05, + 4.8983e-05, 3.9737e-05], device='cuda:2') +2022-12-07 08:37:42,665 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=16853.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:37:55,915 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16868.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 08:38:10,285 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2350, 3.2296, 4.3217, 2.6832, 3.0329, 2.9355, 1.0956, 2.8568], + device='cuda:2'), covar=tensor([0.1308, 0.1332, 0.0483, 0.1605, 0.0764, 0.1538, 0.3637, 0.1338], + device='cuda:2'), in_proj_covar=tensor([0.0062, 0.0068, 0.0061, 0.0070, 0.0075, 0.0063, 0.0121, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 08:38:28,227 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=16906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:38:33,244 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.516e+02 2.887e+02 3.816e+02 4.798e+02 1.128e+03, threshold=7.633e+02, percent-clipped=3.0 +2022-12-07 08:38:37,046 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1930, 1.1287, 0.7885, 0.8880, 0.6246, 0.9558, 1.0348, 0.7855], + device='cuda:2'), covar=tensor([0.0510, 0.1667, 0.1078, 0.0614, 0.0929, 0.0246, 0.0274, 0.0633], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0012, 0.0012, 0.0011, 0.0013, 0.0015, 0.0014, 0.0015], + device='cuda:2'), out_proj_covar=tensor([3.3880e-05, 3.0846e-05, 3.4773e-05, 3.0065e-05, 3.2693e-05, 3.4981e-05, + 3.9693e-05, 4.0586e-05], device='cuda:2') +2022-12-07 08:38:44,615 INFO [train.py:873] (2/4) Epoch 3, batch 1800, loss[loss=0.2564, simple_loss=0.2324, pruned_loss=0.1403, over 6929.00 frames. ], tot_loss[loss=0.2202, simple_loss=0.2182, pruned_loss=0.1111, over 1907956.99 frames. ], batch size: 100, lr: 2.48e-02, grad_scale: 8.0 +2022-12-07 08:38:48,312 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16929.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 08:39:13,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3568, 1.3017, 3.2440, 1.5145, 3.2560, 3.2328, 2.2171, 3.5682], + device='cuda:2'), covar=tensor([0.0171, 0.2278, 0.0293, 0.1866, 0.0353, 0.0291, 0.0738, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0141, 0.0098, 0.0149, 0.0115, 0.0105, 0.0094, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:39:21,082 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=16967.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:39:57,000 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-07 08:39:59,562 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 3.016e+02 3.995e+02 5.163e+02 9.262e+02, threshold=7.990e+02, percent-clipped=4.0 +2022-12-07 08:40:00,720 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0316, 2.6051, 4.2403, 4.1314, 4.2745, 2.6890, 4.1499, 2.9556], + device='cuda:2'), covar=tensor([0.0056, 0.0157, 0.0161, 0.0078, 0.0032, 0.0274, 0.0019, 0.0227], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0152, 0.0178, 0.0156, 0.0134, 0.0195, 0.0091, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 08:40:11,093 INFO [train.py:873] (2/4) Epoch 3, batch 1900, loss[loss=0.2264, simple_loss=0.2106, pruned_loss=0.1211, over 3856.00 frames. ], tot_loss[loss=0.2203, simple_loss=0.2185, pruned_loss=0.111, over 1927313.06 frames. ], batch size: 100, lr: 2.47e-02, grad_scale: 8.0 +2022-12-07 08:40:12,968 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17027.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:40:53,623 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17075.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:41:22,672 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4663, 2.4787, 1.8836, 2.5893, 2.5236, 2.6760, 2.3031, 2.0509], + device='cuda:2'), covar=tensor([0.0173, 0.0318, 0.1464, 0.0124, 0.0254, 0.0202, 0.0543, 0.1563], + device='cuda:2'), in_proj_covar=tensor([0.0202, 0.0273, 0.0348, 0.0163, 0.0196, 0.0193, 0.0249, 0.0354], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:41:25,312 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.512e+02 2.833e+02 3.865e+02 5.052e+02 1.376e+03, threshold=7.730e+02, percent-clipped=3.0 +2022-12-07 08:41:36,266 INFO [train.py:873] (2/4) Epoch 3, batch 2000, loss[loss=0.233, simple_loss=0.2301, pruned_loss=0.1179, over 14280.00 frames. ], tot_loss[loss=0.2205, simple_loss=0.219, pruned_loss=0.111, over 1975957.81 frames. ], batch size: 80, lr: 2.47e-02, grad_scale: 16.0 +2022-12-07 08:41:37,178 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7276, 1.1785, 2.1999, 2.0968, 2.3542, 2.2224, 1.6585, 2.2943], + device='cuda:2'), covar=tensor([0.0831, 0.1560, 0.0220, 0.0419, 0.0242, 0.0229, 0.0614, 0.0241], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0144, 0.0076, 0.0104, 0.0089, 0.0094, 0.0072, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:42:30,533 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17189.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:42:32,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0821, 2.5694, 4.2758, 4.0856, 4.2552, 2.7621, 4.1868, 3.3778], + device='cuda:2'), covar=tensor([0.0044, 0.0137, 0.0126, 0.0066, 0.0035, 0.0246, 0.0016, 0.0159], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0156, 0.0183, 0.0160, 0.0137, 0.0199, 0.0093, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 08:42:46,398 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 08:42:48,658 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-07 08:42:49,868 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 3.144e+02 4.016e+02 5.229e+02 1.105e+03, threshold=8.031e+02, percent-clipped=5.0 +2022-12-07 08:43:00,464 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17224.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:43:01,531 INFO [train.py:873] (2/4) Epoch 3, batch 2100, loss[loss=0.1862, simple_loss=0.1772, pruned_loss=0.0976, over 1209.00 frames. ], tot_loss[loss=0.2187, simple_loss=0.2178, pruned_loss=0.1098, over 1979618.70 frames. ], batch size: 100, lr: 2.46e-02, grad_scale: 16.0 +2022-12-07 08:43:08,951 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.48 vs. limit=5.0 +2022-12-07 08:43:22,662 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17250.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:43:23,753 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 08:43:32,883 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17262.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 08:43:32,950 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4819, 2.0984, 2.0198, 1.8967, 1.1450, 1.9224, 1.8401, 1.3082], + device='cuda:2'), covar=tensor([0.6343, 0.2144, 0.2601, 0.1705, 0.1649, 0.0904, 0.2214, 0.3231], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0069, 0.0053, 0.0062, 0.0069, 0.0056, 0.0079, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:43:44,339 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0481, 2.0148, 2.9349, 1.8613, 2.2539, 2.1788, 0.8473, 2.0169], + device='cuda:2'), covar=tensor([0.1334, 0.1187, 0.0700, 0.1511, 0.0955, 0.0997, 0.4283, 0.1385], + device='cuda:2'), in_proj_covar=tensor([0.0066, 0.0070, 0.0060, 0.0071, 0.0076, 0.0063, 0.0124, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0001], + device='cuda:2') +2022-12-07 08:44:16,383 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.689e+02 2.883e+02 3.756e+02 4.821e+02 7.319e+02, threshold=7.512e+02, percent-clipped=0.0 +2022-12-07 08:44:24,557 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.22 vs. limit=2.0 +2022-12-07 08:44:27,587 INFO [train.py:873] (2/4) Epoch 3, batch 2200, loss[loss=0.1787, simple_loss=0.1731, pruned_loss=0.09215, over 2626.00 frames. ], tot_loss[loss=0.2194, simple_loss=0.2181, pruned_loss=0.1104, over 1991672.08 frames. ], batch size: 100, lr: 2.45e-02, grad_scale: 16.0 +2022-12-07 08:44:59,083 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17362.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:45:17,001 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17383.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 08:45:38,849 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.1327, 5.4013, 5.5452, 6.0806, 5.7510, 5.1204, 6.0476, 5.0747], + device='cuda:2'), covar=tensor([0.0191, 0.0838, 0.0195, 0.0272, 0.0576, 0.0229, 0.0353, 0.0386], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0168, 0.0109, 0.0102, 0.0102, 0.0096, 0.0158, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:45:42,085 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.435e+02 3.126e+02 3.760e+02 4.600e+02 9.821e+02, threshold=7.519e+02, percent-clipped=3.0 +2022-12-07 08:45:51,620 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17423.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:45:53,185 INFO [train.py:873] (2/4) Epoch 3, batch 2300, loss[loss=0.2005, simple_loss=0.2059, pruned_loss=0.09753, over 14217.00 frames. ], tot_loss[loss=0.2171, simple_loss=0.2161, pruned_loss=0.109, over 1929894.29 frames. ], batch size: 35, lr: 2.45e-02, grad_scale: 16.0 +2022-12-07 08:46:09,808 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17444.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 08:46:35,381 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8786, 1.4906, 1.8986, 1.2828, 1.5385, 1.6162, 1.7056, 1.5464], + device='cuda:2'), covar=tensor([0.0414, 0.1411, 0.0706, 0.1800, 0.0720, 0.0697, 0.0398, 0.1055], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0205, 0.0094, 0.0127, 0.0079, 0.0080, 0.0071, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:47:07,374 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 2.910e+02 3.962e+02 5.251e+02 8.178e+02, threshold=7.923e+02, percent-clipped=2.0 +2022-12-07 08:47:12,778 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4973, 2.3785, 2.5312, 2.3108, 2.4454, 2.1363, 1.1964, 2.2260], + device='cuda:2'), covar=tensor([0.0225, 0.0323, 0.0289, 0.0253, 0.0244, 0.0578, 0.1736, 0.0298], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0114, 0.0108, 0.0093, 0.0150, 0.0100, 0.0147, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 08:47:18,008 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17524.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:47:18,682 INFO [train.py:873] (2/4) Epoch 3, batch 2400, loss[loss=0.1981, simple_loss=0.2073, pruned_loss=0.09446, over 13971.00 frames. ], tot_loss[loss=0.2162, simple_loss=0.2157, pruned_loss=0.1084, over 1925043.19 frames. ], batch size: 19, lr: 2.44e-02, grad_scale: 16.0 +2022-12-07 08:47:35,590 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17545.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:47:38,187 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5913, 1.7036, 3.5630, 1.5627, 3.6022, 3.6086, 2.7994, 3.9239], + device='cuda:2'), covar=tensor([0.0151, 0.2151, 0.0325, 0.2016, 0.0312, 0.0264, 0.0534, 0.0101], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0144, 0.0105, 0.0154, 0.0119, 0.0108, 0.0096, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:47:50,015 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17562.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:47:53,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9197, 0.5952, 0.8053, 0.8421, 0.9119, 0.2991, 0.8390, 0.7278], + device='cuda:2'), covar=tensor([0.0096, 0.0240, 0.0085, 0.0138, 0.0128, 0.0141, 0.0203, 0.0082], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0019, 0.0020, 0.0017, 0.0018, 0.0022, 0.0017, 0.0015], + device='cuda:2'), out_proj_covar=tensor([4.7365e-05, 5.1617e-05, 5.0977e-05, 5.1484e-05, 4.8322e-05, 5.5701e-05, + 4.8224e-05, 3.9810e-05], device='cuda:2') +2022-12-07 08:47:58,497 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17572.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 08:48:22,224 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7116, 1.8687, 3.7692, 2.6353, 3.6449, 1.7185, 2.7970, 3.4029], + device='cuda:2'), covar=tensor([0.0307, 0.5081, 0.0286, 0.6630, 0.0180, 0.4220, 0.1080, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0329, 0.0180, 0.0424, 0.0167, 0.0336, 0.0301, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0004, 0.0002, 0.0004, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 08:48:31,674 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17610.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:48:34,155 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.459e+02 2.796e+02 3.862e+02 5.213e+02 1.653e+03, threshold=7.725e+02, percent-clipped=7.0 +2022-12-07 08:48:44,388 INFO [train.py:873] (2/4) Epoch 3, batch 2500, loss[loss=0.1755, simple_loss=0.1642, pruned_loss=0.09335, over 2598.00 frames. ], tot_loss[loss=0.2158, simple_loss=0.2157, pruned_loss=0.108, over 1924547.20 frames. ], batch size: 100, lr: 2.43e-02, grad_scale: 8.0 +2022-12-07 08:48:55,959 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9976, 0.7007, 0.6619, 0.8619, 0.7481, 0.9866, 0.8034, 0.7804], + device='cuda:2'), covar=tensor([0.0249, 0.0349, 0.0233, 0.0321, 0.0307, 0.0195, 0.0254, 0.0419], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0011, 0.0012, 0.0012, 0.0014, 0.0013, 0.0015], + device='cuda:2'), out_proj_covar=tensor([3.3460e-05, 3.1498e-05, 3.4509e-05, 3.1827e-05, 3.2242e-05, 3.4201e-05, + 4.0753e-05, 4.1300e-05], device='cuda:2') +2022-12-07 08:49:49,066 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17700.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:49:59,743 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 2.968e+02 3.839e+02 5.013e+02 8.291e+02, threshold=7.678e+02, percent-clipped=4.0 +2022-12-07 08:50:04,188 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17718.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:50:05,752 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-12-07 08:50:10,238 INFO [train.py:873] (2/4) Epoch 3, batch 2600, loss[loss=0.2409, simple_loss=0.2329, pruned_loss=0.1245, over 14315.00 frames. ], tot_loss[loss=0.2188, simple_loss=0.2178, pruned_loss=0.1099, over 1994834.34 frames. ], batch size: 46, lr: 2.43e-02, grad_scale: 8.0 +2022-12-07 08:50:14,831 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17730.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:50:22,577 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=17739.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 08:50:41,004 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17761.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:50:56,781 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9692, 1.8369, 1.5867, 1.9827, 1.9173, 2.0562, 1.7994, 1.6725], + device='cuda:2'), covar=tensor([0.0085, 0.0301, 0.0506, 0.0098, 0.0146, 0.0051, 0.0347, 0.0317], + device='cuda:2'), in_proj_covar=tensor([0.0204, 0.0276, 0.0347, 0.0166, 0.0201, 0.0190, 0.0253, 0.0350], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:51:07,037 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=17791.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:51:25,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.679e+02 2.813e+02 3.769e+02 4.969e+02 1.379e+03, threshold=7.537e+02, percent-clipped=10.0 +2022-12-07 08:51:36,296 INFO [train.py:873] (2/4) Epoch 3, batch 2700, loss[loss=0.2103, simple_loss=0.2189, pruned_loss=0.1009, over 14295.00 frames. ], tot_loss[loss=0.2182, simple_loss=0.2172, pruned_loss=0.1096, over 1934051.76 frames. ], batch size: 25, lr: 2.42e-02, grad_scale: 8.0 +2022-12-07 08:51:53,246 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=17845.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:52:00,145 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6135, 1.5972, 2.6910, 1.5993, 2.6300, 2.6511, 2.0802, 2.7163], + device='cuda:2'), covar=tensor([0.0155, 0.1702, 0.0217, 0.1462, 0.0230, 0.0303, 0.0565, 0.0177], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0150, 0.0105, 0.0158, 0.0123, 0.0111, 0.0097, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:52:34,444 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=17893.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:52:51,605 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 3.286e+02 4.021e+02 4.924e+02 9.078e+02, threshold=8.042e+02, percent-clipped=2.0 +2022-12-07 08:53:01,816 INFO [train.py:873] (2/4) Epoch 3, batch 2800, loss[loss=0.2314, simple_loss=0.2291, pruned_loss=0.1168, over 12739.00 frames. ], tot_loss[loss=0.2166, simple_loss=0.2167, pruned_loss=0.1083, over 1943089.43 frames. ], batch size: 100, lr: 2.41e-02, grad_scale: 8.0 +2022-12-07 08:53:31,811 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=17960.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:53:42,511 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9970, 1.9947, 1.8490, 2.1000, 1.7188, 1.8999, 1.9836, 2.0824], + device='cuda:2'), covar=tensor([0.0979, 0.0771, 0.1022, 0.0771, 0.1323, 0.0677, 0.1048, 0.0763], + device='cuda:2'), in_proj_covar=tensor([0.0095, 0.0077, 0.0098, 0.0092, 0.0104, 0.0066, 0.0096, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:54:16,685 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.972e+02 3.622e+02 4.500e+02 9.762e+02, threshold=7.243e+02, percent-clipped=2.0 +2022-12-07 08:54:21,502 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18018.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:54:24,238 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18021.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:54:27,362 INFO [train.py:873] (2/4) Epoch 3, batch 2900, loss[loss=0.2079, simple_loss=0.2116, pruned_loss=0.1021, over 13861.00 frames. ], tot_loss[loss=0.2168, simple_loss=0.2164, pruned_loss=0.1086, over 1928355.36 frames. ], batch size: 20, lr: 2.41e-02, grad_scale: 8.0 +2022-12-07 08:54:39,652 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18039.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 08:54:53,276 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18056.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:55:01,988 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18066.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:55:08,304 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4570, 1.3999, 1.5228, 1.4561, 1.2945, 1.4182, 1.2795, 0.9814], + device='cuda:2'), covar=tensor([0.3467, 0.1536, 0.1190, 0.0802, 0.1222, 0.1015, 0.1949, 0.2705], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0066, 0.0051, 0.0059, 0.0067, 0.0057, 0.0079, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:55:19,288 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18086.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:55:20,067 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18087.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 08:55:22,174 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 08:55:35,055 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9477, 3.4524, 2.4104, 3.9654, 3.7718, 3.9190, 3.3764, 2.5085], + device='cuda:2'), covar=tensor([0.0199, 0.0660, 0.3346, 0.0172, 0.0225, 0.0390, 0.0606, 0.3291], + device='cuda:2'), in_proj_covar=tensor([0.0207, 0.0275, 0.0346, 0.0164, 0.0200, 0.0191, 0.0247, 0.0349], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0001, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:55:42,573 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 3.066e+02 4.043e+02 5.337e+02 1.703e+03, threshold=8.087e+02, percent-clipped=7.0 +2022-12-07 08:55:52,803 INFO [train.py:873] (2/4) Epoch 3, batch 3000, loss[loss=0.2046, simple_loss=0.1926, pruned_loss=0.1083, over 3869.00 frames. ], tot_loss[loss=0.2176, simple_loss=0.217, pruned_loss=0.1091, over 1937529.01 frames. ], batch size: 100, lr: 2.40e-02, grad_scale: 8.0 +2022-12-07 08:55:52,804 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 08:56:03,704 INFO [train.py:905] (2/4) Epoch 3, validation: loss=0.1337, simple_loss=0.176, pruned_loss=0.04573, over 857387.00 frames. +2022-12-07 08:56:03,705 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 08:56:19,750 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=4.32 vs. limit=2.0 +2022-12-07 08:56:45,815 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18174.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:56:46,874 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18175.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:57:19,775 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.296e+02 3.113e+02 4.054e+02 5.125e+02 9.563e+02, threshold=8.108e+02, percent-clipped=4.0 +2022-12-07 08:57:25,608 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.55 vs. limit=5.0 +2022-12-07 08:57:30,105 INFO [train.py:873] (2/4) Epoch 3, batch 3100, loss[loss=0.248, simple_loss=0.2411, pruned_loss=0.1274, over 14591.00 frames. ], tot_loss[loss=0.2166, simple_loss=0.2167, pruned_loss=0.1083, over 1958128.76 frames. ], batch size: 24, lr: 2.40e-02, grad_scale: 8.0 +2022-12-07 08:57:38,714 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18235.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:57:39,582 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18236.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:58:28,645 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 08:58:34,746 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6022, 1.9018, 4.3185, 2.2116, 4.2116, 4.3957, 3.8921, 4.8854], + device='cuda:2'), covar=tensor([0.0116, 0.2395, 0.0203, 0.1912, 0.0231, 0.0190, 0.0260, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0149, 0.0105, 0.0159, 0.0122, 0.0111, 0.0098, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:58:37,182 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7234, 3.5268, 3.5017, 3.9036, 3.5368, 3.0181, 3.8291, 3.7803], + device='cuda:2'), covar=tensor([0.0701, 0.0570, 0.0649, 0.0467, 0.0758, 0.0584, 0.0570, 0.0567], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0078, 0.0100, 0.0094, 0.0106, 0.0069, 0.0101, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:58:40,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8407, 2.5957, 2.1812, 2.3773, 1.7552, 1.7109, 2.5207, 1.1671], + device='cuda:2'), covar=tensor([0.3617, 0.1234, 0.2956, 0.2033, 0.1155, 0.3208, 0.1077, 0.3172], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0067, 0.0051, 0.0060, 0.0067, 0.0057, 0.0074, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 08:58:45,517 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 2.872e+02 3.739e+02 4.989e+02 1.887e+03, threshold=7.477e+02, percent-clipped=4.0 +2022-12-07 08:58:48,248 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18316.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:58:55,690 INFO [train.py:873] (2/4) Epoch 3, batch 3200, loss[loss=0.2072, simple_loss=0.2098, pruned_loss=0.1023, over 14428.00 frames. ], tot_loss[loss=0.217, simple_loss=0.2172, pruned_loss=0.1083, over 1976380.14 frames. ], batch size: 73, lr: 2.39e-02, grad_scale: 8.0 +2022-12-07 08:59:14,351 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6992, 5.4609, 5.3145, 5.9058, 5.5714, 5.0189, 5.9234, 5.8392], + device='cuda:2'), covar=tensor([0.0478, 0.0263, 0.0423, 0.0311, 0.0367, 0.0234, 0.0338, 0.0387], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0078, 0.0099, 0.0095, 0.0106, 0.0070, 0.0103, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 08:59:15,734 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.15 vs. limit=2.0 +2022-12-07 08:59:21,757 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 08:59:22,775 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18356.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:59:48,402 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18386.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 08:59:51,196 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-12-07 09:00:03,918 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18404.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:00:11,699 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 3.073e+02 3.846e+02 5.259e+02 1.192e+03, threshold=7.691e+02, percent-clipped=7.0 +2022-12-07 09:00:11,802 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8262, 2.5116, 2.6864, 2.8441, 2.8600, 2.8263, 2.9733, 2.5045], + device='cuda:2'), covar=tensor([0.0465, 0.1124, 0.0424, 0.0483, 0.0576, 0.0388, 0.0542, 0.0577], + device='cuda:2'), in_proj_covar=tensor([0.0106, 0.0178, 0.0117, 0.0110, 0.0104, 0.0100, 0.0164, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:00:22,199 INFO [train.py:873] (2/4) Epoch 3, batch 3300, loss[loss=0.1497, simple_loss=0.1538, pruned_loss=0.07283, over 2578.00 frames. ], tot_loss[loss=0.2154, simple_loss=0.2159, pruned_loss=0.1074, over 1973760.94 frames. ], batch size: 100, lr: 2.38e-02, grad_scale: 8.0 +2022-12-07 09:00:29,918 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18434.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:00:35,981 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6794, 0.5291, 0.2352, 0.6331, 0.6164, 0.0495, 0.7787, 0.6885], + device='cuda:2'), covar=tensor([0.0034, 0.0042, 0.0015, 0.0114, 0.0019, 0.0020, 0.0081, 0.0043], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0021, 0.0021, 0.0017, 0.0018, 0.0023, 0.0016, 0.0017], + device='cuda:2'), out_proj_covar=tensor([4.6399e-05, 5.6234e-05, 5.5199e-05, 5.3383e-05, 5.0410e-05, 5.8412e-05, + 4.8781e-05, 4.5393e-05], device='cuda:2') +2022-12-07 09:00:51,783 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8858, 1.7543, 1.7226, 1.7619, 1.4703, 1.6674, 1.5866, 0.8233], + device='cuda:2'), covar=tensor([0.3560, 0.1324, 0.1964, 0.0855, 0.1352, 0.1138, 0.1277, 0.3443], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0070, 0.0052, 0.0061, 0.0067, 0.0058, 0.0076, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:01:18,404 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18490.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:01:33,638 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18508.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 09:01:37,436 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.740e+02 3.014e+02 3.926e+02 4.952e+02 1.093e+03, threshold=7.852e+02, percent-clipped=5.0 +2022-12-07 09:01:46,453 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3125, 1.5412, 1.5639, 1.6918, 1.4638, 1.6702, 1.4046, 0.9345], + device='cuda:2'), covar=tensor([0.2277, 0.1853, 0.0364, 0.0481, 0.0735, 0.0465, 0.0784, 0.2021], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0068, 0.0050, 0.0058, 0.0066, 0.0057, 0.0074, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:01:48,040 INFO [train.py:873] (2/4) Epoch 3, batch 3400, loss[loss=0.2824, simple_loss=0.2571, pruned_loss=0.1538, over 14247.00 frames. ], tot_loss[loss=0.2139, simple_loss=0.2155, pruned_loss=0.1061, over 2064026.79 frames. ], batch size: 80, lr: 2.38e-02, grad_scale: 8.0 +2022-12-07 09:01:52,315 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18530.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:01:53,153 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18531.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:02:07,855 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.52 vs. limit=5.0 +2022-12-07 09:02:10,187 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18551.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:02:10,624 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.90 vs. limit=5.0 +2022-12-07 09:02:25,544 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18569.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 09:03:03,540 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.675e+02 3.062e+02 4.109e+02 5.827e+02 9.337e+02, threshold=8.217e+02, percent-clipped=6.0 +2022-12-07 09:03:03,779 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5181, 1.8776, 2.8134, 2.6942, 2.5719, 1.8931, 2.8506, 2.1318], + device='cuda:2'), covar=tensor([0.0056, 0.0147, 0.0101, 0.0072, 0.0052, 0.0232, 0.0033, 0.0172], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0154, 0.0185, 0.0160, 0.0139, 0.0200, 0.0104, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 09:03:05,346 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7279, 1.9468, 2.4772, 1.2609, 1.8180, 1.8652, 1.0828, 2.0931], + device='cuda:2'), covar=tensor([0.1023, 0.1257, 0.0542, 0.2325, 0.1353, 0.0929, 0.3975, 0.0738], + device='cuda:2'), in_proj_covar=tensor([0.0059, 0.0067, 0.0061, 0.0072, 0.0084, 0.0062, 0.0125, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:03:06,449 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18616.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:03:13,884 INFO [train.py:873] (2/4) Epoch 3, batch 3500, loss[loss=0.2248, simple_loss=0.1836, pruned_loss=0.133, over 1293.00 frames. ], tot_loss[loss=0.2141, simple_loss=0.2151, pruned_loss=0.1065, over 2023621.61 frames. ], batch size: 100, lr: 2.37e-02, grad_scale: 8.0 +2022-12-07 09:03:19,268 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.23 vs. limit=2.0 +2022-12-07 09:03:46,480 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18664.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:03:54,774 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18674.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:04:28,294 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 2.925e+02 3.819e+02 4.873e+02 8.102e+02, threshold=7.639e+02, percent-clipped=0.0 +2022-12-07 09:04:38,566 INFO [train.py:873] (2/4) Epoch 3, batch 3600, loss[loss=0.2337, simple_loss=0.2243, pruned_loss=0.1215, over 14217.00 frames. ], tot_loss[loss=0.2148, simple_loss=0.215, pruned_loss=0.1073, over 1960343.70 frames. ], batch size: 94, lr: 2.37e-02, grad_scale: 8.0 +2022-12-07 09:04:47,616 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18735.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:04:54,245 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.98 vs. limit=2.0 +2022-12-07 09:05:54,915 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.731e+02 3.054e+02 3.647e+02 5.491e+02 1.580e+03, threshold=7.295e+02, percent-clipped=4.0 +2022-12-07 09:06:05,676 INFO [train.py:873] (2/4) Epoch 3, batch 3700, loss[loss=0.2181, simple_loss=0.206, pruned_loss=0.1151, over 6030.00 frames. ], tot_loss[loss=0.214, simple_loss=0.215, pruned_loss=0.1065, over 2025638.14 frames. ], batch size: 100, lr: 2.36e-02, grad_scale: 8.0 +2022-12-07 09:06:09,864 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18830.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:06:10,737 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=18831.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:06:23,444 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18846.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:06:39,730 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=18864.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 09:06:51,531 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18878.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:06:52,333 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=18879.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:07:21,106 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.590e+02 2.825e+02 3.727e+02 4.870e+02 9.642e+02, threshold=7.453e+02, percent-clipped=3.0 +2022-12-07 09:07:31,081 INFO [train.py:873] (2/4) Epoch 3, batch 3800, loss[loss=0.1875, simple_loss=0.1566, pruned_loss=0.1092, over 1260.00 frames. ], tot_loss[loss=0.2124, simple_loss=0.2142, pruned_loss=0.1053, over 2015988.85 frames. ], batch size: 100, lr: 2.35e-02, grad_scale: 8.0 +2022-12-07 09:07:35,841 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=18930.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:07:39,589 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-07 09:08:23,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3983, 2.2924, 3.3723, 2.5261, 3.3070, 3.1964, 3.0678, 2.7701], + device='cuda:2'), covar=tensor([0.0099, 0.1149, 0.0110, 0.0676, 0.0195, 0.0193, 0.0516, 0.0773], + device='cuda:2'), in_proj_covar=tensor([0.0216, 0.0338, 0.0189, 0.0286, 0.0234, 0.0230, 0.0250, 0.0360], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:08:28,739 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=18991.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:08:47,290 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.617e+02 3.029e+02 3.748e+02 4.757e+02 1.336e+03, threshold=7.495e+02, percent-clipped=2.0 +2022-12-07 09:08:58,126 INFO [train.py:873] (2/4) Epoch 3, batch 3900, loss[loss=0.2123, simple_loss=0.207, pruned_loss=0.1088, over 5949.00 frames. ], tot_loss[loss=0.2122, simple_loss=0.2139, pruned_loss=0.1052, over 2058057.75 frames. ], batch size: 100, lr: 2.35e-02, grad_scale: 8.0 +2022-12-07 09:09:02,371 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19030.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:09:04,145 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8709, 2.6198, 1.9509, 2.2505, 1.6172, 2.4682, 2.3165, 0.9242], + device='cuda:2'), covar=tensor([0.4358, 0.0991, 0.4799, 0.2117, 0.1523, 0.1826, 0.1596, 0.3886], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0066, 0.0053, 0.0060, 0.0070, 0.0057, 0.0075, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:10:13,853 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.994e+02 3.505e+02 4.615e+02 1.018e+03, threshold=7.009e+02, percent-clipped=3.0 +2022-12-07 09:10:23,710 INFO [train.py:873] (2/4) Epoch 3, batch 4000, loss[loss=0.2132, simple_loss=0.1987, pruned_loss=0.1139, over 4981.00 frames. ], tot_loss[loss=0.2109, simple_loss=0.2128, pruned_loss=0.1045, over 2033290.77 frames. ], batch size: 100, lr: 2.34e-02, grad_scale: 8.0 +2022-12-07 09:10:27,248 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19129.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 09:10:42,466 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19146.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:10:46,923 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.53 vs. limit=2.0 +2022-12-07 09:10:57,610 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19164.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 09:11:20,478 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19190.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 09:11:23,546 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19194.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:11:29,887 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.48 vs. limit=2.0 +2022-12-07 09:11:39,102 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19212.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 09:11:39,768 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 2.771e+02 3.795e+02 5.006e+02 9.210e+02, threshold=7.589e+02, percent-clipped=4.0 +2022-12-07 09:11:50,340 INFO [train.py:873] (2/4) Epoch 3, batch 4100, loss[loss=0.2223, simple_loss=0.2245, pruned_loss=0.11, over 14160.00 frames. ], tot_loss[loss=0.2109, simple_loss=0.2127, pruned_loss=0.1045, over 2027724.13 frames. ], batch size: 99, lr: 2.34e-02, grad_scale: 8.0 +2022-12-07 09:11:59,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1192, 1.7182, 1.0249, 1.1992, 0.8735, 1.1238, 1.1961, 1.0943], + device='cuda:2'), covar=tensor([0.0749, 0.1145, 0.2151, 0.1286, 0.1680, 0.0470, 0.0305, 0.1727], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0011, 0.0012, 0.0013, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([3.4222e-05, 3.3148e-05, 3.5796e-05, 3.1909e-05, 3.4940e-05, 3.4899e-05, + 4.2359e-05, 4.5847e-05], device='cuda:2') +2022-12-07 09:12:27,994 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.00 vs. limit=2.0 +2022-12-07 09:12:38,016 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0525, 2.0001, 1.7588, 1.7418, 1.9525, 1.8794, 1.9793, 1.9503], + device='cuda:2'), covar=tensor([0.0965, 0.1024, 0.2289, 0.2895, 0.0913, 0.0937, 0.1840, 0.1022], + device='cuda:2'), in_proj_covar=tensor([0.0210, 0.0179, 0.0249, 0.0313, 0.0200, 0.0228, 0.0243, 0.0194], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:12:42,469 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19286.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:12:44,132 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3175, 1.3473, 1.4911, 1.5101, 1.2581, 1.3794, 1.3548, 0.7551], + device='cuda:2'), covar=tensor([0.2621, 0.1241, 0.0773, 0.0879, 0.1048, 0.0351, 0.1365, 0.2565], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0062, 0.0051, 0.0058, 0.0065, 0.0055, 0.0072, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:13:00,720 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.09 vs. limit=2.0 +2022-12-07 09:13:05,131 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 2.865e+02 3.600e+02 4.882e+02 1.320e+03, threshold=7.199e+02, percent-clipped=6.0 +2022-12-07 09:13:15,261 INFO [train.py:873] (2/4) Epoch 3, batch 4200, loss[loss=0.1776, simple_loss=0.1893, pruned_loss=0.08297, over 3891.00 frames. ], tot_loss[loss=0.2108, simple_loss=0.2126, pruned_loss=0.1045, over 1963095.00 frames. ], batch size: 100, lr: 2.33e-02, grad_scale: 8.0 +2022-12-07 09:13:18,469 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.84 vs. limit=2.0 +2022-12-07 09:13:19,733 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19330.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:13:38,357 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1346, 2.9285, 1.8816, 2.2667, 1.9724, 2.8051, 2.6504, 1.1735], + device='cuda:2'), covar=tensor([0.3389, 0.0905, 0.3321, 0.2054, 0.1150, 0.0897, 0.1036, 0.3383], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0065, 0.0052, 0.0057, 0.0067, 0.0056, 0.0074, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:14:01,560 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19378.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:14:01,660 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8151, 1.5191, 2.0362, 1.6181, 2.0198, 1.5216, 1.6694, 1.6856], + device='cuda:2'), covar=tensor([0.0671, 0.1066, 0.0114, 0.0752, 0.0155, 0.0820, 0.0888, 0.0313], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0316, 0.0181, 0.0419, 0.0170, 0.0326, 0.0292, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0005, 0.0002, 0.0004, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:14:15,289 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19394.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:14:19,463 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6851, 2.7811, 2.2993, 1.5152, 2.5742, 2.2898, 2.6076, 1.8913], + device='cuda:2'), covar=tensor([0.0486, 0.2714, 0.1264, 0.3032, 0.1038, 0.0468, 0.1188, 0.1664], + device='cuda:2'), in_proj_covar=tensor([0.0077, 0.0214, 0.0100, 0.0131, 0.0087, 0.0086, 0.0072, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0005, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:14:22,115 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0729, 2.0598, 2.0059, 2.1004, 2.0082, 1.8409, 1.1435, 1.9203], + device='cuda:2'), covar=tensor([0.0213, 0.0273, 0.0385, 0.0182, 0.0294, 0.0537, 0.1512, 0.0292], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0116, 0.0111, 0.0092, 0.0150, 0.0103, 0.0148, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:14:26,445 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1086, 0.8455, 0.9812, 0.9892, 0.8632, 0.6013, 1.2961, 1.4074], + device='cuda:2'), covar=tensor([0.0469, 0.0324, 0.0159, 0.1094, 0.0530, 0.0167, 0.0276, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0020, 0.0020, 0.0017, 0.0018, 0.0022, 0.0016, 0.0017], + device='cuda:2'), out_proj_covar=tensor([4.9524e-05, 5.6046e-05, 5.2015e-05, 5.2148e-05, 5.1220e-05, 5.9373e-05, + 4.9578e-05, 4.7922e-05], device='cuda:2') +2022-12-07 09:14:31,170 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.339e+02 3.180e+02 3.832e+02 4.985e+02 8.220e+02, threshold=7.663e+02, percent-clipped=2.0 +2022-12-07 09:14:42,547 INFO [train.py:873] (2/4) Epoch 3, batch 4300, loss[loss=0.2048, simple_loss=0.2139, pruned_loss=0.0978, over 14081.00 frames. ], tot_loss[loss=0.2123, simple_loss=0.2135, pruned_loss=0.1055, over 1902990.21 frames. ], batch size: 29, lr: 2.33e-02, grad_scale: 8.0 +2022-12-07 09:15:08,474 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19455.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:15:26,207 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.91 vs. limit=2.0 +2022-12-07 09:15:34,009 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19485.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 09:15:36,136 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.26 vs. limit=5.0 +2022-12-07 09:15:57,829 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.656e+02 2.698e+02 3.428e+02 4.598e+02 7.579e+02, threshold=6.855e+02, percent-clipped=0.0 +2022-12-07 09:16:08,044 INFO [train.py:873] (2/4) Epoch 3, batch 4400, loss[loss=0.1979, simple_loss=0.21, pruned_loss=0.09294, over 14255.00 frames. ], tot_loss[loss=0.2101, simple_loss=0.2119, pruned_loss=0.1042, over 1900264.28 frames. ], batch size: 25, lr: 2.32e-02, grad_scale: 8.0 +2022-12-07 09:16:10,675 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8306, 0.7507, 0.5439, 0.9978, 0.8658, 0.3397, 0.8861, 0.9766], + device='cuda:2'), covar=tensor([0.0085, 0.0087, 0.0038, 0.0111, 0.0030, 0.0066, 0.0141, 0.0087], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0020, 0.0020, 0.0016, 0.0019, 0.0023, 0.0017, 0.0018], + device='cuda:2'), out_proj_covar=tensor([5.0192e-05, 5.5937e-05, 5.2612e-05, 5.1800e-05, 5.1507e-05, 6.0632e-05, + 5.1518e-05, 5.0843e-05], device='cuda:2') +2022-12-07 09:16:23,908 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7515, 2.6130, 2.6144, 2.8358, 2.4323, 2.3794, 2.8074, 2.7867], + device='cuda:2'), covar=tensor([0.0814, 0.0729, 0.0719, 0.0607, 0.1079, 0.0713, 0.0738, 0.0690], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0078, 0.0099, 0.0095, 0.0106, 0.0071, 0.0103, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:16:29,653 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9919, 1.4540, 3.7818, 3.5560, 3.6112, 3.6674, 3.0704, 3.8702], + device='cuda:2'), covar=tensor([0.1087, 0.1307, 0.0074, 0.0122, 0.0125, 0.0097, 0.0250, 0.0066], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0144, 0.0081, 0.0114, 0.0094, 0.0101, 0.0076, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:16:49,684 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-07 09:17:00,369 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19586.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:17:23,968 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.141e+02 2.902e+02 3.784e+02 4.919e+02 1.720e+03, threshold=7.568e+02, percent-clipped=8.0 +2022-12-07 09:17:28,273 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=19618.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:17:34,596 INFO [train.py:873] (2/4) Epoch 3, batch 4500, loss[loss=0.1796, simple_loss=0.199, pruned_loss=0.08014, over 13995.00 frames. ], tot_loss[loss=0.2084, simple_loss=0.2107, pruned_loss=0.103, over 1889950.39 frames. ], batch size: 26, lr: 2.31e-02, grad_scale: 8.0 +2022-12-07 09:17:42,294 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19634.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:18:20,859 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=19679.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:18:25,232 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.13 vs. limit=2.0 +2022-12-07 09:18:47,770 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 09:18:50,393 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.725e+01 3.027e+02 3.828e+02 4.881e+02 1.178e+03, threshold=7.656e+02, percent-clipped=6.0 +2022-12-07 09:18:59,480 INFO [train.py:873] (2/4) Epoch 3, batch 4600, loss[loss=0.2297, simple_loss=0.2255, pruned_loss=0.117, over 14028.00 frames. ], tot_loss[loss=0.2103, simple_loss=0.212, pruned_loss=0.1043, over 1916920.85 frames. ], batch size: 26, lr: 2.31e-02, grad_scale: 8.0 +2022-12-07 09:19:21,754 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19750.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:19:40,123 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2477, 2.4525, 3.1651, 2.0187, 2.2543, 2.5633, 1.2077, 2.6347], + device='cuda:2'), covar=tensor([0.1112, 0.0820, 0.0527, 0.1462, 0.1420, 0.0794, 0.4021, 0.0823], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0066, 0.0064, 0.0072, 0.0087, 0.0063, 0.0129, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:19:52,032 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=19785.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 09:20:16,183 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.596e+02 3.281e+02 3.991e+02 5.326e+02 1.100e+03, threshold=7.982e+02, percent-clipped=6.0 +2022-12-07 09:20:26,812 INFO [train.py:873] (2/4) Epoch 3, batch 4700, loss[loss=0.2255, simple_loss=0.222, pruned_loss=0.1145, over 14169.00 frames. ], tot_loss[loss=0.2114, simple_loss=0.213, pruned_loss=0.1049, over 1961662.81 frames. ], batch size: 29, lr: 2.30e-02, grad_scale: 8.0 +2022-12-07 09:20:33,469 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=19833.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 09:20:42,557 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0596, 1.0291, 1.0028, 1.1355, 0.8751, 0.6835, 0.3612, 0.8161], + device='cuda:2'), covar=tensor([0.0102, 0.0168, 0.0095, 0.0117, 0.0353, 0.0259, 0.0111, 0.0486], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0012, 0.0011, 0.0013, 0.0014, 0.0013, 0.0016], + device='cuda:2'), out_proj_covar=tensor([3.7191e-05, 3.4878e-05, 3.8953e-05, 3.4397e-05, 3.7133e-05, 3.6136e-05, + 4.5382e-05, 4.7734e-05], device='cuda:2') +2022-12-07 09:20:47,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7696, 1.9722, 3.0568, 3.0428, 2.8711, 2.1718, 3.0151, 2.2669], + device='cuda:2'), covar=tensor([0.0058, 0.0141, 0.0125, 0.0087, 0.0051, 0.0237, 0.0034, 0.0186], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0157, 0.0197, 0.0167, 0.0142, 0.0201, 0.0107, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 09:21:04,679 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4674, 2.4263, 2.0175, 1.4275, 2.3861, 2.1715, 2.5204, 1.8726], + device='cuda:2'), covar=tensor([0.0477, 0.2575, 0.1236, 0.2666, 0.0897, 0.0465, 0.0875, 0.1712], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0213, 0.0097, 0.0128, 0.0084, 0.0085, 0.0074, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0005, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:21:43,694 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.462e+02 3.110e+02 3.871e+02 4.666e+02 7.831e+02, threshold=7.741e+02, percent-clipped=0.0 +2022-12-07 09:21:52,843 INFO [train.py:873] (2/4) Epoch 3, batch 4800, loss[loss=0.2094, simple_loss=0.2057, pruned_loss=0.1065, over 5998.00 frames. ], tot_loss[loss=0.2114, simple_loss=0.2124, pruned_loss=0.1052, over 1921486.02 frames. ], batch size: 100, lr: 2.30e-02, grad_scale: 8.0 +2022-12-07 09:21:57,562 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-07 09:22:22,005 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 09:22:35,317 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=19974.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:23:13,337 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.353e+02 2.781e+02 3.610e+02 5.418e+02 1.055e+03, threshold=7.220e+02, percent-clipped=5.0 +2022-12-07 09:23:19,212 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20020.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:23:23,445 INFO [train.py:873] (2/4) Epoch 3, batch 4900, loss[loss=0.2389, simple_loss=0.2289, pruned_loss=0.1244, over 6926.00 frames. ], tot_loss[loss=0.2116, simple_loss=0.2127, pruned_loss=0.1052, over 1911970.01 frames. ], batch size: 100, lr: 2.29e-02, grad_scale: 8.0 +2022-12-07 09:23:29,350 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-07 09:23:44,333 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20050.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:24:10,870 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20081.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:24:12,385 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8594, 1.2400, 2.4815, 2.3348, 2.4572, 2.2773, 1.6423, 2.4382], + device='cuda:2'), covar=tensor([0.0582, 0.0878, 0.0108, 0.0217, 0.0164, 0.0104, 0.0317, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0145, 0.0085, 0.0114, 0.0095, 0.0104, 0.0078, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:24:16,269 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-12-07 09:24:17,502 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20089.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:24:25,506 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20098.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:24:37,974 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20112.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:24:39,471 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.119e+02 3.078e+02 3.886e+02 5.087e+02 8.123e+02, threshold=7.771e+02, percent-clipped=4.0 +2022-12-07 09:24:48,394 INFO [train.py:873] (2/4) Epoch 3, batch 5000, loss[loss=0.2217, simple_loss=0.2241, pruned_loss=0.1096, over 14344.00 frames. ], tot_loss[loss=0.2096, simple_loss=0.2119, pruned_loss=0.1036, over 1917253.73 frames. ], batch size: 66, lr: 2.29e-02, grad_scale: 8.0 +2022-12-07 09:25:10,544 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20150.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:25:29,494 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20173.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:25:56,520 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20204.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:26:04,802 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.185e+02 2.928e+02 3.762e+02 4.834e+02 9.442e+02, threshold=7.523e+02, percent-clipped=4.0 +2022-12-07 09:26:14,944 INFO [train.py:873] (2/4) Epoch 3, batch 5100, loss[loss=0.1636, simple_loss=0.1582, pruned_loss=0.08447, over 3826.00 frames. ], tot_loss[loss=0.2094, simple_loss=0.2118, pruned_loss=0.1035, over 1939345.10 frames. ], batch size: 100, lr: 2.28e-02, grad_scale: 8.0 +2022-12-07 09:26:49,043 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20265.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:26:56,637 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20274.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:27:30,744 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.615e+02 2.933e+02 3.750e+02 4.548e+02 8.499e+02, threshold=7.500e+02, percent-clipped=1.0 +2022-12-07 09:27:37,606 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20322.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:27:39,974 INFO [train.py:873] (2/4) Epoch 3, batch 5200, loss[loss=0.1924, simple_loss=0.1859, pruned_loss=0.09943, over 3848.00 frames. ], tot_loss[loss=0.2099, simple_loss=0.2122, pruned_loss=0.1038, over 1978657.19 frames. ], batch size: 100, lr: 2.28e-02, grad_scale: 8.0 +2022-12-07 09:28:23,151 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20376.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:28:33,324 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7891, 1.5423, 2.1509, 1.8104, 2.1861, 1.7621, 1.8350, 2.0324], + device='cuda:2'), covar=tensor([0.0076, 0.0254, 0.0037, 0.0083, 0.0037, 0.0088, 0.0045, 0.0095], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0345, 0.0208, 0.0293, 0.0254, 0.0232, 0.0264, 0.0367], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 09:28:34,137 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20388.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:28:56,776 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.552e+02 2.824e+02 3.945e+02 4.935e+02 1.895e+03, threshold=7.890e+02, percent-clipped=6.0 +2022-12-07 09:28:59,497 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20418.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:29:06,139 INFO [train.py:873] (2/4) Epoch 3, batch 5300, loss[loss=0.1727, simple_loss=0.1811, pruned_loss=0.08216, over 14038.00 frames. ], tot_loss[loss=0.2086, simple_loss=0.2113, pruned_loss=0.103, over 1960910.22 frames. ], batch size: 22, lr: 2.27e-02, grad_scale: 4.0 +2022-12-07 09:29:23,389 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20445.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:29:26,908 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20449.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:29:43,753 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20468.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:29:52,984 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20479.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:30:23,816 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.141e+02 2.763e+02 3.648e+02 4.928e+02 1.065e+03, threshold=7.297e+02, percent-clipped=2.0 +2022-12-07 09:30:32,155 INFO [train.py:873] (2/4) Epoch 3, batch 5400, loss[loss=0.2165, simple_loss=0.2217, pruned_loss=0.1057, over 14228.00 frames. ], tot_loss[loss=0.2089, simple_loss=0.2113, pruned_loss=0.1033, over 1919025.67 frames. ], batch size: 80, lr: 2.27e-02, grad_scale: 4.0 +2022-12-07 09:31:01,044 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-12-07 09:31:02,911 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20560.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:31:05,573 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6300, 4.4662, 4.9065, 3.8406, 4.5301, 5.0079, 1.8218, 4.4988], + device='cuda:2'), covar=tensor([0.0149, 0.0209, 0.0275, 0.0494, 0.0250, 0.0095, 0.2835, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0116, 0.0108, 0.0093, 0.0150, 0.0101, 0.0146, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:31:27,532 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=20588.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:31:50,019 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.464e+02 2.948e+02 3.775e+02 4.848e+02 6.830e+02, threshold=7.550e+02, percent-clipped=0.0 +2022-12-07 09:31:59,005 INFO [train.py:873] (2/4) Epoch 3, batch 5500, loss[loss=0.1937, simple_loss=0.207, pruned_loss=0.09022, over 13846.00 frames. ], tot_loss[loss=0.206, simple_loss=0.2099, pruned_loss=0.1011, over 1946354.72 frames. ], batch size: 23, lr: 2.26e-02, grad_scale: 4.0 +2022-12-07 09:32:19,613 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=20649.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:32:42,424 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20676.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:33:15,378 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.510e+02 2.742e+02 3.691e+02 4.963e+02 1.196e+03, threshold=7.382e+02, percent-clipped=6.0 +2022-12-07 09:33:22,784 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20724.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:33:23,569 INFO [train.py:873] (2/4) Epoch 3, batch 5600, loss[loss=0.2116, simple_loss=0.2136, pruned_loss=0.1048, over 14335.00 frames. ], tot_loss[loss=0.2071, simple_loss=0.2106, pruned_loss=0.1018, over 1956186.59 frames. ], batch size: 73, lr: 2.26e-02, grad_scale: 8.0 +2022-12-07 09:33:40,076 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20744.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:33:41,233 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20745.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:34:00,984 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20768.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:34:06,487 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20774.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:34:24,029 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20793.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:34:24,682 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-12-07 09:34:43,760 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.806e+02 3.037e+02 4.375e+02 5.410e+02 1.130e+03, threshold=8.749e+02, percent-clipped=7.0 +2022-12-07 09:34:44,729 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20816.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:34:53,314 INFO [train.py:873] (2/4) Epoch 3, batch 5700, loss[loss=0.2073, simple_loss=0.2201, pruned_loss=0.09729, over 14114.00 frames. ], tot_loss[loss=0.2073, simple_loss=0.2107, pruned_loss=0.102, over 1919315.83 frames. ], batch size: 29, lr: 2.25e-02, grad_scale: 8.0 +2022-12-07 09:35:24,816 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=20860.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:35:28,562 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8859, 3.9095, 2.9739, 4.9156, 4.4219, 4.8472, 4.2550, 3.3314], + device='cuda:2'), covar=tensor([0.0138, 0.0569, 0.2618, 0.0108, 0.0256, 0.0524, 0.0452, 0.2250], + device='cuda:2'), in_proj_covar=tensor([0.0207, 0.0279, 0.0335, 0.0169, 0.0219, 0.0205, 0.0252, 0.0330], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:35:30,679 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8339, 1.1785, 1.2283, 1.1804, 0.8465, 1.0764, 0.9927, 0.7415], + device='cuda:2'), covar=tensor([0.1817, 0.0735, 0.0396, 0.0395, 0.1317, 0.0320, 0.0835, 0.0986], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0062, 0.0051, 0.0055, 0.0066, 0.0057, 0.0073, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:36:06,592 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=20908.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:36:12,609 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.426e+01 2.770e+02 3.515e+02 4.337e+02 1.206e+03, threshold=7.030e+02, percent-clipped=1.0 +2022-12-07 09:36:21,396 INFO [train.py:873] (2/4) Epoch 3, batch 5800, loss[loss=0.2031, simple_loss=0.2122, pruned_loss=0.09702, over 14232.00 frames. ], tot_loss[loss=0.2074, simple_loss=0.2107, pruned_loss=0.1021, over 1941108.90 frames. ], batch size: 89, lr: 2.25e-02, grad_scale: 8.0 +2022-12-07 09:36:37,650 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=20944.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:37:39,552 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.882e+02 3.467e+02 4.356e+02 1.149e+03, threshold=6.934e+02, percent-clipped=1.0 +2022-12-07 09:37:47,928 INFO [train.py:873] (2/4) Epoch 3, batch 5900, loss[loss=0.2361, simple_loss=0.2353, pruned_loss=0.1184, over 14412.00 frames. ], tot_loss[loss=0.2083, simple_loss=0.2109, pruned_loss=0.1028, over 1917211.14 frames. ], batch size: 53, lr: 2.24e-02, grad_scale: 8.0 +2022-12-07 09:38:04,785 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21044.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:38:11,936 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21052.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:38:15,276 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0204, 2.0366, 1.7160, 1.7083, 1.9831, 1.8980, 1.9865, 1.9684], + device='cuda:2'), covar=tensor([0.0897, 0.0650, 0.1800, 0.2558, 0.0911, 0.0915, 0.1317, 0.1057], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0194, 0.0271, 0.0354, 0.0224, 0.0254, 0.0261, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:38:32,101 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21074.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:38:47,685 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21092.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:39:05,590 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.72 vs. limit=2.0 +2022-12-07 09:39:07,933 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21113.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:39:09,616 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 3.189e+02 4.370e+02 6.428e+02 1.112e+03, threshold=8.740e+02, percent-clipped=21.0 +2022-12-07 09:39:12,495 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21118.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:39:16,020 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21122.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:39:18,491 INFO [train.py:873] (2/4) Epoch 3, batch 6000, loss[loss=0.1959, simple_loss=0.1899, pruned_loss=0.101, over 4991.00 frames. ], tot_loss[loss=0.2084, simple_loss=0.2111, pruned_loss=0.1029, over 1953670.15 frames. ], batch size: 100, lr: 2.24e-02, grad_scale: 8.0 +2022-12-07 09:39:18,491 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 09:39:27,745 INFO [train.py:905] (2/4) Epoch 3, validation: loss=0.1295, simple_loss=0.172, pruned_loss=0.04354, over 857387.00 frames. +2022-12-07 09:39:27,746 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 09:40:08,900 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4039, 0.9168, 1.3481, 0.8704, 1.1596, 1.3306, 1.1229, 1.2160], + device='cuda:2'), covar=tensor([0.0251, 0.0847, 0.0431, 0.0333, 0.0876, 0.0459, 0.0317, 0.0716], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0225, 0.0104, 0.0134, 0.0092, 0.0089, 0.0077, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0005, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:40:16,130 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21179.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:40:18,335 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-12-07 09:40:24,391 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3059, 1.3774, 2.4720, 1.4014, 2.4144, 2.4459, 1.9533, 2.5293], + device='cuda:2'), covar=tensor([0.0160, 0.1299, 0.0185, 0.1147, 0.0212, 0.0249, 0.0626, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0147, 0.0111, 0.0158, 0.0130, 0.0116, 0.0100, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:40:43,103 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.70 vs. limit=5.0 +2022-12-07 09:40:46,033 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-12-07 09:40:47,317 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.278e+02 2.723e+02 3.536e+02 4.487e+02 9.124e+02, threshold=7.073e+02, percent-clipped=3.0 +2022-12-07 09:40:55,463 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3184, 3.1350, 2.6886, 1.7748, 2.9003, 2.8284, 3.5319, 2.4118], + device='cuda:2'), covar=tensor([0.0374, 0.2796, 0.1072, 0.2805, 0.0784, 0.0524, 0.0368, 0.1353], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0222, 0.0102, 0.0132, 0.0091, 0.0086, 0.0075, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0005, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:40:56,102 INFO [train.py:873] (2/4) Epoch 3, batch 6100, loss[loss=0.1852, simple_loss=0.1978, pruned_loss=0.08631, over 14261.00 frames. ], tot_loss[loss=0.2082, simple_loss=0.2111, pruned_loss=0.1027, over 1950855.60 frames. ], batch size: 35, lr: 2.23e-02, grad_scale: 8.0 +2022-12-07 09:41:03,040 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4276, 4.7874, 4.1760, 3.8771, 4.5407, 4.6600, 5.0168, 4.5940], + device='cuda:2'), covar=tensor([0.1845, 0.0546, 0.2133, 0.4215, 0.0885, 0.0971, 0.0894, 0.1803], + device='cuda:2'), in_proj_covar=tensor([0.0228, 0.0197, 0.0273, 0.0360, 0.0225, 0.0257, 0.0266, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 09:41:05,455 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6600, 0.5907, 0.6347, 0.7302, 0.6033, 0.3047, 0.8840, 0.2551], + device='cuda:2'), covar=tensor([0.0213, 0.0103, 0.0171, 0.0152, 0.0295, 0.0274, 0.0133, 0.0437], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0011, 0.0011, 0.0014, 0.0013, 0.0015], + device='cuda:2'), out_proj_covar=tensor([3.6669e-05, 3.5967e-05, 4.0512e-05, 3.5733e-05, 3.6164e-05, 4.1335e-05, + 4.8184e-05, 5.0256e-05], device='cuda:2') +2022-12-07 09:41:12,181 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21244.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:41:16,375 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1258, 1.9526, 2.0881, 2.1882, 2.1930, 2.1219, 2.2809, 1.8855], + device='cuda:2'), covar=tensor([0.0550, 0.1127, 0.0447, 0.0617, 0.0647, 0.0482, 0.0624, 0.0647], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0188, 0.0125, 0.0115, 0.0116, 0.0102, 0.0174, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:41:54,615 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21292.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:41:55,995 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.20 vs. limit=2.0 +2022-12-07 09:42:11,634 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21312.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:42:14,077 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.862e+02 3.459e+02 4.303e+02 5.209e+02 1.109e+03, threshold=8.606e+02, percent-clipped=4.0 +2022-12-07 09:42:22,734 INFO [train.py:873] (2/4) Epoch 3, batch 6200, loss[loss=0.221, simple_loss=0.2245, pruned_loss=0.1088, over 14413.00 frames. ], tot_loss[loss=0.2084, simple_loss=0.2114, pruned_loss=0.1028, over 1969209.30 frames. ], batch size: 41, lr: 2.23e-02, grad_scale: 8.0 +2022-12-07 09:42:25,343 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21328.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:42:46,453 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0841, 2.9985, 3.1120, 2.7991, 2.9852, 2.8520, 1.3326, 2.8874], + device='cuda:2'), covar=tensor([0.0204, 0.0262, 0.0344, 0.0357, 0.0264, 0.0634, 0.2542, 0.0252], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0120, 0.0112, 0.0096, 0.0148, 0.0103, 0.0148, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:43:04,559 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21373.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:43:16,256 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.17 vs. limit=2.0 +2022-12-07 09:43:18,590 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21389.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:43:20,587 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-12-07 09:43:25,175 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7142, 4.0439, 3.3966, 4.9387, 4.3256, 4.6491, 3.8349, 3.3135], + device='cuda:2'), covar=tensor([0.0208, 0.0577, 0.2695, 0.0146, 0.0273, 0.0460, 0.0740, 0.2491], + device='cuda:2'), in_proj_covar=tensor([0.0215, 0.0283, 0.0329, 0.0173, 0.0221, 0.0209, 0.0260, 0.0331], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 09:43:34,673 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21408.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:43:39,738 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0538, 2.2338, 2.9866, 2.3902, 3.1691, 2.9669, 2.8692, 2.4825], + device='cuda:2'), covar=tensor([0.0110, 0.1037, 0.0158, 0.0674, 0.0209, 0.0211, 0.0378, 0.0790], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0332, 0.0218, 0.0285, 0.0249, 0.0233, 0.0255, 0.0347], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:43:41,036 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.343e+02 2.777e+02 3.357e+02 4.403e+02 6.655e+02, threshold=6.715e+02, percent-clipped=0.0 +2022-12-07 09:43:49,957 INFO [train.py:873] (2/4) Epoch 3, batch 6300, loss[loss=0.178, simple_loss=0.1626, pruned_loss=0.09674, over 1237.00 frames. ], tot_loss[loss=0.2069, simple_loss=0.2105, pruned_loss=0.1017, over 1969625.18 frames. ], batch size: 100, lr: 2.22e-02, grad_scale: 8.0 +2022-12-07 09:44:14,304 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-07 09:44:25,113 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3336, 0.7556, 0.9871, 1.2858, 0.9505, 0.6606, 1.4752, 1.0541], + device='cuda:2'), covar=tensor([0.0270, 0.0220, 0.0142, 0.0316, 0.0431, 0.0177, 0.0235, 0.0468], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0019, 0.0020, 0.0016, 0.0017, 0.0023, 0.0019, 0.0018], + device='cuda:2'), out_proj_covar=tensor([5.1538e-05, 5.7713e-05, 5.3900e-05, 5.4723e-05, 5.2506e-05, 6.4246e-05, + 5.9715e-05, 5.2809e-05], device='cuda:2') +2022-12-07 09:44:28,720 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.11 vs. limit=2.0 +2022-12-07 09:44:32,739 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21474.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:44:45,041 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.95 vs. limit=5.0 +2022-12-07 09:45:07,700 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.209e+02 2.996e+02 3.929e+02 5.404e+02 9.451e+02, threshold=7.857e+02, percent-clipped=12.0 +2022-12-07 09:45:16,311 INFO [train.py:873] (2/4) Epoch 3, batch 6400, loss[loss=0.1933, simple_loss=0.2148, pruned_loss=0.08588, over 13923.00 frames. ], tot_loss[loss=0.2062, simple_loss=0.21, pruned_loss=0.1012, over 1954914.94 frames. ], batch size: 26, lr: 2.22e-02, grad_scale: 8.0 +2022-12-07 09:45:44,150 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2592, 4.7430, 4.6874, 5.2319, 4.9242, 4.4409, 5.2499, 4.5092], + device='cuda:2'), covar=tensor([0.0295, 0.0864, 0.0274, 0.0398, 0.0704, 0.0456, 0.0424, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0189, 0.0123, 0.0114, 0.0121, 0.0106, 0.0175, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:46:16,427 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21595.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:46:32,403 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21613.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:46:33,697 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.19 vs. limit=2.0 +2022-12-07 09:46:34,034 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.437e+02 2.688e+02 3.501e+02 4.661e+02 7.945e+02, threshold=7.001e+02, percent-clipped=1.0 +2022-12-07 09:46:35,469 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.04 vs. limit=2.0 +2022-12-07 09:46:43,050 INFO [train.py:873] (2/4) Epoch 3, batch 6500, loss[loss=0.1931, simple_loss=0.2041, pruned_loss=0.0911, over 14190.00 frames. ], tot_loss[loss=0.207, simple_loss=0.2105, pruned_loss=0.1018, over 1920800.62 frames. ], batch size: 37, lr: 2.21e-02, grad_scale: 8.0 +2022-12-07 09:47:09,421 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21656.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:47:19,621 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21668.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:47:24,802 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21674.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:47:33,101 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21684.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:47:42,153 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9122, 1.5770, 4.1375, 2.0187, 3.9991, 4.5833, 4.3425, 5.1202], + device='cuda:2'), covar=tensor([0.0105, 0.2658, 0.0323, 0.1961, 0.0324, 0.0215, 0.0181, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0149, 0.0112, 0.0160, 0.0131, 0.0119, 0.0100, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:47:54,434 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21708.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:48:00,153 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.571e+02 3.136e+02 4.193e+02 5.448e+02 1.293e+03, threshold=8.387e+02, percent-clipped=8.0 +2022-12-07 09:48:08,229 INFO [train.py:873] (2/4) Epoch 3, batch 6600, loss[loss=0.1948, simple_loss=0.2077, pruned_loss=0.09091, over 14232.00 frames. ], tot_loss[loss=0.2064, simple_loss=0.2101, pruned_loss=0.1013, over 1927752.85 frames. ], batch size: 37, lr: 2.21e-02, grad_scale: 8.0 +2022-12-07 09:48:34,766 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21756.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:48:40,754 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21763.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:48:51,187 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21774.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:49:27,683 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.343e+02 2.849e+02 3.705e+02 5.166e+02 8.548e+02, threshold=7.409e+02, percent-clipped=1.0 +2022-12-07 09:49:34,274 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=21822.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:49:36,097 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21824.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:49:36,764 INFO [train.py:873] (2/4) Epoch 3, batch 6700, loss[loss=0.1755, simple_loss=0.1916, pruned_loss=0.07968, over 14234.00 frames. ], tot_loss[loss=0.2069, simple_loss=0.2102, pruned_loss=0.1018, over 1902518.49 frames. ], batch size: 69, lr: 2.20e-02, grad_scale: 8.0 +2022-12-07 09:49:43,865 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-07 09:49:58,005 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21850.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:50:06,899 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8208, 3.4483, 3.6072, 3.9298, 3.5715, 2.8513, 3.9353, 3.8747], + device='cuda:2'), covar=tensor([0.0667, 0.0661, 0.0513, 0.0541, 0.0745, 0.0535, 0.0579, 0.0659], + device='cuda:2'), in_proj_covar=tensor([0.0101, 0.0081, 0.0097, 0.0098, 0.0107, 0.0074, 0.0103, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:50:48,538 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.05 vs. limit=2.0 +2022-12-07 09:50:51,726 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=21911.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:50:54,926 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 2.679e+02 3.700e+02 5.034e+02 8.925e+02, threshold=7.401e+02, percent-clipped=4.0 +2022-12-07 09:51:03,449 INFO [train.py:873] (2/4) Epoch 3, batch 6800, loss[loss=0.2254, simple_loss=0.2237, pruned_loss=0.1135, over 14284.00 frames. ], tot_loss[loss=0.2058, simple_loss=0.2095, pruned_loss=0.101, over 1948635.34 frames. ], batch size: 66, lr: 2.20e-02, grad_scale: 8.0 +2022-12-07 09:51:26,040 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21951.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:51:40,255 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21968.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:51:41,077 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=21969.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:51:42,344 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=21970.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:51:54,849 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=21984.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:52:11,958 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-07 09:52:18,671 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1164, 2.8976, 2.4617, 2.4132, 1.8057, 2.5403, 2.6369, 1.2553], + device='cuda:2'), covar=tensor([0.3290, 0.0841, 0.2163, 0.1751, 0.1087, 0.0836, 0.1469, 0.3098], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0064, 0.0050, 0.0057, 0.0070, 0.0058, 0.0079, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:52:21,335 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.545e+02 2.962e+02 3.808e+02 5.028e+02 7.344e+02, threshold=7.616e+02, percent-clipped=0.0 +2022-12-07 09:52:22,234 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22016.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:52:22,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0529, 2.8218, 2.7674, 3.1241, 2.6701, 2.4127, 3.0830, 3.0813], + device='cuda:2'), covar=tensor([0.0846, 0.0736, 0.0845, 0.0680, 0.0984, 0.0914, 0.0916, 0.0735], + device='cuda:2'), in_proj_covar=tensor([0.0102, 0.0082, 0.0100, 0.0099, 0.0111, 0.0076, 0.0106, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 09:52:29,726 INFO [train.py:873] (2/4) Epoch 3, batch 6900, loss[loss=0.1921, simple_loss=0.2042, pruned_loss=0.09, over 14283.00 frames. ], tot_loss[loss=0.2063, simple_loss=0.2101, pruned_loss=0.1012, over 1951511.30 frames. ], batch size: 69, lr: 2.19e-02, grad_scale: 8.0 +2022-12-07 09:52:35,062 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22031.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:52:35,739 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22032.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:52:52,958 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5727, 2.9315, 4.1991, 2.6500, 2.4069, 3.1119, 1.2965, 3.0389], + device='cuda:2'), covar=tensor([0.2290, 0.0764, 0.0239, 0.1629, 0.1576, 0.0974, 0.5020, 0.1084], + device='cuda:2'), in_proj_covar=tensor([0.0061, 0.0066, 0.0067, 0.0073, 0.0086, 0.0059, 0.0134, 0.0067], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 09:53:23,154 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.96 vs. limit=5.0 +2022-12-07 09:53:36,640 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7718, 4.5511, 4.0934, 4.2715, 4.1934, 4.5987, 4.7427, 4.7647], + device='cuda:2'), covar=tensor([0.0610, 0.0460, 0.1566, 0.1999, 0.0715, 0.0453, 0.0912, 0.0588], + device='cuda:2'), in_proj_covar=tensor([0.0229, 0.0197, 0.0280, 0.0363, 0.0226, 0.0258, 0.0267, 0.0216], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:53:44,989 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=9.20 vs. limit=5.0 +2022-12-07 09:53:46,969 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.783e+02 2.990e+02 4.011e+02 5.296e+02 1.371e+03, threshold=8.022e+02, percent-clipped=6.0 +2022-12-07 09:53:50,265 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22119.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:53:55,140 INFO [train.py:873] (2/4) Epoch 3, batch 7000, loss[loss=0.1204, simple_loss=0.1503, pruned_loss=0.04523, over 13584.00 frames. ], tot_loss[loss=0.206, simple_loss=0.2099, pruned_loss=0.101, over 2023104.70 frames. ], batch size: 17, lr: 2.19e-02, grad_scale: 8.0 +2022-12-07 09:53:59,773 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22130.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:54:39,490 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22176.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:54:52,539 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22191.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:05,116 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22206.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:12,926 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.440e+02 2.681e+02 3.418e+02 4.673e+02 9.486e+02, threshold=6.836e+02, percent-clipped=4.0 +2022-12-07 09:55:13,357 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22215.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:21,728 INFO [train.py:873] (2/4) Epoch 3, batch 7100, loss[loss=0.172, simple_loss=0.1928, pruned_loss=0.07563, over 14508.00 frames. ], tot_loss[loss=0.2048, simple_loss=0.2093, pruned_loss=0.1002, over 1983896.83 frames. ], batch size: 34, lr: 2.18e-02, grad_scale: 8.0 +2022-12-07 09:55:32,445 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22237.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:32,465 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22237.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:41,850 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 09:55:44,106 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22251.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:55:53,172 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 09:55:59,950 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22269.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:56:06,058 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22276.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:56:24,906 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22298.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:56:25,591 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22299.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:56:38,994 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.756e+01 2.795e+02 3.480e+02 4.240e+02 8.610e+02, threshold=6.960e+02, percent-clipped=4.0 +2022-12-07 09:56:40,766 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22317.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:56:46,872 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9169, 1.5370, 1.8852, 1.1501, 1.5589, 1.7359, 1.8317, 1.6217], + device='cuda:2'), covar=tensor([0.0478, 0.1322, 0.0657, 0.1984, 0.0994, 0.0733, 0.0510, 0.1577], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0222, 0.0103, 0.0127, 0.0092, 0.0088, 0.0082, 0.0115], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0005, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:56:47,594 INFO [train.py:873] (2/4) Epoch 3, batch 7200, loss[loss=0.1946, simple_loss=0.1953, pruned_loss=0.09695, over 4936.00 frames. ], tot_loss[loss=0.207, simple_loss=0.2106, pruned_loss=0.1017, over 2001927.18 frames. ], batch size: 100, lr: 2.18e-02, grad_scale: 8.0 +2022-12-07 09:56:48,419 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22326.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:57:20,875 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6228, 4.3261, 4.0126, 4.1318, 4.2644, 4.2969, 4.5980, 4.5167], + device='cuda:2'), covar=tensor([0.0548, 0.0572, 0.1579, 0.2074, 0.0692, 0.0579, 0.0777, 0.0763], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0198, 0.0277, 0.0366, 0.0221, 0.0261, 0.0269, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 09:57:21,237 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.71 vs. limit=5.0 +2022-12-07 09:57:39,354 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22385.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:57:48,256 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.61 vs. limit=5.0 +2022-12-07 09:58:05,271 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.487e+02 2.951e+02 3.875e+02 4.878e+02 1.117e+03, threshold=7.750e+02, percent-clipped=7.0 +2022-12-07 09:58:09,128 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22419.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:58:13,856 INFO [train.py:873] (2/4) Epoch 3, batch 7300, loss[loss=0.2187, simple_loss=0.2247, pruned_loss=0.1063, over 14257.00 frames. ], tot_loss[loss=0.2052, simple_loss=0.2092, pruned_loss=0.1006, over 2030505.20 frames. ], batch size: 25, lr: 2.17e-02, grad_scale: 16.0 +2022-12-07 09:58:31,560 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22446.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:58:49,751 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22467.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:59:06,170 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22486.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:59:18,633 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9955, 3.8185, 4.1570, 3.6238, 3.9400, 4.0660, 1.4104, 3.8072], + device='cuda:2'), covar=tensor([0.0155, 0.0272, 0.0318, 0.0388, 0.0223, 0.0204, 0.3096, 0.0233], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0123, 0.0113, 0.0098, 0.0154, 0.0106, 0.0152, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 09:59:23,403 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22506.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 09:59:31,053 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.878e+02 3.644e+02 4.636e+02 1.010e+03, threshold=7.288e+02, percent-clipped=2.0 +2022-12-07 09:59:39,443 INFO [train.py:873] (2/4) Epoch 3, batch 7400, loss[loss=0.234, simple_loss=0.2201, pruned_loss=0.1239, over 6933.00 frames. ], tot_loss[loss=0.2054, simple_loss=0.2092, pruned_loss=0.1008, over 1962210.91 frames. ], batch size: 100, lr: 2.17e-02, grad_scale: 16.0 +2022-12-07 09:59:45,698 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22532.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:00:05,238 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22554.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:00:19,770 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22571.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:00:38,844 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22593.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:00:58,627 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.482e+02 2.799e+02 3.655e+02 4.825e+02 8.691e+02, threshold=7.309e+02, percent-clipped=2.0 +2022-12-07 10:01:06,422 INFO [train.py:873] (2/4) Epoch 3, batch 7500, loss[loss=0.222, simple_loss=0.225, pruned_loss=0.1095, over 14082.00 frames. ], tot_loss[loss=0.2048, simple_loss=0.209, pruned_loss=0.1003, over 1975434.73 frames. ], batch size: 29, lr: 2.16e-02, grad_scale: 8.0 +2022-12-07 10:01:07,396 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22626.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:01:11,865 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8426, 3.5132, 2.8701, 4.9667, 3.9715, 4.9156, 3.9607, 2.9752], + device='cuda:2'), covar=tensor([0.0266, 0.0962, 0.4419, 0.0257, 0.0709, 0.0437, 0.0985, 0.3736], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0284, 0.0323, 0.0179, 0.0231, 0.0217, 0.0259, 0.0327], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 10:01:18,030 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22639.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:01:26,373 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7948, 3.7440, 3.9244, 3.4617, 3.7268, 3.7576, 1.3118, 3.5994], + device='cuda:2'), covar=tensor([0.0157, 0.0201, 0.0335, 0.0355, 0.0244, 0.0291, 0.2778, 0.0199], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0117, 0.0111, 0.0096, 0.0151, 0.0102, 0.0146, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:01:42,991 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.42 vs. limit=2.0 +2022-12-07 10:01:44,222 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22674.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:02:33,696 INFO [train.py:873] (2/4) Epoch 4, batch 0, loss[loss=0.2456, simple_loss=0.2536, pruned_loss=0.1188, over 14228.00 frames. ], tot_loss[loss=0.2456, simple_loss=0.2536, pruned_loss=0.1188, over 14228.00 frames. ], batch size: 37, lr: 2.02e-02, grad_scale: 8.0 +2022-12-07 10:02:33,696 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 10:02:40,697 INFO [train.py:905] (2/4) Epoch 4, validation: loss=0.1426, simple_loss=0.185, pruned_loss=0.0501, over 857387.00 frames. +2022-12-07 10:02:40,698 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 10:02:52,911 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22700.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 10:03:06,590 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.563e+01 2.095e+02 3.755e+02 5.061e+02 1.365e+03, threshold=7.510e+02, percent-clipped=11.0 +2022-12-07 10:03:09,798 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-12-07 10:03:18,497 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9126, 0.8717, 0.9951, 0.9411, 1.0402, 0.3241, 1.0079, 1.0827], + device='cuda:2'), covar=tensor([0.0277, 0.0538, 0.0162, 0.0230, 0.0192, 0.0182, 0.0359, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0018, 0.0019, 0.0015, 0.0016, 0.0022, 0.0017, 0.0016], + device='cuda:2'), out_proj_covar=tensor([5.2464e-05, 5.5313e-05, 5.3260e-05, 5.2293e-05, 4.9585e-05, 6.4558e-05, + 5.8440e-05, 5.1067e-05], device='cuda:2') +2022-12-07 10:03:28,831 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22741.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:04:08,129 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22786.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:04:08,914 INFO [train.py:873] (2/4) Epoch 4, batch 100, loss[loss=0.2206, simple_loss=0.2161, pruned_loss=0.1126, over 4981.00 frames. ], tot_loss[loss=0.2056, simple_loss=0.2105, pruned_loss=0.1004, over 877532.21 frames. ], batch size: 100, lr: 2.02e-02, grad_scale: 8.0 +2022-12-07 10:04:14,993 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22794.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:04:16,913 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.64 vs. limit=5.0 +2022-12-07 10:04:33,905 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.273e+02 2.903e+02 3.464e+02 4.472e+02 7.664e+02, threshold=6.927e+02, percent-clipped=2.0 +2022-12-07 10:04:41,863 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=22825.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:04:48,055 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22832.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:04:49,740 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22834.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:00,370 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9792, 3.8892, 4.0012, 3.3469, 3.8847, 4.0547, 1.4998, 3.7150], + device='cuda:2'), covar=tensor([0.0165, 0.0241, 0.0449, 0.0451, 0.0338, 0.0203, 0.3121, 0.0268], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0118, 0.0114, 0.0097, 0.0153, 0.0103, 0.0146, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:05:08,059 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22855.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:22,139 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22871.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:29,639 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22880.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:35,019 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=22886.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:35,727 INFO [train.py:873] (2/4) Epoch 4, batch 200, loss[loss=0.2081, simple_loss=0.2195, pruned_loss=0.09837, over 14520.00 frames. ], tot_loss[loss=0.2057, simple_loss=0.2099, pruned_loss=0.1008, over 1316236.63 frames. ], batch size: 34, lr: 2.01e-02, grad_scale: 8.0 +2022-12-07 10:05:40,978 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=22893.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:05:47,970 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0669, 1.9816, 1.7137, 1.7473, 2.0171, 1.8388, 2.0159, 2.0253], + device='cuda:2'), covar=tensor([0.0780, 0.1029, 0.2255, 0.2584, 0.0939, 0.0910, 0.1438, 0.0858], + device='cuda:2'), in_proj_covar=tensor([0.0233, 0.0198, 0.0285, 0.0369, 0.0226, 0.0262, 0.0271, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:06:00,925 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.534e+02 3.313e+02 4.252e+02 7.669e+02, threshold=6.626e+02, percent-clipped=3.0 +2022-12-07 10:06:03,423 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22919.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:06:14,325 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0358, 1.5043, 1.5236, 1.5710, 1.4104, 1.5528, 1.2974, 0.7581], + device='cuda:2'), covar=tensor([0.1806, 0.1458, 0.0807, 0.0291, 0.0703, 0.0403, 0.1047, 0.1615], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0062, 0.0053, 0.0053, 0.0069, 0.0056, 0.0078, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:06:21,994 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=22941.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:06:36,539 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-12-07 10:07:01,609 INFO [train.py:873] (2/4) Epoch 4, batch 300, loss[loss=0.1421, simple_loss=0.1303, pruned_loss=0.07693, over 1244.00 frames. ], tot_loss[loss=0.2041, simple_loss=0.2089, pruned_loss=0.09964, over 1588962.50 frames. ], batch size: 100, lr: 2.01e-02, grad_scale: 8.0 +2022-12-07 10:07:05,775 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 10:07:08,467 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=22995.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 10:07:18,361 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8178, 1.6996, 1.8468, 1.8204, 1.4939, 1.8101, 1.7723, 0.8393], + device='cuda:2'), covar=tensor([0.3591, 0.1655, 0.1018, 0.0860, 0.1261, 0.0724, 0.1666, 0.3879], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0062, 0.0053, 0.0053, 0.0070, 0.0056, 0.0079, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:07:26,197 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.872e+01 2.724e+02 3.752e+02 4.773e+02 9.337e+02, threshold=7.504e+02, percent-clipped=9.0 +2022-12-07 10:07:32,124 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.18 vs. limit=2.0 +2022-12-07 10:07:32,566 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3020, 3.9464, 3.8053, 4.3334, 4.1293, 3.8662, 4.3385, 3.7079], + device='cuda:2'), covar=tensor([0.0385, 0.0897, 0.0323, 0.0403, 0.0691, 0.0603, 0.0530, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0117, 0.0189, 0.0127, 0.0119, 0.0123, 0.0101, 0.0180, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:07:48,201 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23041.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:08:14,976 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8853, 1.2853, 2.4556, 2.3069, 2.5107, 2.4011, 1.9821, 2.5602], + device='cuda:2'), covar=tensor([0.0493, 0.0783, 0.0097, 0.0161, 0.0143, 0.0103, 0.0248, 0.0097], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0145, 0.0087, 0.0118, 0.0100, 0.0104, 0.0077, 0.0083], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 10:08:27,272 INFO [train.py:873] (2/4) Epoch 4, batch 400, loss[loss=0.2148, simple_loss=0.2115, pruned_loss=0.109, over 8604.00 frames. ], tot_loss[loss=0.2026, simple_loss=0.2077, pruned_loss=0.09878, over 1676834.76 frames. ], batch size: 100, lr: 2.00e-02, grad_scale: 8.0 +2022-12-07 10:08:29,398 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23089.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:08:46,296 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23108.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:08:52,657 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 2.809e+02 3.789e+02 4.581e+02 1.009e+03, threshold=7.577e+02, percent-clipped=4.0 +2022-12-07 10:09:22,135 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23150.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:09:38,656 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23169.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:09:48,778 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23181.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:09:54,470 INFO [train.py:873] (2/4) Epoch 4, batch 500, loss[loss=0.2063, simple_loss=0.1864, pruned_loss=0.1131, over 3895.00 frames. ], tot_loss[loss=0.202, simple_loss=0.2071, pruned_loss=0.09841, over 1785605.09 frames. ], batch size: 100, lr: 2.00e-02, grad_scale: 8.0 +2022-12-07 10:10:19,852 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.033e+01 2.776e+02 3.736e+02 4.463e+02 9.726e+02, threshold=7.473e+02, percent-clipped=4.0 +2022-12-07 10:10:20,801 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23217.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:10:31,953 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23229.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:11:13,559 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23278.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:11:16,051 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7698, 0.6763, 0.7769, 0.6824, 0.5175, 0.2924, 0.5774, 0.4949], + device='cuda:2'), covar=tensor([0.0134, 0.0047, 0.0087, 0.0068, 0.0209, 0.0285, 0.0132, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0010, 0.0012, 0.0011, 0.0012, 0.0014, 0.0013, 0.0017], + device='cuda:2'), out_proj_covar=tensor([3.9848e-05, 3.7540e-05, 4.4798e-05, 3.8779e-05, 4.1367e-05, 4.4883e-05, + 5.6720e-05, 5.6993e-05], device='cuda:2') +2022-12-07 10:11:18,163 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 10:11:20,902 INFO [train.py:873] (2/4) Epoch 4, batch 600, loss[loss=0.1886, simple_loss=0.2093, pruned_loss=0.08393, over 14568.00 frames. ], tot_loss[loss=0.2007, simple_loss=0.2063, pruned_loss=0.09757, over 1850891.76 frames. ], batch size: 22, lr: 2.00e-02, grad_scale: 8.0 +2022-12-07 10:11:23,610 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23290.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:11:27,795 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23295.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:11:45,413 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 2.720e+02 3.786e+02 4.640e+02 1.060e+03, threshold=7.571e+02, percent-clipped=5.0 +2022-12-07 10:12:08,388 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23343.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:12:10,597 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 10:12:43,487 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 10:12:46,509 INFO [train.py:873] (2/4) Epoch 4, batch 700, loss[loss=0.2475, simple_loss=0.2036, pruned_loss=0.1457, over 1224.00 frames. ], tot_loss[loss=0.2004, simple_loss=0.2061, pruned_loss=0.09737, over 1911344.18 frames. ], batch size: 100, lr: 1.99e-02, grad_scale: 8.0 +2022-12-07 10:13:11,266 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.426e+02 2.855e+02 3.544e+02 4.274e+02 7.481e+02, threshold=7.089e+02, percent-clipped=0.0 +2022-12-07 10:13:21,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6716, 3.4868, 3.8100, 3.2919, 3.6794, 3.5940, 1.3785, 3.5782], + device='cuda:2'), covar=tensor([0.0205, 0.0272, 0.0347, 0.0422, 0.0255, 0.0336, 0.2785, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0121, 0.0114, 0.0096, 0.0152, 0.0105, 0.0145, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:13:40,585 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23450.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:13:46,614 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 10:13:52,685 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23464.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:14:07,434 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23481.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:14:12,390 INFO [train.py:873] (2/4) Epoch 4, batch 800, loss[loss=0.1735, simple_loss=0.1973, pruned_loss=0.07485, over 14281.00 frames. ], tot_loss[loss=0.2002, simple_loss=0.2054, pruned_loss=0.09754, over 1958863.81 frames. ], batch size: 31, lr: 1.99e-02, grad_scale: 8.0 +2022-12-07 10:14:13,341 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=23488.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:14:22,043 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23498.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:14:37,485 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.307e+02 3.074e+02 3.598e+02 4.736e+02 1.088e+03, threshold=7.196e+02, percent-clipped=3.0 +2022-12-07 10:14:48,567 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23529.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:14:49,036 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.37 vs. limit=5.0 +2022-12-07 10:14:58,828 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5550, 2.7988, 3.4025, 2.5312, 2.2632, 2.4345, 1.1553, 2.7171], + device='cuda:2'), covar=tensor([0.1842, 0.1067, 0.0683, 0.1473, 0.1800, 0.1907, 0.6650, 0.1464], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0074, 0.0070, 0.0083, 0.0093, 0.0069, 0.0146, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:15:02,822 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 10:15:05,985 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=23549.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 10:15:26,569 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23573.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:15:36,755 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23585.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:15:38,303 INFO [train.py:873] (2/4) Epoch 4, batch 900, loss[loss=0.2055, simple_loss=0.2038, pruned_loss=0.1036, over 14369.00 frames. ], tot_loss[loss=0.2005, simple_loss=0.206, pruned_loss=0.09754, over 2074696.31 frames. ], batch size: 73, lr: 1.98e-02, grad_scale: 8.0 +2022-12-07 10:16:00,323 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1583, 4.8979, 4.4366, 4.6731, 4.6433, 4.9887, 5.1256, 5.1689], + device='cuda:2'), covar=tensor([0.0726, 0.0401, 0.1609, 0.2315, 0.0633, 0.0514, 0.0715, 0.0534], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0199, 0.0302, 0.0389, 0.0228, 0.0269, 0.0274, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:16:03,091 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.303e+02 2.960e+02 3.909e+02 5.159e+02 1.247e+03, threshold=7.817e+02, percent-clipped=4.0 +2022-12-07 10:16:38,009 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-12-07 10:17:03,224 INFO [train.py:873] (2/4) Epoch 4, batch 1000, loss[loss=0.2756, simple_loss=0.2151, pruned_loss=0.1681, over 1217.00 frames. ], tot_loss[loss=0.2, simple_loss=0.2057, pruned_loss=0.09714, over 2040931.02 frames. ], batch size: 100, lr: 1.98e-02, grad_scale: 8.0 +2022-12-07 10:17:28,304 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.149e+02 2.746e+02 3.575e+02 4.376e+02 8.279e+02, threshold=7.149e+02, percent-clipped=2.0 +2022-12-07 10:18:05,368 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9002, 3.4217, 2.5561, 4.0975, 3.8387, 3.7899, 3.3639, 2.4950], + device='cuda:2'), covar=tensor([0.0274, 0.0826, 0.3280, 0.0177, 0.0305, 0.0801, 0.0637, 0.3468], + device='cuda:2'), in_proj_covar=tensor([0.0222, 0.0285, 0.0325, 0.0184, 0.0230, 0.0219, 0.0257, 0.0324], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 10:18:09,496 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23764.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:18:25,208 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0646, 3.7674, 3.7846, 4.1204, 3.9613, 3.5997, 4.1397, 3.6069], + device='cuda:2'), covar=tensor([0.0426, 0.0765, 0.0339, 0.0374, 0.0539, 0.0852, 0.0437, 0.0462], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0187, 0.0124, 0.0116, 0.0122, 0.0103, 0.0182, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:18:28,811 INFO [train.py:873] (2/4) Epoch 4, batch 1100, loss[loss=0.1776, simple_loss=0.2029, pruned_loss=0.07611, over 13904.00 frames. ], tot_loss[loss=0.2011, simple_loss=0.206, pruned_loss=0.09807, over 2011434.57 frames. ], batch size: 23, lr: 1.98e-02, grad_scale: 8.0 +2022-12-07 10:18:50,534 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23812.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:18:53,786 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.712e+02 2.927e+02 4.016e+02 5.307e+02 1.051e+03, threshold=8.031e+02, percent-clipped=8.0 +2022-12-07 10:19:18,375 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=23844.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 10:19:18,761 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 10:19:32,882 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5711, 4.3678, 4.2079, 4.7219, 4.2287, 3.7044, 4.7328, 4.5809], + device='cuda:2'), covar=tensor([0.0675, 0.0536, 0.0732, 0.0511, 0.0617, 0.0484, 0.0491, 0.0635], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0088, 0.0110, 0.0105, 0.0116, 0.0080, 0.0116, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 10:19:39,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8839, 1.0379, 0.9586, 0.6666, 0.5467, 0.6711, 0.7661, 0.4765], + device='cuda:2'), covar=tensor([0.0280, 0.0239, 0.0226, 0.0155, 0.0677, 0.0296, 0.0247, 0.0750], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0010, 0.0011, 0.0014, 0.0013, 0.0016], + device='cuda:2'), out_proj_covar=tensor([4.1182e-05, 3.9025e-05, 4.5823e-05, 3.7119e-05, 4.0837e-05, 4.8379e-05, + 5.6052e-05, 5.7167e-05], device='cuda:2') +2022-12-07 10:19:43,535 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23873.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:19:53,954 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=23885.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:19:55,485 INFO [train.py:873] (2/4) Epoch 4, batch 1200, loss[loss=0.1882, simple_loss=0.1557, pruned_loss=0.1104, over 1237.00 frames. ], tot_loss[loss=0.2019, simple_loss=0.2065, pruned_loss=0.0986, over 2009597.59 frames. ], batch size: 100, lr: 1.97e-02, grad_scale: 8.0 +2022-12-07 10:20:21,446 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.594e+02 2.900e+02 3.609e+02 4.734e+02 8.045e+02, threshold=7.219e+02, percent-clipped=1.0 +2022-12-07 10:20:24,888 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23921.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:20:34,957 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=23933.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:20:53,566 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.36 vs. limit=2.0 +2022-12-07 10:21:20,672 INFO [train.py:873] (2/4) Epoch 4, batch 1300, loss[loss=0.2317, simple_loss=0.1958, pruned_loss=0.1338, over 1290.00 frames. ], tot_loss[loss=0.2005, simple_loss=0.2062, pruned_loss=0.09745, over 2005490.12 frames. ], batch size: 100, lr: 1.97e-02, grad_scale: 4.0 +2022-12-07 10:21:46,582 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.354e+02 2.737e+02 3.421e+02 4.208e+02 7.387e+02, threshold=6.842e+02, percent-clipped=1.0 +2022-12-07 10:21:57,418 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2719, 3.6293, 3.0643, 4.5531, 4.0912, 4.3500, 3.4548, 2.9089], + device='cuda:2'), covar=tensor([0.0447, 0.1000, 0.3275, 0.0190, 0.0534, 0.1193, 0.1036, 0.4304], + device='cuda:2'), in_proj_covar=tensor([0.0228, 0.0298, 0.0333, 0.0189, 0.0240, 0.0224, 0.0266, 0.0332], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:22:04,277 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24037.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:22:13,388 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8909, 1.2319, 1.1867, 1.1535, 0.9486, 1.2164, 1.1984, 0.7382], + device='cuda:2'), covar=tensor([0.2686, 0.0700, 0.0525, 0.0339, 0.1028, 0.0342, 0.1440, 0.1165], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0059, 0.0053, 0.0051, 0.0068, 0.0055, 0.0078, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:22:46,182 INFO [train.py:873] (2/4) Epoch 4, batch 1400, loss[loss=0.2526, simple_loss=0.2033, pruned_loss=0.151, over 1258.00 frames. ], tot_loss[loss=0.1996, simple_loss=0.2055, pruned_loss=0.09683, over 1961178.61 frames. ], batch size: 100, lr: 1.96e-02, grad_scale: 4.0 +2022-12-07 10:22:48,314 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.19 vs. limit=2.0 +2022-12-07 10:22:48,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1365, 1.3358, 3.2021, 1.3863, 2.9365, 3.0983, 2.2728, 3.2702], + device='cuda:2'), covar=tensor([0.0206, 0.2448, 0.0252, 0.2071, 0.0861, 0.0330, 0.0702, 0.0199], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0152, 0.0121, 0.0164, 0.0134, 0.0130, 0.0111, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:22:55,455 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24098.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:23:02,425 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5277, 2.4766, 3.4448, 2.7798, 3.3673, 3.3373, 3.2040, 2.6965], + device='cuda:2'), covar=tensor([0.0121, 0.1363, 0.0308, 0.0874, 0.0275, 0.0222, 0.0596, 0.0995], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0348, 0.0260, 0.0306, 0.0290, 0.0245, 0.0279, 0.0362], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:23:12,193 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 2.769e+02 3.319e+02 4.343e+02 7.993e+02, threshold=6.638e+02, percent-clipped=1.0 +2022-12-07 10:23:35,027 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=24144.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 10:23:55,206 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2350, 2.2208, 2.1005, 1.3803, 1.9439, 1.9787, 2.3854, 1.9456], + device='cuda:2'), covar=tensor([0.0317, 0.1716, 0.0870, 0.2466, 0.0690, 0.0373, 0.0312, 0.1198], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0228, 0.0104, 0.0130, 0.0098, 0.0089, 0.0078, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:24:11,899 INFO [train.py:873] (2/4) Epoch 4, batch 1500, loss[loss=0.1876, simple_loss=0.1689, pruned_loss=0.1032, over 2650.00 frames. ], tot_loss[loss=0.1984, simple_loss=0.2046, pruned_loss=0.09609, over 1981917.75 frames. ], batch size: 100, lr: 1.96e-02, grad_scale: 4.0 +2022-12-07 10:24:16,769 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=24192.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:24:28,489 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9342, 2.7478, 2.4070, 2.4940, 2.8179, 2.8098, 2.9029, 2.8595], + device='cuda:2'), covar=tensor([0.0855, 0.0965, 0.2008, 0.3249, 0.0851, 0.0852, 0.1195, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0210, 0.0305, 0.0390, 0.0238, 0.0277, 0.0281, 0.0228], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:24:38,553 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.592e+02 2.634e+02 3.340e+02 4.470e+02 8.988e+02, threshold=6.680e+02, percent-clipped=4.0 +2022-12-07 10:25:02,521 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0546, 2.3779, 4.0862, 4.2627, 4.2865, 2.3498, 4.1594, 3.1218], + device='cuda:2'), covar=tensor([0.0074, 0.0239, 0.0223, 0.0105, 0.0041, 0.0388, 0.0036, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0170, 0.0232, 0.0191, 0.0157, 0.0215, 0.0127, 0.0208], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 10:25:36,184 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4560, 3.1690, 2.4808, 3.5737, 3.3348, 3.4847, 3.0586, 2.4584], + device='cuda:2'), covar=tensor([0.0295, 0.0993, 0.3254, 0.0220, 0.0422, 0.0732, 0.0908, 0.4031], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0297, 0.0333, 0.0187, 0.0241, 0.0227, 0.0265, 0.0325], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:25:39,351 INFO [train.py:873] (2/4) Epoch 4, batch 1600, loss[loss=0.1658, simple_loss=0.1625, pruned_loss=0.08452, over 3865.00 frames. ], tot_loss[loss=0.198, simple_loss=0.2044, pruned_loss=0.09586, over 2002660.73 frames. ], batch size: 100, lr: 1.96e-02, grad_scale: 8.0 +2022-12-07 10:25:41,255 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2854, 4.1360, 4.4528, 3.8252, 4.2734, 4.5402, 1.6237, 4.1560], + device='cuda:2'), covar=tensor([0.0160, 0.0242, 0.0353, 0.0325, 0.0224, 0.0136, 0.2755, 0.0190], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0124, 0.0114, 0.0096, 0.0153, 0.0107, 0.0144, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:26:05,382 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.425e+02 2.687e+02 3.404e+02 4.412e+02 1.604e+03, threshold=6.807e+02, percent-clipped=2.0 +2022-12-07 10:26:49,055 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4671, 2.2828, 2.0179, 1.9867, 1.6951, 2.4245, 2.0906, 0.8053], + device='cuda:2'), covar=tensor([0.2629, 0.1930, 0.2871, 0.1647, 0.1531, 0.0824, 0.1408, 0.4171], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0059, 0.0053, 0.0052, 0.0069, 0.0054, 0.0077, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:26:52,394 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.9440, 5.3105, 5.3910, 5.9769, 5.6836, 4.7689, 5.9290, 5.1675], + device='cuda:2'), covar=tensor([0.0300, 0.1099, 0.0290, 0.0310, 0.0530, 0.0294, 0.0393, 0.0372], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0189, 0.0126, 0.0117, 0.0124, 0.0103, 0.0180, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:27:05,337 INFO [train.py:873] (2/4) Epoch 4, batch 1700, loss[loss=0.175, simple_loss=0.1923, pruned_loss=0.07884, over 14131.00 frames. ], tot_loss[loss=0.1975, simple_loss=0.2042, pruned_loss=0.0954, over 2019910.63 frames. ], batch size: 99, lr: 1.95e-02, grad_scale: 8.0 +2022-12-07 10:27:10,486 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24393.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:27:31,011 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.561e+02 2.793e+02 3.554e+02 4.451e+02 1.467e+03, threshold=7.108e+02, percent-clipped=5.0 +2022-12-07 10:27:32,897 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1325, 1.8249, 2.3804, 1.6332, 1.7490, 2.2951, 1.2451, 2.1278], + device='cuda:2'), covar=tensor([0.0673, 0.1398, 0.0619, 0.2634, 0.1965, 0.0537, 0.4470, 0.0789], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0074, 0.0067, 0.0081, 0.0094, 0.0063, 0.0139, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:27:35,351 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 10:28:31,821 INFO [train.py:873] (2/4) Epoch 4, batch 1800, loss[loss=0.1424, simple_loss=0.1575, pruned_loss=0.0637, over 10776.00 frames. ], tot_loss[loss=0.1985, simple_loss=0.2048, pruned_loss=0.09616, over 1972543.05 frames. ], batch size: 13, lr: 1.95e-02, grad_scale: 8.0 +2022-12-07 10:28:36,748 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.68 vs. limit=5.0 +2022-12-07 10:28:41,921 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-07 10:28:57,764 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.645e+02 3.516e+02 4.741e+02 1.240e+03, threshold=7.031e+02, percent-clipped=7.0 +2022-12-07 10:29:23,753 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24547.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:29:43,025 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24569.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:29:58,459 INFO [train.py:873] (2/4) Epoch 4, batch 1900, loss[loss=0.2444, simple_loss=0.2005, pruned_loss=0.1441, over 1340.00 frames. ], tot_loss[loss=0.1975, simple_loss=0.204, pruned_loss=0.09554, over 1978901.47 frames. ], batch size: 100, lr: 1.94e-02, grad_scale: 8.0 +2022-12-07 10:30:17,293 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7911, 2.7388, 2.7371, 2.7129, 2.7472, 2.6664, 1.2764, 2.6000], + device='cuda:2'), covar=tensor([0.0240, 0.0326, 0.0436, 0.0318, 0.0386, 0.0438, 0.2355, 0.0342], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0125, 0.0113, 0.0098, 0.0155, 0.0109, 0.0146, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:30:17,342 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24608.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:30:24,686 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.791e+02 2.734e+02 3.498e+02 4.101e+02 8.612e+02, threshold=6.995e+02, percent-clipped=1.0 +2022-12-07 10:30:36,110 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24630.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:30:57,827 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.97 vs. limit=5.0 +2022-12-07 10:31:07,140 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.31 vs. limit=5.0 +2022-12-07 10:31:12,165 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3894, 2.3206, 2.2259, 1.3653, 1.9375, 2.1403, 2.3405, 1.8705], + device='cuda:2'), covar=tensor([0.0404, 0.2272, 0.1205, 0.3313, 0.0862, 0.0511, 0.0616, 0.1769], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0231, 0.0110, 0.0136, 0.0101, 0.0094, 0.0082, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:31:19,552 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-12-07 10:31:24,624 INFO [train.py:873] (2/4) Epoch 4, batch 2000, loss[loss=0.1918, simple_loss=0.2075, pruned_loss=0.08806, over 13916.00 frames. ], tot_loss[loss=0.1979, simple_loss=0.2041, pruned_loss=0.09586, over 1968723.98 frames. ], batch size: 23, lr: 1.94e-02, grad_scale: 8.0 +2022-12-07 10:31:29,952 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=24693.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:31:50,652 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.066e+02 2.930e+02 3.788e+02 4.800e+02 1.678e+03, threshold=7.576e+02, percent-clipped=5.0 +2022-12-07 10:32:11,156 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=24741.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:32:32,581 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24765.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:32:51,149 INFO [train.py:873] (2/4) Epoch 4, batch 2100, loss[loss=0.1843, simple_loss=0.2001, pruned_loss=0.08423, over 14528.00 frames. ], tot_loss[loss=0.1965, simple_loss=0.203, pruned_loss=0.09501, over 1983887.96 frames. ], batch size: 49, lr: 1.94e-02, grad_scale: 8.0 +2022-12-07 10:33:17,441 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.596e+02 2.608e+02 3.300e+02 4.069e+02 1.028e+03, threshold=6.599e+02, percent-clipped=1.0 +2022-12-07 10:33:25,463 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24826.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:33:41,953 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=9.74 vs. limit=5.0 +2022-12-07 10:33:42,265 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3925, 2.3098, 3.3942, 2.3888, 2.2832, 2.5755, 1.3051, 2.9251], + device='cuda:2'), covar=tensor([0.1645, 0.1581, 0.0847, 0.1962, 0.1940, 0.1993, 0.6166, 0.1285], + device='cuda:2'), in_proj_covar=tensor([0.0065, 0.0072, 0.0065, 0.0079, 0.0090, 0.0063, 0.0137, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:33:44,852 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:33:44,898 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1832, 2.1138, 3.0832, 2.3097, 3.0397, 2.8898, 2.8400, 2.4308], + device='cuda:2'), covar=tensor([0.0203, 0.1947, 0.0286, 0.1085, 0.0303, 0.0430, 0.0683, 0.1761], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0360, 0.0266, 0.0308, 0.0292, 0.0254, 0.0288, 0.0373], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:34:13,189 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.21 vs. limit=2.0 +2022-12-07 10:34:18,570 INFO [train.py:873] (2/4) Epoch 4, batch 2200, loss[loss=0.1798, simple_loss=0.1885, pruned_loss=0.0855, over 13919.00 frames. ], tot_loss[loss=0.1984, simple_loss=0.2039, pruned_loss=0.09647, over 1930586.35 frames. ], batch size: 23, lr: 1.93e-02, grad_scale: 8.0 +2022-12-07 10:34:31,883 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24903.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:34:37,061 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=24909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:34:43,926 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.370e+02 2.748e+02 3.566e+02 4.781e+02 9.343e+02, threshold=7.131e+02, percent-clipped=9.0 +2022-12-07 10:34:50,878 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=24925.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:34:59,110 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.76 vs. limit=2.0 +2022-12-07 10:35:03,705 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6037, 4.9708, 5.0082, 5.6533, 5.3220, 4.4477, 5.5397, 4.6735], + device='cuda:2'), covar=tensor([0.0306, 0.1269, 0.0246, 0.0315, 0.0801, 0.0357, 0.0498, 0.0475], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0190, 0.0128, 0.0119, 0.0129, 0.0105, 0.0187, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:35:09,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2770, 4.7653, 4.7061, 5.3128, 5.0122, 4.3854, 5.1837, 4.4438], + device='cuda:2'), covar=tensor([0.0250, 0.0995, 0.0262, 0.0291, 0.0673, 0.0362, 0.0533, 0.0444], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0188, 0.0127, 0.0118, 0.0128, 0.0104, 0.0186, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 10:35:11,111 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 10:35:44,685 INFO [train.py:873] (2/4) Epoch 4, batch 2300, loss[loss=0.225, simple_loss=0.1915, pruned_loss=0.1293, over 1247.00 frames. ], tot_loss[loss=0.1957, simple_loss=0.2028, pruned_loss=0.09426, over 1996666.79 frames. ], batch size: 100, lr: 1.93e-02, grad_scale: 8.0 +2022-12-07 10:35:51,908 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=24995.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:36:00,033 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4994, 1.7748, 2.5771, 2.6959, 2.3613, 1.8243, 2.7636, 2.1419], + device='cuda:2'), covar=tensor([0.0074, 0.0186, 0.0143, 0.0067, 0.0091, 0.0294, 0.0053, 0.0169], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0172, 0.0232, 0.0191, 0.0159, 0.0216, 0.0129, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 10:36:14,180 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.386e+02 2.734e+02 3.348e+02 4.422e+02 1.416e+03, threshold=6.696e+02, percent-clipped=4.0 +2022-12-07 10:36:48,380 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25056.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:37:15,454 INFO [train.py:873] (2/4) Epoch 4, batch 2400, loss[loss=0.1732, simple_loss=0.1929, pruned_loss=0.07675, over 13971.00 frames. ], tot_loss[loss=0.1974, simple_loss=0.2042, pruned_loss=0.09536, over 2034612.29 frames. ], batch size: 19, lr: 1.93e-02, grad_scale: 8.0 +2022-12-07 10:37:41,017 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.389e+02 2.617e+02 3.623e+02 4.580e+02 1.018e+03, threshold=7.246e+02, percent-clipped=1.0 +2022-12-07 10:37:44,898 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25121.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:38:11,398 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5673, 3.2952, 3.7023, 3.0964, 3.4855, 3.4080, 1.3772, 3.3072], + device='cuda:2'), covar=tensor([0.0188, 0.0323, 0.0320, 0.0495, 0.0306, 0.0354, 0.2874, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0124, 0.0114, 0.0098, 0.0156, 0.0109, 0.0146, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:38:41,251 INFO [train.py:873] (2/4) Epoch 4, batch 2500, loss[loss=0.1967, simple_loss=0.1993, pruned_loss=0.09708, over 14256.00 frames. ], tot_loss[loss=0.1961, simple_loss=0.2032, pruned_loss=0.09451, over 2022756.88 frames. ], batch size: 80, lr: 1.92e-02, grad_scale: 8.0 +2022-12-07 10:38:55,814 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25203.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:38:56,512 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25204.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:39:07,387 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.756e+02 2.934e+02 3.822e+02 4.724e+02 8.736e+02, threshold=7.644e+02, percent-clipped=3.0 +2022-12-07 10:39:14,160 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25225.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:39:21,606 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0723, 1.6870, 4.5571, 4.2135, 4.2286, 4.6316, 4.3324, 4.6289], + device='cuda:2'), covar=tensor([0.1108, 0.1286, 0.0066, 0.0152, 0.0123, 0.0077, 0.0082, 0.0081], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0153, 0.0090, 0.0125, 0.0107, 0.0111, 0.0080, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 10:39:37,057 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25251.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:39:47,961 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6406, 1.5799, 2.8402, 1.4096, 2.9202, 2.7971, 2.0280, 3.0111], + device='cuda:2'), covar=tensor([0.0203, 0.1727, 0.0222, 0.1540, 0.0202, 0.0274, 0.0630, 0.0148], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0149, 0.0119, 0.0158, 0.0133, 0.0128, 0.0110, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:39:56,075 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25273.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:40:08,587 INFO [train.py:873] (2/4) Epoch 4, batch 2600, loss[loss=0.22, simple_loss=0.2158, pruned_loss=0.112, over 14315.00 frames. ], tot_loss[loss=0.196, simple_loss=0.2032, pruned_loss=0.09443, over 2044866.39 frames. ], batch size: 46, lr: 1.92e-02, grad_scale: 8.0 +2022-12-07 10:40:14,755 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8121, 1.4161, 0.9934, 1.5961, 1.1158, 0.7109, 1.1496, 1.1004], + device='cuda:2'), covar=tensor([0.0990, 0.1240, 0.1299, 0.0552, 0.0979, 0.0400, 0.0645, 0.1254], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0011, 0.0012, 0.0010, 0.0012, 0.0015, 0.0013, 0.0018], + device='cuda:2'), out_proj_covar=tensor([4.5466e-05, 4.3091e-05, 4.8804e-05, 3.9940e-05, 4.5974e-05, 5.3975e-05, + 5.8211e-05, 6.6522e-05], device='cuda:2') +2022-12-07 10:40:15,184 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 10:40:34,423 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.970e+01 2.618e+02 3.461e+02 4.480e+02 1.117e+03, threshold=6.922e+02, percent-clipped=3.0 +2022-12-07 10:40:51,733 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.30 vs. limit=5.0 +2022-12-07 10:41:03,004 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25351.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 10:41:33,623 INFO [train.py:873] (2/4) Epoch 4, batch 2700, loss[loss=0.2127, simple_loss=0.2143, pruned_loss=0.1055, over 10318.00 frames. ], tot_loss[loss=0.1952, simple_loss=0.2025, pruned_loss=0.09392, over 2070622.52 frames. ], batch size: 100, lr: 1.92e-02, grad_scale: 8.0 +2022-12-07 10:41:38,898 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-07 10:41:59,634 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.867e+02 3.536e+02 4.473e+02 9.787e+02, threshold=7.071e+02, percent-clipped=5.0 +2022-12-07 10:42:03,235 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25421.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:42:09,177 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=25428.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:42:10,002 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8666, 1.2339, 2.5179, 2.3795, 2.5233, 2.4189, 1.8351, 2.4685], + device='cuda:2'), covar=tensor([0.0456, 0.0863, 0.0072, 0.0156, 0.0131, 0.0082, 0.0255, 0.0098], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0153, 0.0090, 0.0124, 0.0105, 0.0111, 0.0081, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 10:42:10,396 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.03 vs. limit=5.0 +2022-12-07 10:42:16,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7634, 1.2927, 3.5874, 3.4146, 3.4868, 3.4700, 2.8547, 3.5483], + device='cuda:2'), covar=tensor([0.1234, 0.1464, 0.0101, 0.0177, 0.0156, 0.0140, 0.0217, 0.0127], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0152, 0.0089, 0.0124, 0.0105, 0.0111, 0.0081, 0.0087], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 10:42:20,053 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=3.36 vs. limit=2.0 +2022-12-07 10:42:26,682 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 10:42:44,229 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25469.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:43:00,508 INFO [train.py:873] (2/4) Epoch 4, batch 2800, loss[loss=0.189, simple_loss=0.2055, pruned_loss=0.08628, over 14555.00 frames. ], tot_loss[loss=0.195, simple_loss=0.2023, pruned_loss=0.09382, over 1972721.24 frames. ], batch size: 22, lr: 1.91e-02, grad_scale: 8.0 +2022-12-07 10:43:02,418 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25489.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:43:15,008 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25504.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:43:25,891 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 2.496e+02 3.123e+02 4.474e+02 9.387e+02, threshold=6.246e+02, percent-clipped=3.0 +2022-12-07 10:43:55,781 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25552.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:44:01,224 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.24 vs. limit=5.0 +2022-12-07 10:44:26,291 INFO [train.py:873] (2/4) Epoch 4, batch 2900, loss[loss=0.1849, simple_loss=0.1816, pruned_loss=0.09416, over 3869.00 frames. ], tot_loss[loss=0.195, simple_loss=0.2023, pruned_loss=0.09389, over 1957551.11 frames. ], batch size: 100, lr: 1.91e-02, grad_scale: 8.0 +2022-12-07 10:44:52,742 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.030e+02 2.734e+02 3.612e+02 4.564e+02 7.787e+02, threshold=7.225e+02, percent-clipped=1.0 +2022-12-07 10:45:19,263 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-12-07 10:45:22,067 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=25651.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 10:45:33,536 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3413, 3.4608, 2.6242, 2.2630, 2.9730, 3.1909, 3.1908, 2.4773], + device='cuda:2'), covar=tensor([0.0551, 0.2505, 0.1440, 0.3195, 0.0964, 0.0572, 0.1578, 0.1969], + device='cuda:2'), in_proj_covar=tensor([0.0091, 0.0227, 0.0109, 0.0134, 0.0101, 0.0094, 0.0082, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:45:53,685 INFO [train.py:873] (2/4) Epoch 4, batch 3000, loss[loss=0.2326, simple_loss=0.2243, pruned_loss=0.1205, over 14219.00 frames. ], tot_loss[loss=0.1976, simple_loss=0.2035, pruned_loss=0.09587, over 1916704.36 frames. ], batch size: 89, lr: 1.90e-02, grad_scale: 8.0 +2022-12-07 10:45:53,685 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 10:46:02,552 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7245, 2.6678, 2.4737, 1.9168, 2.4037, 2.8379, 2.9982, 2.4556], + device='cuda:2'), covar=tensor([0.0612, 0.1834, 0.1449, 0.3030, 0.0838, 0.0543, 0.0405, 0.1510], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0227, 0.0110, 0.0133, 0.0102, 0.0093, 0.0081, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:46:03,274 INFO [train.py:905] (2/4) Epoch 4, validation: loss=0.1268, simple_loss=0.1698, pruned_loss=0.0419, over 857387.00 frames. +2022-12-07 10:46:03,275 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 10:46:14,147 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=25699.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:46:29,651 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.391e+02 2.570e+02 3.689e+02 4.619e+02 9.776e+02, threshold=7.378e+02, percent-clipped=3.0 +2022-12-07 10:47:23,842 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2490, 5.1334, 4.5296, 4.7830, 4.7141, 5.1638, 5.2436, 5.1754], + device='cuda:2'), covar=tensor([0.0798, 0.0368, 0.1596, 0.2071, 0.0727, 0.0479, 0.0813, 0.0820], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0213, 0.0313, 0.0393, 0.0241, 0.0278, 0.0293, 0.0235], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:47:28,433 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=25784.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 10:47:30,950 INFO [train.py:873] (2/4) Epoch 4, batch 3100, loss[loss=0.1649, simple_loss=0.1506, pruned_loss=0.08956, over 2629.00 frames. ], tot_loss[loss=0.1959, simple_loss=0.2025, pruned_loss=0.09462, over 1901777.78 frames. ], batch size: 100, lr: 1.90e-02, grad_scale: 8.0 +2022-12-07 10:47:57,004 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.131e+02 2.578e+02 3.545e+02 4.492e+02 7.913e+02, threshold=7.089e+02, percent-clipped=1.0 +2022-12-07 10:48:22,526 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 10:48:33,423 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=25858.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:48:37,076 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-12-07 10:48:57,813 INFO [train.py:873] (2/4) Epoch 4, batch 3200, loss[loss=0.1985, simple_loss=0.202, pruned_loss=0.09747, over 13913.00 frames. ], tot_loss[loss=0.1956, simple_loss=0.2023, pruned_loss=0.09447, over 1901457.64 frames. ], batch size: 26, lr: 1.90e-02, grad_scale: 8.0 +2022-12-07 10:49:24,095 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.295e+02 2.801e+02 3.510e+02 4.555e+02 1.491e+03, threshold=7.021e+02, percent-clipped=7.0 +2022-12-07 10:49:26,004 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=25919.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 10:49:37,844 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6426, 4.5371, 4.6715, 4.3367, 4.4574, 5.0273, 2.0253, 4.2700], + device='cuda:2'), covar=tensor([0.0187, 0.0244, 0.0443, 0.0312, 0.0297, 0.0106, 0.3003, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0126, 0.0114, 0.0098, 0.0153, 0.0107, 0.0147, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:49:43,105 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0876, 1.4177, 3.2484, 1.3627, 3.0163, 3.2460, 2.1458, 3.3121], + device='cuda:2'), covar=tensor([0.0169, 0.2290, 0.0204, 0.1942, 0.0647, 0.0251, 0.0722, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0153, 0.0123, 0.0162, 0.0136, 0.0130, 0.0112, 0.0109], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:50:25,175 INFO [train.py:873] (2/4) Epoch 4, batch 3300, loss[loss=0.1949, simple_loss=0.1948, pruned_loss=0.09746, over 4957.00 frames. ], tot_loss[loss=0.1943, simple_loss=0.2018, pruned_loss=0.09344, over 1954438.00 frames. ], batch size: 100, lr: 1.89e-02, grad_scale: 8.0 +2022-12-07 10:50:41,666 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9968, 1.6386, 4.6285, 1.9758, 4.3943, 4.5865, 4.4560, 5.2596], + device='cuda:2'), covar=tensor([0.0131, 0.2557, 0.0244, 0.2130, 0.0228, 0.0189, 0.0157, 0.0098], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0152, 0.0122, 0.0160, 0.0135, 0.0130, 0.0110, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:50:51,957 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.542e+02 2.499e+02 3.235e+02 4.436e+02 1.031e+03, threshold=6.470e+02, percent-clipped=3.0 +2022-12-07 10:51:24,529 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7069, 3.5359, 3.2427, 3.2856, 3.6250, 3.5724, 3.6900, 3.6741], + device='cuda:2'), covar=tensor([0.0986, 0.0671, 0.1670, 0.2540, 0.0670, 0.0674, 0.0982, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0212, 0.0317, 0.0391, 0.0239, 0.0280, 0.0292, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:51:48,813 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26084.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 10:51:49,203 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 10:51:51,018 INFO [train.py:873] (2/4) Epoch 4, batch 3400, loss[loss=0.1553, simple_loss=0.1412, pruned_loss=0.08475, over 2649.00 frames. ], tot_loss[loss=0.1942, simple_loss=0.2017, pruned_loss=0.09329, over 1963033.39 frames. ], batch size: 100, lr: 1.89e-02, grad_scale: 8.0 +2022-12-07 10:52:04,494 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26102.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:52:18,305 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.312e+02 2.979e+02 3.585e+02 4.923e+02 1.213e+03, threshold=7.170e+02, percent-clipped=7.0 +2022-12-07 10:52:30,046 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26132.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 10:52:31,205 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9678, 2.8155, 2.5529, 2.5628, 2.8897, 2.7953, 2.9320, 2.8125], + device='cuda:2'), covar=tensor([0.0854, 0.0775, 0.1685, 0.2839, 0.0793, 0.0931, 0.1081, 0.1138], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0211, 0.0316, 0.0390, 0.0239, 0.0282, 0.0294, 0.0233], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 10:52:50,631 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5376, 1.9500, 3.7091, 2.5052, 3.6238, 1.7428, 2.6984, 3.4006], + device='cuda:2'), covar=tensor([0.0396, 0.5039, 0.0368, 0.9463, 0.0236, 0.4386, 0.1387, 0.0267], + device='cuda:2'), in_proj_covar=tensor([0.0228, 0.0285, 0.0177, 0.0389, 0.0178, 0.0298, 0.0267, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:52:57,388 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26163.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:53:01,594 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26168.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:53:04,675 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4170, 0.9648, 1.3216, 0.6944, 1.0204, 1.0161, 1.2142, 1.2061], + device='cuda:2'), covar=tensor([0.0349, 0.1022, 0.0674, 0.0726, 0.0899, 0.0854, 0.0287, 0.0971], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0222, 0.0108, 0.0131, 0.0098, 0.0090, 0.0081, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 10:53:18,123 INFO [train.py:873] (2/4) Epoch 4, batch 3500, loss[loss=0.1875, simple_loss=0.2007, pruned_loss=0.08713, over 14649.00 frames. ], tot_loss[loss=0.1928, simple_loss=0.2006, pruned_loss=0.09254, over 1949475.80 frames. ], batch size: 23, lr: 1.89e-02, grad_scale: 8.0 +2022-12-07 10:53:41,625 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26214.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:53:44,756 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.551e+02 2.647e+02 3.463e+02 4.115e+02 9.483e+02, threshold=6.926e+02, percent-clipped=4.0 +2022-12-07 10:53:54,557 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26229.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:54:12,084 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26250.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:54:27,627 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1361, 3.4702, 2.4905, 4.1517, 3.8266, 4.0193, 3.4001, 3.0014], + device='cuda:2'), covar=tensor([0.0344, 0.1027, 0.4254, 0.0369, 0.0487, 0.0703, 0.1017, 0.3414], + device='cuda:2'), in_proj_covar=tensor([0.0218, 0.0295, 0.0320, 0.0187, 0.0236, 0.0227, 0.0260, 0.0310], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 10:54:44,198 INFO [train.py:873] (2/4) Epoch 4, batch 3600, loss[loss=0.2122, simple_loss=0.2151, pruned_loss=0.1046, over 11977.00 frames. ], tot_loss[loss=0.1931, simple_loss=0.201, pruned_loss=0.09263, over 1924405.75 frames. ], batch size: 100, lr: 1.88e-02, grad_scale: 8.0 +2022-12-07 10:55:05,297 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26311.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:55:10,969 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.347e+02 2.617e+02 3.267e+02 4.043e+02 9.670e+02, threshold=6.534e+02, percent-clipped=3.0 +2022-12-07 10:56:10,704 INFO [train.py:873] (2/4) Epoch 4, batch 3700, loss[loss=0.1908, simple_loss=0.2033, pruned_loss=0.08914, over 14669.00 frames. ], tot_loss[loss=0.1938, simple_loss=0.2019, pruned_loss=0.09288, over 1978809.32 frames. ], batch size: 23, lr: 1.88e-02, grad_scale: 8.0 +2022-12-07 10:56:12,973 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-07 10:56:34,067 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26414.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:56:37,277 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.464e+02 2.639e+02 3.495e+02 4.493e+02 9.568e+02, threshold=6.990e+02, percent-clipped=8.0 +2022-12-07 10:57:01,198 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-07 10:57:11,873 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26458.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:57:26,652 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26475.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:57:36,082 INFO [train.py:873] (2/4) Epoch 4, batch 3800, loss[loss=0.1839, simple_loss=0.2036, pruned_loss=0.08213, over 14566.00 frames. ], tot_loss[loss=0.1942, simple_loss=0.2021, pruned_loss=0.0931, over 2027470.21 frames. ], batch size: 34, lr: 1.88e-02, grad_scale: 8.0 +2022-12-07 10:58:00,280 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26514.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:58:03,348 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 2.752e+02 3.266e+02 4.284e+02 8.876e+02, threshold=6.533e+02, percent-clipped=2.0 +2022-12-07 10:58:08,569 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26524.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:58:41,458 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26562.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 10:59:03,303 INFO [train.py:873] (2/4) Epoch 4, batch 3900, loss[loss=0.1897, simple_loss=0.2069, pruned_loss=0.08624, over 14601.00 frames. ], tot_loss[loss=0.1935, simple_loss=0.2016, pruned_loss=0.09272, over 2009636.68 frames. ], batch size: 22, lr: 1.87e-02, grad_scale: 8.0 +2022-12-07 10:59:19,336 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26606.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 10:59:29,337 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.794e+02 3.682e+02 4.681e+02 9.431e+02, threshold=7.364e+02, percent-clipped=3.0 +2022-12-07 10:59:39,131 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 10:59:48,921 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9972, 2.1393, 3.0828, 2.2807, 3.0295, 2.8876, 2.8487, 2.3827], + device='cuda:2'), covar=tensor([0.0215, 0.1631, 0.0333, 0.1068, 0.0387, 0.0367, 0.0533, 0.1401], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0353, 0.0288, 0.0316, 0.0304, 0.0261, 0.0304, 0.0370], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:00:26,114 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-12-07 11:00:28,849 INFO [train.py:873] (2/4) Epoch 4, batch 4000, loss[loss=0.2135, simple_loss=0.2172, pruned_loss=0.1049, over 14151.00 frames. ], tot_loss[loss=0.1935, simple_loss=0.2013, pruned_loss=0.09282, over 1988772.05 frames. ], batch size: 84, lr: 1.87e-02, grad_scale: 8.0 +2022-12-07 11:00:33,212 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0299, 0.5758, 0.8001, 1.0771, 1.0680, 0.5384, 1.0500, 0.9245], + device='cuda:2'), covar=tensor([0.0417, 0.0678, 0.0402, 0.0367, 0.0428, 0.0261, 0.0363, 0.0352], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0019, 0.0021, 0.0016, 0.0017, 0.0023, 0.0018, 0.0018], + device='cuda:2'), out_proj_covar=tensor([5.9150e-05, 6.4231e-05, 6.3257e-05, 5.7983e-05, 5.6165e-05, 7.4070e-05, + 6.6873e-05, 6.1689e-05], device='cuda:2') +2022-12-07 11:00:55,534 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.732e+02 2.728e+02 3.637e+02 4.905e+02 8.607e+02, threshold=7.274e+02, percent-clipped=2.0 +2022-12-07 11:00:55,676 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8767, 4.5141, 4.3053, 4.7927, 4.6920, 4.1359, 4.8914, 4.2289], + device='cuda:2'), covar=tensor([0.0302, 0.0865, 0.0293, 0.0412, 0.0580, 0.0534, 0.0440, 0.0382], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0203, 0.0131, 0.0125, 0.0132, 0.0111, 0.0196, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 11:01:29,194 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26758.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:01:39,482 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=26770.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:01:53,836 INFO [train.py:873] (2/4) Epoch 4, batch 4100, loss[loss=0.212, simple_loss=0.2155, pruned_loss=0.1043, over 14160.00 frames. ], tot_loss[loss=0.1927, simple_loss=0.2011, pruned_loss=0.09211, over 2025897.31 frames. ], batch size: 99, lr: 1.87e-02, grad_scale: 8.0 +2022-12-07 11:02:00,664 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7982, 1.3067, 3.6892, 3.4284, 3.6383, 3.6355, 2.9007, 3.6620], + device='cuda:2'), covar=tensor([0.1364, 0.1682, 0.0125, 0.0194, 0.0162, 0.0156, 0.0240, 0.0147], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0154, 0.0093, 0.0127, 0.0107, 0.0112, 0.0081, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:02:10,423 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26806.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:02:20,838 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.226e+02 2.498e+02 3.224e+02 4.280e+02 1.572e+03, threshold=6.447e+02, percent-clipped=4.0 +2022-12-07 11:02:26,132 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26824.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:02:30,041 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4402, 1.0297, 1.3193, 0.8113, 0.9874, 1.2093, 1.1780, 1.0815], + device='cuda:2'), covar=tensor([0.0264, 0.0631, 0.0461, 0.0544, 0.0677, 0.0510, 0.0234, 0.0722], + device='cuda:2'), in_proj_covar=tensor([0.0093, 0.0225, 0.0107, 0.0128, 0.0098, 0.0095, 0.0078, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:03:07,343 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26872.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:03:19,882 INFO [train.py:873] (2/4) Epoch 4, batch 4200, loss[loss=0.2178, simple_loss=0.2199, pruned_loss=0.1078, over 14291.00 frames. ], tot_loss[loss=0.1934, simple_loss=0.2017, pruned_loss=0.09255, over 2016997.92 frames. ], batch size: 66, lr: 1.86e-02, grad_scale: 8.0 +2022-12-07 11:03:22,834 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 11:03:34,669 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26904.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 11:03:36,728 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=26906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:03:41,175 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-12-07 11:03:46,349 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.525e+02 2.657e+02 3.822e+02 4.895e+02 8.860e+02, threshold=7.643e+02, percent-clipped=9.0 +2022-12-07 11:04:13,585 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=26950.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:04:16,902 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=26954.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:04:26,322 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=26965.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:04:45,366 INFO [train.py:873] (2/4) Epoch 4, batch 4300, loss[loss=0.1803, simple_loss=0.1982, pruned_loss=0.08122, over 14442.00 frames. ], tot_loss[loss=0.1933, simple_loss=0.2014, pruned_loss=0.09257, over 1974489.69 frames. ], batch size: 51, lr: 1.86e-02, grad_scale: 8.0 +2022-12-07 11:05:05,645 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27011.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:05:11,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.439e+02 2.870e+02 3.578e+02 4.388e+02 9.558e+02, threshold=7.157e+02, percent-clipped=3.0 +2022-12-07 11:05:57,152 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27070.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:06:11,314 INFO [train.py:873] (2/4) Epoch 4, batch 4400, loss[loss=0.1891, simple_loss=0.1591, pruned_loss=0.1095, over 1223.00 frames. ], tot_loss[loss=0.1945, simple_loss=0.202, pruned_loss=0.09349, over 1950831.35 frames. ], batch size: 100, lr: 1.86e-02, grad_scale: 8.0 +2022-12-07 11:06:38,400 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.559e+02 3.461e+02 4.606e+02 7.213e+02, threshold=6.922e+02, percent-clipped=1.0 +2022-12-07 11:06:38,491 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27118.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:06:44,011 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.03 vs. limit=5.0 +2022-12-07 11:07:04,556 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=27148.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:07:16,429 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 11:07:30,904 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 11:07:37,302 INFO [train.py:873] (2/4) Epoch 4, batch 4500, loss[loss=0.1609, simple_loss=0.1491, pruned_loss=0.08637, over 1303.00 frames. ], tot_loss[loss=0.1938, simple_loss=0.2018, pruned_loss=0.0929, over 1968929.14 frames. ], batch size: 100, lr: 1.85e-02, grad_scale: 8.0 +2022-12-07 11:07:57,179 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27209.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:08:04,791 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.722e+01 2.647e+02 3.341e+02 4.493e+02 1.287e+03, threshold=6.683e+02, percent-clipped=3.0 +2022-12-07 11:08:40,927 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27260.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 11:09:04,081 INFO [train.py:873] (2/4) Epoch 4, batch 4600, loss[loss=0.1852, simple_loss=0.2038, pruned_loss=0.08331, over 14278.00 frames. ], tot_loss[loss=0.1936, simple_loss=0.202, pruned_loss=0.09263, over 1968770.47 frames. ], batch size: 76, lr: 1.85e-02, grad_scale: 8.0 +2022-12-07 11:09:20,188 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27306.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:09:30,612 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.463e+02 3.048e+02 4.012e+02 4.961e+02 8.817e+02, threshold=8.024e+02, percent-clipped=5.0 +2022-12-07 11:09:50,092 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7380, 3.1221, 4.3250, 2.9903, 4.1518, 4.3164, 3.9238, 3.6555], + device='cuda:2'), covar=tensor([0.0137, 0.1627, 0.0322, 0.1164, 0.0445, 0.0273, 0.1106, 0.1280], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0338, 0.0291, 0.0307, 0.0306, 0.0255, 0.0294, 0.0361], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:09:55,361 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8164, 1.2695, 1.2947, 1.2273, 1.1762, 1.0004, 1.0248, 0.9053], + device='cuda:2'), covar=tensor([0.2504, 0.0519, 0.0326, 0.0490, 0.0857, 0.0178, 0.1912, 0.1692], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0057, 0.0048, 0.0050, 0.0067, 0.0052, 0.0079, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:10:14,866 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7470, 0.5910, 0.3959, 0.8100, 0.6841, 0.2367, 0.5552, 0.6520], + device='cuda:2'), covar=tensor([0.0093, 0.0056, 0.0067, 0.0121, 0.0080, 0.0063, 0.0519, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0020, 0.0016, 0.0018, 0.0023, 0.0017, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.9349e-05, 6.0064e-05, 6.2882e-05, 5.9189e-05, 5.8731e-05, 7.5131e-05, + 6.4837e-05, 5.9020e-05], device='cuda:2') +2022-12-07 11:10:22,776 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5924, 3.3301, 2.3540, 3.7861, 3.5851, 3.6636, 2.8757, 2.4781], + device='cuda:2'), covar=tensor([0.0682, 0.1038, 0.4341, 0.0427, 0.0711, 0.0930, 0.1330, 0.4261], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0293, 0.0322, 0.0187, 0.0243, 0.0234, 0.0258, 0.0314], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 11:10:29,692 INFO [train.py:873] (2/4) Epoch 4, batch 4700, loss[loss=0.216, simple_loss=0.2188, pruned_loss=0.1066, over 14207.00 frames. ], tot_loss[loss=0.1943, simple_loss=0.2022, pruned_loss=0.09317, over 1981311.60 frames. ], batch size: 89, lr: 1.85e-02, grad_scale: 8.0 +2022-12-07 11:10:30,563 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0994, 3.7326, 3.7618, 4.1617, 3.8533, 3.3679, 4.1975, 4.1139], + device='cuda:2'), covar=tensor([0.0572, 0.0604, 0.0588, 0.0479, 0.0653, 0.0614, 0.0539, 0.0549], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0091, 0.0106, 0.0108, 0.0117, 0.0085, 0.0117, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:10:56,345 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.523e+02 2.544e+02 3.410e+02 4.417e+02 8.558e+02, threshold=6.819e+02, percent-clipped=1.0 +2022-12-07 11:11:18,085 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1839, 4.8446, 4.4281, 4.6374, 4.5445, 4.8622, 4.9941, 5.1372], + device='cuda:2'), covar=tensor([0.0672, 0.0539, 0.1992, 0.2560, 0.0719, 0.0610, 0.0944, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0219, 0.0316, 0.0404, 0.0238, 0.0290, 0.0297, 0.0235], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 11:11:29,385 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-07 11:11:56,555 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-12-07 11:11:56,835 INFO [train.py:873] (2/4) Epoch 4, batch 4800, loss[loss=0.2054, simple_loss=0.2107, pruned_loss=0.1001, over 14245.00 frames. ], tot_loss[loss=0.1924, simple_loss=0.2007, pruned_loss=0.09212, over 1913740.88 frames. ], batch size: 69, lr: 1.84e-02, grad_scale: 8.0 +2022-12-07 11:11:57,458 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-07 11:12:11,677 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=27504.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:12:24,599 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.288e+02 3.057e+02 3.716e+02 4.446e+02 1.010e+03, threshold=7.432e+02, percent-clipped=2.0 +2022-12-07 11:13:00,433 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27560.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 11:13:00,681 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-07 11:13:06,501 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.08 vs. limit=5.0 +2022-12-07 11:13:23,629 INFO [train.py:873] (2/4) Epoch 4, batch 4900, loss[loss=0.1885, simple_loss=0.2056, pruned_loss=0.08565, over 14412.00 frames. ], tot_loss[loss=0.1916, simple_loss=0.2004, pruned_loss=0.09139, over 1971666.33 frames. ], batch size: 41, lr: 1.84e-02, grad_scale: 8.0 +2022-12-07 11:13:24,503 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1152, 4.7750, 4.6672, 5.2251, 4.8636, 4.1759, 5.2106, 5.0770], + device='cuda:2'), covar=tensor([0.0601, 0.0342, 0.0467, 0.0442, 0.0460, 0.0466, 0.0509, 0.0614], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0087, 0.0102, 0.0103, 0.0110, 0.0081, 0.0111, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:13:40,764 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27606.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:13:42,438 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27608.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 11:13:51,358 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.582e+02 2.749e+02 3.689e+02 4.632e+02 8.661e+02, threshold=7.378e+02, percent-clipped=1.0 +2022-12-07 11:14:21,727 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27654.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:14:33,340 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-07 11:14:49,329 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.87 vs. limit=2.0 +2022-12-07 11:14:50,476 INFO [train.py:873] (2/4) Epoch 4, batch 5000, loss[loss=0.2079, simple_loss=0.2024, pruned_loss=0.1067, over 7742.00 frames. ], tot_loss[loss=0.19, simple_loss=0.1994, pruned_loss=0.09033, over 1965999.73 frames. ], batch size: 100, lr: 1.84e-02, grad_scale: 8.0 +2022-12-07 11:15:14,379 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=27714.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 11:15:18,160 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.210e+02 2.479e+02 3.081e+02 3.772e+02 7.331e+02, threshold=6.161e+02, percent-clipped=0.0 +2022-12-07 11:15:51,068 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1509, 0.6358, 0.4297, 0.8584, 0.9755, 0.5314, 0.7329, 0.8278], + device='cuda:2'), covar=tensor([0.0271, 0.0303, 0.0154, 0.0329, 0.0270, 0.0253, 0.0396, 0.0283], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0017, 0.0019, 0.0015, 0.0017, 0.0022, 0.0017, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.7267e-05, 5.9236e-05, 6.0039e-05, 5.6582e-05, 5.7615e-05, 7.4551e-05, + 6.4228e-05, 5.8924e-05], device='cuda:2') +2022-12-07 11:15:52,693 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0015, 1.9084, 1.8546, 2.1082, 1.7367, 1.9299, 2.0510, 2.0656], + device='cuda:2'), covar=tensor([0.0783, 0.0920, 0.0966, 0.0663, 0.1084, 0.0639, 0.0856, 0.0779], + device='cuda:2'), in_proj_covar=tensor([0.0105, 0.0087, 0.0103, 0.0102, 0.0110, 0.0080, 0.0112, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:16:06,285 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=27775.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:16:16,333 INFO [train.py:873] (2/4) Epoch 4, batch 5100, loss[loss=0.1953, simple_loss=0.2074, pruned_loss=0.0916, over 14273.00 frames. ], tot_loss[loss=0.1904, simple_loss=0.1997, pruned_loss=0.09059, over 1943887.40 frames. ], batch size: 57, lr: 1.83e-02, grad_scale: 8.0 +2022-12-07 11:16:31,606 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=27804.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:16:43,937 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 2.535e+02 3.343e+02 4.225e+02 1.167e+03, threshold=6.685e+02, percent-clipped=6.0 +2022-12-07 11:17:12,651 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=27852.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:17:43,049 INFO [train.py:873] (2/4) Epoch 4, batch 5200, loss[loss=0.1702, simple_loss=0.1875, pruned_loss=0.07642, over 14411.00 frames. ], tot_loss[loss=0.1923, simple_loss=0.2008, pruned_loss=0.09186, over 1949382.71 frames. ], batch size: 53, lr: 1.83e-02, grad_scale: 8.0 +2022-12-07 11:17:53,299 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-12-07 11:18:10,408 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.340e+02 2.745e+02 3.569e+02 4.514e+02 6.730e+02, threshold=7.138e+02, percent-clipped=2.0 +2022-12-07 11:18:31,646 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1576, 1.9711, 2.7208, 1.9418, 2.0411, 2.1489, 1.1575, 2.1375], + device='cuda:2'), covar=tensor([0.0975, 0.1568, 0.0801, 0.2130, 0.1865, 0.0957, 0.5289, 0.1286], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0073, 0.0068, 0.0080, 0.0094, 0.0064, 0.0136, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 11:19:08,660 INFO [train.py:873] (2/4) Epoch 4, batch 5300, loss[loss=0.2077, simple_loss=0.2138, pruned_loss=0.1008, over 14261.00 frames. ], tot_loss[loss=0.1916, simple_loss=0.2006, pruned_loss=0.09132, over 1946979.44 frames. ], batch size: 35, lr: 1.83e-02, grad_scale: 8.0 +2022-12-07 11:19:35,271 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-07 11:19:36,271 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.465e+02 2.605e+02 3.441e+02 4.511e+02 8.751e+02, threshold=6.883e+02, percent-clipped=1.0 +2022-12-07 11:20:20,522 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28070.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 11:20:35,860 INFO [train.py:873] (2/4) Epoch 4, batch 5400, loss[loss=0.2108, simple_loss=0.2161, pruned_loss=0.1028, over 14307.00 frames. ], tot_loss[loss=0.1918, simple_loss=0.2011, pruned_loss=0.09123, over 2031268.63 frames. ], batch size: 66, lr: 1.82e-02, grad_scale: 8.0 +2022-12-07 11:20:39,781 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.14 vs. limit=5.0 +2022-12-07 11:21:00,150 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 11:21:03,785 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.371e+02 3.121e+02 4.059e+02 8.001e+02, threshold=6.242e+02, percent-clipped=5.0 +2022-12-07 11:22:02,365 INFO [train.py:873] (2/4) Epoch 4, batch 5500, loss[loss=0.1546, simple_loss=0.1806, pruned_loss=0.06432, over 14179.00 frames. ], tot_loss[loss=0.1903, simple_loss=0.2, pruned_loss=0.09026, over 2060214.27 frames. ], batch size: 35, lr: 1.82e-02, grad_scale: 8.0 +2022-12-07 11:22:24,505 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1974, 1.4099, 2.5232, 1.3774, 2.3560, 2.3675, 1.8240, 2.4772], + device='cuda:2'), covar=tensor([0.0256, 0.1728, 0.0207, 0.1520, 0.0285, 0.0375, 0.0752, 0.0214], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0155, 0.0126, 0.0165, 0.0141, 0.0133, 0.0113, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 11:22:28,703 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7537, 2.5614, 2.5360, 2.8828, 2.3267, 2.4295, 2.7722, 2.7677], + device='cuda:2'), covar=tensor([0.0831, 0.0937, 0.0948, 0.0656, 0.1197, 0.0816, 0.0864, 0.1026], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0089, 0.0108, 0.0106, 0.0115, 0.0085, 0.0118, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:22:30,324 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.463e+02 2.632e+02 3.347e+02 4.419e+02 1.188e+03, threshold=6.694e+02, percent-clipped=11.0 +2022-12-07 11:22:39,483 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28229.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:22:41,771 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.95 vs. limit=2.0 +2022-12-07 11:22:58,089 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 11:23:01,980 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28255.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 11:23:29,473 INFO [train.py:873] (2/4) Epoch 4, batch 5600, loss[loss=0.1687, simple_loss=0.1903, pruned_loss=0.07361, over 14307.00 frames. ], tot_loss[loss=0.19, simple_loss=0.1997, pruned_loss=0.09018, over 2020624.15 frames. ], batch size: 39, lr: 1.82e-02, grad_scale: 8.0 +2022-12-07 11:23:29,650 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28287.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:23:32,017 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28290.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:23:54,674 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28316.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 11:23:57,192 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.230e+02 2.669e+02 3.555e+02 4.575e+02 8.799e+02, threshold=7.110e+02, percent-clipped=5.0 +2022-12-07 11:24:01,321 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9916, 4.5731, 4.5574, 4.9485, 4.7994, 4.3987, 5.0477, 4.2844], + device='cuda:2'), covar=tensor([0.0221, 0.0683, 0.0224, 0.0370, 0.0508, 0.0383, 0.0346, 0.0364], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0195, 0.0129, 0.0123, 0.0131, 0.0105, 0.0187, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 11:24:21,892 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9335, 1.1853, 1.0240, 0.9145, 0.8930, 0.6662, 1.1481, 0.8797], + device='cuda:2'), covar=tensor([0.0797, 0.1550, 0.1472, 0.1996, 0.1481, 0.0543, 0.0307, 0.1298], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0011, 0.0010, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([4.9425e-05, 5.0302e-05, 5.0082e-05, 4.6838e-05, 5.1491e-05, 6.3585e-05, + 5.8729e-05, 6.9775e-05], device='cuda:2') +2022-12-07 11:24:21,897 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28348.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:24:41,027 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28370.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 11:24:55,323 INFO [train.py:873] (2/4) Epoch 4, batch 5700, loss[loss=0.1922, simple_loss=0.2079, pruned_loss=0.08826, over 14263.00 frames. ], tot_loss[loss=0.1914, simple_loss=0.2005, pruned_loss=0.09116, over 1991625.93 frames. ], batch size: 63, lr: 1.81e-02, grad_scale: 8.0 +2022-12-07 11:25:22,829 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28418.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 11:25:23,562 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.172e+02 2.835e+02 3.398e+02 4.455e+02 8.619e+02, threshold=6.796e+02, percent-clipped=2.0 +2022-12-07 11:25:46,219 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.67 vs. limit=5.0 +2022-12-07 11:26:16,609 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0829, 1.9150, 2.0983, 1.9793, 1.5614, 1.8742, 2.0020, 1.0109], + device='cuda:2'), covar=tensor([0.3435, 0.1154, 0.1400, 0.1121, 0.1518, 0.0798, 0.1881, 0.4276], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0055, 0.0051, 0.0048, 0.0069, 0.0052, 0.0078, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:26:22,895 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28486.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:26:23,532 INFO [train.py:873] (2/4) Epoch 4, batch 5800, loss[loss=0.2154, simple_loss=0.2094, pruned_loss=0.1107, over 9503.00 frames. ], tot_loss[loss=0.1917, simple_loss=0.2003, pruned_loss=0.09151, over 2014914.10 frames. ], batch size: 100, lr: 1.81e-02, grad_scale: 8.0 +2022-12-07 11:26:27,916 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28492.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:26:52,116 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.461e+02 2.705e+02 3.370e+02 4.199e+02 7.683e+02, threshold=6.740e+02, percent-clipped=3.0 +2022-12-07 11:27:16,514 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28547.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:27:21,716 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28553.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:27:49,620 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28585.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:27:51,188 INFO [train.py:873] (2/4) Epoch 4, batch 5900, loss[loss=0.1873, simple_loss=0.1879, pruned_loss=0.0934, over 6915.00 frames. ], tot_loss[loss=0.1911, simple_loss=0.1999, pruned_loss=0.09113, over 2000870.96 frames. ], batch size: 100, lr: 1.81e-02, grad_scale: 8.0 +2022-12-07 11:27:52,553 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.09 vs. limit=2.0 +2022-12-07 11:28:13,018 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28611.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:28:13,102 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9709, 0.8864, 0.7133, 0.6396, 0.6884, 0.7021, 0.8318, 0.5834], + device='cuda:2'), covar=tensor([0.0859, 0.0612, 0.1056, 0.0537, 0.1065, 0.0635, 0.0521, 0.1368], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0012, 0.0011, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.1700e-05, 5.1529e-05, 5.4365e-05, 4.9570e-05, 5.2630e-05, 6.5457e-05, + 6.1180e-05, 7.1150e-05], device='cuda:2') +2022-12-07 11:28:19,509 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.061e+02 2.929e+02 3.624e+02 4.382e+02 1.301e+03, threshold=7.248e+02, percent-clipped=4.0 +2022-12-07 11:28:39,038 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5341, 1.1812, 2.0282, 1.8345, 2.0314, 1.9293, 1.6761, 2.0481], + device='cuda:2'), covar=tensor([0.0419, 0.0718, 0.0099, 0.0238, 0.0183, 0.0144, 0.0209, 0.0129], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0151, 0.0096, 0.0127, 0.0109, 0.0115, 0.0080, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:28:40,764 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28643.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:29:19,378 INFO [train.py:873] (2/4) Epoch 4, batch 6000, loss[loss=0.1646, simple_loss=0.1887, pruned_loss=0.07025, over 14080.00 frames. ], tot_loss[loss=0.1902, simple_loss=0.1991, pruned_loss=0.09068, over 1957291.25 frames. ], batch size: 22, lr: 1.81e-02, grad_scale: 8.0 +2022-12-07 11:29:19,378 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 11:29:30,469 INFO [train.py:905] (2/4) Epoch 4, validation: loss=0.1258, simple_loss=0.1688, pruned_loss=0.04138, over 857387.00 frames. +2022-12-07 11:29:30,470 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 11:29:46,861 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 11:29:55,276 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7450, 2.0562, 2.6732, 2.2103, 2.8135, 2.5896, 2.5510, 2.3272], + device='cuda:2'), covar=tensor([0.0202, 0.1423, 0.0255, 0.0967, 0.0273, 0.0296, 0.0405, 0.1198], + device='cuda:2'), in_proj_covar=tensor([0.0259, 0.0349, 0.0319, 0.0315, 0.0320, 0.0270, 0.0311, 0.0365], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:29:58,467 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-07 11:29:58,582 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 2.688e+02 3.368e+02 4.685e+02 1.249e+03, threshold=6.736e+02, percent-clipped=4.0 +2022-12-07 11:30:40,168 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7372, 1.4850, 1.8240, 1.0995, 1.3678, 1.7165, 1.6421, 1.5270], + device='cuda:2'), covar=tensor([0.0374, 0.0671, 0.0466, 0.0662, 0.0842, 0.0350, 0.0249, 0.0907], + device='cuda:2'), in_proj_covar=tensor([0.0092, 0.0213, 0.0107, 0.0125, 0.0096, 0.0093, 0.0079, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:30:58,674 INFO [train.py:873] (2/4) Epoch 4, batch 6100, loss[loss=0.1896, simple_loss=0.1819, pruned_loss=0.09866, over 3867.00 frames. ], tot_loss[loss=0.1911, simple_loss=0.1997, pruned_loss=0.09121, over 1959377.75 frames. ], batch size: 100, lr: 1.80e-02, grad_scale: 8.0 +2022-12-07 11:31:24,973 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3495, 1.4923, 2.5439, 1.9352, 2.3494, 1.5652, 1.9527, 2.1301], + device='cuda:2'), covar=tensor([0.1097, 0.4543, 0.0340, 0.4864, 0.0317, 0.3538, 0.1880, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0227, 0.0274, 0.0168, 0.0369, 0.0168, 0.0280, 0.0258, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 11:31:27,268 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.60 vs. limit=5.0 +2022-12-07 11:31:27,398 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.104e+02 2.717e+02 3.239e+02 4.102e+02 7.823e+02, threshold=6.479e+02, percent-clipped=2.0 +2022-12-07 11:31:36,418 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.5821, 0.6512, 0.5788, 0.3141, 0.6936, 0.5237, 0.4607, 0.5260], + device='cuda:2'), covar=tensor([0.0094, 0.0091, 0.0084, 0.0062, 0.0223, 0.0290, 0.0145, 0.0429], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0013, 0.0012, 0.0013, 0.0016, 0.0012, 0.0018], + device='cuda:2'), out_proj_covar=tensor([5.3374e-05, 5.4036e-05, 5.6897e-05, 5.1316e-05, 5.5187e-05, 6.7083e-05, + 6.2129e-05, 7.3583e-05], device='cuda:2') +2022-12-07 11:31:45,069 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2053, 2.4724, 4.2059, 4.2955, 4.4772, 2.5406, 4.5003, 3.4936], + device='cuda:2'), covar=tensor([0.0089, 0.0273, 0.0303, 0.0121, 0.0057, 0.0420, 0.0039, 0.0260], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0176, 0.0251, 0.0204, 0.0164, 0.0226, 0.0145, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 11:31:47,360 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28842.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:31:52,826 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=28848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:32:04,570 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=28861.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:32:15,543 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-12-07 11:32:25,341 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28885.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:32:27,172 INFO [train.py:873] (2/4) Epoch 4, batch 6200, loss[loss=0.1482, simple_loss=0.1422, pruned_loss=0.07708, over 1228.00 frames. ], tot_loss[loss=0.1934, simple_loss=0.2008, pruned_loss=0.09299, over 1904796.80 frames. ], batch size: 100, lr: 1.80e-02, grad_scale: 8.0 +2022-12-07 11:32:48,580 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28911.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 11:32:55,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.253e+02 2.759e+02 3.767e+02 5.172e+02 1.797e+03, threshold=7.535e+02, percent-clipped=14.0 +2022-12-07 11:32:57,899 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=28922.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:33:07,484 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28933.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:33:16,601 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=28943.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:33:30,089 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28959.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 11:33:55,196 INFO [train.py:873] (2/4) Epoch 4, batch 6300, loss[loss=0.1914, simple_loss=0.2045, pruned_loss=0.08917, over 13524.00 frames. ], tot_loss[loss=0.1909, simple_loss=0.1997, pruned_loss=0.09106, over 1986853.33 frames. ], batch size: 100, lr: 1.80e-02, grad_scale: 8.0 +2022-12-07 11:33:58,978 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=28991.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:34:08,989 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29002.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:34:23,887 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.333e+02 2.322e+02 3.000e+02 3.617e+02 7.085e+02, threshold=6.000e+02, percent-clipped=0.0 +2022-12-07 11:34:51,187 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-07 11:35:03,259 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29063.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:35:09,676 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6861, 3.0158, 4.2199, 3.0308, 4.3304, 4.3490, 3.9004, 3.5828], + device='cuda:2'), covar=tensor([0.0163, 0.2444, 0.0461, 0.1540, 0.0518, 0.0391, 0.1751, 0.2079], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0348, 0.0330, 0.0314, 0.0322, 0.0270, 0.0315, 0.0367], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:35:23,301 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4745, 4.0665, 4.1131, 4.6272, 4.0850, 3.4475, 4.5406, 4.4986], + device='cuda:2'), covar=tensor([0.0737, 0.0633, 0.0614, 0.0515, 0.0701, 0.0620, 0.0684, 0.0710], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0088, 0.0108, 0.0106, 0.0115, 0.0083, 0.0120, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:35:24,089 INFO [train.py:873] (2/4) Epoch 4, batch 6400, loss[loss=0.1643, simple_loss=0.1902, pruned_loss=0.06923, over 14254.00 frames. ], tot_loss[loss=0.1887, simple_loss=0.198, pruned_loss=0.08971, over 1972531.63 frames. ], batch size: 80, lr: 1.79e-02, grad_scale: 8.0 +2022-12-07 11:35:52,467 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 2.620e+02 3.469e+02 4.373e+02 1.002e+03, threshold=6.937e+02, percent-clipped=9.0 +2022-12-07 11:36:12,682 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29142.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:36:17,910 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29148.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:36:50,044 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.34 vs. limit=5.0 +2022-12-07 11:36:52,307 INFO [train.py:873] (2/4) Epoch 4, batch 6500, loss[loss=0.1843, simple_loss=0.1952, pruned_loss=0.08668, over 14323.00 frames. ], tot_loss[loss=0.1905, simple_loss=0.1993, pruned_loss=0.09081, over 1993057.26 frames. ], batch size: 60, lr: 1.79e-02, grad_scale: 8.0 +2022-12-07 11:36:54,888 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29190.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:37:00,023 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29196.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:37:18,808 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29217.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:37:20,463 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.481e+02 2.712e+02 3.572e+02 4.670e+02 1.139e+03, threshold=7.145e+02, percent-clipped=7.0 +2022-12-07 11:37:29,922 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.80 vs. limit=5.0 +2022-12-07 11:37:35,595 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29236.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:38:07,639 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6917, 1.2110, 1.2638, 1.1673, 1.0749, 1.1758, 1.0379, 0.7967], + device='cuda:2'), covar=tensor([0.2914, 0.0578, 0.0214, 0.0487, 0.0948, 0.0183, 0.1181, 0.1092], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0058, 0.0051, 0.0052, 0.0070, 0.0054, 0.0080, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:38:20,130 INFO [train.py:873] (2/4) Epoch 4, batch 6600, loss[loss=0.1874, simple_loss=0.2002, pruned_loss=0.08728, over 14232.00 frames. ], tot_loss[loss=0.1891, simple_loss=0.1986, pruned_loss=0.08981, over 2066041.44 frames. ], batch size: 37, lr: 1.79e-02, grad_scale: 8.0 +2022-12-07 11:38:29,396 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29297.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:38:48,062 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.105e+02 2.617e+02 3.366e+02 4.262e+02 9.069e+02, threshold=6.731e+02, percent-clipped=3.0 +2022-12-07 11:39:03,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4526, 2.2183, 2.3745, 1.2226, 2.0604, 2.1321, 2.5641, 2.0102], + device='cuda:2'), covar=tensor([0.0823, 0.2173, 0.1220, 0.3527, 0.1223, 0.0576, 0.0574, 0.2041], + device='cuda:2'), in_proj_covar=tensor([0.0096, 0.0221, 0.0112, 0.0130, 0.0097, 0.0097, 0.0084, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0006, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:39:18,504 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.74 vs. limit=5.0 +2022-12-07 11:39:22,474 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29358.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:39:47,260 INFO [train.py:873] (2/4) Epoch 4, batch 6700, loss[loss=0.1654, simple_loss=0.1875, pruned_loss=0.07159, over 14166.00 frames. ], tot_loss[loss=0.1886, simple_loss=0.1983, pruned_loss=0.08947, over 1954725.79 frames. ], batch size: 35, lr: 1.78e-02, grad_scale: 8.0 +2022-12-07 11:39:55,963 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-07 11:40:14,795 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.599e+02 2.696e+02 3.507e+02 4.703e+02 1.037e+03, threshold=7.015e+02, percent-clipped=5.0 +2022-12-07 11:40:32,232 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-12-07 11:41:13,497 INFO [train.py:873] (2/4) Epoch 4, batch 6800, loss[loss=0.2439, simple_loss=0.1925, pruned_loss=0.1477, over 1189.00 frames. ], tot_loss[loss=0.1893, simple_loss=0.1985, pruned_loss=0.09001, over 1914359.49 frames. ], batch size: 100, lr: 1.78e-02, grad_scale: 8.0 +2022-12-07 11:41:28,314 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.22 vs. limit=5.0 +2022-12-07 11:41:39,860 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29517.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:41:42,139 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.836e+02 3.618e+02 4.474e+02 6.792e+02, threshold=7.235e+02, percent-clipped=0.0 +2022-12-07 11:41:47,785 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-12-07 11:42:03,094 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.46 vs. limit=2.0 +2022-12-07 11:42:04,313 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4136, 2.2820, 2.5018, 2.4455, 2.4695, 2.2205, 1.2430, 2.2518], + device='cuda:2'), covar=tensor([0.0282, 0.0328, 0.0354, 0.0219, 0.0257, 0.0744, 0.1857, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0127, 0.0117, 0.0102, 0.0158, 0.0114, 0.0147, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 11:42:09,648 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29551.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:42:20,661 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-07 11:42:21,755 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29565.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:42:41,256 INFO [train.py:873] (2/4) Epoch 4, batch 6900, loss[loss=0.1725, simple_loss=0.1901, pruned_loss=0.07744, over 14003.00 frames. ], tot_loss[loss=0.1919, simple_loss=0.2004, pruned_loss=0.09171, over 1957149.03 frames. ], batch size: 22, lr: 1.78e-02, grad_scale: 8.0 +2022-12-07 11:42:45,746 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29592.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:43:03,026 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29612.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:43:09,418 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.448e+02 2.737e+02 3.624e+02 4.962e+02 1.279e+03, threshold=7.248e+02, percent-clipped=6.0 +2022-12-07 11:43:10,926 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.63 vs. limit=5.0 +2022-12-07 11:43:18,591 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-07 11:43:26,029 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 11:43:42,713 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29658.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:43:45,833 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29662.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:43:57,018 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5630, 3.3201, 3.2880, 3.6401, 3.1367, 2.8579, 3.6142, 3.5546], + device='cuda:2'), covar=tensor([0.0688, 0.0772, 0.0710, 0.0654, 0.0871, 0.0712, 0.0658, 0.0734], + device='cuda:2'), in_proj_covar=tensor([0.0103, 0.0086, 0.0104, 0.0104, 0.0111, 0.0081, 0.0113, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:44:07,055 INFO [train.py:873] (2/4) Epoch 4, batch 7000, loss[loss=0.1855, simple_loss=0.2025, pruned_loss=0.08426, over 14207.00 frames. ], tot_loss[loss=0.1916, simple_loss=0.2002, pruned_loss=0.09148, over 1969742.06 frames. ], batch size: 25, lr: 1.78e-02, grad_scale: 8.0 +2022-12-07 11:44:15,181 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7701, 2.4960, 3.8029, 3.9332, 3.9726, 2.2578, 4.0294, 3.0327], + device='cuda:2'), covar=tensor([0.0098, 0.0241, 0.0241, 0.0137, 0.0067, 0.0435, 0.0048, 0.0314], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0179, 0.0259, 0.0209, 0.0166, 0.0227, 0.0149, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 11:44:16,480 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 11:44:23,793 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29706.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:44:35,728 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.520e+02 2.834e+02 3.387e+02 4.469e+02 7.718e+02, threshold=6.775e+02, percent-clipped=1.0 +2022-12-07 11:44:38,265 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29723.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:45:23,312 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.19 vs. limit=5.0 +2022-12-07 11:45:33,335 INFO [train.py:873] (2/4) Epoch 4, batch 7100, loss[loss=0.176, simple_loss=0.1959, pruned_loss=0.07803, over 14080.00 frames. ], tot_loss[loss=0.1901, simple_loss=0.199, pruned_loss=0.09066, over 1970721.92 frames. ], batch size: 26, lr: 1.77e-02, grad_scale: 8.0 +2022-12-07 11:45:46,845 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=29802.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:45:57,015 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 11:46:02,683 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.326e+02 2.583e+02 3.436e+02 4.712e+02 1.166e+03, threshold=6.872e+02, percent-clipped=4.0 +2022-12-07 11:46:32,481 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1978, 2.2355, 4.7820, 4.2121, 4.4237, 4.6702, 4.4386, 4.7390], + device='cuda:2'), covar=tensor([0.0978, 0.0959, 0.0064, 0.0113, 0.0094, 0.0086, 0.0086, 0.0090], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0153, 0.0095, 0.0132, 0.0111, 0.0117, 0.0086, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:46:38,925 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0861, 2.3238, 4.1458, 2.7542, 4.0083, 1.8466, 3.0354, 3.7654], + device='cuda:2'), covar=tensor([0.0293, 0.5514, 0.0364, 1.1030, 0.0260, 0.4755, 0.1435, 0.0228], + device='cuda:2'), in_proj_covar=tensor([0.0226, 0.0280, 0.0168, 0.0373, 0.0176, 0.0282, 0.0259, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 11:46:40,735 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=29863.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:47:01,597 INFO [train.py:873] (2/4) Epoch 4, batch 7200, loss[loss=0.1831, simple_loss=0.2044, pruned_loss=0.08093, over 14596.00 frames. ], tot_loss[loss=0.1908, simple_loss=0.1997, pruned_loss=0.09094, over 2079182.21 frames. ], batch size: 43, lr: 1.77e-02, grad_scale: 8.0 +2022-12-07 11:47:06,132 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=29892.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:47:19,521 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=29907.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:47:30,663 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.626e+02 3.344e+02 4.272e+02 1.259e+03, threshold=6.687e+02, percent-clipped=4.0 +2022-12-07 11:47:47,973 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=29940.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:48:28,907 INFO [train.py:873] (2/4) Epoch 4, batch 7300, loss[loss=0.1756, simple_loss=0.1999, pruned_loss=0.07569, over 14257.00 frames. ], tot_loss[loss=0.1891, simple_loss=0.1981, pruned_loss=0.09003, over 2004590.65 frames. ], batch size: 25, lr: 1.77e-02, grad_scale: 8.0 +2022-12-07 11:48:48,792 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 11:48:59,524 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30018.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:49:01,143 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.428e+02 2.508e+02 3.092e+02 3.972e+02 1.080e+03, threshold=6.184e+02, percent-clipped=3.0 +2022-12-07 11:49:59,920 INFO [train.py:873] (2/4) Epoch 4, batch 7400, loss[loss=0.1955, simple_loss=0.1732, pruned_loss=0.1089, over 1245.00 frames. ], tot_loss[loss=0.1878, simple_loss=0.1975, pruned_loss=0.08903, over 2011652.21 frames. ], batch size: 100, lr: 1.76e-02, grad_scale: 8.0 +2022-12-07 11:50:05,442 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8004, 1.6239, 1.8860, 1.8463, 2.1290, 1.7312, 1.7402, 1.8996], + device='cuda:2'), covar=tensor([0.0179, 0.0483, 0.0084, 0.0179, 0.0090, 0.0280, 0.0087, 0.0197], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0345, 0.0333, 0.0311, 0.0320, 0.0267, 0.0315, 0.0366], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:50:29,607 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.686e+02 2.725e+02 3.785e+02 4.899e+02 8.566e+02, threshold=7.571e+02, percent-clipped=11.0 +2022-12-07 11:50:33,921 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30125.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:50:53,486 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9874, 1.3687, 3.6077, 3.5422, 3.5369, 3.6272, 3.0518, 3.6972], + device='cuda:2'), covar=tensor([0.0948, 0.1249, 0.0076, 0.0118, 0.0121, 0.0080, 0.0177, 0.0073], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0151, 0.0093, 0.0128, 0.0110, 0.0114, 0.0084, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:51:02,850 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30158.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:51:27,357 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30186.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:51:28,007 INFO [train.py:873] (2/4) Epoch 4, batch 7500, loss[loss=0.1679, simple_loss=0.1864, pruned_loss=0.07475, over 14286.00 frames. ], tot_loss[loss=0.1892, simple_loss=0.1986, pruned_loss=0.08993, over 2047145.24 frames. ], batch size: 31, lr: 1.76e-02, grad_scale: 8.0 +2022-12-07 11:51:35,978 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8898, 1.3317, 2.4094, 2.3563, 2.5185, 2.4685, 1.8520, 2.5327], + device='cuda:2'), covar=tensor([0.0466, 0.0764, 0.0069, 0.0144, 0.0139, 0.0078, 0.0237, 0.0085], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0150, 0.0095, 0.0128, 0.0110, 0.0114, 0.0083, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:51:45,986 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30207.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:51:47,606 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30209.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:51:56,540 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 2.609e+02 3.309e+02 4.020e+02 7.663e+02, threshold=6.618e+02, percent-clipped=1.0 +2022-12-07 11:52:55,166 INFO [train.py:873] (2/4) Epoch 5, batch 0, loss[loss=0.2685, simple_loss=0.2488, pruned_loss=0.1441, over 7761.00 frames. ], tot_loss[loss=0.2685, simple_loss=0.2488, pruned_loss=0.1441, over 7761.00 frames. ], batch size: 100, lr: 1.64e-02, grad_scale: 8.0 +2022-12-07 11:52:55,166 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 11:53:02,316 INFO [train.py:905] (2/4) Epoch 5, validation: loss=0.1366, simple_loss=0.1807, pruned_loss=0.04626, over 857387.00 frames. +2022-12-07 11:53:02,317 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 11:53:07,520 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30255.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:53:21,383 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30270.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:53:22,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3993, 4.0582, 4.0487, 4.4917, 4.1193, 3.6151, 4.4942, 4.3358], + device='cuda:2'), covar=tensor([0.0751, 0.0643, 0.0567, 0.0565, 0.0665, 0.0537, 0.0572, 0.0756], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0088, 0.0104, 0.0106, 0.0114, 0.0083, 0.0116, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:53:28,443 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0830, 3.6860, 3.8155, 4.1329, 3.7846, 3.3737, 4.1376, 4.0398], + device='cuda:2'), covar=tensor([0.0701, 0.0654, 0.0547, 0.0562, 0.0668, 0.0573, 0.0590, 0.0772], + device='cuda:2'), in_proj_covar=tensor([0.0108, 0.0089, 0.0105, 0.0108, 0.0115, 0.0084, 0.0117, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 11:53:37,451 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-12-07 11:53:57,229 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 11:54:03,558 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30318.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 11:54:05,440 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.465e+01 2.449e+02 3.602e+02 5.226e+02 1.817e+03, threshold=7.204e+02, percent-clipped=11.0 +2022-12-07 11:54:29,328 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 11:54:31,491 INFO [train.py:873] (2/4) Epoch 5, batch 100, loss[loss=0.1367, simple_loss=0.1685, pruned_loss=0.05243, over 14095.00 frames. ], tot_loss[loss=0.1859, simple_loss=0.197, pruned_loss=0.08742, over 885610.49 frames. ], batch size: 22, lr: 1.64e-02, grad_scale: 8.0 +2022-12-07 11:54:40,752 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30360.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:54:45,795 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30366.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:55:32,796 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.580e+02 2.778e+02 3.417e+02 4.208e+02 9.792e+02, threshold=6.833e+02, percent-clipped=3.0 +2022-12-07 11:55:33,886 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30421.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:55:58,007 INFO [train.py:873] (2/4) Epoch 5, batch 200, loss[loss=0.2201, simple_loss=0.2163, pruned_loss=0.112, over 8626.00 frames. ], tot_loss[loss=0.185, simple_loss=0.1965, pruned_loss=0.08677, over 1307411.90 frames. ], batch size: 100, lr: 1.63e-02, grad_scale: 8.0 +2022-12-07 11:56:06,144 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30458.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:56:26,441 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30481.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:56:36,687 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2652, 1.3785, 0.9019, 1.3273, 1.0564, 0.9826, 1.4114, 1.2167], + device='cuda:2'), covar=tensor([0.0990, 0.1305, 0.1457, 0.1282, 0.1235, 0.0406, 0.0287, 0.1680], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0015, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.2086e-05, 5.3803e-05, 5.3492e-05, 4.8225e-05, 5.3674e-05, 6.6439e-05, + 5.8827e-05, 7.2142e-05], device='cuda:2') +2022-12-07 11:56:44,949 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.45 vs. limit=5.0 +2022-12-07 11:56:48,247 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30506.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:57:00,757 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.564e+02 3.377e+02 4.486e+02 9.729e+02, threshold=6.754e+02, percent-clipped=3.0 +2022-12-07 11:57:25,911 INFO [train.py:873] (2/4) Epoch 5, batch 300, loss[loss=0.2189, simple_loss=0.1925, pruned_loss=0.1226, over 1193.00 frames. ], tot_loss[loss=0.1847, simple_loss=0.1958, pruned_loss=0.08676, over 1609099.75 frames. ], batch size: 100, lr: 1.63e-02, grad_scale: 8.0 +2022-12-07 11:57:39,931 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30565.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:58:12,477 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2210, 1.2814, 3.3240, 1.3003, 3.0730, 3.2227, 2.2318, 3.5300], + device='cuda:2'), covar=tensor([0.0211, 0.2712, 0.0282, 0.2211, 0.0675, 0.0311, 0.0805, 0.0150], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0129, 0.0169, 0.0148, 0.0139, 0.0119, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 11:58:27,948 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.393e+02 2.626e+02 3.332e+02 4.486e+02 9.530e+02, threshold=6.664e+02, percent-clipped=3.0 +2022-12-07 11:58:47,018 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0952, 2.1573, 2.0562, 1.8809, 1.6549, 2.1619, 2.0749, 0.8720], + device='cuda:2'), covar=tensor([0.3657, 0.0836, 0.1080, 0.0878, 0.1354, 0.0657, 0.1297, 0.3817], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0058, 0.0050, 0.0048, 0.0068, 0.0052, 0.0078, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:58:53,713 INFO [train.py:873] (2/4) Epoch 5, batch 400, loss[loss=0.1669, simple_loss=0.1981, pruned_loss=0.0679, over 14250.00 frames. ], tot_loss[loss=0.1823, simple_loss=0.1947, pruned_loss=0.08501, over 1834371.09 frames. ], batch size: 37, lr: 1.63e-02, grad_scale: 8.0 +2022-12-07 11:58:59,029 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30655.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:59:00,004 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8683, 3.0573, 4.5530, 3.4297, 4.5745, 4.3930, 3.9891, 3.7299], + device='cuda:2'), covar=tensor([0.0190, 0.2946, 0.0394, 0.1329, 0.0535, 0.0524, 0.1686, 0.1937], + device='cuda:2'), in_proj_covar=tensor([0.0270, 0.0351, 0.0336, 0.0321, 0.0335, 0.0277, 0.0323, 0.0375], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 11:59:09,362 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6765, 4.5014, 4.9382, 4.2478, 4.6517, 5.0807, 1.9789, 4.4512], + device='cuda:2'), covar=tensor([0.0135, 0.0232, 0.0267, 0.0228, 0.0265, 0.0087, 0.2621, 0.0221], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0131, 0.0122, 0.0107, 0.0161, 0.0117, 0.0150, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 11:59:52,596 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=30716.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 11:59:52,666 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30716.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 11:59:56,047 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.557e+02 3.010e+02 3.908e+02 7.694e+02, threshold=6.020e+02, percent-clipped=1.0 +2022-12-07 12:00:14,473 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=14.44 vs. limit=5.0 +2022-12-07 12:00:15,669 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8178, 1.3322, 2.9965, 1.3906, 3.1037, 2.9484, 2.1220, 3.1702], + device='cuda:2'), covar=tensor([0.0179, 0.2239, 0.0241, 0.1701, 0.0230, 0.0311, 0.0823, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0158, 0.0131, 0.0167, 0.0149, 0.0138, 0.0118, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:00:17,737 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9622, 2.6375, 4.0090, 4.0728, 4.2703, 2.4584, 4.3861, 3.1908], + device='cuda:2'), covar=tensor([0.0112, 0.0299, 0.0312, 0.0141, 0.0080, 0.0471, 0.0063, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0182, 0.0261, 0.0208, 0.0166, 0.0229, 0.0151, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:00:21,154 INFO [train.py:873] (2/4) Epoch 5, batch 500, loss[loss=0.1867, simple_loss=0.2055, pruned_loss=0.08391, over 14255.00 frames. ], tot_loss[loss=0.184, simple_loss=0.1954, pruned_loss=0.08633, over 1861927.38 frames. ], batch size: 46, lr: 1.63e-02, grad_scale: 8.0 +2022-12-07 12:00:38,793 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2943, 2.8609, 4.0859, 3.2520, 4.0552, 4.0912, 3.7005, 3.3461], + device='cuda:2'), covar=tensor([0.0181, 0.2022, 0.0483, 0.1175, 0.0473, 0.0321, 0.1151, 0.1786], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0340, 0.0332, 0.0313, 0.0326, 0.0272, 0.0313, 0.0364], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:00:48,352 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8441, 0.6889, 0.7300, 0.9389, 0.9430, 0.6062, 0.9182, 0.8904], + device='cuda:2'), covar=tensor([0.0452, 0.0823, 0.0279, 0.0329, 0.0353, 0.0312, 0.0411, 0.0363], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0020, 0.0019, 0.0019, 0.0026, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([6.9809e-05, 7.2283e-05, 6.8115e-05, 6.9998e-05, 7.0567e-05, 9.1620e-05, + 7.5289e-05, 6.9353e-05], device='cuda:2') +2022-12-07 12:00:49,031 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30781.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:01:23,928 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 2.567e+02 3.338e+02 3.998e+02 8.568e+02, threshold=6.676e+02, percent-clipped=3.0 +2022-12-07 12:01:31,368 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30829.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:01:48,857 INFO [train.py:873] (2/4) Epoch 5, batch 600, loss[loss=0.1935, simple_loss=0.1926, pruned_loss=0.09719, over 6905.00 frames. ], tot_loss[loss=0.1843, simple_loss=0.1952, pruned_loss=0.08672, over 1863967.80 frames. ], batch size: 100, lr: 1.62e-02, grad_scale: 4.0 +2022-12-07 12:01:49,070 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9478, 0.6934, 0.8399, 0.9947, 0.9209, 0.7025, 1.0926, 1.0878], + device='cuda:2'), covar=tensor([0.0519, 0.0977, 0.0628, 0.0364, 0.0608, 0.0413, 0.0404, 0.0472], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0020, 0.0019, 0.0020, 0.0026, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.0985e-05, 7.4017e-05, 6.9189e-05, 7.0914e-05, 7.1790e-05, 9.2747e-05, + 7.5242e-05, 6.9758e-05], device='cuda:2') +2022-12-07 12:02:02,899 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=30865.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:02:08,126 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1241, 1.9026, 2.0692, 2.1195, 2.0748, 2.0314, 2.1921, 1.8744], + device='cuda:2'), covar=tensor([0.0637, 0.1111, 0.0541, 0.0587, 0.0902, 0.0493, 0.0807, 0.0617], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0194, 0.0135, 0.0125, 0.0133, 0.0103, 0.0195, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:02:36,293 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30903.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:02:45,013 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=30913.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:02:51,900 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.564e+02 3.109e+02 3.998e+02 8.746e+02, threshold=6.217e+02, percent-clipped=4.0 +2022-12-07 12:02:55,473 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30925.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:03:03,273 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=30934.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:03:15,071 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 12:03:16,249 INFO [train.py:873] (2/4) Epoch 5, batch 700, loss[loss=0.1751, simple_loss=0.1783, pruned_loss=0.0859, over 4943.00 frames. ], tot_loss[loss=0.1839, simple_loss=0.1949, pruned_loss=0.08643, over 1865466.60 frames. ], batch size: 100, lr: 1.62e-02, grad_scale: 4.0 +2022-12-07 12:03:29,809 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30964.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:03:40,198 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.09 vs. limit=5.0 +2022-12-07 12:03:42,928 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 12:03:49,010 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30986.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:03:56,552 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=30995.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:04:10,469 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31011.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 12:04:10,551 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31011.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:04:14,733 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31016.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:04:18,900 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.713e+02 2.586e+02 3.335e+02 4.382e+02 1.879e+03, threshold=6.670e+02, percent-clipped=6.0 +2022-12-07 12:04:43,827 INFO [train.py:873] (2/4) Epoch 5, batch 800, loss[loss=0.1899, simple_loss=0.2014, pruned_loss=0.08923, over 13954.00 frames. ], tot_loss[loss=0.1846, simple_loss=0.1956, pruned_loss=0.0868, over 1895762.97 frames. ], batch size: 23, lr: 1.62e-02, grad_scale: 8.0 +2022-12-07 12:04:57,135 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31064.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:05:00,570 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.21 vs. limit=2.0 +2022-12-07 12:05:03,588 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9702, 1.2743, 1.3029, 1.2209, 0.9745, 1.2889, 0.9702, 0.8859], + device='cuda:2'), covar=tensor([0.1747, 0.0452, 0.0304, 0.0437, 0.1042, 0.0436, 0.1747, 0.1272], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0058, 0.0050, 0.0049, 0.0071, 0.0053, 0.0078, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:05:04,603 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31072.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 12:05:10,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0637, 2.9431, 2.1942, 3.3239, 2.9097, 3.1329, 2.6837, 2.2333], + device='cuda:2'), covar=tensor([0.0414, 0.0886, 0.2969, 0.0237, 0.0447, 0.0557, 0.1021, 0.3303], + device='cuda:2'), in_proj_covar=tensor([0.0225, 0.0298, 0.0313, 0.0183, 0.0244, 0.0237, 0.0263, 0.0302], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:05:30,174 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-07 12:05:37,067 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9696, 1.3047, 3.0529, 1.4246, 3.2183, 3.0454, 2.2000, 3.2297], + device='cuda:2'), covar=tensor([0.0150, 0.2277, 0.0254, 0.1705, 0.0181, 0.0250, 0.0640, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0155, 0.0127, 0.0160, 0.0144, 0.0133, 0.0113, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:05:47,140 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.226e+02 2.770e+02 3.066e+02 3.847e+02 6.961e+02, threshold=6.133e+02, percent-clipped=2.0 +2022-12-07 12:06:11,658 INFO [train.py:873] (2/4) Epoch 5, batch 900, loss[loss=0.1817, simple_loss=0.1935, pruned_loss=0.08496, over 14302.00 frames. ], tot_loss[loss=0.1837, simple_loss=0.1949, pruned_loss=0.08621, over 1905097.02 frames. ], batch size: 60, lr: 1.62e-02, grad_scale: 8.0 +2022-12-07 12:06:15,266 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0168, 3.6584, 3.5114, 4.0832, 3.9223, 3.5347, 4.0362, 3.2871], + device='cuda:2'), covar=tensor([0.0430, 0.0948, 0.0349, 0.0341, 0.0635, 0.1109, 0.0613, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0202, 0.0137, 0.0129, 0.0135, 0.0107, 0.0204, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:06:58,825 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6168, 1.9101, 2.6497, 2.8165, 2.6188, 1.9234, 2.7394, 2.0983], + device='cuda:2'), covar=tensor([0.0104, 0.0255, 0.0162, 0.0104, 0.0094, 0.0387, 0.0080, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0183, 0.0262, 0.0205, 0.0165, 0.0229, 0.0153, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:07:12,566 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3508, 2.2807, 4.3412, 3.0917, 4.2712, 2.1625, 3.2535, 3.9623], + device='cuda:2'), covar=tensor([0.0269, 0.4598, 0.0209, 0.7547, 0.0270, 0.3409, 0.1172, 0.0200], + device='cuda:2'), in_proj_covar=tensor([0.0225, 0.0272, 0.0166, 0.0367, 0.0176, 0.0273, 0.0252, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:07:14,865 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.620e+02 2.339e+02 3.126e+02 3.777e+02 6.609e+02, threshold=6.252e+02, percent-clipped=2.0 +2022-12-07 12:07:28,831 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9424, 2.2955, 3.9822, 4.2015, 4.0173, 2.3903, 4.1323, 3.0659], + device='cuda:2'), covar=tensor([0.0111, 0.0308, 0.0306, 0.0120, 0.0086, 0.0513, 0.0063, 0.0367], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0184, 0.0264, 0.0208, 0.0167, 0.0231, 0.0155, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:07:39,739 INFO [train.py:873] (2/4) Epoch 5, batch 1000, loss[loss=0.2013, simple_loss=0.2127, pruned_loss=0.09489, over 14067.00 frames. ], tot_loss[loss=0.1841, simple_loss=0.1953, pruned_loss=0.08646, over 1961100.48 frames. ], batch size: 22, lr: 1.61e-02, grad_scale: 8.0 +2022-12-07 12:07:48,346 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31259.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:08:00,775 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1353, 1.8298, 2.5851, 1.8413, 1.6998, 2.3879, 1.0717, 2.0334], + device='cuda:2'), covar=tensor([0.1299, 0.2490, 0.0876, 0.2676, 0.3750, 0.1128, 0.7688, 0.1467], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0074, 0.0069, 0.0084, 0.0098, 0.0064, 0.0136, 0.0069], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:08:07,828 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31281.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:08:15,261 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31290.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:08:19,090 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.30 vs. limit=2.0 +2022-12-07 12:08:33,855 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31311.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:08:42,294 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.584e+02 2.699e+02 3.594e+02 4.260e+02 1.045e+03, threshold=7.189e+02, percent-clipped=6.0 +2022-12-07 12:08:58,081 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-12-07 12:09:00,679 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 12:09:06,865 INFO [train.py:873] (2/4) Epoch 5, batch 1100, loss[loss=0.1911, simple_loss=0.1997, pruned_loss=0.09126, over 14171.00 frames. ], tot_loss[loss=0.1848, simple_loss=0.1956, pruned_loss=0.08703, over 1912023.57 frames. ], batch size: 99, lr: 1.61e-02, grad_scale: 8.0 +2022-12-07 12:09:15,219 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31359.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:09:16,458 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9024, 4.7147, 4.2559, 4.4659, 4.4830, 4.6597, 4.8545, 4.8027], + device='cuda:2'), covar=tensor([0.0625, 0.0389, 0.1487, 0.2471, 0.0574, 0.0526, 0.0678, 0.0815], + device='cuda:2'), in_proj_covar=tensor([0.0267, 0.0220, 0.0333, 0.0433, 0.0252, 0.0297, 0.0308, 0.0256], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:09:22,303 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31367.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:09:59,551 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31409.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:10:00,410 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0632, 0.9009, 0.9749, 0.9552, 0.7198, 0.4830, 0.7641, 0.6193], + device='cuda:2'), covar=tensor([0.0283, 0.0277, 0.0404, 0.0226, 0.0420, 0.0481, 0.0259, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0011, 0.0011, 0.0010, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.4990e-05, 5.5548e-05, 5.4808e-05, 4.9794e-05, 5.7767e-05, 7.1315e-05, + 6.4380e-05, 7.5467e-05], device='cuda:2') +2022-12-07 12:10:02,802 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4210, 5.1343, 4.6836, 4.8818, 4.7923, 5.3029, 5.4043, 5.4794], + device='cuda:2'), covar=tensor([0.0653, 0.0343, 0.1662, 0.2234, 0.0632, 0.0477, 0.0610, 0.0591], + device='cuda:2'), in_proj_covar=tensor([0.0266, 0.0221, 0.0332, 0.0430, 0.0254, 0.0298, 0.0308, 0.0255], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:10:09,480 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 2.558e+02 3.355e+02 4.456e+02 1.089e+03, threshold=6.709e+02, percent-clipped=2.0 +2022-12-07 12:10:18,178 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2009, 2.4996, 4.2405, 4.4871, 4.5568, 2.7252, 4.5011, 3.5259], + device='cuda:2'), covar=tensor([0.0108, 0.0312, 0.0353, 0.0110, 0.0078, 0.0491, 0.0069, 0.0313], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0189, 0.0269, 0.0214, 0.0170, 0.0235, 0.0157, 0.0229], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:10:27,541 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-07 12:10:34,138 INFO [train.py:873] (2/4) Epoch 5, batch 1200, loss[loss=0.2125, simple_loss=0.2181, pruned_loss=0.1034, over 14252.00 frames. ], tot_loss[loss=0.1837, simple_loss=0.1952, pruned_loss=0.08607, over 1970438.30 frames. ], batch size: 66, lr: 1.61e-02, grad_scale: 8.0 +2022-12-07 12:10:52,737 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31470.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:11:10,767 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31491.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 12:11:25,799 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0041, 1.9340, 4.4074, 4.1858, 4.0601, 4.3802, 3.8966, 4.4464], + device='cuda:2'), covar=tensor([0.1058, 0.1093, 0.0064, 0.0097, 0.0103, 0.0080, 0.0097, 0.0080], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0150, 0.0096, 0.0130, 0.0108, 0.0113, 0.0084, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:11:26,752 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3958, 2.8175, 4.2943, 2.8365, 4.1559, 4.0408, 3.8107, 3.6631], + device='cuda:2'), covar=tensor([0.0254, 0.2521, 0.0387, 0.1695, 0.0669, 0.0470, 0.1780, 0.1999], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0336, 0.0333, 0.0311, 0.0326, 0.0268, 0.0317, 0.0356], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:11:37,430 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.031e+02 2.496e+02 3.332e+02 4.341e+02 8.542e+02, threshold=6.664e+02, percent-clipped=7.0 +2022-12-07 12:12:00,090 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.83 vs. limit=5.0 +2022-12-07 12:12:02,327 INFO [train.py:873] (2/4) Epoch 5, batch 1300, loss[loss=0.1883, simple_loss=0.2046, pruned_loss=0.08596, over 14476.00 frames. ], tot_loss[loss=0.1832, simple_loss=0.1948, pruned_loss=0.08585, over 2005660.89 frames. ], batch size: 51, lr: 1.61e-02, grad_scale: 8.0 +2022-12-07 12:12:05,319 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31552.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 12:12:11,360 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31559.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:12:30,395 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31581.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:12:38,838 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31590.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:12:54,064 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31607.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:13:05,507 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.035e+02 2.616e+02 3.313e+02 3.855e+02 7.867e+02, threshold=6.626e+02, percent-clipped=4.0 +2022-12-07 12:13:13,009 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31629.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:13:20,929 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31638.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:13:30,389 INFO [train.py:873] (2/4) Epoch 5, batch 1400, loss[loss=0.1679, simple_loss=0.1494, pruned_loss=0.09324, over 1282.00 frames. ], tot_loss[loss=0.1841, simple_loss=0.195, pruned_loss=0.08655, over 1922587.77 frames. ], batch size: 100, lr: 1.60e-02, grad_scale: 8.0 +2022-12-07 12:13:45,871 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=31667.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:14:18,254 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31704.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:14:28,096 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=31715.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 12:14:33,108 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.505e+02 2.718e+02 3.448e+02 4.143e+02 7.580e+02, threshold=6.896e+02, percent-clipped=4.0 +2022-12-07 12:14:57,261 INFO [train.py:873] (2/4) Epoch 5, batch 1500, loss[loss=0.1698, simple_loss=0.1905, pruned_loss=0.07454, over 14511.00 frames. ], tot_loss[loss=0.183, simple_loss=0.194, pruned_loss=0.08598, over 1896839.83 frames. ], batch size: 49, lr: 1.60e-02, grad_scale: 8.0 +2022-12-07 12:14:57,678 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 12:15:11,636 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31765.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:15:11,752 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31765.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:15:18,070 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=3.39 vs. limit=2.0 +2022-12-07 12:15:31,594 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31788.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:16:00,993 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.197e+02 2.523e+02 3.162e+02 3.978e+02 7.805e+02, threshold=6.325e+02, percent-clipped=2.0 +2022-12-07 12:16:02,926 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31823.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:16:03,662 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5900, 1.1832, 1.9757, 1.8869, 2.0276, 2.0244, 1.3716, 2.0357], + device='cuda:2'), covar=tensor([0.0381, 0.0633, 0.0098, 0.0189, 0.0155, 0.0080, 0.0247, 0.0110], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0148, 0.0095, 0.0128, 0.0108, 0.0112, 0.0085, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:16:24,803 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=31847.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 12:16:26,444 INFO [train.py:873] (2/4) Epoch 5, batch 1600, loss[loss=0.1976, simple_loss=0.2072, pruned_loss=0.09402, over 14276.00 frames. ], tot_loss[loss=0.1825, simple_loss=0.1933, pruned_loss=0.08582, over 1898179.00 frames. ], batch size: 76, lr: 1.60e-02, grad_scale: 8.0 +2022-12-07 12:16:26,598 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31849.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:16:33,621 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8734, 1.7775, 3.0575, 2.1408, 2.9272, 1.6865, 2.2708, 2.6907], + device='cuda:2'), covar=tensor([0.0593, 0.4205, 0.0347, 0.5318, 0.0359, 0.3796, 0.1267, 0.0372], + device='cuda:2'), in_proj_covar=tensor([0.0222, 0.0270, 0.0167, 0.0360, 0.0178, 0.0277, 0.0255, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:16:52,765 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31878.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:16:58,392 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31884.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:17:25,433 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31915.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:17:25,854 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-12-07 12:17:30,353 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 2.483e+02 3.139e+02 4.013e+02 6.840e+02, threshold=6.277e+02, percent-clipped=1.0 +2022-12-07 12:17:35,600 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-07 12:17:46,186 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31939.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:17:54,781 INFO [train.py:873] (2/4) Epoch 5, batch 1700, loss[loss=0.1973, simple_loss=0.1741, pruned_loss=0.1102, over 1248.00 frames. ], tot_loss[loss=0.182, simple_loss=0.1933, pruned_loss=0.08536, over 1907324.92 frames. ], batch size: 100, lr: 1.60e-02, grad_scale: 8.0 +2022-12-07 12:17:56,121 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=31950.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:18:04,732 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2271, 5.0604, 4.6984, 4.7481, 4.7323, 5.0990, 5.2763, 5.1845], + device='cuda:2'), covar=tensor([0.0676, 0.0391, 0.1567, 0.2663, 0.0612, 0.0631, 0.0658, 0.0802], + device='cuda:2'), in_proj_covar=tensor([0.0272, 0.0229, 0.0347, 0.0436, 0.0259, 0.0310, 0.0320, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:18:15,353 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0988, 0.5863, 1.0923, 1.2606, 1.1389, 0.7182, 0.9825, 1.6020], + device='cuda:2'), covar=tensor([0.0643, 0.0759, 0.0315, 0.0813, 0.1145, 0.0309, 0.1515, 0.0463], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0018, 0.0017, 0.0018, 0.0024, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([6.5703e-05, 6.8433e-05, 6.3840e-05, 6.5987e-05, 6.7204e-05, 8.7306e-05, + 7.5333e-05, 6.8922e-05], device='cuda:2') +2022-12-07 12:18:18,822 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=31976.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:18:50,977 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=32011.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:18:53,957 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-12-07 12:18:59,553 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.301e+02 2.546e+02 3.116e+02 4.047e+02 1.395e+03, threshold=6.231e+02, percent-clipped=6.0 +2022-12-07 12:19:19,197 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4142, 1.0154, 1.3243, 0.8261, 0.9773, 1.3136, 1.2369, 1.2248], + device='cuda:2'), covar=tensor([0.0233, 0.0329, 0.0472, 0.0296, 0.0795, 0.0502, 0.0213, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0098, 0.0208, 0.0114, 0.0127, 0.0098, 0.0100, 0.0083, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:19:25,247 INFO [train.py:873] (2/4) Epoch 5, batch 1800, loss[loss=0.2081, simple_loss=0.2151, pruned_loss=0.1006, over 14267.00 frames. ], tot_loss[loss=0.1824, simple_loss=0.1943, pruned_loss=0.08522, over 1931976.74 frames. ], batch size: 76, lr: 1.59e-02, grad_scale: 8.0 +2022-12-07 12:19:34,866 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32060.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:19:39,322 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32065.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:19:59,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9084, 2.8889, 3.8920, 2.5748, 2.3844, 2.6874, 1.5486, 3.1985], + device='cuda:2'), covar=tensor([0.1274, 0.0865, 0.0555, 0.2220, 0.2117, 0.1461, 0.5200, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0074, 0.0070, 0.0082, 0.0097, 0.0064, 0.0133, 0.0068], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:20:00,608 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3863, 1.1774, 3.5274, 1.3546, 3.4807, 3.4940, 2.6293, 3.7011], + device='cuda:2'), covar=tensor([0.0377, 0.4404, 0.0512, 0.3390, 0.0754, 0.0499, 0.0907, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0158, 0.0132, 0.0167, 0.0150, 0.0138, 0.0119, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:20:10,268 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1207, 1.8492, 2.2160, 2.3193, 1.9866, 1.8007, 2.2485, 2.1061], + device='cuda:2'), covar=tensor([0.0060, 0.0122, 0.0063, 0.0041, 0.0079, 0.0211, 0.0086, 0.0075], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0189, 0.0269, 0.0211, 0.0173, 0.0234, 0.0155, 0.0227], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:20:22,895 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32113.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:20:30,237 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 2.694e+02 3.195e+02 4.070e+02 6.631e+02, threshold=6.390e+02, percent-clipped=2.0 +2022-12-07 12:20:40,070 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-12-07 12:20:51,090 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32144.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:20:53,870 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32147.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:20:55,469 INFO [train.py:873] (2/4) Epoch 5, batch 1900, loss[loss=0.1948, simple_loss=0.1864, pruned_loss=0.1016, over 3856.00 frames. ], tot_loss[loss=0.1828, simple_loss=0.1943, pruned_loss=0.08566, over 1932119.21 frames. ], batch size: 100, lr: 1.59e-02, grad_scale: 8.0 +2022-12-07 12:21:23,381 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32179.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:21:38,187 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32195.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:22:01,271 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.008e+02 2.535e+02 3.167e+02 4.268e+02 9.066e+02, threshold=6.334e+02, percent-clipped=2.0 +2022-12-07 12:22:13,242 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32234.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:22:26,466 INFO [train.py:873] (2/4) Epoch 5, batch 2000, loss[loss=0.1575, simple_loss=0.1682, pruned_loss=0.07334, over 5991.00 frames. ], tot_loss[loss=0.1826, simple_loss=0.194, pruned_loss=0.08558, over 1940351.27 frames. ], batch size: 100, lr: 1.59e-02, grad_scale: 8.0 +2022-12-07 12:22:45,446 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32271.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:23:16,416 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=32306.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:23:30,255 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.326e+02 2.510e+02 3.096e+02 3.855e+02 9.073e+02, threshold=6.191e+02, percent-clipped=1.0 +2022-12-07 12:23:35,571 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 12:23:55,265 INFO [train.py:873] (2/4) Epoch 5, batch 2100, loss[loss=0.1834, simple_loss=0.1599, pruned_loss=0.1035, over 2678.00 frames. ], tot_loss[loss=0.1827, simple_loss=0.194, pruned_loss=0.08574, over 1962336.40 frames. ], batch size: 100, lr: 1.59e-02, grad_scale: 8.0 +2022-12-07 12:24:05,671 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32360.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:24:07,510 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3897, 2.6107, 4.1818, 4.4944, 4.4884, 2.7294, 4.4889, 3.5289], + device='cuda:2'), covar=tensor([0.0111, 0.0372, 0.0263, 0.0131, 0.0086, 0.0553, 0.0053, 0.0391], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0188, 0.0268, 0.0212, 0.0172, 0.0232, 0.0155, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:24:11,427 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 12:24:44,905 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-07 12:24:50,018 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32408.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:24:51,090 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7646, 1.2723, 3.8887, 3.7112, 3.8631, 3.9089, 3.3976, 3.9335], + device='cuda:2'), covar=tensor([0.1211, 0.1436, 0.0080, 0.0164, 0.0119, 0.0095, 0.0186, 0.0097], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0155, 0.0097, 0.0134, 0.0113, 0.0116, 0.0088, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:25:01,675 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.555e+02 2.695e+02 3.113e+02 4.251e+02 1.070e+03, threshold=6.225e+02, percent-clipped=3.0 +2022-12-07 12:25:23,331 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32444.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:25:27,727 INFO [train.py:873] (2/4) Epoch 5, batch 2200, loss[loss=0.2165, simple_loss=0.2206, pruned_loss=0.1062, over 10395.00 frames. ], tot_loss[loss=0.1836, simple_loss=0.1946, pruned_loss=0.08626, over 2002830.83 frames. ], batch size: 100, lr: 1.58e-02, grad_scale: 8.0 +2022-12-07 12:25:54,669 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32479.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:26:05,776 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32492.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:26:05,916 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6206, 2.4269, 2.4451, 1.3067, 2.3011, 2.4459, 2.7898, 2.0470], + device='cuda:2'), covar=tensor([0.0647, 0.1458, 0.1223, 0.3026, 0.0911, 0.0475, 0.0585, 0.2034], + device='cuda:2'), in_proj_covar=tensor([0.0099, 0.0208, 0.0116, 0.0130, 0.0099, 0.0101, 0.0085, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:26:13,061 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.32 vs. limit=2.0 +2022-12-07 12:26:31,379 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.306e+02 2.575e+02 3.323e+02 4.480e+02 8.616e+02, threshold=6.645e+02, percent-clipped=4.0 +2022-12-07 12:26:36,973 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32527.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:26:43,129 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32534.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:26:56,111 INFO [train.py:873] (2/4) Epoch 5, batch 2300, loss[loss=0.2267, simple_loss=0.1892, pruned_loss=0.1321, over 1247.00 frames. ], tot_loss[loss=0.1826, simple_loss=0.1939, pruned_loss=0.08567, over 1967311.01 frames. ], batch size: 100, lr: 1.58e-02, grad_scale: 8.0 +2022-12-07 12:27:15,933 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32571.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:27:19,699 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4162, 1.9963, 3.4391, 2.5452, 3.3841, 1.9859, 2.6954, 3.1008], + device='cuda:2'), covar=tensor([0.0554, 0.5033, 0.0286, 0.7537, 0.0382, 0.4233, 0.1488, 0.0360], + device='cuda:2'), in_proj_covar=tensor([0.0225, 0.0265, 0.0167, 0.0356, 0.0174, 0.0273, 0.0250, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:27:25,532 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32582.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:27:48,289 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=32606.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:27:59,284 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32619.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:28:00,944 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.405e+02 2.763e+02 3.478e+02 4.434e+02 7.343e+02, threshold=6.957e+02, percent-clipped=3.0 +2022-12-07 12:28:26,682 INFO [train.py:873] (2/4) Epoch 5, batch 2400, loss[loss=0.172, simple_loss=0.1933, pruned_loss=0.07541, over 14291.00 frames. ], tot_loss[loss=0.1839, simple_loss=0.195, pruned_loss=0.08637, over 2000621.87 frames. ], batch size: 60, lr: 1.58e-02, grad_scale: 8.0 +2022-12-07 12:28:28,520 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4273, 2.5111, 3.1468, 1.9119, 2.1455, 2.6112, 1.5018, 2.3932], + device='cuda:2'), covar=tensor([0.1850, 0.1192, 0.0704, 0.3030, 0.2364, 0.1149, 0.5326, 0.1088], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0072, 0.0070, 0.0081, 0.0098, 0.0064, 0.0135, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 12:28:30,926 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=32654.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:28:34,411 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.22 vs. limit=5.0 +2022-12-07 12:28:58,820 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1123, 3.7664, 3.5851, 3.6819, 3.9535, 3.9625, 4.1174, 4.0819], + device='cuda:2'), covar=tensor([0.0786, 0.0879, 0.2104, 0.2943, 0.0763, 0.0769, 0.1011, 0.0813], + device='cuda:2'), in_proj_covar=tensor([0.0271, 0.0234, 0.0350, 0.0442, 0.0264, 0.0308, 0.0327, 0.0267], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:29:21,054 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 12:29:31,403 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.127e+02 2.520e+02 3.446e+02 4.436e+02 8.213e+02, threshold=6.892e+02, percent-clipped=3.0 +2022-12-07 12:29:56,007 INFO [train.py:873] (2/4) Epoch 5, batch 2500, loss[loss=0.171, simple_loss=0.1721, pruned_loss=0.0849, over 3850.00 frames. ], tot_loss[loss=0.183, simple_loss=0.1943, pruned_loss=0.08583, over 1952352.69 frames. ], batch size: 100, lr: 1.58e-02, grad_scale: 8.0 +2022-12-07 12:29:57,554 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 12:30:13,489 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6472, 3.6547, 3.9099, 3.3135, 3.6597, 3.7293, 1.3372, 3.5318], + device='cuda:2'), covar=tensor([0.0224, 0.0289, 0.0383, 0.0498, 0.0313, 0.0347, 0.3383, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0134, 0.0126, 0.0108, 0.0166, 0.0115, 0.0150, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:30:54,231 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0213, 1.8284, 4.3243, 4.0516, 4.1367, 4.2793, 3.6561, 4.3401], + device='cuda:2'), covar=tensor([0.1143, 0.1128, 0.0066, 0.0097, 0.0102, 0.0079, 0.0111, 0.0071], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0154, 0.0096, 0.0135, 0.0112, 0.0115, 0.0087, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:31:01,910 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.502e+02 2.569e+02 3.218e+02 4.442e+02 9.627e+02, threshold=6.437e+02, percent-clipped=1.0 +2022-12-07 12:31:03,071 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6340, 4.0298, 3.0692, 4.6629, 4.5260, 4.6639, 3.8060, 3.2726], + device='cuda:2'), covar=tensor([0.0362, 0.1024, 0.4430, 0.0422, 0.0390, 0.0730, 0.0955, 0.3541], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0296, 0.0310, 0.0183, 0.0237, 0.0239, 0.0255, 0.0284], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:31:27,893 INFO [train.py:873] (2/4) Epoch 5, batch 2600, loss[loss=0.197, simple_loss=0.2045, pruned_loss=0.09469, over 14239.00 frames. ], tot_loss[loss=0.1828, simple_loss=0.1943, pruned_loss=0.08563, over 2038648.16 frames. ], batch size: 35, lr: 1.57e-02, grad_scale: 16.0 +2022-12-07 12:31:36,431 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1382, 2.0465, 1.7382, 1.7877, 2.0526, 1.9296, 2.0366, 2.0228], + device='cuda:2'), covar=tensor([0.0771, 0.0962, 0.2181, 0.2530, 0.0812, 0.1040, 0.1373, 0.1053], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0231, 0.0348, 0.0437, 0.0258, 0.0305, 0.0324, 0.0265], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:32:29,892 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9086, 1.7835, 3.1740, 2.3182, 3.1176, 1.6491, 2.4914, 2.8160], + device='cuda:2'), covar=tensor([0.0527, 0.5647, 0.0355, 0.8881, 0.0413, 0.4995, 0.1504, 0.0362], + device='cuda:2'), in_proj_covar=tensor([0.0222, 0.0265, 0.0166, 0.0357, 0.0174, 0.0272, 0.0246, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:32:33,210 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 2.535e+02 3.293e+02 4.332e+02 8.259e+02, threshold=6.586e+02, percent-clipped=6.0 +2022-12-07 12:32:57,303 INFO [train.py:873] (2/4) Epoch 5, batch 2700, loss[loss=0.2015, simple_loss=0.2073, pruned_loss=0.09781, over 12710.00 frames. ], tot_loss[loss=0.183, simple_loss=0.1948, pruned_loss=0.08565, over 2024075.54 frames. ], batch size: 100, lr: 1.57e-02, grad_scale: 8.0 +2022-12-07 12:33:13,542 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=32966.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:34:03,193 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.510e+02 2.537e+02 3.172e+02 3.779e+02 6.676e+02, threshold=6.344e+02, percent-clipped=1.0 +2022-12-07 12:34:07,330 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33027.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:34:09,932 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8526, 3.7825, 4.2088, 3.7298, 3.9466, 4.1891, 1.5188, 3.8328], + device='cuda:2'), covar=tensor([0.0187, 0.0287, 0.0326, 0.0349, 0.0232, 0.0217, 0.2746, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0136, 0.0129, 0.0108, 0.0167, 0.0116, 0.0151, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:34:27,431 INFO [train.py:873] (2/4) Epoch 5, batch 2800, loss[loss=0.1915, simple_loss=0.201, pruned_loss=0.09103, over 12739.00 frames. ], tot_loss[loss=0.1809, simple_loss=0.1933, pruned_loss=0.0842, over 2000892.84 frames. ], batch size: 100, lr: 1.57e-02, grad_scale: 8.0 +2022-12-07 12:34:37,381 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8689, 1.5133, 3.8781, 3.7149, 3.8012, 3.8918, 3.4159, 3.9526], + device='cuda:2'), covar=tensor([0.1123, 0.1276, 0.0072, 0.0122, 0.0104, 0.0074, 0.0114, 0.0078], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0155, 0.0096, 0.0136, 0.0111, 0.0115, 0.0084, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:34:45,066 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33069.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:35:07,517 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4643, 0.9836, 1.1747, 1.2264, 1.2414, 1.0636, 1.5300, 1.0874], + device='cuda:2'), covar=tensor([0.1060, 0.1858, 0.1333, 0.0600, 0.0916, 0.0553, 0.0361, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0012, 0.0010, 0.0012, 0.0015, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.6873e-05, 5.8025e-05, 5.9681e-05, 5.1613e-05, 5.9863e-05, 7.7136e-05, + 6.2971e-05, 7.9327e-05], device='cuda:2') +2022-12-07 12:35:08,392 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1917, 1.7833, 5.0625, 4.4646, 4.5754, 5.0813, 4.9492, 5.0793], + device='cuda:2'), covar=tensor([0.1032, 0.1197, 0.0051, 0.0097, 0.0085, 0.0056, 0.0031, 0.0073], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0155, 0.0097, 0.0137, 0.0113, 0.0116, 0.0085, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:35:16,131 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 12:35:33,622 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.428e+02 3.052e+02 4.047e+02 7.917e+02, threshold=6.105e+02, percent-clipped=1.0 +2022-12-07 12:35:33,747 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1565, 4.8899, 4.7296, 5.2589, 4.8062, 4.2835, 5.2453, 5.0650], + device='cuda:2'), covar=tensor([0.0714, 0.0486, 0.0488, 0.0402, 0.0564, 0.0413, 0.0515, 0.0667], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0093, 0.0110, 0.0108, 0.0115, 0.0086, 0.0124, 0.0107], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:35:40,321 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33130.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:35:56,041 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9555, 2.5461, 4.0277, 4.1291, 4.1803, 2.7040, 4.0710, 3.3258], + device='cuda:2'), covar=tensor([0.0095, 0.0280, 0.0230, 0.0125, 0.0072, 0.0450, 0.0068, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0190, 0.0272, 0.0217, 0.0173, 0.0233, 0.0158, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:35:56,745 INFO [train.py:873] (2/4) Epoch 5, batch 2900, loss[loss=0.1908, simple_loss=0.2014, pruned_loss=0.09013, over 14649.00 frames. ], tot_loss[loss=0.1811, simple_loss=0.1928, pruned_loss=0.0847, over 1911167.99 frames. ], batch size: 33, lr: 1.57e-02, grad_scale: 4.0 +2022-12-07 12:36:02,013 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33155.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:36:56,600 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33216.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:37:02,203 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.058e+02 2.752e+02 3.414e+02 4.319e+02 7.862e+02, threshold=6.827e+02, percent-clipped=8.0 +2022-12-07 12:37:04,209 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8232, 1.6017, 2.0556, 1.7237, 2.0360, 1.3959, 1.7842, 1.6578], + device='cuda:2'), covar=tensor([0.0797, 0.1784, 0.0132, 0.1045, 0.0242, 0.0879, 0.0575, 0.0278], + device='cuda:2'), in_proj_covar=tensor([0.0220, 0.0268, 0.0167, 0.0352, 0.0172, 0.0271, 0.0244, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:37:07,683 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5372, 1.7114, 4.3167, 1.5887, 4.1053, 4.3425, 3.9326, 4.8609], + device='cuda:2'), covar=tensor([0.0140, 0.2437, 0.0249, 0.2264, 0.0253, 0.0224, 0.0282, 0.0090], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0155, 0.0131, 0.0167, 0.0145, 0.0140, 0.0116, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:37:08,589 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9174, 1.4349, 1.9397, 2.1151, 1.3630, 1.6885, 2.0531, 1.8580], + device='cuda:2'), covar=tensor([0.0035, 0.0088, 0.0034, 0.0025, 0.0069, 0.0087, 0.0029, 0.0034], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0192, 0.0276, 0.0220, 0.0174, 0.0235, 0.0160, 0.0226], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:37:09,338 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2004, 2.8797, 3.0191, 3.1807, 3.1363, 3.1888, 3.2689, 2.7250], + device='cuda:2'), covar=tensor([0.0435, 0.1175, 0.0434, 0.0504, 0.0635, 0.0408, 0.0611, 0.0579], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0208, 0.0141, 0.0134, 0.0136, 0.0112, 0.0207, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:37:26,085 INFO [train.py:873] (2/4) Epoch 5, batch 3000, loss[loss=0.219, simple_loss=0.2076, pruned_loss=0.1152, over 9500.00 frames. ], tot_loss[loss=0.1826, simple_loss=0.1938, pruned_loss=0.08569, over 1938517.81 frames. ], batch size: 100, lr: 1.56e-02, grad_scale: 4.0 +2022-12-07 12:37:26,085 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 12:37:38,398 INFO [train.py:905] (2/4) Epoch 5, validation: loss=0.1238, simple_loss=0.167, pruned_loss=0.04033, over 857387.00 frames. +2022-12-07 12:37:38,399 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 12:38:24,913 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.59 vs. limit=5.0 +2022-12-07 12:38:44,282 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33322.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:38:44,934 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.389e+02 2.483e+02 3.391e+02 4.332e+02 8.491e+02, threshold=6.783e+02, percent-clipped=1.0 +2022-12-07 12:39:08,696 INFO [train.py:873] (2/4) Epoch 5, batch 3100, loss[loss=0.1471, simple_loss=0.1754, pruned_loss=0.05942, over 14660.00 frames. ], tot_loss[loss=0.1819, simple_loss=0.1935, pruned_loss=0.0852, over 1965952.90 frames. ], batch size: 22, lr: 1.56e-02, grad_scale: 4.0 +2022-12-07 12:40:15,024 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.236e+02 3.030e+02 3.906e+02 6.925e+02, threshold=6.060e+02, percent-clipped=1.0 +2022-12-07 12:40:17,240 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33425.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:40:38,100 INFO [train.py:873] (2/4) Epoch 5, batch 3200, loss[loss=0.2105, simple_loss=0.2163, pruned_loss=0.1023, over 14405.00 frames. ], tot_loss[loss=0.1811, simple_loss=0.193, pruned_loss=0.08464, over 1947842.04 frames. ], batch size: 53, lr: 1.56e-02, grad_scale: 8.0 +2022-12-07 12:40:59,506 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33472.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:41:27,003 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6005, 4.9634, 5.0014, 5.4526, 5.1090, 4.7010, 5.4602, 4.3958], + device='cuda:2'), covar=tensor([0.0218, 0.1021, 0.0246, 0.0365, 0.0651, 0.0328, 0.0441, 0.0471], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0212, 0.0143, 0.0136, 0.0138, 0.0112, 0.0211, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:41:30,947 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8384, 1.0030, 1.0476, 0.7323, 0.8252, 0.6043, 0.5644, 0.7362], + device='cuda:2'), covar=tensor([0.0221, 0.0376, 0.0272, 0.0162, 0.0411, 0.0566, 0.0400, 0.0546], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0016, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.6808e-05, 5.7424e-05, 5.8579e-05, 5.2862e-05, 6.0946e-05, 7.9403e-05, + 6.2332e-05, 8.0626e-05], device='cuda:2') +2022-12-07 12:41:34,532 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33511.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:41:42,399 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-12-07 12:41:44,536 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.346e+02 2.667e+02 3.351e+02 4.253e+02 1.055e+03, threshold=6.703e+02, percent-clipped=3.0 +2022-12-07 12:41:45,607 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7878, 2.0080, 2.7460, 2.7481, 2.7747, 2.0806, 2.7901, 2.2791], + device='cuda:2'), covar=tensor([0.0095, 0.0259, 0.0165, 0.0120, 0.0097, 0.0365, 0.0078, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0183, 0.0193, 0.0278, 0.0223, 0.0179, 0.0240, 0.0163, 0.0231], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 12:41:53,263 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33533.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:42:08,400 INFO [train.py:873] (2/4) Epoch 5, batch 3300, loss[loss=0.1884, simple_loss=0.1902, pruned_loss=0.09329, over 13977.00 frames. ], tot_loss[loss=0.1812, simple_loss=0.1932, pruned_loss=0.08461, over 1942656.71 frames. ], batch size: 19, lr: 1.56e-02, grad_scale: 8.0 +2022-12-07 12:42:09,132 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-07 12:42:21,693 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=11.15 vs. limit=5.0 +2022-12-07 12:42:27,026 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-07 12:42:48,551 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6603, 1.2827, 1.1275, 1.3935, 1.3602, 1.1497, 1.4558, 0.8585], + device='cuda:2'), covar=tensor([0.0344, 0.1047, 0.0808, 0.0495, 0.0680, 0.0322, 0.0269, 0.1185], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0016, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.6511e-05, 5.8020e-05, 5.8101e-05, 5.2603e-05, 6.0455e-05, 8.0417e-05, + 6.2184e-05, 7.9904e-05], device='cuda:2') +2022-12-07 12:43:13,045 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33622.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:43:13,762 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.043e+02 2.484e+02 3.236e+02 4.553e+02 1.226e+03, threshold=6.473e+02, percent-clipped=4.0 +2022-12-07 12:43:37,543 INFO [train.py:873] (2/4) Epoch 5, batch 3400, loss[loss=0.1785, simple_loss=0.1821, pruned_loss=0.08742, over 6937.00 frames. ], tot_loss[loss=0.1803, simple_loss=0.1923, pruned_loss=0.08416, over 1927460.47 frames. ], batch size: 100, lr: 1.56e-02, grad_scale: 8.0 +2022-12-07 12:43:57,497 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33670.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:44:03,515 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33677.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:44:43,756 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.328e+02 2.478e+02 3.354e+02 4.337e+02 1.168e+03, threshold=6.709e+02, percent-clipped=6.0 +2022-12-07 12:44:45,637 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33725.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:44:56,910 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=33738.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 12:45:07,677 INFO [train.py:873] (2/4) Epoch 5, batch 3500, loss[loss=0.1886, simple_loss=0.1641, pruned_loss=0.1065, over 2674.00 frames. ], tot_loss[loss=0.1791, simple_loss=0.1916, pruned_loss=0.08333, over 1863189.69 frames. ], batch size: 100, lr: 1.55e-02, grad_scale: 8.0 +2022-12-07 12:45:12,188 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8678, 2.5636, 2.5939, 2.8442, 2.8150, 2.8544, 2.9198, 2.4235], + device='cuda:2'), covar=tensor([0.0547, 0.1209, 0.0520, 0.0543, 0.0746, 0.0419, 0.0657, 0.0692], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0216, 0.0142, 0.0138, 0.0140, 0.0113, 0.0210, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:45:28,361 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33773.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:46:02,467 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=33811.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:46:13,097 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.895e+01 2.306e+02 2.839e+02 3.644e+02 6.341e+02, threshold=5.678e+02, percent-clipped=0.0 +2022-12-07 12:46:18,217 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=33828.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:46:30,340 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.29 vs. limit=2.0 +2022-12-07 12:46:33,446 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8480, 0.9608, 0.9753, 1.3669, 1.2226, 0.7922, 1.4580, 0.8345], + device='cuda:2'), covar=tensor([0.1035, 0.1305, 0.0928, 0.0704, 0.1121, 0.0398, 0.0230, 0.1190], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0015, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.7096e-05, 5.7866e-05, 5.7712e-05, 5.3120e-05, 5.8672e-05, 7.8711e-05, + 6.2148e-05, 7.9204e-05], device='cuda:2') +2022-12-07 12:46:36,550 INFO [train.py:873] (2/4) Epoch 5, batch 3600, loss[loss=0.1758, simple_loss=0.192, pruned_loss=0.07977, over 14458.00 frames. ], tot_loss[loss=0.1784, simple_loss=0.1915, pruned_loss=0.08271, over 1916841.12 frames. ], batch size: 51, lr: 1.55e-02, grad_scale: 8.0 +2022-12-07 12:46:45,892 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=33859.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:47:03,083 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-12-07 12:47:14,189 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1270, 2.5957, 3.7766, 3.1083, 3.7218, 3.7914, 3.5395, 3.1737], + device='cuda:2'), covar=tensor([0.0233, 0.2165, 0.0550, 0.1165, 0.0532, 0.0422, 0.1199, 0.1348], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0336, 0.0342, 0.0314, 0.0339, 0.0272, 0.0320, 0.0351], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:47:44,859 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.078e+02 2.977e+02 3.494e+02 4.554e+02 8.959e+02, threshold=6.988e+02, percent-clipped=9.0 +2022-12-07 12:47:59,699 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=33939.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:48:09,172 INFO [train.py:873] (2/4) Epoch 5, batch 3700, loss[loss=0.2035, simple_loss=0.2083, pruned_loss=0.09936, over 14280.00 frames. ], tot_loss[loss=0.1799, simple_loss=0.1923, pruned_loss=0.08375, over 1955863.42 frames. ], batch size: 66, lr: 1.55e-02, grad_scale: 8.0 +2022-12-07 12:48:54,428 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34000.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:49:03,856 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9958, 1.9276, 1.8104, 2.0438, 1.6877, 1.8257, 2.0116, 2.0678], + device='cuda:2'), covar=tensor([0.0967, 0.0913, 0.1163, 0.0891, 0.1291, 0.0866, 0.1010, 0.0836], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0095, 0.0111, 0.0110, 0.0116, 0.0087, 0.0125, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:49:15,044 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.339e+02 2.468e+02 3.019e+02 3.918e+02 8.852e+02, threshold=6.038e+02, percent-clipped=4.0 +2022-12-07 12:49:15,451 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-07 12:49:22,713 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-12-07 12:49:24,241 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34033.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 12:49:34,029 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34044.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:49:38,093 INFO [train.py:873] (2/4) Epoch 5, batch 3800, loss[loss=0.1668, simple_loss=0.1872, pruned_loss=0.0732, over 14222.00 frames. ], tot_loss[loss=0.1811, simple_loss=0.193, pruned_loss=0.08463, over 1926346.34 frames. ], batch size: 94, lr: 1.55e-02, grad_scale: 8.0 +2022-12-07 12:50:11,733 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4340, 4.0484, 3.9844, 4.3283, 4.1419, 3.9152, 4.4023, 3.8335], + device='cuda:2'), covar=tensor([0.0293, 0.0832, 0.0286, 0.0429, 0.0673, 0.0641, 0.0453, 0.0440], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0215, 0.0141, 0.0136, 0.0142, 0.0112, 0.0211, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 12:50:29,156 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34105.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:50:33,674 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-07 12:50:45,937 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 2.542e+02 3.059e+02 3.773e+02 1.009e+03, threshold=6.118e+02, percent-clipped=4.0 +2022-12-07 12:50:50,731 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34128.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:51:05,952 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34145.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:51:09,663 INFO [train.py:873] (2/4) Epoch 5, batch 3900, loss[loss=0.1716, simple_loss=0.1513, pruned_loss=0.09594, over 1275.00 frames. ], tot_loss[loss=0.1787, simple_loss=0.1914, pruned_loss=0.08302, over 1947062.63 frames. ], batch size: 100, lr: 1.54e-02, grad_scale: 8.0 +2022-12-07 12:51:13,502 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1298, 1.9867, 1.7921, 1.7329, 2.0122, 1.9338, 2.0734, 1.9992], + device='cuda:2'), covar=tensor([0.0773, 0.1122, 0.2173, 0.2937, 0.0966, 0.1009, 0.1544, 0.1154], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0230, 0.0346, 0.0427, 0.0253, 0.0303, 0.0320, 0.0265], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:51:23,085 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7842, 1.4776, 2.8316, 2.6525, 2.8163, 2.8441, 2.0981, 2.8680], + device='cuda:2'), covar=tensor([0.0967, 0.1042, 0.0095, 0.0239, 0.0182, 0.0094, 0.0346, 0.0110], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0100, 0.0139, 0.0116, 0.0120, 0.0091, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:51:33,807 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34176.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:52:02,410 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34206.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:52:15,641 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 12:52:17,891 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.643e+02 2.452e+02 3.217e+02 4.044e+02 1.041e+03, threshold=6.433e+02, percent-clipped=5.0 +2022-12-07 12:52:26,771 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.05 vs. limit=5.0 +2022-12-07 12:52:41,188 INFO [train.py:873] (2/4) Epoch 5, batch 4000, loss[loss=0.1864, simple_loss=0.1964, pruned_loss=0.08819, over 11989.00 frames. ], tot_loss[loss=0.1782, simple_loss=0.1913, pruned_loss=0.08256, over 1963622.07 frames. ], batch size: 100, lr: 1.54e-02, grad_scale: 8.0 +2022-12-07 12:53:22,776 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34295.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:53:48,677 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.550e+02 2.578e+02 3.207e+02 3.995e+02 8.945e+02, threshold=6.415e+02, percent-clipped=4.0 +2022-12-07 12:53:57,396 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34333.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:54:12,176 INFO [train.py:873] (2/4) Epoch 5, batch 4100, loss[loss=0.1851, simple_loss=0.1892, pruned_loss=0.09048, over 14394.00 frames. ], tot_loss[loss=0.178, simple_loss=0.1912, pruned_loss=0.08235, over 1996553.55 frames. ], batch size: 53, lr: 1.54e-02, grad_scale: 8.0 +2022-12-07 12:54:17,251 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-12-07 12:54:40,237 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34381.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:54:45,774 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8277, 0.4657, 0.6777, 0.6620, 0.7721, 0.1465, 0.6488, 0.7403], + device='cuda:2'), covar=tensor([0.0117, 0.0393, 0.0167, 0.0121, 0.0112, 0.0104, 0.0576, 0.0224], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0019, 0.0019, 0.0020, 0.0026, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.2950e-05, 7.6771e-05, 6.9110e-05, 7.3399e-05, 7.7727e-05, 9.6668e-05, + 8.1295e-05, 7.4842e-05], device='cuda:2') +2022-12-07 12:54:49,218 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 12:54:57,962 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34400.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:55:10,343 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6503, 2.1444, 3.8049, 2.4562, 3.5816, 1.8893, 2.7097, 3.4644], + device='cuda:2'), covar=tensor([0.0405, 0.4125, 0.0319, 0.8238, 0.0238, 0.3688, 0.1378, 0.0246], + device='cuda:2'), in_proj_covar=tensor([0.0220, 0.0261, 0.0163, 0.0346, 0.0172, 0.0261, 0.0248, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:55:18,131 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.386e+02 2.334e+02 3.145e+02 3.965e+02 8.915e+02, threshold=6.291e+02, percent-clipped=4.0 +2022-12-07 12:55:39,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8538, 0.5938, 0.6596, 0.6194, 0.7743, 0.0620, 0.5619, 0.7533], + device='cuda:2'), covar=tensor([0.0088, 0.0256, 0.0134, 0.0075, 0.0077, 0.0057, 0.0497, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0018, 0.0019, 0.0020, 0.0026, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.2473e-05, 7.6657e-05, 6.9047e-05, 7.3593e-05, 7.7623e-05, 9.7228e-05, + 8.1428e-05, 7.4936e-05], device='cuda:2') +2022-12-07 12:55:41,759 INFO [train.py:873] (2/4) Epoch 5, batch 4200, loss[loss=0.1721, simple_loss=0.1825, pruned_loss=0.08087, over 9503.00 frames. ], tot_loss[loss=0.1781, simple_loss=0.1914, pruned_loss=0.08242, over 1962449.29 frames. ], batch size: 100, lr: 1.54e-02, grad_scale: 8.0 +2022-12-07 12:55:42,749 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1087, 1.9897, 1.7191, 1.7147, 2.0613, 1.9875, 2.0753, 2.0173], + device='cuda:2'), covar=tensor([0.0812, 0.0900, 0.2131, 0.2891, 0.0897, 0.0953, 0.1327, 0.1017], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0229, 0.0346, 0.0428, 0.0251, 0.0302, 0.0319, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 12:55:53,187 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6265, 2.5896, 3.2495, 2.2071, 2.1566, 2.7948, 1.3260, 2.7735], + device='cuda:2'), covar=tensor([0.1590, 0.1435, 0.0970, 0.2831, 0.2896, 0.1212, 0.6890, 0.1166], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0078, 0.0074, 0.0083, 0.0105, 0.0065, 0.0139, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 12:56:00,910 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8094, 2.4408, 3.6048, 2.7490, 3.6752, 3.5254, 3.4654, 2.9167], + device='cuda:2'), covar=tensor([0.0255, 0.2555, 0.0607, 0.1582, 0.0562, 0.0493, 0.0855, 0.1823], + device='cuda:2'), in_proj_covar=tensor([0.0262, 0.0329, 0.0340, 0.0304, 0.0331, 0.0270, 0.0313, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:56:04,719 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5395, 2.3561, 1.7497, 2.5236, 2.3453, 2.4145, 2.1887, 1.9554], + device='cuda:2'), covar=tensor([0.0337, 0.0712, 0.2117, 0.0213, 0.0927, 0.0348, 0.0919, 0.1640], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0299, 0.0297, 0.0183, 0.0241, 0.0243, 0.0253, 0.0293], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:56:26,892 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9670, 2.4252, 3.5830, 2.7147, 3.6008, 3.6604, 3.4778, 2.8946], + device='cuda:2'), covar=tensor([0.0285, 0.2802, 0.0703, 0.1942, 0.0760, 0.0541, 0.1303, 0.2318], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0333, 0.0344, 0.0309, 0.0334, 0.0272, 0.0319, 0.0351], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 12:56:28,622 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34501.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:56:48,745 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.089e+02 2.566e+02 3.165e+02 3.922e+02 1.587e+03, threshold=6.330e+02, percent-clipped=5.0 +2022-12-07 12:57:11,681 INFO [train.py:873] (2/4) Epoch 5, batch 4300, loss[loss=0.1736, simple_loss=0.1512, pruned_loss=0.09798, over 1295.00 frames. ], tot_loss[loss=0.1781, simple_loss=0.1918, pruned_loss=0.08224, over 2012906.14 frames. ], batch size: 100, lr: 1.54e-02, grad_scale: 8.0 +2022-12-07 12:57:29,936 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8516, 1.3185, 3.7655, 3.6511, 3.6657, 3.8261, 3.3141, 3.8443], + device='cuda:2'), covar=tensor([0.1314, 0.1597, 0.0161, 0.0175, 0.0198, 0.0158, 0.0181, 0.0150], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0154, 0.0099, 0.0138, 0.0116, 0.0120, 0.0091, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 12:57:39,609 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2588, 4.0389, 4.4191, 3.7485, 4.2262, 4.3896, 1.5693, 3.9546], + device='cuda:2'), covar=tensor([0.0145, 0.0279, 0.0294, 0.0477, 0.0233, 0.0191, 0.2862, 0.0215], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0135, 0.0121, 0.0106, 0.0162, 0.0114, 0.0147, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 12:57:51,388 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34593.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:57:53,696 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34595.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:58:17,761 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34622.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:58:18,609 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.121e+02 2.611e+02 3.299e+02 4.173e+02 8.760e+02, threshold=6.598e+02, percent-clipped=4.0 +2022-12-07 12:58:21,415 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34626.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:58:36,745 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34643.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:58:41,842 INFO [train.py:873] (2/4) Epoch 5, batch 4400, loss[loss=0.2213, simple_loss=0.2198, pruned_loss=0.1115, over 8575.00 frames. ], tot_loss[loss=0.1783, simple_loss=0.1917, pruned_loss=0.08241, over 1996168.26 frames. ], batch size: 100, lr: 1.53e-02, grad_scale: 8.0 +2022-12-07 12:58:46,349 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34654.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:59:05,612 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.99 vs. limit=2.0 +2022-12-07 12:59:12,452 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34683.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:59:16,010 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34687.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:59:27,195 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34700.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 12:59:36,227 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=34710.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 12:59:46,459 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7310, 1.7835, 1.9927, 1.4572, 1.3881, 1.7828, 1.0753, 1.6139], + device='cuda:2'), covar=tensor([0.0691, 0.1433, 0.0494, 0.1325, 0.1851, 0.0688, 0.3071, 0.0653], + device='cuda:2'), in_proj_covar=tensor([0.0070, 0.0080, 0.0077, 0.0084, 0.0108, 0.0065, 0.0141, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 12:59:48,179 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.105e+02 2.387e+02 3.073e+02 3.868e+02 1.059e+03, threshold=6.147e+02, percent-clipped=1.0 +2022-12-07 13:00:10,393 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34748.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:00:11,139 INFO [train.py:873] (2/4) Epoch 5, batch 4500, loss[loss=0.2118, simple_loss=0.2038, pruned_loss=0.1099, over 7771.00 frames. ], tot_loss[loss=0.1776, simple_loss=0.1912, pruned_loss=0.08201, over 1972073.63 frames. ], batch size: 100, lr: 1.53e-02, grad_scale: 8.0 +2022-12-07 13:00:15,114 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9605, 1.6063, 3.8997, 3.7356, 3.7602, 3.9513, 3.3189, 3.9938], + device='cuda:2'), covar=tensor([0.1171, 0.1216, 0.0084, 0.0148, 0.0134, 0.0096, 0.0218, 0.0084], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0101, 0.0142, 0.0118, 0.0121, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:00:31,219 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=34771.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 13:00:36,555 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-12-07 13:00:57,952 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=34801.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:01:16,933 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 2.392e+02 3.325e+02 4.104e+02 6.523e+02, threshold=6.650e+02, percent-clipped=2.0 +2022-12-07 13:01:40,482 INFO [train.py:873] (2/4) Epoch 5, batch 4600, loss[loss=0.1613, simple_loss=0.1715, pruned_loss=0.07557, over 4986.00 frames. ], tot_loss[loss=0.1801, simple_loss=0.1928, pruned_loss=0.08365, over 1964737.47 frames. ], batch size: 100, lr: 1.53e-02, grad_scale: 8.0 +2022-12-07 13:01:40,573 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=34849.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:02:46,770 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.280e+02 2.771e+02 3.403e+02 4.151e+02 6.915e+02, threshold=6.807e+02, percent-clipped=1.0 +2022-12-07 13:02:50,776 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 13:03:10,244 INFO [train.py:873] (2/4) Epoch 5, batch 4700, loss[loss=0.1653, simple_loss=0.192, pruned_loss=0.06934, over 13994.00 frames. ], tot_loss[loss=0.1791, simple_loss=0.1923, pruned_loss=0.08293, over 2010380.36 frames. ], batch size: 22, lr: 1.53e-02, grad_scale: 8.0 +2022-12-07 13:03:10,336 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34949.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:03:35,677 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4384, 5.2507, 4.8249, 5.3805, 5.1449, 5.1446, 5.4736, 5.3857], + device='cuda:2'), covar=tensor([0.0752, 0.0462, 0.0574, 0.0728, 0.0537, 0.0327, 0.0556, 0.0739], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0099, 0.0113, 0.0112, 0.0118, 0.0090, 0.0128, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:03:35,712 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34978.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:03:39,285 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=34982.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:04:11,215 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9274, 3.1420, 4.0190, 2.7117, 2.5877, 3.1176, 1.4508, 2.9232], + device='cuda:2'), covar=tensor([0.1557, 0.0807, 0.0411, 0.1818, 0.1792, 0.0986, 0.4917, 0.1352], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0078, 0.0075, 0.0082, 0.0104, 0.0063, 0.0134, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 13:04:21,150 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.362e+02 2.441e+02 3.432e+02 4.388e+02 1.189e+03, threshold=6.863e+02, percent-clipped=8.0 +2022-12-07 13:04:45,222 INFO [train.py:873] (2/4) Epoch 5, batch 4800, loss[loss=0.2054, simple_loss=0.199, pruned_loss=0.1058, over 5998.00 frames. ], tot_loss[loss=0.1795, simple_loss=0.1919, pruned_loss=0.08353, over 2004267.29 frames. ], batch size: 100, lr: 1.52e-02, grad_scale: 8.0 +2022-12-07 13:04:48,445 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0 +2022-12-07 13:05:01,163 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35066.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 13:05:14,811 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4065, 1.8232, 3.5460, 2.2571, 3.3765, 1.6486, 2.6328, 3.2820], + device='cuda:2'), covar=tensor([0.0848, 0.5353, 0.0725, 0.9029, 0.0414, 0.5223, 0.1412, 0.0444], + device='cuda:2'), in_proj_covar=tensor([0.0218, 0.0259, 0.0167, 0.0353, 0.0174, 0.0265, 0.0239, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:05:18,511 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.33 vs. limit=2.0 +2022-12-07 13:05:21,748 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0158, 1.5570, 1.3014, 1.6372, 1.2366, 1.4753, 1.4124, 0.8967], + device='cuda:2'), covar=tensor([0.1076, 0.1215, 0.0787, 0.0761, 0.2527, 0.0380, 0.1079, 0.1819], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.8884e-05, 6.0099e-05, 5.9436e-05, 5.4777e-05, 6.3623e-05, 8.2066e-05, + 6.7570e-05, 8.2659e-05], device='cuda:2') +2022-12-07 13:05:24,502 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35092.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:05:53,124 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 2.350e+02 3.192e+02 4.148e+02 1.074e+03, threshold=6.385e+02, percent-clipped=3.0 +2022-12-07 13:06:16,620 INFO [train.py:873] (2/4) Epoch 5, batch 4900, loss[loss=0.1648, simple_loss=0.1972, pruned_loss=0.06617, over 14289.00 frames. ], tot_loss[loss=0.1793, simple_loss=0.1921, pruned_loss=0.08322, over 1991490.94 frames. ], batch size: 25, lr: 1.52e-02, grad_scale: 16.0 +2022-12-07 13:06:20,570 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3129, 3.2111, 3.2282, 3.0049, 2.4190, 3.6175, 3.1075, 1.3144], + device='cuda:2'), covar=tensor([0.3816, 0.1287, 0.1405, 0.1150, 0.1087, 0.0374, 0.1683, 0.4164], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0060, 0.0052, 0.0050, 0.0070, 0.0054, 0.0081, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:06:20,610 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35153.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:06:24,154 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0805, 2.0131, 1.7426, 1.7316, 2.0705, 1.9427, 2.0499, 2.0434], + device='cuda:2'), covar=tensor([0.0954, 0.0693, 0.1924, 0.2594, 0.0813, 0.0934, 0.1162, 0.0987], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0222, 0.0349, 0.0432, 0.0253, 0.0310, 0.0325, 0.0270], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:06:26,860 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9445, 1.3586, 1.1377, 1.5658, 1.5072, 1.0748, 0.8924, 0.6279], + device='cuda:2'), covar=tensor([0.0692, 0.1422, 0.1008, 0.0743, 0.0651, 0.0359, 0.0298, 0.1329], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0011, 0.0010, 0.0012, 0.0015, 0.0011, 0.0017], + device='cuda:2'), out_proj_covar=tensor([5.7710e-05, 5.9255e-05, 5.8280e-05, 5.4197e-05, 6.1981e-05, 8.1073e-05, + 6.5600e-05, 8.0979e-05], device='cuda:2') +2022-12-07 13:06:31,879 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.88 vs. limit=5.0 +2022-12-07 13:07:24,093 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.440e+01 2.537e+02 3.134e+02 4.134e+02 8.105e+02, threshold=6.269e+02, percent-clipped=2.0 +2022-12-07 13:07:47,536 INFO [train.py:873] (2/4) Epoch 5, batch 5000, loss[loss=0.1753, simple_loss=0.1943, pruned_loss=0.07814, over 14452.00 frames. ], tot_loss[loss=0.1802, simple_loss=0.1926, pruned_loss=0.08387, over 2010850.84 frames. ], batch size: 49, lr: 1.52e-02, grad_scale: 16.0 +2022-12-07 13:07:47,659 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35249.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:08:13,625 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35278.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 13:08:17,378 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35282.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:08:21,757 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2426, 2.3113, 2.3660, 2.3343, 1.9720, 2.4965, 2.1547, 1.1321], + device='cuda:2'), covar=tensor([0.2174, 0.0960, 0.0806, 0.0645, 0.0909, 0.0634, 0.1501, 0.3662], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0060, 0.0051, 0.0050, 0.0069, 0.0054, 0.0081, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:08:30,436 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35297.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:08:35,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0675, 4.5250, 4.5335, 5.0045, 4.6751, 4.2799, 4.9696, 4.1573], + device='cuda:2'), covar=tensor([0.0355, 0.1023, 0.0316, 0.0434, 0.0834, 0.0478, 0.0580, 0.0607], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0219, 0.0142, 0.0139, 0.0143, 0.0111, 0.0211, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 13:08:53,565 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 2.768e+02 3.459e+02 4.682e+02 1.378e+03, threshold=6.919e+02, percent-clipped=12.0 +2022-12-07 13:08:56,552 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35326.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:09:00,117 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35330.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:09:13,387 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35345.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:09:17,078 INFO [train.py:873] (2/4) Epoch 5, batch 5100, loss[loss=0.1609, simple_loss=0.1835, pruned_loss=0.06917, over 14259.00 frames. ], tot_loss[loss=0.1785, simple_loss=0.1915, pruned_loss=0.08274, over 2066803.52 frames. ], batch size: 76, lr: 1.52e-02, grad_scale: 16.0 +2022-12-07 13:09:32,865 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35366.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:09:39,947 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35374.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:09:47,111 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7027, 4.9506, 5.1773, 5.5662, 5.3460, 4.6050, 5.5423, 4.6494], + device='cuda:2'), covar=tensor([0.0264, 0.1019, 0.0254, 0.0426, 0.0562, 0.0354, 0.0448, 0.0451], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0213, 0.0140, 0.0136, 0.0141, 0.0110, 0.0205, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 13:10:04,664 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 13:10:08,801 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35406.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:10:15,183 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35414.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:10:23,148 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 2.613e+02 3.282e+02 4.367e+02 7.735e+02, threshold=6.564e+02, percent-clipped=1.0 +2022-12-07 13:10:34,103 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35435.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:10:44,971 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35448.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:10:45,710 INFO [train.py:873] (2/4) Epoch 5, batch 5200, loss[loss=0.1746, simple_loss=0.193, pruned_loss=0.07815, over 14616.00 frames. ], tot_loss[loss=0.1785, simple_loss=0.1916, pruned_loss=0.08269, over 2053243.66 frames. ], batch size: 33, lr: 1.52e-02, grad_scale: 8.0 +2022-12-07 13:11:00,225 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4125, 3.1914, 3.3506, 2.8021, 2.5784, 3.2682, 3.0940, 1.5438], + device='cuda:2'), covar=tensor([0.3820, 0.1158, 0.1070, 0.0980, 0.0941, 0.1193, 0.1546, 0.3830], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0061, 0.0052, 0.0051, 0.0071, 0.0056, 0.0083, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:11:51,440 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.407e+02 2.356e+02 3.494e+02 4.184e+02 1.202e+03, threshold=6.988e+02, percent-clipped=4.0 +2022-12-07 13:11:57,327 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35530.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:12:13,730 INFO [train.py:873] (2/4) Epoch 5, batch 5300, loss[loss=0.1547, simple_loss=0.1696, pruned_loss=0.06983, over 13912.00 frames. ], tot_loss[loss=0.178, simple_loss=0.1912, pruned_loss=0.08239, over 2083867.41 frames. ], batch size: 20, lr: 1.51e-02, grad_scale: 8.0 +2022-12-07 13:12:51,649 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35591.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:12:55,870 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-12-07 13:13:07,604 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 13:13:20,926 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.526e+02 2.787e+02 3.586e+02 4.482e+02 8.195e+02, threshold=7.172e+02, percent-clipped=2.0 +2022-12-07 13:13:43,968 INFO [train.py:873] (2/4) Epoch 5, batch 5400, loss[loss=0.1906, simple_loss=0.2103, pruned_loss=0.08545, over 14192.00 frames. ], tot_loss[loss=0.1776, simple_loss=0.1916, pruned_loss=0.08181, over 2116898.30 frames. ], batch size: 35, lr: 1.51e-02, grad_scale: 8.0 +2022-12-07 13:13:47,701 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5298, 4.3400, 4.0581, 4.0705, 4.2254, 4.3302, 4.4991, 4.5254], + device='cuda:2'), covar=tensor([0.0775, 0.0488, 0.1723, 0.2619, 0.0774, 0.0679, 0.1051, 0.0838], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0223, 0.0349, 0.0432, 0.0257, 0.0308, 0.0328, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:14:02,937 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3689, 0.9403, 1.1616, 1.0718, 1.1322, 0.6593, 1.1881, 1.6039], + device='cuda:2'), covar=tensor([0.0922, 0.0663, 0.0356, 0.1322, 0.0966, 0.0401, 0.0934, 0.0472], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0018, 0.0018, 0.0019, 0.0024, 0.0019, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.1322e-05, 7.4005e-05, 6.9151e-05, 7.3623e-05, 7.5888e-05, 9.0533e-05, + 7.9270e-05, 7.5649e-05], device='cuda:2') +2022-12-07 13:14:30,641 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35701.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 13:14:51,484 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.281e+02 2.811e+02 3.690e+02 4.403e+02 1.133e+03, threshold=7.380e+02, percent-clipped=4.0 +2022-12-07 13:14:55,978 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9735, 1.4380, 3.4413, 3.2955, 3.3625, 3.4855, 2.8349, 3.4963], + device='cuda:2'), covar=tensor([0.1020, 0.1209, 0.0092, 0.0156, 0.0144, 0.0092, 0.0238, 0.0092], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0153, 0.0101, 0.0140, 0.0115, 0.0117, 0.0092, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:14:57,170 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35730.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:15:13,013 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=35748.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:15:13,722 INFO [train.py:873] (2/4) Epoch 5, batch 5500, loss[loss=0.1994, simple_loss=0.2086, pruned_loss=0.09508, over 14269.00 frames. ], tot_loss[loss=0.1769, simple_loss=0.1911, pruned_loss=0.0814, over 2084979.37 frames. ], batch size: 94, lr: 1.51e-02, grad_scale: 8.0 +2022-12-07 13:15:15,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0949, 1.6094, 1.6352, 1.5734, 1.5419, 1.7168, 1.1747, 1.0461], + device='cuda:2'), covar=tensor([0.2832, 0.0968, 0.0522, 0.0468, 0.0921, 0.0475, 0.1945, 0.2260], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0059, 0.0051, 0.0050, 0.0071, 0.0056, 0.0081, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:15:33,611 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35771.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:15:55,933 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=35796.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:16:09,293 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=35811.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:16:20,154 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.649e+02 3.424e+02 4.152e+02 8.919e+02, threshold=6.848e+02, percent-clipped=1.0 +2022-12-07 13:16:28,194 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35832.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:16:36,286 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0096, 2.0775, 2.1511, 2.1712, 1.8197, 2.1782, 1.8474, 0.9110], + device='cuda:2'), covar=tensor([0.2577, 0.1002, 0.0687, 0.0503, 0.1102, 0.0604, 0.1435, 0.3916], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0059, 0.0052, 0.0050, 0.0071, 0.0057, 0.0081, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:16:39,882 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8374, 3.2593, 2.4172, 3.8745, 3.6039, 3.7232, 3.0289, 2.4868], + device='cuda:2'), covar=tensor([0.0465, 0.1305, 0.3726, 0.0341, 0.0699, 0.0947, 0.1235, 0.4036], + device='cuda:2'), in_proj_covar=tensor([0.0219, 0.0294, 0.0287, 0.0185, 0.0246, 0.0245, 0.0252, 0.0283], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:16:42,806 INFO [train.py:873] (2/4) Epoch 5, batch 5600, loss[loss=0.1594, simple_loss=0.1841, pruned_loss=0.06731, over 14307.00 frames. ], tot_loss[loss=0.1776, simple_loss=0.191, pruned_loss=0.08211, over 2070435.74 frames. ], batch size: 39, lr: 1.51e-02, grad_scale: 8.0 +2022-12-07 13:17:03,270 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=35872.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:17:07,863 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1141, 1.5337, 3.9533, 1.7002, 3.9548, 4.1598, 3.2493, 4.3303], + device='cuda:2'), covar=tensor([0.0146, 0.2460, 0.0244, 0.1898, 0.0283, 0.0216, 0.0406, 0.0116], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0158, 0.0135, 0.0167, 0.0151, 0.0147, 0.0120, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:17:15,336 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=35886.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:17:35,993 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9044, 2.0152, 2.0543, 2.0803, 1.7516, 2.1110, 1.8590, 0.9373], + device='cuda:2'), covar=tensor([0.2214, 0.0850, 0.0590, 0.0681, 0.1090, 0.0551, 0.1564, 0.3926], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0059, 0.0052, 0.0050, 0.0071, 0.0057, 0.0081, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:17:48,147 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.365e+02 2.385e+02 3.015e+02 3.800e+02 5.921e+02, threshold=6.029e+02, percent-clipped=0.0 +2022-12-07 13:17:57,961 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8330, 2.1238, 2.7483, 2.3745, 2.6104, 2.6164, 2.5947, 2.3522], + device='cuda:2'), covar=tensor([0.0367, 0.2250, 0.0518, 0.1279, 0.0318, 0.0670, 0.0768, 0.1582], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0340, 0.0355, 0.0313, 0.0344, 0.0283, 0.0330, 0.0354], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:18:10,021 INFO [train.py:873] (2/4) Epoch 5, batch 5700, loss[loss=0.1866, simple_loss=0.1969, pruned_loss=0.0882, over 12745.00 frames. ], tot_loss[loss=0.1779, simple_loss=0.191, pruned_loss=0.0824, over 2051181.64 frames. ], batch size: 100, lr: 1.51e-02, grad_scale: 8.0 +2022-12-07 13:18:55,435 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36001.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 13:19:13,835 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 13:19:15,102 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.306e+02 2.574e+02 3.156e+02 4.047e+02 9.769e+02, threshold=6.311e+02, percent-clipped=9.0 +2022-12-07 13:19:20,456 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36030.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:19:37,072 INFO [train.py:873] (2/4) Epoch 5, batch 5800, loss[loss=0.2108, simple_loss=0.209, pruned_loss=0.1063, over 9503.00 frames. ], tot_loss[loss=0.1799, simple_loss=0.1924, pruned_loss=0.08368, over 2020990.80 frames. ], batch size: 100, lr: 1.50e-02, grad_scale: 8.0 +2022-12-07 13:19:37,127 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36049.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:19:45,356 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.13 vs. limit=2.0 +2022-12-07 13:19:47,044 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.14 vs. limit=2.0 +2022-12-07 13:19:51,375 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=9.03 vs. limit=5.0 +2022-12-07 13:20:02,756 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36078.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:20:42,662 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 2.552e+02 3.407e+02 4.382e+02 1.034e+03, threshold=6.813e+02, percent-clipped=3.0 +2022-12-07 13:20:45,473 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36127.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:21:04,917 INFO [train.py:873] (2/4) Epoch 5, batch 5900, loss[loss=0.1813, simple_loss=0.1914, pruned_loss=0.08557, over 9494.00 frames. ], tot_loss[loss=0.1787, simple_loss=0.1914, pruned_loss=0.08303, over 1988204.89 frames. ], batch size: 100, lr: 1.50e-02, grad_scale: 8.0 +2022-12-07 13:21:18,883 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7210, 2.5514, 2.4915, 2.8542, 2.3478, 2.3263, 2.7695, 2.7979], + device='cuda:2'), covar=tensor([0.0813, 0.0934, 0.0831, 0.0640, 0.1086, 0.0758, 0.0921, 0.0691], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0096, 0.0107, 0.0111, 0.0113, 0.0086, 0.0123, 0.0105], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:21:20,681 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36167.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:21:22,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6068, 1.1154, 2.0084, 2.0077, 2.0502, 2.1233, 1.4033, 2.0610], + device='cuda:2'), covar=tensor([0.0366, 0.0727, 0.0099, 0.0166, 0.0137, 0.0084, 0.0301, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0159, 0.0102, 0.0143, 0.0118, 0.0121, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:21:30,244 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8884, 0.8247, 0.6884, 0.9829, 0.9981, 0.4753, 0.7256, 0.8760], + device='cuda:2'), covar=tensor([0.0691, 0.0780, 0.0428, 0.0426, 0.0679, 0.0486, 0.1092, 0.0779], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0019, 0.0021, 0.0026, 0.0021, 0.0020], + device='cuda:2'), out_proj_covar=tensor([7.8629e-05, 7.9956e-05, 7.5222e-05, 7.7667e-05, 8.3431e-05, 9.8233e-05, + 8.7225e-05, 7.8456e-05], device='cuda:2') +2022-12-07 13:21:32,819 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36181.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:21:37,056 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36186.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:22:10,496 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.379e+02 2.756e+02 3.283e+02 4.209e+02 7.399e+02, threshold=6.567e+02, percent-clipped=1.0 +2022-12-07 13:22:20,119 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36234.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:22:27,264 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36242.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:22:33,229 INFO [train.py:873] (2/4) Epoch 5, batch 6000, loss[loss=0.1933, simple_loss=0.2056, pruned_loss=0.09049, over 14514.00 frames. ], tot_loss[loss=0.1771, simple_loss=0.1906, pruned_loss=0.08182, over 2002919.11 frames. ], batch size: 51, lr: 1.50e-02, grad_scale: 8.0 +2022-12-07 13:22:33,229 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 13:22:44,005 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3468, 1.0060, 0.7942, 1.1783, 0.9350, 0.7046, 0.9445, 1.3400], + device='cuda:2'), covar=tensor([0.1287, 0.1888, 0.1239, 0.0576, 0.1601, 0.0505, 0.0591, 0.0701], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0020, 0.0019, 0.0020, 0.0025, 0.0021, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.7899e-05, 7.9446e-05, 7.4921e-05, 7.7692e-05, 8.2345e-05, 9.7254e-05, + 8.5721e-05, 7.8056e-05], device='cuda:2') +2022-12-07 13:22:45,291 INFO [train.py:905] (2/4) Epoch 5, validation: loss=0.1225, simple_loss=0.1661, pruned_loss=0.03949, over 857387.00 frames. +2022-12-07 13:22:45,292 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 13:22:58,430 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-12-07 13:23:01,012 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 13:23:32,900 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=15.62 vs. limit=5.0 +2022-12-07 13:23:51,459 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.588e+02 2.659e+02 3.309e+02 3.881e+02 6.355e+02, threshold=6.617e+02, percent-clipped=0.0 +2022-12-07 13:24:00,177 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3620, 3.1926, 2.8431, 2.9725, 3.2855, 3.1887, 3.3459, 3.2852], + device='cuda:2'), covar=tensor([0.0811, 0.0631, 0.2110, 0.2892, 0.0795, 0.0913, 0.1070, 0.0939], + device='cuda:2'), in_proj_covar=tensor([0.0293, 0.0227, 0.0357, 0.0443, 0.0263, 0.0320, 0.0327, 0.0273], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:24:13,083 INFO [train.py:873] (2/4) Epoch 5, batch 6100, loss[loss=0.16, simple_loss=0.181, pruned_loss=0.0695, over 14027.00 frames. ], tot_loss[loss=0.1764, simple_loss=0.19, pruned_loss=0.08136, over 1974852.13 frames. ], batch size: 29, lr: 1.50e-02, grad_scale: 8.0 +2022-12-07 13:24:23,913 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2165, 3.4214, 4.1570, 2.9247, 2.2965, 3.2430, 1.6695, 3.1735], + device='cuda:2'), covar=tensor([0.1755, 0.0621, 0.0472, 0.1586, 0.2310, 0.1258, 0.4970, 0.1317], + device='cuda:2'), in_proj_covar=tensor([0.0069, 0.0077, 0.0073, 0.0083, 0.0106, 0.0066, 0.0134, 0.0071], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 13:25:19,184 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.518e+02 3.418e+02 4.191e+02 1.416e+03, threshold=6.837e+02, percent-clipped=13.0 +2022-12-07 13:25:21,972 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36427.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:25:40,681 INFO [train.py:873] (2/4) Epoch 5, batch 6200, loss[loss=0.1814, simple_loss=0.2017, pruned_loss=0.08053, over 14023.00 frames. ], tot_loss[loss=0.177, simple_loss=0.1904, pruned_loss=0.08178, over 1924718.73 frames. ], batch size: 22, lr: 1.50e-02, grad_scale: 8.0 +2022-12-07 13:25:56,852 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36467.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:26:03,706 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36475.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:26:07,468 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4315, 3.1038, 4.2407, 2.9986, 4.1712, 4.1506, 3.9571, 3.5660], + device='cuda:2'), covar=tensor([0.0352, 0.2505, 0.0850, 0.1958, 0.0737, 0.0578, 0.1893, 0.1883], + device='cuda:2'), in_proj_covar=tensor([0.0269, 0.0334, 0.0355, 0.0307, 0.0341, 0.0280, 0.0331, 0.0352], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:26:14,781 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.50 vs. limit=2.0 +2022-12-07 13:26:38,819 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36515.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:26:46,427 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.596e+02 3.351e+02 4.378e+02 1.450e+03, threshold=6.701e+02, percent-clipped=3.0 +2022-12-07 13:26:57,884 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36537.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:27:08,590 INFO [train.py:873] (2/4) Epoch 5, batch 6300, loss[loss=0.204, simple_loss=0.2061, pruned_loss=0.101, over 9426.00 frames. ], tot_loss[loss=0.1765, simple_loss=0.1902, pruned_loss=0.08145, over 1941159.19 frames. ], batch size: 100, lr: 1.49e-02, grad_scale: 8.0 +2022-12-07 13:27:37,010 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2667, 2.6912, 4.2309, 3.0524, 4.1632, 3.9215, 3.8889, 3.4862], + device='cuda:2'), covar=tensor([0.0245, 0.2977, 0.0610, 0.1614, 0.0608, 0.0665, 0.1457, 0.1970], + device='cuda:2'), in_proj_covar=tensor([0.0267, 0.0334, 0.0353, 0.0307, 0.0339, 0.0277, 0.0323, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:27:57,476 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36605.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:28:00,260 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36608.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:28:03,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1464, 4.8188, 4.4739, 4.6161, 4.6329, 4.9664, 5.1531, 5.0416], + device='cuda:2'), covar=tensor([0.0842, 0.0484, 0.1978, 0.2825, 0.0718, 0.0639, 0.0727, 0.0941], + device='cuda:2'), in_proj_covar=tensor([0.0292, 0.0224, 0.0358, 0.0456, 0.0262, 0.0322, 0.0329, 0.0274], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:28:04,018 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36612.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:28:14,423 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.701e+02 3.163e+02 3.734e+02 7.022e+02, threshold=6.325e+02, percent-clipped=2.0 +2022-12-07 13:28:35,795 INFO [train.py:873] (2/4) Epoch 5, batch 6400, loss[loss=0.1715, simple_loss=0.1742, pruned_loss=0.0844, over 4936.00 frames. ], tot_loss[loss=0.1763, simple_loss=0.1903, pruned_loss=0.08122, over 1955690.84 frames. ], batch size: 100, lr: 1.49e-02, grad_scale: 8.0 +2022-12-07 13:28:43,769 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 13:28:51,083 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36666.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:28:53,753 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36669.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:28:57,066 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36673.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:29:20,716 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36700.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:29:41,187 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.651e+02 2.894e+02 3.789e+02 4.825e+02 7.842e+02, threshold=7.578e+02, percent-clipped=4.0 +2022-12-07 13:30:02,671 INFO [train.py:873] (2/4) Epoch 5, batch 6500, loss[loss=0.1564, simple_loss=0.1699, pruned_loss=0.07148, over 6955.00 frames. ], tot_loss[loss=0.1768, simple_loss=0.1902, pruned_loss=0.0817, over 1891271.97 frames. ], batch size: 100, lr: 1.49e-02, grad_scale: 8.0 +2022-12-07 13:30:13,301 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36761.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:30:50,702 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1595, 1.0947, 1.1369, 1.1099, 0.8829, 0.6801, 0.7409, 0.7048], + device='cuda:2'), covar=tensor([0.0286, 0.0344, 0.0426, 0.0164, 0.0493, 0.0329, 0.0270, 0.0601], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0011, 0.0010, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([6.1405e-05, 6.3889e-05, 6.0923e-05, 5.6259e-05, 6.5575e-05, 8.6462e-05, + 7.1398e-05, 8.4454e-05], device='cuda:2') +2022-12-07 13:30:51,518 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36805.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 13:31:07,675 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.168e+02 2.577e+02 3.368e+02 4.224e+02 7.324e+02, threshold=6.736e+02, percent-clipped=0.0 +2022-12-07 13:31:18,682 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=36837.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:31:28,830 INFO [train.py:873] (2/4) Epoch 5, batch 6600, loss[loss=0.1773, simple_loss=0.1953, pruned_loss=0.07963, over 14375.00 frames. ], tot_loss[loss=0.1768, simple_loss=0.1899, pruned_loss=0.08184, over 1940684.25 frames. ], batch size: 73, lr: 1.49e-02, grad_scale: 8.0 +2022-12-07 13:31:43,430 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1261, 2.9071, 3.9434, 3.0220, 3.8844, 3.8296, 3.6263, 3.3693], + device='cuda:2'), covar=tensor([0.0284, 0.2161, 0.0747, 0.1440, 0.0501, 0.0494, 0.1667, 0.1686], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0338, 0.0362, 0.0317, 0.0344, 0.0286, 0.0339, 0.0361], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:31:44,154 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36866.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 13:31:58,151 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3714, 1.1680, 1.0857, 1.5145, 0.8833, 0.9353, 1.3668, 1.0271], + device='cuda:2'), covar=tensor([0.1329, 0.3478, 0.1659, 0.1273, 0.2423, 0.0494, 0.0531, 0.1452], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0011, 0.0010, 0.0012, 0.0015, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.0754e-05, 6.3081e-05, 6.1199e-05, 5.6535e-05, 6.4816e-05, 8.5788e-05, + 7.0889e-05, 8.2822e-05], device='cuda:2') +2022-12-07 13:32:00,559 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=36885.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:32:23,326 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36911.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:32:34,637 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.557e+01 2.430e+02 3.121e+02 4.365e+02 9.299e+02, threshold=6.241e+02, percent-clipped=3.0 +2022-12-07 13:32:56,984 INFO [train.py:873] (2/4) Epoch 5, batch 6700, loss[loss=0.1565, simple_loss=0.1826, pruned_loss=0.06519, over 14166.00 frames. ], tot_loss[loss=0.177, simple_loss=0.1904, pruned_loss=0.08177, over 1975892.26 frames. ], batch size: 35, lr: 1.49e-02, grad_scale: 8.0 +2022-12-07 13:33:04,795 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=36958.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:07,472 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36961.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:10,028 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36964.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:13,476 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=36968.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:17,180 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=36972.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:27,463 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.03 vs. limit=5.0 +2022-12-07 13:33:45,463 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37004.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:33:59,082 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37019.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:34:03,149 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.645e+02 2.484e+02 3.120e+02 4.102e+02 1.074e+03, threshold=6.239e+02, percent-clipped=7.0 +2022-12-07 13:34:07,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.38 vs. limit=5.0 +2022-12-07 13:34:24,643 INFO [train.py:873] (2/4) Epoch 5, batch 6800, loss[loss=0.2128, simple_loss=0.1705, pruned_loss=0.1275, over 1200.00 frames. ], tot_loss[loss=0.1773, simple_loss=0.1908, pruned_loss=0.0819, over 2025607.71 frames. ], batch size: 100, lr: 1.48e-02, grad_scale: 8.0 +2022-12-07 13:34:31,275 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37056.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 13:34:38,916 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37065.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:35:24,519 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.89 vs. limit=5.0 +2022-12-07 13:35:30,147 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.403e+02 2.984e+02 3.959e+02 1.091e+03, threshold=5.968e+02, percent-clipped=8.0 +2022-12-07 13:35:37,943 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 13:35:52,586 INFO [train.py:873] (2/4) Epoch 5, batch 6900, loss[loss=0.2, simple_loss=0.2068, pruned_loss=0.09659, over 14281.00 frames. ], tot_loss[loss=0.1767, simple_loss=0.1905, pruned_loss=0.08142, over 2051041.82 frames. ], batch size: 60, lr: 1.48e-02, grad_scale: 8.0 +2022-12-07 13:35:54,970 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.04 vs. limit=2.0 +2022-12-07 13:36:01,491 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-07 13:36:02,906 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37161.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 13:36:11,323 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 13:36:29,650 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.12 vs. limit=5.0 +2022-12-07 13:36:58,808 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.066e+02 2.434e+02 3.159e+02 4.022e+02 7.807e+02, threshold=6.317e+02, percent-clipped=6.0 +2022-12-07 13:37:16,310 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-12-07 13:37:20,128 INFO [train.py:873] (2/4) Epoch 5, batch 7000, loss[loss=0.176, simple_loss=0.1882, pruned_loss=0.08191, over 14256.00 frames. ], tot_loss[loss=0.1771, simple_loss=0.1907, pruned_loss=0.08178, over 2011793.66 frames. ], batch size: 60, lr: 1.48e-02, grad_scale: 4.0 +2022-12-07 13:37:24,054 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4587, 1.5366, 2.7646, 1.4863, 2.7328, 2.7785, 2.0127, 2.8564], + device='cuda:2'), covar=tensor([0.0245, 0.2101, 0.0228, 0.1697, 0.0258, 0.0319, 0.0707, 0.0191], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0162, 0.0139, 0.0171, 0.0156, 0.0151, 0.0124, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:37:31,258 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37261.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:37:33,843 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37264.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:37:36,238 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37267.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:37:37,162 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37268.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:37:37,201 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37268.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:12,641 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37309.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:14,980 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37312.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:16,022 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5569, 2.4045, 4.5783, 3.0202, 4.2331, 1.9675, 3.4543, 4.1111], + device='cuda:2'), covar=tensor([0.0416, 0.5213, 0.0311, 0.9299, 0.0333, 0.4248, 0.1143, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0252, 0.0166, 0.0343, 0.0175, 0.0260, 0.0238, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:38:16,750 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37314.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:18,490 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37316.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:26,222 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.611e+02 3.349e+02 4.594e+02 9.259e+02, threshold=6.698e+02, percent-clipped=6.0 +2022-12-07 13:38:29,752 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37329.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:46,859 INFO [train.py:873] (2/4) Epoch 5, batch 7100, loss[loss=0.1978, simple_loss=0.1988, pruned_loss=0.09837, over 7774.00 frames. ], tot_loss[loss=0.1767, simple_loss=0.1902, pruned_loss=0.08159, over 1939421.28 frames. ], batch size: 100, lr: 1.48e-02, grad_scale: 4.0 +2022-12-07 13:38:52,930 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37356.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 13:38:56,305 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37360.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:38:59,720 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37364.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:39:34,774 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37404.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:39:53,450 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.411e+02 2.303e+02 3.351e+02 4.171e+02 6.616e+02, threshold=6.701e+02, percent-clipped=0.0 +2022-12-07 13:39:53,711 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=37425.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:40:06,320 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8083, 0.8029, 0.7727, 0.9372, 0.9209, 0.3117, 0.8236, 0.8390], + device='cuda:2'), covar=tensor([0.0247, 0.0371, 0.0157, 0.0230, 0.0107, 0.0101, 0.0710, 0.0276], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0018, 0.0019, 0.0021, 0.0025, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.6050e-05, 8.1368e-05, 7.2945e-05, 7.7620e-05, 8.3666e-05, 9.7273e-05, + 8.6593e-05, 7.9971e-05], device='cuda:2') +2022-12-07 13:40:10,711 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9312, 2.7142, 2.6129, 1.5884, 2.3903, 2.7183, 2.9139, 2.3891], + device='cuda:2'), covar=tensor([0.0625, 0.1840, 0.1125, 0.2547, 0.0981, 0.0526, 0.0695, 0.1739], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0202, 0.0120, 0.0126, 0.0105, 0.0112, 0.0089, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 13:40:14,167 INFO [train.py:873] (2/4) Epoch 5, batch 7200, loss[loss=0.2404, simple_loss=0.1976, pruned_loss=0.1416, over 1163.00 frames. ], tot_loss[loss=0.1754, simple_loss=0.1894, pruned_loss=0.08076, over 1965348.65 frames. ], batch size: 100, lr: 1.48e-02, grad_scale: 8.0 +2022-12-07 13:40:18,427 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-07 13:40:25,465 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37461.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 13:40:49,126 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7412, 5.6272, 5.2777, 5.8871, 5.5349, 5.2615, 5.8836, 5.7659], + device='cuda:2'), covar=tensor([0.0566, 0.0358, 0.0387, 0.0347, 0.0441, 0.0237, 0.0378, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0096, 0.0109, 0.0112, 0.0114, 0.0086, 0.0125, 0.0106], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:41:06,636 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-12-07 13:41:08,040 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37509.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 13:41:21,482 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.551e+02 2.612e+02 3.336e+02 4.094e+02 6.871e+02, threshold=6.672e+02, percent-clipped=1.0 +2022-12-07 13:41:31,034 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-12-07 13:41:35,518 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.57 vs. limit=5.0 +2022-12-07 13:41:42,812 INFO [train.py:873] (2/4) Epoch 5, batch 7300, loss[loss=0.1717, simple_loss=0.1886, pruned_loss=0.07736, over 14296.00 frames. ], tot_loss[loss=0.1765, simple_loss=0.1897, pruned_loss=0.08165, over 1903211.74 frames. ], batch size: 69, lr: 1.47e-02, grad_scale: 8.0 +2022-12-07 13:41:58,569 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37567.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:42:17,817 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.78 vs. limit=5.0 +2022-12-07 13:42:41,433 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37614.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:42:42,183 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37615.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:42:50,477 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37624.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:42:51,224 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.575e+02 2.435e+02 3.251e+02 4.059e+02 7.905e+02, threshold=6.501e+02, percent-clipped=2.0 +2022-12-07 13:43:12,151 INFO [train.py:873] (2/4) Epoch 5, batch 7400, loss[loss=0.1726, simple_loss=0.1885, pruned_loss=0.0783, over 14527.00 frames. ], tot_loss[loss=0.1753, simple_loss=0.1892, pruned_loss=0.08067, over 1928903.26 frames. ], batch size: 49, lr: 1.47e-02, grad_scale: 8.0 +2022-12-07 13:43:22,080 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37660.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:43:24,013 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37662.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:43:30,257 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9022, 0.8466, 1.2494, 1.0564, 1.2862, 0.7453, 1.0645, 0.8458], + device='cuda:2'), covar=tensor([0.0700, 0.0563, 0.0495, 0.0377, 0.0642, 0.0324, 0.0313, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0012, 0.0010, 0.0010, 0.0012, 0.0015, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.2247e-05, 6.4855e-05, 6.0296e-05, 5.5935e-05, 6.5943e-05, 8.5392e-05, + 7.0173e-05, 8.3805e-05], device='cuda:2') +2022-12-07 13:43:52,431 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6467, 2.7644, 2.5578, 2.4494, 2.0491, 2.9689, 2.3941, 1.1366], + device='cuda:2'), covar=tensor([0.3091, 0.0779, 0.0902, 0.1141, 0.1075, 0.0509, 0.1780, 0.4014], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0059, 0.0051, 0.0051, 0.0072, 0.0055, 0.0079, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:44:05,021 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37708.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:44:15,056 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=37720.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:44:19,314 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.338e+02 2.420e+02 3.401e+02 4.272e+02 7.052e+02, threshold=6.801e+02, percent-clipped=3.0 +2022-12-07 13:44:41,303 INFO [train.py:873] (2/4) Epoch 5, batch 7500, loss[loss=0.1596, simple_loss=0.1869, pruned_loss=0.06619, over 14248.00 frames. ], tot_loss[loss=0.1753, simple_loss=0.1893, pruned_loss=0.08067, over 1967206.44 frames. ], batch size: 57, lr: 1.47e-02, grad_scale: 8.0 +2022-12-07 13:44:47,615 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4239, 5.2513, 4.8578, 5.0197, 5.0968, 5.3333, 5.5156, 5.4423], + device='cuda:2'), covar=tensor([0.0882, 0.0360, 0.1840, 0.2063, 0.0595, 0.0523, 0.0566, 0.0754], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0229, 0.0365, 0.0458, 0.0273, 0.0328, 0.0334, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:45:08,553 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1508, 3.5385, 2.8052, 4.2098, 4.0318, 4.0672, 3.4032, 2.9600], + device='cuda:2'), covar=tensor([0.0743, 0.1245, 0.3408, 0.0350, 0.0502, 0.1797, 0.1125, 0.3504], + device='cuda:2'), in_proj_covar=tensor([0.0220, 0.0298, 0.0292, 0.0185, 0.0251, 0.0253, 0.0252, 0.0287], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:45:09,360 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9792, 2.6933, 2.7190, 1.7545, 2.5093, 2.7201, 2.9071, 2.3389], + device='cuda:2'), covar=tensor([0.0809, 0.2945, 0.1511, 0.2703, 0.1181, 0.0716, 0.0851, 0.2147], + device='cuda:2'), in_proj_covar=tensor([0.0107, 0.0202, 0.0119, 0.0124, 0.0104, 0.0110, 0.0087, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 13:45:17,236 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.67 vs. limit=5.0 +2022-12-07 13:46:12,627 INFO [train.py:873] (2/4) Epoch 6, batch 0, loss[loss=0.2007, simple_loss=0.2115, pruned_loss=0.09501, over 13973.00 frames. ], tot_loss[loss=0.2007, simple_loss=0.2115, pruned_loss=0.09501, over 13973.00 frames. ], batch size: 19, lr: 1.37e-02, grad_scale: 8.0 +2022-12-07 13:46:12,627 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 13:46:18,289 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2177, 1.5667, 3.4299, 1.6108, 3.3621, 3.3331, 2.2717, 3.5516], + device='cuda:2'), covar=tensor([0.0181, 0.2607, 0.0155, 0.2040, 0.0196, 0.0250, 0.0316, 0.0123], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0154, 0.0132, 0.0164, 0.0151, 0.0146, 0.0119, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:46:20,010 INFO [train.py:905] (2/4) Epoch 6, validation: loss=0.1313, simple_loss=0.1749, pruned_loss=0.04388, over 857387.00 frames. +2022-12-07 13:46:20,014 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 13:46:32,651 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.323e+01 1.772e+02 2.799e+02 3.695e+02 8.641e+02, threshold=5.598e+02, percent-clipped=1.0 +2022-12-07 13:46:45,302 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 13:47:50,545 INFO [train.py:873] (2/4) Epoch 6, batch 100, loss[loss=0.1544, simple_loss=0.1736, pruned_loss=0.06762, over 14095.00 frames. ], tot_loss[loss=0.1729, simple_loss=0.1879, pruned_loss=0.07893, over 838551.50 frames. ], batch size: 22, lr: 1.37e-02, grad_scale: 8.0 +2022-12-07 13:48:01,908 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=37924.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:48:02,649 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.346e+02 2.495e+02 3.181e+02 4.062e+02 9.836e+02, threshold=6.363e+02, percent-clipped=3.0 +2022-12-07 13:48:20,991 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-07 13:48:44,529 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=37972.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:48:50,824 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=37979.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:49:09,511 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-07 13:49:19,531 INFO [train.py:873] (2/4) Epoch 6, batch 200, loss[loss=0.1961, simple_loss=0.1875, pruned_loss=0.1023, over 2674.00 frames. ], tot_loss[loss=0.171, simple_loss=0.1863, pruned_loss=0.07789, over 1227036.52 frames. ], batch size: 100, lr: 1.37e-02, grad_scale: 8.0 +2022-12-07 13:49:27,521 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=38020.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:49:31,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.520e+02 2.443e+02 3.000e+02 3.883e+02 6.442e+02, threshold=6.000e+02, percent-clipped=2.0 +2022-12-07 13:49:45,625 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38040.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:50:09,555 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=38068.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:50:11,414 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.76 vs. limit=5.0 +2022-12-07 13:50:47,794 INFO [train.py:873] (2/4) Epoch 6, batch 300, loss[loss=0.1907, simple_loss=0.2011, pruned_loss=0.09016, over 14553.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.187, pruned_loss=0.07843, over 1538402.25 frames. ], batch size: 43, lr: 1.37e-02, grad_scale: 8.0 +2022-12-07 13:50:59,800 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 2.428e+02 3.008e+02 3.765e+02 6.349e+02, threshold=6.017e+02, percent-clipped=1.0 +2022-12-07 13:52:16,131 INFO [train.py:873] (2/4) Epoch 6, batch 400, loss[loss=0.1584, simple_loss=0.174, pruned_loss=0.07135, over 14634.00 frames. ], tot_loss[loss=0.1716, simple_loss=0.1871, pruned_loss=0.0781, over 1740032.39 frames. ], batch size: 33, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:52:16,212 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3115, 3.9455, 3.9443, 4.3054, 3.9650, 3.5689, 4.3810, 4.2666], + device='cuda:2'), covar=tensor([0.0663, 0.0723, 0.0661, 0.0551, 0.0795, 0.0597, 0.0622, 0.0628], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0098, 0.0111, 0.0113, 0.0115, 0.0086, 0.0126, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 13:52:28,766 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 2.719e+02 3.194e+02 3.868e+02 8.474e+02, threshold=6.387e+02, percent-clipped=5.0 +2022-12-07 13:52:49,858 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.84 vs. limit=2.0 +2022-12-07 13:53:45,481 INFO [train.py:873] (2/4) Epoch 6, batch 500, loss[loss=0.2076, simple_loss=0.1821, pruned_loss=0.1166, over 1273.00 frames. ], tot_loss[loss=0.1745, simple_loss=0.1887, pruned_loss=0.08011, over 1813547.80 frames. ], batch size: 100, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:53:57,615 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.672e+02 2.649e+02 3.367e+02 4.608e+02 8.885e+02, threshold=6.735e+02, percent-clipped=8.0 +2022-12-07 13:54:06,532 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=38335.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:54:28,798 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=38360.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:55:12,159 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.47 vs. limit=5.0 +2022-12-07 13:55:12,572 INFO [train.py:873] (2/4) Epoch 6, batch 600, loss[loss=0.1518, simple_loss=0.1799, pruned_loss=0.06187, over 14385.00 frames. ], tot_loss[loss=0.175, simple_loss=0.1891, pruned_loss=0.08043, over 1903813.34 frames. ], batch size: 55, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:55:17,920 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7268, 3.2869, 4.1523, 2.5557, 2.7378, 3.3446, 1.6729, 3.2896], + device='cuda:2'), covar=tensor([0.2065, 0.0644, 0.0403, 0.2724, 0.2205, 0.1016, 0.5574, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0068, 0.0077, 0.0074, 0.0082, 0.0105, 0.0068, 0.0133, 0.0070], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:55:18,306 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 13:55:21,354 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38421.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:55:24,769 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 2.417e+02 3.018e+02 3.905e+02 9.436e+02, threshold=6.036e+02, percent-clipped=5.0 +2022-12-07 13:55:49,243 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9542, 1.5714, 1.9684, 1.8311, 2.0600, 1.8240, 1.6586, 1.9439], + device='cuda:2'), covar=tensor([0.0184, 0.0709, 0.0105, 0.0247, 0.0112, 0.0247, 0.0158, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0278, 0.0331, 0.0364, 0.0313, 0.0349, 0.0286, 0.0336, 0.0345], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 13:56:41,310 INFO [train.py:873] (2/4) Epoch 6, batch 700, loss[loss=0.1666, simple_loss=0.1836, pruned_loss=0.07482, over 14233.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.1871, pruned_loss=0.07884, over 1919309.51 frames. ], batch size: 69, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:56:53,909 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 2.377e+02 3.088e+02 3.950e+02 6.999e+02, threshold=6.177e+02, percent-clipped=2.0 +2022-12-07 13:57:39,436 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6091, 3.6042, 3.8118, 3.4490, 3.5864, 3.6166, 1.4072, 3.4725], + device='cuda:2'), covar=tensor([0.0213, 0.0280, 0.0314, 0.0395, 0.0302, 0.0414, 0.3109, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0138, 0.0126, 0.0117, 0.0169, 0.0118, 0.0154, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:57:40,369 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4175, 3.0656, 2.7349, 1.8387, 2.7478, 3.0832, 3.3321, 2.5093], + device='cuda:2'), covar=tensor([0.0524, 0.1872, 0.1025, 0.2141, 0.0846, 0.0542, 0.0604, 0.1316], + device='cuda:2'), in_proj_covar=tensor([0.0109, 0.0203, 0.0117, 0.0126, 0.0107, 0.0113, 0.0088, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 13:57:55,952 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4944, 2.4120, 4.4734, 3.0692, 4.3272, 2.0579, 3.2303, 4.2342], + device='cuda:2'), covar=tensor([0.0398, 0.4752, 0.0460, 0.9777, 0.0402, 0.4404, 0.1336, 0.0330], + device='cuda:2'), in_proj_covar=tensor([0.0220, 0.0250, 0.0171, 0.0350, 0.0180, 0.0260, 0.0240, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 13:58:03,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1360, 1.4485, 3.9556, 1.8584, 4.0302, 4.0400, 3.2815, 4.5205], + device='cuda:2'), covar=tensor([0.0171, 0.2763, 0.0303, 0.2105, 0.0267, 0.0304, 0.0452, 0.0105], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0159, 0.0139, 0.0171, 0.0155, 0.0151, 0.0123, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 13:58:10,603 INFO [train.py:873] (2/4) Epoch 6, batch 800, loss[loss=0.178, simple_loss=0.1931, pruned_loss=0.08143, over 14422.00 frames. ], tot_loss[loss=0.1723, simple_loss=0.187, pruned_loss=0.07875, over 1933943.37 frames. ], batch size: 51, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:58:23,021 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 2.424e+02 2.905e+02 3.924e+02 6.932e+02, threshold=5.809e+02, percent-clipped=2.0 +2022-12-07 13:58:32,273 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=38635.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:58:42,710 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3183, 1.8548, 1.9953, 1.0373, 1.7929, 2.1921, 2.1879, 1.7662], + device='cuda:2'), covar=tensor([0.0671, 0.1501, 0.1103, 0.2744, 0.1109, 0.0674, 0.0562, 0.1607], + device='cuda:2'), in_proj_covar=tensor([0.0110, 0.0207, 0.0121, 0.0128, 0.0108, 0.0114, 0.0089, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 13:58:55,727 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9755, 0.9226, 0.9388, 1.1978, 1.1728, 0.5760, 0.9671, 0.8862], + device='cuda:2'), covar=tensor([0.1800, 0.1076, 0.0549, 0.0944, 0.1054, 0.0436, 0.1396, 0.2835], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0018, 0.0018, 0.0019, 0.0025, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.8260e-05, 8.0339e-05, 7.3111e-05, 7.7477e-05, 8.1723e-05, 9.8882e-05, + 8.7625e-05, 7.8816e-05], device='cuda:2') +2022-12-07 13:59:15,078 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=38683.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:59:25,028 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.84 vs. limit=5.0 +2022-12-07 13:59:39,509 INFO [train.py:873] (2/4) Epoch 6, batch 900, loss[loss=0.1717, simple_loss=0.1942, pruned_loss=0.07458, over 14295.00 frames. ], tot_loss[loss=0.1714, simple_loss=0.1867, pruned_loss=0.07807, over 1976770.42 frames. ], batch size: 28, lr: 1.36e-02, grad_scale: 8.0 +2022-12-07 13:59:44,619 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=38716.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 13:59:52,235 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.369e+02 2.457e+02 3.272e+02 3.954e+02 1.451e+03, threshold=6.544e+02, percent-clipped=7.0 +2022-12-07 14:00:05,869 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 14:00:23,548 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-12-07 14:00:29,101 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4538, 2.5847, 2.4826, 2.4392, 1.9707, 2.5857, 2.2999, 1.0670], + device='cuda:2'), covar=tensor([0.2179, 0.1000, 0.0851, 0.0725, 0.0969, 0.0691, 0.1713, 0.3694], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0061, 0.0050, 0.0053, 0.0072, 0.0056, 0.0082, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:01:08,900 INFO [train.py:873] (2/4) Epoch 6, batch 1000, loss[loss=0.1742, simple_loss=0.1907, pruned_loss=0.07886, over 11171.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.1871, pruned_loss=0.07885, over 1953442.98 frames. ], batch size: 100, lr: 1.35e-02, grad_scale: 8.0 +2022-12-07 14:01:19,444 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=38823.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:01:21,140 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.119e+02 2.592e+02 3.216e+02 4.248e+02 8.098e+02, threshold=6.432e+02, percent-clipped=2.0 +2022-12-07 14:02:13,880 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=38884.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:02:14,903 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.99 vs. limit=2.0 +2022-12-07 14:02:37,253 INFO [train.py:873] (2/4) Epoch 6, batch 1100, loss[loss=0.1388, simple_loss=0.1652, pruned_loss=0.05617, over 14050.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.1863, pruned_loss=0.07813, over 1970152.91 frames. ], batch size: 19, lr: 1.35e-02, grad_scale: 8.0 +2022-12-07 14:02:50,185 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.088e+02 2.398e+02 3.098e+02 3.888e+02 7.053e+02, threshold=6.197e+02, percent-clipped=1.0 +2022-12-07 14:03:35,874 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1840, 3.6412, 2.6389, 4.4265, 4.0893, 4.2235, 3.4363, 2.8915], + device='cuda:2'), covar=tensor([0.0495, 0.1090, 0.4045, 0.0224, 0.0594, 0.0686, 0.0974, 0.3869], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0295, 0.0287, 0.0185, 0.0250, 0.0247, 0.0250, 0.0283], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:04:07,243 INFO [train.py:873] (2/4) Epoch 6, batch 1200, loss[loss=0.1861, simple_loss=0.1977, pruned_loss=0.08723, over 14252.00 frames. ], tot_loss[loss=0.1721, simple_loss=0.1873, pruned_loss=0.07843, over 2003606.60 frames. ], batch size: 66, lr: 1.35e-02, grad_scale: 8.0 +2022-12-07 14:04:11,665 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39016.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:04:16,777 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8692, 1.5683, 3.8620, 3.6269, 3.7093, 3.7043, 3.0415, 3.8964], + device='cuda:2'), covar=tensor([0.1150, 0.1228, 0.0079, 0.0142, 0.0147, 0.0109, 0.0236, 0.0089], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0107, 0.0145, 0.0122, 0.0125, 0.0096, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:04:19,444 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.136e+01 2.396e+02 3.010e+02 3.930e+02 7.519e+02, threshold=6.020e+02, percent-clipped=3.0 +2022-12-07 14:04:46,490 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4029, 1.8116, 1.6110, 1.8010, 1.6130, 1.7546, 1.4577, 1.0630], + device='cuda:2'), covar=tensor([0.1806, 0.0655, 0.0608, 0.0385, 0.0943, 0.0398, 0.1672, 0.2483], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0062, 0.0052, 0.0052, 0.0076, 0.0057, 0.0085, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:04:54,531 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39064.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:05:05,638 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-12-07 14:05:35,514 INFO [train.py:873] (2/4) Epoch 6, batch 1300, loss[loss=0.195, simple_loss=0.1906, pruned_loss=0.09968, over 3892.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.1866, pruned_loss=0.07802, over 1970778.86 frames. ], batch size: 100, lr: 1.35e-02, grad_scale: 8.0 +2022-12-07 14:05:46,163 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8890, 1.4691, 3.5624, 3.3210, 3.4451, 3.5368, 2.8998, 3.5996], + device='cuda:2'), covar=tensor([0.1076, 0.1292, 0.0085, 0.0172, 0.0167, 0.0102, 0.0198, 0.0103], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0154, 0.0105, 0.0144, 0.0120, 0.0123, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:05:47,006 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8561, 1.5927, 3.7659, 3.4815, 3.6348, 3.7728, 3.1540, 3.8036], + device='cuda:2'), covar=tensor([0.1134, 0.1311, 0.0081, 0.0159, 0.0148, 0.0090, 0.0183, 0.0099], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0154, 0.0105, 0.0144, 0.0120, 0.0123, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:05:48,566 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.179e+02 2.329e+02 2.773e+02 3.737e+02 7.179e+02, threshold=5.547e+02, percent-clipped=2.0 +2022-12-07 14:06:09,151 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-12-07 14:06:14,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3249, 3.1651, 2.8514, 2.9689, 3.2844, 3.2039, 3.3192, 3.2913], + device='cuda:2'), covar=tensor([0.0896, 0.0664, 0.2188, 0.2674, 0.0856, 0.0894, 0.1101, 0.0830], + device='cuda:2'), in_proj_covar=tensor([0.0304, 0.0225, 0.0368, 0.0460, 0.0272, 0.0334, 0.0336, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:06:36,469 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39179.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:06:41,827 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39185.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:06:53,490 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6426, 4.6104, 5.0621, 4.1191, 4.7495, 5.1340, 1.9306, 4.4213], + device='cuda:2'), covar=tensor([0.0195, 0.0222, 0.0232, 0.0356, 0.0225, 0.0095, 0.2774, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0141, 0.0125, 0.0119, 0.0170, 0.0121, 0.0153, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:07:05,237 INFO [train.py:873] (2/4) Epoch 6, batch 1400, loss[loss=0.1663, simple_loss=0.1836, pruned_loss=0.07448, over 14227.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.1871, pruned_loss=0.07869, over 1970705.03 frames. ], batch size: 94, lr: 1.35e-02, grad_scale: 8.0 +2022-12-07 14:07:17,497 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 2.370e+02 2.964e+02 3.794e+02 7.798e+02, threshold=5.929e+02, percent-clipped=9.0 +2022-12-07 14:07:33,677 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-12-07 14:07:36,765 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39246.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:08:03,321 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6138, 1.8827, 2.5139, 2.2495, 2.5603, 2.3366, 2.2704, 2.2451], + device='cuda:2'), covar=tensor([0.0426, 0.2463, 0.0603, 0.1574, 0.0420, 0.0555, 0.0893, 0.1223], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0342, 0.0375, 0.0315, 0.0361, 0.0289, 0.0346, 0.0350], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:08:03,540 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-12-07 14:08:10,647 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39284.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:08:33,850 INFO [train.py:873] (2/4) Epoch 6, batch 1500, loss[loss=0.1872, simple_loss=0.1714, pruned_loss=0.1016, over 2650.00 frames. ], tot_loss[loss=0.1715, simple_loss=0.1863, pruned_loss=0.07835, over 1954165.78 frames. ], batch size: 100, lr: 1.34e-02, grad_scale: 16.0 +2022-12-07 14:08:46,905 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 2.464e+02 2.970e+02 3.813e+02 8.133e+02, threshold=5.940e+02, percent-clipped=4.0 +2022-12-07 14:09:04,266 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39345.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:09:07,347 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.03 vs. limit=2.0 +2022-12-07 14:09:33,386 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=16.53 vs. limit=5.0 +2022-12-07 14:10:03,120 INFO [train.py:873] (2/4) Epoch 6, batch 1600, loss[loss=0.1858, simple_loss=0.1714, pruned_loss=0.1001, over 1160.00 frames. ], tot_loss[loss=0.1711, simple_loss=0.1862, pruned_loss=0.07798, over 2000188.48 frames. ], batch size: 100, lr: 1.34e-02, grad_scale: 8.0 +2022-12-07 14:10:11,876 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1638, 1.3219, 1.5009, 1.2216, 1.2900, 0.9450, 0.9211, 0.9334], + device='cuda:2'), covar=tensor([0.0650, 0.0342, 0.0497, 0.0325, 0.0634, 0.0303, 0.0262, 0.0577], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0012, 0.0015, 0.0011, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.2971e-05, 6.6683e-05, 6.0362e-05, 6.0180e-05, 6.6821e-05, 8.7930e-05, + 7.2805e-05, 8.4946e-05], device='cuda:2') +2022-12-07 14:10:16,105 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.549e+01 2.367e+02 3.070e+02 3.991e+02 2.269e+03, threshold=6.141e+02, percent-clipped=9.0 +2022-12-07 14:10:18,927 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8878, 3.7282, 4.0738, 3.5291, 3.8144, 3.9854, 1.3113, 3.6493], + device='cuda:2'), covar=tensor([0.0193, 0.0332, 0.0359, 0.0502, 0.0334, 0.0348, 0.3379, 0.0262], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0140, 0.0125, 0.0118, 0.0169, 0.0120, 0.0151, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:10:49,282 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39463.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:10:53,773 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9804, 0.9020, 0.9004, 1.0756, 0.6238, 0.5010, 0.9807, 1.0643], + device='cuda:2'), covar=tensor([0.1327, 0.0794, 0.1345, 0.1209, 0.2106, 0.0749, 0.1699, 0.1248], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0020, 0.0018, 0.0019, 0.0020, 0.0026, 0.0020, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.8828e-05, 8.2576e-05, 7.3412e-05, 7.9857e-05, 8.3278e-05, 1.0138e-04, + 8.7404e-05, 7.9198e-05], device='cuda:2') +2022-12-07 14:10:56,482 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0831, 1.1250, 1.2640, 1.0035, 0.9203, 0.8075, 0.9337, 0.8253], + device='cuda:2'), covar=tensor([0.0400, 0.0741, 0.0420, 0.0338, 0.0459, 0.0420, 0.0249, 0.0753], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0012, 0.0015, 0.0012, 0.0017], + device='cuda:2'), out_proj_covar=tensor([6.4476e-05, 6.8647e-05, 6.1992e-05, 6.2398e-05, 6.8669e-05, 8.9238e-05, + 7.5252e-05, 8.7091e-05], device='cuda:2') +2022-12-07 14:11:03,667 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39479.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:11:04,743 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.75 vs. limit=2.0 +2022-12-07 14:11:27,384 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7456, 2.7304, 2.5839, 2.8575, 2.3828, 2.5213, 2.8080, 2.8103], + device='cuda:2'), covar=tensor([0.0697, 0.0753, 0.0728, 0.0546, 0.1028, 0.0693, 0.0740, 0.0623], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0097, 0.0112, 0.0114, 0.0115, 0.0087, 0.0128, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:11:31,582 INFO [train.py:873] (2/4) Epoch 6, batch 1700, loss[loss=0.2229, simple_loss=0.2135, pruned_loss=0.1162, over 8648.00 frames. ], tot_loss[loss=0.1718, simple_loss=0.1864, pruned_loss=0.0786, over 1942391.30 frames. ], batch size: 100, lr: 1.34e-02, grad_scale: 8.0 +2022-12-07 14:11:43,964 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39524.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:11:45,579 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.568e+02 2.465e+02 3.198e+02 4.096e+02 7.326e+02, threshold=6.396e+02, percent-clipped=2.0 +2022-12-07 14:11:46,575 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39527.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:11:58,765 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39541.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:12:07,758 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7647, 5.1724, 5.2869, 5.7378, 5.3218, 4.6520, 5.6970, 4.6866], + device='cuda:2'), covar=tensor([0.0265, 0.1313, 0.0274, 0.0403, 0.0838, 0.0351, 0.0417, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0220, 0.0146, 0.0140, 0.0146, 0.0116, 0.0213, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:12:44,060 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-07 14:13:01,691 INFO [train.py:873] (2/4) Epoch 6, batch 1800, loss[loss=0.1688, simple_loss=0.1887, pruned_loss=0.07449, over 14380.00 frames. ], tot_loss[loss=0.17, simple_loss=0.1857, pruned_loss=0.07713, over 2003336.00 frames. ], batch size: 73, lr: 1.34e-02, grad_scale: 4.0 +2022-12-07 14:13:09,299 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-12-07 14:13:15,825 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.478e+02 2.436e+02 3.001e+02 3.832e+02 8.283e+02, threshold=6.003e+02, percent-clipped=3.0 +2022-12-07 14:13:19,206 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.92 vs. limit=5.0 +2022-12-07 14:13:26,794 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39639.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:13:27,566 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39640.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:13:35,678 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-07 14:13:42,991 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9156, 0.7464, 0.6608, 0.8727, 0.8273, 0.5231, 0.9384, 0.7396], + device='cuda:2'), covar=tensor([0.0915, 0.1054, 0.0424, 0.0807, 0.0989, 0.0698, 0.0807, 0.1343], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0018, 0.0019, 0.0020, 0.0026, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([7.9829e-05, 8.3056e-05, 7.3835e-05, 7.9904e-05, 8.3152e-05, 1.0240e-04, + 8.7172e-05, 7.8580e-05], device='cuda:2') +2022-12-07 14:14:20,068 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39700.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 14:14:29,790 INFO [train.py:873] (2/4) Epoch 6, batch 1900, loss[loss=0.2188, simple_loss=0.1943, pruned_loss=0.1216, over 2619.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.1855, pruned_loss=0.07737, over 1946159.09 frames. ], batch size: 100, lr: 1.34e-02, grad_scale: 4.0 +2022-12-07 14:14:44,080 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.151e+02 2.287e+02 3.025e+02 3.865e+02 4.213e+03, threshold=6.050e+02, percent-clipped=8.0 +2022-12-07 14:14:45,587 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 14:15:19,624 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39767.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:15:58,846 INFO [train.py:873] (2/4) Epoch 6, batch 2000, loss[loss=0.1687, simple_loss=0.1756, pruned_loss=0.0809, over 4907.00 frames. ], tot_loss[loss=0.1715, simple_loss=0.1864, pruned_loss=0.0783, over 1923060.05 frames. ], batch size: 100, lr: 1.34e-02, grad_scale: 8.0 +2022-12-07 14:16:01,667 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9221, 2.8917, 2.5552, 2.8578, 2.3397, 3.1298, 2.6723, 1.0588], + device='cuda:2'), covar=tensor([0.2498, 0.1037, 0.1591, 0.0840, 0.1085, 0.0446, 0.1476, 0.4069], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0063, 0.0053, 0.0055, 0.0077, 0.0058, 0.0088, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:16:02,516 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=39815.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:16:05,912 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39819.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:16:12,942 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.379e+02 2.241e+02 2.938e+02 3.748e+02 1.021e+03, threshold=5.875e+02, percent-clipped=4.0 +2022-12-07 14:16:14,100 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39828.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:16:25,422 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39841.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:16:56,531 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=39876.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:16:56,587 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4604, 2.2778, 3.5326, 3.6786, 3.6506, 2.5178, 3.5550, 2.8730], + device='cuda:2'), covar=tensor([0.0158, 0.0394, 0.0384, 0.0180, 0.0112, 0.0497, 0.0120, 0.0401], + device='cuda:2'), in_proj_covar=tensor([0.0202, 0.0202, 0.0298, 0.0237, 0.0188, 0.0246, 0.0187, 0.0237], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 14:17:08,352 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39889.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:17:27,453 INFO [train.py:873] (2/4) Epoch 6, batch 2100, loss[loss=0.1364, simple_loss=0.1654, pruned_loss=0.05372, over 14541.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.1856, pruned_loss=0.07794, over 1863049.79 frames. ], batch size: 34, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:17:42,096 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.442e+01 2.310e+02 2.903e+02 3.569e+02 8.046e+02, threshold=5.805e+02, percent-clipped=4.0 +2022-12-07 14:17:53,581 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=39940.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:17:53,664 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7902, 2.4363, 3.6562, 2.7074, 3.6003, 3.4645, 3.2752, 3.0811], + device='cuda:2'), covar=tensor([0.0426, 0.2947, 0.0629, 0.1919, 0.0589, 0.0765, 0.1442, 0.2043], + device='cuda:2'), in_proj_covar=tensor([0.0280, 0.0332, 0.0370, 0.0308, 0.0351, 0.0287, 0.0337, 0.0346], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:18:02,214 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0321, 3.5297, 2.8033, 4.2643, 4.1279, 4.0819, 3.3647, 2.9342], + device='cuda:2'), covar=tensor([0.0586, 0.1277, 0.4223, 0.0329, 0.0697, 0.1071, 0.1192, 0.3883], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0297, 0.0295, 0.0190, 0.0254, 0.0256, 0.0255, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:18:25,510 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6123, 1.8917, 2.4628, 2.2421, 2.5718, 2.4488, 2.2446, 2.1709], + device='cuda:2'), covar=tensor([0.0361, 0.2287, 0.0513, 0.1461, 0.0385, 0.0630, 0.0618, 0.1222], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0334, 0.0373, 0.0311, 0.0354, 0.0289, 0.0341, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:18:35,848 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=39988.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:18:41,907 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=39995.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:18:56,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0466, 2.1067, 1.9475, 2.1851, 1.7362, 1.9388, 2.0776, 2.1114], + device='cuda:2'), covar=tensor([0.0926, 0.0922, 0.0984, 0.0823, 0.1345, 0.0831, 0.1008, 0.0954], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0099, 0.0115, 0.0115, 0.0118, 0.0090, 0.0128, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:19:00,410 INFO [train.py:873] (2/4) Epoch 6, batch 2200, loss[loss=0.1382, simple_loss=0.1678, pruned_loss=0.05433, over 13927.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.1863, pruned_loss=0.07815, over 1935952.40 frames. ], batch size: 19, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:19:12,784 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 14:19:13,861 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-07 14:19:14,064 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.175e+01 2.567e+02 3.218e+02 4.341e+02 1.607e+03, threshold=6.436e+02, percent-clipped=13.0 +2022-12-07 14:19:39,145 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5783, 2.4462, 2.1629, 2.2536, 2.5335, 2.4444, 2.5491, 2.5065], + device='cuda:2'), covar=tensor([0.0975, 0.0866, 0.2347, 0.2761, 0.0989, 0.1066, 0.1321, 0.1118], + device='cuda:2'), in_proj_covar=tensor([0.0304, 0.0231, 0.0360, 0.0460, 0.0273, 0.0336, 0.0339, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:19:57,806 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8415, 4.4511, 4.2725, 4.8050, 4.5177, 4.1385, 4.7821, 4.0192], + device='cuda:2'), covar=tensor([0.0281, 0.0815, 0.0308, 0.0358, 0.0708, 0.0556, 0.0467, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0221, 0.0147, 0.0139, 0.0150, 0.0120, 0.0219, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:20:27,390 INFO [train.py:873] (2/4) Epoch 6, batch 2300, loss[loss=0.1871, simple_loss=0.1872, pruned_loss=0.09351, over 5961.00 frames. ], tot_loss[loss=0.1716, simple_loss=0.1864, pruned_loss=0.0784, over 1958084.75 frames. ], batch size: 100, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:20:35,047 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40119.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:20:38,458 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40123.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:20:41,820 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.659e+01 2.460e+02 3.103e+02 4.082e+02 7.462e+02, threshold=6.206e+02, percent-clipped=2.0 +2022-12-07 14:21:18,095 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40167.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:21:18,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2761, 4.6845, 4.6430, 5.2565, 4.9549, 4.3593, 5.1218, 4.3012], + device='cuda:2'), covar=tensor([0.0298, 0.1043, 0.0305, 0.0396, 0.0652, 0.0390, 0.0534, 0.0453], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0225, 0.0150, 0.0142, 0.0152, 0.0120, 0.0223, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:21:21,629 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40171.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:21:42,943 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40195.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:21:43,859 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0163, 2.6292, 3.5928, 2.8456, 3.6503, 3.6825, 3.5510, 2.9660], + device='cuda:2'), covar=tensor([0.0479, 0.3002, 0.0851, 0.2182, 0.0829, 0.0632, 0.1600, 0.1835], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0333, 0.0371, 0.0310, 0.0351, 0.0287, 0.0335, 0.0343], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:21:51,677 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.18 vs. limit=2.0 +2022-12-07 14:21:57,513 INFO [train.py:873] (2/4) Epoch 6, batch 2400, loss[loss=0.1648, simple_loss=0.1453, pruned_loss=0.09214, over 1297.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.1865, pruned_loss=0.07811, over 1945094.64 frames. ], batch size: 100, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:22:03,735 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40218.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:22:11,491 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 2.289e+02 3.018e+02 4.008e+02 1.305e+03, threshold=6.036e+02, percent-clipped=3.0 +2022-12-07 14:22:37,299 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40256.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:22:51,136 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:22:57,530 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40279.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:22:58,364 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7155, 1.3143, 2.9550, 1.4834, 2.8761, 2.8305, 2.1193, 2.9778], + device='cuda:2'), covar=tensor([0.0241, 0.2457, 0.0251, 0.1820, 0.0299, 0.0412, 0.0840, 0.0182], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0161, 0.0142, 0.0173, 0.0157, 0.0153, 0.0127, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:23:11,493 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40295.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:23:25,879 INFO [train.py:873] (2/4) Epoch 6, batch 2500, loss[loss=0.1709, simple_loss=0.1884, pruned_loss=0.07677, over 14420.00 frames. ], tot_loss[loss=0.17, simple_loss=0.1855, pruned_loss=0.07722, over 1941115.76 frames. ], batch size: 73, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:23:39,898 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.231e+02 2.352e+02 2.913e+02 3.857e+02 7.423e+02, threshold=5.826e+02, percent-clipped=3.0 +2022-12-07 14:23:43,090 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40330.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:23:45,702 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40333.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:23:53,910 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40343.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:24:12,365 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-12-07 14:24:34,714 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-07 14:24:36,785 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40391.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:24:54,952 INFO [train.py:873] (2/4) Epoch 6, batch 2600, loss[loss=0.2007, simple_loss=0.2091, pruned_loss=0.09614, over 14496.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.1858, pruned_loss=0.07742, over 1934544.89 frames. ], batch size: 34, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:24:58,859 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-07 14:25:05,731 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40423.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:25:08,969 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 2.477e+02 3.292e+02 4.100e+02 6.300e+02, threshold=6.583e+02, percent-clipped=3.0 +2022-12-07 14:25:12,880 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4305, 3.1072, 2.7602, 1.7503, 2.6942, 3.2323, 3.1708, 2.5472], + device='cuda:2'), covar=tensor([0.0612, 0.2238, 0.1348, 0.2822, 0.1103, 0.0450, 0.0911, 0.1798], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0199, 0.0118, 0.0126, 0.0108, 0.0110, 0.0088, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:25:36,683 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9919, 1.5900, 3.7720, 1.4826, 3.8370, 4.0293, 3.1014, 4.3498], + device='cuda:2'), covar=tensor([0.0196, 0.2898, 0.0364, 0.2592, 0.0341, 0.0329, 0.0618, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0156, 0.0140, 0.0167, 0.0156, 0.0150, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:25:47,480 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40471.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:25:47,547 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40471.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:26:06,955 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40492.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:26:09,483 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1352, 0.9603, 1.1010, 1.2400, 0.9191, 0.6626, 1.3885, 1.0432], + device='cuda:2'), covar=tensor([0.1267, 0.1579, 0.0527, 0.1075, 0.1306, 0.0733, 0.0630, 0.1565], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0018, 0.0019, 0.0020, 0.0027, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([8.0465e-05, 8.6119e-05, 7.4886e-05, 8.1277e-05, 8.4199e-05, 1.0487e-04, + 8.8339e-05, 8.0175e-05], device='cuda:2') +2022-12-07 14:26:23,201 INFO [train.py:873] (2/4) Epoch 6, batch 2700, loss[loss=0.141, simple_loss=0.1658, pruned_loss=0.05809, over 14177.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.1855, pruned_loss=0.07633, over 2029736.54 frames. ], batch size: 29, lr: 1.33e-02, grad_scale: 8.0 +2022-12-07 14:26:30,850 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40519.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:26:34,176 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.48 vs. limit=5.0 +2022-12-07 14:26:37,692 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.473e+02 2.565e+02 3.016e+02 3.792e+02 1.283e+03, threshold=6.032e+02, percent-clipped=5.0 +2022-12-07 14:26:52,946 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-07 14:26:58,877 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40551.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:27:00,556 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40553.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:27:19,404 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40574.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:27:29,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8311, 1.3342, 3.2249, 2.9612, 3.1643, 3.1810, 2.6099, 3.2766], + device='cuda:2'), covar=tensor([0.1142, 0.1402, 0.0106, 0.0228, 0.0189, 0.0113, 0.0237, 0.0112], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0153, 0.0106, 0.0145, 0.0120, 0.0123, 0.0095, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:27:52,084 INFO [train.py:873] (2/4) Epoch 6, batch 2800, loss[loss=0.1763, simple_loss=0.197, pruned_loss=0.07783, over 14065.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.1853, pruned_loss=0.07681, over 1911587.27 frames. ], batch size: 29, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:28:05,951 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.405e+02 3.321e+02 4.166e+02 7.949e+02, threshold=6.642e+02, percent-clipped=7.0 +2022-12-07 14:28:06,941 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40628.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:28:27,687 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40651.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:28:58,732 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40686.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:29:20,280 INFO [train.py:873] (2/4) Epoch 6, batch 2900, loss[loss=0.1529, simple_loss=0.1727, pruned_loss=0.06652, over 6947.00 frames. ], tot_loss[loss=0.1689, simple_loss=0.1848, pruned_loss=0.07647, over 1903741.89 frames. ], batch size: 100, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:29:21,622 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40712.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:29:34,595 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.325e+02 2.437e+02 2.821e+02 3.804e+02 6.190e+02, threshold=5.643e+02, percent-clipped=0.0 +2022-12-07 14:30:48,711 INFO [train.py:873] (2/4) Epoch 6, batch 3000, loss[loss=0.1514, simple_loss=0.1795, pruned_loss=0.06167, over 14483.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.1861, pruned_loss=0.07813, over 1915751.40 frames. ], batch size: 49, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:30:48,711 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 14:31:00,806 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1313, 2.6207, 3.9144, 3.0788, 3.8298, 3.5281, 3.5741, 3.1182], + device='cuda:2'), covar=tensor([0.0441, 0.3517, 0.0810, 0.1899, 0.0683, 0.0925, 0.1886, 0.2770], + device='cuda:2'), in_proj_covar=tensor([0.0286, 0.0334, 0.0378, 0.0311, 0.0359, 0.0293, 0.0346, 0.0349], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:31:04,487 INFO [train.py:905] (2/4) Epoch 6, validation: loss=0.1224, simple_loss=0.1659, pruned_loss=0.03945, over 857387.00 frames. +2022-12-07 14:31:04,488 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 14:31:19,219 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.206e+02 2.431e+02 3.272e+02 4.116e+02 8.676e+02, threshold=6.543e+02, percent-clipped=8.0 +2022-12-07 14:31:37,454 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=40848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:31:40,153 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40851.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:31:53,032 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40865.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:00,741 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40874.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:22,698 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40899.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:23,728 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40900.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:33,301 INFO [train.py:873] (2/4) Epoch 6, batch 3100, loss[loss=0.1483, simple_loss=0.1598, pruned_loss=0.06841, over 5988.00 frames. ], tot_loss[loss=0.1714, simple_loss=0.1862, pruned_loss=0.07832, over 1897451.10 frames. ], batch size: 100, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:32:43,004 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40922.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:46,557 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40926.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:32:47,206 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.510e+02 3.195e+02 3.907e+02 1.074e+03, threshold=6.390e+02, percent-clipped=2.0 +2022-12-07 14:32:48,232 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40928.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:33:17,797 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=40961.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 14:33:30,785 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=40976.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:33:40,124 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=40986.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:33:49,441 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=40997.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:33:53,884 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.82 vs. limit=5.0 +2022-12-07 14:33:58,664 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41007.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:34:01,821 INFO [train.py:873] (2/4) Epoch 6, batch 3200, loss[loss=0.1848, simple_loss=0.1989, pruned_loss=0.08531, over 14215.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.186, pruned_loss=0.07706, over 1963168.87 frames. ], batch size: 35, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:34:16,163 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.950e+01 2.679e+02 3.334e+02 4.339e+02 1.581e+03, threshold=6.667e+02, percent-clipped=5.0 +2022-12-07 14:34:22,464 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41034.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:34:32,082 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41045.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:34:35,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8982, 1.7337, 4.4362, 2.1791, 4.1579, 4.6516, 4.3735, 5.1353], + device='cuda:2'), covar=tensor([0.0149, 0.2891, 0.0327, 0.2068, 0.0308, 0.0298, 0.0199, 0.0128], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0156, 0.0139, 0.0168, 0.0156, 0.0153, 0.0123, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:34:37,431 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4169, 2.0598, 3.3118, 3.4438, 3.3588, 2.1356, 3.4109, 2.5860], + device='cuda:2'), covar=tensor([0.0186, 0.0439, 0.0331, 0.0193, 0.0153, 0.0669, 0.0120, 0.0505], + device='cuda:2'), in_proj_covar=tensor([0.0210, 0.0208, 0.0313, 0.0244, 0.0197, 0.0257, 0.0199, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 14:34:40,019 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0475, 3.1285, 3.2995, 3.0690, 3.1991, 2.7959, 1.2855, 3.0074], + device='cuda:2'), covar=tensor([0.0310, 0.0347, 0.0451, 0.0407, 0.0349, 0.0733, 0.3071, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0139, 0.0123, 0.0118, 0.0169, 0.0117, 0.0149, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:34:43,563 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41058.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:35:09,488 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2798, 1.7359, 2.2288, 2.0089, 2.4316, 2.1857, 2.0810, 2.0868], + device='cuda:2'), covar=tensor([0.0285, 0.1453, 0.0241, 0.0617, 0.0275, 0.0454, 0.0307, 0.0561], + device='cuda:2'), in_proj_covar=tensor([0.0279, 0.0332, 0.0369, 0.0306, 0.0346, 0.0286, 0.0342, 0.0339], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:35:19,246 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5724, 3.3568, 3.2284, 3.6925, 3.1891, 2.9126, 3.5924, 3.5821], + device='cuda:2'), covar=tensor([0.0791, 0.0730, 0.0861, 0.0608, 0.0908, 0.0737, 0.0729, 0.0709], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0099, 0.0116, 0.0116, 0.0117, 0.0091, 0.0130, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:35:22,849 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41102.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:35:26,830 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41106.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:35:26,858 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8063, 3.3648, 2.6602, 3.8709, 3.7982, 3.7595, 3.0965, 2.5902], + device='cuda:2'), covar=tensor([0.0544, 0.1102, 0.3470, 0.0551, 0.0552, 0.1111, 0.1179, 0.4128], + device='cuda:2'), in_proj_covar=tensor([0.0227, 0.0300, 0.0290, 0.0192, 0.0255, 0.0257, 0.0259, 0.0283], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:35:30,961 INFO [train.py:873] (2/4) Epoch 6, batch 3300, loss[loss=0.198, simple_loss=0.2048, pruned_loss=0.09563, over 14127.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.186, pruned_loss=0.07766, over 1955934.42 frames. ], batch size: 99, lr: 1.32e-02, grad_scale: 8.0 +2022-12-07 14:35:45,225 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.442e+02 2.460e+02 3.115e+02 3.768e+02 7.967e+02, threshold=6.230e+02, percent-clipped=3.0 +2022-12-07 14:35:48,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1394, 1.0492, 0.8553, 1.0136, 1.2446, 1.1401, 1.2630, 1.1324], + device='cuda:2'), covar=tensor([0.1012, 0.3244, 0.0972, 0.1185, 0.0850, 0.0483, 0.0514, 0.0790], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0012, 0.0015, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.6641e-05, 7.0318e-05, 6.4679e-05, 6.4525e-05, 6.9223e-05, 9.2350e-05, + 7.8413e-05, 8.9376e-05], device='cuda:2') +2022-12-07 14:36:04,026 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41148.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:36:06,463 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1860, 0.9221, 1.1423, 1.2429, 1.1102, 0.7393, 1.3180, 1.1042], + device='cuda:2'), covar=tensor([0.0934, 0.1148, 0.0455, 0.0833, 0.1169, 0.0620, 0.0567, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0018, 0.0018, 0.0019, 0.0026, 0.0020, 0.0018], + device='cuda:2'), out_proj_covar=tensor([8.0468e-05, 8.4660e-05, 7.5179e-05, 7.9827e-05, 8.2438e-05, 1.0409e-04, + 8.7163e-05, 7.9609e-05], device='cuda:2') +2022-12-07 14:36:17,035 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41163.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:36:42,656 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7306, 4.4847, 4.2532, 4.3781, 4.4078, 4.6550, 4.7652, 4.7245], + device='cuda:2'), covar=tensor([0.0948, 0.0564, 0.1917, 0.2709, 0.0810, 0.0628, 0.0761, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0307, 0.0228, 0.0369, 0.0457, 0.0271, 0.0342, 0.0346, 0.0285], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:36:46,080 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41196.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:36:59,351 INFO [train.py:873] (2/4) Epoch 6, batch 3400, loss[loss=0.1343, simple_loss=0.1624, pruned_loss=0.05314, over 14144.00 frames. ], tot_loss[loss=0.1705, simple_loss=0.1861, pruned_loss=0.07744, over 2026470.54 frames. ], batch size: 25, lr: 1.31e-02, grad_scale: 8.0 +2022-12-07 14:37:08,227 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41221.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:37:13,996 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.559e+02 2.427e+02 3.173e+02 3.892e+02 5.965e+02, threshold=6.346e+02, percent-clipped=0.0 +2022-12-07 14:37:15,082 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1451, 2.1356, 4.9973, 4.4654, 4.5559, 5.0688, 4.9394, 5.1072], + device='cuda:2'), covar=tensor([0.1079, 0.1095, 0.0058, 0.0101, 0.0107, 0.0066, 0.0056, 0.0079], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0107, 0.0149, 0.0121, 0.0126, 0.0099, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:37:39,695 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41256.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:37:41,444 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7726, 1.3702, 1.8022, 1.2494, 1.4480, 1.7903, 1.5077, 1.5285], + device='cuda:2'), covar=tensor([0.0660, 0.1184, 0.0798, 0.1026, 0.1331, 0.0925, 0.0508, 0.1873], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0197, 0.0120, 0.0126, 0.0106, 0.0111, 0.0089, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:38:25,187 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41307.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:38:25,467 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-12-07 14:38:28,734 INFO [train.py:873] (2/4) Epoch 6, batch 3500, loss[loss=0.1529, simple_loss=0.1793, pruned_loss=0.06326, over 14221.00 frames. ], tot_loss[loss=0.1688, simple_loss=0.1852, pruned_loss=0.07625, over 2033538.78 frames. ], batch size: 60, lr: 1.31e-02, grad_scale: 8.0 +2022-12-07 14:38:43,211 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 2.629e+02 3.345e+02 4.291e+02 7.270e+02, threshold=6.690e+02, percent-clipped=2.0 +2022-12-07 14:38:44,235 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8024, 5.2619, 5.2925, 5.8509, 5.2245, 5.0882, 5.8790, 5.7127], + device='cuda:2'), covar=tensor([0.0521, 0.0537, 0.0552, 0.0452, 0.0675, 0.0322, 0.0488, 0.0479], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0102, 0.0117, 0.0117, 0.0118, 0.0093, 0.0133, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:38:50,570 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6371, 1.9390, 1.8517, 1.8787, 1.7549, 2.0714, 1.5741, 1.0906], + device='cuda:2'), covar=tensor([0.2083, 0.0759, 0.1099, 0.0481, 0.0922, 0.0518, 0.2318, 0.3857], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0063, 0.0050, 0.0054, 0.0077, 0.0058, 0.0084, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:38:51,910 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.13 vs. limit=5.0 +2022-12-07 14:39:06,523 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41353.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:39:08,307 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41355.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:39:24,493 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 14:39:26,129 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-12-07 14:39:40,337 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-07 14:39:45,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8440, 1.4763, 1.8103, 1.2262, 1.3647, 1.7960, 1.5348, 1.4879], + device='cuda:2'), covar=tensor([0.0499, 0.0953, 0.0631, 0.1002, 0.1219, 0.0752, 0.0540, 0.1771], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0200, 0.0121, 0.0127, 0.0107, 0.0110, 0.0091, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:39:48,938 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41401.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:39:57,988 INFO [train.py:873] (2/4) Epoch 6, batch 3600, loss[loss=0.1691, simple_loss=0.1796, pruned_loss=0.07933, over 6885.00 frames. ], tot_loss[loss=0.1669, simple_loss=0.1834, pruned_loss=0.07518, over 1927806.66 frames. ], batch size: 100, lr: 1.31e-02, grad_scale: 8.0 +2022-12-07 14:40:03,183 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9295, 4.6926, 5.1761, 4.2295, 4.9288, 5.3433, 1.5556, 4.6801], + device='cuda:2'), covar=tensor([0.0148, 0.0215, 0.0347, 0.0329, 0.0245, 0.0081, 0.3166, 0.0174], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0142, 0.0126, 0.0120, 0.0175, 0.0121, 0.0153, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:40:11,738 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 2.550e+02 3.176e+02 4.214e+02 9.343e+02, threshold=6.353e+02, percent-clipped=2.0 +2022-12-07 14:40:38,998 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=41458.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:40:43,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6333, 5.2937, 5.1593, 5.6823, 5.1983, 4.8737, 5.6846, 5.6961], + device='cuda:2'), covar=tensor([0.0645, 0.0576, 0.0617, 0.0561, 0.0556, 0.0428, 0.0627, 0.0540], + device='cuda:2'), in_proj_covar=tensor([0.0112, 0.0101, 0.0117, 0.0118, 0.0118, 0.0092, 0.0132, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:41:25,999 INFO [train.py:873] (2/4) Epoch 6, batch 3700, loss[loss=0.1841, simple_loss=0.1955, pruned_loss=0.08631, over 14145.00 frames. ], tot_loss[loss=0.168, simple_loss=0.1845, pruned_loss=0.07577, over 2001299.80 frames. ], batch size: 84, lr: 1.31e-02, grad_scale: 8.0 +2022-12-07 14:41:34,690 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41521.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:41:39,215 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8304, 1.4369, 1.8083, 1.2791, 1.4876, 1.8404, 1.5595, 1.5565], + device='cuda:2'), covar=tensor([0.0381, 0.1066, 0.0538, 0.0926, 0.0949, 0.0483, 0.0426, 0.1301], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0203, 0.0122, 0.0128, 0.0109, 0.0111, 0.0090, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:41:39,913 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.379e+02 2.533e+02 3.248e+02 4.162e+02 7.233e+02, threshold=6.497e+02, percent-clipped=2.0 +2022-12-07 14:42:06,456 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41556.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:42:18,100 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41569.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:42:49,100 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41604.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:42:49,225 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2352, 1.6987, 2.1724, 2.0314, 2.3259, 2.0635, 1.8911, 2.0783], + device='cuda:2'), covar=tensor([0.0229, 0.1288, 0.0304, 0.0568, 0.0193, 0.0392, 0.0143, 0.0431], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0334, 0.0377, 0.0313, 0.0358, 0.0291, 0.0347, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:42:52,518 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0820, 2.0050, 1.7178, 1.7387, 2.0348, 2.0148, 2.0281, 2.0063], + device='cuda:2'), covar=tensor([0.1101, 0.0809, 0.2604, 0.2877, 0.1116, 0.1058, 0.1556, 0.1059], + device='cuda:2'), in_proj_covar=tensor([0.0311, 0.0226, 0.0373, 0.0466, 0.0271, 0.0344, 0.0347, 0.0288], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:42:55,067 INFO [train.py:873] (2/4) Epoch 6, batch 3800, loss[loss=0.15, simple_loss=0.1754, pruned_loss=0.06229, over 14097.00 frames. ], tot_loss[loss=0.1681, simple_loss=0.1847, pruned_loss=0.07576, over 2007842.54 frames. ], batch size: 29, lr: 1.31e-02, grad_scale: 16.0 +2022-12-07 14:43:09,832 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.285e+02 2.547e+02 3.241e+02 4.303e+02 1.041e+03, threshold=6.482e+02, percent-clipped=5.0 +2022-12-07 14:43:33,350 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41653.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:43:34,336 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41654.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:43:48,094 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.75 vs. limit=2.0 +2022-12-07 14:43:59,543 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.45 vs. limit=5.0 +2022-12-07 14:44:03,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8605, 1.5558, 3.8323, 3.6289, 3.7203, 3.9069, 3.2611, 3.9192], + device='cuda:2'), covar=tensor([0.1245, 0.1294, 0.0082, 0.0148, 0.0132, 0.0085, 0.0166, 0.0089], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0107, 0.0151, 0.0121, 0.0126, 0.0098, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 14:44:04,324 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 14:44:15,992 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41701.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:44:16,092 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41701.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:44:24,886 INFO [train.py:873] (2/4) Epoch 6, batch 3900, loss[loss=0.2026, simple_loss=0.2027, pruned_loss=0.1013, over 11146.00 frames. ], tot_loss[loss=0.1679, simple_loss=0.1845, pruned_loss=0.07561, over 2020832.99 frames. ], batch size: 100, lr: 1.31e-02, grad_scale: 16.0 +2022-12-07 14:44:28,606 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41715.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:44:38,880 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.336e+02 2.642e+02 3.297e+02 4.041e+02 8.886e+02, threshold=6.594e+02, percent-clipped=2.0 +2022-12-07 14:44:58,838 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41749.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:45:01,049 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.07 vs. limit=2.0 +2022-12-07 14:45:06,659 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=41758.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:45:17,547 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1101, 2.6926, 2.5718, 1.5070, 2.4399, 2.6373, 3.0610, 2.3110], + device='cuda:2'), covar=tensor([0.0730, 0.1966, 0.1261, 0.3001, 0.1540, 0.0643, 0.0688, 0.1871], + device='cuda:2'), in_proj_covar=tensor([0.0111, 0.0202, 0.0119, 0.0126, 0.0108, 0.0110, 0.0089, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:45:49,541 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=41806.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:45:54,441 INFO [train.py:873] (2/4) Epoch 6, batch 4000, loss[loss=0.1396, simple_loss=0.1698, pruned_loss=0.0547, over 13929.00 frames. ], tot_loss[loss=0.1677, simple_loss=0.1845, pruned_loss=0.07549, over 1981463.17 frames. ], batch size: 26, lr: 1.30e-02, grad_scale: 16.0 +2022-12-07 14:46:08,693 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.207e+02 2.921e+02 3.758e+02 8.130e+02, threshold=5.842e+02, percent-clipped=2.0 +2022-12-07 14:47:18,566 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:47:22,766 INFO [train.py:873] (2/4) Epoch 6, batch 4100, loss[loss=0.1864, simple_loss=0.1797, pruned_loss=0.09659, over 5023.00 frames. ], tot_loss[loss=0.1692, simple_loss=0.185, pruned_loss=0.07666, over 1923488.13 frames. ], batch size: 100, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:47:37,464 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.378e+02 2.509e+02 3.055e+02 4.404e+02 7.346e+02, threshold=6.111e+02, percent-clipped=4.0 +2022-12-07 14:47:55,751 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=41948.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 14:48:10,233 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5531, 2.5905, 2.5760, 2.5500, 2.0777, 2.7720, 2.6956, 1.1487], + device='cuda:2'), covar=tensor([0.3675, 0.1180, 0.1071, 0.1191, 0.1347, 0.0764, 0.1091, 0.3987], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0062, 0.0050, 0.0054, 0.0077, 0.0057, 0.0083, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:48:11,846 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=41967.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:48:39,603 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9302, 0.7798, 0.9136, 1.0288, 0.7184, 0.6700, 0.5664, 0.5191], + device='cuda:2'), covar=tensor([0.0156, 0.0142, 0.0268, 0.0152, 0.0208, 0.0447, 0.0263, 0.0584], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0011, 0.0015, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.5621e-05, 6.8493e-05, 6.2537e-05, 6.2713e-05, 6.6939e-05, 9.1806e-05, + 7.6134e-05, 8.7944e-05], device='cuda:2') +2022-12-07 14:48:49,694 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42009.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 14:48:50,428 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42010.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 14:48:51,185 INFO [train.py:873] (2/4) Epoch 6, batch 4200, loss[loss=0.1547, simple_loss=0.1671, pruned_loss=0.07118, over 6905.00 frames. ], tot_loss[loss=0.1677, simple_loss=0.1837, pruned_loss=0.07589, over 1848772.17 frames. ], batch size: 100, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:48:59,178 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-07 14:49:04,384 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7642, 1.9376, 2.6281, 2.2186, 2.5713, 2.5467, 2.3883, 2.2112], + device='cuda:2'), covar=tensor([0.0387, 0.2797, 0.0824, 0.1749, 0.0460, 0.0671, 0.0817, 0.1930], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0333, 0.0375, 0.0315, 0.0354, 0.0292, 0.0346, 0.0345], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:49:07,157 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.456e+02 2.446e+02 2.869e+02 3.541e+02 9.255e+02, threshold=5.737e+02, percent-clipped=3.0 +2022-12-07 14:49:09,994 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1750, 2.1581, 3.1770, 3.2455, 3.1896, 2.1715, 3.2208, 2.4533], + device='cuda:2'), covar=tensor([0.0169, 0.0393, 0.0385, 0.0198, 0.0154, 0.0643, 0.0117, 0.0477], + device='cuda:2'), in_proj_covar=tensor([0.0211, 0.0209, 0.0310, 0.0247, 0.0196, 0.0256, 0.0197, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 14:49:23,290 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 14:49:55,046 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9223, 2.5813, 3.5532, 2.8391, 3.6441, 3.5584, 3.4493, 2.9832], + device='cuda:2'), covar=tensor([0.0554, 0.2845, 0.1070, 0.1954, 0.0672, 0.0698, 0.1590, 0.1999], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0329, 0.0367, 0.0311, 0.0349, 0.0290, 0.0344, 0.0342], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 14:49:59,354 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3681, 3.2828, 3.5750, 3.2914, 3.3945, 3.1239, 1.3368, 3.1584], + device='cuda:2'), covar=tensor([0.0234, 0.0335, 0.0337, 0.0381, 0.0307, 0.0503, 0.2986, 0.0284], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0140, 0.0124, 0.0117, 0.0171, 0.0120, 0.0151, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:50:21,093 INFO [train.py:873] (2/4) Epoch 6, batch 4300, loss[loss=0.1682, simple_loss=0.1939, pruned_loss=0.0713, over 14395.00 frames. ], tot_loss[loss=0.1676, simple_loss=0.1841, pruned_loss=0.07558, over 1972688.89 frames. ], batch size: 41, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:50:35,780 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.072e+01 2.450e+02 2.996e+02 3.613e+02 8.545e+02, threshold=5.992e+02, percent-clipped=0.0 +2022-12-07 14:51:17,833 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4659, 2.1697, 2.8876, 1.8307, 2.0494, 2.4850, 1.2915, 2.4221], + device='cuda:2'), covar=tensor([0.1144, 0.1387, 0.0821, 0.2635, 0.2754, 0.1365, 0.5460, 0.1013], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0077, 0.0077, 0.0081, 0.0107, 0.0068, 0.0132, 0.0073], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:51:38,317 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0842, 2.0040, 1.6695, 1.7180, 2.0399, 2.0007, 2.0529, 1.9971], + device='cuda:2'), covar=tensor([0.1081, 0.1001, 0.2784, 0.3262, 0.1116, 0.1154, 0.1631, 0.1176], + device='cuda:2'), in_proj_covar=tensor([0.0311, 0.0225, 0.0378, 0.0466, 0.0269, 0.0343, 0.0342, 0.0290], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:51:41,343 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8047, 2.9421, 2.7332, 2.9397, 2.2513, 3.1381, 2.6605, 1.1271], + device='cuda:2'), covar=tensor([0.2702, 0.0801, 0.1097, 0.0807, 0.1194, 0.0442, 0.1568, 0.3885], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0062, 0.0051, 0.0053, 0.0077, 0.0056, 0.0082, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 14:51:51,141 INFO [train.py:873] (2/4) Epoch 6, batch 4400, loss[loss=0.1748, simple_loss=0.1781, pruned_loss=0.08575, over 4952.00 frames. ], tot_loss[loss=0.1673, simple_loss=0.1841, pruned_loss=0.07521, over 2033557.37 frames. ], batch size: 100, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:52:06,374 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.325e+02 2.637e+02 3.076e+02 3.933e+02 7.884e+02, threshold=6.152e+02, percent-clipped=2.0 +2022-12-07 14:52:09,087 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0407, 2.0776, 2.3162, 1.5271, 1.6737, 2.1010, 1.1202, 2.0233], + device='cuda:2'), covar=tensor([0.0678, 0.1071, 0.0515, 0.1938, 0.2425, 0.0590, 0.4287, 0.0645], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0078, 0.0079, 0.0083, 0.0109, 0.0069, 0.0134, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:52:17,530 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-07 14:52:36,370 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42262.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:53:14,112 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42304.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 14:53:19,690 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42310.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 14:53:20,404 INFO [train.py:873] (2/4) Epoch 6, batch 4500, loss[loss=0.1951, simple_loss=0.196, pruned_loss=0.09711, over 7771.00 frames. ], tot_loss[loss=0.1661, simple_loss=0.1833, pruned_loss=0.07447, over 2040655.19 frames. ], batch size: 100, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:53:35,323 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.162e+02 2.450e+02 2.856e+02 3.817e+02 6.882e+02, threshold=5.713e+02, percent-clipped=1.0 +2022-12-07 14:54:02,627 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42358.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:54:22,178 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42379.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:54:23,737 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5120, 3.1553, 3.1918, 3.4231, 3.3770, 3.4012, 3.4990, 2.9837], + device='cuda:2'), covar=tensor([0.0403, 0.1043, 0.0403, 0.0481, 0.0663, 0.0327, 0.0565, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0228, 0.0151, 0.0145, 0.0150, 0.0121, 0.0229, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:54:27,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2575, 1.3655, 1.2479, 1.3593, 1.1776, 0.8632, 0.8591, 0.8476], + device='cuda:2'), covar=tensor([0.0529, 0.0701, 0.0492, 0.0321, 0.0303, 0.0273, 0.0274, 0.0580], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0011, 0.0010, 0.0009, 0.0011, 0.0015, 0.0011, 0.0015], + device='cuda:2'), out_proj_covar=tensor([6.5202e-05, 6.8140e-05, 6.2361e-05, 6.1424e-05, 6.7553e-05, 8.9356e-05, + 7.3965e-05, 8.5442e-05], device='cuda:2') +2022-12-07 14:54:50,838 INFO [train.py:873] (2/4) Epoch 6, batch 4600, loss[loss=0.1408, simple_loss=0.1747, pruned_loss=0.05344, over 14459.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.183, pruned_loss=0.07413, over 2025130.60 frames. ], batch size: 24, lr: 1.30e-02, grad_scale: 8.0 +2022-12-07 14:55:06,687 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.318e+02 2.342e+02 3.312e+02 4.722e+02 1.011e+03, threshold=6.623e+02, percent-clipped=9.0 +2022-12-07 14:55:17,226 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42440.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:55:41,185 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6407, 1.3179, 3.0147, 1.3310, 3.0692, 2.8488, 2.0985, 3.0055], + device='cuda:2'), covar=tensor([0.0417, 0.3836, 0.0445, 0.2943, 0.0430, 0.0575, 0.1015, 0.0381], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0155, 0.0139, 0.0166, 0.0151, 0.0150, 0.0121, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:56:10,011 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 14:56:15,918 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42506.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:56:20,487 INFO [train.py:873] (2/4) Epoch 6, batch 4700, loss[loss=0.168, simple_loss=0.1785, pruned_loss=0.07873, over 6964.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.1819, pruned_loss=0.07336, over 2015199.34 frames. ], batch size: 100, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 14:56:30,844 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 14:56:33,204 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5519, 2.2985, 4.6318, 3.0868, 4.3914, 1.9873, 3.3524, 4.4231], + device='cuda:2'), covar=tensor([0.0404, 0.4946, 0.0272, 0.8687, 0.0373, 0.3955, 0.1282, 0.0255], + device='cuda:2'), in_proj_covar=tensor([0.0221, 0.0245, 0.0173, 0.0339, 0.0186, 0.0251, 0.0239, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 14:56:35,464 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 2.212e+02 2.979e+02 3.531e+02 6.343e+02, threshold=5.959e+02, percent-clipped=0.0 +2022-12-07 14:57:06,001 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42562.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:57:10,665 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42567.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:57:22,459 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9487, 2.5045, 4.1494, 4.3744, 4.3751, 2.6226, 4.3251, 3.4194], + device='cuda:2'), covar=tensor([0.0165, 0.0389, 0.0312, 0.0162, 0.0100, 0.0629, 0.0127, 0.0372], + device='cuda:2'), in_proj_covar=tensor([0.0209, 0.0205, 0.0311, 0.0244, 0.0195, 0.0252, 0.0198, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 14:57:41,931 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1559, 1.9332, 2.0650, 2.1247, 2.0766, 2.0101, 2.2010, 1.8366], + device='cuda:2'), covar=tensor([0.0708, 0.1487, 0.0733, 0.0723, 0.0973, 0.0816, 0.0902, 0.0732], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0231, 0.0152, 0.0146, 0.0153, 0.0124, 0.0232, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 14:57:43,816 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=42604.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:57:48,833 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42610.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:57:49,693 INFO [train.py:873] (2/4) Epoch 6, batch 4800, loss[loss=0.1439, simple_loss=0.1722, pruned_loss=0.0578, over 14643.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.1816, pruned_loss=0.07353, over 1940548.51 frames. ], batch size: 23, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 14:58:05,119 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.276e+02 2.352e+02 2.966e+02 3.514e+02 5.979e+02, threshold=5.933e+02, percent-clipped=1.0 +2022-12-07 14:58:15,338 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-07 14:58:23,596 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42649.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:58:25,952 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=42652.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 14:59:18,060 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42710.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 14:59:18,816 INFO [train.py:873] (2/4) Epoch 6, batch 4900, loss[loss=0.1367, simple_loss=0.1677, pruned_loss=0.05289, over 14615.00 frames. ], tot_loss[loss=0.1663, simple_loss=0.1834, pruned_loss=0.07456, over 1989637.92 frames. ], batch size: 22, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 14:59:23,855 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.75 vs. limit=5.0 +2022-12-07 14:59:24,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3413, 3.7353, 2.9804, 4.7876, 4.2335, 4.4319, 3.8149, 3.3154], + device='cuda:2'), covar=tensor([0.0725, 0.1470, 0.4533, 0.0384, 0.1000, 0.1539, 0.1041, 0.3348], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0303, 0.0296, 0.0199, 0.0261, 0.0267, 0.0255, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 14:59:33,472 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 2.456e+02 3.213e+02 4.119e+02 7.897e+02, threshold=6.426e+02, percent-clipped=4.0 +2022-12-07 14:59:40,193 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42735.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:00:47,920 INFO [train.py:873] (2/4) Epoch 6, batch 5000, loss[loss=0.1827, simple_loss=0.1984, pruned_loss=0.08349, over 14209.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.1828, pruned_loss=0.07348, over 2071118.03 frames. ], batch size: 94, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 15:01:03,357 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.594e+01 2.343e+02 3.154e+02 4.045e+02 6.991e+02, threshold=6.308e+02, percent-clipped=1.0 +2022-12-07 15:01:30,709 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42859.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:01:33,687 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=42862.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:01:44,460 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6168, 4.4264, 4.7697, 4.0133, 4.4849, 4.8292, 1.6931, 4.2155], + device='cuda:2'), covar=tensor([0.0190, 0.0281, 0.0355, 0.0488, 0.0329, 0.0143, 0.3254, 0.0258], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0144, 0.0124, 0.0119, 0.0174, 0.0119, 0.0152, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:01:48,820 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=42879.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:02:16,483 INFO [train.py:873] (2/4) Epoch 6, batch 5100, loss[loss=0.1554, simple_loss=0.1793, pruned_loss=0.06574, over 13964.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.183, pruned_loss=0.07418, over 2009131.62 frames. ], batch size: 26, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 15:02:24,171 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42920.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:02:30,811 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.301e+02 2.401e+02 2.989e+02 4.009e+02 7.834e+02, threshold=5.978e+02, percent-clipped=3.0 +2022-12-07 15:02:32,088 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.38 vs. limit=5.0 +2022-12-07 15:02:42,216 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=42940.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:03:13,680 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9334, 2.0620, 2.2106, 2.1884, 1.8043, 2.2399, 1.9591, 1.0942], + device='cuda:2'), covar=tensor([0.1499, 0.0898, 0.0792, 0.0456, 0.0959, 0.0596, 0.1298, 0.3308], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0063, 0.0050, 0.0054, 0.0078, 0.0058, 0.0083, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:03:40,340 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43005.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:03:45,518 INFO [train.py:873] (2/4) Epoch 6, batch 5200, loss[loss=0.1662, simple_loss=0.1898, pruned_loss=0.07132, over 14523.00 frames. ], tot_loss[loss=0.1679, simple_loss=0.1842, pruned_loss=0.07586, over 1949492.05 frames. ], batch size: 34, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 15:04:01,018 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.446e+02 2.408e+02 3.132e+02 3.708e+02 6.690e+02, threshold=6.264e+02, percent-clipped=2.0 +2022-12-07 15:04:07,517 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43035.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:04:24,375 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 15:04:50,228 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43083.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:05:09,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7473, 2.9864, 4.1092, 2.9080, 2.5169, 2.9342, 1.8615, 3.4207], + device='cuda:2'), covar=tensor([0.1491, 0.1082, 0.0470, 0.1756, 0.2253, 0.1183, 0.4638, 0.0789], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0079, 0.0076, 0.0083, 0.0108, 0.0068, 0.0131, 0.0072], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:05:15,649 INFO [train.py:873] (2/4) Epoch 6, batch 5300, loss[loss=0.1316, simple_loss=0.1645, pruned_loss=0.04934, over 14250.00 frames. ], tot_loss[loss=0.1676, simple_loss=0.1839, pruned_loss=0.07566, over 1906594.30 frames. ], batch size: 39, lr: 1.29e-02, grad_scale: 8.0 +2022-12-07 15:05:20,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9846, 1.4043, 3.9157, 1.4416, 3.9453, 4.0756, 3.1072, 4.4641], + device='cuda:2'), covar=tensor([0.0196, 0.2752, 0.0297, 0.2287, 0.0275, 0.0247, 0.0444, 0.0117], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0158, 0.0140, 0.0169, 0.0152, 0.0153, 0.0121, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:05:29,870 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.587e+02 2.504e+02 3.046e+02 3.756e+02 7.408e+02, threshold=6.093e+02, percent-clipped=3.0 +2022-12-07 15:06:00,661 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43162.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:06:43,862 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43210.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:06:44,600 INFO [train.py:873] (2/4) Epoch 6, batch 5400, loss[loss=0.1726, simple_loss=0.1633, pruned_loss=0.09094, over 3787.00 frames. ], tot_loss[loss=0.1677, simple_loss=0.184, pruned_loss=0.07567, over 1862339.81 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:06:44,843 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4921, 1.8953, 1.3758, 1.7175, 1.9024, 1.4356, 1.7820, 1.2608], + device='cuda:2'), covar=tensor([0.2671, 0.2056, 0.1091, 0.0641, 0.0853, 0.0694, 0.0378, 0.1017], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0011, 0.0015, 0.0012, 0.0016], + device='cuda:2'), out_proj_covar=tensor([6.5927e-05, 7.0652e-05, 6.3984e-05, 6.4193e-05, 6.8908e-05, 9.2792e-05, + 7.7556e-05, 8.9207e-05], device='cuda:2') +2022-12-07 15:06:47,972 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43215.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:06:56,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5344, 4.3608, 4.2181, 4.6165, 4.2396, 3.5916, 4.5882, 4.4577], + device='cuda:2'), covar=tensor([0.0661, 0.0596, 0.0619, 0.0478, 0.0669, 0.0592, 0.0612, 0.0627], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0102, 0.0118, 0.0120, 0.0121, 0.0094, 0.0133, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:06:59,607 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.808e+01 2.467e+02 2.944e+02 3.777e+02 9.316e+02, threshold=5.888e+02, percent-clipped=3.0 +2022-12-07 15:07:06,172 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43235.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:07:12,094 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 15:07:14,048 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7282, 4.7628, 4.0421, 4.0647, 4.4096, 4.7000, 5.0116, 4.8140], + device='cuda:2'), covar=tensor([0.1446, 0.0511, 0.2506, 0.3944, 0.0995, 0.1177, 0.0990, 0.1346], + device='cuda:2'), in_proj_covar=tensor([0.0312, 0.0227, 0.0382, 0.0471, 0.0269, 0.0349, 0.0349, 0.0292], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:07:23,387 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8735, 4.6044, 4.5164, 4.9458, 4.4867, 4.2437, 4.9389, 4.7527], + device='cuda:2'), covar=tensor([0.0770, 0.0569, 0.0568, 0.0559, 0.0730, 0.0422, 0.0572, 0.0728], + device='cuda:2'), in_proj_covar=tensor([0.0116, 0.0103, 0.0118, 0.0121, 0.0122, 0.0094, 0.0134, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:07:59,651 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8963, 2.5741, 3.5144, 2.3993, 2.2905, 2.6620, 1.5167, 2.9311], + device='cuda:2'), covar=tensor([0.1325, 0.1344, 0.0593, 0.2945, 0.2662, 0.1274, 0.4993, 0.1228], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0081, 0.0079, 0.0086, 0.0112, 0.0070, 0.0136, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:08:08,951 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43305.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:08:14,522 INFO [train.py:873] (2/4) Epoch 6, batch 5500, loss[loss=0.2163, simple_loss=0.1856, pruned_loss=0.1235, over 1234.00 frames. ], tot_loss[loss=0.1659, simple_loss=0.183, pruned_loss=0.07442, over 1939180.33 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:08:27,132 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.43 vs. limit=2.0 +2022-12-07 15:08:28,979 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.312e+02 2.280e+02 2.901e+02 3.837e+02 7.523e+02, threshold=5.802e+02, percent-clipped=2.0 +2022-12-07 15:08:38,894 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-07 15:08:45,811 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0637, 4.8201, 4.6620, 5.1414, 4.7324, 4.2810, 5.1016, 5.0065], + device='cuda:2'), covar=tensor([0.0541, 0.0517, 0.0499, 0.0388, 0.0516, 0.0425, 0.0494, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0102, 0.0116, 0.0119, 0.0121, 0.0093, 0.0132, 0.0114], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:08:46,269 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-12-07 15:08:52,282 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43353.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:09:37,212 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-07 15:09:42,228 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8550, 3.5391, 2.9140, 2.2272, 3.1060, 3.3176, 3.5798, 2.8508], + device='cuda:2'), covar=tensor([0.0481, 0.2266, 0.1266, 0.2620, 0.0949, 0.0591, 0.1058, 0.1906], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0198, 0.0120, 0.0129, 0.0111, 0.0113, 0.0092, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:09:43,895 INFO [train.py:873] (2/4) Epoch 6, batch 5600, loss[loss=0.1605, simple_loss=0.1449, pruned_loss=0.08807, over 2658.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.1827, pruned_loss=0.07419, over 1938719.43 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:09:59,516 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.410e+02 2.262e+02 2.794e+02 3.488e+02 5.213e+02, threshold=5.588e+02, percent-clipped=0.0 +2022-12-07 15:10:01,801 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5112, 2.2669, 3.2837, 2.5805, 3.3376, 3.2125, 3.0276, 2.6569], + device='cuda:2'), covar=tensor([0.0485, 0.3371, 0.0913, 0.2298, 0.0812, 0.0892, 0.1288, 0.2410], + device='cuda:2'), in_proj_covar=tensor([0.0288, 0.0331, 0.0372, 0.0308, 0.0355, 0.0299, 0.0354, 0.0339], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:10:09,181 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-12-07 15:10:14,518 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.17 vs. limit=5.0 +2022-12-07 15:10:19,446 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 15:11:07,323 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5281, 2.6020, 2.6519, 2.6223, 1.9644, 2.7771, 2.6119, 1.1629], + device='cuda:2'), covar=tensor([0.2637, 0.0896, 0.1330, 0.0693, 0.1265, 0.0548, 0.1074, 0.3636], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0062, 0.0050, 0.0054, 0.0079, 0.0057, 0.0082, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:11:13,951 INFO [train.py:873] (2/4) Epoch 6, batch 5700, loss[loss=0.2106, simple_loss=0.2017, pruned_loss=0.1098, over 7783.00 frames. ], tot_loss[loss=0.1668, simple_loss=0.1834, pruned_loss=0.07508, over 1987618.13 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:11:16,593 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43514.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:11:17,466 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43515.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:11:28,312 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.154e+02 2.452e+02 3.294e+02 4.312e+02 1.114e+03, threshold=6.587e+02, percent-clipped=10.0 +2022-12-07 15:11:34,574 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=43535.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:11:59,859 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43563.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:12:10,445 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43575.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:12:16,687 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8099, 1.4872, 1.8067, 2.0609, 1.3545, 1.6985, 1.9658, 1.9111], + device='cuda:2'), covar=tensor([0.0047, 0.0079, 0.0038, 0.0022, 0.0077, 0.0080, 0.0042, 0.0035], + device='cuda:2'), in_proj_covar=tensor([0.0212, 0.0206, 0.0310, 0.0246, 0.0197, 0.0251, 0.0201, 0.0239], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 15:12:17,485 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=43583.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:12:42,161 INFO [train.py:873] (2/4) Epoch 6, batch 5800, loss[loss=0.1749, simple_loss=0.1956, pruned_loss=0.0771, over 14285.00 frames. ], tot_loss[loss=0.1669, simple_loss=0.1834, pruned_loss=0.07523, over 1938684.06 frames. ], batch size: 31, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:12:55,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.73 vs. limit=2.0 +2022-12-07 15:12:57,493 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.484e+02 3.216e+02 3.939e+02 1.250e+03, threshold=6.432e+02, percent-clipped=4.0 +2022-12-07 15:14:08,817 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2650, 3.3829, 3.0178, 4.5060, 4.0800, 4.2329, 3.4063, 2.9134], + device='cuda:2'), covar=tensor([0.0656, 0.1555, 0.4404, 0.0360, 0.0856, 0.1358, 0.1458, 0.4343], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0303, 0.0287, 0.0201, 0.0263, 0.0261, 0.0250, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:14:12,025 INFO [train.py:873] (2/4) Epoch 6, batch 5900, loss[loss=0.2045, simple_loss=0.1688, pruned_loss=0.1201, over 1314.00 frames. ], tot_loss[loss=0.1661, simple_loss=0.1828, pruned_loss=0.07468, over 1926810.92 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:14:18,478 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3221, 1.3748, 1.5120, 0.9990, 0.9924, 1.3712, 0.8441, 1.2532], + device='cuda:2'), covar=tensor([0.1214, 0.2028, 0.0581, 0.2402, 0.3028, 0.0730, 0.2386, 0.0969], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0081, 0.0079, 0.0086, 0.0111, 0.0071, 0.0136, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:14:27,080 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.291e+02 2.477e+02 2.996e+02 3.769e+02 7.369e+02, threshold=5.992e+02, percent-clipped=1.0 +2022-12-07 15:14:53,037 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.25 vs. limit=5.0 +2022-12-07 15:15:20,296 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43787.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:15:42,288 INFO [train.py:873] (2/4) Epoch 6, batch 6000, loss[loss=0.1357, simple_loss=0.1367, pruned_loss=0.06737, over 2650.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.1831, pruned_loss=0.07404, over 2000952.63 frames. ], batch size: 100, lr: 1.28e-02, grad_scale: 8.0 +2022-12-07 15:15:42,288 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 15:15:51,411 INFO [train.py:905] (2/4) Epoch 6, validation: loss=0.1218, simple_loss=0.1652, pruned_loss=0.03921, over 857387.00 frames. +2022-12-07 15:15:51,412 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 15:16:06,715 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.214e+02 2.267e+02 3.074e+02 3.851e+02 8.509e+02, threshold=6.148e+02, percent-clipped=4.0 +2022-12-07 15:16:24,592 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:16:44,656 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=43870.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:17:19,408 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:17:19,630 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-12-07 15:17:20,998 INFO [train.py:873] (2/4) Epoch 6, batch 6100, loss[loss=0.2413, simple_loss=0.2246, pruned_loss=0.129, over 9533.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.1831, pruned_loss=0.0741, over 1979390.65 frames. ], batch size: 100, lr: 1.27e-02, grad_scale: 16.0 +2022-12-07 15:17:27,492 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8468, 3.2108, 2.5690, 4.0234, 3.8805, 3.7861, 3.0524, 2.6466], + device='cuda:2'), covar=tensor([0.0605, 0.1452, 0.3816, 0.0347, 0.0704, 0.1286, 0.1362, 0.3557], + device='cuda:2'), in_proj_covar=tensor([0.0235, 0.0304, 0.0292, 0.0205, 0.0267, 0.0266, 0.0253, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:17:35,709 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 2.736e+02 3.463e+02 4.201e+02 1.121e+03, threshold=6.926e+02, percent-clipped=3.0 +2022-12-07 15:18:03,926 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=43959.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:18:14,026 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=43970.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:18:50,019 INFO [train.py:873] (2/4) Epoch 6, batch 6200, loss[loss=0.1519, simple_loss=0.144, pruned_loss=0.0799, over 2570.00 frames. ], tot_loss[loss=0.1663, simple_loss=0.1835, pruned_loss=0.07457, over 2019015.73 frames. ], batch size: 100, lr: 1.27e-02, grad_scale: 16.0 +2022-12-07 15:18:57,998 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5253, 1.4249, 3.4592, 1.4182, 3.3482, 3.5070, 2.6300, 3.8617], + device='cuda:2'), covar=tensor([0.0228, 0.2806, 0.0431, 0.2329, 0.0622, 0.0465, 0.0684, 0.0166], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0156, 0.0142, 0.0167, 0.0155, 0.0155, 0.0124, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:18:58,887 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44020.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:19:06,617 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.319e+02 2.517e+02 3.185e+02 3.672e+02 8.437e+02, threshold=6.370e+02, percent-clipped=2.0 +2022-12-07 15:19:41,957 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8375, 3.6437, 3.5452, 3.8848, 3.6606, 3.5028, 3.9143, 3.2935], + device='cuda:2'), covar=tensor([0.0595, 0.0921, 0.0388, 0.0429, 0.0709, 0.1220, 0.0562, 0.0522], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0230, 0.0151, 0.0144, 0.0151, 0.0118, 0.0229, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:19:42,007 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7784, 1.3843, 3.2372, 3.0406, 3.1638, 3.1372, 2.3950, 3.2590], + device='cuda:2'), covar=tensor([0.1163, 0.1343, 0.0102, 0.0192, 0.0193, 0.0099, 0.0229, 0.0138], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0107, 0.0150, 0.0123, 0.0124, 0.0098, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0002, 0.0004, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:20:19,414 INFO [train.py:873] (2/4) Epoch 6, batch 6300, loss[loss=0.1584, simple_loss=0.1825, pruned_loss=0.06717, over 14269.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.1825, pruned_loss=0.07326, over 2006578.05 frames. ], batch size: 76, lr: 1.27e-02, grad_scale: 8.0 +2022-12-07 15:20:35,103 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.219e+02 2.319e+02 2.963e+02 3.987e+02 7.339e+02, threshold=5.926e+02, percent-clipped=1.0 +2022-12-07 15:20:48,209 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44143.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:20:56,674 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-07 15:21:08,362 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5161, 2.1609, 2.2755, 1.3347, 2.2026, 2.2737, 2.5372, 2.0261], + device='cuda:2'), covar=tensor([0.0838, 0.1653, 0.1341, 0.2757, 0.1099, 0.0818, 0.0619, 0.2063], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0200, 0.0123, 0.0128, 0.0112, 0.0111, 0.0093, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:21:11,790 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44170.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:21:42,082 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6697, 3.0859, 2.4110, 3.8572, 3.6629, 3.5823, 2.8530, 2.3347], + device='cuda:2'), covar=tensor([0.0746, 0.1619, 0.4467, 0.0455, 0.0730, 0.1433, 0.1706, 0.4997], + device='cuda:2'), in_proj_covar=tensor([0.0234, 0.0303, 0.0292, 0.0208, 0.0264, 0.0269, 0.0253, 0.0282], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:21:48,489 INFO [train.py:873] (2/4) Epoch 6, batch 6400, loss[loss=0.1739, simple_loss=0.1894, pruned_loss=0.07917, over 14178.00 frames. ], tot_loss[loss=0.166, simple_loss=0.1832, pruned_loss=0.07441, over 1945585.13 frames. ], batch size: 84, lr: 1.27e-02, grad_scale: 8.0 +2022-12-07 15:21:55,370 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44218.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:22:04,826 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 2.640e+02 3.353e+02 4.203e+02 1.187e+03, threshold=6.705e+02, percent-clipped=10.0 +2022-12-07 15:22:24,914 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.06 vs. limit=2.0 +2022-12-07 15:22:37,056 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44265.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:22:46,642 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1885, 2.0699, 3.3896, 3.4071, 3.4045, 2.2131, 3.3362, 2.5614], + device='cuda:2'), covar=tensor([0.0169, 0.0388, 0.0360, 0.0185, 0.0130, 0.0596, 0.0108, 0.0420], + device='cuda:2'), in_proj_covar=tensor([0.0214, 0.0207, 0.0312, 0.0247, 0.0197, 0.0250, 0.0203, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 15:23:10,403 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0208, 1.6483, 3.9568, 3.7140, 3.8706, 3.9857, 3.4925, 4.0356], + device='cuda:2'), covar=tensor([0.1056, 0.1195, 0.0078, 0.0141, 0.0116, 0.0077, 0.0140, 0.0095], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0108, 0.0151, 0.0124, 0.0125, 0.0099, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:23:17,595 INFO [train.py:873] (2/4) Epoch 6, batch 6500, loss[loss=0.1331, simple_loss=0.1669, pruned_loss=0.04959, over 14449.00 frames. ], tot_loss[loss=0.166, simple_loss=0.183, pruned_loss=0.07454, over 1897501.82 frames. ], batch size: 53, lr: 1.27e-02, grad_scale: 8.0 +2022-12-07 15:23:21,308 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44315.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:23:33,244 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 2.290e+02 3.052e+02 3.998e+02 7.186e+02, threshold=6.105e+02, percent-clipped=1.0 +2022-12-07 15:23:44,924 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.94 vs. limit=5.0 +2022-12-07 15:23:57,188 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.45 vs. limit=5.0 +2022-12-07 15:24:13,090 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-12-07 15:24:33,837 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 15:24:38,735 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2456, 1.2486, 1.4471, 0.9983, 0.8523, 1.3142, 0.7696, 1.2078], + device='cuda:2'), covar=tensor([0.1470, 0.2731, 0.0974, 0.2401, 0.3878, 0.0840, 0.3235, 0.1255], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0080, 0.0078, 0.0082, 0.0108, 0.0068, 0.0128, 0.0074], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:24:46,950 INFO [train.py:873] (2/4) Epoch 6, batch 6600, loss[loss=0.1764, simple_loss=0.1866, pruned_loss=0.08309, over 10319.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.1827, pruned_loss=0.07427, over 1917431.65 frames. ], batch size: 100, lr: 1.27e-02, grad_scale: 8.0 +2022-12-07 15:24:48,065 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2701, 1.2118, 1.3974, 1.0202, 0.8186, 0.8963, 0.9587, 0.6732], + device='cuda:2'), covar=tensor([0.0385, 0.0854, 0.0340, 0.0294, 0.0455, 0.0347, 0.0328, 0.0657], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0012, 0.0016, 0.0013, 0.0017], + device='cuda:2'), out_proj_covar=tensor([6.9578e-05, 7.5424e-05, 6.6493e-05, 6.7561e-05, 7.1405e-05, 9.8780e-05, + 8.5680e-05, 9.5484e-05], device='cuda:2') +2022-12-07 15:25:03,287 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.314e+02 2.475e+02 3.026e+02 3.737e+02 6.916e+02, threshold=6.052e+02, percent-clipped=2.0 +2022-12-07 15:25:15,255 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44443.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:25:50,724 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5265, 1.8585, 1.8887, 1.9572, 1.6170, 1.9973, 1.5642, 1.0034], + device='cuda:2'), covar=tensor([0.1883, 0.1170, 0.0630, 0.0653, 0.1255, 0.0600, 0.1694, 0.4035], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0061, 0.0049, 0.0053, 0.0078, 0.0056, 0.0082, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:25:58,417 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44491.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:26:16,141 INFO [train.py:873] (2/4) Epoch 6, batch 6700, loss[loss=0.1483, simple_loss=0.1829, pruned_loss=0.05683, over 14323.00 frames. ], tot_loss[loss=0.1673, simple_loss=0.1839, pruned_loss=0.07538, over 1872455.72 frames. ], batch size: 31, lr: 1.27e-02, grad_scale: 8.0 +2022-12-07 15:26:30,051 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=44527.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:26:31,879 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.570e+02 2.579e+02 3.217e+02 4.461e+02 7.506e+02, threshold=6.434e+02, percent-clipped=5.0 +2022-12-07 15:26:51,068 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=44550.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:27:03,999 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44565.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:27:25,097 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44588.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:27:45,455 INFO [train.py:873] (2/4) Epoch 6, batch 6800, loss[loss=0.1856, simple_loss=0.1938, pruned_loss=0.08869, over 9453.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.1824, pruned_loss=0.07339, over 2000635.93 frames. ], batch size: 100, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:27:45,671 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=44611.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:27:47,299 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44613.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:27:49,183 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=44615.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:28:01,814 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.238e+02 2.228e+02 2.960e+02 3.530e+02 6.520e+02, threshold=5.920e+02, percent-clipped=1.0 +2022-12-07 15:28:26,951 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 15:28:32,297 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=44663.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:28:40,135 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3232, 2.6343, 1.9998, 2.6094, 1.6378, 2.7086, 2.4421, 0.9537], + device='cuda:2'), covar=tensor([0.3420, 0.1162, 0.1287, 0.0968, 0.1662, 0.0571, 0.1720, 0.4460], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0062, 0.0050, 0.0054, 0.0080, 0.0057, 0.0084, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:29:03,493 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9041, 1.4247, 3.4812, 3.4364, 3.4836, 3.6145, 2.9758, 3.6456], + device='cuda:2'), covar=tensor([0.1208, 0.1323, 0.0098, 0.0170, 0.0160, 0.0092, 0.0190, 0.0109], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0155, 0.0108, 0.0150, 0.0123, 0.0124, 0.0100, 0.0103], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:29:08,291 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7719, 1.9752, 3.8716, 2.4939, 3.7599, 1.7404, 3.0141, 3.6559], + device='cuda:2'), covar=tensor([0.0648, 0.5059, 0.0407, 0.8486, 0.0503, 0.4573, 0.1305, 0.0389], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0243, 0.0167, 0.0334, 0.0188, 0.0249, 0.0236, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:29:15,083 INFO [train.py:873] (2/4) Epoch 6, batch 6900, loss[loss=0.1793, simple_loss=0.1596, pruned_loss=0.09947, over 1218.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.1826, pruned_loss=0.0742, over 1941616.16 frames. ], batch size: 100, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:29:31,170 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 2.434e+02 2.991e+02 3.865e+02 6.867e+02, threshold=5.982e+02, percent-clipped=5.0 +2022-12-07 15:29:32,253 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8297, 1.9982, 2.5758, 2.2843, 2.6978, 2.6461, 2.5791, 2.3291], + device='cuda:2'), covar=tensor([0.0468, 0.2795, 0.0562, 0.1610, 0.0512, 0.0915, 0.0873, 0.1811], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0331, 0.0373, 0.0310, 0.0356, 0.0304, 0.0346, 0.0343], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:29:55,253 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1667, 3.8750, 3.6365, 3.7777, 3.9893, 4.0599, 4.1946, 4.1305], + device='cuda:2'), covar=tensor([0.0763, 0.0687, 0.2061, 0.2693, 0.0824, 0.0827, 0.0906, 0.0795], + device='cuda:2'), in_proj_covar=tensor([0.0311, 0.0232, 0.0387, 0.0486, 0.0280, 0.0360, 0.0349, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:29:58,346 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 15:30:06,315 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8998, 2.1331, 2.7706, 2.3251, 2.7640, 2.7432, 2.7004, 2.4187], + device='cuda:2'), covar=tensor([0.0422, 0.2392, 0.0727, 0.1656, 0.0558, 0.0720, 0.0871, 0.1851], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0326, 0.0369, 0.0304, 0.0350, 0.0299, 0.0339, 0.0335], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:30:12,839 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-07 15:30:43,884 INFO [train.py:873] (2/4) Epoch 6, batch 7000, loss[loss=0.1518, simple_loss=0.1733, pruned_loss=0.06519, over 14256.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.1821, pruned_loss=0.07379, over 1956598.28 frames. ], batch size: 60, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:31:00,497 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.508e+02 2.539e+02 3.130e+02 3.727e+02 8.173e+02, threshold=6.260e+02, percent-clipped=3.0 +2022-12-07 15:31:27,789 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3682, 1.7960, 3.4615, 2.4355, 3.3938, 1.6809, 2.6933, 3.2835], + device='cuda:2'), covar=tensor([0.0653, 0.5420, 0.0377, 0.7613, 0.0530, 0.4568, 0.1486, 0.0426], + device='cuda:2'), in_proj_covar=tensor([0.0215, 0.0240, 0.0168, 0.0333, 0.0186, 0.0248, 0.0238, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:31:47,427 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44883.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:31:52,111 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-12-07 15:31:58,220 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9427, 2.1586, 2.0313, 2.2394, 1.7305, 2.2614, 2.0183, 1.0039], + device='cuda:2'), covar=tensor([0.1782, 0.0974, 0.0895, 0.0548, 0.1221, 0.0655, 0.1435, 0.3443], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0062, 0.0051, 0.0054, 0.0079, 0.0057, 0.0083, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:32:08,009 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=44906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:32:12,183 INFO [train.py:873] (2/4) Epoch 6, batch 7100, loss[loss=0.178, simple_loss=0.1901, pruned_loss=0.08291, over 14217.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.1822, pruned_loss=0.07348, over 1963009.43 frames. ], batch size: 35, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:32:27,516 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.394e+02 2.599e+02 3.099e+02 3.894e+02 6.870e+02, threshold=6.198e+02, percent-clipped=1.0 +2022-12-07 15:33:44,151 INFO [train.py:873] (2/4) Epoch 6, batch 7200, loss[loss=0.1429, simple_loss=0.1708, pruned_loss=0.05749, over 14298.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.1829, pruned_loss=0.07406, over 1993066.13 frames. ], batch size: 37, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:33:46,196 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0306, 4.5901, 4.5202, 4.9853, 4.6872, 4.2825, 4.9283, 4.2119], + device='cuda:2'), covar=tensor([0.0296, 0.0890, 0.0279, 0.0364, 0.0695, 0.0437, 0.0511, 0.0476], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0229, 0.0148, 0.0143, 0.0152, 0.0119, 0.0225, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:34:00,688 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.267e+02 2.403e+02 2.886e+02 3.979e+02 7.470e+02, threshold=5.772e+02, percent-clipped=2.0 +2022-12-07 15:34:24,905 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45056.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:34:33,617 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0276, 4.0420, 4.3040, 3.4566, 4.0191, 4.3080, 1.4601, 3.8741], + device='cuda:2'), covar=tensor([0.0241, 0.0261, 0.0326, 0.0624, 0.0315, 0.0191, 0.3287, 0.0245], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0145, 0.0125, 0.0118, 0.0171, 0.0115, 0.0151, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:34:40,866 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7485, 1.2508, 2.8395, 2.6480, 2.8678, 2.9013, 2.1969, 2.8753], + device='cuda:2'), covar=tensor([0.0938, 0.1189, 0.0127, 0.0252, 0.0203, 0.0114, 0.0310, 0.0146], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0109, 0.0152, 0.0124, 0.0125, 0.0099, 0.0104], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 15:34:46,855 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45081.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:34:56,500 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0202, 4.5409, 4.4699, 5.0127, 4.6969, 4.2934, 4.9589, 4.2426], + device='cuda:2'), covar=tensor([0.0436, 0.1198, 0.0348, 0.0404, 0.0799, 0.0485, 0.0568, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0230, 0.0149, 0.0144, 0.0153, 0.0120, 0.0225, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:35:13,049 INFO [train.py:873] (2/4) Epoch 6, batch 7300, loss[loss=0.1231, simple_loss=0.1559, pruned_loss=0.04519, over 13889.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.1809, pruned_loss=0.07268, over 1965147.48 frames. ], batch size: 20, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:35:18,554 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45117.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 15:35:22,383 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 15:35:29,221 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.594e+02 2.544e+02 3.092e+02 3.955e+02 6.821e+02, threshold=6.183e+02, percent-clipped=3.0 +2022-12-07 15:35:32,008 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7341, 2.6539, 3.5444, 2.3239, 2.4007, 2.6920, 1.7751, 2.8689], + device='cuda:2'), covar=tensor([0.1611, 0.1278, 0.0882, 0.2693, 0.2260, 0.1060, 0.4988, 0.1497], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0082, 0.0079, 0.0086, 0.0108, 0.0068, 0.0134, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:35:41,376 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45142.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:36:17,624 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45183.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:36:29,804 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8769, 1.2969, 1.2660, 1.2841, 1.1582, 1.4004, 1.2196, 0.9491], + device='cuda:2'), covar=tensor([0.1710, 0.0736, 0.0201, 0.0304, 0.1222, 0.0343, 0.1340, 0.0908], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0062, 0.0050, 0.0054, 0.0079, 0.0057, 0.0082, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:36:37,788 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45206.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:36:40,532 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3327, 3.3316, 2.9190, 2.9208, 3.3828, 3.3356, 3.4316, 3.3336], + device='cuda:2'), covar=tensor([0.1370, 0.0792, 0.2563, 0.3565, 0.1081, 0.1121, 0.1634, 0.1351], + device='cuda:2'), in_proj_covar=tensor([0.0318, 0.0233, 0.0387, 0.0488, 0.0284, 0.0361, 0.0358, 0.0303], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:36:41,983 INFO [train.py:873] (2/4) Epoch 6, batch 7400, loss[loss=0.1378, simple_loss=0.1703, pruned_loss=0.05264, over 14224.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.1817, pruned_loss=0.07399, over 1962073.14 frames. ], batch size: 35, lr: 1.26e-02, grad_scale: 8.0 +2022-12-07 15:36:54,650 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8520, 1.4817, 2.0920, 1.6555, 2.0208, 1.3821, 1.6845, 1.7783], + device='cuda:2'), covar=tensor([0.1234, 0.2279, 0.0175, 0.1300, 0.0453, 0.1117, 0.0843, 0.0361], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0243, 0.0170, 0.0328, 0.0185, 0.0246, 0.0235, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:36:58,363 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.666e+02 2.594e+02 3.335e+02 4.109e+02 6.658e+02, threshold=6.669e+02, percent-clipped=6.0 +2022-12-07 15:37:00,150 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45231.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:37:07,687 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2763, 1.0622, 1.3509, 1.3210, 1.0211, 0.7060, 1.3466, 1.5448], + device='cuda:2'), covar=tensor([0.1261, 0.1175, 0.0476, 0.1743, 0.2715, 0.0707, 0.1212, 0.0860], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0019, 0.0020, 0.0020, 0.0028, 0.0020, 0.0020], + device='cuda:2'), out_proj_covar=tensor([8.8237e-05, 9.0032e-05, 8.5848e-05, 8.9028e-05, 9.0285e-05, 1.1600e-04, + 9.3633e-05, 8.9412e-05], device='cuda:2') +2022-12-07 15:37:19,966 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1309, 4.6201, 4.5509, 5.1346, 4.8320, 4.2492, 5.0654, 4.2337], + device='cuda:2'), covar=tensor([0.0354, 0.1008, 0.0293, 0.0390, 0.0688, 0.0476, 0.0491, 0.0512], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0230, 0.0151, 0.0146, 0.0151, 0.0123, 0.0227, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:37:20,775 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45254.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:37:43,392 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1535, 2.1675, 2.6849, 1.8687, 1.8699, 2.3772, 1.2615, 2.3824], + device='cuda:2'), covar=tensor([0.0982, 0.1239, 0.0610, 0.1546, 0.2272, 0.0810, 0.4633, 0.0878], + device='cuda:2'), in_proj_covar=tensor([0.0071, 0.0083, 0.0078, 0.0086, 0.0109, 0.0069, 0.0135, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:38:12,292 INFO [train.py:873] (2/4) Epoch 6, batch 7500, loss[loss=0.1671, simple_loss=0.1814, pruned_loss=0.07638, over 14154.00 frames. ], tot_loss[loss=0.1653, simple_loss=0.1817, pruned_loss=0.07447, over 1909598.05 frames. ], batch size: 35, lr: 1.25e-02, grad_scale: 8.0 +2022-12-07 15:38:27,608 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 2.437e+02 3.096e+02 3.781e+02 7.631e+02, threshold=6.193e+02, percent-clipped=3.0 +2022-12-07 15:38:44,174 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45348.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 15:38:53,157 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.37 vs. limit=2.0 +2022-12-07 15:39:51,270 INFO [train.py:873] (2/4) Epoch 7, batch 0, loss[loss=0.1852, simple_loss=0.2109, pruned_loss=0.07976, over 14287.00 frames. ], tot_loss[loss=0.1852, simple_loss=0.2109, pruned_loss=0.07976, over 14287.00 frames. ], batch size: 44, lr: 1.17e-02, grad_scale: 8.0 +2022-12-07 15:39:51,270 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 15:39:59,857 INFO [train.py:905] (2/4) Epoch 7, validation: loss=0.1305, simple_loss=0.175, pruned_loss=0.04304, over 857387.00 frames. +2022-12-07 15:39:59,859 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 15:40:21,845 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-12-07 15:40:33,572 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45409.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 15:40:36,210 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45412.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 15:40:36,224 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4877, 4.5504, 4.4779, 4.1624, 4.4458, 4.9298, 1.6382, 4.2360], + device='cuda:2'), covar=tensor([0.0302, 0.0353, 0.0741, 0.0550, 0.0365, 0.0181, 0.4186, 0.0317], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0149, 0.0127, 0.0120, 0.0174, 0.0120, 0.0154, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:40:51,472 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.719e+01 2.285e+02 3.118e+02 4.102e+02 1.207e+03, threshold=6.237e+02, percent-clipped=4.0 +2022-12-07 15:40:59,123 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45437.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:41:31,134 INFO [train.py:873] (2/4) Epoch 7, batch 100, loss[loss=0.1402, simple_loss=0.1697, pruned_loss=0.05538, over 14245.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.1819, pruned_loss=0.07355, over 814394.89 frames. ], batch size: 31, lr: 1.17e-02, grad_scale: 8.0 +2022-12-07 15:42:21,345 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 2.542e+02 3.020e+02 3.979e+02 7.369e+02, threshold=6.041e+02, percent-clipped=5.0 +2022-12-07 15:42:38,121 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.56 vs. limit=5.0 +2022-12-07 15:42:40,091 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-07 15:43:00,540 INFO [train.py:873] (2/4) Epoch 7, batch 200, loss[loss=0.194, simple_loss=0.1732, pruned_loss=0.1074, over 1188.00 frames. ], tot_loss[loss=0.1632, simple_loss=0.1807, pruned_loss=0.0728, over 1266758.19 frames. ], batch size: 100, lr: 1.17e-02, grad_scale: 8.0 +2022-12-07 15:43:27,515 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2787, 2.1946, 4.2088, 2.6599, 4.0772, 1.8914, 3.0623, 4.0093], + device='cuda:2'), covar=tensor([0.0424, 0.5500, 0.0340, 1.0368, 0.0442, 0.4690, 0.1493, 0.0405], + device='cuda:2'), in_proj_covar=tensor([0.0216, 0.0239, 0.0171, 0.0325, 0.0187, 0.0245, 0.0236, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:43:29,369 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8659, 2.4850, 4.8533, 3.1400, 4.6693, 2.1395, 3.5677, 4.4271], + device='cuda:2'), covar=tensor([0.0348, 0.4785, 0.0206, 0.8682, 0.0349, 0.4128, 0.1238, 0.0250], + device='cuda:2'), in_proj_covar=tensor([0.0216, 0.0240, 0.0171, 0.0326, 0.0187, 0.0246, 0.0236, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:43:42,201 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3617, 2.3165, 4.3310, 2.9254, 4.1499, 2.1692, 3.1614, 4.0481], + device='cuda:2'), covar=tensor([0.0387, 0.4778, 0.0282, 0.9252, 0.0405, 0.3723, 0.1392, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0217, 0.0239, 0.0171, 0.0328, 0.0187, 0.0247, 0.0236, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 15:43:50,948 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.430e+02 2.408e+02 2.969e+02 3.747e+02 6.560e+02, threshold=5.938e+02, percent-clipped=2.0 +2022-12-07 15:44:30,684 INFO [train.py:873] (2/4) Epoch 7, batch 300, loss[loss=0.1434, simple_loss=0.1403, pruned_loss=0.07326, over 2618.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.1811, pruned_loss=0.07403, over 1475395.29 frames. ], batch size: 100, lr: 1.17e-02, grad_scale: 8.0 +2022-12-07 15:44:56,568 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45702.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:44:58,081 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=45704.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 15:45:05,276 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45712.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 15:45:20,483 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.117e+02 2.543e+02 3.312e+02 3.999e+02 6.909e+02, threshold=6.625e+02, percent-clipped=2.0 +2022-12-07 15:45:27,792 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=45737.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:45:36,779 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1215, 4.1646, 4.4026, 3.7263, 4.1495, 4.4527, 1.5186, 3.9281], + device='cuda:2'), covar=tensor([0.0268, 0.0281, 0.0332, 0.0426, 0.0281, 0.0200, 0.3294, 0.0237], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0149, 0.0130, 0.0121, 0.0176, 0.0122, 0.0153, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:45:40,027 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-07 15:45:48,358 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45760.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:45:50,999 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45763.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:45:51,540 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 15:45:56,498 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9092, 1.6470, 1.8576, 1.7116, 2.0181, 1.8393, 1.6742, 1.8032], + device='cuda:2'), covar=tensor([0.0264, 0.0967, 0.0095, 0.0239, 0.0163, 0.0396, 0.0130, 0.0211], + device='cuda:2'), in_proj_covar=tensor([0.0301, 0.0326, 0.0370, 0.0310, 0.0357, 0.0301, 0.0348, 0.0337], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:45:59,856 INFO [train.py:873] (2/4) Epoch 7, batch 400, loss[loss=0.1599, simple_loss=0.1765, pruned_loss=0.07166, over 14634.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.1812, pruned_loss=0.07286, over 1687167.45 frames. ], batch size: 33, lr: 1.17e-02, grad_scale: 8.0 +2022-12-07 15:46:01,072 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45774.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:46:02,736 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7338, 4.6590, 5.0363, 3.9643, 4.7534, 5.2140, 1.7116, 4.5206], + device='cuda:2'), covar=tensor([0.0203, 0.0232, 0.0345, 0.0503, 0.0243, 0.0084, 0.3205, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0149, 0.0129, 0.0121, 0.0176, 0.0122, 0.0152, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:46:10,775 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=45785.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:46:50,562 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 2.431e+02 3.067e+02 3.597e+02 6.934e+02, threshold=6.134e+02, percent-clipped=1.0 +2022-12-07 15:46:56,441 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45835.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:47:12,823 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-12-07 15:47:17,522 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=45859.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:47:21,469 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 15:47:30,189 INFO [train.py:873] (2/4) Epoch 7, batch 500, loss[loss=0.1731, simple_loss=0.1676, pruned_loss=0.08928, over 3895.00 frames. ], tot_loss[loss=0.1622, simple_loss=0.1801, pruned_loss=0.07216, over 1763905.12 frames. ], batch size: 100, lr: 1.17e-02, grad_scale: 4.0 +2022-12-07 15:48:12,918 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=45920.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:48:21,755 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.265e+02 2.455e+02 3.322e+02 4.008e+02 8.135e+02, threshold=6.645e+02, percent-clipped=7.0 +2022-12-07 15:48:59,908 INFO [train.py:873] (2/4) Epoch 7, batch 600, loss[loss=0.1437, simple_loss=0.1684, pruned_loss=0.05946, over 14264.00 frames. ], tot_loss[loss=0.162, simple_loss=0.1801, pruned_loss=0.07198, over 1878676.85 frames. ], batch size: 28, lr: 1.17e-02, grad_scale: 4.0 +2022-12-07 15:49:27,343 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46004.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 15:49:49,710 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.803e+01 2.347e+02 3.029e+02 4.189e+02 7.825e+02, threshold=6.059e+02, percent-clipped=1.0 +2022-12-07 15:50:04,318 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.65 vs. limit=5.0 +2022-12-07 15:50:09,868 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46052.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 15:50:15,074 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46058.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:50:28,494 INFO [train.py:873] (2/4) Epoch 7, batch 700, loss[loss=0.1749, simple_loss=0.1807, pruned_loss=0.08456, over 5946.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.1789, pruned_loss=0.07111, over 1882442.98 frames. ], batch size: 100, lr: 1.17e-02, grad_scale: 4.0 +2022-12-07 15:51:18,727 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.605e+01 2.609e+02 3.339e+02 4.151e+02 6.051e+02, threshold=6.678e+02, percent-clipped=0.0 +2022-12-07 15:51:18,858 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46130.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:51:49,129 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-12-07 15:51:50,369 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7517, 0.8331, 0.7650, 0.8952, 0.9572, 0.2967, 0.7200, 0.9296], + device='cuda:2'), covar=tensor([0.0436, 0.0450, 0.0281, 0.0352, 0.0140, 0.0168, 0.0893, 0.0353], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0021, 0.0020, 0.0021, 0.0028, 0.0020, 0.0022], + device='cuda:2'), out_proj_covar=tensor([9.2259e-05, 9.0872e-05, 9.0969e-05, 9.2658e-05, 9.3473e-05, 1.1771e-04, + 9.6871e-05, 9.6457e-05], device='cuda:2') +2022-12-07 15:51:56,447 INFO [train.py:873] (2/4) Epoch 7, batch 800, loss[loss=0.1993, simple_loss=0.1959, pruned_loss=0.1014, over 7799.00 frames. ], tot_loss[loss=0.1611, simple_loss=0.179, pruned_loss=0.07158, over 1931712.06 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:52:22,246 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5506, 2.3631, 3.3990, 3.6375, 3.5168, 2.2690, 3.4767, 2.8277], + device='cuda:2'), covar=tensor([0.0156, 0.0383, 0.0339, 0.0201, 0.0142, 0.0645, 0.0166, 0.0425], + device='cuda:2'), in_proj_covar=tensor([0.0224, 0.0215, 0.0328, 0.0262, 0.0208, 0.0261, 0.0217, 0.0250], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 15:52:34,719 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46215.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:52:34,845 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4002, 3.0881, 2.3430, 3.4435, 3.2491, 3.2752, 2.9430, 2.3699], + device='cuda:2'), covar=tensor([0.0612, 0.1346, 0.3608, 0.0415, 0.0783, 0.1239, 0.1312, 0.3666], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0302, 0.0284, 0.0205, 0.0267, 0.0268, 0.0258, 0.0276], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:52:47,697 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.967e+01 2.522e+02 3.122e+02 4.007e+02 6.928e+02, threshold=6.244e+02, percent-clipped=1.0 +2022-12-07 15:53:26,506 INFO [train.py:873] (2/4) Epoch 7, batch 900, loss[loss=0.1676, simple_loss=0.1838, pruned_loss=0.0757, over 14253.00 frames. ], tot_loss[loss=0.1612, simple_loss=0.1796, pruned_loss=0.07142, over 1965570.89 frames. ], batch size: 37, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:53:29,280 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46276.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:53:41,203 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7631, 1.2766, 1.2492, 1.2971, 1.1400, 1.3683, 1.0748, 0.8345], + device='cuda:2'), covar=tensor([0.2327, 0.0846, 0.0253, 0.0381, 0.1268, 0.0429, 0.1793, 0.1231], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0062, 0.0051, 0.0055, 0.0079, 0.0057, 0.0085, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 15:54:16,875 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 2.538e+02 3.268e+02 4.038e+02 9.582e+02, threshold=6.536e+02, percent-clipped=3.0 +2022-12-07 15:54:23,248 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46337.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:54:42,204 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46358.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:54:55,204 INFO [train.py:873] (2/4) Epoch 7, batch 1000, loss[loss=0.1686, simple_loss=0.1825, pruned_loss=0.0773, over 13554.00 frames. ], tot_loss[loss=0.1627, simple_loss=0.1804, pruned_loss=0.07249, over 1927704.00 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:55:24,798 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46406.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:55:46,198 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.306e+02 2.372e+02 2.921e+02 3.865e+02 1.419e+03, threshold=5.842e+02, percent-clipped=9.0 +2022-12-07 15:55:46,349 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46430.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:56:24,647 INFO [train.py:873] (2/4) Epoch 7, batch 1100, loss[loss=0.2065, simple_loss=0.1716, pruned_loss=0.1207, over 1184.00 frames. ], tot_loss[loss=0.1613, simple_loss=0.1798, pruned_loss=0.0714, over 2028956.57 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:56:29,023 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46478.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:56:48,463 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8463, 4.7771, 4.1228, 4.3533, 4.5672, 4.9397, 5.0219, 4.9247], + device='cuda:2'), covar=tensor([0.1246, 0.0529, 0.2477, 0.3386, 0.0917, 0.0950, 0.1082, 0.1124], + device='cuda:2'), in_proj_covar=tensor([0.0320, 0.0233, 0.0392, 0.0493, 0.0283, 0.0359, 0.0345, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 15:57:01,971 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46515.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:57:15,199 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.295e+02 2.418e+02 3.079e+02 3.635e+02 7.997e+02, threshold=6.158e+02, percent-clipped=1.0 +2022-12-07 15:57:45,020 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46563.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:57:53,826 INFO [train.py:873] (2/4) Epoch 7, batch 1200, loss[loss=0.1837, simple_loss=0.1803, pruned_loss=0.09354, over 3862.00 frames. ], tot_loss[loss=0.1599, simple_loss=0.1788, pruned_loss=0.07048, over 1952350.25 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:57:56,291 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-12-07 15:58:01,657 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8093, 5.2080, 5.2023, 5.8134, 5.3309, 4.6621, 5.7191, 4.7176], + device='cuda:2'), covar=tensor([0.0341, 0.0877, 0.0251, 0.0332, 0.0648, 0.0338, 0.0407, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0233, 0.0151, 0.0144, 0.0153, 0.0124, 0.0227, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 15:58:13,327 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46595.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:58:20,083 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46602.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:58:36,220 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.31 vs. limit=5.0 +2022-12-07 15:58:44,673 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.081e+01 2.469e+02 2.984e+02 3.753e+02 8.629e+02, threshold=5.968e+02, percent-clipped=3.0 +2022-12-07 15:58:46,486 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46632.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:59:07,600 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9594, 3.5120, 2.6798, 4.0986, 3.9017, 3.9693, 3.4744, 2.8661], + device='cuda:2'), covar=tensor([0.0501, 0.1177, 0.3911, 0.0391, 0.0578, 0.0970, 0.0997, 0.3285], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0296, 0.0279, 0.0203, 0.0259, 0.0261, 0.0248, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 15:59:08,441 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46656.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 15:59:14,304 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46663.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 15:59:23,324 INFO [train.py:873] (2/4) Epoch 7, batch 1300, loss[loss=0.2103, simple_loss=0.1953, pruned_loss=0.1127, over 4993.00 frames. ], tot_loss[loss=0.1609, simple_loss=0.1795, pruned_loss=0.07118, over 2016523.80 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 15:59:23,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0203, 1.0163, 1.2903, 1.1631, 0.9863, 0.6180, 1.1474, 1.3893], + device='cuda:2'), covar=tensor([0.1043, 0.0726, 0.1172, 0.2121, 0.1496, 0.0638, 0.1029, 0.1003], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0020, 0.0021, 0.0021, 0.0029, 0.0021, 0.0022], + device='cuda:2'), out_proj_covar=tensor([9.3052e-05, 9.2482e-05, 9.1168e-05, 9.5637e-05, 9.5458e-05, 1.1996e-04, + 9.8783e-05, 9.6043e-05], device='cuda:2') +2022-12-07 16:00:14,511 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.436e+02 2.325e+02 3.068e+02 3.769e+02 6.043e+02, threshold=6.136e+02, percent-clipped=2.0 +2022-12-07 16:00:45,150 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9786, 0.8316, 0.9708, 1.1323, 1.0641, 0.5760, 1.0044, 1.1157], + device='cuda:2'), covar=tensor([0.0665, 0.1079, 0.0347, 0.0485, 0.1127, 0.0638, 0.0569, 0.0462], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0020, 0.0020, 0.0020, 0.0028, 0.0020, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.1088e-05, 9.1667e-05, 8.8915e-05, 9.1983e-05, 9.1888e-05, 1.1715e-04, + 9.6141e-05, 9.2379e-05], device='cuda:2') +2022-12-07 16:00:53,008 INFO [train.py:873] (2/4) Epoch 7, batch 1400, loss[loss=0.1816, simple_loss=0.1928, pruned_loss=0.08515, over 10313.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.1793, pruned_loss=0.07086, over 2017454.02 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 16:01:05,024 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0167, 2.6631, 2.7340, 1.7502, 2.5155, 2.7260, 3.0823, 2.3302], + device='cuda:2'), covar=tensor([0.0704, 0.1741, 0.1347, 0.2889, 0.1156, 0.0906, 0.0609, 0.2131], + device='cuda:2'), in_proj_covar=tensor([0.0113, 0.0194, 0.0123, 0.0126, 0.0111, 0.0115, 0.0095, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0006, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:01:27,161 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=46810.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:01:43,423 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 16:01:45,246 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.279e+02 2.517e+02 3.054e+02 3.788e+02 7.041e+02, threshold=6.108e+02, percent-clipped=3.0 +2022-12-07 16:02:15,247 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-07 16:02:21,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1943, 1.3755, 1.4015, 1.1779, 1.1771, 0.9854, 1.0295, 0.8041], + device='cuda:2'), covar=tensor([0.0234, 0.0686, 0.0444, 0.0291, 0.0484, 0.0335, 0.0271, 0.0595], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0011, 0.0011, 0.0017, 0.0014, 0.0018], + device='cuda:2'), out_proj_covar=tensor([7.3804e-05, 7.9563e-05, 7.2676e-05, 7.2605e-05, 7.3345e-05, 1.0737e-04, + 9.2497e-05, 1.0274e-04], device='cuda:2') +2022-12-07 16:02:21,960 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=46871.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:02:23,386 INFO [train.py:873] (2/4) Epoch 7, batch 1500, loss[loss=0.1447, simple_loss=0.1303, pruned_loss=0.07952, over 1342.00 frames. ], tot_loss[loss=0.1597, simple_loss=0.1786, pruned_loss=0.07041, over 1959559.34 frames. ], batch size: 100, lr: 1.16e-02, grad_scale: 8.0 +2022-12-07 16:02:24,658 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9070, 2.5821, 4.6876, 3.0119, 4.4888, 2.1029, 3.7032, 4.2667], + device='cuda:2'), covar=tensor([0.0343, 0.4620, 0.0442, 0.9512, 0.0391, 0.4323, 0.1176, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0222, 0.0239, 0.0168, 0.0328, 0.0191, 0.0248, 0.0231, 0.0184], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:03:11,318 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.07 vs. limit=5.0 +2022-12-07 16:03:13,535 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.064e+01 2.462e+02 3.053e+02 4.274e+02 1.237e+03, threshold=6.105e+02, percent-clipped=6.0 +2022-12-07 16:03:15,395 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=46932.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:03:32,180 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46951.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 16:03:33,421 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-12-07 16:03:38,710 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=46958.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:03:48,567 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.11 vs. limit=5.0 +2022-12-07 16:03:51,574 INFO [train.py:873] (2/4) Epoch 7, batch 1600, loss[loss=0.1598, simple_loss=0.1813, pruned_loss=0.06913, over 14163.00 frames. ], tot_loss[loss=0.1599, simple_loss=0.1789, pruned_loss=0.07042, over 1995459.99 frames. ], batch size: 84, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:03:57,848 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=46980.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:04:07,673 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8914, 2.4328, 3.5997, 2.7694, 3.6947, 3.3844, 3.4381, 2.9548], + device='cuda:2'), covar=tensor([0.0585, 0.3131, 0.0872, 0.2103, 0.0832, 0.0972, 0.1581, 0.2265], + device='cuda:2'), in_proj_covar=tensor([0.0307, 0.0334, 0.0384, 0.0312, 0.0365, 0.0306, 0.0360, 0.0338], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:04:42,979 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.262e+02 2.534e+02 3.085e+02 3.879e+02 6.356e+02, threshold=6.171e+02, percent-clipped=1.0 +2022-12-07 16:05:21,095 INFO [train.py:873] (2/4) Epoch 7, batch 1700, loss[loss=0.1475, simple_loss=0.1512, pruned_loss=0.07187, over 3842.00 frames. ], tot_loss[loss=0.1591, simple_loss=0.1786, pruned_loss=0.06984, over 2028971.26 frames. ], batch size: 100, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:05:48,096 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7985, 3.2379, 2.7219, 2.9589, 2.2075, 3.1988, 2.7905, 1.1935], + device='cuda:2'), covar=tensor([0.2571, 0.0893, 0.0824, 0.0649, 0.0910, 0.0400, 0.1232, 0.3146], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0061, 0.0050, 0.0054, 0.0077, 0.0058, 0.0084, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:06:11,977 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.285e+02 2.426e+02 3.035e+02 3.683e+02 7.018e+02, threshold=6.070e+02, percent-clipped=2.0 +2022-12-07 16:06:28,066 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8225, 2.7153, 2.4457, 2.5203, 2.7986, 2.7620, 2.8560, 2.8071], + device='cuda:2'), covar=tensor([0.1004, 0.0726, 0.2085, 0.2853, 0.0937, 0.0966, 0.1115, 0.0934], + device='cuda:2'), in_proj_covar=tensor([0.0327, 0.0238, 0.0399, 0.0500, 0.0289, 0.0365, 0.0343, 0.0306], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:06:28,107 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3434, 1.8216, 1.4544, 1.7890, 1.5802, 1.8186, 1.4964, 1.1301], + device='cuda:2'), covar=tensor([0.2058, 0.0686, 0.0512, 0.0385, 0.1131, 0.0597, 0.2093, 0.2853], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0061, 0.0051, 0.0053, 0.0078, 0.0058, 0.0084, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:06:44,216 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47166.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:06:50,174 INFO [train.py:873] (2/4) Epoch 7, batch 1800, loss[loss=0.1705, simple_loss=0.1926, pruned_loss=0.07418, over 14267.00 frames. ], tot_loss[loss=0.1595, simple_loss=0.1791, pruned_loss=0.06995, over 1988746.21 frames. ], batch size: 76, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:07:02,448 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.50 vs. limit=2.0 +2022-12-07 16:07:40,890 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.187e+02 2.439e+02 3.276e+02 4.235e+02 9.028e+02, threshold=6.551e+02, percent-clipped=6.0 +2022-12-07 16:07:50,934 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47241.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:07:59,632 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47251.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:08:04,863 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6492, 3.4226, 3.2129, 3.3242, 3.5895, 3.5685, 3.7091, 3.6117], + device='cuda:2'), covar=tensor([0.1033, 0.0632, 0.2186, 0.3033, 0.0783, 0.0786, 0.0863, 0.0997], + device='cuda:2'), in_proj_covar=tensor([0.0319, 0.0230, 0.0390, 0.0492, 0.0285, 0.0361, 0.0338, 0.0302], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:08:05,725 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47258.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:08:19,478 INFO [train.py:873] (2/4) Epoch 7, batch 1900, loss[loss=0.1567, simple_loss=0.1771, pruned_loss=0.06819, over 14283.00 frames. ], tot_loss[loss=0.1606, simple_loss=0.1794, pruned_loss=0.07095, over 1994769.99 frames. ], batch size: 46, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:08:42,239 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47299.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:08:44,901 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47302.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:08:48,299 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47306.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:09:01,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 16:09:10,048 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.096e+02 2.352e+02 2.909e+02 3.754e+02 8.188e+02, threshold=5.818e+02, percent-clipped=1.0 +2022-12-07 16:09:12,792 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7659, 0.7165, 0.7333, 0.8211, 0.9178, 0.3903, 0.7149, 0.8548], + device='cuda:2'), covar=tensor([0.0430, 0.0730, 0.0416, 0.0396, 0.0239, 0.0195, 0.0892, 0.0479], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0021, 0.0020, 0.0028, 0.0021, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.0386e-05, 9.2149e-05, 8.9228e-05, 9.4332e-05, 9.1740e-05, 1.1819e-04, + 9.7269e-05, 9.4304e-05], device='cuda:2') +2022-12-07 16:09:47,746 INFO [train.py:873] (2/4) Epoch 7, batch 2000, loss[loss=0.1791, simple_loss=0.1779, pruned_loss=0.0901, over 3866.00 frames. ], tot_loss[loss=0.1616, simple_loss=0.1801, pruned_loss=0.07157, over 1923139.32 frames. ], batch size: 100, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:09:53,046 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47379.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:10:20,346 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0601, 3.7127, 3.5750, 3.6209, 3.8810, 3.9314, 4.0272, 3.9940], + device='cuda:2'), covar=tensor([0.0897, 0.0670, 0.1811, 0.2980, 0.0705, 0.0826, 0.0965, 0.0975], + device='cuda:2'), in_proj_covar=tensor([0.0325, 0.0233, 0.0394, 0.0498, 0.0290, 0.0365, 0.0350, 0.0309], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:10:28,101 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5865, 2.4414, 2.2154, 2.2835, 2.5195, 2.4760, 2.5952, 2.5379], + device='cuda:2'), covar=tensor([0.1109, 0.0876, 0.2215, 0.2778, 0.0984, 0.0930, 0.1122, 0.0966], + device='cuda:2'), in_proj_covar=tensor([0.0326, 0.0234, 0.0395, 0.0500, 0.0290, 0.0366, 0.0351, 0.0309], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:10:40,007 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.317e+02 2.312e+02 2.798e+02 3.545e+02 9.094e+02, threshold=5.596e+02, percent-clipped=2.0 +2022-12-07 16:10:48,701 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47440.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:11:11,221 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47466.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:11:17,999 INFO [train.py:873] (2/4) Epoch 7, batch 2100, loss[loss=0.17, simple_loss=0.1899, pruned_loss=0.07502, over 14320.00 frames. ], tot_loss[loss=0.1606, simple_loss=0.1795, pruned_loss=0.07081, over 1932965.59 frames. ], batch size: 60, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:11:19,845 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5064, 3.5772, 3.5948, 3.3658, 3.5194, 3.6100, 1.3927, 3.3526], + device='cuda:2'), covar=tensor([0.0311, 0.0355, 0.0621, 0.0563, 0.0461, 0.0369, 0.3572, 0.0355], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0149, 0.0128, 0.0121, 0.0174, 0.0123, 0.0154, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:11:25,151 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4824, 1.0199, 1.4467, 1.0432, 1.1754, 1.4161, 1.2267, 1.1954], + device='cuda:2'), covar=tensor([0.0278, 0.0748, 0.0419, 0.0350, 0.0613, 0.0458, 0.0277, 0.0725], + device='cuda:2'), in_proj_covar=tensor([0.0115, 0.0198, 0.0126, 0.0129, 0.0113, 0.0118, 0.0098, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0007, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:11:53,885 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47514.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:12:09,090 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.294e+02 2.308e+02 2.873e+02 3.601e+02 7.245e+02, threshold=5.745e+02, percent-clipped=7.0 +2022-12-07 16:12:46,255 INFO [train.py:873] (2/4) Epoch 7, batch 2200, loss[loss=0.1448, simple_loss=0.1518, pruned_loss=0.06894, over 2626.00 frames. ], tot_loss[loss=0.1602, simple_loss=0.1791, pruned_loss=0.07068, over 1922657.07 frames. ], batch size: 100, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:13:08,067 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47597.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:13:27,458 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9848, 1.9162, 1.5861, 2.0414, 1.8059, 2.0363, 1.8475, 1.8611], + device='cuda:2'), covar=tensor([0.0554, 0.0679, 0.1526, 0.0289, 0.0586, 0.0384, 0.0890, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0311, 0.0289, 0.0208, 0.0275, 0.0275, 0.0255, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:13:37,949 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.355e+02 2.347e+02 2.892e+02 3.477e+02 7.491e+02, threshold=5.783e+02, percent-clipped=4.0 +2022-12-07 16:14:07,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7762, 1.5138, 2.0985, 1.6920, 1.9462, 1.4905, 1.6730, 1.8188], + device='cuda:2'), covar=tensor([0.1247, 0.1620, 0.0156, 0.0997, 0.0400, 0.0885, 0.0685, 0.0305], + device='cuda:2'), in_proj_covar=tensor([0.0222, 0.0238, 0.0165, 0.0320, 0.0190, 0.0246, 0.0224, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:14:15,734 INFO [train.py:873] (2/4) Epoch 7, batch 2300, loss[loss=0.1586, simple_loss=0.183, pruned_loss=0.06706, over 14287.00 frames. ], tot_loss[loss=0.1595, simple_loss=0.1785, pruned_loss=0.07026, over 1954256.57 frames. ], batch size: 25, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:14:21,278 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.21 vs. limit=2.0 +2022-12-07 16:15:07,552 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.453e+02 2.381e+02 2.948e+02 3.680e+02 8.732e+02, threshold=5.896e+02, percent-clipped=3.0 +2022-12-07 16:15:10,923 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=47735.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:15:11,968 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5937, 2.4241, 3.3753, 2.6010, 3.5225, 3.3645, 3.1971, 2.8661], + device='cuda:2'), covar=tensor([0.0773, 0.3155, 0.0867, 0.2252, 0.0797, 0.0775, 0.1681, 0.2025], + device='cuda:2'), in_proj_covar=tensor([0.0304, 0.0327, 0.0380, 0.0307, 0.0360, 0.0303, 0.0352, 0.0330], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:15:13,665 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47738.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:15:45,314 INFO [train.py:873] (2/4) Epoch 7, batch 2400, loss[loss=0.1452, simple_loss=0.1757, pruned_loss=0.05737, over 14291.00 frames. ], tot_loss[loss=0.16, simple_loss=0.1789, pruned_loss=0.07056, over 1939019.03 frames. ], batch size: 60, lr: 1.15e-02, grad_scale: 8.0 +2022-12-07 16:15:58,196 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47788.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:16:08,501 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47799.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:16:17,288 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=47809.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:16:36,797 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.330e+02 2.462e+02 3.088e+02 3.941e+02 7.678e+02, threshold=6.176e+02, percent-clipped=7.0 +2022-12-07 16:16:53,190 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47849.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:16:59,041 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8075, 2.7891, 2.9205, 2.8775, 2.8020, 2.7056, 1.2910, 2.5748], + device='cuda:2'), covar=tensor([0.0289, 0.0313, 0.0378, 0.0296, 0.0307, 0.0665, 0.2648, 0.0273], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0148, 0.0127, 0.0120, 0.0175, 0.0124, 0.0152, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:17:02,136 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-07 16:17:11,963 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=47870.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 16:17:14,360 INFO [train.py:873] (2/4) Epoch 7, batch 2500, loss[loss=0.1722, simple_loss=0.1864, pruned_loss=0.07899, over 14266.00 frames. ], tot_loss[loss=0.1589, simple_loss=0.1781, pruned_loss=0.06991, over 1917237.31 frames. ], batch size: 25, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:17:35,581 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=47897.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:17:46,331 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1167, 2.3823, 2.2463, 2.3075, 2.0085, 2.5387, 2.1851, 1.1916], + device='cuda:2'), covar=tensor([0.1947, 0.0924, 0.0893, 0.0870, 0.1140, 0.0564, 0.1280, 0.2954], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0062, 0.0050, 0.0053, 0.0078, 0.0059, 0.0086, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:18:06,392 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.397e+02 2.559e+02 3.205e+02 4.003e+02 7.455e+02, threshold=6.409e+02, percent-clipped=4.0 +2022-12-07 16:18:18,486 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=47945.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:18:43,889 INFO [train.py:873] (2/4) Epoch 7, batch 2600, loss[loss=0.1748, simple_loss=0.1942, pruned_loss=0.0777, over 13927.00 frames. ], tot_loss[loss=0.1593, simple_loss=0.1786, pruned_loss=0.06996, over 2008440.78 frames. ], batch size: 23, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:18:51,178 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3843, 1.8141, 1.7760, 1.8380, 1.6560, 1.9459, 1.5393, 1.2217], + device='cuda:2'), covar=tensor([0.1608, 0.0800, 0.0383, 0.0586, 0.1070, 0.0534, 0.1775, 0.2396], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0062, 0.0051, 0.0054, 0.0079, 0.0060, 0.0087, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:19:14,162 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1725, 2.5685, 3.8695, 2.6808, 4.0045, 3.6288, 3.5700, 3.2037], + device='cuda:2'), covar=tensor([0.0672, 0.3849, 0.1113, 0.2746, 0.0915, 0.1112, 0.2757, 0.3296], + device='cuda:2'), in_proj_covar=tensor([0.0312, 0.0334, 0.0388, 0.0314, 0.0369, 0.0308, 0.0362, 0.0342], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:19:36,189 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.203e+02 2.385e+02 3.008e+02 3.884e+02 8.359e+02, threshold=6.016e+02, percent-clipped=2.0 +2022-12-07 16:19:39,592 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.66 vs. limit=2.0 +2022-12-07 16:19:40,101 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48035.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:20:10,307 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4562, 2.1736, 2.3584, 1.5352, 2.0067, 2.2634, 2.5093, 2.0707], + device='cuda:2'), covar=tensor([0.0662, 0.1591, 0.1247, 0.2718, 0.1167, 0.0744, 0.0643, 0.1884], + device='cuda:2'), in_proj_covar=tensor([0.0114, 0.0196, 0.0126, 0.0133, 0.0114, 0.0117, 0.0098, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:20:13,244 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48072.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:20:13,918 INFO [train.py:873] (2/4) Epoch 7, batch 2700, loss[loss=0.161, simple_loss=0.1823, pruned_loss=0.0699, over 14270.00 frames. ], tot_loss[loss=0.1597, simple_loss=0.1787, pruned_loss=0.0703, over 1980922.47 frames. ], batch size: 76, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:20:23,360 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48083.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:20:33,255 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48094.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:21:02,970 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 16:21:06,888 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.439e+01 2.366e+02 3.097e+02 3.751e+02 6.035e+02, threshold=6.194e+02, percent-clipped=1.0 +2022-12-07 16:21:08,888 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48133.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:21:11,319 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8940, 4.6382, 4.2014, 4.4296, 4.5072, 4.7291, 4.8566, 4.8756], + device='cuda:2'), covar=tensor([0.0711, 0.0427, 0.2212, 0.2612, 0.0690, 0.0617, 0.0784, 0.0721], + device='cuda:2'), in_proj_covar=tensor([0.0328, 0.0234, 0.0398, 0.0491, 0.0284, 0.0356, 0.0352, 0.0306], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:21:17,966 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48144.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:21:37,445 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48165.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 16:21:44,253 INFO [train.py:873] (2/4) Epoch 7, batch 2800, loss[loss=0.1582, simple_loss=0.1793, pruned_loss=0.06849, over 14420.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.1792, pruned_loss=0.07068, over 1963796.06 frames. ], batch size: 73, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:22:19,995 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48212.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:22:36,451 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.192e+02 2.244e+02 3.117e+02 3.930e+02 6.165e+02, threshold=6.234e+02, percent-clipped=0.0 +2022-12-07 16:22:49,044 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8856, 2.4841, 3.3548, 2.2282, 2.0829, 2.6850, 1.5936, 2.7003], + device='cuda:2'), covar=tensor([0.1100, 0.1502, 0.0798, 0.3746, 0.3405, 0.1504, 0.5127, 0.1185], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0086, 0.0080, 0.0088, 0.0112, 0.0071, 0.0130, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:23:06,175 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5029, 1.8438, 1.8927, 1.9155, 1.7246, 2.0031, 1.5709, 1.0729], + device='cuda:2'), covar=tensor([0.2275, 0.1330, 0.0752, 0.0359, 0.1164, 0.0685, 0.1848, 0.2946], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0064, 0.0053, 0.0054, 0.0081, 0.0061, 0.0088, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 16:23:14,252 INFO [train.py:873] (2/4) Epoch 7, batch 2900, loss[loss=0.1419, simple_loss=0.1724, pruned_loss=0.0557, over 14575.00 frames. ], tot_loss[loss=0.1595, simple_loss=0.1786, pruned_loss=0.07025, over 1899921.40 frames. ], batch size: 22, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:23:14,769 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48273.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 16:23:55,491 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.54 vs. limit=5.0 +2022-12-07 16:24:01,606 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48324.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:24:07,736 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.587e+02 2.377e+02 3.206e+02 3.983e+02 6.235e+02, threshold=6.412e+02, percent-clipped=1.0 +2022-12-07 16:24:46,194 INFO [train.py:873] (2/4) Epoch 7, batch 3000, loss[loss=0.1549, simple_loss=0.1734, pruned_loss=0.06813, over 13554.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.1788, pruned_loss=0.07088, over 1855902.52 frames. ], batch size: 100, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:24:46,195 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 16:25:06,816 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0344, 3.9300, 3.5137, 3.6267, 3.9236, 4.0008, 4.1230, 4.1351], + device='cuda:2'), covar=tensor([0.0662, 0.0378, 0.1761, 0.3258, 0.0540, 0.0554, 0.0726, 0.0556], + device='cuda:2'), in_proj_covar=tensor([0.0324, 0.0230, 0.0391, 0.0486, 0.0282, 0.0353, 0.0348, 0.0304], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:25:07,552 INFO [train.py:905] (2/4) Epoch 7, validation: loss=0.1228, simple_loss=0.1657, pruned_loss=0.03995, over 857387.00 frames. +2022-12-07 16:25:07,553 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 16:25:19,600 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48385.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:25:27,561 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48394.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:25:30,219 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48397.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:25:39,405 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4602, 3.6093, 3.3581, 3.2875, 2.5657, 3.4906, 3.4318, 1.6046], + device='cuda:2'), covar=tensor([0.2673, 0.1133, 0.2588, 0.1416, 0.1181, 0.0869, 0.1535, 0.3746], + device='cuda:2'), in_proj_covar=tensor([0.0159, 0.0064, 0.0053, 0.0055, 0.0081, 0.0061, 0.0088, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 16:25:54,425 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48423.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:25:58,986 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48428.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:26:01,390 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.229e+02 2.374e+02 2.963e+02 3.800e+02 1.337e+03, threshold=5.925e+02, percent-clipped=7.0 +2022-12-07 16:26:10,808 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48442.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:26:12,821 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48444.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:26:13,238 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.02 vs. limit=2.0 +2022-12-07 16:26:26,034 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48458.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:26:32,548 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48465.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 16:26:39,377 INFO [train.py:873] (2/4) Epoch 7, batch 3100, loss[loss=0.1764, simple_loss=0.1911, pruned_loss=0.08083, over 14661.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.1792, pruned_loss=0.0709, over 1938529.60 frames. ], batch size: 33, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:26:49,161 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48484.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:26:56,201 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48492.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:27:15,587 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48513.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 16:27:31,120 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.348e+02 2.509e+02 3.187e+02 4.348e+02 7.597e+02, threshold=6.375e+02, percent-clipped=5.0 +2022-12-07 16:28:04,866 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48568.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:28:09,136 INFO [train.py:873] (2/4) Epoch 7, batch 3200, loss[loss=0.1606, simple_loss=0.1683, pruned_loss=0.07651, over 6969.00 frames. ], tot_loss[loss=0.1597, simple_loss=0.1789, pruned_loss=0.0702, over 1940471.03 frames. ], batch size: 100, lr: 1.14e-02, grad_scale: 8.0 +2022-12-07 16:29:02,497 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 2.245e+02 2.769e+02 3.901e+02 6.467e+02, threshold=5.537e+02, percent-clipped=1.0 +2022-12-07 16:29:40,598 INFO [train.py:873] (2/4) Epoch 7, batch 3300, loss[loss=0.2265, simple_loss=0.1885, pruned_loss=0.1323, over 1163.00 frames. ], tot_loss[loss=0.1588, simple_loss=0.1785, pruned_loss=0.06955, over 1963922.70 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:29:46,901 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48680.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:30:30,616 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48728.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:30:32,954 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 2.201e+02 2.834e+02 3.394e+02 6.106e+02, threshold=5.668e+02, percent-clipped=1.0 +2022-12-07 16:30:53,232 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48753.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:31:10,408 INFO [train.py:873] (2/4) Epoch 7, batch 3400, loss[loss=0.1654, simple_loss=0.1734, pruned_loss=0.07865, over 3858.00 frames. ], tot_loss[loss=0.1587, simple_loss=0.1781, pruned_loss=0.06962, over 1955301.84 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:31:13,823 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48776.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:31:16,472 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=48779.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:31:54,737 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48821.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:32:04,511 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.534e+01 2.454e+02 2.984e+02 3.725e+02 7.522e+02, threshold=5.968e+02, percent-clipped=2.0 +2022-12-07 16:32:09,951 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=48838.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:32:36,360 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48868.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:32:40,438 INFO [train.py:873] (2/4) Epoch 7, batch 3500, loss[loss=0.1802, simple_loss=0.19, pruned_loss=0.08517, over 12789.00 frames. ], tot_loss[loss=0.1593, simple_loss=0.1783, pruned_loss=0.07011, over 1948508.08 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 4.0 +2022-12-07 16:32:48,519 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48882.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:32:57,362 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-07 16:33:03,327 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=48899.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:33:15,112 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-12-07 16:33:18,174 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=48916.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 16:33:20,878 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5757, 1.4526, 3.5677, 1.4922, 3.4574, 3.5937, 2.5031, 3.8930], + device='cuda:2'), covar=tensor([0.0191, 0.2954, 0.0330, 0.2402, 0.0600, 0.0338, 0.0995, 0.0164], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0160, 0.0148, 0.0167, 0.0160, 0.0160, 0.0132, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:33:31,826 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.472e+02 2.610e+02 3.212e+02 4.326e+02 7.674e+02, threshold=6.425e+02, percent-clipped=4.0 +2022-12-07 16:33:49,162 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9978, 2.0713, 2.2555, 1.4718, 1.6631, 1.9982, 1.2300, 1.9176], + device='cuda:2'), covar=tensor([0.0936, 0.1249, 0.0637, 0.1994, 0.2862, 0.0911, 0.4644, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0087, 0.0079, 0.0088, 0.0112, 0.0071, 0.0129, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:34:08,388 INFO [train.py:873] (2/4) Epoch 7, batch 3600, loss[loss=0.1483, simple_loss=0.1402, pruned_loss=0.07822, over 1240.00 frames. ], tot_loss[loss=0.16, simple_loss=0.1787, pruned_loss=0.07064, over 1967354.27 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:34:14,426 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=48980.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:34:27,456 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2241, 1.4705, 2.4773, 1.3821, 2.4964, 2.4096, 1.8646, 2.5894], + device='cuda:2'), covar=tensor([0.0229, 0.1916, 0.0317, 0.1613, 0.0320, 0.0547, 0.0906, 0.0218], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0159, 0.0147, 0.0167, 0.0160, 0.0159, 0.0131, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:34:35,658 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3955, 5.1279, 4.9325, 5.5239, 5.1553, 4.6194, 5.5575, 5.3395], + device='cuda:2'), covar=tensor([0.0733, 0.0486, 0.0577, 0.0510, 0.0546, 0.0343, 0.0497, 0.0713], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0105, 0.0121, 0.0125, 0.0122, 0.0096, 0.0136, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 16:34:53,956 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-07 16:34:58,402 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49028.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:35:01,629 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.848e+01 2.321e+02 2.995e+02 3.518e+02 8.417e+02, threshold=5.990e+02, percent-clipped=3.0 +2022-12-07 16:35:13,237 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3239, 1.2165, 1.1480, 1.0755, 1.1142, 0.5754, 1.5044, 1.5135], + device='cuda:2'), covar=tensor([0.1429, 0.1633, 0.0657, 0.1445, 0.1556, 0.0526, 0.0817, 0.0614], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0021, 0.0019, 0.0027, 0.0020, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.2352e-05, 9.2106e-05, 9.2068e-05, 9.4620e-05, 9.1484e-05, 1.1687e-04, + 9.7828e-05, 9.5003e-05], device='cuda:2') +2022-12-07 16:35:20,055 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49053.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:35:20,077 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7679, 1.8374, 1.9642, 1.9531, 1.7980, 2.1274, 1.8221, 0.8951], + device='cuda:2'), covar=tensor([0.1957, 0.0969, 0.1033, 0.0664, 0.1065, 0.0585, 0.1729, 0.3315], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0065, 0.0052, 0.0055, 0.0080, 0.0061, 0.0089, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 16:35:38,275 INFO [train.py:873] (2/4) Epoch 7, batch 3700, loss[loss=0.175, simple_loss=0.1926, pruned_loss=0.07872, over 12739.00 frames. ], tot_loss[loss=0.1598, simple_loss=0.1786, pruned_loss=0.07051, over 1932972.16 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:35:43,609 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49079.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:36:02,765 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49101.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:36:26,220 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49127.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:36:30,495 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.168e+02 2.323e+02 3.026e+02 3.847e+02 8.325e+02, threshold=6.052e+02, percent-clipped=4.0 +2022-12-07 16:37:07,567 INFO [train.py:873] (2/4) Epoch 7, batch 3800, loss[loss=0.1723, simple_loss=0.185, pruned_loss=0.07986, over 10332.00 frames. ], tot_loss[loss=0.1585, simple_loss=0.1779, pruned_loss=0.06949, over 1929075.48 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:37:11,107 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.46 vs. limit=5.0 +2022-12-07 16:37:11,421 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49177.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:37:26,390 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49194.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:38:01,123 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.147e+01 2.278e+02 2.831e+02 3.417e+02 7.148e+02, threshold=5.663e+02, percent-clipped=1.0 +2022-12-07 16:38:04,704 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49236.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:38:08,203 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0352, 2.9308, 2.9256, 3.0956, 2.5273, 3.2515, 2.8479, 1.3835], + device='cuda:2'), covar=tensor([0.2274, 0.1908, 0.1563, 0.1051, 0.1029, 0.0645, 0.2110, 0.3668], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0067, 0.0053, 0.0055, 0.0081, 0.0062, 0.0089, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 16:38:37,955 INFO [train.py:873] (2/4) Epoch 7, batch 3900, loss[loss=0.158, simple_loss=0.1428, pruned_loss=0.08658, over 1275.00 frames. ], tot_loss[loss=0.1565, simple_loss=0.1765, pruned_loss=0.0683, over 1912388.96 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:38:59,151 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49297.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:39:30,001 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 2.072e+02 2.772e+02 3.353e+02 6.720e+02, threshold=5.544e+02, percent-clipped=1.0 +2022-12-07 16:40:05,910 INFO [train.py:873] (2/4) Epoch 7, batch 4000, loss[loss=0.1839, simple_loss=0.1622, pruned_loss=0.1028, over 1261.00 frames. ], tot_loss[loss=0.1561, simple_loss=0.1764, pruned_loss=0.06792, over 1962583.43 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:40:09,530 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.58 vs. limit=5.0 +2022-12-07 16:40:49,573 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49422.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:40:58,792 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.814e+01 2.339e+02 2.890e+02 3.562e+02 6.238e+02, threshold=5.779e+02, percent-clipped=1.0 +2022-12-07 16:40:59,036 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49432.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:41:04,261 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1931, 2.6756, 4.0231, 3.0164, 4.0308, 3.9883, 3.7502, 3.3227], + device='cuda:2'), covar=tensor([0.0540, 0.4038, 0.0908, 0.2311, 0.0813, 0.0810, 0.1986, 0.3196], + device='cuda:2'), in_proj_covar=tensor([0.0308, 0.0321, 0.0373, 0.0305, 0.0358, 0.0299, 0.0353, 0.0329], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:41:34,938 INFO [train.py:873] (2/4) Epoch 7, batch 4100, loss[loss=0.1801, simple_loss=0.1739, pruned_loss=0.09314, over 2611.00 frames. ], tot_loss[loss=0.1565, simple_loss=0.1767, pruned_loss=0.06813, over 1986056.28 frames. ], batch size: 100, lr: 1.13e-02, grad_scale: 8.0 +2022-12-07 16:41:38,588 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49477.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:41:43,969 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49483.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:41:52,446 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49493.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 16:41:53,177 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49494.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:42:20,789 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49525.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:42:26,656 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.128e+02 2.350e+02 2.777e+02 3.332e+02 5.900e+02, threshold=5.554e+02, percent-clipped=2.0 +2022-12-07 16:42:29,355 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7760, 2.7457, 2.0920, 2.1527, 2.6753, 2.7261, 2.7864, 2.7642], + device='cuda:2'), covar=tensor([0.1549, 0.0916, 0.3763, 0.4942, 0.1564, 0.1417, 0.1825, 0.1600], + device='cuda:2'), in_proj_covar=tensor([0.0325, 0.0231, 0.0389, 0.0491, 0.0289, 0.0357, 0.0350, 0.0314], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:42:35,642 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49542.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:42:58,369 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.53 vs. limit=5.0 +2022-12-07 16:43:03,206 INFO [train.py:873] (2/4) Epoch 7, batch 4200, loss[loss=0.1659, simple_loss=0.1779, pruned_loss=0.07699, over 9503.00 frames. ], tot_loss[loss=0.1599, simple_loss=0.1785, pruned_loss=0.07063, over 1955958.78 frames. ], batch size: 100, lr: 1.12e-02, grad_scale: 4.0 +2022-12-07 16:43:20,442 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49592.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:43:28,083 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.94 vs. limit=2.0 +2022-12-07 16:43:32,166 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5780, 3.3122, 2.9374, 2.1134, 3.1399, 3.1776, 3.6174, 2.8333], + device='cuda:2'), covar=tensor([0.0585, 0.2566, 0.1191, 0.2585, 0.0888, 0.0722, 0.0914, 0.1706], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0192, 0.0123, 0.0128, 0.0113, 0.0115, 0.0095, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 16:43:57,174 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.547e+02 2.513e+02 3.123e+02 4.078e+02 1.360e+03, threshold=6.245e+02, percent-clipped=14.0 +2022-12-07 16:44:28,197 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1493, 3.0579, 2.7214, 2.8366, 3.1058, 3.0588, 3.1579, 3.1390], + device='cuda:2'), covar=tensor([0.1029, 0.0602, 0.2264, 0.2843, 0.0795, 0.0947, 0.1094, 0.0932], + device='cuda:2'), in_proj_covar=tensor([0.0327, 0.0231, 0.0392, 0.0493, 0.0291, 0.0361, 0.0352, 0.0314], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:44:32,551 INFO [train.py:873] (2/4) Epoch 7, batch 4300, loss[loss=0.1523, simple_loss=0.1786, pruned_loss=0.06299, over 14395.00 frames. ], tot_loss[loss=0.1603, simple_loss=0.1791, pruned_loss=0.07079, over 1978610.88 frames. ], batch size: 41, lr: 1.12e-02, grad_scale: 4.0 +2022-12-07 16:45:25,734 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.563e+02 3.565e+02 4.633e+02 7.506e+02, threshold=7.130e+02, percent-clipped=5.0 +2022-12-07 16:45:56,327 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=49766.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:46:02,008 INFO [train.py:873] (2/4) Epoch 7, batch 4400, loss[loss=0.1462, simple_loss=0.1776, pruned_loss=0.05743, over 14281.00 frames. ], tot_loss[loss=0.1608, simple_loss=0.1796, pruned_loss=0.07105, over 1964639.98 frames. ], batch size: 44, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:46:06,965 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49778.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:46:15,370 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=49788.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 16:46:50,900 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=49827.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:46:55,944 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.417e+01 2.268e+02 2.875e+02 3.664e+02 5.837e+02, threshold=5.750e+02, percent-clipped=0.0 +2022-12-07 16:47:22,765 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2488, 1.2390, 1.3346, 1.1908, 1.4807, 0.6773, 1.6715, 1.4641], + device='cuda:2'), covar=tensor([0.0933, 0.1025, 0.0981, 0.1778, 0.1589, 0.0555, 0.0485, 0.0857], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0020, 0.0020, 0.0028, 0.0021, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.2977e-05, 9.3787e-05, 9.4120e-05, 9.5207e-05, 9.4529e-05, 1.2157e-04, + 1.0056e-04, 9.7176e-05], device='cuda:2') +2022-12-07 16:47:31,489 INFO [train.py:873] (2/4) Epoch 7, batch 4500, loss[loss=0.1541, simple_loss=0.1779, pruned_loss=0.06518, over 14269.00 frames. ], tot_loss[loss=0.1586, simple_loss=0.1782, pruned_loss=0.06955, over 1947828.48 frames. ], batch size: 46, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:47:48,516 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=49892.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:48:01,215 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7633, 0.7240, 0.6336, 0.6816, 0.8666, 0.0318, 0.7203, 0.7599], + device='cuda:2'), covar=tensor([0.0248, 0.0219, 0.0173, 0.0166, 0.0189, 0.0190, 0.0435, 0.0216], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0020, 0.0020, 0.0020, 0.0020, 0.0028, 0.0021, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.2583e-05, 9.2869e-05, 9.3321e-05, 9.3744e-05, 9.3745e-05, 1.2090e-04, + 9.9445e-05, 9.6949e-05], device='cuda:2') +2022-12-07 16:48:25,117 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.380e+02 2.362e+02 2.957e+02 3.544e+02 6.398e+02, threshold=5.915e+02, percent-clipped=1.0 +2022-12-07 16:48:31,734 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=49940.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:49:00,841 INFO [train.py:873] (2/4) Epoch 7, batch 4600, loss[loss=0.1468, simple_loss=0.1772, pruned_loss=0.05821, over 14278.00 frames. ], tot_loss[loss=0.1584, simple_loss=0.1781, pruned_loss=0.06939, over 1942198.27 frames. ], batch size: 76, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:49:23,595 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9431, 1.6076, 2.0168, 1.8229, 2.1334, 1.7927, 1.7893, 2.0152], + device='cuda:2'), covar=tensor([0.0297, 0.1202, 0.0143, 0.0342, 0.0208, 0.0455, 0.0187, 0.0268], + device='cuda:2'), in_proj_covar=tensor([0.0312, 0.0320, 0.0377, 0.0305, 0.0359, 0.0303, 0.0358, 0.0333], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:49:58,507 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.255e+02 2.301e+02 2.897e+02 3.880e+02 6.571e+02, threshold=5.793e+02, percent-clipped=2.0 +2022-12-07 16:50:19,689 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6107, 1.8728, 2.6276, 2.6813, 2.5686, 1.9107, 2.6359, 2.1934], + device='cuda:2'), covar=tensor([0.0149, 0.0414, 0.0209, 0.0167, 0.0190, 0.0611, 0.0118, 0.0366], + device='cuda:2'), in_proj_covar=tensor([0.0229, 0.0219, 0.0329, 0.0268, 0.0214, 0.0264, 0.0228, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:50:32,300 INFO [train.py:873] (2/4) Epoch 7, batch 4700, loss[loss=0.1734, simple_loss=0.1922, pruned_loss=0.0773, over 14207.00 frames. ], tot_loss[loss=0.1591, simple_loss=0.1781, pruned_loss=0.06999, over 1904866.76 frames. ], batch size: 89, lr: 1.12e-02, grad_scale: 4.0 +2022-12-07 16:50:37,339 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50078.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:50:40,423 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-07 16:50:45,852 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50088.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 16:51:15,691 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50122.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:51:19,178 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50126.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:51:20,966 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50128.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:51:25,948 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.013e+02 2.271e+02 2.971e+02 3.641e+02 8.155e+02, threshold=5.943e+02, percent-clipped=3.0 +2022-12-07 16:51:27,662 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50136.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:51:36,204 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.77 vs. limit=5.0 +2022-12-07 16:52:00,691 INFO [train.py:873] (2/4) Epoch 7, batch 4800, loss[loss=0.1389, simple_loss=0.1597, pruned_loss=0.05905, over 14349.00 frames. ], tot_loss[loss=0.1584, simple_loss=0.1777, pruned_loss=0.06952, over 1947401.20 frames. ], batch size: 73, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:52:02,661 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3454, 1.6530, 1.6519, 1.2553, 1.5481, 1.0972, 1.1972, 1.1918], + device='cuda:2'), covar=tensor([0.0926, 0.0963, 0.0600, 0.1155, 0.0828, 0.0450, 0.0422, 0.0781], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0010, 0.0011, 0.0016, 0.0013, 0.0017], + device='cuda:2'), out_proj_covar=tensor([7.1819e-05, 7.8595e-05, 7.1490e-05, 7.3141e-05, 7.5858e-05, 1.0601e-04, + 8.9525e-05, 1.0361e-04], device='cuda:2') +2022-12-07 16:52:14,898 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50189.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:52:21,176 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1708, 1.8897, 2.1425, 2.3490, 2.0269, 1.8678, 2.2658, 2.1055], + device='cuda:2'), covar=tensor([0.0097, 0.0194, 0.0090, 0.0073, 0.0119, 0.0274, 0.0108, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0229, 0.0217, 0.0327, 0.0268, 0.0211, 0.0262, 0.0226, 0.0254], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-07 16:52:22,008 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50197.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 16:52:25,708 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-07 16:52:54,459 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.403e+02 3.099e+02 4.047e+02 7.554e+02, threshold=6.197e+02, percent-clipped=4.0 +2022-12-07 16:53:09,680 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2692, 2.2077, 3.1703, 2.5446, 3.1984, 3.1344, 3.0580, 2.5552], + device='cuda:2'), covar=tensor([0.0576, 0.2726, 0.0809, 0.1905, 0.0636, 0.0714, 0.1086, 0.2301], + device='cuda:2'), in_proj_covar=tensor([0.0308, 0.0320, 0.0375, 0.0303, 0.0360, 0.0300, 0.0350, 0.0331], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:53:15,913 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50258.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 16:53:29,361 INFO [train.py:873] (2/4) Epoch 7, batch 4900, loss[loss=0.2093, simple_loss=0.2149, pruned_loss=0.1018, over 11174.00 frames. ], tot_loss[loss=0.1576, simple_loss=0.1777, pruned_loss=0.0688, over 2030233.19 frames. ], batch size: 100, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:54:23,754 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.482e+02 2.354e+02 3.057e+02 3.894e+02 8.460e+02, threshold=6.115e+02, percent-clipped=5.0 +2022-12-07 16:54:28,234 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1863, 1.2971, 1.4514, 0.9808, 1.0123, 1.3245, 0.8338, 1.0373], + device='cuda:2'), covar=tensor([0.1222, 0.2292, 0.0614, 0.1553, 0.2396, 0.0565, 0.2064, 0.0981], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0086, 0.0077, 0.0088, 0.0110, 0.0071, 0.0129, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:54:58,365 INFO [train.py:873] (2/4) Epoch 7, batch 5000, loss[loss=0.1474, simple_loss=0.1774, pruned_loss=0.05876, over 14346.00 frames. ], tot_loss[loss=0.157, simple_loss=0.1773, pruned_loss=0.06835, over 1970968.42 frames. ], batch size: 55, lr: 1.12e-02, grad_scale: 8.0 +2022-12-07 16:55:28,262 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6824, 4.4718, 4.2133, 4.3384, 4.4258, 4.5417, 4.7246, 4.6954], + device='cuda:2'), covar=tensor([0.0906, 0.0480, 0.1630, 0.2532, 0.0604, 0.0717, 0.0836, 0.0906], + device='cuda:2'), in_proj_covar=tensor([0.0325, 0.0233, 0.0386, 0.0485, 0.0286, 0.0356, 0.0353, 0.0312], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:55:42,536 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50422.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:55:50,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2611, 1.2492, 1.3081, 1.1114, 0.9858, 0.8399, 1.0102, 0.7710], + device='cuda:2'), covar=tensor([0.0370, 0.0446, 0.0235, 0.0279, 0.0361, 0.0353, 0.0288, 0.0556], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0010, 0.0011, 0.0012, 0.0016, 0.0013, 0.0018], + device='cuda:2'), out_proj_covar=tensor([7.4011e-05, 8.0057e-05, 7.2044e-05, 7.4659e-05, 7.7014e-05, 1.0769e-04, + 9.1861e-05, 1.0494e-04], device='cuda:2') +2022-12-07 16:55:50,966 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 16:55:52,925 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 2.513e+02 3.076e+02 4.134e+02 7.861e+02, threshold=6.153e+02, percent-clipped=5.0 +2022-12-07 16:55:58,028 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3614, 1.8820, 2.3864, 2.4241, 2.2014, 1.8421, 2.4536, 1.9955], + device='cuda:2'), covar=tensor([0.0143, 0.0316, 0.0163, 0.0119, 0.0203, 0.0493, 0.0135, 0.0302], + device='cuda:2'), in_proj_covar=tensor([0.0232, 0.0219, 0.0329, 0.0270, 0.0214, 0.0264, 0.0228, 0.0254], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:56:23,424 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9789, 1.8978, 2.0397, 2.0222, 2.0523, 1.7556, 1.2373, 1.7826], + device='cuda:2'), covar=tensor([0.0428, 0.0406, 0.0459, 0.0311, 0.0363, 0.0964, 0.2030, 0.0391], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0146, 0.0128, 0.0123, 0.0176, 0.0122, 0.0151, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:56:24,881 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50470.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:56:27,487 INFO [train.py:873] (2/4) Epoch 7, batch 5100, loss[loss=0.1948, simple_loss=0.1878, pruned_loss=0.1009, over 9499.00 frames. ], tot_loss[loss=0.1564, simple_loss=0.1766, pruned_loss=0.06805, over 1961425.86 frames. ], batch size: 100, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 16:56:32,088 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2191, 1.2394, 1.4621, 0.9791, 0.9707, 1.3453, 0.8968, 1.1239], + device='cuda:2'), covar=tensor([0.1417, 0.2158, 0.0533, 0.1680, 0.2170, 0.0511, 0.1813, 0.0964], + device='cuda:2'), in_proj_covar=tensor([0.0072, 0.0084, 0.0076, 0.0087, 0.0107, 0.0070, 0.0127, 0.0075], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:56:37,633 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50484.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:56:47,783 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.44 vs. limit=5.0 +2022-12-07 16:56:51,086 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9083, 1.2921, 2.8036, 2.4842, 2.6389, 2.7755, 2.0415, 2.7452], + device='cuda:2'), covar=tensor([0.0821, 0.1064, 0.0102, 0.0285, 0.0250, 0.0125, 0.0361, 0.0152], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0155, 0.0111, 0.0154, 0.0127, 0.0125, 0.0102, 0.0108], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 16:57:20,713 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.452e+02 3.012e+02 3.839e+02 6.174e+02, threshold=6.024e+02, percent-clipped=1.0 +2022-12-07 16:57:30,254 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9941, 2.6039, 3.7219, 2.6830, 2.2265, 2.8301, 1.4943, 3.2301], + device='cuda:2'), covar=tensor([0.1111, 0.1753, 0.0774, 0.1875, 0.2704, 0.0980, 0.5569, 0.0873], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0086, 0.0078, 0.0088, 0.0109, 0.0071, 0.0129, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:57:30,332 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1373, 3.5881, 2.8483, 4.4397, 4.1451, 4.2299, 3.5191, 2.9643], + device='cuda:2'), covar=tensor([0.0941, 0.1407, 0.4210, 0.0423, 0.0641, 0.1086, 0.1345, 0.3841], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0301, 0.0281, 0.0207, 0.0270, 0.0268, 0.0252, 0.0266], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 16:57:32,029 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50546.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:57:35,509 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5863, 2.1747, 3.1174, 2.1667, 1.9537, 2.5290, 1.2529, 2.8464], + device='cuda:2'), covar=tensor([0.1479, 0.3154, 0.1125, 0.2697, 0.4029, 0.1531, 0.7945, 0.1153], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0086, 0.0078, 0.0088, 0.0109, 0.0071, 0.0129, 0.0077], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:57:38,431 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50553.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 16:57:55,459 INFO [train.py:873] (2/4) Epoch 7, batch 5200, loss[loss=0.1522, simple_loss=0.1815, pruned_loss=0.06149, over 14301.00 frames. ], tot_loss[loss=0.1573, simple_loss=0.1775, pruned_loss=0.06854, over 1999322.15 frames. ], batch size: 60, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 16:58:26,211 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50607.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 16:58:35,862 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0180, 3.2852, 2.9432, 2.9706, 2.0258, 3.3336, 2.9235, 1.2414], + device='cuda:2'), covar=tensor([0.2323, 0.0755, 0.1694, 0.1569, 0.1287, 0.0416, 0.1430, 0.3370], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0063, 0.0051, 0.0054, 0.0079, 0.0059, 0.0088, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 16:58:49,801 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.178e+02 2.234e+02 2.881e+02 3.626e+02 9.172e+02, threshold=5.763e+02, percent-clipped=1.0 +2022-12-07 16:58:56,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7715, 1.9093, 3.7528, 2.5570, 3.6212, 1.7250, 2.8164, 3.5779], + device='cuda:2'), covar=tensor([0.0611, 0.4780, 0.0652, 0.7197, 0.0471, 0.4157, 0.1352, 0.0365], + device='cuda:2'), in_proj_covar=tensor([0.0223, 0.0236, 0.0178, 0.0322, 0.0195, 0.0242, 0.0227, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 16:59:15,534 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7738, 3.4889, 3.2881, 3.3305, 3.6180, 3.6199, 3.7086, 3.6988], + device='cuda:2'), covar=tensor([0.0929, 0.0711, 0.2198, 0.2961, 0.0808, 0.0824, 0.1129, 0.0939], + device='cuda:2'), in_proj_covar=tensor([0.0331, 0.0239, 0.0395, 0.0499, 0.0296, 0.0369, 0.0366, 0.0317], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 16:59:24,244 INFO [train.py:873] (2/4) Epoch 7, batch 5300, loss[loss=0.1295, simple_loss=0.1592, pruned_loss=0.0499, over 14340.00 frames. ], tot_loss[loss=0.1562, simple_loss=0.1769, pruned_loss=0.06779, over 2008580.87 frames. ], batch size: 55, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:00:16,768 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.527e+02 2.441e+02 3.172e+02 3.955e+02 6.877e+02, threshold=6.344e+02, percent-clipped=8.0 +2022-12-07 17:00:49,949 INFO [train.py:873] (2/4) Epoch 7, batch 5400, loss[loss=0.1545, simple_loss=0.1634, pruned_loss=0.07282, over 3876.00 frames. ], tot_loss[loss=0.157, simple_loss=0.1773, pruned_loss=0.06841, over 1989198.66 frames. ], batch size: 100, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:00:58,301 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50782.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:00:59,861 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50784.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:01:20,924 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50808.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:01:41,868 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50832.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:01:43,629 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.515e+02 2.469e+02 3.143e+02 3.850e+02 7.689e+02, threshold=6.287e+02, percent-clipped=2.0 +2022-12-07 17:01:51,274 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50843.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:01:54,178 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1201, 1.8525, 2.0934, 1.4466, 1.8270, 2.1017, 2.3139, 1.8756], + device='cuda:2'), covar=tensor([0.0819, 0.1021, 0.1076, 0.2091, 0.1053, 0.0725, 0.0520, 0.1621], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0196, 0.0125, 0.0128, 0.0117, 0.0117, 0.0098, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:01:59,981 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=50853.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:02:05,413 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8661, 1.2692, 2.0615, 1.1937, 2.0086, 2.0461, 1.6205, 2.0355], + device='cuda:2'), covar=tensor([0.0310, 0.1715, 0.0332, 0.1704, 0.0399, 0.0387, 0.1010, 0.0418], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0158, 0.0149, 0.0169, 0.0160, 0.0161, 0.0131, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:02:08,391 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-07 17:02:14,108 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=50869.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 17:02:17,733 INFO [train.py:873] (2/4) Epoch 7, batch 5500, loss[loss=0.1154, simple_loss=0.1543, pruned_loss=0.03828, over 14179.00 frames. ], tot_loss[loss=0.1562, simple_loss=0.1767, pruned_loss=0.06784, over 1961052.92 frames. ], batch size: 35, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:02:35,627 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-07 17:02:42,256 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=50901.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:02:43,090 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=50902.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:03:10,733 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.245e+02 2.288e+02 2.905e+02 3.483e+02 6.290e+02, threshold=5.809e+02, percent-clipped=1.0 +2022-12-07 17:03:21,358 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=50946.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:03:33,128 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 17:03:44,815 INFO [train.py:873] (2/4) Epoch 7, batch 5600, loss[loss=0.1299, simple_loss=0.1638, pruned_loss=0.04798, over 14307.00 frames. ], tot_loss[loss=0.1571, simple_loss=0.1769, pruned_loss=0.06862, over 1901787.19 frames. ], batch size: 39, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:03:57,631 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1929, 1.8705, 2.2231, 2.3075, 1.9516, 1.9525, 2.2635, 2.1628], + device='cuda:2'), covar=tensor([0.0097, 0.0206, 0.0099, 0.0077, 0.0150, 0.0268, 0.0122, 0.0137], + device='cuda:2'), in_proj_covar=tensor([0.0230, 0.0220, 0.0327, 0.0270, 0.0215, 0.0261, 0.0229, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:04:15,893 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51007.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:04:18,209 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2315, 2.9671, 2.8764, 2.1878, 2.6805, 2.9968, 3.3546, 2.6071], + device='cuda:2'), covar=tensor([0.0777, 0.2024, 0.1189, 0.2236, 0.1246, 0.0804, 0.0630, 0.1599], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0195, 0.0125, 0.0128, 0.0118, 0.0118, 0.0097, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:04:23,041 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7089, 0.7085, 0.6401, 0.8016, 0.7554, 0.1191, 0.6831, 0.7635], + device='cuda:2'), covar=tensor([0.0130, 0.0339, 0.0182, 0.0240, 0.0133, 0.0127, 0.0502, 0.0312], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0021, 0.0020, 0.0021, 0.0029, 0.0021, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.4980e-05, 9.8992e-05, 9.7363e-05, 9.6661e-05, 9.8414e-05, 1.2635e-04, + 1.0173e-04, 1.0070e-04], device='cuda:2') +2022-12-07 17:04:38,633 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.538e+02 2.390e+02 3.229e+02 4.054e+02 7.897e+02, threshold=6.457e+02, percent-clipped=5.0 +2022-12-07 17:04:47,537 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6171, 3.3796, 3.3435, 3.7043, 3.2273, 3.1568, 3.6771, 3.6173], + device='cuda:2'), covar=tensor([0.0773, 0.0804, 0.0858, 0.0747, 0.0939, 0.0733, 0.0815, 0.0840], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0111, 0.0124, 0.0127, 0.0127, 0.0098, 0.0139, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:05:03,784 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51062.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:05:13,367 INFO [train.py:873] (2/4) Epoch 7, batch 5700, loss[loss=0.1692, simple_loss=0.1908, pruned_loss=0.07378, over 14114.00 frames. ], tot_loss[loss=0.1567, simple_loss=0.1766, pruned_loss=0.06841, over 1907652.80 frames. ], batch size: 29, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:05:14,034 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.42 vs. limit=5.0 +2022-12-07 17:05:56,762 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51123.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:05:58,090 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 17:06:05,989 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.301e+02 2.940e+02 3.681e+02 9.913e+02, threshold=5.881e+02, percent-clipped=3.0 +2022-12-07 17:06:09,463 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51138.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:06:31,991 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51164.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:06:39,065 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51172.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:06:39,721 INFO [train.py:873] (2/4) Epoch 7, batch 5800, loss[loss=0.1874, simple_loss=0.194, pruned_loss=0.09042, over 9503.00 frames. ], tot_loss[loss=0.1574, simple_loss=0.1772, pruned_loss=0.06883, over 1917938.41 frames. ], batch size: 100, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:06:42,748 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.36 vs. limit=5.0 +2022-12-07 17:07:05,816 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51202.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:07:21,518 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5525, 5.2089, 5.0323, 5.5285, 5.1309, 4.4412, 5.5421, 5.4789], + device='cuda:2'), covar=tensor([0.0478, 0.0485, 0.0537, 0.0425, 0.0536, 0.0419, 0.0478, 0.0510], + device='cuda:2'), in_proj_covar=tensor([0.0120, 0.0112, 0.0126, 0.0129, 0.0128, 0.0100, 0.0141, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:07:24,901 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7130, 2.6126, 1.9216, 2.7415, 2.5650, 2.6148, 2.3759, 2.1340], + device='cuda:2'), covar=tensor([0.0646, 0.1159, 0.2961, 0.0374, 0.0913, 0.0676, 0.1338, 0.2727], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0292, 0.0275, 0.0204, 0.0269, 0.0267, 0.0249, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:07:33,322 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51233.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:07:33,900 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.270e+02 2.265e+02 2.888e+02 3.967e+02 8.048e+02, threshold=5.777e+02, percent-clipped=2.0 +2022-12-07 17:07:48,092 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51250.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:07:57,588 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-07 17:08:08,702 INFO [train.py:873] (2/4) Epoch 7, batch 5900, loss[loss=0.1254, simple_loss=0.1631, pruned_loss=0.04387, over 14392.00 frames. ], tot_loss[loss=0.1555, simple_loss=0.1759, pruned_loss=0.06751, over 1917527.04 frames. ], batch size: 41, lr: 1.11e-02, grad_scale: 8.0 +2022-12-07 17:08:34,039 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51302.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:08:43,252 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.50 vs. limit=5.0 +2022-12-07 17:09:01,665 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.784e+01 2.421e+02 2.907e+02 3.542e+02 5.776e+02, threshold=5.815e+02, percent-clipped=0.0 +2022-12-07 17:09:35,644 INFO [train.py:873] (2/4) Epoch 7, batch 6000, loss[loss=0.1406, simple_loss=0.1746, pruned_loss=0.05328, over 14585.00 frames. ], tot_loss[loss=0.1548, simple_loss=0.1756, pruned_loss=0.06702, over 1930369.05 frames. ], batch size: 22, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:09:35,645 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 17:09:56,739 INFO [train.py:905] (2/4) Epoch 7, validation: loss=0.1227, simple_loss=0.1653, pruned_loss=0.04007, over 857387.00 frames. +2022-12-07 17:09:56,739 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 17:10:04,644 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51382.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:10:13,365 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51392.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:10:14,607 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-12-07 17:10:36,270 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51418.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:10:49,690 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.039e+02 2.553e+02 3.284e+02 4.114e+02 1.252e+03, threshold=6.568e+02, percent-clipped=8.0 +2022-12-07 17:10:53,244 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51438.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:10:57,513 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51443.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:11:06,384 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51453.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:11:15,136 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2592, 1.2626, 1.3005, 1.0688, 1.2623, 0.5559, 1.1135, 0.8361], + device='cuda:2'), covar=tensor([0.0214, 0.0337, 0.0284, 0.0428, 0.0398, 0.0373, 0.0243, 0.0530], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0011, 0.0012, 0.0018, 0.0014, 0.0019], + device='cuda:2'), out_proj_covar=tensor([7.8156e-05, 8.5211e-05, 7.5876e-05, 7.9163e-05, 8.0348e-05, 1.1675e-04, + 9.5503e-05, 1.1195e-04], device='cuda:2') +2022-12-07 17:11:15,932 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51464.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:11:21,685 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0682, 3.5355, 2.6683, 4.2259, 4.0134, 4.0849, 3.6036, 2.6058], + device='cuda:2'), covar=tensor([0.0588, 0.1131, 0.3475, 0.0436, 0.0606, 0.1256, 0.0847, 0.3491], + device='cuda:2'), in_proj_covar=tensor([0.0239, 0.0297, 0.0278, 0.0207, 0.0271, 0.0269, 0.0251, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:11:23,461 INFO [train.py:873] (2/4) Epoch 7, batch 6100, loss[loss=0.1484, simple_loss=0.142, pruned_loss=0.07735, over 1284.00 frames. ], tot_loss[loss=0.1559, simple_loss=0.1764, pruned_loss=0.06773, over 1964103.71 frames. ], batch size: 100, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:11:34,637 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51486.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:11:42,571 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51495.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:11:46,009 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0034, 1.9278, 1.6428, 2.0598, 1.8453, 2.0125, 1.8460, 1.7312], + device='cuda:2'), covar=tensor([0.0548, 0.0871, 0.1635, 0.0346, 0.0504, 0.0318, 0.1107, 0.0649], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0299, 0.0280, 0.0208, 0.0273, 0.0270, 0.0252, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:11:57,337 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51512.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:12:05,715 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9491, 1.8370, 1.5765, 2.0434, 1.8165, 1.9502, 1.8474, 1.7286], + device='cuda:2'), covar=tensor([0.0593, 0.0945, 0.1781, 0.0362, 0.0557, 0.0415, 0.1269, 0.0724], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0299, 0.0281, 0.0208, 0.0272, 0.0270, 0.0252, 0.0269], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:12:10,906 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51528.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 17:12:16,320 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.479e+02 2.483e+02 3.170e+02 3.695e+02 8.405e+02, threshold=6.340e+02, percent-clipped=2.0 +2022-12-07 17:12:29,868 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 17:12:35,784 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51556.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:12:38,579 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-12-07 17:12:50,515 INFO [train.py:873] (2/4) Epoch 7, batch 6200, loss[loss=0.1364, simple_loss=0.152, pruned_loss=0.06042, over 4968.00 frames. ], tot_loss[loss=0.1566, simple_loss=0.1767, pruned_loss=0.06827, over 1903506.34 frames. ], batch size: 100, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:12:51,209 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-12-07 17:13:16,634 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51602.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:13:21,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7999, 3.6474, 3.5932, 4.0030, 3.7272, 3.5081, 3.9903, 3.4001], + device='cuda:2'), covar=tensor([0.0687, 0.1154, 0.0423, 0.0454, 0.0811, 0.1355, 0.0604, 0.0523], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0235, 0.0163, 0.0154, 0.0159, 0.0127, 0.0238, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:13:22,491 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7271, 3.3351, 2.9102, 2.2483, 2.9736, 3.3814, 3.8360, 2.6957], + device='cuda:2'), covar=tensor([0.0763, 0.2807, 0.1460, 0.2625, 0.1317, 0.0691, 0.0696, 0.2042], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0196, 0.0124, 0.0128, 0.0115, 0.0119, 0.0097, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:13:33,016 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 17:13:44,671 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.329e+02 2.979e+02 3.715e+02 8.886e+02, threshold=5.958e+02, percent-clipped=2.0 +2022-12-07 17:13:45,611 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1374, 2.9366, 2.7117, 2.8159, 3.0919, 3.0523, 3.1315, 3.1088], + device='cuda:2'), covar=tensor([0.1137, 0.0810, 0.2378, 0.2965, 0.0859, 0.0934, 0.1161, 0.1017], + device='cuda:2'), in_proj_covar=tensor([0.0333, 0.0235, 0.0398, 0.0496, 0.0288, 0.0369, 0.0357, 0.0312], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:13:48,396 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7247, 3.1904, 4.2507, 3.4270, 4.4550, 4.2760, 4.2537, 3.9294], + device='cuda:2'), covar=tensor([0.0503, 0.3122, 0.1259, 0.1949, 0.0762, 0.0785, 0.1904, 0.1721], + device='cuda:2'), in_proj_covar=tensor([0.0303, 0.0326, 0.0378, 0.0300, 0.0364, 0.0295, 0.0350, 0.0327], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:13:58,954 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51650.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:14:19,073 INFO [train.py:873] (2/4) Epoch 7, batch 6300, loss[loss=0.1472, simple_loss=0.1761, pruned_loss=0.05912, over 14222.00 frames. ], tot_loss[loss=0.1562, simple_loss=0.1766, pruned_loss=0.06788, over 1909859.87 frames. ], batch size: 37, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:14:58,583 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51718.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:15:12,417 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.367e+02 2.206e+02 2.744e+02 3.241e+02 7.232e+02, threshold=5.488e+02, percent-clipped=1.0 +2022-12-07 17:15:16,222 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51738.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:15:19,754 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9886, 3.0025, 3.8647, 2.5504, 2.5822, 2.8684, 2.1265, 3.3458], + device='cuda:2'), covar=tensor([0.1413, 0.1089, 0.0958, 0.2749, 0.2518, 0.1211, 0.4405, 0.1346], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0086, 0.0081, 0.0089, 0.0110, 0.0072, 0.0131, 0.0080], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:15:23,529 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 17:15:24,004 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=51747.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:15:24,668 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51748.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:15:40,151 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51766.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:15:46,558 INFO [train.py:873] (2/4) Epoch 7, batch 6400, loss[loss=0.1385, simple_loss=0.1698, pruned_loss=0.05361, over 14521.00 frames. ], tot_loss[loss=0.1554, simple_loss=0.1762, pruned_loss=0.06725, over 1908708.81 frames. ], batch size: 43, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:16:17,607 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=51808.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:16:34,989 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=51828.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:16:39,968 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.142e+02 2.371e+02 3.096e+02 4.074e+02 1.030e+03, threshold=6.192e+02, percent-clipped=3.0 +2022-12-07 17:16:55,656 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=51851.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:17:14,510 INFO [train.py:873] (2/4) Epoch 7, batch 6500, loss[loss=0.1376, simple_loss=0.1649, pruned_loss=0.05512, over 13997.00 frames. ], tot_loss[loss=0.1554, simple_loss=0.1763, pruned_loss=0.06727, over 1945945.91 frames. ], batch size: 19, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:17:17,358 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=51876.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:17:59,144 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8130, 1.4481, 3.7660, 3.6078, 3.6932, 3.8802, 3.2291, 3.8719], + device='cuda:2'), covar=tensor([0.1249, 0.1422, 0.0099, 0.0184, 0.0172, 0.0102, 0.0215, 0.0119], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0113, 0.0155, 0.0129, 0.0127, 0.0105, 0.0111], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:18:07,971 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.565e+02 2.306e+02 2.865e+02 3.658e+02 7.175e+02, threshold=5.730e+02, percent-clipped=1.0 +2022-12-07 17:18:25,037 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.28 vs. limit=2.0 +2022-12-07 17:18:42,427 INFO [train.py:873] (2/4) Epoch 7, batch 6600, loss[loss=0.1598, simple_loss=0.1836, pruned_loss=0.06802, over 14272.00 frames. ], tot_loss[loss=0.1548, simple_loss=0.1756, pruned_loss=0.06698, over 1892360.29 frames. ], batch size: 57, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:19:03,758 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1429, 2.8149, 2.6893, 1.9904, 2.4679, 2.7487, 3.3063, 2.5293], + device='cuda:2'), covar=tensor([0.0691, 0.1675, 0.1431, 0.2598, 0.1226, 0.0678, 0.0478, 0.1721], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0195, 0.0123, 0.0129, 0.0118, 0.0118, 0.0097, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:19:16,545 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0948, 1.6733, 1.6658, 1.7330, 1.5771, 1.7739, 1.3796, 1.1788], + device='cuda:2'), covar=tensor([0.1909, 0.0779, 0.0327, 0.0380, 0.1306, 0.0367, 0.1841, 0.2070], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0063, 0.0053, 0.0055, 0.0081, 0.0058, 0.0086, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 17:19:35,920 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 2.181e+02 2.663e+02 3.409e+02 8.463e+02, threshold=5.326e+02, percent-clipped=3.0 +2022-12-07 17:19:39,792 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52038.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:19:41,547 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52040.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:19:42,914 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-12-07 17:19:48,537 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52048.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:19:59,203 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4825, 1.2992, 1.3906, 1.3013, 1.4848, 1.1767, 1.6259, 1.3813], + device='cuda:2'), covar=tensor([0.1564, 0.1741, 0.0537, 0.0822, 0.0845, 0.0445, 0.0348, 0.0775], + device='cuda:2'), in_proj_covar=tensor([0.0011, 0.0012, 0.0011, 0.0011, 0.0011, 0.0017, 0.0014, 0.0018], + device='cuda:2'), out_proj_covar=tensor([7.7377e-05, 8.2969e-05, 7.5421e-05, 7.6800e-05, 7.7753e-05, 1.1295e-04, + 9.4753e-05, 1.0880e-04], device='cuda:2') +2022-12-07 17:20:10,105 INFO [train.py:873] (2/4) Epoch 7, batch 6700, loss[loss=0.1378, simple_loss=0.1619, pruned_loss=0.05688, over 14586.00 frames. ], tot_loss[loss=0.1554, simple_loss=0.176, pruned_loss=0.06733, over 1935286.56 frames. ], batch size: 34, lr: 1.10e-02, grad_scale: 16.0 +2022-12-07 17:20:21,602 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52086.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:20:30,295 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52096.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:20:35,004 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52101.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:20:36,551 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52103.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:21:03,676 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.373e+02 2.435e+02 3.169e+02 4.506e+02 1.155e+03, threshold=6.339e+02, percent-clipped=9.0 +2022-12-07 17:21:18,745 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52151.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:21:26,930 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-07 17:21:38,017 INFO [train.py:873] (2/4) Epoch 7, batch 6800, loss[loss=0.1438, simple_loss=0.1664, pruned_loss=0.06061, over 14032.00 frames. ], tot_loss[loss=0.1555, simple_loss=0.176, pruned_loss=0.06752, over 1970908.43 frames. ], batch size: 26, lr: 1.10e-02, grad_scale: 16.0 +2022-12-07 17:21:55,393 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52193.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:22:00,769 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52199.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:22:00,830 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9215, 1.5764, 2.7231, 2.4982, 2.6909, 2.7077, 2.0619, 2.7471], + device='cuda:2'), covar=tensor([0.0908, 0.0943, 0.0112, 0.0306, 0.0254, 0.0115, 0.0399, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0154, 0.0113, 0.0155, 0.0129, 0.0127, 0.0104, 0.0110], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:22:17,049 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4297, 3.1009, 3.1509, 3.4259, 3.2470, 3.3779, 3.4945, 2.8457], + device='cuda:2'), covar=tensor([0.0487, 0.1217, 0.0473, 0.0616, 0.0851, 0.0435, 0.0671, 0.0626], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0240, 0.0167, 0.0158, 0.0160, 0.0133, 0.0245, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:22:17,690 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-07 17:22:20,979 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7643, 3.1873, 4.4145, 3.2455, 4.4390, 4.3497, 4.2485, 3.9168], + device='cuda:2'), covar=tensor([0.0553, 0.2854, 0.1144, 0.2235, 0.0896, 0.0834, 0.1893, 0.2001], + device='cuda:2'), in_proj_covar=tensor([0.0317, 0.0333, 0.0387, 0.0312, 0.0375, 0.0304, 0.0359, 0.0334], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:22:32,532 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.294e+02 2.590e+02 3.090e+02 3.890e+02 7.235e+02, threshold=6.180e+02, percent-clipped=2.0 +2022-12-07 17:22:49,447 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52254.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:23:05,760 INFO [train.py:873] (2/4) Epoch 7, batch 6900, loss[loss=0.156, simple_loss=0.1644, pruned_loss=0.07382, over 7768.00 frames. ], tot_loss[loss=0.1564, simple_loss=0.1763, pruned_loss=0.06826, over 1933978.96 frames. ], batch size: 100, lr: 1.10e-02, grad_scale: 8.0 +2022-12-07 17:23:15,340 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9470, 1.8313, 2.1641, 1.4687, 1.5290, 1.9106, 1.1320, 1.7295], + device='cuda:2'), covar=tensor([0.1558, 0.2973, 0.1051, 0.3459, 0.4380, 0.1180, 0.7069, 0.1664], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0087, 0.0081, 0.0089, 0.0110, 0.0073, 0.0128, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:23:29,663 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0087, 2.0094, 2.0616, 2.0321, 2.0374, 1.7292, 1.2778, 1.8523], + device='cuda:2'), covar=tensor([0.0387, 0.0295, 0.0437, 0.0299, 0.0347, 0.0805, 0.1860, 0.0344], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0145, 0.0127, 0.0122, 0.0176, 0.0119, 0.0151, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:23:40,095 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52312.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:24:00,341 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.245e+02 2.451e+02 3.077e+02 3.642e+02 5.911e+02, threshold=6.154e+02, percent-clipped=0.0 +2022-12-07 17:24:03,283 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9761, 2.0704, 1.8873, 2.1123, 1.6869, 1.9689, 2.0219, 2.0623], + device='cuda:2'), covar=tensor([0.1009, 0.1143, 0.1226, 0.0903, 0.1558, 0.0856, 0.1217, 0.1034], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0113, 0.0125, 0.0130, 0.0127, 0.0100, 0.0141, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:24:19,542 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8701, 3.4740, 3.0417, 2.3841, 3.1911, 3.4294, 3.8707, 2.9033], + device='cuda:2'), covar=tensor([0.0418, 0.1726, 0.0931, 0.1733, 0.0850, 0.0396, 0.0536, 0.1240], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0188, 0.0121, 0.0124, 0.0114, 0.0117, 0.0098, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0006, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:24:33,250 INFO [train.py:873] (2/4) Epoch 7, batch 7000, loss[loss=0.1587, simple_loss=0.1855, pruned_loss=0.0659, over 14355.00 frames. ], tot_loss[loss=0.1564, simple_loss=0.1762, pruned_loss=0.06832, over 1922181.95 frames. ], batch size: 73, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:24:33,449 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52373.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:24:53,448 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52396.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:25:00,113 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52403.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:25:27,918 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.282e+02 2.412e+02 3.030e+02 3.769e+02 8.602e+02, threshold=6.059e+02, percent-clipped=2.0 +2022-12-07 17:25:41,254 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8668, 1.5763, 2.1421, 1.6914, 1.9172, 1.4574, 1.6987, 1.6896], + device='cuda:2'), covar=tensor([0.1564, 0.1995, 0.0228, 0.1600, 0.0470, 0.1220, 0.0839, 0.0536], + device='cuda:2'), in_proj_covar=tensor([0.0228, 0.0235, 0.0178, 0.0318, 0.0197, 0.0246, 0.0227, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:25:41,995 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52451.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:26:01,214 INFO [train.py:873] (2/4) Epoch 7, batch 7100, loss[loss=0.1702, simple_loss=0.1846, pruned_loss=0.07795, over 14469.00 frames. ], tot_loss[loss=0.1564, simple_loss=0.1765, pruned_loss=0.06811, over 1977912.15 frames. ], batch size: 51, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:26:56,578 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.463e+02 3.010e+02 3.918e+02 9.338e+02, threshold=6.019e+02, percent-clipped=5.0 +2022-12-07 17:27:04,568 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6394, 4.4308, 4.1143, 4.2442, 4.4251, 4.5605, 4.6993, 4.6509], + device='cuda:2'), covar=tensor([0.0821, 0.0463, 0.1738, 0.2481, 0.0658, 0.0594, 0.0861, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0327, 0.0234, 0.0391, 0.0488, 0.0282, 0.0358, 0.0355, 0.0308], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:27:08,058 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52549.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:27:13,428 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52555.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:27:29,108 INFO [train.py:873] (2/4) Epoch 7, batch 7200, loss[loss=0.1685, simple_loss=0.1847, pruned_loss=0.07618, over 14298.00 frames. ], tot_loss[loss=0.1558, simple_loss=0.1763, pruned_loss=0.0677, over 1986869.72 frames. ], batch size: 66, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:28:07,084 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52616.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 17:28:24,359 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.010e+02 2.308e+02 2.919e+02 3.605e+02 5.197e+02, threshold=5.838e+02, percent-clipped=0.0 +2022-12-07 17:28:29,967 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-07 17:28:52,705 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52668.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:28:56,838 INFO [train.py:873] (2/4) Epoch 7, batch 7300, loss[loss=0.1554, simple_loss=0.1761, pruned_loss=0.06738, over 14296.00 frames. ], tot_loss[loss=0.1544, simple_loss=0.1752, pruned_loss=0.06682, over 1980128.15 frames. ], batch size: 69, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:29:15,386 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 17:29:16,727 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52696.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:29:51,340 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.524e+02 2.341e+02 3.110e+02 4.071e+02 1.233e+03, threshold=6.219e+02, percent-clipped=5.0 +2022-12-07 17:29:58,694 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52744.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:30:05,614 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=52752.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:30:13,267 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7149, 2.9404, 4.3550, 3.2268, 4.4245, 4.3520, 4.1415, 3.9151], + device='cuda:2'), covar=tensor([0.0532, 0.3053, 0.0945, 0.1998, 0.0916, 0.0701, 0.1691, 0.1962], + device='cuda:2'), in_proj_covar=tensor([0.0314, 0.0327, 0.0385, 0.0310, 0.0375, 0.0305, 0.0354, 0.0332], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:30:16,578 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.67 vs. limit=5.0 +2022-12-07 17:30:24,392 INFO [train.py:873] (2/4) Epoch 7, batch 7400, loss[loss=0.2987, simple_loss=0.2366, pruned_loss=0.1804, over 1270.00 frames. ], tot_loss[loss=0.1542, simple_loss=0.1753, pruned_loss=0.06649, over 2056763.15 frames. ], batch size: 100, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:31:00,105 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=52813.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:31:02,257 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-07 17:31:19,608 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.081e+01 2.208e+02 2.890e+02 3.954e+02 8.452e+02, threshold=5.779e+02, percent-clipped=3.0 +2022-12-07 17:31:22,355 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3020, 4.0313, 3.7123, 3.8333, 4.0658, 4.1494, 4.2675, 4.2390], + device='cuda:2'), covar=tensor([0.0756, 0.0510, 0.2059, 0.2685, 0.0721, 0.0729, 0.1027, 0.0795], + device='cuda:2'), in_proj_covar=tensor([0.0318, 0.0231, 0.0388, 0.0482, 0.0280, 0.0354, 0.0349, 0.0304], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:31:31,469 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52849.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:31:52,181 INFO [train.py:873] (2/4) Epoch 7, batch 7500, loss[loss=0.1389, simple_loss=0.1651, pruned_loss=0.05635, over 14286.00 frames. ], tot_loss[loss=0.155, simple_loss=0.1757, pruned_loss=0.06719, over 2046209.46 frames. ], batch size: 31, lr: 1.09e-02, grad_scale: 8.0 +2022-12-07 17:32:13,103 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=52897.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:32:23,973 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0678, 1.6483, 2.0549, 1.3977, 1.6927, 2.0102, 1.9618, 1.7777], + device='cuda:2'), covar=tensor([0.1091, 0.1177, 0.0967, 0.1833, 0.1552, 0.0799, 0.0626, 0.1943], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0194, 0.0124, 0.0127, 0.0118, 0.0120, 0.0101, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:32:24,666 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=52911.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 17:32:33,943 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4097, 0.9909, 1.4090, 0.9203, 1.0441, 1.3780, 1.1119, 1.0528], + device='cuda:2'), covar=tensor([0.0457, 0.0948, 0.0388, 0.0695, 0.0699, 0.0564, 0.0430, 0.1075], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0192, 0.0122, 0.0125, 0.0117, 0.0119, 0.0100, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:33:18,934 INFO [train.py:873] (2/4) Epoch 8, batch 0, loss[loss=0.2071, simple_loss=0.221, pruned_loss=0.09664, over 14324.00 frames. ], tot_loss[loss=0.2071, simple_loss=0.221, pruned_loss=0.09664, over 14324.00 frames. ], batch size: 55, lr: 1.03e-02, grad_scale: 8.0 +2022-12-07 17:33:18,935 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 17:33:22,596 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9054, 4.4172, 4.6284, 4.8182, 4.5201, 4.2391, 4.7712, 4.5716], + device='cuda:2'), covar=tensor([0.0272, 0.0657, 0.0311, 0.0356, 0.0641, 0.0221, 0.0653, 0.0329], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0240, 0.0164, 0.0155, 0.0157, 0.0128, 0.0244, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:33:26,262 INFO [train.py:905] (2/4) Epoch 8, validation: loss=0.1282, simple_loss=0.1716, pruned_loss=0.04242, over 857387.00 frames. +2022-12-07 17:33:26,262 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 17:33:27,144 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.973e+01 1.673e+02 2.697e+02 3.774e+02 7.847e+02, threshold=5.395e+02, percent-clipped=3.0 +2022-12-07 17:33:53,671 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9338, 4.9081, 3.9167, 4.1833, 4.4546, 4.9218, 5.0527, 5.0642], + device='cuda:2'), covar=tensor([0.1073, 0.0496, 0.3081, 0.3628, 0.1196, 0.0819, 0.1096, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0325, 0.0237, 0.0398, 0.0492, 0.0286, 0.0361, 0.0357, 0.0311], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:33:56,328 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=52968.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:34:38,737 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53016.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:34:55,995 INFO [train.py:873] (2/4) Epoch 8, batch 100, loss[loss=0.1224, simple_loss=0.1625, pruned_loss=0.04118, over 13941.00 frames. ], tot_loss[loss=0.1534, simple_loss=0.1753, pruned_loss=0.06576, over 869349.89 frames. ], batch size: 23, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:34:56,725 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.667e+02 2.496e+02 3.096e+02 4.127e+02 8.461e+02, threshold=6.192e+02, percent-clipped=10.0 +2022-12-07 17:35:06,713 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3197, 4.8013, 4.6965, 5.2629, 4.8175, 4.4110, 5.2283, 4.2956], + device='cuda:2'), covar=tensor([0.0262, 0.0906, 0.0294, 0.0379, 0.0814, 0.0426, 0.0542, 0.0491], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0240, 0.0164, 0.0155, 0.0158, 0.0128, 0.0243, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:35:13,505 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53056.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:35:19,599 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1774, 1.2931, 1.4780, 0.8823, 0.8907, 1.3161, 0.8360, 1.1987], + device='cuda:2'), covar=tensor([0.1682, 0.2533, 0.0546, 0.2506, 0.3100, 0.0912, 0.2060, 0.0963], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0090, 0.0081, 0.0091, 0.0111, 0.0074, 0.0133, 0.0080], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:35:44,529 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 17:35:59,472 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53108.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:36:07,683 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53117.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:36:10,206 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3773, 4.1279, 4.0885, 4.4266, 3.9350, 3.6904, 4.4362, 4.3567], + device='cuda:2'), covar=tensor([0.0659, 0.0650, 0.0698, 0.0605, 0.0744, 0.0696, 0.0578, 0.0660], + device='cuda:2'), in_proj_covar=tensor([0.0118, 0.0110, 0.0120, 0.0126, 0.0127, 0.0097, 0.0137, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:36:23,090 INFO [train.py:873] (2/4) Epoch 8, batch 200, loss[loss=0.1755, simple_loss=0.1808, pruned_loss=0.08514, over 7811.00 frames. ], tot_loss[loss=0.1515, simple_loss=0.1735, pruned_loss=0.06478, over 1281468.68 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:36:23,899 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.126e+02 2.183e+02 2.918e+02 3.622e+02 1.019e+03, threshold=5.836e+02, percent-clipped=2.0 +2022-12-07 17:37:30,636 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53211.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:37:52,052 INFO [train.py:873] (2/4) Epoch 8, batch 300, loss[loss=0.1659, simple_loss=0.1791, pruned_loss=0.07636, over 10347.00 frames. ], tot_loss[loss=0.1526, simple_loss=0.1741, pruned_loss=0.06552, over 1535485.20 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:37:52,977 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.915e+01 2.253e+02 2.827e+02 3.791e+02 8.401e+02, threshold=5.654e+02, percent-clipped=6.0 +2022-12-07 17:37:57,595 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53241.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:37:59,276 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4597, 2.2622, 2.4177, 1.5989, 2.0924, 2.3550, 2.5852, 2.1819], + device='cuda:2'), covar=tensor([0.0606, 0.1221, 0.0956, 0.1893, 0.0941, 0.0639, 0.0533, 0.1373], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0192, 0.0126, 0.0127, 0.0116, 0.0120, 0.0101, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 17:38:01,111 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3481, 2.0296, 4.7792, 4.4096, 4.4001, 4.9465, 4.7324, 4.9548], + device='cuda:2'), covar=tensor([0.1180, 0.1162, 0.0066, 0.0123, 0.0117, 0.0073, 0.0063, 0.0078], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0155, 0.0115, 0.0159, 0.0130, 0.0128, 0.0107, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:38:10,726 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.39 vs. limit=5.0 +2022-12-07 17:38:12,805 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53259.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:38:50,110 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53302.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:38:51,846 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53304.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:39:02,086 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53315.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:39:18,815 INFO [train.py:873] (2/4) Epoch 8, batch 400, loss[loss=0.1914, simple_loss=0.1658, pruned_loss=0.1085, over 1292.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.1733, pruned_loss=0.06464, over 1698370.33 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:39:19,681 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.217e+02 2.324e+02 2.854e+02 3.563e+02 7.977e+02, threshold=5.709e+02, percent-clipped=8.0 +2022-12-07 17:39:42,907 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9609, 3.2238, 2.3967, 4.2449, 3.9138, 3.9304, 3.1763, 2.6190], + device='cuda:2'), covar=tensor([0.0841, 0.1889, 0.5467, 0.0503, 0.0931, 0.1791, 0.1592, 0.4820], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0296, 0.0281, 0.0211, 0.0273, 0.0268, 0.0250, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:39:43,846 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8433, 2.2754, 3.6783, 3.8623, 3.7854, 2.3911, 3.8549, 2.9786], + device='cuda:2'), covar=tensor([0.0226, 0.0581, 0.0627, 0.0235, 0.0223, 0.0907, 0.0199, 0.0549], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0222, 0.0332, 0.0273, 0.0220, 0.0268, 0.0238, 0.0257], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:39:45,334 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53365.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:39:47,001 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=53367.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:39:49,755 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1153, 1.2547, 1.3321, 1.0537, 1.1296, 0.7279, 1.4733, 1.4123], + device='cuda:2'), covar=tensor([0.2964, 0.1736, 0.0960, 0.1992, 0.2886, 0.0938, 0.1269, 0.1647], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0022, 0.0022, 0.0023, 0.0032, 0.0023, 0.0023], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 17:39:54,798 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53376.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:40:23,180 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53408.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:40:26,500 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53412.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:40:40,714 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=53428.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 17:40:43,192 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8447, 2.0351, 2.8456, 2.9077, 2.8388, 2.1001, 2.9411, 2.2390], + device='cuda:2'), covar=tensor([0.0170, 0.0416, 0.0264, 0.0167, 0.0192, 0.0586, 0.0134, 0.0415], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0224, 0.0333, 0.0275, 0.0222, 0.0270, 0.0239, 0.0260], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:40:47,041 INFO [train.py:873] (2/4) Epoch 8, batch 500, loss[loss=0.1817, simple_loss=0.1706, pruned_loss=0.0964, over 3895.00 frames. ], tot_loss[loss=0.1528, simple_loss=0.1745, pruned_loss=0.06559, over 1810055.46 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 4.0 +2022-12-07 17:40:48,831 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.378e+02 2.493e+02 3.208e+02 4.073e+02 9.438e+02, threshold=6.416e+02, percent-clipped=6.0 +2022-12-07 17:40:55,832 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1390, 1.8825, 3.1612, 2.2636, 3.0928, 1.8491, 2.4118, 3.1539], + device='cuda:2'), covar=tensor([0.0715, 0.4781, 0.0439, 0.6858, 0.0696, 0.3783, 0.1578, 0.0436], + device='cuda:2'), in_proj_covar=tensor([0.0226, 0.0236, 0.0180, 0.0317, 0.0200, 0.0242, 0.0231, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:41:05,151 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53456.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:41:08,416 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9507, 1.6565, 3.9782, 3.7042, 3.7509, 4.0862, 3.4943, 4.0582], + device='cuda:2'), covar=tensor([0.1270, 0.1370, 0.0128, 0.0214, 0.0191, 0.0121, 0.0220, 0.0132], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0154, 0.0115, 0.0158, 0.0131, 0.0128, 0.0106, 0.0113], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:42:11,063 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.37 vs. limit=5.0 +2022-12-07 17:42:13,824 INFO [train.py:873] (2/4) Epoch 8, batch 600, loss[loss=0.1363, simple_loss=0.1668, pruned_loss=0.05295, over 14255.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.1741, pruned_loss=0.06613, over 1795577.48 frames. ], batch size: 37, lr: 1.02e-02, grad_scale: 4.0 +2022-12-07 17:42:15,675 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.499e+01 2.137e+02 2.538e+02 3.360e+02 7.193e+02, threshold=5.075e+02, percent-clipped=1.0 +2022-12-07 17:42:18,394 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0119, 1.0487, 1.7481, 1.4389, 1.1427, 0.9956, 1.1762, 0.7687], + device='cuda:2'), covar=tensor([0.1160, 0.1453, 0.0508, 0.0588, 0.0963, 0.0521, 0.0601, 0.1336], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0011, 0.0012, 0.0018, 0.0014, 0.0018], + device='cuda:2'), out_proj_covar=tensor([8.2049e-05, 8.7644e-05, 7.8608e-05, 8.1624e-05, 8.3194e-05, 1.1854e-04, + 1.0130e-04, 1.1331e-04], device='cuda:2') +2022-12-07 17:42:42,551 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9396, 2.6975, 3.5997, 2.5253, 2.2627, 2.8240, 1.3443, 3.0485], + device='cuda:2'), covar=tensor([0.1572, 0.1382, 0.0764, 0.1877, 0.2337, 0.1067, 0.4829, 0.1105], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0084, 0.0077, 0.0086, 0.0107, 0.0072, 0.0125, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:43:08,697 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53597.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:43:14,894 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0443, 2.0818, 1.9073, 2.1615, 1.7993, 1.9705, 2.0606, 2.1193], + device='cuda:2'), covar=tensor([0.0875, 0.1060, 0.1094, 0.0830, 0.1321, 0.0844, 0.1048, 0.0831], + device='cuda:2'), in_proj_covar=tensor([0.0119, 0.0111, 0.0121, 0.0126, 0.0126, 0.0097, 0.0137, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:43:39,734 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2658, 1.3305, 1.4750, 0.8689, 0.8980, 1.3835, 0.7402, 1.2290], + device='cuda:2'), covar=tensor([0.1476, 0.2492, 0.0709, 0.2484, 0.3362, 0.0632, 0.1921, 0.0925], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0084, 0.0078, 0.0086, 0.0107, 0.0072, 0.0126, 0.0076], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:43:42,307 INFO [train.py:873] (2/4) Epoch 8, batch 700, loss[loss=0.2035, simple_loss=0.1869, pruned_loss=0.11, over 3871.00 frames. ], tot_loss[loss=0.1535, simple_loss=0.1742, pruned_loss=0.06642, over 1881554.85 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 4.0 +2022-12-07 17:43:44,307 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.331e+02 2.400e+02 3.018e+02 4.076e+02 1.039e+03, threshold=6.035e+02, percent-clipped=12.0 +2022-12-07 17:44:04,154 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53660.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:44:13,796 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53671.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:44:49,912 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53712.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:44:53,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5605, 1.5839, 2.7419, 1.3396, 2.8143, 2.7673, 2.0487, 2.8721], + device='cuda:2'), covar=tensor([0.0261, 0.2259, 0.0308, 0.2067, 0.0327, 0.0431, 0.0960, 0.0207], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0159, 0.0152, 0.0167, 0.0162, 0.0162, 0.0131, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:44:54,648 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.30 vs. limit=5.0 +2022-12-07 17:44:59,945 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=53723.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 17:45:10,197 INFO [train.py:873] (2/4) Epoch 8, batch 800, loss[loss=0.1695, simple_loss=0.1825, pruned_loss=0.07829, over 6887.00 frames. ], tot_loss[loss=0.1538, simple_loss=0.1742, pruned_loss=0.06665, over 1850218.21 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:45:11,875 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.469e+01 2.335e+02 2.764e+02 3.237e+02 6.593e+02, threshold=5.529e+02, percent-clipped=1.0 +2022-12-07 17:45:32,809 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53760.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:45:43,305 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3783, 1.3756, 3.4002, 1.3806, 3.2558, 3.4988, 2.1125, 3.6955], + device='cuda:2'), covar=tensor([0.0215, 0.3161, 0.0331, 0.2423, 0.0758, 0.0343, 0.1024, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0160, 0.0151, 0.0166, 0.0162, 0.0162, 0.0131, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:46:24,246 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.97 vs. limit=2.0 +2022-12-07 17:46:38,789 INFO [train.py:873] (2/4) Epoch 8, batch 900, loss[loss=0.1664, simple_loss=0.1716, pruned_loss=0.08053, over 4919.00 frames. ], tot_loss[loss=0.1514, simple_loss=0.1734, pruned_loss=0.06474, over 1948365.82 frames. ], batch size: 100, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:46:40,850 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.375e+02 2.164e+02 2.734e+02 3.492e+02 7.190e+02, threshold=5.467e+02, percent-clipped=1.0 +2022-12-07 17:47:32,855 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53897.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:47:46,353 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.76 vs. limit=5.0 +2022-12-07 17:48:05,623 INFO [train.py:873] (2/4) Epoch 8, batch 1000, loss[loss=0.1588, simple_loss=0.1765, pruned_loss=0.07058, over 14286.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.1726, pruned_loss=0.06498, over 1888549.54 frames. ], batch size: 39, lr: 1.02e-02, grad_scale: 8.0 +2022-12-07 17:48:07,331 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.511e+01 2.177e+02 2.724e+02 3.766e+02 8.147e+02, threshold=5.449e+02, percent-clipped=5.0 +2022-12-07 17:48:14,326 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=53945.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:48:27,797 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53960.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:48:37,350 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=53971.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:49:09,286 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54008.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:49:18,699 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54019.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:49:22,110 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=54023.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:49:32,903 INFO [train.py:873] (2/4) Epoch 8, batch 1100, loss[loss=0.1504, simple_loss=0.1715, pruned_loss=0.06461, over 6945.00 frames. ], tot_loss[loss=0.1519, simple_loss=0.1734, pruned_loss=0.06518, over 1979092.96 frames. ], batch size: 100, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:49:33,918 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-12-07 17:49:34,931 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.125e+02 2.772e+02 3.608e+02 1.248e+03, threshold=5.543e+02, percent-clipped=5.0 +2022-12-07 17:49:39,309 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0176, 2.0648, 1.9098, 2.1377, 1.7103, 1.8910, 2.0504, 2.1009], + device='cuda:2'), covar=tensor([0.0887, 0.0927, 0.1149, 0.0783, 0.1276, 0.0765, 0.1078, 0.0904], + device='cuda:2'), in_proj_covar=tensor([0.0121, 0.0112, 0.0124, 0.0128, 0.0128, 0.0098, 0.0141, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:50:04,034 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=54071.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 17:50:59,597 INFO [train.py:873] (2/4) Epoch 8, batch 1200, loss[loss=0.1633, simple_loss=0.1929, pruned_loss=0.06685, over 14228.00 frames. ], tot_loss[loss=0.1537, simple_loss=0.1746, pruned_loss=0.06642, over 1964895.68 frames. ], batch size: 46, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:51:01,572 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3579, 3.0521, 2.3425, 3.4792, 3.1313, 3.3331, 2.8665, 2.4778], + device='cuda:2'), covar=tensor([0.0669, 0.1445, 0.3441, 0.0423, 0.0809, 0.0989, 0.1243, 0.3537], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0298, 0.0278, 0.0213, 0.0278, 0.0272, 0.0250, 0.0269], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:51:02,200 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 2.508e+02 3.088e+02 3.577e+02 7.086e+02, threshold=6.177e+02, percent-clipped=4.0 +2022-12-07 17:51:21,112 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.47 vs. limit=5.0 +2022-12-07 17:51:34,071 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2196, 2.2161, 3.0216, 2.3461, 3.1147, 3.0164, 2.8778, 2.5732], + device='cuda:2'), covar=tensor([0.0583, 0.2680, 0.0790, 0.1990, 0.0654, 0.0748, 0.1054, 0.2132], + device='cuda:2'), in_proj_covar=tensor([0.0317, 0.0323, 0.0381, 0.0308, 0.0371, 0.0303, 0.0354, 0.0330], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:51:58,780 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9502, 1.4051, 2.9102, 2.6870, 2.8681, 2.9219, 2.1819, 2.9523], + device='cuda:2'), covar=tensor([0.0965, 0.1191, 0.0111, 0.0291, 0.0246, 0.0122, 0.0359, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0115, 0.0158, 0.0132, 0.0128, 0.0106, 0.0112], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 17:52:21,151 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5049, 3.2206, 3.1240, 3.4378, 3.2895, 3.4216, 3.5105, 2.9162], + device='cuda:2'), covar=tensor([0.0428, 0.1041, 0.0468, 0.0538, 0.0822, 0.0399, 0.0620, 0.0592], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0241, 0.0165, 0.0155, 0.0162, 0.0130, 0.0241, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:52:27,388 INFO [train.py:873] (2/4) Epoch 8, batch 1300, loss[loss=0.2054, simple_loss=0.2013, pruned_loss=0.1047, over 8584.00 frames. ], tot_loss[loss=0.1529, simple_loss=0.174, pruned_loss=0.06589, over 1932540.39 frames. ], batch size: 100, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:52:30,351 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.196e+02 2.858e+02 3.785e+02 8.723e+02, threshold=5.716e+02, percent-clipped=2.0 +2022-12-07 17:52:40,827 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1659, 2.4946, 2.3613, 2.6054, 1.9339, 2.7719, 2.2572, 1.0887], + device='cuda:2'), covar=tensor([0.3284, 0.0791, 0.1590, 0.0776, 0.1476, 0.0551, 0.1766, 0.4173], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0064, 0.0053, 0.0054, 0.0080, 0.0060, 0.0086, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 17:53:24,684 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3434, 1.1922, 1.0142, 1.0564, 1.2194, 0.6094, 1.1788, 1.4269], + device='cuda:2'), covar=tensor([0.0989, 0.1031, 0.1224, 0.1533, 0.1401, 0.0982, 0.0774, 0.1122], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0022, 0.0023, 0.0021, 0.0022, 0.0032, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 17:53:38,682 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7335, 4.4019, 4.3219, 4.8114, 4.3060, 3.8832, 4.7852, 4.6223], + device='cuda:2'), covar=tensor([0.0639, 0.0652, 0.0763, 0.0537, 0.0774, 0.0616, 0.0597, 0.0740], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0112, 0.0125, 0.0130, 0.0129, 0.0100, 0.0141, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 17:53:53,234 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7078, 1.2393, 3.6133, 1.3060, 3.5894, 3.7224, 2.5192, 4.0324], + device='cuda:2'), covar=tensor([0.0203, 0.3034, 0.0343, 0.2444, 0.0473, 0.0348, 0.0848, 0.0139], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0159, 0.0150, 0.0167, 0.0163, 0.0162, 0.0133, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:53:54,867 INFO [train.py:873] (2/4) Epoch 8, batch 1400, loss[loss=0.1849, simple_loss=0.1942, pruned_loss=0.08782, over 8605.00 frames. ], tot_loss[loss=0.1527, simple_loss=0.1742, pruned_loss=0.06566, over 1967132.14 frames. ], batch size: 100, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:53:57,392 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.292e+02 2.799e+02 3.377e+02 6.597e+02, threshold=5.599e+02, percent-clipped=2.0 +2022-12-07 17:54:25,616 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8822, 1.6262, 1.9689, 1.7477, 2.0981, 1.8134, 1.7058, 1.8867], + device='cuda:2'), covar=tensor([0.0460, 0.1155, 0.0166, 0.0374, 0.0225, 0.0444, 0.0163, 0.0326], + device='cuda:2'), in_proj_covar=tensor([0.0316, 0.0323, 0.0380, 0.0307, 0.0372, 0.0301, 0.0355, 0.0328], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:54:39,803 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.61 vs. limit=5.0 +2022-12-07 17:55:22,060 INFO [train.py:873] (2/4) Epoch 8, batch 1500, loss[loss=0.137, simple_loss=0.1694, pruned_loss=0.05228, over 14511.00 frames. ], tot_loss[loss=0.1523, simple_loss=0.174, pruned_loss=0.06536, over 1993437.33 frames. ], batch size: 49, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:55:25,015 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.978e+01 2.195e+02 2.660e+02 3.596e+02 7.442e+02, threshold=5.321e+02, percent-clipped=3.0 +2022-12-07 17:55:37,261 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9556, 1.8625, 3.1108, 2.2793, 2.9942, 1.9081, 2.5128, 3.0094], + device='cuda:2'), covar=tensor([0.0714, 0.4340, 0.0408, 0.5318, 0.0648, 0.3163, 0.1250, 0.0445], + device='cuda:2'), in_proj_covar=tensor([0.0226, 0.0232, 0.0177, 0.0310, 0.0201, 0.0235, 0.0224, 0.0183], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 17:56:05,607 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8487, 2.6859, 2.7022, 2.8581, 2.7777, 2.7551, 2.9819, 2.4217], + device='cuda:2'), covar=tensor([0.0721, 0.1250, 0.0521, 0.0624, 0.0852, 0.0583, 0.0707, 0.0628], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0241, 0.0166, 0.0157, 0.0162, 0.0131, 0.0243, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:56:18,963 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3170, 2.7325, 3.8986, 2.9817, 3.9809, 3.9288, 3.9038, 3.3736], + device='cuda:2'), covar=tensor([0.0500, 0.2937, 0.0954, 0.2052, 0.0832, 0.0723, 0.1508, 0.2104], + device='cuda:2'), in_proj_covar=tensor([0.0321, 0.0326, 0.0383, 0.0311, 0.0377, 0.0306, 0.0357, 0.0331], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 17:56:49,480 INFO [train.py:873] (2/4) Epoch 8, batch 1600, loss[loss=0.1776, simple_loss=0.1935, pruned_loss=0.08089, over 13925.00 frames. ], tot_loss[loss=0.1533, simple_loss=0.1744, pruned_loss=0.06617, over 1968405.45 frames. ], batch size: 23, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:56:51,890 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.201e+02 2.581e+02 3.079e+02 3.798e+02 6.375e+02, threshold=6.158e+02, percent-clipped=8.0 +2022-12-07 17:57:07,549 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-07 17:57:39,647 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.16 vs. limit=2.0 +2022-12-07 17:58:17,001 INFO [train.py:873] (2/4) Epoch 8, batch 1700, loss[loss=0.173, simple_loss=0.182, pruned_loss=0.08201, over 8648.00 frames. ], tot_loss[loss=0.1531, simple_loss=0.1745, pruned_loss=0.06586, over 2008427.77 frames. ], batch size: 100, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:58:19,791 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.211e+02 2.375e+02 2.929e+02 3.612e+02 8.453e+02, threshold=5.858e+02, percent-clipped=6.0 +2022-12-07 17:58:42,687 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=54664.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 17:58:43,513 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3965, 1.7862, 1.8163, 1.8732, 1.6948, 1.9209, 1.5599, 1.1020], + device='cuda:2'), covar=tensor([0.1511, 0.0543, 0.0281, 0.0378, 0.0884, 0.0486, 0.1276, 0.2359], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0063, 0.0054, 0.0055, 0.0080, 0.0060, 0.0085, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 17:58:56,776 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1267, 1.9388, 2.0622, 2.1198, 2.0373, 2.0520, 2.1839, 1.8304], + device='cuda:2'), covar=tensor([0.0800, 0.1244, 0.0587, 0.0720, 0.0836, 0.0642, 0.0807, 0.0669], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0239, 0.0165, 0.0156, 0.0163, 0.0129, 0.0243, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 17:59:36,553 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=54725.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 17:59:44,437 INFO [train.py:873] (2/4) Epoch 8, batch 1800, loss[loss=0.1775, simple_loss=0.1903, pruned_loss=0.08234, over 13558.00 frames. ], tot_loss[loss=0.1517, simple_loss=0.1733, pruned_loss=0.0651, over 1920497.73 frames. ], batch size: 100, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 17:59:46,925 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 2.304e+02 2.792e+02 3.452e+02 5.919e+02, threshold=5.584e+02, percent-clipped=1.0 +2022-12-07 18:00:06,645 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1536, 1.1040, 1.1403, 0.9783, 1.0161, 0.8497, 0.9473, 0.7238], + device='cuda:2'), covar=tensor([0.0262, 0.0360, 0.0309, 0.0307, 0.0258, 0.0448, 0.0350, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0018, 0.0015, 0.0019], + device='cuda:2'), out_proj_covar=tensor([8.3580e-05, 9.0876e-05, 8.1277e-05, 8.4964e-05, 8.3831e-05, 1.2304e-04, + 1.0357e-04, 1.1980e-04], device='cuda:2') +2022-12-07 18:01:11,092 INFO [train.py:873] (2/4) Epoch 8, batch 1900, loss[loss=0.1636, simple_loss=0.1804, pruned_loss=0.07343, over 14487.00 frames. ], tot_loss[loss=0.1533, simple_loss=0.1741, pruned_loss=0.06621, over 1923727.13 frames. ], batch size: 49, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 18:01:13,579 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.876e+01 2.431e+02 3.063e+02 3.945e+02 8.424e+02, threshold=6.126e+02, percent-clipped=6.0 +2022-12-07 18:01:19,281 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.67 vs. limit=2.0 +2022-12-07 18:01:27,289 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0766, 3.1457, 2.9331, 3.1865, 2.3927, 3.3464, 2.8021, 1.4622], + device='cuda:2'), covar=tensor([0.2197, 0.0913, 0.1209, 0.0775, 0.1119, 0.0491, 0.1390, 0.3288], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0065, 0.0055, 0.0056, 0.0084, 0.0061, 0.0089, 0.0101], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:01:35,132 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=54862.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:01:59,636 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8892, 4.7322, 4.4410, 5.0527, 4.5008, 4.2996, 5.0257, 4.8684], + device='cuda:2'), covar=tensor([0.0692, 0.0603, 0.0690, 0.0520, 0.0609, 0.0433, 0.0575, 0.0667], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0113, 0.0127, 0.0129, 0.0131, 0.0100, 0.0144, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 18:02:29,217 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=54923.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 18:02:39,005 INFO [train.py:873] (2/4) Epoch 8, batch 2000, loss[loss=0.1372, simple_loss=0.1707, pruned_loss=0.05187, over 14162.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.1742, pruned_loss=0.06608, over 1897804.44 frames. ], batch size: 35, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 18:02:40,915 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6761, 2.4921, 4.7056, 3.1328, 4.4723, 2.1534, 3.3169, 4.4460], + device='cuda:2'), covar=tensor([0.0353, 0.4909, 0.0343, 0.8225, 0.0483, 0.4211, 0.1287, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0229, 0.0236, 0.0180, 0.0313, 0.0204, 0.0239, 0.0224, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:02:41,554 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.941e+01 2.269e+02 2.935e+02 3.791e+02 7.498e+02, threshold=5.870e+02, percent-clipped=3.0 +2022-12-07 18:03:02,302 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2073, 1.2139, 1.1238, 1.0050, 1.3618, 0.6776, 1.2319, 1.2921], + device='cuda:2'), covar=tensor([0.0891, 0.0589, 0.0510, 0.0641, 0.0487, 0.0766, 0.0495, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0021, 0.0019, 0.0021, 0.0030, 0.0020, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.9794e-05, 1.0048e-04, 1.0086e-04, 9.6445e-05, 1.0052e-04, 1.3043e-04, + 1.0293e-04, 1.0311e-04], device='cuda:2') +2022-12-07 18:03:57,232 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55020.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:04:09,837 INFO [train.py:873] (2/4) Epoch 8, batch 2100, loss[loss=0.1555, simple_loss=0.1821, pruned_loss=0.06451, over 14476.00 frames. ], tot_loss[loss=0.1533, simple_loss=0.1742, pruned_loss=0.06619, over 1921368.09 frames. ], batch size: 51, lr: 1.01e-02, grad_scale: 8.0 +2022-12-07 18:04:12,640 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.250e+02 2.381e+02 2.883e+02 3.565e+02 6.243e+02, threshold=5.765e+02, percent-clipped=1.0 +2022-12-07 18:04:52,980 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 18:05:16,617 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.64 vs. limit=2.0 +2022-12-07 18:05:37,628 INFO [train.py:873] (2/4) Epoch 8, batch 2200, loss[loss=0.1659, simple_loss=0.1699, pruned_loss=0.08093, over 4963.00 frames. ], tot_loss[loss=0.1535, simple_loss=0.1744, pruned_loss=0.06623, over 2013220.76 frames. ], batch size: 100, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:05:39,867 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.445e+02 2.363e+02 3.011e+02 4.055e+02 7.336e+02, threshold=6.021e+02, percent-clipped=10.0 +2022-12-07 18:05:48,535 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55148.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:06:43,456 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55209.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:06:44,990 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55211.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:06:51,031 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55218.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 18:07:05,472 INFO [train.py:873] (2/4) Epoch 8, batch 2300, loss[loss=0.1706, simple_loss=0.1601, pruned_loss=0.09057, over 2573.00 frames. ], tot_loss[loss=0.1518, simple_loss=0.1733, pruned_loss=0.06517, over 1973077.02 frames. ], batch size: 100, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:07:08,045 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.599e+02 2.336e+02 2.949e+02 3.747e+02 7.724e+02, threshold=5.898e+02, percent-clipped=2.0 +2022-12-07 18:07:39,195 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:08:21,838 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55320.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 18:08:27,470 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7646, 2.0069, 2.8426, 2.8873, 2.8130, 2.0875, 2.8887, 2.1227], + device='cuda:2'), covar=tensor([0.0192, 0.0500, 0.0301, 0.0226, 0.0216, 0.0645, 0.0178, 0.0498], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0223, 0.0341, 0.0281, 0.0225, 0.0275, 0.0245, 0.0260], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:08:34,797 INFO [train.py:873] (2/4) Epoch 8, batch 2400, loss[loss=0.1322, simple_loss=0.1641, pruned_loss=0.0502, over 13971.00 frames. ], tot_loss[loss=0.1519, simple_loss=0.1735, pruned_loss=0.06511, over 2029836.82 frames. ], batch size: 19, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:08:35,832 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0754, 2.1397, 3.0821, 3.1822, 3.0930, 2.1749, 3.0505, 2.4582], + device='cuda:2'), covar=tensor([0.0208, 0.0452, 0.0454, 0.0251, 0.0205, 0.0716, 0.0184, 0.0497], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0222, 0.0338, 0.0280, 0.0224, 0.0273, 0.0243, 0.0259], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:08:37,246 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.077e+02 2.285e+02 2.947e+02 4.114e+02 7.749e+02, threshold=5.894e+02, percent-clipped=7.0 +2022-12-07 18:09:04,222 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55368.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 18:10:03,414 INFO [train.py:873] (2/4) Epoch 8, batch 2500, loss[loss=0.1221, simple_loss=0.15, pruned_loss=0.04712, over 14619.00 frames. ], tot_loss[loss=0.151, simple_loss=0.1729, pruned_loss=0.06452, over 2032798.14 frames. ], batch size: 21, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:10:05,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 2.135e+02 2.789e+02 3.558e+02 8.555e+02, threshold=5.578e+02, percent-clipped=3.0 +2022-12-07 18:10:23,026 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8875, 3.5684, 3.1846, 2.4696, 3.2395, 3.5389, 3.7100, 3.1030], + device='cuda:2'), covar=tensor([0.0717, 0.2292, 0.1244, 0.2277, 0.0968, 0.0659, 0.1090, 0.1488], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0191, 0.0127, 0.0127, 0.0118, 0.0124, 0.0104, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 18:11:03,341 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55504.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:11:15,534 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55518.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 18:11:30,376 INFO [train.py:873] (2/4) Epoch 8, batch 2600, loss[loss=0.1556, simple_loss=0.1798, pruned_loss=0.06569, over 13927.00 frames. ], tot_loss[loss=0.1514, simple_loss=0.1729, pruned_loss=0.06494, over 1989959.64 frames. ], batch size: 23, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:11:32,918 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.429e+02 2.392e+02 2.780e+02 3.615e+02 5.645e+02, threshold=5.559e+02, percent-clipped=1.0 +2022-12-07 18:11:42,345 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5826, 2.7504, 2.5375, 2.7226, 2.0602, 2.8639, 2.6786, 1.2014], + device='cuda:2'), covar=tensor([0.1942, 0.0749, 0.1306, 0.0653, 0.1100, 0.0514, 0.0987, 0.3190], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0064, 0.0054, 0.0057, 0.0082, 0.0062, 0.0085, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:11:47,701 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5147, 1.5670, 4.1236, 1.7551, 4.1689, 4.3754, 3.8564, 4.8266], + device='cuda:2'), covar=tensor([0.0169, 0.2970, 0.0382, 0.2243, 0.0309, 0.0306, 0.0368, 0.0120], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0160, 0.0153, 0.0167, 0.0164, 0.0165, 0.0132, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:11:57,671 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55566.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:11:58,688 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=55567.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:12:02,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7942, 1.9502, 2.6719, 2.1133, 2.6290, 2.4291, 2.4321, 2.3121], + device='cuda:2'), covar=tensor([0.0596, 0.3695, 0.1191, 0.2265, 0.0856, 0.1187, 0.1326, 0.2268], + device='cuda:2'), in_proj_covar=tensor([0.0321, 0.0325, 0.0387, 0.0305, 0.0373, 0.0307, 0.0363, 0.0331], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:12:44,772 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4613, 2.4442, 1.9479, 2.5369, 2.2565, 2.3831, 2.2226, 2.2069], + device='cuda:2'), covar=tensor([0.0623, 0.0676, 0.2131, 0.0359, 0.0722, 0.0653, 0.1084, 0.1138], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0299, 0.0275, 0.0216, 0.0274, 0.0273, 0.0253, 0.0267], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:12:57,537 INFO [train.py:873] (2/4) Epoch 8, batch 2700, loss[loss=0.1949, simple_loss=0.2014, pruned_loss=0.09422, over 8570.00 frames. ], tot_loss[loss=0.1523, simple_loss=0.1739, pruned_loss=0.06539, over 2003536.31 frames. ], batch size: 100, lr: 1.00e-02, grad_scale: 8.0 +2022-12-07 18:13:00,127 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.995e+01 2.381e+02 3.010e+02 3.860e+02 1.123e+03, threshold=6.020e+02, percent-clipped=6.0 +2022-12-07 18:14:09,885 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.81 vs. limit=2.0 +2022-12-07 18:14:27,410 INFO [train.py:873] (2/4) Epoch 8, batch 2800, loss[loss=0.1507, simple_loss=0.1752, pruned_loss=0.06311, over 14303.00 frames. ], tot_loss[loss=0.1493, simple_loss=0.1723, pruned_loss=0.06314, over 2019160.62 frames. ], batch size: 46, lr: 9.99e-03, grad_scale: 8.0 +2022-12-07 18:14:30,866 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.464e+02 2.891e+02 3.921e+02 7.148e+02, threshold=5.781e+02, percent-clipped=1.0 +2022-12-07 18:14:38,403 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55748.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:14:54,474 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-12-07 18:15:28,239 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55804.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:15:32,927 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55809.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:15:54,463 INFO [train.py:873] (2/4) Epoch 8, batch 2900, loss[loss=0.1631, simple_loss=0.1794, pruned_loss=0.07342, over 9502.00 frames. ], tot_loss[loss=0.1514, simple_loss=0.1736, pruned_loss=0.06458, over 1996352.71 frames. ], batch size: 100, lr: 9.99e-03, grad_scale: 8.0 +2022-12-07 18:15:57,950 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.733e+01 2.419e+02 2.995e+02 3.748e+02 5.974e+02, threshold=5.989e+02, percent-clipped=2.0 +2022-12-07 18:16:06,762 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=55848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:16:09,823 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55852.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:16:22,879 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=55867.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:16:59,401 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=55909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:17:04,326 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=55915.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:17:22,327 INFO [train.py:873] (2/4) Epoch 8, batch 3000, loss[loss=0.1613, simple_loss=0.1792, pruned_loss=0.07164, over 14247.00 frames. ], tot_loss[loss=0.1512, simple_loss=0.1734, pruned_loss=0.06452, over 2016008.97 frames. ], batch size: 99, lr: 9.98e-03, grad_scale: 8.0 +2022-12-07 18:17:22,327 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 18:17:42,268 INFO [train.py:905] (2/4) Epoch 8, validation: loss=0.1226, simple_loss=0.1659, pruned_loss=0.03968, over 857387.00 frames. +2022-12-07 18:17:42,268 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 18:17:45,688 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.182e+02 2.436e+02 3.032e+02 3.912e+02 1.128e+03, threshold=6.064e+02, percent-clipped=7.0 +2022-12-07 18:18:06,876 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7341, 1.1278, 1.2553, 1.2629, 1.1503, 1.3477, 1.0519, 0.8914], + device='cuda:2'), covar=tensor([0.2497, 0.0823, 0.0367, 0.0244, 0.0934, 0.0430, 0.1239, 0.1174], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0065, 0.0054, 0.0057, 0.0082, 0.0062, 0.0087, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:18:26,860 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0596, 2.1155, 1.9906, 2.2203, 1.7695, 1.8919, 2.1586, 2.1883], + device='cuda:2'), covar=tensor([0.0888, 0.0972, 0.1157, 0.0807, 0.1434, 0.0874, 0.1015, 0.0820], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0113, 0.0123, 0.0128, 0.0127, 0.0099, 0.0139, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 18:18:51,127 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9524, 1.9730, 1.6582, 2.0993, 1.8131, 1.9271, 1.8315, 1.7993], + device='cuda:2'), covar=tensor([0.0779, 0.0882, 0.1834, 0.0357, 0.0689, 0.0567, 0.1333, 0.0623], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0294, 0.0273, 0.0217, 0.0278, 0.0272, 0.0255, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:18:51,944 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5778, 1.8894, 1.8943, 1.9589, 1.7398, 2.0150, 1.5869, 1.1523], + device='cuda:2'), covar=tensor([0.1816, 0.0763, 0.0470, 0.0558, 0.1069, 0.0578, 0.1642, 0.2778], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0065, 0.0053, 0.0056, 0.0082, 0.0062, 0.0086, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:19:11,704 INFO [train.py:873] (2/4) Epoch 8, batch 3100, loss[loss=0.1605, simple_loss=0.1797, pruned_loss=0.07065, over 8650.00 frames. ], tot_loss[loss=0.151, simple_loss=0.1732, pruned_loss=0.06442, over 1963464.69 frames. ], batch size: 100, lr: 9.97e-03, grad_scale: 8.0 +2022-12-07 18:19:15,078 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.402e+01 2.241e+02 2.784e+02 3.482e+02 7.776e+02, threshold=5.568e+02, percent-clipped=3.0 +2022-12-07 18:20:05,876 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.10 vs. limit=5.0 +2022-12-07 18:20:11,519 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56104.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:20:18,449 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-07 18:20:23,011 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.54 vs. limit=2.0 +2022-12-07 18:20:38,661 INFO [train.py:873] (2/4) Epoch 8, batch 3200, loss[loss=0.151, simple_loss=0.1712, pruned_loss=0.06536, over 10307.00 frames. ], tot_loss[loss=0.1527, simple_loss=0.1741, pruned_loss=0.0656, over 1878900.02 frames. ], batch size: 100, lr: 9.96e-03, grad_scale: 8.0 +2022-12-07 18:20:40,922 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 18:20:42,717 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.378e+02 2.292e+02 3.048e+02 3.641e+02 9.234e+02, threshold=6.095e+02, percent-clipped=4.0 +2022-12-07 18:20:47,436 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56145.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:21:40,231 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56204.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:21:42,139 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56206.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:22:08,272 INFO [train.py:873] (2/4) Epoch 8, batch 3300, loss[loss=0.1671, simple_loss=0.1529, pruned_loss=0.09066, over 2622.00 frames. ], tot_loss[loss=0.1506, simple_loss=0.1728, pruned_loss=0.06422, over 1979537.84 frames. ], batch size: 100, lr: 9.95e-03, grad_scale: 8.0 +2022-12-07 18:22:09,856 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0309, 1.2776, 4.0284, 1.4875, 3.9316, 4.0874, 3.1292, 4.4562], + device='cuda:2'), covar=tensor([0.0214, 0.3374, 0.0361, 0.2487, 0.0346, 0.0359, 0.0629, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0160, 0.0151, 0.0168, 0.0163, 0.0165, 0.0130, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:22:12,322 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.476e+02 2.398e+02 3.040e+02 3.643e+02 7.318e+02, threshold=6.080e+02, percent-clipped=1.0 +2022-12-07 18:23:35,251 INFO [train.py:873] (2/4) Epoch 8, batch 3400, loss[loss=0.1623, simple_loss=0.1797, pruned_loss=0.07252, over 11214.00 frames. ], tot_loss[loss=0.1502, simple_loss=0.1728, pruned_loss=0.06386, over 2021123.28 frames. ], batch size: 100, lr: 9.94e-03, grad_scale: 8.0 +2022-12-07 18:23:39,210 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 18:23:39,443 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.326e+02 2.408e+02 2.894e+02 3.622e+02 6.016e+02, threshold=5.789e+02, percent-clipped=0.0 +2022-12-07 18:24:26,469 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5180, 1.4414, 2.8510, 1.3900, 2.8634, 2.8035, 2.0901, 2.8539], + device='cuda:2'), covar=tensor([0.0384, 0.3541, 0.0369, 0.2510, 0.0402, 0.0499, 0.0974, 0.0360], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0160, 0.0151, 0.0168, 0.0163, 0.0166, 0.0131, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:24:27,829 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.07 vs. limit=2.0 +2022-12-07 18:24:31,187 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1238, 1.5664, 1.6605, 1.6167, 1.5866, 1.6946, 1.2925, 1.0970], + device='cuda:2'), covar=tensor([0.2044, 0.0893, 0.0525, 0.0332, 0.1096, 0.0646, 0.2203, 0.2377], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0067, 0.0055, 0.0058, 0.0084, 0.0065, 0.0089, 0.0102], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:24:36,615 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56404.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:25:03,324 INFO [train.py:873] (2/4) Epoch 8, batch 3500, loss[loss=0.1655, simple_loss=0.1828, pruned_loss=0.07409, over 11987.00 frames. ], tot_loss[loss=0.1514, simple_loss=0.173, pruned_loss=0.06487, over 1934298.32 frames. ], batch size: 100, lr: 9.93e-03, grad_scale: 8.0 +2022-12-07 18:25:07,749 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.106e+02 2.219e+02 2.782e+02 3.549e+02 6.206e+02, threshold=5.564e+02, percent-clipped=1.0 +2022-12-07 18:25:18,098 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56452.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:26:01,150 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56501.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:26:04,237 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56504.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:26:20,813 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56523.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:26:30,710 INFO [train.py:873] (2/4) Epoch 8, batch 3600, loss[loss=0.1363, simple_loss=0.1698, pruned_loss=0.05145, over 14409.00 frames. ], tot_loss[loss=0.1514, simple_loss=0.173, pruned_loss=0.06495, over 1850182.37 frames. ], batch size: 41, lr: 9.92e-03, grad_scale: 8.0 +2022-12-07 18:26:31,412 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-07 18:26:35,120 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.906e+01 2.280e+02 2.872e+02 3.796e+02 9.491e+02, threshold=5.743e+02, percent-clipped=4.0 +2022-12-07 18:26:46,156 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56552.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:27:02,067 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2648, 1.3451, 1.3512, 1.1359, 1.4097, 0.6802, 1.1890, 1.1884], + device='cuda:2'), covar=tensor([0.0849, 0.0755, 0.0581, 0.0885, 0.0880, 0.0899, 0.0931, 0.0917], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0022, 0.0021, 0.0022, 0.0031, 0.0022, 0.0022], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 18:27:08,489 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-07 18:27:14,200 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56584.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 18:27:39,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3048, 1.3494, 1.5021, 1.1989, 1.4601, 0.8627, 1.0536, 0.8503], + device='cuda:2'), covar=tensor([0.0275, 0.0623, 0.0412, 0.0399, 0.0573, 0.0457, 0.0481, 0.0797], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0014, 0.0012, 0.0012, 0.0012, 0.0019, 0.0015, 0.0020], + device='cuda:2'), out_proj_covar=tensor([8.8153e-05, 9.5220e-05, 8.5386e-05, 8.9196e-05, 8.6953e-05, 1.2911e-04, + 1.0872e-04, 1.2377e-04], device='cuda:2') +2022-12-07 18:27:59,552 INFO [train.py:873] (2/4) Epoch 8, batch 3700, loss[loss=0.1475, simple_loss=0.1774, pruned_loss=0.0588, over 14250.00 frames. ], tot_loss[loss=0.1504, simple_loss=0.1726, pruned_loss=0.06406, over 1946348.13 frames. ], batch size: 44, lr: 9.92e-03, grad_scale: 8.0 +2022-12-07 18:28:03,913 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.177e+02 2.170e+02 2.783e+02 3.492e+02 8.002e+02, threshold=5.566e+02, percent-clipped=4.0 +2022-12-07 18:28:47,280 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2427, 1.2696, 1.4765, 1.0688, 1.0840, 1.3875, 0.9683, 1.0949], + device='cuda:2'), covar=tensor([0.1346, 0.2254, 0.0653, 0.1654, 0.2133, 0.0579, 0.1983, 0.1022], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0086, 0.0080, 0.0088, 0.0109, 0.0075, 0.0126, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:29:06,565 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2523, 2.2341, 3.0050, 2.4018, 3.0612, 3.0026, 2.9811, 2.4950], + device='cuda:2'), covar=tensor([0.0490, 0.2794, 0.0795, 0.2072, 0.0709, 0.0793, 0.1063, 0.2043], + device='cuda:2'), in_proj_covar=tensor([0.0315, 0.0319, 0.0384, 0.0302, 0.0367, 0.0303, 0.0357, 0.0326], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:29:21,029 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.37 vs. limit=5.0 +2022-12-07 18:29:25,510 INFO [train.py:873] (2/4) Epoch 8, batch 3800, loss[loss=0.1667, simple_loss=0.1442, pruned_loss=0.09459, over 1260.00 frames. ], tot_loss[loss=0.1506, simple_loss=0.173, pruned_loss=0.06412, over 1943358.21 frames. ], batch size: 100, lr: 9.91e-03, grad_scale: 8.0 +2022-12-07 18:29:29,583 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.270e+02 2.607e+02 3.160e+02 3.908e+02 8.359e+02, threshold=6.320e+02, percent-clipped=5.0 +2022-12-07 18:30:24,863 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=56801.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:30:26,557 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56803.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:30:54,312 INFO [train.py:873] (2/4) Epoch 8, batch 3900, loss[loss=0.1763, simple_loss=0.1569, pruned_loss=0.09785, over 2619.00 frames. ], tot_loss[loss=0.15, simple_loss=0.1724, pruned_loss=0.06379, over 1968804.24 frames. ], batch size: 100, lr: 9.90e-03, grad_scale: 8.0 +2022-12-07 18:30:58,940 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.085e+02 2.187e+02 2.677e+02 3.401e+02 6.261e+02, threshold=5.355e+02, percent-clipped=0.0 +2022-12-07 18:31:06,449 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=56849.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:31:19,823 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56864.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:31:33,509 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=56879.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 18:31:59,188 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56908.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:32:18,792 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9001, 1.3335, 2.0869, 1.2953, 2.0387, 2.0733, 1.7902, 2.1011], + device='cuda:2'), covar=tensor([0.0263, 0.1606, 0.0313, 0.1627, 0.0353, 0.0382, 0.0650, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0159, 0.0150, 0.0168, 0.0163, 0.0162, 0.0130, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:32:22,734 INFO [train.py:873] (2/4) Epoch 8, batch 4000, loss[loss=0.1971, simple_loss=0.1835, pruned_loss=0.1054, over 3884.00 frames. ], tot_loss[loss=0.1495, simple_loss=0.1724, pruned_loss=0.06328, over 2006831.70 frames. ], batch size: 100, lr: 9.89e-03, grad_scale: 8.0 +2022-12-07 18:32:27,090 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 2.242e+02 2.985e+02 3.676e+02 7.027e+02, threshold=5.970e+02, percent-clipped=5.0 +2022-12-07 18:32:53,039 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=56969.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:33:07,021 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8165, 2.8326, 2.9952, 2.8660, 2.8644, 2.7418, 1.4810, 2.6322], + device='cuda:2'), covar=tensor([0.0349, 0.0361, 0.0375, 0.0314, 0.0330, 0.0690, 0.2581, 0.0335], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0152, 0.0131, 0.0127, 0.0183, 0.0124, 0.0150, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:33:14,738 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56993.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:33:16,325 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=56995.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:33:51,681 INFO [train.py:873] (2/4) Epoch 8, batch 4100, loss[loss=0.1924, simple_loss=0.1888, pruned_loss=0.09803, over 5936.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.1721, pruned_loss=0.06314, over 2032546.46 frames. ], batch size: 100, lr: 9.88e-03, grad_scale: 8.0 +2022-12-07 18:33:56,138 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.189e+02 2.377e+02 3.103e+02 3.855e+02 1.239e+03, threshold=6.205e+02, percent-clipped=7.0 +2022-12-07 18:34:08,934 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57054.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:34:10,588 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57056.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:34:11,413 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2561, 1.3330, 3.3006, 1.3927, 3.1229, 3.3777, 2.2924, 3.5526], + device='cuda:2'), covar=tensor([0.0245, 0.3291, 0.0452, 0.2651, 0.0790, 0.0356, 0.0903, 0.0203], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0159, 0.0150, 0.0169, 0.0162, 0.0163, 0.0131, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:34:46,391 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4743, 2.8773, 4.1455, 3.2281, 4.2526, 4.2013, 3.9729, 3.8087], + device='cuda:2'), covar=tensor([0.0557, 0.2911, 0.0832, 0.1903, 0.0926, 0.0830, 0.1790, 0.1830], + device='cuda:2'), in_proj_covar=tensor([0.0316, 0.0315, 0.0383, 0.0302, 0.0369, 0.0302, 0.0352, 0.0326], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:34:54,200 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8167, 3.5894, 3.2673, 3.4507, 3.6904, 3.7292, 3.8134, 3.7866], + device='cuda:2'), covar=tensor([0.0819, 0.0596, 0.2210, 0.2763, 0.0735, 0.0770, 0.0875, 0.0858], + device='cuda:2'), in_proj_covar=tensor([0.0339, 0.0240, 0.0401, 0.0512, 0.0294, 0.0371, 0.0367, 0.0319], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:35:20,886 INFO [train.py:873] (2/4) Epoch 8, batch 4200, loss[loss=0.1352, simple_loss=0.1646, pruned_loss=0.05289, over 14298.00 frames. ], tot_loss[loss=0.1484, simple_loss=0.1713, pruned_loss=0.06272, over 2008947.39 frames. ], batch size: 63, lr: 9.87e-03, grad_scale: 8.0 +2022-12-07 18:35:25,040 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.452e+02 2.260e+02 2.916e+02 3.546e+02 6.128e+02, threshold=5.833e+02, percent-clipped=0.0 +2022-12-07 18:35:30,875 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-12-07 18:35:42,102 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57159.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:35:59,642 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57179.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:36:05,055 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1037, 1.2830, 1.5086, 0.9930, 0.9029, 1.2780, 0.9799, 1.1872], + device='cuda:2'), covar=tensor([0.1594, 0.2142, 0.0588, 0.1825, 0.2585, 0.0742, 0.1994, 0.1038], + device='cuda:2'), in_proj_covar=tensor([0.0073, 0.0086, 0.0079, 0.0088, 0.0110, 0.0075, 0.0125, 0.0078], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:36:06,035 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57186.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:36:15,678 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57197.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:36:28,393 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9864, 2.1189, 1.9153, 2.1581, 1.7424, 1.8919, 2.0872, 2.0826], + device='cuda:2'), covar=tensor([0.0941, 0.0837, 0.0972, 0.0763, 0.1326, 0.0940, 0.0899, 0.0926], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0114, 0.0123, 0.0130, 0.0129, 0.0101, 0.0142, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-07 18:36:41,992 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57227.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:36:48,821 INFO [train.py:873] (2/4) Epoch 8, batch 4300, loss[loss=0.1404, simple_loss=0.1679, pruned_loss=0.05641, over 14278.00 frames. ], tot_loss[loss=0.1504, simple_loss=0.1725, pruned_loss=0.06411, over 1981904.36 frames. ], batch size: 44, lr: 9.86e-03, grad_scale: 8.0 +2022-12-07 18:36:53,099 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.149e+02 2.274e+02 2.646e+02 3.498e+02 8.636e+02, threshold=5.293e+02, percent-clipped=3.0 +2022-12-07 18:36:57,051 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57244.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:36:57,895 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3367, 1.6197, 1.5977, 1.3189, 1.5749, 1.1827, 1.1261, 0.9847], + device='cuda:2'), covar=tensor([0.0277, 0.0379, 0.0347, 0.0338, 0.0326, 0.0271, 0.0278, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0015, 0.0019], + device='cuda:2'), out_proj_covar=tensor([8.6180e-05, 9.2878e-05, 8.3882e-05, 8.7420e-05, 8.5445e-05, 1.2821e-04, + 1.0729e-04, 1.2256e-04], device='cuda:2') +2022-12-07 18:36:59,608 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57247.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:37:03,335 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.62 vs. limit=2.0 +2022-12-07 18:37:08,930 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57258.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:37:14,067 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57264.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:37:19,643 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0902, 4.9583, 4.4956, 4.6690, 4.6711, 5.0226, 5.1451, 5.1088], + device='cuda:2'), covar=tensor([0.0961, 0.0431, 0.2167, 0.2791, 0.0750, 0.0807, 0.0902, 0.0820], + device='cuda:2'), in_proj_covar=tensor([0.0342, 0.0243, 0.0401, 0.0515, 0.0296, 0.0374, 0.0372, 0.0323], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:37:26,891 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-07 18:37:27,269 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8806, 2.0758, 2.1695, 2.3146, 1.8883, 2.2869, 2.0127, 1.1728], + device='cuda:2'), covar=tensor([0.1424, 0.1032, 0.0587, 0.0381, 0.1096, 0.0526, 0.1528, 0.2862], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0066, 0.0055, 0.0057, 0.0084, 0.0063, 0.0086, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:37:49,453 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57305.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:38:15,736 INFO [train.py:873] (2/4) Epoch 8, batch 4400, loss[loss=0.1407, simple_loss=0.1709, pruned_loss=0.05528, over 14265.00 frames. ], tot_loss[loss=0.1499, simple_loss=0.1721, pruned_loss=0.06384, over 1949485.61 frames. ], batch size: 31, lr: 9.86e-03, grad_scale: 8.0 +2022-12-07 18:38:19,656 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.383e+01 2.226e+02 2.824e+02 3.594e+02 6.936e+02, threshold=5.649e+02, percent-clipped=3.0 +2022-12-07 18:38:27,787 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57349.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:38:29,484 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57351.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:38:57,055 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6784, 2.3367, 2.4953, 1.5497, 2.2733, 2.5702, 2.7357, 2.1894], + device='cuda:2'), covar=tensor([0.0847, 0.1477, 0.1203, 0.2368, 0.1084, 0.0704, 0.0776, 0.1854], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0186, 0.0127, 0.0126, 0.0120, 0.0126, 0.0103, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 18:39:10,829 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57398.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:39:18,378 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1894, 2.9684, 2.7208, 2.8228, 3.0848, 3.0703, 3.1325, 3.1201], + device='cuda:2'), covar=tensor([0.0951, 0.0684, 0.2435, 0.2780, 0.0908, 0.0838, 0.1380, 0.0913], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0245, 0.0406, 0.0516, 0.0298, 0.0380, 0.0377, 0.0324], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:39:30,472 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5782, 3.3518, 3.0050, 2.2838, 3.0836, 3.3725, 3.5540, 2.7863], + device='cuda:2'), covar=tensor([0.0580, 0.1951, 0.1111, 0.1878, 0.0862, 0.0486, 0.0712, 0.1447], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0188, 0.0127, 0.0126, 0.0120, 0.0126, 0.0104, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 18:39:43,851 INFO [train.py:873] (2/4) Epoch 8, batch 4500, loss[loss=0.1166, simple_loss=0.1533, pruned_loss=0.03997, over 14279.00 frames. ], tot_loss[loss=0.1496, simple_loss=0.1721, pruned_loss=0.06355, over 1977636.75 frames. ], batch size: 44, lr: 9.85e-03, grad_scale: 8.0 +2022-12-07 18:39:47,868 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.294e+02 2.953e+02 3.643e+02 6.271e+02, threshold=5.907e+02, percent-clipped=3.0 +2022-12-07 18:39:59,247 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1247, 1.3872, 3.3325, 1.4068, 3.0279, 3.2401, 2.3964, 3.4649], + device='cuda:2'), covar=tensor([0.0222, 0.2738, 0.0293, 0.2304, 0.0952, 0.0357, 0.0761, 0.0176], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0159, 0.0152, 0.0170, 0.0165, 0.0166, 0.0132, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:40:04,374 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57459.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:40:04,431 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57459.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:40:06,996 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57462.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:40:32,978 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3931, 2.4199, 2.9406, 1.9947, 1.8837, 2.5970, 1.2951, 2.6437], + device='cuda:2'), covar=tensor([0.1487, 0.2072, 0.0996, 0.2411, 0.3626, 0.1480, 0.6337, 0.1284], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0087, 0.0083, 0.0090, 0.0112, 0.0077, 0.0128, 0.0079], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 18:40:35,517 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57495.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:40:45,774 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57507.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:40:49,263 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7747, 0.6676, 0.7649, 0.6641, 0.7100, 0.5007, 0.4378, 0.6850], + device='cuda:2'), covar=tensor([0.0119, 0.0133, 0.0080, 0.0078, 0.0168, 0.0294, 0.0183, 0.0267], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0015, 0.0019], + device='cuda:2'), out_proj_covar=tensor([8.7153e-05, 9.3021e-05, 8.3708e-05, 8.8140e-05, 8.6758e-05, 1.2891e-04, + 1.0876e-04, 1.2352e-04], device='cuda:2') +2022-12-07 18:40:57,039 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.51 vs. limit=5.0 +2022-12-07 18:41:00,055 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57523.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:41:10,290 INFO [train.py:873] (2/4) Epoch 8, batch 4600, loss[loss=0.161, simple_loss=0.1787, pruned_loss=0.07167, over 7803.00 frames. ], tot_loss[loss=0.1512, simple_loss=0.1731, pruned_loss=0.06462, over 1962968.56 frames. ], batch size: 100, lr: 9.84e-03, grad_scale: 8.0 +2022-12-07 18:41:14,686 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.272e+02 2.523e+02 3.027e+02 3.692e+02 7.181e+02, threshold=6.055e+02, percent-clipped=3.0 +2022-12-07 18:41:16,403 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57542.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:41:26,159 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57553.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 18:41:29,262 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7737, 2.8133, 2.7246, 2.6752, 2.1904, 3.1455, 2.7717, 1.3043], + device='cuda:2'), covar=tensor([0.3211, 0.1051, 0.1880, 0.1627, 0.1397, 0.0841, 0.1844, 0.3778], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0067, 0.0055, 0.0056, 0.0084, 0.0063, 0.0086, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:41:29,299 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57556.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:41:36,008 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57564.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:42:07,626 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57600.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:42:18,285 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57612.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:42:35,967 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8199, 2.4785, 3.4426, 2.7556, 3.6910, 3.4652, 3.4673, 3.0039], + device='cuda:2'), covar=tensor([0.0562, 0.3065, 0.0982, 0.2127, 0.0685, 0.0795, 0.1556, 0.2183], + device='cuda:2'), in_proj_covar=tensor([0.0316, 0.0318, 0.0382, 0.0306, 0.0367, 0.0300, 0.0352, 0.0324], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:42:38,955 INFO [train.py:873] (2/4) Epoch 8, batch 4700, loss[loss=0.1743, simple_loss=0.1689, pruned_loss=0.08983, over 2611.00 frames. ], tot_loss[loss=0.1501, simple_loss=0.1724, pruned_loss=0.06385, over 2014250.41 frames. ], batch size: 100, lr: 9.83e-03, grad_scale: 8.0 +2022-12-07 18:42:39,976 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7869, 1.6216, 2.0834, 1.6817, 1.9121, 1.4661, 1.6428, 1.8698], + device='cuda:2'), covar=tensor([0.1964, 0.1855, 0.0245, 0.1522, 0.0734, 0.0839, 0.0743, 0.0320], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0225, 0.0182, 0.0308, 0.0200, 0.0232, 0.0221, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:42:43,279 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.474e+02 2.451e+02 2.895e+02 3.498e+02 7.656e+02, threshold=5.791e+02, percent-clipped=2.0 +2022-12-07 18:42:46,176 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-07 18:42:51,313 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57649.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:42:52,971 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57651.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:42:57,921 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.84 vs. limit=5.0 +2022-12-07 18:43:25,658 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7814, 3.7861, 3.3463, 2.8319, 3.3268, 3.5779, 4.0025, 3.1023], + device='cuda:2'), covar=tensor([0.0850, 0.1929, 0.1142, 0.1709, 0.1106, 0.0663, 0.0907, 0.1678], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0188, 0.0127, 0.0125, 0.0119, 0.0125, 0.0104, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 18:43:33,483 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57697.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:43:35,284 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57699.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:44:05,060 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=57733.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:44:06,682 INFO [train.py:873] (2/4) Epoch 8, batch 4800, loss[loss=0.1496, simple_loss=0.1452, pruned_loss=0.077, over 1262.00 frames. ], tot_loss[loss=0.1511, simple_loss=0.1726, pruned_loss=0.0648, over 1913187.73 frames. ], batch size: 100, lr: 9.82e-03, grad_scale: 8.0 +2022-12-07 18:44:10,753 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.453e+02 2.410e+02 2.943e+02 4.035e+02 9.695e+02, threshold=5.886e+02, percent-clipped=7.0 +2022-12-07 18:44:23,490 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57754.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:44:40,318 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.14 vs. limit=2.0 +2022-12-07 18:44:58,951 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=57794.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:45:04,757 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-12-07 18:45:11,091 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.00 vs. limit=5.0 +2022-12-07 18:45:20,524 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57818.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:45:36,248 INFO [train.py:873] (2/4) Epoch 8, batch 4900, loss[loss=0.1584, simple_loss=0.1754, pruned_loss=0.07071, over 14533.00 frames. ], tot_loss[loss=0.1519, simple_loss=0.173, pruned_loss=0.06538, over 1949274.42 frames. ], batch size: 49, lr: 9.81e-03, grad_scale: 8.0 +2022-12-07 18:45:40,366 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.301e+02 2.954e+02 3.459e+02 6.054e+02, threshold=5.907e+02, percent-clipped=1.0 +2022-12-07 18:45:42,310 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57842.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:45:42,334 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4208, 1.0774, 1.3102, 0.9662, 1.1152, 1.3359, 1.1557, 1.2032], + device='cuda:2'), covar=tensor([0.0327, 0.0706, 0.0530, 0.0471, 0.0822, 0.0514, 0.0387, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0188, 0.0127, 0.0125, 0.0119, 0.0126, 0.0105, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:45:44,001 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4460, 1.8219, 1.9346, 1.9525, 1.6690, 1.9811, 1.7593, 1.0842], + device='cuda:2'), covar=tensor([0.1562, 0.1265, 0.0731, 0.0494, 0.1218, 0.0743, 0.1395, 0.3021], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0067, 0.0056, 0.0057, 0.0086, 0.0063, 0.0086, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 18:45:50,042 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=57851.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:45:51,644 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57853.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 18:46:24,224 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57890.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:46:33,375 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=57900.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:46:34,124 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57901.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:46:44,941 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9657, 3.4047, 2.7899, 4.1415, 3.9483, 3.9634, 3.3572, 2.8602], + device='cuda:2'), covar=tensor([0.0716, 0.1716, 0.3864, 0.0459, 0.0935, 0.2438, 0.1334, 0.4112], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0295, 0.0276, 0.0220, 0.0278, 0.0276, 0.0251, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:47:03,921 INFO [train.py:873] (2/4) Epoch 8, batch 5000, loss[loss=0.1045, simple_loss=0.145, pruned_loss=0.03199, over 14271.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.1717, pruned_loss=0.06336, over 2005762.87 frames. ], batch size: 31, lr: 9.80e-03, grad_scale: 8.0 +2022-12-07 18:47:08,079 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 2.289e+02 3.008e+02 3.693e+02 7.100e+02, threshold=6.016e+02, percent-clipped=4.0 +2022-12-07 18:47:15,004 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=57948.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:47:59,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6453, 3.1769, 2.4241, 3.7148, 3.4815, 3.5675, 3.0802, 2.5178], + device='cuda:2'), covar=tensor([0.0668, 0.1654, 0.4925, 0.0448, 0.0952, 0.1131, 0.1442, 0.4311], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0292, 0.0275, 0.0220, 0.0279, 0.0275, 0.0251, 0.0263], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:48:04,494 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1946, 3.1759, 3.3303, 3.1829, 3.2541, 2.7899, 1.3606, 3.0302], + device='cuda:2'), covar=tensor([0.0299, 0.0358, 0.0359, 0.0385, 0.0281, 0.0868, 0.3013, 0.0280], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0155, 0.0132, 0.0128, 0.0184, 0.0126, 0.0152, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:48:31,879 INFO [train.py:873] (2/4) Epoch 8, batch 5100, loss[loss=0.1622, simple_loss=0.1613, pruned_loss=0.08149, over 4942.00 frames. ], tot_loss[loss=0.1507, simple_loss=0.1722, pruned_loss=0.06463, over 1973494.03 frames. ], batch size: 100, lr: 9.80e-03, grad_scale: 8.0 +2022-12-07 18:48:36,395 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.346e+02 3.025e+02 3.888e+02 7.693e+02, threshold=6.049e+02, percent-clipped=2.0 +2022-12-07 18:48:48,821 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58054.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:48:54,933 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0773, 3.1120, 3.2425, 3.0273, 3.1634, 2.7977, 1.3586, 2.9339], + device='cuda:2'), covar=tensor([0.0336, 0.0390, 0.0406, 0.0451, 0.0361, 0.0906, 0.3221, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0133, 0.0129, 0.0186, 0.0127, 0.0153, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:49:19,603 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58089.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:49:30,697 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58102.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:49:45,300 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58118.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:49:59,577 INFO [train.py:873] (2/4) Epoch 8, batch 5200, loss[loss=0.1768, simple_loss=0.1826, pruned_loss=0.08554, over 4979.00 frames. ], tot_loss[loss=0.151, simple_loss=0.1726, pruned_loss=0.06466, over 1960295.38 frames. ], batch size: 100, lr: 9.79e-03, grad_scale: 16.0 +2022-12-07 18:50:04,075 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.185e+02 2.199e+02 2.811e+02 3.642e+02 7.366e+02, threshold=5.622e+02, percent-clipped=3.0 +2022-12-07 18:50:05,173 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58141.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:50:07,749 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58144.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:50:13,748 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58151.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:50:26,679 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58166.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:50:56,069 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58199.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:50:59,039 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58202.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:51:01,532 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58205.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:51:27,321 INFO [train.py:873] (2/4) Epoch 8, batch 5300, loss[loss=0.1427, simple_loss=0.1766, pruned_loss=0.05441, over 13966.00 frames. ], tot_loss[loss=0.15, simple_loss=0.1724, pruned_loss=0.06383, over 2001175.77 frames. ], batch size: 26, lr: 9.78e-03, grad_scale: 8.0 +2022-12-07 18:51:32,495 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.430e+02 3.290e+02 3.970e+02 9.558e+02, threshold=6.580e+02, percent-clipped=6.0 +2022-12-07 18:51:57,935 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4722, 2.2348, 3.3833, 3.5120, 3.4170, 2.2047, 3.3492, 2.5617], + device='cuda:2'), covar=tensor([0.0245, 0.0534, 0.0534, 0.0312, 0.0244, 0.0915, 0.0221, 0.0652], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0227, 0.0339, 0.0285, 0.0226, 0.0273, 0.0246, 0.0258], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:52:55,482 INFO [train.py:873] (2/4) Epoch 8, batch 5400, loss[loss=0.1233, simple_loss=0.1611, pruned_loss=0.04275, over 14328.00 frames. ], tot_loss[loss=0.1496, simple_loss=0.1721, pruned_loss=0.06358, over 1971994.87 frames. ], batch size: 55, lr: 9.77e-03, grad_scale: 8.0 +2022-12-07 18:53:00,691 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 2.324e+02 2.944e+02 3.627e+02 6.890e+02, threshold=5.887e+02, percent-clipped=1.0 +2022-12-07 18:53:43,268 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58389.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:54:23,916 INFO [train.py:873] (2/4) Epoch 8, batch 5500, loss[loss=0.1584, simple_loss=0.1781, pruned_loss=0.0694, over 14249.00 frames. ], tot_loss[loss=0.1487, simple_loss=0.1716, pruned_loss=0.06293, over 1961566.30 frames. ], batch size: 69, lr: 9.76e-03, grad_scale: 8.0 +2022-12-07 18:54:26,057 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58437.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:54:29,550 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.638e+01 2.350e+02 2.867e+02 3.886e+02 1.534e+03, threshold=5.734e+02, percent-clipped=10.0 +2022-12-07 18:55:18,764 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58497.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:55:21,726 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=58500.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:55:32,277 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1868, 4.2992, 4.5319, 3.9074, 4.3065, 4.6107, 1.6115, 4.0981], + device='cuda:2'), covar=tensor([0.0233, 0.0303, 0.0391, 0.0498, 0.0339, 0.0204, 0.3343, 0.0287], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0132, 0.0130, 0.0186, 0.0128, 0.0155, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:55:52,342 INFO [train.py:873] (2/4) Epoch 8, batch 5600, loss[loss=0.1332, simple_loss=0.1644, pruned_loss=0.05103, over 14532.00 frames. ], tot_loss[loss=0.1491, simple_loss=0.1721, pruned_loss=0.06302, over 2038430.02 frames. ], batch size: 43, lr: 9.75e-03, grad_scale: 8.0 +2022-12-07 18:55:57,837 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.352e+02 2.292e+02 2.814e+02 3.387e+02 5.498e+02, threshold=5.627e+02, percent-clipped=0.0 +2022-12-07 18:56:57,734 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2003, 1.5444, 1.3287, 1.3443, 1.3536, 0.9840, 0.7666, 0.8245], + device='cuda:2'), covar=tensor([0.0722, 0.0896, 0.0490, 0.0446, 0.0744, 0.0337, 0.0674, 0.1018], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0015, 0.0019], + device='cuda:2'), out_proj_covar=tensor([8.7117e-05, 9.2707e-05, 8.2621e-05, 8.7873e-05, 8.6509e-05, 1.2835e-04, + 1.0768e-04, 1.2251e-04], device='cuda:2') +2022-12-07 18:57:21,324 INFO [train.py:873] (2/4) Epoch 8, batch 5700, loss[loss=0.1812, simple_loss=0.1861, pruned_loss=0.0881, over 7783.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.1732, pruned_loss=0.06463, over 1956333.52 frames. ], batch size: 100, lr: 9.75e-03, grad_scale: 8.0 +2022-12-07 18:57:26,806 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.215e+02 2.316e+02 2.828e+02 3.502e+02 7.696e+02, threshold=5.656e+02, percent-clipped=3.0 +2022-12-07 18:58:00,440 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6080, 2.5287, 2.7120, 2.6929, 2.6302, 2.4486, 1.4398, 2.4400], + device='cuda:2'), covar=tensor([0.0379, 0.0467, 0.0416, 0.0403, 0.0394, 0.1050, 0.2431, 0.0354], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0154, 0.0129, 0.0128, 0.0184, 0.0128, 0.0153, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 18:58:02,434 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58681.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:58:17,254 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-12-07 18:58:49,966 INFO [train.py:873] (2/4) Epoch 8, batch 5800, loss[loss=0.2037, simple_loss=0.171, pruned_loss=0.1182, over 1257.00 frames. ], tot_loss[loss=0.15, simple_loss=0.1723, pruned_loss=0.06388, over 1900797.89 frames. ], batch size: 100, lr: 9.74e-03, grad_scale: 8.0 +2022-12-07 18:58:55,175 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.238e+02 2.215e+02 2.766e+02 3.426e+02 6.991e+02, threshold=5.532e+02, percent-clipped=1.0 +2022-12-07 18:58:56,257 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58742.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:58:58,309 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8541, 2.6150, 4.9297, 3.2304, 4.5506, 2.5427, 3.4997, 4.5812], + device='cuda:2'), covar=tensor([0.0303, 0.3884, 0.0206, 0.6796, 0.0401, 0.3105, 0.1035, 0.0246], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0235, 0.0186, 0.0320, 0.0208, 0.0239, 0.0227, 0.0195], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 18:59:25,704 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58775.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:59:45,974 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58797.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 18:59:48,893 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=58800.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:00:19,702 INFO [train.py:873] (2/4) Epoch 8, batch 5900, loss[loss=0.1679, simple_loss=0.1831, pruned_loss=0.07633, over 9494.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.172, pruned_loss=0.0632, over 1955026.97 frames. ], batch size: 100, lr: 9.73e-03, grad_scale: 8.0 +2022-12-07 19:00:20,777 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58836.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:00:25,228 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.436e+02 2.290e+02 2.920e+02 3.537e+02 8.146e+02, threshold=5.840e+02, percent-clipped=6.0 +2022-12-07 19:00:29,004 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58845.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:00:31,571 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=58848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:00:56,404 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-12-07 19:01:22,548 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=58906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:01:47,423 INFO [train.py:873] (2/4) Epoch 8, batch 6000, loss[loss=0.2169, simple_loss=0.2123, pruned_loss=0.1107, over 8651.00 frames. ], tot_loss[loss=0.1469, simple_loss=0.1705, pruned_loss=0.0617, over 1976377.07 frames. ], batch size: 100, lr: 9.72e-03, grad_scale: 8.0 +2022-12-07 19:01:47,423 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 19:02:10,413 INFO [train.py:905] (2/4) Epoch 8, validation: loss=0.1228, simple_loss=0.1649, pruned_loss=0.04039, over 857387.00 frames. +2022-12-07 19:02:10,414 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 19:02:16,061 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.501e+02 2.300e+02 3.027e+02 3.751e+02 1.020e+03, threshold=6.055e+02, percent-clipped=3.0 +2022-12-07 19:02:35,956 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-12-07 19:02:38,943 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=58967.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:03:38,338 INFO [train.py:873] (2/4) Epoch 8, batch 6100, loss[loss=0.1335, simple_loss=0.1343, pruned_loss=0.06636, over 2618.00 frames. ], tot_loss[loss=0.1476, simple_loss=0.1709, pruned_loss=0.06216, over 1973552.15 frames. ], batch size: 100, lr: 9.71e-03, grad_scale: 8.0 +2022-12-07 19:03:40,173 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59037.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:03:43,431 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.600e+02 2.326e+02 2.830e+02 3.436e+02 6.113e+02, threshold=5.661e+02, percent-clipped=1.0 +2022-12-07 19:05:02,953 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59131.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:05:06,213 INFO [train.py:873] (2/4) Epoch 8, batch 6200, loss[loss=0.1341, simple_loss=0.1742, pruned_loss=0.04705, over 14023.00 frames. ], tot_loss[loss=0.1471, simple_loss=0.1704, pruned_loss=0.06188, over 1950470.92 frames. ], batch size: 22, lr: 9.71e-03, grad_scale: 8.0 +2022-12-07 19:05:11,776 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.400e+01 2.402e+02 3.111e+02 3.796e+02 6.098e+02, threshold=6.221e+02, percent-clipped=5.0 +2022-12-07 19:05:32,165 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59164.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:05:50,731 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3323, 3.0058, 2.8081, 1.8706, 2.8127, 3.0990, 3.3174, 2.6859], + device='cuda:2'), covar=tensor([0.0545, 0.1435, 0.1009, 0.2079, 0.0968, 0.0531, 0.0678, 0.1351], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0186, 0.0129, 0.0126, 0.0122, 0.0128, 0.0105, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:06:26,862 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59225.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:06:35,507 INFO [train.py:873] (2/4) Epoch 8, batch 6300, loss[loss=0.1546, simple_loss=0.145, pruned_loss=0.0821, over 1237.00 frames. ], tot_loss[loss=0.1469, simple_loss=0.1704, pruned_loss=0.06169, over 1934483.45 frames. ], batch size: 100, lr: 9.70e-03, grad_scale: 8.0 +2022-12-07 19:06:40,625 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.486e+02 2.206e+02 2.756e+02 3.412e+02 6.293e+02, threshold=5.512e+02, percent-clipped=2.0 +2022-12-07 19:07:00,054 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59262.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:07:20,960 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9139, 1.6953, 1.9711, 1.6721, 2.1076, 1.7282, 1.6178, 1.8788], + device='cuda:2'), covar=tensor([0.0458, 0.1274, 0.0220, 0.0405, 0.0275, 0.0597, 0.0264, 0.0390], + device='cuda:2'), in_proj_covar=tensor([0.0323, 0.0324, 0.0395, 0.0308, 0.0381, 0.0308, 0.0355, 0.0326], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:07:51,734 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.99 vs. limit=5.0 +2022-12-07 19:08:04,133 INFO [train.py:873] (2/4) Epoch 8, batch 6400, loss[loss=0.1505, simple_loss=0.161, pruned_loss=0.06998, over 4936.00 frames. ], tot_loss[loss=0.1468, simple_loss=0.1703, pruned_loss=0.06161, over 1970008.72 frames. ], batch size: 100, lr: 9.69e-03, grad_scale: 8.0 +2022-12-07 19:08:06,046 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59337.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:08:09,584 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.267e+01 2.304e+02 2.776e+02 3.758e+02 5.907e+02, threshold=5.553e+02, percent-clipped=4.0 +2022-12-07 19:08:15,119 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9902, 2.8394, 2.5181, 2.6488, 2.9014, 2.8880, 2.9393, 2.9798], + device='cuda:2'), covar=tensor([0.1127, 0.0830, 0.2494, 0.3305, 0.1036, 0.1053, 0.1455, 0.0943], + device='cuda:2'), in_proj_covar=tensor([0.0336, 0.0239, 0.0398, 0.0510, 0.0285, 0.0376, 0.0366, 0.0323], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:08:48,444 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59385.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:09:25,840 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3000, 4.9342, 4.7806, 5.3737, 4.8331, 4.6465, 5.3656, 5.2239], + device='cuda:2'), covar=tensor([0.0621, 0.0662, 0.0808, 0.0532, 0.0950, 0.0423, 0.0543, 0.0678], + device='cuda:2'), in_proj_covar=tensor([0.0123, 0.0117, 0.0129, 0.0135, 0.0131, 0.0104, 0.0144, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 19:09:29,207 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59431.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:09:32,600 INFO [train.py:873] (2/4) Epoch 8, batch 6500, loss[loss=0.146, simple_loss=0.1754, pruned_loss=0.05824, over 14221.00 frames. ], tot_loss[loss=0.1465, simple_loss=0.1703, pruned_loss=0.06133, over 1981725.99 frames. ], batch size: 89, lr: 9.68e-03, grad_scale: 8.0 +2022-12-07 19:09:37,914 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.592e+02 2.343e+02 2.966e+02 3.672e+02 1.051e+03, threshold=5.932e+02, percent-clipped=2.0 +2022-12-07 19:10:10,831 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59479.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:10:11,801 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1566, 3.4326, 4.1682, 3.0115, 2.4635, 3.4520, 2.1545, 3.5716], + device='cuda:2'), covar=tensor([0.1158, 0.0884, 0.0479, 0.1690, 0.2623, 0.0898, 0.4452, 0.0838], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0093, 0.0083, 0.0091, 0.0112, 0.0078, 0.0132, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:10:31,168 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59502.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:10:46,667 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59520.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:10:59,606 INFO [train.py:873] (2/4) Epoch 8, batch 6600, loss[loss=0.1692, simple_loss=0.181, pruned_loss=0.07868, over 11947.00 frames. ], tot_loss[loss=0.1469, simple_loss=0.1701, pruned_loss=0.06183, over 1986755.61 frames. ], batch size: 100, lr: 9.67e-03, grad_scale: 8.0 +2022-12-07 19:11:04,956 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.142e+02 2.283e+02 2.818e+02 3.412e+02 6.206e+02, threshold=5.635e+02, percent-clipped=2.0 +2022-12-07 19:11:23,725 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59562.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:11:24,548 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59563.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:11:48,140 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-12-07 19:12:05,792 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59610.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:12:27,867 INFO [train.py:873] (2/4) Epoch 8, batch 6700, loss[loss=0.1296, simple_loss=0.162, pruned_loss=0.04863, over 14166.00 frames. ], tot_loss[loss=0.1467, simple_loss=0.1701, pruned_loss=0.06162, over 1970884.02 frames. ], batch size: 35, lr: 9.66e-03, grad_scale: 8.0 +2022-12-07 19:12:32,694 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 2.361e+02 2.930e+02 3.597e+02 6.848e+02, threshold=5.861e+02, percent-clipped=3.0 +2022-12-07 19:12:59,864 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4042, 2.4333, 1.9431, 2.5449, 2.3084, 2.4394, 2.1819, 2.1407], + device='cuda:2'), covar=tensor([0.0651, 0.0726, 0.2004, 0.0466, 0.0842, 0.0571, 0.1253, 0.1173], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0293, 0.0275, 0.0222, 0.0286, 0.0275, 0.0253, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:13:10,699 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-07 19:13:15,597 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2062, 2.2743, 4.2219, 2.8445, 4.1216, 2.0615, 3.2697, 4.0445], + device='cuda:2'), covar=tensor([0.0486, 0.4333, 0.0403, 0.7598, 0.0444, 0.3761, 0.1260, 0.0463], + device='cuda:2'), in_proj_covar=tensor([0.0231, 0.0230, 0.0184, 0.0308, 0.0205, 0.0232, 0.0223, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:13:55,716 INFO [train.py:873] (2/4) Epoch 8, batch 6800, loss[loss=0.1279, simple_loss=0.1607, pruned_loss=0.04753, over 14285.00 frames. ], tot_loss[loss=0.1458, simple_loss=0.1698, pruned_loss=0.06087, over 1987819.16 frames. ], batch size: 63, lr: 9.66e-03, grad_scale: 8.0 +2022-12-07 19:14:01,554 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.278e+02 2.761e+02 3.768e+02 7.478e+02, threshold=5.523e+02, percent-clipped=3.0 +2022-12-07 19:14:51,290 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-12-07 19:15:12,051 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=59820.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:15:17,886 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7852, 2.6766, 1.9549, 2.8702, 2.5916, 2.7502, 2.3125, 2.0966], + device='cuda:2'), covar=tensor([0.0796, 0.1444, 0.4112, 0.0600, 0.0991, 0.0853, 0.1758, 0.3522], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0294, 0.0276, 0.0225, 0.0289, 0.0278, 0.0255, 0.0263], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:15:25,705 INFO [train.py:873] (2/4) Epoch 8, batch 6900, loss[loss=0.162, simple_loss=0.1891, pruned_loss=0.06744, over 14466.00 frames. ], tot_loss[loss=0.1465, simple_loss=0.1703, pruned_loss=0.06135, over 1997842.15 frames. ], batch size: 51, lr: 9.65e-03, grad_scale: 8.0 +2022-12-07 19:15:30,915 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 2.523e+02 3.039e+02 3.738e+02 1.408e+03, threshold=6.079e+02, percent-clipped=6.0 +2022-12-07 19:15:36,917 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:15:45,656 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=59858.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:15:54,683 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=59868.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:16:13,238 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=59889.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:16:31,694 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:16:38,546 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2045, 2.9408, 5.2523, 3.5451, 4.8433, 2.6321, 4.0931, 4.7674], + device='cuda:2'), covar=tensor([0.0337, 0.3838, 0.0415, 0.7439, 0.0428, 0.3217, 0.0951, 0.0204], + device='cuda:2'), in_proj_covar=tensor([0.0233, 0.0230, 0.0183, 0.0309, 0.0205, 0.0233, 0.0223, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:16:53,936 INFO [train.py:873] (2/4) Epoch 8, batch 7000, loss[loss=0.1153, simple_loss=0.1519, pruned_loss=0.03935, over 14045.00 frames. ], tot_loss[loss=0.1472, simple_loss=0.1706, pruned_loss=0.06187, over 2059844.20 frames. ], batch size: 19, lr: 9.64e-03, grad_scale: 8.0 +2022-12-07 19:16:59,264 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.368e+02 2.338e+02 3.066e+02 3.722e+02 8.266e+02, threshold=6.131e+02, percent-clipped=2.0 +2022-12-07 19:17:08,045 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=59950.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:17:22,100 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.68 vs. limit=2.0 +2022-12-07 19:17:50,673 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7902, 0.7975, 0.6432, 0.7376, 0.7914, 0.1425, 0.6737, 0.8443], + device='cuda:2'), covar=tensor([0.0295, 0.0315, 0.0305, 0.0142, 0.0174, 0.0179, 0.0481, 0.0315], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0024, 0.0023, 0.0022, 0.0024, 0.0033, 0.0022, 0.0023], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 19:18:26,315 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.65 vs. limit=5.0 +2022-12-07 19:18:27,445 INFO [train.py:873] (2/4) Epoch 8, batch 7100, loss[loss=0.2099, simple_loss=0.1748, pruned_loss=0.1225, over 1267.00 frames. ], tot_loss[loss=0.1475, simple_loss=0.1711, pruned_loss=0.06193, over 2041550.33 frames. ], batch size: 100, lr: 9.63e-03, grad_scale: 8.0 +2022-12-07 19:18:31,888 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60040.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:18:32,561 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.335e+02 2.178e+02 2.684e+02 3.271e+02 7.336e+02, threshold=5.369e+02, percent-clipped=1.0 +2022-12-07 19:18:42,299 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60052.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:18:54,032 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60065.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:19:05,152 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60078.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:19:06,893 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1430, 0.9668, 1.1121, 1.0487, 1.1643, 0.6129, 1.0975, 1.1082], + device='cuda:2'), covar=tensor([0.0674, 0.1177, 0.0596, 0.0788, 0.0601, 0.1199, 0.0779, 0.0824], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0024, 0.0024, 0.0022, 0.0025, 0.0033, 0.0023, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 19:19:10,091 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6149, 2.2864, 2.9722, 1.8157, 1.9135, 2.5913, 1.4152, 2.5044], + device='cuda:2'), covar=tensor([0.0962, 0.1836, 0.0714, 0.2517, 0.2745, 0.0958, 0.4946, 0.1339], + device='cuda:2'), in_proj_covar=tensor([0.0074, 0.0094, 0.0084, 0.0091, 0.0111, 0.0078, 0.0131, 0.0081], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:19:24,580 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60101.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 19:19:35,595 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60113.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 19:19:46,343 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60126.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:19:53,838 INFO [train.py:873] (2/4) Epoch 8, batch 7200, loss[loss=0.1219, simple_loss=0.1392, pruned_loss=0.05231, over 3902.00 frames. ], tot_loss[loss=0.1486, simple_loss=0.1717, pruned_loss=0.06277, over 2004249.91 frames. ], batch size: 100, lr: 9.63e-03, grad_scale: 8.0 +2022-12-07 19:19:57,783 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60139.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:19:59,259 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.579e+01 2.475e+02 2.936e+02 3.540e+02 1.198e+03, threshold=5.873e+02, percent-clipped=8.0 +2022-12-07 19:20:10,989 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6730, 1.8592, 1.9883, 2.1658, 1.6887, 2.0910, 1.7723, 1.0005], + device='cuda:2'), covar=tensor([0.1708, 0.1039, 0.0568, 0.0464, 0.1196, 0.0837, 0.1590, 0.3193], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0066, 0.0055, 0.0055, 0.0085, 0.0065, 0.0088, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:20:14,377 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60158.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:20:26,321 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4819, 1.2991, 1.3052, 1.3796, 1.5757, 0.8635, 1.2741, 0.8403], + device='cuda:2'), covar=tensor([0.0363, 0.0921, 0.0453, 0.0483, 0.0272, 0.0383, 0.0299, 0.0587], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0016, 0.0020], + device='cuda:2'), out_proj_covar=tensor([9.1527e-05, 9.6601e-05, 8.5596e-05, 9.2197e-05, 8.6832e-05, 1.3437e-04, + 1.1316e-04, 1.2715e-04], device='cuda:2') +2022-12-07 19:20:32,256 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1419, 1.5749, 1.6926, 1.7339, 1.5668, 1.7909, 1.4105, 1.2370], + device='cuda:2'), covar=tensor([0.1732, 0.0742, 0.0432, 0.0289, 0.0915, 0.0509, 0.1556, 0.1767], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0066, 0.0055, 0.0056, 0.0085, 0.0065, 0.0088, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:20:49,243 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60198.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:20:54,637 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60204.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:20:56,407 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60206.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:21:03,451 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6075, 1.3157, 3.5635, 1.4845, 3.4404, 3.6581, 2.5293, 3.9258], + device='cuda:2'), covar=tensor([0.0241, 0.3142, 0.0367, 0.2426, 0.0689, 0.0363, 0.0808, 0.0174], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0159, 0.0155, 0.0169, 0.0168, 0.0166, 0.0133, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:21:22,474 INFO [train.py:873] (2/4) Epoch 8, batch 7300, loss[loss=0.1991, simple_loss=0.1767, pruned_loss=0.1107, over 2628.00 frames. ], tot_loss[loss=0.1477, simple_loss=0.1707, pruned_loss=0.06236, over 1986812.58 frames. ], batch size: 100, lr: 9.62e-03, grad_scale: 16.0 +2022-12-07 19:21:28,209 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.538e+01 2.297e+02 3.128e+02 4.022e+02 9.031e+02, threshold=6.256e+02, percent-clipped=2.0 +2022-12-07 19:21:30,674 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60245.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:21:43,249 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60259.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:22:17,590 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1851, 4.6795, 4.6319, 5.1441, 4.7840, 4.4347, 5.1598, 4.2698], + device='cuda:2'), covar=tensor([0.0383, 0.1178, 0.0331, 0.0462, 0.0786, 0.0509, 0.0524, 0.0558], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0246, 0.0166, 0.0163, 0.0164, 0.0132, 0.0250, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 19:22:49,539 INFO [train.py:873] (2/4) Epoch 8, batch 7400, loss[loss=0.207, simple_loss=0.2027, pruned_loss=0.1056, over 9485.00 frames. ], tot_loss[loss=0.1478, simple_loss=0.1704, pruned_loss=0.06255, over 1985056.54 frames. ], batch size: 100, lr: 9.61e-03, grad_scale: 8.0 +2022-12-07 19:22:56,039 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.153e+02 2.246e+02 2.782e+02 3.435e+02 7.808e+02, threshold=5.565e+02, percent-clipped=3.0 +2022-12-07 19:23:36,481 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5703, 4.2368, 3.9403, 4.1498, 4.2880, 4.4330, 4.5542, 4.5103], + device='cuda:2'), covar=tensor([0.0677, 0.0458, 0.1890, 0.2649, 0.0684, 0.0596, 0.0754, 0.0718], + device='cuda:2'), in_proj_covar=tensor([0.0341, 0.0239, 0.0398, 0.0514, 0.0295, 0.0375, 0.0367, 0.0325], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:23:42,331 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6663, 3.4483, 3.1741, 2.4317, 3.2035, 3.3630, 3.6651, 3.0221], + device='cuda:2'), covar=tensor([0.0632, 0.1632, 0.1049, 0.1887, 0.0821, 0.0693, 0.0762, 0.1464], + device='cuda:2'), in_proj_covar=tensor([0.0122, 0.0182, 0.0126, 0.0122, 0.0121, 0.0126, 0.0103, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0004, 0.0005], + device='cuda:2') +2022-12-07 19:23:43,976 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60396.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 19:23:54,333 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60408.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 19:24:06,087 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60421.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:24:17,783 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60434.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:24:18,543 INFO [train.py:873] (2/4) Epoch 8, batch 7500, loss[loss=0.1373, simple_loss=0.1641, pruned_loss=0.05526, over 14154.00 frames. ], tot_loss[loss=0.1461, simple_loss=0.1692, pruned_loss=0.06145, over 1997824.67 frames. ], batch size: 35, lr: 9.60e-03, grad_scale: 8.0 +2022-12-07 19:24:24,273 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.353e+02 2.510e+02 2.850e+02 3.552e+02 6.747e+02, threshold=5.701e+02, percent-clipped=5.0 +2022-12-07 19:24:28,298 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.75 vs. limit=2.0 +2022-12-07 19:25:46,339 INFO [train.py:873] (2/4) Epoch 9, batch 0, loss[loss=0.1415, simple_loss=0.169, pruned_loss=0.05695, over 14273.00 frames. ], tot_loss[loss=0.1415, simple_loss=0.169, pruned_loss=0.05695, over 14273.00 frames. ], batch size: 57, lr: 9.08e-03, grad_scale: 8.0 +2022-12-07 19:25:46,340 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 19:25:53,546 INFO [train.py:905] (2/4) Epoch 9, validation: loss=0.1275, simple_loss=0.1706, pruned_loss=0.04216, over 857387.00 frames. +2022-12-07 19:25:53,547 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 19:26:00,156 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60504.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:26:10,750 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3094, 2.1078, 2.2387, 1.5840, 1.9165, 2.2452, 2.3578, 2.0193], + device='cuda:2'), covar=tensor([0.0787, 0.1092, 0.1000, 0.1612, 0.1235, 0.0643, 0.0498, 0.1433], + device='cuda:2'), in_proj_covar=tensor([0.0124, 0.0187, 0.0128, 0.0124, 0.0124, 0.0128, 0.0105, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:26:33,659 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.503e+01 2.217e+02 2.837e+02 3.988e+02 6.329e+02, threshold=5.675e+02, percent-clipped=3.0 +2022-12-07 19:26:36,533 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60545.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:26:43,323 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60552.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:26:45,481 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=60554.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:27:19,869 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60593.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:27:23,500 INFO [train.py:873] (2/4) Epoch 9, batch 100, loss[loss=0.1341, simple_loss=0.1651, pruned_loss=0.05153, over 14257.00 frames. ], tot_loss[loss=0.1467, simple_loss=0.1714, pruned_loss=0.06103, over 883238.93 frames. ], batch size: 80, lr: 9.08e-03, grad_scale: 8.0 +2022-12-07 19:27:58,048 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4789, 4.9524, 4.9335, 5.3327, 5.1105, 4.6141, 5.3903, 4.4868], + device='cuda:2'), covar=tensor([0.0275, 0.0869, 0.0275, 0.0454, 0.0637, 0.0403, 0.0476, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0243, 0.0165, 0.0161, 0.0162, 0.0132, 0.0249, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 19:28:03,330 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.184e+02 2.443e+02 2.952e+02 3.884e+02 8.156e+02, threshold=5.905e+02, percent-clipped=9.0 +2022-12-07 19:28:04,992 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-07 19:28:51,035 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60696.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:28:51,887 INFO [train.py:873] (2/4) Epoch 9, batch 200, loss[loss=0.1948, simple_loss=0.1975, pruned_loss=0.09608, over 8626.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.1696, pruned_loss=0.05997, over 1302023.77 frames. ], batch size: 100, lr: 9.07e-03, grad_scale: 8.0 +2022-12-07 19:29:01,434 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60708.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:11,264 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5647, 4.7173, 5.2383, 5.5369, 5.3021, 4.4910, 5.6393, 4.3189], + device='cuda:2'), covar=tensor([0.1058, 0.2193, 0.0740, 0.1084, 0.1182, 0.0642, 0.1071, 0.1089], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0242, 0.0164, 0.0159, 0.0161, 0.0132, 0.0247, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 19:29:11,344 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=60718.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:12,216 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3817, 1.6216, 1.6678, 1.3925, 1.2377, 1.0246, 1.1945, 1.0353], + device='cuda:2'), covar=tensor([0.0506, 0.0459, 0.0372, 0.0705, 0.0495, 0.0411, 0.0310, 0.0577], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0015, 0.0019], + device='cuda:2'), out_proj_covar=tensor([8.9124e-05, 9.5668e-05, 8.5254e-05, 9.0823e-05, 8.6887e-05, 1.3391e-04, + 1.1241e-04, 1.2611e-04], device='cuda:2') +2022-12-07 19:29:13,741 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60721.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:24,906 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60734.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:32,171 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.211e+01 2.496e+02 3.042e+02 4.274e+02 1.188e+03, threshold=6.084e+02, percent-clipped=4.0 +2022-12-07 19:29:33,109 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60744.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:44,435 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60756.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:29:55,852 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60769.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:30:04,792 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=60779.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:30:07,105 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60782.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:30:20,884 INFO [train.py:873] (2/4) Epoch 9, batch 300, loss[loss=0.147, simple_loss=0.1711, pruned_loss=0.06141, over 11979.00 frames. ], tot_loss[loss=0.1462, simple_loss=0.1703, pruned_loss=0.06109, over 1584825.89 frames. ], batch size: 100, lr: 9.06e-03, grad_scale: 4.0 +2022-12-07 19:31:01,440 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.410e+02 2.299e+02 2.848e+02 3.507e+02 1.292e+03, threshold=5.696e+02, percent-clipped=2.0 +2022-12-07 19:31:11,045 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=60854.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:31:49,309 INFO [train.py:873] (2/4) Epoch 9, batch 400, loss[loss=0.1692, simple_loss=0.1778, pruned_loss=0.08031, over 8604.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.1699, pruned_loss=0.06076, over 1779524.75 frames. ], batch size: 100, lr: 9.06e-03, grad_scale: 8.0 +2022-12-07 19:31:53,798 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=60902.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:32:09,323 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4587, 2.6153, 4.4076, 4.5558, 4.5464, 2.6920, 4.4181, 3.4757], + device='cuda:2'), covar=tensor([0.0202, 0.0610, 0.0677, 0.0235, 0.0170, 0.0890, 0.0194, 0.0572], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0232, 0.0344, 0.0292, 0.0232, 0.0276, 0.0255, 0.0261], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:32:29,748 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.639e+01 2.151e+02 2.975e+02 3.759e+02 9.050e+02, threshold=5.950e+02, percent-clipped=6.0 +2022-12-07 19:32:45,763 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9785, 3.6339, 2.8098, 4.2251, 3.9695, 4.0026, 3.4416, 2.9153], + device='cuda:2'), covar=tensor([0.0912, 0.1397, 0.3909, 0.0432, 0.0850, 0.1663, 0.1189, 0.3855], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0296, 0.0276, 0.0230, 0.0294, 0.0280, 0.0253, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 19:33:01,684 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 19:33:11,472 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7422, 1.1377, 1.2859, 1.2897, 1.1192, 1.3780, 1.0958, 0.8897], + device='cuda:2'), covar=tensor([0.2662, 0.0934, 0.0306, 0.0298, 0.1376, 0.0432, 0.1793, 0.1303], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0068, 0.0055, 0.0057, 0.0086, 0.0065, 0.0091, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:33:12,438 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0732, 2.4784, 3.7861, 2.6640, 3.8273, 3.7223, 3.6128, 3.1737], + device='cuda:2'), covar=tensor([0.0654, 0.3452, 0.1030, 0.2325, 0.0789, 0.0862, 0.1750, 0.2382], + device='cuda:2'), in_proj_covar=tensor([0.0330, 0.0324, 0.0399, 0.0313, 0.0374, 0.0313, 0.0364, 0.0326], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:33:18,101 INFO [train.py:873] (2/4) Epoch 9, batch 500, loss[loss=0.1383, simple_loss=0.1326, pruned_loss=0.07205, over 1199.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.1692, pruned_loss=0.06019, over 1817279.21 frames. ], batch size: 100, lr: 9.05e-03, grad_scale: 8.0 +2022-12-07 19:33:30,975 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2791, 2.9425, 2.2624, 3.3023, 3.0500, 3.0983, 2.5960, 2.1516], + device='cuda:2'), covar=tensor([0.0731, 0.1701, 0.4418, 0.0560, 0.1239, 0.1222, 0.2075, 0.4646], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0293, 0.0274, 0.0230, 0.0292, 0.0279, 0.0252, 0.0266], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 19:33:31,410 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.21 vs. limit=5.0 +2022-12-07 19:33:59,480 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 2.291e+02 2.820e+02 3.548e+02 5.650e+02, threshold=5.639e+02, percent-clipped=0.0 +2022-12-07 19:34:14,613 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61061.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:34:25,958 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=61074.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:34:45,881 INFO [train.py:873] (2/4) Epoch 9, batch 600, loss[loss=0.1359, simple_loss=0.1687, pruned_loss=0.0515, over 14471.00 frames. ], tot_loss[loss=0.1459, simple_loss=0.1695, pruned_loss=0.06112, over 1854513.06 frames. ], batch size: 24, lr: 9.04e-03, grad_scale: 8.0 +2022-12-07 19:35:08,482 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61122.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:35:25,943 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.178e+01 2.284e+02 2.726e+02 3.406e+02 8.336e+02, threshold=5.452e+02, percent-clipped=5.0 +2022-12-07 19:35:45,891 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2252, 2.7366, 3.8470, 3.1127, 3.9883, 3.9069, 3.7788, 3.2556], + device='cuda:2'), covar=tensor([0.0644, 0.2812, 0.1172, 0.1678, 0.0884, 0.0762, 0.1416, 0.1793], + device='cuda:2'), in_proj_covar=tensor([0.0326, 0.0317, 0.0393, 0.0306, 0.0371, 0.0308, 0.0355, 0.0320], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:36:08,711 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3473, 1.7220, 1.8818, 1.5308, 1.4623, 1.3738, 1.3478, 1.3276], + device='cuda:2'), covar=tensor([0.0634, 0.0770, 0.0489, 0.0859, 0.0851, 0.0535, 0.0500, 0.0929], + device='cuda:2'), in_proj_covar=tensor([0.0012, 0.0013, 0.0011, 0.0012, 0.0012, 0.0019, 0.0016, 0.0020], + device='cuda:2'), out_proj_covar=tensor([9.1573e-05, 9.7359e-05, 8.5892e-05, 9.2981e-05, 8.8692e-05, 1.3402e-04, + 1.1352e-04, 1.2739e-04], device='cuda:2') +2022-12-07 19:36:13,117 INFO [train.py:873] (2/4) Epoch 9, batch 700, loss[loss=0.1574, simple_loss=0.1704, pruned_loss=0.07224, over 5946.00 frames. ], tot_loss[loss=0.1466, simple_loss=0.17, pruned_loss=0.06154, over 1875747.28 frames. ], batch size: 100, lr: 9.03e-03, grad_scale: 8.0 +2022-12-07 19:36:26,110 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7581, 1.5826, 1.7463, 2.0785, 1.4605, 1.7775, 1.7105, 1.9214], + device='cuda:2'), covar=tensor([0.0109, 0.0171, 0.0086, 0.0083, 0.0170, 0.0187, 0.0121, 0.0086], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0231, 0.0345, 0.0293, 0.0233, 0.0279, 0.0256, 0.0262], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:36:28,781 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1212, 3.1457, 3.1282, 3.0981, 2.5020, 3.1565, 2.8516, 1.4222], + device='cuda:2'), covar=tensor([0.2399, 0.0827, 0.2023, 0.0827, 0.1175, 0.0764, 0.1655, 0.3379], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0068, 0.0055, 0.0056, 0.0086, 0.0064, 0.0089, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:36:54,420 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 2.220e+02 2.923e+02 3.750e+02 6.888e+02, threshold=5.845e+02, percent-clipped=6.0 +2022-12-07 19:37:41,706 INFO [train.py:873] (2/4) Epoch 9, batch 800, loss[loss=0.1682, simple_loss=0.1591, pruned_loss=0.08867, over 1232.00 frames. ], tot_loss[loss=0.147, simple_loss=0.1701, pruned_loss=0.06193, over 1901616.61 frames. ], batch size: 100, lr: 9.03e-03, grad_scale: 8.0 +2022-12-07 19:37:43,789 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7140, 2.1247, 3.5855, 3.8703, 3.6764, 2.2066, 3.7022, 2.6255], + device='cuda:2'), covar=tensor([0.0279, 0.0772, 0.0656, 0.0342, 0.0295, 0.1177, 0.0224, 0.0902], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0234, 0.0348, 0.0296, 0.0236, 0.0281, 0.0258, 0.0265], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:38:05,749 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-07 19:38:22,019 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.383e+02 2.254e+02 2.833e+02 3.522e+02 6.123e+02, threshold=5.665e+02, percent-clipped=1.0 +2022-12-07 19:38:39,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8259, 3.5079, 3.3539, 3.4803, 3.7220, 3.7179, 3.8386, 3.7856], + device='cuda:2'), covar=tensor([0.0771, 0.0593, 0.1883, 0.2489, 0.0678, 0.0715, 0.0836, 0.0847], + device='cuda:2'), in_proj_covar=tensor([0.0340, 0.0241, 0.0399, 0.0513, 0.0291, 0.0374, 0.0368, 0.0329], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:38:49,291 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=61374.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:39:09,294 INFO [train.py:873] (2/4) Epoch 9, batch 900, loss[loss=0.1585, simple_loss=0.1776, pruned_loss=0.06969, over 12738.00 frames. ], tot_loss[loss=0.1467, simple_loss=0.1699, pruned_loss=0.06173, over 1921909.97 frames. ], batch size: 100, lr: 9.02e-03, grad_scale: 8.0 +2022-12-07 19:39:27,310 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=61417.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:39:31,537 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=61422.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:39:50,648 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.310e+02 2.290e+02 3.289e+02 4.035e+02 9.123e+02, threshold=6.578e+02, percent-clipped=5.0 +2022-12-07 19:39:53,268 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3694, 3.7781, 3.0566, 4.5621, 4.2116, 4.2748, 3.7078, 3.1130], + device='cuda:2'), covar=tensor([0.1082, 0.1546, 0.4315, 0.0737, 0.0840, 0.3040, 0.1526, 0.4262], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0294, 0.0273, 0.0230, 0.0291, 0.0279, 0.0254, 0.0267], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 19:40:27,244 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8234, 1.4778, 3.7657, 1.6195, 3.7374, 3.8355, 2.6701, 4.1997], + device='cuda:2'), covar=tensor([0.0196, 0.2951, 0.0344, 0.2240, 0.0417, 0.0336, 0.0798, 0.0132], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0159, 0.0157, 0.0173, 0.0171, 0.0171, 0.0138, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:40:37,283 INFO [train.py:873] (2/4) Epoch 9, batch 1000, loss[loss=0.164, simple_loss=0.1592, pruned_loss=0.08437, over 2625.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.1692, pruned_loss=0.06112, over 1922352.50 frames. ], batch size: 100, lr: 9.01e-03, grad_scale: 8.0 +2022-12-07 19:41:16,883 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.92 vs. limit=2.0 +2022-12-07 19:41:17,398 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6498, 1.3532, 3.6342, 1.5327, 3.6139, 3.6377, 2.5823, 3.9887], + device='cuda:2'), covar=tensor([0.0226, 0.3137, 0.0384, 0.2447, 0.0488, 0.0390, 0.0832, 0.0167], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0158, 0.0157, 0.0172, 0.0171, 0.0169, 0.0136, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:41:18,076 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 2.101e+02 2.828e+02 3.770e+02 7.982e+02, threshold=5.656e+02, percent-clipped=1.0 +2022-12-07 19:41:19,561 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.78 vs. limit=2.0 +2022-12-07 19:42:00,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4112, 2.3340, 2.4839, 2.4636, 2.4180, 2.1709, 1.3794, 2.2147], + device='cuda:2'), covar=tensor([0.0391, 0.0431, 0.0441, 0.0328, 0.0399, 0.1049, 0.2270, 0.0356], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0157, 0.0133, 0.0129, 0.0185, 0.0127, 0.0152, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:42:05,668 INFO [train.py:873] (2/4) Epoch 9, batch 1100, loss[loss=0.1223, simple_loss=0.1529, pruned_loss=0.0458, over 14050.00 frames. ], tot_loss[loss=0.1447, simple_loss=0.1683, pruned_loss=0.06051, over 1928759.31 frames. ], batch size: 19, lr: 9.00e-03, grad_scale: 8.0 +2022-12-07 19:42:26,657 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 19:42:46,207 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 2.543e+02 3.100e+02 3.894e+02 7.246e+02, threshold=6.200e+02, percent-clipped=7.0 +2022-12-07 19:42:48,114 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3131, 1.5160, 1.3027, 1.1364, 1.4384, 0.5917, 1.4372, 1.4126], + device='cuda:2'), covar=tensor([0.2345, 0.1132, 0.0898, 0.2252, 0.1322, 0.0834, 0.0907, 0.1419], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0023, 0.0021, 0.0023, 0.0033, 0.0024, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 19:43:05,612 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7591, 1.6646, 3.6311, 1.6305, 3.6632, 3.6474, 2.6061, 4.0781], + device='cuda:2'), covar=tensor([0.0205, 0.2660, 0.0339, 0.2179, 0.0464, 0.0381, 0.0839, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0158, 0.0155, 0.0172, 0.0170, 0.0169, 0.0136, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:43:32,679 INFO [train.py:873] (2/4) Epoch 9, batch 1200, loss[loss=0.1319, simple_loss=0.1651, pruned_loss=0.0493, over 14252.00 frames. ], tot_loss[loss=0.1449, simple_loss=0.1687, pruned_loss=0.06057, over 1952733.22 frames. ], batch size: 66, lr: 9.00e-03, grad_scale: 8.0 +2022-12-07 19:43:33,361 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 19:43:37,475 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-07 19:43:49,867 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=61717.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:44:03,834 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61732.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:44:13,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.333e+02 2.189e+02 2.580e+02 3.174e+02 5.497e+02, threshold=5.160e+02, percent-clipped=0.0 +2022-12-07 19:44:13,400 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61743.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:44:32,972 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=61765.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:44:56,452 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61793.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:44:59,743 INFO [train.py:873] (2/4) Epoch 9, batch 1300, loss[loss=0.1424, simple_loss=0.1744, pruned_loss=0.0552, over 14248.00 frames. ], tot_loss[loss=0.1447, simple_loss=0.1685, pruned_loss=0.06045, over 1953042.55 frames. ], batch size: 89, lr: 8.99e-03, grad_scale: 8.0 +2022-12-07 19:45:07,295 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61804.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:45:41,029 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.302e+02 2.881e+02 3.480e+02 7.451e+02, threshold=5.762e+02, percent-clipped=5.0 +2022-12-07 19:46:04,691 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7729, 2.0408, 2.6950, 2.2526, 2.7475, 2.4815, 2.4377, 2.3378], + device='cuda:2'), covar=tensor([0.0541, 0.2642, 0.0899, 0.1684, 0.0507, 0.1016, 0.0763, 0.1648], + device='cuda:2'), in_proj_covar=tensor([0.0327, 0.0318, 0.0390, 0.0305, 0.0373, 0.0306, 0.0358, 0.0321], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:46:28,405 INFO [train.py:873] (2/4) Epoch 9, batch 1400, loss[loss=0.1718, simple_loss=0.1529, pruned_loss=0.09529, over 1219.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.1688, pruned_loss=0.06036, over 1988410.39 frames. ], batch size: 100, lr: 8.98e-03, grad_scale: 8.0 +2022-12-07 19:46:42,921 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=61914.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:46:53,118 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4742, 1.8241, 2.4430, 2.1033, 2.5035, 2.2554, 2.2034, 2.1140], + device='cuda:2'), covar=tensor([0.0452, 0.2547, 0.0893, 0.1528, 0.0485, 0.0918, 0.0814, 0.1530], + device='cuda:2'), in_proj_covar=tensor([0.0329, 0.0321, 0.0392, 0.0308, 0.0375, 0.0308, 0.0360, 0.0323], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:47:08,292 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.377e+02 2.184e+02 2.770e+02 3.491e+02 7.127e+02, threshold=5.539e+02, percent-clipped=2.0 +2022-12-07 19:47:37,245 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=61975.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:47:39,730 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6356, 2.5387, 2.6976, 2.6963, 2.6243, 2.4694, 1.3869, 2.4115], + device='cuda:2'), covar=tensor([0.0416, 0.0471, 0.0460, 0.0331, 0.0377, 0.0909, 0.2672, 0.0375], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0131, 0.0128, 0.0185, 0.0127, 0.0153, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:47:56,375 INFO [train.py:873] (2/4) Epoch 9, batch 1500, loss[loss=0.1711, simple_loss=0.1625, pruned_loss=0.08985, over 1248.00 frames. ], tot_loss[loss=0.1444, simple_loss=0.1684, pruned_loss=0.0602, over 1936523.18 frames. ], batch size: 100, lr: 8.98e-03, grad_scale: 8.0 +2022-12-07 19:48:37,573 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.350e+02 2.345e+02 2.854e+02 3.466e+02 6.884e+02, threshold=5.708e+02, percent-clipped=1.0 +2022-12-07 19:49:18,208 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62088.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:49:25,701 INFO [train.py:873] (2/4) Epoch 9, batch 1600, loss[loss=0.1488, simple_loss=0.1498, pruned_loss=0.0739, over 2615.00 frames. ], tot_loss[loss=0.1443, simple_loss=0.1681, pruned_loss=0.0603, over 1902122.54 frames. ], batch size: 100, lr: 8.97e-03, grad_scale: 8.0 +2022-12-07 19:49:27,543 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62099.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:49:48,697 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62122.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:50:07,094 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.173e+02 2.069e+02 2.487e+02 3.323e+02 7.707e+02, threshold=4.973e+02, percent-clipped=4.0 +2022-12-07 19:50:08,937 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3698, 0.8741, 1.2466, 0.8667, 0.9456, 1.3209, 1.0293, 1.0254], + device='cuda:2'), covar=tensor([0.0436, 0.0867, 0.0569, 0.0401, 0.1044, 0.0659, 0.0379, 0.1213], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0186, 0.0128, 0.0125, 0.0125, 0.0130, 0.0107, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:50:28,517 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-07 19:50:42,578 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62183.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:50:54,728 INFO [train.py:873] (2/4) Epoch 9, batch 1700, loss[loss=0.1223, simple_loss=0.1484, pruned_loss=0.0481, over 13664.00 frames. ], tot_loss[loss=0.1453, simple_loss=0.1689, pruned_loss=0.06085, over 1914329.08 frames. ], batch size: 17, lr: 8.96e-03, grad_scale: 8.0 +2022-12-07 19:51:13,755 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3319, 1.4221, 2.5596, 1.4040, 2.5292, 2.4705, 1.8677, 2.6278], + device='cuda:2'), covar=tensor([0.0262, 0.2041, 0.0260, 0.1667, 0.0345, 0.0472, 0.1026, 0.0243], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0156, 0.0154, 0.0170, 0.0166, 0.0166, 0.0133, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 19:51:35,609 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 2.316e+02 3.145e+02 3.790e+02 7.968e+02, threshold=6.289e+02, percent-clipped=7.0 +2022-12-07 19:51:52,016 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 19:51:59,346 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62270.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:52:01,204 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3701, 2.3112, 3.1126, 2.5110, 3.0884, 3.1321, 3.0208, 2.6287], + device='cuda:2'), covar=tensor([0.0704, 0.2886, 0.0989, 0.1866, 0.0702, 0.0937, 0.1252, 0.1846], + device='cuda:2'), in_proj_covar=tensor([0.0330, 0.0319, 0.0396, 0.0310, 0.0372, 0.0308, 0.0359, 0.0324], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:52:01,971 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62273.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:52:06,704 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-07 19:52:23,520 INFO [train.py:873] (2/4) Epoch 9, batch 1800, loss[loss=0.169, simple_loss=0.1831, pruned_loss=0.07747, over 6929.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.1684, pruned_loss=0.06042, over 1899443.34 frames. ], batch size: 100, lr: 8.95e-03, grad_scale: 8.0 +2022-12-07 19:52:56,785 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62334.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:53:00,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2985, 2.1693, 3.2593, 3.3410, 3.2573, 2.2589, 3.2795, 2.5776], + device='cuda:2'), covar=tensor([0.0275, 0.0638, 0.0597, 0.0289, 0.0293, 0.0863, 0.0228, 0.0661], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0235, 0.0351, 0.0295, 0.0239, 0.0282, 0.0261, 0.0265], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:53:04,486 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 2.235e+02 2.778e+02 3.404e+02 6.725e+02, threshold=5.556e+02, percent-clipped=1.0 +2022-12-07 19:53:12,098 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.80 vs. limit=2.0 +2022-12-07 19:53:19,867 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1184, 1.8116, 3.3101, 2.3015, 3.1379, 1.7578, 2.6158, 3.1528], + device='cuda:2'), covar=tensor([0.0714, 0.4721, 0.0374, 0.6687, 0.0725, 0.3621, 0.1295, 0.0450], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0227, 0.0187, 0.0307, 0.0209, 0.0228, 0.0218, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:53:30,862 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9386, 1.5314, 3.8366, 3.6047, 3.6439, 3.8802, 3.1917, 3.9404], + device='cuda:2'), covar=tensor([0.1376, 0.1438, 0.0104, 0.0238, 0.0211, 0.0109, 0.0299, 0.0112], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0120, 0.0162, 0.0140, 0.0133, 0.0115, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:53:37,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0688, 4.8201, 4.5842, 5.1252, 4.6775, 4.3811, 5.1272, 4.9669], + device='cuda:2'), covar=tensor([0.0553, 0.0621, 0.0774, 0.0498, 0.0798, 0.0535, 0.0556, 0.0631], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0121, 0.0130, 0.0138, 0.0135, 0.0108, 0.0152, 0.0129], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 19:53:44,376 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62388.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:53:46,993 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62391.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:53:52,239 INFO [train.py:873] (2/4) Epoch 9, batch 1900, loss[loss=0.148, simple_loss=0.1726, pruned_loss=0.06174, over 14282.00 frames. ], tot_loss[loss=0.1449, simple_loss=0.1688, pruned_loss=0.06053, over 1929322.96 frames. ], batch size: 35, lr: 8.95e-03, grad_scale: 8.0 +2022-12-07 19:53:54,343 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:54:26,834 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62436.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:54:33,571 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.440e+02 2.190e+02 2.770e+02 3.608e+02 5.661e+02, threshold=5.540e+02, percent-clipped=1.0 +2022-12-07 19:54:37,182 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62447.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:54:41,706 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62452.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:55:04,154 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62478.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:55:20,939 INFO [train.py:873] (2/4) Epoch 9, batch 2000, loss[loss=0.1971, simple_loss=0.1653, pruned_loss=0.1145, over 1272.00 frames. ], tot_loss[loss=0.1453, simple_loss=0.1693, pruned_loss=0.06069, over 1956573.51 frames. ], batch size: 100, lr: 8.94e-03, grad_scale: 8.0 +2022-12-07 19:55:24,571 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5284, 3.1178, 3.0617, 1.8842, 3.0039, 3.2985, 3.4864, 2.7767], + device='cuda:2'), covar=tensor([0.0812, 0.2275, 0.1369, 0.3044, 0.1305, 0.0801, 0.1547, 0.1872], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0184, 0.0126, 0.0122, 0.0123, 0.0129, 0.0105, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:55:45,757 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0831, 3.2481, 3.1726, 3.2909, 2.2845, 3.4073, 3.0336, 1.4143], + device='cuda:2'), covar=tensor([0.2111, 0.0848, 0.1134, 0.0485, 0.1173, 0.0610, 0.1323, 0.3470], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0068, 0.0056, 0.0058, 0.0088, 0.0067, 0.0091, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 19:56:01,038 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 2.315e+02 2.841e+02 3.698e+02 9.209e+02, threshold=5.682e+02, percent-clipped=7.0 +2022-12-07 19:56:02,972 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62545.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:56:24,918 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62570.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:56:37,494 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62585.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:56:41,027 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0129, 2.0519, 3.0396, 3.1314, 3.0767, 2.2330, 3.0753, 2.3941], + device='cuda:2'), covar=tensor([0.0257, 0.0615, 0.0462, 0.0274, 0.0244, 0.0799, 0.0231, 0.0579], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0236, 0.0352, 0.0296, 0.0239, 0.0282, 0.0262, 0.0266], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:56:47,848 INFO [train.py:873] (2/4) Epoch 9, batch 2100, loss[loss=0.1359, simple_loss=0.1646, pruned_loss=0.05356, over 14188.00 frames. ], tot_loss[loss=0.1449, simple_loss=0.1688, pruned_loss=0.06052, over 2030265.51 frames. ], batch size: 99, lr: 8.93e-03, grad_scale: 8.0 +2022-12-07 19:56:56,372 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62606.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:57:07,124 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62618.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:57:10,694 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9239, 1.6827, 3.2674, 2.2985, 3.1236, 1.6919, 2.4886, 2.9499], + device='cuda:2'), covar=tensor([0.0989, 0.5424, 0.0565, 0.6013, 0.0805, 0.4304, 0.1505, 0.0649], + device='cuda:2'), in_proj_covar=tensor([0.0237, 0.0228, 0.0189, 0.0307, 0.0210, 0.0228, 0.0218, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 19:57:13,973 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5684, 1.1412, 2.0757, 1.8920, 1.9690, 2.0534, 1.5266, 2.0971], + device='cuda:2'), covar=tensor([0.0546, 0.0941, 0.0134, 0.0261, 0.0286, 0.0150, 0.0430, 0.0167], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0157, 0.0120, 0.0161, 0.0140, 0.0134, 0.0115, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:57:16,342 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62629.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:57:16,677 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.76 vs. limit=5.0 +2022-12-07 19:57:28,901 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.143e+02 2.774e+02 3.519e+02 5.598e+02, threshold=5.548e+02, percent-clipped=0.0 +2022-12-07 19:57:31,562 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62646.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:57:48,815 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62666.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:58:15,320 INFO [train.py:873] (2/4) Epoch 9, batch 2200, loss[loss=0.17, simple_loss=0.1726, pruned_loss=0.08375, over 4963.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.169, pruned_loss=0.06015, over 2024479.11 frames. ], batch size: 100, lr: 8.93e-03, grad_scale: 8.0 +2022-12-07 19:58:42,009 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62727.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:58:46,861 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.53 vs. limit=2.0 +2022-12-07 19:58:47,417 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2851, 3.4014, 3.5786, 3.3110, 3.4328, 3.2710, 1.4098, 3.3087], + device='cuda:2'), covar=tensor([0.0336, 0.0322, 0.0384, 0.0417, 0.0349, 0.0533, 0.3287, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0153, 0.0130, 0.0125, 0.0185, 0.0126, 0.0151, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 19:58:56,803 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.196e+02 2.557e+02 3.266e+02 4.324e+02 1.294e+03, threshold=6.533e+02, percent-clipped=13.0 +2022-12-07 19:58:59,500 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62747.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:59:09,715 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8083, 2.7665, 2.0749, 2.8929, 2.6645, 2.8104, 2.4867, 2.2723], + device='cuda:2'), covar=tensor([0.0741, 0.1236, 0.2916, 0.0599, 0.1259, 0.0933, 0.1364, 0.2641], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0296, 0.0274, 0.0230, 0.0293, 0.0279, 0.0254, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 19:59:21,927 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=62773.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:59:26,028 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62778.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 19:59:41,695 INFO [train.py:873] (2/4) Epoch 9, batch 2300, loss[loss=0.1237, simple_loss=0.1605, pruned_loss=0.04347, over 14287.00 frames. ], tot_loss[loss=0.1426, simple_loss=0.1676, pruned_loss=0.05878, over 1999369.15 frames. ], batch size: 44, lr: 8.92e-03, grad_scale: 8.0 +2022-12-07 19:59:59,844 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.6845, 1.1563, 1.3132, 1.3025, 1.2482, 1.3346, 1.0978, 0.9653], + device='cuda:2'), covar=tensor([0.2099, 0.0845, 0.0269, 0.0264, 0.0956, 0.0586, 0.1572, 0.0917], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0069, 0.0056, 0.0058, 0.0089, 0.0068, 0.0092, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:00:07,228 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62826.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:00:14,356 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=62834.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:00:23,176 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.000e+02 2.029e+02 2.536e+02 3.250e+02 8.518e+02, threshold=5.072e+02, percent-clipped=1.0 +2022-12-07 20:00:24,312 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4645, 3.0724, 2.3890, 3.5133, 3.3955, 3.4156, 2.9276, 2.3637], + device='cuda:2'), covar=tensor([0.0934, 0.1600, 0.3497, 0.0574, 0.0845, 0.1352, 0.1372, 0.4179], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0296, 0.0275, 0.0231, 0.0292, 0.0279, 0.0254, 0.0263], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 20:00:31,379 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2952, 1.3699, 1.2387, 1.0165, 1.3719, 0.7203, 1.2733, 1.5316], + device='cuda:2'), covar=tensor([0.0911, 0.1390, 0.0631, 0.2109, 0.1657, 0.1031, 0.1056, 0.0679], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0022, 0.0022, 0.0023, 0.0033, 0.0023, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 20:00:41,407 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1345, 2.2857, 2.4008, 2.4289, 1.9526, 2.4442, 2.1910, 1.2759], + device='cuda:2'), covar=tensor([0.1621, 0.0748, 0.0623, 0.0513, 0.1164, 0.0489, 0.1242, 0.2725], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0069, 0.0055, 0.0058, 0.0089, 0.0068, 0.0091, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:01:09,230 INFO [train.py:873] (2/4) Epoch 9, batch 2400, loss[loss=0.1756, simple_loss=0.1844, pruned_loss=0.08341, over 7763.00 frames. ], tot_loss[loss=0.1432, simple_loss=0.1681, pruned_loss=0.05908, over 2023926.82 frames. ], batch size: 100, lr: 8.91e-03, grad_scale: 8.0 +2022-12-07 20:01:12,677 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62901.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:01:36,923 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=62929.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:01:47,423 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=62941.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:01:49,790 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 2.251e+02 2.910e+02 3.795e+02 1.080e+03, threshold=5.819e+02, percent-clipped=6.0 +2022-12-07 20:02:18,740 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=62977.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:02:35,717 INFO [train.py:873] (2/4) Epoch 9, batch 2500, loss[loss=0.1439, simple_loss=0.1684, pruned_loss=0.05967, over 14256.00 frames. ], tot_loss[loss=0.1434, simple_loss=0.1679, pruned_loss=0.05945, over 1969487.37 frames. ], batch size: 80, lr: 8.90e-03, grad_scale: 8.0 +2022-12-07 20:02:36,758 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4619, 1.9988, 2.5171, 2.5962, 2.4098, 1.9832, 2.5665, 2.2367], + device='cuda:2'), covar=tensor([0.0242, 0.0506, 0.0273, 0.0220, 0.0259, 0.0626, 0.0220, 0.0358], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0238, 0.0353, 0.0300, 0.0242, 0.0285, 0.0266, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 20:02:58,105 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63022.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:03:17,451 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.170e+02 2.289e+02 2.848e+02 3.593e+02 5.714e+02, threshold=5.696e+02, percent-clipped=0.0 +2022-12-07 20:03:20,522 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63047.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:04:02,994 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63095.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:04:04,718 INFO [train.py:873] (2/4) Epoch 9, batch 2600, loss[loss=0.1556, simple_loss=0.1767, pruned_loss=0.06725, over 14239.00 frames. ], tot_loss[loss=0.1438, simple_loss=0.168, pruned_loss=0.05984, over 1952366.22 frames. ], batch size: 35, lr: 8.90e-03, grad_scale: 8.0 +2022-12-07 20:04:32,677 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63129.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:04:45,882 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 2.604e+02 3.223e+02 3.963e+02 1.117e+03, threshold=6.445e+02, percent-clipped=7.0 +2022-12-07 20:04:54,187 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.29 vs. limit=5.0 +2022-12-07 20:05:27,446 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 20:05:32,671 INFO [train.py:873] (2/4) Epoch 9, batch 2700, loss[loss=0.1542, simple_loss=0.1777, pruned_loss=0.0653, over 9494.00 frames. ], tot_loss[loss=0.1443, simple_loss=0.168, pruned_loss=0.06029, over 1884149.21 frames. ], batch size: 100, lr: 8.89e-03, grad_scale: 8.0 +2022-12-07 20:05:36,681 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63201.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:05:54,588 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63221.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:06:10,878 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1473, 1.9405, 2.0728, 2.1453, 2.0841, 2.0243, 2.1996, 1.8333], + device='cuda:2'), covar=tensor([0.0791, 0.1395, 0.0686, 0.0722, 0.0833, 0.0720, 0.0866, 0.0662], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0252, 0.0175, 0.0167, 0.0168, 0.0137, 0.0256, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 20:06:11,821 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63241.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:06:14,513 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.383e+02 3.002e+02 3.565e+02 6.845e+02, threshold=6.004e+02, percent-clipped=1.0 +2022-12-07 20:06:19,153 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63249.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:06:47,809 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63282.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:06:48,818 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7212, 1.7715, 1.6887, 1.5526, 1.4037, 1.2179, 1.3364, 0.9842], + device='cuda:2'), covar=tensor([0.0242, 0.0474, 0.0280, 0.0437, 0.0457, 0.0396, 0.0412, 0.0698], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0012, 0.0020, 0.0016, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.4880e-05, 1.0193e-04, 8.9125e-05, 9.9296e-05, 9.3032e-05, 1.4049e-04, + 1.1759e-04, 1.3436e-04], device='cuda:2') +2022-12-07 20:06:54,050 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63289.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:07:00,708 INFO [train.py:873] (2/4) Epoch 9, batch 2800, loss[loss=0.1551, simple_loss=0.1726, pruned_loss=0.06883, over 5979.00 frames. ], tot_loss[loss=0.1433, simple_loss=0.1677, pruned_loss=0.05939, over 1907696.00 frames. ], batch size: 100, lr: 8.88e-03, grad_scale: 8.0 +2022-12-07 20:07:22,861 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63322.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:07:42,340 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 2.099e+02 2.541e+02 3.230e+02 5.695e+02, threshold=5.082e+02, percent-clipped=0.0 +2022-12-07 20:07:44,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9157, 4.6318, 4.2024, 4.4989, 4.5639, 4.8013, 4.8343, 4.8256], + device='cuda:2'), covar=tensor([0.0794, 0.0475, 0.2352, 0.2580, 0.0716, 0.0673, 0.1303, 0.1020], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0242, 0.0407, 0.0521, 0.0298, 0.0386, 0.0381, 0.0342], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:08:04,957 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63370.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:08:09,933 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6548, 3.3282, 3.1458, 2.3288, 3.1107, 3.4285, 3.5133, 2.8610], + device='cuda:2'), covar=tensor([0.0612, 0.1954, 0.1026, 0.1867, 0.0988, 0.0651, 0.0816, 0.1424], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0187, 0.0130, 0.0125, 0.0123, 0.0131, 0.0106, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:08:27,863 INFO [train.py:873] (2/4) Epoch 9, batch 2900, loss[loss=0.1725, simple_loss=0.1531, pruned_loss=0.09592, over 1269.00 frames. ], tot_loss[loss=0.1434, simple_loss=0.1683, pruned_loss=0.05925, over 1988876.37 frames. ], batch size: 100, lr: 8.88e-03, grad_scale: 8.0 +2022-12-07 20:08:48,394 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63420.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:08:49,964 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63422.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:08:55,940 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63429.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:09:09,074 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.350e+02 2.806e+02 3.593e+02 6.969e+02, threshold=5.612e+02, percent-clipped=3.0 +2022-12-07 20:09:37,904 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63477.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:09:41,707 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63481.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:09:43,380 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63483.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:09:55,508 INFO [train.py:873] (2/4) Epoch 9, batch 3000, loss[loss=0.1367, simple_loss=0.1296, pruned_loss=0.07187, over 2592.00 frames. ], tot_loss[loss=0.1428, simple_loss=0.1676, pruned_loss=0.05901, over 1987223.27 frames. ], batch size: 100, lr: 8.87e-03, grad_scale: 8.0 +2022-12-07 20:09:55,508 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 20:10:05,839 INFO [train.py:905] (2/4) Epoch 9, validation: loss=0.124, simple_loss=0.1667, pruned_loss=0.04063, over 857387.00 frames. +2022-12-07 20:10:05,840 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17750MB +2022-12-07 20:10:12,595 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-07 20:10:46,176 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.041e+02 2.483e+02 2.932e+02 3.564e+02 6.010e+02, threshold=5.864e+02, percent-clipped=1.0 +2022-12-07 20:11:06,982 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9694, 4.7872, 4.2087, 4.3205, 4.6722, 4.8561, 5.0775, 4.9638], + device='cuda:2'), covar=tensor([0.1513, 0.1083, 0.3335, 0.4326, 0.1094, 0.1290, 0.1694, 0.1419], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0240, 0.0403, 0.0516, 0.0296, 0.0387, 0.0377, 0.0336], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:11:08,828 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63569.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:11:15,615 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63577.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:11:23,888 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63586.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:11:33,372 INFO [train.py:873] (2/4) Epoch 9, batch 3100, loss[loss=0.1535, simple_loss=0.1767, pruned_loss=0.06512, over 14661.00 frames. ], tot_loss[loss=0.143, simple_loss=0.168, pruned_loss=0.05898, over 1990183.32 frames. ], batch size: 33, lr: 8.86e-03, grad_scale: 8.0 +2022-12-07 20:12:03,510 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63630.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:12:15,374 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.211e+02 2.304e+02 2.853e+02 3.296e+02 8.516e+02, threshold=5.706e+02, percent-clipped=4.0 +2022-12-07 20:12:18,617 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63647.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:12:55,986 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6272, 2.3497, 3.3910, 2.6816, 3.4898, 3.3307, 3.2803, 2.8806], + device='cuda:2'), covar=tensor([0.0716, 0.2913, 0.1016, 0.1745, 0.0786, 0.0833, 0.1488, 0.2000], + device='cuda:2'), in_proj_covar=tensor([0.0330, 0.0318, 0.0391, 0.0303, 0.0376, 0.0308, 0.0358, 0.0324], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:13:02,478 INFO [train.py:873] (2/4) Epoch 9, batch 3200, loss[loss=0.1334, simple_loss=0.1691, pruned_loss=0.04885, over 14067.00 frames. ], tot_loss[loss=0.1426, simple_loss=0.1676, pruned_loss=0.05878, over 1985154.52 frames. ], batch size: 22, lr: 8.86e-03, grad_scale: 8.0 +2022-12-07 20:13:39,013 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4147, 2.5962, 2.4799, 2.7151, 2.0391, 2.8411, 2.4890, 1.1838], + device='cuda:2'), covar=tensor([0.1719, 0.0808, 0.1145, 0.0680, 0.1203, 0.0657, 0.1203, 0.3364], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0069, 0.0054, 0.0057, 0.0086, 0.0066, 0.0092, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:13:43,328 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.200e+02 2.396e+02 2.898e+02 3.837e+02 7.541e+02, threshold=5.796e+02, percent-clipped=6.0 +2022-12-07 20:14:11,277 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63776.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:14:13,008 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63778.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:14:26,734 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1298, 1.1853, 0.9676, 1.0661, 1.0664, 0.6210, 0.8581, 0.9207], + device='cuda:2'), covar=tensor([0.0376, 0.0578, 0.0664, 0.0433, 0.0491, 0.0811, 0.0815, 0.0726], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0023, 0.0022, 0.0024, 0.0033, 0.0023, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 20:14:29,970 INFO [train.py:873] (2/4) Epoch 9, batch 3300, loss[loss=0.1347, simple_loss=0.1649, pruned_loss=0.05228, over 14173.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.167, pruned_loss=0.05876, over 1938444.62 frames. ], batch size: 89, lr: 8.85e-03, grad_scale: 8.0 +2022-12-07 20:14:41,634 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63810.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:15:10,960 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.144e+02 2.539e+02 3.073e+02 7.001e+02, threshold=5.078e+02, percent-clipped=2.0 +2022-12-07 20:15:11,967 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63845.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:15:30,574 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5429, 5.4091, 5.0680, 5.5907, 5.1549, 4.7607, 5.6083, 5.4700], + device='cuda:2'), covar=tensor([0.0597, 0.0556, 0.0658, 0.0427, 0.0686, 0.0384, 0.0521, 0.0551], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0120, 0.0130, 0.0134, 0.0133, 0.0107, 0.0149, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 20:15:34,991 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63871.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:15:40,211 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=63877.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:15:55,705 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.35 vs. limit=5.0 +2022-12-07 20:15:56,935 INFO [train.py:873] (2/4) Epoch 9, batch 3400, loss[loss=0.1238, simple_loss=0.1502, pruned_loss=0.04866, over 6937.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.1668, pruned_loss=0.0585, over 1940162.43 frames. ], batch size: 100, lr: 8.84e-03, grad_scale: 8.0 +2022-12-07 20:16:05,645 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63906.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:16:14,100 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=63916.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:16:20,075 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1790, 3.4109, 3.1109, 2.9946, 2.5015, 3.5054, 3.0865, 1.5491], + device='cuda:2'), covar=tensor([0.2516, 0.0629, 0.1339, 0.1346, 0.1065, 0.0556, 0.1386, 0.3312], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0070, 0.0054, 0.0058, 0.0087, 0.0067, 0.0093, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:16:21,654 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=63925.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:16:21,711 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63925.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:16:36,687 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=63942.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:16:38,551 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.128e+02 2.612e+02 3.647e+02 5.829e+02, threshold=5.224e+02, percent-clipped=2.0 +2022-12-07 20:17:07,539 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=63977.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:17:25,059 INFO [train.py:873] (2/4) Epoch 9, batch 3500, loss[loss=0.1863, simple_loss=0.1612, pruned_loss=0.1057, over 1293.00 frames. ], tot_loss[loss=0.1415, simple_loss=0.1667, pruned_loss=0.05813, over 1988388.93 frames. ], batch size: 100, lr: 8.83e-03, grad_scale: 8.0 +2022-12-07 20:17:48,243 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64023.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:18:06,277 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.249e+02 2.173e+02 2.574e+02 3.096e+02 7.573e+02, threshold=5.148e+02, percent-clipped=3.0 +2022-12-07 20:18:33,644 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64076.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:18:35,473 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64078.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:18:40,801 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64084.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:18:51,456 INFO [train.py:873] (2/4) Epoch 9, batch 3600, loss[loss=0.1215, simple_loss=0.1534, pruned_loss=0.04474, over 14266.00 frames. ], tot_loss[loss=0.1418, simple_loss=0.1671, pruned_loss=0.05828, over 2019168.37 frames. ], batch size: 25, lr: 8.83e-03, grad_scale: 8.0 +2022-12-07 20:18:57,806 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1156, 2.0613, 1.7386, 1.8156, 2.0377, 2.0681, 2.0887, 2.0489], + device='cuda:2'), covar=tensor([0.1312, 0.0781, 0.2806, 0.3201, 0.1268, 0.1168, 0.1467, 0.1269], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0242, 0.0409, 0.0518, 0.0296, 0.0385, 0.0379, 0.0342], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:19:15,031 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64124.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:19:16,699 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64126.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:19:22,123 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3659, 3.0271, 2.8910, 1.8483, 2.8054, 3.0598, 3.3650, 2.7682], + device='cuda:2'), covar=tensor([0.0722, 0.1856, 0.1220, 0.2449, 0.1154, 0.0696, 0.0822, 0.1564], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0187, 0.0131, 0.0125, 0.0126, 0.0133, 0.0108, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:19:32,556 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.135e+01 2.573e+02 3.171e+02 4.040e+02 8.310e+02, threshold=6.341e+02, percent-clipped=10.0 +2022-12-07 20:19:41,450 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0681, 1.3172, 3.7890, 1.7249, 3.8648, 4.1107, 3.2094, 4.4173], + device='cuda:2'), covar=tensor([0.0232, 0.3190, 0.0449, 0.2357, 0.0454, 0.0374, 0.0669, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0155, 0.0156, 0.0169, 0.0167, 0.0164, 0.0134, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:19:47,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8254, 1.1921, 2.1080, 1.1923, 2.0095, 2.0653, 1.7586, 2.1059], + device='cuda:2'), covar=tensor([0.0281, 0.1753, 0.0268, 0.1857, 0.0388, 0.0405, 0.0862, 0.0310], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0154, 0.0156, 0.0169, 0.0167, 0.0164, 0.0133, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:19:51,884 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64166.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:19:58,076 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7932, 0.7517, 0.4788, 0.7410, 0.7027, 0.5701, 0.4323, 0.5689], + device='cuda:2'), covar=tensor([0.0108, 0.0116, 0.0115, 0.0105, 0.0159, 0.0316, 0.0192, 0.0290], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0012, 0.0020, 0.0016, 0.0020], + device='cuda:2'), out_proj_covar=tensor([9.4481e-05, 1.0153e-04, 8.9427e-05, 9.7568e-05, 9.3192e-05, 1.4215e-04, + 1.1869e-04, 1.3440e-04], device='cuda:2') +2022-12-07 20:20:11,137 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.22 vs. limit=2.0 +2022-12-07 20:20:18,988 INFO [train.py:873] (2/4) Epoch 9, batch 3700, loss[loss=0.1421, simple_loss=0.1633, pruned_loss=0.06044, over 11184.00 frames. ], tot_loss[loss=0.1424, simple_loss=0.1671, pruned_loss=0.05886, over 1925678.34 frames. ], batch size: 100, lr: 8.82e-03, grad_scale: 8.0 +2022-12-07 20:20:22,583 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64201.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:20:44,488 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64225.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:20:50,549 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64232.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:20:58,956 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64242.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:21:00,575 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.090e+02 2.323e+02 2.858e+02 3.638e+02 5.583e+02, threshold=5.716e+02, percent-clipped=0.0 +2022-12-07 20:21:25,189 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:21:25,988 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64273.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:21:41,388 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64290.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:21:44,049 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64293.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:21:47,279 INFO [train.py:873] (2/4) Epoch 9, batch 3800, loss[loss=0.1295, simple_loss=0.1517, pruned_loss=0.05362, over 5990.00 frames. ], tot_loss[loss=0.142, simple_loss=0.1666, pruned_loss=0.05863, over 1900082.82 frames. ], batch size: 100, lr: 8.81e-03, grad_scale: 8.0 +2022-12-07 20:22:28,598 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.324e+02 2.269e+02 2.644e+02 3.472e+02 7.922e+02, threshold=5.288e+02, percent-clipped=2.0 +2022-12-07 20:22:29,737 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64345.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:22:30,853 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2250, 1.4057, 3.4529, 1.4226, 3.1447, 3.4142, 2.2973, 3.5948], + device='cuda:2'), covar=tensor([0.0238, 0.2927, 0.0284, 0.2366, 0.0897, 0.0311, 0.0863, 0.0196], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0156, 0.0156, 0.0169, 0.0167, 0.0166, 0.0134, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:22:59,833 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64379.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:23:16,039 INFO [train.py:873] (2/4) Epoch 9, batch 3900, loss[loss=0.1949, simple_loss=0.1695, pruned_loss=0.1101, over 1205.00 frames. ], tot_loss[loss=0.141, simple_loss=0.1662, pruned_loss=0.05788, over 1954185.25 frames. ], batch size: 100, lr: 8.81e-03, grad_scale: 8.0 +2022-12-07 20:23:24,354 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64406.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:23:57,837 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.354e+02 2.154e+02 2.432e+02 3.241e+02 6.644e+02, threshold=4.864e+02, percent-clipped=3.0 +2022-12-07 20:24:17,335 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64465.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:24:18,148 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64466.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:24:40,870 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9705, 2.0875, 1.9414, 2.1326, 1.7613, 1.9062, 2.0736, 2.0634], + device='cuda:2'), covar=tensor([0.1171, 0.1191, 0.1264, 0.0952, 0.1453, 0.0925, 0.1200, 0.0981], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0120, 0.0132, 0.0136, 0.0132, 0.0108, 0.0151, 0.0132], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 20:24:45,108 INFO [train.py:873] (2/4) Epoch 9, batch 4000, loss[loss=0.1112, simple_loss=0.1468, pruned_loss=0.03777, over 11146.00 frames. ], tot_loss[loss=0.1425, simple_loss=0.1671, pruned_loss=0.0589, over 1979347.32 frames. ], batch size: 14, lr: 8.80e-03, grad_scale: 8.0 +2022-12-07 20:24:48,746 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64501.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:25:00,309 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64514.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:25:11,093 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64526.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:25:23,130 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.20 vs. limit=5.0 +2022-12-07 20:25:26,781 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.50 vs. limit=5.0 +2022-12-07 20:25:26,921 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.253e+02 2.384e+02 2.847e+02 3.821e+02 7.495e+02, threshold=5.695e+02, percent-clipped=8.0 +2022-12-07 20:25:31,862 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64549.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:25:51,530 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64572.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:26:06,380 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64588.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:26:07,636 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64589.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:26:14,196 INFO [train.py:873] (2/4) Epoch 9, batch 4100, loss[loss=0.2079, simple_loss=0.1809, pruned_loss=0.1175, over 1246.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.1669, pruned_loss=0.05845, over 2016537.82 frames. ], batch size: 100, lr: 8.79e-03, grad_scale: 8.0 +2022-12-07 20:26:34,416 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64620.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:26:40,883 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-12-07 20:26:55,696 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.345e+01 2.214e+02 2.881e+02 3.617e+02 6.891e+02, threshold=5.761e+02, percent-clipped=1.0 +2022-12-07 20:26:59,483 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64648.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:27:01,282 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64650.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:27:26,361 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64679.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:27:41,954 INFO [train.py:873] (2/4) Epoch 9, batch 4200, loss[loss=0.1095, simple_loss=0.1508, pruned_loss=0.03413, over 14266.00 frames. ], tot_loss[loss=0.1429, simple_loss=0.1677, pruned_loss=0.05907, over 1959454.97 frames. ], batch size: 60, lr: 8.79e-03, grad_scale: 8.0 +2022-12-07 20:27:45,394 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64701.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:27:46,943 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64703.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:27:52,738 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64709.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:28:07,778 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64727.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:28:09,681 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3070, 2.5435, 4.3114, 4.3515, 4.2940, 2.4512, 4.3586, 3.4003], + device='cuda:2'), covar=tensor([0.0264, 0.0644, 0.0653, 0.0272, 0.0231, 0.1127, 0.0208, 0.0609], + device='cuda:2'), in_proj_covar=tensor([0.0263, 0.0238, 0.0352, 0.0300, 0.0242, 0.0286, 0.0270, 0.0269], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 20:28:12,268 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5437, 2.2677, 2.9291, 1.9364, 1.9161, 2.5252, 1.5023, 2.4093], + device='cuda:2'), covar=tensor([0.0739, 0.1408, 0.0566, 0.1852, 0.2532, 0.0780, 0.3728, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0075, 0.0090, 0.0085, 0.0091, 0.0109, 0.0078, 0.0125, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:28:23,777 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.280e+02 2.745e+02 3.613e+02 6.444e+02, threshold=5.491e+02, percent-clipped=3.0 +2022-12-07 20:28:23,960 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64745.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:28:30,398 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64752.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:28:40,124 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64764.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:28:42,421 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 20:29:09,365 INFO [train.py:873] (2/4) Epoch 9, batch 4300, loss[loss=0.1733, simple_loss=0.1564, pruned_loss=0.09511, over 1257.00 frames. ], tot_loss[loss=0.1432, simple_loss=0.1681, pruned_loss=0.05918, over 1971773.57 frames. ], batch size: 100, lr: 8.78e-03, grad_scale: 8.0 +2022-12-07 20:29:17,546 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64806.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:29:23,573 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=64813.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:29:30,669 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64821.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:29:51,885 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.265e+02 2.326e+02 2.910e+02 3.508e+02 7.763e+02, threshold=5.820e+02, percent-clipped=3.0 +2022-12-07 20:30:29,583 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=64888.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:30:37,025 INFO [train.py:873] (2/4) Epoch 9, batch 4400, loss[loss=0.137, simple_loss=0.1692, pruned_loss=0.05241, over 14255.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.1674, pruned_loss=0.0586, over 1960096.04 frames. ], batch size: 25, lr: 8.77e-03, grad_scale: 8.0 +2022-12-07 20:30:57,992 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7241, 2.3360, 3.4901, 2.6235, 3.6071, 3.3855, 3.2843, 2.6317], + device='cuda:2'), covar=tensor([0.0817, 0.2932, 0.1136, 0.1951, 0.0798, 0.0937, 0.1196, 0.2143], + device='cuda:2'), in_proj_covar=tensor([0.0328, 0.0313, 0.0392, 0.0304, 0.0370, 0.0308, 0.0358, 0.0317], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:31:06,574 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8814, 3.2947, 3.2356, 2.9753, 2.2101, 3.1837, 3.1009, 1.4794], + device='cuda:2'), covar=tensor([0.2852, 0.0697, 0.0845, 0.0864, 0.1401, 0.0542, 0.1175, 0.3239], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0070, 0.0056, 0.0059, 0.0088, 0.0067, 0.0092, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:31:10,768 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=64936.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:31:18,743 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.162e+02 2.178e+02 2.781e+02 3.445e+02 8.866e+02, threshold=5.563e+02, percent-clipped=5.0 +2022-12-07 20:31:18,903 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=64945.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:31:45,862 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=64976.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:32:04,728 INFO [train.py:873] (2/4) Epoch 9, batch 4500, loss[loss=0.1346, simple_loss=0.1734, pruned_loss=0.04794, over 14388.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.1677, pruned_loss=0.05849, over 2005331.29 frames. ], batch size: 53, lr: 8.77e-03, grad_scale: 8.0 +2022-12-07 20:32:11,690 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65001.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:32:14,095 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65004.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:32:43,536 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65037.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:32:50,125 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.126e+02 2.276e+02 2.761e+02 3.494e+02 6.954e+02, threshold=5.522e+02, percent-clipped=1.0 +2022-12-07 20:32:51,442 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.87 vs. limit=5.0 +2022-12-07 20:32:53,584 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65049.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:32:58,618 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65055.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:33:02,166 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65059.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:33:35,182 INFO [train.py:873] (2/4) Epoch 9, batch 4600, loss[loss=0.1348, simple_loss=0.1498, pruned_loss=0.0599, over 3862.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.1677, pruned_loss=0.05848, over 1996793.88 frames. ], batch size: 100, lr: 8.76e-03, grad_scale: 8.0 +2022-12-07 20:33:39,059 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65101.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:33:45,155 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65108.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:33:52,349 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65116.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:33:56,530 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65121.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:34:17,217 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.207e+01 2.339e+02 3.104e+02 3.874e+02 1.034e+03, threshold=6.208e+02, percent-clipped=6.0 +2022-12-07 20:34:38,797 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65169.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:35:03,850 INFO [train.py:873] (2/4) Epoch 9, batch 4700, loss[loss=0.1712, simple_loss=0.1509, pruned_loss=0.09577, over 1277.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.1673, pruned_loss=0.05828, over 1985573.36 frames. ], batch size: 100, lr: 8.75e-03, grad_scale: 8.0 +2022-12-07 20:35:33,194 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5483, 3.8869, 3.1485, 4.8146, 4.2465, 4.4920, 3.9820, 3.4890], + device='cuda:2'), covar=tensor([0.0750, 0.1468, 0.4175, 0.0576, 0.0963, 0.1835, 0.1130, 0.3376], + device='cuda:2'), in_proj_covar=tensor([0.0258, 0.0296, 0.0276, 0.0230, 0.0295, 0.0287, 0.0255, 0.0260], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 20:35:46,990 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 2.284e+02 2.894e+02 3.637e+02 5.922e+02, threshold=5.788e+02, percent-clipped=0.0 +2022-12-07 20:35:47,178 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65245.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:36:13,636 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4560, 3.0230, 2.9449, 1.8137, 3.0815, 3.3606, 3.6195, 2.6271], + device='cuda:2'), covar=tensor([0.0764, 0.2088, 0.1182, 0.2523, 0.1078, 0.0632, 0.0723, 0.1694], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0181, 0.0132, 0.0123, 0.0126, 0.0132, 0.0107, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:36:20,239 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8125, 2.4217, 2.5741, 1.6635, 2.2812, 2.5656, 2.8748, 2.3115], + device='cuda:2'), covar=tensor([0.0707, 0.1173, 0.1101, 0.1945, 0.1059, 0.0652, 0.0543, 0.1704], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0181, 0.0131, 0.0123, 0.0126, 0.0132, 0.0107, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:36:21,932 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65285.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:36:28,432 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65293.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:36:31,782 INFO [train.py:873] (2/4) Epoch 9, batch 4800, loss[loss=0.162, simple_loss=0.1639, pruned_loss=0.08003, over 3901.00 frames. ], tot_loss[loss=0.1417, simple_loss=0.1672, pruned_loss=0.0581, over 2003090.79 frames. ], batch size: 100, lr: 8.75e-03, grad_scale: 8.0 +2022-12-07 20:36:38,322 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65304.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:36:48,344 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9282, 4.6245, 4.3956, 4.5727, 4.5962, 4.8508, 4.9543, 4.9147], + device='cuda:2'), covar=tensor([0.0933, 0.0535, 0.2068, 0.2538, 0.0681, 0.0656, 0.0828, 0.0862], + device='cuda:2'), in_proj_covar=tensor([0.0355, 0.0242, 0.0412, 0.0525, 0.0298, 0.0394, 0.0381, 0.0343], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:37:02,583 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65332.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:37:04,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8904, 1.6515, 1.9168, 1.6834, 2.0849, 1.7166, 1.6238, 1.7756], + device='cuda:2'), covar=tensor([0.0435, 0.1223, 0.0192, 0.0383, 0.0239, 0.0637, 0.0215, 0.0298], + device='cuda:2'), in_proj_covar=tensor([0.0335, 0.0317, 0.0393, 0.0307, 0.0373, 0.0312, 0.0359, 0.0321], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:37:13,781 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.537e+02 2.162e+02 2.794e+02 3.355e+02 5.841e+02, threshold=5.587e+02, percent-clipped=1.0 +2022-12-07 20:37:14,882 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65346.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:37:16,345 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9983, 2.0830, 2.3147, 2.2599, 1.9129, 2.2731, 2.1337, 1.2208], + device='cuda:2'), covar=tensor([0.1547, 0.1030, 0.0585, 0.0635, 0.1255, 0.0575, 0.1282, 0.2697], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0070, 0.0055, 0.0058, 0.0087, 0.0067, 0.0092, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:37:20,534 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65352.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:37:26,008 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65359.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:37:58,499 INFO [train.py:873] (2/4) Epoch 9, batch 4900, loss[loss=0.1176, simple_loss=0.1532, pruned_loss=0.04105, over 13572.00 frames. ], tot_loss[loss=0.1416, simple_loss=0.1671, pruned_loss=0.05804, over 1992338.38 frames. ], batch size: 17, lr: 8.74e-03, grad_scale: 8.0 +2022-12-07 20:38:02,249 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65401.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:07,257 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65407.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:08,248 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65408.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:10,653 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65411.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:40,336 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.955e+01 2.399e+02 2.803e+02 3.609e+02 7.825e+02, threshold=5.606e+02, percent-clipped=2.0 +2022-12-07 20:38:43,754 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65449.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:49,500 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65456.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:38:55,877 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3634, 2.0866, 2.3085, 1.3722, 2.0956, 2.3848, 2.4932, 2.1297], + device='cuda:2'), covar=tensor([0.0781, 0.0841, 0.1123, 0.1976, 0.1232, 0.0716, 0.0686, 0.1426], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0184, 0.0133, 0.0125, 0.0127, 0.0134, 0.0109, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:39:15,594 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-12-07 20:39:25,268 INFO [train.py:873] (2/4) Epoch 9, batch 5000, loss[loss=0.1612, simple_loss=0.1778, pruned_loss=0.07233, over 9502.00 frames. ], tot_loss[loss=0.142, simple_loss=0.1671, pruned_loss=0.05842, over 2005737.25 frames. ], batch size: 100, lr: 8.73e-03, grad_scale: 8.0 +2022-12-07 20:39:27,674 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-07 20:39:51,724 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-12-07 20:39:56,460 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3469, 5.0555, 4.9757, 5.3983, 4.9571, 4.6315, 5.4281, 5.2764], + device='cuda:2'), covar=tensor([0.0608, 0.0667, 0.0671, 0.0509, 0.0604, 0.0399, 0.0479, 0.0611], + device='cuda:2'), in_proj_covar=tensor([0.0125, 0.0119, 0.0131, 0.0135, 0.0131, 0.0106, 0.0148, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 20:39:56,550 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65532.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 20:40:07,679 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.681e+01 2.367e+02 2.989e+02 3.809e+02 1.039e+03, threshold=5.978e+02, percent-clipped=5.0 +2022-12-07 20:40:40,472 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7990, 3.8523, 4.0910, 3.5721, 3.9155, 4.0320, 1.5671, 3.7018], + device='cuda:2'), covar=tensor([0.0247, 0.0289, 0.0340, 0.0427, 0.0293, 0.0278, 0.3000, 0.0247], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0156, 0.0132, 0.0127, 0.0188, 0.0128, 0.0153, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 20:40:50,490 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65593.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 20:40:53,901 INFO [train.py:873] (2/4) Epoch 9, batch 5100, loss[loss=0.1427, simple_loss=0.1666, pruned_loss=0.05941, over 14241.00 frames. ], tot_loss[loss=0.1413, simple_loss=0.1667, pruned_loss=0.05791, over 2038310.40 frames. ], batch size: 46, lr: 8.73e-03, grad_scale: 8.0 +2022-12-07 20:41:25,527 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65632.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:41:33,044 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65641.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:41:36,042 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-12-07 20:41:36,453 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.407e+02 2.306e+02 2.958e+02 3.852e+02 6.809e+02, threshold=5.917e+02, percent-clipped=2.0 +2022-12-07 20:41:45,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8143, 2.5614, 2.6645, 1.6775, 2.3587, 2.6031, 2.9208, 2.3624], + device='cuda:2'), covar=tensor([0.0736, 0.1465, 0.1011, 0.1948, 0.1098, 0.0769, 0.0616, 0.1511], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0186, 0.0134, 0.0127, 0.0128, 0.0136, 0.0110, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:41:51,268 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8582, 1.3921, 3.1128, 1.4154, 3.2494, 3.0973, 2.1751, 3.2790], + device='cuda:2'), covar=tensor([0.0298, 0.2862, 0.0301, 0.2264, 0.0267, 0.0392, 0.0831, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0156, 0.0154, 0.0169, 0.0166, 0.0168, 0.0133, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:42:01,149 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8518, 2.4942, 3.6567, 2.7849, 3.6228, 3.6426, 3.4280, 2.9331], + device='cuda:2'), covar=tensor([0.0803, 0.2943, 0.0960, 0.1828, 0.0741, 0.0752, 0.1239, 0.1718], + device='cuda:2'), in_proj_covar=tensor([0.0333, 0.0315, 0.0393, 0.0305, 0.0373, 0.0309, 0.0355, 0.0317], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:42:07,915 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65680.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:42:22,491 INFO [train.py:873] (2/4) Epoch 9, batch 5200, loss[loss=0.1165, simple_loss=0.1528, pruned_loss=0.04013, over 14291.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.1669, pruned_loss=0.05781, over 2055542.72 frames. ], batch size: 39, lr: 8.72e-03, grad_scale: 8.0 +2022-12-07 20:42:33,714 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6798, 3.3871, 3.1897, 3.3389, 3.5306, 3.5292, 3.6304, 3.6316], + device='cuda:2'), covar=tensor([0.0823, 0.0678, 0.2167, 0.2625, 0.0795, 0.0901, 0.1112, 0.0906], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0244, 0.0410, 0.0527, 0.0300, 0.0398, 0.0386, 0.0343], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:42:35,052 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.59 vs. limit=5.0 +2022-12-07 20:42:35,341 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65711.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:42:57,153 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3173, 1.5368, 3.3420, 1.4268, 3.2608, 3.3972, 2.4875, 3.6479], + device='cuda:2'), covar=tensor([0.0254, 0.2985, 0.0393, 0.2528, 0.0694, 0.0413, 0.0862, 0.0191], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0157, 0.0156, 0.0169, 0.0167, 0.0170, 0.0136, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:43:04,805 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.172e+02 3.153e+02 3.803e+02 8.139e+02, threshold=6.306e+02, percent-clipped=3.0 +2022-12-07 20:43:17,694 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65759.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:43:50,697 INFO [train.py:873] (2/4) Epoch 9, batch 5300, loss[loss=0.1509, simple_loss=0.1811, pruned_loss=0.06039, over 14087.00 frames. ], tot_loss[loss=0.1413, simple_loss=0.167, pruned_loss=0.05778, over 2046097.67 frames. ], batch size: 29, lr: 8.71e-03, grad_scale: 8.0 +2022-12-07 20:44:26,675 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=65838.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 20:44:32,252 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.588e+01 2.128e+02 2.955e+02 3.598e+02 1.062e+03, threshold=5.909e+02, percent-clipped=4.0 +2022-12-07 20:44:37,238 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.52 vs. limit=5.0 +2022-12-07 20:45:09,766 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=65888.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 20:45:17,380 INFO [train.py:873] (2/4) Epoch 9, batch 5400, loss[loss=0.1389, simple_loss=0.1625, pruned_loss=0.0576, over 11157.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.1674, pruned_loss=0.05816, over 2053026.64 frames. ], batch size: 100, lr: 8.71e-03, grad_scale: 8.0 +2022-12-07 20:45:19,340 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=65899.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:45:56,515 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=65941.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:46:00,156 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.009e+02 2.028e+02 2.560e+02 3.349e+02 6.726e+02, threshold=5.119e+02, percent-clipped=1.0 +2022-12-07 20:46:33,130 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-07 20:46:38,493 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=65989.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:46:45,301 INFO [train.py:873] (2/4) Epoch 9, batch 5500, loss[loss=0.1452, simple_loss=0.1528, pruned_loss=0.06882, over 4932.00 frames. ], tot_loss[loss=0.14, simple_loss=0.1659, pruned_loss=0.05699, over 2002460.99 frames. ], batch size: 100, lr: 8.70e-03, grad_scale: 8.0 +2022-12-07 20:47:26,601 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6493, 1.9863, 2.6396, 2.7607, 2.5367, 1.9886, 2.6793, 2.2475], + device='cuda:2'), covar=tensor([0.0264, 0.0642, 0.0323, 0.0245, 0.0311, 0.0799, 0.0269, 0.0509], + device='cuda:2'), in_proj_covar=tensor([0.0266, 0.0239, 0.0352, 0.0301, 0.0242, 0.0285, 0.0272, 0.0267], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 20:47:27,194 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 2.224e+02 2.797e+02 3.647e+02 1.046e+03, threshold=5.594e+02, percent-clipped=7.0 +2022-12-07 20:47:41,997 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8398, 0.8885, 0.7246, 0.7642, 0.7062, 0.5418, 0.5114, 0.5809], + device='cuda:2'), covar=tensor([0.0096, 0.0089, 0.0109, 0.0085, 0.0121, 0.0347, 0.0173, 0.0347], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0013, 0.0021, 0.0016, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.6032e-05, 1.0550e-04, 9.3177e-05, 9.9410e-05, 9.6554e-05, 1.4509e-04, + 1.2123e-04, 1.4115e-04], device='cuda:2') +2022-12-07 20:48:11,089 INFO [train.py:873] (2/4) Epoch 9, batch 5600, loss[loss=0.1307, simple_loss=0.1597, pruned_loss=0.05079, over 14365.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.1666, pruned_loss=0.05762, over 2078498.72 frames. ], batch size: 55, lr: 8.69e-03, grad_scale: 8.0 +2022-12-07 20:48:52,446 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66144.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:48:53,079 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.414e+02 2.262e+02 2.866e+02 3.556e+02 6.245e+02, threshold=5.731e+02, percent-clipped=4.0 +2022-12-07 20:49:07,451 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66160.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:49:32,064 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66188.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 20:49:37,651 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66194.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 20:49:40,171 INFO [train.py:873] (2/4) Epoch 9, batch 5700, loss[loss=0.2122, simple_loss=0.1733, pruned_loss=0.1256, over 1253.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.1663, pruned_loss=0.0577, over 2007419.09 frames. ], batch size: 100, lr: 8.69e-03, grad_scale: 8.0 +2022-12-07 20:49:47,515 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66205.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 20:50:01,125 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66221.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 20:50:07,104 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.34 vs. limit=5.0 +2022-12-07 20:50:15,200 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66236.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:50:23,973 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.292e+02 2.851e+02 3.352e+02 5.991e+02, threshold=5.702e+02, percent-clipped=1.0 +2022-12-07 20:50:34,220 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66258.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:51:08,643 INFO [train.py:873] (2/4) Epoch 9, batch 5800, loss[loss=0.1672, simple_loss=0.1818, pruned_loss=0.07636, over 11144.00 frames. ], tot_loss[loss=0.1413, simple_loss=0.1661, pruned_loss=0.05819, over 1963880.19 frames. ], batch size: 100, lr: 8.68e-03, grad_scale: 4.0 +2022-12-07 20:51:26,374 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.37 vs. limit=5.0 +2022-12-07 20:51:28,582 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66319.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:51:51,562 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.104e+02 2.481e+02 2.990e+02 4.125e+02 8.241e+02, threshold=5.981e+02, percent-clipped=12.0 +2022-12-07 20:52:36,914 INFO [train.py:873] (2/4) Epoch 9, batch 5900, loss[loss=0.1528, simple_loss=0.1745, pruned_loss=0.06551, over 14469.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.1662, pruned_loss=0.05779, over 1997174.73 frames. ], batch size: 49, lr: 8.67e-03, grad_scale: 4.0 +2022-12-07 20:53:19,921 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 2.452e+02 3.007e+02 3.915e+02 6.171e+02, threshold=6.013e+02, percent-clipped=2.0 +2022-12-07 20:53:27,744 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4658, 1.9366, 3.5382, 2.5236, 3.4463, 1.7893, 2.7630, 3.3197], + device='cuda:2'), covar=tensor([0.0688, 0.4322, 0.0525, 0.5974, 0.0578, 0.3727, 0.1273, 0.0608], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0223, 0.0193, 0.0303, 0.0212, 0.0232, 0.0222, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:54:02,084 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66494.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:54:04,616 INFO [train.py:873] (2/4) Epoch 9, batch 6000, loss[loss=0.1602, simple_loss=0.1744, pruned_loss=0.07303, over 6969.00 frames. ], tot_loss[loss=0.142, simple_loss=0.1668, pruned_loss=0.05866, over 1973954.93 frames. ], batch size: 100, lr: 8.67e-03, grad_scale: 8.0 +2022-12-07 20:54:04,616 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 20:54:15,608 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8957, 1.9577, 4.6497, 2.5036, 4.3784, 4.9466, 4.4156, 5.2522], + device='cuda:2'), covar=tensor([0.0109, 0.2485, 0.0190, 0.1533, 0.0213, 0.0155, 0.0188, 0.0080], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0158, 0.0157, 0.0169, 0.0168, 0.0170, 0.0136, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 20:54:16,408 INFO [train.py:905] (2/4) Epoch 9, validation: loss=0.1244, simple_loss=0.166, pruned_loss=0.04137, over 857387.00 frames. +2022-12-07 20:54:16,408 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-07 20:54:19,132 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66500.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:54:32,878 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66516.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 20:54:37,343 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66521.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 20:54:43,698 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 20:54:56,088 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66542.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:54:59,452 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.842e+01 2.449e+02 2.904e+02 3.531e+02 7.481e+02, threshold=5.809e+02, percent-clipped=4.0 +2022-12-07 20:55:05,631 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7291, 3.3349, 3.4960, 3.7266, 3.6486, 3.4587, 3.7629, 3.1601], + device='cuda:2'), covar=tensor([0.1074, 0.1920, 0.0870, 0.0985, 0.1225, 0.1703, 0.1150, 0.1175], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0253, 0.0177, 0.0173, 0.0173, 0.0139, 0.0261, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 20:55:32,219 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66582.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 20:55:42,517 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1467, 3.2395, 3.1399, 3.0192, 2.5418, 3.3976, 3.0551, 1.3759], + device='cuda:2'), covar=tensor([0.2306, 0.0875, 0.1773, 0.1091, 0.1093, 0.0414, 0.1553, 0.3114], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0070, 0.0057, 0.0057, 0.0086, 0.0065, 0.0091, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 20:55:45,073 INFO [train.py:873] (2/4) Epoch 9, batch 6100, loss[loss=0.1144, simple_loss=0.1502, pruned_loss=0.0393, over 13951.00 frames. ], tot_loss[loss=0.1413, simple_loss=0.1666, pruned_loss=0.05801, over 1985883.35 frames. ], batch size: 19, lr: 8.66e-03, grad_scale: 8.0 +2022-12-07 20:55:58,467 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5591, 4.2694, 3.9590, 4.1664, 4.3364, 4.3932, 4.5510, 4.5313], + device='cuda:2'), covar=tensor([0.0711, 0.0588, 0.2197, 0.2657, 0.0685, 0.0727, 0.0999, 0.0711], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0242, 0.0408, 0.0525, 0.0302, 0.0391, 0.0381, 0.0343], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 20:56:00,270 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66614.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:56:28,746 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 2.301e+02 2.963e+02 3.616e+02 7.389e+02, threshold=5.926e+02, percent-clipped=2.0 +2022-12-07 20:56:54,765 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2799, 5.0632, 4.8419, 5.3574, 4.8072, 4.6525, 5.4258, 5.2513], + device='cuda:2'), covar=tensor([0.0668, 0.0685, 0.0707, 0.0518, 0.0544, 0.0382, 0.0540, 0.0602], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0120, 0.0131, 0.0139, 0.0132, 0.0108, 0.0152, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 20:57:13,996 INFO [train.py:873] (2/4) Epoch 9, batch 6200, loss[loss=0.1408, simple_loss=0.1679, pruned_loss=0.05685, over 14268.00 frames. ], tot_loss[loss=0.142, simple_loss=0.1673, pruned_loss=0.05841, over 1995083.67 frames. ], batch size: 37, lr: 8.66e-03, grad_scale: 8.0 +2022-12-07 20:57:20,011 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2284, 1.9426, 2.2050, 2.4190, 1.9848, 1.9545, 2.2161, 2.1805], + device='cuda:2'), covar=tensor([0.0140, 0.0301, 0.0152, 0.0132, 0.0206, 0.0368, 0.0186, 0.0187], + device='cuda:2'), in_proj_covar=tensor([0.0267, 0.0238, 0.0355, 0.0299, 0.0242, 0.0287, 0.0270, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 20:57:40,203 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 20:57:44,104 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66731.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:57:50,537 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9737, 2.8610, 2.1987, 3.0270, 2.7941, 2.8947, 2.6368, 2.3749], + device='cuda:2'), covar=tensor([0.0845, 0.1390, 0.3138, 0.0615, 0.1081, 0.0979, 0.1335, 0.2677], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0299, 0.0274, 0.0232, 0.0296, 0.0284, 0.0256, 0.0261], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 20:57:57,359 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.480e+01 2.278e+02 2.813e+02 3.570e+02 7.547e+02, threshold=5.626e+02, percent-clipped=4.0 +2022-12-07 20:58:37,876 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=66792.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:58:42,094 INFO [train.py:873] (2/4) Epoch 9, batch 6300, loss[loss=0.1576, simple_loss=0.1787, pruned_loss=0.06822, over 14235.00 frames. ], tot_loss[loss=0.1424, simple_loss=0.1671, pruned_loss=0.05886, over 1918371.61 frames. ], batch size: 94, lr: 8.65e-03, grad_scale: 8.0 +2022-12-07 20:58:45,093 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66800.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 20:58:59,256 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66816.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 20:59:08,391 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5549, 4.3344, 4.2533, 4.6414, 4.1549, 3.8309, 4.6417, 4.5378], + device='cuda:2'), covar=tensor([0.0657, 0.0619, 0.0706, 0.0519, 0.0649, 0.0543, 0.0596, 0.0586], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0121, 0.0133, 0.0140, 0.0134, 0.0108, 0.0152, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 20:59:25,550 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.076e+02 2.178e+02 2.874e+02 3.614e+02 8.081e+02, threshold=5.749e+02, percent-clipped=6.0 +2022-12-07 20:59:27,380 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66848.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 20:59:42,265 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66864.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 20:59:53,390 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=66877.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:00:10,822 INFO [train.py:873] (2/4) Epoch 9, batch 6400, loss[loss=0.1655, simple_loss=0.1533, pruned_loss=0.08882, over 2655.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.1666, pruned_loss=0.05794, over 1936950.55 frames. ], batch size: 100, lr: 8.64e-03, grad_scale: 8.0 +2022-12-07 21:00:26,480 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=66914.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:00:54,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.137e+02 2.352e+02 2.835e+02 3.387e+02 8.466e+02, threshold=5.670e+02, percent-clipped=3.0 +2022-12-07 21:01:04,697 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=66957.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:01:09,005 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=66962.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:01:32,940 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9674, 1.3148, 3.8307, 1.6191, 3.7793, 4.0055, 2.9079, 4.3009], + device='cuda:2'), covar=tensor([0.0204, 0.3228, 0.0456, 0.2327, 0.0422, 0.0351, 0.0712, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0158, 0.0158, 0.0170, 0.0168, 0.0168, 0.0137, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 21:01:40,465 INFO [train.py:873] (2/4) Epoch 9, batch 6500, loss[loss=0.1346, simple_loss=0.1346, pruned_loss=0.06728, over 2670.00 frames. ], tot_loss[loss=0.1411, simple_loss=0.1665, pruned_loss=0.05789, over 1926348.47 frames. ], batch size: 100, lr: 8.64e-03, grad_scale: 8.0 +2022-12-07 21:01:59,264 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67018.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:02:02,108 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9000, 2.1123, 2.6895, 2.2825, 2.7708, 2.7136, 2.6205, 2.4770], + device='cuda:2'), covar=tensor([0.0570, 0.2483, 0.0753, 0.1571, 0.0521, 0.0831, 0.0959, 0.1667], + device='cuda:2'), in_proj_covar=tensor([0.0335, 0.0322, 0.0402, 0.0309, 0.0380, 0.0314, 0.0366, 0.0323], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:02:24,260 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.174e+02 2.311e+02 2.795e+02 3.453e+02 6.517e+02, threshold=5.590e+02, percent-clipped=2.0 +2022-12-07 21:02:33,299 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8966, 1.9150, 1.6607, 2.0234, 1.8883, 1.9737, 1.8334, 1.8668], + device='cuda:2'), covar=tensor([0.1078, 0.0729, 0.1605, 0.0527, 0.0664, 0.0484, 0.1363, 0.0761], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0302, 0.0279, 0.0237, 0.0302, 0.0293, 0.0258, 0.0264], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 21:02:54,810 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 21:02:57,868 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67083.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:03:01,180 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67087.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:03:10,250 INFO [train.py:873] (2/4) Epoch 9, batch 6600, loss[loss=0.146, simple_loss=0.1704, pruned_loss=0.06084, over 14186.00 frames. ], tot_loss[loss=0.1403, simple_loss=0.1659, pruned_loss=0.05733, over 2004899.18 frames. ], batch size: 84, lr: 8.63e-03, grad_scale: 8.0 +2022-12-07 21:03:45,642 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-07 21:03:53,418 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67144.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:03:54,921 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.422e+02 2.185e+02 2.883e+02 3.643e+02 7.006e+02, threshold=5.766e+02, percent-clipped=2.0 +2022-12-07 21:04:22,355 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67177.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:04:40,788 INFO [train.py:873] (2/4) Epoch 9, batch 6700, loss[loss=0.113, simple_loss=0.1577, pruned_loss=0.03415, over 14323.00 frames. ], tot_loss[loss=0.1415, simple_loss=0.1668, pruned_loss=0.05811, over 2032871.80 frames. ], batch size: 31, lr: 8.62e-03, grad_scale: 8.0 +2022-12-07 21:05:06,519 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67225.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:05:24,547 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.307e+02 2.308e+02 2.851e+02 3.420e+02 6.457e+02, threshold=5.703e+02, percent-clipped=2.0 +2022-12-07 21:05:34,460 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-07 21:06:10,530 INFO [train.py:873] (2/4) Epoch 9, batch 6800, loss[loss=0.2292, simple_loss=0.1853, pruned_loss=0.1365, over 1189.00 frames. ], tot_loss[loss=0.1404, simple_loss=0.1659, pruned_loss=0.05744, over 2011126.43 frames. ], batch size: 100, lr: 8.62e-03, grad_scale: 8.0 +2022-12-07 21:06:23,320 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.96 vs. limit=5.0 +2022-12-07 21:06:25,256 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67313.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:06:25,337 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6257, 3.4374, 2.9806, 2.3338, 3.0946, 3.3636, 3.6692, 2.8993], + device='cuda:2'), covar=tensor([0.0606, 0.1694, 0.1165, 0.1845, 0.0926, 0.0683, 0.0691, 0.1481], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0181, 0.0131, 0.0124, 0.0124, 0.0134, 0.0110, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 21:06:30,759 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6710, 1.3940, 1.3695, 1.2692, 1.6499, 0.8024, 1.4010, 1.7559], + device='cuda:2'), covar=tensor([0.0591, 0.0756, 0.0721, 0.1276, 0.1507, 0.0767, 0.1095, 0.1124], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0026, 0.0023, 0.0025, 0.0034, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 21:06:42,915 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0147, 2.6921, 2.7035, 1.9291, 2.5887, 2.7843, 3.0491, 2.5051], + device='cuda:2'), covar=tensor([0.0700, 0.1275, 0.1139, 0.1794, 0.1027, 0.0732, 0.0641, 0.1537], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0182, 0.0131, 0.0124, 0.0124, 0.0135, 0.0111, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 21:06:54,533 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.150e+02 2.355e+02 2.871e+02 3.825e+02 6.310e+02, threshold=5.742e+02, percent-clipped=3.0 +2022-12-07 21:07:31,828 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67387.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:07:40,407 INFO [train.py:873] (2/4) Epoch 9, batch 6900, loss[loss=0.1304, simple_loss=0.1656, pruned_loss=0.0476, over 14571.00 frames. ], tot_loss[loss=0.1403, simple_loss=0.1663, pruned_loss=0.0572, over 2043953.99 frames. ], batch size: 34, lr: 8.61e-03, grad_scale: 8.0 +2022-12-07 21:08:08,885 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-12-07 21:08:14,506 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67435.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:08:17,927 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67439.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:08:21,571 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3948, 4.8457, 4.8082, 5.3657, 4.9203, 4.5046, 5.3414, 4.4213], + device='cuda:2'), covar=tensor([0.0328, 0.1241, 0.0353, 0.0410, 0.0785, 0.0472, 0.0535, 0.0522], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0256, 0.0179, 0.0174, 0.0171, 0.0141, 0.0262, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 21:08:24,081 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.180e+02 2.447e+02 3.065e+02 3.875e+02 6.583e+02, threshold=6.130e+02, percent-clipped=2.0 +2022-12-07 21:08:43,822 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-07 21:09:01,406 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-12-07 21:09:08,979 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6361, 1.3879, 1.3859, 1.4849, 1.8178, 0.8486, 1.6013, 1.9863], + device='cuda:2'), covar=tensor([0.1083, 0.1192, 0.1557, 0.1987, 0.1172, 0.0625, 0.1053, 0.1011], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0023, 0.0026, 0.0023, 0.0024, 0.0034, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 21:09:09,668 INFO [train.py:873] (2/4) Epoch 9, batch 7000, loss[loss=0.1287, simple_loss=0.1658, pruned_loss=0.0458, over 14207.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.1662, pruned_loss=0.05709, over 2063071.04 frames. ], batch size: 25, lr: 8.60e-03, grad_scale: 8.0 +2022-12-07 21:09:24,651 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67513.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:09:45,833 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0849, 4.1529, 4.1047, 3.9420, 4.0530, 4.4039, 1.6316, 3.8861], + device='cuda:2'), covar=tensor([0.0350, 0.0392, 0.0725, 0.0565, 0.0515, 0.0348, 0.3994, 0.0411], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0159, 0.0130, 0.0130, 0.0187, 0.0129, 0.0153, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:09:53,897 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.336e+02 2.835e+02 3.380e+02 6.445e+02, threshold=5.670e+02, percent-clipped=1.0 +2022-12-07 21:10:19,384 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67574.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 21:10:40,042 INFO [train.py:873] (2/4) Epoch 9, batch 7100, loss[loss=0.1573, simple_loss=0.1743, pruned_loss=0.07021, over 14469.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.1664, pruned_loss=0.05701, over 2070343.09 frames. ], batch size: 51, lr: 8.60e-03, grad_scale: 8.0 +2022-12-07 21:10:55,502 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67613.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:11:27,145 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.458e+02 2.435e+02 3.463e+02 4.295e+02 1.529e+03, threshold=6.926e+02, percent-clipped=8.0 +2022-12-07 21:11:27,907 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.84 vs. limit=5.0 +2022-12-07 21:11:42,155 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67661.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:12:00,567 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67680.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:12:11,882 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67692.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:12:16,510 INFO [train.py:873] (2/4) Epoch 9, batch 7200, loss[loss=0.1853, simple_loss=0.1641, pruned_loss=0.1033, over 2576.00 frames. ], tot_loss[loss=0.1394, simple_loss=0.1659, pruned_loss=0.05649, over 1982691.98 frames. ], batch size: 100, lr: 8.59e-03, grad_scale: 8.0 +2022-12-07 21:12:57,742 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=67739.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:12:59,668 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67741.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:13:04,118 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 2.252e+02 2.773e+02 3.514e+02 7.822e+02, threshold=5.546e+02, percent-clipped=1.0 +2022-12-07 21:13:11,090 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=67753.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 21:13:21,555 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-07 21:13:43,175 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=67787.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:13:52,644 INFO [train.py:873] (2/4) Epoch 9, batch 7300, loss[loss=0.144, simple_loss=0.1677, pruned_loss=0.06018, over 5991.00 frames. ], tot_loss[loss=0.1394, simple_loss=0.1656, pruned_loss=0.05661, over 1974446.19 frames. ], batch size: 100, lr: 8.58e-03, grad_scale: 8.0 +2022-12-07 21:14:39,306 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.094e+02 2.117e+02 2.726e+02 3.356e+02 6.858e+02, threshold=5.453e+02, percent-clipped=1.0 +2022-12-07 21:14:55,086 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7123, 2.2842, 3.5904, 3.8027, 3.6129, 2.3774, 3.5452, 2.9064], + device='cuda:2'), covar=tensor([0.0287, 0.0692, 0.0547, 0.0327, 0.0307, 0.1037, 0.0306, 0.0723], + device='cuda:2'), in_proj_covar=tensor([0.0268, 0.0240, 0.0357, 0.0297, 0.0243, 0.0290, 0.0270, 0.0268], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:14:55,911 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9000, 0.8550, 0.8818, 0.8835, 1.0755, 0.5913, 0.9194, 1.0432], + device='cuda:2'), covar=tensor([0.0581, 0.0712, 0.0608, 0.0629, 0.0389, 0.0724, 0.0850, 0.0708], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0023, 0.0025, 0.0022, 0.0024, 0.0034, 0.0024, 0.0025], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 21:15:01,881 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=67869.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:15:28,016 INFO [train.py:873] (2/4) Epoch 9, batch 7400, loss[loss=0.1557, simple_loss=0.159, pruned_loss=0.07623, over 3830.00 frames. ], tot_loss[loss=0.1396, simple_loss=0.1656, pruned_loss=0.0568, over 1918706.58 frames. ], batch size: 100, lr: 8.58e-03, grad_scale: 8.0 +2022-12-07 21:16:01,026 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-07 21:16:15,148 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.396e+02 2.952e+02 3.564e+02 5.471e+02, threshold=5.904e+02, percent-clipped=1.0 +2022-12-07 21:16:18,530 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-12-07 21:16:31,132 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-07 21:16:47,297 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=67981.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:17:02,496 INFO [train.py:873] (2/4) Epoch 9, batch 7500, loss[loss=0.1461, simple_loss=0.1755, pruned_loss=0.05834, over 14294.00 frames. ], tot_loss[loss=0.139, simple_loss=0.1653, pruned_loss=0.05633, over 1968561.13 frames. ], batch size: 76, lr: 8.57e-03, grad_scale: 8.0 +2022-12-07 21:17:31,985 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8159, 3.8784, 4.0852, 3.7490, 3.8856, 3.9831, 1.5100, 3.7043], + device='cuda:2'), covar=tensor([0.0279, 0.0323, 0.0360, 0.0422, 0.0317, 0.0341, 0.3192, 0.0271], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0130, 0.0128, 0.0184, 0.0127, 0.0152, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:17:38,182 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68036.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:17:42,605 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68042.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:17:44,527 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.588e+01 2.266e+02 2.777e+02 3.907e+02 7.467e+02, threshold=5.554e+02, percent-clipped=5.0 +2022-12-07 21:17:45,644 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68048.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:18:40,270 INFO [train.py:873] (2/4) Epoch 10, batch 0, loss[loss=0.1982, simple_loss=0.2102, pruned_loss=0.09313, over 14266.00 frames. ], tot_loss[loss=0.1982, simple_loss=0.2102, pruned_loss=0.09313, over 14266.00 frames. ], batch size: 63, lr: 8.15e-03, grad_scale: 8.0 +2022-12-07 21:18:40,271 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 21:18:48,291 INFO [train.py:905] (2/4) Epoch 10, validation: loss=0.1297, simple_loss=0.1728, pruned_loss=0.04327, over 857387.00 frames. +2022-12-07 21:18:48,293 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-07 21:18:54,443 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-07 21:19:16,398 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68087.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:19:32,590 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-12-07 21:19:33,609 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-07 21:19:42,873 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-12-07 21:20:12,821 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.622e+01 2.492e+02 3.120e+02 3.862e+02 1.100e+03, threshold=6.240e+02, percent-clipped=9.0 +2022-12-07 21:20:14,303 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68148.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:20:24,790 INFO [train.py:873] (2/4) Epoch 10, batch 100, loss[loss=0.1308, simple_loss=0.1622, pruned_loss=0.04969, over 14283.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.167, pruned_loss=0.05743, over 816585.08 frames. ], batch size: 66, lr: 8.14e-03, grad_scale: 4.0 +2022-12-07 21:20:34,377 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68169.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:21:11,451 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0845, 2.7190, 5.1506, 3.3978, 4.7500, 2.2576, 4.0800, 4.6951], + device='cuda:2'), covar=tensor([0.0347, 0.4125, 0.0223, 0.7261, 0.0514, 0.3659, 0.0954, 0.0278], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0225, 0.0194, 0.0303, 0.0214, 0.0227, 0.0219, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:21:19,968 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68217.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:21:43,220 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2012, 2.0784, 4.9947, 4.5063, 4.3240, 5.1201, 4.9336, 5.0904], + device='cuda:2'), covar=tensor([0.1293, 0.1261, 0.0071, 0.0132, 0.0175, 0.0071, 0.0072, 0.0093], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0122, 0.0166, 0.0142, 0.0135, 0.0115, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:21:48,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.438e+02 2.529e+02 3.051e+02 3.928e+02 8.118e+02, threshold=6.102e+02, percent-clipped=5.0 +2022-12-07 21:21:55,899 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.06 vs. limit=5.0 +2022-12-07 21:21:59,799 INFO [train.py:873] (2/4) Epoch 10, batch 200, loss[loss=0.2343, simple_loss=0.1938, pruned_loss=0.1374, over 1230.00 frames. ], tot_loss[loss=0.1383, simple_loss=0.1644, pruned_loss=0.05605, over 1269437.78 frames. ], batch size: 100, lr: 8.14e-03, grad_scale: 4.0 +2022-12-07 21:22:38,749 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6889, 2.4658, 3.3295, 2.1620, 2.0561, 2.5502, 1.4024, 2.8270], + device='cuda:2'), covar=tensor([0.1347, 0.1509, 0.0863, 0.2387, 0.2798, 0.1633, 0.4757, 0.1525], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0093, 0.0087, 0.0094, 0.0113, 0.0081, 0.0129, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0005, 0.0004], + device='cuda:2') +2022-12-07 21:23:12,796 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68336.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:23:13,617 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68337.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:23:17,732 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.85 vs. limit=5.0 +2022-12-07 21:23:23,110 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.156e+02 2.072e+02 2.682e+02 3.537e+02 7.107e+02, threshold=5.365e+02, percent-clipped=2.0 +2022-12-07 21:23:24,312 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68348.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:23:34,545 INFO [train.py:873] (2/4) Epoch 10, batch 300, loss[loss=0.1348, simple_loss=0.1456, pruned_loss=0.06199, over 3868.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.1644, pruned_loss=0.05604, over 1554249.12 frames. ], batch size: 100, lr: 8.13e-03, grad_scale: 4.0 +2022-12-07 21:23:38,529 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6118, 2.2247, 3.6496, 3.7726, 3.7254, 2.3597, 3.6615, 2.8886], + device='cuda:2'), covar=tensor([0.0316, 0.0746, 0.0720, 0.0388, 0.0271, 0.1085, 0.0294, 0.0714], + device='cuda:2'), in_proj_covar=tensor([0.0269, 0.0240, 0.0360, 0.0299, 0.0243, 0.0292, 0.0272, 0.0270], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:23:57,987 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68384.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:24:10,421 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68396.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:24:13,355 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:24:25,471 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1427, 3.8947, 3.6180, 3.8007, 3.9950, 4.0263, 4.1797, 4.1412], + device='cuda:2'), covar=tensor([0.0908, 0.0648, 0.2363, 0.2874, 0.0764, 0.0846, 0.1021, 0.0917], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0248, 0.0417, 0.0528, 0.0309, 0.0399, 0.0386, 0.0351], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:24:36,532 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.0805, 5.4595, 5.4246, 5.9911, 5.5922, 4.7862, 5.9401, 4.7199], + device='cuda:2'), covar=tensor([0.0219, 0.0736, 0.0261, 0.0336, 0.0655, 0.0341, 0.0407, 0.0500], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0247, 0.0172, 0.0167, 0.0167, 0.0135, 0.0253, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:24:56,070 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68443.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:24:59,357 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 2.338e+02 2.858e+02 3.586e+02 7.542e+02, threshold=5.716e+02, percent-clipped=9.0 +2022-12-07 21:25:10,936 INFO [train.py:873] (2/4) Epoch 10, batch 400, loss[loss=0.1161, simple_loss=0.1477, pruned_loss=0.0422, over 14634.00 frames. ], tot_loss[loss=0.1378, simple_loss=0.1642, pruned_loss=0.05576, over 1747737.13 frames. ], batch size: 33, lr: 8.12e-03, grad_scale: 8.0 +2022-12-07 21:25:12,104 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68460.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:26:19,667 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7173, 0.7451, 0.6268, 0.7607, 0.6785, 0.3621, 0.4068, 0.5374], + device='cuda:2'), covar=tensor([0.0154, 0.0147, 0.0122, 0.0115, 0.0164, 0.0352, 0.0234, 0.0362], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0013, 0.0021, 0.0017, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0069e-04, 1.0861e-04, 9.6545e-05, 1.0336e-04, 1.0046e-04, 1.5022e-04, + 1.2663e-04, 1.4514e-04], device='cuda:2') +2022-12-07 21:26:35,456 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.435e+02 2.815e+02 3.563e+02 7.495e+02, threshold=5.630e+02, percent-clipped=4.0 +2022-12-07 21:26:47,610 INFO [train.py:873] (2/4) Epoch 10, batch 500, loss[loss=0.161, simple_loss=0.1718, pruned_loss=0.07505, over 7804.00 frames. ], tot_loss[loss=0.1392, simple_loss=0.1651, pruned_loss=0.05662, over 1859835.13 frames. ], batch size: 100, lr: 8.12e-03, grad_scale: 8.0 +2022-12-07 21:27:05,928 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68578.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:28:01,890 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68637.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:28:03,528 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68639.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:28:10,692 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.029e+02 2.141e+02 2.717e+02 3.371e+02 6.429e+02, threshold=5.434e+02, percent-clipped=1.0 +2022-12-07 21:28:21,922 INFO [train.py:873] (2/4) Epoch 10, batch 600, loss[loss=0.1205, simple_loss=0.1518, pruned_loss=0.0446, over 13964.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.166, pruned_loss=0.05823, over 1837381.38 frames. ], batch size: 20, lr: 8.11e-03, grad_scale: 8.0 +2022-12-07 21:28:37,742 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4684, 2.7033, 4.1901, 3.1437, 4.1523, 3.8334, 3.9020, 3.5051], + device='cuda:2'), covar=tensor([0.0656, 0.3673, 0.1048, 0.2212, 0.1007, 0.1058, 0.1875, 0.2017], + device='cuda:2'), in_proj_covar=tensor([0.0341, 0.0326, 0.0406, 0.0311, 0.0379, 0.0320, 0.0368, 0.0323], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:28:46,815 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68685.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:28:46,958 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=68685.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:29:02,536 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-12-07 21:29:31,319 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4226, 2.3777, 2.5213, 2.5431, 2.4667, 2.1134, 1.3981, 2.1879], + device='cuda:2'), covar=tensor([0.0421, 0.0461, 0.0425, 0.0304, 0.0354, 0.1093, 0.2213, 0.0399], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0158, 0.0130, 0.0129, 0.0185, 0.0127, 0.0154, 0.0175], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:29:41,787 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=68743.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:29:44,921 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=68746.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:29:45,570 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.363e+02 2.288e+02 3.002e+02 3.760e+02 7.437e+02, threshold=6.005e+02, percent-clipped=6.0 +2022-12-07 21:29:53,300 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68755.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:29:57,179 INFO [train.py:873] (2/4) Epoch 10, batch 700, loss[loss=0.1303, simple_loss=0.1608, pruned_loss=0.04992, over 14266.00 frames. ], tot_loss[loss=0.139, simple_loss=0.1648, pruned_loss=0.05657, over 1869356.60 frames. ], batch size: 80, lr: 8.11e-03, grad_scale: 8.0 +2022-12-07 21:30:27,494 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=68791.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:31:18,241 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7504, 0.8046, 0.7409, 0.7602, 0.6549, 0.3661, 0.3856, 0.5017], + device='cuda:2'), covar=tensor([0.0117, 0.0117, 0.0111, 0.0120, 0.0142, 0.0332, 0.0259, 0.0335], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0013, 0.0021, 0.0017, 0.0021], + device='cuda:2'), out_proj_covar=tensor([1.0016e-04, 1.0761e-04, 9.5933e-05, 1.0303e-04, 1.0005e-04, 1.4937e-04, + 1.2672e-04, 1.4383e-04], device='cuda:2') +2022-12-07 21:31:20,804 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.536e+02 2.251e+02 2.713e+02 3.475e+02 5.744e+02, threshold=5.426e+02, percent-clipped=0.0 +2022-12-07 21:31:32,390 INFO [train.py:873] (2/4) Epoch 10, batch 800, loss[loss=0.1896, simple_loss=0.1736, pruned_loss=0.1029, over 1253.00 frames. ], tot_loss[loss=0.137, simple_loss=0.1637, pruned_loss=0.05517, over 1903146.43 frames. ], batch size: 100, lr: 8.10e-03, grad_scale: 8.0 +2022-12-07 21:32:17,990 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-07 21:32:43,333 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=68934.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:32:55,163 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 2.204e+02 2.859e+02 3.964e+02 7.134e+02, threshold=5.717e+02, percent-clipped=4.0 +2022-12-07 21:33:06,464 INFO [train.py:873] (2/4) Epoch 10, batch 900, loss[loss=0.1687, simple_loss=0.1793, pruned_loss=0.07903, over 7794.00 frames. ], tot_loss[loss=0.1368, simple_loss=0.1639, pruned_loss=0.05485, over 1956473.28 frames. ], batch size: 100, lr: 8.09e-03, grad_scale: 8.0 +2022-12-07 21:33:11,507 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8009, 2.7652, 2.6892, 2.9451, 2.4803, 2.4906, 2.9048, 2.8888], + device='cuda:2'), covar=tensor([0.0663, 0.0798, 0.0801, 0.0547, 0.1057, 0.0759, 0.0673, 0.0605], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0121, 0.0130, 0.0136, 0.0132, 0.0108, 0.0151, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:33:59,175 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8207, 1.5719, 1.8382, 2.0286, 1.2907, 1.7799, 1.8129, 1.9349], + device='cuda:2'), covar=tensor([0.0077, 0.0131, 0.0083, 0.0071, 0.0175, 0.0180, 0.0102, 0.0079], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0240, 0.0359, 0.0302, 0.0245, 0.0291, 0.0274, 0.0273], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:34:24,728 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69041.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:34:26,716 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69043.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:34:30,091 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.213e+02 2.392e+02 2.888e+02 3.362e+02 7.513e+02, threshold=5.776e+02, percent-clipped=4.0 +2022-12-07 21:34:31,164 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1692, 1.9440, 2.2355, 2.3342, 1.9301, 1.9387, 2.2344, 2.1539], + device='cuda:2'), covar=tensor([0.0162, 0.0310, 0.0139, 0.0147, 0.0247, 0.0393, 0.0186, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0240, 0.0358, 0.0301, 0.0244, 0.0290, 0.0274, 0.0272], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:34:37,729 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69055.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:34:41,174 INFO [train.py:873] (2/4) Epoch 10, batch 1000, loss[loss=0.2061, simple_loss=0.1758, pruned_loss=0.1182, over 1251.00 frames. ], tot_loss[loss=0.1388, simple_loss=0.1647, pruned_loss=0.0564, over 1885986.90 frames. ], batch size: 100, lr: 8.09e-03, grad_scale: 8.0 +2022-12-07 21:35:22,729 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69103.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:35:23,592 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69104.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:35:24,451 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6739, 2.7577, 2.6449, 2.9263, 2.3836, 2.6432, 2.7614, 2.8403], + device='cuda:2'), covar=tensor([0.0857, 0.0915, 0.0905, 0.0688, 0.1117, 0.0688, 0.0953, 0.0712], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0120, 0.0128, 0.0134, 0.0131, 0.0106, 0.0149, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:36:04,001 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.114e+02 2.200e+02 2.692e+02 3.189e+02 5.615e+02, threshold=5.384e+02, percent-clipped=0.0 +2022-12-07 21:36:09,170 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8644, 4.5989, 4.2384, 4.4495, 4.5322, 4.7692, 4.7987, 4.8158], + device='cuda:2'), covar=tensor([0.0719, 0.0445, 0.2244, 0.2558, 0.0736, 0.0645, 0.0900, 0.0752], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0247, 0.0414, 0.0532, 0.0308, 0.0403, 0.0389, 0.0347], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:36:15,496 INFO [train.py:873] (2/4) Epoch 10, batch 1100, loss[loss=0.1404, simple_loss=0.1703, pruned_loss=0.0553, over 14338.00 frames. ], tot_loss[loss=0.1367, simple_loss=0.1634, pruned_loss=0.05496, over 1894737.73 frames. ], batch size: 66, lr: 8.08e-03, grad_scale: 8.0 +2022-12-07 21:36:18,785 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69162.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:36:24,399 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69168.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:36:31,195 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6864, 2.6346, 2.0002, 2.7278, 2.4828, 2.6522, 2.3416, 2.1737], + device='cuda:2'), covar=tensor([0.0969, 0.1150, 0.3350, 0.0625, 0.0849, 0.0812, 0.1450, 0.2257], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0292, 0.0269, 0.0236, 0.0292, 0.0284, 0.0255, 0.0256], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 21:37:16,976 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69223.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:37:18,939 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69225.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:37:22,798 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69229.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:37:27,490 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69234.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:37:37,879 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69245.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:37:39,427 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.224e+02 2.182e+02 2.689e+02 3.231e+02 7.876e+02, threshold=5.378e+02, percent-clipped=5.0 +2022-12-07 21:37:51,106 INFO [train.py:873] (2/4) Epoch 10, batch 1200, loss[loss=0.1653, simple_loss=0.1786, pruned_loss=0.07599, over 14174.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.1644, pruned_loss=0.05603, over 1919238.72 frames. ], batch size: 99, lr: 8.08e-03, grad_scale: 8.0 +2022-12-07 21:37:51,282 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3170, 2.0829, 2.2630, 1.5879, 1.9816, 2.3136, 2.3855, 1.9880], + device='cuda:2'), covar=tensor([0.0576, 0.0625, 0.0885, 0.1405, 0.1057, 0.0597, 0.0489, 0.1332], + device='cuda:2'), in_proj_covar=tensor([0.0131, 0.0184, 0.0133, 0.0125, 0.0125, 0.0135, 0.0111, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 21:38:03,923 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8930, 1.5308, 2.1263, 1.6473, 1.9478, 1.5083, 1.6543, 1.9575], + device='cuda:2'), covar=tensor([0.1977, 0.2734, 0.0356, 0.2240, 0.1053, 0.1378, 0.0800, 0.0658], + device='cuda:2'), in_proj_covar=tensor([0.0241, 0.0222, 0.0193, 0.0300, 0.0214, 0.0228, 0.0217, 0.0201], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:38:04,677 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69273.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:38:13,069 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69282.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:38:16,844 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69286.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:38:36,120 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69306.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 21:38:42,991 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5866, 4.0177, 3.0394, 4.9140, 4.4095, 4.7098, 3.9754, 3.5428], + device='cuda:2'), covar=tensor([0.0854, 0.1259, 0.4306, 0.0503, 0.0862, 0.1879, 0.1498, 0.3368], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0292, 0.0270, 0.0236, 0.0293, 0.0285, 0.0257, 0.0257], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 21:38:49,568 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69320.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:39:02,181 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69334.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:39:09,002 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69341.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:39:14,524 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 2.336e+02 2.846e+02 3.621e+02 9.526e+02, threshold=5.693e+02, percent-clipped=8.0 +2022-12-07 21:39:25,827 INFO [train.py:873] (2/4) Epoch 10, batch 1300, loss[loss=0.1322, simple_loss=0.1706, pruned_loss=0.04692, over 14543.00 frames. ], tot_loss[loss=0.1375, simple_loss=0.1642, pruned_loss=0.05538, over 1970644.15 frames. ], batch size: 43, lr: 8.07e-03, grad_scale: 8.0 +2022-12-07 21:39:34,523 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69368.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:39:46,922 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69381.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:39:54,588 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69389.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:39:59,631 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69394.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:40:03,975 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69399.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:40:32,773 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69429.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:40:39,707 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7678, 1.4818, 1.7552, 2.0162, 1.2493, 1.7063, 1.8412, 1.8546], + device='cuda:2'), covar=tensor([0.0103, 0.0193, 0.0112, 0.0086, 0.0216, 0.0240, 0.0129, 0.0094], + device='cuda:2'), in_proj_covar=tensor([0.0270, 0.0241, 0.0362, 0.0304, 0.0248, 0.0291, 0.0276, 0.0273], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:40:42,330 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7238, 3.5707, 3.3084, 3.4321, 3.7053, 3.6534, 3.7700, 3.7345], + device='cuda:2'), covar=tensor([0.0979, 0.0574, 0.1836, 0.2522, 0.0692, 0.0834, 0.0957, 0.0824], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0247, 0.0408, 0.0526, 0.0303, 0.0398, 0.0382, 0.0339], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:40:49,782 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 2.414e+02 2.899e+02 3.828e+02 8.841e+02, threshold=5.798e+02, percent-clipped=2.0 +2022-12-07 21:40:57,545 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69455.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:40:57,563 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0078, 1.9852, 1.6252, 2.0093, 1.8637, 1.9653, 1.7529, 1.7809], + device='cuda:2'), covar=tensor([0.0740, 0.0761, 0.1756, 0.0470, 0.0775, 0.0476, 0.1512, 0.0855], + device='cuda:2'), in_proj_covar=tensor([0.0261, 0.0297, 0.0274, 0.0240, 0.0297, 0.0288, 0.0259, 0.0260], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 21:41:01,165 INFO [train.py:873] (2/4) Epoch 10, batch 1400, loss[loss=0.1413, simple_loss=0.1681, pruned_loss=0.05723, over 11991.00 frames. ], tot_loss[loss=0.1374, simple_loss=0.1639, pruned_loss=0.05544, over 1982167.24 frames. ], batch size: 100, lr: 8.07e-03, grad_scale: 8.0 +2022-12-07 21:41:07,915 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9437, 1.8680, 1.9271, 2.0822, 1.9513, 1.6265, 1.1855, 1.7686], + device='cuda:2'), covar=tensor([0.0601, 0.0647, 0.0538, 0.0361, 0.0507, 0.1260, 0.2256, 0.0501], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0160, 0.0131, 0.0131, 0.0188, 0.0130, 0.0154, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:41:42,919 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2019, 1.2396, 1.4034, 0.9739, 0.8937, 1.2848, 0.8091, 1.2535], + device='cuda:2'), covar=tensor([0.1267, 0.2531, 0.0844, 0.2054, 0.2794, 0.0836, 0.1955, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0077, 0.0092, 0.0086, 0.0092, 0.0112, 0.0078, 0.0127, 0.0083], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0005, 0.0003], + device='cuda:2') +2022-12-07 21:41:57,175 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69518.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:42:02,573 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69524.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:42:22,844 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-12-07 21:42:24,549 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.449e+02 2.299e+02 2.933e+02 3.622e+02 8.157e+02, threshold=5.865e+02, percent-clipped=2.0 +2022-12-07 21:42:35,973 INFO [train.py:873] (2/4) Epoch 10, batch 1500, loss[loss=0.1577, simple_loss=0.1509, pruned_loss=0.0823, over 1248.00 frames. ], tot_loss[loss=0.136, simple_loss=0.163, pruned_loss=0.05447, over 1979339.26 frames. ], batch size: 100, lr: 8.06e-03, grad_scale: 8.0 +2022-12-07 21:42:57,190 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69581.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:43:15,896 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69601.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:43:33,110 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-07 21:43:42,190 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69629.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:43:59,276 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.210e+02 2.142e+02 2.757e+02 3.564e+02 6.285e+02, threshold=5.515e+02, percent-clipped=1.0 +2022-12-07 21:44:10,750 INFO [train.py:873] (2/4) Epoch 10, batch 1600, loss[loss=0.1009, simple_loss=0.1411, pruned_loss=0.03039, over 14244.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.1633, pruned_loss=0.05471, over 2015314.03 frames. ], batch size: 25, lr: 8.05e-03, grad_scale: 8.0 +2022-12-07 21:44:19,181 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7287, 2.2804, 3.8278, 3.9297, 3.8279, 2.3981, 3.7909, 3.0391], + device='cuda:2'), covar=tensor([0.0305, 0.0733, 0.0541, 0.0294, 0.0239, 0.1056, 0.0256, 0.0707], + device='cuda:2'), in_proj_covar=tensor([0.0272, 0.0243, 0.0365, 0.0307, 0.0249, 0.0294, 0.0279, 0.0274], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:44:24,817 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=69674.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:44:26,600 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69676.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:44:48,113 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69699.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:45:11,237 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69724.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:45:15,583 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-07 21:45:21,926 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=69735.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 21:45:33,106 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.146e+01 2.291e+02 2.751e+02 3.642e+02 1.145e+03, threshold=5.503e+02, percent-clipped=2.0 +2022-12-07 21:45:33,205 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69747.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:45:36,273 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=69750.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:45:45,009 INFO [train.py:873] (2/4) Epoch 10, batch 1700, loss[loss=0.1515, simple_loss=0.1751, pruned_loss=0.06401, over 10327.00 frames. ], tot_loss[loss=0.1372, simple_loss=0.164, pruned_loss=0.05518, over 2011128.50 frames. ], batch size: 100, lr: 8.05e-03, grad_scale: 8.0 +2022-12-07 21:46:19,938 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1704, 2.9372, 2.2532, 3.1904, 3.0435, 3.1059, 2.7154, 2.3533], + device='cuda:2'), covar=tensor([0.0842, 0.1331, 0.3128, 0.0582, 0.0964, 0.0951, 0.1356, 0.2942], + device='cuda:2'), in_proj_covar=tensor([0.0263, 0.0297, 0.0271, 0.0241, 0.0297, 0.0291, 0.0258, 0.0259], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 21:46:25,006 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8270, 1.5991, 2.1007, 1.6981, 1.9777, 1.4660, 1.6331, 1.9550], + device='cuda:2'), covar=tensor([0.2318, 0.2276, 0.0341, 0.1499, 0.0858, 0.1140, 0.1090, 0.0673], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0221, 0.0192, 0.0302, 0.0217, 0.0226, 0.0219, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:46:40,883 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69818.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:46:46,192 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69824.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:47:08,312 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.377e+02 2.221e+02 2.761e+02 3.390e+02 7.818e+02, threshold=5.522e+02, percent-clipped=1.0 +2022-12-07 21:47:10,423 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2635, 1.8279, 2.2455, 1.9437, 2.3105, 2.0745, 1.9559, 2.0007], + device='cuda:2'), covar=tensor([0.0431, 0.1883, 0.0427, 0.0799, 0.0395, 0.0724, 0.0377, 0.0705], + device='cuda:2'), in_proj_covar=tensor([0.0341, 0.0322, 0.0396, 0.0305, 0.0380, 0.0313, 0.0360, 0.0318], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:47:19,440 INFO [train.py:873] (2/4) Epoch 10, batch 1800, loss[loss=0.1686, simple_loss=0.1727, pruned_loss=0.08229, over 4991.00 frames. ], tot_loss[loss=0.1386, simple_loss=0.1647, pruned_loss=0.05628, over 2003956.57 frames. ], batch size: 100, lr: 8.04e-03, grad_scale: 8.0 +2022-12-07 21:47:26,412 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69866.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:47:30,563 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1490, 2.0144, 1.7575, 1.8057, 2.0790, 2.0425, 2.1016, 2.0481], + device='cuda:2'), covar=tensor([0.1111, 0.0779, 0.2757, 0.3031, 0.1083, 0.1203, 0.1532, 0.1233], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0247, 0.0417, 0.0533, 0.0314, 0.0408, 0.0385, 0.0350], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:47:32,300 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69872.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:47:40,927 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69881.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:47:51,815 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-07 21:47:59,771 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69901.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:48:26,756 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69929.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:48:26,820 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69929.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:48:43,306 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.361e+02 2.916e+02 3.771e+02 6.970e+02, threshold=5.833e+02, percent-clipped=3.0 +2022-12-07 21:48:45,629 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69949.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:48:55,136 INFO [train.py:873] (2/4) Epoch 10, batch 1900, loss[loss=0.1183, simple_loss=0.1512, pruned_loss=0.04268, over 14257.00 frames. ], tot_loss[loss=0.1385, simple_loss=0.1645, pruned_loss=0.05626, over 2039651.80 frames. ], batch size: 57, lr: 8.04e-03, grad_scale: 8.0 +2022-12-07 21:49:04,047 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0911, 2.8370, 5.2069, 3.4696, 4.6746, 2.5762, 3.8732, 4.8000], + device='cuda:2'), covar=tensor([0.0351, 0.3527, 0.0188, 0.6703, 0.0448, 0.3113, 0.0958, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0221, 0.0192, 0.0301, 0.0216, 0.0226, 0.0219, 0.0203], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:49:11,055 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=69976.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:49:11,853 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=69977.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:50:02,129 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70024.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:50:02,233 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70024.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:50:08,090 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70030.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 21:50:08,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6002, 4.4412, 4.2499, 4.7246, 4.2527, 4.0188, 4.6847, 4.5939], + device='cuda:2'), covar=tensor([0.0633, 0.0646, 0.0722, 0.0488, 0.0748, 0.0677, 0.0589, 0.0567], + device='cuda:2'), in_proj_covar=tensor([0.0127, 0.0120, 0.0130, 0.0137, 0.0132, 0.0109, 0.0151, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:50:24,070 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.053e+02 2.126e+02 2.650e+02 3.300e+02 6.481e+02, threshold=5.300e+02, percent-clipped=2.0 +2022-12-07 21:50:27,048 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70050.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:50:35,177 INFO [train.py:873] (2/4) Epoch 10, batch 2000, loss[loss=0.1377, simple_loss=0.1693, pruned_loss=0.05305, over 11174.00 frames. ], tot_loss[loss=0.1384, simple_loss=0.1644, pruned_loss=0.05622, over 2019306.43 frames. ], batch size: 100, lr: 8.03e-03, grad_scale: 8.0 +2022-12-07 21:50:48,017 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70072.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:51:12,854 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70098.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:51:24,537 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70110.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:51:25,341 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1758, 2.0288, 2.6607, 1.5759, 1.8265, 2.4618, 1.2824, 2.4048], + device='cuda:2'), covar=tensor([0.1251, 0.1903, 0.0718, 0.2960, 0.2861, 0.0739, 0.4558, 0.0988], + device='cuda:2'), in_proj_covar=tensor([0.0079, 0.0093, 0.0087, 0.0094, 0.0113, 0.0079, 0.0128, 0.0084], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0005, 0.0003], + device='cuda:2') +2022-12-07 21:51:59,346 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.415e+02 2.238e+02 2.810e+02 3.599e+02 8.591e+02, threshold=5.620e+02, percent-clipped=6.0 +2022-12-07 21:52:10,761 INFO [train.py:873] (2/4) Epoch 10, batch 2100, loss[loss=0.103, simple_loss=0.1459, pruned_loss=0.03003, over 14391.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.1632, pruned_loss=0.05477, over 2028576.14 frames. ], batch size: 41, lr: 8.03e-03, grad_scale: 16.0 +2022-12-07 21:52:21,438 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5471, 1.0893, 2.0335, 1.8264, 1.9342, 2.0709, 1.4800, 2.0190], + device='cuda:2'), covar=tensor([0.0699, 0.1162, 0.0169, 0.0377, 0.0452, 0.0191, 0.0486, 0.0259], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0121, 0.0165, 0.0141, 0.0133, 0.0114, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:52:22,433 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=70171.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:53:31,574 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8281, 1.2150, 2.0533, 1.2101, 2.0458, 2.0449, 1.8082, 2.1182], + device='cuda:2'), covar=tensor([0.0321, 0.1952, 0.0384, 0.1922, 0.0410, 0.0469, 0.0769, 0.0290], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0159, 0.0156, 0.0169, 0.0169, 0.0168, 0.0135, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 21:53:34,941 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.579e+01 2.218e+02 2.863e+02 3.700e+02 6.629e+02, threshold=5.727e+02, percent-clipped=4.0 +2022-12-07 21:53:39,749 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1369, 3.0764, 2.9318, 3.2538, 2.7007, 2.8362, 3.2334, 3.1762], + device='cuda:2'), covar=tensor([0.0713, 0.0949, 0.0907, 0.0691, 0.1246, 0.0753, 0.0817, 0.0764], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0123, 0.0133, 0.0139, 0.0134, 0.0111, 0.0153, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:53:45,273 INFO [train.py:873] (2/4) Epoch 10, batch 2200, loss[loss=0.2237, simple_loss=0.1876, pruned_loss=0.1299, over 1256.00 frames. ], tot_loss[loss=0.1365, simple_loss=0.163, pruned_loss=0.05497, over 1926237.40 frames. ], batch size: 100, lr: 8.02e-03, grad_scale: 8.0 +2022-12-07 21:53:46,702 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70260.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:54:14,810 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0466, 2.0683, 4.1223, 2.8476, 3.9414, 2.0102, 3.0866, 3.9217], + device='cuda:2'), covar=tensor([0.0641, 0.4449, 0.0482, 0.6470, 0.0594, 0.3792, 0.1203, 0.0486], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0222, 0.0192, 0.0299, 0.0216, 0.0227, 0.0219, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:54:26,479 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5865, 4.6793, 4.7966, 4.3724, 4.6550, 5.1115, 1.7547, 4.3650], + device='cuda:2'), covar=tensor([0.0327, 0.0405, 0.0683, 0.0437, 0.0485, 0.0205, 0.4142, 0.0357], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0158, 0.0132, 0.0133, 0.0189, 0.0129, 0.0153, 0.0176], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:54:43,806 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=70321.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:54:52,750 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70330.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 21:55:09,312 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.038e+02 2.253e+02 2.794e+02 3.378e+02 9.827e+02, threshold=5.588e+02, percent-clipped=3.0 +2022-12-07 21:55:19,407 INFO [train.py:873] (2/4) Epoch 10, batch 2300, loss[loss=0.1655, simple_loss=0.1722, pruned_loss=0.07943, over 3860.00 frames. ], tot_loss[loss=0.1375, simple_loss=0.1635, pruned_loss=0.05574, over 1958012.55 frames. ], batch size: 100, lr: 8.01e-03, grad_scale: 8.0 +2022-12-07 21:55:33,949 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3258, 5.1182, 4.8466, 5.3531, 4.8933, 4.6869, 5.4166, 5.1949], + device='cuda:2'), covar=tensor([0.0569, 0.0567, 0.0777, 0.0452, 0.0670, 0.0435, 0.0471, 0.0597], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0122, 0.0132, 0.0138, 0.0132, 0.0110, 0.0151, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 21:55:37,486 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70378.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 21:55:46,284 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3383, 1.4872, 3.8818, 2.1403, 4.0642, 4.2594, 3.5663, 4.7884], + device='cuda:2'), covar=tensor([0.0204, 0.3143, 0.0473, 0.2167, 0.0374, 0.0423, 0.0563, 0.0125], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0159, 0.0156, 0.0168, 0.0169, 0.0169, 0.0134, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 21:55:51,311 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8502, 2.0865, 3.9227, 2.7006, 3.8134, 2.0440, 3.0316, 3.7684], + device='cuda:2'), covar=tensor([0.0618, 0.4624, 0.0417, 0.6642, 0.0506, 0.3544, 0.1282, 0.0428], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0219, 0.0191, 0.0299, 0.0214, 0.0224, 0.0218, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 21:56:02,237 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0820, 1.3767, 3.2119, 1.5923, 3.0506, 3.2573, 2.3059, 3.4510], + device='cuda:2'), covar=tensor([0.0261, 0.3133, 0.0445, 0.2187, 0.0967, 0.0396, 0.0872, 0.0192], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0159, 0.0156, 0.0168, 0.0169, 0.0169, 0.0135, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 21:56:22,027 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1743, 2.2473, 3.9484, 4.0474, 4.1004, 2.3954, 3.8934, 3.1300], + device='cuda:2'), covar=tensor([0.0257, 0.0818, 0.0735, 0.0342, 0.0249, 0.1147, 0.0397, 0.0739], + device='cuda:2'), in_proj_covar=tensor([0.0269, 0.0240, 0.0360, 0.0304, 0.0247, 0.0292, 0.0277, 0.0273], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:56:40,810 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1830, 3.9442, 3.8451, 4.2677, 4.0031, 3.8080, 4.3288, 3.5823], + device='cuda:2'), covar=tensor([0.0533, 0.1036, 0.0453, 0.0510, 0.0896, 0.0954, 0.0539, 0.0620], + device='cuda:2'), in_proj_covar=tensor([0.0158, 0.0257, 0.0178, 0.0175, 0.0168, 0.0137, 0.0262, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 21:56:43,385 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.243e+01 2.133e+02 2.592e+02 3.520e+02 8.171e+02, threshold=5.183e+02, percent-clipped=4.0 +2022-12-07 21:56:53,785 INFO [train.py:873] (2/4) Epoch 10, batch 2400, loss[loss=0.1369, simple_loss=0.1589, pruned_loss=0.05746, over 6919.00 frames. ], tot_loss[loss=0.1369, simple_loss=0.1635, pruned_loss=0.05514, over 1930180.30 frames. ], batch size: 100, lr: 8.01e-03, grad_scale: 8.0 +2022-12-07 21:56:55,880 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8617, 2.5309, 2.6822, 1.8705, 2.4062, 2.5560, 2.9745, 2.3906], + device='cuda:2'), covar=tensor([0.0775, 0.1173, 0.1124, 0.1687, 0.1068, 0.0766, 0.0585, 0.1487], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0187, 0.0135, 0.0126, 0.0128, 0.0137, 0.0113, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0005], + device='cuda:2') +2022-12-07 21:57:00,671 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70466.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:57:12,189 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3806, 2.3628, 4.2410, 4.2723, 4.3592, 2.5365, 4.3107, 3.3675], + device='cuda:2'), covar=tensor([0.0303, 0.0847, 0.0680, 0.0366, 0.0266, 0.1235, 0.0322, 0.0745], + device='cuda:2'), in_proj_covar=tensor([0.0268, 0.0240, 0.0360, 0.0303, 0.0246, 0.0291, 0.0277, 0.0272], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 21:57:39,327 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 21:58:18,045 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.315e+02 2.395e+02 3.014e+02 3.583e+02 8.568e+02, threshold=6.028e+02, percent-clipped=4.0 +2022-12-07 21:58:28,700 INFO [train.py:873] (2/4) Epoch 10, batch 2500, loss[loss=0.1016, simple_loss=0.1393, pruned_loss=0.03191, over 14619.00 frames. ], tot_loss[loss=0.1374, simple_loss=0.1638, pruned_loss=0.05548, over 1983612.28 frames. ], batch size: 21, lr: 8.00e-03, grad_scale: 8.0 +2022-12-07 21:58:37,070 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1937, 1.3947, 1.3584, 1.3957, 1.4164, 1.3031, 1.1971, 1.0898], + device='cuda:2'), covar=tensor([0.0573, 0.1051, 0.0498, 0.0591, 0.0566, 0.0449, 0.0409, 0.0774], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0015, 0.0012, 0.0013, 0.0013, 0.0021, 0.0017, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0236e-04, 1.1110e-04, 9.6603e-05, 1.0567e-04, 1.0181e-04, 1.5212e-04, + 1.2883e-04, 1.4771e-04], device='cuda:2') +2022-12-07 21:58:43,700 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.97 vs. limit=2.0 +2022-12-07 21:58:52,009 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-07 21:59:22,823 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=70616.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 21:59:41,275 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3554, 3.6793, 3.1697, 3.3565, 2.4726, 3.5558, 3.3357, 1.5357], + device='cuda:2'), covar=tensor([0.2132, 0.0695, 0.1580, 0.0813, 0.1218, 0.0590, 0.1404, 0.3004], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0074, 0.0059, 0.0062, 0.0089, 0.0069, 0.0094, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 21:59:53,709 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.220e+02 2.772e+02 3.507e+02 5.637e+02, threshold=5.543e+02, percent-clipped=0.0 +2022-12-07 22:00:03,811 INFO [train.py:873] (2/4) Epoch 10, batch 2600, loss[loss=0.1322, simple_loss=0.1336, pruned_loss=0.06538, over 2629.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.1632, pruned_loss=0.05428, over 2028238.10 frames. ], batch size: 100, lr: 8.00e-03, grad_scale: 8.0 +2022-12-07 22:00:42,404 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.58 vs. limit=2.0 +2022-12-07 22:01:27,326 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.690e+02 2.257e+02 2.909e+02 3.862e+02 1.406e+03, threshold=5.817e+02, percent-clipped=9.0 +2022-12-07 22:01:38,026 INFO [train.py:873] (2/4) Epoch 10, batch 2700, loss[loss=0.1328, simple_loss=0.1668, pruned_loss=0.04934, over 14384.00 frames. ], tot_loss[loss=0.136, simple_loss=0.1632, pruned_loss=0.05439, over 2054765.47 frames. ], batch size: 55, lr: 7.99e-03, grad_scale: 8.0 +2022-12-07 22:01:45,064 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70766.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:01:48,078 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.56 vs. limit=2.0 +2022-12-07 22:01:56,036 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6267, 1.2814, 1.5727, 1.3358, 1.5524, 0.8484, 1.5557, 1.5095], + device='cuda:2'), covar=tensor([0.1057, 0.1037, 0.0862, 0.2310, 0.0752, 0.1048, 0.0714, 0.0840], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0023, 0.0025, 0.0023, 0.0024, 0.0035, 0.0023, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:01:57,807 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1263, 2.6848, 3.7233, 2.4874, 2.2636, 3.1155, 1.7265, 3.0221], + device='cuda:2'), covar=tensor([0.0794, 0.1615, 0.0513, 0.1901, 0.2252, 0.0697, 0.4045, 0.0840], + device='cuda:2'), in_proj_covar=tensor([0.0076, 0.0092, 0.0084, 0.0091, 0.0109, 0.0077, 0.0124, 0.0082], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:02:30,607 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70814.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:03:03,173 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.053e+02 2.251e+02 2.633e+02 3.197e+02 4.846e+02, threshold=5.267e+02, percent-clipped=0.0 +2022-12-07 22:03:13,059 INFO [train.py:873] (2/4) Epoch 10, batch 2800, loss[loss=0.1604, simple_loss=0.1752, pruned_loss=0.0728, over 5979.00 frames. ], tot_loss[loss=0.1365, simple_loss=0.1636, pruned_loss=0.05467, over 2064945.93 frames. ], batch size: 100, lr: 7.99e-03, grad_scale: 8.0 +2022-12-07 22:03:52,558 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5427, 1.5423, 1.5984, 1.5010, 1.3990, 1.4078, 0.9325, 0.9992], + device='cuda:2'), covar=tensor([0.0214, 0.0275, 0.0207, 0.0282, 0.0295, 0.0252, 0.0267, 0.0444], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0013, 0.0021, 0.0017, 0.0021], + device='cuda:2'), out_proj_covar=tensor([9.9634e-05, 1.0808e-04, 9.4366e-05, 1.0299e-04, 1.0099e-04, 1.4963e-04, + 1.2673e-04, 1.4421e-04], device='cuda:2') +2022-12-07 22:04:08,136 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=70916.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:04:38,139 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.119e+02 2.647e+02 3.407e+02 7.312e+02, threshold=5.294e+02, percent-clipped=4.0 +2022-12-07 22:04:48,431 INFO [train.py:873] (2/4) Epoch 10, batch 2900, loss[loss=0.1332, simple_loss=0.1642, pruned_loss=0.05104, over 14025.00 frames. ], tot_loss[loss=0.1372, simple_loss=0.1637, pruned_loss=0.05534, over 1918271.98 frames. ], batch size: 29, lr: 7.98e-03, grad_scale: 8.0 +2022-12-07 22:04:52,929 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=70964.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:05:19,756 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=70992.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:05:21,595 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9865, 1.7158, 4.1746, 3.9299, 3.9812, 4.3372, 3.5670, 4.2955], + device='cuda:2'), covar=tensor([0.1398, 0.1517, 0.0105, 0.0217, 0.0177, 0.0083, 0.0240, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0161, 0.0123, 0.0168, 0.0143, 0.0134, 0.0117, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:05:22,386 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5355, 4.2184, 4.1071, 4.5308, 4.2847, 4.0548, 4.5903, 3.8454], + device='cuda:2'), covar=tensor([0.0382, 0.1013, 0.0366, 0.0478, 0.0799, 0.0853, 0.0515, 0.0554], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0258, 0.0178, 0.0176, 0.0167, 0.0139, 0.0264, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 22:05:55,231 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71030.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:06:12,367 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.268e+02 2.365e+02 2.871e+02 3.545e+02 4.870e+02, threshold=5.742e+02, percent-clipped=0.0 +2022-12-07 22:06:17,173 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71053.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:06:22,389 INFO [train.py:873] (2/4) Epoch 10, batch 3000, loss[loss=0.1489, simple_loss=0.1683, pruned_loss=0.06477, over 14542.00 frames. ], tot_loss[loss=0.1372, simple_loss=0.1642, pruned_loss=0.05512, over 2014356.70 frames. ], batch size: 34, lr: 7.98e-03, grad_scale: 8.0 +2022-12-07 22:06:22,389 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 22:06:39,844 INFO [train.py:905] (2/4) Epoch 10, validation: loss=0.1251, simple_loss=0.1669, pruned_loss=0.04162, over 857387.00 frames. +2022-12-07 22:06:39,845 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-07 22:07:03,315 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3493, 3.3017, 4.1973, 2.7775, 2.5178, 3.1704, 1.9295, 3.3415], + device='cuda:2'), covar=tensor([0.0904, 0.0826, 0.0462, 0.2468, 0.2311, 0.1058, 0.3785, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0094, 0.0086, 0.0094, 0.0113, 0.0079, 0.0127, 0.0084], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0005, 0.0003], + device='cuda:2') +2022-12-07 22:07:10,037 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71091.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:07:34,571 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.49 vs. limit=5.0 +2022-12-07 22:07:50,111 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8448, 1.4497, 1.6409, 1.3078, 1.4503, 0.8230, 1.6453, 1.7754], + device='cuda:2'), covar=tensor([0.0841, 0.1078, 0.0920, 0.2502, 0.1798, 0.0758, 0.0931, 0.0715], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0024, 0.0026, 0.0023, 0.0025, 0.0035, 0.0024, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:08:04,100 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.301e+01 2.256e+02 2.878e+02 3.587e+02 6.087e+02, threshold=5.757e+02, percent-clipped=3.0 +2022-12-07 22:08:14,733 INFO [train.py:873] (2/4) Epoch 10, batch 3100, loss[loss=0.1616, simple_loss=0.1499, pruned_loss=0.08668, over 1255.00 frames. ], tot_loss[loss=0.1378, simple_loss=0.1644, pruned_loss=0.05559, over 2010358.83 frames. ], batch size: 100, lr: 7.97e-03, grad_scale: 8.0 +2022-12-07 22:08:31,872 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8907, 2.6880, 2.3930, 2.5834, 2.8058, 2.8350, 2.8485, 2.8347], + device='cuda:2'), covar=tensor([0.1014, 0.0806, 0.2408, 0.2590, 0.1003, 0.0922, 0.1203, 0.0995], + device='cuda:2'), in_proj_covar=tensor([0.0356, 0.0245, 0.0413, 0.0533, 0.0310, 0.0402, 0.0382, 0.0347], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:09:38,773 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.264e+02 3.028e+02 3.832e+02 6.668e+02, threshold=6.056e+02, percent-clipped=4.0 +2022-12-07 22:09:40,230 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-12-07 22:09:49,303 INFO [train.py:873] (2/4) Epoch 10, batch 3200, loss[loss=0.1349, simple_loss=0.1626, pruned_loss=0.05355, over 14428.00 frames. ], tot_loss[loss=0.1375, simple_loss=0.1641, pruned_loss=0.0555, over 2006607.58 frames. ], batch size: 73, lr: 7.96e-03, grad_scale: 8.0 +2022-12-07 22:10:48,876 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8196, 5.5364, 5.0963, 5.8491, 5.3520, 4.9779, 5.8554, 5.6999], + device='cuda:2'), covar=tensor([0.0452, 0.0539, 0.0633, 0.0422, 0.0597, 0.0354, 0.0486, 0.0510], + device='cuda:2'), in_proj_covar=tensor([0.0126, 0.0120, 0.0129, 0.0136, 0.0132, 0.0110, 0.0153, 0.0130], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 22:10:59,384 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4795, 1.6098, 4.1359, 1.8795, 4.1421, 4.3254, 3.6424, 4.7528], + device='cuda:2'), covar=tensor([0.0162, 0.2794, 0.0402, 0.2207, 0.0357, 0.0374, 0.0463, 0.0130], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0156, 0.0156, 0.0167, 0.0168, 0.0168, 0.0133, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:11:13,450 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.358e+02 2.362e+02 2.792e+02 3.327e+02 7.945e+02, threshold=5.585e+02, percent-clipped=1.0 +2022-12-07 22:11:13,613 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71348.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:11:24,232 INFO [train.py:873] (2/4) Epoch 10, batch 3300, loss[loss=0.1423, simple_loss=0.1689, pruned_loss=0.05781, over 14420.00 frames. ], tot_loss[loss=0.1376, simple_loss=0.1636, pruned_loss=0.05583, over 1973958.03 frames. ], batch size: 53, lr: 7.96e-03, grad_scale: 8.0 +2022-12-07 22:11:40,381 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6038, 1.6035, 2.8024, 2.1006, 2.7077, 1.7248, 2.2778, 2.5872], + device='cuda:2'), covar=tensor([0.1076, 0.4446, 0.0570, 0.3827, 0.0856, 0.3504, 0.1201, 0.0780], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0223, 0.0196, 0.0297, 0.0215, 0.0226, 0.0219, 0.0203], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:11:47,603 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.30 vs. limit=5.0 +2022-12-07 22:11:49,885 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71386.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:11:55,971 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8954, 2.7211, 2.4367, 2.6069, 2.8035, 2.8020, 2.8250, 2.8512], + device='cuda:2'), covar=tensor([0.1075, 0.0890, 0.2649, 0.2808, 0.0939, 0.1100, 0.1650, 0.0979], + device='cuda:2'), in_proj_covar=tensor([0.0359, 0.0246, 0.0415, 0.0534, 0.0313, 0.0402, 0.0383, 0.0346], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:12:49,123 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.00 vs. limit=5.0 +2022-12-07 22:12:49,402 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 2.167e+02 2.744e+02 3.343e+02 6.323e+02, threshold=5.487e+02, percent-clipped=2.0 +2022-12-07 22:12:59,541 INFO [train.py:873] (2/4) Epoch 10, batch 3400, loss[loss=0.1284, simple_loss=0.1591, pruned_loss=0.04888, over 14275.00 frames. ], tot_loss[loss=0.1363, simple_loss=0.163, pruned_loss=0.0548, over 2015069.50 frames. ], batch size: 57, lr: 7.95e-03, grad_scale: 8.0 +2022-12-07 22:13:51,714 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7789, 0.7947, 0.6083, 0.7072, 0.6519, 0.4735, 0.3538, 0.6387], + device='cuda:2'), covar=tensor([0.0145, 0.0137, 0.0129, 0.0112, 0.0122, 0.0304, 0.0246, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0013, 0.0014, 0.0012, 0.0013, 0.0013, 0.0021, 0.0017, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0289e-04, 1.1049e-04, 9.7170e-05, 1.0405e-04, 1.0157e-04, 1.5310e-04, + 1.2831e-04, 1.4770e-04], device='cuda:2') +2022-12-07 22:14:16,674 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2673, 1.8035, 2.2332, 1.8832, 2.3184, 2.1126, 2.0247, 2.0510], + device='cuda:2'), covar=tensor([0.0493, 0.1693, 0.0341, 0.0821, 0.0340, 0.0662, 0.0427, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0338, 0.0316, 0.0395, 0.0305, 0.0371, 0.0312, 0.0358, 0.0315], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:14:22,583 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.256e+02 2.276e+02 2.825e+02 3.448e+02 6.629e+02, threshold=5.650e+02, percent-clipped=5.0 +2022-12-07 22:14:33,335 INFO [train.py:873] (2/4) Epoch 10, batch 3500, loss[loss=0.199, simple_loss=0.167, pruned_loss=0.1155, over 1293.00 frames. ], tot_loss[loss=0.1345, simple_loss=0.1617, pruned_loss=0.05368, over 1992683.81 frames. ], batch size: 100, lr: 7.95e-03, grad_scale: 8.0 +2022-12-07 22:14:38,688 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.01 vs. limit=5.0 +2022-12-07 22:15:15,299 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0508, 2.3528, 3.3116, 2.0057, 2.0835, 2.6310, 1.4668, 2.5767], + device='cuda:2'), covar=tensor([0.0599, 0.1304, 0.0490, 0.2623, 0.2215, 0.1129, 0.4240, 0.1279], + device='cuda:2'), in_proj_covar=tensor([0.0077, 0.0092, 0.0085, 0.0092, 0.0110, 0.0079, 0.0124, 0.0083], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:15:27,745 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71616.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:15:57,136 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.705e+02 2.664e+02 3.016e+02 4.035e+02 8.090e+02, threshold=6.032e+02, percent-clipped=4.0 +2022-12-07 22:15:57,290 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=71648.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:16:07,709 INFO [train.py:873] (2/4) Epoch 10, batch 3600, loss[loss=0.1554, simple_loss=0.1757, pruned_loss=0.06758, over 14129.00 frames. ], tot_loss[loss=0.1342, simple_loss=0.1615, pruned_loss=0.05345, over 2002825.21 frames. ], batch size: 99, lr: 7.94e-03, grad_scale: 8.0 +2022-12-07 22:16:25,246 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71677.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:16:33,289 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=71686.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:16:42,663 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=71696.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:17:18,937 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=71734.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:17:19,037 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7700, 0.8730, 0.6638, 0.7228, 0.7350, 0.4665, 0.3707, 0.6671], + device='cuda:2'), covar=tensor([0.0093, 0.0103, 0.0084, 0.0079, 0.0098, 0.0257, 0.0162, 0.0201], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0013, 0.0013, 0.0022, 0.0018, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0475e-04, 1.1307e-04, 9.8373e-05, 1.0615e-04, 1.0339e-04, 1.5669e-04, + 1.3043e-04, 1.5041e-04], device='cuda:2') +2022-12-07 22:17:19,984 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4068, 1.5003, 4.2590, 2.1090, 4.1640, 4.2640, 3.8332, 4.7611], + device='cuda:2'), covar=tensor([0.0191, 0.2961, 0.0283, 0.2010, 0.0352, 0.0362, 0.0372, 0.0149], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0158, 0.0157, 0.0169, 0.0170, 0.0170, 0.0135, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:17:31,779 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.595e+01 2.418e+02 2.819e+02 3.742e+02 8.382e+02, threshold=5.637e+02, percent-clipped=6.0 +2022-12-07 22:17:42,531 INFO [train.py:873] (2/4) Epoch 10, batch 3700, loss[loss=0.1691, simple_loss=0.1803, pruned_loss=0.07896, over 9502.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.1625, pruned_loss=0.05458, over 1999223.87 frames. ], batch size: 100, lr: 7.94e-03, grad_scale: 8.0 +2022-12-07 22:17:45,343 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1641, 4.2409, 4.6041, 3.8206, 4.3961, 4.5723, 1.5569, 4.0918], + device='cuda:2'), covar=tensor([0.0276, 0.0319, 0.0308, 0.0503, 0.0290, 0.0197, 0.3329, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0160, 0.0132, 0.0135, 0.0192, 0.0130, 0.0154, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:18:57,258 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0025, 1.9834, 1.9600, 2.1080, 1.9348, 1.7843, 1.2157, 1.7118], + device='cuda:2'), covar=tensor([0.0619, 0.0650, 0.0745, 0.0379, 0.0675, 0.1175, 0.2578, 0.0700], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0160, 0.0134, 0.0136, 0.0192, 0.0130, 0.0154, 0.0178], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:19:05,971 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.143e+02 2.175e+02 2.639e+02 3.434e+02 5.313e+02, threshold=5.277e+02, percent-clipped=0.0 +2022-12-07 22:19:16,067 INFO [train.py:873] (2/4) Epoch 10, batch 3800, loss[loss=0.1204, simple_loss=0.1613, pruned_loss=0.03973, over 14566.00 frames. ], tot_loss[loss=0.1355, simple_loss=0.1623, pruned_loss=0.05433, over 1976178.48 frames. ], batch size: 34, lr: 7.93e-03, grad_scale: 8.0 +2022-12-07 22:19:52,390 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-07 22:20:12,801 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=71919.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:20:15,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0929, 1.4948, 4.0403, 1.9477, 3.9483, 4.1105, 3.1176, 4.4567], + device='cuda:2'), covar=tensor([0.0209, 0.3108, 0.0324, 0.2097, 0.0385, 0.0405, 0.0631, 0.0150], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0159, 0.0158, 0.0170, 0.0172, 0.0171, 0.0136, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:20:18,111 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1759, 3.4944, 3.3322, 3.2839, 2.5661, 3.4736, 3.2293, 1.7296], + device='cuda:2'), covar=tensor([0.2080, 0.0496, 0.0941, 0.0912, 0.1081, 0.0497, 0.1343, 0.2752], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0073, 0.0059, 0.0062, 0.0089, 0.0071, 0.0092, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:20:33,789 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5377, 1.7443, 1.2347, 1.5946, 1.4542, 0.8330, 1.4156, 1.6893], + device='cuda:2'), covar=tensor([0.2210, 0.0612, 0.3519, 0.1013, 0.1380, 0.0964, 0.1625, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0027, 0.0023, 0.0025, 0.0036, 0.0025, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:20:40,196 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 2.292e+02 2.855e+02 3.626e+02 5.894e+02, threshold=5.709e+02, percent-clipped=2.0 +2022-12-07 22:20:42,688 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0941, 3.5599, 2.7788, 4.3541, 4.1323, 4.1624, 3.6249, 2.9261], + device='cuda:2'), covar=tensor([0.0946, 0.1452, 0.4029, 0.0465, 0.0752, 0.1148, 0.1253, 0.3909], + device='cuda:2'), in_proj_covar=tensor([0.0263, 0.0297, 0.0270, 0.0245, 0.0299, 0.0289, 0.0258, 0.0258], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 22:20:43,464 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1448, 2.0164, 1.7767, 1.8793, 2.0695, 2.0969, 2.1097, 2.0827], + device='cuda:2'), covar=tensor([0.0995, 0.0787, 0.2728, 0.2741, 0.1081, 0.1166, 0.1378, 0.1039], + device='cuda:2'), in_proj_covar=tensor([0.0358, 0.0247, 0.0417, 0.0534, 0.0313, 0.0403, 0.0381, 0.0347], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:20:49,796 INFO [train.py:873] (2/4) Epoch 10, batch 3900, loss[loss=0.1603, simple_loss=0.1663, pruned_loss=0.07717, over 3899.00 frames. ], tot_loss[loss=0.1342, simple_loss=0.1612, pruned_loss=0.05359, over 1967934.97 frames. ], batch size: 100, lr: 7.93e-03, grad_scale: 4.0 +2022-12-07 22:21:01,560 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=71972.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:21:09,378 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=71980.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:21:49,006 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2516, 1.6421, 2.4773, 2.0313, 2.2898, 1.6512, 1.9097, 2.1988], + device='cuda:2'), covar=tensor([0.1809, 0.3088, 0.0374, 0.2219, 0.0917, 0.2007, 0.1293, 0.0833], + device='cuda:2'), in_proj_covar=tensor([0.0240, 0.0220, 0.0196, 0.0298, 0.0213, 0.0222, 0.0216, 0.0202], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:22:07,717 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-07 22:22:14,704 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.192e+02 2.367e+02 2.840e+02 3.454e+02 7.877e+02, threshold=5.680e+02, percent-clipped=2.0 +2022-12-07 22:22:21,640 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-12-07 22:22:23,929 INFO [train.py:873] (2/4) Epoch 10, batch 4000, loss[loss=0.1448, simple_loss=0.1694, pruned_loss=0.0601, over 11989.00 frames. ], tot_loss[loss=0.1343, simple_loss=0.1614, pruned_loss=0.0536, over 1958005.38 frames. ], batch size: 100, lr: 7.92e-03, grad_scale: 8.0 +2022-12-07 22:23:29,172 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72129.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:23:29,193 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7656, 1.6468, 1.7854, 2.0157, 1.9222, 1.5146, 1.5708, 1.4724], + device='cuda:2'), covar=tensor([0.0576, 0.1725, 0.0546, 0.0590, 0.0649, 0.0523, 0.0440, 0.0625], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0014, 0.0014, 0.0022, 0.0018, 0.0023], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 22:23:45,802 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-07 22:23:48,077 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.453e+01 2.051e+02 2.497e+02 3.120e+02 6.082e+02, threshold=4.994e+02, percent-clipped=1.0 +2022-12-07 22:23:57,368 INFO [train.py:873] (2/4) Epoch 10, batch 4100, loss[loss=0.1628, simple_loss=0.1493, pruned_loss=0.08816, over 1218.00 frames. ], tot_loss[loss=0.136, simple_loss=0.1624, pruned_loss=0.05478, over 1929128.90 frames. ], batch size: 100, lr: 7.91e-03, grad_scale: 4.0 +2022-12-07 22:24:15,390 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-07 22:24:26,477 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72190.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:25:22,724 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.251e+02 2.564e+02 3.159e+02 3.830e+02 6.918e+02, threshold=6.317e+02, percent-clipped=10.0 +2022-12-07 22:25:25,506 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9098, 3.7444, 3.5676, 3.9945, 3.6900, 3.4634, 4.0363, 3.3146], + device='cuda:2'), covar=tensor([0.0612, 0.0892, 0.0466, 0.0449, 0.0745, 0.1492, 0.0539, 0.0545], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0250, 0.0174, 0.0171, 0.0166, 0.0137, 0.0259, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 22:25:31,385 INFO [train.py:873] (2/4) Epoch 10, batch 4200, loss[loss=0.1215, simple_loss=0.1574, pruned_loss=0.04284, over 14269.00 frames. ], tot_loss[loss=0.1365, simple_loss=0.163, pruned_loss=0.05501, over 1993717.13 frames. ], batch size: 28, lr: 7.91e-03, grad_scale: 4.0 +2022-12-07 22:25:43,107 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8573, 3.9420, 4.2085, 3.5202, 3.9971, 4.0440, 1.7149, 3.7535], + device='cuda:2'), covar=tensor([0.0291, 0.0339, 0.0306, 0.0469, 0.0339, 0.0316, 0.2919, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0158, 0.0133, 0.0132, 0.0190, 0.0129, 0.0153, 0.0177], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:25:44,359 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:25:46,883 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72275.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:26:00,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4875, 1.4624, 4.0792, 1.8621, 4.1929, 4.3972, 3.8094, 4.7409], + device='cuda:2'), covar=tensor([0.0177, 0.3202, 0.0621, 0.2211, 0.0363, 0.0408, 0.0389, 0.0155], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0156, 0.0157, 0.0166, 0.0170, 0.0171, 0.0133, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:26:27,857 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72320.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:26:33,212 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72326.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:26:53,173 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.88 vs. limit=2.0 +2022-12-07 22:26:55,037 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.448e+02 2.250e+02 2.962e+02 3.836e+02 8.638e+02, threshold=5.923e+02, percent-clipped=4.0 +2022-12-07 22:27:03,694 INFO [train.py:873] (2/4) Epoch 10, batch 4300, loss[loss=0.152, simple_loss=0.1524, pruned_loss=0.07585, over 2661.00 frames. ], tot_loss[loss=0.1365, simple_loss=0.1633, pruned_loss=0.05489, over 1994705.79 frames. ], batch size: 100, lr: 7.90e-03, grad_scale: 4.0 +2022-12-07 22:27:05,066 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-12-07 22:27:27,862 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1224, 2.0633, 4.1428, 2.7128, 3.8984, 1.9879, 3.0385, 3.9774], + device='cuda:2'), covar=tensor([0.0679, 0.4681, 0.0424, 0.7550, 0.0716, 0.4047, 0.1458, 0.0417], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0220, 0.0198, 0.0301, 0.0216, 0.0225, 0.0219, 0.0202], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:27:29,473 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72387.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:28:04,511 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9642, 1.9320, 1.5988, 1.9895, 1.9108, 1.9730, 1.8305, 1.7973], + device='cuda:2'), covar=tensor([0.0742, 0.0770, 0.1848, 0.0512, 0.0722, 0.0489, 0.1288, 0.0629], + device='cuda:2'), in_proj_covar=tensor([0.0266, 0.0301, 0.0273, 0.0248, 0.0305, 0.0292, 0.0260, 0.0259], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 22:28:26,475 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.139e+02 2.736e+02 3.399e+02 6.791e+02, threshold=5.473e+02, percent-clipped=1.0 +2022-12-07 22:28:34,597 INFO [train.py:873] (2/4) Epoch 10, batch 4400, loss[loss=0.1183, simple_loss=0.1579, pruned_loss=0.03936, over 14265.00 frames. ], tot_loss[loss=0.1357, simple_loss=0.163, pruned_loss=0.05424, over 1986281.14 frames. ], batch size: 31, lr: 7.90e-03, grad_scale: 8.0 +2022-12-07 22:28:58,335 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72485.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:29:45,387 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6760, 1.6784, 1.8562, 1.4758, 1.6055, 1.4500, 1.2352, 1.0030], + device='cuda:2'), covar=tensor([0.0284, 0.0404, 0.0303, 0.0639, 0.0450, 0.0353, 0.0343, 0.0640], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0013, 0.0014, 0.0022, 0.0018, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0542e-04, 1.1432e-04, 9.9101e-05, 1.0655e-04, 1.0460e-04, 1.5734e-04, + 1.3284e-04, 1.5234e-04], device='cuda:2') +2022-12-07 22:29:56,357 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 2.111e+02 2.675e+02 3.314e+02 5.959e+02, threshold=5.350e+02, percent-clipped=1.0 +2022-12-07 22:30:03,710 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8166, 1.6370, 1.6858, 1.7384, 1.6055, 0.9398, 1.4786, 1.8336], + device='cuda:2'), covar=tensor([0.0746, 0.1100, 0.0541, 0.1367, 0.1282, 0.0787, 0.1384, 0.0891], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0026, 0.0024, 0.0025, 0.0036, 0.0025, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:30:05,239 INFO [train.py:873] (2/4) Epoch 10, batch 4500, loss[loss=0.1585, simple_loss=0.1692, pruned_loss=0.0739, over 7771.00 frames. ], tot_loss[loss=0.1347, simple_loss=0.162, pruned_loss=0.05369, over 1978218.87 frames. ], batch size: 100, lr: 7.89e-03, grad_scale: 8.0 +2022-12-07 22:30:19,391 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72575.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:30:20,952 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.33 vs. limit=5.0 +2022-12-07 22:31:03,241 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72623.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:31:07,357 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6000, 1.6515, 1.5423, 1.4199, 1.4222, 1.1751, 0.9120, 1.0635], + device='cuda:2'), covar=tensor([0.0138, 0.0242, 0.0255, 0.0181, 0.0204, 0.0300, 0.0260, 0.0419], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0012, 0.0013, 0.0014, 0.0022, 0.0018, 0.0022], + device='cuda:2'), out_proj_covar=tensor([1.0511e-04, 1.1379e-04, 9.8579e-05, 1.0591e-04, 1.0485e-04, 1.5624e-04, + 1.3138e-04, 1.5197e-04], device='cuda:2') +2022-12-07 22:31:18,196 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0839, 0.9396, 0.8872, 0.9921, 1.0316, 0.5153, 0.8264, 1.0889], + device='cuda:2'), covar=tensor([0.0368, 0.0545, 0.0289, 0.0417, 0.0320, 0.0427, 0.0914, 0.0477], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0025, 0.0027, 0.0024, 0.0025, 0.0037, 0.0025, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:31:27,222 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.124e+02 2.389e+02 3.143e+02 3.960e+02 1.878e+03, threshold=6.285e+02, percent-clipped=10.0 +2022-12-07 22:31:35,245 INFO [train.py:873] (2/4) Epoch 10, batch 4600, loss[loss=0.1396, simple_loss=0.1737, pruned_loss=0.0528, over 14636.00 frames. ], tot_loss[loss=0.1349, simple_loss=0.1625, pruned_loss=0.05366, over 2019755.00 frames. ], batch size: 22, lr: 7.89e-03, grad_scale: 8.0 +2022-12-07 22:31:56,344 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=72682.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:31:56,640 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.53 vs. limit=2.0 +2022-12-07 22:32:57,788 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 2.006e+02 2.571e+02 3.192e+02 6.224e+02, threshold=5.142e+02, percent-clipped=0.0 +2022-12-07 22:33:06,369 INFO [train.py:873] (2/4) Epoch 10, batch 4700, loss[loss=0.1347, simple_loss=0.1395, pruned_loss=0.06496, over 3894.00 frames. ], tot_loss[loss=0.1337, simple_loss=0.1615, pruned_loss=0.0529, over 1994325.42 frames. ], batch size: 100, lr: 7.88e-03, grad_scale: 8.0 +2022-12-07 22:33:29,652 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72785.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:33:37,788 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.10 vs. limit=5.0 +2022-12-07 22:34:12,941 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=72833.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:34:27,384 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.194e+02 2.219e+02 2.922e+02 3.522e+02 5.753e+02, threshold=5.843e+02, percent-clipped=6.0 +2022-12-07 22:34:35,287 INFO [train.py:873] (2/4) Epoch 10, batch 4800, loss[loss=0.1481, simple_loss=0.1633, pruned_loss=0.06644, over 5998.00 frames. ], tot_loss[loss=0.1329, simple_loss=0.1612, pruned_loss=0.05233, over 2015339.32 frames. ], batch size: 100, lr: 7.88e-03, grad_scale: 8.0 +2022-12-07 22:34:37,157 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3195, 3.4345, 3.5421, 3.2653, 3.4288, 3.2549, 1.4325, 3.2368], + device='cuda:2'), covar=tensor([0.0341, 0.0336, 0.0390, 0.0426, 0.0387, 0.0491, 0.3116, 0.0316], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0161, 0.0135, 0.0134, 0.0195, 0.0130, 0.0155, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 22:34:49,300 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5880, 1.6791, 4.2773, 2.1103, 4.2478, 4.3965, 4.0660, 4.9518], + device='cuda:2'), covar=tensor([0.0191, 0.2949, 0.0346, 0.2089, 0.0340, 0.0382, 0.0297, 0.0131], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0157, 0.0156, 0.0169, 0.0171, 0.0170, 0.0133, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 22:35:06,537 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72894.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:35:21,177 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3226, 3.6376, 3.1447, 3.3711, 2.7210, 3.4393, 3.1587, 1.7067], + device='cuda:2'), covar=tensor([0.2147, 0.1176, 0.1221, 0.0679, 0.1049, 0.0642, 0.1731, 0.2776], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0075, 0.0061, 0.0063, 0.0090, 0.0072, 0.0093, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:35:38,410 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8348, 1.7237, 3.1344, 2.2489, 2.9286, 1.8235, 2.3953, 2.8659], + device='cuda:2'), covar=tensor([0.0937, 0.4035, 0.0498, 0.4262, 0.0807, 0.3272, 0.1179, 0.0576], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0222, 0.0198, 0.0302, 0.0216, 0.0227, 0.0222, 0.0205], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:35:56,441 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.266e+02 2.330e+02 2.761e+02 3.317e+02 8.665e+02, threshold=5.521e+02, percent-clipped=2.0 +2022-12-07 22:35:57,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1711, 3.1886, 3.3377, 3.1878, 3.2587, 2.9208, 1.4146, 3.0754], + device='cuda:2'), covar=tensor([0.0348, 0.0381, 0.0395, 0.0406, 0.0375, 0.0894, 0.3191, 0.0304], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0162, 0.0136, 0.0135, 0.0196, 0.0132, 0.0156, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 22:36:01,284 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=72955.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:36:04,982 INFO [train.py:873] (2/4) Epoch 10, batch 4900, loss[loss=0.141, simple_loss=0.166, pruned_loss=0.05806, over 6926.00 frames. ], tot_loss[loss=0.1347, simple_loss=0.1618, pruned_loss=0.05375, over 1972877.81 frames. ], batch size: 100, lr: 7.87e-03, grad_scale: 8.0 +2022-12-07 22:36:09,392 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=72964.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 22:36:24,861 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=72982.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:36:47,689 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0729, 2.0504, 4.5851, 4.2332, 4.2010, 4.6964, 4.1471, 4.6239], + device='cuda:2'), covar=tensor([0.1305, 0.1234, 0.0073, 0.0157, 0.0169, 0.0078, 0.0148, 0.0096], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0122, 0.0167, 0.0142, 0.0133, 0.0116, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:36:47,721 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9550, 3.3811, 3.1446, 3.1530, 2.4899, 3.4009, 3.1797, 1.5577], + device='cuda:2'), covar=tensor([0.2095, 0.0670, 0.1134, 0.0690, 0.0995, 0.0595, 0.0923, 0.2595], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0074, 0.0059, 0.0062, 0.0089, 0.0071, 0.0091, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:36:49,084 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-07 22:37:03,945 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73025.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 22:37:07,508 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7734, 0.7372, 0.7674, 0.7840, 0.8606, 0.2540, 0.7036, 0.8748], + device='cuda:2'), covar=tensor([0.0297, 0.0515, 0.0406, 0.0396, 0.0316, 0.0263, 0.0691, 0.0569], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0025, 0.0027, 0.0024, 0.0026, 0.0037, 0.0025, 0.0027], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 22:37:08,119 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73030.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:37:08,493 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-07 22:37:26,027 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.386e+02 2.275e+02 2.705e+02 3.390e+02 8.104e+02, threshold=5.410e+02, percent-clipped=1.0 +2022-12-07 22:37:33,745 INFO [train.py:873] (2/4) Epoch 10, batch 5000, loss[loss=0.123, simple_loss=0.1617, pruned_loss=0.04213, over 14317.00 frames. ], tot_loss[loss=0.1347, simple_loss=0.1618, pruned_loss=0.05385, over 1946339.47 frames. ], batch size: 28, lr: 7.87e-03, grad_scale: 8.0 +2022-12-07 22:37:52,938 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1600, 2.6983, 4.0161, 2.8353, 4.0299, 3.8810, 3.6974, 3.4265], + device='cuda:2'), covar=tensor([0.0667, 0.3191, 0.0924, 0.2145, 0.0710, 0.0838, 0.1908, 0.1914], + device='cuda:2'), in_proj_covar=tensor([0.0341, 0.0314, 0.0398, 0.0303, 0.0374, 0.0317, 0.0360, 0.0313], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:38:12,891 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.60 vs. limit=2.0 +2022-12-07 22:38:54,056 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.200e+01 2.088e+02 2.701e+02 3.421e+02 5.931e+02, threshold=5.402e+02, percent-clipped=1.0 +2022-12-07 22:39:01,816 INFO [train.py:873] (2/4) Epoch 10, batch 5100, loss[loss=0.1115, simple_loss=0.153, pruned_loss=0.03501, over 14243.00 frames. ], tot_loss[loss=0.135, simple_loss=0.1619, pruned_loss=0.05407, over 1911162.63 frames. ], batch size: 35, lr: 7.86e-03, grad_scale: 4.0 +2022-12-07 22:40:22,029 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73250.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:40:22,701 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.094e+02 2.487e+02 3.025e+02 3.884e+02 1.180e+03, threshold=6.049e+02, percent-clipped=5.0 +2022-12-07 22:40:29,837 INFO [train.py:873] (2/4) Epoch 10, batch 5200, loss[loss=0.1651, simple_loss=0.1834, pruned_loss=0.07344, over 10372.00 frames. ], tot_loss[loss=0.1374, simple_loss=0.1635, pruned_loss=0.05569, over 1903124.18 frames. ], batch size: 100, lr: 7.85e-03, grad_scale: 8.0 +2022-12-07 22:41:00,596 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73293.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:41:25,167 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73320.0, num_to_drop=1, layers_to_drop={0} +2022-12-07 22:41:52,768 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.362e+02 2.946e+02 3.664e+02 8.798e+02, threshold=5.892e+02, percent-clipped=2.0 +2022-12-07 22:41:54,679 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73354.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:41:59,546 INFO [train.py:873] (2/4) Epoch 10, batch 5300, loss[loss=0.1735, simple_loss=0.1609, pruned_loss=0.09301, over 1207.00 frames. ], tot_loss[loss=0.1349, simple_loss=0.1623, pruned_loss=0.05379, over 1945176.70 frames. ], batch size: 100, lr: 7.85e-03, grad_scale: 4.0 +2022-12-07 22:42:28,626 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7793, 3.6219, 3.5345, 3.9343, 3.6465, 3.5722, 3.9407, 3.2637], + device='cuda:2'), covar=tensor([0.0926, 0.1012, 0.0451, 0.0421, 0.0783, 0.1096, 0.0549, 0.0586], + device='cuda:2'), in_proj_covar=tensor([0.0164, 0.0260, 0.0181, 0.0176, 0.0173, 0.0143, 0.0268, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 22:43:11,544 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-12-07 22:43:12,043 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2827, 5.0947, 4.9087, 5.3521, 4.8473, 4.6060, 5.3645, 5.1893], + device='cuda:2'), covar=tensor([0.0649, 0.0641, 0.0648, 0.0474, 0.0683, 0.0479, 0.0613, 0.0584], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0120, 0.0129, 0.0138, 0.0133, 0.0108, 0.0152, 0.0128], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 22:43:21,801 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.441e+02 2.215e+02 2.822e+02 3.427e+02 6.404e+02, threshold=5.643e+02, percent-clipped=2.0 +2022-12-07 22:43:27,865 INFO [train.py:873] (2/4) Epoch 10, batch 5400, loss[loss=0.1286, simple_loss=0.1609, pruned_loss=0.04811, over 14326.00 frames. ], tot_loss[loss=0.1345, simple_loss=0.1621, pruned_loss=0.05344, over 1926271.17 frames. ], batch size: 37, lr: 7.84e-03, grad_scale: 4.0 +2022-12-07 22:43:39,238 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.65 vs. limit=2.0 +2022-12-07 22:44:48,839 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73550.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:44:50,661 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.270e+02 2.129e+02 2.539e+02 3.119e+02 4.865e+02, threshold=5.078e+02, percent-clipped=0.0 +2022-12-07 22:44:57,303 INFO [train.py:873] (2/4) Epoch 10, batch 5500, loss[loss=0.126, simple_loss=0.1402, pruned_loss=0.05594, over 4932.00 frames. ], tot_loss[loss=0.1341, simple_loss=0.1621, pruned_loss=0.0531, over 1948769.77 frames. ], batch size: 100, lr: 7.84e-03, grad_scale: 4.0 +2022-12-07 22:44:59,702 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.95 vs. limit=5.0 +2022-12-07 22:45:03,652 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-07 22:45:24,685 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73590.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:45:31,978 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73598.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:45:50,959 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73620.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 22:46:13,451 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.42 vs. limit=5.0 +2022-12-07 22:46:17,403 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73649.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:46:19,116 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73651.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:46:19,731 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.890e+01 2.291e+02 2.969e+02 3.792e+02 1.060e+03, threshold=5.938e+02, percent-clipped=6.0 +2022-12-07 22:46:25,680 INFO [train.py:873] (2/4) Epoch 10, batch 5600, loss[loss=0.1116, simple_loss=0.15, pruned_loss=0.03664, over 14365.00 frames. ], tot_loss[loss=0.1344, simple_loss=0.162, pruned_loss=0.05345, over 1975558.63 frames. ], batch size: 55, lr: 7.83e-03, grad_scale: 8.0 +2022-12-07 22:46:33,860 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73668.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 22:47:30,767 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=73732.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:47:48,145 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.384e+02 2.591e+02 3.035e+02 3.423e+02 1.031e+03, threshold=6.069e+02, percent-clipped=1.0 +2022-12-07 22:47:55,249 INFO [train.py:873] (2/4) Epoch 10, batch 5700, loss[loss=0.1471, simple_loss=0.1769, pruned_loss=0.05866, over 14114.00 frames. ], tot_loss[loss=0.1357, simple_loss=0.1627, pruned_loss=0.0543, over 1993117.49 frames. ], batch size: 29, lr: 7.83e-03, grad_scale: 8.0 +2022-12-07 22:47:59,540 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8561, 1.4483, 3.2121, 2.9792, 3.1017, 3.2647, 2.4012, 3.2090], + device='cuda:2'), covar=tensor([0.1111, 0.1334, 0.0124, 0.0278, 0.0232, 0.0119, 0.0364, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0123, 0.0166, 0.0142, 0.0133, 0.0117, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:48:02,224 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9892, 0.9875, 1.0594, 0.8139, 0.6981, 0.6658, 0.9079, 0.7978], + device='cuda:2'), covar=tensor([0.0242, 0.0208, 0.0188, 0.0276, 0.0277, 0.0593, 0.0369, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0015, 0.0013, 0.0014, 0.0014, 0.0023, 0.0018, 0.0023], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 22:48:24,558 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=73793.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:48:55,199 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9159, 4.0528, 4.2593, 3.6925, 4.1557, 4.3073, 1.6591, 3.9092], + device='cuda:2'), covar=tensor([0.0278, 0.0308, 0.0393, 0.0503, 0.0292, 0.0241, 0.3171, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0161, 0.0135, 0.0133, 0.0193, 0.0130, 0.0155, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 22:49:17,092 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 2.156e+02 2.743e+02 3.331e+02 7.800e+02, threshold=5.486e+02, percent-clipped=2.0 +2022-12-07 22:49:23,293 INFO [train.py:873] (2/4) Epoch 10, batch 5800, loss[loss=0.1485, simple_loss=0.1663, pruned_loss=0.06531, over 11962.00 frames. ], tot_loss[loss=0.1351, simple_loss=0.1623, pruned_loss=0.05392, over 1955411.14 frames. ], batch size: 100, lr: 7.82e-03, grad_scale: 8.0 +2022-12-07 22:50:40,896 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=73946.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:50:44,020 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=73949.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:50:46,443 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 2.192e+02 2.728e+02 3.266e+02 1.115e+03, threshold=5.456e+02, percent-clipped=1.0 +2022-12-07 22:50:52,926 INFO [train.py:873] (2/4) Epoch 10, batch 5900, loss[loss=0.1253, simple_loss=0.1617, pruned_loss=0.04449, over 14265.00 frames. ], tot_loss[loss=0.1346, simple_loss=0.1618, pruned_loss=0.05368, over 2020518.63 frames. ], batch size: 76, lr: 7.82e-03, grad_scale: 8.0 +2022-12-07 22:51:14,526 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-07 22:51:26,784 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=73997.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:51:43,169 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-12-07 22:51:45,021 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.73 vs. limit=2.0 +2022-12-07 22:52:03,078 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6400, 2.3979, 2.2319, 2.3722, 2.5379, 2.5383, 2.5941, 2.5721], + device='cuda:2'), covar=tensor([0.1070, 0.1045, 0.2477, 0.2851, 0.1174, 0.1328, 0.1529, 0.0959], + device='cuda:2'), in_proj_covar=tensor([0.0356, 0.0252, 0.0417, 0.0550, 0.0318, 0.0408, 0.0384, 0.0351], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:52:14,955 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.132e+02 2.894e+02 3.484e+02 7.158e+02, threshold=5.787e+02, percent-clipped=1.0 +2022-12-07 22:52:17,647 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2926, 2.0755, 2.6136, 1.8033, 1.6589, 2.4355, 1.3222, 2.3696], + device='cuda:2'), covar=tensor([0.1017, 0.1640, 0.0716, 0.1864, 0.2693, 0.0815, 0.4117, 0.0873], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0094, 0.0088, 0.0094, 0.0114, 0.0078, 0.0126, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0005, 0.0004], + device='cuda:2') +2022-12-07 22:52:20,978 INFO [train.py:873] (2/4) Epoch 10, batch 6000, loss[loss=0.1611, simple_loss=0.1767, pruned_loss=0.07276, over 13545.00 frames. ], tot_loss[loss=0.1358, simple_loss=0.1623, pruned_loss=0.05461, over 1951117.48 frames. ], batch size: 100, lr: 7.81e-03, grad_scale: 8.0 +2022-12-07 22:52:20,978 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 22:52:36,792 INFO [train.py:905] (2/4) Epoch 10, validation: loss=0.1251, simple_loss=0.167, pruned_loss=0.04163, over 857387.00 frames. +2022-12-07 22:52:36,793 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-07 22:52:41,999 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74064.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:53:02,980 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74088.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:53:35,925 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74125.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:53:41,276 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1701, 2.0451, 4.3115, 3.0805, 4.0888, 2.1484, 3.2411, 4.0417], + device='cuda:2'), covar=tensor([0.0635, 0.4586, 0.0401, 0.6100, 0.0623, 0.3296, 0.1330, 0.0397], + device='cuda:2'), in_proj_covar=tensor([0.0243, 0.0218, 0.0194, 0.0295, 0.0216, 0.0221, 0.0217, 0.0200], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 22:53:42,020 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9681, 4.6355, 4.4999, 4.9494, 4.6526, 4.3165, 4.9230, 4.1244], + device='cuda:2'), covar=tensor([0.0326, 0.0945, 0.0361, 0.0477, 0.0783, 0.0578, 0.0552, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0259, 0.0181, 0.0178, 0.0173, 0.0144, 0.0268, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 22:54:00,210 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.199e+02 2.430e+02 2.883e+02 3.808e+02 1.343e+03, threshold=5.765e+02, percent-clipped=6.0 +2022-12-07 22:54:06,429 INFO [train.py:873] (2/4) Epoch 10, batch 6100, loss[loss=0.2152, simple_loss=0.2142, pruned_loss=0.1081, over 9533.00 frames. ], tot_loss[loss=0.1354, simple_loss=0.1622, pruned_loss=0.05428, over 1930884.82 frames. ], batch size: 100, lr: 7.81e-03, grad_scale: 8.0 +2022-12-07 22:54:27,218 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-07 22:55:18,956 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74240.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:55:24,044 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74246.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:55:29,019 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 2.250e+02 2.648e+02 3.376e+02 5.030e+02, threshold=5.296e+02, percent-clipped=0.0 +2022-12-07 22:55:35,300 INFO [train.py:873] (2/4) Epoch 10, batch 6200, loss[loss=0.1415, simple_loss=0.1661, pruned_loss=0.05845, over 11165.00 frames. ], tot_loss[loss=0.1338, simple_loss=0.1611, pruned_loss=0.05323, over 1920188.20 frames. ], batch size: 100, lr: 7.80e-03, grad_scale: 8.0 +2022-12-07 22:55:54,484 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3679, 2.0021, 2.3542, 2.4689, 2.2641, 1.9791, 2.4303, 2.1932], + device='cuda:2'), covar=tensor([0.0235, 0.0537, 0.0260, 0.0248, 0.0315, 0.0633, 0.0254, 0.0337], + device='cuda:2'), in_proj_covar=tensor([0.0270, 0.0242, 0.0361, 0.0309, 0.0250, 0.0291, 0.0284, 0.0272], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 22:56:06,633 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74294.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:56:13,583 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74301.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:56:58,218 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.014e+02 2.284e+02 2.758e+02 3.409e+02 5.517e+02, threshold=5.515e+02, percent-clipped=2.0 +2022-12-07 22:57:04,257 INFO [train.py:873] (2/4) Epoch 10, batch 6300, loss[loss=0.1383, simple_loss=0.1369, pruned_loss=0.06991, over 1237.00 frames. ], tot_loss[loss=0.134, simple_loss=0.1617, pruned_loss=0.05321, over 1960739.79 frames. ], batch size: 100, lr: 7.80e-03, grad_scale: 8.0 +2022-12-07 22:57:30,130 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74388.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:57:33,396 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6725, 1.9452, 1.9491, 2.0578, 1.8340, 2.1589, 1.7768, 1.1321], + device='cuda:2'), covar=tensor([0.1433, 0.1135, 0.1383, 0.0891, 0.1364, 0.0788, 0.1698, 0.3179], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0074, 0.0059, 0.0062, 0.0089, 0.0071, 0.0092, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0007, 0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:57:44,544 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.43 vs. limit=5.0 +2022-12-07 22:57:52,531 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4055, 2.0998, 2.3161, 1.5137, 2.0490, 2.3054, 2.4068, 1.9564], + device='cuda:2'), covar=tensor([0.0665, 0.0810, 0.0903, 0.1733, 0.1128, 0.0676, 0.0466, 0.1507], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0186, 0.0135, 0.0126, 0.0133, 0.0140, 0.0114, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:57:58,125 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74420.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:58:11,980 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74436.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 22:58:25,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.400e+02 2.328e+02 2.936e+02 3.672e+02 8.689e+02, threshold=5.873e+02, percent-clipped=4.0 +2022-12-07 22:58:32,160 INFO [train.py:873] (2/4) Epoch 10, batch 6400, loss[loss=0.1628, simple_loss=0.1492, pruned_loss=0.08819, over 1208.00 frames. ], tot_loss[loss=0.1349, simple_loss=0.162, pruned_loss=0.05385, over 1970784.25 frames. ], batch size: 100, lr: 7.79e-03, grad_scale: 8.0 +2022-12-07 22:58:50,665 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0063, 2.6985, 2.7876, 1.8570, 2.5970, 2.6493, 3.1156, 2.4584], + device='cuda:2'), covar=tensor([0.0681, 0.0890, 0.0905, 0.1537, 0.0944, 0.0693, 0.0546, 0.1295], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0186, 0.0135, 0.0126, 0.0132, 0.0141, 0.0115, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 22:59:53,825 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.408e+02 2.247e+02 2.848e+02 3.548e+02 7.995e+02, threshold=5.696e+02, percent-clipped=2.0 +2022-12-07 22:59:59,603 INFO [train.py:873] (2/4) Epoch 10, batch 6500, loss[loss=0.09943, simple_loss=0.1395, pruned_loss=0.02969, over 13873.00 frames. ], tot_loss[loss=0.1349, simple_loss=0.1621, pruned_loss=0.05387, over 1992363.63 frames. ], batch size: 20, lr: 7.79e-03, grad_scale: 8.0 +2022-12-07 23:00:31,661 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74595.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:00:32,435 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74596.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:00:56,873 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-07 23:01:09,600 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4666, 2.1767, 3.3989, 3.4876, 3.4200, 2.2901, 3.3917, 2.6823], + device='cuda:2'), covar=tensor([0.0322, 0.0772, 0.0626, 0.0392, 0.0315, 0.1081, 0.0356, 0.0701], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0245, 0.0366, 0.0311, 0.0252, 0.0293, 0.0284, 0.0272], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:01:13,414 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.77 vs. limit=2.0 +2022-12-07 23:01:22,224 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.260e+02 2.214e+02 2.798e+02 3.905e+02 7.921e+02, threshold=5.597e+02, percent-clipped=3.0 +2022-12-07 23:01:25,285 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74656.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:01:28,053 INFO [train.py:873] (2/4) Epoch 10, batch 6600, loss[loss=0.1332, simple_loss=0.1613, pruned_loss=0.05255, over 6932.00 frames. ], tot_loss[loss=0.1354, simple_loss=0.1621, pruned_loss=0.05432, over 1924758.46 frames. ], batch size: 100, lr: 7.78e-03, grad_scale: 4.0 +2022-12-07 23:02:22,396 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74720.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:02:39,755 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-07 23:02:51,954 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.183e+02 2.052e+02 2.533e+02 3.147e+02 6.445e+02, threshold=5.065e+02, percent-clipped=3.0 +2022-12-07 23:02:57,100 INFO [train.py:873] (2/4) Epoch 10, batch 6700, loss[loss=0.1396, simple_loss=0.1651, pruned_loss=0.05703, over 14183.00 frames. ], tot_loss[loss=0.1352, simple_loss=0.1625, pruned_loss=0.054, over 1954853.12 frames. ], batch size: 99, lr: 7.78e-03, grad_scale: 4.0 +2022-12-07 23:03:05,000 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74768.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:03:46,760 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 23:04:19,048 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.400e+02 2.393e+02 2.874e+02 3.943e+02 8.045e+02, threshold=5.748e+02, percent-clipped=8.0 +2022-12-07 23:04:24,814 INFO [train.py:873] (2/4) Epoch 10, batch 6800, loss[loss=0.1472, simple_loss=0.1706, pruned_loss=0.06192, over 14593.00 frames. ], tot_loss[loss=0.1342, simple_loss=0.1617, pruned_loss=0.05341, over 1970924.71 frames. ], batch size: 23, lr: 7.77e-03, grad_scale: 8.0 +2022-12-07 23:04:28,713 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1270, 2.4079, 4.1766, 4.1470, 4.1620, 2.4387, 4.2742, 3.3050], + device='cuda:2'), covar=tensor([0.0289, 0.0778, 0.0599, 0.0336, 0.0275, 0.1235, 0.0251, 0.0655], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0246, 0.0365, 0.0308, 0.0252, 0.0293, 0.0284, 0.0273], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:04:56,616 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=74896.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:05:08,605 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74909.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:05:38,779 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=74944.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:05:45,361 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=74951.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:05:47,782 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.213e+02 2.283e+02 2.713e+02 3.632e+02 6.989e+02, threshold=5.426e+02, percent-clipped=3.0 +2022-12-07 23:05:52,206 INFO [train.py:873] (2/4) Epoch 10, batch 6900, loss[loss=0.1249, simple_loss=0.1407, pruned_loss=0.05454, over 3881.00 frames. ], tot_loss[loss=0.1335, simple_loss=0.161, pruned_loss=0.05303, over 1961777.89 frames. ], batch size: 100, lr: 7.77e-03, grad_scale: 4.0 +2022-12-07 23:05:56,660 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=74964.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:06:01,553 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=74970.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:06:01,832 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-07 23:06:27,054 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.73 vs. limit=5.0 +2022-12-07 23:06:52,420 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75025.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:07:16,831 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 2.278e+02 2.897e+02 3.473e+02 8.232e+02, threshold=5.794e+02, percent-clipped=1.0 +2022-12-07 23:07:21,375 INFO [train.py:873] (2/4) Epoch 10, batch 7000, loss[loss=0.1678, simple_loss=0.185, pruned_loss=0.07526, over 9497.00 frames. ], tot_loss[loss=0.1346, simple_loss=0.1619, pruned_loss=0.05364, over 1940747.89 frames. ], batch size: 100, lr: 7.76e-03, grad_scale: 4.0 +2022-12-07 23:08:39,655 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-07 23:08:45,734 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 2.185e+02 2.832e+02 3.460e+02 7.150e+02, threshold=5.664e+02, percent-clipped=2.0 +2022-12-07 23:08:49,892 INFO [train.py:873] (2/4) Epoch 10, batch 7100, loss[loss=0.1431, simple_loss=0.1453, pruned_loss=0.07045, over 2671.00 frames. ], tot_loss[loss=0.134, simple_loss=0.1617, pruned_loss=0.05314, over 2001893.73 frames. ], batch size: 100, lr: 7.76e-03, grad_scale: 4.0 +2022-12-07 23:09:09,142 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5330, 3.7121, 3.8522, 3.4884, 3.6937, 3.7617, 1.3873, 3.5096], + device='cuda:2'), covar=tensor([0.0329, 0.0328, 0.0390, 0.0485, 0.0340, 0.0407, 0.3343, 0.0303], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0160, 0.0133, 0.0133, 0.0191, 0.0129, 0.0153, 0.0179], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:09:51,380 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=75228.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:10:09,433 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9892, 1.3665, 3.8049, 1.5721, 3.7902, 3.9897, 3.0472, 4.3217], + device='cuda:2'), covar=tensor([0.0254, 0.3580, 0.0600, 0.2643, 0.0506, 0.0449, 0.0713, 0.0217], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0158, 0.0158, 0.0169, 0.0170, 0.0174, 0.0132, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 23:10:11,098 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75251.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:10:13,567 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 2.322e+02 2.903e+02 3.521e+02 5.847e+02, threshold=5.807e+02, percent-clipped=2.0 +2022-12-07 23:10:18,336 INFO [train.py:873] (2/4) Epoch 10, batch 7200, loss[loss=0.1188, simple_loss=0.1555, pruned_loss=0.04107, over 13871.00 frames. ], tot_loss[loss=0.134, simple_loss=0.1617, pruned_loss=0.05316, over 1978223.80 frames. ], batch size: 23, lr: 7.75e-03, grad_scale: 8.0 +2022-12-07 23:10:23,942 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75265.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:10:28,267 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=75270.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:10:44,796 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75289.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:10:53,491 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75299.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:11:06,249 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3733, 4.0638, 4.0096, 4.3402, 4.0173, 3.4699, 4.3699, 4.2118], + device='cuda:2'), covar=tensor([0.0593, 0.0807, 0.0793, 0.0616, 0.0802, 0.0877, 0.0778, 0.0771], + device='cuda:2'), in_proj_covar=tensor([0.0129, 0.0124, 0.0134, 0.0141, 0.0135, 0.0112, 0.0158, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 23:11:11,934 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75320.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:11:21,425 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=75331.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:11:41,267 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.97 vs. limit=5.0 +2022-12-07 23:11:41,652 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.115e+02 2.194e+02 2.882e+02 3.919e+02 8.679e+02, threshold=5.764e+02, percent-clipped=4.0 +2022-12-07 23:11:42,699 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9902, 1.9735, 1.7400, 2.0273, 1.8806, 1.9973, 1.8507, 1.8106], + device='cuda:2'), covar=tensor([0.0857, 0.0978, 0.1915, 0.0596, 0.0846, 0.0633, 0.1497, 0.0770], + device='cuda:2'), in_proj_covar=tensor([0.0267, 0.0301, 0.0269, 0.0249, 0.0309, 0.0295, 0.0256, 0.0257], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 23:11:46,063 INFO [train.py:873] (2/4) Epoch 10, batch 7300, loss[loss=0.1624, simple_loss=0.1555, pruned_loss=0.08468, over 2658.00 frames. ], tot_loss[loss=0.1338, simple_loss=0.1615, pruned_loss=0.05309, over 1955256.54 frames. ], batch size: 100, lr: 7.75e-03, grad_scale: 8.0 +2022-12-07 23:11:55,223 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.31 vs. limit=5.0 +2022-12-07 23:12:33,110 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.78 vs. limit=2.0 +2022-12-07 23:12:58,431 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2001, 2.9159, 2.3677, 3.3044, 3.0880, 3.1467, 2.7752, 2.3155], + device='cuda:2'), covar=tensor([0.0685, 0.1395, 0.3039, 0.0583, 0.0895, 0.0942, 0.1392, 0.3387], + device='cuda:2'), in_proj_covar=tensor([0.0264, 0.0295, 0.0265, 0.0244, 0.0304, 0.0289, 0.0251, 0.0255], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 23:13:09,406 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.408e+02 2.421e+02 2.835e+02 3.319e+02 6.698e+02, threshold=5.670e+02, percent-clipped=3.0 +2022-12-07 23:13:13,358 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8015, 1.7985, 2.1736, 1.8274, 1.6329, 1.0862, 1.7642, 1.9965], + device='cuda:2'), covar=tensor([0.0790, 0.0768, 0.0767, 0.1890, 0.1647, 0.0719, 0.0935, 0.0711], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0025, 0.0027, 0.0024, 0.0025, 0.0037, 0.0025, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 23:13:14,008 INFO [train.py:873] (2/4) Epoch 10, batch 7400, loss[loss=0.1214, simple_loss=0.1536, pruned_loss=0.04467, over 14269.00 frames. ], tot_loss[loss=0.1345, simple_loss=0.1617, pruned_loss=0.05363, over 1933711.07 frames. ], batch size: 28, lr: 7.74e-03, grad_scale: 8.0 +2022-12-07 23:13:21,985 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-07 23:13:23,494 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.71 vs. limit=5.0 +2022-12-07 23:14:38,882 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 2.214e+02 2.894e+02 3.789e+02 9.059e+02, threshold=5.789e+02, percent-clipped=5.0 +2022-12-07 23:14:42,301 INFO [train.py:873] (2/4) Epoch 10, batch 7500, loss[loss=0.1405, simple_loss=0.1511, pruned_loss=0.06493, over 3921.00 frames. ], tot_loss[loss=0.1342, simple_loss=0.1613, pruned_loss=0.05359, over 1934219.21 frames. ], batch size: 100, lr: 7.73e-03, grad_scale: 4.0 +2022-12-07 23:14:47,596 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75565.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:14:50,063 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5659, 1.4242, 2.7194, 1.4450, 2.7510, 2.7183, 2.0007, 2.8547], + device='cuda:2'), covar=tensor([0.0281, 0.2421, 0.0359, 0.1840, 0.0373, 0.0489, 0.0940, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0156, 0.0156, 0.0168, 0.0170, 0.0174, 0.0131, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 23:15:04,142 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75584.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:15:20,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4197, 1.0453, 1.3118, 0.8890, 1.0364, 1.3799, 1.0931, 1.0877], + device='cuda:2'), covar=tensor([0.0403, 0.0683, 0.0567, 0.0422, 0.0876, 0.0536, 0.0353, 0.1141], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0183, 0.0134, 0.0125, 0.0133, 0.0140, 0.0114, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 23:15:21,335 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-07 23:15:21,604 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8525, 1.5711, 2.1239, 1.6980, 1.9673, 1.4483, 1.7205, 1.9379], + device='cuda:2'), covar=tensor([0.2082, 0.2132, 0.0326, 0.1712, 0.0962, 0.1189, 0.0831, 0.0541], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0222, 0.0200, 0.0298, 0.0220, 0.0226, 0.0221, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:15:24,291 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75613.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:16:08,895 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75620.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:16:09,627 INFO [train.py:873] (2/4) Epoch 11, batch 0, loss[loss=0.1457, simple_loss=0.1736, pruned_loss=0.05883, over 14393.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.1736, pruned_loss=0.05883, over 14393.00 frames. ], batch size: 53, lr: 7.38e-03, grad_scale: 8.0 +2022-12-07 23:16:09,627 INFO [train.py:896] (2/4) Computing validation loss +2022-12-07 23:16:16,872 INFO [train.py:905] (2/4) Epoch 11, validation: loss=0.1341, simple_loss=0.1756, pruned_loss=0.0463, over 857387.00 frames. +2022-12-07 23:16:16,873 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-07 23:16:21,386 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=75626.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:16:47,142 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.744e+01 1.774e+02 2.985e+02 4.009e+02 1.076e+03, threshold=5.970e+02, percent-clipped=9.0 +2022-12-07 23:16:49,870 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0401, 2.0996, 4.0394, 2.6990, 3.9171, 2.1772, 3.0634, 3.9101], + device='cuda:2'), covar=tensor([0.0613, 0.4778, 0.0507, 0.7072, 0.0567, 0.3561, 0.1376, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0221, 0.0200, 0.0297, 0.0220, 0.0224, 0.0220, 0.0208], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:16:53,314 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3613, 2.0435, 2.7521, 2.0152, 1.6723, 2.6001, 1.1594, 2.4604], + device='cuda:2'), covar=tensor([0.1297, 0.2558, 0.0948, 0.1704, 0.3374, 0.1164, 0.5874, 0.1245], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0095, 0.0089, 0.0096, 0.0115, 0.0081, 0.0127, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0005, 0.0004], + device='cuda:2') +2022-12-07 23:16:58,378 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75668.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:17:45,592 INFO [train.py:873] (2/4) Epoch 11, batch 100, loss[loss=0.1248, simple_loss=0.1594, pruned_loss=0.0451, over 14196.00 frames. ], tot_loss[loss=0.1308, simple_loss=0.1602, pruned_loss=0.05064, over 908058.43 frames. ], batch size: 89, lr: 7.38e-03, grad_scale: 8.0 +2022-12-07 23:17:57,376 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.76 vs. limit=2.0 +2022-12-07 23:18:04,386 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0126, 1.8372, 4.5347, 4.1580, 4.1091, 4.6086, 4.3027, 4.5421], + device='cuda:2'), covar=tensor([0.1377, 0.1312, 0.0078, 0.0143, 0.0186, 0.0091, 0.0106, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0159, 0.0124, 0.0167, 0.0142, 0.0135, 0.0117, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:18:11,546 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8935, 1.7193, 2.0043, 1.6371, 1.6206, 0.9743, 1.8300, 1.9792], + device='cuda:2'), covar=tensor([0.0915, 0.1292, 0.1366, 0.1533, 0.2177, 0.0890, 0.1341, 0.0955], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0027, 0.0024, 0.0025, 0.0037, 0.0026, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 23:18:14,579 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.558e+02 2.145e+02 2.643e+02 3.199e+02 5.049e+02, threshold=5.286e+02, percent-clipped=0.0 +2022-12-07 23:18:59,083 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7157, 1.1173, 1.3229, 1.2679, 1.0442, 1.2989, 1.1536, 0.9577], + device='cuda:2'), covar=tensor([0.2505, 0.0848, 0.0396, 0.0501, 0.1603, 0.0767, 0.2190, 0.1111], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0074, 0.0059, 0.0064, 0.0090, 0.0072, 0.0093, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-07 23:19:05,341 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-12-07 23:19:13,452 INFO [train.py:873] (2/4) Epoch 11, batch 200, loss[loss=0.1225, simple_loss=0.1571, pruned_loss=0.04394, over 14277.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.1607, pruned_loss=0.05104, over 1377563.00 frames. ], batch size: 28, lr: 7.37e-03, grad_scale: 8.0 +2022-12-07 23:19:20,578 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1385, 2.0813, 4.8394, 4.3670, 4.2741, 4.9117, 4.6546, 4.8839], + device='cuda:2'), covar=tensor([0.1286, 0.1258, 0.0070, 0.0165, 0.0176, 0.0084, 0.0090, 0.0089], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0159, 0.0124, 0.0166, 0.0142, 0.0135, 0.0117, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:19:32,304 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6536, 2.4415, 2.2037, 2.2943, 2.5307, 2.5410, 2.5915, 2.5604], + device='cuda:2'), covar=tensor([0.0857, 0.0679, 0.2318, 0.2485, 0.0989, 0.0975, 0.1165, 0.0938], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0244, 0.0419, 0.0539, 0.0312, 0.0400, 0.0380, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:19:43,431 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 2.275e+02 2.829e+02 3.715e+02 6.696e+02, threshold=5.659e+02, percent-clipped=6.0 +2022-12-07 23:19:51,462 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-12-07 23:20:08,908 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75884.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:20:41,222 INFO [train.py:873] (2/4) Epoch 11, batch 300, loss[loss=0.1476, simple_loss=0.1739, pruned_loss=0.06063, over 13522.00 frames. ], tot_loss[loss=0.1312, simple_loss=0.1601, pruned_loss=0.05116, over 1615963.68 frames. ], batch size: 100, lr: 7.37e-03, grad_scale: 8.0 +2022-12-07 23:20:46,321 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=75926.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:20:51,520 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75932.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:21:11,324 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.372e+02 2.182e+02 2.664e+02 3.344e+02 7.501e+02, threshold=5.329e+02, percent-clipped=3.0 +2022-12-07 23:21:16,329 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9121, 1.5731, 1.9016, 2.0590, 1.5164, 1.7710, 1.9074, 1.9454], + device='cuda:2'), covar=tensor([0.0120, 0.0175, 0.0112, 0.0112, 0.0244, 0.0256, 0.0140, 0.0110], + device='cuda:2'), in_proj_covar=tensor([0.0276, 0.0246, 0.0365, 0.0311, 0.0255, 0.0298, 0.0287, 0.0275], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:21:22,823 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0905, 2.4512, 3.8149, 2.7065, 3.9363, 3.8085, 3.6555, 3.1308], + device='cuda:2'), covar=tensor([0.0748, 0.3219, 0.1105, 0.2297, 0.0955, 0.0867, 0.1410, 0.2021], + device='cuda:2'), in_proj_covar=tensor([0.0344, 0.0317, 0.0395, 0.0302, 0.0375, 0.0318, 0.0357, 0.0317], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:21:28,769 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=75974.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:21:51,858 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4252, 3.8866, 3.2810, 4.8346, 4.1454, 4.4478, 4.0659, 3.4589], + device='cuda:2'), covar=tensor([0.0780, 0.1248, 0.3574, 0.0386, 0.1361, 0.1720, 0.1149, 0.2767], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0297, 0.0269, 0.0247, 0.0306, 0.0291, 0.0254, 0.0256], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 23:22:10,197 INFO [train.py:873] (2/4) Epoch 11, batch 400, loss[loss=0.1072, simple_loss=0.1432, pruned_loss=0.03564, over 14010.00 frames. ], tot_loss[loss=0.1305, simple_loss=0.1595, pruned_loss=0.05076, over 1803677.02 frames. ], batch size: 22, lr: 7.36e-03, grad_scale: 8.0 +2022-12-07 23:22:40,269 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.302e+02 2.357e+02 2.915e+02 3.546e+02 6.707e+02, threshold=5.830e+02, percent-clipped=5.0 +2022-12-07 23:23:07,529 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1095, 1.4096, 3.8948, 1.6979, 3.9594, 4.1987, 3.2370, 4.4388], + device='cuda:2'), covar=tensor([0.0195, 0.3176, 0.0386, 0.2339, 0.0335, 0.0335, 0.0642, 0.0159], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0156, 0.0157, 0.0168, 0.0171, 0.0174, 0.0132, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 23:23:38,046 INFO [train.py:873] (2/4) Epoch 11, batch 500, loss[loss=0.1481, simple_loss=0.1742, pruned_loss=0.06097, over 14295.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.16, pruned_loss=0.05139, over 1855735.01 frames. ], batch size: 80, lr: 7.36e-03, grad_scale: 8.0 +2022-12-07 23:24:07,887 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.432e+01 2.214e+02 2.601e+02 3.343e+02 5.170e+02, threshold=5.201e+02, percent-clipped=0.0 +2022-12-07 23:24:43,340 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3458, 1.6655, 2.6430, 2.0840, 2.4329, 1.6640, 2.0497, 2.4094], + device='cuda:2'), covar=tensor([0.1337, 0.4168, 0.0516, 0.3251, 0.0960, 0.3234, 0.1100, 0.0672], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0222, 0.0201, 0.0297, 0.0217, 0.0222, 0.0216, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:25:05,642 INFO [train.py:873] (2/4) Epoch 11, batch 600, loss[loss=0.1296, simple_loss=0.1624, pruned_loss=0.04842, over 14220.00 frames. ], tot_loss[loss=0.1334, simple_loss=0.161, pruned_loss=0.05297, over 1849098.70 frames. ], batch size: 80, lr: 7.35e-03, grad_scale: 8.0 +2022-12-07 23:25:08,436 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2842, 2.2117, 4.3434, 2.8782, 4.1096, 2.0222, 3.1835, 4.0518], + device='cuda:2'), covar=tensor([0.0575, 0.5005, 0.0421, 0.8455, 0.0550, 0.4234, 0.1499, 0.0392], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0222, 0.0201, 0.0297, 0.0217, 0.0223, 0.0217, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:25:15,595 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-07 23:25:35,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.207e+02 2.199e+02 2.747e+02 3.310e+02 7.404e+02, threshold=5.494e+02, percent-clipped=4.0 +2022-12-07 23:25:47,480 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76269.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:26:09,598 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76294.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:26:24,160 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5971, 2.9191, 4.3636, 3.3563, 4.5512, 4.2536, 4.1467, 3.8606], + device='cuda:2'), covar=tensor([0.0541, 0.2675, 0.0891, 0.1677, 0.0672, 0.0756, 0.1646, 0.1777], + device='cuda:2'), in_proj_covar=tensor([0.0348, 0.0320, 0.0399, 0.0305, 0.0382, 0.0322, 0.0360, 0.0316], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:26:32,274 INFO [train.py:873] (2/4) Epoch 11, batch 700, loss[loss=0.1777, simple_loss=0.1586, pruned_loss=0.09842, over 1217.00 frames. ], tot_loss[loss=0.1315, simple_loss=0.1596, pruned_loss=0.05173, over 1882140.88 frames. ], batch size: 100, lr: 7.35e-03, grad_scale: 8.0 +2022-12-07 23:26:34,406 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76323.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:26:40,828 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76330.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:26:57,433 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4230, 1.0944, 1.2772, 0.9035, 1.2195, 1.4264, 1.1351, 1.0543], + device='cuda:2'), covar=tensor([0.0377, 0.0765, 0.0602, 0.0535, 0.0731, 0.0512, 0.0388, 0.1209], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0183, 0.0135, 0.0125, 0.0135, 0.0140, 0.0115, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0005, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 23:27:01,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.209e+02 2.063e+02 2.502e+02 2.955e+02 5.307e+02, threshold=5.004e+02, percent-clipped=0.0 +2022-12-07 23:27:02,095 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76355.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:27:27,286 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76384.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 23:27:59,960 INFO [train.py:873] (2/4) Epoch 11, batch 800, loss[loss=0.1104, simple_loss=0.1462, pruned_loss=0.0373, over 14538.00 frames. ], tot_loss[loss=0.1323, simple_loss=0.1602, pruned_loss=0.0522, over 1985884.29 frames. ], batch size: 34, lr: 7.34e-03, grad_scale: 8.0 +2022-12-07 23:28:29,451 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.454e+02 2.357e+02 2.806e+02 3.598e+02 6.382e+02, threshold=5.612e+02, percent-clipped=5.0 +2022-12-07 23:28:38,571 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.10 vs. limit=5.0 +2022-12-07 23:29:20,863 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5082, 2.2747, 4.5485, 3.0213, 4.2539, 2.0665, 3.4454, 4.3157], + device='cuda:2'), covar=tensor([0.0443, 0.4202, 0.0287, 0.7161, 0.0492, 0.3519, 0.1112, 0.0354], + device='cuda:2'), in_proj_covar=tensor([0.0244, 0.0219, 0.0199, 0.0292, 0.0216, 0.0221, 0.0216, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:29:26,690 INFO [train.py:873] (2/4) Epoch 11, batch 900, loss[loss=0.1746, simple_loss=0.1585, pruned_loss=0.09537, over 1247.00 frames. ], tot_loss[loss=0.1326, simple_loss=0.1606, pruned_loss=0.05229, over 1936885.58 frames. ], batch size: 100, lr: 7.34e-03, grad_scale: 8.0 +2022-12-07 23:29:56,493 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 2.233e+02 2.846e+02 3.366e+02 5.399e+02, threshold=5.691e+02, percent-clipped=0.0 +2022-12-07 23:30:38,302 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0837, 4.8673, 4.4866, 4.6635, 4.7547, 4.9762, 5.0910, 5.0932], + device='cuda:2'), covar=tensor([0.0832, 0.0455, 0.2176, 0.2735, 0.0655, 0.0881, 0.0786, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0358, 0.0243, 0.0413, 0.0536, 0.0311, 0.0398, 0.0382, 0.0342], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:30:54,356 INFO [train.py:873] (2/4) Epoch 11, batch 1000, loss[loss=0.1503, simple_loss=0.165, pruned_loss=0.06777, over 5958.00 frames. ], tot_loss[loss=0.1301, simple_loss=0.159, pruned_loss=0.05057, over 2010127.55 frames. ], batch size: 100, lr: 7.33e-03, grad_scale: 8.0 +2022-12-07 23:30:57,807 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76625.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:31:19,967 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76650.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:31:24,209 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.288e+02 2.175e+02 2.720e+02 3.257e+02 6.419e+02, threshold=5.440e+02, percent-clipped=2.0 +2022-12-07 23:31:44,651 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=76679.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 23:32:18,901 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9990, 1.9027, 1.7074, 1.9965, 1.8498, 1.9208, 1.7892, 1.6627], + device='cuda:2'), covar=tensor([0.0914, 0.0937, 0.1734, 0.0592, 0.0802, 0.0628, 0.1703, 0.0673], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0296, 0.0268, 0.0247, 0.0307, 0.0291, 0.0256, 0.0253], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-07 23:32:21,331 INFO [train.py:873] (2/4) Epoch 11, batch 1100, loss[loss=0.1372, simple_loss=0.1551, pruned_loss=0.05968, over 6006.00 frames. ], tot_loss[loss=0.1306, simple_loss=0.1595, pruned_loss=0.05083, over 2064405.75 frames. ], batch size: 100, lr: 7.33e-03, grad_scale: 8.0 +2022-12-07 23:32:50,792 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.125e+02 2.272e+02 2.937e+02 3.615e+02 8.026e+02, threshold=5.874e+02, percent-clipped=3.0 +2022-12-07 23:33:37,362 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8084, 1.5024, 3.1177, 2.8654, 3.0577, 3.1293, 2.2922, 3.1207], + device='cuda:2'), covar=tensor([0.1182, 0.1291, 0.0125, 0.0298, 0.0240, 0.0144, 0.0382, 0.0158], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0159, 0.0123, 0.0166, 0.0140, 0.0134, 0.0116, 0.0116], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:33:49,947 INFO [train.py:873] (2/4) Epoch 11, batch 1200, loss[loss=0.1705, simple_loss=0.1597, pruned_loss=0.09064, over 2627.00 frames. ], tot_loss[loss=0.1311, simple_loss=0.1599, pruned_loss=0.0512, over 2049457.33 frames. ], batch size: 100, lr: 7.32e-03, grad_scale: 8.0 +2022-12-07 23:34:05,479 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=76839.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:34:19,397 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.367e+02 2.391e+02 2.914e+02 3.659e+02 7.427e+02, threshold=5.827e+02, percent-clipped=2.0 +2022-12-07 23:34:41,032 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.78 vs. limit=2.0 +2022-12-07 23:34:59,297 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=76900.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:35:05,291 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5176, 3.2582, 3.2355, 3.4847, 3.3309, 3.4722, 3.5239, 2.9677], + device='cuda:2'), covar=tensor([0.0436, 0.1043, 0.0519, 0.0510, 0.0790, 0.0384, 0.0676, 0.0612], + device='cuda:2'), in_proj_covar=tensor([0.0163, 0.0258, 0.0182, 0.0177, 0.0173, 0.0142, 0.0264, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 23:35:17,349 INFO [train.py:873] (2/4) Epoch 11, batch 1300, loss[loss=0.1128, simple_loss=0.1506, pruned_loss=0.03744, over 14030.00 frames. ], tot_loss[loss=0.131, simple_loss=0.1595, pruned_loss=0.05128, over 1989536.72 frames. ], batch size: 22, lr: 7.32e-03, grad_scale: 8.0 +2022-12-07 23:35:21,012 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76925.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:35:42,625 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76950.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:35:46,323 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.0067, 5.6345, 5.4268, 6.1219, 5.5164, 5.2361, 6.1351, 5.8335], + device='cuda:2'), covar=tensor([0.0691, 0.0604, 0.0851, 0.0463, 0.0673, 0.0335, 0.0510, 0.0727], + device='cuda:2'), in_proj_covar=tensor([0.0128, 0.0123, 0.0132, 0.0140, 0.0133, 0.0109, 0.0154, 0.0133], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-07 23:35:46,764 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.86 vs. limit=2.0 +2022-12-07 23:35:47,102 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 2.104e+02 2.573e+02 3.185e+02 4.865e+02, threshold=5.146e+02, percent-clipped=0.0 +2022-12-07 23:35:59,494 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7928, 1.5153, 3.2819, 2.9754, 3.1763, 3.3127, 2.5743, 3.2974], + device='cuda:2'), covar=tensor([0.1310, 0.1415, 0.0128, 0.0266, 0.0245, 0.0130, 0.0333, 0.0158], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0160, 0.0124, 0.0167, 0.0141, 0.0135, 0.0117, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:36:02,808 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=76973.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:36:08,396 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=76979.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:36:24,630 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=76998.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:36:45,316 INFO [train.py:873] (2/4) Epoch 11, batch 1400, loss[loss=0.199, simple_loss=0.165, pruned_loss=0.1166, over 1219.00 frames. ], tot_loss[loss=0.1317, simple_loss=0.1602, pruned_loss=0.05161, over 1993136.00 frames. ], batch size: 100, lr: 7.31e-03, grad_scale: 8.0 +2022-12-07 23:36:50,503 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77027.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:37:14,870 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.323e+02 2.332e+02 2.864e+02 3.537e+02 8.782e+02, threshold=5.729e+02, percent-clipped=8.0 +2022-12-07 23:37:49,627 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77095.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:38:08,157 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1124, 2.3434, 4.1646, 4.3475, 4.2362, 2.3797, 4.2635, 3.2604], + device='cuda:2'), covar=tensor([0.0285, 0.0889, 0.0592, 0.0279, 0.0286, 0.1438, 0.0303, 0.0734], + device='cuda:2'), in_proj_covar=tensor([0.0277, 0.0248, 0.0367, 0.0313, 0.0258, 0.0298, 0.0290, 0.0276], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:38:12,200 INFO [train.py:873] (2/4) Epoch 11, batch 1500, loss[loss=0.1164, simple_loss=0.1249, pruned_loss=0.05397, over 2619.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.16, pruned_loss=0.05156, over 1963688.90 frames. ], batch size: 100, lr: 7.31e-03, grad_scale: 8.0 +2022-12-07 23:38:41,967 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.264e+02 2.474e+02 2.983e+02 4.107e+02 8.785e+02, threshold=5.967e+02, percent-clipped=9.0 +2022-12-07 23:38:43,013 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77156.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:39:17,211 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77195.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:39:32,228 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77211.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:39:35,767 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4185, 3.4841, 3.6863, 3.4867, 3.5719, 3.4065, 1.5843, 3.3908], + device='cuda:2'), covar=tensor([0.0335, 0.0380, 0.0349, 0.0411, 0.0335, 0.0526, 0.2960, 0.0293], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0166, 0.0137, 0.0136, 0.0195, 0.0132, 0.0156, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:39:36,667 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2937, 1.1372, 1.2797, 1.2987, 1.2881, 0.8269, 1.1519, 1.2812], + device='cuda:2'), covar=tensor([0.0758, 0.0784, 0.0681, 0.0572, 0.0648, 0.0962, 0.0891, 0.0616], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0026, 0.0029, 0.0025, 0.0027, 0.0039, 0.0027, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 23:39:40,747 INFO [train.py:873] (2/4) Epoch 11, batch 1600, loss[loss=0.1479, simple_loss=0.1699, pruned_loss=0.06299, over 13925.00 frames. ], tot_loss[loss=0.1328, simple_loss=0.1607, pruned_loss=0.05251, over 1906973.79 frames. ], batch size: 23, lr: 7.31e-03, grad_scale: 8.0 +2022-12-07 23:39:57,561 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5700, 1.4883, 1.6472, 1.3683, 1.3906, 1.2565, 1.0728, 1.0540], + device='cuda:2'), covar=tensor([0.0188, 0.0303, 0.0221, 0.0292, 0.0228, 0.0314, 0.0263, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0014, 0.0016, 0.0013, 0.0014, 0.0014, 0.0024, 0.0019, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-07 23:40:10,411 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.174e+02 1.982e+02 2.587e+02 3.263e+02 7.799e+02, threshold=5.175e+02, percent-clipped=3.0 +2022-12-07 23:40:25,856 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:40:26,996 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77273.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:40:30,437 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77277.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:41:08,996 INFO [train.py:873] (2/4) Epoch 11, batch 1700, loss[loss=0.1426, simple_loss=0.1675, pruned_loss=0.05887, over 14257.00 frames. ], tot_loss[loss=0.1327, simple_loss=0.1609, pruned_loss=0.05229, over 1938259.14 frames. ], batch size: 80, lr: 7.30e-03, grad_scale: 8.0 +2022-12-07 23:41:20,900 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77334.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:41:24,084 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77338.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:41:39,032 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.095e+02 2.641e+02 3.320e+02 6.995e+02, threshold=5.282e+02, percent-clipped=0.0 +2022-12-07 23:41:42,780 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1431, 3.4536, 3.2419, 3.4634, 2.5178, 3.4541, 3.1481, 1.5090], + device='cuda:2'), covar=tensor([0.2180, 0.0622, 0.1275, 0.0819, 0.1091, 0.0684, 0.1242, 0.2684], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0078, 0.0061, 0.0065, 0.0093, 0.0075, 0.0096, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-07 23:42:37,341 INFO [train.py:873] (2/4) Epoch 11, batch 1800, loss[loss=0.1606, simple_loss=0.1498, pruned_loss=0.08571, over 1332.00 frames. ], tot_loss[loss=0.1318, simple_loss=0.1602, pruned_loss=0.05168, over 1953265.31 frames. ], batch size: 100, lr: 7.30e-03, grad_scale: 8.0 +2022-12-07 23:43:04,198 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77451.0, num_to_drop=1, layers_to_drop={3} +2022-12-07 23:43:07,257 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.178e+02 2.136e+02 2.704e+02 3.283e+02 4.688e+02, threshold=5.409e+02, percent-clipped=1.0 +2022-12-07 23:43:42,131 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.75 vs. limit=2.0 +2022-12-07 23:43:42,419 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77495.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:43:42,891 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.96 vs. limit=2.0 +2022-12-07 23:43:54,233 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-12-07 23:44:05,110 INFO [train.py:873] (2/4) Epoch 11, batch 1900, loss[loss=0.1284, simple_loss=0.1611, pruned_loss=0.04786, over 14282.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.16, pruned_loss=0.05162, over 1964587.73 frames. ], batch size: 35, lr: 7.29e-03, grad_scale: 8.0 +2022-12-07 23:44:24,709 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77543.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:44:36,350 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.774e+01 2.327e+02 2.859e+02 3.630e+02 7.780e+02, threshold=5.718e+02, percent-clipped=3.0 +2022-12-07 23:44:45,939 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77567.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:44:52,102 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-12-07 23:45:29,331 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77616.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:45:33,421 INFO [train.py:873] (2/4) Epoch 11, batch 2000, loss[loss=0.126, simple_loss=0.1281, pruned_loss=0.06193, over 2727.00 frames. ], tot_loss[loss=0.1324, simple_loss=0.1601, pruned_loss=0.05232, over 1955275.17 frames. ], batch size: 100, lr: 7.29e-03, grad_scale: 8.0 +2022-12-07 23:45:40,512 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77629.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:45:44,075 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77633.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:46:04,193 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.424e+02 2.333e+02 2.909e+02 3.438e+02 9.067e+02, threshold=5.818e+02, percent-clipped=5.0 +2022-12-07 23:46:21,814 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=77677.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:46:21,835 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77677.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:46:24,409 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0481, 1.9667, 2.0495, 2.0794, 2.0257, 1.5877, 1.3118, 1.8252], + device='cuda:2'), covar=tensor([0.0518, 0.0485, 0.0540, 0.0367, 0.0473, 0.1498, 0.1907, 0.0475], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0164, 0.0137, 0.0137, 0.0195, 0.0133, 0.0156, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:46:28,477 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.76 vs. limit=5.0 +2022-12-07 23:46:59,457 INFO [train.py:873] (2/4) Epoch 11, batch 2100, loss[loss=0.123, simple_loss=0.154, pruned_loss=0.04599, over 14188.00 frames. ], tot_loss[loss=0.1323, simple_loss=0.1602, pruned_loss=0.05222, over 1996463.20 frames. ], batch size: 99, lr: 7.28e-03, grad_scale: 8.0 +2022-12-07 23:47:07,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1259, 1.9710, 2.0551, 2.1836, 2.0962, 2.0328, 2.2322, 1.8447], + device='cuda:2'), covar=tensor([0.0881, 0.1211, 0.0668, 0.0692, 0.0931, 0.0665, 0.0798, 0.0696], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0259, 0.0183, 0.0180, 0.0174, 0.0143, 0.0265, 0.0158], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-07 23:47:14,326 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=77738.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:47:25,619 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77751.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:47:29,645 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.109e+02 2.408e+02 3.077e+02 4.325e+02 1.206e+03, threshold=6.153e+02, percent-clipped=10.0 +2022-12-07 23:48:07,060 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77799.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:48:26,369 INFO [train.py:873] (2/4) Epoch 11, batch 2200, loss[loss=0.1074, simple_loss=0.1445, pruned_loss=0.03519, over 14544.00 frames. ], tot_loss[loss=0.1328, simple_loss=0.1606, pruned_loss=0.05248, over 1996437.41 frames. ], batch size: 43, lr: 7.28e-03, grad_scale: 8.0 +2022-12-07 23:48:27,875 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-07 23:48:57,072 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 2.100e+02 2.823e+02 3.472e+02 8.203e+02, threshold=5.646e+02, percent-clipped=4.0 +2022-12-07 23:49:06,604 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77867.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:49:35,597 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.66 vs. limit=2.0 +2022-12-07 23:49:48,093 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77915.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:49:53,347 INFO [train.py:873] (2/4) Epoch 11, batch 2300, loss[loss=0.1153, simple_loss=0.1566, pruned_loss=0.03696, over 14287.00 frames. ], tot_loss[loss=0.1314, simple_loss=0.1594, pruned_loss=0.05173, over 1939327.08 frames. ], batch size: 25, lr: 7.27e-03, grad_scale: 8.0 +2022-12-07 23:50:00,424 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77929.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:50:04,137 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=77933.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:50:23,859 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.278e+02 2.313e+02 2.906e+02 3.798e+02 8.346e+02, threshold=5.811e+02, percent-clipped=6.0 +2022-12-07 23:50:38,070 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=77972.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:50:42,331 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77977.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:50:45,713 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=77981.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:51:21,251 INFO [train.py:873] (2/4) Epoch 11, batch 2400, loss[loss=0.1813, simple_loss=0.1602, pruned_loss=0.1012, over 1291.00 frames. ], tot_loss[loss=0.1331, simple_loss=0.1606, pruned_loss=0.05277, over 1935394.30 frames. ], batch size: 100, lr: 7.27e-03, grad_scale: 8.0 +2022-12-07 23:51:28,803 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-12-07 23:51:32,079 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78033.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:51:38,425 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.68 vs. limit=2.0 +2022-12-07 23:51:51,986 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 2.203e+02 2.586e+02 3.154e+02 7.524e+02, threshold=5.173e+02, percent-clipped=2.0 +2022-12-07 23:52:04,975 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.52 vs. limit=5.0 +2022-12-07 23:52:36,019 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-07 23:52:42,828 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0918, 2.1091, 4.0601, 2.7175, 3.9460, 1.9475, 3.0929, 3.9180], + device='cuda:2'), covar=tensor([0.0685, 0.4611, 0.0535, 0.7485, 0.0574, 0.3834, 0.1443, 0.0399], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0218, 0.0200, 0.0293, 0.0219, 0.0219, 0.0213, 0.0205], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:52:47,523 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2259, 1.5918, 1.7833, 1.7430, 1.5902, 1.7185, 1.3522, 1.2705], + device='cuda:2'), covar=tensor([0.1596, 0.0925, 0.0471, 0.0381, 0.1254, 0.0642, 0.2125, 0.1541], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0077, 0.0061, 0.0066, 0.0093, 0.0075, 0.0096, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-07 23:52:49,055 INFO [train.py:873] (2/4) Epoch 11, batch 2500, loss[loss=0.1203, simple_loss=0.1593, pruned_loss=0.04071, over 14241.00 frames. ], tot_loss[loss=0.1325, simple_loss=0.1604, pruned_loss=0.05231, over 2010070.09 frames. ], batch size: 37, lr: 7.26e-03, grad_scale: 8.0 +2022-12-07 23:52:49,616 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.85 vs. limit=2.0 +2022-12-07 23:53:19,543 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.400e+02 2.052e+02 2.774e+02 3.543e+02 6.822e+02, threshold=5.549e+02, percent-clipped=2.0 +2022-12-07 23:53:21,005 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-12-07 23:53:23,414 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9547, 2.6315, 3.6866, 3.0461, 3.8165, 3.7561, 3.6179, 3.1162], + device='cuda:2'), covar=tensor([0.0657, 0.2754, 0.1010, 0.1647, 0.0797, 0.0803, 0.1404, 0.1984], + device='cuda:2'), in_proj_covar=tensor([0.0342, 0.0311, 0.0390, 0.0297, 0.0374, 0.0315, 0.0358, 0.0311], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:53:27,073 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.62 vs. limit=2.0 +2022-12-07 23:53:31,326 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5102, 2.0563, 2.3253, 1.3517, 2.2013, 2.4329, 2.6142, 2.0993], + device='cuda:2'), covar=tensor([0.0943, 0.1246, 0.1147, 0.2377, 0.1252, 0.0888, 0.0696, 0.1778], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0182, 0.0134, 0.0124, 0.0134, 0.0139, 0.0116, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-07 23:53:45,711 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6218, 4.3476, 4.0414, 4.2029, 4.3683, 4.4968, 4.6204, 4.5294], + device='cuda:2'), covar=tensor([0.0804, 0.0460, 0.1994, 0.2591, 0.0680, 0.0674, 0.0794, 0.0935], + device='cuda:2'), in_proj_covar=tensor([0.0362, 0.0247, 0.0421, 0.0540, 0.0315, 0.0403, 0.0380, 0.0348], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:54:16,599 INFO [train.py:873] (2/4) Epoch 11, batch 2600, loss[loss=0.1426, simple_loss=0.1722, pruned_loss=0.05656, over 9435.00 frames. ], tot_loss[loss=0.1329, simple_loss=0.1605, pruned_loss=0.05263, over 1989397.69 frames. ], batch size: 100, lr: 7.26e-03, grad_scale: 8.0 +2022-12-07 23:54:21,919 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6991, 2.6653, 2.8104, 2.6921, 2.7092, 2.4024, 1.4658, 2.4677], + device='cuda:2'), covar=tensor([0.0423, 0.0416, 0.0406, 0.0416, 0.0355, 0.1049, 0.2461, 0.0368], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0165, 0.0137, 0.0138, 0.0195, 0.0134, 0.0155, 0.0182], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-07 23:54:26,475 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78232.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:54:47,973 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.197e+02 2.874e+02 3.504e+02 7.274e+02, threshold=5.748e+02, percent-clipped=5.0 +2022-12-07 23:54:53,433 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-07 23:55:00,962 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78272.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:55:19,425 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78293.0, num_to_drop=1, layers_to_drop={1} +2022-12-07 23:55:42,563 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78320.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:55:43,479 INFO [train.py:873] (2/4) Epoch 11, batch 2700, loss[loss=0.1578, simple_loss=0.1581, pruned_loss=0.07874, over 3825.00 frames. ], tot_loss[loss=0.132, simple_loss=0.1603, pruned_loss=0.05185, over 2009939.57 frames. ], batch size: 100, lr: 7.25e-03, grad_scale: 4.0 +2022-12-07 23:55:54,629 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78333.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:56:15,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.898e+01 2.288e+02 2.843e+02 3.409e+02 7.759e+02, threshold=5.687e+02, percent-clipped=1.0 +2022-12-07 23:56:36,327 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78381.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:56:43,875 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1424, 2.1199, 2.3473, 1.5069, 1.6302, 2.2331, 1.4312, 2.0980], + device='cuda:2'), covar=tensor([0.0851, 0.1534, 0.0846, 0.2278, 0.2792, 0.0925, 0.3628, 0.1088], + device='cuda:2'), in_proj_covar=tensor([0.0078, 0.0094, 0.0089, 0.0092, 0.0111, 0.0081, 0.0123, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0005, 0.0004], + device='cuda:2') +2022-12-07 23:56:56,436 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-07 23:57:11,628 INFO [train.py:873] (2/4) Epoch 11, batch 2800, loss[loss=0.1377, simple_loss=0.1643, pruned_loss=0.05551, over 14148.00 frames. ], tot_loss[loss=0.1317, simple_loss=0.1603, pruned_loss=0.05159, over 2000154.08 frames. ], batch size: 99, lr: 7.25e-03, grad_scale: 8.0 +2022-12-07 23:57:43,401 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.203e+02 2.311e+02 2.686e+02 3.123e+02 7.325e+02, threshold=5.372e+02, percent-clipped=4.0 +2022-12-07 23:57:46,077 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78460.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:58:03,176 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1034, 1.9136, 3.3049, 2.3918, 3.0885, 1.8372, 2.5175, 3.0680], + device='cuda:2'), covar=tensor([0.0913, 0.4066, 0.0485, 0.4808, 0.0828, 0.3429, 0.1420, 0.0567], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0217, 0.0200, 0.0292, 0.0219, 0.0218, 0.0213, 0.0202], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-07 23:58:15,655 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1822, 1.3979, 3.2779, 1.3810, 3.0579, 3.2487, 2.1089, 3.4317], + device='cuda:2'), covar=tensor([0.0249, 0.2950, 0.0354, 0.2342, 0.1149, 0.0444, 0.1079, 0.0239], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0157, 0.0157, 0.0168, 0.0170, 0.0174, 0.0132, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-07 23:58:40,124 INFO [train.py:873] (2/4) Epoch 11, batch 2900, loss[loss=0.1475, simple_loss=0.1684, pruned_loss=0.06326, over 13861.00 frames. ], tot_loss[loss=0.1322, simple_loss=0.1604, pruned_loss=0.05198, over 2013569.69 frames. ], batch size: 20, lr: 7.24e-03, grad_scale: 8.0 +2022-12-07 23:58:40,264 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78521.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:59:09,869 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0352, 1.1072, 0.9266, 1.0226, 1.1231, 0.6457, 0.9402, 1.0436], + device='cuda:2'), covar=tensor([0.0782, 0.0925, 0.0735, 0.0718, 0.0463, 0.0886, 0.0940, 0.1212], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0025, 0.0028, 0.0025, 0.0026, 0.0038, 0.0026, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-07 23:59:10,562 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78556.0, num_to_drop=0, layers_to_drop=set() +2022-12-07 23:59:11,185 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.326e+02 2.237e+02 2.916e+02 3.680e+02 6.575e+02, threshold=5.831e+02, percent-clipped=6.0 +2022-12-07 23:59:38,760 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78588.0, num_to_drop=1, layers_to_drop={2} +2022-12-07 23:59:40,633 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2813, 1.3946, 1.4942, 1.3138, 1.2805, 1.1712, 1.1018, 1.0324], + device='cuda:2'), covar=tensor([0.0214, 0.0204, 0.0126, 0.0159, 0.0261, 0.0360, 0.0208, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0014, 0.0014, 0.0014, 0.0024, 0.0019, 0.0024], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-08 00:00:05,416 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=78617.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:00:08,925 INFO [train.py:873] (2/4) Epoch 11, batch 3000, loss[loss=0.1224, simple_loss=0.1564, pruned_loss=0.04415, over 14192.00 frames. ], tot_loss[loss=0.1319, simple_loss=0.1601, pruned_loss=0.05186, over 1950657.19 frames. ], batch size: 80, lr: 7.24e-03, grad_scale: 8.0 +2022-12-08 00:00:08,926 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 00:00:17,380 INFO [train.py:905] (2/4) Epoch 11, validation: loss=0.1282, simple_loss=0.1681, pruned_loss=0.04413, over 857387.00 frames. +2022-12-08 00:00:17,380 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17779MB +2022-12-08 00:00:37,802 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8824, 1.5859, 4.1301, 3.9396, 3.8683, 4.2299, 3.6248, 4.2009], + device='cuda:2'), covar=tensor([0.1425, 0.1535, 0.0110, 0.0175, 0.0201, 0.0107, 0.0213, 0.0120], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0123, 0.0164, 0.0142, 0.0134, 0.0118, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 00:00:48,735 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.362e+02 2.287e+02 2.909e+02 3.866e+02 8.652e+02, threshold=5.818e+02, percent-clipped=3.0 +2022-12-08 00:01:46,388 INFO [train.py:873] (2/4) Epoch 11, batch 3100, loss[loss=0.111, simple_loss=0.143, pruned_loss=0.03951, over 13997.00 frames. ], tot_loss[loss=0.1319, simple_loss=0.16, pruned_loss=0.05192, over 1981956.43 frames. ], batch size: 19, lr: 7.24e-03, grad_scale: 8.0 +2022-12-08 00:01:46,512 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.9632, 5.4682, 5.3961, 5.9528, 5.4777, 4.7866, 5.9104, 4.9518], + device='cuda:2'), covar=tensor([0.0260, 0.0703, 0.0326, 0.0314, 0.0640, 0.0335, 0.0456, 0.0465], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0257, 0.0180, 0.0175, 0.0172, 0.0140, 0.0262, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 00:02:12,556 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.49 vs. limit=2.0 +2022-12-08 00:02:18,621 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.341e+02 2.078e+02 2.524e+02 3.137e+02 7.110e+02, threshold=5.049e+02, percent-clipped=1.0 +2022-12-08 00:02:49,022 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8725, 0.7955, 0.6894, 0.8378, 0.8214, 0.2685, 0.7351, 0.8702], + device='cuda:2'), covar=tensor([0.0236, 0.0461, 0.0457, 0.0305, 0.0245, 0.0352, 0.0782, 0.0571], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0025, 0.0027, 0.0024, 0.0026, 0.0037, 0.0026, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0001], + device='cuda:2') +2022-12-08 00:03:11,339 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78816.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:03:15,287 INFO [train.py:873] (2/4) Epoch 11, batch 3200, loss[loss=0.1205, simple_loss=0.1541, pruned_loss=0.04343, over 10307.00 frames. ], tot_loss[loss=0.1327, simple_loss=0.1605, pruned_loss=0.05241, over 1902948.11 frames. ], batch size: 100, lr: 7.23e-03, grad_scale: 8.0 +2022-12-08 00:03:46,998 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.001e+02 2.170e+02 2.756e+02 3.335e+02 6.522e+02, threshold=5.512e+02, percent-clipped=3.0 +2022-12-08 00:03:58,473 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.76 vs. limit=2.0 +2022-12-08 00:04:14,738 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=78888.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:04:17,354 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3370, 3.4079, 4.1597, 2.9080, 2.5705, 3.3698, 1.9930, 3.5038], + device='cuda:2'), covar=tensor([0.0997, 0.0883, 0.0561, 0.1767, 0.2086, 0.0955, 0.3767, 0.1192], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0095, 0.0091, 0.0094, 0.0113, 0.0082, 0.0125, 0.0088], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 00:04:35,372 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=78912.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:04:43,623 INFO [train.py:873] (2/4) Epoch 11, batch 3300, loss[loss=0.09446, simple_loss=0.139, pruned_loss=0.02494, over 14072.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.1601, pruned_loss=0.05158, over 2002388.21 frames. ], batch size: 26, lr: 7.23e-03, grad_scale: 8.0 +2022-12-08 00:04:56,438 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=78936.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:05:15,459 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.245e+02 2.789e+02 3.616e+02 5.775e+02, threshold=5.578e+02, percent-clipped=1.0 +2022-12-08 00:05:19,546 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.02 vs. limit=5.0 +2022-12-08 00:05:22,654 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=78965.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:05:48,553 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2261, 1.9008, 2.3450, 1.9488, 2.4282, 2.0637, 2.1112, 2.0936], + device='cuda:2'), covar=tensor([0.0483, 0.1799, 0.0458, 0.0804, 0.0408, 0.0771, 0.0390, 0.0737], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0320, 0.0400, 0.0308, 0.0383, 0.0324, 0.0371, 0.0317], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:06:10,658 INFO [train.py:873] (2/4) Epoch 11, batch 3400, loss[loss=0.125, simple_loss=0.1565, pruned_loss=0.04674, over 14278.00 frames. ], tot_loss[loss=0.131, simple_loss=0.1595, pruned_loss=0.05125, over 1985142.32 frames. ], batch size: 63, lr: 7.22e-03, grad_scale: 4.0 +2022-12-08 00:06:15,560 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7638, 3.5477, 3.5188, 3.8336, 3.4250, 3.1912, 3.8140, 3.7431], + device='cuda:2'), covar=tensor([0.0712, 0.0901, 0.0838, 0.0696, 0.0996, 0.0783, 0.0696, 0.0782], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0127, 0.0134, 0.0145, 0.0135, 0.0112, 0.0158, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 00:06:15,669 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79026.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:06:43,352 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 2.208e+02 2.821e+02 3.585e+02 7.488e+02, threshold=5.641e+02, percent-clipped=3.0 +2022-12-08 00:07:09,643 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0994, 2.9110, 3.8819, 2.7330, 2.5037, 3.3729, 1.8814, 3.0577], + device='cuda:2'), covar=tensor([0.2331, 0.1302, 0.0614, 0.1950, 0.2282, 0.0798, 0.4184, 0.1488], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0095, 0.0091, 0.0095, 0.0113, 0.0082, 0.0125, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 00:07:34,142 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79116.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:07:38,503 INFO [train.py:873] (2/4) Epoch 11, batch 3500, loss[loss=0.192, simple_loss=0.172, pruned_loss=0.106, over 1228.00 frames. ], tot_loss[loss=0.1299, simple_loss=0.1586, pruned_loss=0.05057, over 1964333.77 frames. ], batch size: 100, lr: 7.22e-03, grad_scale: 4.0 +2022-12-08 00:08:10,356 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.398e+02 2.883e+02 3.497e+02 6.738e+02, threshold=5.767e+02, percent-clipped=3.0 +2022-12-08 00:08:15,407 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79164.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:08:35,211 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7648, 1.2098, 1.3285, 1.1815, 1.0742, 1.3308, 1.0628, 0.9128], + device='cuda:2'), covar=tensor([0.1892, 0.0736, 0.0350, 0.0389, 0.1930, 0.0608, 0.1547, 0.1511], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0077, 0.0061, 0.0065, 0.0093, 0.0074, 0.0096, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:08:57,448 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79212.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:09:04,926 INFO [train.py:873] (2/4) Epoch 11, batch 3600, loss[loss=0.158, simple_loss=0.1535, pruned_loss=0.08129, over 3854.00 frames. ], tot_loss[loss=0.1293, simple_loss=0.1583, pruned_loss=0.05015, over 1937586.38 frames. ], batch size: 100, lr: 7.21e-03, grad_scale: 8.0 +2022-12-08 00:09:30,677 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-08 00:09:37,406 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.384e+02 2.170e+02 2.534e+02 3.229e+02 7.657e+02, threshold=5.069e+02, percent-clipped=4.0 +2022-12-08 00:09:39,211 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79260.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:10:01,636 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79285.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 00:10:09,638 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-08 00:10:13,378 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6945, 5.2876, 5.0732, 5.6014, 5.1895, 4.6346, 5.6955, 5.5438], + device='cuda:2'), covar=tensor([0.0433, 0.0504, 0.0584, 0.0430, 0.0668, 0.0440, 0.0463, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0130, 0.0126, 0.0132, 0.0142, 0.0133, 0.0110, 0.0154, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 00:10:32,864 INFO [train.py:873] (2/4) Epoch 11, batch 3700, loss[loss=0.1814, simple_loss=0.1564, pruned_loss=0.1032, over 1227.00 frames. ], tot_loss[loss=0.1312, simple_loss=0.1591, pruned_loss=0.05162, over 1859762.59 frames. ], batch size: 100, lr: 7.21e-03, grad_scale: 8.0 +2022-12-08 00:10:32,975 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:10:43,199 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79333.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:10:54,301 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79346.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 00:11:04,339 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.300e+02 2.713e+02 3.691e+02 7.855e+02, threshold=5.426e+02, percent-clipped=7.0 +2022-12-08 00:11:09,821 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79364.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:11:19,327 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79375.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 00:11:25,673 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3944, 1.7740, 1.8332, 1.7487, 1.6174, 1.6288, 1.2995, 1.1997], + device='cuda:2'), covar=tensor([0.0535, 0.0614, 0.0484, 0.0317, 0.0364, 0.0319, 0.0356, 0.0636], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0014, 0.0014, 0.0015, 0.0024, 0.0019, 0.0025], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-08 00:11:35,785 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79394.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:11:58,465 INFO [train.py:873] (2/4) Epoch 11, batch 3800, loss[loss=0.1572, simple_loss=0.1365, pruned_loss=0.08896, over 1260.00 frames. ], tot_loss[loss=0.132, simple_loss=0.1597, pruned_loss=0.05218, over 1836089.84 frames. ], batch size: 100, lr: 7.20e-03, grad_scale: 8.0 +2022-12-08 00:12:00,722 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-08 00:12:02,435 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79425.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:12:11,688 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79436.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 00:12:30,564 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 2.258e+02 2.922e+02 3.679e+02 8.434e+02, threshold=5.844e+02, percent-clipped=6.0 +2022-12-08 00:12:32,112 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-12-08 00:12:45,960 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9007, 1.2079, 1.3363, 1.2728, 0.9722, 1.2881, 1.0440, 0.9269], + device='cuda:2'), covar=tensor([0.1745, 0.0991, 0.0488, 0.0388, 0.1696, 0.0711, 0.1435, 0.1158], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0077, 0.0061, 0.0064, 0.0091, 0.0074, 0.0094, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:12:53,238 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5577, 2.1590, 2.4545, 1.5584, 2.1602, 2.4600, 2.6296, 2.1680], + device='cuda:2'), covar=tensor([0.0814, 0.0954, 0.1092, 0.1745, 0.1000, 0.0713, 0.0621, 0.1498], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0182, 0.0135, 0.0124, 0.0132, 0.0140, 0.0118, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 00:13:17,547 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.23 vs. limit=5.0 +2022-12-08 00:13:26,571 INFO [train.py:873] (2/4) Epoch 11, batch 3900, loss[loss=0.1806, simple_loss=0.1635, pruned_loss=0.09883, over 1205.00 frames. ], tot_loss[loss=0.1313, simple_loss=0.1594, pruned_loss=0.05161, over 1845759.26 frames. ], batch size: 100, lr: 7.20e-03, grad_scale: 8.0 +2022-12-08 00:13:59,196 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.132e+02 2.623e+02 3.165e+02 6.507e+02, threshold=5.247e+02, percent-clipped=1.0 +2022-12-08 00:14:22,793 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79585.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:14:54,980 INFO [train.py:873] (2/4) Epoch 11, batch 4000, loss[loss=0.1147, simple_loss=0.1486, pruned_loss=0.04042, over 14050.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1585, pruned_loss=0.05078, over 1889285.70 frames. ], batch size: 19, lr: 7.19e-03, grad_scale: 8.0 +2022-12-08 00:14:55,097 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79621.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:15:12,989 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79641.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:15:17,458 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79646.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:15:19,663 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.23 vs. limit=5.0 +2022-12-08 00:15:24,458 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3974, 3.0977, 2.4425, 3.6307, 3.4488, 3.4384, 3.0343, 2.3697], + device='cuda:2'), covar=tensor([0.1052, 0.1673, 0.3822, 0.0565, 0.0876, 0.1281, 0.1394, 0.4002], + device='cuda:2'), in_proj_covar=tensor([0.0267, 0.0294, 0.0270, 0.0249, 0.0307, 0.0292, 0.0253, 0.0254], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 00:15:26,934 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79657.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:15:27,678 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.890e+01 2.190e+02 2.797e+02 3.644e+02 7.365e+02, threshold=5.595e+02, percent-clipped=3.0 +2022-12-08 00:15:37,286 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79669.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:15:55,350 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79689.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:16:11,781 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 00:16:21,021 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79718.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:16:23,029 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79720.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:16:23,757 INFO [train.py:873] (2/4) Epoch 11, batch 4100, loss[loss=0.1455, simple_loss=0.172, pruned_loss=0.05943, over 14160.00 frames. ], tot_loss[loss=0.131, simple_loss=0.1592, pruned_loss=0.05137, over 1882161.33 frames. ], batch size: 99, lr: 7.19e-03, grad_scale: 8.0 +2022-12-08 00:16:32,663 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79731.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 00:16:56,455 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.197e+02 2.218e+02 2.886e+02 3.615e+02 6.897e+02, threshold=5.771e+02, percent-clipped=4.0 +2022-12-08 00:17:00,664 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=79762.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:17:37,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3496, 2.1595, 2.5866, 1.5772, 1.7197, 2.3574, 1.3318, 2.2437], + device='cuda:2'), covar=tensor([0.0862, 0.1473, 0.0966, 0.2042, 0.2643, 0.1168, 0.4522, 0.1080], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0096, 0.0092, 0.0095, 0.0116, 0.0083, 0.0127, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 00:17:52,398 INFO [train.py:873] (2/4) Epoch 11, batch 4200, loss[loss=0.1339, simple_loss=0.1694, pruned_loss=0.04919, over 14303.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1588, pruned_loss=0.05064, over 1965469.05 frames. ], batch size: 76, lr: 7.19e-03, grad_scale: 8.0 +2022-12-08 00:17:54,311 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=79823.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:18:25,211 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 2.219e+02 2.651e+02 3.356e+02 1.038e+03, threshold=5.303e+02, percent-clipped=1.0 +2022-12-08 00:18:44,365 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.87 vs. limit=5.0 +2022-12-08 00:19:21,013 INFO [train.py:873] (2/4) Epoch 11, batch 4300, loss[loss=0.1342, simple_loss=0.1617, pruned_loss=0.05333, over 10376.00 frames. ], tot_loss[loss=0.1295, simple_loss=0.1586, pruned_loss=0.05017, over 1995624.02 frames. ], batch size: 100, lr: 7.18e-03, grad_scale: 8.0 +2022-12-08 00:19:38,637 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=79941.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:19:38,680 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79941.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:19:53,275 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9385, 1.8291, 4.3286, 4.0555, 3.9895, 4.4185, 3.8350, 4.4537], + device='cuda:2'), covar=tensor([0.1482, 0.1438, 0.0102, 0.0182, 0.0191, 0.0111, 0.0221, 0.0109], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0126, 0.0166, 0.0142, 0.0135, 0.0117, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 00:19:53,915 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.285e+02 2.309e+02 2.745e+02 3.460e+02 6.233e+02, threshold=5.491e+02, percent-clipped=1.0 +2022-12-08 00:20:21,853 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=79989.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:20:21,929 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=79989.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:20:47,237 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80013.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:20:52,344 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6101, 1.3674, 3.5256, 1.6071, 3.6137, 3.7001, 2.6109, 3.9796], + device='cuda:2'), covar=tensor([0.0249, 0.3185, 0.0465, 0.2341, 0.0584, 0.0415, 0.0925, 0.0173], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0160, 0.0162, 0.0170, 0.0174, 0.0176, 0.0136, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 00:20:53,264 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80020.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:20:53,977 INFO [train.py:873] (2/4) Epoch 11, batch 4400, loss[loss=0.1351, simple_loss=0.1639, pruned_loss=0.05315, over 14156.00 frames. ], tot_loss[loss=0.1293, simple_loss=0.1585, pruned_loss=0.05006, over 1963107.66 frames. ], batch size: 84, lr: 7.18e-03, grad_scale: 8.0 +2022-12-08 00:21:03,089 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80031.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:21:08,528 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80037.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:21:26,665 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.889e+01 2.388e+02 2.902e+02 3.759e+02 8.737e+02, threshold=5.804e+02, percent-clipped=8.0 +2022-12-08 00:21:35,080 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80068.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:21:44,990 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80079.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:22:08,079 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0821, 2.0170, 4.1146, 2.7472, 3.8896, 1.9872, 3.0120, 3.8311], + device='cuda:2'), covar=tensor([0.0697, 0.4590, 0.0415, 0.6950, 0.0735, 0.4033, 0.1378, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0242, 0.0213, 0.0198, 0.0287, 0.0216, 0.0216, 0.0211, 0.0200], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:22:19,512 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80118.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:22:21,959 INFO [train.py:873] (2/4) Epoch 11, batch 4500, loss[loss=0.1266, simple_loss=0.1588, pruned_loss=0.0472, over 14212.00 frames. ], tot_loss[loss=0.1304, simple_loss=0.1592, pruned_loss=0.05085, over 1973300.12 frames. ], batch size: 94, lr: 7.17e-03, grad_scale: 8.0 +2022-12-08 00:22:54,366 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.548e+02 2.254e+02 2.921e+02 3.671e+02 7.258e+02, threshold=5.842e+02, percent-clipped=4.0 +2022-12-08 00:23:48,491 INFO [train.py:873] (2/4) Epoch 11, batch 4600, loss[loss=0.14, simple_loss=0.1625, pruned_loss=0.05872, over 14040.00 frames. ], tot_loss[loss=0.1311, simple_loss=0.1594, pruned_loss=0.05142, over 1961657.76 frames. ], batch size: 19, lr: 7.17e-03, grad_scale: 8.0 +2022-12-08 00:24:06,600 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80241.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:24:21,385 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.285e+02 2.172e+02 2.626e+02 3.163e+02 7.453e+02, threshold=5.252e+02, percent-clipped=2.0 +2022-12-08 00:24:42,979 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80283.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:24:48,268 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80289.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:24:55,093 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1741, 2.7535, 3.6343, 2.4509, 2.2301, 3.2987, 1.8509, 2.9610], + device='cuda:2'), covar=tensor([0.0913, 0.1238, 0.0501, 0.2081, 0.2379, 0.0662, 0.3443, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0093, 0.0091, 0.0093, 0.0113, 0.0082, 0.0123, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 00:25:02,006 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8824, 2.5293, 2.6700, 1.7180, 2.3406, 2.6850, 2.9176, 2.4086], + device='cuda:2'), covar=tensor([0.0857, 0.1277, 0.0992, 0.1834, 0.1190, 0.0878, 0.0634, 0.1442], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0178, 0.0134, 0.0123, 0.0132, 0.0138, 0.0116, 0.0136], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0007, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 00:25:05,403 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9265, 2.1158, 2.2651, 2.3147, 1.9377, 2.2305, 1.9492, 1.3558], + device='cuda:2'), covar=tensor([0.1249, 0.0950, 0.0712, 0.0459, 0.1149, 0.0659, 0.1279, 0.2226], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0076, 0.0060, 0.0063, 0.0091, 0.0074, 0.0094, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:25:09,168 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80313.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:25:16,059 INFO [train.py:873] (2/4) Epoch 11, batch 4700, loss[loss=0.1266, simple_loss=0.1649, pruned_loss=0.04417, over 14242.00 frames. ], tot_loss[loss=0.1309, simple_loss=0.1594, pruned_loss=0.05124, over 2000727.44 frames. ], batch size: 35, lr: 7.16e-03, grad_scale: 8.0 +2022-12-08 00:25:36,877 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80344.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:25:49,170 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.156e+02 2.137e+02 2.805e+02 3.644e+02 8.749e+02, threshold=5.610e+02, percent-clipped=7.0 +2022-12-08 00:25:51,863 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80361.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:26:03,787 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5662, 2.2120, 4.6170, 3.1037, 4.3951, 2.2484, 3.3265, 4.2486], + device='cuda:2'), covar=tensor([0.0408, 0.4264, 0.0313, 0.5742, 0.0428, 0.3217, 0.1314, 0.0324], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0216, 0.0199, 0.0290, 0.0216, 0.0218, 0.0212, 0.0203], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:26:20,340 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80393.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:26:42,686 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80418.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:26:45,212 INFO [train.py:873] (2/4) Epoch 11, batch 4800, loss[loss=0.1328, simple_loss=0.1643, pruned_loss=0.05065, over 14362.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1589, pruned_loss=0.05056, over 1989792.82 frames. ], batch size: 66, lr: 7.16e-03, grad_scale: 8.0 +2022-12-08 00:26:54,960 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1197, 3.4196, 3.2192, 3.3850, 2.6585, 3.4513, 3.1191, 1.8118], + device='cuda:2'), covar=tensor([0.2203, 0.0774, 0.1236, 0.0948, 0.0996, 0.0497, 0.1284, 0.2490], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0076, 0.0060, 0.0063, 0.0091, 0.0073, 0.0094, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0004, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:27:14,131 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80454.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:27:17,206 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.361e+02 2.171e+02 2.910e+02 3.468e+02 6.443e+02, threshold=5.821e+02, percent-clipped=1.0 +2022-12-08 00:27:21,542 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80463.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:27:23,912 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80466.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:27:44,093 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2774, 3.0329, 3.0305, 3.2661, 3.1603, 3.2793, 3.3202, 2.8059], + device='cuda:2'), covar=tensor([0.0505, 0.0946, 0.0496, 0.0526, 0.0703, 0.0380, 0.0597, 0.0547], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0255, 0.0179, 0.0176, 0.0171, 0.0141, 0.0260, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 00:27:49,347 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80494.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:27:49,405 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6702, 2.0449, 2.6416, 2.7578, 2.5825, 2.1394, 2.7312, 2.2925], + device='cuda:2'), covar=tensor([0.0312, 0.0796, 0.0430, 0.0359, 0.0420, 0.0859, 0.0298, 0.0579], + device='cuda:2'), in_proj_covar=tensor([0.0278, 0.0248, 0.0363, 0.0314, 0.0255, 0.0296, 0.0291, 0.0275], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 00:28:12,376 INFO [train.py:873] (2/4) Epoch 11, batch 4900, loss[loss=0.1042, simple_loss=0.1504, pruned_loss=0.029, over 14292.00 frames. ], tot_loss[loss=0.1309, simple_loss=0.1592, pruned_loss=0.05129, over 1948677.88 frames. ], batch size: 35, lr: 7.15e-03, grad_scale: 8.0 +2022-12-08 00:28:15,366 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80524.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:28:42,488 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80555.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:28:44,859 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.308e+02 2.165e+02 2.757e+02 3.267e+02 6.600e+02, threshold=5.514e+02, percent-clipped=1.0 +2022-12-08 00:29:05,884 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.20 vs. limit=5.0 +2022-12-08 00:29:41,618 INFO [train.py:873] (2/4) Epoch 11, batch 5000, loss[loss=0.1458, simple_loss=0.1473, pruned_loss=0.07214, over 2674.00 frames. ], tot_loss[loss=0.1307, simple_loss=0.1592, pruned_loss=0.05109, over 1981468.95 frames. ], batch size: 100, lr: 7.15e-03, grad_scale: 8.0 +2022-12-08 00:29:57,624 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80639.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:29:59,500 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7530, 1.8399, 1.7633, 1.9547, 2.1851, 1.3293, 1.5015, 1.7041], + device='cuda:2'), covar=tensor([0.0351, 0.0582, 0.0644, 0.0292, 0.0316, 0.0494, 0.0545, 0.0604], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0014, 0.0015, 0.0015, 0.0025, 0.0020, 0.0025], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-08 00:30:14,652 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.349e+02 2.220e+02 2.746e+02 3.480e+02 5.243e+02, threshold=5.491e+02, percent-clipped=0.0 +2022-12-08 00:30:41,021 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0608, 2.2364, 3.0024, 3.1065, 2.9967, 2.1666, 2.9912, 2.5135], + device='cuda:2'), covar=tensor([0.0301, 0.0764, 0.0541, 0.0433, 0.0354, 0.1121, 0.0327, 0.0730], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0252, 0.0366, 0.0319, 0.0258, 0.0300, 0.0293, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 00:30:44,494 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8896, 1.2952, 2.0440, 1.2488, 1.9562, 2.0754, 1.6509, 2.1353], + device='cuda:2'), covar=tensor([0.0293, 0.1839, 0.0358, 0.1774, 0.0516, 0.0487, 0.1118, 0.0316], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0156, 0.0157, 0.0167, 0.0170, 0.0173, 0.0134, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 00:31:10,694 INFO [train.py:873] (2/4) Epoch 11, batch 5100, loss[loss=0.1262, simple_loss=0.1562, pruned_loss=0.04808, over 14249.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1588, pruned_loss=0.0506, over 1975209.13 frames. ], batch size: 63, lr: 7.15e-03, grad_scale: 8.0 +2022-12-08 00:31:24,285 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80736.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:31:35,057 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80749.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:31:43,013 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.114e+02 2.657e+02 3.455e+02 5.035e+02, threshold=5.315e+02, percent-clipped=0.0 +2022-12-08 00:31:56,843 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=80773.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:32:09,756 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.53 vs. limit=2.0 +2022-12-08 00:32:11,100 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3923, 4.1082, 3.9529, 4.4835, 4.1194, 3.9489, 4.4610, 3.7734], + device='cuda:2'), covar=tensor([0.0572, 0.1140, 0.0419, 0.0428, 0.0904, 0.0771, 0.0573, 0.0592], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0260, 0.0182, 0.0178, 0.0174, 0.0144, 0.0265, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 00:32:17,357 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80797.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:32:33,881 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-12-08 00:32:35,256 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4609, 5.0046, 4.8473, 5.5110, 5.0387, 4.5339, 5.4139, 4.6695], + device='cuda:2'), covar=tensor([0.0351, 0.1113, 0.0354, 0.0400, 0.0821, 0.0430, 0.0524, 0.0457], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0259, 0.0181, 0.0178, 0.0173, 0.0144, 0.0265, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 00:32:36,948 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80819.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:32:38,684 INFO [train.py:873] (2/4) Epoch 11, batch 5200, loss[loss=0.1856, simple_loss=0.1865, pruned_loss=0.09235, over 8597.00 frames. ], tot_loss[loss=0.1315, simple_loss=0.1596, pruned_loss=0.05172, over 1936807.56 frames. ], batch size: 100, lr: 7.14e-03, grad_scale: 8.0 +2022-12-08 00:32:50,337 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=80834.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:33:04,590 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=80850.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:33:11,236 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.384e+02 2.222e+02 2.775e+02 3.597e+02 5.956e+02, threshold=5.550e+02, percent-clipped=3.0 +2022-12-08 00:33:56,029 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9042, 1.5897, 2.0918, 1.7184, 1.9562, 1.5037, 1.6774, 1.9483], + device='cuda:2'), covar=tensor([0.1801, 0.3084, 0.0408, 0.2046, 0.1214, 0.1555, 0.1070, 0.0960], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0215, 0.0198, 0.0291, 0.0216, 0.0219, 0.0214, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:34:04,557 INFO [train.py:873] (2/4) Epoch 11, batch 5300, loss[loss=0.1641, simple_loss=0.153, pruned_loss=0.08758, over 1202.00 frames. ], tot_loss[loss=0.1316, simple_loss=0.1598, pruned_loss=0.05172, over 1964421.14 frames. ], batch size: 100, lr: 7.14e-03, grad_scale: 8.0 +2022-12-08 00:34:11,666 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.15 vs. limit=5.0 +2022-12-08 00:34:20,770 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=80939.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:34:36,058 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.08 vs. limit=2.0 +2022-12-08 00:34:37,143 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 2.266e+02 2.683e+02 3.445e+02 6.231e+02, threshold=5.367e+02, percent-clipped=3.0 +2022-12-08 00:34:56,756 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0567, 1.7906, 4.4111, 4.1532, 4.1089, 4.4614, 4.0162, 4.4685], + device='cuda:2'), covar=tensor([0.1440, 0.1522, 0.0105, 0.0185, 0.0204, 0.0117, 0.0185, 0.0126], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0125, 0.0166, 0.0141, 0.0135, 0.0117, 0.0117], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 00:34:58,551 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-08 00:35:02,411 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=80987.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:35:32,523 INFO [train.py:873] (2/4) Epoch 11, batch 5400, loss[loss=0.1266, simple_loss=0.1561, pruned_loss=0.0486, over 11157.00 frames. ], tot_loss[loss=0.1301, simple_loss=0.1589, pruned_loss=0.05063, over 1963461.24 frames. ], batch size: 100, lr: 7.13e-03, grad_scale: 16.0 +2022-12-08 00:35:58,332 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81049.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:36:05,569 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.933e+01 1.955e+02 2.740e+02 3.056e+02 5.675e+02, threshold=5.479e+02, percent-clipped=1.0 +2022-12-08 00:36:11,730 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1158, 1.8429, 2.1016, 1.5826, 2.0798, 0.9083, 1.8726, 1.9885], + device='cuda:2'), covar=tensor([0.0921, 0.1109, 0.0898, 0.3107, 0.1113, 0.1076, 0.0930, 0.1289], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0026, 0.0028, 0.0026, 0.0027, 0.0040, 0.0027, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0001, 0.0002], + device='cuda:2') +2022-12-08 00:36:35,744 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81092.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:36:39,872 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81097.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:36:56,335 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81116.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:36:58,927 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81119.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:37:00,421 INFO [train.py:873] (2/4) Epoch 11, batch 5500, loss[loss=0.1157, simple_loss=0.134, pruned_loss=0.04873, over 4932.00 frames. ], tot_loss[loss=0.1291, simple_loss=0.1587, pruned_loss=0.04974, over 1985691.95 frames. ], batch size: 100, lr: 7.13e-03, grad_scale: 8.0 +2022-12-08 00:37:05,653 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-08 00:37:07,713 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81129.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:37:21,048 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.09 vs. limit=2.0 +2022-12-08 00:37:25,911 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81150.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:37:33,796 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.607e+02 2.115e+02 2.510e+02 3.295e+02 5.828e+02, threshold=5.019e+02, percent-clipped=2.0 +2022-12-08 00:37:40,956 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81167.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:37:49,658 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81177.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:38:07,659 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81198.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:38:10,650 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3753, 1.3438, 2.5825, 1.4768, 2.5141, 2.4927, 1.8170, 2.6442], + device='cuda:2'), covar=tensor([0.0256, 0.2282, 0.0322, 0.1578, 0.0380, 0.0421, 0.0962, 0.0242], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0157, 0.0159, 0.0170, 0.0170, 0.0173, 0.0134, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 00:38:28,045 INFO [train.py:873] (2/4) Epoch 11, batch 5600, loss[loss=0.1411, simple_loss=0.1587, pruned_loss=0.06171, over 6023.00 frames. ], tot_loss[loss=0.1312, simple_loss=0.1596, pruned_loss=0.0514, over 1973214.46 frames. ], batch size: 100, lr: 7.12e-03, grad_scale: 8.0 +2022-12-08 00:38:45,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.02 vs. limit=5.0 +2022-12-08 00:38:56,439 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5630, 1.8975, 2.5072, 2.1212, 2.5469, 2.5082, 2.2633, 2.3184], + device='cuda:2'), covar=tensor([0.0576, 0.2806, 0.0772, 0.1623, 0.0458, 0.1019, 0.0695, 0.1452], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0318, 0.0402, 0.0308, 0.0380, 0.0324, 0.0368, 0.0316], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:39:02,406 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 2.253e+02 2.769e+02 3.640e+02 8.621e+02, threshold=5.537e+02, percent-clipped=7.0 +2022-12-08 00:39:07,311 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-12-08 00:39:37,318 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81299.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:39:38,912 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81301.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:39:55,991 INFO [train.py:873] (2/4) Epoch 11, batch 5700, loss[loss=0.1111, simple_loss=0.1499, pruned_loss=0.03612, over 14403.00 frames. ], tot_loss[loss=0.1318, simple_loss=0.1596, pruned_loss=0.05197, over 1898455.28 frames. ], batch size: 53, lr: 7.12e-03, grad_scale: 8.0 +2022-12-08 00:40:28,773 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.149e+01 2.189e+02 2.731e+02 3.467e+02 4.923e+02, threshold=5.462e+02, percent-clipped=0.0 +2022-12-08 00:40:29,848 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81360.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:40:31,343 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81362.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:40:57,939 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81392.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:41:23,789 INFO [train.py:873] (2/4) Epoch 11, batch 5800, loss[loss=0.1172, simple_loss=0.1226, pruned_loss=0.05586, over 2626.00 frames. ], tot_loss[loss=0.1301, simple_loss=0.1586, pruned_loss=0.05081, over 1933051.44 frames. ], batch size: 100, lr: 7.11e-03, grad_scale: 8.0 +2022-12-08 00:41:30,589 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81429.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:41:40,079 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81440.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:41:56,861 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 2.147e+02 2.711e+02 3.541e+02 7.030e+02, threshold=5.421e+02, percent-clipped=5.0 +2022-12-08 00:42:08,746 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81472.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:42:12,697 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81477.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:42:14,542 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6717, 1.7614, 1.8975, 1.2796, 1.2507, 1.7894, 1.0644, 1.7400], + device='cuda:2'), covar=tensor([0.1488, 0.2415, 0.1069, 0.2521, 0.3292, 0.0948, 0.3493, 0.0919], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0096, 0.0091, 0.0094, 0.0113, 0.0083, 0.0125, 0.0086], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 00:42:51,273 INFO [train.py:873] (2/4) Epoch 11, batch 5900, loss[loss=0.1214, simple_loss=0.1594, pruned_loss=0.04175, over 14097.00 frames. ], tot_loss[loss=0.1295, simple_loss=0.1585, pruned_loss=0.05032, over 1921561.10 frames. ], batch size: 29, lr: 7.11e-03, grad_scale: 8.0 +2022-12-08 00:43:01,630 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-12-08 00:43:24,408 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.261e+02 2.246e+02 2.674e+02 3.531e+02 7.780e+02, threshold=5.349e+02, percent-clipped=2.0 +2022-12-08 00:43:32,781 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1366, 1.3568, 2.4756, 1.3154, 2.3094, 2.3433, 1.8773, 2.2544], + device='cuda:2'), covar=tensor([0.0625, 0.3161, 0.0572, 0.2536, 0.0894, 0.0823, 0.1385, 0.0760], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0158, 0.0160, 0.0170, 0.0172, 0.0172, 0.0135, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 00:44:19,736 INFO [train.py:873] (2/4) Epoch 11, batch 6000, loss[loss=0.1394, simple_loss=0.161, pruned_loss=0.05888, over 6969.00 frames. ], tot_loss[loss=0.13, simple_loss=0.1586, pruned_loss=0.05075, over 1912884.96 frames. ], batch size: 100, lr: 7.11e-03, grad_scale: 8.0 +2022-12-08 00:44:19,736 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 00:44:28,211 INFO [train.py:905] (2/4) Epoch 11, validation: loss=0.1286, simple_loss=0.169, pruned_loss=0.04409, over 857387.00 frames. +2022-12-08 00:44:28,213 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17841MB +2022-12-08 00:44:53,209 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4862, 2.7390, 2.7568, 2.7367, 2.2536, 2.8265, 2.5944, 1.4369], + device='cuda:2'), covar=tensor([0.1428, 0.1018, 0.0717, 0.0733, 0.1041, 0.0537, 0.0971, 0.2378], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0077, 0.0061, 0.0065, 0.0092, 0.0075, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:44:58,288 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81655.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:45:00,135 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=81657.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:45:01,820 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.338e+02 2.152e+02 2.716e+02 3.284e+02 6.077e+02, threshold=5.431e+02, percent-clipped=1.0 +2022-12-08 00:45:11,791 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81670.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:45:12,705 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2761, 1.8528, 2.2429, 1.9594, 2.3162, 2.1114, 2.0030, 2.1004], + device='cuda:2'), covar=tensor([0.0460, 0.1640, 0.0364, 0.0760, 0.0336, 0.0817, 0.0371, 0.0625], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0314, 0.0399, 0.0306, 0.0377, 0.0321, 0.0366, 0.0312], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:45:55,908 INFO [train.py:873] (2/4) Epoch 11, batch 6100, loss[loss=0.1708, simple_loss=0.1855, pruned_loss=0.07804, over 9533.00 frames. ], tot_loss[loss=0.1297, simple_loss=0.1586, pruned_loss=0.05043, over 1939652.50 frames. ], batch size: 100, lr: 7.10e-03, grad_scale: 8.0 +2022-12-08 00:46:04,807 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=81731.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 00:46:29,256 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.111e+02 2.193e+02 2.528e+02 3.178e+02 6.202e+02, threshold=5.055e+02, percent-clipped=2.0 +2022-12-08 00:46:40,565 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81772.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:46:54,378 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-08 00:47:23,442 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=81820.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:47:24,229 INFO [train.py:873] (2/4) Epoch 11, batch 6200, loss[loss=0.1826, simple_loss=0.187, pruned_loss=0.08911, over 8595.00 frames. ], tot_loss[loss=0.1297, simple_loss=0.1583, pruned_loss=0.05056, over 1930677.75 frames. ], batch size: 100, lr: 7.10e-03, grad_scale: 8.0 +2022-12-08 00:47:58,180 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.426e+02 2.324e+02 2.708e+02 3.471e+02 1.552e+03, threshold=5.417e+02, percent-clipped=7.0 +2022-12-08 00:48:04,850 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 00:48:52,506 INFO [train.py:873] (2/4) Epoch 11, batch 6300, loss[loss=0.128, simple_loss=0.1551, pruned_loss=0.05043, over 14250.00 frames. ], tot_loss[loss=0.129, simple_loss=0.1583, pruned_loss=0.04981, over 2038224.85 frames. ], batch size: 80, lr: 7.09e-03, grad_scale: 8.0 +2022-12-08 00:49:22,564 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81955.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:49:24,331 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=81957.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:49:25,808 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.231e+02 2.360e+02 2.936e+02 3.429e+02 7.128e+02, threshold=5.872e+02, percent-clipped=5.0 +2022-12-08 00:49:50,531 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=81987.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:50:04,799 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82003.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:50:06,453 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82005.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:50:20,252 INFO [train.py:873] (2/4) Epoch 11, batch 6400, loss[loss=0.1989, simple_loss=0.1801, pruned_loss=0.1088, over 1311.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1575, pruned_loss=0.04894, over 1992646.39 frames. ], batch size: 100, lr: 7.09e-03, grad_scale: 8.0 +2022-12-08 00:50:24,737 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82026.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:50:43,841 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82048.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:50:49,994 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82055.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:50:53,435 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 2.188e+02 2.645e+02 3.155e+02 5.407e+02, threshold=5.290e+02, percent-clipped=0.0 +2022-12-08 00:51:43,651 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82116.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:51:47,738 INFO [train.py:873] (2/4) Epoch 11, batch 6500, loss[loss=0.149, simple_loss=0.1729, pruned_loss=0.06256, over 9538.00 frames. ], tot_loss[loss=0.1284, simple_loss=0.158, pruned_loss=0.0494, over 2002713.12 frames. ], batch size: 100, lr: 7.08e-03, grad_scale: 8.0 +2022-12-08 00:52:02,556 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 00:52:20,600 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.664e+01 2.301e+02 3.022e+02 3.856e+02 8.544e+02, threshold=6.044e+02, percent-clipped=9.0 +2022-12-08 00:52:21,285 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.84 vs. limit=2.0 +2022-12-08 00:53:14,952 INFO [train.py:873] (2/4) Epoch 11, batch 6600, loss[loss=0.1258, simple_loss=0.1556, pruned_loss=0.04803, over 10319.00 frames. ], tot_loss[loss=0.127, simple_loss=0.1572, pruned_loss=0.04846, over 2032517.10 frames. ], batch size: 100, lr: 7.08e-03, grad_scale: 8.0 +2022-12-08 00:53:48,294 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.365e+02 2.109e+02 2.550e+02 3.112e+02 5.386e+02, threshold=5.100e+02, percent-clipped=0.0 +2022-12-08 00:54:43,678 INFO [train.py:873] (2/4) Epoch 11, batch 6700, loss[loss=0.1116, simple_loss=0.1494, pruned_loss=0.03691, over 14166.00 frames. ], tot_loss[loss=0.1275, simple_loss=0.1576, pruned_loss=0.0487, over 2023446.76 frames. ], batch size: 35, lr: 7.08e-03, grad_scale: 8.0 +2022-12-08 00:54:48,095 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82326.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:55:02,410 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82343.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:55:05,115 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82346.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:55:16,416 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.150e+02 2.173e+02 2.670e+02 3.578e+02 7.367e+02, threshold=5.339e+02, percent-clipped=2.0 +2022-12-08 00:55:29,452 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82374.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:55:33,742 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7143, 1.6358, 1.7478, 1.7780, 1.8476, 1.5021, 1.3163, 1.2157], + device='cuda:2'), covar=tensor([0.0432, 0.0395, 0.0557, 0.0340, 0.0376, 0.0337, 0.0323, 0.0750], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0014, 0.0015, 0.0015, 0.0025, 0.0020, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 00:55:59,141 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82407.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:56:02,567 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82411.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:56:11,100 INFO [train.py:873] (2/4) Epoch 11, batch 6800, loss[loss=0.1573, simple_loss=0.1721, pruned_loss=0.07128, over 5996.00 frames. ], tot_loss[loss=0.1275, simple_loss=0.1578, pruned_loss=0.04859, over 2027307.09 frames. ], batch size: 100, lr: 7.07e-03, grad_scale: 8.0 +2022-12-08 00:56:43,725 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.232e+02 2.471e+02 3.089e+02 4.263e+02 8.393e+02, threshold=6.178e+02, percent-clipped=10.0 +2022-12-08 00:57:01,404 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-12-08 00:57:14,082 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82493.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 00:57:25,015 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8585, 1.5364, 1.8881, 1.4828, 1.6770, 1.0090, 1.6495, 2.0124], + device='cuda:2'), covar=tensor([0.1050, 0.0852, 0.0801, 0.2173, 0.1747, 0.0939, 0.0752, 0.0603], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0026, 0.0029, 0.0026, 0.0028, 0.0039, 0.0027, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 00:57:35,482 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6846, 4.3723, 4.2598, 4.7594, 4.4478, 4.1987, 4.6904, 4.0707], + device='cuda:2'), covar=tensor([0.0337, 0.0899, 0.0354, 0.0385, 0.0703, 0.0706, 0.0539, 0.0465], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0258, 0.0181, 0.0181, 0.0173, 0.0145, 0.0268, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 00:57:37,891 INFO [train.py:873] (2/4) Epoch 11, batch 6900, loss[loss=0.1449, simple_loss=0.176, pruned_loss=0.05693, over 14466.00 frames. ], tot_loss[loss=0.1288, simple_loss=0.1584, pruned_loss=0.04958, over 2021310.02 frames. ], batch size: 51, lr: 7.07e-03, grad_scale: 8.0 +2022-12-08 00:58:03,895 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.32 vs. limit=5.0 +2022-12-08 00:58:06,348 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82554.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 00:58:11,135 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.611e+02 2.298e+02 3.049e+02 3.803e+02 8.603e+02, threshold=6.098e+02, percent-clipped=2.0 +2022-12-08 00:58:20,039 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3042, 4.0929, 3.8295, 3.9521, 4.1777, 4.2131, 4.2953, 4.3042], + device='cuda:2'), covar=tensor([0.0862, 0.0553, 0.1860, 0.2565, 0.0694, 0.0759, 0.0997, 0.0784], + device='cuda:2'), in_proj_covar=tensor([0.0371, 0.0261, 0.0432, 0.0546, 0.0326, 0.0419, 0.0393, 0.0367], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:58:20,125 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82569.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:58:28,893 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6240, 1.6632, 2.8606, 1.9910, 2.6750, 1.7030, 2.1534, 2.6099], + device='cuda:2'), covar=tensor([0.1340, 0.4691, 0.0707, 0.5440, 0.1007, 0.4029, 0.1509, 0.0988], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0213, 0.0200, 0.0289, 0.0217, 0.0217, 0.0216, 0.0206], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 00:59:01,401 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0608, 3.2442, 3.0567, 3.1703, 2.3800, 3.1954, 3.0665, 1.6524], + device='cuda:2'), covar=tensor([0.1493, 0.1146, 0.1315, 0.0549, 0.0970, 0.0444, 0.1120, 0.2169], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0076, 0.0061, 0.0064, 0.0092, 0.0075, 0.0093, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 00:59:06,537 INFO [train.py:873] (2/4) Epoch 11, batch 7000, loss[loss=0.08659, simple_loss=0.1222, pruned_loss=0.0255, over 13584.00 frames. ], tot_loss[loss=0.1292, simple_loss=0.1584, pruned_loss=0.04995, over 1971054.21 frames. ], batch size: 17, lr: 7.06e-03, grad_scale: 8.0 +2022-12-08 00:59:11,592 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.70 vs. limit=5.0 +2022-12-08 00:59:14,700 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=82630.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:59:26,125 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 00:59:39,576 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 2.154e+02 2.641e+02 3.288e+02 6.152e+02, threshold=5.282e+02, percent-clipped=1.0 +2022-12-08 00:59:58,003 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4630, 3.3277, 3.0723, 3.1528, 3.4100, 3.3774, 3.4611, 3.4455], + device='cuda:2'), covar=tensor([0.1065, 0.0716, 0.2351, 0.2728, 0.0935, 0.1086, 0.1162, 0.0878], + device='cuda:2'), in_proj_covar=tensor([0.0370, 0.0260, 0.0432, 0.0547, 0.0327, 0.0422, 0.0388, 0.0365], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:00:08,002 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82691.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:00:17,858 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82702.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:00:25,995 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=82711.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:00:34,804 INFO [train.py:873] (2/4) Epoch 11, batch 7100, loss[loss=0.132, simple_loss=0.1679, pruned_loss=0.0481, over 14487.00 frames. ], tot_loss[loss=0.1299, simple_loss=0.1589, pruned_loss=0.05048, over 1971196.61 frames. ], batch size: 49, lr: 7.06e-03, grad_scale: 8.0 +2022-12-08 01:01:07,646 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.406e+02 2.263e+02 2.729e+02 3.602e+02 7.969e+02, threshold=5.458e+02, percent-clipped=3.0 +2022-12-08 01:01:07,748 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=82759.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:01:16,052 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3615, 1.4298, 2.5710, 1.4940, 2.4839, 2.5051, 1.9086, 2.6253], + device='cuda:2'), covar=tensor([0.0274, 0.2262, 0.0351, 0.1682, 0.0452, 0.0434, 0.0993, 0.0262], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0157, 0.0159, 0.0169, 0.0170, 0.0172, 0.0132, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 01:01:18,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-12-08 01:01:25,456 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8751, 2.6591, 2.7345, 2.9183, 2.7941, 2.8471, 2.9465, 2.4709], + device='cuda:2'), covar=tensor([0.0713, 0.1231, 0.0595, 0.0579, 0.0838, 0.0573, 0.0746, 0.0730], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0259, 0.0182, 0.0182, 0.0175, 0.0147, 0.0270, 0.0162], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 01:01:39,260 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7817, 0.7920, 0.7130, 0.7774, 0.7209, 0.4118, 0.5434, 0.6518], + device='cuda:2'), covar=tensor([0.0110, 0.0124, 0.0102, 0.0129, 0.0151, 0.0338, 0.0189, 0.0256], + device='cuda:2'), in_proj_covar=tensor([0.0015, 0.0016, 0.0014, 0.0015, 0.0015, 0.0025, 0.0020, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:01:43,754 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0445, 1.1202, 1.0025, 1.0386, 1.0720, 0.5467, 0.9176, 1.0971], + device='cuda:2'), covar=tensor([0.0504, 0.0792, 0.0558, 0.0649, 0.0399, 0.0685, 0.1053, 0.0791], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0027, 0.0029, 0.0026, 0.0028, 0.0040, 0.0028, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:01:47,615 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-08 01:02:03,355 INFO [train.py:873] (2/4) Epoch 11, batch 7200, loss[loss=0.1227, simple_loss=0.1545, pruned_loss=0.04545, over 14369.00 frames. ], tot_loss[loss=0.1302, simple_loss=0.159, pruned_loss=0.05072, over 1916848.27 frames. ], batch size: 41, lr: 7.05e-03, grad_scale: 8.0 +2022-12-08 01:02:09,694 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7003, 2.4044, 3.4716, 2.7685, 3.4422, 3.4707, 3.2984, 2.9028], + device='cuda:2'), covar=tensor([0.0723, 0.3042, 0.0972, 0.1904, 0.0826, 0.0871, 0.1300, 0.2074], + device='cuda:2'), in_proj_covar=tensor([0.0348, 0.0312, 0.0399, 0.0306, 0.0376, 0.0323, 0.0363, 0.0311], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:02:28,438 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82849.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:02:36,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.101e+02 2.276e+02 3.049e+02 3.669e+02 5.112e+02, threshold=6.097e+02, percent-clipped=0.0 +2022-12-08 01:03:12,144 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1261, 2.4079, 2.4259, 2.4969, 1.9823, 2.5199, 2.2551, 1.3517], + device='cuda:2'), covar=tensor([0.1280, 0.0868, 0.0935, 0.0494, 0.1138, 0.0537, 0.1214, 0.2335], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0077, 0.0062, 0.0065, 0.0094, 0.0076, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:03:31,180 INFO [train.py:873] (2/4) Epoch 11, batch 7300, loss[loss=0.1422, simple_loss=0.164, pruned_loss=0.06023, over 5991.00 frames. ], tot_loss[loss=0.129, simple_loss=0.1578, pruned_loss=0.05012, over 1863190.39 frames. ], batch size: 100, lr: 7.05e-03, grad_scale: 8.0 +2022-12-08 01:03:34,875 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=82925.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:03:40,123 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2679, 1.6941, 2.5355, 2.0316, 2.3226, 1.7002, 1.9457, 2.2386], + device='cuda:2'), covar=tensor([0.1632, 0.2760, 0.0413, 0.2166, 0.0838, 0.2188, 0.1025, 0.1021], + device='cuda:2'), in_proj_covar=tensor([0.0245, 0.0215, 0.0199, 0.0290, 0.0217, 0.0217, 0.0216, 0.0205], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:04:04,503 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.016e+02 2.470e+02 3.084e+02 3.689e+02 1.024e+03, threshold=6.168e+02, percent-clipped=3.0 +2022-12-08 01:04:08,531 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=82963.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:04:09,405 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5369, 2.3514, 3.3994, 2.5814, 3.4220, 3.3623, 3.1607, 2.8550], + device='cuda:2'), covar=tensor([0.0888, 0.2833, 0.1221, 0.2077, 0.0868, 0.0872, 0.1432, 0.1867], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0314, 0.0402, 0.0307, 0.0378, 0.0324, 0.0366, 0.0313], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:04:21,045 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.49 vs. limit=5.0 +2022-12-08 01:04:43,197 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83002.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:04:43,289 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2106, 2.1064, 3.1552, 3.3400, 3.2310, 2.1539, 3.2270, 2.4003], + device='cuda:2'), covar=tensor([0.0390, 0.0867, 0.0677, 0.0357, 0.0369, 0.1306, 0.0338, 0.0888], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0248, 0.0365, 0.0311, 0.0257, 0.0295, 0.0293, 0.0275], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:04:59,090 INFO [train.py:873] (2/4) Epoch 11, batch 7400, loss[loss=0.1447, simple_loss=0.1679, pruned_loss=0.0608, over 11994.00 frames. ], tot_loss[loss=0.1291, simple_loss=0.1581, pruned_loss=0.05008, over 1931175.08 frames. ], batch size: 100, lr: 7.05e-03, grad_scale: 8.0 +2022-12-08 01:05:02,198 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83024.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:05:25,165 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83050.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:05:32,577 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.163e+02 2.140e+02 2.552e+02 3.210e+02 1.054e+03, threshold=5.104e+02, percent-clipped=1.0 +2022-12-08 01:05:40,678 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6197, 2.2543, 2.4814, 1.5742, 2.1793, 2.4344, 2.6305, 2.2243], + device='cuda:2'), covar=tensor([0.0725, 0.0818, 0.0995, 0.1824, 0.1141, 0.0744, 0.0699, 0.1447], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0181, 0.0136, 0.0126, 0.0134, 0.0141, 0.0120, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:05:43,620 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-08 01:06:26,610 INFO [train.py:873] (2/4) Epoch 11, batch 7500, loss[loss=0.1549, simple_loss=0.1743, pruned_loss=0.06779, over 10383.00 frames. ], tot_loss[loss=0.1299, simple_loss=0.1583, pruned_loss=0.05075, over 1945956.05 frames. ], batch size: 100, lr: 7.04e-03, grad_scale: 16.0 +2022-12-08 01:06:50,680 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83149.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:06:57,278 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3211, 2.9677, 2.8820, 1.8281, 2.7844, 3.1399, 3.3288, 2.5800], + device='cuda:2'), covar=tensor([0.0612, 0.1354, 0.1044, 0.2105, 0.0915, 0.0618, 0.0709, 0.1428], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0181, 0.0136, 0.0127, 0.0134, 0.0141, 0.0119, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:06:58,814 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.424e+02 2.439e+02 2.915e+02 3.570e+02 7.862e+02, threshold=5.829e+02, percent-clipped=6.0 +2022-12-08 01:07:53,460 INFO [train.py:873] (2/4) Epoch 12, batch 0, loss[loss=0.1403, simple_loss=0.1695, pruned_loss=0.05562, over 14253.00 frames. ], tot_loss[loss=0.1403, simple_loss=0.1695, pruned_loss=0.05562, over 14253.00 frames. ], batch size: 46, lr: 6.74e-03, grad_scale: 16.0 +2022-12-08 01:07:53,460 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 01:07:57,202 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1150, 1.5392, 4.1567, 2.0706, 4.1463, 4.2386, 3.0351, 4.5152], + device='cuda:2'), covar=tensor([0.0156, 0.3026, 0.0207, 0.1796, 0.0195, 0.0207, 0.0565, 0.0102], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0158, 0.0159, 0.0168, 0.0169, 0.0173, 0.0133, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 01:07:59,123 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7332, 4.9381, 4.6400, 4.9129, 4.2225, 4.3586, 4.7565, 4.4587], + device='cuda:2'), covar=tensor([0.0666, 0.0434, 0.0836, 0.0682, 0.1275, 0.0453, 0.0526, 0.1032], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0127, 0.0134, 0.0143, 0.0135, 0.0112, 0.0155, 0.0131], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:08:00,686 INFO [train.py:905] (2/4) Epoch 12, validation: loss=0.1326, simple_loss=0.1738, pruned_loss=0.04568, over 857387.00 frames. +2022-12-08 01:08:00,686 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17841MB +2022-12-08 01:08:12,339 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83196.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:08:13,525 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83197.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:08:26,206 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0887, 2.1996, 4.1233, 2.8466, 3.9503, 2.1823, 2.9547, 3.9443], + device='cuda:2'), covar=tensor([0.0491, 0.4098, 0.0445, 0.6406, 0.0494, 0.3232, 0.1423, 0.0364], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0217, 0.0202, 0.0289, 0.0219, 0.0219, 0.0218, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:08:38,636 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83225.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:08:48,951 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.66 vs. limit=2.0 +2022-12-08 01:09:07,330 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83257.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:09:08,851 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.351e+01 1.849e+02 2.933e+02 3.942e+02 9.490e+02, threshold=5.866e+02, percent-clipped=6.0 +2022-12-08 01:09:21,074 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83273.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:09:30,011 INFO [train.py:873] (2/4) Epoch 12, batch 100, loss[loss=0.1267, simple_loss=0.1608, pruned_loss=0.04625, over 14278.00 frames. ], tot_loss[loss=0.1256, simple_loss=0.1569, pruned_loss=0.0471, over 872276.71 frames. ], batch size: 44, lr: 6.74e-03, grad_scale: 16.0 +2022-12-08 01:09:30,900 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83284.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:09:53,196 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5142, 1.4014, 3.5878, 1.6028, 3.4952, 3.6272, 2.5842, 3.9100], + device='cuda:2'), covar=tensor([0.0245, 0.3498, 0.0447, 0.2424, 0.0727, 0.0482, 0.0979, 0.0196], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0158, 0.0159, 0.0169, 0.0170, 0.0174, 0.0133, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 01:10:00,920 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83319.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:10:14,807 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.01 vs. limit=2.0 +2022-12-08 01:10:24,246 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83345.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:10:36,280 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.323e+02 2.243e+02 2.760e+02 3.223e+02 5.001e+02, threshold=5.521e+02, percent-clipped=0.0 +2022-12-08 01:10:53,278 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6763, 4.6994, 5.0955, 4.3192, 4.8698, 5.2347, 1.9718, 4.5255], + device='cuda:2'), covar=tensor([0.0229, 0.0272, 0.0336, 0.0323, 0.0283, 0.0113, 0.2888, 0.0282], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0165, 0.0137, 0.0133, 0.0197, 0.0131, 0.0154, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:10:57,437 INFO [train.py:873] (2/4) Epoch 12, batch 200, loss[loss=0.1503, simple_loss=0.1592, pruned_loss=0.0707, over 4938.00 frames. ], tot_loss[loss=0.126, simple_loss=0.1566, pruned_loss=0.04771, over 1296113.93 frames. ], batch size: 100, lr: 6.74e-03, grad_scale: 16.0 +2022-12-08 01:11:55,174 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2795, 4.0684, 3.7393, 3.9180, 4.1297, 4.1474, 4.2966, 4.2609], + device='cuda:2'), covar=tensor([0.0838, 0.0610, 0.2068, 0.2739, 0.0751, 0.0809, 0.0950, 0.0804], + device='cuda:2'), in_proj_covar=tensor([0.0370, 0.0260, 0.0433, 0.0545, 0.0325, 0.0419, 0.0394, 0.0364], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:12:04,538 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 2.214e+02 2.674e+02 3.340e+02 6.097e+02, threshold=5.348e+02, percent-clipped=3.0 +2022-12-08 01:12:06,231 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0054, 2.0075, 2.0693, 2.1310, 2.0082, 1.6593, 1.3584, 1.8118], + device='cuda:2'), covar=tensor([0.0561, 0.0462, 0.0471, 0.0323, 0.0511, 0.1368, 0.2068, 0.0428], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0165, 0.0138, 0.0134, 0.0197, 0.0132, 0.0154, 0.0181], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:12:24,718 INFO [train.py:873] (2/4) Epoch 12, batch 300, loss[loss=0.1739, simple_loss=0.1628, pruned_loss=0.09246, over 1193.00 frames. ], tot_loss[loss=0.1253, simple_loss=0.1561, pruned_loss=0.04731, over 1574884.43 frames. ], batch size: 100, lr: 6.73e-03, grad_scale: 8.0 +2022-12-08 01:13:03,855 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4575, 4.2441, 4.1386, 4.5319, 4.0880, 3.8404, 4.4784, 4.3433], + device='cuda:2'), covar=tensor([0.0646, 0.0730, 0.0727, 0.0533, 0.0771, 0.0537, 0.0613, 0.0718], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0129, 0.0136, 0.0144, 0.0137, 0.0113, 0.0156, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:13:17,138 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4287, 2.7678, 2.6761, 2.7006, 2.1964, 2.7272, 2.6000, 1.2914], + device='cuda:2'), covar=tensor([0.1668, 0.0817, 0.0811, 0.0706, 0.1082, 0.0573, 0.0920, 0.2744], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0077, 0.0063, 0.0064, 0.0092, 0.0076, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:13:25,766 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83552.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:13:32,202 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.217e+02 2.254e+02 2.674e+02 3.399e+02 5.874e+02, threshold=5.349e+02, percent-clipped=4.0 +2022-12-08 01:13:43,725 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 01:13:53,020 INFO [train.py:873] (2/4) Epoch 12, batch 400, loss[loss=0.1681, simple_loss=0.1584, pruned_loss=0.0889, over 1167.00 frames. ], tot_loss[loss=0.1261, simple_loss=0.1565, pruned_loss=0.04781, over 1705007.98 frames. ], batch size: 100, lr: 6.73e-03, grad_scale: 8.0 +2022-12-08 01:13:58,377 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0007, 2.0123, 1.9810, 1.6800, 2.1134, 0.9840, 1.8508, 2.0144], + device='cuda:2'), covar=tensor([0.1194, 0.0701, 0.1270, 0.1210, 0.1419, 0.1029, 0.1075, 0.0753], + device='cuda:2'), in_proj_covar=tensor([0.0026, 0.0026, 0.0028, 0.0026, 0.0028, 0.0039, 0.0027, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:14:13,913 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0588, 1.9038, 4.5522, 4.0748, 4.1359, 4.5983, 4.3129, 4.6669], + device='cuda:2'), covar=tensor([0.1378, 0.1389, 0.0092, 0.0199, 0.0198, 0.0112, 0.0162, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0129, 0.0167, 0.0144, 0.0139, 0.0120, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:14:16,714 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8387, 5.4688, 5.0553, 5.6593, 5.3378, 5.1335, 5.7777, 5.6822], + device='cuda:2'), covar=tensor([0.0485, 0.0547, 0.0863, 0.0626, 0.0587, 0.0442, 0.0432, 0.0481], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0129, 0.0135, 0.0144, 0.0137, 0.0113, 0.0155, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:14:25,416 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83619.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:14:43,810 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=83640.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:14:55,798 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3325, 1.4171, 2.5422, 1.4915, 2.4976, 2.4643, 2.0137, 2.6097], + device='cuda:2'), covar=tensor([0.0291, 0.2367, 0.0386, 0.1842, 0.0434, 0.0541, 0.1075, 0.0333], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0158, 0.0160, 0.0171, 0.0171, 0.0175, 0.0135, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:15:01,083 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.765e+01 2.109e+02 2.628e+02 3.451e+02 6.835e+02, threshold=5.257e+02, percent-clipped=4.0 +2022-12-08 01:15:07,461 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83667.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:15:21,386 INFO [train.py:873] (2/4) Epoch 12, batch 500, loss[loss=0.1303, simple_loss=0.1684, pruned_loss=0.04609, over 14088.00 frames. ], tot_loss[loss=0.1264, simple_loss=0.1566, pruned_loss=0.04809, over 1791359.21 frames. ], batch size: 29, lr: 6.72e-03, grad_scale: 8.0 +2022-12-08 01:15:25,865 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8744, 1.6313, 3.3909, 3.1036, 3.2430, 3.3920, 2.8397, 3.4428], + device='cuda:2'), covar=tensor([0.1431, 0.1493, 0.0135, 0.0313, 0.0237, 0.0140, 0.0280, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0130, 0.0168, 0.0145, 0.0140, 0.0121, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:15:51,856 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.03 vs. limit=5.0 +2022-12-08 01:16:28,956 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.131e+02 2.083e+02 2.759e+02 3.343e+02 7.485e+02, threshold=5.518e+02, percent-clipped=2.0 +2022-12-08 01:16:29,118 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83760.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:16:49,377 INFO [train.py:873] (2/4) Epoch 12, batch 600, loss[loss=0.104, simple_loss=0.1421, pruned_loss=0.0329, over 11978.00 frames. ], tot_loss[loss=0.1278, simple_loss=0.1571, pruned_loss=0.04925, over 1820929.80 frames. ], batch size: 100, lr: 6.72e-03, grad_scale: 8.0 +2022-12-08 01:17:07,192 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8343, 1.9754, 5.0311, 4.5996, 4.4415, 5.2255, 4.8782, 5.2194], + device='cuda:2'), covar=tensor([0.2038, 0.1951, 0.0156, 0.0254, 0.0265, 0.0177, 0.0139, 0.0200], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0129, 0.0167, 0.0145, 0.0139, 0.0121, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:17:22,705 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=83821.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:17:25,641 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8813, 1.2540, 2.0322, 1.3023, 1.9594, 2.0702, 1.6316, 2.1429], + device='cuda:2'), covar=tensor([0.0296, 0.1981, 0.0476, 0.1726, 0.0532, 0.0527, 0.1121, 0.0351], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0159, 0.0162, 0.0172, 0.0172, 0.0175, 0.0136, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:17:27,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9167, 1.5468, 3.0866, 1.6260, 3.1724, 3.0284, 2.1167, 3.2300], + device='cuda:2'), covar=tensor([0.0263, 0.2700, 0.0403, 0.1945, 0.0331, 0.0448, 0.1128, 0.0209], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0159, 0.0162, 0.0172, 0.0172, 0.0175, 0.0136, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:17:28,401 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5023, 2.0053, 2.4340, 2.5954, 2.4772, 2.0020, 2.5197, 2.2443], + device='cuda:2'), covar=tensor([0.0358, 0.0696, 0.0454, 0.0329, 0.0430, 0.0942, 0.0311, 0.0501], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0250, 0.0369, 0.0317, 0.0259, 0.0297, 0.0296, 0.0276], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:17:50,249 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83852.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:17:57,119 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.264e+02 2.241e+02 2.685e+02 3.305e+02 1.014e+03, threshold=5.370e+02, percent-clipped=1.0 +2022-12-08 01:18:07,030 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3444, 2.6396, 4.1406, 3.1306, 4.1221, 3.9008, 3.9179, 3.4550], + device='cuda:2'), covar=tensor([0.0739, 0.3218, 0.0951, 0.1841, 0.0873, 0.0943, 0.1463, 0.1787], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0315, 0.0392, 0.0300, 0.0375, 0.0319, 0.0365, 0.0309], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:18:17,199 INFO [train.py:873] (2/4) Epoch 12, batch 700, loss[loss=0.09999, simple_loss=0.1381, pruned_loss=0.03092, over 13931.00 frames. ], tot_loss[loss=0.127, simple_loss=0.1565, pruned_loss=0.0487, over 1886042.26 frames. ], batch size: 20, lr: 6.72e-03, grad_scale: 8.0 +2022-12-08 01:18:32,205 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83900.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:19:06,789 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=83940.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:19:06,868 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9617, 2.0942, 2.7192, 2.2912, 2.8287, 2.7884, 2.6417, 2.3922], + device='cuda:2'), covar=tensor([0.0662, 0.2726, 0.0845, 0.1721, 0.0770, 0.0913, 0.0885, 0.1608], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0313, 0.0391, 0.0300, 0.0374, 0.0317, 0.0364, 0.0308], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:19:14,788 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83949.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:19:23,841 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.241e+02 2.717e+02 3.335e+02 9.146e+02, threshold=5.433e+02, percent-clipped=9.0 +2022-12-08 01:19:44,264 INFO [train.py:873] (2/4) Epoch 12, batch 800, loss[loss=0.1053, simple_loss=0.1457, pruned_loss=0.03247, over 14263.00 frames. ], tot_loss[loss=0.128, simple_loss=0.1572, pruned_loss=0.04933, over 1975421.33 frames. ], batch size: 63, lr: 6.71e-03, grad_scale: 8.0 +2022-12-08 01:19:46,354 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=83985.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:19:48,770 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=83988.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:20:08,154 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84010.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:20:14,422 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0385, 1.7577, 3.2674, 2.3308, 3.1137, 1.7661, 2.3052, 3.1215], + device='cuda:2'), covar=tensor([0.0857, 0.5029, 0.0659, 0.6217, 0.0859, 0.4159, 0.1921, 0.0591], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0217, 0.0202, 0.0288, 0.0220, 0.0217, 0.0218, 0.0206], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:20:37,343 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5619, 4.3011, 4.2463, 4.6513, 4.1077, 3.7153, 4.6172, 4.4497], + device='cuda:2'), covar=tensor([0.0643, 0.0686, 0.0670, 0.0518, 0.0862, 0.0675, 0.0545, 0.0719], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0129, 0.0135, 0.0144, 0.0138, 0.0114, 0.0157, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:20:40,296 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84046.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:20:52,177 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 2.165e+02 2.773e+02 3.425e+02 7.521e+02, threshold=5.546e+02, percent-clipped=4.0 +2022-12-08 01:21:05,966 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7784, 0.7267, 0.7516, 0.7618, 0.6013, 0.5045, 0.6408, 0.7537], + device='cuda:2'), covar=tensor([0.0125, 0.0110, 0.0095, 0.0110, 0.0138, 0.0283, 0.0159, 0.0193], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0016, 0.0014, 0.0016, 0.0015, 0.0026, 0.0020, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:21:10,338 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-12-08 01:21:11,644 INFO [train.py:873] (2/4) Epoch 12, batch 900, loss[loss=0.1511, simple_loss=0.1715, pruned_loss=0.06541, over 8611.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1566, pruned_loss=0.04938, over 1938257.47 frames. ], batch size: 100, lr: 6.71e-03, grad_scale: 8.0 +2022-12-08 01:21:24,394 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84097.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:21:40,366 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84116.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:22:11,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5409, 3.8546, 3.4646, 3.8170, 2.6020, 3.7733, 3.6078, 2.0481], + device='cuda:2'), covar=tensor([0.1449, 0.0619, 0.1296, 0.0413, 0.0989, 0.0448, 0.0869, 0.2258], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0078, 0.0062, 0.0064, 0.0093, 0.0076, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:22:17,048 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84158.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:22:18,574 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.043e+02 2.201e+02 2.633e+02 3.303e+02 7.770e+02, threshold=5.265e+02, percent-clipped=5.0 +2022-12-08 01:22:38,829 INFO [train.py:873] (2/4) Epoch 12, batch 1000, loss[loss=0.1298, simple_loss=0.1565, pruned_loss=0.05157, over 14286.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1572, pruned_loss=0.0491, over 1992824.33 frames. ], batch size: 39, lr: 6.70e-03, grad_scale: 8.0 +2022-12-08 01:22:52,891 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84199.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:23:00,472 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3928, 2.0739, 2.3000, 1.5845, 2.0443, 2.3212, 2.3925, 1.9989], + device='cuda:2'), covar=tensor([0.0648, 0.0693, 0.0919, 0.1414, 0.1001, 0.0640, 0.0532, 0.1322], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0180, 0.0138, 0.0125, 0.0135, 0.0142, 0.0121, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:23:20,906 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.62 vs. limit=2.0 +2022-12-08 01:23:25,698 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84236.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:23:46,259 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.153e+02 2.176e+02 2.741e+02 3.221e+02 6.313e+02, threshold=5.481e+02, percent-clipped=1.0 +2022-12-08 01:23:46,450 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84260.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:24:06,155 INFO [train.py:873] (2/4) Epoch 12, batch 1100, loss[loss=0.145, simple_loss=0.1472, pruned_loss=0.07143, over 2602.00 frames. ], tot_loss[loss=0.1278, simple_loss=0.1571, pruned_loss=0.04923, over 1958105.13 frames. ], batch size: 100, lr: 6.70e-03, grad_scale: 8.0 +2022-12-08 01:24:11,140 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4939, 1.5479, 1.7674, 1.5188, 1.5577, 1.2893, 1.5358, 1.0266], + device='cuda:2'), covar=tensor([0.0306, 0.0438, 0.0277, 0.0413, 0.0325, 0.0362, 0.0249, 0.0543], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0016, 0.0015, 0.0015, 0.0015, 0.0026, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:24:18,764 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84297.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:24:25,600 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84305.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 01:24:52,910 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84336.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:24:56,743 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84341.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:25:13,031 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.329e+02 2.219e+02 2.734e+02 3.210e+02 6.975e+02, threshold=5.467e+02, percent-clipped=2.0 +2022-12-08 01:25:33,229 INFO [train.py:873] (2/4) Epoch 12, batch 1200, loss[loss=0.1192, simple_loss=0.1529, pruned_loss=0.04282, over 14294.00 frames. ], tot_loss[loss=0.1286, simple_loss=0.1574, pruned_loss=0.04985, over 1873846.10 frames. ], batch size: 63, lr: 6.70e-03, grad_scale: 8.0 +2022-12-08 01:25:37,186 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-08 01:25:45,490 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84397.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:26:02,540 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84416.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:26:25,265 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6081, 2.0037, 2.6950, 2.7766, 2.6352, 1.9047, 2.7642, 2.1215], + device='cuda:2'), covar=tensor([0.0360, 0.0859, 0.0563, 0.0370, 0.0412, 0.1137, 0.0260, 0.0812], + device='cuda:2'), in_proj_covar=tensor([0.0288, 0.0252, 0.0374, 0.0322, 0.0263, 0.0298, 0.0300, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:26:34,083 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84453.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:26:40,236 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.339e+02 2.298e+02 2.755e+02 3.792e+02 6.830e+02, threshold=5.510e+02, percent-clipped=5.0 +2022-12-08 01:26:43,682 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84464.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:27:00,265 INFO [train.py:873] (2/4) Epoch 12, batch 1300, loss[loss=0.1196, simple_loss=0.154, pruned_loss=0.04265, over 14173.00 frames. ], tot_loss[loss=0.128, simple_loss=0.157, pruned_loss=0.04945, over 1905344.22 frames. ], batch size: 84, lr: 6.69e-03, grad_scale: 8.0 +2022-12-08 01:27:36,878 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84525.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:28:03,418 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84555.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:28:07,761 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.298e+02 2.228e+02 2.625e+02 3.303e+02 8.483e+02, threshold=5.249e+02, percent-clipped=5.0 +2022-12-08 01:28:28,447 INFO [train.py:873] (2/4) Epoch 12, batch 1400, loss[loss=0.1117, simple_loss=0.1493, pruned_loss=0.03704, over 14192.00 frames. ], tot_loss[loss=0.1277, simple_loss=0.1573, pruned_loss=0.04908, over 1945735.14 frames. ], batch size: 57, lr: 6.69e-03, grad_scale: 8.0 +2022-12-08 01:28:31,083 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84586.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:28:36,476 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84592.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:28:48,176 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84605.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:28:52,510 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3591, 2.2426, 4.3668, 3.0380, 4.0803, 2.1988, 3.3300, 4.1520], + device='cuda:2'), covar=tensor([0.0525, 0.3951, 0.0366, 0.5753, 0.0842, 0.3100, 0.1167, 0.0370], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0216, 0.0202, 0.0291, 0.0223, 0.0217, 0.0217, 0.0206], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:29:00,440 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8526, 3.2926, 2.6532, 4.0648, 3.8705, 3.8363, 3.4056, 2.6453], + device='cuda:2'), covar=tensor([0.0992, 0.1690, 0.3965, 0.0615, 0.0900, 0.1641, 0.1350, 0.3911], + device='cuda:2'), in_proj_covar=tensor([0.0266, 0.0292, 0.0265, 0.0255, 0.0307, 0.0293, 0.0252, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:29:12,511 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84633.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:29:19,419 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84641.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:29:20,978 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1503, 2.0043, 2.1030, 2.2021, 2.1012, 2.0738, 2.2257, 1.8830], + device='cuda:2'), covar=tensor([0.0968, 0.1395, 0.0702, 0.0799, 0.1150, 0.0751, 0.0829, 0.0787], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0258, 0.0183, 0.0181, 0.0175, 0.0146, 0.0270, 0.0159], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 01:29:29,702 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84653.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:29:35,865 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.687e+01 2.089e+02 2.664e+02 3.503e+02 6.178e+02, threshold=5.327e+02, percent-clipped=6.0 +2022-12-08 01:29:55,407 INFO [train.py:873] (2/4) Epoch 12, batch 1500, loss[loss=0.1133, simple_loss=0.1478, pruned_loss=0.0394, over 14531.00 frames. ], tot_loss[loss=0.1264, simple_loss=0.1565, pruned_loss=0.04813, over 1937999.96 frames. ], batch size: 43, lr: 6.68e-03, grad_scale: 8.0 +2022-12-08 01:30:00,989 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84689.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:30:03,579 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84692.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:30:05,701 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84694.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:30:33,259 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3428, 3.7868, 3.5625, 3.5670, 2.5286, 3.6244, 3.4075, 1.9431], + device='cuda:2'), covar=tensor([0.1698, 0.0548, 0.1030, 0.0884, 0.1003, 0.0599, 0.1114, 0.2464], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0079, 0.0062, 0.0065, 0.0093, 0.0077, 0.0095, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:30:53,253 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-08 01:30:56,767 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84753.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:31:02,108 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.488e+02 2.424e+02 2.855e+02 3.432e+02 8.447e+02, threshold=5.711e+02, percent-clipped=4.0 +2022-12-08 01:31:04,030 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8823, 1.5782, 1.9655, 1.7105, 2.0425, 1.7897, 1.6641, 1.7979], + device='cuda:2'), covar=tensor([0.0640, 0.1395, 0.0327, 0.0515, 0.0436, 0.0764, 0.0328, 0.0375], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0311, 0.0393, 0.0301, 0.0370, 0.0317, 0.0361, 0.0307], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:31:22,263 INFO [train.py:873] (2/4) Epoch 12, batch 1600, loss[loss=0.1567, simple_loss=0.1416, pruned_loss=0.08596, over 1228.00 frames. ], tot_loss[loss=0.1284, simple_loss=0.1578, pruned_loss=0.04952, over 1901478.60 frames. ], batch size: 100, lr: 6.68e-03, grad_scale: 8.0 +2022-12-08 01:31:26,615 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6909, 1.6415, 1.8954, 1.5952, 1.5898, 1.4520, 1.4301, 1.0246], + device='cuda:2'), covar=tensor([0.0233, 0.0580, 0.0231, 0.0323, 0.0340, 0.0429, 0.0283, 0.0484], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0016, 0.0015, 0.0015, 0.0015, 0.0026, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:31:36,934 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84800.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:31:37,622 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84801.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:32:25,373 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84855.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:32:29,445 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.720e+01 2.080e+02 2.547e+02 3.159e+02 6.486e+02, threshold=5.095e+02, percent-clipped=3.0 +2022-12-08 01:32:30,507 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=84861.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:32:46,521 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6244, 2.1934, 3.5553, 3.6885, 3.4844, 2.2549, 3.5597, 2.7321], + device='cuda:2'), covar=tensor([0.0366, 0.0909, 0.0666, 0.0397, 0.0425, 0.1357, 0.0364, 0.0872], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0251, 0.0369, 0.0320, 0.0260, 0.0296, 0.0299, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:32:47,200 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84881.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:32:48,865 INFO [train.py:873] (2/4) Epoch 12, batch 1700, loss[loss=0.2015, simple_loss=0.1782, pruned_loss=0.1124, over 1170.00 frames. ], tot_loss[loss=0.1266, simple_loss=0.1569, pruned_loss=0.04818, over 1950678.95 frames. ], batch size: 100, lr: 6.68e-03, grad_scale: 4.0 +2022-12-08 01:32:49,052 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5029, 1.8550, 2.4618, 2.0673, 2.4817, 2.2121, 2.2180, 2.2330], + device='cuda:2'), covar=tensor([0.0666, 0.2342, 0.0643, 0.1323, 0.0449, 0.1024, 0.0607, 0.0966], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0311, 0.0394, 0.0299, 0.0368, 0.0316, 0.0362, 0.0305], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:32:57,294 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84892.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:33:06,787 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84903.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:33:11,743 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 01:33:23,414 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.32 vs. limit=5.0 +2022-12-08 01:33:39,690 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=84940.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:33:49,414 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=84951.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:33:52,733 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5164, 2.8563, 4.3108, 3.3166, 4.3121, 4.1796, 4.0075, 3.6971], + device='cuda:2'), covar=tensor([0.0769, 0.2970, 0.0937, 0.1705, 0.0728, 0.0861, 0.1900, 0.1720], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0311, 0.0396, 0.0301, 0.0372, 0.0317, 0.0364, 0.0306], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:33:57,762 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 2.143e+02 2.620e+02 3.286e+02 6.897e+02, threshold=5.240e+02, percent-clipped=6.0 +2022-12-08 01:34:08,112 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4654, 3.2331, 3.1372, 2.3542, 2.9002, 3.0838, 3.5477, 2.7361], + device='cuda:2'), covar=tensor([0.0591, 0.0982, 0.0898, 0.1506, 0.0942, 0.0720, 0.0651, 0.1420], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0181, 0.0137, 0.0125, 0.0134, 0.0143, 0.0121, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:34:11,876 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=11.10 vs. limit=5.0 +2022-12-08 01:34:17,445 INFO [train.py:873] (2/4) Epoch 12, batch 1800, loss[loss=0.1473, simple_loss=0.1606, pruned_loss=0.067, over 4991.00 frames. ], tot_loss[loss=0.1262, simple_loss=0.1566, pruned_loss=0.04787, over 1947049.15 frames. ], batch size: 100, lr: 6.67e-03, grad_scale: 4.0 +2022-12-08 01:34:19,390 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5905, 2.3832, 4.5867, 3.0334, 4.2990, 2.2550, 3.2234, 4.3770], + device='cuda:2'), covar=tensor([0.0580, 0.4038, 0.0356, 0.6992, 0.0615, 0.3645, 0.1594, 0.0329], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0212, 0.0199, 0.0287, 0.0221, 0.0216, 0.0214, 0.0204], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:34:22,817 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=84989.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:34:25,409 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=84992.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:34:47,151 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85012.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:35:10,617 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85040.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:35:29,534 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.196e+02 2.693e+02 3.379e+02 9.036e+02, threshold=5.386e+02, percent-clipped=1.0 +2022-12-08 01:35:42,637 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5336, 3.6045, 3.7874, 3.4380, 3.6088, 3.5547, 1.5293, 3.4169], + device='cuda:2'), covar=tensor([0.0318, 0.0339, 0.0357, 0.0435, 0.0354, 0.0438, 0.3133, 0.0282], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0170, 0.0140, 0.0137, 0.0201, 0.0135, 0.0157, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:35:48,290 INFO [train.py:873] (2/4) Epoch 12, batch 1900, loss[loss=0.1304, simple_loss=0.1608, pruned_loss=0.04999, over 14281.00 frames. ], tot_loss[loss=0.1274, simple_loss=0.157, pruned_loss=0.04893, over 2002488.50 frames. ], batch size: 60, lr: 6.67e-03, grad_scale: 4.0 +2022-12-08 01:36:08,135 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.33 vs. limit=5.0 +2022-12-08 01:36:09,512 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85107.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:36:16,192 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85115.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:36:44,517 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0294, 2.1730, 2.3458, 2.3565, 1.9630, 2.2822, 2.0154, 1.3529], + device='cuda:2'), covar=tensor([0.1139, 0.0872, 0.0700, 0.0496, 0.1187, 0.0691, 0.1185, 0.2347], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0078, 0.0062, 0.0065, 0.0093, 0.0076, 0.0094, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:36:45,789 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-08 01:36:46,364 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4553, 3.1939, 5.5701, 4.0947, 5.0267, 2.7214, 4.4150, 5.1223], + device='cuda:2'), covar=tensor([0.0615, 0.3002, 0.0223, 0.5404, 0.0574, 0.2733, 0.0929, 0.0300], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0213, 0.0201, 0.0286, 0.0222, 0.0218, 0.0215, 0.0206], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:36:52,427 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85156.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:36:56,466 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.007e+02 2.116e+02 2.633e+02 3.281e+02 1.062e+03, threshold=5.267e+02, percent-clipped=7.0 +2022-12-08 01:37:03,248 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85168.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:37:10,218 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85176.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:37:14,103 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85181.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:37:15,663 INFO [train.py:873] (2/4) Epoch 12, batch 2000, loss[loss=0.1482, simple_loss=0.1383, pruned_loss=0.07909, over 1229.00 frames. ], tot_loss[loss=0.1263, simple_loss=0.1563, pruned_loss=0.04814, over 1985014.08 frames. ], batch size: 100, lr: 6.66e-03, grad_scale: 8.0 +2022-12-08 01:37:36,134 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5169, 2.1598, 3.4609, 3.5834, 3.4307, 2.2069, 3.4926, 2.7379], + device='cuda:2'), covar=tensor([0.0355, 0.0972, 0.0773, 0.0427, 0.0395, 0.1369, 0.0410, 0.0859], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0253, 0.0369, 0.0320, 0.0261, 0.0299, 0.0300, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:37:56,094 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85229.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:38:04,469 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.11 vs. limit=5.0 +2022-12-08 01:38:06,798 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7314, 1.9791, 2.7111, 2.8218, 2.7290, 2.0967, 2.7569, 2.2746], + device='cuda:2'), covar=tensor([0.0346, 0.0850, 0.0538, 0.0410, 0.0395, 0.1083, 0.0426, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0251, 0.0367, 0.0318, 0.0259, 0.0296, 0.0297, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:38:09,015 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-08 01:38:25,281 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.573e+01 2.203e+02 2.703e+02 3.626e+02 1.921e+03, threshold=5.405e+02, percent-clipped=8.0 +2022-12-08 01:38:43,506 INFO [train.py:873] (2/4) Epoch 12, batch 2100, loss[loss=0.1094, simple_loss=0.1486, pruned_loss=0.03509, over 14391.00 frames. ], tot_loss[loss=0.1249, simple_loss=0.1557, pruned_loss=0.04705, over 2001086.66 frames. ], batch size: 41, lr: 6.66e-03, grad_scale: 4.0 +2022-12-08 01:38:49,018 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85289.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:38:49,855 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5556, 1.1970, 1.4385, 1.3736, 1.5147, 0.8749, 1.3400, 1.2940], + device='cuda:2'), covar=tensor([0.0823, 0.1196, 0.0843, 0.1052, 0.0888, 0.1070, 0.0771, 0.0867], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0026, 0.0029, 0.0026, 0.0028, 0.0039, 0.0027, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:39:04,453 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85307.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:39:30,443 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85337.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:39:51,892 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.818e+01 2.339e+02 2.945e+02 3.643e+02 8.230e+02, threshold=5.891e+02, percent-clipped=7.0 +2022-12-08 01:40:10,082 INFO [train.py:873] (2/4) Epoch 12, batch 2200, loss[loss=0.1007, simple_loss=0.1383, pruned_loss=0.03158, over 13919.00 frames. ], tot_loss[loss=0.1278, simple_loss=0.157, pruned_loss=0.04923, over 1962004.26 frames. ], batch size: 23, lr: 6.66e-03, grad_scale: 4.0 +2022-12-08 01:40:19,726 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-12-08 01:40:21,068 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1672, 1.2398, 1.1454, 0.9928, 0.9866, 0.9136, 0.9400, 0.9229], + device='cuda:2'), covar=tensor([0.0186, 0.0264, 0.0192, 0.0216, 0.0213, 0.0451, 0.0286, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0015, 0.0016, 0.0016, 0.0026, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:40:39,602 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 01:40:49,623 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85428.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:40:50,590 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7028, 1.7329, 1.6258, 1.7756, 1.6358, 1.5891, 1.2619, 1.2015], + device='cuda:2'), covar=tensor([0.0391, 0.0361, 0.0524, 0.0284, 0.0355, 0.0292, 0.0322, 0.0653], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0015, 0.0016, 0.0015, 0.0026, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:41:14,575 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85456.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:41:19,365 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.272e+02 2.150e+02 2.815e+02 3.417e+02 5.996e+02, threshold=5.631e+02, percent-clipped=2.0 +2022-12-08 01:41:20,077 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85463.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:41:20,105 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8066, 1.3828, 4.0108, 1.8570, 3.9360, 4.0516, 3.3935, 4.2914], + device='cuda:2'), covar=tensor([0.0340, 0.4162, 0.0474, 0.2646, 0.0494, 0.0473, 0.0572, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0157, 0.0160, 0.0168, 0.0167, 0.0173, 0.0133, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 01:41:26,811 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85471.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:41:36,990 INFO [train.py:873] (2/4) Epoch 12, batch 2300, loss[loss=0.1691, simple_loss=0.1563, pruned_loss=0.09092, over 1221.00 frames. ], tot_loss[loss=0.1259, simple_loss=0.1559, pruned_loss=0.0479, over 1937989.84 frames. ], batch size: 100, lr: 6.65e-03, grad_scale: 4.0 +2022-12-08 01:41:37,185 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85483.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:41:42,418 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85489.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:41:55,419 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85504.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:42:11,901 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2817, 1.7901, 2.2802, 1.9535, 2.3521, 2.0479, 2.0953, 2.1571], + device='cuda:2'), covar=tensor([0.0495, 0.1559, 0.0467, 0.0672, 0.0292, 0.0749, 0.0391, 0.0627], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0316, 0.0402, 0.0306, 0.0377, 0.0323, 0.0373, 0.0312], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:42:18,026 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85530.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:42:30,162 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85544.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:42:45,173 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 2.375e+02 2.779e+02 3.534e+02 5.701e+02, threshold=5.557e+02, percent-clipped=1.0 +2022-12-08 01:42:53,238 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8292, 1.6656, 1.8385, 2.0166, 1.4876, 1.7411, 1.7881, 1.9074], + device='cuda:2'), covar=tensor([0.0160, 0.0287, 0.0146, 0.0159, 0.0236, 0.0295, 0.0164, 0.0135], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0251, 0.0368, 0.0318, 0.0260, 0.0300, 0.0297, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:43:03,728 INFO [train.py:873] (2/4) Epoch 12, batch 2400, loss[loss=0.1009, simple_loss=0.14, pruned_loss=0.03094, over 13810.00 frames. ], tot_loss[loss=0.1264, simple_loss=0.1563, pruned_loss=0.04824, over 1955722.84 frames. ], batch size: 23, lr: 6.65e-03, grad_scale: 8.0 +2022-12-08 01:43:10,495 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85591.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:43:24,431 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85607.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:43:40,314 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85625.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:44:07,098 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85655.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:44:12,664 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.126e+02 2.042e+02 2.557e+02 3.044e+02 5.063e+02, threshold=5.114e+02, percent-clipped=0.0 +2022-12-08 01:44:30,326 INFO [train.py:873] (2/4) Epoch 12, batch 2500, loss[loss=0.1432, simple_loss=0.1707, pruned_loss=0.05788, over 14499.00 frames. ], tot_loss[loss=0.1261, simple_loss=0.1564, pruned_loss=0.04786, over 2024662.41 frames. ], batch size: 49, lr: 6.65e-03, grad_scale: 8.0 +2022-12-08 01:44:33,003 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85686.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 01:44:48,831 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8086, 1.5665, 2.0771, 1.6961, 1.9245, 1.4871, 1.6322, 1.9525], + device='cuda:2'), covar=tensor([0.2421, 0.2550, 0.0370, 0.1607, 0.1160, 0.1363, 0.1163, 0.0693], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0214, 0.0203, 0.0288, 0.0223, 0.0220, 0.0215, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 01:44:57,297 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9065, 3.9968, 4.2949, 3.5988, 4.1002, 4.3051, 1.6070, 3.8548], + device='cuda:2'), covar=tensor([0.0296, 0.0333, 0.0374, 0.0603, 0.0348, 0.0224, 0.3060, 0.0296], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0171, 0.0142, 0.0139, 0.0203, 0.0136, 0.0159, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 01:45:11,648 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8633, 1.2824, 2.0155, 1.3142, 1.9931, 2.0906, 1.7382, 2.1510], + device='cuda:2'), covar=tensor([0.0292, 0.1956, 0.0442, 0.1767, 0.0490, 0.0494, 0.0891, 0.0350], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0158, 0.0160, 0.0168, 0.0170, 0.0174, 0.0133, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 01:45:15,168 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1786, 1.5582, 1.7828, 1.6572, 1.6063, 1.6674, 1.4075, 1.1844], + device='cuda:2'), covar=tensor([0.1631, 0.1324, 0.0410, 0.0484, 0.1401, 0.0913, 0.2398, 0.1888], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0079, 0.0064, 0.0067, 0.0094, 0.0078, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:45:24,563 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.74 vs. limit=5.0 +2022-12-08 01:45:32,707 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-08 01:45:40,095 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.041e+02 2.190e+02 2.751e+02 3.272e+02 6.413e+02, threshold=5.502e+02, percent-clipped=4.0 +2022-12-08 01:45:40,249 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85763.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:45:47,365 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=85771.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:45:58,014 INFO [train.py:873] (2/4) Epoch 12, batch 2600, loss[loss=0.1381, simple_loss=0.1705, pruned_loss=0.05288, over 9435.00 frames. ], tot_loss[loss=0.1254, simple_loss=0.1555, pruned_loss=0.04766, over 1931254.78 frames. ], batch size: 100, lr: 6.64e-03, grad_scale: 4.0 +2022-12-08 01:45:58,997 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85784.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:46:07,198 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-12-08 01:46:17,553 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4132, 1.0130, 1.2513, 0.8617, 1.0605, 1.4056, 1.0495, 1.1107], + device='cuda:2'), covar=tensor([0.0409, 0.0867, 0.0633, 0.0551, 0.1132, 0.0779, 0.0496, 0.1139], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0181, 0.0138, 0.0126, 0.0136, 0.0144, 0.0121, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:46:22,745 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85811.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:46:29,905 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=85819.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:46:39,133 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1695, 2.2089, 5.2518, 4.6352, 4.5360, 5.2962, 5.0766, 5.3759], + device='cuda:2'), covar=tensor([0.1308, 0.1222, 0.0066, 0.0158, 0.0174, 0.0081, 0.0071, 0.0082], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0157, 0.0127, 0.0166, 0.0143, 0.0138, 0.0118, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 01:46:46,847 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85839.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:47:08,285 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.414e+01 2.205e+02 2.825e+02 3.419e+02 7.121e+02, threshold=5.651e+02, percent-clipped=1.0 +2022-12-08 01:47:18,509 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85875.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:47:25,018 INFO [train.py:873] (2/4) Epoch 12, batch 2700, loss[loss=0.1236, simple_loss=0.1466, pruned_loss=0.05026, over 5035.00 frames. ], tot_loss[loss=0.1257, simple_loss=0.1558, pruned_loss=0.04777, over 1952992.93 frames. ], batch size: 100, lr: 6.64e-03, grad_scale: 4.0 +2022-12-08 01:47:27,748 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85886.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:47:58,895 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7732, 5.5685, 5.1161, 5.7485, 5.2956, 5.1417, 5.6993, 5.5833], + device='cuda:2'), covar=tensor([0.0504, 0.0665, 0.0691, 0.0501, 0.0732, 0.0353, 0.0532, 0.0710], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0129, 0.0136, 0.0148, 0.0139, 0.0113, 0.0156, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 01:48:06,431 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85930.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 01:48:10,316 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.14 vs. limit=2.0 +2022-12-08 01:48:11,818 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85936.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:48:16,770 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.35 vs. limit=5.0 +2022-12-08 01:48:29,644 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=85957.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:48:34,528 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.895e+01 2.358e+02 3.006e+02 3.476e+02 6.598e+02, threshold=6.011e+02, percent-clipped=3.0 +2022-12-08 01:48:51,231 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=85981.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 01:48:52,817 INFO [train.py:873] (2/4) Epoch 12, batch 2800, loss[loss=0.1242, simple_loss=0.1614, pruned_loss=0.04346, over 14065.00 frames. ], tot_loss[loss=0.1263, simple_loss=0.1562, pruned_loss=0.04816, over 1945891.36 frames. ], batch size: 29, lr: 6.63e-03, grad_scale: 8.0 +2022-12-08 01:48:59,897 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=85991.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 01:49:09,705 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86002.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:49:24,440 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86018.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:49:53,848 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86052.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:50:03,560 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.107e+02 2.317e+02 2.627e+02 3.139e+02 9.999e+02, threshold=5.255e+02, percent-clipped=3.0 +2022-12-08 01:50:03,795 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86063.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:50:08,679 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86069.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:50:20,499 INFO [train.py:873] (2/4) Epoch 12, batch 2900, loss[loss=0.1079, simple_loss=0.1495, pruned_loss=0.03311, over 14575.00 frames. ], tot_loss[loss=0.127, simple_loss=0.1571, pruned_loss=0.04851, over 1996067.45 frames. ], batch size: 22, lr: 6.63e-03, grad_scale: 8.0 +2022-12-08 01:50:21,322 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86084.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:50:46,812 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86113.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:51:01,279 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86130.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:51:03,163 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86132.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:51:09,827 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86139.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:51:30,509 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.360e+01 2.186e+02 2.706e+02 3.386e+02 6.755e+02, threshold=5.412e+02, percent-clipped=5.0 +2022-12-08 01:51:47,632 INFO [train.py:873] (2/4) Epoch 12, batch 3000, loss[loss=0.1254, simple_loss=0.1534, pruned_loss=0.04866, over 14406.00 frames. ], tot_loss[loss=0.1269, simple_loss=0.1566, pruned_loss=0.04857, over 1980326.37 frames. ], batch size: 53, lr: 6.63e-03, grad_scale: 4.0 +2022-12-08 01:51:47,632 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 01:51:51,874 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4614, 3.1419, 3.8850, 2.6000, 2.5642, 3.4601, 2.2118, 3.6539], + device='cuda:2'), covar=tensor([0.0725, 0.0665, 0.0382, 0.1631, 0.1948, 0.0574, 0.2869, 0.0501], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0096, 0.0089, 0.0094, 0.0113, 0.0082, 0.0121, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 01:51:56,040 INFO [train.py:905] (2/4) Epoch 12, validation: loss=0.1299, simple_loss=0.1698, pruned_loss=0.04501, over 857387.00 frames. +2022-12-08 01:51:56,041 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 01:51:58,786 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86186.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:51:59,530 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86187.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:52:04,328 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.61 vs. limit=2.0 +2022-12-08 01:52:09,639 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9885, 2.6832, 2.7648, 1.7732, 2.5202, 2.7586, 3.0273, 2.4178], + device='cuda:2'), covar=tensor([0.0756, 0.1141, 0.0948, 0.1621, 0.1161, 0.0831, 0.0690, 0.1378], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0180, 0.0138, 0.0126, 0.0135, 0.0142, 0.0120, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 01:52:21,401 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-12-08 01:52:38,274 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86231.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:52:41,124 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86234.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:52:52,099 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86246.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:53:00,227 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.23 vs. limit=5.0 +2022-12-08 01:53:07,351 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.699e+01 2.315e+02 2.814e+02 3.910e+02 8.994e+02, threshold=5.629e+02, percent-clipped=7.0 +2022-12-08 01:53:23,046 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86281.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:53:24,631 INFO [train.py:873] (2/4) Epoch 12, batch 3100, loss[loss=0.1755, simple_loss=0.1638, pruned_loss=0.09365, over 2643.00 frames. ], tot_loss[loss=0.1279, simple_loss=0.1572, pruned_loss=0.04931, over 1944631.48 frames. ], batch size: 100, lr: 6.62e-03, grad_scale: 4.0 +2022-12-08 01:53:27,207 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86286.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 01:53:45,086 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86307.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:53:50,987 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86313.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:54:04,961 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:54:12,579 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9161, 2.6613, 2.7351, 2.9156, 2.7840, 2.7756, 2.9503, 2.4809], + device='cuda:2'), covar=tensor([0.0815, 0.1367, 0.0604, 0.0640, 0.0846, 0.0690, 0.0777, 0.0818], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0262, 0.0185, 0.0181, 0.0176, 0.0148, 0.0271, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 01:54:31,025 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86358.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:54:36,315 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.153e+02 2.157e+02 2.639e+02 3.347e+02 7.885e+02, threshold=5.278e+02, percent-clipped=4.0 +2022-12-08 01:54:51,747 INFO [train.py:873] (2/4) Epoch 12, batch 3200, loss[loss=0.1222, simple_loss=0.1576, pruned_loss=0.04335, over 14597.00 frames. ], tot_loss[loss=0.1278, simple_loss=0.1572, pruned_loss=0.04926, over 1924329.77 frames. ], batch size: 30, lr: 6.62e-03, grad_scale: 8.0 +2022-12-08 01:55:14,595 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86408.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:55:21,653 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.61 vs. limit=5.0 +2022-12-08 01:55:28,861 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86425.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:55:42,168 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86440.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:55:42,868 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1317, 4.6428, 4.5381, 5.0568, 4.6428, 4.2828, 4.9712, 4.1943], + device='cuda:2'), covar=tensor([0.0313, 0.0996, 0.0387, 0.0447, 0.0879, 0.0569, 0.0545, 0.0641], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0263, 0.0185, 0.0182, 0.0176, 0.0147, 0.0271, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 01:55:46,322 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86445.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:56:02,446 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.245e+02 2.734e+02 3.789e+02 7.990e+02, threshold=5.469e+02, percent-clipped=5.0 +2022-12-08 01:56:20,045 INFO [train.py:873] (2/4) Epoch 12, batch 3300, loss[loss=0.1145, simple_loss=0.1522, pruned_loss=0.03843, over 14173.00 frames. ], tot_loss[loss=0.1271, simple_loss=0.1568, pruned_loss=0.04869, over 1959389.84 frames. ], batch size: 89, lr: 6.61e-03, grad_scale: 8.0 +2022-12-08 01:56:29,917 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.09 vs. limit=5.0 +2022-12-08 01:56:35,374 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86501.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:56:39,498 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86506.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 01:56:42,598 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-08 01:57:01,830 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86531.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:57:01,854 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3862, 2.5088, 2.5750, 2.6199, 2.2495, 2.6108, 2.3912, 1.4594], + device='cuda:2'), covar=tensor([0.1313, 0.0898, 0.0868, 0.0624, 0.0918, 0.0687, 0.1154, 0.2379], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0080, 0.0063, 0.0067, 0.0092, 0.0078, 0.0095, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:57:30,031 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 2.093e+02 2.507e+02 3.209e+02 6.718e+02, threshold=5.013e+02, percent-clipped=2.0 +2022-12-08 01:57:42,890 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86579.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:57:43,866 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86580.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:57:46,321 INFO [train.py:873] (2/4) Epoch 12, batch 3400, loss[loss=0.1256, simple_loss=0.1261, pruned_loss=0.06253, over 1244.00 frames. ], tot_loss[loss=0.1272, simple_loss=0.1564, pruned_loss=0.04901, over 1876668.92 frames. ], batch size: 100, lr: 6.61e-03, grad_scale: 8.0 +2022-12-08 01:57:48,934 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86586.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 01:57:52,578 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-08 01:58:03,566 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86602.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:58:05,353 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7856, 0.7579, 0.7125, 0.8568, 0.7315, 0.1884, 0.7253, 0.8492], + device='cuda:2'), covar=tensor([0.0190, 0.0576, 0.0398, 0.0284, 0.0363, 0.0269, 0.0469, 0.0478], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0027, 0.0030, 0.0026, 0.0029, 0.0040, 0.0028, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 01:58:13,192 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86613.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:58:31,348 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86634.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 01:58:38,017 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86641.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:58:52,749 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86658.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:58:55,356 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86661.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:58:57,881 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.288e+02 2.267e+02 2.858e+02 3.502e+02 8.320e+02, threshold=5.717e+02, percent-clipped=7.0 +2022-12-08 01:59:14,486 INFO [train.py:873] (2/4) Epoch 12, batch 3500, loss[loss=0.1811, simple_loss=0.1572, pruned_loss=0.1024, over 1229.00 frames. ], tot_loss[loss=0.1259, simple_loss=0.1558, pruned_loss=0.04798, over 1942643.82 frames. ], batch size: 100, lr: 6.61e-03, grad_scale: 8.0 +2022-12-08 01:59:25,922 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-08 01:59:27,701 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-08 01:59:31,509 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0789, 2.2599, 2.3101, 2.4324, 2.0152, 2.3708, 2.2022, 1.2662], + device='cuda:2'), covar=tensor([0.1535, 0.0977, 0.1121, 0.0662, 0.1019, 0.0661, 0.1289, 0.2641], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0080, 0.0064, 0.0067, 0.0092, 0.0078, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 01:59:34,038 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86706.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:59:35,786 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86708.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 01:59:51,299 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86725.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:00:17,814 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86756.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:00:24,901 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 2.023e+02 2.697e+02 3.316e+02 6.067e+02, threshold=5.393e+02, percent-clipped=1.0 +2022-12-08 02:00:32,575 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86773.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:00:41,038 INFO [train.py:873] (2/4) Epoch 12, batch 3600, loss[loss=0.122, simple_loss=0.1603, pruned_loss=0.04184, over 14409.00 frames. ], tot_loss[loss=0.1257, simple_loss=0.1559, pruned_loss=0.04775, over 1960608.45 frames. ], batch size: 41, lr: 6.60e-03, grad_scale: 8.0 +2022-12-08 02:00:52,490 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86796.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:00:57,634 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86801.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:01:08,615 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86814.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:01:15,523 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6512, 2.4781, 3.3211, 2.3554, 1.8712, 2.8567, 1.3359, 2.8704], + device='cuda:2'), covar=tensor([0.1200, 0.1652, 0.0720, 0.2046, 0.3049, 0.1156, 0.5214, 0.1198], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0097, 0.0090, 0.0096, 0.0114, 0.0085, 0.0123, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:01:51,778 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.100e+02 2.201e+02 2.694e+02 3.354e+02 5.968e+02, threshold=5.388e+02, percent-clipped=2.0 +2022-12-08 02:02:01,313 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86875.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:02:01,601 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-08 02:02:04,444 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=86878.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:02:08,671 INFO [train.py:873] (2/4) Epoch 12, batch 3700, loss[loss=0.1328, simple_loss=0.1507, pruned_loss=0.05741, over 6030.00 frames. ], tot_loss[loss=0.1267, simple_loss=0.1565, pruned_loss=0.04839, over 1979859.52 frames. ], batch size: 100, lr: 6.60e-03, grad_scale: 8.0 +2022-12-08 02:02:24,531 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=86902.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:02:32,393 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-08 02:02:54,793 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=86936.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:02:57,544 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=86939.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:03:06,362 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=86950.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:03:19,132 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.304e+02 2.133e+02 2.735e+02 3.535e+02 1.030e+03, threshold=5.470e+02, percent-clipped=7.0 +2022-12-08 02:03:35,280 INFO [train.py:873] (2/4) Epoch 12, batch 3800, loss[loss=0.09935, simple_loss=0.142, pruned_loss=0.02835, over 14097.00 frames. ], tot_loss[loss=0.1256, simple_loss=0.1557, pruned_loss=0.04771, over 2014409.93 frames. ], batch size: 29, lr: 6.60e-03, grad_scale: 4.0 +2022-12-08 02:03:49,732 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3574, 3.1941, 3.1084, 3.4507, 3.0019, 2.8913, 3.4396, 3.3395], + device='cuda:2'), covar=tensor([0.0786, 0.0996, 0.0938, 0.0744, 0.1243, 0.0924, 0.0705, 0.0823], + device='cuda:2'), in_proj_covar=tensor([0.0133, 0.0129, 0.0137, 0.0147, 0.0138, 0.0112, 0.0154, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 02:03:51,923 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9376, 1.8426, 4.0446, 3.8677, 3.8547, 4.1296, 3.5759, 4.1526], + device='cuda:2'), covar=tensor([0.1451, 0.1365, 0.0106, 0.0172, 0.0192, 0.0118, 0.0210, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0157, 0.0125, 0.0163, 0.0142, 0.0138, 0.0118, 0.0118], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:03:53,136 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87002.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:04:45,855 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87063.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:04:47,287 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.323e+02 2.236e+02 2.550e+02 3.017e+02 4.626e+02, threshold=5.100e+02, percent-clipped=0.0 +2022-12-08 02:05:03,582 INFO [train.py:873] (2/4) Epoch 12, batch 3900, loss[loss=0.1527, simple_loss=0.146, pruned_loss=0.07971, over 1234.00 frames. ], tot_loss[loss=0.1246, simple_loss=0.1553, pruned_loss=0.04691, over 2059230.24 frames. ], batch size: 100, lr: 6.59e-03, grad_scale: 4.0 +2022-12-08 02:05:14,064 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87096.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:05:14,088 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9039, 3.2982, 3.1497, 3.0187, 2.3365, 3.2320, 2.9344, 1.5489], + device='cuda:2'), covar=tensor([0.1861, 0.0540, 0.1182, 0.1018, 0.1136, 0.0565, 0.1268, 0.2421], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0078, 0.0062, 0.0067, 0.0091, 0.0077, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 02:05:18,280 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87101.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:05:56,028 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87144.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:06:00,195 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87149.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:06:14,161 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 2.280e+02 2.775e+02 3.261e+02 6.407e+02, threshold=5.550e+02, percent-clipped=4.0 +2022-12-08 02:06:14,379 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7564, 2.4315, 2.6142, 1.6679, 2.2163, 2.4675, 2.7345, 2.3195], + device='cuda:2'), covar=tensor([0.0803, 0.1071, 0.0967, 0.1631, 0.1333, 0.0810, 0.0674, 0.1356], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0180, 0.0138, 0.0126, 0.0135, 0.0144, 0.0122, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:06:18,653 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87170.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:06:29,721 INFO [train.py:873] (2/4) Epoch 12, batch 4000, loss[loss=0.1022, simple_loss=0.1394, pruned_loss=0.03247, over 13929.00 frames. ], tot_loss[loss=0.124, simple_loss=0.1551, pruned_loss=0.04647, over 2036354.61 frames. ], batch size: 26, lr: 6.59e-03, grad_scale: 8.0 +2022-12-08 02:07:10,565 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87230.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:07:14,065 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87234.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:07:15,995 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87236.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:07:40,897 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.276e+02 2.153e+02 2.853e+02 3.504e+02 7.130e+02, threshold=5.705e+02, percent-clipped=5.0 +2022-12-08 02:07:47,928 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9670, 2.1152, 3.9419, 2.7394, 3.8177, 1.8793, 3.0315, 3.8315], + device='cuda:2'), covar=tensor([0.0728, 0.4279, 0.0595, 0.6315, 0.0621, 0.3722, 0.1369, 0.0516], + device='cuda:2'), in_proj_covar=tensor([0.0246, 0.0211, 0.0203, 0.0284, 0.0223, 0.0214, 0.0214, 0.0206], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:07:57,458 INFO [train.py:873] (2/4) Epoch 12, batch 4100, loss[loss=0.1569, simple_loss=0.1567, pruned_loss=0.07853, over 2611.00 frames. ], tot_loss[loss=0.1237, simple_loss=0.155, pruned_loss=0.04622, over 2045890.21 frames. ], batch size: 100, lr: 6.58e-03, grad_scale: 8.0 +2022-12-08 02:07:58,285 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87284.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:08:04,410 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87291.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:08:18,594 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1699, 1.2610, 1.4528, 1.0334, 0.8778, 1.2112, 0.8503, 1.2941], + device='cuda:2'), covar=tensor([0.1961, 0.2933, 0.0937, 0.2932, 0.3444, 0.1148, 0.1742, 0.1094], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0097, 0.0091, 0.0096, 0.0115, 0.0085, 0.0123, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:08:24,645 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8734, 2.7722, 2.4462, 2.6292, 2.8072, 2.8479, 2.8762, 2.8321], + device='cuda:2'), covar=tensor([0.1199, 0.0685, 0.2477, 0.2634, 0.0954, 0.1037, 0.1298, 0.1048], + device='cuda:2'), in_proj_covar=tensor([0.0375, 0.0258, 0.0432, 0.0550, 0.0326, 0.0421, 0.0385, 0.0360], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:08:25,513 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2658, 3.0723, 2.7377, 2.9212, 3.1408, 3.1676, 3.2249, 3.1562], + device='cuda:2'), covar=tensor([0.0868, 0.0613, 0.2179, 0.2464, 0.0758, 0.0912, 0.1100, 0.0982], + device='cuda:2'), in_proj_covar=tensor([0.0375, 0.0258, 0.0432, 0.0550, 0.0326, 0.0421, 0.0385, 0.0360], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:09:02,736 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87358.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:09:06,541 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87362.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:09:09,786 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.165e+02 2.894e+02 3.729e+02 1.916e+03, threshold=5.789e+02, percent-clipped=7.0 +2022-12-08 02:09:21,639 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1470, 3.2323, 4.0268, 2.9080, 2.3939, 3.1287, 2.0436, 3.4291], + device='cuda:2'), covar=tensor([0.1469, 0.1185, 0.0475, 0.1894, 0.2320, 0.1202, 0.3236, 0.1043], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0097, 0.0091, 0.0096, 0.0115, 0.0085, 0.0123, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:09:24,062 INFO [train.py:873] (2/4) Epoch 12, batch 4200, loss[loss=0.1615, simple_loss=0.1734, pruned_loss=0.07481, over 9503.00 frames. ], tot_loss[loss=0.1241, simple_loss=0.1551, pruned_loss=0.04651, over 2030457.11 frames. ], batch size: 100, lr: 6.58e-03, grad_scale: 4.0 +2022-12-08 02:09:28,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7587, 1.9921, 2.6288, 2.2081, 2.6566, 2.5555, 2.4347, 2.3619], + device='cuda:2'), covar=tensor([0.0710, 0.2524, 0.0760, 0.1523, 0.0611, 0.1066, 0.0829, 0.1346], + device='cuda:2'), in_proj_covar=tensor([0.0356, 0.0314, 0.0399, 0.0305, 0.0377, 0.0324, 0.0371, 0.0311], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:09:58,975 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87423.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:10:32,211 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87461.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:10:34,411 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-12-08 02:10:36,121 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.802e+01 2.101e+02 2.786e+02 3.317e+02 5.801e+02, threshold=5.572e+02, percent-clipped=2.0 +2022-12-08 02:10:38,717 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9234, 1.7698, 3.8735, 3.5556, 3.7338, 3.9326, 3.3368, 3.9490], + device='cuda:2'), covar=tensor([0.1494, 0.1490, 0.0115, 0.0239, 0.0200, 0.0120, 0.0242, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0126, 0.0166, 0.0144, 0.0140, 0.0120, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:10:39,558 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87470.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:10:51,436 INFO [train.py:873] (2/4) Epoch 12, batch 4300, loss[loss=0.1321, simple_loss=0.1594, pruned_loss=0.05237, over 10376.00 frames. ], tot_loss[loss=0.1258, simple_loss=0.1559, pruned_loss=0.04781, over 1971263.21 frames. ], batch size: 100, lr: 6.58e-03, grad_scale: 4.0 +2022-12-08 02:11:10,447 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87505.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:11:21,417 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87518.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:11:25,982 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87522.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:11:36,359 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87534.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:12:04,323 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.182e+02 2.709e+02 3.348e+02 1.136e+03, threshold=5.418e+02, percent-clipped=3.0 +2022-12-08 02:12:04,580 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87566.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:12:18,389 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87582.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:12:19,184 INFO [train.py:873] (2/4) Epoch 12, batch 4400, loss[loss=0.1453, simple_loss=0.1583, pruned_loss=0.06611, over 3864.00 frames. ], tot_loss[loss=0.1248, simple_loss=0.1554, pruned_loss=0.04715, over 2023255.18 frames. ], batch size: 100, lr: 6.57e-03, grad_scale: 8.0 +2022-12-08 02:12:19,949 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87584.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:12:21,508 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87586.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:12:24,156 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87589.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:12:30,099 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8794, 3.0231, 4.5673, 3.5345, 4.6028, 4.5342, 4.3095, 4.0400], + device='cuda:2'), covar=tensor([0.0652, 0.2714, 0.0779, 0.1555, 0.0742, 0.0642, 0.1631, 0.1462], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0314, 0.0397, 0.0303, 0.0374, 0.0321, 0.0371, 0.0309], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:12:38,803 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=87605.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:13:13,708 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87645.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:13:18,228 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87650.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:13:24,946 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87658.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:13:31,927 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=87666.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 02:13:32,488 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.847e+01 2.130e+02 2.616e+02 3.250e+02 5.248e+02, threshold=5.232e+02, percent-clipped=0.0 +2022-12-08 02:13:32,651 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3268, 1.7534, 1.8175, 1.8107, 1.5916, 1.8343, 1.6581, 1.1410], + device='cuda:2'), covar=tensor([0.1967, 0.1807, 0.0964, 0.1208, 0.1544, 0.1175, 0.1718, 0.3037], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0080, 0.0065, 0.0068, 0.0094, 0.0079, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 02:13:46,591 INFO [train.py:873] (2/4) Epoch 12, batch 4500, loss[loss=0.1459, simple_loss=0.1605, pruned_loss=0.06561, over 5970.00 frames. ], tot_loss[loss=0.1245, simple_loss=0.1551, pruned_loss=0.04695, over 2002320.77 frames. ], batch size: 100, lr: 6.57e-03, grad_scale: 4.0 +2022-12-08 02:14:06,149 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4651, 1.4585, 1.5680, 1.4021, 1.5177, 0.9558, 1.2552, 1.4596], + device='cuda:2'), covar=tensor([0.0820, 0.0639, 0.0683, 0.0847, 0.0850, 0.0951, 0.0900, 0.0810], + device='cuda:2'), in_proj_covar=tensor([0.0027, 0.0027, 0.0029, 0.0026, 0.0028, 0.0039, 0.0028, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:14:06,767 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87706.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:14:17,064 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87718.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:14:17,999 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8576, 1.9684, 4.8975, 4.5383, 4.2682, 4.9824, 4.8052, 5.0400], + device='cuda:2'), covar=tensor([0.1785, 0.1664, 0.0140, 0.0218, 0.0242, 0.0182, 0.0139, 0.0139], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0160, 0.0128, 0.0168, 0.0145, 0.0142, 0.0121, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:14:59,665 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.324e+02 2.389e+02 2.965e+02 3.391e+02 5.507e+02, threshold=5.931e+02, percent-clipped=1.0 +2022-12-08 02:15:13,022 INFO [train.py:873] (2/4) Epoch 12, batch 4600, loss[loss=0.1138, simple_loss=0.1388, pruned_loss=0.04435, over 4986.00 frames. ], tot_loss[loss=0.125, simple_loss=0.1556, pruned_loss=0.04724, over 2014938.91 frames. ], batch size: 100, lr: 6.57e-03, grad_scale: 4.0 +2022-12-08 02:15:13,203 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1045, 2.5220, 2.6702, 1.6355, 2.7413, 2.9629, 3.1512, 2.2246], + device='cuda:2'), covar=tensor([0.0793, 0.1415, 0.1248, 0.2141, 0.1138, 0.0729, 0.1063, 0.1800], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0179, 0.0137, 0.0124, 0.0135, 0.0144, 0.0122, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:15:43,367 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87817.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:16:16,422 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9065, 4.6471, 4.2862, 4.4673, 4.5428, 4.7218, 4.8773, 4.8318], + device='cuda:2'), covar=tensor([0.0813, 0.0469, 0.1806, 0.2757, 0.0765, 0.0862, 0.0929, 0.0812], + device='cuda:2'), in_proj_covar=tensor([0.0375, 0.0257, 0.0431, 0.0556, 0.0330, 0.0424, 0.0389, 0.0365], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:16:21,489 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87861.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:16:26,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.265e+02 2.249e+02 2.775e+02 3.473e+02 5.522e+02, threshold=5.551e+02, percent-clipped=0.0 +2022-12-08 02:16:40,651 INFO [train.py:873] (2/4) Epoch 12, batch 4700, loss[loss=0.1406, simple_loss=0.1427, pruned_loss=0.06925, over 3813.00 frames. ], tot_loss[loss=0.1246, simple_loss=0.1551, pruned_loss=0.04711, over 2007505.98 frames. ], batch size: 100, lr: 6.56e-03, grad_scale: 4.0 +2022-12-08 02:16:43,349 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=87886.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:17:25,818 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=87934.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:17:31,121 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87940.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:17:35,294 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87945.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:17:47,346 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8810, 5.3814, 5.3382, 5.9026, 5.4353, 4.6517, 5.7743, 4.6854], + device='cuda:2'), covar=tensor([0.0326, 0.0844, 0.0347, 0.0318, 0.0762, 0.0397, 0.0440, 0.0538], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0263, 0.0188, 0.0183, 0.0177, 0.0152, 0.0271, 0.0163], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 02:17:49,006 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=87961.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 02:17:54,230 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.984e+01 2.201e+02 2.736e+02 3.426e+02 5.956e+02, threshold=5.473e+02, percent-clipped=2.0 +2022-12-08 02:18:08,687 INFO [train.py:873] (2/4) Epoch 12, batch 4800, loss[loss=0.1128, simple_loss=0.1473, pruned_loss=0.03914, over 11915.00 frames. ], tot_loss[loss=0.1238, simple_loss=0.1545, pruned_loss=0.04653, over 1990135.38 frames. ], batch size: 100, lr: 6.56e-03, grad_scale: 8.0 +2022-12-08 02:18:23,374 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-08 02:18:39,769 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88018.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:18:43,430 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.80 vs. limit=2.0 +2022-12-08 02:18:56,419 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.56 vs. limit=2.0 +2022-12-08 02:19:21,834 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88066.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:19:22,466 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 2.264e+02 2.796e+02 3.655e+02 7.619e+02, threshold=5.592e+02, percent-clipped=2.0 +2022-12-08 02:19:36,610 INFO [train.py:873] (2/4) Epoch 12, batch 4900, loss[loss=0.1055, simple_loss=0.1483, pruned_loss=0.03138, over 14355.00 frames. ], tot_loss[loss=0.1242, simple_loss=0.155, pruned_loss=0.04666, over 2002150.40 frames. ], batch size: 28, lr: 6.55e-03, grad_scale: 8.0 +2022-12-08 02:19:51,936 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=88100.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:20:06,839 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88117.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:20:24,564 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-12-08 02:20:45,891 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88161.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:20:45,935 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=88161.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:20:49,494 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88165.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:20:51,096 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.528e+02 2.168e+02 2.607e+02 3.248e+02 6.908e+02, threshold=5.213e+02, percent-clipped=2.0 +2022-12-08 02:21:05,441 INFO [train.py:873] (2/4) Epoch 12, batch 5000, loss[loss=0.1508, simple_loss=0.1685, pruned_loss=0.06653, over 9502.00 frames. ], tot_loss[loss=0.1231, simple_loss=0.1546, pruned_loss=0.04576, over 2044806.04 frames. ], batch size: 100, lr: 6.55e-03, grad_scale: 8.0 +2022-12-08 02:21:28,461 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88209.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:21:42,394 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.91 vs. limit=2.0 +2022-12-08 02:21:55,831 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88240.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:22:00,524 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88245.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:22:14,878 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88261.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:22:19,851 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 2.163e+02 2.571e+02 3.278e+02 8.920e+02, threshold=5.143e+02, percent-clipped=2.0 +2022-12-08 02:22:33,614 INFO [train.py:873] (2/4) Epoch 12, batch 5100, loss[loss=0.1067, simple_loss=0.1481, pruned_loss=0.03263, over 14243.00 frames. ], tot_loss[loss=0.1231, simple_loss=0.1544, pruned_loss=0.04589, over 2059699.64 frames. ], batch size: 35, lr: 6.55e-03, grad_scale: 8.0 +2022-12-08 02:22:38,217 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88288.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:22:41,185 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=88291.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:22:42,681 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88293.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:22:46,520 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0661, 1.8135, 1.4187, 2.2217, 1.8816, 1.7059, 1.4085, 1.4338], + device='cuda:2'), covar=tensor([0.0368, 0.0810, 0.0834, 0.0211, 0.0344, 0.0576, 0.0535, 0.0615], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0015, 0.0016, 0.0016, 0.0027, 0.0022, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:22:56,552 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88309.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:23:13,182 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8057, 3.7747, 3.2855, 2.6386, 3.2736, 3.6183, 4.0943, 2.9757], + device='cuda:2'), covar=tensor([0.0595, 0.1292, 0.0922, 0.1532, 0.0812, 0.0681, 0.0688, 0.1340], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0180, 0.0136, 0.0125, 0.0137, 0.0145, 0.0121, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:23:14,812 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8226, 5.5980, 5.2516, 5.8840, 5.3312, 4.9438, 5.8311, 5.7618], + device='cuda:2'), covar=tensor([0.0502, 0.0613, 0.0726, 0.0390, 0.0693, 0.0443, 0.0508, 0.0444], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0129, 0.0137, 0.0148, 0.0138, 0.0113, 0.0156, 0.0135], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 02:23:17,457 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 02:23:34,141 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=88352.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 02:23:43,748 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6240, 3.3869, 3.1597, 2.3284, 3.0938, 3.3471, 3.7446, 2.9037], + device='cuda:2'), covar=tensor([0.0648, 0.1617, 0.1022, 0.1768, 0.0956, 0.0811, 0.0842, 0.1365], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0179, 0.0137, 0.0125, 0.0136, 0.0145, 0.0121, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:23:45,386 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7357, 3.5249, 3.1937, 3.4011, 3.6121, 3.6087, 3.6987, 3.6725], + device='cuda:2'), covar=tensor([0.0857, 0.0606, 0.2216, 0.2595, 0.0823, 0.0920, 0.1059, 0.0884], + device='cuda:2'), in_proj_covar=tensor([0.0373, 0.0261, 0.0434, 0.0554, 0.0329, 0.0426, 0.0387, 0.0366], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:23:46,914 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 2.245e+02 2.755e+02 3.354e+02 6.298e+02, threshold=5.511e+02, percent-clipped=3.0 +2022-12-08 02:24:01,892 INFO [train.py:873] (2/4) Epoch 12, batch 5200, loss[loss=0.1423, simple_loss=0.1619, pruned_loss=0.06138, over 4979.00 frames. ], tot_loss[loss=0.1242, simple_loss=0.1553, pruned_loss=0.04657, over 2035185.19 frames. ], batch size: 100, lr: 6.54e-03, grad_scale: 8.0 +2022-12-08 02:24:12,170 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3862, 5.1322, 4.9539, 5.4741, 4.9536, 4.8265, 5.4650, 5.3550], + device='cuda:2'), covar=tensor([0.0567, 0.0719, 0.0742, 0.0417, 0.0791, 0.0440, 0.0509, 0.0460], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0129, 0.0137, 0.0147, 0.0137, 0.0113, 0.0156, 0.0134], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 02:24:38,001 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3480, 3.0897, 2.4305, 3.4447, 3.2632, 3.2744, 2.9630, 2.3323], + device='cuda:2'), covar=tensor([0.0750, 0.1507, 0.3483, 0.0666, 0.0943, 0.1064, 0.1355, 0.3603], + device='cuda:2'), in_proj_covar=tensor([0.0265, 0.0296, 0.0267, 0.0264, 0.0312, 0.0295, 0.0255, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 02:24:50,748 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3424, 4.6837, 4.7325, 5.3195, 4.9409, 4.5006, 5.2251, 4.2917], + device='cuda:2'), covar=tensor([0.0319, 0.1270, 0.0368, 0.0392, 0.0713, 0.0476, 0.0611, 0.0549], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0263, 0.0186, 0.0182, 0.0175, 0.0149, 0.0271, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 02:24:52,035 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6254, 1.6415, 1.7374, 1.6901, 1.7052, 1.5765, 1.4045, 1.0356], + device='cuda:2'), covar=tensor([0.0306, 0.0416, 0.0255, 0.0225, 0.0253, 0.0265, 0.0264, 0.0499], + device='cuda:2'), in_proj_covar=tensor([0.0016, 0.0017, 0.0015, 0.0016, 0.0016, 0.0027, 0.0021, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:25:06,282 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=88456.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:25:15,464 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.239e+02 2.113e+02 2.685e+02 3.247e+02 4.954e+02, threshold=5.369e+02, percent-clipped=0.0 +2022-12-08 02:25:29,582 INFO [train.py:873] (2/4) Epoch 12, batch 5300, loss[loss=0.129, simple_loss=0.1286, pruned_loss=0.06473, over 1301.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.155, pruned_loss=0.04633, over 2024566.36 frames. ], batch size: 100, lr: 6.54e-03, grad_scale: 8.0 +2022-12-08 02:25:53,481 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1559, 1.9939, 1.7993, 1.9003, 2.0594, 2.0661, 2.0448, 2.0698], + device='cuda:2'), covar=tensor([0.1133, 0.0930, 0.2712, 0.2575, 0.1246, 0.1305, 0.1704, 0.1225], + device='cuda:2'), in_proj_covar=tensor([0.0374, 0.0261, 0.0433, 0.0555, 0.0329, 0.0429, 0.0387, 0.0366], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:26:03,983 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5245, 3.2567, 3.2295, 3.5250, 3.3502, 3.5121, 3.5577, 2.9203], + device='cuda:2'), covar=tensor([0.0417, 0.0954, 0.0425, 0.0449, 0.0667, 0.0315, 0.0523, 0.0591], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0261, 0.0184, 0.0181, 0.0174, 0.0149, 0.0268, 0.0160], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 02:26:43,276 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.234e+01 2.154e+02 2.570e+02 3.365e+02 7.012e+02, threshold=5.140e+02, percent-clipped=5.0 +2022-12-08 02:26:44,871 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.75 vs. limit=5.0 +2022-12-08 02:26:54,620 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2966, 1.7513, 2.4936, 2.0602, 2.3105, 1.6454, 2.0428, 2.2875], + device='cuda:2'), covar=tensor([0.2062, 0.3191, 0.0565, 0.2144, 0.1117, 0.2090, 0.1045, 0.0832], + device='cuda:2'), in_proj_covar=tensor([0.0247, 0.0210, 0.0204, 0.0283, 0.0223, 0.0212, 0.0212, 0.0207], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:26:57,901 INFO [train.py:873] (2/4) Epoch 12, batch 5400, loss[loss=0.1313, simple_loss=0.1358, pruned_loss=0.06342, over 2666.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1546, pruned_loss=0.04658, over 1984739.07 frames. ], batch size: 100, lr: 6.54e-03, grad_scale: 8.0 +2022-12-08 02:27:53,998 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=88647.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:28:07,066 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3354, 1.7076, 4.3206, 2.0261, 4.2128, 4.3601, 3.6673, 4.7703], + device='cuda:2'), covar=tensor([0.0206, 0.2969, 0.0324, 0.2003, 0.0352, 0.0345, 0.0470, 0.0134], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0160, 0.0167, 0.0168, 0.0177, 0.0132, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 02:28:11,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.088e+01 2.108e+02 2.673e+02 3.283e+02 5.238e+02, threshold=5.346e+02, percent-clipped=1.0 +2022-12-08 02:28:20,278 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.53 vs. limit=2.0 +2022-12-08 02:28:26,024 INFO [train.py:873] (2/4) Epoch 12, batch 5500, loss[loss=0.1202, simple_loss=0.1545, pruned_loss=0.04292, over 14393.00 frames. ], tot_loss[loss=0.1226, simple_loss=0.1536, pruned_loss=0.0458, over 1934186.78 frames. ], batch size: 44, lr: 6.53e-03, grad_scale: 8.0 +2022-12-08 02:28:40,991 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9924, 2.6854, 3.4886, 2.3644, 2.1965, 3.0245, 1.7126, 3.0425], + device='cuda:2'), covar=tensor([0.1034, 0.1267, 0.0626, 0.2406, 0.2462, 0.1164, 0.3626, 0.0976], + device='cuda:2'), in_proj_covar=tensor([0.0080, 0.0097, 0.0090, 0.0097, 0.0115, 0.0084, 0.0121, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:29:07,611 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.12 vs. limit=2.0 +2022-12-08 02:29:15,509 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.43 vs. limit=5.0 +2022-12-08 02:29:24,630 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5016, 2.3475, 3.3184, 2.5247, 3.3802, 3.2633, 3.1172, 2.7832], + device='cuda:2'), covar=tensor([0.0910, 0.2676, 0.1397, 0.1921, 0.0988, 0.0906, 0.1700, 0.1608], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0316, 0.0398, 0.0305, 0.0376, 0.0323, 0.0367, 0.0309], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:29:28,915 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3081, 1.3624, 2.5379, 1.5165, 2.4939, 2.4689, 1.8949, 2.6261], + device='cuda:2'), covar=tensor([0.0260, 0.2345, 0.0349, 0.1622, 0.0449, 0.0483, 0.1016, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0159, 0.0167, 0.0168, 0.0176, 0.0131, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 02:29:29,717 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88756.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:29:38,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.261e+02 2.096e+02 2.766e+02 3.641e+02 8.752e+02, threshold=5.532e+02, percent-clipped=4.0 +2022-12-08 02:29:53,659 INFO [train.py:873] (2/4) Epoch 12, batch 5600, loss[loss=0.1351, simple_loss=0.1636, pruned_loss=0.05331, over 14164.00 frames. ], tot_loss[loss=0.1247, simple_loss=0.1547, pruned_loss=0.04732, over 1929266.90 frames. ], batch size: 99, lr: 6.53e-03, grad_scale: 8.0 +2022-12-08 02:30:12,323 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88804.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:30:32,634 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.82 vs. limit=2.0 +2022-12-08 02:31:04,884 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.21 vs. limit=2.0 +2022-12-08 02:31:06,999 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.050e+02 2.217e+02 2.679e+02 3.309e+02 6.646e+02, threshold=5.358e+02, percent-clipped=2.0 +2022-12-08 02:31:20,615 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.85 vs. limit=5.0 +2022-12-08 02:31:20,847 INFO [train.py:873] (2/4) Epoch 12, batch 5700, loss[loss=0.1388, simple_loss=0.1481, pruned_loss=0.06478, over 3896.00 frames. ], tot_loss[loss=0.1253, simple_loss=0.1549, pruned_loss=0.04783, over 1876041.39 frames. ], batch size: 100, lr: 6.52e-03, grad_scale: 8.0 +2022-12-08 02:31:23,555 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3108, 1.9837, 2.3450, 2.3811, 2.2201, 1.9305, 2.4520, 2.1727], + device='cuda:2'), covar=tensor([0.0301, 0.0672, 0.0379, 0.0367, 0.0418, 0.0838, 0.0375, 0.0469], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0252, 0.0367, 0.0320, 0.0262, 0.0297, 0.0299, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 02:31:43,944 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 02:31:46,989 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1834, 1.9042, 1.9670, 2.1142, 1.8849, 1.1888, 1.6686, 2.0345], + device='cuda:2'), covar=tensor([0.0625, 0.0966, 0.0638, 0.1090, 0.0946, 0.0868, 0.1052, 0.0453], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0027, 0.0029, 0.0041, 0.0029, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:32:10,948 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 02:32:16,543 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=88947.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:32:32,860 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.260e+02 2.272e+02 2.731e+02 3.384e+02 1.131e+03, threshold=5.463e+02, percent-clipped=4.0 +2022-12-08 02:32:47,398 INFO [train.py:873] (2/4) Epoch 12, batch 5800, loss[loss=0.1447, simple_loss=0.1671, pruned_loss=0.06112, over 6906.00 frames. ], tot_loss[loss=0.1251, simple_loss=0.1551, pruned_loss=0.04755, over 1909604.84 frames. ], batch size: 100, lr: 6.52e-03, grad_scale: 8.0 +2022-12-08 02:32:57,686 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=88995.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:33:19,369 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8230, 1.6428, 1.9865, 1.6841, 2.0015, 1.8216, 1.6878, 1.8958], + device='cuda:2'), covar=tensor([0.0675, 0.1262, 0.0323, 0.0435, 0.0334, 0.0720, 0.0288, 0.0415], + device='cuda:2'), in_proj_covar=tensor([0.0355, 0.0319, 0.0400, 0.0306, 0.0380, 0.0324, 0.0368, 0.0311], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:33:22,797 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-08 02:33:26,343 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89027.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:34:01,515 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.293e+02 2.084e+02 2.490e+02 3.041e+02 5.969e+02, threshold=4.980e+02, percent-clipped=1.0 +2022-12-08 02:34:06,897 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2348, 2.4262, 2.3941, 2.5760, 2.0270, 2.5536, 2.3686, 1.4431], + device='cuda:2'), covar=tensor([0.1305, 0.0849, 0.0863, 0.0547, 0.1142, 0.0811, 0.1198, 0.2440], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0079, 0.0065, 0.0067, 0.0093, 0.0078, 0.0094, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 02:34:15,418 INFO [train.py:873] (2/4) Epoch 12, batch 5900, loss[loss=0.1086, simple_loss=0.1497, pruned_loss=0.03372, over 14306.00 frames. ], tot_loss[loss=0.1251, simple_loss=0.1555, pruned_loss=0.04735, over 1993119.37 frames. ], batch size: 28, lr: 6.52e-03, grad_scale: 8.0 +2022-12-08 02:34:20,322 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:34:43,022 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.66 vs. limit=5.0 +2022-12-08 02:35:29,921 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.297e+02 2.664e+02 3.393e+02 7.245e+02, threshold=5.328e+02, percent-clipped=3.0 +2022-12-08 02:35:31,822 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89169.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:35:44,588 INFO [train.py:873] (2/4) Epoch 12, batch 6000, loss[loss=0.1091, simple_loss=0.1515, pruned_loss=0.03339, over 14585.00 frames. ], tot_loss[loss=0.1247, simple_loss=0.1549, pruned_loss=0.04725, over 1950084.46 frames. ], batch size: 22, lr: 6.51e-03, grad_scale: 8.0 +2022-12-08 02:35:44,588 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 02:35:52,921 INFO [train.py:905] (2/4) Epoch 12, validation: loss=0.1296, simple_loss=0.1695, pruned_loss=0.04492, over 857387.00 frames. +2022-12-08 02:35:52,922 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 02:35:58,312 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9103, 1.5322, 4.5147, 4.1460, 4.0777, 4.5605, 4.1845, 4.5630], + device='cuda:2'), covar=tensor([0.1565, 0.1678, 0.0103, 0.0183, 0.0209, 0.0115, 0.0129, 0.0121], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0126, 0.0163, 0.0144, 0.0139, 0.0120, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:36:20,791 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9712, 1.5640, 4.1590, 3.8623, 3.9133, 4.2047, 3.7082, 4.2159], + device='cuda:2'), covar=tensor([0.1335, 0.1468, 0.0099, 0.0201, 0.0184, 0.0108, 0.0190, 0.0113], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0125, 0.0163, 0.0143, 0.0138, 0.0119, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:36:29,154 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9118, 1.3009, 2.0203, 1.3361, 1.9631, 2.0424, 1.7658, 2.1371], + device='cuda:2'), covar=tensor([0.0317, 0.2028, 0.0423, 0.1801, 0.0564, 0.0593, 0.0891, 0.0365], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0159, 0.0168, 0.0167, 0.0176, 0.0131, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 02:36:34,389 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89230.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:37:06,543 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.059e+02 2.275e+02 2.670e+02 3.684e+02 9.566e+02, threshold=5.341e+02, percent-clipped=7.0 +2022-12-08 02:37:21,417 INFO [train.py:873] (2/4) Epoch 12, batch 6100, loss[loss=0.1513, simple_loss=0.1803, pruned_loss=0.06113, over 13922.00 frames. ], tot_loss[loss=0.125, simple_loss=0.1553, pruned_loss=0.04737, over 1940370.59 frames. ], batch size: 26, lr: 6.51e-03, grad_scale: 8.0 +2022-12-08 02:38:34,355 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.186e+02 2.240e+02 2.644e+02 3.331e+02 6.090e+02, threshold=5.289e+02, percent-clipped=8.0 +2022-12-08 02:38:40,683 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89374.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:38:48,058 INFO [train.py:873] (2/4) Epoch 12, batch 6200, loss[loss=0.1313, simple_loss=0.1559, pruned_loss=0.05334, over 14268.00 frames. ], tot_loss[loss=0.1251, simple_loss=0.1552, pruned_loss=0.04754, over 1921300.68 frames. ], batch size: 89, lr: 6.51e-03, grad_scale: 8.0 +2022-12-08 02:38:48,165 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89383.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:39:15,235 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6972, 3.8715, 4.0796, 3.5181, 3.9051, 3.9332, 1.4574, 3.7225], + device='cuda:2'), covar=tensor([0.0301, 0.0317, 0.0327, 0.0509, 0.0330, 0.0325, 0.3087, 0.0271], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0169, 0.0140, 0.0139, 0.0198, 0.0136, 0.0156, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 02:39:24,618 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89425.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:39:33,851 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89435.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:39:37,330 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89439.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:40:00,789 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 2.324e+02 2.890e+02 3.661e+02 8.443e+02, threshold=5.779e+02, percent-clipped=4.0 +2022-12-08 02:40:15,103 INFO [train.py:873] (2/4) Epoch 12, batch 6300, loss[loss=0.1769, simple_loss=0.1564, pruned_loss=0.09871, over 1275.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1544, pruned_loss=0.04669, over 1913878.90 frames. ], batch size: 100, lr: 6.50e-03, grad_scale: 8.0 +2022-12-08 02:40:17,847 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89486.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:40:29,978 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89500.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:40:37,527 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89509.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:40:51,607 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89525.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:41:26,476 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0010, 2.4519, 3.7151, 2.9250, 3.7779, 3.5453, 3.5867, 3.1207], + device='cuda:2'), covar=tensor([0.0731, 0.3165, 0.1018, 0.1918, 0.0854, 0.1087, 0.1376, 0.2222], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0315, 0.0395, 0.0305, 0.0376, 0.0320, 0.0364, 0.0308], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:41:27,939 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.271e+01 2.078e+02 2.565e+02 3.125e+02 6.537e+02, threshold=5.131e+02, percent-clipped=2.0 +2022-12-08 02:41:30,687 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89570.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:41:41,870 INFO [train.py:873] (2/4) Epoch 12, batch 6400, loss[loss=0.1448, simple_loss=0.1428, pruned_loss=0.07337, over 2655.00 frames. ], tot_loss[loss=0.1238, simple_loss=0.1543, pruned_loss=0.04661, over 1882032.24 frames. ], batch size: 100, lr: 6.50e-03, grad_scale: 8.0 +2022-12-08 02:42:30,076 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-08 02:42:56,147 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.041e+02 2.320e+02 2.969e+02 3.886e+02 7.379e+02, threshold=5.938e+02, percent-clipped=5.0 +2022-12-08 02:43:07,280 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89680.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:43:09,696 INFO [train.py:873] (2/4) Epoch 12, batch 6500, loss[loss=0.1089, simple_loss=0.1477, pruned_loss=0.03503, over 13941.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1547, pruned_loss=0.04653, over 1948069.94 frames. ], batch size: 23, lr: 6.50e-03, grad_scale: 8.0 +2022-12-08 02:43:09,883 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=89683.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:43:50,442 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89730.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:43:51,228 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=89731.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:44:00,198 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89741.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:44:23,990 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.437e+02 2.343e+02 2.899e+02 3.659e+02 1.529e+03, threshold=5.799e+02, percent-clipped=6.0 +2022-12-08 02:44:25,805 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4384, 1.8774, 3.4382, 2.4662, 3.3193, 1.8384, 2.7574, 3.3427], + device='cuda:2'), covar=tensor([0.0823, 0.4794, 0.0676, 0.5913, 0.0903, 0.3613, 0.1359, 0.0529], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0214, 0.0205, 0.0289, 0.0225, 0.0214, 0.0214, 0.0210], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:44:35,260 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89781.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:44:36,902 INFO [train.py:873] (2/4) Epoch 12, batch 6600, loss[loss=0.1223, simple_loss=0.1431, pruned_loss=0.05079, over 3852.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1548, pruned_loss=0.04694, over 1914986.33 frames. ], batch size: 100, lr: 6.49e-03, grad_scale: 8.0 +2022-12-08 02:44:38,759 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9044, 2.3391, 2.4910, 1.4976, 2.4243, 2.7874, 2.9464, 2.2040], + device='cuda:2'), covar=tensor([0.0669, 0.1207, 0.1040, 0.1975, 0.1085, 0.0674, 0.0573, 0.1407], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0174, 0.0135, 0.0123, 0.0133, 0.0143, 0.0121, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:44:47,410 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89795.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 02:45:13,779 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=89825.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:45:48,198 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=89865.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:45:48,293 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=89865.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:45:50,624 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.462e+01 2.267e+02 2.755e+02 3.423e+02 7.380e+02, threshold=5.509e+02, percent-clipped=4.0 +2022-12-08 02:45:52,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8610, 1.3132, 2.0035, 1.2167, 1.9729, 2.0609, 1.7113, 2.1381], + device='cuda:2'), covar=tensor([0.0317, 0.2115, 0.0502, 0.1933, 0.0566, 0.0599, 0.0994, 0.0373], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0160, 0.0169, 0.0168, 0.0178, 0.0133, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:45:55,159 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=89873.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:46:04,380 INFO [train.py:873] (2/4) Epoch 12, batch 6700, loss[loss=0.115, simple_loss=0.1524, pruned_loss=0.03878, over 14275.00 frames. ], tot_loss[loss=0.124, simple_loss=0.1548, pruned_loss=0.04665, over 1984970.10 frames. ], batch size: 44, lr: 6.49e-03, grad_scale: 8.0 +2022-12-08 02:46:08,918 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.56 vs. limit=2.0 +2022-12-08 02:46:41,442 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=89926.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:47:13,359 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 02:47:17,827 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.103e+02 2.198e+02 2.718e+02 3.192e+02 1.007e+03, threshold=5.437e+02, percent-clipped=1.0 +2022-12-08 02:47:30,904 INFO [train.py:873] (2/4) Epoch 12, batch 6800, loss[loss=0.1272, simple_loss=0.162, pruned_loss=0.04618, over 14593.00 frames. ], tot_loss[loss=0.1233, simple_loss=0.154, pruned_loss=0.04628, over 2006293.72 frames. ], batch size: 23, lr: 6.48e-03, grad_scale: 8.0 +2022-12-08 02:47:41,516 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6448, 1.6944, 1.7929, 1.2943, 1.2462, 1.6762, 1.0846, 1.6916], + device='cuda:2'), covar=tensor([0.1375, 0.2312, 0.0929, 0.2140, 0.2998, 0.0907, 0.2943, 0.1107], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0099, 0.0092, 0.0098, 0.0117, 0.0085, 0.0124, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:48:15,284 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90030.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:48:20,869 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:48:47,747 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.338e+02 2.301e+02 2.958e+02 4.111e+02 8.301e+02, threshold=5.916e+02, percent-clipped=6.0 +2022-12-08 02:48:57,173 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90078.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:48:59,884 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90081.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:49:01,450 INFO [train.py:873] (2/4) Epoch 12, batch 6900, loss[loss=0.1283, simple_loss=0.1583, pruned_loss=0.04915, over 14281.00 frames. ], tot_loss[loss=0.1236, simple_loss=0.1544, pruned_loss=0.04638, over 2022827.74 frames. ], batch size: 60, lr: 6.48e-03, grad_scale: 8.0 +2022-12-08 02:49:06,710 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4080, 2.5415, 5.2262, 4.5790, 4.5534, 5.3111, 5.0292, 5.3629], + device='cuda:2'), covar=tensor([0.1172, 0.1097, 0.0071, 0.0164, 0.0174, 0.0079, 0.0094, 0.0078], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0127, 0.0164, 0.0143, 0.0139, 0.0120, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 02:49:11,904 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90095.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:49:41,778 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90129.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:49:42,822 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90130.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:49:53,759 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90143.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 02:49:55,549 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2395, 1.7996, 2.1337, 1.9367, 2.2476, 2.0985, 2.0217, 2.1123], + device='cuda:2'), covar=tensor([0.0449, 0.1570, 0.0387, 0.0792, 0.0365, 0.0726, 0.0393, 0.0610], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0315, 0.0398, 0.0304, 0.0373, 0.0321, 0.0364, 0.0310], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:50:13,213 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90165.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:50:15,737 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.569e+02 2.354e+02 2.821e+02 3.688e+02 2.263e+03, threshold=5.641e+02, percent-clipped=6.0 +2022-12-08 02:50:28,849 INFO [train.py:873] (2/4) Epoch 12, batch 7000, loss[loss=0.1375, simple_loss=0.1407, pruned_loss=0.0672, over 2618.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1546, pruned_loss=0.04698, over 1960721.40 frames. ], batch size: 100, lr: 6.48e-03, grad_scale: 8.0 +2022-12-08 02:50:35,978 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90191.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:50:55,446 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90213.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:50:59,933 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7106, 3.4034, 3.3256, 3.7056, 3.4471, 3.6432, 3.6806, 3.0183], + device='cuda:2'), covar=tensor([0.0482, 0.1125, 0.0507, 0.0437, 0.0824, 0.0376, 0.0652, 0.0632], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0258, 0.0186, 0.0181, 0.0175, 0.0147, 0.0267, 0.0161], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 02:51:02,619 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90221.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 02:51:03,797 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9567, 2.4752, 3.7228, 2.8834, 3.7670, 3.6268, 3.5301, 3.1722], + device='cuda:2'), covar=tensor([0.0657, 0.2765, 0.0865, 0.1620, 0.0679, 0.0818, 0.1481, 0.1819], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0315, 0.0397, 0.0303, 0.0372, 0.0319, 0.0362, 0.0308], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:51:11,954 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90232.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:51:27,833 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7844, 3.5578, 3.2934, 2.4668, 3.3307, 3.5950, 3.9640, 2.9284], + device='cuda:2'), covar=tensor([0.0599, 0.1129, 0.0818, 0.1283, 0.0748, 0.0537, 0.0510, 0.1118], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0178, 0.0136, 0.0124, 0.0134, 0.0146, 0.0123, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 02:51:30,693 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.17 vs. limit=2.0 +2022-12-08 02:51:44,201 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.424e+02 1.997e+02 2.595e+02 3.353e+02 6.185e+02, threshold=5.189e+02, percent-clipped=2.0 +2022-12-08 02:51:56,323 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.64 vs. limit=5.0 +2022-12-08 02:51:57,580 INFO [train.py:873] (2/4) Epoch 12, batch 7100, loss[loss=0.1269, simple_loss=0.1562, pruned_loss=0.04875, over 14217.00 frames. ], tot_loss[loss=0.124, simple_loss=0.1544, pruned_loss=0.04674, over 1956782.07 frames. ], batch size: 35, lr: 6.47e-03, grad_scale: 8.0 +2022-12-08 02:52:06,230 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90293.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:52:10,429 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7921, 1.4698, 3.7071, 1.6268, 3.7513, 3.8528, 2.8440, 4.2117], + device='cuda:2'), covar=tensor([0.0224, 0.3166, 0.0392, 0.2397, 0.0468, 0.0378, 0.0658, 0.0138], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0157, 0.0162, 0.0169, 0.0169, 0.0179, 0.0132, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:52:40,683 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6323, 1.7615, 1.8455, 1.3399, 1.3559, 1.6709, 1.0987, 1.7722], + device='cuda:2'), covar=tensor([0.1457, 0.2051, 0.0862, 0.2464, 0.2711, 0.0910, 0.2576, 0.0955], + device='cuda:2'), in_proj_covar=tensor([0.0081, 0.0098, 0.0091, 0.0096, 0.0115, 0.0084, 0.0122, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 02:52:44,265 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90336.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:52:57,287 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90351.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:53:12,653 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.266e+02 2.346e+02 2.734e+02 3.324e+02 7.118e+02, threshold=5.469e+02, percent-clipped=5.0 +2022-12-08 02:53:14,483 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6797, 4.7575, 5.1426, 4.1143, 4.9514, 5.2459, 2.1474, 4.6247], + device='cuda:2'), covar=tensor([0.0264, 0.0260, 0.0311, 0.0420, 0.0231, 0.0115, 0.2814, 0.0231], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0169, 0.0142, 0.0140, 0.0201, 0.0135, 0.0157, 0.0187], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 02:53:26,144 INFO [train.py:873] (2/4) Epoch 12, batch 7200, loss[loss=0.2007, simple_loss=0.1741, pruned_loss=0.1137, over 1239.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1545, pruned_loss=0.04706, over 1876485.20 frames. ], batch size: 100, lr: 6.47e-03, grad_scale: 8.0 +2022-12-08 02:53:26,989 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90384.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:53:51,946 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90412.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:53:55,183 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8725, 1.6501, 1.7841, 1.5150, 1.8758, 1.1346, 1.5499, 1.8331], + device='cuda:2'), covar=tensor([0.1186, 0.1206, 0.0917, 0.2104, 0.1062, 0.0922, 0.0820, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0027, 0.0028, 0.0040, 0.0028, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:54:07,531 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8083, 2.1132, 1.9380, 1.6043, 1.6261, 1.7969, 1.6339, 1.7643], + device='cuda:2'), covar=tensor([0.0630, 0.0793, 0.0433, 0.0630, 0.0636, 0.0513, 0.0594, 0.0479], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0015, 0.0016, 0.0016, 0.0027, 0.0022, 0.0027], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:54:33,017 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90459.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:54:40,950 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.493e+02 2.297e+02 2.801e+02 3.599e+02 6.534e+02, threshold=5.602e+02, percent-clipped=3.0 +2022-12-08 02:54:54,141 INFO [train.py:873] (2/4) Epoch 12, batch 7300, loss[loss=0.1719, simple_loss=0.1834, pruned_loss=0.08017, over 8629.00 frames. ], tot_loss[loss=0.1243, simple_loss=0.1539, pruned_loss=0.04734, over 1834224.31 frames. ], batch size: 100, lr: 6.47e-03, grad_scale: 8.0 +2022-12-08 02:54:56,704 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90486.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:55:05,358 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90496.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:55:23,902 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7952, 0.8072, 0.6227, 0.8273, 0.7834, 0.2537, 0.7670, 0.8573], + device='cuda:2'), covar=tensor([0.0267, 0.0418, 0.0319, 0.0300, 0.0249, 0.0236, 0.0704, 0.0561], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0027, 0.0029, 0.0040, 0.0028, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:55:26,544 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90520.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:55:27,622 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90521.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 02:55:29,557 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.03 vs. limit=5.0 +2022-12-08 02:55:46,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3227, 1.4676, 3.3672, 1.5152, 3.1262, 3.4475, 2.5476, 3.6520], + device='cuda:2'), covar=tensor([0.0266, 0.3180, 0.0490, 0.2518, 0.0948, 0.0434, 0.0816, 0.0234], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0156, 0.0161, 0.0169, 0.0169, 0.0178, 0.0133, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 02:55:58,869 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90557.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:56:05,132 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7222, 1.6868, 1.6798, 1.4450, 1.4700, 1.4141, 0.9908, 1.1582], + device='cuda:2'), covar=tensor([0.0208, 0.0268, 0.0206, 0.0226, 0.0246, 0.0289, 0.0245, 0.0469], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0015, 0.0016, 0.0016, 0.0027, 0.0022, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 02:56:08,322 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.346e+02 2.439e+02 2.820e+02 3.761e+02 1.640e+03, threshold=5.641e+02, percent-clipped=6.0 +2022-12-08 02:56:09,298 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90569.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:56:14,447 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8970, 2.7508, 2.4700, 2.5920, 2.8139, 2.8343, 2.8420, 2.8522], + device='cuda:2'), covar=tensor([0.0977, 0.0764, 0.2310, 0.2806, 0.0996, 0.1138, 0.1420, 0.0948], + device='cuda:2'), in_proj_covar=tensor([0.0372, 0.0260, 0.0430, 0.0543, 0.0324, 0.0425, 0.0386, 0.0362], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 02:56:21,034 INFO [train.py:873] (2/4) Epoch 12, batch 7400, loss[loss=0.1074, simple_loss=0.1458, pruned_loss=0.03452, over 14269.00 frames. ], tot_loss[loss=0.1248, simple_loss=0.1546, pruned_loss=0.04754, over 1877354.17 frames. ], batch size: 28, lr: 6.46e-03, grad_scale: 8.0 +2022-12-08 02:56:25,655 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90588.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:56:28,366 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2551, 2.2020, 3.2524, 3.3389, 3.2449, 2.2527, 3.1794, 2.5374], + device='cuda:2'), covar=tensor([0.0382, 0.0931, 0.0697, 0.0518, 0.0405, 0.1276, 0.0362, 0.0862], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0254, 0.0370, 0.0324, 0.0265, 0.0301, 0.0299, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 02:57:36,485 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.166e+02 2.056e+02 2.705e+02 3.388e+02 5.902e+02, threshold=5.409e+02, percent-clipped=1.0 +2022-12-08 02:57:50,343 INFO [train.py:873] (2/4) Epoch 12, batch 7500, loss[loss=0.128, simple_loss=0.1297, pruned_loss=0.06311, over 1165.00 frames. ], tot_loss[loss=0.1252, simple_loss=0.155, pruned_loss=0.04768, over 1909094.52 frames. ], batch size: 100, lr: 6.46e-03, grad_scale: 8.0 +2022-12-08 02:58:10,924 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90707.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 02:59:13,085 INFO [train.py:873] (2/4) Epoch 13, batch 0, loss[loss=0.158, simple_loss=0.1885, pruned_loss=0.06381, over 14494.00 frames. ], tot_loss[loss=0.158, simple_loss=0.1885, pruned_loss=0.06381, over 14494.00 frames. ], batch size: 24, lr: 6.21e-03, grad_scale: 8.0 +2022-12-08 02:59:13,085 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 02:59:20,360 INFO [train.py:905] (2/4) Epoch 13, validation: loss=0.1364, simple_loss=0.1777, pruned_loss=0.04756, over 857387.00 frames. +2022-12-08 02:59:20,361 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 02:59:27,103 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-12-08 02:59:28,878 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-08 02:59:41,457 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 6.584e+01 1.659e+02 2.772e+02 3.641e+02 1.065e+03, threshold=5.544e+02, percent-clipped=8.0 +2022-12-08 02:59:48,150 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9706, 3.2311, 2.9892, 3.1727, 2.4056, 3.3312, 3.1207, 1.6795], + device='cuda:2'), covar=tensor([0.1768, 0.0763, 0.1414, 0.1084, 0.1047, 0.0593, 0.0945, 0.2365], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0080, 0.0064, 0.0067, 0.0093, 0.0078, 0.0094, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 02:59:57,642 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90786.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:00:16,028 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7434, 1.8947, 2.5620, 2.2750, 2.6113, 2.5112, 2.4103, 2.3224], + device='cuda:2'), covar=tensor([0.0704, 0.2719, 0.0900, 0.1495, 0.0598, 0.1101, 0.0831, 0.1347], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0315, 0.0398, 0.0304, 0.0377, 0.0321, 0.0366, 0.0306], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:00:24,320 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90815.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:00:40,590 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90834.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:00:50,553 INFO [train.py:873] (2/4) Epoch 13, batch 100, loss[loss=0.1229, simple_loss=0.1568, pruned_loss=0.04449, over 14196.00 frames. ], tot_loss[loss=0.1222, simple_loss=0.1549, pruned_loss=0.04479, over 895288.53 frames. ], batch size: 89, lr: 6.20e-03, grad_scale: 8.0 +2022-12-08 03:00:54,318 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=90849.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:00:56,694 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=90852.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:01:09,979 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.361e+02 2.335e+02 2.702e+02 3.282e+02 9.825e+02, threshold=5.404e+02, percent-clipped=3.0 +2022-12-08 03:01:27,452 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=90888.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:01:30,057 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8768, 1.5217, 3.0651, 2.8441, 2.9806, 3.1079, 2.2532, 3.0520], + device='cuda:2'), covar=tensor([0.1216, 0.1388, 0.0162, 0.0331, 0.0306, 0.0151, 0.0471, 0.0202], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0157, 0.0126, 0.0165, 0.0143, 0.0139, 0.0120, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:01:46,447 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=90910.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:02:07,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7575, 1.5811, 1.8311, 2.0592, 1.3600, 1.7836, 1.7379, 1.8400], + device='cuda:2'), covar=tensor([0.0173, 0.0248, 0.0133, 0.0141, 0.0256, 0.0282, 0.0176, 0.0112], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0254, 0.0371, 0.0325, 0.0265, 0.0300, 0.0300, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:02:09,078 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=90936.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:02:16,741 INFO [train.py:873] (2/4) Epoch 13, batch 200, loss[loss=0.1866, simple_loss=0.1909, pruned_loss=0.09119, over 8626.00 frames. ], tot_loss[loss=0.1241, simple_loss=0.1551, pruned_loss=0.04655, over 1320278.48 frames. ], batch size: 100, lr: 6.20e-03, grad_scale: 8.0 +2022-12-08 03:02:32,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6330, 1.7524, 1.8276, 1.3410, 1.2768, 1.7440, 1.0232, 1.6868], + device='cuda:2'), covar=tensor([0.1333, 0.2220, 0.0850, 0.2285, 0.2894, 0.0883, 0.3069, 0.1264], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0098, 0.0090, 0.0097, 0.0114, 0.0084, 0.0123, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:02:36,077 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.982e+01 2.220e+02 2.675e+02 3.390e+02 6.533e+02, threshold=5.349e+02, percent-clipped=6.0 +2022-12-08 03:02:39,128 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8145, 1.5019, 1.8587, 1.6623, 1.8666, 1.1070, 1.5106, 1.6467], + device='cuda:2'), covar=tensor([0.0898, 0.0883, 0.0626, 0.0594, 0.0585, 0.0848, 0.0925, 0.0625], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0027, 0.0028, 0.0040, 0.0028, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:02:39,141 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7805, 1.7946, 1.9883, 1.6335, 1.7063, 1.5184, 1.3834, 1.1874], + device='cuda:2'), covar=tensor([0.0293, 0.0323, 0.0254, 0.0379, 0.0297, 0.0351, 0.0274, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0015, 0.0016, 0.0016, 0.0027, 0.0022, 0.0026], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:02:48,924 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8485, 1.5678, 3.6431, 3.3943, 3.5015, 3.7102, 3.0841, 3.6607], + device='cuda:2'), covar=tensor([0.1427, 0.1571, 0.0124, 0.0240, 0.0231, 0.0131, 0.0257, 0.0143], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0158, 0.0127, 0.0166, 0.0144, 0.0140, 0.0120, 0.0119], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:02:49,868 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8667, 1.4830, 1.7951, 1.7986, 1.8626, 1.0016, 1.5238, 1.5655], + device='cuda:2'), covar=tensor([0.0587, 0.1084, 0.0493, 0.1240, 0.0771, 0.0962, 0.0791, 0.0708], + device='cuda:2'), in_proj_covar=tensor([0.0028, 0.0028, 0.0030, 0.0027, 0.0029, 0.0040, 0.0028, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:03:11,305 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91007.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:03:44,491 INFO [train.py:873] (2/4) Epoch 13, batch 300, loss[loss=0.1301, simple_loss=0.1316, pruned_loss=0.06424, over 2598.00 frames. ], tot_loss[loss=0.123, simple_loss=0.1545, pruned_loss=0.04577, over 1647753.29 frames. ], batch size: 100, lr: 6.20e-03, grad_scale: 8.0 +2022-12-08 03:03:53,666 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91055.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:04:03,034 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8823, 3.6331, 3.3875, 3.5782, 3.8130, 3.8341, 3.8678, 3.8965], + device='cuda:2'), covar=tensor([0.0926, 0.0629, 0.2007, 0.2480, 0.0711, 0.0774, 0.1065, 0.0754], + device='cuda:2'), in_proj_covar=tensor([0.0376, 0.0263, 0.0439, 0.0549, 0.0329, 0.0426, 0.0391, 0.0362], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:04:04,582 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 2.005e+02 2.425e+02 3.069e+02 6.147e+02, threshold=4.849e+02, percent-clipped=1.0 +2022-12-08 03:04:16,168 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.50 vs. limit=2.0 +2022-12-08 03:04:46,143 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91115.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:05:05,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6677, 3.5365, 3.3998, 3.7448, 3.3705, 2.9801, 3.7507, 3.5909], + device='cuda:2'), covar=tensor([0.0723, 0.0881, 0.0949, 0.0704, 0.0978, 0.1013, 0.0640, 0.0818], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0133, 0.0143, 0.0152, 0.0142, 0.0118, 0.0161, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:05:13,074 INFO [train.py:873] (2/4) Epoch 13, batch 400, loss[loss=0.1638, simple_loss=0.1521, pruned_loss=0.08776, over 1159.00 frames. ], tot_loss[loss=0.1231, simple_loss=0.1542, pruned_loss=0.04598, over 1762568.98 frames. ], batch size: 100, lr: 6.19e-03, grad_scale: 8.0 +2022-12-08 03:05:16,744 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8803, 3.6305, 3.3116, 2.8185, 3.2952, 3.5843, 3.9563, 3.0486], + device='cuda:2'), covar=tensor([0.0463, 0.1184, 0.0863, 0.1215, 0.0755, 0.0531, 0.0836, 0.1202], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0178, 0.0138, 0.0125, 0.0135, 0.0146, 0.0124, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 03:05:19,350 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91152.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:05:28,732 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91163.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:05:33,351 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.284e+02 2.800e+02 3.397e+02 6.782e+02, threshold=5.601e+02, percent-clipped=4.0 +2022-12-08 03:06:01,549 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91200.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:06:06,355 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91205.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:06:39,703 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4277, 4.6039, 4.8279, 4.1929, 4.5993, 4.8806, 1.7610, 4.3310], + device='cuda:2'), covar=tensor([0.0242, 0.0281, 0.0322, 0.0373, 0.0274, 0.0155, 0.3239, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0167, 0.0140, 0.0137, 0.0198, 0.0133, 0.0155, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:06:41,226 INFO [train.py:873] (2/4) Epoch 13, batch 500, loss[loss=0.1164, simple_loss=0.1311, pruned_loss=0.05088, over 2548.00 frames. ], tot_loss[loss=0.1229, simple_loss=0.1542, pruned_loss=0.0458, over 1840620.59 frames. ], batch size: 100, lr: 6.19e-03, grad_scale: 8.0 +2022-12-08 03:07:01,832 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.375e+02 2.197e+02 2.790e+02 3.555e+02 6.703e+02, threshold=5.580e+02, percent-clipped=3.0 +2022-12-08 03:07:16,577 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91285.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:07:21,701 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-12-08 03:07:45,460 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8088, 2.1453, 2.0871, 2.1713, 1.8916, 2.1993, 2.0367, 1.1662], + device='cuda:2'), covar=tensor([0.1590, 0.1118, 0.1004, 0.0979, 0.1327, 0.0918, 0.1407, 0.2779], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0080, 0.0064, 0.0067, 0.0094, 0.0078, 0.0093, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 03:08:09,275 INFO [train.py:873] (2/4) Epoch 13, batch 600, loss[loss=0.09467, simple_loss=0.1424, pruned_loss=0.02345, over 14214.00 frames. ], tot_loss[loss=0.1228, simple_loss=0.1541, pruned_loss=0.04575, over 1902742.30 frames. ], batch size: 25, lr: 6.19e-03, grad_scale: 8.0 +2022-12-08 03:08:10,275 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91346.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:08:21,124 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9179, 1.6046, 3.5538, 3.2953, 3.4069, 3.5412, 2.8453, 3.5727], + device='cuda:2'), covar=tensor([0.1396, 0.1557, 0.0122, 0.0276, 0.0251, 0.0154, 0.0356, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0129, 0.0167, 0.0147, 0.0142, 0.0123, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:08:21,142 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5066, 2.3443, 2.6572, 1.8126, 1.8505, 2.4386, 1.4571, 2.3046], + device='cuda:2'), covar=tensor([0.0612, 0.1299, 0.0878, 0.2659, 0.2548, 0.0866, 0.3673, 0.1021], + device='cuda:2'), in_proj_covar=tensor([0.0082, 0.0098, 0.0091, 0.0097, 0.0114, 0.0086, 0.0123, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:08:26,519 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6132, 1.3941, 3.6967, 1.6279, 3.5315, 3.7644, 2.4724, 4.0157], + device='cuda:2'), covar=tensor([0.0253, 0.3166, 0.0361, 0.2228, 0.0634, 0.0374, 0.0948, 0.0180], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0156, 0.0157, 0.0166, 0.0166, 0.0175, 0.0132, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 03:08:29,164 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.092e+02 2.257e+02 2.792e+02 3.414e+02 7.089e+02, threshold=5.584e+02, percent-clipped=8.0 +2022-12-08 03:08:39,699 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9094, 2.7322, 2.7098, 2.9228, 2.7539, 2.8274, 2.9688, 2.4683], + device='cuda:2'), covar=tensor([0.0544, 0.0959, 0.0595, 0.0576, 0.0851, 0.0491, 0.0650, 0.0646], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0263, 0.0187, 0.0183, 0.0178, 0.0149, 0.0269, 0.0164], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 03:09:00,320 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1311, 3.1103, 2.9195, 3.2803, 2.8621, 2.7899, 3.2007, 3.1582], + device='cuda:2'), covar=tensor([0.0780, 0.0840, 0.1037, 0.0600, 0.1170, 0.0826, 0.0725, 0.0763], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0131, 0.0142, 0.0151, 0.0140, 0.0117, 0.0160, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:09:35,347 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6878, 1.7060, 2.9283, 2.1283, 2.7142, 1.7467, 2.2631, 2.7306], + device='cuda:2'), covar=tensor([0.1206, 0.4563, 0.0674, 0.4329, 0.1166, 0.3658, 0.1419, 0.1019], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0214, 0.0208, 0.0291, 0.0232, 0.0215, 0.0213, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:09:36,012 INFO [train.py:873] (2/4) Epoch 13, batch 700, loss[loss=0.1367, simple_loss=0.1578, pruned_loss=0.0578, over 5946.00 frames. ], tot_loss[loss=0.1224, simple_loss=0.1539, pruned_loss=0.04542, over 2004010.45 frames. ], batch size: 100, lr: 6.18e-03, grad_scale: 8.0 +2022-12-08 03:09:55,978 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.741e+01 2.087e+02 2.616e+02 3.337e+02 7.440e+02, threshold=5.233e+02, percent-clipped=3.0 +2022-12-08 03:10:28,587 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91505.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:11:01,221 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91542.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:11:03,623 INFO [train.py:873] (2/4) Epoch 13, batch 800, loss[loss=0.1187, simple_loss=0.1552, pruned_loss=0.04107, over 14263.00 frames. ], tot_loss[loss=0.122, simple_loss=0.1534, pruned_loss=0.04536, over 1943154.02 frames. ], batch size: 63, lr: 6.18e-03, grad_scale: 8.0 +2022-12-08 03:11:10,462 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91553.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:11:23,656 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.089e+02 2.194e+02 2.551e+02 3.061e+02 6.051e+02, threshold=5.101e+02, percent-clipped=1.0 +2022-12-08 03:11:54,254 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91603.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:12:23,658 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.41 vs. limit=5.0 +2022-12-08 03:12:27,369 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91641.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:12:30,941 INFO [train.py:873] (2/4) Epoch 13, batch 900, loss[loss=0.1188, simple_loss=0.1312, pruned_loss=0.05315, over 3879.00 frames. ], tot_loss[loss=0.1219, simple_loss=0.1534, pruned_loss=0.04518, over 2002875.53 frames. ], batch size: 100, lr: 6.18e-03, grad_scale: 16.0 +2022-12-08 03:12:31,121 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7582, 2.7493, 2.0865, 2.8561, 2.6643, 2.7109, 2.5161, 2.2332], + device='cuda:2'), covar=tensor([0.0962, 0.1241, 0.3473, 0.0761, 0.1190, 0.0878, 0.1566, 0.2487], + device='cuda:2'), in_proj_covar=tensor([0.0269, 0.0297, 0.0269, 0.0262, 0.0315, 0.0296, 0.0258, 0.0249], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:12:33,571 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-12-08 03:12:52,063 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 2.371e+02 2.875e+02 3.801e+02 8.432e+02, threshold=5.750e+02, percent-clipped=7.0 +2022-12-08 03:13:41,401 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.25 vs. limit=5.0 +2022-12-08 03:13:58,931 INFO [train.py:873] (2/4) Epoch 13, batch 1000, loss[loss=0.1151, simple_loss=0.1418, pruned_loss=0.04418, over 6917.00 frames. ], tot_loss[loss=0.1232, simple_loss=0.154, pruned_loss=0.04621, over 1912143.10 frames. ], batch size: 100, lr: 6.17e-03, grad_scale: 8.0 +2022-12-08 03:14:19,310 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.021e+02 2.131e+02 2.597e+02 3.397e+02 6.255e+02, threshold=5.195e+02, percent-clipped=2.0 +2022-12-08 03:14:44,186 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1927, 4.7339, 4.6323, 5.1218, 4.7691, 4.4540, 5.1241, 4.3198], + device='cuda:2'), covar=tensor([0.0338, 0.1115, 0.0330, 0.0390, 0.0755, 0.0521, 0.0475, 0.0479], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0266, 0.0188, 0.0184, 0.0180, 0.0149, 0.0271, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 03:15:01,000 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9235, 1.5149, 3.4907, 3.2296, 3.3194, 3.5051, 2.7962, 3.4909], + device='cuda:2'), covar=tensor([0.1547, 0.1665, 0.0119, 0.0264, 0.0303, 0.0142, 0.0301, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0129, 0.0167, 0.0146, 0.0141, 0.0122, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:15:09,050 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.86 vs. limit=5.0 +2022-12-08 03:15:26,317 INFO [train.py:873] (2/4) Epoch 13, batch 1100, loss[loss=0.1121, simple_loss=0.1297, pruned_loss=0.04719, over 3841.00 frames. ], tot_loss[loss=0.1239, simple_loss=0.1541, pruned_loss=0.04684, over 1843686.47 frames. ], batch size: 100, lr: 6.17e-03, grad_scale: 8.0 +2022-12-08 03:15:36,872 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.59 vs. limit=5.0 +2022-12-08 03:15:46,818 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7364, 3.0632, 2.7882, 3.0058, 2.3149, 3.0757, 2.8404, 1.3772], + device='cuda:2'), covar=tensor([0.1903, 0.0737, 0.1211, 0.0692, 0.1130, 0.0627, 0.1199, 0.2763], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0081, 0.0065, 0.0067, 0.0095, 0.0079, 0.0095, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 03:15:47,465 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.763e+01 2.120e+02 2.631e+02 3.193e+02 5.598e+02, threshold=5.262e+02, percent-clipped=1.0 +2022-12-08 03:15:48,018 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-08 03:16:07,096 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91891.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:16:13,635 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=91898.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:16:51,346 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=91941.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:16:51,450 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7803, 3.0989, 4.5380, 3.3982, 4.5323, 4.4273, 4.3323, 4.0604], + device='cuda:2'), covar=tensor([0.0695, 0.2735, 0.0715, 0.1638, 0.0798, 0.0866, 0.1496, 0.1432], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0312, 0.0391, 0.0300, 0.0373, 0.0320, 0.0359, 0.0306], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:16:54,656 INFO [train.py:873] (2/4) Epoch 13, batch 1200, loss[loss=0.1504, simple_loss=0.1448, pruned_loss=0.07797, over 2627.00 frames. ], tot_loss[loss=0.1227, simple_loss=0.1542, pruned_loss=0.04557, over 1958241.11 frames. ], batch size: 100, lr: 6.17e-03, grad_scale: 8.0 +2022-12-08 03:17:00,858 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=91952.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:17:03,181 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91955.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:17:15,484 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.061e+02 2.334e+02 2.878e+02 3.521e+02 1.024e+03, threshold=5.756e+02, percent-clipped=7.0 +2022-12-08 03:17:26,232 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=91981.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:17:33,103 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=91989.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:17:57,356 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92016.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:18:19,101 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5764, 2.4028, 3.6123, 3.7395, 3.6956, 2.2360, 3.7584, 2.5998], + device='cuda:2'), covar=tensor([0.0442, 0.1018, 0.0827, 0.0533, 0.0417, 0.1639, 0.0414, 0.1127], + device='cuda:2'), in_proj_covar=tensor([0.0288, 0.0256, 0.0374, 0.0327, 0.0267, 0.0302, 0.0304, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0003, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:18:19,861 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92042.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 03:18:20,296 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-08 03:18:22,473 INFO [train.py:873] (2/4) Epoch 13, batch 1300, loss[loss=0.1904, simple_loss=0.1794, pruned_loss=0.1007, over 3918.00 frames. ], tot_loss[loss=0.1221, simple_loss=0.1537, pruned_loss=0.04519, over 2033612.48 frames. ], batch size: 100, lr: 6.16e-03, grad_scale: 8.0 +2022-12-08 03:18:44,122 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.023e+02 2.102e+02 2.497e+02 3.213e+02 6.107e+02, threshold=4.993e+02, percent-clipped=1.0 +2022-12-08 03:19:52,208 INFO [train.py:873] (2/4) Epoch 13, batch 1400, loss[loss=0.09536, simple_loss=0.1434, pruned_loss=0.02366, over 14252.00 frames. ], tot_loss[loss=0.1217, simple_loss=0.1535, pruned_loss=0.04493, over 2042678.22 frames. ], batch size: 35, lr: 6.16e-03, grad_scale: 8.0 +2022-12-08 03:19:53,527 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-08 03:20:13,182 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.308e+02 2.362e+02 2.970e+02 3.546e+02 6.469e+02, threshold=5.940e+02, percent-clipped=3.0 +2022-12-08 03:20:22,954 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7722, 3.0600, 2.9024, 3.0613, 2.3196, 3.1880, 2.8728, 1.6141], + device='cuda:2'), covar=tensor([0.1726, 0.0959, 0.1121, 0.0643, 0.1065, 0.0427, 0.1255, 0.2416], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0081, 0.0066, 0.0067, 0.0095, 0.0080, 0.0095, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 03:20:32,954 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.42 vs. limit=5.0 +2022-12-08 03:20:39,433 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92198.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:20:54,219 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0207, 1.9671, 2.2096, 2.0376, 2.1474, 1.4008, 1.8179, 2.1679], + device='cuda:2'), covar=tensor([0.0930, 0.0867, 0.0742, 0.0907, 0.1943, 0.0807, 0.1500, 0.0926], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0029, 0.0031, 0.0028, 0.0029, 0.0042, 0.0030, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:21:08,425 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0498, 1.9181, 4.7598, 2.5788, 4.5054, 4.9243, 4.6087, 5.4876], + device='cuda:2'), covar=tensor([0.0196, 0.2995, 0.0353, 0.1891, 0.0242, 0.0373, 0.0254, 0.0123], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0157, 0.0159, 0.0167, 0.0168, 0.0178, 0.0134, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:21:14,526 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2082, 2.9470, 2.3152, 3.2584, 3.0792, 3.2025, 2.8038, 2.3214], + device='cuda:2'), covar=tensor([0.0905, 0.1448, 0.3176, 0.0655, 0.0890, 0.1057, 0.1390, 0.3063], + device='cuda:2'), in_proj_covar=tensor([0.0269, 0.0295, 0.0265, 0.0261, 0.0309, 0.0296, 0.0257, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:21:16,235 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6172, 1.4791, 1.9679, 1.5733, 1.7203, 1.3018, 1.6382, 1.1174], + device='cuda:2'), covar=tensor([0.0272, 0.0445, 0.0201, 0.0340, 0.0251, 0.0373, 0.0315, 0.0670], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0016, 0.0017, 0.0017, 0.0027, 0.0022, 0.0027], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:21:21,064 INFO [train.py:873] (2/4) Epoch 13, batch 1500, loss[loss=0.1207, simple_loss=0.1255, pruned_loss=0.05792, over 2599.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1528, pruned_loss=0.04427, over 2066860.74 frames. ], batch size: 100, lr: 6.16e-03, grad_scale: 8.0 +2022-12-08 03:21:22,064 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92246.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:21:22,851 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92247.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:21:41,369 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.124e+02 2.104e+02 2.663e+02 3.435e+02 6.153e+02, threshold=5.325e+02, percent-clipped=2.0 +2022-12-08 03:21:51,792 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9756, 3.0886, 3.1884, 3.0256, 3.1372, 2.9999, 1.5076, 2.8883], + device='cuda:2'), covar=tensor([0.0354, 0.0353, 0.0384, 0.0380, 0.0326, 0.0615, 0.2828, 0.0303], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0167, 0.0141, 0.0139, 0.0199, 0.0133, 0.0156, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:22:18,313 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92311.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:22:30,384 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2470, 1.7281, 2.4492, 1.9960, 2.3122, 1.6912, 1.9977, 2.2416], + device='cuda:2'), covar=tensor([0.1717, 0.3578, 0.0566, 0.2176, 0.1154, 0.2137, 0.1036, 0.0846], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0214, 0.0210, 0.0289, 0.0231, 0.0215, 0.0214, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:22:40,846 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92337.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:22:48,258 INFO [train.py:873] (2/4) Epoch 13, batch 1600, loss[loss=0.1036, simple_loss=0.1216, pruned_loss=0.0428, over 3865.00 frames. ], tot_loss[loss=0.1224, simple_loss=0.1535, pruned_loss=0.04563, over 2052512.08 frames. ], batch size: 100, lr: 6.15e-03, grad_scale: 8.0 +2022-12-08 03:22:53,315 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9615, 1.8565, 1.6582, 1.9743, 1.8618, 1.8220, 1.8246, 1.7920], + device='cuda:2'), covar=tensor([0.0859, 0.0884, 0.1966, 0.0686, 0.1008, 0.0715, 0.1626, 0.1028], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0298, 0.0269, 0.0264, 0.0314, 0.0299, 0.0262, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:22:59,226 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8285, 1.1604, 1.2717, 1.2205, 1.0205, 1.3077, 1.0744, 0.9402], + device='cuda:2'), covar=tensor([0.2094, 0.0872, 0.0454, 0.0432, 0.1687, 0.0718, 0.1461, 0.1362], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0082, 0.0066, 0.0068, 0.0096, 0.0081, 0.0096, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 03:23:08,502 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.214e+02 2.735e+02 3.327e+02 6.728e+02, threshold=5.470e+02, percent-clipped=2.0 +2022-12-08 03:23:11,247 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4047, 2.3767, 1.9769, 2.4692, 2.2808, 2.3524, 2.1474, 2.0356], + device='cuda:2'), covar=tensor([0.0745, 0.0897, 0.2042, 0.0636, 0.0870, 0.0641, 0.1472, 0.1311], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0298, 0.0270, 0.0264, 0.0315, 0.0299, 0.0262, 0.0251], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:24:14,891 INFO [train.py:873] (2/4) Epoch 13, batch 1700, loss[loss=0.1443, simple_loss=0.1427, pruned_loss=0.07292, over 2576.00 frames. ], tot_loss[loss=0.123, simple_loss=0.1537, pruned_loss=0.04614, over 1936013.46 frames. ], batch size: 100, lr: 6.15e-03, grad_scale: 8.0 +2022-12-08 03:24:36,007 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.170e+02 2.177e+02 2.526e+02 3.126e+02 5.467e+02, threshold=5.052e+02, percent-clipped=0.0 +2022-12-08 03:25:19,516 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4097, 2.2168, 3.3497, 3.5227, 3.3267, 2.2237, 3.4125, 2.6330], + device='cuda:2'), covar=tensor([0.0437, 0.1019, 0.0911, 0.0509, 0.0512, 0.1458, 0.0434, 0.1013], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0256, 0.0372, 0.0326, 0.0266, 0.0302, 0.0304, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:25:39,963 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8793, 2.0309, 1.8457, 2.0752, 1.6927, 1.9792, 1.9800, 1.9843], + device='cuda:2'), covar=tensor([0.1185, 0.1169, 0.1320, 0.0952, 0.1739, 0.0922, 0.1366, 0.1117], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0132, 0.0140, 0.0151, 0.0140, 0.0118, 0.0159, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:25:42,443 INFO [train.py:873] (2/4) Epoch 13, batch 1800, loss[loss=0.1218, simple_loss=0.1557, pruned_loss=0.04397, over 14242.00 frames. ], tot_loss[loss=0.1217, simple_loss=0.1531, pruned_loss=0.04519, over 1949897.76 frames. ], batch size: 80, lr: 6.15e-03, grad_scale: 8.0 +2022-12-08 03:25:44,264 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92547.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:26:02,904 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.175e+02 2.080e+02 2.746e+02 3.507e+02 5.512e+02, threshold=5.493e+02, percent-clipped=3.0 +2022-12-08 03:26:11,969 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9799, 3.8035, 3.4549, 2.8223, 3.3708, 3.6292, 3.9677, 3.2152], + device='cuda:2'), covar=tensor([0.0506, 0.1013, 0.0872, 0.1303, 0.0764, 0.0608, 0.0641, 0.1012], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0178, 0.0138, 0.0125, 0.0137, 0.0147, 0.0123, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 03:26:25,749 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92595.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:26:34,675 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-08 03:26:39,633 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92611.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:26:52,231 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92625.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:27:02,613 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=92637.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:27:09,285 INFO [train.py:873] (2/4) Epoch 13, batch 1900, loss[loss=0.1756, simple_loss=0.1683, pruned_loss=0.09146, over 1271.00 frames. ], tot_loss[loss=0.123, simple_loss=0.1536, pruned_loss=0.04617, over 1905177.47 frames. ], batch size: 100, lr: 6.14e-03, grad_scale: 4.0 +2022-12-08 03:27:21,753 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:27:31,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.202e+02 2.634e+02 3.218e+02 6.502e+02, threshold=5.268e+02, percent-clipped=3.0 +2022-12-08 03:27:44,398 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.68 vs. limit=2.0 +2022-12-08 03:27:44,649 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=92685.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:27:45,536 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92686.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:28:15,773 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92720.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:28:37,931 INFO [train.py:873] (2/4) Epoch 13, batch 2000, loss[loss=0.1296, simple_loss=0.1611, pruned_loss=0.04902, over 11174.00 frames. ], tot_loss[loss=0.1223, simple_loss=0.1532, pruned_loss=0.04572, over 1873148.37 frames. ], batch size: 100, lr: 6.14e-03, grad_scale: 8.0 +2022-12-08 03:28:40,249 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-08 03:28:59,689 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.406e+02 2.366e+02 2.802e+02 3.411e+02 7.982e+02, threshold=5.603e+02, percent-clipped=8.0 +2022-12-08 03:29:10,418 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92781.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:29:54,408 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92831.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:30:06,187 INFO [train.py:873] (2/4) Epoch 13, batch 2100, loss[loss=0.1435, simple_loss=0.1662, pruned_loss=0.06042, over 7764.00 frames. ], tot_loss[loss=0.1219, simple_loss=0.1527, pruned_loss=0.04549, over 1857405.98 frames. ], batch size: 100, lr: 6.14e-03, grad_scale: 8.0 +2022-12-08 03:30:11,747 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3427, 1.4749, 2.5216, 1.5486, 2.4516, 2.4798, 1.9312, 2.6276], + device='cuda:2'), covar=tensor([0.0319, 0.2313, 0.0369, 0.1707, 0.0533, 0.0549, 0.1084, 0.0269], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0157, 0.0160, 0.0166, 0.0169, 0.0177, 0.0133, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:30:20,713 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.12 vs. limit=2.0 +2022-12-08 03:30:27,468 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.90 vs. limit=5.0 +2022-12-08 03:30:28,666 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.062e+02 2.147e+02 2.777e+02 3.486e+02 6.981e+02, threshold=5.554e+02, percent-clipped=1.0 +2022-12-08 03:30:48,217 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92892.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:30:51,325 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1318, 3.2414, 3.4062, 3.2597, 3.2983, 2.9340, 1.5040, 3.1247], + device='cuda:2'), covar=tensor([0.0380, 0.0363, 0.0382, 0.0353, 0.0336, 0.0682, 0.2920, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0161, 0.0168, 0.0142, 0.0139, 0.0200, 0.0134, 0.0156, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 03:30:54,502 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92899.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:31:11,398 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8562, 2.3985, 3.3908, 2.2688, 2.0593, 3.0333, 1.4313, 2.8378], + device='cuda:2'), covar=tensor([0.1090, 0.1198, 0.0635, 0.2656, 0.2517, 0.0901, 0.4588, 0.1210], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0098, 0.0091, 0.0100, 0.0116, 0.0087, 0.0124, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:31:22,050 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92931.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:31:34,897 INFO [train.py:873] (2/4) Epoch 13, batch 2200, loss[loss=0.1275, simple_loss=0.1538, pruned_loss=0.05056, over 11133.00 frames. ], tot_loss[loss=0.1224, simple_loss=0.1538, pruned_loss=0.04554, over 1966130.65 frames. ], batch size: 100, lr: 6.13e-03, grad_scale: 8.0 +2022-12-08 03:31:47,575 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92960.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:31:56,269 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.342e+02 2.331e+02 2.968e+02 3.670e+02 7.983e+02, threshold=5.936e+02, percent-clipped=6.0 +2022-12-08 03:32:01,007 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=92975.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:32:06,287 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=92981.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:32:16,163 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=92992.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:32:31,707 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93010.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:32:55,221 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:32:55,242 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:33:02,673 INFO [train.py:873] (2/4) Epoch 13, batch 2300, loss[loss=0.1041, simple_loss=0.1406, pruned_loss=0.0338, over 14217.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1528, pruned_loss=0.04432, over 2027452.74 frames. ], batch size: 35, lr: 6.13e-03, grad_scale: 8.0 +2022-12-08 03:33:25,584 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.411e+02 2.107e+02 2.508e+02 3.191e+02 7.606e+02, threshold=5.016e+02, percent-clipped=1.0 +2022-12-08 03:33:26,608 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93071.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:33:30,962 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93076.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:33:33,354 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1030, 2.5689, 3.4295, 2.1481, 2.1326, 2.9324, 1.6255, 2.7011], + device='cuda:2'), covar=tensor([0.0942, 0.1517, 0.0555, 0.2141, 0.2286, 0.0956, 0.3812, 0.1186], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0097, 0.0090, 0.0098, 0.0115, 0.0086, 0.0123, 0.0089], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:33:49,134 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93097.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 03:33:57,491 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4216, 2.8360, 4.1618, 3.1489, 4.1552, 4.0693, 3.9331, 3.5248], + device='cuda:2'), covar=tensor([0.0737, 0.3041, 0.1293, 0.1925, 0.0864, 0.0948, 0.2107, 0.1870], + device='cuda:2'), in_proj_covar=tensor([0.0343, 0.0307, 0.0389, 0.0296, 0.0370, 0.0315, 0.0356, 0.0300], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:34:05,053 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93115.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:34:31,778 INFO [train.py:873] (2/4) Epoch 13, batch 2400, loss[loss=0.1128, simple_loss=0.1509, pruned_loss=0.03734, over 14041.00 frames. ], tot_loss[loss=0.1204, simple_loss=0.1527, pruned_loss=0.04407, over 2066323.61 frames. ], batch size: 26, lr: 6.13e-03, grad_scale: 8.0 +2022-12-08 03:34:50,163 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8858, 1.7054, 1.9180, 1.6751, 1.9445, 1.7970, 1.6264, 1.8911], + device='cuda:2'), covar=tensor([0.0455, 0.0933, 0.0264, 0.0480, 0.0475, 0.0801, 0.0322, 0.0380], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0309, 0.0391, 0.0299, 0.0373, 0.0318, 0.0358, 0.0304], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:34:53,122 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.501e+02 2.312e+02 2.703e+02 3.635e+02 6.826e+02, threshold=5.406e+02, percent-clipped=5.0 +2022-12-08 03:34:58,902 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93176.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:35:09,392 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93187.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:35:59,852 INFO [train.py:873] (2/4) Epoch 13, batch 2500, loss[loss=0.1815, simple_loss=0.1744, pruned_loss=0.09431, over 1254.00 frames. ], tot_loss[loss=0.1205, simple_loss=0.1528, pruned_loss=0.0441, over 2018355.43 frames. ], batch size: 100, lr: 6.12e-03, grad_scale: 8.0 +2022-12-08 03:36:09,016 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93255.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:36:22,499 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.150e+02 2.321e+02 2.910e+02 3.520e+02 9.929e+02, threshold=5.819e+02, percent-clipped=5.0 +2022-12-08 03:36:32,050 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93281.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:36:37,123 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93287.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:37:15,079 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:37:16,865 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93331.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:37:28,230 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6667, 3.4061, 3.3389, 3.6248, 3.4249, 3.6914, 3.7177, 3.0998], + device='cuda:2'), covar=tensor([0.0464, 0.1013, 0.0468, 0.0588, 0.0893, 0.0388, 0.0551, 0.0601], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0273, 0.0194, 0.0192, 0.0186, 0.0155, 0.0279, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 03:37:28,916 INFO [train.py:873] (2/4) Epoch 13, batch 2600, loss[loss=0.117, simple_loss=0.1476, pruned_loss=0.04316, over 6030.00 frames. ], tot_loss[loss=0.1209, simple_loss=0.1528, pruned_loss=0.04449, over 2023628.23 frames. ], batch size: 100, lr: 6.12e-03, grad_scale: 8.0 +2022-12-08 03:37:47,444 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93366.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:37:50,652 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.166e+02 2.180e+02 2.788e+02 3.347e+02 5.990e+02, threshold=5.575e+02, percent-clipped=1.0 +2022-12-08 03:37:55,998 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93376.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:38:10,531 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93392.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:38:22,970 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4608, 3.9230, 3.1311, 4.8657, 4.2592, 4.6054, 4.0592, 3.4610], + device='cuda:2'), covar=tensor([0.0823, 0.1188, 0.3491, 0.0394, 0.0872, 0.1447, 0.1070, 0.2654], + device='cuda:2'), in_proj_covar=tensor([0.0270, 0.0295, 0.0267, 0.0263, 0.0314, 0.0298, 0.0257, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:38:39,204 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93424.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:38:56,284 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.72 vs. limit=2.0 +2022-12-08 03:38:57,480 INFO [train.py:873] (2/4) Epoch 13, batch 2700, loss[loss=0.1096, simple_loss=0.149, pruned_loss=0.0351, over 14653.00 frames. ], tot_loss[loss=0.1206, simple_loss=0.1523, pruned_loss=0.04444, over 1942936.94 frames. ], batch size: 33, lr: 6.12e-03, grad_scale: 8.0 +2022-12-08 03:39:04,081 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7418, 2.3565, 3.1689, 2.1080, 1.9999, 2.8851, 1.3815, 2.6671], + device='cuda:2'), covar=tensor([0.1128, 0.1668, 0.0757, 0.2087, 0.2608, 0.0796, 0.4057, 0.1088], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0098, 0.0090, 0.0099, 0.0116, 0.0086, 0.0123, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:39:19,854 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.738e+01 2.118e+02 2.554e+02 3.202e+02 5.751e+02, threshold=5.109e+02, percent-clipped=2.0 +2022-12-08 03:39:20,919 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=93471.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:39:22,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3505, 4.0899, 3.7572, 3.9583, 4.1469, 4.2476, 4.3262, 4.2983], + device='cuda:2'), covar=tensor([0.0764, 0.0477, 0.2023, 0.2686, 0.0713, 0.0797, 0.0766, 0.0764], + device='cuda:2'), in_proj_covar=tensor([0.0377, 0.0266, 0.0447, 0.0568, 0.0333, 0.0435, 0.0390, 0.0375], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:39:34,820 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93487.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:40:16,923 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93535.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:40:25,982 INFO [train.py:873] (2/4) Epoch 13, batch 2800, loss[loss=0.1342, simple_loss=0.1637, pruned_loss=0.05239, over 12739.00 frames. ], tot_loss[loss=0.1214, simple_loss=0.1531, pruned_loss=0.04482, over 1969465.50 frames. ], batch size: 100, lr: 6.11e-03, grad_scale: 8.0 +2022-12-08 03:40:31,007 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1309, 2.0638, 2.0250, 2.3094, 2.3019, 1.1597, 1.9786, 2.3394], + device='cuda:2'), covar=tensor([0.0711, 0.0942, 0.1778, 0.0652, 0.1239, 0.0905, 0.0688, 0.0528], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0029, 0.0031, 0.0027, 0.0029, 0.0041, 0.0029, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:40:35,203 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93555.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:40:46,086 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-12-08 03:40:47,337 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.714e+01 2.417e+02 3.010e+02 3.786e+02 1.160e+03, threshold=6.020e+02, percent-clipped=14.0 +2022-12-08 03:41:02,898 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93587.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:41:16,928 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93603.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:41:27,891 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9609, 1.6490, 1.9978, 1.3939, 1.6481, 2.0312, 1.7943, 1.7268], + device='cuda:2'), covar=tensor([0.0950, 0.0814, 0.0779, 0.1224, 0.1371, 0.0921, 0.0703, 0.1482], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0174, 0.0137, 0.0125, 0.0137, 0.0146, 0.0123, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 03:41:41,550 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93631.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:41:44,878 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93635.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:41:53,319 INFO [train.py:873] (2/4) Epoch 13, batch 2900, loss[loss=0.1141, simple_loss=0.1475, pruned_loss=0.04035, over 14420.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1525, pruned_loss=0.04446, over 1988405.71 frames. ], batch size: 24, lr: 6.11e-03, grad_scale: 8.0 +2022-12-08 03:42:11,973 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93666.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:42:15,458 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.161e+02 2.101e+02 2.613e+02 3.235e+02 8.582e+02, threshold=5.226e+02, percent-clipped=1.0 +2022-12-08 03:42:23,190 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93679.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:42:34,309 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93692.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:42:35,490 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4366, 1.3418, 1.4952, 1.2558, 1.2173, 1.0450, 1.0998, 1.0860], + device='cuda:2'), covar=tensor([0.0179, 0.0379, 0.0163, 0.0227, 0.0256, 0.0455, 0.0321, 0.0457], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0018, 0.0016, 0.0017, 0.0017, 0.0028, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:42:46,696 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9069, 1.3243, 2.0434, 1.3493, 1.9903, 2.0932, 1.6766, 2.1448], + device='cuda:2'), covar=tensor([0.0268, 0.1834, 0.0415, 0.1715, 0.0502, 0.0479, 0.0983, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0158, 0.0160, 0.0170, 0.0171, 0.0179, 0.0136, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:42:53,844 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93714.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:43:05,858 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6358, 2.7150, 2.7801, 2.7767, 2.6969, 2.4176, 1.4911, 2.4470], + device='cuda:2'), covar=tensor([0.0479, 0.0426, 0.0463, 0.0377, 0.0363, 0.0993, 0.2569, 0.0382], + device='cuda:2'), in_proj_covar=tensor([0.0162, 0.0167, 0.0140, 0.0139, 0.0199, 0.0134, 0.0155, 0.0186], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 03:43:16,480 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93740.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:43:21,028 INFO [train.py:873] (2/4) Epoch 13, batch 3000, loss[loss=0.1673, simple_loss=0.1863, pruned_loss=0.07417, over 13853.00 frames. ], tot_loss[loss=0.1219, simple_loss=0.1534, pruned_loss=0.04523, over 1979290.78 frames. ], batch size: 23, lr: 6.11e-03, grad_scale: 8.0 +2022-12-08 03:43:21,029 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 03:43:29,437 INFO [train.py:905] (2/4) Epoch 13, validation: loss=0.1304, simple_loss=0.1697, pruned_loss=0.04555, over 857387.00 frames. +2022-12-08 03:43:29,438 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 03:43:52,064 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.563e+02 2.240e+02 2.864e+02 3.755e+02 7.950e+02, threshold=5.728e+02, percent-clipped=3.0 +2022-12-08 03:43:52,306 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93770.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:43:53,107 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=93771.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:44:05,640 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3945, 1.9412, 2.2918, 2.3132, 2.3496, 1.0807, 2.0621, 2.0506], + device='cuda:2'), covar=tensor([0.0707, 0.0943, 0.0596, 0.0856, 0.1038, 0.0882, 0.0908, 0.0926], + device='cuda:2'), in_proj_covar=tensor([0.0029, 0.0029, 0.0031, 0.0027, 0.0029, 0.0041, 0.0029, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:44:20,893 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4553, 4.0204, 3.2160, 4.7844, 4.2468, 4.4848, 4.0652, 3.4354], + device='cuda:2'), covar=tensor([0.0916, 0.1233, 0.3807, 0.0540, 0.1217, 0.1706, 0.1211, 0.3107], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0297, 0.0270, 0.0268, 0.0317, 0.0300, 0.0258, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:44:35,612 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=93819.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:44:43,399 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8723, 1.4514, 2.9879, 2.6397, 2.8705, 2.9772, 2.2223, 2.9520], + device='cuda:2'), covar=tensor([0.1147, 0.1367, 0.0149, 0.0418, 0.0316, 0.0171, 0.0560, 0.0206], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0128, 0.0166, 0.0145, 0.0141, 0.0123, 0.0120], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:44:46,127 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=93831.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:44:58,705 INFO [train.py:873] (2/4) Epoch 13, batch 3100, loss[loss=0.08323, simple_loss=0.1274, pruned_loss=0.01952, over 13997.00 frames. ], tot_loss[loss=0.1216, simple_loss=0.1529, pruned_loss=0.04519, over 1968146.22 frames. ], batch size: 19, lr: 6.10e-03, grad_scale: 8.0 +2022-12-08 03:44:58,790 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0977, 2.8998, 2.6241, 2.8000, 2.9780, 3.0114, 3.0432, 3.0332], + device='cuda:2'), covar=tensor([0.0841, 0.0768, 0.2658, 0.2794, 0.0958, 0.1086, 0.1390, 0.0914], + device='cuda:2'), in_proj_covar=tensor([0.0378, 0.0266, 0.0446, 0.0566, 0.0333, 0.0438, 0.0388, 0.0372], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:45:16,999 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5064, 4.3041, 4.1822, 4.5034, 4.1636, 3.7565, 4.5934, 4.4139], + device='cuda:2'), covar=tensor([0.0577, 0.0781, 0.0822, 0.0600, 0.0762, 0.0660, 0.0543, 0.0602], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0131, 0.0140, 0.0152, 0.0140, 0.0117, 0.0159, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 03:45:19,448 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.395e+02 2.887e+02 3.517e+02 6.635e+02, threshold=5.774e+02, percent-clipped=1.0 +2022-12-08 03:46:25,613 INFO [train.py:873] (2/4) Epoch 13, batch 3200, loss[loss=0.1437, simple_loss=0.1437, pruned_loss=0.07187, over 1229.00 frames. ], tot_loss[loss=0.1199, simple_loss=0.1519, pruned_loss=0.04392, over 1990361.80 frames. ], batch size: 100, lr: 6.10e-03, grad_scale: 8.0 +2022-12-08 03:46:32,452 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 03:46:37,229 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2934, 4.0290, 3.7132, 3.8836, 4.1074, 4.1900, 4.2445, 4.2173], + device='cuda:2'), covar=tensor([0.0788, 0.0533, 0.2014, 0.2694, 0.0796, 0.0860, 0.0927, 0.0888], + device='cuda:2'), in_proj_covar=tensor([0.0376, 0.0265, 0.0443, 0.0563, 0.0332, 0.0437, 0.0387, 0.0371], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:46:48,615 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.501e+02 2.002e+02 2.396e+02 3.046e+02 4.955e+02, threshold=4.792e+02, percent-clipped=0.0 +2022-12-08 03:46:57,605 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=93981.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:47:50,707 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94042.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:47:53,370 INFO [train.py:873] (2/4) Epoch 13, batch 3300, loss[loss=0.1309, simple_loss=0.1647, pruned_loss=0.04853, over 14363.00 frames. ], tot_loss[loss=0.1203, simple_loss=0.1523, pruned_loss=0.04409, over 2047963.20 frames. ], batch size: 73, lr: 6.10e-03, grad_scale: 4.0 +2022-12-08 03:48:12,360 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1153, 1.0228, 1.0938, 0.8605, 0.8681, 0.6910, 0.8129, 0.7958], + device='cuda:2'), covar=tensor([0.0189, 0.0202, 0.0200, 0.0224, 0.0213, 0.0470, 0.0293, 0.0453], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0018, 0.0016, 0.0017, 0.0017, 0.0028, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 03:48:15,567 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.372e+02 2.207e+02 2.819e+02 3.660e+02 9.376e+02, threshold=5.637e+02, percent-clipped=6.0 +2022-12-08 03:49:03,565 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94126.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:49:19,899 INFO [train.py:873] (2/4) Epoch 13, batch 3400, loss[loss=0.1359, simple_loss=0.1552, pruned_loss=0.05831, over 5994.00 frames. ], tot_loss[loss=0.1218, simple_loss=0.153, pruned_loss=0.04531, over 1996465.90 frames. ], batch size: 100, lr: 6.09e-03, grad_scale: 4.0 +2022-12-08 03:49:42,650 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.317e+02 2.362e+02 2.991e+02 3.918e+02 1.858e+03, threshold=5.982e+02, percent-clipped=6.0 +2022-12-08 03:50:28,478 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=94223.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:50:36,977 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5698, 2.6275, 2.7595, 2.7109, 2.7287, 2.4382, 1.4947, 2.4874], + device='cuda:2'), covar=tensor([0.0502, 0.0498, 0.0479, 0.0390, 0.0435, 0.0967, 0.2653, 0.0387], + device='cuda:2'), in_proj_covar=tensor([0.0160, 0.0167, 0.0139, 0.0137, 0.0197, 0.0134, 0.0154, 0.0185], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 03:50:47,830 INFO [train.py:873] (2/4) Epoch 13, batch 3500, loss[loss=0.1106, simple_loss=0.1234, pruned_loss=0.04886, over 2659.00 frames. ], tot_loss[loss=0.1208, simple_loss=0.1525, pruned_loss=0.04461, over 1981345.94 frames. ], batch size: 100, lr: 6.09e-03, grad_scale: 4.0 +2022-12-08 03:50:50,333 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7926, 1.7490, 4.7504, 2.0037, 4.2931, 4.7803, 4.2659, 5.1762], + device='cuda:2'), covar=tensor([0.0196, 0.3135, 0.0248, 0.2114, 0.0329, 0.0292, 0.0341, 0.0140], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0156, 0.0157, 0.0166, 0.0168, 0.0175, 0.0135, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 03:51:10,109 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.271e+02 2.788e+02 3.634e+02 1.227e+03, threshold=5.575e+02, percent-clipped=1.0 +2022-12-08 03:51:22,039 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94284.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 03:52:07,999 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94337.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:52:14,720 INFO [train.py:873] (2/4) Epoch 13, batch 3600, loss[loss=0.1388, simple_loss=0.1646, pruned_loss=0.05655, over 14217.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1522, pruned_loss=0.04461, over 1912970.82 frames. ], batch size: 94, lr: 6.09e-03, grad_scale: 8.0 +2022-12-08 03:52:17,400 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=94348.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:52:37,865 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.011e+02 2.604e+02 3.365e+02 5.868e+02, threshold=5.208e+02, percent-clipped=1.0 +2022-12-08 03:53:10,937 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=94409.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 03:53:25,261 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94426.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:53:41,903 INFO [train.py:873] (2/4) Epoch 13, batch 3700, loss[loss=0.0993, simple_loss=0.1447, pruned_loss=0.02694, over 14250.00 frames. ], tot_loss[loss=0.1221, simple_loss=0.1533, pruned_loss=0.04545, over 1908155.88 frames. ], batch size: 44, lr: 6.08e-03, grad_scale: 4.0 +2022-12-08 03:54:04,972 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 2.353e+02 2.765e+02 3.627e+02 6.924e+02, threshold=5.531e+02, percent-clipped=2.0 +2022-12-08 03:54:06,735 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94474.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:54:14,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3843, 2.1980, 2.3309, 1.5438, 2.0538, 2.3487, 2.4114, 2.0362], + device='cuda:2'), covar=tensor([0.0742, 0.0542, 0.0761, 0.1345, 0.1047, 0.0640, 0.0518, 0.1217], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0177, 0.0139, 0.0126, 0.0138, 0.0148, 0.0124, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 03:55:09,009 INFO [train.py:873] (2/4) Epoch 13, batch 3800, loss[loss=0.1207, simple_loss=0.1582, pruned_loss=0.0416, over 14263.00 frames. ], tot_loss[loss=0.1214, simple_loss=0.1531, pruned_loss=0.04489, over 1960336.29 frames. ], batch size: 60, lr: 6.08e-03, grad_scale: 4.0 +2022-12-08 03:55:18,547 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3987, 4.9533, 4.9276, 5.3644, 5.0258, 4.5715, 5.3681, 4.4458], + device='cuda:2'), covar=tensor([0.0357, 0.1127, 0.0319, 0.0420, 0.0756, 0.0486, 0.0477, 0.0500], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0271, 0.0192, 0.0187, 0.0183, 0.0151, 0.0273, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 03:55:32,196 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 2.117e+02 2.466e+02 3.115e+02 4.944e+02, threshold=4.932e+02, percent-clipped=0.0 +2022-12-08 03:55:38,496 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94579.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 03:55:46,496 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5396, 1.8128, 1.9615, 1.9194, 1.8435, 1.9147, 1.6189, 1.3286], + device='cuda:2'), covar=tensor([0.1314, 0.1966, 0.0672, 0.0714, 0.1262, 0.0764, 0.1907, 0.2508], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0083, 0.0066, 0.0069, 0.0097, 0.0082, 0.0097, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 03:56:29,432 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94637.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:56:36,276 INFO [train.py:873] (2/4) Epoch 13, batch 3900, loss[loss=0.1009, simple_loss=0.1437, pruned_loss=0.02903, over 13954.00 frames. ], tot_loss[loss=0.1204, simple_loss=0.1521, pruned_loss=0.04429, over 2002914.47 frames. ], batch size: 26, lr: 6.08e-03, grad_scale: 4.0 +2022-12-08 03:56:41,308 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.16 vs. limit=5.0 +2022-12-08 03:56:59,462 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 2.050e+02 2.441e+02 3.382e+02 7.650e+02, threshold=4.881e+02, percent-clipped=6.0 +2022-12-08 03:57:11,472 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94685.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 03:57:27,713 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=94704.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 03:57:29,007 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.52 vs. limit=5.0 +2022-12-08 03:57:41,095 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3011, 2.3184, 4.3275, 3.0576, 4.1323, 2.1562, 3.2568, 4.0822], + device='cuda:2'), covar=tensor([0.0535, 0.3776, 0.0355, 0.5658, 0.0523, 0.3054, 0.1217, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0207, 0.0206, 0.0279, 0.0224, 0.0208, 0.0210, 0.0208], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 03:57:43,834 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 03:58:02,598 INFO [train.py:873] (2/4) Epoch 13, batch 4000, loss[loss=0.1339, simple_loss=0.1611, pruned_loss=0.05334, over 11989.00 frames. ], tot_loss[loss=0.1206, simple_loss=0.1523, pruned_loss=0.04446, over 2010973.25 frames. ], batch size: 100, lr: 6.07e-03, grad_scale: 8.0 +2022-12-08 03:58:26,194 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.300e+02 2.218e+02 2.831e+02 3.684e+02 6.338e+02, threshold=5.663e+02, percent-clipped=4.0 +2022-12-08 03:58:47,415 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8128, 1.3768, 3.7686, 1.7631, 3.7950, 3.8570, 2.7045, 4.1983], + device='cuda:2'), covar=tensor([0.0230, 0.3206, 0.0360, 0.2152, 0.0438, 0.0390, 0.0791, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0156, 0.0164, 0.0165, 0.0173, 0.0133, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003], + device='cuda:2') +2022-12-08 03:59:11,754 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1026, 2.0938, 2.3797, 1.4638, 1.6334, 2.1720, 1.2879, 2.0675], + device='cuda:2'), covar=tensor([0.1117, 0.1577, 0.0906, 0.2481, 0.3099, 0.1180, 0.3913, 0.1213], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0098, 0.0090, 0.0097, 0.0115, 0.0085, 0.0122, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 03:59:29,380 INFO [train.py:873] (2/4) Epoch 13, batch 4100, loss[loss=0.2093, simple_loss=0.1835, pruned_loss=0.1176, over 1258.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1522, pruned_loss=0.04462, over 1978905.46 frames. ], batch size: 100, lr: 6.07e-03, grad_scale: 8.0 +2022-12-08 03:59:52,465 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 2.091e+02 2.510e+02 3.105e+02 7.353e+02, threshold=5.020e+02, percent-clipped=2.0 +2022-12-08 03:59:58,836 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=94879.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:00:36,248 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9539, 2.6354, 4.9706, 3.3410, 4.7612, 2.2900, 3.6039, 4.6733], + device='cuda:2'), covar=tensor([0.0521, 0.4034, 0.0395, 0.6893, 0.0418, 0.3225, 0.1301, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0248, 0.0208, 0.0205, 0.0281, 0.0225, 0.0208, 0.0209, 0.0208], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:00:40,994 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=94927.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 04:00:56,136 INFO [train.py:873] (2/4) Epoch 13, batch 4200, loss[loss=0.109, simple_loss=0.1453, pruned_loss=0.03638, over 14298.00 frames. ], tot_loss[loss=0.1211, simple_loss=0.1526, pruned_loss=0.0448, over 2002166.13 frames. ], batch size: 63, lr: 6.07e-03, grad_scale: 8.0 +2022-12-08 04:01:15,429 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.43 vs. limit=2.0 +2022-12-08 04:01:19,979 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 2.402e+02 2.842e+02 3.518e+02 1.114e+03, threshold=5.683e+02, percent-clipped=6.0 +2022-12-08 04:01:51,671 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95004.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:02:09,773 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1404, 1.3281, 3.2975, 1.4367, 3.0008, 3.2674, 2.3322, 3.4657], + device='cuda:2'), covar=tensor([0.0330, 0.3409, 0.0400, 0.2566, 0.1179, 0.0476, 0.1019, 0.0284], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0157, 0.0158, 0.0167, 0.0167, 0.0176, 0.0134, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:02:17,104 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2885, 2.4296, 2.1652, 2.1631, 2.1424, 1.9894, 1.7325, 2.1576], + device='cuda:2'), covar=tensor([0.0331, 0.0461, 0.0483, 0.0288, 0.0629, 0.0528, 0.0528, 0.0572], + device='cuda:2'), in_proj_covar=tensor([0.0017, 0.0018, 0.0016, 0.0017, 0.0017, 0.0028, 0.0022, 0.0027], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 04:02:26,456 INFO [train.py:873] (2/4) Epoch 13, batch 4300, loss[loss=0.1231, simple_loss=0.1574, pruned_loss=0.04445, over 14395.00 frames. ], tot_loss[loss=0.122, simple_loss=0.1532, pruned_loss=0.04539, over 1999965.81 frames. ], batch size: 41, lr: 6.06e-03, grad_scale: 8.0 +2022-12-08 04:02:32,647 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95052.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:02:42,210 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4978, 2.2264, 4.4772, 2.9923, 4.2712, 2.0785, 3.4002, 4.3149], + device='cuda:2'), covar=tensor([0.0491, 0.4169, 0.0414, 0.6749, 0.0542, 0.3482, 0.1291, 0.0357], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0210, 0.0207, 0.0284, 0.0227, 0.0209, 0.0211, 0.0210], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:02:49,664 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 2.351e+02 2.797e+02 3.272e+02 6.145e+02, threshold=5.594e+02, percent-clipped=1.0 +2022-12-08 04:02:57,700 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0249, 1.9938, 2.1975, 1.4602, 1.5235, 2.0119, 1.2410, 2.0096], + device='cuda:2'), covar=tensor([0.1019, 0.1559, 0.0821, 0.1923, 0.2683, 0.0954, 0.3771, 0.1057], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0097, 0.0090, 0.0097, 0.0114, 0.0085, 0.0121, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 04:03:18,412 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95105.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 04:03:53,023 INFO [train.py:873] (2/4) Epoch 13, batch 4400, loss[loss=0.1289, simple_loss=0.1311, pruned_loss=0.06331, over 2688.00 frames. ], tot_loss[loss=0.1233, simple_loss=0.1541, pruned_loss=0.04629, over 2003619.04 frames. ], batch size: 100, lr: 6.06e-03, grad_scale: 8.0 +2022-12-08 04:04:11,057 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95166.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:04:16,204 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.201e+02 2.204e+02 2.628e+02 3.192e+02 7.807e+02, threshold=5.256e+02, percent-clipped=2.0 +2022-12-08 04:05:20,306 INFO [train.py:873] (2/4) Epoch 13, batch 4500, loss[loss=0.1119, simple_loss=0.1483, pruned_loss=0.0378, over 14325.00 frames. ], tot_loss[loss=0.1217, simple_loss=0.1531, pruned_loss=0.04513, over 1998590.43 frames. ], batch size: 60, lr: 6.06e-03, grad_scale: 8.0 +2022-12-08 04:05:33,385 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95260.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:05:40,417 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95268.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:05:43,076 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4275, 2.1059, 3.3552, 3.4945, 3.3678, 2.2588, 3.3224, 2.5137], + device='cuda:2'), covar=tensor([0.0436, 0.1060, 0.0705, 0.0481, 0.0495, 0.1450, 0.0392, 0.1026], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0254, 0.0368, 0.0324, 0.0265, 0.0301, 0.0303, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:05:43,636 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.015e+02 2.327e+02 3.103e+02 3.682e+02 7.295e+02, threshold=6.206e+02, percent-clipped=3.0 +2022-12-08 04:06:05,651 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7604, 1.0615, 1.2673, 1.1848, 1.0815, 1.3012, 1.0390, 0.8586], + device='cuda:2'), covar=tensor([0.1992, 0.1010, 0.0398, 0.0554, 0.1726, 0.1025, 0.1698, 0.1558], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0083, 0.0065, 0.0069, 0.0096, 0.0083, 0.0096, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0005, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 04:06:26,023 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:06:33,443 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:06:41,729 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95339.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:06:46,983 INFO [train.py:873] (2/4) Epoch 13, batch 4600, loss[loss=0.1286, simple_loss=0.1665, pruned_loss=0.04539, over 14636.00 frames. ], tot_loss[loss=0.1216, simple_loss=0.1534, pruned_loss=0.04493, over 2028491.56 frames. ], batch size: 22, lr: 6.05e-03, grad_scale: 8.0 +2022-12-08 04:06:59,376 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.50 vs. limit=2.0 +2022-12-08 04:07:10,470 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.352e+02 2.077e+02 2.550e+02 3.185e+02 6.474e+02, threshold=5.099e+02, percent-clipped=1.0 +2022-12-08 04:07:35,145 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95400.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:07:54,350 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0136, 4.6225, 4.4840, 5.0463, 4.6501, 4.3313, 5.0195, 4.1492], + device='cuda:2'), covar=tensor([0.0332, 0.0872, 0.0362, 0.0354, 0.0823, 0.0598, 0.0450, 0.0466], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0270, 0.0190, 0.0186, 0.0183, 0.0150, 0.0275, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 04:08:14,532 INFO [train.py:873] (2/4) Epoch 13, batch 4700, loss[loss=0.1658, simple_loss=0.1486, pruned_loss=0.0915, over 1246.00 frames. ], tot_loss[loss=0.122, simple_loss=0.1535, pruned_loss=0.0452, over 1989887.13 frames. ], batch size: 100, lr: 6.05e-03, grad_scale: 4.0 +2022-12-08 04:08:28,729 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95461.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:08:39,145 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.429e+02 2.071e+02 2.759e+02 3.394e+02 8.943e+02, threshold=5.519e+02, percent-clipped=4.0 +2022-12-08 04:09:41,415 INFO [train.py:873] (2/4) Epoch 13, batch 4800, loss[loss=0.1224, simple_loss=0.1345, pruned_loss=0.0552, over 3788.00 frames. ], tot_loss[loss=0.1208, simple_loss=0.1529, pruned_loss=0.04439, over 2062287.21 frames. ], batch size: 100, lr: 6.05e-03, grad_scale: 8.0 +2022-12-08 04:09:51,399 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4311, 2.1493, 2.3157, 1.3940, 2.0615, 2.3795, 2.5151, 2.0546], + device='cuda:2'), covar=tensor([0.0790, 0.1044, 0.1016, 0.1927, 0.1128, 0.0880, 0.0588, 0.1474], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0175, 0.0138, 0.0125, 0.0137, 0.0148, 0.0123, 0.0137], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:10:05,764 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.497e+02 2.302e+02 2.972e+02 3.662e+02 7.055e+02, threshold=5.944e+02, percent-clipped=3.0 +2022-12-08 04:10:17,123 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0298, 2.0421, 4.0952, 2.8885, 3.9168, 2.0745, 3.1547, 3.8890], + device='cuda:2'), covar=tensor([0.0657, 0.4573, 0.0480, 0.5928, 0.0590, 0.3508, 0.1347, 0.0417], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0210, 0.0208, 0.0284, 0.0226, 0.0210, 0.0211, 0.0210], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:10:43,506 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95616.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:10:44,470 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95617.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:10:50,232 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95624.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:11:00,217 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6914, 4.4430, 4.2009, 4.7358, 4.4400, 4.1734, 4.7704, 4.0461], + device='cuda:2'), covar=tensor([0.0419, 0.0896, 0.0387, 0.0442, 0.0742, 0.0657, 0.0463, 0.0439], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0270, 0.0189, 0.0187, 0.0184, 0.0151, 0.0275, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 04:11:08,088 INFO [train.py:873] (2/4) Epoch 13, batch 4900, loss[loss=0.1081, simple_loss=0.1514, pruned_loss=0.03237, over 14292.00 frames. ], tot_loss[loss=0.1208, simple_loss=0.1528, pruned_loss=0.04438, over 2020340.18 frames. ], batch size: 35, lr: 6.05e-03, grad_scale: 8.0 +2022-12-08 04:11:32,372 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.329e+02 2.158e+02 2.610e+02 3.322e+02 1.003e+03, threshold=5.220e+02, percent-clipped=4.0 +2022-12-08 04:11:36,519 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95678.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:11:51,539 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95695.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:12:06,296 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8192, 1.5047, 1.8202, 1.7678, 1.7927, 1.5100, 1.4284, 1.1681], + device='cuda:2'), covar=tensor([0.0247, 0.0560, 0.0367, 0.0218, 0.0213, 0.0326, 0.0278, 0.0513], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0016, 0.0017, 0.0017, 0.0029, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 04:12:35,017 INFO [train.py:873] (2/4) Epoch 13, batch 5000, loss[loss=0.1202, simple_loss=0.1473, pruned_loss=0.04661, over 14010.00 frames. ], tot_loss[loss=0.1205, simple_loss=0.1528, pruned_loss=0.04413, over 1995236.84 frames. ], batch size: 22, lr: 6.04e-03, grad_scale: 8.0 +2022-12-08 04:12:49,280 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95761.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 04:12:59,678 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.354e+02 2.284e+02 2.864e+02 3.553e+02 7.569e+02, threshold=5.729e+02, percent-clipped=2.0 +2022-12-08 04:13:14,429 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1262, 2.1023, 3.0324, 3.1618, 3.0559, 2.1730, 3.0608, 2.3515], + device='cuda:2'), covar=tensor([0.0432, 0.0979, 0.0734, 0.0516, 0.0488, 0.1396, 0.0420, 0.0979], + device='cuda:2'), in_proj_covar=tensor([0.0288, 0.0252, 0.0366, 0.0323, 0.0265, 0.0299, 0.0302, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:13:31,720 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95809.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:13:34,698 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95812.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:13:49,499 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4776, 1.9039, 4.3692, 2.0086, 4.1900, 4.5708, 4.1220, 4.8897], + device='cuda:2'), covar=tensor([0.0211, 0.3082, 0.0353, 0.2325, 0.0382, 0.0339, 0.0336, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0157, 0.0159, 0.0166, 0.0167, 0.0175, 0.0134, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:14:01,044 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 04:14:03,055 INFO [train.py:873] (2/4) Epoch 13, batch 5100, loss[loss=0.1228, simple_loss=0.1553, pruned_loss=0.04512, over 14371.00 frames. ], tot_loss[loss=0.1209, simple_loss=0.1527, pruned_loss=0.04453, over 1983073.21 frames. ], batch size: 18, lr: 6.04e-03, grad_scale: 8.0 +2022-12-08 04:14:05,080 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95847.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:14:27,856 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 2.050e+02 2.508e+02 3.173e+02 5.976e+02, threshold=5.017e+02, percent-clipped=1.0 +2022-12-08 04:14:28,051 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95873.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 04:14:59,251 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=95908.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 04:14:59,622 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-12-08 04:15:06,251 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95916.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:15:13,258 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95924.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:15:31,805 INFO [train.py:873] (2/4) Epoch 13, batch 5200, loss[loss=0.152, simple_loss=0.15, pruned_loss=0.07699, over 2614.00 frames. ], tot_loss[loss=0.1209, simple_loss=0.1527, pruned_loss=0.04453, over 1887482.40 frames. ], batch size: 100, lr: 6.04e-03, grad_scale: 8.0 +2022-12-08 04:15:48,364 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95964.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:15:52,879 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8273, 3.6025, 3.5172, 3.8706, 3.6891, 3.6237, 3.9248, 3.2873], + device='cuda:2'), covar=tensor([0.0655, 0.1157, 0.0498, 0.0585, 0.0805, 0.1214, 0.0596, 0.0607], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0271, 0.0191, 0.0188, 0.0185, 0.0152, 0.0279, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 04:15:55,697 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=95972.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:15:56,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.143e+02 2.184e+02 2.660e+02 3.283e+02 4.676e+02, threshold=5.321e+02, percent-clipped=0.0 +2022-12-08 04:15:56,629 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=95973.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:16:03,041 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=95980.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:16:15,820 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=95995.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:16:56,441 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96041.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:16:58,082 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96043.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:16:59,770 INFO [train.py:873] (2/4) Epoch 13, batch 5300, loss[loss=0.1044, simple_loss=0.1421, pruned_loss=0.0334, over 14139.00 frames. ], tot_loss[loss=0.1215, simple_loss=0.153, pruned_loss=0.04496, over 1926885.08 frames. ], batch size: 84, lr: 6.03e-03, grad_scale: 8.0 +2022-12-08 04:17:12,948 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4965, 1.9397, 2.5297, 2.5923, 2.4674, 1.9653, 2.6077, 2.0568], + device='cuda:2'), covar=tensor([0.0401, 0.0862, 0.0527, 0.0461, 0.0541, 0.1188, 0.0400, 0.0730], + device='cuda:2'), in_proj_covar=tensor([0.0286, 0.0251, 0.0365, 0.0321, 0.0263, 0.0297, 0.0301, 0.0276], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:17:23,907 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.342e+02 2.223e+02 2.915e+02 3.617e+02 9.982e+02, threshold=5.829e+02, percent-clipped=6.0 +2022-12-08 04:17:29,934 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96080.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:17:38,225 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2918, 2.9023, 2.9382, 2.0806, 2.7221, 2.9203, 3.2784, 2.7370], + device='cuda:2'), covar=tensor([0.0760, 0.1288, 0.0877, 0.1495, 0.0967, 0.0819, 0.0744, 0.1154], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0179, 0.0140, 0.0127, 0.0138, 0.0151, 0.0125, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:18:05,072 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.26 vs. limit=5.0 +2022-12-08 04:18:23,880 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96141.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:18:27,060 INFO [train.py:873] (2/4) Epoch 13, batch 5400, loss[loss=0.1322, simple_loss=0.1343, pruned_loss=0.06504, over 2666.00 frames. ], tot_loss[loss=0.1213, simple_loss=0.1528, pruned_loss=0.04495, over 1868651.21 frames. ], batch size: 100, lr: 6.03e-03, grad_scale: 8.0 +2022-12-08 04:18:38,522 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4802, 2.3035, 3.3516, 3.5560, 3.3284, 2.2822, 3.4438, 2.6985], + device='cuda:2'), covar=tensor([0.0414, 0.0929, 0.0783, 0.0462, 0.0457, 0.1384, 0.0440, 0.0952], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0253, 0.0368, 0.0323, 0.0265, 0.0299, 0.0302, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:18:44,253 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96165.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:18:46,927 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96168.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 04:18:51,338 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.196e+02 2.106e+02 2.759e+02 3.696e+02 7.492e+02, threshold=5.517e+02, percent-clipped=3.0 +2022-12-08 04:19:17,379 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96203.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 04:19:22,427 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7909, 2.0123, 2.7853, 2.1983, 2.7415, 2.6712, 2.5588, 2.4548], + device='cuda:2'), covar=tensor([0.0714, 0.3055, 0.0939, 0.1819, 0.0732, 0.1038, 0.1150, 0.1942], + device='cuda:2'), in_proj_covar=tensor([0.0348, 0.0312, 0.0395, 0.0303, 0.0374, 0.0322, 0.0362, 0.0305], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:19:37,235 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96226.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:19:53,397 INFO [train.py:873] (2/4) Epoch 13, batch 5500, loss[loss=0.1682, simple_loss=0.1821, pruned_loss=0.07714, over 10347.00 frames. ], tot_loss[loss=0.1203, simple_loss=0.1522, pruned_loss=0.04415, over 1989852.30 frames. ], batch size: 100, lr: 6.03e-03, grad_scale: 8.0 +2022-12-08 04:20:17,918 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.945e+01 2.434e+02 2.937e+02 3.811e+02 8.980e+02, threshold=5.874e+02, percent-clipped=5.0 +2022-12-08 04:20:18,398 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96273.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:21:00,282 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:21:02,941 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96324.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:21:13,614 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96336.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:21:21,090 INFO [train.py:873] (2/4) Epoch 13, batch 5600, loss[loss=0.1399, simple_loss=0.1629, pruned_loss=0.05846, over 14516.00 frames. ], tot_loss[loss=0.1226, simple_loss=0.1537, pruned_loss=0.0458, over 2007296.46 frames. ], batch size: 49, lr: 6.02e-03, grad_scale: 8.0 +2022-12-08 04:21:45,614 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.402e+02 2.301e+02 2.704e+02 3.458e+02 6.924e+02, threshold=5.409e+02, percent-clipped=1.0 +2022-12-08 04:21:56,141 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96385.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:22:00,535 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96390.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:22:08,403 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4644, 3.8068, 2.9238, 4.7048, 4.2001, 4.4897, 3.9578, 3.1207], + device='cuda:2'), covar=tensor([0.0763, 0.1374, 0.3590, 0.0488, 0.1005, 0.1386, 0.1186, 0.3646], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0293, 0.0268, 0.0270, 0.0316, 0.0299, 0.0258, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:22:27,103 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 04:22:31,156 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2670, 2.1017, 3.1462, 3.2968, 3.1994, 2.2467, 3.2099, 2.4917], + device='cuda:2'), covar=tensor([0.0433, 0.1060, 0.0766, 0.0474, 0.0506, 0.1439, 0.0467, 0.0973], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0253, 0.0370, 0.0322, 0.0266, 0.0300, 0.0303, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:22:41,419 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96436.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:22:42,382 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1235, 2.8059, 2.8624, 1.9802, 2.6582, 2.9814, 3.1659, 2.5472], + device='cuda:2'), covar=tensor([0.0735, 0.1061, 0.0890, 0.1400, 0.0832, 0.0661, 0.0588, 0.1187], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0178, 0.0139, 0.0126, 0.0137, 0.0150, 0.0125, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:22:48,807 INFO [train.py:873] (2/4) Epoch 13, batch 5700, loss[loss=0.1733, simple_loss=0.1586, pruned_loss=0.09404, over 1244.00 frames. ], tot_loss[loss=0.1226, simple_loss=0.1533, pruned_loss=0.04591, over 1913626.44 frames. ], batch size: 100, lr: 6.02e-03, grad_scale: 8.0 +2022-12-08 04:22:54,745 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96451.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:23:09,367 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96468.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 04:23:13,581 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.334e+02 2.382e+02 2.762e+02 3.323e+02 6.415e+02, threshold=5.524e+02, percent-clipped=3.0 +2022-12-08 04:23:40,464 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96503.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:23:51,381 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96516.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:23:55,870 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96521.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:24:17,061 INFO [train.py:873] (2/4) Epoch 13, batch 5800, loss[loss=0.1004, simple_loss=0.1431, pruned_loss=0.02891, over 14289.00 frames. ], tot_loss[loss=0.1222, simple_loss=0.1532, pruned_loss=0.04559, over 1889536.47 frames. ], batch size: 25, lr: 6.02e-03, grad_scale: 4.0 +2022-12-08 04:24:22,206 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96551.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:24:42,624 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.152e+02 2.229e+02 2.647e+02 3.440e+02 6.528e+02, threshold=5.293e+02, percent-clipped=3.0 +2022-12-08 04:25:37,054 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96636.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:25:44,389 INFO [train.py:873] (2/4) Epoch 13, batch 5900, loss[loss=0.1673, simple_loss=0.1467, pruned_loss=0.09391, over 1245.00 frames. ], tot_loss[loss=0.1204, simple_loss=0.1522, pruned_loss=0.04427, over 1968062.92 frames. ], batch size: 100, lr: 6.01e-03, grad_scale: 4.0 +2022-12-08 04:26:10,181 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.137e+02 2.158e+02 2.566e+02 3.278e+02 1.082e+03, threshold=5.131e+02, percent-clipped=5.0 +2022-12-08 04:26:15,613 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96680.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:26:18,911 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96684.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:26:42,763 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0744, 3.6181, 2.9126, 4.4974, 4.0998, 4.1639, 3.8276, 3.1257], + device='cuda:2'), covar=tensor([0.1248, 0.1587, 0.3834, 0.0564, 0.1182, 0.1845, 0.1267, 0.3418], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0292, 0.0268, 0.0269, 0.0316, 0.0298, 0.0257, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:27:03,979 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96736.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:27:11,548 INFO [train.py:873] (2/4) Epoch 13, batch 6000, loss[loss=0.1326, simple_loss=0.1631, pruned_loss=0.051, over 14514.00 frames. ], tot_loss[loss=0.1203, simple_loss=0.1522, pruned_loss=0.04418, over 1951484.67 frames. ], batch size: 51, lr: 6.01e-03, grad_scale: 8.0 +2022-12-08 04:27:11,548 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 04:27:19,804 INFO [train.py:905] (2/4) Epoch 13, validation: loss=0.132, simple_loss=0.1717, pruned_loss=0.04613, over 857387.00 frames. +2022-12-08 04:27:19,805 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 04:27:20,823 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=96746.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:27:45,011 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.016e+02 2.616e+02 3.028e+02 5.890e+02, threshold=5.232e+02, percent-clipped=1.0 +2022-12-08 04:27:46,854 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=96776.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:27:53,300 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96784.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:28:25,754 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96821.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:28:40,221 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=96837.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:28:47,194 INFO [train.py:873] (2/4) Epoch 13, batch 6100, loss[loss=0.1074, simple_loss=0.1454, pruned_loss=0.03473, over 13925.00 frames. ], tot_loss[loss=0.1196, simple_loss=0.1519, pruned_loss=0.04365, over 2041105.23 frames. ], batch size: 23, lr: 6.01e-03, grad_scale: 8.0 +2022-12-08 04:29:00,541 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3716, 4.1599, 3.8325, 3.9997, 4.2269, 4.2693, 4.4235, 4.3820], + device='cuda:2'), covar=tensor([0.0979, 0.0530, 0.2143, 0.3070, 0.0796, 0.0894, 0.0863, 0.0816], + device='cuda:2'), in_proj_covar=tensor([0.0377, 0.0267, 0.0440, 0.0554, 0.0331, 0.0431, 0.0382, 0.0372], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:29:04,921 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8830, 1.6199, 1.8565, 2.0287, 1.4642, 1.8121, 1.8597, 1.9339], + device='cuda:2'), covar=tensor([0.0145, 0.0239, 0.0141, 0.0118, 0.0249, 0.0245, 0.0167, 0.0116], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0254, 0.0369, 0.0323, 0.0266, 0.0299, 0.0303, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:29:06,883 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1396, 3.0524, 2.9116, 3.2300, 2.8309, 2.8110, 3.2545, 3.1216], + device='cuda:2'), covar=tensor([0.0721, 0.0885, 0.0965, 0.0643, 0.1078, 0.0800, 0.0660, 0.0774], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0135, 0.0143, 0.0153, 0.0141, 0.0119, 0.0161, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:29:07,681 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=96869.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:29:12,499 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.404e+02 2.137e+02 2.568e+02 3.326e+02 6.550e+02, threshold=5.137e+02, percent-clipped=4.0 +2022-12-08 04:29:12,716 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3539, 1.8961, 2.3356, 2.4210, 2.1864, 1.9008, 2.4527, 2.1347], + device='cuda:2'), covar=tensor([0.0398, 0.0836, 0.0408, 0.0344, 0.0506, 0.0996, 0.0387, 0.0550], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0255, 0.0370, 0.0324, 0.0266, 0.0300, 0.0304, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:29:21,373 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-08 04:30:13,979 INFO [train.py:873] (2/4) Epoch 13, batch 6200, loss[loss=0.1241, simple_loss=0.1532, pruned_loss=0.0475, over 14268.00 frames. ], tot_loss[loss=0.1207, simple_loss=0.1527, pruned_loss=0.04435, over 2008477.42 frames. ], batch size: 89, lr: 6.00e-03, grad_scale: 8.0 +2022-12-08 04:30:39,952 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.066e+02 2.243e+02 2.655e+02 3.164e+02 5.951e+02, threshold=5.310e+02, percent-clipped=2.0 +2022-12-08 04:30:45,342 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=96980.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:31:03,643 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97000.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:31:09,990 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9594, 1.8150, 2.0513, 1.9101, 1.8921, 1.6252, 1.3795, 1.2000], + device='cuda:2'), covar=tensor([0.0251, 0.0342, 0.0241, 0.0200, 0.0295, 0.0331, 0.0297, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0016, 0.0017, 0.0018, 0.0029, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 04:31:14,670 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-08 04:31:28,046 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97028.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:31:43,228 INFO [train.py:873] (2/4) Epoch 13, batch 6300, loss[loss=0.2329, simple_loss=0.1979, pruned_loss=0.1339, over 1168.00 frames. ], tot_loss[loss=0.12, simple_loss=0.1523, pruned_loss=0.04388, over 1977661.66 frames. ], batch size: 100, lr: 6.00e-03, grad_scale: 4.0 +2022-12-08 04:31:44,293 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97046.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:31:50,431 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-12-08 04:31:57,036 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97061.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:32:09,641 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.227e+02 2.717e+02 3.666e+02 6.842e+02, threshold=5.435e+02, percent-clipped=6.0 +2022-12-08 04:32:25,896 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97094.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:32:30,530 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.44 vs. limit=2.0 +2022-12-08 04:32:58,842 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97132.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:33:09,963 INFO [train.py:873] (2/4) Epoch 13, batch 6400, loss[loss=0.1153, simple_loss=0.1478, pruned_loss=0.04137, over 13883.00 frames. ], tot_loss[loss=0.1187, simple_loss=0.1513, pruned_loss=0.04303, over 1993047.21 frames. ], batch size: 23, lr: 6.00e-03, grad_scale: 8.0 +2022-12-08 04:33:36,266 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.400e+02 2.249e+02 2.635e+02 3.271e+02 6.214e+02, threshold=5.270e+02, percent-clipped=1.0 +2022-12-08 04:34:37,204 INFO [train.py:873] (2/4) Epoch 13, batch 6500, loss[loss=0.1446, simple_loss=0.1656, pruned_loss=0.06179, over 8634.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1516, pruned_loss=0.04361, over 1934458.58 frames. ], batch size: 100, lr: 6.00e-03, grad_scale: 8.0 +2022-12-08 04:34:50,003 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97260.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:35:03,586 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.464e+02 2.327e+02 2.807e+02 3.571e+02 7.532e+02, threshold=5.613e+02, percent-clipped=5.0 +2022-12-08 04:35:44,109 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:35:49,639 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6504, 2.3367, 3.5797, 3.7320, 3.5976, 2.3569, 3.6125, 2.7409], + device='cuda:2'), covar=tensor([0.0459, 0.0968, 0.0901, 0.0569, 0.0467, 0.1477, 0.0494, 0.0961], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0254, 0.0369, 0.0323, 0.0265, 0.0298, 0.0304, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:36:02,214 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97342.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:36:04,580 INFO [train.py:873] (2/4) Epoch 13, batch 6600, loss[loss=0.1502, simple_loss=0.1673, pruned_loss=0.0665, over 10319.00 frames. ], tot_loss[loss=0.1193, simple_loss=0.1513, pruned_loss=0.04361, over 2004979.36 frames. ], batch size: 100, lr: 5.99e-03, grad_scale: 8.0 +2022-12-08 04:36:06,447 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97347.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:36:14,720 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97356.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:36:30,433 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97374.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:36:31,019 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.194e+02 2.777e+02 3.424e+02 5.452e+02, threshold=5.554e+02, percent-clipped=0.0 +2022-12-08 04:36:55,767 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97403.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 04:37:00,398 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97408.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:37:02,950 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2129, 2.6251, 5.0482, 3.4288, 4.9349, 2.5228, 3.8661, 4.8252], + device='cuda:2'), covar=tensor([0.0401, 0.4059, 0.0644, 0.6574, 0.0501, 0.3530, 0.1208, 0.0390], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0210, 0.0213, 0.0286, 0.0228, 0.0212, 0.0212, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:37:09,869 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3448, 2.3100, 2.2307, 1.9649, 2.4848, 2.1323, 1.8929, 2.0806], + device='cuda:2'), covar=tensor([0.0194, 0.0576, 0.0382, 0.0330, 0.0275, 0.0446, 0.0476, 0.0505], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0016, 0.0017, 0.0018, 0.0029, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 04:37:20,871 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:37:23,925 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97435.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:37:32,757 INFO [train.py:873] (2/4) Epoch 13, batch 6700, loss[loss=0.09836, simple_loss=0.1365, pruned_loss=0.03012, over 14234.00 frames. ], tot_loss[loss=0.1199, simple_loss=0.1514, pruned_loss=0.04416, over 1929270.19 frames. ], batch size: 69, lr: 5.99e-03, grad_scale: 8.0 +2022-12-08 04:37:59,105 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.315e+02 2.128e+02 2.579e+02 3.037e+02 5.604e+02, threshold=5.159e+02, percent-clipped=1.0 +2022-12-08 04:38:03,372 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97480.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:38:10,483 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-08 04:38:17,295 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.67 vs. limit=2.0 +2022-12-08 04:38:20,531 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-08 04:38:42,418 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3585, 3.6528, 3.5509, 3.4677, 2.6870, 3.6845, 3.3191, 1.9588], + device='cuda:2'), covar=tensor([0.1737, 0.0637, 0.0759, 0.0633, 0.0954, 0.0484, 0.0953, 0.2112], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0085, 0.0065, 0.0069, 0.0097, 0.0082, 0.0098, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 04:38:59,023 INFO [train.py:873] (2/4) Epoch 13, batch 6800, loss[loss=0.1343, simple_loss=0.1595, pruned_loss=0.05454, over 4942.00 frames. ], tot_loss[loss=0.1212, simple_loss=0.1521, pruned_loss=0.04511, over 1906432.34 frames. ], batch size: 100, lr: 5.99e-03, grad_scale: 8.0 +2022-12-08 04:39:08,484 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97555.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:39:11,093 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5130, 2.2251, 2.4235, 1.7030, 2.1539, 2.3789, 2.4924, 2.1631], + device='cuda:2'), covar=tensor([0.0828, 0.0833, 0.1026, 0.1502, 0.1179, 0.0880, 0.0735, 0.1404], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0177, 0.0139, 0.0127, 0.0138, 0.0148, 0.0125, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:39:24,725 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8334, 1.2130, 2.0063, 1.3104, 1.9459, 2.0692, 1.7249, 2.1158], + device='cuda:2'), covar=tensor([0.0376, 0.2334, 0.0566, 0.1870, 0.0603, 0.0590, 0.1172, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0158, 0.0158, 0.0169, 0.0167, 0.0176, 0.0134, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:39:25,406 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.218e+02 2.339e+02 3.043e+02 3.694e+02 1.051e+03, threshold=6.086e+02, percent-clipped=5.0 +2022-12-08 04:39:38,390 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4645, 1.7587, 2.6941, 2.1472, 2.5531, 1.6926, 2.2424, 2.4607], + device='cuda:2'), covar=tensor([0.1566, 0.3760, 0.0669, 0.3551, 0.1071, 0.3030, 0.1126, 0.0642], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0208, 0.0212, 0.0283, 0.0228, 0.0211, 0.0209, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:40:01,868 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97616.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:40:01,947 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97616.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:40:26,027 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5493, 1.3447, 2.7746, 1.5284, 2.7850, 2.7145, 1.9221, 2.8960], + device='cuda:2'), covar=tensor([0.0287, 0.2812, 0.0407, 0.1926, 0.0419, 0.0507, 0.1138, 0.0243], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0158, 0.0158, 0.0170, 0.0167, 0.0176, 0.0134, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:40:26,466 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.32 vs. limit=5.0 +2022-12-08 04:40:27,572 INFO [train.py:873] (2/4) Epoch 13, batch 6900, loss[loss=0.1006, simple_loss=0.1466, pruned_loss=0.02735, over 14511.00 frames. ], tot_loss[loss=0.1195, simple_loss=0.1511, pruned_loss=0.04397, over 1957753.21 frames. ], batch size: 22, lr: 5.98e-03, grad_scale: 8.0 +2022-12-08 04:40:29,368 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7624, 1.1275, 1.2894, 1.2373, 1.1227, 1.2750, 1.0572, 0.9680], + device='cuda:2'), covar=tensor([0.2337, 0.0970, 0.0362, 0.0542, 0.1574, 0.0927, 0.2328, 0.1476], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0085, 0.0065, 0.0069, 0.0097, 0.0082, 0.0098, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 04:40:36,920 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97656.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:40:49,048 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2291, 1.9082, 3.4403, 2.5020, 3.2609, 1.8655, 2.5764, 3.1758], + device='cuda:2'), covar=tensor([0.0832, 0.4246, 0.0603, 0.4867, 0.0752, 0.3321, 0.1400, 0.0820], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0208, 0.0212, 0.0282, 0.0228, 0.0210, 0.0208, 0.0214], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:40:52,668 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97674.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 04:40:53,268 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.208e+02 2.182e+02 2.726e+02 3.211e+02 6.785e+02, threshold=5.451e+02, percent-clipped=1.0 +2022-12-08 04:41:02,855 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.58 vs. limit=2.0 +2022-12-08 04:41:13,562 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97698.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:41:17,875 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97703.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:41:18,592 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97704.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:41:41,326 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97730.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:41:45,643 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97735.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:41:53,813 INFO [train.py:873] (2/4) Epoch 13, batch 7000, loss[loss=0.1358, simple_loss=0.147, pruned_loss=0.06229, over 3876.00 frames. ], tot_loss[loss=0.1189, simple_loss=0.151, pruned_loss=0.0434, over 1931635.84 frames. ], batch size: 100, lr: 5.98e-03, grad_scale: 8.0 +2022-12-08 04:41:56,731 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5492, 2.2163, 2.4485, 1.6236, 2.1111, 2.4482, 2.5222, 2.1761], + device='cuda:2'), covar=tensor([0.0762, 0.0810, 0.0867, 0.1540, 0.1039, 0.0843, 0.0662, 0.1243], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0177, 0.0139, 0.0126, 0.0139, 0.0150, 0.0125, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:42:20,482 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.837e+01 2.234e+02 2.676e+02 3.367e+02 8.383e+02, threshold=5.352e+02, percent-clipped=3.0 +2022-12-08 04:43:02,898 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-08 04:43:15,558 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.80 vs. limit=2.0 +2022-12-08 04:43:18,034 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3619, 2.3682, 1.9822, 2.4391, 2.2935, 2.3587, 2.1764, 1.9916], + device='cuda:2'), covar=tensor([0.1061, 0.0896, 0.1932, 0.0788, 0.1057, 0.0768, 0.1331, 0.1377], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0292, 0.0266, 0.0271, 0.0313, 0.0296, 0.0254, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:43:23,311 INFO [train.py:873] (2/4) Epoch 13, batch 7100, loss[loss=0.1145, simple_loss=0.1539, pruned_loss=0.03752, over 14239.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1514, pruned_loss=0.04368, over 1942013.88 frames. ], batch size: 69, lr: 5.98e-03, grad_scale: 8.0 +2022-12-08 04:43:49,282 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.222e+02 2.322e+02 2.890e+02 3.385e+02 9.839e+02, threshold=5.780e+02, percent-clipped=7.0 +2022-12-08 04:43:55,926 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2133, 2.8810, 2.8112, 2.0563, 2.6969, 3.0176, 3.2223, 2.4869], + device='cuda:2'), covar=tensor([0.0695, 0.0932, 0.1177, 0.1596, 0.0920, 0.0657, 0.0661, 0.1470], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0176, 0.0139, 0.0127, 0.0138, 0.0149, 0.0125, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:44:01,063 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=97888.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:44:01,980 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8791, 1.8843, 2.1831, 2.1711, 2.3891, 2.0053, 1.9372, 1.9026], + device='cuda:2'), covar=tensor([0.0473, 0.0831, 0.0352, 0.0541, 0.0325, 0.0461, 0.0473, 0.0597], + device='cuda:2'), in_proj_covar=tensor([0.0018, 0.0019, 0.0016, 0.0017, 0.0017, 0.0029, 0.0023, 0.0028], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 04:44:19,441 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1287, 2.0643, 1.8247, 1.8956, 2.0349, 2.1089, 2.0985, 2.0747], + device='cuda:2'), covar=tensor([0.1171, 0.0939, 0.2704, 0.2656, 0.1415, 0.1215, 0.1588, 0.1168], + device='cuda:2'), in_proj_covar=tensor([0.0381, 0.0269, 0.0441, 0.0556, 0.0336, 0.0434, 0.0387, 0.0373], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:44:21,114 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=97911.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:44:26,047 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97916.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:44:49,858 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1157, 1.3412, 4.0429, 1.6564, 3.8015, 4.1011, 3.3531, 4.3763], + device='cuda:2'), covar=tensor([0.0257, 0.3882, 0.0440, 0.2869, 0.0596, 0.0412, 0.0650, 0.0249], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0157, 0.0157, 0.0168, 0.0167, 0.0175, 0.0133, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:44:51,432 INFO [train.py:873] (2/4) Epoch 13, batch 7200, loss[loss=0.1002, simple_loss=0.1253, pruned_loss=0.03761, over 3902.00 frames. ], tot_loss[loss=0.1195, simple_loss=0.1517, pruned_loss=0.04369, over 2015462.29 frames. ], batch size: 100, lr: 5.97e-03, grad_scale: 8.0 +2022-12-08 04:44:54,809 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=97949.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:44:55,756 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6745, 2.3972, 3.0844, 2.0090, 1.9363, 2.7474, 1.5457, 2.7038], + device='cuda:2'), covar=tensor([0.1214, 0.1522, 0.0613, 0.2694, 0.2493, 0.0880, 0.3436, 0.0935], + device='cuda:2'), in_proj_covar=tensor([0.0084, 0.0099, 0.0090, 0.0098, 0.0114, 0.0086, 0.0122, 0.0090], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 04:45:08,591 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=97964.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:45:18,186 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.164e+02 2.816e+02 3.534e+02 1.223e+03, threshold=5.633e+02, percent-clipped=3.0 +2022-12-08 04:45:38,800 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=97998.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:45:43,500 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98003.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:46:07,194 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98030.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 04:46:07,220 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98030.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:46:20,635 INFO [train.py:873] (2/4) Epoch 13, batch 7300, loss[loss=0.1213, simple_loss=0.1523, pruned_loss=0.04515, over 14296.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1499, pruned_loss=0.04243, over 2014361.24 frames. ], batch size: 69, lr: 5.97e-03, grad_scale: 8.0 +2022-12-08 04:46:21,868 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98046.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:46:26,101 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98051.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:46:42,974 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8545, 2.7270, 2.4507, 2.5653, 2.7746, 2.8101, 2.8424, 2.8075], + device='cuda:2'), covar=tensor([0.1167, 0.0901, 0.2379, 0.2521, 0.1045, 0.1153, 0.1553, 0.1069], + device='cuda:2'), in_proj_covar=tensor([0.0384, 0.0267, 0.0442, 0.0557, 0.0335, 0.0435, 0.0386, 0.0374], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:46:46,648 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.423e+02 2.194e+02 2.668e+02 3.260e+02 4.997e+02, threshold=5.336e+02, percent-clipped=0.0 +2022-12-08 04:46:50,043 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98078.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:47:14,635 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98106.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:47:46,040 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-12-08 04:47:48,798 INFO [train.py:873] (2/4) Epoch 13, batch 7400, loss[loss=0.1105, simple_loss=0.1469, pruned_loss=0.03707, over 14264.00 frames. ], tot_loss[loss=0.1184, simple_loss=0.1503, pruned_loss=0.04321, over 1960979.45 frames. ], batch size: 76, lr: 5.97e-03, grad_scale: 8.0 +2022-12-08 04:48:07,790 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1390, 2.0533, 1.7814, 1.8684, 2.0472, 2.0814, 2.0567, 2.0698], + device='cuda:2'), covar=tensor([0.1178, 0.1029, 0.2800, 0.3014, 0.1432, 0.1183, 0.2086, 0.1250], + device='cuda:2'), in_proj_covar=tensor([0.0381, 0.0266, 0.0439, 0.0554, 0.0334, 0.0433, 0.0386, 0.0372], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:48:08,747 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98167.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:48:15,367 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.067e+02 1.931e+02 2.491e+02 3.118e+02 6.123e+02, threshold=4.982e+02, percent-clipped=1.0 +2022-12-08 04:48:48,125 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98211.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:49:08,877 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1720, 1.4270, 1.6832, 1.7104, 1.4994, 1.6173, 1.3184, 1.2857], + device='cuda:2'), covar=tensor([0.1691, 0.1469, 0.0417, 0.0537, 0.1297, 0.0994, 0.1964, 0.1707], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0084, 0.0066, 0.0069, 0.0097, 0.0082, 0.0097, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 04:49:16,930 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98244.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:49:17,649 INFO [train.py:873] (2/4) Epoch 13, batch 7500, loss[loss=0.09305, simple_loss=0.1373, pruned_loss=0.0244, over 14305.00 frames. ], tot_loss[loss=0.1192, simple_loss=0.1509, pruned_loss=0.04379, over 1964234.70 frames. ], batch size: 25, lr: 5.96e-03, grad_scale: 4.0 +2022-12-08 04:49:29,612 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98259.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:49:44,017 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.582e+02 2.297e+02 2.932e+02 3.500e+02 6.688e+02, threshold=5.864e+02, percent-clipped=4.0 +2022-12-08 04:50:44,935 INFO [train.py:873] (2/4) Epoch 14, batch 0, loss[loss=0.1275, simple_loss=0.1731, pruned_loss=0.0409, over 14012.00 frames. ], tot_loss[loss=0.1275, simple_loss=0.1731, pruned_loss=0.0409, over 14012.00 frames. ], batch size: 22, lr: 5.75e-03, grad_scale: 8.0 +2022-12-08 04:50:44,935 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 04:50:49,729 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1281, 4.2655, 4.3386, 3.7213, 4.2613, 4.3508, 1.9625, 4.0993], + device='cuda:2'), covar=tensor([0.0179, 0.0214, 0.0255, 0.0385, 0.0231, 0.0314, 0.2691, 0.0215], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0170, 0.0142, 0.0140, 0.0201, 0.0135, 0.0158, 0.0188], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 04:50:52,247 INFO [train.py:905] (2/4) Epoch 14, validation: loss=0.1389, simple_loss=0.1805, pruned_loss=0.04866, over 857387.00 frames. +2022-12-08 04:50:52,247 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 04:51:13,036 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98330.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:51:53,528 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.725e+01 2.112e+02 3.078e+02 4.479e+02 1.243e+03, threshold=6.157e+02, percent-clipped=15.0 +2022-12-08 04:51:55,468 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98378.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 04:52:20,898 INFO [train.py:873] (2/4) Epoch 14, batch 100, loss[loss=0.1371, simple_loss=0.1575, pruned_loss=0.05837, over 13874.00 frames. ], tot_loss[loss=0.1204, simple_loss=0.1526, pruned_loss=0.04412, over 832646.11 frames. ], batch size: 20, lr: 5.74e-03, grad_scale: 8.0 +2022-12-08 04:52:22,357 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8623, 3.4624, 3.3421, 2.5452, 3.3418, 3.4991, 3.8310, 3.1136], + device='cuda:2'), covar=tensor([0.0473, 0.1217, 0.0792, 0.1322, 0.0642, 0.0616, 0.0669, 0.0972], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0175, 0.0136, 0.0124, 0.0137, 0.0147, 0.0124, 0.0138], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 04:52:24,315 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98410.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:53:10,374 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98462.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:53:18,281 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98471.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:53:22,339 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.180e+02 2.038e+02 2.657e+02 3.395e+02 7.955e+02, threshold=5.314e+02, percent-clipped=1.0 +2022-12-08 04:53:49,904 INFO [train.py:873] (2/4) Epoch 14, batch 200, loss[loss=0.1426, simple_loss=0.1677, pruned_loss=0.05876, over 11201.00 frames. ], tot_loss[loss=0.1211, simple_loss=0.1525, pruned_loss=0.04491, over 1252151.10 frames. ], batch size: 100, lr: 5.74e-03, grad_scale: 8.0 +2022-12-08 04:54:08,124 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8280, 3.5535, 3.5816, 3.8609, 3.4656, 3.2056, 3.9090, 3.7353], + device='cuda:2'), covar=tensor([0.0695, 0.0990, 0.0922, 0.0698, 0.1090, 0.0855, 0.0648, 0.0809], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0136, 0.0144, 0.0154, 0.0143, 0.0119, 0.0163, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:54:22,187 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98543.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:54:22,925 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98544.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:54:50,566 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.342e+02 2.243e+02 2.774e+02 3.161e+02 6.652e+02, threshold=5.548e+02, percent-clipped=3.0 +2022-12-08 04:55:03,797 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98592.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:55:14,501 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98604.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:55:16,907 INFO [train.py:873] (2/4) Epoch 14, batch 300, loss[loss=0.1333, simple_loss=0.1639, pruned_loss=0.05138, over 13522.00 frames. ], tot_loss[loss=0.1211, simple_loss=0.1522, pruned_loss=0.04504, over 1524051.72 frames. ], batch size: 100, lr: 5.74e-03, grad_scale: 8.0 +2022-12-08 04:55:23,235 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3848, 2.3607, 1.8610, 2.4328, 2.2786, 2.3195, 2.1764, 2.0044], + device='cuda:2'), covar=tensor([0.0748, 0.0776, 0.1993, 0.0629, 0.0940, 0.0657, 0.1226, 0.1227], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0290, 0.0265, 0.0270, 0.0313, 0.0296, 0.0255, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:55:27,705 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1934, 3.9102, 3.8204, 4.2647, 3.8447, 3.5465, 4.2447, 4.0615], + device='cuda:2'), covar=tensor([0.0590, 0.0889, 0.0809, 0.0534, 0.0848, 0.0799, 0.0578, 0.0703], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0137, 0.0145, 0.0155, 0.0144, 0.0119, 0.0163, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 04:56:18,353 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.000e+02 2.234e+02 2.680e+02 3.385e+02 6.957e+02, threshold=5.360e+02, percent-clipped=3.0 +2022-12-08 04:56:45,721 INFO [train.py:873] (2/4) Epoch 14, batch 400, loss[loss=0.1215, simple_loss=0.152, pruned_loss=0.04547, over 5938.00 frames. ], tot_loss[loss=0.1192, simple_loss=0.1512, pruned_loss=0.04359, over 1757049.04 frames. ], batch size: 100, lr: 5.73e-03, grad_scale: 8.0 +2022-12-08 04:57:18,380 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=2.51 vs. limit=2.0 +2022-12-08 04:57:33,727 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=98762.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:57:36,962 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98766.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:57:46,467 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.221e+02 2.679e+02 3.803e+02 8.740e+02, threshold=5.359e+02, percent-clipped=4.0 +2022-12-08 04:57:59,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7576, 2.9237, 4.4965, 3.3384, 4.4375, 4.4385, 4.2844, 3.9834], + device='cuda:2'), covar=tensor([0.0878, 0.3127, 0.1003, 0.1987, 0.0767, 0.0791, 0.1699, 0.1659], + device='cuda:2'), in_proj_covar=tensor([0.0348, 0.0312, 0.0390, 0.0299, 0.0369, 0.0317, 0.0360, 0.0301], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:58:13,697 INFO [train.py:873] (2/4) Epoch 14, batch 500, loss[loss=0.128, simple_loss=0.1421, pruned_loss=0.05699, over 5009.00 frames. ], tot_loss[loss=0.1185, simple_loss=0.1509, pruned_loss=0.04304, over 1832195.64 frames. ], batch size: 100, lr: 5.73e-03, grad_scale: 8.0 +2022-12-08 04:58:16,990 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=98810.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:58:43,047 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4114, 1.3833, 3.6457, 1.6673, 3.3821, 3.5866, 2.5872, 3.7373], + device='cuda:2'), covar=tensor([0.0348, 0.4189, 0.0458, 0.2834, 0.0846, 0.0509, 0.1060, 0.0322], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0159, 0.0161, 0.0173, 0.0169, 0.0179, 0.0136, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:59:08,838 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3976, 1.9400, 3.5293, 2.4835, 3.3254, 1.8072, 2.6025, 3.3431], + device='cuda:2'), covar=tensor([0.0713, 0.3999, 0.0524, 0.4935, 0.0809, 0.3343, 0.1432, 0.0659], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0207, 0.0209, 0.0280, 0.0227, 0.0207, 0.0206, 0.0209], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 04:59:14,756 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 1.935e+02 2.381e+02 2.894e+02 6.356e+02, threshold=4.763e+02, percent-clipped=3.0 +2022-12-08 04:59:26,751 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98890.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:59:31,761 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98895.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:59:35,185 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=98899.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 04:59:42,007 INFO [train.py:873] (2/4) Epoch 14, batch 600, loss[loss=0.1085, simple_loss=0.1118, pruned_loss=0.05257, over 1240.00 frames. ], tot_loss[loss=0.118, simple_loss=0.1503, pruned_loss=0.04283, over 1858404.93 frames. ], batch size: 100, lr: 5.73e-03, grad_scale: 8.0 +2022-12-08 05:00:15,386 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 05:00:20,916 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98951.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:00:25,439 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=98956.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:00:38,401 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=98971.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:00:43,054 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.204e+01 1.954e+02 2.310e+02 3.088e+02 7.406e+02, threshold=4.620e+02, percent-clipped=2.0 +2022-12-08 05:01:09,650 INFO [train.py:873] (2/4) Epoch 14, batch 700, loss[loss=0.1089, simple_loss=0.1472, pruned_loss=0.03536, over 14561.00 frames. ], tot_loss[loss=0.118, simple_loss=0.1504, pruned_loss=0.04283, over 1885658.65 frames. ], batch size: 34, lr: 5.73e-03, grad_scale: 8.0 +2022-12-08 05:01:32,187 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99032.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:01:48,597 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99051.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:02:01,353 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99066.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:02:09,758 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.403e+02 2.198e+02 2.677e+02 3.490e+02 1.171e+03, threshold=5.353e+02, percent-clipped=9.0 +2022-12-08 05:02:37,295 INFO [train.py:873] (2/4) Epoch 14, batch 800, loss[loss=0.1521, simple_loss=0.1471, pruned_loss=0.07851, over 2583.00 frames. ], tot_loss[loss=0.1186, simple_loss=0.1507, pruned_loss=0.04327, over 1893219.69 frames. ], batch size: 100, lr: 5.72e-03, grad_scale: 8.0 +2022-12-08 05:02:37,529 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8352, 2.5418, 3.6367, 2.7941, 3.5991, 3.5480, 3.4945, 3.0270], + device='cuda:2'), covar=tensor([0.0878, 0.2696, 0.0966, 0.1802, 0.0768, 0.0906, 0.1320, 0.1855], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0311, 0.0393, 0.0300, 0.0372, 0.0318, 0.0360, 0.0304], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:02:41,497 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99112.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:02:43,048 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99114.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:03:38,209 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.475e+02 2.181e+02 2.713e+02 3.365e+02 5.447e+02, threshold=5.425e+02, percent-clipped=1.0 +2022-12-08 05:03:58,148 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99199.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:04:05,016 INFO [train.py:873] (2/4) Epoch 14, batch 900, loss[loss=0.135, simple_loss=0.159, pruned_loss=0.05547, over 14253.00 frames. ], tot_loss[loss=0.1184, simple_loss=0.1509, pruned_loss=0.04292, over 1999210.71 frames. ], batch size: 37, lr: 5.72e-03, grad_scale: 8.0 +2022-12-08 05:04:24,474 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99229.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:04:39,124 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99246.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:04:39,934 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99247.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:04:42,706 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7908, 1.8171, 1.9786, 1.9683, 2.1073, 1.2922, 1.6054, 1.7499], + device='cuda:2'), covar=tensor([0.1217, 0.0985, 0.0613, 0.0693, 0.0690, 0.0924, 0.0973, 0.1241], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0030, 0.0033, 0.0028, 0.0030, 0.0043, 0.0031, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:04:43,448 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99251.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:04:58,850 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99268.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:05:02,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6062, 3.2769, 2.4369, 3.7490, 3.5134, 3.5892, 3.1882, 2.4840], + device='cuda:2'), covar=tensor([0.0794, 0.1377, 0.3787, 0.0671, 0.1022, 0.1371, 0.1314, 0.3683], + device='cuda:2'), in_proj_covar=tensor([0.0273, 0.0291, 0.0264, 0.0271, 0.0314, 0.0296, 0.0253, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 05:05:04,173 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2272, 2.1570, 4.9695, 4.4998, 4.3211, 4.9868, 4.8271, 5.0725], + device='cuda:2'), covar=tensor([0.1361, 0.1294, 0.0101, 0.0186, 0.0212, 0.0125, 0.0122, 0.0111], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0158, 0.0127, 0.0167, 0.0146, 0.0140, 0.0120, 0.0121], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 05:05:05,667 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.212e+02 2.795e+02 3.543e+02 9.051e+02, threshold=5.590e+02, percent-clipped=4.0 +2022-12-08 05:05:06,188 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-08 05:05:06,725 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8457, 1.6780, 2.0310, 1.7751, 1.8330, 1.5967, 0.8054, 1.0718], + device='cuda:2'), covar=tensor([0.0209, 0.0492, 0.0314, 0.0296, 0.0257, 0.0332, 0.0311, 0.0597], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0017, 0.0018, 0.0018, 0.0030, 0.0024, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:05:17,585 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99290.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:05:32,839 INFO [train.py:873] (2/4) Epoch 14, batch 1000, loss[loss=0.1257, simple_loss=0.1584, pruned_loss=0.04652, over 14390.00 frames. ], tot_loss[loss=0.1188, simple_loss=0.1512, pruned_loss=0.04318, over 1922551.38 frames. ], batch size: 53, lr: 5.72e-03, grad_scale: 8.0 +2022-12-08 05:05:47,439 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99324.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:05:49,848 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99327.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:05:51,547 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:06:01,248 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.70 vs. limit=5.0 +2022-12-08 05:06:01,391 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.67 vs. limit=2.0 +2022-12-08 05:06:06,536 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 05:06:06,568 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.48 vs. limit=5.0 +2022-12-08 05:06:32,478 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.188e+02 2.112e+02 2.804e+02 3.540e+02 1.187e+03, threshold=5.607e+02, percent-clipped=4.0 +2022-12-08 05:06:41,193 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99385.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:06:55,059 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 05:06:59,882 INFO [train.py:873] (2/4) Epoch 14, batch 1100, loss[loss=0.1077, simple_loss=0.1471, pruned_loss=0.03412, over 14307.00 frames. ], tot_loss[loss=0.119, simple_loss=0.1513, pruned_loss=0.04339, over 1943378.95 frames. ], batch size: 46, lr: 5.71e-03, grad_scale: 8.0 +2022-12-08 05:06:59,959 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99407.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:07:10,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8753, 2.7261, 2.7157, 2.9210, 2.7791, 2.8101, 2.9776, 2.4937], + device='cuda:2'), covar=tensor([0.0704, 0.1048, 0.0574, 0.0535, 0.0882, 0.0597, 0.0608, 0.0661], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0270, 0.0191, 0.0188, 0.0183, 0.0152, 0.0277, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 05:07:12,237 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99420.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:07:53,804 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99468.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:08:00,246 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.366e+02 2.195e+02 2.512e+02 3.214e+02 5.194e+02, threshold=5.024e+02, percent-clipped=0.0 +2022-12-08 05:08:04,845 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99481.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:08:27,692 INFO [train.py:873] (2/4) Epoch 14, batch 1200, loss[loss=0.1098, simple_loss=0.1405, pruned_loss=0.0396, over 14293.00 frames. ], tot_loss[loss=0.1194, simple_loss=0.1517, pruned_loss=0.04357, over 1915641.34 frames. ], batch size: 25, lr: 5.71e-03, grad_scale: 8.0 +2022-12-08 05:08:46,694 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99529.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:09:01,577 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99546.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:05,788 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99551.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:20,238 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99568.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:23,011 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3561, 0.9939, 1.3123, 0.9346, 1.0411, 1.4145, 1.0213, 1.1129], + device='cuda:2'), covar=tensor([0.0319, 0.0720, 0.0510, 0.0429, 0.0831, 0.0587, 0.0460, 0.1042], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0174, 0.0136, 0.0124, 0.0137, 0.0147, 0.0125, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 05:09:25,332 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0085, 2.0325, 2.0671, 2.0647, 2.0205, 1.5893, 1.2757, 1.7973], + device='cuda:2'), covar=tensor([0.0598, 0.0488, 0.0498, 0.0366, 0.0427, 0.1381, 0.2108, 0.0468], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0172, 0.0144, 0.0142, 0.0203, 0.0137, 0.0159, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 05:09:28,000 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.218e+02 2.156e+02 2.823e+02 3.405e+02 6.610e+02, threshold=5.645e+02, percent-clipped=3.0 +2022-12-08 05:09:35,450 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99585.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:38,990 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99589.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:42,817 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-08 05:09:43,002 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99594.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:47,147 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99599.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:09:51,462 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.91 vs. limit=5.0 +2022-12-08 05:09:54,365 INFO [train.py:873] (2/4) Epoch 14, batch 1300, loss[loss=0.1225, simple_loss=0.156, pruned_loss=0.04449, over 14284.00 frames. ], tot_loss[loss=0.1178, simple_loss=0.1509, pruned_loss=0.0424, over 1984632.84 frames. ], batch size: 80, lr: 5.71e-03, grad_scale: 4.0 +2022-12-08 05:09:54,787 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.22 vs. limit=2.0 +2022-12-08 05:10:10,436 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99624.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:12,993 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99627.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:14,819 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99629.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:32,915 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99650.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:54,762 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99675.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:56,552 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.319e+02 2.185e+02 2.869e+02 3.573e+02 7.828e+02, threshold=5.739e+02, percent-clipped=4.0 +2022-12-08 05:10:59,277 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99680.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:10:59,363 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=99680.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:11:23,270 INFO [train.py:873] (2/4) Epoch 14, batch 1400, loss[loss=0.1665, simple_loss=0.1568, pruned_loss=0.08816, over 1191.00 frames. ], tot_loss[loss=0.1188, simple_loss=0.1515, pruned_loss=0.04308, over 1972041.15 frames. ], batch size: 100, lr: 5.71e-03, grad_scale: 4.0 +2022-12-08 05:11:23,411 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99707.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:11:52,923 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=99741.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:12:01,314 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-08 05:12:05,058 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99755.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:12:14,693 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7337, 3.5100, 3.2868, 3.4292, 3.6502, 3.6418, 3.7191, 3.6900], + device='cuda:2'), covar=tensor([0.0915, 0.0753, 0.2040, 0.2740, 0.0806, 0.0958, 0.1062, 0.0954], + device='cuda:2'), in_proj_covar=tensor([0.0381, 0.0267, 0.0441, 0.0562, 0.0339, 0.0441, 0.0388, 0.0379], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:12:23,719 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99776.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:12:24,429 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.204e+02 2.049e+02 2.625e+02 3.154e+02 6.097e+02, threshold=5.251e+02, percent-clipped=2.0 +2022-12-08 05:12:40,772 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.81 vs. limit=2.0 +2022-12-08 05:12:50,315 INFO [train.py:873] (2/4) Epoch 14, batch 1500, loss[loss=0.1142, simple_loss=0.1561, pruned_loss=0.03611, over 14220.00 frames. ], tot_loss[loss=0.1183, simple_loss=0.1508, pruned_loss=0.04286, over 1955618.44 frames. ], batch size: 46, lr: 5.70e-03, grad_scale: 4.0 +2022-12-08 05:13:06,060 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99824.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:13:36,947 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4608, 3.9627, 3.0875, 4.7849, 4.3128, 4.6046, 3.9786, 3.2014], + device='cuda:2'), covar=tensor([0.0663, 0.1102, 0.3145, 0.0499, 0.0810, 0.1078, 0.0997, 0.2970], + device='cuda:2'), in_proj_covar=tensor([0.0270, 0.0288, 0.0261, 0.0267, 0.0310, 0.0292, 0.0251, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 05:13:44,241 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0163, 3.7973, 3.6818, 4.1297, 3.8401, 3.5799, 4.0910, 3.4746], + device='cuda:2'), covar=tensor([0.0653, 0.1058, 0.0470, 0.0439, 0.0826, 0.1605, 0.0601, 0.0572], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0275, 0.0194, 0.0193, 0.0185, 0.0156, 0.0283, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 05:13:49,916 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.22 vs. limit=5.0 +2022-12-08 05:13:51,262 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3167, 2.6792, 2.6593, 2.6456, 2.2055, 2.6413, 2.3715, 1.5464], + device='cuda:2'), covar=tensor([0.1233, 0.0674, 0.0839, 0.0796, 0.1020, 0.0609, 0.1134, 0.2145], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0084, 0.0067, 0.0070, 0.0098, 0.0084, 0.0098, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:13:51,974 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.709e+01 2.163e+02 2.561e+02 3.336e+02 7.134e+02, threshold=5.122e+02, percent-clipped=2.0 +2022-12-08 05:13:58,832 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99885.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:14:18,361 INFO [train.py:873] (2/4) Epoch 14, batch 1600, loss[loss=0.1649, simple_loss=0.1831, pruned_loss=0.07332, over 14276.00 frames. ], tot_loss[loss=0.1191, simple_loss=0.1513, pruned_loss=0.04346, over 2032321.86 frames. ], batch size: 76, lr: 5.70e-03, grad_scale: 8.0 +2022-12-08 05:14:32,761 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99924.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:14:32,798 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99924.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:14:35,213 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8747, 5.0096, 5.3065, 4.3058, 5.0509, 5.3831, 1.9866, 4.7780], + device='cuda:2'), covar=tensor([0.0239, 0.0255, 0.0345, 0.0471, 0.0289, 0.0114, 0.2830, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0172, 0.0144, 0.0141, 0.0203, 0.0136, 0.0158, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 05:14:40,526 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99933.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:14:44,820 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.99 vs. limit=5.0 +2022-12-08 05:14:52,030 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=99945.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:14:58,412 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2486, 1.7380, 2.4491, 2.0461, 2.3573, 1.5836, 1.9748, 2.2735], + device='cuda:2'), covar=tensor([0.1790, 0.3488, 0.0525, 0.2275, 0.1094, 0.2288, 0.1138, 0.1090], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0209, 0.0211, 0.0282, 0.0227, 0.0209, 0.0207, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:15:15,686 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=99972.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:15:20,191 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.257e+02 2.180e+02 2.741e+02 3.315e+02 6.557e+02, threshold=5.482e+02, percent-clipped=1.0 +2022-12-08 05:15:23,095 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=99980.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:15:50,679 INFO [train.py:873] (2/4) Epoch 14, batch 1700, loss[loss=0.111, simple_loss=0.1455, pruned_loss=0.03823, over 14498.00 frames. ], tot_loss[loss=0.1178, simple_loss=0.1509, pruned_loss=0.0424, over 2076443.52 frames. ], batch size: 51, lr: 5.70e-03, grad_scale: 8.0 +2022-12-08 05:15:50,897 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100007.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:16:09,970 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100028.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:16:16,895 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:16:36,582 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.55 vs. limit=5.0 +2022-12-08 05:16:45,312 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100068.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:16:46,639 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9620, 1.5298, 4.7042, 2.2323, 4.4247, 4.8568, 4.4756, 5.3084], + device='cuda:2'), covar=tensor([0.0184, 0.3244, 0.0362, 0.2041, 0.0305, 0.0334, 0.0276, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0157, 0.0161, 0.0170, 0.0170, 0.0179, 0.0135, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:16:51,813 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100076.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:16:52,533 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.197e+02 2.632e+02 3.389e+02 8.054e+02, threshold=5.264e+02, percent-clipped=5.0 +2022-12-08 05:17:19,682 INFO [train.py:873] (2/4) Epoch 14, batch 1800, loss[loss=0.1435, simple_loss=0.1661, pruned_loss=0.06039, over 6929.00 frames. ], tot_loss[loss=0.1173, simple_loss=0.1506, pruned_loss=0.042, over 2087063.93 frames. ], batch size: 100, lr: 5.69e-03, grad_scale: 8.0 +2022-12-08 05:17:26,519 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 05:17:34,625 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100124.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:17:34,729 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100124.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:18:17,541 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100172.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:18:21,976 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.282e+02 2.047e+02 2.482e+02 3.140e+02 5.159e+02, threshold=4.965e+02, percent-clipped=0.0 +2022-12-08 05:18:48,335 INFO [train.py:873] (2/4) Epoch 14, batch 1900, loss[loss=0.145, simple_loss=0.1338, pruned_loss=0.07815, over 1291.00 frames. ], tot_loss[loss=0.1167, simple_loss=0.1498, pruned_loss=0.04174, over 2039302.39 frames. ], batch size: 100, lr: 5.69e-03, grad_scale: 8.0 +2022-12-08 05:19:03,394 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100224.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:19:15,283 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100238.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:19:21,257 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100245.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:19:45,365 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100272.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:19:49,479 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.349e+02 2.314e+02 2.818e+02 3.736e+02 1.298e+03, threshold=5.635e+02, percent-clipped=11.0 +2022-12-08 05:19:50,443 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3516, 3.2061, 3.1625, 3.4495, 3.0583, 2.9078, 3.4089, 3.2907], + device='cuda:2'), covar=tensor([0.0799, 0.1098, 0.0911, 0.0696, 0.1122, 0.0782, 0.0772, 0.0900], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0136, 0.0141, 0.0153, 0.0142, 0.0118, 0.0161, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:20:03,313 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100293.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:20:08,773 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100299.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:20:15,747 INFO [train.py:873] (2/4) Epoch 14, batch 2000, loss[loss=0.09439, simple_loss=0.1356, pruned_loss=0.02657, over 13935.00 frames. ], tot_loss[loss=0.1175, simple_loss=0.1503, pruned_loss=0.04231, over 1948025.00 frames. ], batch size: 20, lr: 5.69e-03, grad_scale: 8.0 +2022-12-08 05:20:41,581 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100336.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:20:44,117 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8853, 4.5526, 4.3079, 4.4835, 4.5710, 4.7771, 4.8665, 4.8087], + device='cuda:2'), covar=tensor([0.0722, 0.0466, 0.2078, 0.2579, 0.0719, 0.0732, 0.0860, 0.0793], + device='cuda:2'), in_proj_covar=tensor([0.0378, 0.0267, 0.0439, 0.0556, 0.0338, 0.0436, 0.0382, 0.0378], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:20:58,897 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5574, 3.4152, 3.3170, 3.6272, 3.2404, 3.1640, 3.6303, 3.4680], + device='cuda:2'), covar=tensor([0.0688, 0.1001, 0.0807, 0.0632, 0.0980, 0.0678, 0.0749, 0.0818], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0136, 0.0141, 0.0153, 0.0142, 0.0117, 0.0161, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:21:05,148 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100363.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:21:17,436 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 1.945e+02 2.468e+02 3.146e+02 6.937e+02, threshold=4.935e+02, percent-clipped=2.0 +2022-12-08 05:21:23,784 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100384.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:21:39,983 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100402.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:21:44,076 INFO [train.py:873] (2/4) Epoch 14, batch 2100, loss[loss=0.1547, simple_loss=0.149, pruned_loss=0.08016, over 1291.00 frames. ], tot_loss[loss=0.1173, simple_loss=0.1502, pruned_loss=0.04218, over 1900688.84 frames. ], batch size: 100, lr: 5.69e-03, grad_scale: 8.0 +2022-12-08 05:22:34,535 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100463.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 05:22:46,712 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.182e+02 2.587e+02 3.120e+02 6.565e+02, threshold=5.173e+02, percent-clipped=5.0 +2022-12-08 05:23:13,307 INFO [train.py:873] (2/4) Epoch 14, batch 2200, loss[loss=0.1495, simple_loss=0.1433, pruned_loss=0.0778, over 1245.00 frames. ], tot_loss[loss=0.1178, simple_loss=0.1505, pruned_loss=0.0425, over 1950178.38 frames. ], batch size: 100, lr: 5.68e-03, grad_scale: 8.0 +2022-12-08 05:23:30,500 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0801, 3.8948, 3.5739, 3.6137, 4.0480, 4.0356, 4.1500, 4.1035], + device='cuda:2'), covar=tensor([0.1191, 0.0728, 0.2447, 0.3637, 0.0955, 0.1167, 0.1194, 0.1229], + device='cuda:2'), in_proj_covar=tensor([0.0380, 0.0269, 0.0441, 0.0559, 0.0340, 0.0439, 0.0382, 0.0379], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:23:47,417 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0177, 1.9714, 1.6493, 1.7331, 2.0059, 1.9896, 2.0169, 1.9607], + device='cuda:2'), covar=tensor([0.1446, 0.1139, 0.3384, 0.3582, 0.1528, 0.1731, 0.1898, 0.1590], + device='cuda:2'), in_proj_covar=tensor([0.0382, 0.0269, 0.0442, 0.0561, 0.0341, 0.0440, 0.0382, 0.0379], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:24:15,306 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.135e+02 2.000e+02 2.520e+02 3.271e+02 5.030e+02, threshold=5.040e+02, percent-clipped=0.0 +2022-12-08 05:24:30,064 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100594.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:24:40,165 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.81 vs. limit=5.0 +2022-12-08 05:24:41,407 INFO [train.py:873] (2/4) Epoch 14, batch 2300, loss[loss=0.1225, simple_loss=0.1462, pruned_loss=0.0494, over 6908.00 frames. ], tot_loss[loss=0.1171, simple_loss=0.1498, pruned_loss=0.04217, over 1958594.93 frames. ], batch size: 100, lr: 5.68e-03, grad_scale: 4.0 +2022-12-08 05:25:05,974 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-08 05:25:31,003 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100663.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:25:44,077 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.238e+02 1.999e+02 2.526e+02 3.011e+02 4.450e+02, threshold=5.052e+02, percent-clipped=0.0 +2022-12-08 05:26:09,988 INFO [train.py:873] (2/4) Epoch 14, batch 2400, loss[loss=0.1984, simple_loss=0.1787, pruned_loss=0.1091, over 1267.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.15, pruned_loss=0.04242, over 1989940.91 frames. ], batch size: 100, lr: 5.68e-03, grad_scale: 8.0 +2022-12-08 05:26:12,763 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3426, 4.4439, 4.7805, 3.9798, 4.5236, 4.6900, 1.9013, 4.3031], + device='cuda:2'), covar=tensor([0.0273, 0.0303, 0.0283, 0.0486, 0.0271, 0.0183, 0.2980, 0.0267], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0175, 0.0144, 0.0143, 0.0204, 0.0140, 0.0160, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 05:26:13,587 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100711.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:26:19,225 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-08 05:26:27,767 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1897, 2.0511, 2.3810, 1.4431, 1.6594, 2.2252, 1.3340, 2.1648], + device='cuda:2'), covar=tensor([0.0727, 0.1631, 0.0743, 0.2492, 0.2535, 0.0858, 0.3156, 0.0812], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0099, 0.0090, 0.0098, 0.0115, 0.0087, 0.0120, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 05:26:30,434 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=100730.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:26:55,101 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=100758.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:26:59,600 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1944, 1.9824, 2.2044, 2.3674, 1.9771, 1.9290, 2.2339, 2.2260], + device='cuda:2'), covar=tensor([0.0245, 0.0444, 0.0263, 0.0240, 0.0375, 0.0643, 0.0317, 0.0264], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0254, 0.0369, 0.0327, 0.0266, 0.0301, 0.0304, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:27:12,429 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8947, 2.5497, 2.6639, 1.8210, 2.4406, 2.6338, 2.9596, 2.4454], + device='cuda:2'), covar=tensor([0.0736, 0.0887, 0.1003, 0.1538, 0.1024, 0.0810, 0.0589, 0.1240], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0175, 0.0137, 0.0125, 0.0138, 0.0150, 0.0127, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 05:27:13,004 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.419e+02 2.043e+02 2.633e+02 3.365e+02 7.380e+02, threshold=5.267e+02, percent-clipped=6.0 +2022-12-08 05:27:20,506 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8824, 0.7864, 0.8118, 0.6945, 0.7503, 0.4414, 0.7252, 0.6575], + device='cuda:2'), covar=tensor([0.0135, 0.0161, 0.0127, 0.0126, 0.0170, 0.0323, 0.0192, 0.0288], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0017, 0.0018, 0.0018, 0.0029, 0.0024, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:27:24,742 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=100791.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 05:27:30,542 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8349, 2.3305, 3.7137, 3.9584, 3.7438, 2.3671, 3.7894, 2.9016], + device='cuda:2'), covar=tensor([0.0405, 0.1056, 0.0839, 0.0437, 0.0432, 0.1490, 0.0385, 0.0915], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0256, 0.0371, 0.0329, 0.0267, 0.0303, 0.0306, 0.0283], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:27:38,944 INFO [train.py:873] (2/4) Epoch 14, batch 2500, loss[loss=0.121, simple_loss=0.156, pruned_loss=0.04303, over 14449.00 frames. ], tot_loss[loss=0.1172, simple_loss=0.1499, pruned_loss=0.04224, over 2032376.62 frames. ], batch size: 51, lr: 5.67e-03, grad_scale: 8.0 +2022-12-08 05:28:01,712 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9278, 1.2676, 2.0326, 1.3203, 1.9847, 2.0993, 1.7690, 2.1600], + device='cuda:2'), covar=tensor([0.0318, 0.2037, 0.0471, 0.1790, 0.0572, 0.0494, 0.0980, 0.0326], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0157, 0.0160, 0.0169, 0.0169, 0.0178, 0.0133, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:28:41,716 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.248e+02 2.659e+02 3.410e+02 6.665e+02, threshold=5.318e+02, percent-clipped=2.0 +2022-12-08 05:28:55,509 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=100894.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:29:06,707 INFO [train.py:873] (2/4) Epoch 14, batch 2600, loss[loss=0.1338, simple_loss=0.1644, pruned_loss=0.05155, over 9435.00 frames. ], tot_loss[loss=0.1178, simple_loss=0.1504, pruned_loss=0.04265, over 2047183.51 frames. ], batch size: 100, lr: 5.67e-03, grad_scale: 8.0 +2022-12-08 05:29:37,209 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=100942.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:30:08,943 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.168e+02 2.110e+02 2.658e+02 3.162e+02 7.088e+02, threshold=5.316e+02, percent-clipped=3.0 +2022-12-08 05:30:34,935 INFO [train.py:873] (2/4) Epoch 14, batch 2700, loss[loss=0.115, simple_loss=0.1409, pruned_loss=0.04457, over 5035.00 frames. ], tot_loss[loss=0.1175, simple_loss=0.15, pruned_loss=0.04252, over 2009910.78 frames. ], batch size: 100, lr: 5.67e-03, grad_scale: 8.0 +2022-12-08 05:31:15,094 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.83 vs. limit=2.0 +2022-12-08 05:31:20,789 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101058.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:31:34,511 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9546, 3.4712, 2.8638, 3.4229, 2.4398, 3.4896, 3.1605, 1.7150], + device='cuda:2'), covar=tensor([0.1698, 0.0812, 0.1538, 0.0955, 0.1082, 0.0448, 0.1049, 0.2496], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0083, 0.0065, 0.0069, 0.0096, 0.0082, 0.0097, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:31:38,598 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.153e+02 2.205e+02 2.633e+02 3.607e+02 6.784e+02, threshold=5.265e+02, percent-clipped=5.0 +2022-12-08 05:31:45,674 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=101086.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:31:55,292 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.62 vs. limit=2.0 +2022-12-08 05:31:57,463 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2407, 1.6170, 1.7897, 1.7820, 1.6560, 1.7476, 1.4461, 1.2733], + device='cuda:2'), covar=tensor([0.1362, 0.1150, 0.0500, 0.0646, 0.1241, 0.0874, 0.1881, 0.2043], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0083, 0.0065, 0.0069, 0.0096, 0.0082, 0.0098, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:32:03,783 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101106.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:32:04,578 INFO [train.py:873] (2/4) Epoch 14, batch 2800, loss[loss=0.1279, simple_loss=0.1575, pruned_loss=0.04912, over 14241.00 frames. ], tot_loss[loss=0.1186, simple_loss=0.1505, pruned_loss=0.04332, over 1926146.82 frames. ], batch size: 69, lr: 5.67e-03, grad_scale: 8.0 +2022-12-08 05:32:04,796 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101107.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:32:58,708 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101168.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:33:07,500 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.080e+02 2.537e+02 3.233e+02 5.412e+02, threshold=5.073e+02, percent-clipped=2.0 +2022-12-08 05:33:33,045 INFO [train.py:873] (2/4) Epoch 14, batch 2900, loss[loss=0.1008, simple_loss=0.1318, pruned_loss=0.03491, over 5987.00 frames. ], tot_loss[loss=0.1197, simple_loss=0.1513, pruned_loss=0.04402, over 1861798.48 frames. ], batch size: 100, lr: 5.66e-03, grad_scale: 4.0 +2022-12-08 05:33:35,341 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.80 vs. limit=5.0 +2022-12-08 05:33:37,635 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9107, 2.4338, 3.7032, 2.7463, 3.6299, 3.6705, 3.5583, 3.1149], + device='cuda:2'), covar=tensor([0.0768, 0.2937, 0.1056, 0.2041, 0.1202, 0.0886, 0.1567, 0.1770], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0315, 0.0397, 0.0305, 0.0378, 0.0323, 0.0361, 0.0303], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:33:59,795 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5797, 3.2086, 3.0101, 2.1765, 2.9846, 3.3079, 3.4775, 2.7072], + device='cuda:2'), covar=tensor([0.0679, 0.1179, 0.0899, 0.1483, 0.1003, 0.0672, 0.0725, 0.1381], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0174, 0.0136, 0.0126, 0.0138, 0.0149, 0.0127, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0005, 0.0006], + device='cuda:2') +2022-12-08 05:34:36,774 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 2.145e+02 2.615e+02 3.247e+02 7.058e+02, threshold=5.230e+02, percent-clipped=3.0 +2022-12-08 05:35:01,745 INFO [train.py:873] (2/4) Epoch 14, batch 3000, loss[loss=0.1381, simple_loss=0.1578, pruned_loss=0.05919, over 6951.00 frames. ], tot_loss[loss=0.1202, simple_loss=0.1513, pruned_loss=0.04459, over 1840134.29 frames. ], batch size: 100, lr: 5.66e-03, grad_scale: 4.0 +2022-12-08 05:35:01,745 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 05:35:10,212 INFO [train.py:905] (2/4) Epoch 14, validation: loss=0.134, simple_loss=0.1722, pruned_loss=0.04793, over 857387.00 frames. +2022-12-08 05:35:10,212 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 05:35:18,924 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.1781, 5.6554, 5.5959, 6.0734, 5.5928, 4.8941, 6.0484, 5.0099], + device='cuda:2'), covar=tensor([0.0298, 0.0778, 0.0320, 0.0513, 0.0849, 0.0299, 0.0487, 0.0462], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0275, 0.0192, 0.0192, 0.0183, 0.0153, 0.0283, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 05:36:12,836 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.532e+02 2.305e+02 3.036e+02 3.734e+02 6.961e+02, threshold=6.071e+02, percent-clipped=4.0 +2022-12-08 05:36:18,763 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101386.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:36:28,725 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.95 vs. limit=5.0 +2022-12-08 05:36:37,210 INFO [train.py:873] (2/4) Epoch 14, batch 3100, loss[loss=0.1393, simple_loss=0.1374, pruned_loss=0.07063, over 1255.00 frames. ], tot_loss[loss=0.1191, simple_loss=0.1504, pruned_loss=0.04393, over 1827268.52 frames. ], batch size: 100, lr: 5.66e-03, grad_scale: 4.0 +2022-12-08 05:37:00,779 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101434.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:37:26,092 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=101463.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:37:39,842 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.043e+01 2.198e+02 2.600e+02 3.209e+02 7.473e+02, threshold=5.200e+02, percent-clipped=3.0 +2022-12-08 05:37:39,977 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5924, 5.0569, 4.9660, 5.5167, 5.1503, 4.6276, 5.5095, 4.4360], + device='cuda:2'), covar=tensor([0.0295, 0.0910, 0.0319, 0.0377, 0.0728, 0.0408, 0.0408, 0.0488], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0272, 0.0191, 0.0190, 0.0181, 0.0152, 0.0280, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 05:37:42,324 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.77 vs. limit=5.0 +2022-12-08 05:38:04,116 INFO [train.py:873] (2/4) Epoch 14, batch 3200, loss[loss=0.1685, simple_loss=0.1568, pruned_loss=0.09012, over 1239.00 frames. ], tot_loss[loss=0.1188, simple_loss=0.1507, pruned_loss=0.04351, over 1878028.78 frames. ], batch size: 100, lr: 5.65e-03, grad_scale: 8.0 +2022-12-08 05:39:08,338 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.211e+02 2.131e+02 2.492e+02 3.275e+02 6.158e+02, threshold=4.985e+02, percent-clipped=2.0 +2022-12-08 05:39:32,255 INFO [train.py:873] (2/4) Epoch 14, batch 3300, loss[loss=0.1481, simple_loss=0.1676, pruned_loss=0.06427, over 8560.00 frames. ], tot_loss[loss=0.1188, simple_loss=0.1509, pruned_loss=0.0434, over 1891788.48 frames. ], batch size: 100, lr: 5.65e-03, grad_scale: 4.0 +2022-12-08 05:39:54,507 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-12-08 05:40:36,072 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.167e+02 2.269e+02 2.791e+02 3.629e+02 1.218e+03, threshold=5.581e+02, percent-clipped=7.0 +2022-12-08 05:40:39,680 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101684.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:40:45,481 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.21 vs. limit=5.0 +2022-12-08 05:40:46,370 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-08 05:40:59,540 INFO [train.py:873] (2/4) Epoch 14, batch 3400, loss[loss=0.1113, simple_loss=0.1458, pruned_loss=0.03841, over 11214.00 frames. ], tot_loss[loss=0.1183, simple_loss=0.1501, pruned_loss=0.04327, over 1922801.47 frames. ], batch size: 100, lr: 5.65e-03, grad_scale: 4.0 +2022-12-08 05:41:32,675 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101745.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:41:48,192 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=101763.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:42:02,724 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.178e+02 2.274e+02 2.665e+02 3.242e+02 5.348e+02, threshold=5.329e+02, percent-clipped=0.0 +2022-12-08 05:42:11,129 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0102, 3.2847, 3.1077, 3.3353, 2.5214, 3.2498, 3.0103, 1.8048], + device='cuda:2'), covar=tensor([0.1720, 0.0741, 0.1216, 0.0617, 0.1059, 0.0629, 0.1131, 0.2427], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0084, 0.0066, 0.0071, 0.0096, 0.0083, 0.0098, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:42:26,266 INFO [train.py:873] (2/4) Epoch 14, batch 3500, loss[loss=0.1174, simple_loss=0.1386, pruned_loss=0.04814, over 6020.00 frames. ], tot_loss[loss=0.1179, simple_loss=0.1499, pruned_loss=0.04296, over 1920652.83 frames. ], batch size: 100, lr: 5.65e-03, grad_scale: 4.0 +2022-12-08 05:42:29,431 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=101811.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:42:33,276 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.60 vs. limit=2.0 +2022-12-08 05:42:54,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6070, 3.2537, 2.5884, 3.6956, 3.5848, 3.5577, 3.1056, 2.5754], + device='cuda:2'), covar=tensor([0.0801, 0.1352, 0.3012, 0.0634, 0.0835, 0.1266, 0.1362, 0.3065], + device='cuda:2'), in_proj_covar=tensor([0.0272, 0.0293, 0.0262, 0.0271, 0.0316, 0.0296, 0.0258, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:43:06,976 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6126, 1.7478, 4.2663, 2.1666, 4.2416, 4.5967, 4.0183, 5.0138], + device='cuda:2'), covar=tensor([0.0219, 0.3107, 0.0437, 0.2122, 0.0380, 0.0357, 0.0449, 0.0157], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0157, 0.0160, 0.0170, 0.0170, 0.0179, 0.0134, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:43:20,521 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2447, 2.1011, 2.1601, 2.2622, 2.1392, 2.1841, 2.3027, 1.9263], + device='cuda:2'), covar=tensor([0.0828, 0.1225, 0.0669, 0.0737, 0.0982, 0.0695, 0.0775, 0.0724], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0269, 0.0189, 0.0188, 0.0180, 0.0152, 0.0277, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 05:43:21,428 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 05:43:29,694 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.267e+02 2.210e+02 2.811e+02 3.369e+02 8.008e+02, threshold=5.622e+02, percent-clipped=4.0 +2022-12-08 05:43:34,337 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.17 vs. limit=2.0 +2022-12-08 05:43:53,155 INFO [train.py:873] (2/4) Epoch 14, batch 3600, loss[loss=0.08377, simple_loss=0.1264, pruned_loss=0.02055, over 14265.00 frames. ], tot_loss[loss=0.1177, simple_loss=0.15, pruned_loss=0.04273, over 1931139.85 frames. ], batch size: 35, lr: 5.64e-03, grad_scale: 8.0 +2022-12-08 05:44:05,122 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101920.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:44:57,577 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.281e+02 2.121e+02 2.665e+02 3.565e+02 6.931e+02, threshold=5.329e+02, percent-clipped=3.0 +2022-12-08 05:44:58,630 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=101981.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:45:00,791 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 05:45:03,548 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=101987.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:45:21,976 INFO [train.py:873] (2/4) Epoch 14, batch 3700, loss[loss=0.1049, simple_loss=0.1449, pruned_loss=0.0325, over 14669.00 frames. ], tot_loss[loss=0.1172, simple_loss=0.1501, pruned_loss=0.04213, over 1991067.12 frames. ], batch size: 23, lr: 5.64e-03, grad_scale: 8.0 +2022-12-08 05:45:35,340 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2344, 2.0783, 3.1753, 3.3245, 3.1714, 2.2282, 3.1449, 2.4147], + device='cuda:2'), covar=tensor([0.0434, 0.1122, 0.0819, 0.0484, 0.0533, 0.1435, 0.0467, 0.1026], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0257, 0.0374, 0.0329, 0.0269, 0.0303, 0.0305, 0.0283], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:45:51,392 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102040.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:45:58,295 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102048.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:46:02,156 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8812, 2.3690, 4.9043, 3.2924, 4.6053, 2.1315, 3.5511, 4.6709], + device='cuda:2'), covar=tensor([0.0493, 0.4429, 0.0301, 0.6683, 0.0579, 0.3825, 0.1346, 0.0317], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0205, 0.0210, 0.0277, 0.0229, 0.0209, 0.0204, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:46:08,850 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3406, 2.1867, 4.9186, 4.4758, 4.2894, 5.0384, 4.7960, 4.9877], + device='cuda:2'), covar=tensor([0.1411, 0.1419, 0.0115, 0.0201, 0.0229, 0.0123, 0.0129, 0.0129], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0157, 0.0128, 0.0167, 0.0144, 0.0138, 0.0121, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 05:46:26,402 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 2.149e+02 2.707e+02 3.533e+02 7.252e+02, threshold=5.415e+02, percent-clipped=4.0 +2022-12-08 05:46:29,114 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7716, 0.8233, 0.6984, 0.8233, 0.7757, 0.3872, 0.7015, 0.8388], + device='cuda:2'), covar=tensor([0.0354, 0.0581, 0.0522, 0.0396, 0.0275, 0.0282, 0.0700, 0.0771], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0030, 0.0033, 0.0029, 0.0030, 0.0043, 0.0031, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:46:45,326 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-12-08 05:46:49,972 INFO [train.py:873] (2/4) Epoch 14, batch 3800, loss[loss=0.1148, simple_loss=0.1513, pruned_loss=0.03913, over 13893.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.15, pruned_loss=0.04247, over 1930701.30 frames. ], batch size: 20, lr: 5.64e-03, grad_scale: 8.0 +2022-12-08 05:47:56,011 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.049e+02 2.208e+02 2.673e+02 3.606e+02 1.148e+03, threshold=5.346e+02, percent-clipped=3.0 +2022-12-08 05:48:19,968 INFO [train.py:873] (2/4) Epoch 14, batch 3900, loss[loss=0.1368, simple_loss=0.1406, pruned_loss=0.06654, over 2648.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1493, pruned_loss=0.04142, over 1989680.99 frames. ], batch size: 100, lr: 5.64e-03, grad_scale: 4.0 +2022-12-08 05:48:31,233 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-12-08 05:49:22,101 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102276.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 05:49:26,436 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 2.029e+02 2.446e+02 2.972e+02 7.280e+02, threshold=4.892e+02, percent-clipped=2.0 +2022-12-08 05:49:49,241 INFO [train.py:873] (2/4) Epoch 14, batch 4000, loss[loss=0.09768, simple_loss=0.1388, pruned_loss=0.02827, over 14539.00 frames. ], tot_loss[loss=0.1155, simple_loss=0.1489, pruned_loss=0.04103, over 1970232.21 frames. ], batch size: 49, lr: 5.63e-03, grad_scale: 8.0 +2022-12-08 05:50:08,493 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7287, 2.7496, 4.4405, 3.3390, 4.4929, 4.4854, 4.2643, 4.1151], + device='cuda:2'), covar=tensor([0.0582, 0.3238, 0.0977, 0.1720, 0.0627, 0.0795, 0.1682, 0.1402], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0313, 0.0393, 0.0300, 0.0370, 0.0322, 0.0363, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:50:18,937 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102340.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:50:21,385 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=102343.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:50:49,682 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0235, 1.9319, 3.1611, 2.3749, 3.0199, 2.0077, 2.5095, 3.0644], + device='cuda:2'), covar=tensor([0.0903, 0.3888, 0.0715, 0.3966, 0.0801, 0.2831, 0.1256, 0.0623], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0206, 0.0212, 0.0279, 0.0229, 0.0209, 0.0205, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:50:54,070 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.83 vs. limit=5.0 +2022-12-08 05:50:55,302 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.024e+02 2.365e+02 2.996e+02 7.946e+02, threshold=4.730e+02, percent-clipped=3.0 +2022-12-08 05:51:01,406 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102388.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:51:18,485 INFO [train.py:873] (2/4) Epoch 14, batch 4100, loss[loss=0.1435, simple_loss=0.1695, pruned_loss=0.05874, over 14160.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1502, pruned_loss=0.0423, over 1952022.04 frames. ], batch size: 99, lr: 5.63e-03, grad_scale: 8.0 +2022-12-08 05:51:26,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8282, 1.7901, 2.1182, 1.9727, 1.7780, 1.6755, 1.6347, 1.3975], + device='cuda:2'), covar=tensor([0.0265, 0.0488, 0.0276, 0.0222, 0.0353, 0.0304, 0.0238, 0.0450], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0017, 0.0018, 0.0018, 0.0029, 0.0024, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:51:39,903 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6023, 3.7121, 3.8803, 3.5112, 3.7395, 3.6863, 1.5836, 3.5694], + device='cuda:2'), covar=tensor([0.0352, 0.0372, 0.0385, 0.0505, 0.0371, 0.0436, 0.3206, 0.0286], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0174, 0.0143, 0.0143, 0.0205, 0.0140, 0.0160, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 05:51:41,607 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9815, 1.8584, 2.0499, 2.1731, 2.1545, 1.8544, 1.7556, 1.7091], + device='cuda:2'), covar=tensor([0.0338, 0.0856, 0.0507, 0.0380, 0.0314, 0.0467, 0.0403, 0.0535], + device='cuda:2'), in_proj_covar=tensor([0.0019, 0.0019, 0.0017, 0.0018, 0.0018, 0.0029, 0.0024, 0.0029], + device='cuda:2'), out_proj_covar=tensor([0.0001, 0.0002, 0.0001, 0.0001, 0.0001, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:52:23,527 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.525e+01 2.118e+02 2.762e+02 3.343e+02 8.705e+02, threshold=5.523e+02, percent-clipped=3.0 +2022-12-08 05:52:28,157 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7302, 1.3359, 1.6993, 1.2855, 1.4765, 1.7581, 1.4710, 1.5348], + device='cuda:2'), covar=tensor([0.0871, 0.1101, 0.0843, 0.1004, 0.1631, 0.1062, 0.0977, 0.1887], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0177, 0.0137, 0.0127, 0.0140, 0.0152, 0.0129, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:52:46,042 INFO [train.py:873] (2/4) Epoch 14, batch 4200, loss[loss=0.111, simple_loss=0.1473, pruned_loss=0.03739, over 14286.00 frames. ], tot_loss[loss=0.1175, simple_loss=0.1503, pruned_loss=0.0423, over 1970322.31 frames. ], batch size: 31, lr: 5.63e-03, grad_scale: 8.0 +2022-12-08 05:52:47,038 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4143, 3.1756, 2.9490, 3.0954, 3.3421, 3.3639, 3.3824, 3.3531], + device='cuda:2'), covar=tensor([0.0949, 0.0797, 0.2482, 0.2958, 0.0887, 0.0996, 0.1183, 0.0952], + device='cuda:2'), in_proj_covar=tensor([0.0383, 0.0268, 0.0443, 0.0567, 0.0343, 0.0439, 0.0392, 0.0379], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 05:52:59,462 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4215, 1.0329, 1.2296, 0.8968, 1.1530, 1.4017, 1.0580, 1.0859], + device='cuda:2'), covar=tensor([0.0431, 0.0942, 0.0705, 0.0450, 0.0937, 0.0720, 0.0526, 0.1238], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0176, 0.0137, 0.0127, 0.0139, 0.0152, 0.0128, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:53:45,733 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102576.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:53:49,866 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.062e+02 2.234e+02 2.939e+02 3.624e+02 9.310e+02, threshold=5.879e+02, percent-clipped=3.0 +2022-12-08 05:53:55,946 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8268, 1.9442, 2.0649, 1.4092, 1.4437, 1.9915, 1.3454, 1.8302], + device='cuda:2'), covar=tensor([0.1216, 0.1653, 0.0850, 0.2762, 0.2973, 0.0919, 0.3221, 0.1050], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0092, 0.0101, 0.0118, 0.0088, 0.0123, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 05:54:13,399 INFO [train.py:873] (2/4) Epoch 14, batch 4300, loss[loss=0.08608, simple_loss=0.1393, pruned_loss=0.01645, over 14002.00 frames. ], tot_loss[loss=0.1187, simple_loss=0.151, pruned_loss=0.04323, over 1960292.93 frames. ], batch size: 26, lr: 5.62e-03, grad_scale: 8.0 +2022-12-08 05:54:27,874 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102624.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 05:54:34,159 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 05:54:43,666 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1803, 3.5700, 3.4130, 3.4503, 2.5714, 3.4091, 3.2888, 1.8269], + device='cuda:2'), covar=tensor([0.1535, 0.0761, 0.1324, 0.0721, 0.1042, 0.0688, 0.0933, 0.2409], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0086, 0.0068, 0.0072, 0.0097, 0.0085, 0.0099, 0.0100], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 05:54:44,478 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=102643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:55:16,065 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.389e+01 1.915e+02 2.366e+02 2.869e+02 5.820e+02, threshold=4.732e+02, percent-clipped=0.0 +2022-12-08 05:55:25,245 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=102691.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:55:38,798 INFO [train.py:873] (2/4) Epoch 14, batch 4400, loss[loss=0.1528, simple_loss=0.1742, pruned_loss=0.06566, over 8575.00 frames. ], tot_loss[loss=0.1182, simple_loss=0.1509, pruned_loss=0.04274, over 1990594.95 frames. ], batch size: 100, lr: 5.62e-03, grad_scale: 8.0 +2022-12-08 05:56:10,086 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8926, 1.7904, 1.8397, 1.8864, 1.8515, 1.2458, 1.7279, 1.9164], + device='cuda:2'), covar=tensor([0.0760, 0.1200, 0.0961, 0.1194, 0.1255, 0.0873, 0.0806, 0.0920], + device='cuda:2'), in_proj_covar=tensor([0.0030, 0.0030, 0.0033, 0.0028, 0.0030, 0.0042, 0.0031, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 05:56:37,329 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3615, 3.2211, 3.1091, 3.4679, 3.0446, 2.8445, 3.3834, 3.3313], + device='cuda:2'), covar=tensor([0.0650, 0.1065, 0.0961, 0.0764, 0.0999, 0.0948, 0.0720, 0.0871], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0134, 0.0140, 0.0153, 0.0141, 0.0119, 0.0160, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:56:44,117 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 2.029e+02 2.612e+02 3.153e+02 6.081e+02, threshold=5.224e+02, percent-clipped=1.0 +2022-12-08 05:57:07,620 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-12-08 05:57:07,856 INFO [train.py:873] (2/4) Epoch 14, batch 4500, loss[loss=0.1196, simple_loss=0.162, pruned_loss=0.03857, over 14388.00 frames. ], tot_loss[loss=0.1165, simple_loss=0.1497, pruned_loss=0.04162, over 1999734.91 frames. ], batch size: 53, lr: 5.62e-03, grad_scale: 8.0 +2022-12-08 05:57:16,815 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5442, 2.0508, 2.5732, 2.6156, 2.4736, 2.0475, 2.6313, 2.1992], + device='cuda:2'), covar=tensor([0.0399, 0.0892, 0.0527, 0.0403, 0.0535, 0.1051, 0.0421, 0.0615], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0254, 0.0369, 0.0323, 0.0266, 0.0300, 0.0304, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:57:18,439 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0054, 2.1252, 1.9761, 2.2221, 1.8210, 1.9414, 2.0972, 2.1131], + device='cuda:2'), covar=tensor([0.1004, 0.1180, 0.1241, 0.0923, 0.1309, 0.0937, 0.1008, 0.1038], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0135, 0.0140, 0.0153, 0.0142, 0.0119, 0.0161, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 05:57:46,981 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=102851.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:58:12,667 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.348e+02 2.344e+02 2.809e+02 3.466e+02 7.090e+02, threshold=5.617e+02, percent-clipped=4.0 +2022-12-08 05:58:18,961 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=102887.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:58:26,277 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8070, 2.5120, 3.4379, 2.4483, 2.1048, 3.2649, 1.8045, 3.1415], + device='cuda:2'), covar=tensor([0.1617, 0.1448, 0.0701, 0.2099, 0.2525, 0.0781, 0.3333, 0.0969], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0092, 0.0100, 0.0117, 0.0088, 0.0122, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 05:58:36,493 INFO [train.py:873] (2/4) Epoch 14, batch 4600, loss[loss=0.1546, simple_loss=0.147, pruned_loss=0.08114, over 1221.00 frames. ], tot_loss[loss=0.1163, simple_loss=0.1497, pruned_loss=0.04148, over 1964978.16 frames. ], batch size: 100, lr: 5.62e-03, grad_scale: 8.0 +2022-12-08 05:58:40,671 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102912.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:59:12,746 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=102948.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 05:59:41,555 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.429e+02 1.979e+02 2.486e+02 3.145e+02 5.919e+02, threshold=4.973e+02, percent-clipped=1.0 +2022-12-08 05:59:42,571 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3822, 3.9528, 3.6520, 3.6284, 2.4908, 3.8765, 3.6071, 2.0710], + device='cuda:2'), covar=tensor([0.1645, 0.0488, 0.1221, 0.0791, 0.1200, 0.0480, 0.0995, 0.2208], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0085, 0.0067, 0.0072, 0.0096, 0.0083, 0.0098, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:00:04,664 INFO [train.py:873] (2/4) Epoch 14, batch 4700, loss[loss=0.1092, simple_loss=0.1468, pruned_loss=0.03581, over 14311.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1496, pruned_loss=0.04123, over 2009469.05 frames. ], batch size: 31, lr: 5.61e-03, grad_scale: 8.0 +2022-12-08 06:00:29,752 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1211, 3.1405, 3.3060, 3.1395, 3.2321, 2.8551, 1.5493, 3.0371], + device='cuda:2'), covar=tensor([0.0399, 0.0399, 0.0404, 0.0407, 0.0368, 0.0911, 0.2793, 0.0338], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0173, 0.0143, 0.0141, 0.0204, 0.0139, 0.0158, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 06:01:01,905 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-08 06:01:10,017 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.112e+02 2.100e+02 2.622e+02 3.309e+02 6.848e+02, threshold=5.244e+02, percent-clipped=4.0 +2022-12-08 06:01:25,786 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103098.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:01:33,434 INFO [train.py:873] (2/4) Epoch 14, batch 4800, loss[loss=0.1411, simple_loss=0.1399, pruned_loss=0.07117, over 2559.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1492, pruned_loss=0.04123, over 2008095.79 frames. ], batch size: 100, lr: 5.61e-03, grad_scale: 8.0 +2022-12-08 06:01:40,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1320, 2.0778, 4.1821, 2.9286, 3.9821, 1.9894, 3.0637, 3.9783], + device='cuda:2'), covar=tensor([0.0870, 0.4391, 0.0498, 0.5859, 0.0837, 0.3790, 0.1402, 0.0531], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0206, 0.0212, 0.0277, 0.0229, 0.0211, 0.0205, 0.0213], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:01:55,882 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103132.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:02:19,494 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103159.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:02:28,051 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2267, 2.6552, 3.9666, 2.9659, 4.0451, 3.7911, 3.7913, 3.2924], + device='cuda:2'), covar=tensor([0.0754, 0.3073, 0.1138, 0.1924, 0.0895, 0.1109, 0.1664, 0.1952], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0315, 0.0396, 0.0300, 0.0374, 0.0322, 0.0366, 0.0303], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:02:39,048 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.240e+02 2.787e+02 3.398e+02 6.860e+02, threshold=5.574e+02, percent-clipped=4.0 +2022-12-08 06:02:49,669 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103193.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:03:02,608 INFO [train.py:873] (2/4) Epoch 14, batch 4900, loss[loss=0.1055, simple_loss=0.1541, pruned_loss=0.02847, over 14555.00 frames. ], tot_loss[loss=0.1166, simple_loss=0.1497, pruned_loss=0.04174, over 2042285.81 frames. ], batch size: 43, lr: 5.61e-03, grad_scale: 8.0 +2022-12-08 06:03:02,700 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103207.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:03:33,987 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103243.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:04:06,509 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.061e+02 2.525e+02 3.223e+02 5.765e+02, threshold=5.049e+02, percent-clipped=2.0 +2022-12-08 06:04:29,070 INFO [train.py:873] (2/4) Epoch 14, batch 5000, loss[loss=0.122, simple_loss=0.1568, pruned_loss=0.04361, over 14240.00 frames. ], tot_loss[loss=0.1169, simple_loss=0.1502, pruned_loss=0.0418, over 2038791.28 frames. ], batch size: 76, lr: 5.61e-03, grad_scale: 8.0 +2022-12-08 06:04:56,010 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-08 06:05:05,904 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8720, 3.0381, 4.7205, 3.1366, 4.6550, 4.6046, 4.3834, 3.8992], + device='cuda:2'), covar=tensor([0.0671, 0.3239, 0.0783, 0.2166, 0.0731, 0.0799, 0.1548, 0.2219], + device='cuda:2'), in_proj_covar=tensor([0.0355, 0.0316, 0.0398, 0.0303, 0.0375, 0.0325, 0.0367, 0.0302], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:05:20,836 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103366.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 06:05:33,575 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.214e+02 2.256e+02 2.699e+02 3.304e+02 5.196e+02, threshold=5.398e+02, percent-clipped=1.0 +2022-12-08 06:05:56,321 INFO [train.py:873] (2/4) Epoch 14, batch 5100, loss[loss=0.1279, simple_loss=0.1419, pruned_loss=0.05693, over 4942.00 frames. ], tot_loss[loss=0.117, simple_loss=0.1502, pruned_loss=0.04185, over 2057727.62 frames. ], batch size: 100, lr: 5.60e-03, grad_scale: 4.0 +2022-12-08 06:06:13,827 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103427.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 06:06:37,911 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103454.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:06:53,002 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8912, 4.4456, 4.3457, 4.8352, 4.4472, 4.2695, 4.8086, 4.1668], + device='cuda:2'), covar=tensor([0.0407, 0.1133, 0.0427, 0.0448, 0.0918, 0.0660, 0.0543, 0.0486], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0270, 0.0191, 0.0192, 0.0183, 0.0155, 0.0282, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 06:07:01,407 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.364e+02 2.994e+02 3.604e+02 8.056e+02, threshold=5.989e+02, percent-clipped=5.0 +2022-12-08 06:07:06,915 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103488.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:07:23,314 INFO [train.py:873] (2/4) Epoch 14, batch 5200, loss[loss=0.1056, simple_loss=0.1449, pruned_loss=0.03317, over 14457.00 frames. ], tot_loss[loss=0.1172, simple_loss=0.1506, pruned_loss=0.04189, over 2073477.74 frames. ], batch size: 51, lr: 5.60e-03, grad_scale: 8.0 +2022-12-08 06:07:23,439 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103507.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:07:47,468 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-08 06:07:55,045 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103543.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:08:04,930 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103555.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:08:28,605 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.299e+02 2.069e+02 2.548e+02 2.976e+02 7.652e+02, threshold=5.095e+02, percent-clipped=1.0 +2022-12-08 06:08:36,187 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103591.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:08:50,464 INFO [train.py:873] (2/4) Epoch 14, batch 5300, loss[loss=0.125, simple_loss=0.1487, pruned_loss=0.0507, over 6003.00 frames. ], tot_loss[loss=0.1164, simple_loss=0.1498, pruned_loss=0.04152, over 2018422.59 frames. ], batch size: 100, lr: 5.60e-03, grad_scale: 8.0 +2022-12-08 06:09:55,896 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.249e+02 2.008e+02 2.559e+02 3.052e+02 6.405e+02, threshold=5.118e+02, percent-clipped=1.0 +2022-12-08 06:10:17,829 INFO [train.py:873] (2/4) Epoch 14, batch 5400, loss[loss=0.1038, simple_loss=0.1465, pruned_loss=0.03057, over 13869.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1494, pruned_loss=0.04107, over 1970183.29 frames. ], batch size: 23, lr: 5.59e-03, grad_scale: 4.0 +2022-12-08 06:10:26,173 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 06:10:30,740 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=103722.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 06:10:47,331 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103740.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:10:47,583 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-12-08 06:10:59,280 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103754.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:11:09,135 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103765.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:11:25,002 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.416e+02 2.268e+02 2.622e+02 3.465e+02 5.931e+02, threshold=5.245e+02, percent-clipped=4.0 +2022-12-08 06:11:29,613 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=103788.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:11:29,651 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1036, 2.6661, 2.7782, 1.8893, 2.5984, 2.7735, 3.1267, 2.3859], + device='cuda:2'), covar=tensor([0.0688, 0.0903, 0.0918, 0.1567, 0.1040, 0.0752, 0.0662, 0.1511], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0177, 0.0138, 0.0127, 0.0140, 0.0152, 0.0128, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:11:41,181 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103801.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:11:41,966 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103802.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:11:46,664 INFO [train.py:873] (2/4) Epoch 14, batch 5500, loss[loss=0.1119, simple_loss=0.1165, pruned_loss=0.05367, over 2568.00 frames. ], tot_loss[loss=0.1159, simple_loss=0.1495, pruned_loss=0.04117, over 1963113.59 frames. ], batch size: 100, lr: 5.59e-03, grad_scale: 4.0 +2022-12-08 06:11:58,806 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=10.12 vs. limit=5.0 +2022-12-08 06:12:03,742 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103826.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:12:12,182 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=103836.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:12:34,515 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8665, 2.7198, 2.7318, 2.9017, 2.7448, 2.8777, 2.9907, 2.4574], + device='cuda:2'), covar=tensor([0.0802, 0.1048, 0.0626, 0.0637, 0.0945, 0.0548, 0.0657, 0.0655], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0267, 0.0190, 0.0189, 0.0183, 0.0154, 0.0279, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 06:12:47,299 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103876.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:12:49,124 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3902, 2.5849, 4.3274, 4.5413, 4.3802, 2.4982, 4.4620, 3.3820], + device='cuda:2'), covar=tensor([0.0357, 0.0988, 0.0964, 0.0342, 0.0352, 0.1667, 0.0393, 0.0852], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0254, 0.0370, 0.0326, 0.0267, 0.0301, 0.0304, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:12:53,008 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 2.148e+02 2.635e+02 3.186e+02 7.478e+02, threshold=5.269e+02, percent-clipped=1.0 +2022-12-08 06:13:00,163 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3160, 5.1564, 4.8903, 5.3055, 4.9060, 4.7775, 5.3676, 5.1132], + device='cuda:2'), covar=tensor([0.0663, 0.0722, 0.0785, 0.0537, 0.0831, 0.0475, 0.0598, 0.0678], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0139, 0.0144, 0.0159, 0.0146, 0.0122, 0.0167, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:13:14,657 INFO [train.py:873] (2/4) Epoch 14, batch 5600, loss[loss=0.1439, simple_loss=0.1449, pruned_loss=0.0715, over 1308.00 frames. ], tot_loss[loss=0.1167, simple_loss=0.15, pruned_loss=0.04168, over 1919507.22 frames. ], batch size: 100, lr: 5.59e-03, grad_scale: 8.0 +2022-12-08 06:13:20,224 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.07 vs. limit=5.0 +2022-12-08 06:13:21,580 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.9892, 5.3151, 5.3760, 5.8564, 5.4643, 4.8249, 5.8565, 4.7943], + device='cuda:2'), covar=tensor([0.0338, 0.1158, 0.0283, 0.0476, 0.0843, 0.0377, 0.0474, 0.0508], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0267, 0.0190, 0.0189, 0.0182, 0.0153, 0.0279, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 06:13:41,238 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=103937.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 06:13:53,413 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.09 vs. limit=2.0 +2022-12-08 06:14:21,889 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.081e+02 2.071e+02 2.747e+02 3.245e+02 6.809e+02, threshold=5.494e+02, percent-clipped=2.0 +2022-12-08 06:14:27,033 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103990.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:14:33,017 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=103997.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:14:42,175 INFO [train.py:873] (2/4) Epoch 14, batch 5700, loss[loss=0.1753, simple_loss=0.1538, pruned_loss=0.09846, over 1253.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.15, pruned_loss=0.04238, over 1907424.60 frames. ], batch size: 100, lr: 5.59e-03, grad_scale: 4.0 +2022-12-08 06:14:55,474 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104022.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 06:15:02,136 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4958, 4.2684, 4.1506, 4.5179, 4.0420, 3.8796, 4.5315, 4.3628], + device='cuda:2'), covar=tensor([0.0559, 0.0765, 0.0697, 0.0525, 0.0812, 0.0586, 0.0514, 0.0647], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0138, 0.0143, 0.0157, 0.0144, 0.0121, 0.0166, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:15:18,592 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=104049.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:15:20,320 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104051.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:15:23,247 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2992, 3.7894, 2.9648, 4.5236, 4.1885, 4.4218, 3.8449, 3.0701], + device='cuda:2'), covar=tensor([0.0657, 0.1146, 0.3326, 0.0560, 0.1028, 0.0995, 0.1068, 0.2958], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0292, 0.0264, 0.0274, 0.0320, 0.0298, 0.0255, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:15:26,884 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104058.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:15:36,678 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104070.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 06:15:49,005 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.572e+02 2.112e+02 2.664e+02 3.363e+02 7.605e+02, threshold=5.327e+02, percent-clipped=4.0 +2022-12-08 06:15:54,281 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3209, 2.3594, 1.9322, 2.3260, 2.2573, 2.3117, 2.1609, 2.0503], + device='cuda:2'), covar=tensor([0.1032, 0.0756, 0.2002, 0.0780, 0.1302, 0.0662, 0.1344, 0.1310], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0293, 0.0265, 0.0275, 0.0321, 0.0298, 0.0257, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:15:55,803 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3639, 2.3805, 2.0107, 2.3266, 2.2361, 2.2917, 2.1778, 2.0379], + device='cuda:2'), covar=tensor([0.1074, 0.0886, 0.1920, 0.0936, 0.1516, 0.0675, 0.1548, 0.1410], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0293, 0.0265, 0.0275, 0.0321, 0.0298, 0.0256, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:15:59,742 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104096.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:16:08,847 INFO [train.py:873] (2/4) Epoch 14, batch 5800, loss[loss=0.09911, simple_loss=0.1402, pruned_loss=0.02903, over 14215.00 frames. ], tot_loss[loss=0.1176, simple_loss=0.1499, pruned_loss=0.04268, over 1861877.96 frames. ], batch size: 37, lr: 5.58e-03, grad_scale: 4.0 +2022-12-08 06:16:11,443 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=104110.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:16:20,701 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104121.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:16:37,989 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-08 06:16:43,314 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8370, 1.4759, 3.8066, 1.7658, 3.7713, 3.9306, 2.8856, 4.2411], + device='cuda:2'), covar=tensor([0.0213, 0.3141, 0.0383, 0.2132, 0.0542, 0.0367, 0.0753, 0.0169], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0156, 0.0159, 0.0169, 0.0168, 0.0178, 0.0132, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:17:04,351 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.51 vs. limit=2.0 +2022-12-08 06:17:16,434 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 2.209e+02 2.706e+02 3.376e+02 1.574e+03, threshold=5.411e+02, percent-clipped=5.0 +2022-12-08 06:17:36,929 INFO [train.py:873] (2/4) Epoch 14, batch 5900, loss[loss=0.148, simple_loss=0.134, pruned_loss=0.08105, over 1372.00 frames. ], tot_loss[loss=0.1171, simple_loss=0.1496, pruned_loss=0.04227, over 1902245.69 frames. ], batch size: 100, lr: 5.58e-03, grad_scale: 4.0 +2022-12-08 06:17:58,497 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104232.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 06:18:12,727 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6662, 1.9592, 3.7210, 2.6215, 3.6405, 2.0072, 2.7627, 3.5941], + device='cuda:2'), covar=tensor([0.0741, 0.4536, 0.0552, 0.5701, 0.0758, 0.3258, 0.1450, 0.0591], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0206, 0.0210, 0.0277, 0.0228, 0.0210, 0.0205, 0.0216], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:18:43,106 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 2.217e+02 2.661e+02 3.155e+02 4.728e+02, threshold=5.321e+02, percent-clipped=0.0 +2022-12-08 06:19:04,081 INFO [train.py:873] (2/4) Epoch 14, batch 6000, loss[loss=0.09311, simple_loss=0.1333, pruned_loss=0.02645, over 13944.00 frames. ], tot_loss[loss=0.1174, simple_loss=0.1497, pruned_loss=0.04258, over 1907633.44 frames. ], batch size: 20, lr: 5.58e-03, grad_scale: 8.0 +2022-12-08 06:19:04,081 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 06:19:12,427 INFO [train.py:905] (2/4) Epoch 14, validation: loss=0.1346, simple_loss=0.1729, pruned_loss=0.04809, over 857387.00 frames. +2022-12-08 06:19:12,428 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 06:19:40,788 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.93 vs. limit=2.0 +2022-12-08 06:19:45,967 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104346.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:19:52,299 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104353.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:20:14,448 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.9711, 5.7593, 5.4917, 6.0227, 5.5690, 5.3956, 6.0418, 5.8575], + device='cuda:2'), covar=tensor([0.0530, 0.0534, 0.0673, 0.0368, 0.0587, 0.0366, 0.0462, 0.0430], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0137, 0.0141, 0.0156, 0.0142, 0.0120, 0.0165, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:20:18,638 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.302e+02 2.150e+02 2.718e+02 3.385e+02 8.730e+02, threshold=5.435e+02, percent-clipped=4.0 +2022-12-08 06:20:21,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6821, 2.3117, 2.5891, 1.7231, 2.2830, 2.5398, 2.7224, 2.3248], + device='cuda:2'), covar=tensor([0.0792, 0.0855, 0.0884, 0.1584, 0.1183, 0.0654, 0.0742, 0.1349], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0177, 0.0137, 0.0127, 0.0141, 0.0152, 0.0129, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:20:29,321 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104396.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:20:37,605 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=104405.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:20:39,258 INFO [train.py:873] (2/4) Epoch 14, batch 6100, loss[loss=0.09405, simple_loss=0.1375, pruned_loss=0.02529, over 13925.00 frames. ], tot_loss[loss=0.1169, simple_loss=0.1498, pruned_loss=0.042, over 1949937.90 frames. ], batch size: 23, lr: 5.58e-03, grad_scale: 8.0 +2022-12-08 06:20:51,229 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104421.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:21:11,707 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104444.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:21:32,841 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104469.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:21:46,358 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.494e+02 2.353e+02 3.023e+02 3.584e+02 7.529e+02, threshold=6.047e+02, percent-clipped=6.0 +2022-12-08 06:22:05,691 INFO [train.py:873] (2/4) Epoch 14, batch 6200, loss[loss=0.134, simple_loss=0.1541, pruned_loss=0.05699, over 9478.00 frames. ], tot_loss[loss=0.1176, simple_loss=0.1502, pruned_loss=0.04255, over 1888931.38 frames. ], batch size: 100, lr: 5.57e-03, grad_scale: 8.0 +2022-12-08 06:22:24,129 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-12-08 06:22:27,808 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104532.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 06:22:29,322 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0256, 3.7877, 3.6876, 4.0895, 3.7012, 3.4475, 4.0960, 3.9469], + device='cuda:2'), covar=tensor([0.0657, 0.0821, 0.0845, 0.0643, 0.0797, 0.0793, 0.0617, 0.0680], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0136, 0.0140, 0.0155, 0.0141, 0.0120, 0.0163, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:23:06,661 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9760, 1.9765, 1.4730, 1.5479, 1.9280, 1.9945, 1.9713, 1.9107], + device='cuda:2'), covar=tensor([0.1632, 0.0915, 0.3857, 0.3943, 0.1739, 0.1549, 0.2203, 0.1793], + device='cuda:2'), in_proj_covar=tensor([0.0383, 0.0266, 0.0449, 0.0569, 0.0344, 0.0442, 0.0393, 0.0385], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:23:09,248 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104580.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:23:12,641 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.150e+02 2.145e+02 2.527e+02 3.084e+02 9.249e+02, threshold=5.054e+02, percent-clipped=2.0 +2022-12-08 06:23:22,798 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5921, 2.2469, 2.4855, 1.6231, 2.1886, 2.4696, 2.5915, 2.2213], + device='cuda:2'), covar=tensor([0.0886, 0.0869, 0.1022, 0.1495, 0.1229, 0.0813, 0.0776, 0.1450], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0177, 0.0137, 0.0127, 0.0141, 0.0152, 0.0129, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:23:33,265 INFO [train.py:873] (2/4) Epoch 14, batch 6300, loss[loss=0.1372, simple_loss=0.1359, pruned_loss=0.0692, over 2601.00 frames. ], tot_loss[loss=0.116, simple_loss=0.1493, pruned_loss=0.04133, over 1901758.17 frames. ], batch size: 100, lr: 5.57e-03, grad_scale: 8.0 +2022-12-08 06:24:07,330 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104646.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:24:07,356 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4265, 1.1179, 1.2101, 0.8640, 1.0779, 1.4211, 0.9743, 1.1254], + device='cuda:2'), covar=tensor([0.0385, 0.0648, 0.0720, 0.0426, 0.0950, 0.0686, 0.0515, 0.1010], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0175, 0.0136, 0.0126, 0.0139, 0.0151, 0.0127, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:24:13,599 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104653.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:24:40,916 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.535e+01 2.052e+02 2.556e+02 3.229e+02 6.200e+02, threshold=5.113e+02, percent-clipped=1.0 +2022-12-08 06:24:49,223 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104694.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:24:55,254 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104701.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:24:58,744 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=104705.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:25:00,317 INFO [train.py:873] (2/4) Epoch 14, batch 6400, loss[loss=0.1158, simple_loss=0.1324, pruned_loss=0.04962, over 3870.00 frames. ], tot_loss[loss=0.1151, simple_loss=0.1487, pruned_loss=0.04074, over 1965102.39 frames. ], batch size: 100, lr: 5.57e-03, grad_scale: 8.0 +2022-12-08 06:25:10,729 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3795, 3.9533, 3.0883, 4.6228, 4.2117, 4.4617, 3.9598, 3.1247], + device='cuda:2'), covar=tensor([0.0789, 0.1119, 0.3405, 0.0827, 0.1048, 0.1211, 0.1073, 0.3278], + device='cuda:2'), in_proj_covar=tensor([0.0276, 0.0293, 0.0264, 0.0276, 0.0321, 0.0301, 0.0257, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:25:26,302 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0833, 4.9832, 4.7361, 5.1281, 4.7908, 4.5887, 5.2198, 4.8754], + device='cuda:2'), covar=tensor([0.0721, 0.0750, 0.0748, 0.0610, 0.0840, 0.0545, 0.0560, 0.0691], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0138, 0.0142, 0.0157, 0.0144, 0.0121, 0.0165, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:25:40,646 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=104753.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:26:07,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.338e+02 2.032e+02 2.617e+02 3.308e+02 7.760e+02, threshold=5.234e+02, percent-clipped=3.0 +2022-12-08 06:26:27,855 INFO [train.py:873] (2/4) Epoch 14, batch 6500, loss[loss=0.1218, simple_loss=0.1481, pruned_loss=0.04772, over 4997.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1493, pruned_loss=0.04138, over 1967253.07 frames. ], batch size: 100, lr: 5.57e-03, grad_scale: 8.0 +2022-12-08 06:27:28,480 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4435, 3.9111, 3.1049, 4.7335, 4.2640, 4.4802, 4.0180, 3.1720], + device='cuda:2'), covar=tensor([0.1088, 0.1194, 0.3405, 0.0474, 0.0897, 0.1188, 0.1021, 0.3214], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0292, 0.0262, 0.0275, 0.0321, 0.0299, 0.0255, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:27:35,547 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.183e+02 2.623e+02 3.627e+02 5.274e+02, threshold=5.247e+02, percent-clipped=1.0 +2022-12-08 06:27:54,697 INFO [train.py:873] (2/4) Epoch 14, batch 6600, loss[loss=0.1145, simple_loss=0.1503, pruned_loss=0.03937, over 14195.00 frames. ], tot_loss[loss=0.1162, simple_loss=0.1491, pruned_loss=0.04167, over 1986550.52 frames. ], batch size: 46, lr: 5.56e-03, grad_scale: 8.0 +2022-12-08 06:28:43,000 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1586, 3.5121, 3.1501, 3.3095, 2.6145, 3.4342, 3.2017, 1.7748], + device='cuda:2'), covar=tensor([0.1820, 0.0693, 0.1306, 0.0712, 0.1007, 0.0588, 0.1154, 0.2329], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0085, 0.0067, 0.0071, 0.0097, 0.0083, 0.0098, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:28:58,170 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-08 06:28:58,842 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1641, 2.9586, 2.3745, 3.2428, 3.1040, 3.1531, 2.7068, 2.3235], + device='cuda:2'), covar=tensor([0.0810, 0.1535, 0.3061, 0.0708, 0.1028, 0.0980, 0.1484, 0.2940], + device='cuda:2'), in_proj_covar=tensor([0.0274, 0.0291, 0.0261, 0.0274, 0.0319, 0.0297, 0.0253, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:29:02,029 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.045e+02 2.257e+02 2.787e+02 3.469e+02 5.148e+02, threshold=5.574e+02, percent-clipped=0.0 +2022-12-08 06:29:27,172 INFO [train.py:873] (2/4) Epoch 14, batch 6700, loss[loss=0.1195, simple_loss=0.1484, pruned_loss=0.04526, over 14313.00 frames. ], tot_loss[loss=0.116, simple_loss=0.1491, pruned_loss=0.04148, over 2019775.03 frames. ], batch size: 46, lr: 5.56e-03, grad_scale: 8.0 +2022-12-08 06:29:40,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8456, 3.5361, 3.3161, 2.6463, 3.2908, 3.5384, 3.8175, 3.1797], + device='cuda:2'), covar=tensor([0.0585, 0.1488, 0.0928, 0.1390, 0.0835, 0.0645, 0.0777, 0.1092], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0175, 0.0137, 0.0126, 0.0140, 0.0151, 0.0128, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:30:35,608 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.155e+02 2.089e+02 2.481e+02 3.177e+02 6.321e+02, threshold=4.962e+02, percent-clipped=1.0 +2022-12-08 06:30:54,205 INFO [train.py:873] (2/4) Epoch 14, batch 6800, loss[loss=0.109, simple_loss=0.1461, pruned_loss=0.03595, over 14264.00 frames. ], tot_loss[loss=0.1158, simple_loss=0.1491, pruned_loss=0.04121, over 2025109.66 frames. ], batch size: 57, lr: 5.56e-03, grad_scale: 8.0 +2022-12-08 06:31:04,225 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0837, 1.9417, 4.5913, 4.2665, 4.1441, 4.6437, 4.3396, 4.6627], + device='cuda:2'), covar=tensor([0.1365, 0.1404, 0.0103, 0.0166, 0.0210, 0.0108, 0.0130, 0.0118], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0128, 0.0168, 0.0145, 0.0139, 0.0122, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 06:31:41,731 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4882, 1.1047, 2.0271, 1.7963, 1.8401, 2.0695, 1.4110, 2.0884], + device='cuda:2'), covar=tensor([0.0891, 0.1423, 0.0272, 0.0613, 0.0750, 0.0321, 0.0748, 0.0295], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0129, 0.0169, 0.0146, 0.0140, 0.0123, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 06:32:01,691 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 2.161e+02 2.579e+02 3.505e+02 6.694e+02, threshold=5.159e+02, percent-clipped=6.0 +2022-12-08 06:32:21,683 INFO [train.py:873] (2/4) Epoch 14, batch 6900, loss[loss=0.1055, simple_loss=0.1471, pruned_loss=0.03193, over 14281.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.1489, pruned_loss=0.04085, over 2001010.61 frames. ], batch size: 60, lr: 5.55e-03, grad_scale: 8.0 +2022-12-08 06:33:29,080 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 2.377e+02 2.807e+02 3.459e+02 9.593e+02, threshold=5.615e+02, percent-clipped=4.0 +2022-12-08 06:33:48,171 INFO [train.py:873] (2/4) Epoch 14, batch 7000, loss[loss=0.1308, simple_loss=0.1568, pruned_loss=0.05236, over 8569.00 frames. ], tot_loss[loss=0.1168, simple_loss=0.15, pruned_loss=0.04179, over 1988523.07 frames. ], batch size: 100, lr: 5.55e-03, grad_scale: 8.0 +2022-12-08 06:34:21,708 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1608, 4.9230, 4.5770, 4.7884, 4.7889, 5.0333, 5.1461, 5.1065], + device='cuda:2'), covar=tensor([0.0649, 0.0401, 0.1859, 0.2366, 0.0675, 0.0681, 0.0691, 0.0678], + device='cuda:2'), in_proj_covar=tensor([0.0377, 0.0261, 0.0436, 0.0555, 0.0336, 0.0435, 0.0380, 0.0376], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:34:56,769 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.288e+01 2.265e+02 2.868e+02 3.572e+02 1.076e+03, threshold=5.736e+02, percent-clipped=4.0 +2022-12-08 06:34:58,913 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.56 vs. limit=5.0 +2022-12-08 06:34:59,499 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105388.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:35:09,476 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3949, 1.0348, 1.2484, 0.8614, 1.1564, 1.4107, 1.0490, 1.0842], + device='cuda:2'), covar=tensor([0.0477, 0.0842, 0.0659, 0.0499, 0.0931, 0.0681, 0.0661, 0.1176], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0175, 0.0137, 0.0125, 0.0140, 0.0152, 0.0128, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:35:16,755 INFO [train.py:873] (2/4) Epoch 14, batch 7100, loss[loss=0.1391, simple_loss=0.1557, pruned_loss=0.06125, over 3852.00 frames. ], tot_loss[loss=0.116, simple_loss=0.1494, pruned_loss=0.04127, over 1952487.45 frames. ], batch size: 100, lr: 5.55e-03, grad_scale: 8.0 +2022-12-08 06:35:16,840 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3894, 2.3590, 2.3192, 2.4960, 2.3669, 2.0670, 1.5146, 2.1002], + device='cuda:2'), covar=tensor([0.0758, 0.0732, 0.0761, 0.0550, 0.0672, 0.1723, 0.2943, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0168, 0.0175, 0.0144, 0.0145, 0.0204, 0.0140, 0.0158, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 06:35:53,793 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105449.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 06:36:25,709 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.061e+02 2.137e+02 2.673e+02 3.373e+02 4.675e+02, threshold=5.347e+02, percent-clipped=1.0 +2022-12-08 06:36:25,874 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105486.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:36:39,598 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105502.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:36:43,630 INFO [train.py:873] (2/4) Epoch 14, batch 7200, loss[loss=0.1162, simple_loss=0.1465, pruned_loss=0.04289, over 14163.00 frames. ], tot_loss[loss=0.1157, simple_loss=0.1494, pruned_loss=0.04103, over 2059046.99 frames. ], batch size: 99, lr: 5.55e-03, grad_scale: 8.0 +2022-12-08 06:36:59,681 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.03 vs. limit=2.0 +2022-12-08 06:37:17,822 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105547.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:37:32,153 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105563.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:37:51,599 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.085e+02 2.276e+02 2.781e+02 3.572e+02 7.841e+02, threshold=5.562e+02, percent-clipped=2.0 +2022-12-08 06:38:10,277 INFO [train.py:873] (2/4) Epoch 14, batch 7300, loss[loss=0.1006, simple_loss=0.1407, pruned_loss=0.03022, over 14292.00 frames. ], tot_loss[loss=0.1161, simple_loss=0.1491, pruned_loss=0.04158, over 1983948.09 frames. ], batch size: 76, lr: 5.54e-03, grad_scale: 8.0 +2022-12-08 06:38:55,871 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5604, 4.0180, 3.1263, 4.9689, 4.3181, 4.6912, 3.9840, 3.4607], + device='cuda:2'), covar=tensor([0.0659, 0.1287, 0.3808, 0.0481, 0.0779, 0.1071, 0.1234, 0.2880], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0292, 0.0263, 0.0276, 0.0323, 0.0299, 0.0256, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:39:20,008 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.394e+02 2.190e+02 2.577e+02 3.146e+02 6.129e+02, threshold=5.153e+02, percent-clipped=1.0 +2022-12-08 06:39:22,168 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6257, 2.4787, 2.2021, 2.3425, 2.5221, 2.5767, 2.5680, 2.5721], + device='cuda:2'), covar=tensor([0.1117, 0.0966, 0.2918, 0.2658, 0.1332, 0.1288, 0.1589, 0.1129], + device='cuda:2'), in_proj_covar=tensor([0.0376, 0.0260, 0.0438, 0.0555, 0.0336, 0.0433, 0.0379, 0.0379], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 06:39:38,185 INFO [train.py:873] (2/4) Epoch 14, batch 7400, loss[loss=0.1382, simple_loss=0.1673, pruned_loss=0.05452, over 10360.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.1487, pruned_loss=0.04092, over 1976153.58 frames. ], batch size: 100, lr: 5.54e-03, grad_scale: 8.0 +2022-12-08 06:40:00,269 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.46 vs. limit=2.0 +2022-12-08 06:40:11,134 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=105744.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 06:40:11,664 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 06:40:48,154 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.057e+02 2.114e+02 2.694e+02 3.325e+02 8.535e+02, threshold=5.387e+02, percent-clipped=3.0 +2022-12-08 06:40:53,330 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105792.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:41:00,012 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=105799.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:41:07,344 INFO [train.py:873] (2/4) Epoch 14, batch 7500, loss[loss=0.1083, simple_loss=0.15, pruned_loss=0.03336, over 14659.00 frames. ], tot_loss[loss=0.1164, simple_loss=0.1492, pruned_loss=0.04176, over 1907305.73 frames. ], batch size: 23, lr: 5.54e-03, grad_scale: 8.0 +2022-12-08 06:41:33,706 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1396, 2.0189, 2.0949, 2.1783, 2.0737, 2.1108, 2.2303, 1.9402], + device='cuda:2'), covar=tensor([0.0881, 0.1394, 0.0720, 0.0830, 0.0977, 0.0713, 0.0895, 0.0721], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0269, 0.0193, 0.0190, 0.0183, 0.0153, 0.0280, 0.0165], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 06:41:36,988 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=105842.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:41:43,494 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.90 vs. limit=2.0 +2022-12-08 06:41:44,731 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105853.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:41:47,025 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=105858.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:41:48,043 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=105860.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:42:35,201 INFO [train.py:873] (2/4) Epoch 15, batch 0, loss[loss=0.1067, simple_loss=0.1504, pruned_loss=0.03155, over 14494.00 frames. ], tot_loss[loss=0.1067, simple_loss=0.1504, pruned_loss=0.03155, over 14494.00 frames. ], batch size: 24, lr: 5.35e-03, grad_scale: 8.0 +2022-12-08 06:42:35,202 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 06:42:39,191 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9708, 3.2800, 3.1873, 3.3465, 2.3823, 3.3054, 3.0107, 1.7246], + device='cuda:2'), covar=tensor([0.0920, 0.0635, 0.0581, 0.0446, 0.0887, 0.0456, 0.0803, 0.2049], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0085, 0.0067, 0.0070, 0.0096, 0.0083, 0.0098, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:42:42,692 INFO [train.py:905] (2/4) Epoch 15, validation: loss=0.1381, simple_loss=0.1782, pruned_loss=0.049, over 857387.00 frames. +2022-12-08 06:42:42,693 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 06:42:43,714 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9125, 3.0062, 3.1229, 2.9031, 3.0658, 2.8808, 1.4952, 2.8679], + device='cuda:2'), covar=tensor([0.0423, 0.0417, 0.0363, 0.0503, 0.0347, 0.0792, 0.2826, 0.0332], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0173, 0.0143, 0.0144, 0.0203, 0.0139, 0.0158, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 06:42:58,149 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.674e+01 2.091e+02 3.002e+02 4.364e+02 1.099e+03, threshold=6.004e+02, percent-clipped=13.0 +2022-12-08 06:43:08,551 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-12-08 06:44:07,307 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3214, 2.1229, 3.2763, 3.3981, 3.3003, 2.2747, 3.2642, 2.5471], + device='cuda:2'), covar=tensor([0.0418, 0.1135, 0.0734, 0.0471, 0.0444, 0.1509, 0.0406, 0.0882], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0256, 0.0372, 0.0326, 0.0269, 0.0302, 0.0306, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:44:11,365 INFO [train.py:873] (2/4) Epoch 15, batch 100, loss[loss=0.1151, simple_loss=0.1512, pruned_loss=0.03946, over 14537.00 frames. ], tot_loss[loss=0.1172, simple_loss=0.1506, pruned_loss=0.04185, over 873819.63 frames. ], batch size: 34, lr: 5.35e-03, grad_scale: 8.0 +2022-12-08 06:44:26,579 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.268e+02 2.570e+02 2.979e+02 3.765e+02 9.423e+02, threshold=5.958e+02, percent-clipped=4.0 +2022-12-08 06:45:18,336 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106044.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:45:40,385 INFO [train.py:873] (2/4) Epoch 15, batch 200, loss[loss=0.1001, simple_loss=0.135, pruned_loss=0.03262, over 13957.00 frames. ], tot_loss[loss=0.114, simple_loss=0.1478, pruned_loss=0.04012, over 1289237.51 frames. ], batch size: 19, lr: 5.34e-03, grad_scale: 8.0 +2022-12-08 06:45:47,372 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=106077.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:45:55,352 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.497e+02 2.218e+02 2.817e+02 3.547e+02 5.569e+02, threshold=5.634e+02, percent-clipped=0.0 +2022-12-08 06:46:00,763 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106092.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:46:01,417 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.93 vs. limit=2.0 +2022-12-08 06:46:30,056 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.66 vs. limit=2.0 +2022-12-08 06:46:41,859 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=106138.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:46:44,994 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106142.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:46:50,486 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106148.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:46:56,476 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106155.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:46:59,195 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106158.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:47:08,664 INFO [train.py:873] (2/4) Epoch 15, batch 300, loss[loss=0.1113, simple_loss=0.1426, pruned_loss=0.03998, over 14244.00 frames. ], tot_loss[loss=0.1136, simple_loss=0.1475, pruned_loss=0.03991, over 1530342.14 frames. ], batch size: 80, lr: 5.34e-03, grad_scale: 8.0 +2022-12-08 06:47:23,719 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.045e+02 2.130e+02 2.529e+02 3.253e+02 6.005e+02, threshold=5.059e+02, percent-clipped=1.0 +2022-12-08 06:47:27,436 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106190.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:47:42,121 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106206.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:47:52,807 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5753, 1.7915, 1.9837, 1.9222, 1.8341, 1.9816, 1.6913, 1.3966], + device='cuda:2'), covar=tensor([0.1356, 0.1237, 0.0605, 0.0721, 0.1130, 0.0700, 0.1760, 0.1898], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0085, 0.0068, 0.0070, 0.0096, 0.0083, 0.0098, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0005, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 06:48:03,856 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.34 vs. limit=5.0 +2022-12-08 06:48:36,974 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=106268.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:48:37,738 INFO [train.py:873] (2/4) Epoch 15, batch 400, loss[loss=0.1187, simple_loss=0.1225, pruned_loss=0.05738, over 2562.00 frames. ], tot_loss[loss=0.1143, simple_loss=0.1476, pruned_loss=0.04046, over 1675018.41 frames. ], batch size: 100, lr: 5.34e-03, grad_scale: 8.0 +2022-12-08 06:48:52,653 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.320e+02 2.076e+02 2.571e+02 3.282e+02 8.496e+02, threshold=5.142e+02, percent-clipped=2.0 +2022-12-08 06:48:59,800 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-08 06:49:30,141 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=106329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:49:36,240 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=106336.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:50:06,073 INFO [train.py:873] (2/4) Epoch 15, batch 500, loss[loss=0.1201, simple_loss=0.152, pruned_loss=0.04408, over 11972.00 frames. ], tot_loss[loss=0.114, simple_loss=0.1478, pruned_loss=0.04015, over 1812546.94 frames. ], batch size: 100, lr: 5.34e-03, grad_scale: 8.0 +2022-12-08 06:50:20,650 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2298, 1.9810, 2.2542, 2.3436, 2.0037, 1.9499, 2.1277, 2.2001], + device='cuda:2'), covar=tensor([0.0230, 0.0563, 0.0294, 0.0240, 0.0378, 0.0680, 0.0328, 0.0349], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0256, 0.0371, 0.0326, 0.0269, 0.0300, 0.0306, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 06:50:21,314 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.064e+02 2.651e+02 3.398e+02 5.648e+02, threshold=5.301e+02, percent-clipped=3.0 +2022-12-08 06:50:31,268 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=106397.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:51:03,961 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106433.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:51:17,621 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106448.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:51:21,210 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9605, 1.9572, 2.0816, 1.9671, 1.9433, 1.7484, 1.3776, 1.2977], + device='cuda:2'), covar=tensor([0.0309, 0.0340, 0.0430, 0.0302, 0.0271, 0.0312, 0.0342, 0.0666], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 06:51:22,799 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=106454.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:51:23,712 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106455.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:51:36,096 INFO [train.py:873] (2/4) Epoch 15, batch 600, loss[loss=0.153, simple_loss=0.17, pruned_loss=0.06799, over 8654.00 frames. ], tot_loss[loss=0.1143, simple_loss=0.1478, pruned_loss=0.04042, over 1807536.83 frames. ], batch size: 100, lr: 5.33e-03, grad_scale: 8.0 +2022-12-08 06:51:51,213 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 2.156e+02 2.802e+02 3.401e+02 5.738e+02, threshold=5.604e+02, percent-clipped=2.0 +2022-12-08 06:51:52,318 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-08 06:52:00,110 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106496.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:52:06,416 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106503.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:52:17,439 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=106515.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:52:26,190 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4812, 1.5259, 1.7016, 1.4425, 1.4623, 1.4671, 1.4685, 1.2081], + device='cuda:2'), covar=tensor([0.0160, 0.0274, 0.0155, 0.0206, 0.0195, 0.0304, 0.0198, 0.0378], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 06:53:05,158 INFO [train.py:873] (2/4) Epoch 15, batch 700, loss[loss=0.107, simple_loss=0.1453, pruned_loss=0.03433, over 14399.00 frames. ], tot_loss[loss=0.115, simple_loss=0.1483, pruned_loss=0.04083, over 1875644.94 frames. ], batch size: 73, lr: 5.33e-03, grad_scale: 8.0 +2022-12-08 06:53:19,980 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.992e+01 2.009e+02 2.731e+02 3.273e+02 5.771e+02, threshold=5.463e+02, percent-clipped=3.0 +2022-12-08 06:53:53,118 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106624.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:53:54,835 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8129, 1.8413, 1.7131, 1.8583, 1.9215, 1.0970, 1.6969, 1.7817], + device='cuda:2'), covar=tensor([0.0817, 0.0727, 0.0747, 0.0808, 0.0735, 0.0798, 0.0588, 0.0784], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0031, 0.0035, 0.0029, 0.0031, 0.0044, 0.0032, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 06:53:58,613 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.66 vs. limit=5.0 +2022-12-08 06:54:32,759 INFO [train.py:873] (2/4) Epoch 15, batch 800, loss[loss=0.1049, simple_loss=0.1515, pruned_loss=0.02912, over 14098.00 frames. ], tot_loss[loss=0.1145, simple_loss=0.1479, pruned_loss=0.0405, over 1904000.48 frames. ], batch size: 29, lr: 5.33e-03, grad_scale: 8.0 +2022-12-08 06:54:47,970 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 2.230e+02 2.748e+02 3.448e+02 5.498e+02, threshold=5.496e+02, percent-clipped=1.0 +2022-12-08 06:54:53,537 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106692.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:55:30,031 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106733.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:56:02,204 INFO [train.py:873] (2/4) Epoch 15, batch 900, loss[loss=0.1284, simple_loss=0.1556, pruned_loss=0.05065, over 10328.00 frames. ], tot_loss[loss=0.1141, simple_loss=0.1481, pruned_loss=0.04004, over 1969159.57 frames. ], batch size: 100, lr: 5.33e-03, grad_scale: 8.0 +2022-12-08 06:56:03,558 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 06:56:12,938 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106781.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:56:16,790 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.105e+02 2.159e+02 2.563e+02 3.204e+02 7.042e+02, threshold=5.126e+02, percent-clipped=4.0 +2022-12-08 06:56:32,577 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-08 06:56:38,475 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=106810.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:56:46,352 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8197, 0.8203, 0.7580, 0.8192, 0.8411, 0.2873, 0.7546, 0.8793], + device='cuda:2'), covar=tensor([0.0468, 0.0388, 0.0465, 0.0389, 0.0261, 0.0252, 0.0861, 0.0622], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0031, 0.0035, 0.0029, 0.0031, 0.0044, 0.0032, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 06:57:21,741 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.30 vs. limit=2.0 +2022-12-08 06:57:30,126 INFO [train.py:873] (2/4) Epoch 15, batch 1000, loss[loss=0.1116, simple_loss=0.1525, pruned_loss=0.03532, over 14201.00 frames. ], tot_loss[loss=0.1138, simple_loss=0.148, pruned_loss=0.03983, over 2001263.32 frames. ], batch size: 46, lr: 5.32e-03, grad_scale: 8.0 +2022-12-08 06:57:44,838 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.224e+02 2.651e+02 3.456e+02 5.513e+02, threshold=5.301e+02, percent-clipped=2.0 +2022-12-08 06:57:48,813 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.94 vs. limit=2.0 +2022-12-08 06:58:18,720 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106924.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:58:28,212 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-12-08 06:58:34,767 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9828, 1.9912, 1.9458, 2.0388, 1.9121, 1.6935, 1.3064, 1.6879], + device='cuda:2'), covar=tensor([0.0747, 0.0621, 0.0692, 0.0463, 0.0614, 0.1314, 0.2721, 0.0657], + device='cuda:2'), in_proj_covar=tensor([0.0165, 0.0170, 0.0142, 0.0143, 0.0201, 0.0137, 0.0156, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 06:58:58,493 INFO [train.py:873] (2/4) Epoch 15, batch 1100, loss[loss=0.1212, simple_loss=0.1584, pruned_loss=0.04206, over 14382.00 frames. ], tot_loss[loss=0.1146, simple_loss=0.1482, pruned_loss=0.04052, over 1977228.88 frames. ], batch size: 73, lr: 5.32e-03, grad_scale: 8.0 +2022-12-08 06:59:01,480 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=106972.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:59:13,540 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.148e+02 2.263e+02 2.855e+02 3.604e+02 7.949e+02, threshold=5.710e+02, percent-clipped=5.0 +2022-12-08 06:59:19,075 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=106992.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 06:59:47,398 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.28 vs. limit=5.0 +2022-12-08 07:00:02,194 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=107040.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:00:15,630 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.77 vs. limit=2.0 +2022-12-08 07:00:21,088 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2175, 2.9065, 2.2971, 3.2971, 3.1104, 3.1225, 2.7613, 2.2027], + device='cuda:2'), covar=tensor([0.0912, 0.1678, 0.3821, 0.0716, 0.1073, 0.1502, 0.1686, 0.3745], + device='cuda:2'), in_proj_covar=tensor([0.0277, 0.0294, 0.0263, 0.0277, 0.0322, 0.0301, 0.0257, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:00:28,133 INFO [train.py:873] (2/4) Epoch 15, batch 1200, loss[loss=0.1148, simple_loss=0.148, pruned_loss=0.04081, over 14468.00 frames. ], tot_loss[loss=0.1148, simple_loss=0.1485, pruned_loss=0.04059, over 1974583.03 frames. ], batch size: 51, lr: 5.32e-03, grad_scale: 8.0 +2022-12-08 07:00:35,924 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 07:00:42,788 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.313e+02 2.082e+02 2.691e+02 3.251e+02 9.387e+02, threshold=5.382e+02, percent-clipped=1.0 +2022-12-08 07:01:04,304 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=107110.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:01:30,749 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=107140.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 07:01:41,297 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=107152.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:01:46,369 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=107158.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:01:56,711 INFO [train.py:873] (2/4) Epoch 15, batch 1300, loss[loss=0.1482, simple_loss=0.1584, pruned_loss=0.06894, over 4993.00 frames. ], tot_loss[loss=0.1138, simple_loss=0.1477, pruned_loss=0.03995, over 1965593.40 frames. ], batch size: 100, lr: 5.32e-03, grad_scale: 8.0 +2022-12-08 07:02:12,107 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.125e+02 2.037e+02 2.513e+02 3.265e+02 6.018e+02, threshold=5.026e+02, percent-clipped=1.0 +2022-12-08 07:02:25,574 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=107201.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 07:02:36,911 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=107213.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 07:03:26,079 INFO [train.py:873] (2/4) Epoch 15, batch 1400, loss[loss=0.1153, simple_loss=0.1518, pruned_loss=0.03938, over 14593.00 frames. ], tot_loss[loss=0.1146, simple_loss=0.1483, pruned_loss=0.04048, over 1974134.64 frames. ], batch size: 34, lr: 5.31e-03, grad_scale: 4.0 +2022-12-08 07:03:41,695 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.373e+02 2.191e+02 2.804e+02 3.719e+02 1.162e+03, threshold=5.608e+02, percent-clipped=5.0 +2022-12-08 07:04:40,919 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-12-08 07:04:54,157 INFO [train.py:873] (2/4) Epoch 15, batch 1500, loss[loss=0.1441, simple_loss=0.1634, pruned_loss=0.06238, over 8602.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.1488, pruned_loss=0.04087, over 1961071.85 frames. ], batch size: 100, lr: 5.31e-03, grad_scale: 4.0 +2022-12-08 07:05:10,372 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.112e+02 2.250e+02 2.599e+02 3.364e+02 1.423e+03, threshold=5.197e+02, percent-clipped=1.0 +2022-12-08 07:06:23,611 INFO [train.py:873] (2/4) Epoch 15, batch 1600, loss[loss=0.1067, simple_loss=0.1437, pruned_loss=0.03484, over 14150.00 frames. ], tot_loss[loss=0.1153, simple_loss=0.1487, pruned_loss=0.04098, over 1996909.06 frames. ], batch size: 35, lr: 5.31e-03, grad_scale: 8.0 +2022-12-08 07:06:28,155 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2168, 2.2725, 4.2151, 2.8195, 4.0023, 2.2184, 3.0724, 4.0001], + device='cuda:2'), covar=tensor([0.0586, 0.4128, 0.0489, 0.6338, 0.0676, 0.3267, 0.1322, 0.0500], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0209, 0.0213, 0.0280, 0.0230, 0.0210, 0.0207, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:06:39,091 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.559e+01 2.229e+02 2.676e+02 3.216e+02 6.513e+02, threshold=5.353e+02, percent-clipped=3.0 +2022-12-08 07:06:47,593 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=107496.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 07:06:58,123 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=107508.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:06:59,838 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=107510.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:07:11,283 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-08 07:07:39,445 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8580, 1.8461, 1.6003, 1.8518, 1.8000, 1.8278, 1.8644, 1.6894], + device='cuda:2'), covar=tensor([0.1032, 0.0960, 0.1939, 0.0800, 0.1104, 0.0711, 0.1268, 0.0991], + device='cuda:2'), in_proj_covar=tensor([0.0275, 0.0290, 0.0263, 0.0275, 0.0319, 0.0297, 0.0256, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:07:51,202 INFO [train.py:873] (2/4) Epoch 15, batch 1700, loss[loss=0.09514, simple_loss=0.1402, pruned_loss=0.02505, over 14201.00 frames. ], tot_loss[loss=0.1147, simple_loss=0.1485, pruned_loss=0.0405, over 2015086.76 frames. ], batch size: 35, lr: 5.31e-03, grad_scale: 8.0 +2022-12-08 07:07:53,431 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=107571.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:08:07,304 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 2.065e+02 2.454e+02 2.883e+02 5.170e+02, threshold=4.908e+02, percent-clipped=0.0 +2022-12-08 07:08:35,490 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7388, 1.7495, 1.9878, 1.9046, 1.7393, 1.7702, 1.5741, 1.4815], + device='cuda:2'), covar=tensor([0.0443, 0.0496, 0.0403, 0.0392, 0.0355, 0.0574, 0.0484, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0017, 0.0019, 0.0019, 0.0030, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:08:49,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-12-08 07:09:11,888 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=107659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:09:20,341 INFO [train.py:873] (2/4) Epoch 15, batch 1800, loss[loss=0.1629, simple_loss=0.1469, pruned_loss=0.08939, over 1332.00 frames. ], tot_loss[loss=0.1155, simple_loss=0.1492, pruned_loss=0.04092, over 2013192.21 frames. ], batch size: 100, lr: 5.30e-03, grad_scale: 8.0 +2022-12-08 07:09:35,244 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.000e+02 2.283e+02 3.065e+02 3.850e+02 1.119e+03, threshold=6.129e+02, percent-clipped=8.0 +2022-12-08 07:10:04,765 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=107720.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:10:19,539 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3976, 1.6969, 2.6165, 2.0802, 2.5068, 1.7306, 2.0497, 2.4475], + device='cuda:2'), covar=tensor([0.1955, 0.3845, 0.0867, 0.3311, 0.1166, 0.2910, 0.1253, 0.0886], + device='cuda:2'), in_proj_covar=tensor([0.0249, 0.0204, 0.0210, 0.0274, 0.0225, 0.0206, 0.0203, 0.0211], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:10:20,189 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=107737.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:10:48,100 INFO [train.py:873] (2/4) Epoch 15, batch 1900, loss[loss=0.1159, simple_loss=0.1545, pruned_loss=0.03864, over 14566.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1487, pruned_loss=0.04006, over 2076331.13 frames. ], batch size: 23, lr: 5.30e-03, grad_scale: 8.0 +2022-12-08 07:11:04,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 2.201e+02 2.753e+02 3.141e+02 7.986e+02, threshold=5.507e+02, percent-clipped=3.0 +2022-12-08 07:11:06,928 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9805, 1.4935, 3.1375, 2.8292, 3.0390, 3.1945, 2.4421, 3.1062], + device='cuda:2'), covar=tensor([0.1203, 0.1523, 0.0146, 0.0369, 0.0287, 0.0173, 0.0409, 0.0199], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0130, 0.0171, 0.0148, 0.0142, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 07:11:07,826 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6738, 4.7517, 5.1554, 4.3398, 4.9520, 5.1930, 2.0044, 4.6071], + device='cuda:2'), covar=tensor([0.0253, 0.0312, 0.0273, 0.0497, 0.0241, 0.0137, 0.2778, 0.0230], + device='cuda:2'), in_proj_covar=tensor([0.0166, 0.0171, 0.0143, 0.0143, 0.0201, 0.0140, 0.0158, 0.0189], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 07:11:12,169 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=107796.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:11:13,996 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=107798.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:11:22,871 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=107808.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:11:54,592 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=107844.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 07:12:04,107 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-08 07:12:05,293 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=107856.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:12:07,030 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9598, 1.9380, 1.9953, 2.0735, 1.9745, 1.7160, 1.3019, 1.7654], + device='cuda:2'), covar=tensor([0.0762, 0.0732, 0.0594, 0.0404, 0.0570, 0.1302, 0.2439, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0172, 0.0144, 0.0144, 0.0202, 0.0140, 0.0158, 0.0190], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 07:12:13,851 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=107866.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:12:16,484 INFO [train.py:873] (2/4) Epoch 15, batch 2000, loss[loss=0.1298, simple_loss=0.1635, pruned_loss=0.04803, over 14280.00 frames. ], tot_loss[loss=0.1159, simple_loss=0.1495, pruned_loss=0.04115, over 2019231.79 frames. ], batch size: 80, lr: 5.30e-03, grad_scale: 8.0 +2022-12-08 07:12:31,885 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.235e+02 1.995e+02 2.775e+02 3.641e+02 9.267e+02, threshold=5.550e+02, percent-clipped=5.0 +2022-12-08 07:12:50,837 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8578, 4.5954, 4.4231, 4.9757, 4.6376, 4.2830, 4.9373, 4.1332], + device='cuda:2'), covar=tensor([0.0430, 0.0903, 0.0389, 0.0356, 0.0715, 0.0631, 0.0464, 0.0476], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0272, 0.0196, 0.0194, 0.0183, 0.0154, 0.0283, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 07:13:43,637 INFO [train.py:873] (2/4) Epoch 15, batch 2100, loss[loss=0.1046, simple_loss=0.14, pruned_loss=0.0346, over 14541.00 frames. ], tot_loss[loss=0.1148, simple_loss=0.1484, pruned_loss=0.04059, over 2005302.20 frames. ], batch size: 34, lr: 5.30e-03, grad_scale: 8.0 +2022-12-08 07:13:56,075 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8262, 0.8477, 0.7804, 0.8124, 0.8799, 0.3737, 0.7546, 0.9028], + device='cuda:2'), covar=tensor([0.0385, 0.0331, 0.0455, 0.0341, 0.0238, 0.0269, 0.0772, 0.0505], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0031, 0.0034, 0.0029, 0.0031, 0.0044, 0.0032, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:13:59,442 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 1.955e+02 2.546e+02 3.190e+02 6.620e+02, threshold=5.091e+02, percent-clipped=1.0 +2022-12-08 07:14:01,974 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 07:14:11,842 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2135, 2.2324, 1.9537, 2.2609, 2.0059, 1.2989, 1.8245, 2.0967], + device='cuda:2'), covar=tensor([0.0725, 0.0545, 0.0992, 0.1304, 0.1005, 0.0872, 0.0865, 0.0987], + device='cuda:2'), in_proj_covar=tensor([0.0031, 0.0031, 0.0035, 0.0029, 0.0032, 0.0044, 0.0032, 0.0035], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:14:14,321 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0653, 3.8063, 3.7878, 4.0986, 3.7040, 3.3866, 4.1320, 3.9947], + device='cuda:2'), covar=tensor([0.0627, 0.0924, 0.0804, 0.0632, 0.0884, 0.0749, 0.0593, 0.0696], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0137, 0.0140, 0.0154, 0.0143, 0.0120, 0.0162, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:14:24,595 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=108015.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:15:11,289 INFO [train.py:873] (2/4) Epoch 15, batch 2200, loss[loss=0.111, simple_loss=0.1487, pruned_loss=0.03669, over 14152.00 frames. ], tot_loss[loss=0.1149, simple_loss=0.1484, pruned_loss=0.04069, over 1958677.53 frames. ], batch size: 37, lr: 5.29e-03, grad_scale: 8.0 +2022-12-08 07:15:24,560 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.51 vs. limit=5.0 +2022-12-08 07:15:26,510 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 2.145e+02 2.568e+02 3.392e+02 6.463e+02, threshold=5.135e+02, percent-clipped=2.0 +2022-12-08 07:15:31,892 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.95 vs. limit=5.0 +2022-12-08 07:15:32,302 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=108093.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 07:15:45,011 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.22 vs. limit=5.0 +2022-12-08 07:16:19,347 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.42 vs. limit=5.0 +2022-12-08 07:16:36,093 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=108166.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:16:39,020 INFO [train.py:873] (2/4) Epoch 15, batch 2300, loss[loss=0.1318, simple_loss=0.1497, pruned_loss=0.05692, over 5961.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1478, pruned_loss=0.04049, over 1920808.08 frames. ], batch size: 100, lr: 5.29e-03, grad_scale: 8.0 +2022-12-08 07:16:54,803 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.282e+02 2.030e+02 2.596e+02 3.385e+02 5.799e+02, threshold=5.191e+02, percent-clipped=3.0 +2022-12-08 07:16:59,036 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.35 vs. limit=5.0 +2022-12-08 07:17:18,545 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=108214.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:17:19,422 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4674, 5.2666, 4.9842, 5.5580, 4.9794, 4.7515, 5.6026, 5.3550], + device='cuda:2'), covar=tensor([0.0581, 0.0592, 0.0747, 0.0448, 0.0663, 0.0495, 0.0424, 0.0576], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0136, 0.0141, 0.0154, 0.0142, 0.0120, 0.0162, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:17:28,365 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1518, 2.0926, 4.7036, 4.3202, 4.2716, 4.7403, 4.3083, 4.7572], + device='cuda:2'), covar=tensor([0.1459, 0.1344, 0.0101, 0.0179, 0.0189, 0.0114, 0.0154, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0129, 0.0170, 0.0147, 0.0141, 0.0123, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 07:18:06,535 INFO [train.py:873] (2/4) Epoch 15, batch 2400, loss[loss=0.1077, simple_loss=0.1541, pruned_loss=0.03061, over 14537.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1482, pruned_loss=0.04026, over 2014512.86 frames. ], batch size: 43, lr: 5.29e-03, grad_scale: 8.0 +2022-12-08 07:18:22,283 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.295e+02 2.197e+02 2.869e+02 3.470e+02 6.788e+02, threshold=5.738e+02, percent-clipped=3.0 +2022-12-08 07:18:26,536 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-12-08 07:18:47,588 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=108315.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:18:55,760 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=108325.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:19:29,553 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=108363.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:19:34,879 INFO [train.py:873] (2/4) Epoch 15, batch 2500, loss[loss=0.09182, simple_loss=0.1349, pruned_loss=0.02437, over 14090.00 frames. ], tot_loss[loss=0.114, simple_loss=0.1481, pruned_loss=0.03998, over 2011052.61 frames. ], batch size: 26, lr: 5.29e-03, grad_scale: 8.0 +2022-12-08 07:19:50,375 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=108386.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:19:50,975 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.050e+02 2.088e+02 2.601e+02 3.226e+02 6.971e+02, threshold=5.203e+02, percent-clipped=3.0 +2022-12-08 07:19:56,363 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=108393.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:20:31,694 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=108432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:20:38,935 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=108441.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:21:03,649 INFO [train.py:873] (2/4) Epoch 15, batch 2600, loss[loss=0.1105, simple_loss=0.1475, pruned_loss=0.03676, over 14239.00 frames. ], tot_loss[loss=0.1141, simple_loss=0.1482, pruned_loss=0.04001, over 2033560.68 frames. ], batch size: 94, lr: 5.28e-03, grad_scale: 8.0 +2022-12-08 07:21:19,264 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.775e+01 2.028e+02 2.663e+02 3.413e+02 6.606e+02, threshold=5.327e+02, percent-clipped=6.0 +2022-12-08 07:21:24,906 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=108493.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 07:21:30,214 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-12-08 07:21:55,229 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2403, 1.4623, 1.6632, 1.6630, 1.5937, 1.6576, 1.3924, 1.3011], + device='cuda:2'), covar=tensor([0.1321, 0.1220, 0.0386, 0.0552, 0.1126, 0.0840, 0.1401, 0.1545], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0087, 0.0067, 0.0070, 0.0097, 0.0085, 0.0099, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:22:14,092 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0588, 1.2429, 1.3945, 1.0262, 0.8464, 1.1304, 0.9064, 1.1876], + device='cuda:2'), covar=tensor([0.2558, 0.3353, 0.1095, 0.2616, 0.3891, 0.1390, 0.2362, 0.1594], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0101, 0.0093, 0.0099, 0.0118, 0.0089, 0.0122, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 07:22:32,113 INFO [train.py:873] (2/4) Epoch 15, batch 2700, loss[loss=0.1195, simple_loss=0.144, pruned_loss=0.04749, over 5035.00 frames. ], tot_loss[loss=0.1147, simple_loss=0.1484, pruned_loss=0.04055, over 1965400.91 frames. ], batch size: 100, lr: 5.28e-03, grad_scale: 4.0 +2022-12-08 07:22:39,943 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=108577.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:22:49,628 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.289e+02 2.154e+02 2.629e+02 3.316e+02 6.561e+02, threshold=5.258e+02, percent-clipped=1.0 +2022-12-08 07:22:58,594 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-12-08 07:23:34,043 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=108638.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:23:38,794 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.45 vs. limit=5.0 +2022-12-08 07:24:01,227 INFO [train.py:873] (2/4) Epoch 15, batch 2800, loss[loss=0.1154, simple_loss=0.1197, pruned_loss=0.05548, over 1261.00 frames. ], tot_loss[loss=0.1147, simple_loss=0.1482, pruned_loss=0.04061, over 1923404.71 frames. ], batch size: 100, lr: 5.28e-03, grad_scale: 8.0 +2022-12-08 07:24:12,087 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=108681.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:24:17,713 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.202e+02 2.137e+02 2.731e+02 3.357e+02 8.128e+02, threshold=5.462e+02, percent-clipped=3.0 +2022-12-08 07:24:55,430 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 07:25:29,353 INFO [train.py:873] (2/4) Epoch 15, batch 2900, loss[loss=0.1218, simple_loss=0.1528, pruned_loss=0.0454, over 9502.00 frames. ], tot_loss[loss=0.1133, simple_loss=0.1472, pruned_loss=0.03974, over 1939124.57 frames. ], batch size: 100, lr: 5.28e-03, grad_scale: 8.0 +2022-12-08 07:25:45,502 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.119e+02 2.529e+02 3.118e+02 5.786e+02, threshold=5.058e+02, percent-clipped=1.0 +2022-12-08 07:25:45,649 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=108788.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:25:48,312 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=108791.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:26:42,268 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=108852.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:26:49,513 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0553, 1.4469, 4.0661, 1.9328, 3.9019, 4.1716, 3.2423, 4.4374], + device='cuda:2'), covar=tensor([0.0201, 0.3120, 0.0314, 0.2169, 0.0418, 0.0306, 0.0670, 0.0151], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0159, 0.0168, 0.0166, 0.0176, 0.0132, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:26:56,662 INFO [train.py:873] (2/4) Epoch 15, batch 3000, loss[loss=0.1082, simple_loss=0.1428, pruned_loss=0.03679, over 14542.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1477, pruned_loss=0.03986, over 1972949.20 frames. ], batch size: 34, lr: 5.27e-03, grad_scale: 8.0 +2022-12-08 07:26:56,662 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 07:27:05,082 INFO [train.py:905] (2/4) Epoch 15, validation: loss=0.1368, simple_loss=0.1737, pruned_loss=0.04998, over 857387.00 frames. +2022-12-08 07:27:05,083 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 07:27:07,982 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0739, 1.1763, 1.2013, 1.0199, 1.0020, 0.8395, 0.9929, 0.8129], + device='cuda:2'), covar=tensor([0.0273, 0.0247, 0.0190, 0.0247, 0.0259, 0.0445, 0.0300, 0.0523], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:27:21,818 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.554e+02 2.224e+02 2.739e+02 3.295e+02 6.855e+02, threshold=5.478e+02, percent-clipped=4.0 +2022-12-08 07:27:24,769 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.90 vs. limit=5.0 +2022-12-08 07:28:00,928 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=108933.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:28:32,783 INFO [train.py:873] (2/4) Epoch 15, batch 3100, loss[loss=0.1081, simple_loss=0.1405, pruned_loss=0.03778, over 14244.00 frames. ], tot_loss[loss=0.1146, simple_loss=0.1482, pruned_loss=0.04046, over 1970676.95 frames. ], batch size: 69, lr: 5.27e-03, grad_scale: 8.0 +2022-12-08 07:28:42,826 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=108981.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:28:48,816 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.020e+02 2.542e+02 3.088e+02 8.012e+02, threshold=5.083e+02, percent-clipped=4.0 +2022-12-08 07:29:06,715 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.64 vs. limit=2.0 +2022-12-08 07:29:25,240 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109029.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:29:32,428 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109037.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 07:29:36,561 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7922, 1.3810, 1.7101, 1.2397, 1.5166, 1.8944, 1.4959, 1.5526], + device='cuda:2'), covar=tensor([0.0554, 0.0934, 0.0695, 0.0799, 0.1505, 0.0772, 0.0780, 0.1564], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0176, 0.0140, 0.0128, 0.0142, 0.0153, 0.0130, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:29:52,814 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4974, 5.3646, 4.9786, 5.5189, 5.0111, 4.8353, 5.5220, 5.3698], + device='cuda:2'), covar=tensor([0.0531, 0.0565, 0.0789, 0.0439, 0.0667, 0.0485, 0.0510, 0.0509], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0140, 0.0144, 0.0158, 0.0144, 0.0122, 0.0166, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:30:00,241 INFO [train.py:873] (2/4) Epoch 15, batch 3200, loss[loss=0.09275, simple_loss=0.1312, pruned_loss=0.02715, over 13907.00 frames. ], tot_loss[loss=0.1152, simple_loss=0.1485, pruned_loss=0.04094, over 1943126.81 frames. ], batch size: 20, lr: 5.27e-03, grad_scale: 8.0 +2022-12-08 07:30:16,994 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.196e+02 2.174e+02 2.531e+02 2.915e+02 6.335e+02, threshold=5.062e+02, percent-clipped=1.0 +2022-12-08 07:30:17,207 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:30:25,649 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109098.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 07:30:58,800 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109136.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:31:08,412 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109147.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:31:27,857 INFO [train.py:873] (2/4) Epoch 15, batch 3300, loss[loss=0.108, simple_loss=0.1439, pruned_loss=0.03602, over 14214.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1476, pruned_loss=0.04055, over 1859743.77 frames. ], batch size: 94, lr: 5.27e-03, grad_scale: 8.0 +2022-12-08 07:31:42,537 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1246, 1.1466, 1.1474, 0.9797, 0.9843, 0.7983, 0.8222, 0.8074], + device='cuda:2'), covar=tensor([0.0185, 0.0238, 0.0169, 0.0202, 0.0224, 0.0436, 0.0276, 0.0402], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0020, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:31:44,098 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.026e+02 2.178e+02 2.677e+02 3.330e+02 7.070e+02, threshold=5.354e+02, percent-clipped=2.0 +2022-12-08 07:31:49,526 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109194.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:32:08,708 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7353, 1.9785, 2.1643, 2.0128, 2.3081, 1.9390, 1.7620, 1.5400], + device='cuda:2'), covar=tensor([0.0409, 0.0784, 0.0362, 0.0304, 0.0164, 0.0360, 0.0345, 0.0527], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:32:23,678 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109233.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:32:26,226 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0608, 1.2363, 1.3649, 0.9257, 0.8841, 1.1083, 0.8735, 1.2005], + device='cuda:2'), covar=tensor([0.2112, 0.3089, 0.1171, 0.2917, 0.3827, 0.1470, 0.2228, 0.1664], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0094, 0.0100, 0.0119, 0.0089, 0.0122, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 07:32:43,214 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109255.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:32:54,412 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1223, 1.1664, 1.2401, 0.9484, 0.9172, 0.8460, 0.9918, 0.8929], + device='cuda:2'), covar=tensor([0.0226, 0.0260, 0.0213, 0.0254, 0.0290, 0.0448, 0.0267, 0.0451], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:32:55,129 INFO [train.py:873] (2/4) Epoch 15, batch 3400, loss[loss=0.1235, simple_loss=0.1447, pruned_loss=0.05116, over 3858.00 frames. ], tot_loss[loss=0.1143, simple_loss=0.1476, pruned_loss=0.04045, over 1872883.07 frames. ], batch size: 100, lr: 5.26e-03, grad_scale: 4.0 +2022-12-08 07:33:05,753 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109281.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:33:12,976 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.873e+01 2.037e+02 2.446e+02 3.255e+02 7.608e+02, threshold=4.893e+02, percent-clipped=4.0 +2022-12-08 07:33:13,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9429, 2.3830, 4.9752, 3.3030, 4.4901, 1.9245, 3.6802, 4.6257], + device='cuda:2'), covar=tensor([0.0467, 0.4295, 0.0378, 0.6259, 0.0768, 0.3850, 0.1250, 0.0365], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0205, 0.0213, 0.0278, 0.0231, 0.0207, 0.0206, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:33:17,251 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109294.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:33:38,167 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109318.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:33:44,030 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4685, 3.8820, 3.6373, 3.7378, 2.6801, 3.9869, 3.5040, 2.0720], + device='cuda:2'), covar=tensor([0.1588, 0.0873, 0.1012, 0.0794, 0.0916, 0.0399, 0.1203, 0.1960], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0087, 0.0069, 0.0071, 0.0098, 0.0086, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:34:03,781 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3506, 3.4268, 3.6353, 3.2875, 3.5016, 3.3109, 1.5202, 3.2694], + device='cuda:2'), covar=tensor([0.0399, 0.0424, 0.0373, 0.0502, 0.0370, 0.0548, 0.3068, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0167, 0.0172, 0.0144, 0.0144, 0.0202, 0.0139, 0.0157, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 07:34:10,917 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109355.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:34:20,893 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.89 vs. limit=5.0 +2022-12-08 07:34:23,256 INFO [train.py:873] (2/4) Epoch 15, batch 3500, loss[loss=0.1749, simple_loss=0.1528, pruned_loss=0.09844, over 1229.00 frames. ], tot_loss[loss=0.1132, simple_loss=0.1471, pruned_loss=0.03969, over 1918889.54 frames. ], batch size: 100, lr: 5.26e-03, grad_scale: 4.0 +2022-12-08 07:34:32,378 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109379.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:34:40,313 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.249e+02 2.264e+02 2.745e+02 3.644e+02 1.157e+03, threshold=5.490e+02, percent-clipped=6.0 +2022-12-08 07:34:44,039 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109393.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 07:35:03,660 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-08 07:35:31,954 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109447.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:35:50,860 INFO [train.py:873] (2/4) Epoch 15, batch 3600, loss[loss=0.09922, simple_loss=0.1357, pruned_loss=0.03139, over 10311.00 frames. ], tot_loss[loss=0.1132, simple_loss=0.1472, pruned_loss=0.03965, over 1969401.94 frames. ], batch size: 100, lr: 5.26e-03, grad_scale: 8.0 +2022-12-08 07:36:08,875 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.104e+02 2.724e+02 3.497e+02 9.894e+02, threshold=5.448e+02, percent-clipped=5.0 +2022-12-08 07:36:14,319 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109495.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:36:20,562 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8218, 1.5271, 1.7284, 1.9474, 1.4858, 1.6925, 1.5725, 1.8364], + device='cuda:2'), covar=tensor([0.0233, 0.0320, 0.0216, 0.0182, 0.0291, 0.0405, 0.0301, 0.0218], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0255, 0.0371, 0.0328, 0.0269, 0.0302, 0.0308, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:37:02,927 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109550.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:37:19,588 INFO [train.py:873] (2/4) Epoch 15, batch 3700, loss[loss=0.1005, simple_loss=0.1294, pruned_loss=0.03578, over 4976.00 frames. ], tot_loss[loss=0.1141, simple_loss=0.1475, pruned_loss=0.04029, over 1945938.68 frames. ], batch size: 100, lr: 5.26e-03, grad_scale: 8.0 +2022-12-08 07:37:28,835 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109579.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:37:37,110 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.397e+02 2.068e+02 2.547e+02 3.153e+02 5.371e+02, threshold=5.093e+02, percent-clipped=0.0 +2022-12-08 07:38:13,909 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1945, 2.0544, 2.1243, 2.2015, 2.0906, 2.0826, 2.2601, 1.9539], + device='cuda:2'), covar=tensor([0.0952, 0.1504, 0.0730, 0.0821, 0.1196, 0.0886, 0.0848, 0.0740], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0274, 0.0199, 0.0196, 0.0186, 0.0156, 0.0288, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 07:38:23,086 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109640.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:38:31,041 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8218, 1.6205, 1.6471, 1.8037, 1.8505, 1.1313, 1.5893, 1.6597], + device='cuda:2'), covar=tensor([0.0513, 0.0766, 0.0620, 0.0565, 0.0840, 0.0832, 0.0589, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0032, 0.0032, 0.0035, 0.0030, 0.0031, 0.0044, 0.0032, 0.0035], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:38:32,136 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109650.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:38:39,885 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=109659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:38:46,646 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2329, 2.1999, 3.1467, 3.3199, 3.1813, 2.2720, 3.2347, 2.5185], + device='cuda:2'), covar=tensor([0.0470, 0.1142, 0.0836, 0.0480, 0.0516, 0.1413, 0.0457, 0.0962], + device='cuda:2'), in_proj_covar=tensor([0.0289, 0.0254, 0.0369, 0.0325, 0.0267, 0.0300, 0.0306, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:38:48,153 INFO [train.py:873] (2/4) Epoch 15, batch 3800, loss[loss=0.1342, simple_loss=0.16, pruned_loss=0.05419, over 13893.00 frames. ], tot_loss[loss=0.1146, simple_loss=0.1482, pruned_loss=0.04053, over 1996470.83 frames. ], batch size: 20, lr: 5.26e-03, grad_scale: 8.0 +2022-12-08 07:38:53,067 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109674.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 07:39:04,197 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7765, 1.7930, 1.5625, 1.8668, 1.7365, 1.7534, 1.8022, 1.7100], + device='cuda:2'), covar=tensor([0.1492, 0.0993, 0.2122, 0.0737, 0.1328, 0.0760, 0.1518, 0.0972], + device='cuda:2'), in_proj_covar=tensor([0.0278, 0.0291, 0.0263, 0.0279, 0.0324, 0.0304, 0.0259, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:39:06,071 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 2.193e+02 2.777e+02 3.474e+02 7.448e+02, threshold=5.555e+02, percent-clipped=3.0 +2022-12-08 07:39:10,084 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109693.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:39:33,975 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=109720.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:39:52,261 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109741.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 07:39:58,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2714, 2.6658, 4.0676, 3.0737, 4.1243, 4.0596, 3.8655, 3.4055], + device='cuda:2'), covar=tensor([0.0729, 0.3146, 0.0954, 0.1692, 0.0755, 0.0873, 0.1228, 0.1665], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0315, 0.0392, 0.0298, 0.0373, 0.0322, 0.0359, 0.0301], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:40:16,359 INFO [train.py:873] (2/4) Epoch 15, batch 3900, loss[loss=0.146, simple_loss=0.1619, pruned_loss=0.06508, over 6042.00 frames. ], tot_loss[loss=0.1145, simple_loss=0.1483, pruned_loss=0.04037, over 2005742.86 frames. ], batch size: 100, lr: 5.25e-03, grad_scale: 8.0 +2022-12-08 07:40:34,144 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.655e+02 2.235e+02 2.763e+02 3.267e+02 1.043e+03, threshold=5.525e+02, percent-clipped=3.0 +2022-12-08 07:40:37,838 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1480, 2.1406, 2.1935, 1.8833, 1.8675, 1.9673, 1.8192, 1.9424], + device='cuda:2'), covar=tensor([0.0285, 0.0832, 0.0363, 0.0393, 0.0374, 0.0528, 0.0482, 0.0703], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0018, 0.0020, 0.0019, 0.0032, 0.0026, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:40:39,718 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3256, 3.6113, 2.9206, 4.5868, 4.1630, 4.3712, 3.6951, 3.0228], + device='cuda:2'), covar=tensor([0.0746, 0.1456, 0.3650, 0.0571, 0.0979, 0.1187, 0.1255, 0.3492], + device='cuda:2'), in_proj_covar=tensor([0.0279, 0.0293, 0.0264, 0.0281, 0.0325, 0.0306, 0.0259, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:40:47,761 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8940, 2.7401, 2.7332, 2.9317, 2.8145, 2.8404, 3.0060, 2.4861], + device='cuda:2'), covar=tensor([0.0796, 0.1052, 0.0661, 0.0641, 0.0855, 0.0561, 0.0643, 0.0663], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0272, 0.0198, 0.0195, 0.0185, 0.0156, 0.0287, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 07:40:47,828 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7829, 1.4025, 1.7015, 1.2330, 1.4814, 1.8092, 1.4887, 1.6058], + device='cuda:2'), covar=tensor([0.0795, 0.0858, 0.0633, 0.0753, 0.1490, 0.0741, 0.0908, 0.1572], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0173, 0.0139, 0.0127, 0.0141, 0.0153, 0.0130, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:41:28,410 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109850.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:41:45,245 INFO [train.py:873] (2/4) Epoch 15, batch 4000, loss[loss=0.1139, simple_loss=0.1374, pruned_loss=0.04521, over 4996.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1477, pruned_loss=0.03984, over 2020450.50 frames. ], batch size: 100, lr: 5.25e-03, grad_scale: 8.0 +2022-12-08 07:41:53,661 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-08 07:42:02,601 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.030e+02 2.144e+02 2.556e+02 3.390e+02 6.366e+02, threshold=5.112e+02, percent-clipped=4.0 +2022-12-08 07:42:09,461 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.19 vs. limit=5.0 +2022-12-08 07:42:10,890 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109898.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:42:43,623 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=109935.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:42:56,987 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109950.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:43:13,349 INFO [train.py:873] (2/4) Epoch 15, batch 4100, loss[loss=0.1023, simple_loss=0.1441, pruned_loss=0.03031, over 14282.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1474, pruned_loss=0.03998, over 1933187.51 frames. ], batch size: 35, lr: 5.25e-03, grad_scale: 8.0 +2022-12-08 07:43:18,154 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=109974.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:43:31,528 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.583e+01 2.156e+02 2.609e+02 3.160e+02 7.009e+02, threshold=5.218e+02, percent-clipped=5.0 +2022-12-08 07:43:37,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 07:43:39,871 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=109998.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:43:59,014 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110015.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:44:00,724 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110017.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:44:05,368 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110022.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:44:23,950 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-08 07:44:24,397 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110044.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:44:46,256 INFO [train.py:873] (2/4) Epoch 15, batch 4200, loss[loss=0.09374, simple_loss=0.1393, pruned_loss=0.0241, over 14281.00 frames. ], tot_loss[loss=0.1129, simple_loss=0.1471, pruned_loss=0.03936, over 1951890.03 frames. ], batch size: 25, lr: 5.25e-03, grad_scale: 8.0 +2022-12-08 07:44:54,519 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110078.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:45:03,731 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.112e+02 2.176e+02 2.682e+02 3.403e+02 7.328e+02, threshold=5.365e+02, percent-clipped=6.0 +2022-12-08 07:45:18,556 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110105.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:45:22,721 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110110.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:45:27,150 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110115.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:45:41,753 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6182, 2.4708, 2.1936, 2.3563, 2.5357, 2.5794, 2.5655, 2.5759], + device='cuda:2'), covar=tensor([0.1298, 0.0942, 0.3206, 0.2920, 0.1280, 0.1309, 0.1937, 0.1168], + device='cuda:2'), in_proj_covar=tensor([0.0381, 0.0258, 0.0443, 0.0557, 0.0336, 0.0437, 0.0387, 0.0382], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:46:03,060 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.18 vs. limit=5.0 +2022-12-08 07:46:13,635 INFO [train.py:873] (2/4) Epoch 15, batch 4300, loss[loss=0.1152, simple_loss=0.1506, pruned_loss=0.03988, over 14278.00 frames. ], tot_loss[loss=0.114, simple_loss=0.1476, pruned_loss=0.04023, over 1913715.94 frames. ], batch size: 44, lr: 5.24e-03, grad_scale: 4.0 +2022-12-08 07:46:15,682 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110171.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:46:20,331 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110176.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:46:28,015 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4174, 5.2433, 5.0731, 5.4851, 5.0866, 4.9129, 5.5217, 5.2602], + device='cuda:2'), covar=tensor([0.0613, 0.0652, 0.0659, 0.0409, 0.0630, 0.0365, 0.0447, 0.0500], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0139, 0.0143, 0.0157, 0.0145, 0.0122, 0.0167, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:46:32,113 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 2.281e+02 2.830e+02 3.301e+02 1.295e+03, threshold=5.661e+02, percent-clipped=5.0 +2022-12-08 07:47:10,793 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1910, 2.6351, 4.3145, 4.4429, 4.3061, 2.5678, 4.5237, 3.5848], + device='cuda:2'), covar=tensor([0.0450, 0.1092, 0.0882, 0.0350, 0.0402, 0.1692, 0.0318, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0256, 0.0371, 0.0327, 0.0269, 0.0302, 0.0308, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 07:47:12,453 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110235.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:47:37,815 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8777, 1.7679, 2.0409, 1.8060, 1.8630, 1.6580, 1.4892, 1.3021], + device='cuda:2'), covar=tensor([0.0192, 0.0569, 0.0244, 0.0273, 0.0197, 0.0330, 0.0309, 0.0403], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0018, 0.0020, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:47:41,819 INFO [train.py:873] (2/4) Epoch 15, batch 4400, loss[loss=0.1029, simple_loss=0.1407, pruned_loss=0.03261, over 14290.00 frames. ], tot_loss[loss=0.1141, simple_loss=0.1479, pruned_loss=0.04014, over 1970167.31 frames. ], batch size: 46, lr: 5.24e-03, grad_scale: 8.0 +2022-12-08 07:47:50,045 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7977, 1.3434, 2.5346, 2.2296, 2.4209, 2.5367, 1.8174, 2.5469], + device='cuda:2'), covar=tensor([0.0982, 0.1270, 0.0200, 0.0472, 0.0462, 0.0235, 0.0699, 0.0242], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0130, 0.0170, 0.0148, 0.0142, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 07:47:54,413 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110283.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:47:54,493 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110283.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:48:00,235 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.123e+01 2.204e+02 2.570e+02 3.174e+02 5.425e+02, threshold=5.140e+02, percent-clipped=0.0 +2022-12-08 07:48:12,333 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110303.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:48:22,531 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110315.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:48:25,423 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110318.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:48:48,613 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110344.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:48:50,191 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1569, 2.0504, 1.7799, 1.8979, 2.0721, 2.1039, 2.0858, 2.0511], + device='cuda:2'), covar=tensor([0.1138, 0.0907, 0.2889, 0.2838, 0.1184, 0.1187, 0.1700, 0.1234], + device='cuda:2'), in_proj_covar=tensor([0.0382, 0.0259, 0.0444, 0.0559, 0.0336, 0.0440, 0.0388, 0.0383], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:49:05,185 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110363.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:49:06,180 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110364.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:49:10,337 INFO [train.py:873] (2/4) Epoch 15, batch 4500, loss[loss=0.109, simple_loss=0.1498, pruned_loss=0.03413, over 14368.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1475, pruned_loss=0.03927, over 2000532.33 frames. ], batch size: 55, lr: 5.24e-03, grad_scale: 8.0 +2022-12-08 07:49:14,176 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110373.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:49:19,990 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110379.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:49:29,134 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.277e+02 2.023e+02 2.685e+02 3.179e+02 5.561e+02, threshold=5.371e+02, percent-clipped=3.0 +2022-12-08 07:49:37,163 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0038, 1.6658, 2.0029, 1.3791, 1.7901, 2.1270, 1.9290, 1.8241], + device='cuda:2'), covar=tensor([0.0928, 0.0803, 0.1041, 0.1503, 0.1425, 0.0868, 0.0785, 0.1593], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0171, 0.0137, 0.0126, 0.0139, 0.0151, 0.0129, 0.0139], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:49:38,330 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110400.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:49:46,897 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8711, 1.8429, 1.6261, 1.9485, 1.8393, 1.7976, 1.8692, 1.7466], + device='cuda:2'), covar=tensor([0.0985, 0.0855, 0.1762, 0.0676, 0.0947, 0.0677, 0.1283, 0.0908], + device='cuda:2'), in_proj_covar=tensor([0.0280, 0.0292, 0.0264, 0.0282, 0.0326, 0.0305, 0.0258, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:49:51,260 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.51 vs. limit=5.0 +2022-12-08 07:50:36,602 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110466.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:50:39,119 INFO [train.py:873] (2/4) Epoch 15, batch 4600, loss[loss=0.1063, simple_loss=0.1466, pruned_loss=0.03305, over 14459.00 frames. ], tot_loss[loss=0.1138, simple_loss=0.148, pruned_loss=0.03986, over 1957521.71 frames. ], batch size: 24, lr: 5.24e-03, grad_scale: 4.0 +2022-12-08 07:50:40,986 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110471.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:50:58,403 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.443e+02 2.355e+02 2.918e+02 3.425e+02 9.705e+02, threshold=5.836e+02, percent-clipped=5.0 +2022-12-08 07:51:07,435 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-08 07:51:23,310 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([6.1604, 5.6090, 5.7562, 6.1533, 5.8071, 4.7976, 6.1042, 5.1127], + device='cuda:2'), covar=tensor([0.0271, 0.0898, 0.0295, 0.0373, 0.0615, 0.0332, 0.0428, 0.0490], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0274, 0.0199, 0.0196, 0.0188, 0.0157, 0.0289, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 07:51:49,548 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7748, 3.0690, 3.0350, 3.0557, 2.3532, 3.1691, 2.9207, 1.5877], + device='cuda:2'), covar=tensor([0.1410, 0.0723, 0.0627, 0.0571, 0.1008, 0.0434, 0.0882, 0.2221], + device='cuda:2'), in_proj_covar=tensor([0.0141, 0.0088, 0.0068, 0.0071, 0.0097, 0.0085, 0.0099, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:51:52,862 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6169, 4.3918, 4.0114, 4.1295, 4.4695, 4.5855, 4.7238, 4.5790], + device='cuda:2'), covar=tensor([0.1407, 0.0566, 0.2776, 0.3499, 0.0944, 0.1289, 0.0989, 0.1307], + device='cuda:2'), in_proj_covar=tensor([0.0374, 0.0255, 0.0433, 0.0544, 0.0331, 0.0432, 0.0380, 0.0375], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0003, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:52:06,648 INFO [train.py:873] (2/4) Epoch 15, batch 4700, loss[loss=0.1313, simple_loss=0.1652, pruned_loss=0.04871, over 14154.00 frames. ], tot_loss[loss=0.1144, simple_loss=0.1482, pruned_loss=0.04028, over 1986749.01 frames. ], batch size: 84, lr: 5.23e-03, grad_scale: 4.0 +2022-12-08 07:52:26,207 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 2.034e+02 2.470e+02 3.158e+02 1.519e+03, threshold=4.939e+02, percent-clipped=4.0 +2022-12-08 07:52:44,588 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110611.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:00,364 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8765, 1.6597, 3.7231, 3.3419, 3.5030, 3.7341, 3.0863, 3.6872], + device='cuda:2'), covar=tensor([0.1617, 0.1606, 0.0126, 0.0318, 0.0247, 0.0150, 0.0295, 0.0158], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0130, 0.0171, 0.0148, 0.0142, 0.0123, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 07:53:09,061 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110639.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:09,538 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-12-08 07:53:22,729 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.49 vs. limit=5.0 +2022-12-08 07:53:26,643 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:34,855 INFO [train.py:873] (2/4) Epoch 15, batch 4800, loss[loss=0.1155, simple_loss=0.1551, pruned_loss=0.03797, over 14256.00 frames. ], tot_loss[loss=0.1135, simple_loss=0.1473, pruned_loss=0.03985, over 1986887.54 frames. ], batch size: 76, lr: 5.23e-03, grad_scale: 8.0 +2022-12-08 07:53:37,707 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110672.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:38,566 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110673.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:39,409 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110674.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:53:53,745 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.088e+02 2.309e+02 2.792e+02 3.592e+02 7.510e+02, threshold=5.583e+02, percent-clipped=5.0 +2022-12-08 07:54:01,964 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110700.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:54:09,614 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.87 vs. limit=5.0 +2022-12-08 07:54:20,442 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110721.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:54:21,909 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.86 vs. limit=2.0 +2022-12-08 07:54:33,612 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110736.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:54:44,441 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110748.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:54:50,834 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7264, 3.4302, 2.6053, 3.8667, 3.8001, 3.7190, 3.3731, 2.7153], + device='cuda:2'), covar=tensor([0.0814, 0.1240, 0.3234, 0.0573, 0.0898, 0.1266, 0.1076, 0.3057], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0295, 0.0266, 0.0283, 0.0328, 0.0308, 0.0259, 0.0248], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:54:59,406 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.36 vs. limit=2.0 +2022-12-08 07:55:00,702 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110766.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:55:03,184 INFO [train.py:873] (2/4) Epoch 15, batch 4900, loss[loss=0.1091, simple_loss=0.1514, pruned_loss=0.03344, over 14289.00 frames. ], tot_loss[loss=0.1131, simple_loss=0.1471, pruned_loss=0.03961, over 1953526.91 frames. ], batch size: 25, lr: 5.23e-03, grad_scale: 8.0 +2022-12-08 07:55:04,783 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110771.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:55:19,502 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3412, 1.4358, 3.3484, 1.6349, 3.2001, 3.4232, 2.3987, 3.6569], + device='cuda:2'), covar=tensor([0.0251, 0.3206, 0.0413, 0.2310, 0.0926, 0.0419, 0.1011, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0155, 0.0159, 0.0170, 0.0168, 0.0178, 0.0133, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:55:22,715 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.185e+02 1.962e+02 2.570e+02 3.049e+02 5.982e+02, threshold=5.139e+02, percent-clipped=2.0 +2022-12-08 07:55:28,568 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110797.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 07:55:38,245 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8187, 2.4695, 3.1475, 2.0410, 1.9846, 2.7631, 1.6800, 2.6510], + device='cuda:2'), covar=tensor([0.0953, 0.1263, 0.0612, 0.2090, 0.2335, 0.0846, 0.3127, 0.0983], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0093, 0.0099, 0.0116, 0.0088, 0.0120, 0.0092], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 07:55:43,508 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110814.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:55:48,356 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110819.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:56:31,189 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110868.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:56:31,860 INFO [train.py:873] (2/4) Epoch 15, batch 5000, loss[loss=0.1183, simple_loss=0.1425, pruned_loss=0.04704, over 4971.00 frames. ], tot_loss[loss=0.1131, simple_loss=0.1471, pruned_loss=0.03958, over 1956366.69 frames. ], batch size: 100, lr: 5.23e-03, grad_scale: 4.0 +2022-12-08 07:56:50,267 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 07:56:52,356 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.709e+01 2.105e+02 2.624e+02 3.225e+02 7.187e+02, threshold=5.248e+02, percent-clipped=2.0 +2022-12-08 07:57:25,027 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=110929.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:57:30,536 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2601, 1.7030, 2.4455, 2.0148, 2.2653, 1.6414, 1.9832, 2.2490], + device='cuda:2'), covar=tensor([0.2061, 0.3600, 0.0723, 0.1910, 0.1136, 0.2181, 0.1126, 0.0897], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0204, 0.0212, 0.0274, 0.0230, 0.0207, 0.0205, 0.0215], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:57:33,690 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110939.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:57:34,721 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0852, 2.0822, 1.9374, 2.0607, 1.9312, 1.8934, 1.7093, 1.7196], + device='cuda:2'), covar=tensor([0.0242, 0.0392, 0.0433, 0.0283, 0.0282, 0.0458, 0.0368, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 07:57:44,149 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.95 vs. limit=5.0 +2022-12-08 07:57:50,996 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=110958.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:57:51,663 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110959.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:57:58,605 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=110967.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:00,303 INFO [train.py:873] (2/4) Epoch 15, batch 5100, loss[loss=0.1063, simple_loss=0.1459, pruned_loss=0.03338, over 14259.00 frames. ], tot_loss[loss=0.1134, simple_loss=0.1474, pruned_loss=0.03968, over 1975970.27 frames. ], batch size: 76, lr: 5.22e-03, grad_scale: 4.0 +2022-12-08 07:58:04,959 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=110974.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:16,272 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=110987.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:20,532 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 2.177e+02 2.703e+02 3.385e+02 6.811e+02, threshold=5.407e+02, percent-clipped=6.0 +2022-12-08 07:58:33,413 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111007.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:44,088 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=111019.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:46,940 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111022.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 07:58:58,774 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5269, 4.3138, 4.0470, 4.1770, 4.3468, 4.4535, 4.5363, 4.5104], + device='cuda:2'), covar=tensor([0.0827, 0.0499, 0.2130, 0.2774, 0.0806, 0.0847, 0.0876, 0.0843], + device='cuda:2'), in_proj_covar=tensor([0.0383, 0.0264, 0.0444, 0.0560, 0.0342, 0.0443, 0.0389, 0.0383], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:59:05,924 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8891, 2.2070, 2.2481, 2.3063, 1.9878, 2.2366, 2.0650, 1.4616], + device='cuda:2'), covar=tensor([0.1177, 0.1078, 0.0736, 0.0568, 0.1017, 0.0820, 0.1079, 0.1934], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0088, 0.0069, 0.0072, 0.0097, 0.0086, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 07:59:28,414 INFO [train.py:873] (2/4) Epoch 15, batch 5200, loss[loss=0.101, simple_loss=0.1427, pruned_loss=0.02972, over 14616.00 frames. ], tot_loss[loss=0.114, simple_loss=0.1479, pruned_loss=0.04005, over 1995960.34 frames. ], batch size: 33, lr: 5.22e-03, grad_scale: 8.0 +2022-12-08 07:59:44,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8818, 2.6836, 2.4250, 2.5631, 2.8234, 2.8333, 2.8297, 2.8220], + device='cuda:2'), covar=tensor([0.1122, 0.0842, 0.2428, 0.2801, 0.0980, 0.1192, 0.1415, 0.1011], + device='cuda:2'), in_proj_covar=tensor([0.0382, 0.0264, 0.0444, 0.0560, 0.0342, 0.0443, 0.0389, 0.0383], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 07:59:45,326 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 07:59:48,545 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.014e+01 2.007e+02 2.591e+02 3.079e+02 5.022e+02, threshold=5.182e+02, percent-clipped=0.0 +2022-12-08 07:59:48,653 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=111092.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 08:00:30,499 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4325, 4.0966, 3.9696, 4.4635, 4.1253, 3.9085, 4.4518, 3.7535], + device='cuda:2'), covar=tensor([0.0404, 0.0939, 0.0439, 0.0389, 0.0781, 0.0893, 0.0513, 0.0527], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0270, 0.0197, 0.0194, 0.0185, 0.0155, 0.0285, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:00:54,708 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0604, 1.3086, 1.3238, 1.0009, 0.8815, 1.1642, 0.9038, 1.2384], + device='cuda:2'), covar=tensor([0.1874, 0.2434, 0.0999, 0.2325, 0.2831, 0.1277, 0.1950, 0.1163], + device='cuda:2'), in_proj_covar=tensor([0.0083, 0.0100, 0.0092, 0.0098, 0.0114, 0.0087, 0.0119, 0.0091], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 08:00:57,147 INFO [train.py:873] (2/4) Epoch 15, batch 5300, loss[loss=0.1134, simple_loss=0.1412, pruned_loss=0.04273, over 6003.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1473, pruned_loss=0.03936, over 2035598.65 frames. ], batch size: 100, lr: 5.22e-03, grad_scale: 4.0 +2022-12-08 08:01:00,065 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8088, 4.4178, 4.2951, 4.8221, 4.4400, 4.2078, 4.7934, 4.0177], + device='cuda:2'), covar=tensor([0.0382, 0.0991, 0.0397, 0.0378, 0.0775, 0.0680, 0.0494, 0.0515], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0271, 0.0197, 0.0194, 0.0185, 0.0155, 0.0285, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:01:18,476 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.451e+02 2.026e+02 2.377e+02 2.924e+02 1.186e+03, threshold=4.754e+02, percent-clipped=4.0 +2022-12-08 08:01:39,083 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5044, 3.2065, 2.5625, 3.6143, 3.5206, 3.4929, 3.0777, 2.5353], + device='cuda:2'), covar=tensor([0.0882, 0.1344, 0.2933, 0.0601, 0.0790, 0.1076, 0.1296, 0.2671], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0292, 0.0260, 0.0278, 0.0321, 0.0302, 0.0255, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:01:46,175 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=111224.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:01:46,998 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=111225.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:02:08,611 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9333, 2.6992, 2.7348, 2.9415, 2.8120, 2.8456, 2.9866, 2.4578], + device='cuda:2'), covar=tensor([0.0690, 0.1108, 0.0598, 0.0618, 0.0889, 0.0528, 0.0692, 0.0685], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0270, 0.0197, 0.0194, 0.0184, 0.0154, 0.0285, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:02:24,007 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=111267.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:02:25,564 INFO [train.py:873] (2/4) Epoch 15, batch 5400, loss[loss=0.1146, simple_loss=0.1568, pruned_loss=0.03621, over 14192.00 frames. ], tot_loss[loss=0.1136, simple_loss=0.1474, pruned_loss=0.03993, over 1976839.55 frames. ], batch size: 35, lr: 5.22e-03, grad_scale: 4.0 +2022-12-08 08:02:34,562 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.40 vs. limit=5.0 +2022-12-08 08:02:40,907 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=111286.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:02:46,809 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.071e+02 2.546e+02 3.124e+02 5.888e+02, threshold=5.092e+02, percent-clipped=1.0 +2022-12-08 08:02:49,760 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9709, 2.9305, 2.3623, 2.4626, 2.9333, 2.9935, 3.0620, 3.0025], + device='cuda:2'), covar=tensor([0.1598, 0.0883, 0.3290, 0.4382, 0.1368, 0.1483, 0.1488, 0.1440], + device='cuda:2'), in_proj_covar=tensor([0.0380, 0.0262, 0.0443, 0.0558, 0.0340, 0.0442, 0.0387, 0.0384], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:03:05,550 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=111314.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:03:06,439 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111315.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:03:07,386 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5022, 1.3999, 3.5326, 1.6425, 3.4085, 3.5951, 2.5324, 3.8887], + device='cuda:2'), covar=tensor([0.0238, 0.3212, 0.0384, 0.2245, 0.0691, 0.0435, 0.0886, 0.0178], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0159, 0.0170, 0.0167, 0.0179, 0.0132, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:03:53,190 INFO [train.py:873] (2/4) Epoch 15, batch 5500, loss[loss=0.145, simple_loss=0.1674, pruned_loss=0.06133, over 10347.00 frames. ], tot_loss[loss=0.1127, simple_loss=0.147, pruned_loss=0.03917, over 1997674.02 frames. ], batch size: 100, lr: 5.21e-03, grad_scale: 4.0 +2022-12-08 08:04:13,967 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=111392.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 08:04:14,625 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.096e+02 2.065e+02 2.464e+02 3.206e+02 5.854e+02, threshold=4.928e+02, percent-clipped=3.0 +2022-12-08 08:04:17,479 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9902, 3.7475, 3.6723, 4.0339, 3.7751, 3.5635, 4.0726, 3.3702], + device='cuda:2'), covar=tensor([0.0563, 0.0842, 0.0463, 0.0434, 0.0794, 0.1371, 0.0516, 0.0571], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0272, 0.0197, 0.0195, 0.0187, 0.0155, 0.0287, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:04:56,359 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111440.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:04:59,443 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.42 vs. limit=5.0 +2022-12-08 08:05:20,914 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=111468.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:05:21,570 INFO [train.py:873] (2/4) Epoch 15, batch 5600, loss[loss=0.1392, simple_loss=0.1611, pruned_loss=0.05862, over 14516.00 frames. ], tot_loss[loss=0.1137, simple_loss=0.1477, pruned_loss=0.03982, over 1996379.94 frames. ], batch size: 49, lr: 5.21e-03, grad_scale: 8.0 +2022-12-08 08:05:42,277 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.367e+02 2.013e+02 2.578e+02 3.220e+02 8.105e+02, threshold=5.156e+02, percent-clipped=4.0 +2022-12-08 08:05:58,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.30 vs. limit=5.0 +2022-12-08 08:06:10,293 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=111524.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:06:14,507 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=111529.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:06:48,896 INFO [train.py:873] (2/4) Epoch 15, batch 5700, loss[loss=0.1871, simple_loss=0.1608, pruned_loss=0.1067, over 1253.00 frames. ], tot_loss[loss=0.1138, simple_loss=0.1477, pruned_loss=0.03997, over 1963259.19 frames. ], batch size: 100, lr: 5.21e-03, grad_scale: 4.0 +2022-12-08 08:06:51,468 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111572.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:06:59,438 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=111581.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:07:10,365 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.509e+01 2.357e+02 2.792e+02 3.438e+02 6.748e+02, threshold=5.583e+02, percent-clipped=1.0 +2022-12-08 08:07:28,709 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=111614.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:07:40,377 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.09 vs. limit=2.0 +2022-12-08 08:07:52,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4774, 3.2142, 2.6204, 3.6244, 3.5147, 3.5351, 3.1243, 2.4732], + device='cuda:2'), covar=tensor([0.0815, 0.1353, 0.2787, 0.0674, 0.1034, 0.1082, 0.1179, 0.3109], + device='cuda:2'), in_proj_covar=tensor([0.0280, 0.0293, 0.0261, 0.0281, 0.0325, 0.0302, 0.0256, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:08:04,541 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.69 vs. limit=5.0 +2022-12-08 08:08:10,712 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111662.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:08:17,304 INFO [train.py:873] (2/4) Epoch 15, batch 5800, loss[loss=0.1111, simple_loss=0.1506, pruned_loss=0.0358, over 14285.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1469, pruned_loss=0.03921, over 1941863.65 frames. ], batch size: 31, lr: 5.21e-03, grad_scale: 4.0 +2022-12-08 08:08:28,861 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4909, 3.3064, 2.6365, 3.6230, 3.5285, 3.5193, 3.1467, 2.5760], + device='cuda:2'), covar=tensor([0.0869, 0.1181, 0.2981, 0.0693, 0.0849, 0.1105, 0.1188, 0.2998], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0293, 0.0261, 0.0281, 0.0325, 0.0302, 0.0257, 0.0247], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:08:38,928 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.211e+02 2.861e+02 3.266e+02 8.310e+02, threshold=5.722e+02, percent-clipped=4.0 +2022-12-08 08:08:59,994 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4138, 3.8517, 3.4799, 3.5244, 2.7039, 3.7150, 3.5502, 1.8659], + device='cuda:2'), covar=tensor([0.1463, 0.0504, 0.1163, 0.0839, 0.0942, 0.0351, 0.0986, 0.2091], + device='cuda:2'), in_proj_covar=tensor([0.0142, 0.0089, 0.0069, 0.0072, 0.0098, 0.0086, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 08:09:18,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7804, 2.7686, 2.6326, 2.8897, 2.4629, 2.5550, 2.8487, 2.7757], + device='cuda:2'), covar=tensor([0.0836, 0.1176, 0.0908, 0.0777, 0.1150, 0.0792, 0.0929, 0.0907], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0141, 0.0145, 0.0159, 0.0146, 0.0122, 0.0167, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:09:44,964 INFO [train.py:873] (2/4) Epoch 15, batch 5900, loss[loss=0.1209, simple_loss=0.1518, pruned_loss=0.04498, over 9494.00 frames. ], tot_loss[loss=0.1128, simple_loss=0.147, pruned_loss=0.03933, over 1886398.09 frames. ], batch size: 100, lr: 5.21e-03, grad_scale: 4.0 +2022-12-08 08:10:04,851 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=111791.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 08:10:07,183 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.002e+02 2.082e+02 2.433e+02 3.039e+02 5.689e+02, threshold=4.867e+02, percent-clipped=0.0 +2022-12-08 08:10:33,500 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=111824.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:10:33,588 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6727, 1.6069, 1.7682, 1.5462, 1.4497, 1.4601, 1.3734, 1.2774], + device='cuda:2'), covar=tensor([0.0174, 0.0190, 0.0169, 0.0177, 0.0193, 0.0320, 0.0243, 0.0375], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:10:57,509 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=111852.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 08:11:12,396 INFO [train.py:873] (2/4) Epoch 15, batch 6000, loss[loss=0.1036, simple_loss=0.1445, pruned_loss=0.03138, over 14269.00 frames. ], tot_loss[loss=0.1115, simple_loss=0.1459, pruned_loss=0.03852, over 1930151.74 frames. ], batch size: 46, lr: 5.20e-03, grad_scale: 8.0 +2022-12-08 08:11:12,397 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 08:11:18,771 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0215, 4.9387, 4.7601, 5.1051, 4.3841, 4.7903, 5.1663, 4.6928], + device='cuda:2'), covar=tensor([0.0527, 0.0443, 0.0950, 0.0692, 0.1577, 0.0333, 0.0418, 0.0879], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0140, 0.0145, 0.0158, 0.0145, 0.0122, 0.0166, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:11:19,952 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6858, 4.1588, 3.4620, 4.9204, 4.5982, 4.8032, 4.2601, 3.4664], + device='cuda:2'), covar=tensor([0.0430, 0.0958, 0.3344, 0.0510, 0.0638, 0.0607, 0.0916, 0.3337], + device='cuda:2'), in_proj_covar=tensor([0.0278, 0.0290, 0.0259, 0.0280, 0.0322, 0.0301, 0.0255, 0.0246], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:11:20,777 INFO [train.py:905] (2/4) Epoch 15, validation: loss=0.1363, simple_loss=0.1737, pruned_loss=0.04946, over 857387.00 frames. +2022-12-08 08:11:20,778 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 08:11:31,601 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=111881.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:11:42,937 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.209e+02 2.150e+02 2.606e+02 3.231e+02 6.699e+02, threshold=5.212e+02, percent-clipped=6.0 +2022-12-08 08:11:50,257 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5376, 1.5023, 1.5950, 1.4146, 1.3226, 1.3969, 1.2656, 1.1812], + device='cuda:2'), covar=tensor([0.0197, 0.0276, 0.0168, 0.0189, 0.0197, 0.0301, 0.0222, 0.0343], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:12:13,674 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=111929.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:12:15,759 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4294, 2.5164, 2.5866, 2.5259, 2.5161, 2.1635, 1.5334, 2.2697], + device='cuda:2'), covar=tensor([0.0603, 0.0452, 0.0392, 0.0411, 0.0400, 0.1441, 0.2417, 0.0413], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0176, 0.0145, 0.0147, 0.0207, 0.0142, 0.0160, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 08:12:30,681 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2922, 1.4184, 3.3217, 1.4920, 3.0950, 3.3963, 2.3888, 3.5808], + device='cuda:2'), covar=tensor([0.0292, 0.3440, 0.0502, 0.2593, 0.1080, 0.0463, 0.1037, 0.0277], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0160, 0.0170, 0.0168, 0.0180, 0.0133, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:12:48,570 INFO [train.py:873] (2/4) Epoch 15, batch 6100, loss[loss=0.1012, simple_loss=0.1459, pruned_loss=0.02822, over 14234.00 frames. ], tot_loss[loss=0.112, simple_loss=0.1463, pruned_loss=0.03886, over 1931469.94 frames. ], batch size: 37, lr: 5.20e-03, grad_scale: 4.0 +2022-12-08 08:13:10,955 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.382e+02 2.212e+02 2.710e+02 3.481e+02 4.963e+02, threshold=5.420e+02, percent-clipped=0.0 +2022-12-08 08:13:14,081 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5383, 2.2769, 2.7244, 1.7279, 1.9000, 2.3464, 1.4473, 2.3024], + device='cuda:2'), covar=tensor([0.0749, 0.1268, 0.0760, 0.2537, 0.2213, 0.1148, 0.3431, 0.1061], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0101, 0.0094, 0.0100, 0.0116, 0.0090, 0.0121, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 08:13:35,167 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8233, 0.8049, 0.7507, 0.8331, 0.8169, 0.5749, 0.4872, 0.7030], + device='cuda:2'), covar=tensor([0.0143, 0.0141, 0.0137, 0.0136, 0.0142, 0.0276, 0.0177, 0.0249], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0020, 0.0018, 0.0019, 0.0019, 0.0031, 0.0025, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:14:13,869 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112066.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:14:16,351 INFO [train.py:873] (2/4) Epoch 15, batch 6200, loss[loss=0.1145, simple_loss=0.1475, pruned_loss=0.04075, over 13886.00 frames. ], tot_loss[loss=0.1135, simple_loss=0.1469, pruned_loss=0.04001, over 1906358.42 frames. ], batch size: 19, lr: 5.20e-03, grad_scale: 4.0 +2022-12-08 08:14:39,267 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.058e+02 2.103e+02 2.700e+02 3.313e+02 6.071e+02, threshold=5.399e+02, percent-clipped=3.0 +2022-12-08 08:15:05,075 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=112124.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:15:07,689 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112127.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:15:23,626 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1357, 4.8811, 4.7054, 5.1413, 4.8253, 4.3539, 5.2009, 4.8988], + device='cuda:2'), covar=tensor([0.0631, 0.0798, 0.0841, 0.0719, 0.0808, 0.0665, 0.0583, 0.0849], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0141, 0.0145, 0.0159, 0.0146, 0.0123, 0.0168, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:15:25,364 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112147.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 08:15:34,291 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8753, 2.0972, 3.9262, 2.8221, 3.7264, 1.9995, 2.8900, 3.7651], + device='cuda:2'), covar=tensor([0.0634, 0.3924, 0.0515, 0.5183, 0.0681, 0.3294, 0.1474, 0.0470], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0204, 0.0214, 0.0274, 0.0232, 0.0205, 0.0207, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:15:44,577 INFO [train.py:873] (2/4) Epoch 15, batch 6300, loss[loss=0.1217, simple_loss=0.1449, pruned_loss=0.04922, over 4929.00 frames. ], tot_loss[loss=0.1132, simple_loss=0.1468, pruned_loss=0.03979, over 1880134.31 frames. ], batch size: 100, lr: 5.20e-03, grad_scale: 4.0 +2022-12-08 08:15:47,438 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=112172.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:16:01,009 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1649, 1.5758, 2.4481, 1.9535, 2.2538, 1.6244, 1.8622, 2.1079], + device='cuda:2'), covar=tensor([0.2252, 0.3537, 0.0701, 0.1923, 0.1549, 0.2148, 0.1247, 0.1324], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0205, 0.0214, 0.0275, 0.0233, 0.0205, 0.0207, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:16:07,628 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.242e+02 2.167e+02 2.529e+02 3.043e+02 7.690e+02, threshold=5.057e+02, percent-clipped=2.0 +2022-12-08 08:16:35,853 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5628, 4.6702, 5.0190, 4.1744, 4.7135, 5.0328, 1.8981, 4.3468], + device='cuda:2'), covar=tensor([0.0258, 0.0314, 0.0269, 0.0399, 0.0287, 0.0190, 0.2964, 0.0266], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0174, 0.0144, 0.0146, 0.0205, 0.0140, 0.0159, 0.0194], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 08:16:47,895 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9309, 1.3612, 2.0056, 1.3070, 1.9476, 2.0679, 1.7876, 2.1291], + device='cuda:2'), covar=tensor([0.0322, 0.1967, 0.0519, 0.1825, 0.0563, 0.0571, 0.0879, 0.0394], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0156, 0.0161, 0.0168, 0.0167, 0.0179, 0.0132, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:17:13,385 INFO [train.py:873] (2/4) Epoch 15, batch 6400, loss[loss=0.1065, simple_loss=0.1433, pruned_loss=0.03489, over 13530.00 frames. ], tot_loss[loss=0.1122, simple_loss=0.1462, pruned_loss=0.03917, over 1897427.35 frames. ], batch size: 100, lr: 5.19e-03, grad_scale: 8.0 +2022-12-08 08:17:35,983 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.228e+02 2.167e+02 2.735e+02 3.656e+02 1.132e+03, threshold=5.470e+02, percent-clipped=11.0 +2022-12-08 08:17:55,269 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6121, 2.2799, 2.9741, 1.8183, 1.9319, 2.6730, 1.5406, 2.6969], + device='cuda:2'), covar=tensor([0.1133, 0.1691, 0.0671, 0.2186, 0.2336, 0.0925, 0.3099, 0.1115], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0094, 0.0100, 0.0117, 0.0090, 0.0121, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 08:18:14,310 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8841, 1.5035, 3.1292, 2.8573, 2.9765, 3.1625, 2.2713, 3.0851], + device='cuda:2'), covar=tensor([0.1306, 0.1502, 0.0161, 0.0335, 0.0334, 0.0168, 0.0539, 0.0211], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0130, 0.0169, 0.0145, 0.0141, 0.0124, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:18:29,145 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8180, 1.5697, 2.0842, 1.6386, 1.8871, 1.4181, 1.6519, 1.9208], + device='cuda:2'), covar=tensor([0.3035, 0.2474, 0.0655, 0.1629, 0.1196, 0.1413, 0.1141, 0.0783], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0208, 0.0217, 0.0279, 0.0235, 0.0209, 0.0209, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 08:18:41,070 INFO [train.py:873] (2/4) Epoch 15, batch 6500, loss[loss=0.09708, simple_loss=0.1321, pruned_loss=0.031, over 13997.00 frames. ], tot_loss[loss=0.1129, simple_loss=0.1462, pruned_loss=0.03982, over 1871172.18 frames. ], batch size: 19, lr: 5.19e-03, grad_scale: 8.0 +2022-12-08 08:18:59,392 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112389.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:19:04,119 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.333e+02 2.163e+02 2.578e+02 3.269e+02 5.158e+02, threshold=5.155e+02, percent-clipped=0.0 +2022-12-08 08:19:28,069 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112422.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:19:36,693 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 08:19:50,529 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=112447.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 08:19:53,050 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112450.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:20:09,726 INFO [train.py:873] (2/4) Epoch 15, batch 6600, loss[loss=0.1173, simple_loss=0.1529, pruned_loss=0.04085, over 6932.00 frames. ], tot_loss[loss=0.1125, simple_loss=0.146, pruned_loss=0.03946, over 1900663.31 frames. ], batch size: 100, lr: 5.19e-03, grad_scale: 8.0 +2022-12-08 08:20:32,877 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.043e+02 2.558e+02 3.259e+02 6.600e+02, threshold=5.115e+02, percent-clipped=3.0 +2022-12-08 08:20:32,962 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=112495.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 08:21:32,161 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112562.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:21:37,743 INFO [train.py:873] (2/4) Epoch 15, batch 6700, loss[loss=0.1038, simple_loss=0.1388, pruned_loss=0.03444, over 14234.00 frames. ], tot_loss[loss=0.112, simple_loss=0.146, pruned_loss=0.03897, over 1931416.62 frames. ], batch size: 69, lr: 5.19e-03, grad_scale: 8.0 +2022-12-08 08:21:44,892 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112577.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:21:56,174 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5280, 3.1882, 3.0684, 2.1468, 2.9468, 3.1797, 3.5471, 2.8298], + device='cuda:2'), covar=tensor([0.0672, 0.1281, 0.0952, 0.1540, 0.1054, 0.0743, 0.0642, 0.1187], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0174, 0.0138, 0.0126, 0.0140, 0.0153, 0.0132, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 08:22:01,003 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.453e+02 2.231e+02 2.690e+02 3.411e+02 7.258e+02, threshold=5.380e+02, percent-clipped=8.0 +2022-12-08 08:22:16,157 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.90 vs. limit=5.0 +2022-12-08 08:22:20,696 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112617.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:22:21,161 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-08 08:22:26,042 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112623.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:22:39,578 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112638.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 08:23:06,603 INFO [train.py:873] (2/4) Epoch 15, batch 6800, loss[loss=0.09706, simple_loss=0.137, pruned_loss=0.02858, over 13949.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1463, pruned_loss=0.0394, over 1943965.71 frames. ], batch size: 23, lr: 5.18e-03, grad_scale: 8.0 +2022-12-08 08:23:14,340 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112678.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 08:23:28,860 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.402e+02 2.249e+02 2.719e+02 3.532e+02 9.106e+02, threshold=5.438e+02, percent-clipped=9.0 +2022-12-08 08:23:33,035 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.53 vs. limit=5.0 +2022-12-08 08:23:52,943 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=112722.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:24:00,693 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8234, 3.6694, 3.2734, 2.4239, 3.3142, 3.5671, 3.8928, 3.1950], + device='cuda:2'), covar=tensor([0.0672, 0.1005, 0.1055, 0.1510, 0.1058, 0.0800, 0.1053, 0.1247], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0175, 0.0140, 0.0127, 0.0142, 0.0155, 0.0133, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 08:24:03,828 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 08:24:13,151 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112745.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:24:16,275 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 08:24:34,753 INFO [train.py:873] (2/4) Epoch 15, batch 6900, loss[loss=0.1058, simple_loss=0.1439, pruned_loss=0.03385, over 14281.00 frames. ], tot_loss[loss=0.1122, simple_loss=0.1462, pruned_loss=0.03909, over 1954478.01 frames. ], batch size: 60, lr: 5.18e-03, grad_scale: 8.0 +2022-12-08 08:24:35,722 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=112770.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:24:57,285 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.416e+02 2.169e+02 2.514e+02 3.179e+02 8.231e+02, threshold=5.028e+02, percent-clipped=2.0 +2022-12-08 08:25:04,985 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1078, 1.8677, 1.9128, 1.9073, 1.9093, 1.2566, 2.0158, 2.1487], + device='cuda:2'), covar=tensor([0.1080, 0.1231, 0.1173, 0.1700, 0.1745, 0.0997, 0.0834, 0.1430], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0033, 0.0036, 0.0031, 0.0033, 0.0047, 0.0034, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:25:39,176 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7228, 1.4781, 3.4478, 3.1907, 3.2618, 3.4869, 2.5152, 3.4187], + device='cuda:2'), covar=tensor([0.2174, 0.2328, 0.0243, 0.0455, 0.0409, 0.0251, 0.0791, 0.0253], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0132, 0.0172, 0.0148, 0.0143, 0.0126, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:26:02,335 INFO [train.py:873] (2/4) Epoch 15, batch 7000, loss[loss=0.2019, simple_loss=0.1741, pruned_loss=0.1149, over 1220.00 frames. ], tot_loss[loss=0.1125, simple_loss=0.1468, pruned_loss=0.0391, over 2027793.76 frames. ], batch size: 100, lr: 5.18e-03, grad_scale: 8.0 +2022-12-08 08:26:09,556 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.6640, 5.1852, 5.0051, 5.5944, 5.1407, 4.6823, 5.5601, 4.6873], + device='cuda:2'), covar=tensor([0.0325, 0.0824, 0.0344, 0.0351, 0.0799, 0.0346, 0.0448, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0266, 0.0194, 0.0190, 0.0183, 0.0153, 0.0278, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:26:25,454 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.347e+02 2.195e+02 2.715e+02 3.350e+02 5.262e+02, threshold=5.431e+02, percent-clipped=3.0 +2022-12-08 08:26:46,180 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112918.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:26:49,251 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=112921.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:26:55,142 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 08:26:59,926 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112933.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 08:27:31,512 INFO [train.py:873] (2/4) Epoch 15, batch 7100, loss[loss=0.09605, simple_loss=0.1347, pruned_loss=0.02868, over 14235.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1463, pruned_loss=0.03838, over 2050042.43 frames. ], batch size: 63, lr: 5.18e-03, grad_scale: 8.0 +2022-12-08 08:27:35,233 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=112973.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 08:27:43,081 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=112982.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:27:54,134 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.485e+02 2.181e+02 2.817e+02 3.355e+02 7.638e+02, threshold=5.634e+02, percent-clipped=2.0 +2022-12-08 08:28:15,457 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-08 08:28:18,152 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1155, 1.9332, 2.0731, 2.1137, 2.0578, 2.0305, 2.1975, 1.8596], + device='cuda:2'), covar=tensor([0.1019, 0.1323, 0.0711, 0.0870, 0.1010, 0.0779, 0.0858, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0266, 0.0193, 0.0189, 0.0182, 0.0153, 0.0278, 0.0166], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 08:28:38,901 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113045.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:28:59,914 INFO [train.py:873] (2/4) Epoch 15, batch 7200, loss[loss=0.1304, simple_loss=0.1504, pruned_loss=0.0552, over 5940.00 frames. ], tot_loss[loss=0.1125, simple_loss=0.1469, pruned_loss=0.03905, over 2037019.85 frames. ], batch size: 100, lr: 5.18e-03, grad_scale: 8.0 +2022-12-08 08:29:21,098 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113093.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:29:23,117 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.106e+02 2.228e+02 2.733e+02 3.389e+02 6.488e+02, threshold=5.466e+02, percent-clipped=1.0 +2022-12-08 08:29:53,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3305, 2.9610, 2.9301, 2.0819, 2.8580, 3.0547, 3.3565, 2.7659], + device='cuda:2'), covar=tensor([0.0765, 0.1041, 0.0930, 0.1455, 0.0950, 0.0629, 0.0732, 0.1150], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0175, 0.0139, 0.0127, 0.0142, 0.0155, 0.0131, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 08:30:00,579 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4229, 5.3417, 5.1183, 5.4527, 5.1321, 4.9435, 5.5491, 5.2340], + device='cuda:2'), covar=tensor([0.0657, 0.0726, 0.0661, 0.0551, 0.0672, 0.0492, 0.0494, 0.0644], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0142, 0.0146, 0.0161, 0.0147, 0.0124, 0.0168, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:30:28,629 INFO [train.py:873] (2/4) Epoch 15, batch 7300, loss[loss=0.1251, simple_loss=0.1426, pruned_loss=0.05381, over 4993.00 frames. ], tot_loss[loss=0.1122, simple_loss=0.1466, pruned_loss=0.03888, over 1982344.10 frames. ], batch size: 100, lr: 5.17e-03, grad_scale: 4.0 +2022-12-08 08:30:52,686 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.057e+02 2.452e+02 3.236e+02 1.011e+03, threshold=4.903e+02, percent-clipped=3.0 +2022-12-08 08:30:53,726 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9689, 1.5677, 3.3109, 3.0009, 3.1841, 3.3287, 2.7048, 3.2778], + device='cuda:2'), covar=tensor([0.1550, 0.1708, 0.0153, 0.0377, 0.0318, 0.0188, 0.0385, 0.0210], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0132, 0.0172, 0.0148, 0.0143, 0.0125, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:31:05,519 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7838, 3.6172, 3.5298, 3.8728, 3.4210, 3.2900, 3.8956, 3.7624], + device='cuda:2'), covar=tensor([0.0669, 0.0975, 0.0885, 0.0612, 0.1097, 0.0699, 0.0586, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0140, 0.0145, 0.0159, 0.0145, 0.0122, 0.0166, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:31:12,614 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113218.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:31:25,889 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113233.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:31:26,810 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=113234.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:31:55,291 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113266.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:31:57,795 INFO [train.py:873] (2/4) Epoch 15, batch 7400, loss[loss=0.1167, simple_loss=0.1501, pruned_loss=0.04169, over 14264.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1461, pruned_loss=0.0385, over 1996795.15 frames. ], batch size: 76, lr: 5.17e-03, grad_scale: 4.0 +2022-12-08 08:32:01,586 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113273.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 08:32:05,482 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=113277.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:32:05,581 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9077, 1.6403, 1.8659, 1.6393, 1.9500, 1.7230, 1.5683, 1.8121], + device='cuda:2'), covar=tensor([0.0575, 0.1377, 0.0472, 0.0565, 0.0569, 0.0849, 0.0337, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0314, 0.0398, 0.0301, 0.0373, 0.0327, 0.0364, 0.0303], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:32:08,865 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113281.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:32:21,750 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=113295.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:32:22,353 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.108e+02 1.996e+02 2.516e+02 3.094e+02 5.431e+02, threshold=5.033e+02, percent-clipped=1.0 +2022-12-08 08:32:44,274 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:32:47,355 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1950, 1.9577, 2.1943, 2.2920, 1.9549, 1.9750, 2.1691, 2.0718], + device='cuda:2'), covar=tensor([0.0337, 0.0567, 0.0350, 0.0322, 0.0548, 0.0814, 0.0448, 0.0425], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0258, 0.0373, 0.0327, 0.0271, 0.0305, 0.0311, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:33:25,700 INFO [train.py:873] (2/4) Epoch 15, batch 7500, loss[loss=0.09924, simple_loss=0.1374, pruned_loss=0.03056, over 14154.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1464, pruned_loss=0.03873, over 1996209.76 frames. ], batch size: 35, lr: 5.17e-03, grad_scale: 4.0 +2022-12-08 08:33:48,620 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.106e+02 2.151e+02 2.525e+02 3.725e+02 6.084e+02, threshold=5.049e+02, percent-clipped=3.0 +2022-12-08 08:34:54,703 INFO [train.py:873] (2/4) Epoch 16, batch 0, loss[loss=0.1182, simple_loss=0.1665, pruned_loss=0.03491, over 14257.00 frames. ], tot_loss[loss=0.1182, simple_loss=0.1665, pruned_loss=0.03491, over 14257.00 frames. ], batch size: 39, lr: 5.00e-03, grad_scale: 8.0 +2022-12-08 08:34:54,703 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 08:35:01,997 INFO [train.py:905] (2/4) Epoch 16, validation: loss=0.1445, simple_loss=0.1858, pruned_loss=0.05158, over 857387.00 frames. +2022-12-08 08:35:01,998 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 08:36:01,325 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.044e+01 2.013e+02 2.778e+02 4.051e+02 1.103e+03, threshold=5.556e+02, percent-clipped=15.0 +2022-12-08 08:36:14,914 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9914, 1.5470, 3.9023, 3.5565, 3.6828, 3.9705, 3.1895, 3.9356], + device='cuda:2'), covar=tensor([0.1514, 0.1675, 0.0117, 0.0276, 0.0243, 0.0146, 0.0347, 0.0141], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0160, 0.0132, 0.0171, 0.0149, 0.0143, 0.0125, 0.0127], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:36:31,859 INFO [train.py:873] (2/4) Epoch 16, batch 100, loss[loss=0.132, simple_loss=0.1567, pruned_loss=0.05361, over 13528.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1479, pruned_loss=0.03864, over 891620.45 frames. ], batch size: 100, lr: 5.00e-03, grad_scale: 4.0 +2022-12-08 08:36:40,705 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8260, 0.8652, 0.7889, 0.8704, 0.8522, 0.4885, 0.5033, 0.6288], + device='cuda:2'), covar=tensor([0.0136, 0.0152, 0.0112, 0.0144, 0.0147, 0.0245, 0.0177, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0020, 0.0021, 0.0018, 0.0020, 0.0019, 0.0031, 0.0026, 0.0030], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:36:57,666 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0153, 1.8854, 1.9003, 1.9084, 1.9664, 1.2630, 1.5773, 1.8652], + device='cuda:2'), covar=tensor([0.0896, 0.0603, 0.0675, 0.0738, 0.0980, 0.0860, 0.0852, 0.0667], + device='cuda:2'), in_proj_covar=tensor([0.0033, 0.0033, 0.0036, 0.0031, 0.0032, 0.0046, 0.0034, 0.0036], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:37:13,461 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113577.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:37:25,077 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=113590.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:37:31,132 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.249e+02 2.137e+02 2.757e+02 3.201e+02 6.700e+02, threshold=5.514e+02, percent-clipped=1.0 +2022-12-08 08:37:49,461 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=113617.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:37:56,669 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113625.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:38:01,677 INFO [train.py:873] (2/4) Epoch 16, batch 200, loss[loss=0.1134, simple_loss=0.1515, pruned_loss=0.03765, over 14138.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1465, pruned_loss=0.03866, over 1323724.16 frames. ], batch size: 84, lr: 5.00e-03, grad_scale: 4.0 +2022-12-08 08:38:02,143 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.80 vs. limit=5.0 +2022-12-08 08:38:14,873 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=113646.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:38:43,107 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=113678.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:38:54,156 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-12-08 08:38:59,611 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.233e+02 2.172e+02 2.621e+02 3.364e+02 2.345e+03, threshold=5.243e+02, percent-clipped=6.0 +2022-12-08 08:39:09,158 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=113707.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:39:30,011 INFO [train.py:873] (2/4) Epoch 16, batch 300, loss[loss=0.1177, simple_loss=0.1296, pruned_loss=0.05284, over 2632.00 frames. ], tot_loss[loss=0.113, simple_loss=0.1468, pruned_loss=0.03958, over 1594150.31 frames. ], batch size: 100, lr: 5.00e-03, grad_scale: 4.0 +2022-12-08 08:39:37,273 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9167, 2.9000, 4.5752, 3.3970, 4.6589, 4.4521, 4.3747, 4.1842], + device='cuda:2'), covar=tensor([0.0563, 0.3033, 0.1028, 0.1665, 0.0745, 0.0897, 0.1463, 0.1473], + device='cuda:2'), in_proj_covar=tensor([0.0347, 0.0308, 0.0387, 0.0293, 0.0364, 0.0318, 0.0355, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:40:28,277 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 1.970e+02 2.457e+02 3.061e+02 5.224e+02, threshold=4.913e+02, percent-clipped=0.0 +2022-12-08 08:40:57,904 INFO [train.py:873] (2/4) Epoch 16, batch 400, loss[loss=0.1007, simple_loss=0.1422, pruned_loss=0.02958, over 14206.00 frames. ], tot_loss[loss=0.1111, simple_loss=0.1457, pruned_loss=0.0383, over 1788459.29 frames. ], batch size: 60, lr: 4.99e-03, grad_scale: 4.0 +2022-12-08 08:41:49,060 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 08:41:50,239 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=113890.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:41:56,841 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.079e+02 2.042e+02 2.566e+02 3.444e+02 7.891e+02, threshold=5.133e+02, percent-clipped=8.0 +2022-12-08 08:42:25,888 INFO [train.py:873] (2/4) Epoch 16, batch 500, loss[loss=0.106, simple_loss=0.1394, pruned_loss=0.03626, over 6928.00 frames. ], tot_loss[loss=0.1115, simple_loss=0.1461, pruned_loss=0.03848, over 1877578.80 frames. ], batch size: 100, lr: 4.99e-03, grad_scale: 4.0 +2022-12-08 08:42:32,152 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=113938.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:43:02,304 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=113973.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:43:24,305 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.003e+02 2.124e+02 2.547e+02 2.966e+02 5.657e+02, threshold=5.095e+02, percent-clipped=2.0 +2022-12-08 08:43:28,065 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=114002.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:43:53,991 INFO [train.py:873] (2/4) Epoch 16, batch 600, loss[loss=0.1328, simple_loss=0.159, pruned_loss=0.05336, over 4987.00 frames. ], tot_loss[loss=0.1117, simple_loss=0.1457, pruned_loss=0.03881, over 1858142.84 frames. ], batch size: 100, lr: 4.99e-03, grad_scale: 4.0 +2022-12-08 08:44:12,337 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.17 vs. limit=5.0 +2022-12-08 08:44:43,662 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1774, 2.6379, 5.2643, 3.6241, 4.9212, 2.5297, 3.9569, 4.9781], + device='cuda:2'), covar=tensor([0.0396, 0.3710, 0.0263, 0.5922, 0.0450, 0.3117, 0.1181, 0.0274], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0203, 0.0214, 0.0273, 0.0231, 0.0206, 0.0204, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:44:52,422 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.140e+02 2.657e+02 3.260e+02 5.874e+02, threshold=5.314e+02, percent-clipped=5.0 +2022-12-08 08:45:18,379 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=114127.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:45:21,946 INFO [train.py:873] (2/4) Epoch 16, batch 700, loss[loss=0.1056, simple_loss=0.1429, pruned_loss=0.0341, over 14266.00 frames. ], tot_loss[loss=0.1112, simple_loss=0.1453, pruned_loss=0.03856, over 1883506.31 frames. ], batch size: 80, lr: 4.99e-03, grad_scale: 4.0 +2022-12-08 08:45:46,768 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-08 08:46:12,423 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=114188.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:46:20,991 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.016e+02 2.560e+02 3.238e+02 1.602e+03, threshold=5.120e+02, percent-clipped=5.0 +2022-12-08 08:46:45,247 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-08 08:46:50,893 INFO [train.py:873] (2/4) Epoch 16, batch 800, loss[loss=0.1003, simple_loss=0.1427, pruned_loss=0.02892, over 13988.00 frames. ], tot_loss[loss=0.1123, simple_loss=0.1461, pruned_loss=0.03932, over 1954105.38 frames. ], batch size: 22, lr: 4.98e-03, grad_scale: 8.0 +2022-12-08 08:47:28,633 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=114273.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:47:39,600 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8616, 1.3444, 2.5051, 2.2567, 2.3421, 2.5413, 1.6899, 2.5007], + device='cuda:2'), covar=tensor([0.0959, 0.1359, 0.0222, 0.0456, 0.0494, 0.0239, 0.0766, 0.0275], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0130, 0.0169, 0.0146, 0.0142, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:47:49,697 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7975, 2.3323, 3.7374, 3.9040, 3.6805, 2.2199, 3.8866, 2.8924], + device='cuda:2'), covar=tensor([0.0466, 0.1158, 0.0898, 0.0439, 0.0561, 0.1791, 0.0396, 0.1015], + device='cuda:2'), in_proj_covar=tensor([0.0290, 0.0258, 0.0372, 0.0328, 0.0271, 0.0305, 0.0309, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:47:50,274 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.518e+01 2.109e+02 2.499e+02 3.135e+02 7.663e+02, threshold=4.999e+02, percent-clipped=3.0 +2022-12-08 08:47:54,017 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=114302.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:48:04,358 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.12 vs. limit=2.0 +2022-12-08 08:48:06,309 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-12-08 08:48:08,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8094, 1.5502, 3.8448, 3.5043, 3.6267, 3.8925, 3.1932, 3.8108], + device='cuda:2'), covar=tensor([0.1653, 0.1653, 0.0122, 0.0248, 0.0252, 0.0146, 0.0269, 0.0153], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0130, 0.0168, 0.0145, 0.0141, 0.0123, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:48:09,891 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=114321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:48:18,624 INFO [train.py:873] (2/4) Epoch 16, batch 900, loss[loss=0.1257, simple_loss=0.1554, pruned_loss=0.04801, over 8611.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1461, pruned_loss=0.03887, over 1977761.32 frames. ], batch size: 100, lr: 4.98e-03, grad_scale: 4.0 +2022-12-08 08:48:35,143 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=114350.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:48:45,992 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8050, 1.3527, 2.5150, 2.2529, 2.3773, 2.5659, 1.6823, 2.5352], + device='cuda:2'), covar=tensor([0.0955, 0.1287, 0.0231, 0.0486, 0.0483, 0.0232, 0.0782, 0.0266], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0158, 0.0130, 0.0169, 0.0146, 0.0142, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:49:17,094 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.318e+02 2.233e+02 2.852e+02 3.607e+02 7.620e+02, threshold=5.705e+02, percent-clipped=7.0 +2022-12-08 08:49:45,663 INFO [train.py:873] (2/4) Epoch 16, batch 1000, loss[loss=0.1185, simple_loss=0.1514, pruned_loss=0.04286, over 14390.00 frames. ], tot_loss[loss=0.1112, simple_loss=0.146, pruned_loss=0.03823, over 2022875.35 frames. ], batch size: 53, lr: 4.98e-03, grad_scale: 4.0 +2022-12-08 08:50:30,792 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=114483.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:50:43,864 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-08 08:50:45,034 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.412e+02 2.045e+02 2.472e+02 3.169e+02 1.892e+03, threshold=4.944e+02, percent-clipped=6.0 +2022-12-08 08:51:13,053 INFO [train.py:873] (2/4) Epoch 16, batch 1100, loss[loss=0.1099, simple_loss=0.147, pruned_loss=0.03642, over 14171.00 frames. ], tot_loss[loss=0.1114, simple_loss=0.146, pruned_loss=0.03846, over 2047287.52 frames. ], batch size: 99, lr: 4.98e-03, grad_scale: 4.0 +2022-12-08 08:51:15,244 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.74 vs. limit=2.0 +2022-12-08 08:52:12,292 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.217e+02 2.285e+02 2.811e+02 4.159e+02 1.083e+03, threshold=5.623e+02, percent-clipped=11.0 +2022-12-08 08:52:21,748 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 08:52:32,263 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0459, 1.8650, 1.9396, 1.8876, 2.0325, 1.3350, 1.8512, 1.8903], + device='cuda:2'), covar=tensor([0.0749, 0.0894, 0.0928, 0.1049, 0.0698, 0.0963, 0.0666, 0.0693], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0033, 0.0036, 0.0031, 0.0032, 0.0046, 0.0034, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:52:41,093 INFO [train.py:873] (2/4) Epoch 16, batch 1200, loss[loss=0.1227, simple_loss=0.1557, pruned_loss=0.04485, over 14636.00 frames. ], tot_loss[loss=0.1126, simple_loss=0.1464, pruned_loss=0.03941, over 1960323.94 frames. ], batch size: 33, lr: 4.98e-03, grad_scale: 8.0 +2022-12-08 08:53:04,813 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.31 vs. limit=5.0 +2022-12-08 08:53:28,938 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9162, 4.6851, 4.5062, 4.8949, 4.5079, 4.4028, 4.9744, 4.7940], + device='cuda:2'), covar=tensor([0.0529, 0.0772, 0.0692, 0.0486, 0.0790, 0.0481, 0.0504, 0.0569], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0142, 0.0146, 0.0161, 0.0146, 0.0124, 0.0167, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 08:53:41,425 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.092e+02 1.943e+02 2.456e+02 3.315e+02 7.474e+02, threshold=4.912e+02, percent-clipped=2.0 +2022-12-08 08:53:56,742 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=114716.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:54:00,114 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0514, 1.1563, 1.2656, 1.0661, 1.0719, 0.9654, 1.0004, 0.8715], + device='cuda:2'), covar=tensor([0.0309, 0.0250, 0.0220, 0.0255, 0.0332, 0.0472, 0.0356, 0.0506], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0018, 0.0020, 0.0020, 0.0032, 0.0026, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0001, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:54:09,741 INFO [train.py:873] (2/4) Epoch 16, batch 1300, loss[loss=0.106, simple_loss=0.1465, pruned_loss=0.03277, over 14248.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1455, pruned_loss=0.03885, over 1926147.38 frames. ], batch size: 89, lr: 4.97e-03, grad_scale: 8.0 +2022-12-08 08:54:10,057 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.26 vs. limit=2.0 +2022-12-08 08:54:25,305 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8224, 1.8298, 1.6180, 1.8804, 1.7556, 1.8207, 1.7319, 1.7216], + device='cuda:2'), covar=tensor([0.1241, 0.0924, 0.2331, 0.0890, 0.1296, 0.0757, 0.1711, 0.1069], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0293, 0.0261, 0.0280, 0.0320, 0.0300, 0.0254, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:54:34,652 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1134, 1.1044, 1.0620, 1.1503, 1.2747, 0.8759, 0.9427, 1.1034], + device='cuda:2'), covar=tensor([0.0788, 0.1383, 0.0663, 0.0656, 0.0559, 0.1039, 0.1273, 0.0868], + device='cuda:2'), in_proj_covar=tensor([0.0034, 0.0033, 0.0036, 0.0032, 0.0033, 0.0047, 0.0034, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 08:54:44,432 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9909, 3.8111, 3.4994, 3.6673, 3.9039, 3.9384, 3.9776, 3.9676], + device='cuda:2'), covar=tensor([0.0903, 0.0569, 0.2205, 0.2791, 0.0772, 0.0844, 0.1089, 0.0947], + device='cuda:2'), in_proj_covar=tensor([0.0387, 0.0266, 0.0445, 0.0567, 0.0343, 0.0442, 0.0391, 0.0389], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:54:49,235 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9046, 1.2908, 2.0298, 1.3108, 1.9876, 2.0399, 1.7148, 2.1488], + device='cuda:2'), covar=tensor([0.0310, 0.2151, 0.0501, 0.1988, 0.0557, 0.0613, 0.1180, 0.0390], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0154, 0.0158, 0.0167, 0.0165, 0.0176, 0.0132, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:54:50,264 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=114777.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:54:56,101 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=114783.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:55:09,769 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.131e+02 2.098e+02 2.728e+02 3.262e+02 8.407e+02, threshold=5.455e+02, percent-clipped=4.0 +2022-12-08 08:55:19,972 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5112, 1.1095, 2.0156, 1.7604, 1.8098, 2.0837, 1.3146, 2.0449], + device='cuda:2'), covar=tensor([0.0962, 0.1489, 0.0308, 0.0591, 0.0719, 0.0271, 0.0817, 0.0316], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0131, 0.0170, 0.0147, 0.0142, 0.0125, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 08:55:22,668 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9712, 1.8475, 3.0830, 2.3008, 2.9605, 1.8329, 2.4599, 2.9879], + device='cuda:2'), covar=tensor([0.1150, 0.3899, 0.0673, 0.3724, 0.1053, 0.3342, 0.1243, 0.0712], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0206, 0.0216, 0.0275, 0.0233, 0.0206, 0.0206, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 08:55:38,357 INFO [train.py:873] (2/4) Epoch 16, batch 1400, loss[loss=0.09443, simple_loss=0.1364, pruned_loss=0.02624, over 14227.00 frames. ], tot_loss[loss=0.1117, simple_loss=0.1458, pruned_loss=0.03882, over 1935243.11 frames. ], batch size: 94, lr: 4.97e-03, grad_scale: 8.0 +2022-12-08 08:55:38,726 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=114831.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:56:14,463 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=114871.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:56:38,333 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.172e+02 2.101e+02 2.672e+02 3.477e+02 6.209e+02, threshold=5.345e+02, percent-clipped=3.0 +2022-12-08 08:57:02,159 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 08:57:06,856 INFO [train.py:873] (2/4) Epoch 16, batch 1500, loss[loss=0.1245, simple_loss=0.1631, pruned_loss=0.04297, over 13883.00 frames. ], tot_loss[loss=0.1106, simple_loss=0.1452, pruned_loss=0.03801, over 1965409.96 frames. ], batch size: 23, lr: 4.97e-03, grad_scale: 8.0 +2022-12-08 08:57:07,947 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=114932.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:57:31,856 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1152, 4.8778, 4.4402, 4.6991, 4.7714, 5.0117, 5.0498, 5.0772], + device='cuda:2'), covar=tensor([0.0659, 0.0429, 0.2169, 0.2540, 0.0823, 0.0789, 0.0795, 0.0704], + device='cuda:2'), in_proj_covar=tensor([0.0387, 0.0264, 0.0444, 0.0565, 0.0344, 0.0442, 0.0389, 0.0388], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:57:39,543 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3449, 2.2083, 3.2749, 2.4377, 3.2426, 3.1728, 3.0509, 2.7921], + device='cuda:2'), covar=tensor([0.0750, 0.3325, 0.1016, 0.1859, 0.0821, 0.0932, 0.1086, 0.1876], + device='cuda:2'), in_proj_covar=tensor([0.0355, 0.0314, 0.0394, 0.0298, 0.0371, 0.0324, 0.0363, 0.0301], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 08:58:07,018 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.294e+02 2.137e+02 2.682e+02 3.588e+02 7.078e+02, threshold=5.364e+02, percent-clipped=5.0 +2022-12-08 08:58:30,105 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-08 08:58:39,010 INFO [train.py:873] (2/4) Epoch 16, batch 1600, loss[loss=0.121, simple_loss=0.1318, pruned_loss=0.05514, over 2615.00 frames. ], tot_loss[loss=0.111, simple_loss=0.1454, pruned_loss=0.03823, over 1931134.86 frames. ], batch size: 100, lr: 4.97e-03, grad_scale: 8.0 +2022-12-08 08:59:15,635 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=115072.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:59:20,432 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-12-08 08:59:27,724 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=115086.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 08:59:28,608 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6017, 1.8483, 1.9980, 2.0003, 1.8067, 1.9308, 1.7231, 1.3754], + device='cuda:2'), covar=tensor([0.1027, 0.1401, 0.0536, 0.0707, 0.1191, 0.0934, 0.1530, 0.2191], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0088, 0.0069, 0.0072, 0.0098, 0.0087, 0.0100, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 08:59:37,191 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5344, 2.2996, 4.5833, 2.9942, 4.3423, 2.1918, 3.2537, 4.3173], + device='cuda:2'), covar=tensor([0.0570, 0.4308, 0.0437, 0.7107, 0.0723, 0.3346, 0.1612, 0.0613], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0207, 0.0217, 0.0277, 0.0235, 0.0206, 0.0207, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 08:59:39,794 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.499e+02 2.275e+02 2.932e+02 3.523e+02 7.695e+02, threshold=5.865e+02, percent-clipped=6.0 +2022-12-08 09:00:07,469 INFO [train.py:873] (2/4) Epoch 16, batch 1700, loss[loss=0.1319, simple_loss=0.1374, pruned_loss=0.06321, over 2621.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1462, pruned_loss=0.03882, over 1932934.92 frames. ], batch size: 100, lr: 4.96e-03, grad_scale: 4.0 +2022-12-08 09:00:21,738 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=115147.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:01:09,162 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.404e+01 2.088e+02 2.519e+02 3.191e+02 6.447e+02, threshold=5.039e+02, percent-clipped=1.0 +2022-12-08 09:01:12,646 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 09:01:33,712 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=115227.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:01:36,925 INFO [train.py:873] (2/4) Epoch 16, batch 1800, loss[loss=0.1098, simple_loss=0.1487, pruned_loss=0.03549, over 14273.00 frames. ], tot_loss[loss=0.1118, simple_loss=0.1465, pruned_loss=0.03859, over 1969104.93 frames. ], batch size: 28, lr: 4.96e-03, grad_scale: 4.0 +2022-12-08 09:02:04,417 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4573, 2.1966, 2.4110, 1.5926, 2.0321, 2.3345, 2.4503, 2.1139], + device='cuda:2'), covar=tensor([0.0735, 0.0615, 0.0815, 0.1370, 0.1137, 0.0711, 0.0716, 0.1253], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0174, 0.0138, 0.0126, 0.0141, 0.0153, 0.0132, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:02:38,560 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.091e+02 1.953e+02 2.460e+02 2.998e+02 4.316e+02, threshold=4.920e+02, percent-clipped=0.0 +2022-12-08 09:03:02,679 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.09 vs. limit=5.0 +2022-12-08 09:03:06,683 INFO [train.py:873] (2/4) Epoch 16, batch 1900, loss[loss=0.08937, simple_loss=0.1352, pruned_loss=0.02178, over 14036.00 frames. ], tot_loss[loss=0.1112, simple_loss=0.1463, pruned_loss=0.03806, over 2023238.63 frames. ], batch size: 26, lr: 4.96e-03, grad_scale: 4.0 +2022-12-08 09:03:41,636 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=115370.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:03:42,550 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=115371.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:03:43,300 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=115372.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:04:08,304 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.218e+02 2.103e+02 2.568e+02 3.222e+02 1.079e+03, threshold=5.135e+02, percent-clipped=6.0 +2022-12-08 09:04:26,541 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=115420.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:04:36,094 INFO [train.py:873] (2/4) Epoch 16, batch 2000, loss[loss=0.1415, simple_loss=0.1377, pruned_loss=0.07269, over 1229.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1467, pruned_loss=0.03852, over 2009586.51 frames. ], batch size: 100, lr: 4.96e-03, grad_scale: 8.0 +2022-12-08 09:04:36,214 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=115431.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:04:37,395 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=115432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:04:45,853 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=115442.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:05:36,028 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.145e+01 1.975e+02 2.632e+02 3.322e+02 6.213e+02, threshold=5.264e+02, percent-clipped=1.0 +2022-12-08 09:05:59,698 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=115527.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:06:03,272 INFO [train.py:873] (2/4) Epoch 16, batch 2100, loss[loss=0.1077, simple_loss=0.1193, pruned_loss=0.04802, over 2678.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1459, pruned_loss=0.03865, over 1951572.88 frames. ], batch size: 100, lr: 4.96e-03, grad_scale: 4.0 +2022-12-08 09:06:28,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4164, 4.9435, 4.8265, 5.3905, 4.9473, 4.5770, 5.3305, 4.3618], + device='cuda:2'), covar=tensor([0.0344, 0.0978, 0.0389, 0.0388, 0.0748, 0.0390, 0.0501, 0.0510], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0269, 0.0195, 0.0191, 0.0182, 0.0156, 0.0280, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:06:42,175 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=115575.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:07:05,100 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.092e+02 1.971e+02 2.573e+02 2.986e+02 1.089e+03, threshold=5.147e+02, percent-clipped=6.0 +2022-12-08 09:07:11,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6127, 1.4144, 2.9512, 2.6413, 2.8852, 3.0144, 2.0450, 2.9055], + device='cuda:2'), covar=tensor([0.2059, 0.2119, 0.0311, 0.0703, 0.0502, 0.0325, 0.0924, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0132, 0.0171, 0.0147, 0.0143, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 09:07:31,194 INFO [train.py:873] (2/4) Epoch 16, batch 2200, loss[loss=0.1023, simple_loss=0.1375, pruned_loss=0.03359, over 13919.00 frames. ], tot_loss[loss=0.1111, simple_loss=0.1456, pruned_loss=0.03831, over 1954979.48 frames. ], batch size: 23, lr: 4.95e-03, grad_scale: 4.0 +2022-12-08 09:07:36,052 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5163, 2.3409, 3.4277, 3.5790, 3.4252, 2.2868, 3.4481, 2.6505], + device='cuda:2'), covar=tensor([0.0444, 0.1089, 0.0887, 0.0479, 0.0540, 0.1607, 0.0458, 0.0990], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0259, 0.0377, 0.0329, 0.0271, 0.0307, 0.0310, 0.0282], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:07:49,923 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5044, 2.2573, 4.5318, 3.0855, 4.1607, 2.1105, 3.3911, 4.3257], + device='cuda:2'), covar=tensor([0.0431, 0.3996, 0.0324, 0.5213, 0.0686, 0.3090, 0.1162, 0.0336], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0204, 0.0215, 0.0275, 0.0233, 0.0204, 0.0205, 0.0216], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:08:32,754 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.142e+02 2.098e+02 2.631e+02 3.239e+02 6.147e+02, threshold=5.263e+02, percent-clipped=2.0 +2022-12-08 09:08:54,838 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=115726.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:08:55,683 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=115727.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:08:58,973 INFO [train.py:873] (2/4) Epoch 16, batch 2300, loss[loss=0.127, simple_loss=0.152, pruned_loss=0.05098, over 3860.00 frames. ], tot_loss[loss=0.1101, simple_loss=0.1446, pruned_loss=0.03783, over 1932266.55 frames. ], batch size: 100, lr: 4.95e-03, grad_scale: 4.0 +2022-12-08 09:09:08,591 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=115742.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:09:12,334 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-08 09:09:21,056 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0063, 2.0767, 1.8463, 2.1234, 1.6931, 2.0343, 2.0414, 2.0013], + device='cuda:2'), covar=tensor([0.1047, 0.1271, 0.1235, 0.0945, 0.1555, 0.0836, 0.1194, 0.1050], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0140, 0.0143, 0.0157, 0.0145, 0.0122, 0.0165, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:09:26,219 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2193, 3.9652, 3.8352, 4.2747, 3.9939, 3.8322, 4.2868, 3.6550], + device='cuda:2'), covar=tensor([0.0499, 0.0942, 0.0460, 0.0415, 0.0766, 0.1183, 0.0511, 0.0494], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0267, 0.0194, 0.0191, 0.0182, 0.0154, 0.0278, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:09:50,756 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=115790.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:10:00,477 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.978e+01 2.138e+02 2.481e+02 3.114e+02 7.710e+02, threshold=4.963e+02, percent-clipped=3.0 +2022-12-08 09:10:04,451 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.46 vs. limit=2.0 +2022-12-08 09:10:26,759 INFO [train.py:873] (2/4) Epoch 16, batch 2400, loss[loss=0.1188, simple_loss=0.1216, pruned_loss=0.058, over 1297.00 frames. ], tot_loss[loss=0.1114, simple_loss=0.1456, pruned_loss=0.03862, over 1942113.64 frames. ], batch size: 100, lr: 4.95e-03, grad_scale: 8.0 +2022-12-08 09:10:59,183 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-12-08 09:11:18,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9624, 1.7901, 3.1197, 2.2936, 2.9512, 1.9114, 2.4255, 2.9870], + device='cuda:2'), covar=tensor([0.0965, 0.4026, 0.0685, 0.3901, 0.0991, 0.2944, 0.1284, 0.0671], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0204, 0.0216, 0.0277, 0.0234, 0.0205, 0.0205, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 09:11:19,615 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=115892.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 09:11:27,448 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.676e+01 2.182e+02 2.667e+02 3.375e+02 7.141e+02, threshold=5.334e+02, percent-clipped=4.0 +2022-12-08 09:11:52,391 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1835, 2.2021, 1.9733, 2.0897, 1.9679, 1.3110, 1.6264, 2.0005], + device='cuda:2'), covar=tensor([0.0651, 0.0971, 0.0622, 0.0472, 0.0941, 0.0828, 0.0893, 0.0894], + device='cuda:2'), in_proj_covar=tensor([0.0033, 0.0033, 0.0036, 0.0031, 0.0032, 0.0045, 0.0034, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:11:54,003 INFO [train.py:873] (2/4) Epoch 16, batch 2500, loss[loss=0.143, simple_loss=0.1633, pruned_loss=0.0613, over 7814.00 frames. ], tot_loss[loss=0.1118, simple_loss=0.1462, pruned_loss=0.03868, over 2038723.01 frames. ], batch size: 100, lr: 4.95e-03, grad_scale: 8.0 +2022-12-08 09:12:13,588 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=115953.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 09:12:51,070 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.92 vs. limit=2.0 +2022-12-08 09:12:56,497 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.515e+02 2.068e+02 2.436e+02 3.138e+02 6.345e+02, threshold=4.873e+02, percent-clipped=1.0 +2022-12-08 09:13:07,032 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116014.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:13:17,539 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116026.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:13:18,395 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116027.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:13:21,685 INFO [train.py:873] (2/4) Epoch 16, batch 2600, loss[loss=0.1017, simple_loss=0.1372, pruned_loss=0.03306, over 13954.00 frames. ], tot_loss[loss=0.1119, simple_loss=0.1463, pruned_loss=0.03877, over 2017829.22 frames. ], batch size: 19, lr: 4.95e-03, grad_scale: 4.0 +2022-12-08 09:13:42,466 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.52 vs. limit=5.0 +2022-12-08 09:13:59,635 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116074.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:14:00,552 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116075.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:14:00,679 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116075.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 09:14:23,797 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.166e+02 2.134e+02 2.737e+02 3.520e+02 9.731e+02, threshold=5.473e+02, percent-clipped=5.0 +2022-12-08 09:14:26,811 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116105.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:14:36,337 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116116.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:14:46,344 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.07 vs. limit=5.0 +2022-12-08 09:14:49,179 INFO [train.py:873] (2/4) Epoch 16, batch 2700, loss[loss=0.1265, simple_loss=0.1588, pruned_loss=0.04706, over 14270.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1464, pruned_loss=0.03837, over 2054702.47 frames. ], batch size: 76, lr: 4.94e-03, grad_scale: 4.0 +2022-12-08 09:15:20,081 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116166.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:15:29,387 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116177.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:15:51,299 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.057e+02 2.028e+02 2.481e+02 3.066e+02 5.287e+02, threshold=4.961e+02, percent-clipped=0.0 +2022-12-08 09:16:05,763 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.59 vs. limit=2.0 +2022-12-08 09:16:16,289 INFO [train.py:873] (2/4) Epoch 16, batch 2800, loss[loss=0.1048, simple_loss=0.1443, pruned_loss=0.03269, over 14492.00 frames. ], tot_loss[loss=0.1104, simple_loss=0.1456, pruned_loss=0.03756, over 2075630.59 frames. ], batch size: 49, lr: 4.94e-03, grad_scale: 8.0 +2022-12-08 09:16:31,707 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116248.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 09:16:51,871 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8849, 2.7276, 2.4488, 2.6332, 2.8331, 2.8102, 2.8342, 2.8664], + device='cuda:2'), covar=tensor([0.1112, 0.0682, 0.2387, 0.2640, 0.1012, 0.1246, 0.1387, 0.0948], + device='cuda:2'), in_proj_covar=tensor([0.0389, 0.0269, 0.0449, 0.0565, 0.0348, 0.0446, 0.0391, 0.0392], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:17:17,065 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3224, 4.0387, 3.7763, 3.9399, 4.1515, 4.2060, 4.2644, 4.2744], + device='cuda:2'), covar=tensor([0.0758, 0.0547, 0.2180, 0.2529, 0.0694, 0.0856, 0.0927, 0.0834], + device='cuda:2'), in_proj_covar=tensor([0.0389, 0.0270, 0.0450, 0.0565, 0.0349, 0.0446, 0.0392, 0.0393], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:17:19,503 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 2.003e+02 2.440e+02 3.142e+02 5.921e+02, threshold=4.880e+02, percent-clipped=3.0 +2022-12-08 09:17:23,788 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9150, 3.0504, 4.7152, 3.5354, 4.6837, 4.6537, 4.5304, 4.1964], + device='cuda:2'), covar=tensor([0.0613, 0.2854, 0.0717, 0.1567, 0.0610, 0.0724, 0.1207, 0.1513], + device='cuda:2'), in_proj_covar=tensor([0.0356, 0.0315, 0.0396, 0.0301, 0.0371, 0.0325, 0.0365, 0.0302], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:17:37,628 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116323.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:17:44,502 INFO [train.py:873] (2/4) Epoch 16, batch 2900, loss[loss=0.1162, simple_loss=0.1458, pruned_loss=0.0433, over 13861.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.145, pruned_loss=0.03713, over 2081395.37 frames. ], batch size: 20, lr: 4.94e-03, grad_scale: 4.0 +2022-12-08 09:18:18,476 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116370.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 09:18:30,372 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116384.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:18:45,975 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0304, 2.3763, 4.0192, 4.1578, 3.9372, 2.3533, 4.0814, 3.0380], + device='cuda:2'), covar=tensor([0.0454, 0.1220, 0.0832, 0.0443, 0.0508, 0.1869, 0.0450, 0.1054], + device='cuda:2'), in_proj_covar=tensor([0.0291, 0.0259, 0.0374, 0.0328, 0.0271, 0.0305, 0.0308, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:18:47,320 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.101e+02 2.585e+02 3.248e+02 5.004e+02, threshold=5.170e+02, percent-clipped=1.0 +2022-12-08 09:19:04,116 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8349, 4.7349, 4.4486, 4.9177, 4.4306, 4.2877, 4.9649, 4.6020], + device='cuda:2'), covar=tensor([0.0689, 0.0884, 0.0802, 0.0612, 0.0755, 0.0599, 0.0623, 0.0850], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0143, 0.0145, 0.0159, 0.0147, 0.0124, 0.0168, 0.0148], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:19:11,752 INFO [train.py:873] (2/4) Epoch 16, batch 3000, loss[loss=0.1045, simple_loss=0.1285, pruned_loss=0.04024, over 4989.00 frames. ], tot_loss[loss=0.1111, simple_loss=0.1455, pruned_loss=0.0384, over 1944245.19 frames. ], batch size: 100, lr: 4.94e-03, grad_scale: 4.0 +2022-12-08 09:19:11,752 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 09:19:20,202 INFO [train.py:905] (2/4) Epoch 16, validation: loss=0.1369, simple_loss=0.1741, pruned_loss=0.04986, over 857387.00 frames. +2022-12-08 09:19:20,203 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 09:19:33,002 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116446.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:19:45,751 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116461.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:19:55,328 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116472.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:20:01,030 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.91 vs. limit=5.0 +2022-12-08 09:20:12,480 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4378, 1.4027, 3.5204, 1.5351, 3.3953, 3.6128, 2.5239, 3.8148], + device='cuda:2'), covar=tensor([0.0260, 0.3158, 0.0415, 0.2349, 0.0741, 0.0382, 0.0961, 0.0197], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0155, 0.0160, 0.0169, 0.0166, 0.0178, 0.0134, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:20:22,410 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.224e+02 2.145e+02 2.640e+02 3.351e+02 6.789e+02, threshold=5.280e+02, percent-clipped=3.0 +2022-12-08 09:20:24,202 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116505.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:20:26,043 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116507.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 09:20:46,691 INFO [train.py:873] (2/4) Epoch 16, batch 3100, loss[loss=0.135, simple_loss=0.1595, pruned_loss=0.05526, over 8650.00 frames. ], tot_loss[loss=0.111, simple_loss=0.1454, pruned_loss=0.03835, over 1933391.28 frames. ], batch size: 100, lr: 4.93e-03, grad_scale: 4.0 +2022-12-08 09:21:01,888 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116548.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 09:21:17,440 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116566.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:21:33,369 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2085, 1.6355, 4.1715, 1.9793, 4.0977, 4.2926, 3.3832, 4.5789], + device='cuda:2'), covar=tensor([0.0204, 0.2960, 0.0371, 0.2102, 0.0359, 0.0381, 0.0659, 0.0153], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0154, 0.0159, 0.0168, 0.0165, 0.0178, 0.0133, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:21:43,625 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116596.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 09:21:49,522 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.181e+02 2.695e+02 3.453e+02 7.270e+02, threshold=5.390e+02, percent-clipped=4.0 +2022-12-08 09:22:02,701 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116618.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:22:04,716 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.77 vs. limit=2.0 +2022-12-08 09:22:13,977 INFO [train.py:873] (2/4) Epoch 16, batch 3200, loss[loss=0.1346, simple_loss=0.1317, pruned_loss=0.06872, over 1283.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.1453, pruned_loss=0.03752, over 2071930.74 frames. ], batch size: 100, lr: 4.93e-03, grad_scale: 8.0 +2022-12-08 09:22:32,322 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4739, 1.9318, 2.4616, 2.0900, 2.5042, 2.3808, 2.3330, 2.2927], + device='cuda:2'), covar=tensor([0.0607, 0.2225, 0.0678, 0.1331, 0.0515, 0.0961, 0.0624, 0.0995], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0309, 0.0391, 0.0296, 0.0368, 0.0323, 0.0359, 0.0300], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:22:44,494 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4928, 2.1535, 2.3984, 1.6506, 2.1578, 2.4256, 2.5515, 2.1819], + device='cuda:2'), covar=tensor([0.0865, 0.0666, 0.0978, 0.1449, 0.1149, 0.0766, 0.0710, 0.1322], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0174, 0.0139, 0.0126, 0.0142, 0.0153, 0.0133, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:22:45,956 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-08 09:22:48,048 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116670.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 09:22:55,626 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116679.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:22:55,734 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116679.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:23:16,519 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.227e+02 2.604e+02 3.188e+02 6.019e+02, threshold=5.207e+02, percent-clipped=1.0 +2022-12-08 09:23:17,685 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116704.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:23:29,704 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116718.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:23:41,061 INFO [train.py:873] (2/4) Epoch 16, batch 3300, loss[loss=0.2152, simple_loss=0.1842, pruned_loss=0.1232, over 1197.00 frames. ], tot_loss[loss=0.1099, simple_loss=0.145, pruned_loss=0.03745, over 2076908.61 frames. ], batch size: 100, lr: 4.93e-03, grad_scale: 8.0 +2022-12-08 09:23:57,990 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.45 vs. limit=2.0 +2022-12-08 09:24:02,496 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.06 vs. limit=5.0 +2022-12-08 09:24:07,214 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116761.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:24:10,661 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116765.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:24:16,748 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116772.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:24:27,330 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8407, 1.6620, 1.9003, 1.6438, 1.9906, 1.7527, 1.6180, 1.8306], + device='cuda:2'), covar=tensor([0.0698, 0.1476, 0.0529, 0.0574, 0.0572, 0.0875, 0.0341, 0.0543], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0310, 0.0390, 0.0296, 0.0366, 0.0322, 0.0360, 0.0301], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:24:43,060 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116802.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 09:24:43,115 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=116802.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:24:43,679 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.185e+02 1.987e+02 2.396e+02 2.955e+02 5.766e+02, threshold=4.792e+02, percent-clipped=2.0 +2022-12-08 09:24:48,968 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116809.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:24:58,344 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=116820.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:25:03,218 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6956, 1.6049, 1.6762, 1.4909, 1.4565, 1.3781, 1.2497, 1.1407], + device='cuda:2'), covar=tensor([0.0143, 0.0175, 0.0147, 0.0159, 0.0163, 0.0311, 0.0202, 0.0349], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0020, 0.0020, 0.0033, 0.0026, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:25:08,247 INFO [train.py:873] (2/4) Epoch 16, batch 3400, loss[loss=0.1049, simple_loss=0.1422, pruned_loss=0.03376, over 14192.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1446, pruned_loss=0.03731, over 2067048.59 frames. ], batch size: 84, lr: 4.93e-03, grad_scale: 8.0 +2022-12-08 09:25:34,082 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116861.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:25:35,779 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=116863.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:26:10,488 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.102e+02 2.045e+02 2.562e+02 3.256e+02 5.626e+02, threshold=5.125e+02, percent-clipped=2.0 +2022-12-08 09:26:35,746 INFO [train.py:873] (2/4) Epoch 16, batch 3500, loss[loss=0.1068, simple_loss=0.1483, pruned_loss=0.03262, over 14316.00 frames. ], tot_loss[loss=0.109, simple_loss=0.1441, pruned_loss=0.03698, over 2026892.97 frames. ], batch size: 31, lr: 4.93e-03, grad_scale: 8.0 +2022-12-08 09:26:41,572 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7864, 1.3729, 1.6668, 1.2584, 1.4947, 1.8464, 1.5290, 1.4943], + device='cuda:2'), covar=tensor([0.0894, 0.0898, 0.0816, 0.0912, 0.1485, 0.0863, 0.0931, 0.1858], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0174, 0.0140, 0.0126, 0.0142, 0.0152, 0.0132, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:27:12,605 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=116974.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:27:14,319 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6088, 3.4874, 3.3530, 3.6855, 3.1828, 3.1065, 3.6722, 3.5053], + device='cuda:2'), covar=tensor([0.0618, 0.0907, 0.0745, 0.0559, 0.0927, 0.0705, 0.0594, 0.0687], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0143, 0.0144, 0.0159, 0.0146, 0.0123, 0.0167, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:27:16,939 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=116979.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:27:38,633 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.091e+02 2.141e+02 2.605e+02 3.184e+02 8.184e+02, threshold=5.210e+02, percent-clipped=2.0 +2022-12-08 09:27:55,166 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9053, 1.3184, 2.0152, 1.3248, 1.9216, 2.0224, 1.5864, 2.0998], + device='cuda:2'), covar=tensor([0.0370, 0.2234, 0.0488, 0.1747, 0.0641, 0.0623, 0.1275, 0.0485], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0155, 0.0159, 0.0168, 0.0167, 0.0179, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:27:58,644 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117027.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:28:02,017 INFO [train.py:873] (2/4) Epoch 16, batch 3600, loss[loss=0.1458, simple_loss=0.1396, pruned_loss=0.076, over 1294.00 frames. ], tot_loss[loss=0.1089, simple_loss=0.1438, pruned_loss=0.03695, over 1977297.76 frames. ], batch size: 100, lr: 4.92e-03, grad_scale: 8.0 +2022-12-08 09:28:27,278 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=117060.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:28:29,372 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.49 vs. limit=2.0 +2022-12-08 09:29:03,728 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117102.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:29:06,091 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.352e+02 2.154e+02 2.650e+02 3.296e+02 7.753e+02, threshold=5.300e+02, percent-clipped=2.0 +2022-12-08 09:29:26,745 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3464, 2.3451, 4.3408, 4.5699, 4.2734, 2.5538, 4.5244, 3.5197], + device='cuda:2'), covar=tensor([0.0368, 0.1226, 0.0820, 0.0337, 0.0448, 0.1752, 0.0359, 0.0812], + device='cuda:2'), in_proj_covar=tensor([0.0292, 0.0259, 0.0375, 0.0327, 0.0272, 0.0305, 0.0309, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:29:28,982 INFO [train.py:873] (2/4) Epoch 16, batch 3700, loss[loss=0.1306, simple_loss=0.1515, pruned_loss=0.05481, over 3903.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1445, pruned_loss=0.03772, over 1958950.80 frames. ], batch size: 100, lr: 4.92e-03, grad_scale: 4.0 +2022-12-08 09:29:45,677 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117150.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:29:52,522 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=117158.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:29:55,073 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117161.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:30:16,970 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8820, 1.3816, 2.0379, 1.3435, 2.0028, 2.0270, 1.5829, 2.1494], + device='cuda:2'), covar=tensor([0.0288, 0.2102, 0.0534, 0.1846, 0.0560, 0.0668, 0.1373, 0.0413], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0156, 0.0160, 0.0169, 0.0167, 0.0180, 0.0134, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:30:26,144 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 09:30:34,101 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.316e+02 2.079e+02 2.495e+02 3.268e+02 6.334e+02, threshold=4.990e+02, percent-clipped=4.0 +2022-12-08 09:30:38,003 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117209.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:30:48,157 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=117221.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:30:57,051 INFO [train.py:873] (2/4) Epoch 16, batch 3800, loss[loss=0.1102, simple_loss=0.1443, pruned_loss=0.03801, over 13544.00 frames. ], tot_loss[loss=0.1098, simple_loss=0.1444, pruned_loss=0.03756, over 1920401.19 frames. ], batch size: 100, lr: 4.92e-03, grad_scale: 4.0 +2022-12-08 09:31:06,016 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9560, 3.6082, 2.7845, 4.1535, 3.9894, 3.9680, 3.5685, 2.8564], + device='cuda:2'), covar=tensor([0.0767, 0.1118, 0.3198, 0.0505, 0.0788, 0.1336, 0.1086, 0.3069], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0293, 0.0260, 0.0282, 0.0320, 0.0300, 0.0254, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:31:31,294 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3878, 3.2834, 3.1734, 3.4935, 3.0529, 3.0226, 3.4599, 3.3323], + device='cuda:2'), covar=tensor([0.0654, 0.0966, 0.0909, 0.0588, 0.1042, 0.0749, 0.0633, 0.0753], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0141, 0.0144, 0.0157, 0.0144, 0.0122, 0.0166, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:31:34,992 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117274.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:31:36,631 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8762, 1.3710, 2.0102, 1.2521, 1.9888, 2.0181, 1.4795, 2.1427], + device='cuda:2'), covar=tensor([0.0372, 0.2114, 0.0548, 0.2020, 0.0607, 0.0682, 0.1613, 0.0420], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0156, 0.0160, 0.0170, 0.0168, 0.0181, 0.0134, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:31:41,903 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=117282.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:32:02,069 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.166e+02 2.002e+02 2.660e+02 3.371e+02 8.415e+02, threshold=5.321e+02, percent-clipped=3.0 +2022-12-08 09:32:17,217 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117322.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:32:23,699 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=117329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:32:25,199 INFO [train.py:873] (2/4) Epoch 16, batch 3900, loss[loss=0.1407, simple_loss=0.1585, pruned_loss=0.06147, over 7795.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1445, pruned_loss=0.03734, over 1928403.57 frames. ], batch size: 100, lr: 4.92e-03, grad_scale: 4.0 +2022-12-08 09:32:51,236 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117360.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:33:11,257 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8116, 2.7508, 2.1290, 2.7969, 2.6901, 2.7288, 2.4527, 2.1903], + device='cuda:2'), covar=tensor([0.0946, 0.1106, 0.2730, 0.0816, 0.1079, 0.0732, 0.1375, 0.2375], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0293, 0.0262, 0.0285, 0.0323, 0.0301, 0.0257, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:33:17,666 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=117390.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:33:26,884 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.5693, 5.3580, 5.0719, 5.6273, 5.1013, 4.8335, 5.6057, 5.3345], + device='cuda:2'), covar=tensor([0.0596, 0.0791, 0.0750, 0.0454, 0.0784, 0.0487, 0.0583, 0.0652], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0141, 0.0144, 0.0158, 0.0144, 0.0123, 0.0166, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:33:30,801 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.106e+02 1.882e+02 2.302e+02 2.791e+02 6.246e+02, threshold=4.605e+02, percent-clipped=1.0 +2022-12-08 09:33:33,455 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117408.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:33:53,717 INFO [train.py:873] (2/4) Epoch 16, batch 4000, loss[loss=0.09552, simple_loss=0.1414, pruned_loss=0.02483, over 14261.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.1436, pruned_loss=0.0368, over 1915125.54 frames. ], batch size: 39, lr: 4.92e-03, grad_scale: 8.0 +2022-12-08 09:34:02,045 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0089, 2.1763, 2.9905, 3.0803, 3.0405, 2.2097, 2.9922, 2.4441], + device='cuda:2'), covar=tensor([0.0499, 0.1184, 0.0741, 0.0561, 0.0516, 0.1474, 0.0472, 0.0877], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0261, 0.0377, 0.0330, 0.0273, 0.0307, 0.0310, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:34:13,011 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7299, 4.4080, 4.2346, 4.7730, 4.4559, 4.1559, 4.7342, 3.9489], + device='cuda:2'), covar=tensor([0.0393, 0.0996, 0.0425, 0.0373, 0.0749, 0.0724, 0.0516, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0270, 0.0198, 0.0193, 0.0183, 0.0156, 0.0282, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:34:17,480 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117458.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:34:53,298 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1884, 2.1990, 3.0361, 2.4363, 3.0591, 3.0230, 2.8900, 2.6203], + device='cuda:2'), covar=tensor([0.0962, 0.2935, 0.1140, 0.1846, 0.0832, 0.0969, 0.1196, 0.1817], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0311, 0.0388, 0.0296, 0.0365, 0.0321, 0.0361, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:34:58,356 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.16 vs. limit=2.0 +2022-12-08 09:34:58,776 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.434e+02 2.165e+02 2.594e+02 3.323e+02 1.177e+03, threshold=5.188e+02, percent-clipped=6.0 +2022-12-08 09:35:00,062 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117506.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:35:01,052 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0616, 2.2235, 2.3731, 2.3628, 2.0275, 2.4043, 2.2158, 1.3442], + device='cuda:2'), covar=tensor([0.1054, 0.1171, 0.0601, 0.0589, 0.1120, 0.0624, 0.1288, 0.2141], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0089, 0.0070, 0.0073, 0.0099, 0.0088, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:35:14,597 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4751, 2.1148, 5.2044, 4.7107, 4.5990, 5.2753, 4.9742, 5.2492], + device='cuda:2'), covar=tensor([0.1300, 0.1341, 0.0077, 0.0163, 0.0183, 0.0100, 0.0127, 0.0109], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0131, 0.0169, 0.0147, 0.0142, 0.0125, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 09:35:22,148 INFO [train.py:873] (2/4) Epoch 16, batch 4100, loss[loss=0.1515, simple_loss=0.1458, pruned_loss=0.07864, over 1246.00 frames. ], tot_loss[loss=0.1088, simple_loss=0.1443, pruned_loss=0.03668, over 1996594.57 frames. ], batch size: 100, lr: 4.91e-03, grad_scale: 8.0 +2022-12-08 09:35:25,453 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0384, 1.6303, 2.0054, 1.4557, 1.7543, 2.0785, 1.8894, 1.8424], + device='cuda:2'), covar=tensor([0.1111, 0.0855, 0.0997, 0.1317, 0.1618, 0.1005, 0.0847, 0.1580], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0173, 0.0139, 0.0125, 0.0141, 0.0152, 0.0132, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:35:32,210 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4633, 3.3844, 4.1642, 3.0172, 2.5991, 3.4075, 1.9293, 3.3858], + device='cuda:2'), covar=tensor([0.0890, 0.0837, 0.0402, 0.1332, 0.1837, 0.0692, 0.3074, 0.1223], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0103, 0.0095, 0.0102, 0.0117, 0.0090, 0.0121, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 09:36:02,167 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=117577.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:36:27,067 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.013e+02 2.651e+02 3.289e+02 6.397e+02, threshold=5.302e+02, percent-clipped=7.0 +2022-12-08 09:36:49,549 INFO [train.py:873] (2/4) Epoch 16, batch 4200, loss[loss=0.142, simple_loss=0.1473, pruned_loss=0.06832, over 2648.00 frames. ], tot_loss[loss=0.1094, simple_loss=0.1448, pruned_loss=0.03696, over 2008347.44 frames. ], batch size: 100, lr: 4.91e-03, grad_scale: 8.0 +2022-12-08 09:37:16,872 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2717, 3.0546, 3.0375, 3.2793, 3.1150, 3.2197, 3.3549, 2.8027], + device='cuda:2'), covar=tensor([0.0513, 0.1026, 0.0609, 0.0530, 0.0788, 0.0436, 0.0615, 0.0656], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0274, 0.0200, 0.0193, 0.0185, 0.0157, 0.0283, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:37:37,443 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=117685.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:37:54,401 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.263e+02 2.184e+02 2.504e+02 3.069e+02 5.464e+02, threshold=5.008e+02, percent-clipped=1.0 +2022-12-08 09:38:17,332 INFO [train.py:873] (2/4) Epoch 16, batch 4300, loss[loss=0.1326, simple_loss=0.1438, pruned_loss=0.06068, over 2661.00 frames. ], tot_loss[loss=0.1094, simple_loss=0.1446, pruned_loss=0.03709, over 1950648.81 frames. ], batch size: 100, lr: 4.91e-03, grad_scale: 8.0 +2022-12-08 09:39:22,644 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0996, 3.8734, 3.8305, 4.1347, 3.7630, 3.4927, 4.1567, 3.9983], + device='cuda:2'), covar=tensor([0.0672, 0.0958, 0.0850, 0.0661, 0.0904, 0.0780, 0.0643, 0.0744], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0143, 0.0146, 0.0161, 0.0146, 0.0124, 0.0169, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 09:39:23,470 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.083e+02 2.539e+02 3.211e+02 7.278e+02, threshold=5.077e+02, percent-clipped=7.0 +2022-12-08 09:39:32,408 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1104, 2.2423, 2.3574, 2.4121, 2.0713, 2.4126, 2.2831, 1.3840], + device='cuda:2'), covar=tensor([0.0917, 0.1092, 0.0707, 0.0455, 0.0867, 0.0572, 0.1054, 0.1849], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0088, 0.0069, 0.0072, 0.0098, 0.0087, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:39:46,097 INFO [train.py:873] (2/4) Epoch 16, batch 4400, loss[loss=0.1218, simple_loss=0.159, pruned_loss=0.04232, over 14255.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1446, pruned_loss=0.0373, over 1906468.95 frames. ], batch size: 25, lr: 4.91e-03, grad_scale: 8.0 +2022-12-08 09:40:09,272 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1580, 2.0612, 1.7603, 1.9053, 2.0758, 2.0942, 2.1122, 2.0705], + device='cuda:2'), covar=tensor([0.1094, 0.0860, 0.2688, 0.2816, 0.1226, 0.1324, 0.1421, 0.1282], + device='cuda:2'), in_proj_covar=tensor([0.0388, 0.0268, 0.0448, 0.0563, 0.0347, 0.0443, 0.0384, 0.0390], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:40:26,826 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117877.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:40:50,806 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.193e+02 2.093e+02 2.556e+02 3.184e+02 6.806e+02, threshold=5.111e+02, percent-clipped=1.0 +2022-12-08 09:41:09,106 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=117925.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:41:14,050 INFO [train.py:873] (2/4) Epoch 16, batch 4500, loss[loss=0.1025, simple_loss=0.1466, pruned_loss=0.02919, over 14368.00 frames. ], tot_loss[loss=0.1101, simple_loss=0.1447, pruned_loss=0.0378, over 1851327.80 frames. ], batch size: 55, lr: 4.91e-03, grad_scale: 8.0 +2022-12-08 09:42:01,165 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=117985.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:42:18,931 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.027e+02 2.156e+02 2.695e+02 3.844e+02 9.084e+02, threshold=5.389e+02, percent-clipped=7.0 +2022-12-08 09:42:30,474 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8928, 1.6659, 1.9109, 1.6294, 1.9769, 1.8197, 1.6694, 1.9004], + device='cuda:2'), covar=tensor([0.0621, 0.1288, 0.0458, 0.0582, 0.0578, 0.0748, 0.0389, 0.0489], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0310, 0.0393, 0.0299, 0.0367, 0.0323, 0.0361, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:42:41,604 INFO [train.py:873] (2/4) Epoch 16, batch 4600, loss[loss=0.1151, simple_loss=0.1466, pruned_loss=0.04175, over 14179.00 frames. ], tot_loss[loss=0.1109, simple_loss=0.1451, pruned_loss=0.03833, over 1854068.17 frames. ], batch size: 99, lr: 4.90e-03, grad_scale: 8.0 +2022-12-08 09:42:43,371 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=118033.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:42:49,949 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118040.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:42:53,201 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1191, 1.9971, 2.0982, 2.1632, 2.0666, 2.0639, 2.2323, 1.8979], + device='cuda:2'), covar=tensor([0.0900, 0.1320, 0.0799, 0.0776, 0.0975, 0.0686, 0.0865, 0.0755], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0272, 0.0198, 0.0191, 0.0183, 0.0157, 0.0282, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:43:42,459 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118101.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:43:45,464 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.349e+02 2.071e+02 2.553e+02 3.122e+02 5.122e+02, threshold=5.107e+02, percent-clipped=0.0 +2022-12-08 09:44:08,312 INFO [train.py:873] (2/4) Epoch 16, batch 4700, loss[loss=0.09623, simple_loss=0.1374, pruned_loss=0.02753, over 14311.00 frames. ], tot_loss[loss=0.1104, simple_loss=0.1447, pruned_loss=0.03807, over 1836664.16 frames. ], batch size: 31, lr: 4.90e-03, grad_scale: 8.0 +2022-12-08 09:44:40,836 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118168.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:44:59,216 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8639, 2.7679, 2.0467, 2.9227, 2.7490, 2.7922, 2.4562, 2.2265], + device='cuda:2'), covar=tensor([0.1115, 0.1371, 0.2994, 0.0786, 0.1126, 0.1189, 0.1405, 0.2674], + device='cuda:2'), in_proj_covar=tensor([0.0287, 0.0293, 0.0261, 0.0284, 0.0323, 0.0301, 0.0254, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:45:03,863 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.55 vs. limit=5.0 +2022-12-08 09:45:13,138 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 2.277e+02 2.708e+02 3.256e+02 5.977e+02, threshold=5.416e+02, percent-clipped=1.0 +2022-12-08 09:45:34,224 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118229.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 09:45:35,650 INFO [train.py:873] (2/4) Epoch 16, batch 4800, loss[loss=0.1152, simple_loss=0.1501, pruned_loss=0.04015, over 14643.00 frames. ], tot_loss[loss=0.1107, simple_loss=0.1453, pruned_loss=0.03804, over 1929928.78 frames. ], batch size: 23, lr: 4.90e-03, grad_scale: 8.0 +2022-12-08 09:45:51,609 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5830, 1.5064, 1.6325, 1.3910, 1.3762, 1.2743, 1.3174, 1.1473], + device='cuda:2'), covar=tensor([0.0184, 0.0216, 0.0165, 0.0207, 0.0199, 0.0347, 0.0226, 0.0348], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0020, 0.0020, 0.0032, 0.0026, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:46:03,569 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9796, 1.8962, 1.8688, 1.8772, 1.7897, 1.7002, 1.2023, 1.1147], + device='cuda:2'), covar=tensor([0.0171, 0.0343, 0.0265, 0.0233, 0.0287, 0.0340, 0.0315, 0.0577], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0020, 0.0020, 0.0032, 0.0026, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:46:08,809 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118269.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:46:26,219 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118289.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:46:39,568 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.170e+02 2.128e+02 2.595e+02 3.127e+02 6.386e+02, threshold=5.189e+02, percent-clipped=3.0 +2022-12-08 09:47:00,673 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:47:01,613 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118330.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:47:02,256 INFO [train.py:873] (2/4) Epoch 16, batch 4900, loss[loss=0.09057, simple_loss=0.141, pruned_loss=0.02006, over 14289.00 frames. ], tot_loss[loss=0.1106, simple_loss=0.1453, pruned_loss=0.03797, over 1944722.90 frames. ], batch size: 25, lr: 4.90e-03, grad_scale: 8.0 +2022-12-08 09:47:09,483 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-08 09:47:15,804 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9397, 2.3850, 2.4107, 2.3839, 1.9588, 2.4122, 2.2769, 1.2809], + device='cuda:2'), covar=tensor([0.1163, 0.0980, 0.0772, 0.0843, 0.1166, 0.0972, 0.1315, 0.2499], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0088, 0.0069, 0.0073, 0.0099, 0.0088, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 09:47:18,870 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118350.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:47:53,425 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118390.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:47:58,337 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118396.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:48:06,579 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.054e+02 2.113e+02 2.635e+02 3.192e+02 6.518e+02, threshold=5.270e+02, percent-clipped=3.0 +2022-12-08 09:48:29,344 INFO [train.py:873] (2/4) Epoch 16, batch 5000, loss[loss=0.1198, simple_loss=0.1498, pruned_loss=0.04484, over 6933.00 frames. ], tot_loss[loss=0.1109, simple_loss=0.1454, pruned_loss=0.03819, over 1889465.70 frames. ], batch size: 100, lr: 4.90e-03, grad_scale: 8.0 +2022-12-08 09:49:34,278 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.516e+01 2.019e+02 2.454e+02 2.999e+02 6.382e+02, threshold=4.907e+02, percent-clipped=1.0 +2022-12-08 09:49:50,393 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118524.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 09:49:56,306 INFO [train.py:873] (2/4) Epoch 16, batch 5100, loss[loss=0.09126, simple_loss=0.1381, pruned_loss=0.02219, over 14524.00 frames. ], tot_loss[loss=0.111, simple_loss=0.1455, pruned_loss=0.03824, over 1945086.62 frames. ], batch size: 34, lr: 4.89e-03, grad_scale: 4.0 +2022-12-08 09:51:01,828 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.319e+02 2.028e+02 2.469e+02 3.033e+02 6.307e+02, threshold=4.938e+02, percent-clipped=1.0 +2022-12-08 09:51:03,856 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7816, 3.4192, 2.7798, 3.9517, 3.8184, 3.7715, 3.2841, 2.8112], + device='cuda:2'), covar=tensor([0.0792, 0.1261, 0.3165, 0.0521, 0.0735, 0.1185, 0.1215, 0.2771], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0291, 0.0259, 0.0282, 0.0321, 0.0299, 0.0252, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:51:08,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5936, 1.6487, 1.7210, 1.4105, 1.3872, 1.3183, 1.3836, 1.0904], + device='cuda:2'), covar=tensor([0.0176, 0.0223, 0.0155, 0.0176, 0.0178, 0.0319, 0.0218, 0.0330], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0022, 0.0019, 0.0021, 0.0020, 0.0033, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:51:18,452 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118625.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:51:23,927 INFO [train.py:873] (2/4) Epoch 16, batch 5200, loss[loss=0.1551, simple_loss=0.1678, pruned_loss=0.07124, over 8597.00 frames. ], tot_loss[loss=0.1115, simple_loss=0.146, pruned_loss=0.0385, over 1927753.42 frames. ], batch size: 100, lr: 4.89e-03, grad_scale: 8.0 +2022-12-08 09:51:24,073 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=118631.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:51:36,587 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118645.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:52:12,060 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118685.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:52:18,146 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=118692.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:52:21,216 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=118696.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:52:21,286 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2525, 1.8431, 2.0849, 1.9023, 2.3030, 2.1070, 1.9794, 2.0779], + device='cuda:2'), covar=tensor([0.0618, 0.1850, 0.0599, 0.1081, 0.0506, 0.0905, 0.0505, 0.0723], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0312, 0.0397, 0.0301, 0.0372, 0.0325, 0.0362, 0.0300], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 09:52:29,669 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.100e+02 1.977e+02 2.482e+02 3.057e+02 6.335e+02, threshold=4.964e+02, percent-clipped=4.0 +2022-12-08 09:52:35,885 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2981, 2.1432, 4.9685, 4.3640, 4.2495, 5.0174, 4.7612, 5.0698], + device='cuda:2'), covar=tensor([0.1481, 0.1424, 0.0086, 0.0217, 0.0226, 0.0128, 0.0141, 0.0087], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0161, 0.0133, 0.0173, 0.0151, 0.0145, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 09:52:51,985 INFO [train.py:873] (2/4) Epoch 16, batch 5300, loss[loss=0.1014, simple_loss=0.1417, pruned_loss=0.03055, over 14250.00 frames. ], tot_loss[loss=0.1113, simple_loss=0.1459, pruned_loss=0.03836, over 1935080.35 frames. ], batch size: 39, lr: 4.89e-03, grad_scale: 8.0 +2022-12-08 09:53:00,846 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9826, 1.9711, 2.0501, 2.0431, 2.0080, 1.6412, 1.2629, 1.8425], + device='cuda:2'), covar=tensor([0.0751, 0.0712, 0.0530, 0.0440, 0.0558, 0.1436, 0.2430, 0.0523], + device='cuda:2'), in_proj_covar=tensor([0.0170, 0.0172, 0.0145, 0.0145, 0.0204, 0.0141, 0.0156, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 09:53:02,120 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-08 09:53:03,348 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=118744.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:53:11,091 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 09:53:32,623 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4874, 4.2318, 4.1008, 4.5615, 4.3012, 3.9851, 4.5598, 3.8454], + device='cuda:2'), covar=tensor([0.0418, 0.0896, 0.0424, 0.0399, 0.0660, 0.0842, 0.0503, 0.0491], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0272, 0.0197, 0.0190, 0.0182, 0.0155, 0.0282, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:53:38,864 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0903, 1.3130, 1.3370, 1.0394, 0.8510, 1.1764, 0.9001, 1.2164], + device='cuda:2'), covar=tensor([0.1892, 0.2063, 0.1142, 0.2006, 0.2766, 0.1074, 0.1963, 0.1087], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0102, 0.0095, 0.0102, 0.0117, 0.0090, 0.0121, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 09:53:58,970 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.455e+02 1.964e+02 2.394e+02 2.893e+02 4.670e+02, threshold=4.788e+02, percent-clipped=0.0 +2022-12-08 09:54:06,646 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5310, 2.2450, 4.5125, 3.0731, 4.3084, 1.9909, 3.2592, 4.3734], + device='cuda:2'), covar=tensor([0.0493, 0.3887, 0.0357, 0.5571, 0.0519, 0.3337, 0.1310, 0.0403], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0204, 0.0217, 0.0277, 0.0236, 0.0206, 0.0203, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 09:54:14,784 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=118824.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:54:19,979 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.65 vs. limit=2.0 +2022-12-08 09:54:20,955 INFO [train.py:873] (2/4) Epoch 16, batch 5400, loss[loss=0.1179, simple_loss=0.1564, pruned_loss=0.03973, over 14282.00 frames. ], tot_loss[loss=0.1099, simple_loss=0.1452, pruned_loss=0.0373, over 1961659.73 frames. ], batch size: 63, lr: 4.89e-03, grad_scale: 8.0 +2022-12-08 09:54:35,751 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.71 vs. limit=5.0 +2022-12-08 09:54:57,518 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=118872.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:55:00,628 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-08 09:55:27,018 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.618e+01 2.141e+02 2.825e+02 3.414e+02 6.875e+02, threshold=5.649e+02, percent-clipped=6.0 +2022-12-08 09:55:44,000 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=118925.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:55:49,204 INFO [train.py:873] (2/4) Epoch 16, batch 5500, loss[loss=0.1112, simple_loss=0.1159, pruned_loss=0.0532, over 1327.00 frames. ], tot_loss[loss=0.1094, simple_loss=0.1447, pruned_loss=0.03705, over 1935126.22 frames. ], batch size: 100, lr: 4.88e-03, grad_scale: 8.0 +2022-12-08 09:56:01,647 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=118945.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:56:26,061 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=118973.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:56:36,631 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=118985.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:56:38,467 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=118987.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:56:41,596 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.59 vs. limit=2.0 +2022-12-08 09:56:43,681 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=118993.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:56:55,520 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.279e+02 2.181e+02 2.740e+02 3.306e+02 6.665e+02, threshold=5.480e+02, percent-clipped=3.0 +2022-12-08 09:57:11,821 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 09:57:17,518 INFO [train.py:873] (2/4) Epoch 16, batch 5600, loss[loss=0.1251, simple_loss=0.1263, pruned_loss=0.06196, over 1230.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1447, pruned_loss=0.03761, over 1855269.51 frames. ], batch size: 100, lr: 4.88e-03, grad_scale: 8.0 +2022-12-08 09:57:19,246 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=119033.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:57:55,370 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=119075.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:58:11,147 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=119093.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:58:21,972 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.186e+02 2.774e+02 3.458e+02 6.793e+02, threshold=5.547e+02, percent-clipped=4.0 +2022-12-08 09:58:43,947 INFO [train.py:873] (2/4) Epoch 16, batch 5700, loss[loss=0.09506, simple_loss=0.1344, pruned_loss=0.02787, over 14666.00 frames. ], tot_loss[loss=0.1099, simple_loss=0.1449, pruned_loss=0.03744, over 1939981.94 frames. ], batch size: 23, lr: 4.88e-03, grad_scale: 8.0 +2022-12-08 09:58:48,226 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9465, 1.8857, 1.8539, 1.7965, 1.8371, 1.2389, 1.4933, 1.6736], + device='cuda:2'), covar=tensor([0.0494, 0.0559, 0.0533, 0.0883, 0.0592, 0.0795, 0.0704, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0034, 0.0037, 0.0032, 0.0033, 0.0046, 0.0035, 0.0038], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 09:58:48,240 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=119136.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 09:59:04,229 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=119154.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 09:59:42,578 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8820, 2.7277, 2.7228, 2.9262, 2.7894, 2.8099, 2.9968, 2.4660], + device='cuda:2'), covar=tensor([0.0635, 0.1037, 0.0644, 0.0604, 0.0906, 0.0534, 0.0680, 0.0697], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0275, 0.0199, 0.0192, 0.0186, 0.0156, 0.0286, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 09:59:50,092 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.363e+02 2.314e+02 2.766e+02 3.252e+02 7.674e+02, threshold=5.531e+02, percent-clipped=3.0 +2022-12-08 10:00:11,371 INFO [train.py:873] (2/4) Epoch 16, batch 5800, loss[loss=0.1341, simple_loss=0.1575, pruned_loss=0.05536, over 11144.00 frames. ], tot_loss[loss=0.111, simple_loss=0.1459, pruned_loss=0.03806, over 1932074.12 frames. ], batch size: 100, lr: 4.88e-03, grad_scale: 8.0 +2022-12-08 10:01:00,905 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=119287.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:01:09,521 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6350, 4.7012, 5.0624, 4.1291, 4.8250, 5.1813, 1.8824, 4.5155], + device='cuda:2'), covar=tensor([0.0315, 0.0271, 0.0307, 0.0461, 0.0260, 0.0129, 0.3019, 0.0275], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0172, 0.0145, 0.0145, 0.0203, 0.0140, 0.0155, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 10:01:17,095 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.269e+02 2.799e+02 3.367e+02 7.852e+02, threshold=5.598e+02, percent-clipped=4.0 +2022-12-08 10:01:25,095 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1811, 3.9627, 3.8265, 4.2686, 3.9317, 3.7502, 4.2676, 3.5643], + device='cuda:2'), covar=tensor([0.0549, 0.0929, 0.0488, 0.0430, 0.0880, 0.1207, 0.0559, 0.0576], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0272, 0.0198, 0.0190, 0.0184, 0.0155, 0.0284, 0.0167], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 10:01:34,052 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9344, 1.9433, 3.9607, 2.5723, 3.7907, 1.8415, 2.8226, 3.7737], + device='cuda:2'), covar=tensor([0.0925, 0.4670, 0.0632, 0.6788, 0.0961, 0.4042, 0.1633, 0.0893], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0203, 0.0217, 0.0276, 0.0236, 0.0208, 0.0202, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 10:01:38,988 INFO [train.py:873] (2/4) Epoch 16, batch 5900, loss[loss=0.1214, simple_loss=0.1378, pruned_loss=0.05251, over 5008.00 frames. ], tot_loss[loss=0.1105, simple_loss=0.1456, pruned_loss=0.03775, over 1937473.19 frames. ], batch size: 100, lr: 4.88e-03, grad_scale: 4.0 +2022-12-08 10:01:42,492 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=119335.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:02:45,457 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-12-08 10:02:45,651 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.338e+02 2.147e+02 2.622e+02 3.177e+02 5.866e+02, threshold=5.244e+02, percent-clipped=1.0 +2022-12-08 10:02:52,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6206, 1.5896, 1.6760, 1.4187, 1.4649, 1.3617, 1.3077, 1.1583], + device='cuda:2'), covar=tensor([0.0199, 0.0260, 0.0143, 0.0219, 0.0191, 0.0283, 0.0207, 0.0343], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0020, 0.0020, 0.0032, 0.0027, 0.0031], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:03:05,593 INFO [train.py:873] (2/4) Epoch 16, batch 6000, loss[loss=0.1395, simple_loss=0.142, pruned_loss=0.0685, over 1275.00 frames. ], tot_loss[loss=0.1123, simple_loss=0.1463, pruned_loss=0.03913, over 1857836.76 frames. ], batch size: 100, lr: 4.87e-03, grad_scale: 4.0 +2022-12-08 10:03:05,593 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 10:03:12,941 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1146, 3.1499, 3.3017, 2.3095, 2.0469, 2.7468, 1.5813, 2.6067], + device='cuda:2'), covar=tensor([0.0467, 0.0469, 0.0475, 0.1921, 0.2414, 0.0635, 0.3366, 0.1014], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0101, 0.0095, 0.0101, 0.0117, 0.0090, 0.0121, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:03:14,044 INFO [train.py:905] (2/4) Epoch 16, validation: loss=0.1378, simple_loss=0.175, pruned_loss=0.05031, over 857387.00 frames. +2022-12-08 10:03:14,044 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 10:03:14,151 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=119431.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:03:29,679 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=119449.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:04:20,754 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.903e+01 2.074e+02 2.533e+02 3.058e+02 7.007e+02, threshold=5.066e+02, percent-clipped=2.0 +2022-12-08 10:04:26,249 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=119514.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:04:41,255 INFO [train.py:873] (2/4) Epoch 16, batch 6100, loss[loss=0.112, simple_loss=0.1424, pruned_loss=0.04078, over 14308.00 frames. ], tot_loss[loss=0.1105, simple_loss=0.1454, pruned_loss=0.03785, over 1950098.46 frames. ], batch size: 60, lr: 4.87e-03, grad_scale: 4.0 +2022-12-08 10:04:49,895 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1696, 5.0227, 4.7619, 5.1592, 4.7767, 4.5304, 5.2633, 4.9285], + device='cuda:2'), covar=tensor([0.0560, 0.0797, 0.0769, 0.0504, 0.0736, 0.0596, 0.0501, 0.0654], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0142, 0.0146, 0.0158, 0.0146, 0.0122, 0.0166, 0.0146], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:05:19,850 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=119575.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:05:48,895 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.289e+02 1.899e+02 2.518e+02 3.457e+02 7.704e+02, threshold=5.035e+02, percent-clipped=3.0 +2022-12-08 10:06:09,184 INFO [train.py:873] (2/4) Epoch 16, batch 6200, loss[loss=0.1115, simple_loss=0.1525, pruned_loss=0.03528, over 14413.00 frames. ], tot_loss[loss=0.1109, simple_loss=0.1455, pruned_loss=0.03821, over 1967275.49 frames. ], batch size: 41, lr: 4.87e-03, grad_scale: 4.0 +2022-12-08 10:06:28,825 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 10:06:38,610 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=119665.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:06:50,166 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.55 vs. limit=2.0 +2022-12-08 10:07:16,891 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.045e+02 2.165e+02 2.712e+02 3.301e+02 6.973e+02, threshold=5.424e+02, percent-clipped=2.0 +2022-12-08 10:07:27,586 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.51 vs. limit=5.0 +2022-12-08 10:07:32,958 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=119726.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:07:37,355 INFO [train.py:873] (2/4) Epoch 16, batch 6300, loss[loss=0.1041, simple_loss=0.144, pruned_loss=0.03208, over 14222.00 frames. ], tot_loss[loss=0.1116, simple_loss=0.1462, pruned_loss=0.03853, over 1992826.24 frames. ], batch size: 37, lr: 4.87e-03, grad_scale: 4.0 +2022-12-08 10:07:37,498 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=119731.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:07:53,329 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=119749.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:07:57,988 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.75 vs. limit=5.0 +2022-12-08 10:08:19,728 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=119779.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:08:32,292 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8680, 1.7036, 3.8822, 3.5902, 3.6608, 3.9586, 3.2174, 3.9417], + device='cuda:2'), covar=tensor([0.1641, 0.1619, 0.0131, 0.0258, 0.0259, 0.0148, 0.0276, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0130, 0.0169, 0.0148, 0.0142, 0.0123, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 10:08:35,908 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=119797.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:08:39,933 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3332, 3.5626, 3.3974, 3.6174, 2.7101, 3.4827, 3.4172, 1.8702], + device='cuda:2'), covar=tensor([0.1324, 0.0794, 0.1028, 0.0557, 0.0902, 0.0616, 0.0930, 0.1913], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0087, 0.0069, 0.0074, 0.0100, 0.0089, 0.0101, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:08:46,037 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.350e+02 2.194e+02 2.675e+02 3.528e+02 1.237e+03, threshold=5.350e+02, percent-clipped=2.0 +2022-12-08 10:09:05,871 INFO [train.py:873] (2/4) Epoch 16, batch 6400, loss[loss=0.1352, simple_loss=0.1559, pruned_loss=0.05723, over 4952.00 frames. ], tot_loss[loss=0.112, simple_loss=0.1464, pruned_loss=0.03884, over 1967356.89 frames. ], batch size: 100, lr: 4.87e-03, grad_scale: 8.0 +2022-12-08 10:09:40,233 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=119870.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:10:13,248 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 2.000e+02 2.417e+02 3.320e+02 6.714e+02, threshold=4.834e+02, percent-clipped=5.0 +2022-12-08 10:10:33,940 INFO [train.py:873] (2/4) Epoch 16, batch 6500, loss[loss=0.09081, simple_loss=0.1362, pruned_loss=0.0227, over 14609.00 frames. ], tot_loss[loss=0.1112, simple_loss=0.1458, pruned_loss=0.03829, over 1983326.83 frames. ], batch size: 22, lr: 4.86e-03, grad_scale: 8.0 +2022-12-08 10:11:01,991 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0746, 2.1922, 2.0346, 2.2513, 1.8917, 2.1272, 2.1550, 2.1160], + device='cuda:2'), covar=tensor([0.0989, 0.1019, 0.1153, 0.0858, 0.1484, 0.0856, 0.1060, 0.0948], + device='cuda:2'), in_proj_covar=tensor([0.0143, 0.0141, 0.0146, 0.0158, 0.0146, 0.0121, 0.0167, 0.0147], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:11:27,182 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0579, 1.7992, 3.1370, 2.3365, 3.0495, 1.9274, 2.5033, 3.0399], + device='cuda:2'), covar=tensor([0.1107, 0.3920, 0.0641, 0.3748, 0.0907, 0.3078, 0.1296, 0.0724], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0203, 0.0217, 0.0271, 0.0236, 0.0206, 0.0202, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 10:11:44,556 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.130e+02 1.947e+02 2.582e+02 3.066e+02 4.807e+02, threshold=5.164e+02, percent-clipped=0.0 +2022-12-08 10:11:55,917 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=120021.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:11:58,689 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8757, 2.4839, 2.6268, 1.8473, 2.4289, 2.6503, 2.8723, 2.3957], + device='cuda:2'), covar=tensor([0.0674, 0.0784, 0.0979, 0.1419, 0.0905, 0.0736, 0.0650, 0.1228], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0172, 0.0141, 0.0125, 0.0142, 0.0155, 0.0133, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:12:04,537 INFO [train.py:873] (2/4) Epoch 16, batch 6600, loss[loss=0.104, simple_loss=0.1411, pruned_loss=0.03349, over 6932.00 frames. ], tot_loss[loss=0.1107, simple_loss=0.1455, pruned_loss=0.03794, over 2012895.91 frames. ], batch size: 100, lr: 4.86e-03, grad_scale: 8.0 +2022-12-08 10:12:16,219 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1554, 3.5404, 3.3013, 3.5441, 2.7025, 3.4618, 3.3266, 1.6719], + device='cuda:2'), covar=tensor([0.1278, 0.0599, 0.0928, 0.0419, 0.0862, 0.0488, 0.0867, 0.2236], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0088, 0.0070, 0.0074, 0.0100, 0.0089, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:12:55,998 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-08 10:13:12,086 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.190e+01 2.035e+02 2.583e+02 3.251e+02 5.143e+02, threshold=5.166e+02, percent-clipped=0.0 +2022-12-08 10:13:14,912 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6026, 2.3031, 4.5920, 3.1675, 4.3240, 2.2473, 3.3837, 4.4212], + device='cuda:2'), covar=tensor([0.0528, 0.4195, 0.0348, 0.5792, 0.0647, 0.3503, 0.1428, 0.0361], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0203, 0.0217, 0.0271, 0.0235, 0.0206, 0.0201, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 10:13:32,126 INFO [train.py:873] (2/4) Epoch 16, batch 6700, loss[loss=0.1131, simple_loss=0.1469, pruned_loss=0.03958, over 14275.00 frames. ], tot_loss[loss=0.1111, simple_loss=0.1457, pruned_loss=0.03824, over 1937089.83 frames. ], batch size: 44, lr: 4.86e-03, grad_scale: 8.0 +2022-12-08 10:13:40,563 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3584, 2.5133, 2.5798, 2.7292, 2.2142, 2.6540, 2.5297, 1.4286], + device='cuda:2'), covar=tensor([0.1050, 0.0764, 0.0833, 0.0498, 0.0973, 0.0542, 0.1052, 0.1944], + device='cuda:2'), in_proj_covar=tensor([0.0140, 0.0088, 0.0070, 0.0074, 0.0101, 0.0089, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:14:06,016 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=120170.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:14:39,090 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.095e+02 2.165e+02 2.458e+02 2.899e+02 9.707e+02, threshold=4.916e+02, percent-clipped=4.0 +2022-12-08 10:14:48,307 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=120218.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:14:59,587 INFO [train.py:873] (2/4) Epoch 16, batch 6800, loss[loss=0.113, simple_loss=0.1503, pruned_loss=0.03779, over 14593.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.1452, pruned_loss=0.03757, over 1944130.22 frames. ], batch size: 23, lr: 4.86e-03, grad_scale: 8.0 +2022-12-08 10:15:37,688 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.69 vs. limit=2.0 +2022-12-08 10:16:06,648 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.522e+02 2.118e+02 2.648e+02 3.259e+02 6.407e+02, threshold=5.296e+02, percent-clipped=3.0 +2022-12-08 10:16:18,542 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=120321.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:16:27,165 INFO [train.py:873] (2/4) Epoch 16, batch 6900, loss[loss=0.08874, simple_loss=0.1172, pruned_loss=0.03013, over 3881.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1447, pruned_loss=0.03763, over 1901559.99 frames. ], batch size: 100, lr: 4.86e-03, grad_scale: 8.0 +2022-12-08 10:16:50,351 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.40 vs. limit=5.0 +2022-12-08 10:17:00,689 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=120369.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:17:22,621 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6551, 3.3708, 4.2559, 3.1583, 2.6840, 3.6083, 2.1121, 3.6520], + device='cuda:2'), covar=tensor([0.0746, 0.1048, 0.0517, 0.1420, 0.1749, 0.0690, 0.2901, 0.1193], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0094, 0.0099, 0.0116, 0.0089, 0.0118, 0.0093], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:17:34,671 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.039e+02 2.083e+02 2.449e+02 3.128e+02 5.961e+02, threshold=4.898e+02, percent-clipped=1.0 +2022-12-08 10:17:55,021 INFO [train.py:873] (2/4) Epoch 16, batch 7000, loss[loss=0.06719, simple_loss=0.1088, pruned_loss=0.01278, over 13584.00 frames. ], tot_loss[loss=0.1092, simple_loss=0.144, pruned_loss=0.0372, over 1861454.97 frames. ], batch size: 17, lr: 4.85e-03, grad_scale: 8.0 +2022-12-08 10:18:19,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8303, 1.5539, 1.8000, 1.9625, 1.4625, 1.7127, 1.8009, 1.8743], + device='cuda:2'), covar=tensor([0.0226, 0.0392, 0.0211, 0.0187, 0.0413, 0.0472, 0.0283, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0293, 0.0259, 0.0375, 0.0330, 0.0272, 0.0305, 0.0310, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:18:30,665 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9907, 1.5103, 3.5361, 3.2082, 3.4056, 3.6161, 2.8342, 3.5579], + device='cuda:2'), covar=tensor([0.1597, 0.1838, 0.0152, 0.0341, 0.0287, 0.0183, 0.0371, 0.0173], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0130, 0.0170, 0.0147, 0.0142, 0.0125, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 10:18:52,461 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5234, 2.8478, 4.3051, 3.1343, 4.3300, 4.0743, 4.0663, 3.5929], + device='cuda:2'), covar=tensor([0.0755, 0.3062, 0.0964, 0.2030, 0.0907, 0.1069, 0.1806, 0.1710], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0312, 0.0394, 0.0302, 0.0370, 0.0325, 0.0362, 0.0302], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:19:01,660 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.060e+01 2.199e+02 2.820e+02 3.385e+02 6.936e+02, threshold=5.640e+02, percent-clipped=6.0 +2022-12-08 10:19:21,718 INFO [train.py:873] (2/4) Epoch 16, batch 7100, loss[loss=0.1548, simple_loss=0.1416, pruned_loss=0.08401, over 1258.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1443, pruned_loss=0.03717, over 1907363.12 frames. ], batch size: 100, lr: 4.85e-03, grad_scale: 8.0 +2022-12-08 10:19:51,046 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3916, 2.4421, 2.5309, 2.4873, 2.5151, 2.1384, 1.4763, 2.2582], + device='cuda:2'), covar=tensor([0.0557, 0.0456, 0.0415, 0.0379, 0.0417, 0.1205, 0.2360, 0.0441], + device='cuda:2'), in_proj_covar=tensor([0.0169, 0.0172, 0.0144, 0.0144, 0.0203, 0.0139, 0.0156, 0.0191], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 10:20:28,194 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.979e+01 2.056e+02 2.495e+02 3.157e+02 6.004e+02, threshold=4.990e+02, percent-clipped=1.0 +2022-12-08 10:20:48,803 INFO [train.py:873] (2/4) Epoch 16, batch 7200, loss[loss=0.1132, simple_loss=0.1468, pruned_loss=0.03984, over 8603.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1447, pruned_loss=0.03767, over 1875266.52 frames. ], batch size: 100, lr: 4.85e-03, grad_scale: 8.0 +2022-12-08 10:21:09,665 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7604, 1.0271, 1.2556, 1.2055, 0.9645, 1.3029, 1.0609, 0.8340], + device='cuda:2'), covar=tensor([0.1949, 0.1183, 0.0414, 0.0489, 0.1877, 0.0929, 0.1341, 0.1419], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0088, 0.0070, 0.0074, 0.0101, 0.0089, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:21:27,822 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=120676.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:21:55,118 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.056e+02 2.553e+02 2.947e+02 6.025e+02, threshold=5.105e+02, percent-clipped=4.0 +2022-12-08 10:22:15,426 INFO [train.py:873] (2/4) Epoch 16, batch 7300, loss[loss=0.1076, simple_loss=0.1465, pruned_loss=0.03437, over 14403.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.144, pruned_loss=0.03732, over 1882551.15 frames. ], batch size: 53, lr: 4.85e-03, grad_scale: 8.0 +2022-12-08 10:22:19,078 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0958, 1.9878, 1.9694, 1.7424, 1.7844, 1.1937, 1.7427, 1.9223], + device='cuda:2'), covar=tensor([0.0793, 0.0666, 0.0495, 0.1364, 0.1034, 0.0792, 0.0866, 0.0847], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0035, 0.0038, 0.0033, 0.0034, 0.0047, 0.0036, 0.0038], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:22:20,977 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=120737.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:22:23,592 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6358, 4.1743, 3.8209, 3.8688, 2.9371, 4.0819, 3.8252, 2.2412], + device='cuda:2'), covar=tensor([0.1747, 0.0594, 0.1372, 0.0695, 0.0897, 0.0334, 0.0885, 0.1914], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0087, 0.0069, 0.0074, 0.0100, 0.0088, 0.0101, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:22:28,633 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2310, 3.0421, 2.8180, 2.9309, 3.1684, 3.1577, 3.1746, 3.2017], + device='cuda:2'), covar=tensor([0.0909, 0.0746, 0.2136, 0.2506, 0.0922, 0.0953, 0.1211, 0.0926], + device='cuda:2'), in_proj_covar=tensor([0.0391, 0.0271, 0.0451, 0.0567, 0.0349, 0.0443, 0.0391, 0.0389], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:22:45,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8223, 1.5817, 1.7941, 1.9754, 1.3856, 1.7506, 1.7085, 1.9201], + device='cuda:2'), covar=tensor([0.0218, 0.0402, 0.0230, 0.0177, 0.0377, 0.0428, 0.0280, 0.0192], + device='cuda:2'), in_proj_covar=tensor([0.0293, 0.0259, 0.0374, 0.0329, 0.0273, 0.0305, 0.0310, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:22:59,799 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5402, 2.9657, 5.2592, 4.6381, 4.5306, 5.3419, 5.0687, 5.3986], + device='cuda:2'), covar=tensor([0.1319, 0.1007, 0.0080, 0.0179, 0.0210, 0.0093, 0.0094, 0.0082], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0159, 0.0131, 0.0169, 0.0148, 0.0143, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 10:23:22,446 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.181e+02 2.578e+02 3.347e+02 7.648e+02, threshold=5.157e+02, percent-clipped=5.0 +2022-12-08 10:23:42,646 INFO [train.py:873] (2/4) Epoch 16, batch 7400, loss[loss=0.1236, simple_loss=0.1532, pruned_loss=0.047, over 6933.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.144, pruned_loss=0.03728, over 1855682.67 frames. ], batch size: 100, lr: 4.85e-03, grad_scale: 8.0 +2022-12-08 10:24:49,561 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.390e+02 2.203e+02 2.672e+02 3.215e+02 8.256e+02, threshold=5.345e+02, percent-clipped=4.0 +2022-12-08 10:25:09,742 INFO [train.py:873] (2/4) Epoch 16, batch 7500, loss[loss=0.108, simple_loss=0.141, pruned_loss=0.03748, over 6000.00 frames. ], tot_loss[loss=0.1103, simple_loss=0.1448, pruned_loss=0.03793, over 1857110.84 frames. ], batch size: 100, lr: 4.84e-03, grad_scale: 8.0 +2022-12-08 10:26:35,449 INFO [train.py:873] (2/4) Epoch 17, batch 0, loss[loss=0.1092, simple_loss=0.1563, pruned_loss=0.031, over 14541.00 frames. ], tot_loss[loss=0.1092, simple_loss=0.1563, pruned_loss=0.031, over 14541.00 frames. ], batch size: 43, lr: 4.70e-03, grad_scale: 8.0 +2022-12-08 10:26:35,449 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 10:26:42,839 INFO [train.py:905] (2/4) Epoch 17, validation: loss=0.1441, simple_loss=0.1813, pruned_loss=0.05348, over 857387.00 frames. +2022-12-08 10:26:42,840 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 10:26:44,738 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6063, 1.4662, 1.5173, 1.5302, 1.5949, 1.0381, 1.3663, 1.5074], + device='cuda:2'), covar=tensor([0.0729, 0.0821, 0.0722, 0.1154, 0.0933, 0.1050, 0.0991, 0.0701], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0034, 0.0037, 0.0032, 0.0033, 0.0046, 0.0035, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:26:52,031 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=121003.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:26:56,392 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.236e+01 1.830e+02 2.606e+02 3.490e+02 1.014e+03, threshold=5.212e+02, percent-clipped=7.0 +2022-12-08 10:27:03,009 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1613, 3.9838, 3.8068, 4.2305, 3.8258, 3.5142, 4.2283, 4.0389], + device='cuda:2'), covar=tensor([0.0640, 0.0865, 0.0881, 0.0537, 0.0855, 0.0772, 0.0607, 0.0666], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0143, 0.0147, 0.0160, 0.0148, 0.0123, 0.0170, 0.0149], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:27:17,857 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=121032.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:27:44,415 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2785, 2.2281, 3.2179, 3.3337, 3.2697, 2.2618, 3.2199, 2.5017], + device='cuda:2'), covar=tensor([0.0527, 0.1218, 0.0895, 0.0545, 0.0563, 0.1657, 0.0562, 0.1084], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0260, 0.0376, 0.0333, 0.0275, 0.0308, 0.0313, 0.0282], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:27:46,283 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=121064.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:28:11,823 INFO [train.py:873] (2/4) Epoch 17, batch 100, loss[loss=0.09697, simple_loss=0.1408, pruned_loss=0.02658, over 14278.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1456, pruned_loss=0.03655, over 896418.59 frames. ], batch size: 44, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:28:13,847 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 10:28:24,822 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 2.352e+02 2.943e+02 3.420e+02 5.915e+02, threshold=5.886e+02, percent-clipped=2.0 +2022-12-08 10:29:05,067 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1737, 1.0329, 1.1788, 1.0061, 0.9406, 0.7184, 0.9296, 0.8298], + device='cuda:2'), covar=tensor([0.0211, 0.0168, 0.0198, 0.0210, 0.0233, 0.0423, 0.0252, 0.0361], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0019, 0.0020, 0.0020, 0.0032, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:29:13,703 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=121164.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:29:39,130 INFO [train.py:873] (2/4) Epoch 17, batch 200, loss[loss=0.09757, simple_loss=0.1338, pruned_loss=0.03069, over 13957.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1445, pruned_loss=0.03737, over 1230706.79 frames. ], batch size: 19, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:29:52,502 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.399e+02 2.132e+02 2.532e+02 3.029e+02 6.779e+02, threshold=5.065e+02, percent-clipped=1.0 +2022-12-08 10:30:07,054 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=121225.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:30:35,625 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.76 vs. limit=5.0 +2022-12-08 10:31:06,419 INFO [train.py:873] (2/4) Epoch 17, batch 300, loss[loss=0.08296, simple_loss=0.124, pruned_loss=0.02098, over 13919.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1441, pruned_loss=0.03725, over 1529825.63 frames. ], batch size: 19, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:31:09,471 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4275, 3.9721, 3.5692, 3.6338, 2.8119, 3.7159, 3.6059, 2.2456], + device='cuda:2'), covar=tensor([0.1705, 0.0622, 0.1147, 0.0610, 0.0842, 0.0528, 0.1048, 0.1782], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0088, 0.0069, 0.0073, 0.0099, 0.0087, 0.0100, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:31:19,308 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 2.023e+02 2.561e+02 3.068e+02 6.366e+02, threshold=5.121e+02, percent-clipped=2.0 +2022-12-08 10:31:22,051 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=121311.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:31:40,182 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=121332.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:32:03,736 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=121359.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:32:14,962 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=121372.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 10:32:22,008 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=121380.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:32:22,187 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5575, 2.1863, 4.5868, 3.1289, 4.3955, 2.1726, 3.2659, 4.4423], + device='cuda:2'), covar=tensor([0.0455, 0.3811, 0.0334, 0.5283, 0.0496, 0.3113, 0.1335, 0.0351], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0204, 0.0219, 0.0275, 0.0238, 0.0208, 0.0203, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 10:32:26,605 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4588, 3.7687, 2.9436, 4.6223, 4.3123, 4.4738, 3.7179, 3.2591], + device='cuda:2'), covar=tensor([0.0645, 0.1263, 0.3550, 0.0454, 0.0732, 0.0932, 0.1276, 0.2654], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0292, 0.0261, 0.0286, 0.0325, 0.0301, 0.0255, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:32:33,192 INFO [train.py:873] (2/4) Epoch 17, batch 400, loss[loss=0.1291, simple_loss=0.138, pruned_loss=0.0601, over 2533.00 frames. ], tot_loss[loss=0.1097, simple_loss=0.1443, pruned_loss=0.03757, over 1688398.35 frames. ], batch size: 100, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:32:45,763 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=121407.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:32:47,376 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.084e+02 2.168e+02 2.580e+02 3.311e+02 5.353e+02, threshold=5.159e+02, percent-clipped=2.0 +2022-12-08 10:32:52,922 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0470, 1.0600, 1.1613, 0.9308, 0.9717, 0.6371, 0.7492, 0.8037], + device='cuda:2'), covar=tensor([0.0254, 0.0221, 0.0205, 0.0246, 0.0260, 0.0435, 0.0315, 0.0400], + device='cuda:2'), in_proj_covar=tensor([0.0021, 0.0021, 0.0020, 0.0021, 0.0020, 0.0032, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:33:38,830 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=121468.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:34:00,287 INFO [train.py:873] (2/4) Epoch 17, batch 500, loss[loss=0.1672, simple_loss=0.1538, pruned_loss=0.09029, over 1312.00 frames. ], tot_loss[loss=0.1099, simple_loss=0.1447, pruned_loss=0.03754, over 1802721.35 frames. ], batch size: 100, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:34:14,595 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.238e+02 2.218e+02 2.633e+02 3.024e+02 9.780e+02, threshold=5.266e+02, percent-clipped=2.0 +2022-12-08 10:34:23,966 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=121520.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:34:28,411 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8062, 0.8654, 0.6456, 0.8879, 0.8649, 0.3612, 0.7528, 0.8765], + device='cuda:2'), covar=tensor([0.0391, 0.0521, 0.0452, 0.0375, 0.0273, 0.0349, 0.0862, 0.0642], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0034, 0.0038, 0.0032, 0.0033, 0.0047, 0.0035, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:34:57,984 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8266, 3.5730, 3.4352, 3.8166, 3.5412, 3.7343, 3.8350, 3.2218], + device='cuda:2'), covar=tensor([0.0442, 0.1054, 0.0570, 0.0495, 0.0855, 0.0369, 0.0590, 0.0603], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0277, 0.0200, 0.0194, 0.0187, 0.0157, 0.0290, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 10:35:22,057 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-08 10:35:25,051 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9626, 1.5831, 3.1036, 2.8233, 2.9554, 3.1640, 2.2968, 3.0960], + device='cuda:2'), covar=tensor([0.1267, 0.1445, 0.0192, 0.0408, 0.0389, 0.0208, 0.0567, 0.0209], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0130, 0.0169, 0.0149, 0.0144, 0.0125, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 10:35:27,466 INFO [train.py:873] (2/4) Epoch 17, batch 600, loss[loss=0.08588, simple_loss=0.1263, pruned_loss=0.02271, over 13964.00 frames. ], tot_loss[loss=0.11, simple_loss=0.1448, pruned_loss=0.03765, over 1850685.71 frames. ], batch size: 20, lr: 4.69e-03, grad_scale: 8.0 +2022-12-08 10:35:41,061 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 2.288e+02 2.719e+02 3.282e+02 7.294e+02, threshold=5.437e+02, percent-clipped=3.0 +2022-12-08 10:36:25,624 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=121659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:36:32,554 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=121667.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 10:36:55,477 INFO [train.py:873] (2/4) Epoch 17, batch 700, loss[loss=0.08713, simple_loss=0.1289, pruned_loss=0.02269, over 13931.00 frames. ], tot_loss[loss=0.1101, simple_loss=0.1446, pruned_loss=0.03777, over 1890011.61 frames. ], batch size: 20, lr: 4.68e-03, grad_scale: 8.0 +2022-12-08 10:37:08,193 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=121707.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:37:09,981 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.050e+02 2.051e+02 2.543e+02 3.291e+02 5.675e+02, threshold=5.087e+02, percent-clipped=1.0 +2022-12-08 10:37:19,854 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-08 10:37:57,037 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=121763.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:38:17,305 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3001, 1.3788, 1.2035, 1.4122, 1.4334, 0.9596, 1.1556, 1.2513], + device='cuda:2'), covar=tensor([0.0694, 0.0664, 0.0663, 0.0497, 0.0508, 0.0805, 0.0897, 0.0639], + device='cuda:2'), in_proj_covar=tensor([0.0035, 0.0034, 0.0038, 0.0032, 0.0033, 0.0046, 0.0035, 0.0037], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 10:38:23,115 INFO [train.py:873] (2/4) Epoch 17, batch 800, loss[loss=0.1141, simple_loss=0.1493, pruned_loss=0.03942, over 14422.00 frames. ], tot_loss[loss=0.1105, simple_loss=0.1446, pruned_loss=0.03817, over 1847011.77 frames. ], batch size: 51, lr: 4.68e-03, grad_scale: 8.0 +2022-12-08 10:38:37,355 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.375e+02 2.062e+02 2.469e+02 3.248e+02 9.190e+02, threshold=4.938e+02, percent-clipped=3.0 +2022-12-08 10:38:47,589 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=121820.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:39:29,510 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=121868.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:39:51,406 INFO [train.py:873] (2/4) Epoch 17, batch 900, loss[loss=0.1103, simple_loss=0.1394, pruned_loss=0.04061, over 4919.00 frames. ], tot_loss[loss=0.1095, simple_loss=0.1438, pruned_loss=0.03756, over 1833021.41 frames. ], batch size: 100, lr: 4.68e-03, grad_scale: 8.0 +2022-12-08 10:40:04,968 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.307e+02 2.025e+02 2.484e+02 3.273e+02 4.983e+02, threshold=4.968e+02, percent-clipped=1.0 +2022-12-08 10:40:28,158 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.09 vs. limit=5.0 +2022-12-08 10:40:40,924 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=121950.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:40:55,441 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=121967.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:41:17,349 INFO [train.py:873] (2/4) Epoch 17, batch 1000, loss[loss=0.1129, simple_loss=0.1536, pruned_loss=0.03607, over 14201.00 frames. ], tot_loss[loss=0.1091, simple_loss=0.1441, pruned_loss=0.03703, over 1910802.16 frames. ], batch size: 46, lr: 4.68e-03, grad_scale: 8.0 +2022-12-08 10:41:31,553 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 2.007e+02 2.510e+02 3.013e+02 6.170e+02, threshold=5.019e+02, percent-clipped=2.0 +2022-12-08 10:41:33,515 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122011.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:41:36,916 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=122015.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 10:41:44,686 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.72 vs. limit=5.0 +2022-12-08 10:42:14,080 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.68 vs. limit=5.0 +2022-12-08 10:42:18,945 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=122063.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:42:44,898 INFO [train.py:873] (2/4) Epoch 17, batch 1100, loss[loss=0.1034, simple_loss=0.1376, pruned_loss=0.03463, over 13915.00 frames. ], tot_loss[loss=0.1098, simple_loss=0.1446, pruned_loss=0.03747, over 1954175.34 frames. ], batch size: 20, lr: 4.68e-03, grad_scale: 8.0 +2022-12-08 10:42:59,064 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.272e+02 2.098e+02 2.553e+02 3.146e+02 5.520e+02, threshold=5.106e+02, percent-clipped=3.0 +2022-12-08 10:43:00,814 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=122111.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:43:05,860 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=122117.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:43:59,276 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122178.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:44:12,206 INFO [train.py:873] (2/4) Epoch 17, batch 1200, loss[loss=0.1142, simple_loss=0.154, pruned_loss=0.03724, over 14252.00 frames. ], tot_loss[loss=0.1087, simple_loss=0.1439, pruned_loss=0.0367, over 1980046.09 frames. ], batch size: 66, lr: 4.67e-03, grad_scale: 8.0 +2022-12-08 10:44:13,798 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=122195.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:44:26,232 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 1.994e+02 2.399e+02 3.114e+02 6.684e+02, threshold=4.798e+02, percent-clipped=5.0 +2022-12-08 10:44:53,019 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4386, 3.7856, 2.9021, 4.5218, 4.2043, 4.3485, 3.7929, 3.1817], + device='cuda:2'), covar=tensor([0.0554, 0.1229, 0.3276, 0.0627, 0.1013, 0.1564, 0.1202, 0.2884], + device='cuda:2'), in_proj_covar=tensor([0.0279, 0.0289, 0.0257, 0.0281, 0.0320, 0.0299, 0.0252, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:44:54,843 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7520, 1.3721, 2.9635, 1.5053, 2.9621, 2.8927, 2.1653, 3.0539], + device='cuda:2'), covar=tensor([0.0340, 0.3024, 0.0444, 0.2186, 0.0401, 0.0517, 0.1059, 0.0298], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0157, 0.0161, 0.0169, 0.0168, 0.0181, 0.0133, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:45:07,634 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122256.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:45:39,708 INFO [train.py:873] (2/4) Epoch 17, batch 1300, loss[loss=0.08525, simple_loss=0.1294, pruned_loss=0.02054, over 14167.00 frames. ], tot_loss[loss=0.1084, simple_loss=0.1437, pruned_loss=0.03657, over 1944075.90 frames. ], batch size: 29, lr: 4.67e-03, grad_scale: 8.0 +2022-12-08 10:45:52,044 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=122306.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:45:54,349 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.486e+02 2.064e+02 2.522e+02 2.867e+02 6.773e+02, threshold=5.044e+02, percent-clipped=2.0 +2022-12-08 10:46:03,016 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=122319.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:46:46,277 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2533, 4.1810, 4.6554, 3.7454, 3.4563, 4.0920, 2.3434, 4.3909], + device='cuda:2'), covar=tensor([0.0680, 0.0648, 0.0336, 0.1088, 0.1471, 0.0843, 0.2158, 0.0487], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0095, 0.0099, 0.0117, 0.0090, 0.0119, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:46:56,010 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122380.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:47:07,175 INFO [train.py:873] (2/4) Epoch 17, batch 1400, loss[loss=0.1752, simple_loss=0.1658, pruned_loss=0.09234, over 1259.00 frames. ], tot_loss[loss=0.1078, simple_loss=0.1437, pruned_loss=0.03599, over 1987213.32 frames. ], batch size: 100, lr: 4.67e-03, grad_scale: 8.0 +2022-12-08 10:47:13,146 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 10:47:21,307 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.812e+01 2.172e+02 2.630e+02 3.463e+02 6.435e+02, threshold=5.259e+02, percent-clipped=4.0 +2022-12-08 10:48:13,755 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=122468.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:48:17,805 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=122473.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 10:48:35,290 INFO [train.py:873] (2/4) Epoch 17, batch 1500, loss[loss=0.1051, simple_loss=0.1431, pruned_loss=0.03357, over 14271.00 frames. ], tot_loss[loss=0.1084, simple_loss=0.144, pruned_loss=0.03633, over 2021383.66 frames. ], batch size: 66, lr: 4.67e-03, grad_scale: 8.0 +2022-12-08 10:48:49,380 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.412e+02 2.116e+02 2.482e+02 3.540e+02 1.353e+03, threshold=4.965e+02, percent-clipped=3.0 +2022-12-08 10:49:02,870 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.20 vs. limit=5.0 +2022-12-08 10:49:06,491 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122529.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:49:14,426 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6325, 1.7738, 1.8696, 1.3736, 1.2796, 1.6767, 1.1669, 1.6877], + device='cuda:2'), covar=tensor([0.1581, 0.2000, 0.0964, 0.2462, 0.2724, 0.1114, 0.2422, 0.1142], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0101, 0.0095, 0.0099, 0.0117, 0.0090, 0.0118, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:49:26,299 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=122551.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:50:03,167 INFO [train.py:873] (2/4) Epoch 17, batch 1600, loss[loss=0.136, simple_loss=0.1393, pruned_loss=0.06641, over 2634.00 frames. ], tot_loss[loss=0.1079, simple_loss=0.1436, pruned_loss=0.03611, over 1994811.76 frames. ], batch size: 100, lr: 4.67e-03, grad_scale: 8.0 +2022-12-08 10:50:14,918 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=122606.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:50:17,233 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.386e+01 1.886e+02 2.427e+02 3.060e+02 1.117e+03, threshold=4.854e+02, percent-clipped=4.0 +2022-12-08 10:50:55,388 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 10:50:56,450 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=122654.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:51:06,462 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-08 10:51:15,255 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=122675.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:51:30,440 INFO [train.py:873] (2/4) Epoch 17, batch 1700, loss[loss=0.09266, simple_loss=0.1399, pruned_loss=0.0227, over 14201.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.144, pruned_loss=0.03618, over 1998803.01 frames. ], batch size: 35, lr: 4.66e-03, grad_scale: 4.0 +2022-12-08 10:51:42,253 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3622, 1.0929, 1.2112, 0.8289, 1.1819, 1.4046, 1.0746, 1.0926], + device='cuda:2'), covar=tensor([0.0527, 0.0768, 0.0794, 0.0525, 0.1069, 0.0805, 0.0710, 0.1382], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0173, 0.0140, 0.0127, 0.0144, 0.0155, 0.0135, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 10:51:45,734 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.060e+02 2.028e+02 2.462e+02 2.959e+02 4.889e+02, threshold=4.923e+02, percent-clipped=1.0 +2022-12-08 10:51:47,052 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 10:51:56,147 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5526, 1.7687, 1.8558, 1.3259, 1.2810, 1.6490, 1.1525, 1.6715], + device='cuda:2'), covar=tensor([0.1936, 0.2178, 0.1025, 0.2713, 0.2940, 0.1375, 0.2811, 0.1171], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0096, 0.0100, 0.0117, 0.0092, 0.0119, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:52:40,021 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=122773.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 10:52:57,842 INFO [train.py:873] (2/4) Epoch 17, batch 1800, loss[loss=0.1182, simple_loss=0.1507, pruned_loss=0.04284, over 8617.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.1439, pruned_loss=0.03662, over 1938067.75 frames. ], batch size: 100, lr: 4.66e-03, grad_scale: 4.0 +2022-12-08 10:53:00,008 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-08 10:53:12,808 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.140e+02 2.194e+02 2.593e+02 3.400e+02 7.475e+02, threshold=5.185e+02, percent-clipped=4.0 +2022-12-08 10:53:15,771 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.31 vs. limit=2.0 +2022-12-08 10:53:22,454 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=122821.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:53:25,023 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=122824.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:53:48,784 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=122851.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:54:00,355 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.74 vs. limit=5.0 +2022-12-08 10:54:07,176 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=122872.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:54:14,491 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-08 10:54:23,079 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0962, 1.6470, 3.9789, 3.6652, 3.7798, 4.0374, 3.4557, 4.0016], + device='cuda:2'), covar=tensor([0.1470, 0.1695, 0.0123, 0.0235, 0.0233, 0.0147, 0.0257, 0.0132], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0130, 0.0168, 0.0148, 0.0144, 0.0124, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 10:54:25,553 INFO [train.py:873] (2/4) Epoch 17, batch 1900, loss[loss=0.09166, simple_loss=0.1364, pruned_loss=0.02348, over 14675.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1433, pruned_loss=0.03639, over 1902864.11 frames. ], batch size: 33, lr: 4.66e-03, grad_scale: 4.0 +2022-12-08 10:54:30,911 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=122899.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:54:38,782 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0006, 1.7065, 1.9508, 1.9927, 1.9114, 2.0089, 2.0501, 1.7356], + device='cuda:2'), covar=tensor([0.1841, 0.3125, 0.1707, 0.1682, 0.2120, 0.1377, 0.1803, 0.1720], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0278, 0.0202, 0.0196, 0.0187, 0.0157, 0.0290, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 10:54:40,367 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.411e+02 2.015e+02 2.576e+02 3.102e+02 5.463e+02, threshold=5.151e+02, percent-clipped=2.0 +2022-12-08 10:54:49,324 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9486, 2.4919, 3.7544, 2.8487, 3.7712, 3.6230, 3.5625, 3.2422], + device='cuda:2'), covar=tensor([0.0854, 0.3191, 0.1131, 0.1983, 0.0962, 0.1050, 0.1527, 0.1661], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0312, 0.0390, 0.0299, 0.0366, 0.0321, 0.0357, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:55:00,440 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=122933.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:55:33,673 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8223, 3.0701, 4.6218, 3.5630, 4.5988, 4.5505, 4.4785, 4.0670], + device='cuda:2'), covar=tensor([0.0841, 0.2899, 0.0891, 0.1670, 0.0747, 0.0810, 0.1134, 0.1682], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0312, 0.0391, 0.0299, 0.0365, 0.0321, 0.0357, 0.0298], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 10:55:37,071 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=122975.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:55:52,710 INFO [train.py:873] (2/4) Epoch 17, batch 2000, loss[loss=0.1213, simple_loss=0.1538, pruned_loss=0.04443, over 14280.00 frames. ], tot_loss[loss=0.1085, simple_loss=0.1434, pruned_loss=0.03683, over 1850876.68 frames. ], batch size: 80, lr: 4.66e-03, grad_scale: 8.0 +2022-12-08 10:56:08,074 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.251e+02 2.103e+02 2.584e+02 3.281e+02 5.069e+02, threshold=5.168e+02, percent-clipped=0.0 +2022-12-08 10:56:19,424 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=123023.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:56:23,335 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8923, 4.7561, 4.4317, 4.9248, 4.4962, 4.2731, 4.9996, 4.6761], + device='cuda:2'), covar=tensor([0.0597, 0.0770, 0.0846, 0.0602, 0.0774, 0.0577, 0.0584, 0.0774], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0144, 0.0147, 0.0162, 0.0149, 0.0123, 0.0170, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:57:13,528 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 10:57:20,549 INFO [train.py:873] (2/4) Epoch 17, batch 2100, loss[loss=0.1157, simple_loss=0.1401, pruned_loss=0.04565, over 4941.00 frames. ], tot_loss[loss=0.1093, simple_loss=0.1438, pruned_loss=0.03739, over 1842977.62 frames. ], batch size: 100, lr: 4.66e-03, grad_scale: 4.0 +2022-12-08 10:57:29,934 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8374, 2.6895, 3.4003, 2.3475, 2.1852, 2.9347, 1.5664, 3.1788], + device='cuda:2'), covar=tensor([0.1103, 0.1173, 0.0505, 0.1577, 0.1933, 0.0896, 0.3038, 0.0651], + device='cuda:2'), in_proj_covar=tensor([0.0085, 0.0101, 0.0095, 0.0100, 0.0116, 0.0091, 0.0117, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 10:57:36,150 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.184e+02 2.207e+02 2.592e+02 3.533e+02 7.488e+02, threshold=5.184e+02, percent-clipped=8.0 +2022-12-08 10:57:47,440 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=123124.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:57:51,328 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.86 vs. limit=5.0 +2022-12-08 10:58:05,029 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 10:58:14,598 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7411, 3.5666, 3.4214, 3.7542, 3.2743, 3.3079, 3.8289, 3.6237], + device='cuda:2'), covar=tensor([0.0687, 0.0990, 0.0935, 0.0690, 0.1071, 0.0659, 0.0596, 0.0760], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0144, 0.0147, 0.0162, 0.0149, 0.0123, 0.0170, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 10:58:29,183 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=123172.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 10:58:46,982 INFO [train.py:873] (2/4) Epoch 17, batch 2200, loss[loss=0.1266, simple_loss=0.148, pruned_loss=0.05261, over 6937.00 frames. ], tot_loss[loss=0.1101, simple_loss=0.1445, pruned_loss=0.03788, over 1896837.10 frames. ], batch size: 100, lr: 4.65e-03, grad_scale: 4.0 +2022-12-08 10:59:03,099 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.154e+02 2.073e+02 2.495e+02 2.880e+02 5.648e+02, threshold=4.991e+02, percent-clipped=1.0 +2022-12-08 10:59:17,867 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=123228.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:00:13,629 INFO [train.py:873] (2/4) Epoch 17, batch 2300, loss[loss=0.1265, simple_loss=0.148, pruned_loss=0.05247, over 5961.00 frames. ], tot_loss[loss=0.109, simple_loss=0.1436, pruned_loss=0.03721, over 1902452.02 frames. ], batch size: 100, lr: 4.65e-03, grad_scale: 4.0 +2022-12-08 11:00:29,911 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.110e+02 2.209e+02 2.627e+02 3.184e+02 5.736e+02, threshold=5.255e+02, percent-clipped=2.0 +2022-12-08 11:00:32,674 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=123314.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:01:17,719 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6727, 2.6742, 2.0413, 2.7580, 2.6712, 2.7103, 2.4189, 2.1446], + device='cuda:2'), covar=tensor([0.1316, 0.1319, 0.3087, 0.0967, 0.1168, 0.0977, 0.1467, 0.2287], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0291, 0.0259, 0.0286, 0.0322, 0.0301, 0.0253, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:01:26,425 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=123375.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:01:41,813 INFO [train.py:873] (2/4) Epoch 17, batch 2400, loss[loss=0.1076, simple_loss=0.1472, pruned_loss=0.03398, over 14217.00 frames. ], tot_loss[loss=0.1096, simple_loss=0.1444, pruned_loss=0.03741, over 1976515.18 frames. ], batch size: 94, lr: 4.65e-03, grad_scale: 8.0 +2022-12-08 11:01:58,033 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.281e+02 2.088e+02 2.563e+02 3.123e+02 4.912e+02, threshold=5.125e+02, percent-clipped=0.0 +2022-12-08 11:02:12,668 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=123428.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:02:37,700 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9250, 3.4667, 3.2721, 3.3748, 2.4650, 3.2455, 3.1156, 1.7081], + device='cuda:2'), covar=tensor([0.1666, 0.0613, 0.0896, 0.0607, 0.1104, 0.0549, 0.1195, 0.2207], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0087, 0.0069, 0.0074, 0.0098, 0.0087, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:02:39,457 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5482, 2.2922, 3.4029, 2.6364, 3.3743, 3.3053, 3.2404, 2.7758], + device='cuda:2'), covar=tensor([0.0758, 0.2933, 0.0981, 0.1826, 0.0821, 0.1064, 0.1208, 0.1776], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0310, 0.0390, 0.0299, 0.0366, 0.0321, 0.0358, 0.0298], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:03:07,242 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=123489.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:03:10,661 INFO [train.py:873] (2/4) Epoch 17, batch 2500, loss[loss=0.1462, simple_loss=0.1654, pruned_loss=0.06352, over 7814.00 frames. ], tot_loss[loss=0.1102, simple_loss=0.1447, pruned_loss=0.03788, over 1895709.61 frames. ], batch size: 100, lr: 4.65e-03, grad_scale: 8.0 +2022-12-08 11:03:26,845 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.447e+02 2.132e+02 2.678e+02 3.392e+02 6.335e+02, threshold=5.356e+02, percent-clipped=3.0 +2022-12-08 11:03:34,259 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.09 vs. limit=2.0 +2022-12-08 11:03:36,857 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5063, 3.5056, 3.2480, 3.5710, 3.1613, 3.1772, 3.5829, 3.4228], + device='cuda:2'), covar=tensor([0.0780, 0.0870, 0.0995, 0.0763, 0.1041, 0.0805, 0.0787, 0.0897], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0143, 0.0147, 0.0162, 0.0148, 0.0123, 0.0170, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 11:03:42,090 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=123528.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:04:06,584 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8434, 1.5477, 2.8230, 2.4962, 2.6766, 2.8329, 2.0461, 2.7771], + device='cuda:2'), covar=tensor([0.1241, 0.1389, 0.0210, 0.0515, 0.0521, 0.0224, 0.0709, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0130, 0.0167, 0.0147, 0.0142, 0.0123, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 11:04:15,083 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-12-08 11:04:23,968 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=123576.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:04:39,203 INFO [train.py:873] (2/4) Epoch 17, batch 2600, loss[loss=0.09973, simple_loss=0.1322, pruned_loss=0.03363, over 6030.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1435, pruned_loss=0.03631, over 1960699.86 frames. ], batch size: 100, lr: 4.65e-03, grad_scale: 8.0 +2022-12-08 11:04:55,081 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.145e+02 1.985e+02 2.407e+02 3.072e+02 5.323e+02, threshold=4.814e+02, percent-clipped=0.0 +2022-12-08 11:05:46,088 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=123670.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:06:06,727 INFO [train.py:873] (2/4) Epoch 17, batch 2700, loss[loss=0.09626, simple_loss=0.1392, pruned_loss=0.02668, over 14295.00 frames. ], tot_loss[loss=0.1077, simple_loss=0.1431, pruned_loss=0.03615, over 1897388.23 frames. ], batch size: 25, lr: 4.65e-03, grad_scale: 8.0 +2022-12-08 11:06:22,183 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 2.020e+02 2.571e+02 3.169e+02 5.832e+02, threshold=5.143e+02, percent-clipped=4.0 +2022-12-08 11:07:25,890 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=123784.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:07:33,392 INFO [train.py:873] (2/4) Epoch 17, batch 2800, loss[loss=0.1195, simple_loss=0.1531, pruned_loss=0.04293, over 14241.00 frames. ], tot_loss[loss=0.1087, simple_loss=0.1439, pruned_loss=0.0367, over 1918522.32 frames. ], batch size: 69, lr: 4.64e-03, grad_scale: 8.0 +2022-12-08 11:07:49,825 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 2.052e+02 2.657e+02 3.438e+02 8.382e+02, threshold=5.313e+02, percent-clipped=4.0 +2022-12-08 11:08:35,239 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8707, 2.0578, 3.9166, 2.7478, 3.7450, 2.0093, 3.0232, 3.7802], + device='cuda:2'), covar=tensor([0.0707, 0.3847, 0.0453, 0.5182, 0.0643, 0.3173, 0.1273, 0.0497], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0201, 0.0217, 0.0270, 0.0236, 0.0205, 0.0202, 0.0216], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:08:43,036 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=123872.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:08:48,707 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.44 vs. limit=5.0 +2022-12-08 11:09:01,389 INFO [train.py:873] (2/4) Epoch 17, batch 2900, loss[loss=0.09534, simple_loss=0.1354, pruned_loss=0.02765, over 13861.00 frames. ], tot_loss[loss=0.1085, simple_loss=0.1437, pruned_loss=0.03663, over 2003329.83 frames. ], batch size: 20, lr: 4.64e-03, grad_scale: 8.0 +2022-12-08 11:09:11,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.50 vs. limit=2.0 +2022-12-08 11:09:16,751 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.276e+02 2.204e+02 2.576e+02 3.173e+02 6.433e+02, threshold=5.152e+02, percent-clipped=4.0 +2022-12-08 11:09:20,738 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1646, 2.3676, 2.4376, 2.3900, 1.9659, 1.9376, 1.6490, 1.8956], + device='cuda:2'), covar=tensor([0.0249, 0.0248, 0.0178, 0.0327, 0.0330, 0.0501, 0.0481, 0.0411], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0021, 0.0020, 0.0021, 0.0021, 0.0033, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 11:09:36,549 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=123933.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:10:06,017 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.46 vs. limit=2.0 +2022-12-08 11:10:08,208 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=123970.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:10:27,941 INFO [train.py:873] (2/4) Epoch 17, batch 3000, loss[loss=0.09956, simple_loss=0.1358, pruned_loss=0.03165, over 14406.00 frames. ], tot_loss[loss=0.1091, simple_loss=0.1443, pruned_loss=0.03689, over 2038907.19 frames. ], batch size: 53, lr: 4.64e-03, grad_scale: 8.0 +2022-12-08 11:10:27,942 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 11:10:36,501 INFO [train.py:905] (2/4) Epoch 17, validation: loss=0.1392, simple_loss=0.1759, pruned_loss=0.05127, over 857387.00 frames. +2022-12-08 11:10:36,502 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 11:10:52,776 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.289e+02 2.049e+02 2.779e+02 3.287e+02 7.415e+02, threshold=5.559e+02, percent-clipped=4.0 +2022-12-08 11:10:58,412 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124018.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:11:14,556 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:11:37,541 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124062.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:11:40,825 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.82 vs. limit=2.0 +2022-12-08 11:11:41,887 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4838, 2.6217, 4.2989, 4.5174, 4.2472, 2.5300, 4.4332, 3.4627], + device='cuda:2'), covar=tensor([0.0392, 0.1227, 0.0778, 0.0339, 0.0533, 0.1936, 0.0440, 0.0901], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0258, 0.0376, 0.0333, 0.0272, 0.0307, 0.0315, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 11:11:57,231 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=124084.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:11:58,097 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8667, 1.5902, 3.8341, 3.5070, 3.6081, 3.8578, 3.1755, 3.8745], + device='cuda:2'), covar=tensor([0.1604, 0.1591, 0.0118, 0.0295, 0.0256, 0.0139, 0.0341, 0.0133], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0130, 0.0168, 0.0147, 0.0143, 0.0124, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 11:12:04,347 INFO [train.py:873] (2/4) Epoch 17, batch 3100, loss[loss=0.09702, simple_loss=0.1391, pruned_loss=0.02748, over 14016.00 frames. ], tot_loss[loss=0.1087, simple_loss=0.1438, pruned_loss=0.03677, over 1994501.52 frames. ], batch size: 29, lr: 4.64e-03, grad_scale: 4.0 +2022-12-08 11:12:08,173 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124097.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:12:20,721 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 2.198e+02 2.691e+02 3.318e+02 1.640e+03, threshold=5.381e+02, percent-clipped=4.0 +2022-12-08 11:12:31,003 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124123.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:12:38,790 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124132.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:13:31,917 INFO [train.py:873] (2/4) Epoch 17, batch 3200, loss[loss=0.1089, simple_loss=0.1486, pruned_loss=0.03461, over 14405.00 frames. ], tot_loss[loss=0.1083, simple_loss=0.1435, pruned_loss=0.03654, over 2005817.72 frames. ], batch size: 53, lr: 4.64e-03, grad_scale: 8.0 +2022-12-08 11:13:48,753 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 1.975e+02 2.521e+02 3.067e+02 5.500e+02, threshold=5.042e+02, percent-clipped=1.0 +2022-12-08 11:14:02,438 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124228.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:14:21,399 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-12-08 11:14:40,352 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124272.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:14:40,571 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-12-08 11:14:58,461 INFO [train.py:873] (2/4) Epoch 17, batch 3300, loss[loss=0.09355, simple_loss=0.1366, pruned_loss=0.02527, over 14173.00 frames. ], tot_loss[loss=0.1077, simple_loss=0.1433, pruned_loss=0.03609, over 2086514.32 frames. ], batch size: 89, lr: 4.63e-03, grad_scale: 8.0 +2022-12-08 11:15:14,505 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.029e+02 2.735e+02 3.344e+02 8.067e+02, threshold=5.469e+02, percent-clipped=3.0 +2022-12-08 11:15:33,693 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124333.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:15:47,161 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124348.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 11:16:25,287 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124392.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:16:26,083 INFO [train.py:873] (2/4) Epoch 17, batch 3400, loss[loss=0.1113, simple_loss=0.1414, pruned_loss=0.04065, over 4955.00 frames. ], tot_loss[loss=0.1079, simple_loss=0.1433, pruned_loss=0.03628, over 2027036.93 frames. ], batch size: 100, lr: 4.63e-03, grad_scale: 4.0 +2022-12-08 11:16:40,697 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124409.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 11:16:43,978 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.203e+02 2.119e+02 2.705e+02 3.387e+02 6.415e+02, threshold=5.409e+02, percent-clipped=2.0 +2022-12-08 11:16:48,648 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124418.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:16:57,569 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9675, 2.1310, 2.8155, 2.3648, 2.8375, 2.8173, 2.7244, 2.5022], + device='cuda:2'), covar=tensor([0.0741, 0.2630, 0.0923, 0.1430, 0.0649, 0.0964, 0.0902, 0.1257], + device='cuda:2'), in_proj_covar=tensor([0.0355, 0.0312, 0.0393, 0.0301, 0.0367, 0.0323, 0.0363, 0.0298], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:17:00,946 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:17:54,881 INFO [train.py:873] (2/4) Epoch 17, batch 3500, loss[loss=0.1089, simple_loss=0.147, pruned_loss=0.0354, over 14316.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1431, pruned_loss=0.03598, over 2028613.28 frames. ], batch size: 31, lr: 4.63e-03, grad_scale: 4.0 +2022-12-08 11:17:55,012 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124493.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:18:12,217 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.179e+02 2.842e+02 3.919e+02 2.207e+03, threshold=5.685e+02, percent-clipped=7.0 +2022-12-08 11:18:19,160 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 11:18:25,960 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=124528.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:19:04,504 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6713, 4.7661, 5.0747, 4.3730, 4.8525, 5.0419, 1.9224, 4.4907], + device='cuda:2'), covar=tensor([0.0278, 0.0275, 0.0346, 0.0314, 0.0251, 0.0175, 0.2951, 0.0279], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0174, 0.0146, 0.0147, 0.0207, 0.0144, 0.0158, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 11:19:08,278 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124576.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:19:22,686 INFO [train.py:873] (2/4) Epoch 17, batch 3600, loss[loss=0.1582, simple_loss=0.1475, pruned_loss=0.08444, over 1228.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1436, pruned_loss=0.03643, over 1979838.13 frames. ], batch size: 100, lr: 4.63e-03, grad_scale: 8.0 +2022-12-08 11:19:40,800 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.418e+02 1.900e+02 2.307e+02 2.789e+02 7.825e+02, threshold=4.613e+02, percent-clipped=1.0 +2022-12-08 11:19:53,713 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124628.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:20:02,955 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124638.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:20:24,709 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124663.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:20:50,231 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=124692.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:20:50,939 INFO [train.py:873] (2/4) Epoch 17, batch 3700, loss[loss=0.08977, simple_loss=0.1339, pruned_loss=0.02281, over 14669.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.1438, pruned_loss=0.03664, over 1980514.55 frames. ], batch size: 23, lr: 4.63e-03, grad_scale: 8.0 +2022-12-08 11:20:56,534 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124699.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:21:00,479 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124704.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 11:21:07,859 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.074e+02 2.026e+02 2.516e+02 3.153e+02 6.578e+02, threshold=5.031e+02, percent-clipped=7.0 +2022-12-08 11:21:12,574 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=124718.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:21:17,746 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124724.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:21:31,750 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124740.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:21:54,339 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124766.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:22:13,586 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124788.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:22:16,061 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0710, 2.0663, 4.6262, 4.2208, 4.1607, 4.7320, 4.2978, 4.7136], + device='cuda:2'), covar=tensor([0.1483, 0.1362, 0.0108, 0.0202, 0.0223, 0.0115, 0.0168, 0.0100], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0160, 0.0132, 0.0170, 0.0149, 0.0145, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 11:22:17,562 INFO [train.py:873] (2/4) Epoch 17, batch 3800, loss[loss=0.1311, simple_loss=0.1585, pruned_loss=0.05188, over 12744.00 frames. ], tot_loss[loss=0.1075, simple_loss=0.1431, pruned_loss=0.03593, over 1966550.68 frames. ], batch size: 100, lr: 4.62e-03, grad_scale: 8.0 +2022-12-08 11:22:35,562 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.197e+02 2.150e+02 2.611e+02 3.247e+02 6.025e+02, threshold=5.221e+02, percent-clipped=4.0 +2022-12-08 11:22:41,899 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=124820.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:23:15,147 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3401, 1.9273, 2.2810, 1.5822, 2.0633, 2.3489, 2.2326, 2.0021], + device='cuda:2'), covar=tensor([0.0889, 0.0592, 0.1014, 0.1396, 0.1299, 0.0994, 0.0824, 0.1461], + device='cuda:2'), in_proj_covar=tensor([0.0152, 0.0171, 0.0141, 0.0127, 0.0143, 0.0154, 0.0135, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:23:16,942 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1528, 2.0609, 4.1458, 2.8046, 4.0249, 1.9061, 3.0836, 4.0314], + device='cuda:2'), covar=tensor([0.0620, 0.3886, 0.0443, 0.5710, 0.0630, 0.3414, 0.1384, 0.0438], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0200, 0.0217, 0.0270, 0.0235, 0.0205, 0.0203, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:23:35,451 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=124881.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:23:45,836 INFO [train.py:873] (2/4) Epoch 17, batch 3900, loss[loss=0.1649, simple_loss=0.1527, pruned_loss=0.08849, over 1318.00 frames. ], tot_loss[loss=0.1064, simple_loss=0.1426, pruned_loss=0.03508, over 2016123.35 frames. ], batch size: 100, lr: 4.62e-03, grad_scale: 4.0 +2022-12-08 11:24:04,347 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 2.040e+02 2.475e+02 3.105e+02 8.016e+02, threshold=4.950e+02, percent-clipped=1.0 +2022-12-08 11:24:04,447 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9386, 2.7408, 2.7440, 2.9390, 2.7798, 2.8599, 3.0079, 2.5279], + device='cuda:2'), covar=tensor([0.0687, 0.1251, 0.0655, 0.0650, 0.1082, 0.0669, 0.0656, 0.0772], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0278, 0.0201, 0.0194, 0.0186, 0.0155, 0.0289, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 11:24:17,283 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=124928.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:24:35,576 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9772, 2.0327, 2.2200, 1.4064, 1.6142, 1.9713, 1.3524, 2.0054], + device='cuda:2'), covar=tensor([0.1219, 0.1570, 0.1023, 0.2325, 0.2387, 0.1035, 0.3250, 0.1005], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0101, 0.0095, 0.0100, 0.0115, 0.0091, 0.0118, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 11:24:38,114 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4019, 2.5707, 4.2666, 4.5037, 4.3512, 2.4780, 4.5161, 3.3044], + device='cuda:2'), covar=tensor([0.0406, 0.1211, 0.0996, 0.0530, 0.0451, 0.2115, 0.0405, 0.1005], + device='cuda:2'), in_proj_covar=tensor([0.0293, 0.0257, 0.0373, 0.0329, 0.0270, 0.0306, 0.0312, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 11:24:59,255 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=124976.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:25:13,842 INFO [train.py:873] (2/4) Epoch 17, batch 4000, loss[loss=0.1134, simple_loss=0.1451, pruned_loss=0.04086, over 14202.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1428, pruned_loss=0.03514, over 2036295.88 frames. ], batch size: 89, lr: 4.62e-03, grad_scale: 8.0 +2022-12-08 11:25:14,808 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=124994.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:25:27,904 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125004.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 11:25:36,307 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.370e+02 2.040e+02 2.496e+02 2.950e+02 4.930e+02, threshold=4.992e+02, percent-clipped=0.0 +2022-12-08 11:25:40,941 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125019.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:26:02,897 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.21 vs. limit=5.0 +2022-12-08 11:26:08,853 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9374, 2.7553, 2.7664, 2.9570, 2.8329, 2.8927, 3.0104, 2.5813], + device='cuda:2'), covar=tensor([0.0676, 0.1169, 0.0638, 0.0582, 0.0806, 0.0543, 0.0695, 0.0665], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0278, 0.0201, 0.0194, 0.0186, 0.0156, 0.0289, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 11:26:09,626 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125052.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 11:26:12,255 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6186, 4.6311, 4.8504, 4.3770, 4.6899, 5.0996, 1.8417, 4.2805], + device='cuda:2'), covar=tensor([0.0413, 0.0428, 0.0549, 0.0471, 0.0454, 0.0152, 0.3780, 0.0431], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0176, 0.0147, 0.0149, 0.0208, 0.0144, 0.0160, 0.0195], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 11:26:40,557 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:26:44,719 INFO [train.py:873] (2/4) Epoch 17, batch 4100, loss[loss=0.131, simple_loss=0.1389, pruned_loss=0.06154, over 2611.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1427, pruned_loss=0.03499, over 2051182.74 frames. ], batch size: 100, lr: 4.62e-03, grad_scale: 4.0 +2022-12-08 11:26:51,289 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9085, 2.4863, 3.7336, 2.8469, 3.8097, 3.6607, 3.5208, 3.1309], + device='cuda:2'), covar=tensor([0.0854, 0.3030, 0.0986, 0.1749, 0.0785, 0.1069, 0.1686, 0.1637], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0308, 0.0391, 0.0300, 0.0365, 0.0321, 0.0361, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:27:03,766 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.387e+02 2.146e+02 2.604e+02 3.171e+02 6.787e+02, threshold=5.207e+02, percent-clipped=4.0 +2022-12-08 11:27:15,914 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125128.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:27:22,530 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125136.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:27:57,306 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125176.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:28:01,409 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 11:28:08,770 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125189.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:28:11,908 INFO [train.py:873] (2/4) Epoch 17, batch 4200, loss[loss=0.09395, simple_loss=0.13, pruned_loss=0.02897, over 3891.00 frames. ], tot_loss[loss=0.107, simple_loss=0.1429, pruned_loss=0.03552, over 2014122.09 frames. ], batch size: 100, lr: 4.62e-03, grad_scale: 4.0 +2022-12-08 11:28:31,909 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.000e+02 2.432e+02 3.049e+02 5.744e+02, threshold=4.863e+02, percent-clipped=3.0 +2022-12-08 11:28:55,710 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4203, 1.3941, 3.5104, 1.6098, 3.2386, 3.4899, 2.5197, 3.7724], + device='cuda:2'), covar=tensor([0.0274, 0.3108, 0.0404, 0.2211, 0.0905, 0.0469, 0.0969, 0.0201], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0156, 0.0158, 0.0166, 0.0167, 0.0179, 0.0132, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:29:40,188 INFO [train.py:873] (2/4) Epoch 17, batch 4300, loss[loss=0.08982, simple_loss=0.1349, pruned_loss=0.02237, over 14550.00 frames. ], tot_loss[loss=0.1077, simple_loss=0.1435, pruned_loss=0.0359, over 2058930.67 frames. ], batch size: 43, lr: 4.62e-03, grad_scale: 4.0 +2022-12-08 11:29:41,228 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125294.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:29:58,839 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.107e+02 2.137e+02 2.626e+02 3.159e+02 7.655e+02, threshold=5.251e+02, percent-clipped=3.0 +2022-12-08 11:30:02,806 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125319.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:30:14,368 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6146, 2.5772, 1.9669, 2.6596, 2.4179, 2.5500, 2.3292, 2.1804], + device='cuda:2'), covar=tensor([0.1075, 0.1327, 0.2641, 0.0854, 0.1390, 0.0936, 0.1662, 0.2112], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0292, 0.0261, 0.0287, 0.0325, 0.0302, 0.0255, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:30:23,030 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125342.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:30:34,895 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125356.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:30:44,403 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125367.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:30:52,422 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125376.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:31:07,133 INFO [train.py:873] (2/4) Epoch 17, batch 4400, loss[loss=0.1043, simple_loss=0.1179, pruned_loss=0.04538, over 2668.00 frames. ], tot_loss[loss=0.1072, simple_loss=0.1432, pruned_loss=0.03561, over 2062456.29 frames. ], batch size: 100, lr: 4.61e-03, grad_scale: 8.0 +2022-12-08 11:31:26,863 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.784e+01 2.190e+02 2.587e+02 3.207e+02 7.157e+02, threshold=5.175e+02, percent-clipped=2.0 +2022-12-08 11:31:28,760 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125417.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 11:31:32,451 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125421.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:31:46,376 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125437.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:32:09,126 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.79 vs. limit=5.0 +2022-12-08 11:32:20,321 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125476.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:32:25,348 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125482.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:32:26,964 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125484.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:32:35,102 INFO [train.py:873] (2/4) Epoch 17, batch 4500, loss[loss=0.1355, simple_loss=0.1387, pruned_loss=0.06615, over 2643.00 frames. ], tot_loss[loss=0.1064, simple_loss=0.1429, pruned_loss=0.03492, over 2110110.37 frames. ], batch size: 100, lr: 4.61e-03, grad_scale: 4.0 +2022-12-08 11:32:35,595 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-08 11:32:54,649 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.256e+02 2.002e+02 2.492e+02 3.086e+02 6.444e+02, threshold=4.984e+02, percent-clipped=2.0 +2022-12-08 11:33:01,478 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125524.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:34:01,919 INFO [train.py:873] (2/4) Epoch 17, batch 4600, loss[loss=0.12, simple_loss=0.15, pruned_loss=0.04502, over 7803.00 frames. ], tot_loss[loss=0.1078, simple_loss=0.1432, pruned_loss=0.03622, over 1966662.22 frames. ], batch size: 100, lr: 4.61e-03, grad_scale: 4.0 +2022-12-08 11:34:11,358 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125603.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:34:22,482 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.136e+02 2.137e+02 2.692e+02 3.129e+02 4.972e+02, threshold=5.384e+02, percent-clipped=0.0 +2022-12-08 11:34:52,694 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4434, 3.2043, 2.5195, 3.5183, 3.3574, 3.4324, 3.0819, 2.5606], + device='cuda:2'), covar=tensor([0.0804, 0.1255, 0.2975, 0.0676, 0.0972, 0.0929, 0.1221, 0.2760], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0292, 0.0261, 0.0287, 0.0323, 0.0303, 0.0257, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:35:05,234 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125664.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:35:23,925 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.26 vs. limit=5.0 +2022-12-08 11:35:30,463 INFO [train.py:873] (2/4) Epoch 17, batch 4700, loss[loss=0.1054, simple_loss=0.1435, pruned_loss=0.03367, over 14304.00 frames. ], tot_loss[loss=0.1079, simple_loss=0.1432, pruned_loss=0.03626, over 1931661.05 frames. ], batch size: 44, lr: 4.61e-03, grad_scale: 4.0 +2022-12-08 11:35:47,161 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125712.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 11:35:50,739 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.065e+02 2.672e+02 3.737e+02 1.296e+03, threshold=5.344e+02, percent-clipped=9.0 +2022-12-08 11:35:57,878 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125725.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:36:04,515 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125732.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:36:17,092 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9966, 1.8988, 1.9861, 1.8758, 1.8438, 1.6920, 1.2721, 1.2767], + device='cuda:2'), covar=tensor([0.0156, 0.0280, 0.0216, 0.0216, 0.0203, 0.0268, 0.0288, 0.0446], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0020, 0.0021, 0.0021, 0.0033, 0.0028, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 11:36:44,463 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125777.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:36:50,812 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=125784.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:36:52,585 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=125786.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:36:58,264 INFO [train.py:873] (2/4) Epoch 17, batch 4800, loss[loss=0.09256, simple_loss=0.1286, pruned_loss=0.02825, over 14349.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1435, pruned_loss=0.03636, over 1950419.23 frames. ], batch size: 73, lr: 4.61e-03, grad_scale: 8.0 +2022-12-08 11:37:04,746 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9065, 3.0279, 3.1324, 2.9596, 3.0845, 2.9347, 1.4612, 2.8691], + device='cuda:2'), covar=tensor([0.0451, 0.0342, 0.0346, 0.0427, 0.0345, 0.0661, 0.2610, 0.0330], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0174, 0.0146, 0.0146, 0.0205, 0.0141, 0.0157, 0.0193], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 11:37:17,308 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8856, 1.8611, 1.6443, 1.9314, 1.7214, 1.8249, 1.8218, 1.6961], + device='cuda:2'), covar=tensor([0.1000, 0.0809, 0.1572, 0.0763, 0.1118, 0.0614, 0.1262, 0.1088], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0291, 0.0261, 0.0287, 0.0322, 0.0303, 0.0256, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:37:19,644 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 1.900e+02 2.405e+02 3.076e+02 6.689e+02, threshold=4.810e+02, percent-clipped=2.0 +2022-12-08 11:37:32,564 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=125832.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:37:54,523 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.96 vs. limit=2.0 +2022-12-08 11:37:58,374 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8826, 4.9557, 5.3202, 4.5163, 5.1754, 5.4871, 2.1307, 4.8649], + device='cuda:2'), covar=tensor([0.0242, 0.0271, 0.0325, 0.0487, 0.0226, 0.0104, 0.2739, 0.0247], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0175, 0.0147, 0.0147, 0.0206, 0.0142, 0.0158, 0.0194], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 11:38:26,387 INFO [train.py:873] (2/4) Epoch 17, batch 4900, loss[loss=0.1349, simple_loss=0.1389, pruned_loss=0.06547, over 2561.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1436, pruned_loss=0.03641, over 1939627.32 frames. ], batch size: 100, lr: 4.60e-03, grad_scale: 4.0 +2022-12-08 11:38:47,417 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.413e+02 2.229e+02 2.578e+02 3.069e+02 1.052e+03, threshold=5.155e+02, percent-clipped=5.0 +2022-12-08 11:39:24,546 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=125959.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:39:40,110 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.95 vs. limit=5.0 +2022-12-08 11:39:46,496 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 11:39:54,287 INFO [train.py:873] (2/4) Epoch 17, batch 5000, loss[loss=0.09786, simple_loss=0.1435, pruned_loss=0.02611, over 14317.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1436, pruned_loss=0.0364, over 1955790.36 frames. ], batch size: 28, lr: 4.60e-03, grad_scale: 4.0 +2022-12-08 11:39:59,042 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=125998.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:40:11,508 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126012.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 11:40:15,916 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.327e+02 2.233e+02 2.614e+02 3.180e+02 5.706e+02, threshold=5.227e+02, percent-clipped=5.0 +2022-12-08 11:40:28,954 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126032.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:40:45,167 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8887, 2.1250, 2.0803, 2.0605, 2.0822, 1.8465, 1.6550, 1.3100], + device='cuda:2'), covar=tensor([0.0307, 0.0314, 0.0308, 0.0239, 0.0297, 0.0337, 0.0338, 0.0575], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0020, 0.0021, 0.0021, 0.0033, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 11:40:52,781 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=126059.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:40:53,516 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126060.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:41:08,262 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126077.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:41:11,123 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126080.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:41:12,008 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=126081.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:41:22,064 INFO [train.py:873] (2/4) Epoch 17, batch 5100, loss[loss=0.1221, simple_loss=0.1509, pruned_loss=0.04663, over 14380.00 frames. ], tot_loss[loss=0.1081, simple_loss=0.1432, pruned_loss=0.03645, over 1977531.46 frames. ], batch size: 53, lr: 4.60e-03, grad_scale: 4.0 +2022-12-08 11:41:42,936 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.364e+02 2.125e+02 2.682e+02 3.243e+02 7.763e+02, threshold=5.364e+02, percent-clipped=5.0 +2022-12-08 11:41:50,069 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126125.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:41:57,393 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1474, 3.0628, 2.8965, 3.2620, 2.7846, 2.8054, 3.2423, 3.1444], + device='cuda:2'), covar=tensor([0.0740, 0.1109, 0.1108, 0.0723, 0.1429, 0.0906, 0.0784, 0.0858], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0148, 0.0151, 0.0165, 0.0152, 0.0126, 0.0173, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 11:42:10,224 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0353, 3.7851, 3.7885, 4.0658, 3.6558, 3.4665, 4.1248, 3.9097], + device='cuda:2'), covar=tensor([0.0623, 0.0951, 0.0843, 0.0621, 0.0926, 0.0810, 0.0587, 0.0762], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0148, 0.0151, 0.0165, 0.0152, 0.0125, 0.0173, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 11:42:23,475 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2704, 3.7582, 2.8309, 4.5300, 4.1715, 4.3536, 3.7241, 3.1057], + device='cuda:2'), covar=tensor([0.0823, 0.1232, 0.3315, 0.0533, 0.0919, 0.0792, 0.1205, 0.2685], + device='cuda:2'), in_proj_covar=tensor([0.0284, 0.0290, 0.0261, 0.0288, 0.0323, 0.0303, 0.0255, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:42:49,534 INFO [train.py:873] (2/4) Epoch 17, batch 5200, loss[loss=0.1342, simple_loss=0.1308, pruned_loss=0.06877, over 2595.00 frames. ], tot_loss[loss=0.1088, simple_loss=0.1437, pruned_loss=0.03699, over 1965128.68 frames. ], batch size: 100, lr: 4.60e-03, grad_scale: 8.0 +2022-12-08 11:43:10,801 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.250e+02 2.118e+02 2.718e+02 3.349e+02 1.527e+03, threshold=5.437e+02, percent-clipped=4.0 +2022-12-08 11:43:16,182 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-12-08 11:43:48,207 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126259.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:44:17,889 INFO [train.py:873] (2/4) Epoch 17, batch 5300, loss[loss=0.09983, simple_loss=0.1448, pruned_loss=0.02744, over 13966.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1435, pruned_loss=0.03585, over 2019856.06 frames. ], batch size: 26, lr: 4.60e-03, grad_scale: 8.0 +2022-12-08 11:44:30,682 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126307.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:44:39,316 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.051e+02 2.097e+02 2.584e+02 3.197e+02 6.466e+02, threshold=5.169e+02, percent-clipped=5.0 +2022-12-08 11:45:03,514 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.75 vs. limit=2.0 +2022-12-08 11:45:11,257 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 11:45:12,377 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=126354.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:45:35,460 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=126380.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:45:36,244 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126381.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:45:46,524 INFO [train.py:873] (2/4) Epoch 17, batch 5400, loss[loss=0.1501, simple_loss=0.1462, pruned_loss=0.077, over 1251.00 frames. ], tot_loss[loss=0.1072, simple_loss=0.143, pruned_loss=0.03573, over 1933409.08 frames. ], batch size: 100, lr: 4.60e-03, grad_scale: 8.0 +2022-12-08 11:46:02,141 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4794, 2.7822, 4.3621, 3.3824, 4.3169, 4.1456, 4.1327, 3.7203], + device='cuda:2'), covar=tensor([0.0668, 0.2893, 0.0813, 0.1488, 0.0690, 0.0886, 0.1317, 0.1653], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0310, 0.0395, 0.0299, 0.0365, 0.0324, 0.0362, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:46:08,053 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.423e+02 2.193e+02 2.737e+02 3.423e+02 8.531e+02, threshold=5.474e+02, percent-clipped=2.0 +2022-12-08 11:46:19,126 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126429.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:46:29,866 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=126441.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:46:37,539 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=126450.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:46:48,729 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0200, 3.2284, 3.1098, 3.2625, 2.5369, 3.4096, 3.0562, 1.8811], + device='cuda:2'), covar=tensor([0.1275, 0.0786, 0.0967, 0.0549, 0.0925, 0.0473, 0.1100, 0.1759], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0089, 0.0070, 0.0076, 0.0100, 0.0089, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:46:56,676 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.11 vs. limit=2.0 +2022-12-08 11:47:14,891 INFO [train.py:873] (2/4) Epoch 17, batch 5500, loss[loss=0.1109, simple_loss=0.146, pruned_loss=0.03786, over 14393.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.142, pruned_loss=0.03466, over 1974062.09 frames. ], batch size: 44, lr: 4.59e-03, grad_scale: 8.0 +2022-12-08 11:47:30,640 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-08 11:47:31,189 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=126511.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:47:36,284 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.274e+02 2.086e+02 2.565e+02 2.972e+02 5.597e+02, threshold=5.130e+02, percent-clipped=1.0 +2022-12-08 11:47:53,979 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9788, 1.9896, 4.0767, 2.8162, 3.8904, 1.9459, 3.0878, 3.8685], + device='cuda:2'), covar=tensor([0.0722, 0.3899, 0.0449, 0.4615, 0.0721, 0.3263, 0.1344, 0.0548], + device='cuda:2'), in_proj_covar=tensor([0.0257, 0.0204, 0.0223, 0.0275, 0.0241, 0.0207, 0.0206, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:48:42,923 INFO [train.py:873] (2/4) Epoch 17, batch 5600, loss[loss=0.1125, simple_loss=0.1462, pruned_loss=0.03936, over 14516.00 frames. ], tot_loss[loss=0.1071, simple_loss=0.1427, pruned_loss=0.03575, over 1950189.45 frames. ], batch size: 49, lr: 4.59e-03, grad_scale: 8.0 +2022-12-08 11:49:04,888 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 1.973e+02 2.464e+02 3.253e+02 5.344e+02, threshold=4.927e+02, percent-clipped=2.0 +2022-12-08 11:49:36,665 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=126654.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:50:10,195 INFO [train.py:873] (2/4) Epoch 17, batch 5700, loss[loss=0.1261, simple_loss=0.1378, pruned_loss=0.05718, over 3896.00 frames. ], tot_loss[loss=0.107, simple_loss=0.1427, pruned_loss=0.03562, over 1895583.48 frames. ], batch size: 100, lr: 4.59e-03, grad_scale: 8.0 +2022-12-08 11:50:14,960 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9156, 1.7719, 3.0684, 2.2196, 2.9690, 1.8129, 2.4194, 2.8910], + device='cuda:2'), covar=tensor([0.1055, 0.4088, 0.0688, 0.4216, 0.1108, 0.3237, 0.1378, 0.0699], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0202, 0.0223, 0.0274, 0.0241, 0.0206, 0.0205, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:50:18,646 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=126702.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:50:22,263 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8098, 1.6374, 2.0958, 1.6359, 1.9285, 1.4727, 1.6806, 1.9343], + device='cuda:2'), covar=tensor([0.3120, 0.2160, 0.0623, 0.1879, 0.1356, 0.1058, 0.1261, 0.0782], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0202, 0.0223, 0.0274, 0.0241, 0.0206, 0.0205, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:50:32,277 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.662e+01 2.224e+02 2.546e+02 3.152e+02 6.937e+02, threshold=5.092e+02, percent-clipped=3.0 +2022-12-08 11:50:48,010 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=126736.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:51:15,766 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.74 vs. limit=5.0 +2022-12-08 11:51:36,148 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7065, 1.7193, 1.7859, 1.6898, 1.6777, 1.6161, 1.5821, 1.2512], + device='cuda:2'), covar=tensor([0.0209, 0.0290, 0.0208, 0.0182, 0.0219, 0.0312, 0.0266, 0.0427], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0020, 0.0021, 0.0021, 0.0033, 0.0027, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 11:51:37,726 INFO [train.py:873] (2/4) Epoch 17, batch 5800, loss[loss=0.09538, simple_loss=0.1378, pruned_loss=0.02649, over 14032.00 frames. ], tot_loss[loss=0.1084, simple_loss=0.1434, pruned_loss=0.03671, over 1828975.16 frames. ], batch size: 26, lr: 4.59e-03, grad_scale: 4.0 +2022-12-08 11:51:44,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9889, 2.9307, 2.2730, 3.0736, 2.8935, 2.9487, 2.6607, 2.3364], + device='cuda:2'), covar=tensor([0.0880, 0.1157, 0.2764, 0.0760, 0.1135, 0.1165, 0.1252, 0.2435], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0287, 0.0259, 0.0286, 0.0321, 0.0300, 0.0252, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:51:49,400 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=126806.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:51:58,007 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=126816.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:52:01,107 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.070e+02 2.153e+02 2.501e+02 2.856e+02 4.579e+02, threshold=5.001e+02, percent-clipped=0.0 +2022-12-08 11:52:06,928 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4639, 2.2208, 2.8412, 1.6959, 1.9031, 2.5002, 1.3943, 2.5027], + device='cuda:2'), covar=tensor([0.0877, 0.1297, 0.0631, 0.2248, 0.2041, 0.1072, 0.3113, 0.0852], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0096, 0.0101, 0.0116, 0.0091, 0.0118, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 11:52:11,827 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1724, 1.4492, 1.6940, 1.6529, 1.5653, 1.6238, 1.3863, 1.2944], + device='cuda:2'), covar=tensor([0.1562, 0.1401, 0.0508, 0.0609, 0.1524, 0.1147, 0.1780, 0.1619], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0089, 0.0069, 0.0075, 0.0099, 0.0089, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:52:45,947 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8457, 1.3958, 2.7965, 2.4762, 2.6864, 2.7896, 1.9207, 2.8041], + device='cuda:2'), covar=tensor([0.1283, 0.1616, 0.0206, 0.0487, 0.0448, 0.0240, 0.0725, 0.0220], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0130, 0.0168, 0.0147, 0.0143, 0.0125, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 11:52:51,097 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=126877.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:53:03,712 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.32 vs. limit=2.0 +2022-12-08 11:53:04,763 INFO [train.py:873] (2/4) Epoch 17, batch 5900, loss[loss=0.09727, simple_loss=0.1408, pruned_loss=0.02686, over 14392.00 frames. ], tot_loss[loss=0.1069, simple_loss=0.1427, pruned_loss=0.03551, over 1930659.30 frames. ], batch size: 41, lr: 4.59e-03, grad_scale: 2.0 +2022-12-08 11:53:29,087 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.268e+02 2.272e+02 2.688e+02 3.245e+02 5.334e+02, threshold=5.377e+02, percent-clipped=3.0 +2022-12-08 11:53:38,152 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8075, 1.1259, 1.2823, 1.2635, 0.9521, 1.3211, 1.1065, 0.8558], + device='cuda:2'), covar=tensor([0.1901, 0.1092, 0.0468, 0.0435, 0.2099, 0.0999, 0.1852, 0.1541], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0089, 0.0069, 0.0075, 0.0099, 0.0089, 0.0101, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:54:32,808 INFO [train.py:873] (2/4) Epoch 17, batch 6000, loss[loss=0.09635, simple_loss=0.1432, pruned_loss=0.02473, over 14585.00 frames. ], tot_loss[loss=0.107, simple_loss=0.1426, pruned_loss=0.0357, over 1926745.57 frames. ], batch size: 22, lr: 4.58e-03, grad_scale: 4.0 +2022-12-08 11:54:32,809 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 11:54:41,592 INFO [train.py:905] (2/4) Epoch 17, validation: loss=0.1381, simple_loss=0.176, pruned_loss=0.05009, over 857387.00 frames. +2022-12-08 11:54:41,593 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 11:54:50,267 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 11:55:00,315 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.83 vs. limit=5.0 +2022-12-08 11:55:05,926 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.176e+02 1.999e+02 2.582e+02 3.270e+02 6.669e+02, threshold=5.164e+02, percent-clipped=4.0 +2022-12-08 11:55:14,954 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.25 vs. limit=5.0 +2022-12-08 11:55:20,215 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=127036.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:55:29,016 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2633, 2.9638, 3.7438, 2.6147, 2.3842, 3.2442, 1.8321, 3.2629], + device='cuda:2'), covar=tensor([0.0838, 0.0995, 0.0567, 0.1531, 0.1873, 0.0836, 0.2812, 0.1005], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0096, 0.0101, 0.0116, 0.0091, 0.0118, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 11:55:41,171 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6042, 1.3732, 3.6005, 1.5170, 3.4834, 3.6928, 2.6395, 3.9284], + device='cuda:2'), covar=tensor([0.0232, 0.3350, 0.0389, 0.2505, 0.0649, 0.0424, 0.0922, 0.0187], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0154, 0.0157, 0.0166, 0.0165, 0.0177, 0.0131, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:55:51,417 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7519, 2.7048, 2.1284, 2.8252, 2.5533, 2.6984, 2.3838, 2.2365], + device='cuda:2'), covar=tensor([0.1066, 0.1035, 0.2710, 0.0851, 0.1140, 0.0902, 0.1513, 0.1900], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0286, 0.0259, 0.0284, 0.0322, 0.0300, 0.0251, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 11:56:01,543 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=127084.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:56:09,390 INFO [train.py:873] (2/4) Epoch 17, batch 6100, loss[loss=0.1082, simple_loss=0.1148, pruned_loss=0.05082, over 1344.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1424, pruned_loss=0.03514, over 1927924.03 frames. ], batch size: 100, lr: 4.58e-03, grad_scale: 4.0 +2022-12-08 11:56:21,142 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=127106.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:56:33,264 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.291e+02 2.062e+02 2.398e+02 3.031e+02 5.398e+02, threshold=4.796e+02, percent-clipped=2.0 +2022-12-08 11:56:39,401 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2093, 1.4813, 1.7086, 1.7113, 1.5664, 1.6229, 1.3973, 1.3052], + device='cuda:2'), covar=tensor([0.1343, 0.1025, 0.0560, 0.0553, 0.1230, 0.1004, 0.1628, 0.1417], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0088, 0.0069, 0.0075, 0.0098, 0.0089, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 11:57:02,819 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=127154.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:57:07,807 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 11:57:10,899 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6755, 3.3477, 2.6881, 3.7711, 3.6182, 3.6351, 3.0833, 2.6625], + device='cuda:2'), covar=tensor([0.0759, 0.1241, 0.2970, 0.0673, 0.1003, 0.1245, 0.1381, 0.3093], + device='cuda:2'), in_proj_covar=tensor([0.0279, 0.0284, 0.0258, 0.0283, 0.0320, 0.0299, 0.0250, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 11:57:18,833 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=127172.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:57:37,194 INFO [train.py:873] (2/4) Epoch 17, batch 6200, loss[loss=0.08307, simple_loss=0.126, pruned_loss=0.02009, over 13886.00 frames. ], tot_loss[loss=0.1059, simple_loss=0.1419, pruned_loss=0.03495, over 1882227.56 frames. ], batch size: 19, lr: 4.58e-03, grad_scale: 4.0 +2022-12-08 11:57:40,318 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0550, 1.8754, 4.6856, 4.3458, 4.2328, 4.8281, 4.5765, 4.8056], + device='cuda:2'), covar=tensor([0.1636, 0.1718, 0.0117, 0.0227, 0.0249, 0.0149, 0.0158, 0.0142], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0159, 0.0130, 0.0169, 0.0147, 0.0143, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 11:58:01,695 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.348e+02 2.117e+02 2.626e+02 3.289e+02 8.585e+02, threshold=5.253e+02, percent-clipped=4.0 +2022-12-08 11:58:27,577 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127249.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:59:05,809 INFO [train.py:873] (2/4) Epoch 17, batch 6300, loss[loss=0.1121, simple_loss=0.1443, pruned_loss=0.03993, over 14250.00 frames. ], tot_loss[loss=0.1067, simple_loss=0.1425, pruned_loss=0.03547, over 1932661.04 frames. ], batch size: 80, lr: 4.58e-03, grad_scale: 4.0 +2022-12-08 11:59:06,852 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9809, 2.0484, 3.9668, 2.7675, 3.8794, 1.8489, 2.9334, 3.8976], + device='cuda:2'), covar=tensor([0.0707, 0.3984, 0.0575, 0.5047, 0.0750, 0.3488, 0.1387, 0.0482], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0201, 0.0220, 0.0271, 0.0238, 0.0204, 0.0202, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 11:59:21,206 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127310.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 11:59:29,648 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.358e+02 2.074e+02 2.479e+02 3.103e+02 6.398e+02, threshold=4.958e+02, percent-clipped=2.0 +2022-12-08 12:00:01,155 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0369, 2.1044, 1.9927, 2.1709, 2.0159, 1.5213, 1.8470, 2.1110], + device='cuda:2'), covar=tensor([0.0874, 0.0587, 0.0774, 0.0737, 0.0781, 0.0743, 0.0695, 0.0578], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0035, 0.0039, 0.0033, 0.0034, 0.0048, 0.0036, 0.0038], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:00:10,927 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.70 vs. limit=2.0 +2022-12-08 12:00:33,559 INFO [train.py:873] (2/4) Epoch 17, batch 6400, loss[loss=0.1088, simple_loss=0.1462, pruned_loss=0.03563, over 14178.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1424, pruned_loss=0.03508, over 1997541.35 frames. ], batch size: 84, lr: 4.58e-03, grad_scale: 8.0 +2022-12-08 12:00:58,034 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.677e+01 2.186e+02 2.674e+02 3.273e+02 5.591e+02, threshold=5.347e+02, percent-clipped=3.0 +2022-12-08 12:01:01,726 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9427, 3.1951, 3.1478, 3.1736, 2.4376, 3.2950, 3.1242, 1.7622], + device='cuda:2'), covar=tensor([0.1050, 0.0961, 0.1075, 0.0774, 0.0912, 0.0429, 0.0717, 0.1898], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0089, 0.0070, 0.0075, 0.0098, 0.0089, 0.0100, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:01:04,631 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127427.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:01:43,648 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=127472.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:01:43,732 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9047, 3.5967, 2.8210, 4.2003, 4.0108, 4.0730, 3.4679, 2.9234], + device='cuda:2'), covar=tensor([0.0846, 0.1220, 0.3300, 0.0573, 0.0981, 0.1114, 0.1326, 0.3189], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0287, 0.0261, 0.0286, 0.0323, 0.0302, 0.0252, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:01:47,159 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6225, 1.7129, 2.8089, 2.1370, 2.7228, 1.7524, 2.3592, 2.6257], + device='cuda:2'), covar=tensor([0.1397, 0.4040, 0.0939, 0.2978, 0.1132, 0.3197, 0.1151, 0.1179], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0203, 0.0223, 0.0275, 0.0241, 0.0207, 0.0205, 0.0225], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:01:57,831 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127488.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 12:01:58,802 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3225, 2.4838, 4.2313, 4.4215, 4.2620, 2.4158, 4.3849, 3.4247], + device='cuda:2'), covar=tensor([0.0424, 0.1249, 0.0998, 0.0430, 0.0454, 0.1956, 0.0442, 0.0845], + device='cuda:2'), in_proj_covar=tensor([0.0296, 0.0260, 0.0377, 0.0332, 0.0271, 0.0308, 0.0313, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 12:02:01,915 INFO [train.py:873] (2/4) Epoch 17, batch 6500, loss[loss=0.1478, simple_loss=0.1409, pruned_loss=0.07733, over 1190.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1425, pruned_loss=0.03529, over 1948121.15 frames. ], batch size: 100, lr: 4.58e-03, grad_scale: 8.0 +2022-12-08 12:02:25,883 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 2.139e+02 2.589e+02 3.452e+02 6.629e+02, threshold=5.179e+02, percent-clipped=2.0 +2022-12-08 12:02:25,983 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=127520.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:02:34,197 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127529.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:03:27,638 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127590.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:03:29,963 INFO [train.py:873] (2/4) Epoch 17, batch 6600, loss[loss=0.131, simple_loss=0.1621, pruned_loss=0.04994, over 14195.00 frames. ], tot_loss[loss=0.1066, simple_loss=0.1426, pruned_loss=0.03533, over 1985459.92 frames. ], batch size: 46, lr: 4.57e-03, grad_scale: 8.0 +2022-12-08 12:03:37,320 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4809, 2.0531, 2.4353, 2.5609, 2.3736, 1.9851, 2.5997, 2.1926], + device='cuda:2'), covar=tensor([0.0542, 0.1070, 0.0636, 0.0546, 0.0740, 0.1466, 0.0571, 0.0933], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0259, 0.0375, 0.0330, 0.0270, 0.0306, 0.0310, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 12:03:40,657 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=127605.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:03:54,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.226e+02 2.073e+02 2.578e+02 3.202e+02 5.753e+02, threshold=5.156e+02, percent-clipped=2.0 +2022-12-08 12:04:19,909 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9952, 2.6158, 2.8255, 1.8397, 2.4159, 2.7525, 2.9226, 2.4807], + device='cuda:2'), covar=tensor([0.0682, 0.0809, 0.0803, 0.1331, 0.1011, 0.0692, 0.0702, 0.1183], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0171, 0.0141, 0.0126, 0.0144, 0.0156, 0.0136, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:04:28,896 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4062, 5.2079, 4.8030, 5.0581, 5.0204, 5.3754, 5.3923, 5.3984], + device='cuda:2'), covar=tensor([0.0730, 0.0351, 0.1876, 0.2419, 0.0621, 0.0606, 0.0773, 0.0703], + device='cuda:2'), in_proj_covar=tensor([0.0391, 0.0272, 0.0451, 0.0568, 0.0350, 0.0450, 0.0390, 0.0399], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:04:30,657 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127662.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:04:57,441 INFO [train.py:873] (2/4) Epoch 17, batch 6700, loss[loss=0.1576, simple_loss=0.1464, pruned_loss=0.08444, over 1230.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1425, pruned_loss=0.03522, over 1964911.17 frames. ], batch size: 100, lr: 4.57e-03, grad_scale: 4.0 +2022-12-08 12:05:21,963 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.328e+02 2.055e+02 2.568e+02 3.169e+02 6.326e+02, threshold=5.135e+02, percent-clipped=1.0 +2022-12-08 12:05:23,964 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127723.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:06:12,257 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3825, 2.4205, 2.5693, 2.5152, 2.4603, 2.1639, 1.4902, 2.2309], + device='cuda:2'), covar=tensor([0.0676, 0.0563, 0.0429, 0.0382, 0.0436, 0.1323, 0.2619, 0.0478], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0148, 0.0149, 0.0209, 0.0144, 0.0159, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:06:13,111 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127779.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:06:16,524 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=127783.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:06:25,292 INFO [train.py:873] (2/4) Epoch 17, batch 6800, loss[loss=0.1229, simple_loss=0.1545, pruned_loss=0.04568, over 14593.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1428, pruned_loss=0.03621, over 1893398.29 frames. ], batch size: 23, lr: 4.57e-03, grad_scale: 8.0 +2022-12-08 12:06:50,326 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.244e+02 2.046e+02 2.510e+02 3.119e+02 5.930e+02, threshold=5.019e+02, percent-clipped=2.0 +2022-12-08 12:07:07,196 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127840.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:07:16,085 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127850.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:07:21,212 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1265, 2.1624, 2.9802, 2.3009, 3.0307, 2.9686, 2.9066, 2.5246], + device='cuda:2'), covar=tensor([0.1005, 0.3035, 0.1321, 0.2060, 0.0826, 0.1184, 0.1271, 0.2082], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0312, 0.0393, 0.0300, 0.0366, 0.0324, 0.0362, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:07:29,035 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0513, 2.0226, 2.0950, 2.0905, 1.9980, 1.6747, 1.2781, 1.8491], + device='cuda:2'), covar=tensor([0.0714, 0.0661, 0.0493, 0.0433, 0.0532, 0.1542, 0.2467, 0.0529], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0148, 0.0149, 0.0208, 0.0144, 0.0158, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:07:34,814 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.59 vs. limit=5.0 +2022-12-08 12:07:35,232 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=127872.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:07:46,423 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=127885.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:07:53,423 INFO [train.py:873] (2/4) Epoch 17, batch 6900, loss[loss=0.1192, simple_loss=0.1591, pruned_loss=0.03968, over 14202.00 frames. ], tot_loss[loss=0.1076, simple_loss=0.1429, pruned_loss=0.03613, over 1914799.40 frames. ], batch size: 60, lr: 4.57e-03, grad_scale: 8.0 +2022-12-08 12:08:03,984 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=127905.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:08:09,291 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127911.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:08:17,922 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.924e+01 2.068e+02 2.418e+02 3.139e+02 1.064e+03, threshold=4.836e+02, percent-clipped=7.0 +2022-12-08 12:08:28,444 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=127933.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:08:34,707 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 12:08:46,263 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=127953.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:08:46,346 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7934, 1.3316, 1.7124, 1.2492, 1.5059, 1.8487, 1.5102, 1.5517], + device='cuda:2'), covar=tensor([0.0873, 0.0876, 0.0766, 0.0933, 0.1630, 0.0870, 0.0839, 0.1706], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0170, 0.0139, 0.0125, 0.0143, 0.0155, 0.0135, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:09:20,969 INFO [train.py:873] (2/4) Epoch 17, batch 7000, loss[loss=0.1095, simple_loss=0.153, pruned_loss=0.03306, over 14355.00 frames. ], tot_loss[loss=0.108, simple_loss=0.1432, pruned_loss=0.03644, over 1893419.07 frames. ], batch size: 73, lr: 4.57e-03, grad_scale: 8.0 +2022-12-08 12:09:25,450 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8349, 1.5297, 2.0715, 1.6444, 1.9114, 1.4068, 1.6538, 1.9148], + device='cuda:2'), covar=tensor([0.2888, 0.2738, 0.0548, 0.1667, 0.1214, 0.1440, 0.1279, 0.0990], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0200, 0.0221, 0.0269, 0.0238, 0.0202, 0.0203, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:09:26,737 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5151, 2.0832, 2.4853, 2.5819, 2.3675, 2.0378, 2.5238, 2.2735], + device='cuda:2'), covar=tensor([0.0477, 0.1168, 0.0590, 0.0460, 0.0628, 0.1307, 0.0517, 0.0755], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0261, 0.0377, 0.0333, 0.0271, 0.0307, 0.0313, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 12:09:43,999 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128018.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:09:47,264 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.330e+02 2.259e+02 2.605e+02 3.114e+02 4.629e+02, threshold=5.210e+02, percent-clipped=0.0 +2022-12-08 12:10:41,148 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128083.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:10:49,318 INFO [train.py:873] (2/4) Epoch 17, batch 7100, loss[loss=0.1294, simple_loss=0.1476, pruned_loss=0.05566, over 3852.00 frames. ], tot_loss[loss=0.1091, simple_loss=0.1438, pruned_loss=0.03718, over 1859679.20 frames. ], batch size: 100, lr: 4.56e-03, grad_scale: 4.0 +2022-12-08 12:11:01,725 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9810, 2.4105, 3.8056, 2.6731, 3.8376, 3.6639, 3.6170, 3.2048], + device='cuda:2'), covar=tensor([0.0931, 0.3415, 0.1173, 0.2173, 0.0880, 0.1255, 0.1674, 0.2166], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0311, 0.0394, 0.0299, 0.0365, 0.0323, 0.0361, 0.0299], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:11:14,254 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.219e+02 2.800e+02 3.533e+02 6.463e+02, threshold=5.601e+02, percent-clipped=4.0 +2022-12-08 12:11:19,814 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1409, 3.9630, 3.8330, 4.1380, 3.7759, 3.6879, 4.2156, 3.9972], + device='cuda:2'), covar=tensor([0.0641, 0.0848, 0.0879, 0.0578, 0.0871, 0.0688, 0.0616, 0.0834], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0145, 0.0148, 0.0164, 0.0149, 0.0124, 0.0170, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 12:11:19,850 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7627, 3.5140, 3.3384, 3.4744, 3.6907, 3.7127, 3.7458, 3.7262], + device='cuda:2'), covar=tensor([0.0809, 0.0641, 0.1844, 0.2175, 0.0724, 0.0845, 0.0916, 0.0820], + device='cuda:2'), in_proj_covar=tensor([0.0392, 0.0275, 0.0452, 0.0568, 0.0353, 0.0452, 0.0392, 0.0399], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:11:22,288 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128131.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 12:11:25,692 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128135.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:11:51,014 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1266, 1.9977, 1.8250, 1.9105, 2.0691, 2.1115, 2.1106, 2.0767], + device='cuda:2'), covar=tensor([0.1345, 0.0984, 0.2666, 0.2713, 0.1369, 0.1222, 0.1528, 0.1184], + device='cuda:2'), in_proj_covar=tensor([0.0390, 0.0273, 0.0448, 0.0564, 0.0350, 0.0449, 0.0389, 0.0396], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:12:09,581 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128185.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:12:16,541 INFO [train.py:873] (2/4) Epoch 17, batch 7200, loss[loss=0.1562, simple_loss=0.1451, pruned_loss=0.08364, over 2576.00 frames. ], tot_loss[loss=0.1086, simple_loss=0.1436, pruned_loss=0.03679, over 1875886.43 frames. ], batch size: 100, lr: 4.56e-03, grad_scale: 8.0 +2022-12-08 12:12:28,375 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128206.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:12:42,993 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 2.064e+02 2.461e+02 3.111e+02 6.198e+02, threshold=4.923e+02, percent-clipped=2.0 +2022-12-08 12:12:47,591 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128228.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:12:52,155 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128233.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:13:44,322 INFO [train.py:873] (2/4) Epoch 17, batch 7300, loss[loss=0.1106, simple_loss=0.1437, pruned_loss=0.03874, over 14296.00 frames. ], tot_loss[loss=0.1069, simple_loss=0.1425, pruned_loss=0.03571, over 1917205.95 frames. ], batch size: 69, lr: 4.56e-03, grad_scale: 4.0 +2022-12-08 12:14:06,065 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128318.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:14:08,060 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=128320.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:14:10,504 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.359e+02 2.143e+02 2.591e+02 3.352e+02 1.032e+03, threshold=5.183e+02, percent-clipped=4.0 +2022-12-08 12:14:47,729 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128366.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:15:00,906 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=128381.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:15:11,327 INFO [train.py:873] (2/4) Epoch 17, batch 7400, loss[loss=0.09769, simple_loss=0.1257, pruned_loss=0.03483, over 4986.00 frames. ], tot_loss[loss=0.1073, simple_loss=0.1425, pruned_loss=0.03599, over 1902224.68 frames. ], batch size: 100, lr: 4.56e-03, grad_scale: 4.0 +2022-12-08 12:15:29,208 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=128413.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:15:33,842 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.63 vs. limit=2.0 +2022-12-08 12:15:36,100 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4580, 1.1486, 2.0318, 1.7498, 1.8493, 2.0334, 1.3823, 2.0561], + device='cuda:2'), covar=tensor([0.0870, 0.1482, 0.0292, 0.0637, 0.0682, 0.0385, 0.0818, 0.0335], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0131, 0.0169, 0.0149, 0.0144, 0.0127, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:15:37,701 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.490e+02 2.104e+02 2.629e+02 3.153e+02 1.005e+03, threshold=5.259e+02, percent-clipped=2.0 +2022-12-08 12:15:48,068 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128435.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:15:52,525 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9339, 3.0043, 3.1479, 2.9752, 3.0555, 2.8740, 1.5490, 2.8449], + device='cuda:2'), covar=tensor([0.0470, 0.0432, 0.0355, 0.0419, 0.0348, 0.0820, 0.2678, 0.0333], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0174, 0.0146, 0.0147, 0.0206, 0.0142, 0.0156, 0.0192], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:16:07,112 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9295, 2.0612, 2.7900, 2.2865, 2.8868, 2.7923, 2.6878, 2.4409], + device='cuda:2'), covar=tensor([0.0912, 0.2965, 0.1309, 0.1816, 0.0769, 0.1153, 0.1134, 0.1602], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0311, 0.0393, 0.0300, 0.0366, 0.0323, 0.0362, 0.0298], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:16:22,010 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=128474.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:16:29,533 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128483.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:16:38,247 INFO [train.py:873] (2/4) Epoch 17, batch 7500, loss[loss=0.1015, simple_loss=0.1373, pruned_loss=0.03292, over 6000.00 frames. ], tot_loss[loss=0.1069, simple_loss=0.1424, pruned_loss=0.03572, over 1927539.07 frames. ], batch size: 100, lr: 4.56e-03, grad_scale: 4.0 +2022-12-08 12:16:49,194 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128506.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:16:51,967 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1119, 1.9598, 2.1511, 2.2408, 1.9009, 1.9081, 2.1549, 2.0466], + device='cuda:2'), covar=tensor([0.0438, 0.0899, 0.0412, 0.0420, 0.0655, 0.1080, 0.0545, 0.0550], + device='cuda:2'), in_proj_covar=tensor([0.0296, 0.0262, 0.0379, 0.0334, 0.0272, 0.0309, 0.0314, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 12:16:54,340 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-12-08 12:17:04,009 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.021e+02 2.073e+02 2.518e+02 3.139e+02 6.346e+02, threshold=5.035e+02, percent-clipped=2.0 +2022-12-08 12:17:08,347 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128528.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:18:06,085 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128554.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:18:06,903 INFO [train.py:873] (2/4) Epoch 18, batch 0, loss[loss=0.1178, simple_loss=0.1638, pruned_loss=0.0359, over 14268.00 frames. ], tot_loss[loss=0.1178, simple_loss=0.1638, pruned_loss=0.0359, over 14268.00 frames. ], batch size: 76, lr: 4.43e-03, grad_scale: 8.0 +2022-12-08 12:18:06,903 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 12:18:14,461 INFO [train.py:905] (2/4) Epoch 18, validation: loss=0.1457, simple_loss=0.1856, pruned_loss=0.05295, over 857387.00 frames. +2022-12-08 12:18:14,462 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 12:18:33,526 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=128576.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:18:54,876 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2092, 3.7887, 2.9708, 4.3776, 4.1688, 4.2866, 3.7706, 3.0869], + device='cuda:2'), covar=tensor([0.0735, 0.1188, 0.3205, 0.0557, 0.0861, 0.0976, 0.1073, 0.2787], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0287, 0.0258, 0.0286, 0.0321, 0.0302, 0.0251, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:19:16,502 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 5.261e+01 1.855e+02 2.639e+02 3.708e+02 1.096e+03, threshold=5.279e+02, percent-clipped=10.0 +2022-12-08 12:19:24,549 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9206, 2.7719, 2.7288, 2.9481, 2.7940, 2.8307, 3.0000, 2.5099], + device='cuda:2'), covar=tensor([0.0732, 0.1131, 0.0601, 0.0606, 0.0881, 0.0622, 0.0684, 0.0664], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0279, 0.0203, 0.0195, 0.0186, 0.0159, 0.0289, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 12:19:28,461 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.55 vs. limit=2.0 +2022-12-08 12:19:43,877 INFO [train.py:873] (2/4) Epoch 18, batch 100, loss[loss=0.1007, simple_loss=0.1374, pruned_loss=0.03197, over 14145.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.143, pruned_loss=0.03422, over 924297.42 frames. ], batch size: 99, lr: 4.42e-03, grad_scale: 4.0 +2022-12-08 12:20:01,882 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128676.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:20:39,605 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8954, 2.1724, 2.2007, 2.0341, 1.9575, 1.7520, 1.6514, 1.3654], + device='cuda:2'), covar=tensor([0.0285, 0.0279, 0.0216, 0.0395, 0.0278, 0.0312, 0.0356, 0.0530], + device='cuda:2'), in_proj_covar=tensor([0.0022, 0.0022, 0.0020, 0.0021, 0.0021, 0.0033, 0.0028, 0.0032], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:20:42,730 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.284e+02 2.050e+02 2.470e+02 3.144e+02 6.150e+02, threshold=4.940e+02, percent-clipped=2.0 +2022-12-08 12:21:02,406 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2686, 4.0329, 3.7319, 3.9252, 4.1025, 4.2074, 4.2257, 4.2553], + device='cuda:2'), covar=tensor([0.0830, 0.0496, 0.1969, 0.2497, 0.0756, 0.0839, 0.0854, 0.0748], + device='cuda:2'), in_proj_covar=tensor([0.0392, 0.0275, 0.0451, 0.0565, 0.0351, 0.0452, 0.0391, 0.0399], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:21:09,630 INFO [train.py:873] (2/4) Epoch 18, batch 200, loss[loss=0.1309, simple_loss=0.1312, pruned_loss=0.06526, over 2642.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.1415, pruned_loss=0.03412, over 1318698.80 frames. ], batch size: 100, lr: 4.42e-03, grad_scale: 4.0 +2022-12-08 12:21:22,385 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=128769.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:21:32,334 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=128781.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:22:09,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 1.948e+02 2.392e+02 2.813e+02 5.621e+02, threshold=4.784e+02, percent-clipped=1.0 +2022-12-08 12:22:25,612 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=128842.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:22:37,495 INFO [train.py:873] (2/4) Epoch 18, batch 300, loss[loss=0.1124, simple_loss=0.1414, pruned_loss=0.04172, over 10347.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1416, pruned_loss=0.03421, over 1614759.73 frames. ], batch size: 100, lr: 4.42e-03, grad_scale: 4.0 +2022-12-08 12:22:59,018 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8970, 1.5615, 3.0791, 1.6834, 3.1585, 3.0066, 2.2154, 3.2409], + device='cuda:2'), covar=tensor([0.0310, 0.2812, 0.0418, 0.2003, 0.0393, 0.0516, 0.1144, 0.0263], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0160, 0.0168, 0.0169, 0.0180, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:23:08,222 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=128890.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 12:23:11,740 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2762, 2.6016, 4.0462, 3.0889, 4.0968, 3.8807, 3.8951, 3.5405], + device='cuda:2'), covar=tensor([0.0833, 0.3011, 0.0901, 0.1613, 0.0728, 0.0991, 0.1306, 0.1506], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0311, 0.0391, 0.0297, 0.0363, 0.0322, 0.0360, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:23:37,894 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 2.277e+02 2.735e+02 3.513e+02 6.767e+02, threshold=5.469e+02, percent-clipped=7.0 +2022-12-08 12:24:02,454 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=128951.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 12:24:05,633 INFO [train.py:873] (2/4) Epoch 18, batch 400, loss[loss=0.1089, simple_loss=0.1399, pruned_loss=0.03897, over 14634.00 frames. ], tot_loss[loss=0.1059, simple_loss=0.142, pruned_loss=0.03486, over 1759318.07 frames. ], batch size: 33, lr: 4.42e-03, grad_scale: 8.0 +2022-12-08 12:24:24,394 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=128976.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:25:01,539 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.71 vs. limit=2.0 +2022-12-08 12:25:06,981 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.216e+02 2.168e+02 2.627e+02 3.251e+02 6.252e+02, threshold=5.254e+02, percent-clipped=2.0 +2022-12-08 12:25:07,086 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=129024.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:25:34,889 INFO [train.py:873] (2/4) Epoch 18, batch 500, loss[loss=0.1494, simple_loss=0.1452, pruned_loss=0.07681, over 1312.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.1417, pruned_loss=0.03489, over 1894062.81 frames. ], batch size: 100, lr: 4.42e-03, grad_scale: 8.0 +2022-12-08 12:25:47,381 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=129069.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:26:26,480 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.4222, 5.1152, 4.8569, 4.9998, 5.0639, 5.3380, 5.3672, 5.3921], + device='cuda:2'), covar=tensor([0.0570, 0.0390, 0.1742, 0.2282, 0.0593, 0.0664, 0.0806, 0.0612], + device='cuda:2'), in_proj_covar=tensor([0.0393, 0.0277, 0.0454, 0.0571, 0.0351, 0.0456, 0.0393, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:26:29,827 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=129117.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:26:35,675 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.427e+02 2.085e+02 2.575e+02 3.232e+02 8.773e+02, threshold=5.149e+02, percent-clipped=4.0 +2022-12-08 12:26:47,861 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=129137.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:27:03,407 INFO [train.py:873] (2/4) Epoch 18, batch 600, loss[loss=0.1362, simple_loss=0.1589, pruned_loss=0.05678, over 8654.00 frames. ], tot_loss[loss=0.1061, simple_loss=0.1421, pruned_loss=0.03503, over 1969550.22 frames. ], batch size: 100, lr: 4.42e-03, grad_scale: 8.0 +2022-12-08 12:27:10,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9071, 1.7990, 3.1127, 2.2873, 2.9360, 1.8267, 2.4123, 2.9499], + device='cuda:2'), covar=tensor([0.1076, 0.3851, 0.0695, 0.3669, 0.1035, 0.3117, 0.1291, 0.0875], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0201, 0.0221, 0.0273, 0.0241, 0.0205, 0.0203, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:28:04,835 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.209e+01 2.130e+02 2.657e+02 3.284e+02 6.073e+02, threshold=5.313e+02, percent-clipped=6.0 +2022-12-08 12:28:12,918 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0913, 1.3018, 1.3200, 0.9775, 0.9064, 1.0879, 0.8353, 1.1905], + device='cuda:2'), covar=tensor([0.1879, 0.3006, 0.1266, 0.2792, 0.2882, 0.1482, 0.1896, 0.1423], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0102, 0.0096, 0.0100, 0.0115, 0.0091, 0.0118, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 12:28:24,379 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=129246.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:28:32,539 INFO [train.py:873] (2/4) Epoch 18, batch 700, loss[loss=0.09074, simple_loss=0.1355, pruned_loss=0.02296, over 14561.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.141, pruned_loss=0.03405, over 1925087.38 frames. ], batch size: 34, lr: 4.41e-03, grad_scale: 4.0 +2022-12-08 12:28:51,084 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.78 vs. limit=2.0 +2022-12-08 12:29:33,262 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.087e+02 1.934e+02 2.414e+02 2.791e+02 7.591e+02, threshold=4.829e+02, percent-clipped=2.0 +2022-12-08 12:29:59,450 INFO [train.py:873] (2/4) Epoch 18, batch 800, loss[loss=0.104, simple_loss=0.1489, pruned_loss=0.02953, over 14098.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1417, pruned_loss=0.03435, over 1948696.98 frames. ], batch size: 29, lr: 4.41e-03, grad_scale: 8.0 +2022-12-08 12:30:51,480 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=129413.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:31:01,565 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.411e+02 2.286e+02 2.776e+02 3.751e+02 1.143e+03, threshold=5.551e+02, percent-clipped=10.0 +2022-12-08 12:31:11,962 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=129437.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:31:27,672 INFO [train.py:873] (2/4) Epoch 18, batch 900, loss[loss=0.07964, simple_loss=0.1161, pruned_loss=0.02157, over 6008.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.1419, pruned_loss=0.03471, over 1963316.40 frames. ], batch size: 100, lr: 4.41e-03, grad_scale: 8.0 +2022-12-08 12:31:44,560 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=129474.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 12:31:53,753 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=129485.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:32:28,738 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.182e+02 2.020e+02 2.482e+02 3.043e+02 9.483e+02, threshold=4.965e+02, percent-clipped=2.0 +2022-12-08 12:32:48,076 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=129546.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:32:55,378 INFO [train.py:873] (2/4) Epoch 18, batch 1000, loss[loss=0.08873, simple_loss=0.1308, pruned_loss=0.02333, over 13825.00 frames. ], tot_loss[loss=0.106, simple_loss=0.142, pruned_loss=0.03505, over 1926668.08 frames. ], batch size: 20, lr: 4.41e-03, grad_scale: 8.0 +2022-12-08 12:33:12,997 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4677, 2.2245, 2.4298, 1.6075, 2.1278, 2.3864, 2.4306, 2.1346], + device='cuda:2'), covar=tensor([0.0784, 0.0633, 0.0886, 0.1322, 0.1132, 0.0768, 0.0707, 0.1265], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0171, 0.0141, 0.0125, 0.0143, 0.0155, 0.0136, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:33:29,297 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=129594.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:33:45,410 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=129612.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:33:46,286 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6267, 2.5347, 2.0819, 2.6610, 2.4755, 2.5432, 2.3123, 2.1464], + device='cuda:2'), covar=tensor([0.1178, 0.1320, 0.2958, 0.1023, 0.1360, 0.1338, 0.1661, 0.2379], + device='cuda:2'), in_proj_covar=tensor([0.0276, 0.0285, 0.0256, 0.0285, 0.0318, 0.0299, 0.0249, 0.0239], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:33:56,804 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.083e+02 2.171e+02 2.641e+02 3.416e+02 7.976e+02, threshold=5.283e+02, percent-clipped=6.0 +2022-12-08 12:34:23,017 INFO [train.py:873] (2/4) Epoch 18, batch 1100, loss[loss=0.09275, simple_loss=0.118, pruned_loss=0.03376, over 3841.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1422, pruned_loss=0.03539, over 1966763.30 frames. ], batch size: 100, lr: 4.41e-03, grad_scale: 8.0 +2022-12-08 12:34:39,290 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=129673.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:34:44,493 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.20 vs. limit=5.0 +2022-12-08 12:35:23,667 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.177e+02 2.155e+02 2.731e+02 3.358e+02 7.442e+02, threshold=5.463e+02, percent-clipped=7.0 +2022-12-08 12:35:28,152 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 12:35:50,418 INFO [train.py:873] (2/4) Epoch 18, batch 1200, loss[loss=0.1098, simple_loss=0.1516, pruned_loss=0.03405, over 14399.00 frames. ], tot_loss[loss=0.107, simple_loss=0.1427, pruned_loss=0.03567, over 1976352.48 frames. ], batch size: 41, lr: 4.41e-03, grad_scale: 8.0 +2022-12-08 12:36:00,301 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.68 vs. limit=2.0 +2022-12-08 12:36:02,211 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=129769.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:36:18,979 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-12-08 12:36:21,546 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=129791.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:36:51,111 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.078e+02 2.653e+02 3.269e+02 7.697e+02, threshold=5.307e+02, percent-clipped=3.0 +2022-12-08 12:37:08,583 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9612, 1.7090, 4.2486, 4.0034, 3.9272, 4.4166, 3.8355, 4.4187], + device='cuda:2'), covar=tensor([0.1635, 0.1664, 0.0131, 0.0228, 0.0266, 0.0138, 0.0219, 0.0130], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0131, 0.0168, 0.0148, 0.0144, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:37:14,931 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=129852.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:37:17,148 INFO [train.py:873] (2/4) Epoch 18, batch 1300, loss[loss=0.1421, simple_loss=0.1556, pruned_loss=0.06433, over 4993.00 frames. ], tot_loss[loss=0.1061, simple_loss=0.1421, pruned_loss=0.03511, over 1979698.77 frames. ], batch size: 100, lr: 4.40e-03, grad_scale: 8.0 +2022-12-08 12:37:20,550 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 12:37:33,285 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3016, 3.4054, 3.5608, 3.2847, 3.4710, 3.1973, 1.5127, 3.2593], + device='cuda:2'), covar=tensor([0.0403, 0.0371, 0.0330, 0.0446, 0.0325, 0.0719, 0.3093, 0.0306], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0178, 0.0147, 0.0150, 0.0209, 0.0145, 0.0159, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:38:18,569 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.477e+02 2.020e+02 2.586e+02 3.074e+02 6.619e+02, threshold=5.173e+02, percent-clipped=1.0 +2022-12-08 12:38:25,459 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-08 12:38:27,143 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.20 vs. limit=5.0 +2022-12-08 12:38:32,396 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0974, 1.8763, 3.2021, 2.3767, 3.0245, 1.8652, 2.5234, 3.0279], + device='cuda:2'), covar=tensor([0.1156, 0.3836, 0.0650, 0.3744, 0.1195, 0.2991, 0.1275, 0.0814], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0198, 0.0221, 0.0269, 0.0238, 0.0204, 0.0201, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:38:44,885 INFO [train.py:873] (2/4) Epoch 18, batch 1400, loss[loss=0.09291, simple_loss=0.1369, pruned_loss=0.02444, over 14260.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1416, pruned_loss=0.03454, over 2000815.47 frames. ], batch size: 28, lr: 4.40e-03, grad_scale: 8.0 +2022-12-08 12:38:55,884 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=129968.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:39:13,975 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0428, 1.8550, 4.6471, 4.2554, 4.1353, 4.7417, 4.2676, 4.8110], + device='cuda:2'), covar=tensor([0.1508, 0.1499, 0.0098, 0.0210, 0.0223, 0.0126, 0.0145, 0.0094], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0158, 0.0131, 0.0168, 0.0147, 0.0143, 0.0127, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:39:19,139 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8714, 1.5804, 1.8889, 1.5947, 1.9942, 1.7646, 1.6714, 1.8698], + device='cuda:2'), covar=tensor([0.0540, 0.1585, 0.0451, 0.0599, 0.0467, 0.0821, 0.0287, 0.0443], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0309, 0.0389, 0.0294, 0.0362, 0.0320, 0.0357, 0.0295], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:39:28,322 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130000.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:39:30,083 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130002.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:39:32,641 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9996, 1.7794, 1.9237, 1.7788, 1.9082, 1.1894, 1.7853, 1.8882], + device='cuda:2'), covar=tensor([0.1050, 0.1130, 0.0726, 0.1452, 0.1067, 0.1160, 0.0798, 0.0749], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0033, 0.0035, 0.0049, 0.0037, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:39:49,592 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9468, 3.5303, 3.1682, 2.2093, 3.4384, 3.7223, 4.2157, 3.1416], + device='cuda:2'), covar=tensor([0.0511, 0.1202, 0.0807, 0.1481, 0.0816, 0.0528, 0.0553, 0.0947], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0171, 0.0141, 0.0125, 0.0143, 0.0155, 0.0135, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:39:51,055 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.109e+02 2.021e+02 2.494e+02 2.988e+02 5.995e+02, threshold=4.988e+02, percent-clipped=3.0 +2022-12-08 12:40:15,734 INFO [train.py:873] (2/4) Epoch 18, batch 1500, loss[loss=0.09531, simple_loss=0.1393, pruned_loss=0.02566, over 14511.00 frames. ], tot_loss[loss=0.1053, simple_loss=0.1415, pruned_loss=0.03455, over 1981519.67 frames. ], batch size: 49, lr: 4.40e-03, grad_scale: 4.0 +2022-12-08 12:40:21,367 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130061.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:40:23,161 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130063.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:40:28,204 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9065, 1.6101, 3.5936, 3.3338, 3.3851, 3.6212, 2.9536, 3.6270], + device='cuda:2'), covar=tensor([0.1726, 0.1809, 0.0161, 0.0325, 0.0313, 0.0194, 0.0357, 0.0172], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0132, 0.0169, 0.0148, 0.0144, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:40:28,210 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=130069.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:41:09,676 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=130117.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:41:17,027 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 2.103e+02 2.601e+02 2.993e+02 6.866e+02, threshold=5.203e+02, percent-clipped=5.0 +2022-12-08 12:41:32,316 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5507, 1.3837, 3.5922, 1.7136, 3.4350, 3.6257, 2.5791, 3.8798], + device='cuda:2'), covar=tensor([0.0236, 0.3239, 0.0405, 0.2176, 0.0754, 0.0390, 0.0980, 0.0185], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0159, 0.0167, 0.0168, 0.0179, 0.0134, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:41:35,572 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130147.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:41:42,392 INFO [train.py:873] (2/4) Epoch 18, batch 1600, loss[loss=0.1298, simple_loss=0.1515, pruned_loss=0.0541, over 7804.00 frames. ], tot_loss[loss=0.1061, simple_loss=0.1419, pruned_loss=0.03511, over 1997136.63 frames. ], batch size: 100, lr: 4.40e-03, grad_scale: 8.0 +2022-12-08 12:42:30,816 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9802, 1.6711, 3.6853, 3.4338, 3.5074, 3.7011, 3.0356, 3.7173], + device='cuda:2'), covar=tensor([0.1596, 0.1668, 0.0139, 0.0282, 0.0281, 0.0172, 0.0300, 0.0144], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0159, 0.0132, 0.0168, 0.0148, 0.0144, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:42:40,184 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-08 12:42:44,999 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.342e+02 2.040e+02 2.513e+02 3.158e+02 8.589e+02, threshold=5.027e+02, percent-clipped=3.0 +2022-12-08 12:42:51,119 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2535, 2.8525, 2.9437, 1.9464, 2.7055, 2.9730, 3.2566, 2.6613], + device='cuda:2'), covar=tensor([0.0694, 0.0825, 0.0863, 0.1444, 0.0993, 0.0739, 0.0612, 0.1189], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0171, 0.0141, 0.0125, 0.0143, 0.0155, 0.0135, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:43:04,642 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6982, 1.8009, 1.8576, 1.3093, 1.3175, 1.6158, 1.2068, 1.6831], + device='cuda:2'), covar=tensor([0.1287, 0.2218, 0.1035, 0.2556, 0.2823, 0.1204, 0.2709, 0.1111], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0095, 0.0101, 0.0115, 0.0092, 0.0118, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 12:43:10,263 INFO [train.py:873] (2/4) Epoch 18, batch 1700, loss[loss=0.1129, simple_loss=0.12, pruned_loss=0.05285, over 1214.00 frames. ], tot_loss[loss=0.1064, simple_loss=0.1422, pruned_loss=0.03529, over 1984559.40 frames. ], batch size: 100, lr: 4.40e-03, grad_scale: 8.0 +2022-12-08 12:43:22,163 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=130268.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:43:31,605 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8578, 1.8489, 1.6511, 1.9222, 1.7937, 1.8468, 1.8204, 1.6684], + device='cuda:2'), covar=tensor([0.1168, 0.0877, 0.2198, 0.1008, 0.1222, 0.0755, 0.1631, 0.1176], + device='cuda:2'), in_proj_covar=tensor([0.0278, 0.0284, 0.0255, 0.0286, 0.0318, 0.0299, 0.0250, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:43:38,515 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7014, 3.0966, 2.8596, 3.0789, 2.3670, 3.2280, 2.9614, 1.6125], + device='cuda:2'), covar=tensor([0.1207, 0.0600, 0.0868, 0.0495, 0.0880, 0.0381, 0.0802, 0.1872], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0090, 0.0070, 0.0076, 0.0100, 0.0090, 0.0100, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:43:39,840 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-08 12:44:04,008 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=130316.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:44:12,323 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.118e+02 2.048e+02 2.469e+02 3.168e+02 7.190e+02, threshold=4.938e+02, percent-clipped=8.0 +2022-12-08 12:44:37,344 INFO [train.py:873] (2/4) Epoch 18, batch 1800, loss[loss=0.1343, simple_loss=0.1346, pruned_loss=0.06698, over 1253.00 frames. ], tot_loss[loss=0.107, simple_loss=0.1428, pruned_loss=0.03555, over 1969935.61 frames. ], batch size: 100, lr: 4.40e-03, grad_scale: 8.0 +2022-12-08 12:44:38,341 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130356.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:44:38,902 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.14 vs. limit=5.0 +2022-12-08 12:44:40,086 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130358.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 12:44:54,488 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.98 vs. limit=5.0 +2022-12-08 12:45:03,061 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130384.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:45:39,184 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.710e+01 2.001e+02 2.479e+02 3.133e+02 6.689e+02, threshold=4.957e+02, percent-clipped=4.0 +2022-12-08 12:45:44,525 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:45:47,248 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.01 vs. limit=5.0 +2022-12-08 12:45:49,408 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130438.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:45:55,527 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130445.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:45:57,232 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=130447.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:46:04,201 INFO [train.py:873] (2/4) Epoch 18, batch 1900, loss[loss=0.116, simple_loss=0.1305, pruned_loss=0.05078, over 4943.00 frames. ], tot_loss[loss=0.1073, simple_loss=0.1425, pruned_loss=0.03601, over 1917404.04 frames. ], batch size: 100, lr: 4.39e-03, grad_scale: 8.0 +2022-12-08 12:46:27,172 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2870, 2.4354, 1.7527, 2.3731, 2.4318, 2.0669, 2.0710, 2.2719], + device='cuda:2'), covar=tensor([0.0292, 0.0742, 0.0643, 0.0465, 0.0258, 0.0437, 0.0476, 0.0399], + device='cuda:2'), in_proj_covar=tensor([0.0023, 0.0023, 0.0020, 0.0022, 0.0021, 0.0034, 0.0028, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:46:32,212 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7132, 1.4320, 3.0057, 1.5437, 3.0125, 2.9527, 2.1476, 3.0727], + device='cuda:2'), covar=tensor([0.0410, 0.3875, 0.0504, 0.2546, 0.0573, 0.0609, 0.1083, 0.0428], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0155, 0.0159, 0.0168, 0.0168, 0.0179, 0.0133, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:46:37,639 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130493.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:46:39,392 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=130495.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:46:42,975 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130499.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:47:05,390 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4952, 4.2135, 4.0222, 4.4926, 4.2166, 4.0314, 4.5369, 3.7690], + device='cuda:2'), covar=tensor([0.0438, 0.0976, 0.0457, 0.0463, 0.0774, 0.0901, 0.0518, 0.0529], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0278, 0.0200, 0.0194, 0.0185, 0.0159, 0.0287, 0.0168], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 12:47:06,079 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.096e+02 2.089e+02 2.666e+02 3.206e+02 5.999e+02, threshold=5.333e+02, percent-clipped=5.0 +2022-12-08 12:47:25,628 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130548.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:47:31,378 INFO [train.py:873] (2/4) Epoch 18, batch 2000, loss[loss=0.1016, simple_loss=0.1399, pruned_loss=0.03164, over 14238.00 frames. ], tot_loss[loss=0.1082, simple_loss=0.1434, pruned_loss=0.03648, over 1996620.09 frames. ], batch size: 89, lr: 4.39e-03, grad_scale: 8.0 +2022-12-08 12:47:40,291 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=130565.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:48:18,807 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130609.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 12:48:34,463 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=130626.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:48:35,243 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 2.007e+02 2.517e+02 3.001e+02 4.503e+02, threshold=5.034e+02, percent-clipped=0.0 +2022-12-08 12:48:59,208 INFO [train.py:873] (2/4) Epoch 18, batch 2100, loss[loss=0.0968, simple_loss=0.1353, pruned_loss=0.02914, over 14109.00 frames. ], tot_loss[loss=0.1069, simple_loss=0.1428, pruned_loss=0.03555, over 1987377.30 frames. ], batch size: 29, lr: 4.39e-03, grad_scale: 4.0 +2022-12-08 12:49:00,454 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=130656.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:49:02,065 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=130658.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:49:25,478 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8679, 1.6691, 1.8193, 1.7965, 1.8341, 1.1459, 1.5897, 1.7734], + device='cuda:2'), covar=tensor([0.0699, 0.0744, 0.0565, 0.0624, 0.0568, 0.0867, 0.0816, 0.0497], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0033, 0.0034, 0.0048, 0.0036, 0.0038], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:49:41,924 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=130704.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:49:43,637 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=130706.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:49:55,923 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7860, 3.5687, 3.3115, 2.4626, 3.1528, 3.4567, 3.7674, 3.1122], + device='cuda:2'), covar=tensor([0.0544, 0.0963, 0.0845, 0.1263, 0.1035, 0.0571, 0.0804, 0.1012], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0171, 0.0141, 0.0126, 0.0144, 0.0156, 0.0136, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:50:01,667 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 2.249e+02 2.636e+02 3.244e+02 1.045e+03, threshold=5.271e+02, percent-clipped=2.0 +2022-12-08 12:50:13,018 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130740.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:50:16,900 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6977, 1.7707, 1.8469, 1.3200, 1.3073, 1.6138, 1.1694, 1.7150], + device='cuda:2'), covar=tensor([0.1379, 0.2377, 0.1081, 0.2436, 0.2882, 0.1225, 0.2846, 0.1187], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0103, 0.0095, 0.0101, 0.0115, 0.0091, 0.0117, 0.0094], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 12:50:26,107 INFO [train.py:873] (2/4) Epoch 18, batch 2200, loss[loss=0.1164, simple_loss=0.1406, pruned_loss=0.04617, over 4963.00 frames. ], tot_loss[loss=0.1073, simple_loss=0.1431, pruned_loss=0.03574, over 1977741.93 frames. ], batch size: 100, lr: 4.39e-03, grad_scale: 4.0 +2022-12-08 12:50:33,405 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.85 vs. limit=2.0 +2022-12-08 12:50:55,150 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130788.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:51:00,294 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130794.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:51:03,812 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7076, 1.4625, 3.6664, 1.8879, 3.6218, 3.7875, 2.8084, 4.0482], + device='cuda:2'), covar=tensor([0.0239, 0.3120, 0.0395, 0.1975, 0.0611, 0.0422, 0.0771, 0.0195], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0154, 0.0158, 0.0166, 0.0166, 0.0177, 0.0132, 0.0151], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:51:29,862 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.247e+02 2.113e+02 2.522e+02 3.160e+02 5.458e+02, threshold=5.044e+02, percent-clipped=1.0 +2022-12-08 12:51:53,885 INFO [train.py:873] (2/4) Epoch 18, batch 2300, loss[loss=0.1159, simple_loss=0.1418, pruned_loss=0.04502, over 7773.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.1418, pruned_loss=0.03478, over 1964155.75 frames. ], batch size: 100, lr: 4.39e-03, grad_scale: 4.0 +2022-12-08 12:52:30,457 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.88 vs. limit=2.0 +2022-12-08 12:52:37,130 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130904.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:52:51,926 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=130921.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:52:56,900 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.171e+02 2.034e+02 2.487e+02 3.081e+02 6.323e+02, threshold=4.975e+02, percent-clipped=2.0 +2022-12-08 12:53:14,323 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4999, 1.0689, 2.0270, 1.8108, 1.8726, 2.0413, 1.4324, 2.0617], + device='cuda:2'), covar=tensor([0.0912, 0.1671, 0.0293, 0.0608, 0.0723, 0.0396, 0.0758, 0.0361], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0158, 0.0131, 0.0168, 0.0147, 0.0144, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 12:53:22,078 INFO [train.py:873] (2/4) Epoch 18, batch 2400, loss[loss=0.1245, simple_loss=0.1429, pruned_loss=0.05301, over 5960.00 frames. ], tot_loss[loss=0.1058, simple_loss=0.1421, pruned_loss=0.03473, over 1961082.14 frames. ], batch size: 100, lr: 4.39e-03, grad_scale: 8.0 +2022-12-08 12:53:43,940 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1465, 1.1188, 1.0604, 1.1131, 1.2210, 0.8133, 0.9523, 1.1380], + device='cuda:2'), covar=tensor([0.0805, 0.0990, 0.0866, 0.0687, 0.0800, 0.0980, 0.1317, 0.1018], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0033, 0.0035, 0.0049, 0.0037, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:54:25,833 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.164e+02 2.032e+02 2.544e+02 3.351e+02 8.852e+02, threshold=5.088e+02, percent-clipped=2.0 +2022-12-08 12:54:37,338 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131040.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:54:50,165 INFO [train.py:873] (2/4) Epoch 18, batch 2500, loss[loss=0.1217, simple_loss=0.1543, pruned_loss=0.04453, over 14420.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1423, pruned_loss=0.03515, over 1951555.23 frames. ], batch size: 73, lr: 4.38e-03, grad_scale: 8.0 +2022-12-08 12:55:04,375 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4797, 1.6590, 1.9248, 1.8683, 1.7984, 1.8623, 1.6053, 1.3284], + device='cuda:2'), covar=tensor([0.1081, 0.1375, 0.0468, 0.0614, 0.1154, 0.0907, 0.1441, 0.1889], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0091, 0.0071, 0.0077, 0.0100, 0.0091, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 12:55:19,756 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:55:19,809 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:55:25,096 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131094.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:55:54,052 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.009e+02 2.192e+02 2.699e+02 3.352e+02 8.933e+02, threshold=5.398e+02, percent-clipped=5.0 +2022-12-08 12:56:01,756 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131136.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:56:07,938 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131142.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:56:11,569 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3238, 2.6434, 5.2433, 3.6124, 5.0005, 2.5268, 3.8602, 5.0450], + device='cuda:2'), covar=tensor([0.0426, 0.3574, 0.0347, 0.5736, 0.0427, 0.3238, 0.1202, 0.0329], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0199, 0.0220, 0.0269, 0.0236, 0.0202, 0.0200, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:56:19,382 INFO [train.py:873] (2/4) Epoch 18, batch 2600, loss[loss=0.09774, simple_loss=0.137, pruned_loss=0.02924, over 13959.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.141, pruned_loss=0.03437, over 1978617.01 frames. ], batch size: 26, lr: 4.38e-03, grad_scale: 8.0 +2022-12-08 12:56:25,636 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7806, 1.6792, 2.9068, 2.1297, 2.8433, 1.7246, 2.2374, 2.7344], + device='cuda:2'), covar=tensor([0.1195, 0.4447, 0.0760, 0.4166, 0.1086, 0.3574, 0.1317, 0.1040], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0199, 0.0220, 0.0269, 0.0236, 0.0202, 0.0200, 0.0223], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 12:56:35,678 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=131174.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:57:02,613 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131204.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 12:57:17,467 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131221.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:57:22,470 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.383e+02 2.143e+02 2.636e+02 3.180e+02 5.935e+02, threshold=5.272e+02, percent-clipped=1.0 +2022-12-08 12:57:29,990 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=131235.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 12:57:44,013 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131252.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:57:46,574 INFO [train.py:873] (2/4) Epoch 18, batch 2700, loss[loss=0.08575, simple_loss=0.1294, pruned_loss=0.02106, over 14653.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1413, pruned_loss=0.03452, over 1981214.19 frames. ], batch size: 33, lr: 4.38e-03, grad_scale: 4.0 +2022-12-08 12:57:47,284 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-08 12:57:50,591 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2111, 1.7962, 2.1906, 1.8145, 2.2541, 2.0844, 2.0489, 2.1211], + device='cuda:2'), covar=tensor([0.0626, 0.2095, 0.0681, 0.0963, 0.0521, 0.1000, 0.0557, 0.0746], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0309, 0.0391, 0.0294, 0.0366, 0.0321, 0.0359, 0.0298], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:57:59,031 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131269.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 12:58:02,050 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2197, 3.9689, 3.7359, 3.8967, 4.1102, 4.1737, 4.1923, 4.1919], + device='cuda:2'), covar=tensor([0.0811, 0.0604, 0.2025, 0.2529, 0.0726, 0.0786, 0.0853, 0.0791], + device='cuda:2'), in_proj_covar=tensor([0.0392, 0.0277, 0.0452, 0.0571, 0.0352, 0.0451, 0.0395, 0.0399], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:58:20,066 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-08 12:58:50,253 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.346e+01 2.091e+02 2.717e+02 3.519e+02 6.308e+02, threshold=5.434e+02, percent-clipped=4.0 +2022-12-08 12:58:52,272 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5095, 2.3083, 2.4018, 2.1933, 2.3849, 1.5077, 2.3693, 2.5072], + device='cuda:2'), covar=tensor([0.0937, 0.0757, 0.0746, 0.2425, 0.1341, 0.0794, 0.1168, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0033, 0.0034, 0.0048, 0.0037, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 12:58:58,947 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.20 vs. limit=2.0 +2022-12-08 12:59:14,301 INFO [train.py:873] (2/4) Epoch 18, batch 2800, loss[loss=0.1274, simple_loss=0.1198, pruned_loss=0.06749, over 1288.00 frames. ], tot_loss[loss=0.1064, simple_loss=0.1421, pruned_loss=0.03535, over 1929064.44 frames. ], batch size: 100, lr: 4.38e-03, grad_scale: 8.0 +2022-12-08 12:59:16,257 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6197, 3.8083, 3.9673, 3.6847, 3.8357, 3.8376, 1.5420, 3.6584], + device='cuda:2'), covar=tensor([0.0418, 0.0365, 0.0341, 0.0439, 0.0321, 0.0428, 0.3197, 0.0302], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0146, 0.0150, 0.0208, 0.0143, 0.0159, 0.0195], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 12:59:17,871 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0494, 4.7560, 4.4188, 4.6470, 4.7320, 4.9640, 4.9823, 4.9902], + device='cuda:2'), covar=tensor([0.0776, 0.0429, 0.1967, 0.2728, 0.0763, 0.0790, 0.0840, 0.0836], + device='cuda:2'), in_proj_covar=tensor([0.0393, 0.0278, 0.0452, 0.0572, 0.0353, 0.0452, 0.0395, 0.0401], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 12:59:52,727 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3185, 2.9918, 3.9487, 2.9557, 2.4760, 3.2868, 1.6216, 3.5473], + device='cuda:2'), covar=tensor([0.1025, 0.1187, 0.0453, 0.1609, 0.1955, 0.0826, 0.3432, 0.1168], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0104, 0.0095, 0.0101, 0.0116, 0.0092, 0.0118, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 13:00:02,576 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=131409.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:00:08,453 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=131416.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:00:18,983 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.304e+02 1.962e+02 2.375e+02 2.873e+02 4.439e+02, threshold=4.749e+02, percent-clipped=0.0 +2022-12-08 13:00:42,313 INFO [train.py:873] (2/4) Epoch 18, batch 2900, loss[loss=0.1677, simple_loss=0.1497, pruned_loss=0.09281, over 1269.00 frames. ], tot_loss[loss=0.1066, simple_loss=0.1421, pruned_loss=0.03555, over 1871962.17 frames. ], batch size: 100, lr: 4.38e-03, grad_scale: 8.0 +2022-12-08 13:00:55,780 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=131470.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:01:02,106 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=131477.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:01:15,250 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=131492.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:01:20,278 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9920, 2.6154, 3.5007, 2.4588, 2.2990, 3.1495, 1.7279, 3.1368], + device='cuda:2'), covar=tensor([0.0832, 0.1452, 0.0521, 0.1960, 0.2022, 0.0684, 0.2919, 0.0776], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0104, 0.0096, 0.0101, 0.0116, 0.0092, 0.0118, 0.0095], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 13:01:46,460 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.231e+02 2.257e+02 2.729e+02 3.510e+02 6.783e+02, threshold=5.458e+02, percent-clipped=7.0 +2022-12-08 13:01:48,338 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=131530.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 13:01:53,570 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8380, 1.7462, 1.8469, 1.9067, 1.8343, 1.2303, 1.5462, 1.7373], + device='cuda:2'), covar=tensor([0.0723, 0.0753, 0.0599, 0.0653, 0.1027, 0.0780, 0.0878, 0.0662], + device='cuda:2'), in_proj_covar=tensor([0.0036, 0.0034, 0.0039, 0.0033, 0.0034, 0.0048, 0.0036, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 13:02:08,659 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=131553.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:02:10,262 INFO [train.py:873] (2/4) Epoch 18, batch 3000, loss[loss=0.09663, simple_loss=0.1347, pruned_loss=0.0293, over 14406.00 frames. ], tot_loss[loss=0.1057, simple_loss=0.1413, pruned_loss=0.03502, over 1867382.19 frames. ], batch size: 53, lr: 4.38e-03, grad_scale: 8.0 +2022-12-08 13:02:10,262 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 13:02:18,730 INFO [train.py:905] (2/4) Epoch 18, validation: loss=0.1388, simple_loss=0.176, pruned_loss=0.05082, over 857387.00 frames. +2022-12-08 13:02:18,731 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 13:03:22,346 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.140e+02 2.067e+02 2.535e+02 3.249e+02 7.779e+02, threshold=5.070e+02, percent-clipped=4.0 +2022-12-08 13:03:38,421 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3269, 2.6214, 2.5696, 2.7341, 2.2311, 2.7328, 2.4950, 1.5379], + device='cuda:2'), covar=tensor([0.0967, 0.0777, 0.0854, 0.0655, 0.0967, 0.0684, 0.1091, 0.1915], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0091, 0.0071, 0.0077, 0.0100, 0.0091, 0.0102, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:03:44,985 INFO [train.py:873] (2/4) Epoch 18, batch 3100, loss[loss=0.117, simple_loss=0.1504, pruned_loss=0.0418, over 14258.00 frames. ], tot_loss[loss=0.1062, simple_loss=0.1418, pruned_loss=0.03534, over 1897850.52 frames. ], batch size: 63, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:04:49,759 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.195e+02 2.137e+02 2.592e+02 3.013e+02 4.856e+02, threshold=5.183e+02, percent-clipped=0.0 +2022-12-08 13:05:12,291 INFO [train.py:873] (2/4) Epoch 18, batch 3200, loss[loss=0.1197, simple_loss=0.1568, pruned_loss=0.04132, over 14405.00 frames. ], tot_loss[loss=0.1064, simple_loss=0.1418, pruned_loss=0.03555, over 1897211.05 frames. ], batch size: 53, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:05:20,990 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=131765.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:05:27,338 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=131772.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:05:37,280 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.25 vs. limit=2.0 +2022-12-08 13:05:53,628 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3313, 4.0069, 3.7917, 3.9854, 4.1706, 4.2224, 4.2803, 4.2861], + device='cuda:2'), covar=tensor([0.0732, 0.0558, 0.1868, 0.2198, 0.0703, 0.0797, 0.0798, 0.0765], + device='cuda:2'), in_proj_covar=tensor([0.0396, 0.0278, 0.0452, 0.0573, 0.0352, 0.0454, 0.0395, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:06:17,105 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.113e+02 2.491e+02 2.980e+02 5.007e+02, threshold=4.982e+02, percent-clipped=0.0 +2022-12-08 13:06:18,126 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=131830.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:06:34,200 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=131848.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:06:40,049 INFO [train.py:873] (2/4) Epoch 18, batch 3300, loss[loss=0.1555, simple_loss=0.1506, pruned_loss=0.08017, over 1243.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1413, pruned_loss=0.03488, over 1917512.99 frames. ], batch size: 100, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:06:41,807 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=131857.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:06:57,346 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0937, 2.1551, 1.9919, 2.2017, 1.8574, 2.0539, 2.1623, 2.0888], + device='cuda:2'), covar=tensor([0.0909, 0.1144, 0.1126, 0.0905, 0.1395, 0.0831, 0.1108, 0.1075], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0146, 0.0150, 0.0164, 0.0149, 0.0124, 0.0171, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:07:00,050 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=131878.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:07:01,958 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0895, 2.2568, 2.4345, 2.4935, 2.0909, 2.4295, 2.3459, 1.4452], + device='cuda:2'), covar=tensor([0.1096, 0.1181, 0.0668, 0.0535, 0.0986, 0.0671, 0.0938, 0.2047], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0092, 0.0071, 0.0077, 0.0100, 0.0091, 0.0103, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:07:32,527 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-12-08 13:07:34,711 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=131918.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:07:44,549 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.221e+02 2.033e+02 2.408e+02 3.060e+02 5.712e+02, threshold=4.815e+02, percent-clipped=3.0 +2022-12-08 13:07:52,999 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2267, 2.2362, 2.5357, 1.7020, 1.7517, 2.2799, 1.4550, 2.2205], + device='cuda:2'), covar=tensor([0.1044, 0.1448, 0.0713, 0.2445, 0.2459, 0.1193, 0.3337, 0.1121], + device='cuda:2'), in_proj_covar=tensor([0.0086, 0.0104, 0.0096, 0.0102, 0.0117, 0.0093, 0.0119, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 13:08:06,598 INFO [train.py:873] (2/4) Epoch 18, batch 3400, loss[loss=0.1225, simple_loss=0.131, pruned_loss=0.05702, over 2658.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1414, pruned_loss=0.03469, over 1950430.45 frames. ], batch size: 100, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:08:25,029 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3415, 2.7203, 4.2038, 3.1030, 4.2404, 3.9912, 3.9949, 3.5552], + device='cuda:2'), covar=tensor([0.0726, 0.3047, 0.1025, 0.1815, 0.0748, 0.1089, 0.1509, 0.1718], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0309, 0.0389, 0.0295, 0.0365, 0.0318, 0.0360, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:08:42,108 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0007, 1.6080, 3.9065, 1.8880, 3.8989, 4.1077, 2.9485, 4.3471], + device='cuda:2'), covar=tensor([0.0197, 0.2949, 0.0416, 0.2009, 0.0429, 0.0320, 0.0760, 0.0161], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0156, 0.0160, 0.0167, 0.0169, 0.0178, 0.0134, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:08:50,338 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8186, 1.5810, 2.0600, 1.6070, 1.8698, 1.4616, 1.6622, 1.8970], + device='cuda:2'), covar=tensor([0.3325, 0.2707, 0.0766, 0.2346, 0.1637, 0.1428, 0.1204, 0.1242], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0199, 0.0219, 0.0269, 0.0235, 0.0201, 0.0199, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:09:10,665 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.372e+02 2.058e+02 2.441e+02 2.998e+02 5.403e+02, threshold=4.883e+02, percent-clipped=4.0 +2022-12-08 13:09:12,607 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=132031.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:09:14,011 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.79 vs. limit=2.0 +2022-12-08 13:09:32,587 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=132054.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:09:33,285 INFO [train.py:873] (2/4) Epoch 18, batch 3500, loss[loss=0.09155, simple_loss=0.1383, pruned_loss=0.0224, over 14316.00 frames. ], tot_loss[loss=0.1062, simple_loss=0.142, pruned_loss=0.03518, over 1954577.16 frames. ], batch size: 31, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:09:42,099 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132065.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:09:43,380 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.04 vs. limit=5.0 +2022-12-08 13:09:47,866 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132072.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:09:51,224 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8722, 5.0073, 5.2870, 4.4605, 5.0875, 5.3708, 2.2693, 4.7878], + device='cuda:2'), covar=tensor([0.0259, 0.0277, 0.0333, 0.0349, 0.0282, 0.0119, 0.2799, 0.0261], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0178, 0.0148, 0.0150, 0.0209, 0.0143, 0.0159, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 13:10:05,484 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=132092.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:10:10,773 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.54 vs. limit=2.0 +2022-12-08 13:10:16,831 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2049, 3.9219, 3.7480, 3.8897, 4.1114, 4.1396, 4.1833, 4.1857], + device='cuda:2'), covar=tensor([0.0888, 0.0600, 0.1861, 0.2389, 0.0664, 0.0795, 0.0892, 0.0829], + device='cuda:2'), in_proj_covar=tensor([0.0394, 0.0278, 0.0450, 0.0572, 0.0350, 0.0451, 0.0393, 0.0401], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:10:23,478 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132113.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:10:25,335 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=132115.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 13:10:29,245 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132120.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:10:37,537 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.237e+02 2.039e+02 2.680e+02 3.337e+02 6.531e+02, threshold=5.359e+02, percent-clipped=3.0 +2022-12-08 13:10:54,152 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132148.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:10:59,847 INFO [train.py:873] (2/4) Epoch 18, batch 3600, loss[loss=0.1432, simple_loss=0.1498, pruned_loss=0.06835, over 2662.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1413, pruned_loss=0.0344, over 1956753.11 frames. ], batch size: 100, lr: 4.37e-03, grad_scale: 8.0 +2022-12-08 13:11:00,784 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1911, 4.8778, 4.5750, 4.7987, 4.8718, 5.0612, 5.1195, 5.1537], + device='cuda:2'), covar=tensor([0.0749, 0.0476, 0.2006, 0.2609, 0.0747, 0.0772, 0.0792, 0.0753], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0278, 0.0450, 0.0572, 0.0351, 0.0452, 0.0394, 0.0401], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:11:35,841 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132196.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:11:51,520 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=132213.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:12:05,413 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.022e+02 2.046e+02 2.523e+02 3.076e+02 9.449e+02, threshold=5.046e+02, percent-clipped=2.0 +2022-12-08 13:12:09,715 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5172, 1.6833, 1.9553, 1.9161, 1.7751, 1.8842, 1.5496, 1.3709], + device='cuda:2'), covar=tensor([0.0904, 0.1237, 0.0541, 0.0668, 0.1172, 0.0776, 0.1453, 0.1849], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0091, 0.0072, 0.0077, 0.0100, 0.0091, 0.0103, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:12:28,037 INFO [train.py:873] (2/4) Epoch 18, batch 3700, loss[loss=0.1067, simple_loss=0.1454, pruned_loss=0.03398, over 12739.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1412, pruned_loss=0.03487, over 1913553.11 frames. ], batch size: 100, lr: 4.36e-03, grad_scale: 8.0 +2022-12-08 13:12:36,252 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7938, 2.2244, 4.8249, 3.2555, 4.6001, 2.2720, 3.6845, 4.6094], + device='cuda:2'), covar=tensor([0.0484, 0.4129, 0.0321, 0.5504, 0.0598, 0.3314, 0.1227, 0.0392], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0200, 0.0220, 0.0271, 0.0238, 0.0202, 0.0200, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:12:45,535 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3415, 2.9180, 2.9366, 2.0114, 2.7613, 3.0243, 3.3148, 2.6365], + device='cuda:2'), covar=tensor([0.0735, 0.0889, 0.0891, 0.1296, 0.1062, 0.0717, 0.0636, 0.1144], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0173, 0.0143, 0.0127, 0.0146, 0.0157, 0.0138, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:13:13,028 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.58 vs. limit=5.0 +2022-12-08 13:13:15,302 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0406, 2.2540, 4.0602, 4.2933, 4.0766, 2.3047, 4.3219, 2.9049], + device='cuda:2'), covar=tensor([0.0505, 0.1516, 0.0891, 0.0447, 0.0545, 0.2397, 0.0397, 0.1311], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0260, 0.0377, 0.0331, 0.0273, 0.0307, 0.0313, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:13:32,949 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.341e+02 2.108e+02 2.462e+02 3.226e+02 6.885e+02, threshold=4.924e+02, percent-clipped=3.0 +2022-12-08 13:13:54,143 INFO [train.py:873] (2/4) Epoch 18, batch 3800, loss[loss=0.1016, simple_loss=0.1405, pruned_loss=0.03139, over 14222.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1417, pruned_loss=0.03438, over 1982554.21 frames. ], batch size: 94, lr: 4.36e-03, grad_scale: 4.0 +2022-12-08 13:14:22,271 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=132387.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:14:36,841 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.62 vs. limit=5.0 +2022-12-08 13:14:42,696 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=132410.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 13:14:59,236 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.308e+02 2.016e+02 2.539e+02 3.051e+02 5.858e+02, threshold=5.078e+02, percent-clipped=3.0 +2022-12-08 13:15:09,086 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.12 vs. limit=2.0 +2022-12-08 13:15:21,340 INFO [train.py:873] (2/4) Epoch 18, batch 3900, loss[loss=0.1389, simple_loss=0.1381, pruned_loss=0.06988, over 1262.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.1411, pruned_loss=0.03401, over 1975582.74 frames. ], batch size: 100, lr: 4.36e-03, grad_scale: 4.0 +2022-12-08 13:15:49,866 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4928, 4.2306, 4.0297, 4.5414, 4.1847, 4.0531, 4.5865, 3.7479], + device='cuda:2'), covar=tensor([0.0423, 0.0904, 0.0444, 0.0419, 0.0779, 0.0977, 0.0520, 0.0555], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0276, 0.0202, 0.0196, 0.0184, 0.0161, 0.0289, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 13:16:12,130 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132513.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:16:25,773 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.17 vs. limit=2.0 +2022-12-08 13:16:26,818 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.188e+02 2.073e+02 2.605e+02 3.188e+02 7.778e+02, threshold=5.211e+02, percent-clipped=4.0 +2022-12-08 13:16:26,982 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0059, 4.8259, 4.5281, 4.6917, 4.8224, 4.9953, 5.0292, 5.0518], + device='cuda:2'), covar=tensor([0.0798, 0.0373, 0.1856, 0.2509, 0.0661, 0.0700, 0.0708, 0.0687], + device='cuda:2'), in_proj_covar=tensor([0.0394, 0.0276, 0.0453, 0.0573, 0.0352, 0.0453, 0.0393, 0.0400], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:16:37,263 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1006, 2.4446, 3.9879, 4.1778, 3.9272, 2.4406, 4.1588, 3.1031], + device='cuda:2'), covar=tensor([0.0414, 0.1288, 0.0827, 0.0414, 0.0524, 0.1925, 0.0479, 0.1016], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0261, 0.0377, 0.0331, 0.0273, 0.0309, 0.0313, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:16:48,241 INFO [train.py:873] (2/4) Epoch 18, batch 4000, loss[loss=0.1227, simple_loss=0.1528, pruned_loss=0.04633, over 14539.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1418, pruned_loss=0.03459, over 1935196.09 frames. ], batch size: 49, lr: 4.36e-03, grad_scale: 8.0 +2022-12-08 13:16:53,534 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132561.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:17:54,707 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.290e+02 2.137e+02 2.506e+02 3.319e+02 7.426e+02, threshold=5.012e+02, percent-clipped=3.0 +2022-12-08 13:18:16,276 INFO [train.py:873] (2/4) Epoch 18, batch 4100, loss[loss=0.09911, simple_loss=0.1395, pruned_loss=0.02936, over 14254.00 frames. ], tot_loss[loss=0.1056, simple_loss=0.1421, pruned_loss=0.0346, over 1966450.68 frames. ], batch size: 76, lr: 4.36e-03, grad_scale: 8.0 +2022-12-08 13:18:20,660 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=132660.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:18:24,079 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9921, 2.6814, 4.9380, 3.4568, 4.7575, 2.3983, 3.6917, 4.7489], + device='cuda:2'), covar=tensor([0.0429, 0.3394, 0.0371, 0.5221, 0.0499, 0.3103, 0.1241, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0199, 0.0220, 0.0270, 0.0239, 0.0202, 0.0201, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:18:44,616 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132687.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:19:05,242 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=132710.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 13:19:14,426 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=132721.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:19:15,206 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.2938, 5.0580, 4.7303, 4.8834, 4.9948, 5.1969, 5.2460, 5.2249], + device='cuda:2'), covar=tensor([0.0770, 0.0385, 0.1983, 0.2386, 0.0635, 0.0845, 0.0673, 0.0911], + device='cuda:2'), in_proj_covar=tensor([0.0393, 0.0275, 0.0448, 0.0569, 0.0349, 0.0450, 0.0389, 0.0398], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:19:15,341 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9866, 2.5638, 4.9440, 3.4156, 4.7027, 2.3587, 3.6432, 4.7886], + device='cuda:2'), covar=tensor([0.0399, 0.3218, 0.0313, 0.4835, 0.0513, 0.2950, 0.1237, 0.0442], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0198, 0.0220, 0.0269, 0.0238, 0.0202, 0.0201, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:19:22,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.248e+02 2.213e+02 2.659e+02 3.609e+02 5.635e+02, threshold=5.318e+02, percent-clipped=5.0 +2022-12-08 13:19:27,153 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132735.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:19:36,797 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=132746.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:19:44,648 INFO [train.py:873] (2/4) Epoch 18, batch 4200, loss[loss=0.1098, simple_loss=0.1467, pruned_loss=0.03641, over 11178.00 frames. ], tot_loss[loss=0.1053, simple_loss=0.1417, pruned_loss=0.03442, over 1959968.65 frames. ], batch size: 100, lr: 4.36e-03, grad_scale: 8.0 +2022-12-08 13:19:47,327 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=132758.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:20:23,898 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.61 vs. limit=2.0 +2022-12-08 13:20:30,905 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=132807.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:20:42,233 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8391, 1.7033, 1.9710, 1.5958, 1.9119, 1.7595, 1.6399, 1.8253], + device='cuda:2'), covar=tensor([0.0729, 0.1320, 0.0541, 0.0583, 0.0574, 0.0928, 0.0392, 0.0463], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0305, 0.0387, 0.0293, 0.0363, 0.0315, 0.0356, 0.0295], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:20:50,972 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 2.134e+02 2.571e+02 3.230e+02 5.703e+02, threshold=5.142e+02, percent-clipped=2.0 +2022-12-08 13:21:08,008 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5719, 1.8962, 2.5397, 2.0852, 2.5025, 2.4345, 2.3149, 2.3019], + device='cuda:2'), covar=tensor([0.0643, 0.2279, 0.0713, 0.1388, 0.0564, 0.1023, 0.0775, 0.1074], + device='cuda:2'), in_proj_covar=tensor([0.0347, 0.0303, 0.0384, 0.0290, 0.0359, 0.0312, 0.0353, 0.0292], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:21:13,203 INFO [train.py:873] (2/4) Epoch 18, batch 4300, loss[loss=0.08511, simple_loss=0.1366, pruned_loss=0.0168, over 14002.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.142, pruned_loss=0.0345, over 1972266.85 frames. ], batch size: 26, lr: 4.35e-03, grad_scale: 4.0 +2022-12-08 13:22:20,490 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 2.027e+02 2.464e+02 3.102e+02 5.439e+02, threshold=4.929e+02, percent-clipped=1.0 +2022-12-08 13:22:33,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.47 vs. limit=2.0 +2022-12-08 13:22:41,319 INFO [train.py:873] (2/4) Epoch 18, batch 4400, loss[loss=0.08839, simple_loss=0.1358, pruned_loss=0.02049, over 14265.00 frames. ], tot_loss[loss=0.106, simple_loss=0.1421, pruned_loss=0.03493, over 1934630.26 frames. ], batch size: 31, lr: 4.35e-03, grad_scale: 8.0 +2022-12-08 13:23:23,122 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6638, 4.4251, 4.1210, 4.3274, 4.4704, 4.5772, 4.6516, 4.6492], + device='cuda:2'), covar=tensor([0.0866, 0.0508, 0.1987, 0.2452, 0.0676, 0.0779, 0.0762, 0.0798], + device='cuda:2'), in_proj_covar=tensor([0.0393, 0.0276, 0.0453, 0.0572, 0.0351, 0.0456, 0.0393, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:23:35,956 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=133016.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:23:41,643 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.30 vs. limit=5.0 +2022-12-08 13:23:48,922 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.037e+02 1.881e+02 2.190e+02 2.815e+02 5.316e+02, threshold=4.380e+02, percent-clipped=1.0 +2022-12-08 13:23:56,580 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5381, 3.2678, 2.5555, 3.6594, 3.5134, 3.5158, 3.0742, 2.5753], + device='cuda:2'), covar=tensor([0.0902, 0.1268, 0.2984, 0.0686, 0.0866, 0.1097, 0.1364, 0.2990], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0288, 0.0258, 0.0291, 0.0321, 0.0303, 0.0253, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:24:10,921 INFO [train.py:873] (2/4) Epoch 18, batch 4500, loss[loss=0.09488, simple_loss=0.1355, pruned_loss=0.02713, over 13888.00 frames. ], tot_loss[loss=0.1055, simple_loss=0.1418, pruned_loss=0.03462, over 1942251.79 frames. ], batch size: 20, lr: 4.35e-03, grad_scale: 8.0 +2022-12-08 13:24:24,960 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8960, 0.8188, 0.7075, 0.8522, 0.8385, 0.3840, 0.8043, 0.8553], + device='cuda:2'), covar=tensor([0.0364, 0.0541, 0.0747, 0.0473, 0.0417, 0.0340, 0.0844, 0.0752], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0034, 0.0034, 0.0049, 0.0036, 0.0038], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 13:24:31,945 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9453, 4.1390, 4.1499, 3.7832, 4.0672, 4.2466, 1.6147, 3.8551], + device='cuda:2'), covar=tensor([0.0492, 0.0558, 0.0611, 0.0704, 0.0544, 0.0353, 0.3859, 0.0415], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0177, 0.0147, 0.0149, 0.0208, 0.0141, 0.0159, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 13:24:52,389 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=133102.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:25:15,646 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0842, 3.3910, 3.2094, 3.2564, 2.5233, 3.3788, 3.3911, 1.8990], + device='cuda:2'), covar=tensor([0.1196, 0.0656, 0.0806, 0.0682, 0.0868, 0.0460, 0.0590, 0.1754], + device='cuda:2'), in_proj_covar=tensor([0.0139, 0.0091, 0.0071, 0.0077, 0.0100, 0.0092, 0.0102, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:25:17,480 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.181e+02 2.087e+02 2.404e+02 3.093e+02 4.922e+02, threshold=4.807e+02, percent-clipped=5.0 +2022-12-08 13:25:24,239 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0946, 2.1870, 3.0880, 3.1737, 3.0481, 2.2021, 3.0636, 2.4572], + device='cuda:2'), covar=tensor([0.0466, 0.1218, 0.0874, 0.0523, 0.0606, 0.1660, 0.0553, 0.1076], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0261, 0.0377, 0.0332, 0.0274, 0.0310, 0.0313, 0.0277], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:25:38,359 INFO [train.py:873] (2/4) Epoch 18, batch 4600, loss[loss=0.08603, simple_loss=0.1218, pruned_loss=0.02513, over 4986.00 frames. ], tot_loss[loss=0.106, simple_loss=0.1423, pruned_loss=0.03487, over 2000341.89 frames. ], batch size: 100, lr: 4.35e-03, grad_scale: 8.0 +2022-12-08 13:25:40,309 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8970, 1.6484, 2.0280, 1.6281, 2.0197, 1.8317, 1.7174, 1.8861], + device='cuda:2'), covar=tensor([0.0693, 0.1750, 0.0543, 0.0622, 0.0513, 0.0999, 0.0424, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0352, 0.0308, 0.0390, 0.0296, 0.0364, 0.0318, 0.0357, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:26:07,746 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0016, 2.5836, 3.8525, 2.9325, 3.8222, 3.6976, 3.6256, 3.3225], + device='cuda:2'), covar=tensor([0.0787, 0.2757, 0.0960, 0.1721, 0.0892, 0.1020, 0.1478, 0.1480], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0308, 0.0390, 0.0296, 0.0364, 0.0318, 0.0358, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:26:14,774 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8338, 1.5569, 2.0505, 1.6344, 1.9050, 1.4436, 1.6785, 1.9276], + device='cuda:2'), covar=tensor([0.2988, 0.2905, 0.0564, 0.2140, 0.1603, 0.1393, 0.1249, 0.0977], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0201, 0.0222, 0.0272, 0.0240, 0.0203, 0.0203, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:26:23,694 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0894, 3.5928, 2.8479, 4.3459, 4.0788, 4.0792, 3.6186, 3.0030], + device='cuda:2'), covar=tensor([0.0607, 0.1133, 0.2818, 0.0459, 0.0742, 0.1469, 0.1087, 0.2635], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0288, 0.0259, 0.0290, 0.0320, 0.0302, 0.0253, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:26:44,621 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 2.089e+02 2.539e+02 2.980e+02 5.650e+02, threshold=5.079e+02, percent-clipped=2.0 +2022-12-08 13:27:05,959 INFO [train.py:873] (2/4) Epoch 18, batch 4700, loss[loss=0.1045, simple_loss=0.145, pruned_loss=0.03195, over 14207.00 frames. ], tot_loss[loss=0.1066, simple_loss=0.1427, pruned_loss=0.03524, over 1989186.53 frames. ], batch size: 89, lr: 4.35e-03, grad_scale: 8.0 +2022-12-08 13:27:25,468 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.34 vs. limit=5.0 +2022-12-08 13:27:31,976 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.49 vs. limit=2.0 +2022-12-08 13:27:58,979 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=133316.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:28:12,073 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.321e+02 2.140e+02 2.537e+02 3.389e+02 1.353e+03, threshold=5.074e+02, percent-clipped=8.0 +2022-12-08 13:28:32,822 INFO [train.py:873] (2/4) Epoch 18, batch 4800, loss[loss=0.121, simple_loss=0.149, pruned_loss=0.0465, over 5998.00 frames. ], tot_loss[loss=0.105, simple_loss=0.1415, pruned_loss=0.03427, over 1957383.77 frames. ], batch size: 100, lr: 4.35e-03, grad_scale: 8.0 +2022-12-08 13:28:40,336 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=133364.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:28:49,155 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=133374.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 13:28:51,118 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=133376.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:28:53,890 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.15 vs. limit=2.0 +2022-12-08 13:29:12,167 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1057, 2.5368, 5.0776, 3.4872, 4.8179, 2.2126, 3.8294, 4.8700], + device='cuda:2'), covar=tensor([0.0376, 0.3480, 0.0337, 0.5100, 0.0490, 0.3187, 0.1097, 0.0291], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0198, 0.0221, 0.0269, 0.0239, 0.0202, 0.0202, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 13:29:13,882 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=133402.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:29:16,563 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.23 vs. limit=2.0 +2022-12-08 13:29:38,999 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.315e+02 1.939e+02 2.430e+02 2.952e+02 7.648e+02, threshold=4.861e+02, percent-clipped=3.0 +2022-12-08 13:29:42,983 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=133435.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 13:29:44,661 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=133437.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:29:55,366 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=133450.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:29:59,647 INFO [train.py:873] (2/4) Epoch 18, batch 4900, loss[loss=0.1033, simple_loss=0.1442, pruned_loss=0.03124, over 14412.00 frames. ], tot_loss[loss=0.1063, simple_loss=0.1422, pruned_loss=0.03522, over 1887781.08 frames. ], batch size: 41, lr: 4.34e-03, grad_scale: 8.0 +2022-12-08 13:30:16,301 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=133474.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:30:35,584 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.45 vs. limit=5.0 +2022-12-08 13:30:36,480 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-12-08 13:31:05,365 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 2.069e+02 2.523e+02 3.509e+02 9.363e+02, threshold=5.045e+02, percent-clipped=9.0 +2022-12-08 13:31:09,170 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0446, 3.1350, 3.2437, 3.1168, 3.1510, 3.0418, 1.5615, 2.9304], + device='cuda:2'), covar=tensor([0.0485, 0.0411, 0.0348, 0.0438, 0.0357, 0.0672, 0.2811, 0.0330], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0179, 0.0149, 0.0151, 0.0210, 0.0143, 0.0160, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 13:31:09,245 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=133535.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:31:26,068 INFO [train.py:873] (2/4) Epoch 18, batch 5000, loss[loss=0.1296, simple_loss=0.1684, pruned_loss=0.04546, over 14346.00 frames. ], tot_loss[loss=0.1065, simple_loss=0.1419, pruned_loss=0.03553, over 1808891.37 frames. ], batch size: 25, lr: 4.34e-03, grad_scale: 4.0 +2022-12-08 13:32:32,831 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 2.004e+02 2.618e+02 3.128e+02 4.580e+02, threshold=5.235e+02, percent-clipped=0.0 +2022-12-08 13:32:50,866 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=133652.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:32:53,181 INFO [train.py:873] (2/4) Epoch 18, batch 5100, loss[loss=0.1064, simple_loss=0.1418, pruned_loss=0.03554, over 14241.00 frames. ], tot_loss[loss=0.1059, simple_loss=0.1417, pruned_loss=0.03504, over 1909598.70 frames. ], batch size: 46, lr: 4.34e-03, grad_scale: 4.0 +2022-12-08 13:33:35,068 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.50 vs. limit=5.0 +2022-12-08 13:33:44,276 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=133713.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:33:58,867 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=133730.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 13:34:00,753 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.056e+02 2.175e+02 2.651e+02 3.112e+02 1.000e+03, threshold=5.302e+02, percent-clipped=1.0 +2022-12-08 13:34:00,889 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=133732.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:34:21,230 INFO [train.py:873] (2/4) Epoch 18, batch 5200, loss[loss=0.09964, simple_loss=0.135, pruned_loss=0.03212, over 14201.00 frames. ], tot_loss[loss=0.106, simple_loss=0.1418, pruned_loss=0.03511, over 1931817.13 frames. ], batch size: 94, lr: 4.34e-03, grad_scale: 8.0 +2022-12-08 13:35:21,819 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.72 vs. limit=2.0 +2022-12-08 13:35:27,056 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=133830.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:35:29,273 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.226e+02 1.917e+02 2.403e+02 3.012e+02 6.543e+02, threshold=4.806e+02, percent-clipped=3.0 +2022-12-08 13:35:33,998 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 13:35:40,667 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.49 vs. limit=2.0 +2022-12-08 13:35:48,361 INFO [train.py:873] (2/4) Epoch 18, batch 5300, loss[loss=0.08373, simple_loss=0.1204, pruned_loss=0.02352, over 13912.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1414, pruned_loss=0.03467, over 1924585.12 frames. ], batch size: 20, lr: 4.34e-03, grad_scale: 4.0 +2022-12-08 13:35:56,466 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.46 vs. limit=5.0 +2022-12-08 13:36:13,049 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.42 vs. limit=5.0 +2022-12-08 13:36:20,621 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2663, 2.1506, 2.4430, 1.4969, 1.7054, 2.1344, 1.4164, 2.2693], + device='cuda:2'), covar=tensor([0.1182, 0.1476, 0.0710, 0.2649, 0.2437, 0.0930, 0.2800, 0.0924], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0104, 0.0096, 0.0101, 0.0117, 0.0092, 0.0118, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 13:36:30,305 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0233, 1.6394, 3.6135, 3.3312, 3.4657, 3.6221, 3.0459, 3.6648], + device='cuda:2'), covar=tensor([0.1496, 0.1697, 0.0147, 0.0299, 0.0288, 0.0183, 0.0295, 0.0137], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0129, 0.0166, 0.0146, 0.0142, 0.0124, 0.0122], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 13:36:39,176 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.18 vs. limit=5.0 +2022-12-08 13:36:55,585 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.086e+02 2.076e+02 2.589e+02 3.060e+02 6.026e+02, threshold=5.178e+02, percent-clipped=1.0 +2022-12-08 13:37:02,992 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0517, 2.9120, 2.2417, 3.0917, 2.9560, 2.9753, 2.6999, 2.3435], + device='cuda:2'), covar=tensor([0.0827, 0.1165, 0.2722, 0.0802, 0.1036, 0.0855, 0.1187, 0.2446], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0287, 0.0258, 0.0291, 0.0319, 0.0302, 0.0253, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:37:14,386 INFO [train.py:873] (2/4) Epoch 18, batch 5400, loss[loss=0.138, simple_loss=0.1427, pruned_loss=0.06661, over 1172.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1415, pruned_loss=0.03447, over 1944059.99 frames. ], batch size: 100, lr: 4.34e-03, grad_scale: 4.0 +2022-12-08 13:37:51,052 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=133997.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:38:00,626 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=134008.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:38:20,343 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134030.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 13:38:22,055 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134032.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:38:22,722 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.308e+02 1.858e+02 2.390e+02 3.308e+02 6.761e+02, threshold=4.781e+02, percent-clipped=5.0 +2022-12-08 13:38:42,050 INFO [train.py:873] (2/4) Epoch 18, batch 5500, loss[loss=0.119, simple_loss=0.1147, pruned_loss=0.06167, over 1289.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1406, pruned_loss=0.03349, over 1957492.57 frames. ], batch size: 100, lr: 4.33e-03, grad_scale: 4.0 +2022-12-08 13:38:45,228 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=134058.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:38:47,203 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.69 vs. limit=5.0 +2022-12-08 13:38:49,773 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 13:39:02,235 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=134078.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 13:39:03,900 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=134080.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:39:47,458 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134130.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:39:50,150 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.270e+02 2.212e+02 2.722e+02 3.305e+02 6.043e+02, threshold=5.445e+02, percent-clipped=3.0 +2022-12-08 13:40:09,467 INFO [train.py:873] (2/4) Epoch 18, batch 5600, loss[loss=0.1011, simple_loss=0.1421, pruned_loss=0.03009, over 14178.00 frames. ], tot_loss[loss=0.104, simple_loss=0.1408, pruned_loss=0.03355, over 1976520.84 frames. ], batch size: 84, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:40:24,809 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7494, 3.0409, 2.9937, 3.1230, 2.3718, 3.1321, 2.9086, 1.8065], + device='cuda:2'), covar=tensor([0.1079, 0.1093, 0.0856, 0.0420, 0.0868, 0.0463, 0.0804, 0.1670], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0090, 0.0070, 0.0077, 0.0099, 0.0091, 0.0101, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 13:40:29,740 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=134178.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:40:54,073 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-08 13:41:18,751 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.315e+02 1.953e+02 2.384e+02 3.110e+02 5.351e+02, threshold=4.768e+02, percent-clipped=0.0 +2022-12-08 13:41:28,552 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7304, 1.4078, 2.8812, 1.5064, 2.9081, 2.8087, 2.0584, 2.9975], + device='cuda:2'), covar=tensor([0.0349, 0.3003, 0.0493, 0.2199, 0.0479, 0.0567, 0.1152, 0.0359], + device='cuda:2'), in_proj_covar=tensor([0.0171, 0.0153, 0.0159, 0.0166, 0.0166, 0.0177, 0.0131, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:41:35,903 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0135, 1.8879, 1.8989, 1.9795, 2.0264, 1.4168, 1.6201, 1.8948], + device='cuda:2'), covar=tensor([0.0722, 0.0645, 0.0680, 0.0641, 0.0464, 0.0820, 0.0868, 0.0638], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0034, 0.0034, 0.0049, 0.0036, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 13:41:38,338 INFO [train.py:873] (2/4) Epoch 18, batch 5700, loss[loss=0.08963, simple_loss=0.1242, pruned_loss=0.02754, over 6920.00 frames. ], tot_loss[loss=0.1046, simple_loss=0.1414, pruned_loss=0.03387, over 1991278.62 frames. ], batch size: 100, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:41:40,021 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1748, 1.1419, 0.9783, 1.1389, 1.1851, 0.8110, 1.0330, 1.1550], + device='cuda:2'), covar=tensor([0.0526, 0.0536, 0.0544, 0.0481, 0.0433, 0.0448, 0.1045, 0.0725], + device='cuda:2'), in_proj_covar=tensor([0.0037, 0.0035, 0.0040, 0.0034, 0.0034, 0.0048, 0.0036, 0.0039], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 13:42:12,853 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6431, 3.4689, 4.2011, 3.0040, 2.5464, 3.6142, 2.1292, 3.4793], + device='cuda:2'), covar=tensor([0.0722, 0.0840, 0.0477, 0.1309, 0.1812, 0.0663, 0.2713, 0.1122], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0096, 0.0101, 0.0116, 0.0092, 0.0118, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 13:42:17,167 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5204, 4.1930, 4.0274, 4.5386, 4.2106, 4.0024, 4.5450, 3.7447], + device='cuda:2'), covar=tensor([0.0405, 0.1005, 0.0467, 0.0445, 0.0878, 0.0893, 0.0587, 0.0564], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0275, 0.0200, 0.0195, 0.0184, 0.0157, 0.0288, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0002, 0.0002, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 13:42:21,770 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=134304.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:42:25,148 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134308.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:42:26,071 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=134309.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:42:34,125 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.48 vs. limit=2.0 +2022-12-08 13:42:47,525 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.380e+02 2.136e+02 2.600e+02 3.296e+02 5.182e+02, threshold=5.200e+02, percent-clipped=3.0 +2022-12-08 13:42:49,345 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2721, 2.3012, 1.9166, 2.3557, 2.2704, 2.2599, 2.1099, 2.0286], + device='cuda:2'), covar=tensor([0.1006, 0.0940, 0.1904, 0.0789, 0.1206, 0.0763, 0.1487, 0.1310], + device='cuda:2'), in_proj_covar=tensor([0.0286, 0.0292, 0.0261, 0.0294, 0.0325, 0.0306, 0.0256, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:42:55,818 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0857, 4.9339, 4.6699, 5.0646, 4.6865, 4.3991, 5.1546, 4.8925], + device='cuda:2'), covar=tensor([0.0549, 0.0707, 0.0825, 0.0564, 0.0821, 0.0571, 0.0557, 0.0734], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0149, 0.0152, 0.0167, 0.0152, 0.0128, 0.0174, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:43:05,200 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=134353.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:43:06,875 INFO [train.py:873] (2/4) Epoch 18, batch 5800, loss[loss=0.09321, simple_loss=0.1382, pruned_loss=0.02411, over 14266.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1417, pruned_loss=0.03456, over 1942367.66 frames. ], batch size: 39, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:43:07,830 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=134356.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:43:16,078 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=134365.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:43:20,571 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=134370.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:44:17,490 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.465e+02 2.136e+02 2.508e+02 3.176e+02 6.308e+02, threshold=5.017e+02, percent-clipped=2.0 +2022-12-08 13:44:37,072 INFO [train.py:873] (2/4) Epoch 18, batch 5900, loss[loss=0.09454, simple_loss=0.1209, pruned_loss=0.03409, over 5008.00 frames. ], tot_loss[loss=0.1048, simple_loss=0.1409, pruned_loss=0.03433, over 1894585.67 frames. ], batch size: 100, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:45:21,196 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.67 vs. limit=2.0 +2022-12-08 13:45:41,712 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.42 vs. limit=2.0 +2022-12-08 13:45:45,583 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.224e+02 2.002e+02 2.502e+02 3.052e+02 5.386e+02, threshold=5.003e+02, percent-clipped=2.0 +2022-12-08 13:46:04,787 INFO [train.py:873] (2/4) Epoch 18, batch 6000, loss[loss=0.09503, simple_loss=0.1413, pruned_loss=0.02437, over 14585.00 frames. ], tot_loss[loss=0.1051, simple_loss=0.1408, pruned_loss=0.03468, over 1868581.11 frames. ], batch size: 22, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:46:04,787 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 13:46:17,879 INFO [train.py:905] (2/4) Epoch 18, validation: loss=0.1412, simple_loss=0.1783, pruned_loss=0.05203, over 857387.00 frames. +2022-12-08 13:46:17,880 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 13:47:04,027 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7835, 2.4179, 3.6470, 2.7579, 3.5665, 3.4774, 3.3997, 2.9979], + device='cuda:2'), covar=tensor([0.0883, 0.2823, 0.1007, 0.1786, 0.0771, 0.1048, 0.1384, 0.1750], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0310, 0.0391, 0.0301, 0.0365, 0.0322, 0.0362, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:47:26,072 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.309e+02 2.089e+02 2.569e+02 3.442e+02 1.078e+03, threshold=5.138e+02, percent-clipped=4.0 +2022-12-08 13:47:44,031 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134653.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:47:45,553 INFO [train.py:873] (2/4) Epoch 18, batch 6100, loss[loss=0.1381, simple_loss=0.1354, pruned_loss=0.07041, over 1284.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.141, pruned_loss=0.03475, over 1875925.30 frames. ], batch size: 100, lr: 4.33e-03, grad_scale: 8.0 +2022-12-08 13:47:49,985 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=134660.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:47:54,220 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=134665.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:48:25,747 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=134701.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:48:53,587 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.283e+02 2.174e+02 2.543e+02 3.201e+02 1.223e+03, threshold=5.086e+02, percent-clipped=2.0 +2022-12-08 13:49:12,510 INFO [train.py:873] (2/4) Epoch 18, batch 6200, loss[loss=0.1006, simple_loss=0.1438, pruned_loss=0.02868, over 14413.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.141, pruned_loss=0.03437, over 1956780.83 frames. ], batch size: 41, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:49:14,523 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1877, 1.4543, 3.2805, 1.5782, 3.0845, 3.2877, 2.4048, 3.5008], + device='cuda:2'), covar=tensor([0.0264, 0.3186, 0.0446, 0.2323, 0.1191, 0.0483, 0.1034, 0.0250], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0163, 0.0169, 0.0171, 0.0182, 0.0133, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:49:20,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3940, 4.2546, 4.1028, 4.4106, 4.0893, 3.7489, 4.4819, 4.2328], + device='cuda:2'), covar=tensor([0.0668, 0.0891, 0.0884, 0.0656, 0.0765, 0.0607, 0.0644, 0.0828], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0148, 0.0151, 0.0166, 0.0151, 0.0127, 0.0173, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 13:49:52,987 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.54 vs. limit=2.0 +2022-12-08 13:50:14,352 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4437, 2.9351, 4.3024, 3.2211, 4.2489, 4.1085, 4.0478, 3.6940], + device='cuda:2'), covar=tensor([0.0790, 0.2642, 0.0939, 0.1725, 0.0772, 0.0949, 0.1432, 0.1608], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0308, 0.0391, 0.0301, 0.0363, 0.0323, 0.0362, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:50:20,437 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.216e+02 2.042e+02 2.445e+02 3.055e+02 6.497e+02, threshold=4.889e+02, percent-clipped=2.0 +2022-12-08 13:50:40,211 INFO [train.py:873] (2/4) Epoch 18, batch 6300, loss[loss=0.1343, simple_loss=0.134, pruned_loss=0.06727, over 1282.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.1409, pruned_loss=0.03361, over 1963988.46 frames. ], batch size: 100, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:50:46,331 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=134862.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:51:39,636 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=134923.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:51:47,986 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.134e+02 2.107e+02 2.599e+02 3.257e+02 7.432e+02, threshold=5.197e+02, percent-clipped=1.0 +2022-12-08 13:51:53,688 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 13:52:07,217 INFO [train.py:873] (2/4) Epoch 18, batch 6400, loss[loss=0.1134, simple_loss=0.147, pruned_loss=0.03995, over 14178.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.141, pruned_loss=0.03364, over 1949503.14 frames. ], batch size: 84, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:52:12,161 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134960.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:52:16,279 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=134965.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:52:57,647 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=135008.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:53:01,790 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=135013.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:53:18,809 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.337e+02 2.100e+02 2.565e+02 3.370e+02 5.862e+02, threshold=5.131e+02, percent-clipped=3.0 +2022-12-08 13:53:38,284 INFO [train.py:873] (2/4) Epoch 18, batch 6500, loss[loss=0.1018, simple_loss=0.1407, pruned_loss=0.03146, over 12785.00 frames. ], tot_loss[loss=0.1046, simple_loss=0.1411, pruned_loss=0.0341, over 1953288.69 frames. ], batch size: 100, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:54:20,385 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5034, 3.2811, 3.1845, 3.5232, 3.3175, 3.4628, 3.5586, 2.9766], + device='cuda:2'), covar=tensor([0.0540, 0.1046, 0.0618, 0.0506, 0.0807, 0.0410, 0.0650, 0.0655], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0281, 0.0204, 0.0201, 0.0188, 0.0162, 0.0295, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 13:54:44,770 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.12 vs. limit=2.0 +2022-12-08 13:54:46,810 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.352e+02 2.038e+02 2.650e+02 3.158e+02 7.242e+02, threshold=5.300e+02, percent-clipped=3.0 +2022-12-08 13:55:05,525 INFO [train.py:873] (2/4) Epoch 18, batch 6600, loss[loss=0.1691, simple_loss=0.1579, pruned_loss=0.09017, over 1224.00 frames. ], tot_loss[loss=0.1048, simple_loss=0.1408, pruned_loss=0.03439, over 1891609.36 frames. ], batch size: 100, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:55:20,586 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=135171.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:55:35,659 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1708, 2.2478, 4.9523, 4.5223, 4.3665, 5.0855, 4.7142, 5.0983], + device='cuda:2'), covar=tensor([0.1500, 0.1352, 0.0097, 0.0187, 0.0239, 0.0112, 0.0135, 0.0104], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0157, 0.0131, 0.0169, 0.0147, 0.0143, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 13:56:01,516 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=135218.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:56:13,611 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=135232.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 13:56:14,315 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.473e+02 2.150e+02 2.615e+02 3.167e+02 5.492e+02, threshold=5.230e+02, percent-clipped=1.0 +2022-12-08 13:56:34,472 INFO [train.py:873] (2/4) Epoch 18, batch 6700, loss[loss=0.08515, simple_loss=0.1152, pruned_loss=0.02754, over 4928.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.1407, pruned_loss=0.0345, over 1834769.51 frames. ], batch size: 100, lr: 4.32e-03, grad_scale: 8.0 +2022-12-08 13:56:45,687 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1560, 2.1904, 4.8746, 4.4031, 4.2648, 4.9682, 4.7355, 5.0452], + device='cuda:2'), covar=tensor([0.1628, 0.1452, 0.0124, 0.0256, 0.0265, 0.0149, 0.0114, 0.0108], + device='cuda:2'), in_proj_covar=tensor([0.0144, 0.0156, 0.0130, 0.0168, 0.0146, 0.0142, 0.0125, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 13:57:41,809 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.031e+02 2.097e+02 2.445e+02 2.909e+02 5.288e+02, threshold=4.890e+02, percent-clipped=1.0 +2022-12-08 13:58:00,109 INFO [train.py:873] (2/4) Epoch 18, batch 6800, loss[loss=0.0828, simple_loss=0.13, pruned_loss=0.01782, over 14285.00 frames. ], tot_loss[loss=0.1044, simple_loss=0.1406, pruned_loss=0.0341, over 1890909.14 frames. ], batch size: 63, lr: 4.31e-03, grad_scale: 8.0 +2022-12-08 13:58:53,839 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3849, 4.1968, 3.7780, 3.9066, 4.2539, 4.3123, 4.4425, 4.4002], + device='cuda:2'), covar=tensor([0.1464, 0.0710, 0.2748, 0.3889, 0.1033, 0.1381, 0.1187, 0.1235], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0278, 0.0453, 0.0578, 0.0354, 0.0458, 0.0388, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:59:07,784 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.077e+02 2.213e+02 2.677e+02 3.514e+02 1.198e+03, threshold=5.353e+02, percent-clipped=7.0 +2022-12-08 13:59:26,336 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6129, 2.4660, 2.2236, 2.3840, 2.5477, 2.5929, 2.6041, 2.6079], + device='cuda:2'), covar=tensor([0.1086, 0.0888, 0.2584, 0.2480, 0.1104, 0.1044, 0.1383, 0.0960], + device='cuda:2'), in_proj_covar=tensor([0.0396, 0.0278, 0.0453, 0.0577, 0.0354, 0.0457, 0.0387, 0.0403], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 13:59:27,059 INFO [train.py:873] (2/4) Epoch 18, batch 6900, loss[loss=0.1147, simple_loss=0.1261, pruned_loss=0.05164, over 2632.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.1412, pruned_loss=0.03429, over 1929172.66 frames. ], batch size: 100, lr: 4.31e-03, grad_scale: 8.0 +2022-12-08 13:59:32,018 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-08 14:00:21,726 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=135518.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:00:29,522 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=135527.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:00:35,294 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.613e+02 2.207e+02 2.585e+02 3.410e+02 8.303e+02, threshold=5.169e+02, percent-clipped=6.0 +2022-12-08 14:00:35,516 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0406, 2.8613, 2.3133, 3.1280, 3.0236, 3.0159, 2.6683, 2.2308], + device='cuda:2'), covar=tensor([0.0978, 0.1259, 0.2677, 0.0866, 0.1000, 0.0914, 0.1332, 0.2812], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0286, 0.0259, 0.0290, 0.0321, 0.0303, 0.0252, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:00:53,014 INFO [train.py:873] (2/4) Epoch 18, batch 7000, loss[loss=0.1352, simple_loss=0.1531, pruned_loss=0.05864, over 7798.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1415, pruned_loss=0.03459, over 1970009.45 frames. ], batch size: 100, lr: 4.31e-03, grad_scale: 4.0 +2022-12-08 14:01:03,614 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=135566.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:01:59,461 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=135630.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:02:01,197 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1254, 1.9367, 4.7298, 4.3666, 4.1948, 4.8188, 4.3976, 4.8313], + device='cuda:2'), covar=tensor([0.1616, 0.1653, 0.0118, 0.0234, 0.0265, 0.0150, 0.0145, 0.0121], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0130, 0.0167, 0.0146, 0.0142, 0.0126, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:02:02,411 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.259e+02 2.039e+02 2.453e+02 3.005e+02 1.669e+03, threshold=4.906e+02, percent-clipped=2.0 +2022-12-08 14:02:21,586 INFO [train.py:873] (2/4) Epoch 18, batch 7100, loss[loss=0.1377, simple_loss=0.1334, pruned_loss=0.07096, over 1276.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1413, pruned_loss=0.03448, over 2011368.28 frames. ], batch size: 100, lr: 4.31e-03, grad_scale: 4.0 +2022-12-08 14:02:53,423 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=135691.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:03:16,343 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 14:03:30,725 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.202e+02 1.933e+02 2.466e+02 3.185e+02 5.052e+02, threshold=4.932e+02, percent-clipped=2.0 +2022-12-08 14:03:38,846 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8030, 2.4856, 2.6368, 1.8235, 2.3477, 2.5601, 2.8143, 2.3391], + device='cuda:2'), covar=tensor([0.0786, 0.0660, 0.0943, 0.1395, 0.1151, 0.0886, 0.0762, 0.1312], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0171, 0.0140, 0.0127, 0.0146, 0.0156, 0.0138, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:03:48,866 INFO [train.py:873] (2/4) Epoch 18, batch 7200, loss[loss=0.1066, simple_loss=0.1403, pruned_loss=0.03639, over 14163.00 frames. ], tot_loss[loss=0.1062, simple_loss=0.142, pruned_loss=0.03522, over 1975149.79 frames. ], batch size: 99, lr: 4.31e-03, grad_scale: 8.0 +2022-12-08 14:04:02,691 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8245, 1.1286, 1.2882, 1.2530, 1.0199, 1.2818, 1.1605, 0.7614], + device='cuda:2'), covar=tensor([0.1943, 0.0962, 0.0530, 0.0478, 0.1684, 0.1055, 0.1503, 0.1605], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0090, 0.0071, 0.0077, 0.0102, 0.0092, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:04:29,199 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3721, 1.0187, 1.2270, 0.8630, 1.0957, 1.3706, 1.0094, 1.1268], + device='cuda:2'), covar=tensor([0.0536, 0.0955, 0.0820, 0.0552, 0.1306, 0.0775, 0.0656, 0.1266], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0170, 0.0140, 0.0127, 0.0146, 0.0157, 0.0138, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:04:45,733 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.90 vs. limit=2.0 +2022-12-08 14:04:52,973 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=135827.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:04:58,962 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.262e+02 2.075e+02 2.523e+02 3.199e+02 6.288e+02, threshold=5.045e+02, percent-clipped=2.0 +2022-12-08 14:05:18,068 INFO [train.py:873] (2/4) Epoch 18, batch 7300, loss[loss=0.09699, simple_loss=0.1343, pruned_loss=0.02983, over 14281.00 frames. ], tot_loss[loss=0.1048, simple_loss=0.1409, pruned_loss=0.03436, over 1957482.81 frames. ], batch size: 44, lr: 4.31e-03, grad_scale: 8.0 +2022-12-08 14:05:35,237 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=135875.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:06:27,400 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.731e+01 2.137e+02 2.530e+02 3.235e+02 6.685e+02, threshold=5.060e+02, percent-clipped=2.0 +2022-12-08 14:06:45,614 INFO [train.py:873] (2/4) Epoch 18, batch 7400, loss[loss=0.1218, simple_loss=0.1488, pruned_loss=0.04737, over 5996.00 frames. ], tot_loss[loss=0.1052, simple_loss=0.1412, pruned_loss=0.03461, over 1920103.48 frames. ], batch size: 100, lr: 4.30e-03, grad_scale: 8.0 +2022-12-08 14:07:09,844 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=135982.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:07:13,170 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=135986.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:07:20,259 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=135994.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:07:35,285 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-12-08 14:07:41,138 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.41 vs. limit=5.0 +2022-12-08 14:07:53,773 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8560, 5.6919, 5.2831, 5.7958, 5.3583, 5.3880, 5.8946, 5.6514], + device='cuda:2'), covar=tensor([0.0536, 0.0737, 0.0716, 0.0491, 0.0603, 0.0383, 0.0494, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0149, 0.0151, 0.0167, 0.0152, 0.0127, 0.0174, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 14:07:55,414 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.302e+02 2.076e+02 2.483e+02 2.988e+02 6.746e+02, threshold=4.966e+02, percent-clipped=1.0 +2022-12-08 14:08:04,516 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136043.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:08:07,967 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3628, 4.5048, 4.6641, 3.9058, 4.5293, 4.7714, 1.7139, 4.2080], + device='cuda:2'), covar=tensor([0.0299, 0.0352, 0.0349, 0.0459, 0.0274, 0.0184, 0.3109, 0.0281], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0178, 0.0149, 0.0151, 0.0209, 0.0142, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:08:14,775 INFO [train.py:873] (2/4) Epoch 18, batch 7500, loss[loss=0.09756, simple_loss=0.1387, pruned_loss=0.02821, over 14082.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1413, pruned_loss=0.03475, over 1941507.77 frames. ], batch size: 29, lr: 4.30e-03, grad_scale: 8.0 +2022-12-08 14:08:14,998 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136055.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:08:52,618 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=8.27 vs. limit=5.0 +2022-12-08 14:09:45,356 INFO [train.py:873] (2/4) Epoch 19, batch 0, loss[loss=0.1297, simple_loss=0.1656, pruned_loss=0.04691, over 13931.00 frames. ], tot_loss[loss=0.1297, simple_loss=0.1656, pruned_loss=0.04691, over 13931.00 frames. ], batch size: 26, lr: 4.19e-03, grad_scale: 8.0 +2022-12-08 14:09:45,357 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 14:09:53,121 INFO [train.py:905] (2/4) Epoch 19, validation: loss=0.1445, simple_loss=0.1825, pruned_loss=0.05324, over 857387.00 frames. +2022-12-08 14:09:53,122 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 14:10:08,830 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.063e+01 1.495e+02 2.536e+02 3.389e+02 8.096e+02, threshold=5.072e+02, percent-clipped=9.0 +2022-12-08 14:10:16,284 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=136142.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:11:10,147 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136203.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:11:22,946 INFO [train.py:873] (2/4) Epoch 19, batch 100, loss[loss=0.115, simple_loss=0.1412, pruned_loss=0.04441, over 11180.00 frames. ], tot_loss[loss=0.1048, simple_loss=0.1419, pruned_loss=0.0339, over 922299.16 frames. ], batch size: 100, lr: 4.18e-03, grad_scale: 8.0 +2022-12-08 14:11:23,503 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8995, 2.4147, 4.8653, 3.4131, 4.6746, 2.3389, 3.6042, 4.7274], + device='cuda:2'), covar=tensor([0.0433, 0.3613, 0.0319, 0.4732, 0.0479, 0.3043, 0.1239, 0.0329], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0199, 0.0219, 0.0268, 0.0238, 0.0203, 0.0201, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:11:23,848 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.66 vs. limit=2.0 +2022-12-08 14:11:35,962 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0028, 1.9855, 2.0992, 2.0883, 2.0429, 1.6861, 1.2700, 1.8780], + device='cuda:2'), covar=tensor([0.0812, 0.0627, 0.0494, 0.0397, 0.0485, 0.1396, 0.2385, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0176, 0.0148, 0.0150, 0.0208, 0.0141, 0.0158, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:11:37,513 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.402e+02 2.463e+02 2.960e+02 3.618e+02 1.073e+03, threshold=5.920e+02, percent-clipped=4.0 +2022-12-08 14:12:21,379 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5223, 3.3339, 3.1046, 3.2533, 3.4541, 3.4557, 3.4890, 3.4962], + device='cuda:2'), covar=tensor([0.0950, 0.0682, 0.2210, 0.2441, 0.0849, 0.0923, 0.1035, 0.0853], + device='cuda:2'), in_proj_covar=tensor([0.0398, 0.0278, 0.0453, 0.0572, 0.0352, 0.0457, 0.0389, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:12:22,987 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=136286.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:12:24,657 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7059, 1.9589, 2.6248, 2.0677, 2.6566, 2.5601, 2.4142, 2.2757], + device='cuda:2'), covar=tensor([0.0831, 0.2656, 0.1022, 0.1792, 0.0789, 0.1202, 0.0990, 0.1446], + device='cuda:2'), in_proj_covar=tensor([0.0347, 0.0303, 0.0386, 0.0294, 0.0358, 0.0317, 0.0355, 0.0291], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:12:50,169 INFO [train.py:873] (2/4) Epoch 19, batch 200, loss[loss=0.09497, simple_loss=0.138, pruned_loss=0.02596, over 14277.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.1401, pruned_loss=0.03323, over 1343594.92 frames. ], batch size: 28, lr: 4.18e-03, grad_scale: 4.0 +2022-12-08 14:13:05,477 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=136334.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:13:06,200 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.129e+02 2.646e+02 3.149e+02 5.242e+02, threshold=5.292e+02, percent-clipped=0.0 +2022-12-08 14:13:06,392 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=136335.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:13:09,289 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=136338.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:13:19,468 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=136350.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:13:59,636 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136396.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:14:18,704 INFO [train.py:873] (2/4) Epoch 19, batch 300, loss[loss=0.105, simple_loss=0.143, pruned_loss=0.0335, over 14225.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1396, pruned_loss=0.03345, over 1556017.22 frames. ], batch size: 94, lr: 4.18e-03, grad_scale: 4.0 +2022-12-08 14:14:33,863 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.272e+02 1.988e+02 2.460e+02 2.943e+02 5.876e+02, threshold=4.921e+02, percent-clipped=3.0 +2022-12-08 14:14:43,569 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6385, 2.1407, 4.6261, 2.9838, 4.4157, 2.1647, 3.3106, 4.4478], + device='cuda:2'), covar=tensor([0.0543, 0.4274, 0.0591, 0.5962, 0.0527, 0.3376, 0.1560, 0.0454], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0198, 0.0217, 0.0269, 0.0238, 0.0202, 0.0199, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:14:53,407 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.77 vs. limit=5.0 +2022-12-08 14:15:29,507 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=136498.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:15:45,666 INFO [train.py:873] (2/4) Epoch 19, batch 400, loss[loss=0.103, simple_loss=0.1401, pruned_loss=0.033, over 14312.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1401, pruned_loss=0.03333, over 1728624.18 frames. ], batch size: 31, lr: 4.18e-03, grad_scale: 8.0 +2022-12-08 14:16:01,299 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.382e+02 2.033e+02 2.479e+02 3.164e+02 1.103e+03, threshold=4.959e+02, percent-clipped=2.0 +2022-12-08 14:16:12,636 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.41 vs. limit=2.0 +2022-12-08 14:16:51,188 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3824, 4.1465, 3.8662, 4.0049, 4.2054, 4.2770, 4.3621, 4.3357], + device='cuda:2'), covar=tensor([0.0745, 0.0490, 0.1834, 0.2401, 0.0665, 0.0826, 0.0699, 0.0805], + device='cuda:2'), in_proj_covar=tensor([0.0399, 0.0278, 0.0456, 0.0574, 0.0355, 0.0459, 0.0390, 0.0404], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:17:14,357 INFO [train.py:873] (2/4) Epoch 19, batch 500, loss[loss=0.1239, simple_loss=0.1584, pruned_loss=0.04467, over 14295.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1405, pruned_loss=0.03319, over 1831412.05 frames. ], batch size: 80, lr: 4.18e-03, grad_scale: 8.0 +2022-12-08 14:17:30,016 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.075e+02 1.937e+02 2.457e+02 2.957e+02 6.738e+02, threshold=4.914e+02, percent-clipped=4.0 +2022-12-08 14:17:32,490 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=136638.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:17:34,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8154, 2.8489, 2.6911, 2.9438, 2.5437, 2.6815, 2.8807, 2.7836], + device='cuda:2'), covar=tensor([0.0845, 0.1116, 0.1001, 0.0674, 0.1272, 0.0735, 0.0910, 0.0927], + device='cuda:2'), in_proj_covar=tensor([0.0151, 0.0151, 0.0152, 0.0167, 0.0154, 0.0128, 0.0176, 0.0157], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:17:36,774 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=136643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:17:42,868 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=136650.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:18:08,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8331, 2.8373, 2.1836, 2.9169, 2.7822, 2.8280, 2.5703, 2.2879], + device='cuda:2'), covar=tensor([0.1057, 0.1213, 0.2777, 0.1016, 0.1228, 0.0988, 0.1383, 0.2658], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0286, 0.0257, 0.0290, 0.0321, 0.0301, 0.0254, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:18:13,559 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=136686.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:18:18,021 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=136691.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:18:24,484 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=136698.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:18:29,943 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136704.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:18:40,756 INFO [train.py:873] (2/4) Epoch 19, batch 600, loss[loss=0.1144, simple_loss=0.1494, pruned_loss=0.03976, over 4987.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.14, pruned_loss=0.03325, over 1875312.87 frames. ], batch size: 100, lr: 4.18e-03, grad_scale: 4.0 +2022-12-08 14:18:49,657 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.27 vs. limit=5.0 +2022-12-08 14:18:57,079 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.012e+02 2.028e+02 2.404e+02 2.918e+02 4.938e+02, threshold=4.808e+02, percent-clipped=1.0 +2022-12-08 14:19:51,214 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=136798.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:19:58,166 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7619, 1.0833, 1.2838, 1.2221, 1.0588, 1.2537, 1.0660, 0.8056], + device='cuda:2'), covar=tensor([0.1917, 0.1189, 0.0403, 0.0538, 0.1658, 0.1035, 0.1523, 0.1509], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0092, 0.0071, 0.0077, 0.0102, 0.0093, 0.0103, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:19:58,178 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=136806.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:20:08,084 INFO [train.py:873] (2/4) Epoch 19, batch 700, loss[loss=0.1194, simple_loss=0.1495, pruned_loss=0.04463, over 14279.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1404, pruned_loss=0.03322, over 1955175.67 frames. ], batch size: 66, lr: 4.17e-03, grad_scale: 4.0 +2022-12-08 14:20:17,982 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7436, 4.0831, 3.9040, 3.7780, 2.8899, 3.9591, 3.7314, 2.1260], + device='cuda:2'), covar=tensor([0.1181, 0.0493, 0.0595, 0.0864, 0.0818, 0.0390, 0.1023, 0.1783], + device='cuda:2'), in_proj_covar=tensor([0.0138, 0.0092, 0.0071, 0.0077, 0.0102, 0.0093, 0.0103, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:20:24,140 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.847e+01 1.962e+02 2.462e+02 3.266e+02 7.518e+02, threshold=4.924e+02, percent-clipped=4.0 +2022-12-08 14:20:32,597 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=136846.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:20:51,838 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=136867.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:21:25,516 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5052, 1.9620, 3.5281, 2.5592, 3.4659, 1.8729, 2.7895, 3.4892], + device='cuda:2'), covar=tensor([0.0840, 0.3876, 0.0630, 0.4569, 0.0877, 0.3393, 0.1384, 0.0719], + device='cuda:2'), in_proj_covar=tensor([0.0255, 0.0198, 0.0219, 0.0271, 0.0239, 0.0203, 0.0201, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:21:31,972 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.70 vs. limit=2.0 +2022-12-08 14:21:35,622 INFO [train.py:873] (2/4) Epoch 19, batch 800, loss[loss=0.09832, simple_loss=0.1382, pruned_loss=0.02921, over 14263.00 frames. ], tot_loss[loss=0.1036, simple_loss=0.1401, pruned_loss=0.03355, over 1966466.98 frames. ], batch size: 63, lr: 4.17e-03, grad_scale: 8.0 +2022-12-08 14:21:38,430 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6193, 3.6580, 4.3350, 3.2898, 2.6498, 3.6337, 2.1401, 3.7762], + device='cuda:2'), covar=tensor([0.1668, 0.0695, 0.0433, 0.1076, 0.1788, 0.0667, 0.2639, 0.1101], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0096, 0.0100, 0.0116, 0.0093, 0.0117, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:21:42,185 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-08 14:21:44,339 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2441, 3.7649, 2.8087, 4.4359, 4.1167, 4.2957, 3.7094, 3.1023], + device='cuda:2'), covar=tensor([0.0715, 0.1173, 0.3197, 0.0500, 0.1056, 0.1017, 0.1192, 0.2893], + device='cuda:2'), in_proj_covar=tensor([0.0280, 0.0285, 0.0257, 0.0288, 0.0320, 0.0301, 0.0253, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:21:50,644 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6022, 1.4102, 3.6502, 1.7312, 3.5500, 3.7320, 2.6916, 3.9678], + device='cuda:2'), covar=tensor([0.0271, 0.3261, 0.0436, 0.2268, 0.0752, 0.0424, 0.0917, 0.0203], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0154, 0.0161, 0.0168, 0.0168, 0.0179, 0.0132, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:21:52,113 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.236e+02 2.241e+02 2.727e+02 3.173e+02 1.328e+03, threshold=5.453e+02, percent-clipped=3.0 +2022-12-08 14:21:54,384 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3541, 2.1170, 4.3387, 3.0790, 4.1913, 2.0005, 3.2526, 4.2190], + device='cuda:2'), covar=tensor([0.0550, 0.3816, 0.0419, 0.4994, 0.0690, 0.3104, 0.1342, 0.0461], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0198, 0.0218, 0.0270, 0.0239, 0.0202, 0.0200, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:22:16,794 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4823, 1.1156, 1.9917, 1.8227, 1.8482, 2.0697, 1.4638, 2.0478], + device='cuda:2'), covar=tensor([0.0928, 0.1665, 0.0388, 0.0667, 0.0839, 0.0408, 0.0918, 0.0437], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0131, 0.0170, 0.0147, 0.0142, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:22:40,628 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=136991.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:22:47,260 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=136999.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:23:03,205 INFO [train.py:873] (2/4) Epoch 19, batch 900, loss[loss=0.106, simple_loss=0.1481, pruned_loss=0.03193, over 14237.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1399, pruned_loss=0.03341, over 1986097.71 frames. ], batch size: 46, lr: 4.17e-03, grad_scale: 4.0 +2022-12-08 14:23:20,504 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.232e+02 2.006e+02 2.396e+02 3.046e+02 6.535e+02, threshold=4.793e+02, percent-clipped=3.0 +2022-12-08 14:23:22,046 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=137039.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:23:44,863 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1622, 2.4866, 4.0092, 2.7410, 3.9689, 3.8631, 3.8556, 3.2899], + device='cuda:2'), covar=tensor([0.0713, 0.3483, 0.0870, 0.2148, 0.0823, 0.0953, 0.1464, 0.2282], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0308, 0.0388, 0.0296, 0.0361, 0.0321, 0.0359, 0.0294], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:23:53,618 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7336, 3.3566, 2.5298, 3.8649, 3.7332, 3.7833, 3.3276, 2.6020], + device='cuda:2'), covar=tensor([0.0768, 0.1303, 0.3261, 0.0613, 0.0848, 0.1034, 0.1121, 0.3196], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0285, 0.0258, 0.0288, 0.0319, 0.0300, 0.0253, 0.0240], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:24:29,764 INFO [train.py:873] (2/4) Epoch 19, batch 1000, loss[loss=0.1149, simple_loss=0.1491, pruned_loss=0.04037, over 14539.00 frames. ], tot_loss[loss=0.1042, simple_loss=0.1407, pruned_loss=0.03388, over 1989400.38 frames. ], batch size: 24, lr: 4.17e-03, grad_scale: 4.0 +2022-12-08 14:24:47,893 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.038e+02 2.060e+02 2.475e+02 3.449e+02 7.129e+02, threshold=4.949e+02, percent-clipped=6.0 +2022-12-08 14:25:04,787 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.69 vs. limit=2.0 +2022-12-08 14:25:09,573 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=137162.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:25:46,345 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1241, 1.2232, 1.2998, 1.0630, 0.8271, 1.1307, 0.8635, 1.1387], + device='cuda:2'), covar=tensor([0.1951, 0.2923, 0.1491, 0.2863, 0.3191, 0.1442, 0.2186, 0.1512], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0104, 0.0097, 0.0102, 0.0117, 0.0093, 0.0118, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:25:48,797 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2633, 3.7743, 3.2361, 3.5147, 2.7595, 3.7133, 3.5311, 2.0575], + device='cuda:2'), covar=tensor([0.1287, 0.0682, 0.1774, 0.0708, 0.0812, 0.0425, 0.0640, 0.1789], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0091, 0.0070, 0.0077, 0.0101, 0.0092, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:25:49,203 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.74 vs. limit=2.0 +2022-12-08 14:25:57,947 INFO [train.py:873] (2/4) Epoch 19, batch 1100, loss[loss=0.09988, simple_loss=0.1372, pruned_loss=0.03129, over 14260.00 frames. ], tot_loss[loss=0.1042, simple_loss=0.1408, pruned_loss=0.0338, over 1992807.45 frames. ], batch size: 39, lr: 4.17e-03, grad_scale: 4.0 +2022-12-08 14:26:15,919 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.041e+02 1.998e+02 2.599e+02 3.181e+02 8.198e+02, threshold=5.197e+02, percent-clipped=8.0 +2022-12-08 14:26:21,161 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9819, 2.0109, 2.0575, 2.0665, 1.9950, 1.6640, 1.3282, 1.8023], + device='cuda:2'), covar=tensor([0.0782, 0.0625, 0.0500, 0.0469, 0.0483, 0.1524, 0.2424, 0.0480], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0175, 0.0147, 0.0148, 0.0208, 0.0142, 0.0158, 0.0195], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:26:21,701 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.91 vs. limit=2.0 +2022-12-08 14:26:27,186 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6866, 2.3746, 2.5743, 1.7160, 2.1852, 2.5945, 2.7059, 2.2688], + device='cuda:2'), covar=tensor([0.0838, 0.0733, 0.0898, 0.1442, 0.1214, 0.0873, 0.0753, 0.1296], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0168, 0.0140, 0.0125, 0.0145, 0.0154, 0.0138, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:26:27,223 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5107, 1.9611, 3.5694, 2.6199, 3.4209, 1.9619, 2.7831, 3.5198], + device='cuda:2'), covar=tensor([0.0728, 0.3849, 0.0550, 0.4509, 0.0873, 0.3119, 0.1374, 0.0600], + device='cuda:2'), in_proj_covar=tensor([0.0256, 0.0200, 0.0220, 0.0273, 0.0241, 0.0203, 0.0203, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:26:59,450 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0364, 1.7963, 2.1456, 1.9265, 2.0900, 1.3557, 1.7092, 1.8620], + device='cuda:2'), covar=tensor([0.0579, 0.0859, 0.0429, 0.0625, 0.0569, 0.0774, 0.0860, 0.0576], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0042, 0.0035, 0.0036, 0.0051, 0.0038, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:27:04,817 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3082, 2.6650, 4.2162, 3.1936, 4.1808, 3.8811, 3.9461, 3.5526], + device='cuda:2'), covar=tensor([0.0689, 0.3164, 0.0880, 0.1766, 0.0752, 0.1116, 0.1359, 0.1662], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0312, 0.0392, 0.0301, 0.0367, 0.0325, 0.0363, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:27:10,319 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=137299.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:27:25,930 INFO [train.py:873] (2/4) Epoch 19, batch 1200, loss[loss=0.1155, simple_loss=0.1466, pruned_loss=0.0422, over 14174.00 frames. ], tot_loss[loss=0.1042, simple_loss=0.1412, pruned_loss=0.03361, over 2055576.79 frames. ], batch size: 99, lr: 4.17e-03, grad_scale: 8.0 +2022-12-08 14:27:43,345 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.217e+02 2.257e+02 2.745e+02 3.542e+02 9.944e+02, threshold=5.490e+02, percent-clipped=8.0 +2022-12-08 14:27:51,881 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=137347.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:27:54,792 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9700, 2.0451, 2.1740, 1.3997, 1.6461, 2.0389, 1.4041, 2.0379], + device='cuda:2'), covar=tensor([0.1231, 0.1506, 0.0910, 0.2659, 0.2607, 0.1060, 0.3014, 0.1025], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0096, 0.0101, 0.0116, 0.0092, 0.0117, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:28:26,772 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4243, 2.3898, 2.2001, 2.8613, 2.0486, 2.2745, 2.4980, 2.3273], + device='cuda:2'), covar=tensor([0.0367, 0.0619, 0.0464, 0.0136, 0.0411, 0.0389, 0.0303, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0021, 0.0022, 0.0022, 0.0035, 0.0029, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:28:33,200 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3167, 2.2765, 1.9314, 2.3960, 2.2404, 2.2711, 2.0754, 1.9651], + device='cuda:2'), covar=tensor([0.1102, 0.0884, 0.2107, 0.0868, 0.1200, 0.0704, 0.1489, 0.1514], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0286, 0.0258, 0.0291, 0.0319, 0.0302, 0.0254, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:28:37,168 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1506, 4.1938, 4.3829, 3.9175, 4.2234, 4.3593, 1.8221, 4.0194], + device='cuda:2'), covar=tensor([0.0291, 0.0323, 0.0294, 0.0426, 0.0275, 0.0276, 0.2910, 0.0270], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0175, 0.0147, 0.0148, 0.0208, 0.0142, 0.0158, 0.0195], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:28:54,282 INFO [train.py:873] (2/4) Epoch 19, batch 1300, loss[loss=0.1347, simple_loss=0.1516, pruned_loss=0.05891, over 4993.00 frames. ], tot_loss[loss=0.104, simple_loss=0.1407, pruned_loss=0.03364, over 2039660.54 frames. ], batch size: 100, lr: 4.17e-03, grad_scale: 8.0 +2022-12-08 14:28:54,460 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0524, 2.7406, 3.5682, 2.4514, 2.3745, 3.2464, 1.9018, 3.0872], + device='cuda:2'), covar=tensor([0.0886, 0.0934, 0.0539, 0.1742, 0.1746, 0.0637, 0.2693, 0.0936], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0096, 0.0101, 0.0116, 0.0092, 0.0117, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:29:12,660 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.353e+02 1.901e+02 2.314e+02 2.897e+02 7.672e+02, threshold=4.628e+02, percent-clipped=3.0 +2022-12-08 14:29:33,979 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=137462.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:29:49,735 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5722, 1.5308, 1.6170, 1.4135, 1.4188, 1.3809, 1.3610, 1.2346], + device='cuda:2'), covar=tensor([0.0208, 0.0249, 0.0186, 0.0213, 0.0207, 0.0325, 0.0212, 0.0343], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0021, 0.0023, 0.0022, 0.0035, 0.0029, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 14:30:15,564 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=137510.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:30:15,690 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5785, 3.2060, 2.9562, 3.0453, 2.2809, 3.2310, 3.0663, 1.4738], + device='cuda:2'), covar=tensor([0.1366, 0.0818, 0.1189, 0.0687, 0.1111, 0.0511, 0.0878, 0.2283], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0092, 0.0071, 0.0078, 0.0101, 0.0093, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:30:21,588 INFO [train.py:873] (2/4) Epoch 19, batch 1400, loss[loss=0.127, simple_loss=0.1289, pruned_loss=0.06259, over 1236.00 frames. ], tot_loss[loss=0.1035, simple_loss=0.1404, pruned_loss=0.03333, over 1979306.66 frames. ], batch size: 100, lr: 4.16e-03, grad_scale: 4.0 +2022-12-08 14:30:40,256 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.322e+02 2.087e+02 2.410e+02 2.920e+02 5.806e+02, threshold=4.820e+02, percent-clipped=3.0 +2022-12-08 14:30:45,421 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.78 vs. limit=5.0 +2022-12-08 14:31:18,728 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1657, 2.2323, 4.6727, 4.2355, 4.1371, 4.7150, 4.2719, 4.7067], + device='cuda:2'), covar=tensor([0.1382, 0.1298, 0.0101, 0.0231, 0.0227, 0.0130, 0.0215, 0.0101], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0130, 0.0169, 0.0147, 0.0142, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:31:29,132 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0385, 2.1557, 1.9381, 2.1594, 1.8104, 2.0452, 2.1171, 2.0485], + device='cuda:2'), covar=tensor([0.0993, 0.1271, 0.1306, 0.1053, 0.1668, 0.1015, 0.1256, 0.1124], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0149, 0.0151, 0.0166, 0.0153, 0.0128, 0.0175, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 14:31:49,006 INFO [train.py:873] (2/4) Epoch 19, batch 1500, loss[loss=0.1229, simple_loss=0.132, pruned_loss=0.05694, over 2650.00 frames. ], tot_loss[loss=0.1044, simple_loss=0.1405, pruned_loss=0.03414, over 1934332.95 frames. ], batch size: 100, lr: 4.16e-03, grad_scale: 4.0 +2022-12-08 14:32:07,872 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.098e+02 1.988e+02 2.718e+02 3.293e+02 9.468e+02, threshold=5.435e+02, percent-clipped=6.0 +2022-12-08 14:32:08,949 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5664, 2.8075, 4.4598, 3.3893, 4.3631, 4.2374, 4.1544, 3.7487], + device='cuda:2'), covar=tensor([0.0698, 0.2892, 0.0760, 0.1387, 0.0662, 0.0840, 0.1483, 0.1710], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0312, 0.0391, 0.0299, 0.0366, 0.0324, 0.0363, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:32:14,212 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9509, 1.5984, 1.9719, 1.3857, 1.6835, 2.0296, 1.8474, 1.7674], + device='cuda:2'), covar=tensor([0.0848, 0.0686, 0.0813, 0.1220, 0.1465, 0.0903, 0.0805, 0.1467], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0170, 0.0142, 0.0126, 0.0146, 0.0156, 0.0139, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:32:54,123 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 14:33:15,527 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1137, 1.9755, 2.0487, 2.1262, 2.0085, 2.0185, 2.1784, 1.8779], + device='cuda:2'), covar=tensor([0.1189, 0.1404, 0.0835, 0.0995, 0.1115, 0.0847, 0.1049, 0.0737], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0275, 0.0201, 0.0199, 0.0186, 0.0159, 0.0291, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 14:33:18,036 INFO [train.py:873] (2/4) Epoch 19, batch 1600, loss[loss=0.08736, simple_loss=0.1229, pruned_loss=0.0259, over 4983.00 frames. ], tot_loss[loss=0.1035, simple_loss=0.1399, pruned_loss=0.03351, over 1887235.42 frames. ], batch size: 100, lr: 4.16e-03, grad_scale: 8.0 +2022-12-08 14:33:18,622 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.57 vs. limit=2.0 +2022-12-08 14:33:27,886 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.19 vs. limit=2.0 +2022-12-08 14:33:36,640 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.099e+02 2.503e+02 3.045e+02 5.451e+02, threshold=5.005e+02, percent-clipped=1.0 +2022-12-08 14:33:56,135 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5367, 1.6558, 4.4202, 2.5121, 4.3108, 4.7019, 4.1690, 5.0247], + device='cuda:2'), covar=tensor([0.0215, 0.3043, 0.0375, 0.1699, 0.0311, 0.0297, 0.0319, 0.0138], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0154, 0.0162, 0.0167, 0.0167, 0.0179, 0.0132, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:34:25,169 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8359, 1.3556, 2.5338, 2.2510, 2.3912, 2.5203, 1.7874, 2.5401], + device='cuda:2'), covar=tensor([0.1011, 0.1426, 0.0234, 0.0549, 0.0510, 0.0288, 0.0810, 0.0305], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0131, 0.0169, 0.0148, 0.0142, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:34:26,094 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8057, 2.6733, 2.1089, 2.7639, 2.6700, 2.6992, 2.4292, 2.1756], + device='cuda:2'), covar=tensor([0.1085, 0.1306, 0.2922, 0.1035, 0.1146, 0.1056, 0.1558, 0.2675], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0289, 0.0262, 0.0293, 0.0325, 0.0305, 0.0257, 0.0245], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:34:46,402 INFO [train.py:873] (2/4) Epoch 19, batch 1700, loss[loss=0.1302, simple_loss=0.1637, pruned_loss=0.04834, over 14372.00 frames. ], tot_loss[loss=0.1043, simple_loss=0.1406, pruned_loss=0.03403, over 1906497.21 frames. ], batch size: 73, lr: 4.16e-03, grad_scale: 8.0 +2022-12-08 14:34:46,551 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1095, 2.3515, 2.3931, 2.4630, 2.0343, 2.5147, 2.3298, 1.4714], + device='cuda:2'), covar=tensor([0.0786, 0.0918, 0.0694, 0.0511, 0.1025, 0.0550, 0.0877, 0.1813], + device='cuda:2'), in_proj_covar=tensor([0.0137, 0.0091, 0.0071, 0.0078, 0.0101, 0.0093, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:35:05,301 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.215e+02 2.077e+02 2.557e+02 3.257e+02 5.458e+02, threshold=5.115e+02, percent-clipped=1.0 +2022-12-08 14:35:07,809 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-08 14:35:08,169 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9562, 1.9993, 2.1481, 1.4285, 1.6295, 2.0179, 1.3563, 2.0388], + device='cuda:2'), covar=tensor([0.1310, 0.1956, 0.0914, 0.2559, 0.2512, 0.1050, 0.3278, 0.0985], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0097, 0.0101, 0.0116, 0.0093, 0.0118, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:35:38,962 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.59 vs. limit=2.0 +2022-12-08 14:35:53,028 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=137892.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:35:55,485 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=137895.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:35:57,223 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8269, 3.0151, 4.6393, 3.6129, 4.5946, 4.5362, 4.4427, 4.0684], + device='cuda:2'), covar=tensor([0.0784, 0.3020, 0.0846, 0.1709, 0.0678, 0.0925, 0.1601, 0.1416], + device='cuda:2'), in_proj_covar=tensor([0.0354, 0.0310, 0.0390, 0.0299, 0.0364, 0.0322, 0.0361, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:36:14,639 INFO [train.py:873] (2/4) Epoch 19, batch 1800, loss[loss=0.1235, simple_loss=0.1499, pruned_loss=0.04855, over 6929.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.1408, pruned_loss=0.03407, over 1965418.49 frames. ], batch size: 100, lr: 4.16e-03, grad_scale: 8.0 +2022-12-08 14:36:33,146 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.219e+02 2.138e+02 2.501e+02 2.995e+02 6.998e+02, threshold=5.002e+02, percent-clipped=2.0 +2022-12-08 14:36:46,945 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=137953.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:36:49,323 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=137956.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:37:05,402 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3430, 3.0965, 2.8497, 3.0565, 3.2581, 3.2724, 3.3007, 3.3252], + device='cuda:2'), covar=tensor([0.0982, 0.0764, 0.2352, 0.2688, 0.0959, 0.1053, 0.1271, 0.0939], + device='cuda:2'), in_proj_covar=tensor([0.0404, 0.0281, 0.0461, 0.0583, 0.0362, 0.0468, 0.0398, 0.0409], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:37:42,664 INFO [train.py:873] (2/4) Epoch 19, batch 1900, loss[loss=0.146, simple_loss=0.1451, pruned_loss=0.0735, over 1271.00 frames. ], tot_loss[loss=0.1043, simple_loss=0.1407, pruned_loss=0.03392, over 2009274.51 frames. ], batch size: 100, lr: 4.16e-03, grad_scale: 8.0 +2022-12-08 14:38:00,918 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.15 vs. limit=5.0 +2022-12-08 14:38:01,168 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.215e+02 2.140e+02 2.687e+02 3.163e+02 5.638e+02, threshold=5.373e+02, percent-clipped=1.0 +2022-12-08 14:38:15,766 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4877, 4.5649, 4.8571, 4.1530, 4.6163, 4.8865, 1.8802, 4.3391], + device='cuda:2'), covar=tensor([0.0331, 0.0330, 0.0293, 0.0595, 0.0265, 0.0156, 0.3000, 0.0331], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0175, 0.0147, 0.0149, 0.0209, 0.0143, 0.0159, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:38:27,888 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1704, 2.0059, 2.0388, 2.0714, 2.1304, 1.9237, 1.8406, 1.4732], + device='cuda:2'), covar=tensor([0.0199, 0.0590, 0.0298, 0.0239, 0.0249, 0.0290, 0.0314, 0.0525], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0021, 0.0023, 0.0022, 0.0035, 0.0029, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:38:39,946 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2712, 2.2247, 2.2028, 2.3369, 2.1961, 1.4721, 2.1321, 2.2953], + device='cuda:2'), covar=tensor([0.0806, 0.0645, 0.0712, 0.0908, 0.1030, 0.0886, 0.0932, 0.0936], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0041, 0.0034, 0.0036, 0.0050, 0.0038, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:38:46,738 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9810, 1.8236, 4.1078, 3.8064, 3.8909, 4.2045, 3.4626, 4.1703], + device='cuda:2'), covar=tensor([0.1599, 0.1579, 0.0126, 0.0266, 0.0246, 0.0149, 0.0220, 0.0131], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0131, 0.0170, 0.0148, 0.0142, 0.0127, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:39:09,901 INFO [train.py:873] (2/4) Epoch 19, batch 2000, loss[loss=0.1113, simple_loss=0.142, pruned_loss=0.04032, over 13935.00 frames. ], tot_loss[loss=0.1047, simple_loss=0.1411, pruned_loss=0.03418, over 1953657.31 frames. ], batch size: 20, lr: 4.15e-03, grad_scale: 8.0 +2022-12-08 14:39:28,120 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 1.998e+02 2.546e+02 3.326e+02 7.451e+02, threshold=5.092e+02, percent-clipped=3.0 +2022-12-08 14:40:37,637 INFO [train.py:873] (2/4) Epoch 19, batch 2100, loss[loss=0.1222, simple_loss=0.1548, pruned_loss=0.04484, over 9501.00 frames. ], tot_loss[loss=0.1037, simple_loss=0.1404, pruned_loss=0.03347, over 1940794.45 frames. ], batch size: 100, lr: 4.15e-03, grad_scale: 8.0 +2022-12-08 14:40:56,379 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.080e+02 1.995e+02 2.366e+02 2.965e+02 6.942e+02, threshold=4.733e+02, percent-clipped=2.0 +2022-12-08 14:40:56,588 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6108, 2.5506, 2.0355, 2.6623, 2.4421, 2.4925, 2.3034, 2.1505], + device='cuda:2'), covar=tensor([0.1130, 0.1153, 0.2511, 0.0964, 0.1351, 0.0906, 0.1474, 0.1888], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0288, 0.0259, 0.0292, 0.0322, 0.0303, 0.0255, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:41:04,815 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=138248.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:41:07,326 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=138251.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:41:11,975 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.74 vs. limit=2.0 +2022-12-08 14:41:24,563 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138270.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:42:05,016 INFO [train.py:873] (2/4) Epoch 19, batch 2200, loss[loss=0.1069, simple_loss=0.1338, pruned_loss=0.04001, over 4963.00 frames. ], tot_loss[loss=0.1043, simple_loss=0.1407, pruned_loss=0.03397, over 1954291.86 frames. ], batch size: 100, lr: 4.15e-03, grad_scale: 8.0 +2022-12-08 14:42:09,666 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138322.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:42:17,302 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138331.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:42:22,959 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.147e+02 2.085e+02 2.687e+02 3.331e+02 7.282e+02, threshold=5.373e+02, percent-clipped=5.0 +2022-12-08 14:43:02,713 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138383.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:43:14,904 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.6191, 4.6715, 5.0285, 4.1734, 4.8086, 5.1169, 2.0431, 4.4473], + device='cuda:2'), covar=tensor([0.0380, 0.0435, 0.0375, 0.0517, 0.0312, 0.0167, 0.3018, 0.0318], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0149, 0.0151, 0.0210, 0.0144, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:43:15,713 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8839, 1.9237, 1.7075, 1.9600, 1.7918, 1.7184, 1.2151, 1.5828], + device='cuda:2'), covar=tensor([0.1184, 0.1029, 0.1139, 0.0664, 0.1165, 0.1590, 0.3398, 0.1376], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0149, 0.0151, 0.0210, 0.0144, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:43:32,136 INFO [train.py:873] (2/4) Epoch 19, batch 2300, loss[loss=0.1288, simple_loss=0.1353, pruned_loss=0.06115, over 2573.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1404, pruned_loss=0.03357, over 1966228.20 frames. ], batch size: 100, lr: 4.15e-03, grad_scale: 8.0 +2022-12-08 14:43:50,854 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.988e+01 2.098e+02 2.453e+02 3.240e+02 6.762e+02, threshold=4.906e+02, percent-clipped=1.0 +2022-12-08 14:44:46,874 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.6457, 1.6324, 1.7040, 1.7985, 1.8109, 1.1964, 1.6170, 1.5828], + device='cuda:2'), covar=tensor([0.0617, 0.0600, 0.0675, 0.0542, 0.0492, 0.0788, 0.0623, 0.0612], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0036, 0.0042, 0.0035, 0.0037, 0.0051, 0.0038, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:45:01,196 INFO [train.py:873] (2/4) Epoch 19, batch 2400, loss[loss=0.08744, simple_loss=0.1348, pruned_loss=0.02006, over 14287.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.14, pruned_loss=0.03328, over 1968399.79 frames. ], batch size: 39, lr: 4.15e-03, grad_scale: 8.0 +2022-12-08 14:45:18,700 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.048e+02 2.033e+02 2.636e+02 3.505e+02 6.447e+02, threshold=5.271e+02, percent-clipped=6.0 +2022-12-08 14:45:28,188 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=138548.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:45:30,909 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=138551.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:45:31,783 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7814, 2.6401, 2.0984, 2.8047, 2.6585, 2.7064, 2.3892, 2.1070], + device='cuda:2'), covar=tensor([0.1221, 0.1325, 0.2943, 0.1055, 0.1221, 0.0921, 0.1501, 0.2667], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0288, 0.0260, 0.0292, 0.0322, 0.0303, 0.0256, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:45:52,820 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.95 vs. limit=2.0 +2022-12-08 14:46:04,974 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3598, 3.1108, 3.0469, 2.1618, 2.8632, 3.1116, 3.4892, 2.7430], + device='cuda:2'), covar=tensor([0.0642, 0.0685, 0.0794, 0.1249, 0.0907, 0.0696, 0.0587, 0.1060], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0167, 0.0139, 0.0124, 0.0143, 0.0154, 0.0137, 0.0141], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:46:07,277 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-08 14:46:08,894 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.11 vs. limit=5.0 +2022-12-08 14:46:09,973 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=138596.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:46:12,444 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=138599.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:46:28,222 INFO [train.py:873] (2/4) Epoch 19, batch 2500, loss[loss=0.1009, simple_loss=0.1431, pruned_loss=0.02937, over 14241.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1402, pruned_loss=0.03307, over 2036681.81 frames. ], batch size: 37, lr: 4.15e-03, grad_scale: 4.0 +2022-12-08 14:46:36,697 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=138626.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:46:47,813 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.473e+02 2.139e+02 2.554e+02 3.094e+02 5.985e+02, threshold=5.107e+02, percent-clipped=2.0 +2022-12-08 14:47:21,852 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=138678.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:47:24,826 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138681.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:47:29,033 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138686.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:47:38,274 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138697.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:47:50,439 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.1169, 4.8115, 4.5067, 4.6516, 4.7371, 4.9533, 5.0066, 5.0101], + device='cuda:2'), covar=tensor([0.0612, 0.0419, 0.1938, 0.2484, 0.0666, 0.0719, 0.0938, 0.0782], + device='cuda:2'), in_proj_covar=tensor([0.0398, 0.0275, 0.0450, 0.0569, 0.0356, 0.0461, 0.0389, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:47:55,443 INFO [train.py:873] (2/4) Epoch 19, batch 2600, loss[loss=0.1122, simple_loss=0.1464, pruned_loss=0.03895, over 11181.00 frames. ], tot_loss[loss=0.1046, simple_loss=0.1408, pruned_loss=0.03415, over 1942822.86 frames. ], batch size: 100, lr: 4.15e-03, grad_scale: 2.0 +2022-12-08 14:48:15,370 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.052e+02 2.043e+02 2.512e+02 3.155e+02 8.164e+02, threshold=5.025e+02, percent-clipped=4.0 +2022-12-08 14:48:17,547 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138742.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:48:21,891 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138747.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:48:31,226 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0635, 2.1166, 1.9232, 2.1537, 1.8181, 2.0256, 2.1201, 2.0621], + device='cuda:2'), covar=tensor([0.0927, 0.1162, 0.1112, 0.0838, 0.1312, 0.0957, 0.1079, 0.1021], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0148, 0.0150, 0.0166, 0.0152, 0.0126, 0.0174, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 14:48:31,322 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138758.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:48:57,348 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.42 vs. limit=5.0 +2022-12-08 14:49:23,300 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=2.00 vs. limit=2.0 +2022-12-08 14:49:23,532 INFO [train.py:873] (2/4) Epoch 19, batch 2700, loss[loss=0.1617, simple_loss=0.151, pruned_loss=0.08619, over 1230.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.1404, pruned_loss=0.03386, over 1897522.58 frames. ], batch size: 100, lr: 4.14e-03, grad_scale: 2.0 +2022-12-08 14:49:43,636 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.077e+02 2.157e+02 2.518e+02 3.000e+02 5.718e+02, threshold=5.036e+02, percent-clipped=2.0 +2022-12-08 14:50:44,383 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=138909.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:50:51,208 INFO [train.py:873] (2/4) Epoch 19, batch 2800, loss[loss=0.113, simple_loss=0.1281, pruned_loss=0.04893, over 2640.00 frames. ], tot_loss[loss=0.1046, simple_loss=0.1409, pruned_loss=0.03413, over 1898770.64 frames. ], batch size: 100, lr: 4.14e-03, grad_scale: 4.0 +2022-12-08 14:50:59,230 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=138926.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:51:10,963 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.214e+02 2.185e+02 2.764e+02 3.570e+02 5.389e+02, threshold=5.528e+02, percent-clipped=1.0 +2022-12-08 14:51:37,507 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=138970.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 14:51:38,297 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4862, 2.1986, 2.4325, 1.6314, 2.0537, 2.3620, 2.4837, 2.1989], + device='cuda:2'), covar=tensor([0.0847, 0.0733, 0.0954, 0.1386, 0.1377, 0.0865, 0.0711, 0.1322], + device='cuda:2'), in_proj_covar=tensor([0.0153, 0.0166, 0.0139, 0.0124, 0.0142, 0.0154, 0.0136, 0.0140], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:51:40,713 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=138974.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:51:44,297 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=138978.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:51:46,941 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5906, 2.0571, 3.5653, 2.6153, 3.5406, 1.9227, 2.7335, 3.5318], + device='cuda:2'), covar=tensor([0.0804, 0.3531, 0.0619, 0.4559, 0.0712, 0.2887, 0.1358, 0.0589], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0197, 0.0218, 0.0266, 0.0239, 0.0200, 0.0200, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 14:51:52,032 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0742, 2.8104, 3.6577, 2.4916, 2.3719, 3.1449, 1.7614, 3.2067], + device='cuda:2'), covar=tensor([0.0747, 0.1037, 0.0496, 0.1927, 0.1856, 0.0778, 0.2775, 0.0896], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0103, 0.0096, 0.0101, 0.0116, 0.0092, 0.0117, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 14:52:18,363 INFO [train.py:873] (2/4) Epoch 19, batch 2900, loss[loss=0.1314, simple_loss=0.1246, pruned_loss=0.06909, over 1252.00 frames. ], tot_loss[loss=0.1041, simple_loss=0.1405, pruned_loss=0.03385, over 1905616.78 frames. ], batch size: 100, lr: 4.14e-03, grad_scale: 4.0 +2022-12-08 14:52:26,194 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139026.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:52:35,644 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139037.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:52:38,099 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.182e+02 1.948e+02 2.422e+02 2.806e+02 5.439e+02, threshold=4.845e+02, percent-clipped=0.0 +2022-12-08 14:52:39,978 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139042.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:52:49,569 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139053.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:53:45,725 INFO [train.py:873] (2/4) Epoch 19, batch 3000, loss[loss=0.09704, simple_loss=0.1352, pruned_loss=0.02944, over 14406.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.141, pruned_loss=0.03401, over 1940851.31 frames. ], batch size: 53, lr: 4.14e-03, grad_scale: 4.0 +2022-12-08 14:53:45,726 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 14:53:50,406 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2039, 4.3254, 4.3932, 3.7650, 4.3113, 4.3730, 1.8451, 4.0652], + device='cuda:2'), covar=tensor([0.0229, 0.0234, 0.0258, 0.0384, 0.0219, 0.0198, 0.3068, 0.0255], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0178, 0.0149, 0.0152, 0.0212, 0.0145, 0.0159, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:53:54,214 INFO [train.py:905] (2/4) Epoch 19, validation: loss=0.142, simple_loss=0.1782, pruned_loss=0.05288, over 857387.00 frames. +2022-12-08 14:53:54,215 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 14:54:02,387 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-12-08 14:54:14,227 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.279e+02 2.026e+02 2.406e+02 3.345e+02 1.533e+03, threshold=4.812e+02, percent-clipped=5.0 +2022-12-08 14:54:20,793 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.15 vs. limit=5.0 +2022-12-08 14:54:30,081 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8312, 2.7836, 2.0617, 2.9036, 2.7281, 2.7713, 2.5318, 2.1861], + device='cuda:2'), covar=tensor([0.1015, 0.1092, 0.2927, 0.0820, 0.1130, 0.1350, 0.1415, 0.2522], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0288, 0.0257, 0.0291, 0.0322, 0.0302, 0.0255, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:54:41,362 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7850, 3.4846, 3.2354, 2.6458, 3.2036, 3.4839, 3.8362, 3.1082], + device='cuda:2'), covar=tensor([0.0545, 0.0908, 0.0781, 0.1057, 0.0909, 0.0606, 0.0742, 0.0877], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0168, 0.0140, 0.0125, 0.0144, 0.0155, 0.0138, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 14:55:22,156 INFO [train.py:873] (2/4) Epoch 19, batch 3100, loss[loss=0.09662, simple_loss=0.1251, pruned_loss=0.03408, over 5988.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1401, pruned_loss=0.03375, over 1885673.26 frames. ], batch size: 100, lr: 4.14e-03, grad_scale: 4.0 +2022-12-08 14:55:28,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.9248, 0.7660, 0.8050, 0.8304, 0.8115, 0.6257, 0.8161, 0.8054], + device='cuda:2'), covar=tensor([0.0188, 0.0191, 0.0157, 0.0177, 0.0172, 0.0308, 0.0224, 0.0257], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0021, 0.0023, 0.0022, 0.0036, 0.0029, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 14:55:38,231 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7783, 0.8282, 0.7018, 0.8661, 0.8468, 0.3699, 0.7965, 0.8595], + device='cuda:2'), covar=tensor([0.0465, 0.0520, 0.0575, 0.0538, 0.0416, 0.0432, 0.1014, 0.0793], + device='cuda:2'), in_proj_covar=tensor([0.0038, 0.0036, 0.0041, 0.0035, 0.0036, 0.0050, 0.0038, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 14:55:40,026 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139238.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:55:41,531 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 2.036e+02 2.564e+02 2.982e+02 6.918e+02, threshold=5.128e+02, percent-clipped=2.0 +2022-12-08 14:56:03,630 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139265.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 14:56:13,035 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139276.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:56:24,423 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139289.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:56:33,160 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139299.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:56:48,456 INFO [train.py:873] (2/4) Epoch 19, batch 3200, loss[loss=0.1161, simple_loss=0.1542, pruned_loss=0.03899, over 14507.00 frames. ], tot_loss[loss=0.104, simple_loss=0.1407, pruned_loss=0.03361, over 1894356.03 frames. ], batch size: 49, lr: 4.14e-03, grad_scale: 8.0 +2022-12-08 14:57:06,565 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139337.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:57:06,603 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139337.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 14:57:08,977 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.774e+01 1.930e+02 2.439e+02 3.004e+02 8.294e+02, threshold=4.879e+02, percent-clipped=2.0 +2022-12-08 14:57:10,907 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139342.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:57:17,677 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139350.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:57:20,037 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139353.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:57:22,980 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8464, 1.7386, 1.6277, 1.8885, 1.7046, 1.8399, 1.8274, 1.6100], + device='cuda:2'), covar=tensor([0.1370, 0.1188, 0.1941, 0.1017, 0.1309, 0.0788, 0.1718, 0.1113], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0289, 0.0258, 0.0293, 0.0324, 0.0304, 0.0256, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:57:41,221 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8107, 1.7241, 3.8845, 3.6246, 3.7541, 3.9983, 3.1765, 3.9467], + device='cuda:2'), covar=tensor([0.2052, 0.1887, 0.0217, 0.0358, 0.0331, 0.0232, 0.0438, 0.0212], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0156, 0.0132, 0.0170, 0.0149, 0.0143, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 14:57:47,834 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139385.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:57:52,092 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139390.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:58:01,915 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139401.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 14:58:16,252 INFO [train.py:873] (2/4) Epoch 19, batch 3300, loss[loss=0.08358, simple_loss=0.1094, pruned_loss=0.02886, over 2578.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1403, pruned_loss=0.03363, over 1881250.44 frames. ], batch size: 100, lr: 4.14e-03, grad_scale: 8.0 +2022-12-08 14:58:21,181 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5056, 2.4197, 2.0831, 2.1171, 2.4843, 2.4768, 2.5100, 2.5013], + device='cuda:2'), covar=tensor([0.1515, 0.1046, 0.2936, 0.3279, 0.1398, 0.1570, 0.1661, 0.1382], + device='cuda:2'), in_proj_covar=tensor([0.0397, 0.0272, 0.0447, 0.0568, 0.0354, 0.0458, 0.0388, 0.0401], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 14:58:35,037 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.212e+02 2.023e+02 2.414e+02 3.086e+02 5.603e+02, threshold=4.828e+02, percent-clipped=3.0 +2022-12-08 14:58:54,750 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.5517, 4.7036, 5.0111, 4.3099, 4.7982, 4.9416, 1.9902, 4.5088], + device='cuda:2'), covar=tensor([0.0299, 0.0268, 0.0316, 0.0420, 0.0286, 0.0186, 0.2911, 0.0273], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0179, 0.0150, 0.0152, 0.0213, 0.0145, 0.0159, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 14:59:41,882 INFO [train.py:873] (2/4) Epoch 19, batch 3400, loss[loss=0.09731, simple_loss=0.1369, pruned_loss=0.02886, over 14272.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1404, pruned_loss=0.03359, over 1947632.97 frames. ], batch size: 28, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 14:59:52,046 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.57 vs. limit=2.0 +2022-12-08 15:00:02,110 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.225e+02 2.047e+02 2.601e+02 3.313e+02 6.688e+02, threshold=5.202e+02, percent-clipped=6.0 +2022-12-08 15:00:24,201 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139565.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 15:00:26,508 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.73 vs. limit=2.0 +2022-12-08 15:00:29,800 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139571.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:00:39,100 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139582.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:00:49,331 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139594.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:01:06,700 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139613.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:01:09,598 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.12 vs. limit=5.0 +2022-12-08 15:01:10,081 INFO [train.py:873] (2/4) Epoch 19, batch 3500, loss[loss=0.09787, simple_loss=0.1359, pruned_loss=0.02991, over 12789.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1401, pruned_loss=0.03379, over 1927251.70 frames. ], batch size: 100, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:01:23,124 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139632.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 15:01:23,219 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139632.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:01:30,131 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.236e+02 2.003e+02 2.440e+02 2.964e+02 5.520e+02, threshold=4.880e+02, percent-clipped=2.0 +2022-12-08 15:01:32,907 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:01:34,851 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139645.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:01:48,956 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9051, 1.3614, 2.0197, 1.3106, 1.9850, 2.0561, 1.6067, 2.1304], + device='cuda:2'), covar=tensor([0.0271, 0.2055, 0.0511, 0.1784, 0.0603, 0.0663, 0.1325, 0.0412], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0161, 0.0169, 0.0169, 0.0179, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:02:37,621 INFO [train.py:873] (2/4) Epoch 19, batch 3600, loss[loss=0.112, simple_loss=0.1445, pruned_loss=0.03975, over 14217.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1396, pruned_loss=0.03342, over 1910860.16 frames. ], batch size: 94, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:02:57,976 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.754e+01 2.074e+02 2.504e+02 3.273e+02 8.629e+02, threshold=5.008e+02, percent-clipped=8.0 +2022-12-08 15:03:22,059 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7175, 1.5273, 2.9115, 1.4945, 2.9456, 2.8567, 2.1751, 2.9860], + device='cuda:2'), covar=tensor([0.0330, 0.2818, 0.0418, 0.2263, 0.0427, 0.0528, 0.1156, 0.0341], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0154, 0.0160, 0.0168, 0.0168, 0.0179, 0.0133, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:03:24,051 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139769.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:03:41,901 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139789.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:03:55,263 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139804.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:03:57,329 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139806.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:04:07,190 INFO [train.py:873] (2/4) Epoch 19, batch 3700, loss[loss=0.1081, simple_loss=0.1443, pruned_loss=0.03596, over 14085.00 frames. ], tot_loss[loss=0.1025, simple_loss=0.1392, pruned_loss=0.03283, over 1934930.63 frames. ], batch size: 26, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:04:18,315 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139830.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:04:26,981 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.254e+02 1.959e+02 2.384e+02 2.847e+02 4.025e+02, threshold=4.768e+02, percent-clipped=0.0 +2022-12-08 15:04:36,845 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=139850.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:04:36,867 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139850.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:04:49,163 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139865.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:04:51,047 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139867.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:08,079 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7720, 4.4805, 4.2798, 4.7630, 4.4798, 4.2427, 4.7706, 4.0610], + device='cuda:2'), covar=tensor([0.0367, 0.0795, 0.0411, 0.0392, 0.0747, 0.0677, 0.0498, 0.0470], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0279, 0.0205, 0.0202, 0.0188, 0.0163, 0.0295, 0.0173], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 15:05:14,698 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139894.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:29,296 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=139911.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:34,124 INFO [train.py:873] (2/4) Epoch 19, batch 3800, loss[loss=0.09119, simple_loss=0.1337, pruned_loss=0.02436, over 14222.00 frames. ], tot_loss[loss=0.1031, simple_loss=0.1398, pruned_loss=0.03317, over 1941798.10 frames. ], batch size: 94, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:05:43,841 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139927.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:48,195 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139932.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 15:05:53,165 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=139938.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:54,737 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.144e+02 2.015e+02 2.567e+02 3.063e+02 4.403e+02, threshold=5.135e+02, percent-clipped=0.0 +2022-12-08 15:05:55,374 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-08 15:05:56,566 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139942.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:05:59,173 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=139945.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:06:29,963 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139980.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:06:35,979 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0799, 1.4906, 3.1541, 1.6526, 3.1385, 3.2092, 2.3729, 3.4128], + device='cuda:2'), covar=tensor([0.0293, 0.2930, 0.0473, 0.2054, 0.0894, 0.0480, 0.1084, 0.0244], + device='cuda:2'), in_proj_covar=tensor([0.0172, 0.0154, 0.0160, 0.0168, 0.0167, 0.0179, 0.0133, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:06:41,160 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=139993.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:07:06,101 INFO [train.py:873] (2/4) Epoch 19, batch 3900, loss[loss=0.1092, simple_loss=0.1448, pruned_loss=0.0368, over 14581.00 frames. ], tot_loss[loss=0.1037, simple_loss=0.1399, pruned_loss=0.03372, over 1918686.57 frames. ], batch size: 43, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:07:25,637 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.340e+02 2.144e+02 2.574e+02 3.333e+02 6.487e+02, threshold=5.147e+02, percent-clipped=6.0 +2022-12-08 15:07:52,522 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5561, 1.7901, 2.0051, 1.9942, 1.7999, 2.0051, 1.7592, 1.3355], + device='cuda:2'), covar=tensor([0.1031, 0.1220, 0.0834, 0.0746, 0.1191, 0.0962, 0.1462, 0.2019], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0091, 0.0071, 0.0077, 0.0101, 0.0093, 0.0102, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:08:21,796 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140103.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:08:27,928 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140110.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:08:33,681 INFO [train.py:873] (2/4) Epoch 19, batch 4000, loss[loss=0.1076, simple_loss=0.1452, pruned_loss=0.035, over 14539.00 frames. ], tot_loss[loss=0.1029, simple_loss=0.1395, pruned_loss=0.03317, over 1923447.10 frames. ], batch size: 49, lr: 4.13e-03, grad_scale: 8.0 +2022-12-08 15:08:41,210 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140125.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:08:43,145 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1364, 2.6607, 4.0056, 3.0365, 4.0749, 3.9713, 3.7626, 3.3792], + device='cuda:2'), covar=tensor([0.0851, 0.3047, 0.1164, 0.1844, 0.0808, 0.0964, 0.1769, 0.1757], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0307, 0.0386, 0.0295, 0.0360, 0.0320, 0.0359, 0.0294], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:08:54,675 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.294e+02 2.130e+02 2.579e+02 3.097e+02 6.234e+02, threshold=5.158e+02, percent-clipped=1.0 +2022-12-08 15:08:59,206 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140145.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:09:01,011 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0707, 1.7067, 2.1119, 1.9871, 2.0621, 1.5628, 1.4607, 1.2917], + device='cuda:2'), covar=tensor([0.0209, 0.0463, 0.0237, 0.0244, 0.0228, 0.0324, 0.0325, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0021, 0.0023, 0.0022, 0.0035, 0.0029, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 15:09:01,721 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7126, 1.0906, 1.2517, 1.1160, 0.9268, 1.2182, 0.9925, 0.7983], + device='cuda:2'), covar=tensor([0.1671, 0.1078, 0.0429, 0.0449, 0.2193, 0.1194, 0.1725, 0.1545], + device='cuda:2'), in_proj_covar=tensor([0.0136, 0.0091, 0.0072, 0.0077, 0.0101, 0.0093, 0.0102, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:09:06,065 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9590, 3.7196, 3.3517, 2.8644, 3.4043, 3.6566, 4.0716, 3.3139], + device='cuda:2'), covar=tensor([0.0536, 0.1075, 0.0762, 0.1033, 0.0695, 0.0552, 0.0729, 0.0766], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0168, 0.0140, 0.0124, 0.0143, 0.0155, 0.0138, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:09:12,063 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140160.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:09:14,040 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140162.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:09:15,880 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140164.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:09:22,486 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140171.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:09:38,453 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7858, 4.8945, 5.2477, 4.4991, 4.9722, 5.2818, 1.9452, 4.7016], + device='cuda:2'), covar=tensor([0.0343, 0.0301, 0.0316, 0.0390, 0.0317, 0.0140, 0.3065, 0.0294], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0177, 0.0148, 0.0151, 0.0212, 0.0144, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 15:09:52,824 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140206.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:10:02,908 INFO [train.py:873] (2/4) Epoch 19, batch 4100, loss[loss=0.101, simple_loss=0.124, pruned_loss=0.03897, over 3848.00 frames. ], tot_loss[loss=0.1039, simple_loss=0.1402, pruned_loss=0.03377, over 1850843.78 frames. ], batch size: 100, lr: 4.12e-03, grad_scale: 8.0 +2022-12-08 15:10:11,563 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140227.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:10:20,984 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140238.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:10:22,566 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.264e+02 2.026e+02 2.683e+02 3.306e+02 6.239e+02, threshold=5.366e+02, percent-clipped=2.0 +2022-12-08 15:10:39,313 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140259.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:10:52,822 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140275.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:10:56,056 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.89 vs. limit=2.0 +2022-12-08 15:11:02,829 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140286.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:11:08,743 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.47 vs. limit=2.0 +2022-12-08 15:11:29,480 INFO [train.py:873] (2/4) Epoch 19, batch 4200, loss[loss=0.0914, simple_loss=0.1344, pruned_loss=0.0242, over 13836.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.14, pruned_loss=0.03326, over 1925157.06 frames. ], batch size: 20, lr: 4.12e-03, grad_scale: 8.0 +2022-12-08 15:11:30,413 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3296, 1.9292, 2.2038, 1.5196, 1.9324, 2.3380, 2.2951, 1.9297], + device='cuda:2'), covar=tensor([0.0719, 0.0683, 0.0790, 0.1267, 0.1278, 0.0775, 0.0738, 0.1326], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0168, 0.0140, 0.0124, 0.0143, 0.0155, 0.0138, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:11:31,941 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140320.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:11:40,340 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3113, 2.5219, 4.3963, 4.5569, 4.5206, 2.4250, 4.5744, 3.4212], + device='cuda:2'), covar=tensor([0.0458, 0.1505, 0.0885, 0.0486, 0.0506, 0.2488, 0.0451, 0.1139], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0262, 0.0378, 0.0333, 0.0273, 0.0310, 0.0314, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:11:49,279 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.317e+02 2.052e+02 2.429e+02 3.115e+02 5.934e+02, threshold=4.859e+02, percent-clipped=1.0 +2022-12-08 15:11:53,619 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1482, 1.8695, 4.6717, 4.1950, 4.1363, 4.7600, 4.3996, 4.7710], + device='cuda:2'), covar=tensor([0.1528, 0.1601, 0.0109, 0.0226, 0.0241, 0.0141, 0.0143, 0.0101], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0132, 0.0170, 0.0149, 0.0142, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 15:12:29,877 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140387.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:12:56,648 INFO [train.py:873] (2/4) Epoch 19, batch 4300, loss[loss=0.09314, simple_loss=0.1376, pruned_loss=0.02435, over 14550.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.14, pruned_loss=0.03333, over 1912728.67 frames. ], batch size: 43, lr: 4.12e-03, grad_scale: 8.0 +2022-12-08 15:13:03,490 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140425.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:16,352 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.481e+01 2.038e+02 2.405e+02 2.912e+02 6.757e+02, threshold=4.809e+02, percent-clipped=4.0 +2022-12-08 15:13:16,647 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1909, 2.6169, 4.0952, 2.9473, 4.0192, 3.9119, 3.8309, 3.3565], + device='cuda:2'), covar=tensor([0.0873, 0.3071, 0.0957, 0.1981, 0.0906, 0.1119, 0.1471, 0.1805], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0306, 0.0384, 0.0295, 0.0361, 0.0318, 0.0358, 0.0293], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:13:21,040 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140445.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:23,495 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140448.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:33,459 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140459.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:34,405 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140460.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:36,169 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140462.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:39,458 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140466.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:45,319 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140473.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:13:59,372 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140489.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:03,130 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140493.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:14,396 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140506.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:16,041 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140508.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:16,954 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140509.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:17,789 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140510.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:23,510 INFO [train.py:873] (2/4) Epoch 19, batch 4400, loss[loss=0.0835, simple_loss=0.1279, pruned_loss=0.01956, over 13920.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1397, pruned_loss=0.03352, over 1865699.96 frames. ], batch size: 20, lr: 4.12e-03, grad_scale: 8.0 +2022-12-08 15:14:34,066 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3033, 4.0681, 3.8188, 3.9954, 4.1790, 4.2294, 4.3049, 4.2880], + device='cuda:2'), covar=tensor([0.0766, 0.0587, 0.1995, 0.2300, 0.0699, 0.0787, 0.0843, 0.0733], + device='cuda:2'), in_proj_covar=tensor([0.0396, 0.0272, 0.0448, 0.0567, 0.0355, 0.0459, 0.0390, 0.0400], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:14:43,959 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.073e+02 2.032e+02 2.425e+02 2.812e+02 5.381e+02, threshold=4.850e+02, percent-clipped=2.0 +2022-12-08 15:14:51,843 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3585, 2.0200, 2.3460, 2.4807, 2.1862, 1.9677, 2.3723, 2.1266], + device='cuda:2'), covar=tensor([0.0523, 0.0984, 0.0550, 0.0488, 0.0713, 0.1291, 0.0587, 0.0695], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0263, 0.0380, 0.0333, 0.0274, 0.0311, 0.0315, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:14:52,737 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140550.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:14:56,036 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140554.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:15:09,641 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140570.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:15:50,162 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140615.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:15:51,860 INFO [train.py:873] (2/4) Epoch 19, batch 4500, loss[loss=0.1075, simple_loss=0.1445, pruned_loss=0.03522, over 14269.00 frames. ], tot_loss[loss=0.1029, simple_loss=0.1399, pruned_loss=0.03294, over 1929550.73 frames. ], batch size: 46, lr: 4.12e-03, grad_scale: 4.0 +2022-12-08 15:16:12,052 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.060e+02 2.100e+02 2.600e+02 3.124e+02 5.170e+02, threshold=5.201e+02, percent-clipped=3.0 +2022-12-08 15:16:12,566 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.29 vs. limit=2.0 +2022-12-08 15:17:17,739 INFO [train.py:873] (2/4) Epoch 19, batch 4600, loss[loss=0.09412, simple_loss=0.138, pruned_loss=0.02511, over 14028.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1405, pruned_loss=0.03349, over 1923273.87 frames. ], batch size: 26, lr: 4.12e-03, grad_scale: 4.0 +2022-12-08 15:17:23,430 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140723.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:17:39,454 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 2.073e+02 2.575e+02 3.535e+02 1.946e+03, threshold=5.149e+02, percent-clipped=6.0 +2022-12-08 15:17:41,343 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140743.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:17:55,007 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140759.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:17:59,688 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.39 vs. limit=5.0 +2022-12-08 15:18:00,952 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140766.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:18:17,074 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140784.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:18:36,905 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140807.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:18:38,945 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140809.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:18:43,027 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140814.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:18:45,758 INFO [train.py:873] (2/4) Epoch 19, batch 4700, loss[loss=0.1787, simple_loss=0.1655, pruned_loss=0.09595, over 1202.00 frames. ], tot_loss[loss=0.1039, simple_loss=0.1406, pruned_loss=0.03358, over 1892650.81 frames. ], batch size: 100, lr: 4.11e-03, grad_scale: 4.0 +2022-12-08 15:19:06,571 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.370e+02 2.144e+02 2.562e+02 3.057e+02 6.222e+02, threshold=5.124e+02, percent-clipped=4.0 +2022-12-08 15:19:10,072 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140845.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:19:28,313 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=140865.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:19:32,482 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140870.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:20:11,791 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=140915.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:20:13,320 INFO [train.py:873] (2/4) Epoch 19, batch 4800, loss[loss=0.1099, simple_loss=0.1458, pruned_loss=0.03698, over 14225.00 frames. ], tot_loss[loss=0.1045, simple_loss=0.1407, pruned_loss=0.03412, over 1894365.22 frames. ], batch size: 94, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:20:13,463 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140917.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:20:34,742 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.343e+02 2.115e+02 2.464e+02 2.978e+02 7.209e+02, threshold=4.928e+02, percent-clipped=2.0 +2022-12-08 15:20:40,044 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=140947.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:20:53,883 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=140963.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:21:07,376 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=140978.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:21:29,995 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9881, 1.9169, 4.3898, 4.0950, 4.0038, 4.4570, 3.9947, 4.4768], + device='cuda:2'), covar=tensor([0.1572, 0.1506, 0.0120, 0.0228, 0.0250, 0.0161, 0.0216, 0.0118], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0131, 0.0169, 0.0148, 0.0143, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 15:21:34,151 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141008.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:21:41,896 INFO [train.py:873] (2/4) Epoch 19, batch 4900, loss[loss=0.1206, simple_loss=0.1483, pruned_loss=0.04643, over 3849.00 frames. ], tot_loss[loss=0.1054, simple_loss=0.1415, pruned_loss=0.03469, over 1921680.96 frames. ], batch size: 100, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:21:47,198 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6393, 2.7840, 2.8148, 2.7422, 2.7565, 2.5067, 1.5355, 2.5645], + device='cuda:2'), covar=tensor([0.0592, 0.0472, 0.0377, 0.0423, 0.0377, 0.1017, 0.2537, 0.0372], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0176, 0.0147, 0.0150, 0.0209, 0.0143, 0.0158, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 15:21:59,459 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.64 vs. limit=2.0 +2022-12-08 15:22:02,360 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.082e+02 2.070e+02 2.764e+02 3.531e+02 1.295e+03, threshold=5.527e+02, percent-clipped=7.0 +2022-12-08 15:22:04,123 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141043.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:22:34,885 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141079.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:22:39,352 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7467, 2.2305, 3.7135, 3.8439, 3.5690, 2.3053, 3.7522, 2.8217], + device='cuda:2'), covar=tensor([0.0512, 0.1439, 0.0866, 0.0532, 0.0676, 0.2091, 0.0525, 0.1205], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0262, 0.0378, 0.0333, 0.0273, 0.0309, 0.0313, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:22:45,509 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141091.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:23:08,041 INFO [train.py:873] (2/4) Epoch 19, batch 5000, loss[loss=0.07591, simple_loss=0.124, pruned_loss=0.01393, over 14271.00 frames. ], tot_loss[loss=0.1044, simple_loss=0.1408, pruned_loss=0.03401, over 1938886.60 frames. ], batch size: 31, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:23:24,673 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.49 vs. limit=2.0 +2022-12-08 15:23:28,878 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.297e+02 2.058e+02 2.477e+02 3.160e+02 7.208e+02, threshold=4.953e+02, percent-clipped=1.0 +2022-12-08 15:23:30,766 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141143.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:23:31,964 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 15:23:32,320 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141145.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:23:49,066 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141165.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:23:49,125 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141165.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:24:13,406 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141193.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:24:22,794 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141204.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:24:30,571 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141213.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:24:34,007 INFO [train.py:873] (2/4) Epoch 19, batch 5100, loss[loss=0.1423, simple_loss=0.1428, pruned_loss=0.07095, over 2609.00 frames. ], tot_loss[loss=0.1051, simple_loss=0.1413, pruned_loss=0.03444, over 1956757.45 frames. ], batch size: 100, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:24:49,639 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.21 vs. limit=5.0 +2022-12-08 15:24:51,186 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.28 vs. limit=2.0 +2022-12-08 15:24:54,061 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.191e+02 2.078e+02 2.511e+02 3.062e+02 5.602e+02, threshold=5.021e+02, percent-clipped=1.0 +2022-12-08 15:25:21,745 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141273.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:25:47,937 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141303.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:25:59,795 INFO [train.py:873] (2/4) Epoch 19, batch 5200, loss[loss=0.1455, simple_loss=0.1671, pruned_loss=0.0619, over 10372.00 frames. ], tot_loss[loss=0.1049, simple_loss=0.1413, pruned_loss=0.03423, over 1935589.36 frames. ], batch size: 100, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:26:08,542 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.67 vs. limit=5.0 +2022-12-08 15:26:10,213 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141329.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:26:20,993 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.019e+02 2.014e+02 2.549e+02 3.340e+02 7.347e+02, threshold=5.099e+02, percent-clipped=2.0 +2022-12-08 15:26:54,507 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141379.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:27:04,092 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141390.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:27:28,123 INFO [train.py:873] (2/4) Epoch 19, batch 5300, loss[loss=0.1083, simple_loss=0.1405, pruned_loss=0.03805, over 14226.00 frames. ], tot_loss[loss=0.1047, simple_loss=0.1408, pruned_loss=0.0343, over 1866042.60 frames. ], batch size: 89, lr: 4.11e-03, grad_scale: 8.0 +2022-12-08 15:27:28,252 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141417.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 15:27:36,871 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141427.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:27:48,828 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.160e+02 2.222e+02 2.556e+02 2.994e+02 5.858e+02, threshold=5.112e+02, percent-clipped=4.0 +2022-12-08 15:27:48,999 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141441.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:28:10,400 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141465.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:28:21,484 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141478.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 15:28:33,022 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1806, 1.8522, 2.2356, 1.5953, 1.9272, 2.2617, 2.1531, 1.9268], + device='cuda:2'), covar=tensor([0.1076, 0.0749, 0.0995, 0.1459, 0.1593, 0.0943, 0.0929, 0.1608], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0170, 0.0141, 0.0126, 0.0145, 0.0157, 0.0140, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:28:39,744 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141499.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:28:42,516 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141502.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:28:43,357 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4485, 3.1598, 3.0760, 2.3286, 2.9571, 3.2327, 3.4806, 2.8001], + device='cuda:2'), covar=tensor([0.0618, 0.0945, 0.0804, 0.1170, 0.0808, 0.0697, 0.0675, 0.1097], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0170, 0.0141, 0.0126, 0.0145, 0.0156, 0.0140, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:28:51,512 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141513.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:28:54,904 INFO [train.py:873] (2/4) Epoch 19, batch 5400, loss[loss=0.09838, simple_loss=0.1247, pruned_loss=0.03604, over 5975.00 frames. ], tot_loss[loss=0.1039, simple_loss=0.1404, pruned_loss=0.03368, over 1917353.68 frames. ], batch size: 100, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:29:16,144 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 2.017e+02 2.549e+02 3.465e+02 7.895e+02, threshold=5.098e+02, percent-clipped=4.0 +2022-12-08 15:29:22,564 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9260, 1.5781, 3.0042, 1.6693, 3.1656, 3.0176, 2.2907, 3.2400], + device='cuda:2'), covar=tensor([0.0265, 0.2598, 0.0415, 0.1843, 0.0350, 0.0467, 0.1049, 0.0225], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0161, 0.0168, 0.0167, 0.0179, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:29:43,832 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141573.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:30:10,157 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141603.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:30:22,130 INFO [train.py:873] (2/4) Epoch 19, batch 5500, loss[loss=0.1627, simple_loss=0.1781, pruned_loss=0.07366, over 10347.00 frames. ], tot_loss[loss=0.1023, simple_loss=0.1395, pruned_loss=0.03257, over 2000076.54 frames. ], batch size: 100, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:30:25,891 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141621.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:30:43,271 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.230e+02 2.074e+02 2.608e+02 3.444e+02 6.357e+02, threshold=5.216e+02, percent-clipped=6.0 +2022-12-08 15:30:52,122 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141651.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:31:21,359 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141685.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:31:24,428 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3310, 1.7081, 4.1667, 2.3105, 4.2465, 4.4997, 3.7824, 4.8293], + device='cuda:2'), covar=tensor([0.0218, 0.3070, 0.0372, 0.1867, 0.0310, 0.0378, 0.0545, 0.0159], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0161, 0.0168, 0.0166, 0.0178, 0.0133, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:31:48,910 INFO [train.py:873] (2/4) Epoch 19, batch 5600, loss[loss=0.08831, simple_loss=0.1322, pruned_loss=0.02223, over 14307.00 frames. ], tot_loss[loss=0.103, simple_loss=0.1397, pruned_loss=0.0332, over 1968557.05 frames. ], batch size: 39, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:31:57,299 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0132, 1.9116, 2.0065, 2.0100, 2.0718, 1.2945, 1.7123, 1.9571], + device='cuda:2'), covar=tensor([0.0628, 0.0786, 0.0533, 0.0850, 0.0765, 0.0809, 0.0750, 0.0547], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0036, 0.0042, 0.0035, 0.0037, 0.0051, 0.0038, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 15:32:09,418 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.697e+01 2.227e+02 2.853e+02 3.557e+02 6.540e+02, threshold=5.705e+02, percent-clipped=5.0 +2022-12-08 15:32:37,314 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141773.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 15:32:58,662 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=141797.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:33:00,319 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141799.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:33:16,298 INFO [train.py:873] (2/4) Epoch 19, batch 5700, loss[loss=0.123, simple_loss=0.1473, pruned_loss=0.04932, over 7783.00 frames. ], tot_loss[loss=0.1039, simple_loss=0.1403, pruned_loss=0.03373, over 1934576.86 frames. ], batch size: 100, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:33:29,967 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.54 vs. limit=5.0 +2022-12-08 15:33:35,608 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 15:33:36,775 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.084e+02 2.066e+02 2.688e+02 3.167e+02 5.846e+02, threshold=5.376e+02, percent-clipped=2.0 +2022-12-08 15:33:41,764 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=141847.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:34:41,490 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141916.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:34:42,171 INFO [train.py:873] (2/4) Epoch 19, batch 5800, loss[loss=0.1049, simple_loss=0.1413, pruned_loss=0.03421, over 14246.00 frames. ], tot_loss[loss=0.104, simple_loss=0.1407, pruned_loss=0.03365, over 1978385.01 frames. ], batch size: 60, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:34:48,909 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.87 vs. limit=2.0 +2022-12-08 15:34:51,880 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=141928.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:35:03,272 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.310e+01 2.008e+02 2.595e+02 3.223e+02 5.261e+02, threshold=5.190e+02, percent-clipped=0.0 +2022-12-08 15:35:35,481 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141977.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:35:43,128 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=141985.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:35:46,567 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=141989.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:35:59,853 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.0300, 2.2908, 4.9694, 3.4740, 4.7723, 1.9581, 3.6168, 4.8307], + device='cuda:2'), covar=tensor([0.0445, 0.3742, 0.0314, 0.4518, 0.0464, 0.3411, 0.1228, 0.0317], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0198, 0.0220, 0.0266, 0.0240, 0.0201, 0.0200, 0.0222], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 15:36:04,254 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.98 vs. limit=5.0 +2022-12-08 15:36:11,185 INFO [train.py:873] (2/4) Epoch 19, batch 5900, loss[loss=0.138, simple_loss=0.1324, pruned_loss=0.07175, over 1258.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1403, pruned_loss=0.03328, over 1971896.08 frames. ], batch size: 100, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:36:24,800 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=142033.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:36:31,707 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.115e+02 2.065e+02 2.424e+02 3.179e+02 4.807e+02, threshold=4.849e+02, percent-clipped=0.0 +2022-12-08 15:37:00,304 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=142073.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 15:37:13,823 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7628, 5.2211, 5.1268, 5.6623, 5.3009, 4.9178, 5.6397, 4.8622], + device='cuda:2'), covar=tensor([0.0291, 0.0939, 0.0433, 0.0412, 0.0740, 0.0357, 0.0501, 0.0442], + device='cuda:2'), in_proj_covar=tensor([0.0182, 0.0281, 0.0205, 0.0203, 0.0188, 0.0162, 0.0295, 0.0172], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 15:37:18,667 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9617, 1.6033, 3.9492, 1.8678, 3.8703, 4.1260, 3.1534, 4.3494], + device='cuda:2'), covar=tensor([0.0221, 0.3109, 0.0382, 0.2151, 0.0497, 0.0324, 0.0730, 0.0152], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0156, 0.0162, 0.0169, 0.0167, 0.0179, 0.0133, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:37:21,243 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=142097.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:37:38,676 INFO [train.py:873] (2/4) Epoch 19, batch 6000, loss[loss=0.09304, simple_loss=0.1217, pruned_loss=0.03221, over 4991.00 frames. ], tot_loss[loss=0.1031, simple_loss=0.1402, pruned_loss=0.033, over 2004544.44 frames. ], batch size: 100, lr: 4.10e-03, grad_scale: 8.0 +2022-12-08 15:37:38,676 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 15:37:47,302 INFO [train.py:905] (2/4) Epoch 19, validation: loss=0.1418, simple_loss=0.1782, pruned_loss=0.05266, over 857387.00 frames. +2022-12-08 15:37:47,303 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 15:37:50,884 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=142121.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 15:38:08,227 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.241e+02 1.928e+02 2.466e+02 3.087e+02 6.019e+02, threshold=4.931e+02, percent-clipped=3.0 +2022-12-08 15:38:11,906 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=142145.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:38:41,805 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8580, 2.3234, 3.7831, 3.9568, 3.5975, 2.3477, 3.8112, 2.9072], + device='cuda:2'), covar=tensor([0.0548, 0.1387, 0.0928, 0.0513, 0.0694, 0.2181, 0.0543, 0.1145], + device='cuda:2'), in_proj_covar=tensor([0.0294, 0.0261, 0.0377, 0.0331, 0.0272, 0.0309, 0.0313, 0.0276], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:38:49,527 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6021, 2.5257, 2.5486, 2.3842, 2.4793, 1.6034, 2.6083, 2.7992], + device='cuda:2'), covar=tensor([0.0708, 0.0510, 0.0578, 0.1882, 0.0883, 0.0719, 0.0655, 0.0843], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0037, 0.0042, 0.0035, 0.0037, 0.0051, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 15:39:15,264 INFO [train.py:873] (2/4) Epoch 19, batch 6100, loss[loss=0.08253, simple_loss=0.1293, pruned_loss=0.01789, over 13925.00 frames. ], tot_loss[loss=0.1034, simple_loss=0.1404, pruned_loss=0.03323, over 2013755.70 frames. ], batch size: 23, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:39:23,739 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.77 vs. limit=2.0 +2022-12-08 15:39:36,597 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.188e+02 2.000e+02 2.533e+02 3.049e+02 1.114e+03, threshold=5.065e+02, percent-clipped=3.0 +2022-12-08 15:39:40,067 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.2124, 4.0371, 3.9077, 4.2181, 3.9324, 3.6948, 4.3107, 4.0739], + device='cuda:2'), covar=tensor([0.0657, 0.0899, 0.0842, 0.0591, 0.0851, 0.0734, 0.0606, 0.0742], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0148, 0.0150, 0.0165, 0.0152, 0.0127, 0.0171, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:40:03,718 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=142272.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:40:14,735 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=142284.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:40:17,747 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9619, 2.1874, 2.8987, 2.9742, 2.9005, 2.2656, 2.8884, 2.4562], + device='cuda:2'), covar=tensor([0.0472, 0.1219, 0.0811, 0.0547, 0.0619, 0.1503, 0.0527, 0.0892], + device='cuda:2'), in_proj_covar=tensor([0.0292, 0.0260, 0.0376, 0.0329, 0.0270, 0.0307, 0.0312, 0.0275], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:40:26,151 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1396, 2.1880, 3.0280, 2.3765, 3.0188, 2.9419, 2.8206, 2.5914], + device='cuda:2'), covar=tensor([0.0755, 0.2842, 0.0855, 0.1568, 0.0757, 0.1065, 0.1007, 0.1578], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0310, 0.0387, 0.0297, 0.0364, 0.0323, 0.0361, 0.0296], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:40:42,868 INFO [train.py:873] (2/4) Epoch 19, batch 6200, loss[loss=0.1083, simple_loss=0.1487, pruned_loss=0.03395, over 14413.00 frames. ], tot_loss[loss=0.1022, simple_loss=0.1396, pruned_loss=0.03244, over 1953910.70 frames. ], batch size: 41, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:40:46,928 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0524, 2.0273, 2.3190, 1.5796, 1.6270, 2.1019, 1.3479, 2.2017], + device='cuda:2'), covar=tensor([0.1348, 0.2016, 0.1010, 0.2943, 0.3164, 0.1293, 0.4054, 0.1347], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0105, 0.0099, 0.0102, 0.0117, 0.0093, 0.0118, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 15:41:04,427 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.036e+02 2.651e+02 3.192e+02 8.704e+02, threshold=5.303e+02, percent-clipped=4.0 +2022-12-08 15:41:23,257 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-12-08 15:42:11,865 INFO [train.py:873] (2/4) Epoch 19, batch 6300, loss[loss=0.09394, simple_loss=0.1335, pruned_loss=0.02718, over 14250.00 frames. ], tot_loss[loss=0.1024, simple_loss=0.1393, pruned_loss=0.03272, over 1908472.70 frames. ], batch size: 80, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:42:32,398 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.018e+02 2.111e+02 2.568e+02 3.235e+02 6.197e+02, threshold=5.136e+02, percent-clipped=2.0 +2022-12-08 15:42:37,858 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-08 15:42:55,706 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.0819, 3.4255, 3.2148, 3.3019, 2.5933, 3.5018, 3.2800, 1.7473], + device='cuda:2'), covar=tensor([0.1102, 0.0622, 0.1023, 0.0634, 0.0856, 0.0396, 0.0847, 0.1728], + device='cuda:2'), in_proj_covar=tensor([0.0134, 0.0091, 0.0071, 0.0076, 0.0101, 0.0092, 0.0102, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:43:38,809 INFO [train.py:873] (2/4) Epoch 19, batch 6400, loss[loss=0.08813, simple_loss=0.135, pruned_loss=0.02064, over 14254.00 frames. ], tot_loss[loss=0.1023, simple_loss=0.1393, pruned_loss=0.03266, over 1963028.37 frames. ], batch size: 80, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:43:42,572 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9622, 2.6300, 3.2920, 2.1999, 2.1032, 2.8250, 1.5738, 2.7378], + device='cuda:2'), covar=tensor([0.0870, 0.1039, 0.0540, 0.1944, 0.1890, 0.0877, 0.3222, 0.0894], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0104, 0.0098, 0.0101, 0.0115, 0.0093, 0.0117, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 15:44:00,397 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 1.982e+02 2.595e+02 3.310e+02 8.116e+02, threshold=5.191e+02, percent-clipped=3.0 +2022-12-08 15:44:14,510 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.70 vs. limit=2.0 +2022-12-08 15:44:27,720 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=142572.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:44:28,722 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=142573.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:44:38,068 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=142584.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:45:01,829 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.64 vs. limit=5.0 +2022-12-08 15:45:07,279 INFO [train.py:873] (2/4) Epoch 19, batch 6500, loss[loss=0.114, simple_loss=0.1519, pruned_loss=0.03807, over 14221.00 frames. ], tot_loss[loss=0.1042, simple_loss=0.1406, pruned_loss=0.03387, over 1959718.96 frames. ], batch size: 89, lr: 4.09e-03, grad_scale: 16.0 +2022-12-08 15:45:09,996 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=142620.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:45:19,990 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=142632.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:45:21,740 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=142634.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:45:27,970 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.114e+02 2.216e+02 2.673e+02 3.413e+02 8.329e+02, threshold=5.346e+02, percent-clipped=5.0 +2022-12-08 15:45:56,360 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8724, 3.2697, 2.9701, 3.3083, 2.4662, 3.3094, 3.0878, 1.6408], + device='cuda:2'), covar=tensor([0.1314, 0.0757, 0.1228, 0.0528, 0.0965, 0.0474, 0.0943, 0.2084], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0092, 0.0072, 0.0077, 0.0102, 0.0093, 0.0103, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:46:00,477 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5314, 3.2812, 3.2190, 3.5212, 3.2963, 3.5173, 3.5628, 3.0038], + device='cuda:2'), covar=tensor([0.0513, 0.0968, 0.0589, 0.0518, 0.0822, 0.0375, 0.0653, 0.0638], + device='cuda:2'), in_proj_covar=tensor([0.0184, 0.0283, 0.0207, 0.0206, 0.0189, 0.0164, 0.0299, 0.0174], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0004, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 15:46:13,088 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.67 vs. limit=2.0 +2022-12-08 15:46:14,158 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1677, 1.2905, 1.3094, 1.1420, 0.8654, 1.0614, 0.9295, 1.2379], + device='cuda:2'), covar=tensor([0.2193, 0.2426, 0.1496, 0.2447, 0.3156, 0.1480, 0.2098, 0.1646], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0104, 0.0098, 0.0102, 0.0116, 0.0093, 0.0117, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 15:46:34,427 INFO [train.py:873] (2/4) Epoch 19, batch 6600, loss[loss=0.1065, simple_loss=0.1341, pruned_loss=0.03949, over 3852.00 frames. ], tot_loss[loss=0.1038, simple_loss=0.1401, pruned_loss=0.03376, over 1950244.54 frames. ], batch size: 100, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:46:40,956 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=142724.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:46:56,547 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.271e+02 2.141e+02 2.630e+02 3.221e+02 9.486e+02, threshold=5.260e+02, percent-clipped=4.0 +2022-12-08 15:47:34,413 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=142785.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:48:02,763 INFO [train.py:873] (2/4) Epoch 19, batch 6700, loss[loss=0.1148, simple_loss=0.1428, pruned_loss=0.04336, over 8631.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.1397, pruned_loss=0.03293, over 1992540.27 frames. ], batch size: 100, lr: 4.09e-03, grad_scale: 8.0 +2022-12-08 15:48:23,751 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.068e+02 1.995e+02 2.478e+02 2.934e+02 5.263e+02, threshold=4.957e+02, percent-clipped=1.0 +2022-12-08 15:48:29,141 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4468, 1.9016, 3.4152, 2.4490, 3.4282, 1.7474, 2.5005, 3.3397], + device='cuda:2'), covar=tensor([0.0791, 0.3866, 0.0692, 0.4916, 0.0778, 0.3408, 0.1718, 0.0794], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0197, 0.0217, 0.0264, 0.0238, 0.0198, 0.0199, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 15:49:28,807 INFO [train.py:873] (2/4) Epoch 19, batch 6800, loss[loss=0.1395, simple_loss=0.1471, pruned_loss=0.06591, over 1238.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1393, pruned_loss=0.03295, over 1963922.09 frames. ], batch size: 100, lr: 4.08e-03, grad_scale: 8.0 +2022-12-08 15:49:37,242 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7802, 1.8059, 1.6148, 1.9580, 1.7050, 1.9040, 1.7576, 1.6277], + device='cuda:2'), covar=tensor([0.1702, 0.1031, 0.2064, 0.0952, 0.1660, 0.0903, 0.1859, 0.1187], + device='cuda:2'), in_proj_covar=tensor([0.0285, 0.0290, 0.0260, 0.0294, 0.0324, 0.0304, 0.0259, 0.0244], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:49:39,777 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=142929.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:49:50,756 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.071e+02 1.868e+02 2.461e+02 3.045e+02 8.260e+02, threshold=4.922e+02, percent-clipped=6.0 +2022-12-08 15:49:53,454 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1149, 1.3103, 1.2794, 1.0621, 0.8115, 1.0564, 0.8921, 1.2052], + device='cuda:2'), covar=tensor([0.2251, 0.2742, 0.1502, 0.2310, 0.3603, 0.1637, 0.2391, 0.1711], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0104, 0.0097, 0.0101, 0.0115, 0.0092, 0.0116, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 15:50:48,205 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.58 vs. limit=5.0 +2022-12-08 15:50:56,727 INFO [train.py:873] (2/4) Epoch 19, batch 6900, loss[loss=0.1721, simple_loss=0.1628, pruned_loss=0.09066, over 1248.00 frames. ], tot_loss[loss=0.1033, simple_loss=0.1398, pruned_loss=0.0334, over 1904057.43 frames. ], batch size: 100, lr: 4.08e-03, grad_scale: 4.0 +2022-12-08 15:51:18,510 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.205e+02 2.146e+02 2.489e+02 3.023e+02 7.661e+02, threshold=4.978e+02, percent-clipped=3.0 +2022-12-08 15:51:51,287 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=143080.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:51:57,489 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-12-08 15:52:18,078 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.49 vs. limit=5.0 +2022-12-08 15:52:23,548 INFO [train.py:873] (2/4) Epoch 19, batch 7000, loss[loss=0.1042, simple_loss=0.1403, pruned_loss=0.03405, over 8569.00 frames. ], tot_loss[loss=0.1035, simple_loss=0.1397, pruned_loss=0.03363, over 1904548.60 frames. ], batch size: 100, lr: 4.08e-03, grad_scale: 4.0 +2022-12-08 15:52:46,400 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.312e+02 1.976e+02 2.529e+02 3.068e+02 1.002e+03, threshold=5.057e+02, percent-clipped=4.0 +2022-12-08 15:53:50,609 INFO [train.py:873] (2/4) Epoch 19, batch 7100, loss[loss=0.1021, simple_loss=0.1409, pruned_loss=0.03159, over 14217.00 frames. ], tot_loss[loss=0.102, simple_loss=0.1391, pruned_loss=0.03248, over 1976379.63 frames. ], batch size: 35, lr: 4.08e-03, grad_scale: 4.0 +2022-12-08 15:54:00,876 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=143229.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:54:12,485 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.837e+01 2.042e+02 2.415e+02 3.007e+02 1.079e+03, threshold=4.829e+02, percent-clipped=5.0 +2022-12-08 15:54:23,099 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7825, 2.7942, 2.6117, 2.8696, 2.4464, 2.6528, 2.8296, 2.7531], + device='cuda:2'), covar=tensor([0.0779, 0.1110, 0.1056, 0.0912, 0.1517, 0.0933, 0.0884, 0.1039], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0146, 0.0149, 0.0164, 0.0151, 0.0126, 0.0170, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 15:54:25,002 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9987, 1.9066, 1.9023, 2.0073, 2.0454, 1.7281, 1.6941, 1.3503], + device='cuda:2'), covar=tensor([0.0224, 0.0444, 0.0330, 0.0307, 0.0334, 0.0415, 0.0351, 0.0619], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0023, 0.0021, 0.0023, 0.0022, 0.0035, 0.0029, 0.0033], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 15:54:42,674 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=143277.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:54:53,655 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.7825, 0.8471, 0.6379, 0.8836, 0.8282, 0.5165, 0.7959, 0.8645], + device='cuda:2'), covar=tensor([0.0421, 0.0411, 0.0605, 0.0549, 0.0341, 0.0347, 0.1246, 0.0774], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0037, 0.0042, 0.0035, 0.0037, 0.0051, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 15:54:54,845 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.92 vs. limit=5.0 +2022-12-08 15:55:17,810 INFO [train.py:873] (2/4) Epoch 19, batch 7200, loss[loss=0.09615, simple_loss=0.1348, pruned_loss=0.02872, over 14213.00 frames. ], tot_loss[loss=0.1024, simple_loss=0.1393, pruned_loss=0.03277, over 1967129.74 frames. ], batch size: 94, lr: 4.08e-03, grad_scale: 8.0 +2022-12-08 15:55:27,908 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.24 vs. limit=2.0 +2022-12-08 15:55:36,755 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.4896, 1.6416, 1.8747, 1.9130, 1.8041, 1.8355, 1.6145, 1.4191], + device='cuda:2'), covar=tensor([0.0949, 0.1451, 0.0564, 0.0621, 0.1162, 0.0828, 0.1189, 0.1642], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0093, 0.0072, 0.0077, 0.0102, 0.0092, 0.0103, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 15:55:40,829 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.306e+02 2.274e+02 2.664e+02 3.314e+02 6.390e+02, threshold=5.328e+02, percent-clipped=7.0 +2022-12-08 15:56:13,754 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=143380.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:56:30,785 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=143400.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 15:56:40,970 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9499, 1.6933, 3.9133, 3.5818, 3.6717, 3.9192, 3.1135, 3.9062], + device='cuda:2'), covar=tensor([0.1630, 0.1675, 0.0131, 0.0274, 0.0263, 0.0157, 0.0329, 0.0137], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0131, 0.0168, 0.0148, 0.0142, 0.0125, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 15:56:46,184 INFO [train.py:873] (2/4) Epoch 19, batch 7300, loss[loss=0.1024, simple_loss=0.1385, pruned_loss=0.03315, over 6908.00 frames. ], tot_loss[loss=0.102, simple_loss=0.1393, pruned_loss=0.03242, over 1977318.70 frames. ], batch size: 100, lr: 4.08e-03, grad_scale: 8.0 +2022-12-08 15:56:55,456 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=143428.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 15:57:08,204 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.099e+02 2.101e+02 2.553e+02 3.059e+02 5.821e+02, threshold=5.106e+02, percent-clipped=2.0 +2022-12-08 15:57:24,959 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=143461.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 15:57:35,058 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=143473.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 15:58:13,342 INFO [train.py:873] (2/4) Epoch 19, batch 7400, loss[loss=0.08774, simple_loss=0.1355, pruned_loss=0.01999, over 14439.00 frames. ], tot_loss[loss=0.1027, simple_loss=0.1397, pruned_loss=0.03282, over 1978387.74 frames. ], batch size: 53, lr: 4.08e-03, grad_scale: 8.0 +2022-12-08 15:58:29,061 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=143534.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 15:58:31,080 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.59 vs. limit=5.0 +2022-12-08 15:58:36,574 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.395e+02 2.037e+02 2.578e+02 3.136e+02 8.501e+02, threshold=5.155e+02, percent-clipped=4.0 +2022-12-08 15:58:36,762 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7568, 1.7867, 1.6180, 1.8618, 1.6624, 1.7818, 1.8027, 1.6159], + device='cuda:2'), covar=tensor([0.1485, 0.1087, 0.2011, 0.1030, 0.1448, 0.0825, 0.1706, 0.1395], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0287, 0.0257, 0.0292, 0.0320, 0.0302, 0.0256, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 15:59:18,393 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.65 vs. limit=5.0 +2022-12-08 15:59:41,170 INFO [train.py:873] (2/4) Epoch 19, batch 7500, loss[loss=0.09817, simple_loss=0.1356, pruned_loss=0.03039, over 14276.00 frames. ], tot_loss[loss=0.1032, simple_loss=0.1398, pruned_loss=0.03331, over 1912663.51 frames. ], batch size: 63, lr: 4.07e-03, grad_scale: 8.0 +2022-12-08 16:00:03,426 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.287e+02 2.124e+02 2.640e+02 3.469e+02 7.311e+02, threshold=5.280e+02, percent-clipped=6.0 +2022-12-08 16:00:24,900 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9931, 1.9659, 1.9778, 2.0465, 1.9948, 1.7055, 1.2949, 1.8094], + device='cuda:2'), covar=tensor([0.0852, 0.0745, 0.0663, 0.0496, 0.0588, 0.1382, 0.2589, 0.0643], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0178, 0.0149, 0.0151, 0.0211, 0.0145, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:01:11,363 INFO [train.py:873] (2/4) Epoch 20, batch 0, loss[loss=0.1036, simple_loss=0.1473, pruned_loss=0.02994, over 14408.00 frames. ], tot_loss[loss=0.1036, simple_loss=0.1473, pruned_loss=0.02994, over 14408.00 frames. ], batch size: 41, lr: 3.97e-03, grad_scale: 8.0 +2022-12-08 16:01:11,363 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 16:01:18,884 INFO [train.py:905] (2/4) Epoch 20, validation: loss=0.1452, simple_loss=0.1824, pruned_loss=0.05396, over 857387.00 frames. +2022-12-08 16:01:18,885 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 16:01:23,329 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1490, 2.1139, 2.3882, 1.5217, 1.6685, 2.1775, 1.3696, 2.1454], + device='cuda:2'), covar=tensor([0.0944, 0.1717, 0.0838, 0.2402, 0.2601, 0.1124, 0.3027, 0.1251], + device='cuda:2'), in_proj_covar=tensor([0.0087, 0.0104, 0.0096, 0.0102, 0.0115, 0.0092, 0.0116, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 16:01:44,100 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.89 vs. limit=2.0 +2022-12-08 16:02:15,749 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.236e+01 1.946e+02 2.466e+02 3.510e+02 7.205e+02, threshold=4.933e+02, percent-clipped=3.0 +2022-12-08 16:02:27,768 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=143756.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:02:47,467 INFO [train.py:873] (2/4) Epoch 20, batch 100, loss[loss=0.08701, simple_loss=0.1032, pruned_loss=0.03539, over 2647.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1393, pruned_loss=0.03151, over 886842.78 frames. ], batch size: 100, lr: 3.97e-03, grad_scale: 8.0 +2022-12-08 16:02:51,763 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.2570, 1.4877, 3.3099, 1.5982, 3.1927, 3.3786, 2.4081, 3.5969], + device='cuda:2'), covar=tensor([0.0270, 0.3115, 0.0468, 0.2386, 0.0817, 0.0458, 0.0998, 0.0212], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0155, 0.0161, 0.0168, 0.0166, 0.0178, 0.0134, 0.0152], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:03:14,892 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1816, 1.9663, 2.2560, 2.3796, 1.9625, 1.9905, 2.2080, 2.1413], + device='cuda:2'), covar=tensor([0.0337, 0.0634, 0.0359, 0.0319, 0.0596, 0.0884, 0.0507, 0.0389], + device='cuda:2'), in_proj_covar=tensor([0.0296, 0.0263, 0.0379, 0.0335, 0.0274, 0.0311, 0.0315, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:03:32,004 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=143829.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:03:43,925 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.291e+02 2.111e+02 2.727e+02 3.354e+02 8.934e+02, threshold=5.454e+02, percent-clipped=6.0 +2022-12-08 16:03:47,935 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.05 vs. limit=5.0 +2022-12-08 16:04:15,260 INFO [train.py:873] (2/4) Epoch 20, batch 200, loss[loss=0.156, simple_loss=0.1437, pruned_loss=0.08415, over 1258.00 frames. ], tot_loss[loss=0.1031, simple_loss=0.1404, pruned_loss=0.03291, over 1323124.17 frames. ], batch size: 100, lr: 3.97e-03, grad_scale: 8.0 +2022-12-08 16:04:45,282 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.84 vs. limit=2.0 +2022-12-08 16:05:11,776 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.198e+02 1.866e+02 2.261e+02 3.101e+02 5.918e+02, threshold=4.522e+02, percent-clipped=1.0 +2022-12-08 16:05:20,158 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1180, 2.2561, 4.9110, 4.4546, 4.3176, 5.0033, 4.6725, 4.9748], + device='cuda:2'), covar=tensor([0.1614, 0.1452, 0.0098, 0.0236, 0.0231, 0.0119, 0.0138, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0158, 0.0133, 0.0170, 0.0150, 0.0144, 0.0127, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 16:05:43,235 INFO [train.py:873] (2/4) Epoch 20, batch 300, loss[loss=0.09524, simple_loss=0.1373, pruned_loss=0.02658, over 14225.00 frames. ], tot_loss[loss=0.1009, simple_loss=0.1391, pruned_loss=0.03131, over 1615011.24 frames. ], batch size: 94, lr: 3.96e-03, grad_scale: 8.0 +2022-12-08 16:06:39,226 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.977e+01 2.046e+02 2.429e+02 3.312e+02 9.305e+02, threshold=4.857e+02, percent-clipped=8.0 +2022-12-08 16:06:50,792 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=144056.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:07:05,022 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3539, 2.8279, 4.3492, 4.5204, 4.1613, 2.5782, 4.4271, 3.4430], + device='cuda:2'), covar=tensor([0.0482, 0.1228, 0.1022, 0.0401, 0.0596, 0.2009, 0.0518, 0.0982], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0263, 0.0377, 0.0332, 0.0273, 0.0309, 0.0315, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:07:10,843 INFO [train.py:873] (2/4) Epoch 20, batch 400, loss[loss=0.1023, simple_loss=0.1403, pruned_loss=0.03217, over 6939.00 frames. ], tot_loss[loss=0.1013, simple_loss=0.1388, pruned_loss=0.03191, over 1710001.80 frames. ], batch size: 100, lr: 3.96e-03, grad_scale: 8.0 +2022-12-08 16:07:32,241 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=144104.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:07:42,508 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.4272, 4.5457, 4.8747, 4.1374, 4.6313, 4.8960, 1.8552, 4.3965], + device='cuda:2'), covar=tensor([0.0347, 0.0389, 0.0308, 0.0422, 0.0293, 0.0166, 0.3125, 0.0272], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0148, 0.0150, 0.0211, 0.0144, 0.0158, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:07:54,124 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=144129.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:08:07,211 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.157e+02 2.224e+02 2.595e+02 3.172e+02 7.454e+02, threshold=5.190e+02, percent-clipped=6.0 +2022-12-08 16:08:11,947 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.36 vs. limit=5.0 +2022-12-08 16:08:35,610 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.41 vs. limit=5.0 +2022-12-08 16:08:35,921 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=144177.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 16:08:37,518 INFO [train.py:873] (2/4) Epoch 20, batch 500, loss[loss=0.1122, simple_loss=0.1312, pruned_loss=0.04658, over 3895.00 frames. ], tot_loss[loss=0.1017, simple_loss=0.1392, pruned_loss=0.03212, over 1822583.50 frames. ], batch size: 100, lr: 3.96e-03, grad_scale: 4.0 +2022-12-08 16:08:53,085 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.2218, 1.3420, 1.2365, 1.4510, 1.4959, 1.0107, 1.1747, 1.1701], + device='cuda:2'), covar=tensor([0.0742, 0.0626, 0.0714, 0.0481, 0.0467, 0.0890, 0.0940, 0.0727], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0037, 0.0042, 0.0035, 0.0037, 0.0051, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 16:09:34,775 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.234e+02 2.083e+02 2.620e+02 3.196e+02 5.834e+02, threshold=5.241e+02, percent-clipped=3.0 +2022-12-08 16:10:05,215 INFO [train.py:873] (2/4) Epoch 20, batch 600, loss[loss=0.114, simple_loss=0.1485, pruned_loss=0.03976, over 4987.00 frames. ], tot_loss[loss=0.1022, simple_loss=0.1393, pruned_loss=0.03257, over 1830967.42 frames. ], batch size: 100, lr: 3.96e-03, grad_scale: 4.0 +2022-12-08 16:10:19,296 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1275, 2.1202, 2.3404, 2.0498, 1.9927, 1.8442, 1.8310, 1.4939], + device='cuda:2'), covar=tensor([0.0242, 0.0382, 0.0243, 0.0298, 0.0231, 0.0377, 0.0325, 0.0430], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0022, 0.0023, 0.0022, 0.0035, 0.0029, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 16:10:34,816 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8019, 4.4443, 4.3554, 4.8022, 4.4123, 4.2131, 4.7968, 4.0157], + device='cuda:2'), covar=tensor([0.0330, 0.0830, 0.0388, 0.0356, 0.0771, 0.0691, 0.0466, 0.0504], + device='cuda:2'), in_proj_covar=tensor([0.0180, 0.0274, 0.0202, 0.0199, 0.0185, 0.0160, 0.0290, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 16:11:02,325 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.523e+02 2.036e+02 2.548e+02 2.967e+02 5.900e+02, threshold=5.097e+02, percent-clipped=2.0 +2022-12-08 16:11:33,560 INFO [train.py:873] (2/4) Epoch 20, batch 700, loss[loss=0.1215, simple_loss=0.1284, pruned_loss=0.05727, over 2565.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1392, pruned_loss=0.033, over 1843569.95 frames. ], batch size: 100, lr: 3.96e-03, grad_scale: 4.0 +2022-12-08 16:11:49,102 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8070, 4.4379, 4.3644, 4.8591, 4.4572, 4.1855, 4.8552, 4.0090], + device='cuda:2'), covar=tensor([0.0394, 0.1044, 0.0426, 0.0414, 0.0848, 0.0807, 0.0536, 0.0576], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0275, 0.0203, 0.0199, 0.0185, 0.0160, 0.0291, 0.0170], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 16:12:02,233 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=144411.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:12:31,476 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.258e+02 1.983e+02 2.498e+02 3.051e+02 6.384e+02, threshold=4.995e+02, percent-clipped=1.0 +2022-12-08 16:12:47,005 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9259, 4.7456, 4.3204, 4.5791, 4.6135, 4.8384, 4.9549, 4.9141], + device='cuda:2'), covar=tensor([0.0722, 0.0425, 0.2256, 0.2686, 0.0769, 0.0771, 0.0696, 0.0723], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0281, 0.0455, 0.0574, 0.0357, 0.0464, 0.0396, 0.0398], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:12:56,197 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=144472.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:13:02,254 INFO [train.py:873] (2/4) Epoch 20, batch 800, loss[loss=0.09599, simple_loss=0.1355, pruned_loss=0.02825, over 14233.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.1394, pruned_loss=0.03309, over 1895905.66 frames. ], batch size: 80, lr: 3.96e-03, grad_scale: 8.0 +2022-12-08 16:13:58,406 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 2.092e+02 2.630e+02 3.168e+02 5.138e+02, threshold=5.259e+02, percent-clipped=1.0 +2022-12-08 16:14:06,930 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9695, 1.9752, 1.9968, 2.0483, 1.9185, 1.7132, 1.4048, 1.7431], + device='cuda:2'), covar=tensor([0.0949, 0.0785, 0.0662, 0.0520, 0.0622, 0.1665, 0.2615, 0.0815], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0176, 0.0148, 0.0150, 0.0210, 0.0144, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:14:23,707 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.41 vs. limit=2.0 +2022-12-08 16:14:29,538 INFO [train.py:873] (2/4) Epoch 20, batch 900, loss[loss=0.1004, simple_loss=0.142, pruned_loss=0.02939, over 14215.00 frames. ], tot_loss[loss=0.102, simple_loss=0.139, pruned_loss=0.03256, over 1898110.90 frames. ], batch size: 69, lr: 3.96e-03, grad_scale: 8.0 +2022-12-08 16:15:05,851 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.57 vs. limit=5.0 +2022-12-08 16:15:25,549 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.82 vs. limit=5.0 +2022-12-08 16:15:27,792 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.200e+02 2.044e+02 2.378e+02 2.904e+02 6.661e+02, threshold=4.757e+02, percent-clipped=2.0 +2022-12-08 16:15:58,946 INFO [train.py:873] (2/4) Epoch 20, batch 1000, loss[loss=0.1125, simple_loss=0.1481, pruned_loss=0.0385, over 14330.00 frames. ], tot_loss[loss=0.1014, simple_loss=0.1385, pruned_loss=0.03215, over 1916428.50 frames. ], batch size: 55, lr: 3.96e-03, grad_scale: 8.0 +2022-12-08 16:16:32,189 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9244, 3.6978, 3.4680, 2.8816, 3.4660, 3.6698, 4.1150, 3.2865], + device='cuda:2'), covar=tensor([0.0514, 0.0982, 0.0719, 0.1069, 0.0680, 0.0556, 0.0550, 0.0969], + device='cuda:2'), in_proj_covar=tensor([0.0157, 0.0171, 0.0143, 0.0126, 0.0147, 0.0158, 0.0141, 0.0145], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 16:16:56,772 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.347e+02 1.902e+02 2.363e+02 2.821e+02 5.781e+02, threshold=4.726e+02, percent-clipped=3.0 +2022-12-08 16:17:17,197 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=144767.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:17:27,648 INFO [train.py:873] (2/4) Epoch 20, batch 1100, loss[loss=0.1191, simple_loss=0.1292, pruned_loss=0.0545, over 2602.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1387, pruned_loss=0.0322, over 1877462.04 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:17:29,490 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2606, 1.4187, 2.4660, 1.4939, 2.3527, 2.4545, 1.7700, 2.5018], + device='cuda:2'), covar=tensor([0.0415, 0.3264, 0.0578, 0.2328, 0.0836, 0.0711, 0.1530, 0.0565], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0162, 0.0168, 0.0167, 0.0180, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:18:25,976 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.093e+02 2.171e+02 2.584e+02 3.315e+02 6.802e+02, threshold=5.169e+02, percent-clipped=3.0 +2022-12-08 16:18:56,031 INFO [train.py:873] (2/4) Epoch 20, batch 1200, loss[loss=0.09576, simple_loss=0.1388, pruned_loss=0.02634, over 14497.00 frames. ], tot_loss[loss=0.1022, simple_loss=0.139, pruned_loss=0.03272, over 1867982.36 frames. ], batch size: 49, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:19:28,382 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8868, 1.5772, 1.8893, 1.6564, 1.9882, 1.8454, 1.6439, 1.8265], + device='cuda:2'), covar=tensor([0.0749, 0.1612, 0.0579, 0.0631, 0.0709, 0.0881, 0.0376, 0.0506], + device='cuda:2'), in_proj_covar=tensor([0.0350, 0.0308, 0.0388, 0.0298, 0.0365, 0.0321, 0.0360, 0.0295], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:19:54,172 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.060e+02 1.946e+02 2.467e+02 2.965e+02 6.907e+02, threshold=4.934e+02, percent-clipped=1.0 +2022-12-08 16:20:00,895 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1879, 2.1327, 4.1753, 2.8686, 4.0541, 2.1437, 3.2063, 4.0889], + device='cuda:2'), covar=tensor([0.0594, 0.3676, 0.0425, 0.4657, 0.0566, 0.2898, 0.1267, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0197, 0.0219, 0.0263, 0.0237, 0.0198, 0.0200, 0.0217], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 16:20:03,146 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4974, 3.2567, 3.1986, 3.5078, 3.2904, 3.4544, 3.5614, 2.9499], + device='cuda:2'), covar=tensor([0.0517, 0.0978, 0.0519, 0.0488, 0.0819, 0.0410, 0.0578, 0.0585], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0271, 0.0199, 0.0198, 0.0183, 0.0159, 0.0288, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 16:20:08,721 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7974, 5.6730, 5.2237, 5.8080, 5.3861, 5.2488, 5.8450, 5.5826], + device='cuda:2'), covar=tensor([0.0511, 0.0704, 0.0840, 0.0410, 0.0610, 0.0374, 0.0489, 0.0566], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0146, 0.0149, 0.0165, 0.0151, 0.0127, 0.0170, 0.0150], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:20:23,996 INFO [train.py:873] (2/4) Epoch 20, batch 1300, loss[loss=0.1275, simple_loss=0.1426, pruned_loss=0.05619, over 3918.00 frames. ], tot_loss[loss=0.1022, simple_loss=0.1391, pruned_loss=0.03265, over 1884524.88 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:20:39,302 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=144996.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:20:41,707 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0287, 1.4404, 3.8767, 1.8494, 3.9593, 4.1310, 3.2082, 4.4673], + device='cuda:2'), covar=tensor([0.0252, 0.3256, 0.0466, 0.2268, 0.0452, 0.0377, 0.0710, 0.0165], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0156, 0.0162, 0.0169, 0.0167, 0.0181, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:21:10,358 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145027.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:21:26,336 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.036e+02 2.136e+02 2.595e+02 3.096e+02 6.124e+02, threshold=5.190e+02, percent-clipped=3.0 +2022-12-08 16:21:36,856 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145057.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 16:21:37,629 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4396, 1.8568, 2.3857, 2.0113, 2.4784, 2.3573, 2.2291, 2.1928], + device='cuda:2'), covar=tensor([0.0757, 0.2485, 0.0944, 0.1121, 0.0560, 0.1260, 0.0771, 0.1045], + device='cuda:2'), in_proj_covar=tensor([0.0353, 0.0310, 0.0391, 0.0301, 0.0367, 0.0323, 0.0362, 0.0297], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:21:44,337 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145066.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:21:45,187 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=145067.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:21:55,282 INFO [train.py:873] (2/4) Epoch 20, batch 1400, loss[loss=0.09662, simple_loss=0.113, pruned_loss=0.04011, over 2654.00 frames. ], tot_loss[loss=0.1028, simple_loss=0.1394, pruned_loss=0.03307, over 1914727.50 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:21:59,920 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8436, 2.4848, 2.6490, 1.7999, 2.3879, 2.5646, 2.8026, 2.3839], + device='cuda:2'), covar=tensor([0.0780, 0.0658, 0.0992, 0.1361, 0.1040, 0.0955, 0.0671, 0.1321], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0170, 0.0143, 0.0126, 0.0146, 0.0158, 0.0141, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 16:22:03,042 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145088.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:22:26,071 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=145115.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:22:35,822 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.98 vs. limit=2.0 +2022-12-08 16:22:36,122 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145127.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:22:52,064 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.811e+01 2.056e+02 2.726e+02 3.195e+02 8.712e+02, threshold=5.453e+02, percent-clipped=3.0 +2022-12-08 16:23:20,286 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.18 vs. limit=2.0 +2022-12-08 16:23:20,738 INFO [train.py:873] (2/4) Epoch 20, batch 1500, loss[loss=0.09052, simple_loss=0.1056, pruned_loss=0.03772, over 2619.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1389, pruned_loss=0.03318, over 1864963.74 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 4.0 +2022-12-08 16:24:19,276 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.332e+01 2.024e+02 2.567e+02 3.376e+02 1.316e+03, threshold=5.133e+02, percent-clipped=5.0 +2022-12-08 16:24:25,324 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6715, 3.5951, 3.3194, 3.7232, 3.3497, 3.3055, 3.6809, 3.5774], + device='cuda:2'), covar=tensor([0.0666, 0.0979, 0.1067, 0.0658, 0.0996, 0.0660, 0.0690, 0.0876], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0149, 0.0152, 0.0167, 0.0152, 0.0129, 0.0173, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:24:48,114 INFO [train.py:873] (2/4) Epoch 20, batch 1600, loss[loss=0.09037, simple_loss=0.1137, pruned_loss=0.03352, over 3865.00 frames. ], tot_loss[loss=0.1026, simple_loss=0.1391, pruned_loss=0.03303, over 1872583.39 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:25:01,256 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1823, 2.0652, 1.8353, 1.9106, 2.0760, 2.1412, 2.0958, 2.1151], + device='cuda:2'), covar=tensor([0.1021, 0.0962, 0.2684, 0.2483, 0.1375, 0.1275, 0.1527, 0.1048], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0279, 0.0454, 0.0573, 0.0357, 0.0462, 0.0396, 0.0398], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:25:38,435 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9041, 2.0202, 2.7510, 2.2179, 2.8190, 2.7342, 2.6154, 2.4187], + device='cuda:2'), covar=tensor([0.0991, 0.3010, 0.1177, 0.1775, 0.0745, 0.1180, 0.1029, 0.1716], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0307, 0.0387, 0.0298, 0.0363, 0.0320, 0.0358, 0.0294], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:25:46,272 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.072e+02 1.921e+02 2.439e+02 2.878e+02 7.278e+02, threshold=4.878e+02, percent-clipped=1.0 +2022-12-08 16:25:52,019 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145352.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:26:08,283 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145371.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:26:15,270 INFO [train.py:873] (2/4) Epoch 20, batch 1700, loss[loss=0.1254, simple_loss=0.1536, pruned_loss=0.04856, over 10327.00 frames. ], tot_loss[loss=0.1031, simple_loss=0.1394, pruned_loss=0.03337, over 1897819.53 frames. ], batch size: 100, lr: 3.95e-03, grad_scale: 8.0 +2022-12-08 16:26:18,710 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145383.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:26:52,713 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145422.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:27:02,004 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145432.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:27:13,922 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.159e+01 2.091e+02 2.554e+02 3.165e+02 6.798e+02, threshold=5.109e+02, percent-clipped=6.0 +2022-12-08 16:27:25,169 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9999, 2.0417, 2.0884, 2.0722, 2.0596, 1.7316, 1.4273, 1.8580], + device='cuda:2'), covar=tensor([0.0685, 0.0588, 0.0446, 0.0393, 0.0394, 0.1248, 0.2096, 0.0507], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0177, 0.0149, 0.0151, 0.0210, 0.0144, 0.0159, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:27:34,598 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.52 vs. limit=2.0 +2022-12-08 16:27:42,941 INFO [train.py:873] (2/4) Epoch 20, batch 1800, loss[loss=0.1023, simple_loss=0.1459, pruned_loss=0.02933, over 14267.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1388, pruned_loss=0.0322, over 1950395.40 frames. ], batch size: 76, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:28:12,098 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.3984, 4.1804, 3.7865, 4.0643, 4.1714, 4.2981, 4.3657, 4.3523], + device='cuda:2'), covar=tensor([0.0756, 0.0462, 0.2154, 0.2388, 0.0761, 0.0772, 0.0942, 0.0777], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0279, 0.0455, 0.0576, 0.0356, 0.0463, 0.0397, 0.0397], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:28:12,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9578, 4.0464, 4.2426, 3.7120, 4.0832, 4.1871, 1.6929, 3.8871], + device='cuda:2'), covar=tensor([0.0334, 0.0292, 0.0318, 0.0490, 0.0323, 0.0299, 0.3127, 0.0301], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0178, 0.0149, 0.0152, 0.0211, 0.0144, 0.0160, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:28:25,433 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.34 vs. limit=2.0 +2022-12-08 16:28:41,420 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.279e+02 2.270e+02 2.808e+02 3.331e+02 1.195e+03, threshold=5.616e+02, percent-clipped=5.0 +2022-12-08 16:29:10,284 INFO [train.py:873] (2/4) Epoch 20, batch 1900, loss[loss=0.09145, simple_loss=0.1192, pruned_loss=0.03185, over 4943.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1388, pruned_loss=0.03213, over 1977152.74 frames. ], batch size: 100, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:29:23,551 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145594.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:29:44,343 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8068, 1.3144, 2.5021, 2.2818, 2.3497, 2.5264, 1.7424, 2.5288], + device='cuda:2'), covar=tensor([0.1098, 0.1572, 0.0279, 0.0515, 0.0663, 0.0316, 0.0917, 0.0328], + device='cuda:2'), in_proj_covar=tensor([0.0147, 0.0157, 0.0132, 0.0171, 0.0150, 0.0144, 0.0127, 0.0126], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 16:29:56,401 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8330, 4.4686, 4.3804, 4.8553, 4.4316, 4.2638, 4.8200, 4.1340], + device='cuda:2'), covar=tensor([0.0355, 0.0929, 0.0405, 0.0360, 0.0837, 0.0616, 0.0507, 0.0464], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0271, 0.0200, 0.0200, 0.0183, 0.0159, 0.0287, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 16:30:07,073 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:30:09,685 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.288e+02 2.010e+02 2.453e+02 2.850e+02 4.730e+02, threshold=4.905e+02, percent-clipped=0.0 +2022-12-08 16:30:15,039 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=145652.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:30:17,538 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145655.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:30:38,453 INFO [train.py:873] (2/4) Epoch 20, batch 2000, loss[loss=0.09402, simple_loss=0.1367, pruned_loss=0.02564, over 14238.00 frames. ], tot_loss[loss=0.1008, simple_loss=0.1384, pruned_loss=0.03163, over 2016099.03 frames. ], batch size: 89, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:30:42,044 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=145683.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:30:46,196 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.79 vs. limit=2.0 +2022-12-08 16:30:56,727 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=145700.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:31:00,306 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145704.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:31:16,211 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=145722.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:31:20,716 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145727.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:31:21,605 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7370, 2.4821, 3.0356, 1.9791, 2.0183, 2.7718, 1.6351, 2.7032], + device='cuda:2'), covar=tensor([0.0912, 0.1213, 0.0528, 0.2671, 0.1988, 0.0801, 0.2968, 0.0888], + device='cuda:2'), in_proj_covar=tensor([0.0089, 0.0103, 0.0096, 0.0102, 0.0115, 0.0093, 0.0117, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 16:31:24,000 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=145731.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:31:37,255 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.155e+02 1.878e+02 2.490e+02 3.077e+02 5.719e+02, threshold=4.981e+02, percent-clipped=2.0 +2022-12-08 16:31:52,297 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5802, 2.2453, 2.3575, 2.2731, 2.2657, 2.0512, 1.8417, 1.7806], + device='cuda:2'), covar=tensor([0.0154, 0.0338, 0.0246, 0.0311, 0.0327, 0.0464, 0.0458, 0.0452], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0022, 0.0023, 0.0023, 0.0036, 0.0030, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 16:31:58,227 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=145770.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:32:05,946 INFO [train.py:873] (2/4) Epoch 20, batch 2100, loss[loss=0.08089, simple_loss=0.1282, pruned_loss=0.0168, over 14391.00 frames. ], tot_loss[loss=0.101, simple_loss=0.1387, pruned_loss=0.03165, over 1994355.74 frames. ], batch size: 41, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:32:30,285 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4378, 2.2171, 2.0773, 2.3371, 2.1840, 1.4486, 1.9874, 2.1747], + device='cuda:2'), covar=tensor([0.0671, 0.0867, 0.0826, 0.0957, 0.0666, 0.0854, 0.0838, 0.0589], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0037, 0.0042, 0.0035, 0.0037, 0.0051, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 16:32:43,280 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=145821.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 16:33:04,938 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.307e+02 2.012e+02 2.500e+02 3.223e+02 4.888e+02, threshold=5.000e+02, percent-clipped=0.0 +2022-12-08 16:33:33,928 INFO [train.py:873] (2/4) Epoch 20, batch 2200, loss[loss=0.09836, simple_loss=0.1337, pruned_loss=0.0315, over 11133.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1387, pruned_loss=0.03212, over 1912810.43 frames. ], batch size: 100, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:33:36,638 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=145882.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 16:34:02,274 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.4139, 2.4239, 2.4281, 2.5504, 2.4649, 2.2386, 1.3929, 2.2248], + device='cuda:2'), covar=tensor([0.0733, 0.0689, 0.0584, 0.0421, 0.0586, 0.1286, 0.2823, 0.0619], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0148, 0.0150, 0.0208, 0.0144, 0.0158, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:34:32,821 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.066e+02 2.107e+02 2.702e+02 3.321e+02 7.589e+02, threshold=5.404e+02, percent-clipped=7.0 +2022-12-08 16:34:36,627 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145950.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 16:34:53,256 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8960, 1.6480, 3.8116, 1.9778, 3.8263, 3.9534, 2.9342, 4.2167], + device='cuda:2'), covar=tensor([0.0207, 0.3110, 0.0384, 0.2111, 0.0526, 0.0391, 0.0802, 0.0173], + device='cuda:2'), in_proj_covar=tensor([0.0175, 0.0158, 0.0163, 0.0170, 0.0168, 0.0181, 0.0134, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:34:56,771 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7073, 1.4834, 2.7339, 2.4189, 2.5983, 2.7224, 1.8162, 2.7051], + device='cuda:2'), covar=tensor([0.1627, 0.1881, 0.0307, 0.0656, 0.0597, 0.0335, 0.1047, 0.0326], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0131, 0.0170, 0.0149, 0.0143, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 16:34:58,863 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.45 vs. limit=5.0 +2022-12-08 16:35:01,456 INFO [train.py:873] (2/4) Epoch 20, batch 2300, loss[loss=0.08617, simple_loss=0.1289, pruned_loss=0.0217, over 14217.00 frames. ], tot_loss[loss=0.1008, simple_loss=0.1388, pruned_loss=0.03139, over 2054833.78 frames. ], batch size: 35, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:35:19,656 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=145999.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:35:33,690 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8055, 4.1570, 3.6801, 3.9905, 3.0163, 4.1481, 4.0514, 2.3573], + device='cuda:2'), covar=tensor([0.0966, 0.0826, 0.1482, 0.0491, 0.0724, 0.0419, 0.0655, 0.1455], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0094, 0.0073, 0.0078, 0.0102, 0.0093, 0.0104, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0007, 0.0006], + device='cuda:2') +2022-12-08 16:35:38,446 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-12-08 16:35:40,102 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.9485, 2.4321, 3.8848, 4.0277, 3.7710, 2.2854, 3.9361, 3.0156], + device='cuda:2'), covar=tensor([0.0477, 0.1358, 0.0937, 0.0496, 0.0621, 0.2191, 0.0560, 0.1129], + device='cuda:2'), in_proj_covar=tensor([0.0296, 0.0263, 0.0378, 0.0333, 0.0273, 0.0309, 0.0314, 0.0279], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:35:44,218 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=146027.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:36:00,288 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.5113, 1.6272, 2.6612, 2.1625, 2.5573, 1.7158, 2.2263, 2.5771], + device='cuda:2'), covar=tensor([0.1577, 0.3781, 0.0731, 0.2591, 0.1401, 0.2696, 0.0979, 0.0859], + device='cuda:2'), in_proj_covar=tensor([0.0254, 0.0197, 0.0220, 0.0264, 0.0238, 0.0200, 0.0200, 0.0218], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0004, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 16:36:00,976 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.336e+02 2.169e+02 2.442e+02 2.958e+02 6.970e+02, threshold=4.884e+02, percent-clipped=2.0 +2022-12-08 16:36:26,862 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=146075.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:36:30,341 INFO [train.py:873] (2/4) Epoch 20, batch 2400, loss[loss=0.08367, simple_loss=0.1294, pruned_loss=0.01899, over 14388.00 frames. ], tot_loss[loss=0.1007, simple_loss=0.1386, pruned_loss=0.03139, over 2061020.33 frames. ], batch size: 41, lr: 3.94e-03, grad_scale: 8.0 +2022-12-08 16:36:39,314 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7246, 1.3087, 2.5064, 2.2230, 2.3549, 2.4969, 1.6688, 2.5274], + device='cuda:2'), covar=tensor([0.1130, 0.1542, 0.0281, 0.0617, 0.0633, 0.0292, 0.0924, 0.0312], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0131, 0.0170, 0.0149, 0.0143, 0.0126, 0.0125], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 16:37:02,096 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.05 vs. limit=5.0 +2022-12-08 16:37:29,039 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.227e+02 1.997e+02 2.339e+02 2.913e+02 8.016e+02, threshold=4.679e+02, percent-clipped=1.0 +2022-12-08 16:37:32,261 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=146149.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:37:44,122 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8830, 3.7264, 3.6279, 4.0254, 3.7476, 3.5470, 4.0548, 3.4302], + device='cuda:2'), covar=tensor([0.0669, 0.0979, 0.0536, 0.0459, 0.0785, 0.1526, 0.0539, 0.0551], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0276, 0.0203, 0.0202, 0.0186, 0.0162, 0.0290, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 16:37:56,484 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=146177.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 16:37:58,097 INFO [train.py:873] (2/4) Epoch 20, batch 2500, loss[loss=0.1075, simple_loss=0.1406, pruned_loss=0.03716, over 14256.00 frames. ], tot_loss[loss=0.1011, simple_loss=0.1388, pruned_loss=0.03168, over 2067382.44 frames. ], batch size: 80, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:38:26,664 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=146210.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:38:31,692 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8036, 4.6676, 4.2735, 4.4403, 4.5561, 4.7487, 4.8133, 4.7846], + device='cuda:2'), covar=tensor([0.0688, 0.0371, 0.1992, 0.2227, 0.0668, 0.0789, 0.0653, 0.0695], + device='cuda:2'), in_proj_covar=tensor([0.0394, 0.0280, 0.0451, 0.0573, 0.0357, 0.0463, 0.0396, 0.0397], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:38:43,312 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8535, 2.2759, 3.7433, 3.8835, 3.6473, 2.3266, 3.8196, 2.8391], + device='cuda:2'), covar=tensor([0.0504, 0.1455, 0.1058, 0.0558, 0.0692, 0.2152, 0.0507, 0.1266], + device='cuda:2'), in_proj_covar=tensor([0.0296, 0.0262, 0.0378, 0.0333, 0.0274, 0.0309, 0.0314, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:38:58,169 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.933e+01 2.087e+02 2.444e+02 3.038e+02 5.934e+02, threshold=4.888e+02, percent-clipped=3.0 +2022-12-08 16:39:01,820 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=146250.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:39:14,922 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8882, 4.0002, 4.0162, 3.8219, 3.8806, 4.1940, 1.5737, 3.6733], + device='cuda:2'), covar=tensor([0.0470, 0.0418, 0.0537, 0.0531, 0.0530, 0.0351, 0.3830, 0.0440], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0177, 0.0149, 0.0151, 0.0209, 0.0144, 0.0158, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:39:27,231 INFO [train.py:873] (2/4) Epoch 20, batch 2600, loss[loss=0.1036, simple_loss=0.1419, pruned_loss=0.03262, over 14239.00 frames. ], tot_loss[loss=0.1007, simple_loss=0.1383, pruned_loss=0.03153, over 2061254.53 frames. ], batch size: 94, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:39:43,724 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=146298.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:39:44,693 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=146299.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:40:04,351 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.35 vs. limit=2.0 +2022-12-08 16:40:25,672 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 8.546e+01 2.007e+02 2.593e+02 2.996e+02 8.778e+02, threshold=5.186e+02, percent-clipped=5.0 +2022-12-08 16:40:26,686 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=146347.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:40:32,147 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.87 vs. limit=5.0 +2022-12-08 16:40:54,845 INFO [train.py:873] (2/4) Epoch 20, batch 2700, loss[loss=0.1503, simple_loss=0.1419, pruned_loss=0.07935, over 1277.00 frames. ], tot_loss[loss=0.1005, simple_loss=0.1383, pruned_loss=0.03132, over 2054598.31 frames. ], batch size: 100, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:41:03,239 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.81 vs. limit=5.0 +2022-12-08 16:41:14,928 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.53 vs. limit=5.0 +2022-12-08 16:41:53,288 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.210e+02 2.024e+02 2.634e+02 3.181e+02 1.190e+03, threshold=5.268e+02, percent-clipped=3.0 +2022-12-08 16:42:11,452 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.8455, 5.6546, 5.2435, 5.8110, 5.3926, 5.2412, 5.9078, 5.5838], + device='cuda:2'), covar=tensor([0.0506, 0.0642, 0.0738, 0.0421, 0.0630, 0.0384, 0.0419, 0.0589], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0150, 0.0152, 0.0168, 0.0153, 0.0129, 0.0175, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:42:13,294 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3205, 2.5138, 2.5477, 2.5889, 2.1911, 2.5890, 2.4934, 1.5265], + device='cuda:2'), covar=tensor([0.0822, 0.1043, 0.0670, 0.0641, 0.1003, 0.0714, 0.0908, 0.1832], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0094, 0.0073, 0.0078, 0.0102, 0.0093, 0.0104, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0007, 0.0006], + device='cuda:2') +2022-12-08 16:42:20,370 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=146477.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 16:42:21,928 INFO [train.py:873] (2/4) Epoch 20, batch 2800, loss[loss=0.1099, simple_loss=0.1513, pruned_loss=0.03424, over 13917.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1391, pruned_loss=0.03202, over 2010140.15 frames. ], batch size: 23, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:42:44,683 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=146505.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:43:02,235 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=146525.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 16:43:06,042 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.36 vs. limit=2.0 +2022-12-08 16:43:20,289 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.384e+02 1.986e+02 2.533e+02 3.212e+02 8.104e+02, threshold=5.065e+02, percent-clipped=2.0 +2022-12-08 16:43:48,733 INFO [train.py:873] (2/4) Epoch 20, batch 2900, loss[loss=0.08834, simple_loss=0.1294, pruned_loss=0.02365, over 14420.00 frames. ], tot_loss[loss=0.1003, simple_loss=0.1383, pruned_loss=0.03118, over 2020522.21 frames. ], batch size: 24, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:43:59,408 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=146591.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:44:23,398 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9087, 1.8088, 2.1360, 2.0164, 1.8293, 1.7123, 1.6331, 1.4091], + device='cuda:2'), covar=tensor([0.0293, 0.0428, 0.0292, 0.0260, 0.0264, 0.0316, 0.0283, 0.0458], + device='cuda:2'), in_proj_covar=tensor([0.0024, 0.0024, 0.0022, 0.0023, 0.0022, 0.0035, 0.0029, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 16:44:30,284 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=146626.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:44:47,463 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 2.123e+02 2.667e+02 3.296e+02 6.992e+02, threshold=5.334e+02, percent-clipped=2.0 +2022-12-08 16:44:52,711 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=146652.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:45:16,679 INFO [train.py:873] (2/4) Epoch 20, batch 3000, loss[loss=0.09009, simple_loss=0.1033, pruned_loss=0.03842, over 2592.00 frames. ], tot_loss[loss=0.1008, simple_loss=0.1385, pruned_loss=0.03158, over 2047926.36 frames. ], batch size: 100, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:45:16,679 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 16:45:25,135 INFO [train.py:905] (2/4) Epoch 20, validation: loss=0.1444, simple_loss=0.1794, pruned_loss=0.05469, over 857387.00 frames. +2022-12-08 16:45:25,136 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 16:45:27,216 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=7.45 vs. limit=5.0 +2022-12-08 16:45:32,112 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=146687.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 16:45:58,748 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.50 vs. limit=5.0 +2022-12-08 16:46:06,180 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2370, 1.8430, 2.2592, 1.6285, 1.9268, 2.2436, 2.1309, 1.9882], + device='cuda:2'), covar=tensor([0.0907, 0.0670, 0.0937, 0.1263, 0.1400, 0.1079, 0.0757, 0.1305], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0167, 0.0140, 0.0126, 0.0144, 0.0156, 0.0139, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 16:46:23,235 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 2.030e+02 2.519e+02 3.160e+02 5.604e+02, threshold=5.039e+02, percent-clipped=1.0 +2022-12-08 16:46:27,163 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.47 vs. limit=2.0 +2022-12-08 16:46:42,121 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.62 vs. limit=5.0 +2022-12-08 16:46:52,204 INFO [train.py:873] (2/4) Epoch 20, batch 3100, loss[loss=0.08328, simple_loss=0.1267, pruned_loss=0.01994, over 14660.00 frames. ], tot_loss[loss=0.1021, simple_loss=0.1391, pruned_loss=0.03255, over 1944516.97 frames. ], batch size: 22, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:47:02,372 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3600, 2.7121, 2.7166, 2.7163, 2.2521, 2.7263, 2.6089, 1.5563], + device='cuda:2'), covar=tensor([0.0800, 0.0638, 0.0547, 0.0553, 0.0960, 0.0617, 0.0783, 0.1742], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0094, 0.0073, 0.0078, 0.0102, 0.0093, 0.0104, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0007, 0.0006], + device='cuda:2') +2022-12-08 16:47:14,755 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=146805.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:47:44,513 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9825, 4.9970, 5.3777, 4.3160, 5.1966, 5.4971, 2.1780, 4.8868], + device='cuda:2'), covar=tensor([0.0305, 0.0361, 0.0288, 0.0542, 0.0281, 0.0146, 0.2878, 0.0284], + device='cuda:2'), in_proj_covar=tensor([0.0176, 0.0177, 0.0148, 0.0151, 0.0210, 0.0143, 0.0158, 0.0196], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:47:45,357 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1489, 2.0766, 1.8371, 1.9306, 2.0969, 2.1442, 2.0962, 2.1046], + device='cuda:2'), covar=tensor([0.1219, 0.0970, 0.2692, 0.2470, 0.1425, 0.1339, 0.1701, 0.1142], + device='cuda:2'), in_proj_covar=tensor([0.0398, 0.0283, 0.0453, 0.0574, 0.0359, 0.0465, 0.0398, 0.0399], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:47:48,870 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=146844.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:47:50,641 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.355e+02 2.017e+02 2.599e+02 3.064e+02 9.305e+02, threshold=5.198e+02, percent-clipped=5.0 +2022-12-08 16:47:52,057 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7301, 2.3426, 3.7074, 3.8587, 3.6116, 2.2764, 3.6872, 2.8191], + device='cuda:2'), covar=tensor([0.0466, 0.1390, 0.0963, 0.0520, 0.0611, 0.2095, 0.0534, 0.1182], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0262, 0.0374, 0.0332, 0.0272, 0.0309, 0.0313, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:47:56,847 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=146853.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:48:12,364 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.71 vs. limit=2.0 +2022-12-08 16:48:19,590 INFO [train.py:873] (2/4) Epoch 20, batch 3200, loss[loss=0.1023, simple_loss=0.1447, pruned_loss=0.02995, over 14507.00 frames. ], tot_loss[loss=0.1021, simple_loss=0.1392, pruned_loss=0.03246, over 1910009.21 frames. ], batch size: 49, lr: 3.93e-03, grad_scale: 8.0 +2022-12-08 16:48:34,937 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.37 vs. limit=2.0 +2022-12-08 16:48:36,691 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.33 vs. limit=2.0 +2022-12-08 16:48:43,314 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=146905.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:49:07,554 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.28 vs. limit=5.0 +2022-12-08 16:49:18,825 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.042e+02 2.442e+02 3.021e+02 7.374e+02, threshold=4.884e+02, percent-clipped=4.0 +2022-12-08 16:49:19,700 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=146947.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:49:47,632 INFO [train.py:873] (2/4) Epoch 20, batch 3300, loss[loss=0.07985, simple_loss=0.1144, pruned_loss=0.02268, over 3898.00 frames. ], tot_loss[loss=0.101, simple_loss=0.1384, pruned_loss=0.03176, over 1973819.26 frames. ], batch size: 100, lr: 3.92e-03, grad_scale: 8.0 +2022-12-08 16:49:50,219 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=146982.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 16:49:55,841 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.62 vs. limit=2.0 +2022-12-08 16:50:47,403 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.340e+02 2.122e+02 2.526e+02 3.271e+02 7.839e+02, threshold=5.052e+02, percent-clipped=6.0 +2022-12-08 16:50:49,510 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.13 vs. limit=2.0 +2022-12-08 16:50:55,743 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9479, 2.0422, 4.8145, 4.3426, 4.1882, 4.8748, 4.5931, 4.9093], + device='cuda:2'), covar=tensor([0.1654, 0.1542, 0.0100, 0.0222, 0.0259, 0.0132, 0.0125, 0.0102], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0131, 0.0170, 0.0149, 0.0143, 0.0125, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 16:50:59,270 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8459, 2.4865, 2.4727, 2.3620, 2.4720, 1.5629, 2.3972, 2.8360], + device='cuda:2'), covar=tensor([0.0726, 0.0567, 0.0670, 0.1487, 0.0786, 0.0719, 0.0691, 0.0495], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0037, 0.0042, 0.0035, 0.0037, 0.0051, 0.0039, 0.0041], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 16:51:04,945 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6029, 3.3319, 3.1421, 2.5236, 2.9734, 3.2582, 3.4839, 2.9801], + device='cuda:2'), covar=tensor([0.0525, 0.0654, 0.0708, 0.0906, 0.0905, 0.0643, 0.0628, 0.0853], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0167, 0.0140, 0.0125, 0.0144, 0.0155, 0.0139, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 16:51:14,965 INFO [train.py:873] (2/4) Epoch 20, batch 3400, loss[loss=0.1094, simple_loss=0.1482, pruned_loss=0.03535, over 14166.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1387, pruned_loss=0.03224, over 1954851.19 frames. ], batch size: 99, lr: 3.92e-03, grad_scale: 4.0 +2022-12-08 16:52:16,823 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.131e+02 1.939e+02 2.278e+02 2.850e+02 6.486e+02, threshold=4.556e+02, percent-clipped=1.0 +2022-12-08 16:52:45,347 INFO [train.py:873] (2/4) Epoch 20, batch 3500, loss[loss=0.1592, simple_loss=0.1448, pruned_loss=0.08681, over 1229.00 frames. ], tot_loss[loss=0.1016, simple_loss=0.1387, pruned_loss=0.03222, over 1962431.20 frames. ], batch size: 100, lr: 3.92e-03, grad_scale: 4.0 +2022-12-08 16:53:03,636 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=147200.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:53:11,761 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1756, 2.0791, 1.8602, 1.9560, 2.0883, 2.1455, 2.0752, 2.1123], + device='cuda:2'), covar=tensor([0.1234, 0.0967, 0.2715, 0.2532, 0.1329, 0.1302, 0.1964, 0.1129], + device='cuda:2'), in_proj_covar=tensor([0.0395, 0.0285, 0.0453, 0.0572, 0.0357, 0.0466, 0.0401, 0.0398], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:53:20,218 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=6.95 vs. limit=5.0 +2022-12-08 16:53:39,356 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147241.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:53:44,667 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.154e+02 2.013e+02 2.574e+02 3.119e+02 7.865e+02, threshold=5.147e+02, percent-clipped=11.0 +2022-12-08 16:53:44,811 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=147247.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:54:12,621 INFO [train.py:873] (2/4) Epoch 20, batch 3600, loss[loss=0.0919, simple_loss=0.1363, pruned_loss=0.02375, over 14565.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1384, pruned_loss=0.03199, over 1967715.71 frames. ], batch size: 34, lr: 3.92e-03, grad_scale: 8.0 +2022-12-08 16:54:15,323 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=147282.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:54:26,919 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=147295.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:54:28,482 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=2.47 vs. limit=5.0 +2022-12-08 16:54:34,061 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147302.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:54:40,874 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147310.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:54:58,636 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=147330.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:55:14,168 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.052e+02 2.112e+02 2.582e+02 3.106e+02 5.993e+02, threshold=5.164e+02, percent-clipped=4.0 +2022-12-08 16:55:27,218 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.6160, 1.4320, 3.6624, 1.7785, 3.5251, 3.7105, 2.7573, 3.9531], + device='cuda:2'), covar=tensor([0.0260, 0.3193, 0.0452, 0.2250, 0.0679, 0.0442, 0.0851, 0.0223], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0162, 0.0171, 0.0167, 0.0180, 0.0134, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:55:29,289 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.42 vs. limit=2.0 +2022-12-08 16:55:34,655 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147371.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:55:41,609 INFO [train.py:873] (2/4) Epoch 20, batch 3700, loss[loss=0.1199, simple_loss=0.1408, pruned_loss=0.04946, over 3903.00 frames. ], tot_loss[loss=0.1018, simple_loss=0.1387, pruned_loss=0.03249, over 1921355.97 frames. ], batch size: 100, lr: 3.92e-03, grad_scale: 4.0 +2022-12-08 16:56:06,143 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147407.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:56:18,488 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8139, 1.5359, 2.0339, 1.6248, 1.9121, 1.4791, 1.6603, 1.9040], + device='cuda:2'), covar=tensor([0.3383, 0.2883, 0.0762, 0.2036, 0.1692, 0.1712, 0.1415, 0.1306], + device='cuda:2'), in_proj_covar=tensor([0.0253, 0.0198, 0.0222, 0.0266, 0.0239, 0.0200, 0.0200, 0.0221], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 16:56:29,353 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147434.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:56:41,247 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.389e+02 2.266e+02 2.765e+02 3.309e+02 4.888e+02, threshold=5.530e+02, percent-clipped=0.0 +2022-12-08 16:56:58,654 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147468.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:57:07,330 INFO [train.py:873] (2/4) Epoch 20, batch 3800, loss[loss=0.08811, simple_loss=0.1314, pruned_loss=0.02241, over 14222.00 frames. ], tot_loss[loss=0.1024, simple_loss=0.1393, pruned_loss=0.03272, over 1968586.46 frames. ], batch size: 94, lr: 3.92e-03, grad_scale: 4.0 +2022-12-08 16:57:21,653 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147495.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:57:25,952 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=147500.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:57:33,198 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2537, 2.0225, 2.3103, 2.3920, 2.1627, 1.9900, 2.3373, 2.0825], + device='cuda:2'), covar=tensor([0.0490, 0.1227, 0.0612, 0.0546, 0.0798, 0.1452, 0.0607, 0.0786], + device='cuda:2'), in_proj_covar=tensor([0.0295, 0.0262, 0.0375, 0.0332, 0.0272, 0.0309, 0.0315, 0.0280], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:57:41,124 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.14 vs. limit=2.0 +2022-12-08 16:58:07,481 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 9.824e+01 2.099e+02 2.660e+02 3.307e+02 5.236e+02, threshold=5.319e+02, percent-clipped=0.0 +2022-12-08 16:58:07,569 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=147548.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:58:12,890 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.7879, 2.9101, 4.5918, 3.5972, 4.6386, 4.4973, 4.4163, 4.0602], + device='cuda:2'), covar=tensor([0.0688, 0.2951, 0.0776, 0.1573, 0.0647, 0.0891, 0.1451, 0.1537], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0306, 0.0389, 0.0297, 0.0365, 0.0320, 0.0358, 0.0293], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 16:58:25,693 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5462, 3.6460, 3.7952, 3.4926, 3.7143, 3.7043, 1.6213, 3.4925], + device='cuda:2'), covar=tensor([0.0355, 0.0388, 0.0353, 0.0513, 0.0310, 0.0413, 0.3011, 0.0331], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0179, 0.0151, 0.0153, 0.0212, 0.0146, 0.0159, 0.0199], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 16:58:34,641 INFO [train.py:873] (2/4) Epoch 20, batch 3900, loss[loss=0.09308, simple_loss=0.1368, pruned_loss=0.02465, over 13965.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1387, pruned_loss=0.03185, over 1959230.96 frames. ], batch size: 23, lr: 3.92e-03, grad_scale: 4.0 +2022-12-08 16:58:50,303 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=147597.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:58:54,879 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.7471, 5.5211, 5.1564, 5.7457, 5.3086, 5.0166, 5.7784, 5.4512], + device='cuda:2'), covar=tensor([0.0566, 0.0704, 0.0679, 0.0396, 0.0610, 0.0463, 0.0461, 0.0698], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0150, 0.0152, 0.0166, 0.0153, 0.0129, 0.0175, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 16:58:55,048 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.9764, 2.7324, 4.9797, 3.4277, 4.7684, 2.3546, 3.8758, 4.7497], + device='cuda:2'), covar=tensor([0.0366, 0.2854, 0.0266, 0.4306, 0.0366, 0.2886, 0.0981, 0.0302], + device='cuda:2'), in_proj_covar=tensor([0.0252, 0.0197, 0.0222, 0.0264, 0.0239, 0.0199, 0.0200, 0.0220], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 16:59:15,779 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.51 vs. limit=2.0 +2022-12-08 16:59:24,697 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8672, 3.4729, 3.2751, 2.2916, 3.2709, 3.4652, 3.9154, 3.1337], + device='cuda:2'), covar=tensor([0.0477, 0.0929, 0.0834, 0.1403, 0.0764, 0.0661, 0.0699, 0.0970], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0168, 0.0141, 0.0126, 0.0145, 0.0157, 0.0140, 0.0144], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 16:59:35,033 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.207e+02 1.905e+02 2.298e+02 3.086e+02 1.551e+03, threshold=4.597e+02, percent-clipped=3.0 +2022-12-08 16:59:45,257 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147659.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 16:59:51,012 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=147666.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:00:01,238 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5695, 3.2260, 3.1043, 2.1479, 2.9228, 3.1784, 3.4791, 2.8652], + device='cuda:2'), covar=tensor([0.0514, 0.0686, 0.0702, 0.1157, 0.0785, 0.0648, 0.0763, 0.0936], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0167, 0.0141, 0.0125, 0.0145, 0.0157, 0.0139, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:00:01,897 INFO [train.py:873] (2/4) Epoch 20, batch 4000, loss[loss=0.09431, simple_loss=0.1357, pruned_loss=0.02645, over 14539.00 frames. ], tot_loss[loss=0.1006, simple_loss=0.1381, pruned_loss=0.03159, over 1975835.31 frames. ], batch size: 49, lr: 3.91e-03, grad_scale: 8.0 +2022-12-08 17:00:37,409 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147720.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:01:01,456 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.046e+02 2.183e+02 2.642e+02 3.333e+02 1.015e+03, threshold=5.284e+02, percent-clipped=7.0 +2022-12-08 17:01:14,055 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=147763.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:01:27,729 INFO [train.py:873] (2/4) Epoch 20, batch 4100, loss[loss=0.1128, simple_loss=0.1364, pruned_loss=0.0446, over 5023.00 frames. ], tot_loss[loss=0.1019, simple_loss=0.139, pruned_loss=0.03236, over 1963879.47 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 8.0 +2022-12-08 17:01:37,715 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=147790.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:01:59,433 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147815.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:02:27,883 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.192e+02 2.007e+02 2.574e+02 3.284e+02 8.087e+02, threshold=5.147e+02, percent-clipped=4.0 +2022-12-08 17:02:36,116 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147857.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:02:52,206 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147876.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:02:54,612 INFO [train.py:873] (2/4) Epoch 20, batch 4200, loss[loss=0.1686, simple_loss=0.1615, pruned_loss=0.08784, over 1211.00 frames. ], tot_loss[loss=0.1014, simple_loss=0.1386, pruned_loss=0.03204, over 1980560.10 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 8.0 +2022-12-08 17:03:10,441 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=147897.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:03:22,117 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1014, 1.4248, 3.1963, 1.6970, 3.1392, 3.2386, 2.4314, 3.4150], + device='cuda:2'), covar=tensor([0.0296, 0.2950, 0.0461, 0.1988, 0.0689, 0.0449, 0.0952, 0.0236], + device='cuda:2'), in_proj_covar=tensor([0.0174, 0.0157, 0.0161, 0.0170, 0.0167, 0.0181, 0.0133, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:03:22,162 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=147911.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:03:28,191 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147918.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:03:52,200 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=147945.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:03:55,540 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.345e+02 1.943e+02 2.282e+02 3.039e+02 5.429e+02, threshold=4.564e+02, percent-clipped=1.0 +2022-12-08 17:04:09,964 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=147966.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:04:15,521 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=147972.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:04:21,717 INFO [train.py:873] (2/4) Epoch 20, batch 4300, loss[loss=0.1077, simple_loss=0.1444, pruned_loss=0.03549, over 11178.00 frames. ], tot_loss[loss=0.1012, simple_loss=0.1388, pruned_loss=0.03175, over 2081781.82 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 4.0 +2022-12-08 17:04:52,071 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148014.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:04:52,937 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148015.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:05:22,415 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.032e+02 2.071e+02 2.502e+02 3.240e+02 4.957e+02, threshold=5.004e+02, percent-clipped=3.0 +2022-12-08 17:05:35,028 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148063.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:05:48,148 INFO [train.py:873] (2/4) Epoch 20, batch 4400, loss[loss=0.1163, simple_loss=0.1232, pruned_loss=0.05468, over 1202.00 frames. ], tot_loss[loss=0.1005, simple_loss=0.1383, pruned_loss=0.03138, over 2027485.21 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 8.0 +2022-12-08 17:05:50,384 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.55 vs. limit=2.0 +2022-12-08 17:05:58,157 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148090.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:06:16,504 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148111.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:06:32,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3386, 1.8741, 3.4204, 2.4608, 3.2997, 2.0052, 2.6367, 3.3247], + device='cuda:2'), covar=tensor([0.0828, 0.3919, 0.0653, 0.4049, 0.0827, 0.2930, 0.1312, 0.0583], + device='cuda:2'), in_proj_covar=tensor([0.0251, 0.0197, 0.0221, 0.0265, 0.0239, 0.0199, 0.0200, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 17:06:39,819 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148138.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:06:49,733 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.267e+02 2.050e+02 2.451e+02 3.050e+02 8.671e+02, threshold=4.901e+02, percent-clipped=4.0 +2022-12-08 17:07:08,034 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148171.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:07:15,618 INFO [train.py:873] (2/4) Epoch 20, batch 4500, loss[loss=0.09317, simple_loss=0.1393, pruned_loss=0.0235, over 14003.00 frames. ], tot_loss[loss=0.1, simple_loss=0.1378, pruned_loss=0.03109, over 1978149.55 frames. ], batch size: 22, lr: 3.91e-03, grad_scale: 4.0 +2022-12-08 17:07:18,972 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.1069, 1.3360, 1.2029, 1.0946, 0.8665, 1.0411, 0.8960, 1.1712], + device='cuda:2'), covar=tensor([0.2027, 0.2351, 0.1423, 0.2205, 0.3077, 0.1430, 0.1749, 0.1428], + device='cuda:2'), in_proj_covar=tensor([0.0088, 0.0103, 0.0097, 0.0101, 0.0116, 0.0093, 0.0116, 0.0097], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 17:07:40,992 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1265, 1.9834, 2.1004, 2.1726, 2.0699, 2.0375, 2.2031, 1.8931], + device='cuda:2'), covar=tensor([0.1090, 0.1397, 0.0848, 0.0820, 0.1018, 0.0812, 0.0864, 0.0750], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0276, 0.0204, 0.0201, 0.0185, 0.0162, 0.0291, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 17:07:45,038 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148213.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:08:17,145 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.374e+02 2.029e+02 2.580e+02 3.112e+02 6.429e+02, threshold=5.160e+02, percent-clipped=2.0 +2022-12-08 17:08:32,468 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148267.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:08:42,533 INFO [train.py:873] (2/4) Epoch 20, batch 4600, loss[loss=0.1018, simple_loss=0.1382, pruned_loss=0.03275, over 7803.00 frames. ], tot_loss[loss=0.1005, simple_loss=0.1379, pruned_loss=0.03157, over 1944457.88 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 4.0 +2022-12-08 17:08:49,640 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=148287.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:09:13,690 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148315.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:09:42,681 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=148348.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:09:44,261 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.292e+02 2.179e+02 2.688e+02 3.360e+02 6.935e+02, threshold=5.375e+02, percent-clipped=4.0 +2022-12-08 17:09:55,123 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148363.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:10:09,364 INFO [train.py:873] (2/4) Epoch 20, batch 4700, loss[loss=0.09362, simple_loss=0.1304, pruned_loss=0.0284, over 6964.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.1387, pruned_loss=0.03221, over 1906226.70 frames. ], batch size: 100, lr: 3.91e-03, grad_scale: 4.0 +2022-12-08 17:10:47,659 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.6062, 2.4048, 2.5735, 2.6492, 2.4291, 1.6800, 2.4447, 2.5806], + device='cuda:2'), covar=tensor([0.1084, 0.1031, 0.0587, 0.0820, 0.1573, 0.0688, 0.0967, 0.0481], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0038, 0.0042, 0.0035, 0.0038, 0.0052, 0.0039, 0.0042], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 17:10:52,668 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=148429.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:11:10,544 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 2.097e+02 2.538e+02 3.093e+02 5.864e+02, threshold=5.077e+02, percent-clipped=2.0 +2022-12-08 17:11:28,991 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148471.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:11:35,538 INFO [train.py:873] (2/4) Epoch 20, batch 4800, loss[loss=0.1104, simple_loss=0.1215, pruned_loss=0.04966, over 2636.00 frames. ], tot_loss[loss=0.1009, simple_loss=0.1382, pruned_loss=0.03181, over 1953610.27 frames. ], batch size: 100, lr: 3.90e-03, grad_scale: 8.0 +2022-12-08 17:11:45,471 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=148490.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:11:52,418 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=148498.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:05,545 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148513.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:10,308 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148519.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:11,186 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7953, 3.5657, 3.4140, 3.8206, 3.5273, 3.7466, 3.8558, 3.2055], + device='cuda:2'), covar=tensor([0.0534, 0.0939, 0.0560, 0.0471, 0.0812, 0.0340, 0.0545, 0.0547], + device='cuda:2'), in_proj_covar=tensor([0.0181, 0.0276, 0.0205, 0.0202, 0.0186, 0.0163, 0.0292, 0.0171], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 17:12:37,249 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.358e+02 2.096e+02 2.575e+02 3.014e+02 5.003e+02, threshold=5.149e+02, percent-clipped=0.0 +2022-12-08 17:12:45,177 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3798, 1.4885, 2.5148, 1.6258, 2.4843, 2.4663, 1.8903, 2.6491], + device='cuda:2'), covar=tensor([0.0302, 0.2450, 0.0452, 0.1655, 0.0520, 0.0625, 0.1339, 0.0319], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0157, 0.0161, 0.0169, 0.0166, 0.0181, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:12:45,234 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=148559.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:46,821 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148561.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:52,324 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148567.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:12:57,314 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7246, 3.5368, 3.3043, 3.4350, 3.6506, 3.6605, 3.7022, 3.7284], + device='cuda:2'), covar=tensor([0.0934, 0.0657, 0.1947, 0.2499, 0.0858, 0.0937, 0.1071, 0.0878], + device='cuda:2'), in_proj_covar=tensor([0.0401, 0.0286, 0.0457, 0.0577, 0.0366, 0.0468, 0.0401, 0.0402], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0005, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:13:02,973 INFO [train.py:873] (2/4) Epoch 20, batch 4900, loss[loss=0.1104, simple_loss=0.1338, pruned_loss=0.04352, over 4974.00 frames. ], tot_loss[loss=0.1008, simple_loss=0.1383, pruned_loss=0.03161, over 1976056.10 frames. ], batch size: 100, lr: 3.90e-03, grad_scale: 4.0 +2022-12-08 17:13:33,596 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148615.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:13:40,209 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1892, 3.5918, 2.8553, 4.3486, 4.1313, 4.2006, 3.6343, 3.1078], + device='cuda:2'), covar=tensor([0.0649, 0.1281, 0.3023, 0.0566, 0.0663, 0.1098, 0.1106, 0.2384], + device='cuda:2'), in_proj_covar=tensor([0.0283, 0.0287, 0.0258, 0.0292, 0.0322, 0.0306, 0.0257, 0.0242], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:13:48,211 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.63 vs. limit=2.0 +2022-12-08 17:13:57,719 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148643.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:14:04,643 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.369e+02 1.967e+02 2.434e+02 3.108e+02 5.007e+02, threshold=4.868e+02, percent-clipped=0.0 +2022-12-08 17:14:04,879 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=148651.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:14:21,831 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8853, 1.5100, 1.9056, 1.3558, 1.7159, 2.0532, 1.8342, 1.7493], + device='cuda:2'), covar=tensor([0.0945, 0.0705, 0.0871, 0.1257, 0.1635, 0.0913, 0.0839, 0.1727], + device='cuda:2'), in_proj_covar=tensor([0.0154, 0.0168, 0.0140, 0.0126, 0.0145, 0.0157, 0.0139, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:14:26,963 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1859, 2.2261, 2.1361, 2.2107, 2.1201, 1.5383, 1.7167, 2.1176], + device='cuda:2'), covar=tensor([0.0892, 0.0652, 0.0603, 0.1504, 0.0875, 0.0712, 0.1176, 0.0574], + device='cuda:2'), in_proj_covar=tensor([0.0039, 0.0038, 0.0042, 0.0035, 0.0037, 0.0051, 0.0040, 0.0042], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002], + device='cuda:2') +2022-12-08 17:14:29,320 INFO [train.py:873] (2/4) Epoch 20, batch 5000, loss[loss=0.119, simple_loss=0.1509, pruned_loss=0.04356, over 13521.00 frames. ], tot_loss[loss=0.1015, simple_loss=0.139, pruned_loss=0.03198, over 1980299.08 frames. ], batch size: 100, lr: 3.90e-03, grad_scale: 4.0 +2022-12-08 17:14:52,854 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8243, 1.4430, 2.5159, 2.2647, 2.3632, 2.5304, 1.8427, 2.5493], + device='cuda:2'), covar=tensor([0.1057, 0.1362, 0.0250, 0.0561, 0.0581, 0.0305, 0.0794, 0.0283], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0157, 0.0132, 0.0170, 0.0149, 0.0142, 0.0126, 0.0124], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 17:14:58,870 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=148712.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:15:02,398 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=148716.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:15:11,152 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 17:15:26,987 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.5213, 1.5841, 1.5515, 1.3477, 1.2973, 1.3265, 1.2771, 1.1062], + device='cuda:2'), covar=tensor([0.0220, 0.0259, 0.0215, 0.0255, 0.0260, 0.0428, 0.0289, 0.0420], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0022, 0.0024, 0.0023, 0.0036, 0.0030, 0.0035], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 17:15:29,559 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3035, 1.8101, 2.2481, 1.9370, 2.3246, 2.1808, 2.1128, 2.1253], + device='cuda:2'), covar=tensor([0.0681, 0.2470, 0.0879, 0.1358, 0.0668, 0.1264, 0.0842, 0.1082], + device='cuda:2'), in_proj_covar=tensor([0.0346, 0.0305, 0.0387, 0.0295, 0.0363, 0.0319, 0.0355, 0.0292], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:15:31,427 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.8104, 3.3671, 2.7482, 3.9356, 3.7902, 3.8232, 3.2983, 2.7096], + device='cuda:2'), covar=tensor([0.0750, 0.1290, 0.2771, 0.0560, 0.0797, 0.0927, 0.1171, 0.2807], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0285, 0.0257, 0.0291, 0.0322, 0.0303, 0.0256, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:15:33,633 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.005e+02 2.005e+02 2.534e+02 3.099e+02 5.530e+02, threshold=5.067e+02, percent-clipped=2.0 +2022-12-08 17:15:52,213 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.44 vs. limit=2.0 +2022-12-08 17:15:54,085 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.89 vs. limit=5.0 +2022-12-08 17:15:56,314 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=148777.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:15:58,227 INFO [train.py:873] (2/4) Epoch 20, batch 5100, loss[loss=0.1277, simple_loss=0.1232, pruned_loss=0.0661, over 1256.00 frames. ], tot_loss[loss=0.1013, simple_loss=0.1384, pruned_loss=0.03207, over 1942217.79 frames. ], batch size: 100, lr: 3.90e-03, grad_scale: 4.0 +2022-12-08 17:16:01,160 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1926, 4.3547, 4.5329, 4.0545, 4.3918, 4.6864, 1.8788, 4.1086], + device='cuda:2'), covar=tensor([0.0447, 0.0467, 0.0486, 0.0497, 0.0423, 0.0228, 0.3363, 0.0391], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0178, 0.0149, 0.0152, 0.0210, 0.0144, 0.0158, 0.0198], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 17:16:03,507 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([5.3300, 5.2427, 4.8960, 5.3550, 4.9869, 4.6958, 5.3971, 5.0738], + device='cuda:2'), covar=tensor([0.0581, 0.0810, 0.0826, 0.0507, 0.0665, 0.0501, 0.0565, 0.0785], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0151, 0.0152, 0.0169, 0.0153, 0.0130, 0.0176, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:16:03,540 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148785.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:16:46,098 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.80 vs. limit=2.0 +2022-12-08 17:17:01,005 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.416e+02 2.285e+02 2.852e+02 3.620e+02 6.279e+02, threshold=5.703e+02, percent-clipped=4.0 +2022-12-08 17:17:03,878 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=148854.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:17:25,637 INFO [train.py:873] (2/4) Epoch 20, batch 5200, loss[loss=0.09639, simple_loss=0.1328, pruned_loss=0.03001, over 14225.00 frames. ], tot_loss[loss=0.1019, simple_loss=0.1389, pruned_loss=0.03242, over 1933671.74 frames. ], batch size: 57, lr: 3.90e-03, grad_scale: 8.0 +2022-12-08 17:17:30,966 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.10 vs. limit=2.0 +2022-12-08 17:18:21,950 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=148943.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:18:29,832 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.154e+02 1.963e+02 2.482e+02 3.012e+02 7.254e+02, threshold=4.965e+02, percent-clipped=1.0 +2022-12-08 17:18:48,602 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0200, 3.8559, 3.5282, 2.8056, 3.4150, 3.6992, 4.1026, 3.3920], + device='cuda:2'), covar=tensor([0.0584, 0.0867, 0.0779, 0.1058, 0.0880, 0.0516, 0.0701, 0.0994], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0168, 0.0141, 0.0126, 0.0146, 0.0157, 0.0140, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:18:52,371 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2061, 2.0790, 2.1823, 2.2788, 2.1818, 1.9779, 1.8178, 1.9846], + device='cuda:2'), covar=tensor([0.0308, 0.0409, 0.0366, 0.0338, 0.0297, 0.0574, 0.0611, 0.0618], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0022, 0.0024, 0.0023, 0.0036, 0.0030, 0.0035], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 17:18:53,064 INFO [train.py:873] (2/4) Epoch 20, batch 5300, loss[loss=0.09402, simple_loss=0.1403, pruned_loss=0.02388, over 14087.00 frames. ], tot_loss[loss=0.1004, simple_loss=0.1379, pruned_loss=0.03149, over 1983265.88 frames. ], batch size: 29, lr: 3.90e-03, grad_scale: 4.0 +2022-12-08 17:19:03,314 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=148991.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:19:10,144 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.8267, 2.7709, 2.6762, 2.9360, 2.5403, 2.6891, 2.8739, 2.7926], + device='cuda:2'), covar=tensor([0.0749, 0.1260, 0.1007, 0.0760, 0.1271, 0.0699, 0.0786, 0.0933], + device='cuda:2'), in_proj_covar=tensor([0.0150, 0.0152, 0.0153, 0.0169, 0.0154, 0.0131, 0.0177, 0.0156], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:19:16,982 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=149007.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:19:27,100 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.1519, 2.2317, 3.0566, 2.3813, 3.0568, 3.0181, 2.8892, 2.5370], + device='cuda:2'), covar=tensor([0.1050, 0.2710, 0.1304, 0.1868, 0.0938, 0.1063, 0.1215, 0.1795], + device='cuda:2'), in_proj_covar=tensor([0.0344, 0.0301, 0.0384, 0.0294, 0.0360, 0.0315, 0.0353, 0.0289], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:19:57,355 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.142e+02 2.154e+02 2.538e+02 2.985e+02 9.704e+02, threshold=5.077e+02, percent-clipped=4.0 +2022-12-08 17:19:59,550 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.7446, 1.6914, 1.5894, 1.8933, 1.7480, 1.7649, 1.7846, 1.6212], + device='cuda:2'), covar=tensor([0.1326, 0.1092, 0.1918, 0.0951, 0.1255, 0.0898, 0.1430, 0.1173], + device='cuda:2'), in_proj_covar=tensor([0.0281, 0.0285, 0.0257, 0.0290, 0.0322, 0.0303, 0.0255, 0.0241], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:20:12,417 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.83 vs. limit=2.0 +2022-12-08 17:20:14,471 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=149072.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:20:20,462 INFO [train.py:873] (2/4) Epoch 20, batch 5400, loss[loss=0.1532, simple_loss=0.1528, pruned_loss=0.07679, over 1179.00 frames. ], tot_loss[loss=0.09975, simple_loss=0.1374, pruned_loss=0.03106, over 1946240.52 frames. ], batch size: 100, lr: 3.90e-03, grad_scale: 2.0 +2022-12-08 17:20:24,454 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.75 vs. limit=2.0 +2022-12-08 17:20:25,685 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=149085.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:21:07,672 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=149133.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:21:25,134 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.118e+02 1.972e+02 2.518e+02 2.987e+02 6.531e+02, threshold=5.036e+02, percent-clipped=3.0 +2022-12-08 17:21:26,190 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=149154.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:21:48,146 INFO [train.py:873] (2/4) Epoch 20, batch 5500, loss[loss=0.08895, simple_loss=0.1268, pruned_loss=0.02555, over 14274.00 frames. ], tot_loss[loss=0.09892, simple_loss=0.1367, pruned_loss=0.0306, over 1998123.03 frames. ], batch size: 76, lr: 3.90e-03, grad_scale: 2.0 +2022-12-08 17:22:07,922 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=149202.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:22:44,089 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.87 vs. limit=5.0 +2022-12-08 17:22:52,116 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.123e+02 2.034e+02 2.619e+02 3.145e+02 6.128e+02, threshold=5.238e+02, percent-clipped=4.0 +2022-12-08 17:23:10,892 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.3945, 2.4557, 2.5433, 2.5115, 2.5040, 2.0848, 1.5554, 2.2585], + device='cuda:2'), covar=tensor([0.0680, 0.0567, 0.0439, 0.0399, 0.0416, 0.1488, 0.2362, 0.0472], + device='cuda:2'), in_proj_covar=tensor([0.0178, 0.0177, 0.0149, 0.0152, 0.0210, 0.0145, 0.0159, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 17:23:15,004 INFO [train.py:873] (2/4) Epoch 20, batch 5600, loss[loss=0.09822, simple_loss=0.1324, pruned_loss=0.03203, over 11164.00 frames. ], tot_loss[loss=0.1001, simple_loss=0.1376, pruned_loss=0.03127, over 1970425.90 frames. ], batch size: 100, lr: 3.89e-03, grad_scale: 4.0 +2022-12-08 17:23:18,514 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.9342, 1.6914, 3.9137, 3.6276, 3.6978, 3.9927, 3.3948, 3.9608], + device='cuda:2'), covar=tensor([0.1676, 0.1701, 0.0142, 0.0287, 0.0286, 0.0158, 0.0267, 0.0147], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0131, 0.0169, 0.0148, 0.0142, 0.0125, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 17:23:24,914 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0287, 1.9979, 4.6562, 4.2701, 4.1247, 4.7076, 4.3623, 4.7043], + device='cuda:2'), covar=tensor([0.1607, 0.1516, 0.0100, 0.0214, 0.0260, 0.0138, 0.0156, 0.0115], + device='cuda:2'), in_proj_covar=tensor([0.0146, 0.0156, 0.0131, 0.0169, 0.0148, 0.0142, 0.0125, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 17:23:27,937 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=3.16 vs. limit=5.0 +2022-12-08 17:23:39,926 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=149307.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:24:03,593 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.58 vs. limit=2.0 +2022-12-08 17:24:19,494 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.122e+02 1.989e+02 2.324e+02 2.779e+02 6.746e+02, threshold=4.648e+02, percent-clipped=1.0 +2022-12-08 17:24:21,361 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=149355.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:24:24,930 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2619, 1.8240, 2.2222, 1.5775, 1.9343, 2.2340, 2.1630, 1.9655], + device='cuda:2'), covar=tensor([0.1161, 0.0686, 0.0880, 0.1394, 0.1385, 0.0931, 0.0961, 0.1467], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0168, 0.0140, 0.0125, 0.0145, 0.0157, 0.0139, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:24:31,234 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.62 vs. limit=2.0 +2022-12-08 17:24:36,166 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=149372.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:24:37,120 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.3969, 3.7130, 3.1471, 3.3605, 2.5073, 3.6140, 3.5337, 2.0096], + device='cuda:2'), covar=tensor([0.1095, 0.0458, 0.1314, 0.0746, 0.0985, 0.0468, 0.0798, 0.1649], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0094, 0.0073, 0.0078, 0.0102, 0.0093, 0.0104, 0.0098], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0007, 0.0006], + device='cuda:2') +2022-12-08 17:24:41,110 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.1997, 3.9398, 3.7778, 4.1950, 3.9292, 3.7219, 4.2312, 3.6081], + device='cuda:2'), covar=tensor([0.0444, 0.0911, 0.0520, 0.0477, 0.0765, 0.1207, 0.0545, 0.0509], + device='cuda:2'), in_proj_covar=tensor([0.0179, 0.0274, 0.0203, 0.0201, 0.0186, 0.0162, 0.0291, 0.0169], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0003, 0.0003, 0.0003, 0.0002, 0.0002, 0.0004, 0.0002], + device='cuda:2') +2022-12-08 17:24:42,037 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8752, 1.5553, 1.8121, 1.9992, 1.4881, 1.7513, 1.5487, 1.8428], + device='cuda:2'), covar=tensor([0.0267, 0.0496, 0.0283, 0.0253, 0.0491, 0.0516, 0.0384, 0.0285], + device='cuda:2'), in_proj_covar=tensor([0.0293, 0.0261, 0.0375, 0.0331, 0.0271, 0.0307, 0.0314, 0.0278], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:24:42,617 INFO [train.py:873] (2/4) Epoch 20, batch 5700, loss[loss=0.1585, simple_loss=0.1528, pruned_loss=0.08212, over 1223.00 frames. ], tot_loss[loss=0.09997, simple_loss=0.1376, pruned_loss=0.03118, over 1920713.71 frames. ], batch size: 100, lr: 3.89e-03, grad_scale: 4.0 +2022-12-08 17:25:18,495 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=149420.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:25:32,428 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.3854, 0.9591, 1.1805, 0.7791, 1.1426, 1.3726, 1.1301, 1.1059], + device='cuda:2'), covar=tensor([0.0536, 0.1027, 0.0944, 0.0745, 0.1237, 0.0922, 0.0696, 0.1378], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0168, 0.0140, 0.0125, 0.0145, 0.0157, 0.0139, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:25:47,191 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 7.719e+01 2.141e+02 2.467e+02 2.944e+02 6.062e+02, threshold=4.935e+02, percent-clipped=6.0 +2022-12-08 17:26:10,321 INFO [train.py:873] (2/4) Epoch 20, batch 5800, loss[loss=0.1577, simple_loss=0.148, pruned_loss=0.08373, over 1377.00 frames. ], tot_loss[loss=0.1009, simple_loss=0.1381, pruned_loss=0.03185, over 1922985.96 frames. ], batch size: 100, lr: 3.89e-03, grad_scale: 4.0 +2022-12-08 17:26:40,924 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=149514.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:27:08,954 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8439, 1.6283, 3.5010, 3.1840, 3.2948, 3.5352, 2.8524, 3.5005], + device='cuda:2'), covar=tensor([0.1720, 0.1686, 0.0162, 0.0357, 0.0326, 0.0185, 0.0333, 0.0170], + device='cuda:2'), in_proj_covar=tensor([0.0145, 0.0156, 0.0131, 0.0169, 0.0148, 0.0141, 0.0124, 0.0123], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0004, 0.0003, 0.0004, 0.0004, 0.0003, 0.0003, 0.0003], + device='cuda:2') +2022-12-08 17:27:15,062 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.429e+02 2.003e+02 2.561e+02 2.963e+02 6.960e+02, threshold=5.123e+02, percent-clipped=3.0 +2022-12-08 17:27:25,782 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=149565.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:27:34,512 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=149575.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:27:38,145 INFO [train.py:873] (2/4) Epoch 20, batch 5900, loss[loss=0.09081, simple_loss=0.1344, pruned_loss=0.02359, over 14392.00 frames. ], tot_loss[loss=0.1009, simple_loss=0.1383, pruned_loss=0.03176, over 1950593.81 frames. ], batch size: 41, lr: 3.89e-03, grad_scale: 4.0 +2022-12-08 17:28:20,512 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.0858, 2.2057, 2.0092, 2.2573, 1.8779, 2.0877, 2.1360, 2.1220], + device='cuda:2'), covar=tensor([0.1042, 0.1193, 0.1233, 0.0807, 0.1628, 0.0899, 0.1060, 0.1155], + device='cuda:2'), in_proj_covar=tensor([0.0149, 0.0150, 0.0151, 0.0168, 0.0153, 0.0130, 0.0175, 0.0155], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:28:20,604 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=149626.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 17:28:43,783 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.268e+02 2.112e+02 2.459e+02 3.260e+02 5.779e+02, threshold=4.919e+02, percent-clipped=5.0 +2022-12-08 17:28:58,069 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 17:29:07,363 INFO [train.py:873] (2/4) Epoch 20, batch 6000, loss[loss=0.09832, simple_loss=0.1348, pruned_loss=0.03092, over 14165.00 frames. ], tot_loss[loss=0.1004, simple_loss=0.138, pruned_loss=0.03136, over 2037850.05 frames. ], batch size: 99, lr: 3.89e-03, grad_scale: 8.0 +2022-12-08 17:29:07,363 INFO [train.py:896] (2/4) Computing validation loss +2022-12-08 17:29:22,625 INFO [train.py:905] (2/4) Epoch 20, validation: loss=0.1445, simple_loss=0.1806, pruned_loss=0.0542, over 857387.00 frames. +2022-12-08 17:29:22,626 INFO [train.py:906] (2/4) Maximum memory allocated so far is 17969MB +2022-12-08 17:30:22,030 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=149746.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 17:30:27,889 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.146e+02 2.065e+02 2.623e+02 3.098e+02 9.406e+02, threshold=5.245e+02, percent-clipped=5.0 +2022-12-08 17:30:50,976 INFO [train.py:873] (2/4) Epoch 20, batch 6100, loss[loss=0.09939, simple_loss=0.1117, pruned_loss=0.04354, over 2618.00 frames. ], tot_loss[loss=0.1003, simple_loss=0.1382, pruned_loss=0.03119, over 2051187.63 frames. ], batch size: 100, lr: 3.89e-03, grad_scale: 8.0 +2022-12-08 17:31:16,033 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=149807.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 17:31:22,211 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7250, 4.3051, 3.8575, 3.6575, 2.8244, 4.0904, 3.9219, 2.2852], + device='cuda:2'), covar=tensor([0.1288, 0.0378, 0.0707, 0.0593, 0.0808, 0.0288, 0.0728, 0.1560], + device='cuda:2'), in_proj_covar=tensor([0.0132, 0.0092, 0.0071, 0.0077, 0.0100, 0.0091, 0.0102, 0.0096], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0005, 0.0006, 0.0006, 0.0006, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:31:26,032 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.9109, 2.8417, 2.1486, 3.0034, 2.8136, 2.9168, 2.5541, 2.2360], + device='cuda:2'), covar=tensor([0.1086, 0.1270, 0.3213, 0.0936, 0.1249, 0.1016, 0.1659, 0.2862], + device='cuda:2'), in_proj_covar=tensor([0.0282, 0.0285, 0.0257, 0.0291, 0.0323, 0.0303, 0.0256, 0.0243], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:31:34,039 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=5.03 vs. limit=5.0 +2022-12-08 17:31:45,817 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.38 vs. limit=2.0 +2022-12-08 17:31:56,634 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.295e+02 2.144e+02 2.583e+02 3.219e+02 5.785e+02, threshold=5.165e+02, percent-clipped=1.0 +2022-12-08 17:32:11,619 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=149870.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:32:19,292 INFO [train.py:873] (2/4) Epoch 20, batch 6200, loss[loss=0.09487, simple_loss=0.1394, pruned_loss=0.02518, over 14265.00 frames. ], tot_loss[loss=0.09989, simple_loss=0.1375, pruned_loss=0.03113, over 1946643.48 frames. ], batch size: 80, lr: 3.89e-03, grad_scale: 8.0 +2022-12-08 17:32:57,037 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=149921.0, num_to_drop=1, layers_to_drop={3} +2022-12-08 17:33:24,559 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.165e+02 2.069e+02 2.540e+02 3.004e+02 8.039e+02, threshold=5.080e+02, percent-clipped=2.0 +2022-12-08 17:33:47,752 INFO [train.py:873] (2/4) Epoch 20, batch 6300, loss[loss=0.1185, simple_loss=0.1239, pruned_loss=0.0565, over 2601.00 frames. ], tot_loss[loss=0.09965, simple_loss=0.1374, pruned_loss=0.03096, over 1947456.65 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:34:11,373 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.41 vs. limit=2.0 +2022-12-08 17:34:56,194 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.169e+02 2.076e+02 2.693e+02 3.406e+02 8.300e+02, threshold=5.385e+02, percent-clipped=3.0 +2022-12-08 17:35:18,743 INFO [train.py:873] (2/4) Epoch 20, batch 6400, loss[loss=0.1136, simple_loss=0.1443, pruned_loss=0.04143, over 7792.00 frames. ], tot_loss[loss=0.09959, simple_loss=0.1373, pruned_loss=0.03096, over 1948294.41 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:35:18,919 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=150079.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:35:39,427 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=150102.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 17:36:12,266 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=150140.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:36:21,422 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.8070, 4.5935, 4.4027, 4.8510, 4.4310, 4.0350, 4.8697, 4.6690], + device='cuda:2'), covar=tensor([0.0609, 0.0975, 0.0864, 0.0505, 0.0798, 0.0666, 0.0581, 0.0640], + device='cuda:2'), in_proj_covar=tensor([0.0148, 0.0149, 0.0150, 0.0168, 0.0151, 0.0129, 0.0174, 0.0154], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0002, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:36:23,032 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.274e+02 2.042e+02 2.543e+02 3.172e+02 7.027e+02, threshold=5.086e+02, percent-clipped=3.0 +2022-12-08 17:36:38,344 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=150170.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:36:46,072 INFO [train.py:873] (2/4) Epoch 20, batch 6500, loss[loss=0.1087, simple_loss=0.1471, pruned_loss=0.03512, over 14221.00 frames. ], tot_loss[loss=0.09973, simple_loss=0.1377, pruned_loss=0.03087, over 2003883.69 frames. ], batch size: 89, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:37:20,282 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=150218.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:37:23,053 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=150221.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:37:50,502 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.402e+02 2.155e+02 2.479e+02 3.022e+02 7.876e+02, threshold=4.957e+02, percent-clipped=4.0 +2022-12-08 17:38:04,373 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=150269.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:38:08,953 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=150274.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:38:11,240 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.31 vs. limit=5.0 +2022-12-08 17:38:13,125 INFO [train.py:873] (2/4) Epoch 20, batch 6600, loss[loss=0.116, simple_loss=0.15, pruned_loss=0.04101, over 10319.00 frames. ], tot_loss[loss=0.1002, simple_loss=0.1379, pruned_loss=0.03122, over 2022872.32 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:38:13,923 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.55 vs. limit=2.0 +2022-12-08 17:38:48,434 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.0928, 1.2961, 1.2596, 1.0681, 0.8146, 1.0444, 0.9531, 1.2084], + device='cuda:2'), covar=tensor([0.2287, 0.2940, 0.1549, 0.2433, 0.3223, 0.1612, 0.2369, 0.1559], + device='cuda:2'), in_proj_covar=tensor([0.0090, 0.0104, 0.0098, 0.0103, 0.0116, 0.0095, 0.0117, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0005, 0.0004, 0.0005, 0.0005, 0.0004, 0.0005, 0.0004], + device='cuda:2') +2022-12-08 17:39:02,513 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=150335.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:39:13,142 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([0.8793, 0.7374, 0.8112, 0.8413, 0.7809, 0.5436, 0.5126, 0.7304], + device='cuda:2'), covar=tensor([0.0162, 0.0173, 0.0168, 0.0154, 0.0161, 0.0321, 0.0239, 0.0235], + device='cuda:2'), in_proj_covar=tensor([0.0025, 0.0024, 0.0022, 0.0024, 0.0023, 0.0036, 0.0030, 0.0034], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0002, 0.0002, 0.0002, 0.0003, 0.0002, 0.0003], + device='cuda:2') +2022-12-08 17:39:18,408 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.127e+02 2.141e+02 2.452e+02 3.103e+02 5.286e+02, threshold=4.904e+02, percent-clipped=1.0 +2022-12-08 17:39:24,122 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.54 vs. limit=5.0 +2022-12-08 17:39:41,074 INFO [train.py:873] (2/4) Epoch 20, batch 6700, loss[loss=0.09925, simple_loss=0.1291, pruned_loss=0.03469, over 4937.00 frames. ], tot_loss[loss=0.09994, simple_loss=0.1378, pruned_loss=0.03101, over 2006002.24 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:39:46,470 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7285, 1.9356, 2.6513, 2.2033, 2.7006, 2.5950, 2.4599, 2.4130], + device='cuda:2'), covar=tensor([0.0773, 0.2773, 0.1023, 0.1574, 0.0685, 0.0933, 0.0924, 0.1462], + device='cuda:2'), in_proj_covar=tensor([0.0349, 0.0307, 0.0386, 0.0297, 0.0360, 0.0319, 0.0356, 0.0292], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:40:01,062 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=150402.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 17:40:29,822 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=150435.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:40:42,827 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=150450.0, num_to_drop=1, layers_to_drop={1} +2022-12-08 17:40:45,138 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.516e+02 2.053e+02 2.408e+02 2.928e+02 1.041e+03, threshold=4.817e+02, percent-clipped=2.0 +2022-12-08 17:40:47,464 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.39 vs. limit=2.0 +2022-12-08 17:41:07,584 INFO [train.py:873] (2/4) Epoch 20, batch 6800, loss[loss=0.09536, simple_loss=0.1329, pruned_loss=0.02889, over 13949.00 frames. ], tot_loss[loss=0.09978, simple_loss=0.1374, pruned_loss=0.03106, over 1962070.19 frames. ], batch size: 23, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:41:27,092 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.7222, 2.0149, 2.6470, 2.2463, 2.6808, 2.5888, 2.4571, 2.4915], + device='cuda:2'), covar=tensor([0.0709, 0.2541, 0.0874, 0.1601, 0.0626, 0.1157, 0.0829, 0.1400], + device='cuda:2'), in_proj_covar=tensor([0.0351, 0.0308, 0.0386, 0.0299, 0.0360, 0.0321, 0.0357, 0.0293], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0003, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:41:57,926 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=150537.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:42:12,046 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.252e+02 2.195e+02 2.476e+02 3.043e+02 9.162e+02, threshold=4.952e+02, percent-clipped=3.0 +2022-12-08 17:42:34,236 INFO [train.py:873] (2/4) Epoch 20, batch 6900, loss[loss=0.1209, simple_loss=0.1191, pruned_loss=0.06132, over 1255.00 frames. ], tot_loss[loss=0.1, simple_loss=0.1373, pruned_loss=0.03137, over 1923847.61 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 8.0 +2022-12-08 17:42:51,063 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=150598.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:42:56,739 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4530, 2.2274, 3.3542, 3.5347, 3.3243, 2.2909, 3.3587, 2.6439], + device='cuda:2'), covar=tensor([0.0568, 0.1507, 0.1047, 0.0638, 0.0698, 0.2017, 0.0635, 0.1203], + device='cuda:2'), in_proj_covar=tensor([0.0297, 0.0264, 0.0380, 0.0337, 0.0275, 0.0312, 0.0317, 0.0281], + device='cuda:2'), out_proj_covar=tensor([0.0002, 0.0002, 0.0003, 0.0003, 0.0002, 0.0003, 0.0003, 0.0002], + device='cuda:2') +2022-12-08 17:43:15,061 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.7304, 3.5012, 3.1766, 2.3003, 3.1421, 3.4485, 3.8002, 3.0481], + device='cuda:2'), covar=tensor([0.0614, 0.0711, 0.0849, 0.1294, 0.0876, 0.0563, 0.0649, 0.1008], + device='cuda:2'), in_proj_covar=tensor([0.0155, 0.0168, 0.0141, 0.0125, 0.0144, 0.0157, 0.0140, 0.0142], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:43:19,288 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=150630.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:43:40,044 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.249e+02 1.852e+02 2.409e+02 3.250e+02 6.967e+02, threshold=4.817e+02, percent-clipped=2.0 +2022-12-08 17:44:02,001 INFO [train.py:873] (2/4) Epoch 20, batch 7000, loss[loss=0.1635, simple_loss=0.1564, pruned_loss=0.08529, over 1193.00 frames. ], tot_loss[loss=0.09987, simple_loss=0.1374, pruned_loss=0.03117, over 1913528.46 frames. ], batch size: 100, lr: 3.88e-03, grad_scale: 4.0 +2022-12-08 17:44:48,530 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=150732.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:44:50,847 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=150735.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:45:06,721 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.355e+02 2.298e+02 2.589e+02 3.066e+02 6.150e+02, threshold=5.179e+02, percent-clipped=4.0 +2022-12-08 17:45:28,339 INFO [train.py:873] (2/4) Epoch 20, batch 7100, loss[loss=0.1061, simple_loss=0.1405, pruned_loss=0.03584, over 14469.00 frames. ], tot_loss[loss=0.09959, simple_loss=0.1373, pruned_loss=0.03096, over 1906876.47 frames. ], batch size: 51, lr: 3.87e-03, grad_scale: 4.0 +2022-12-08 17:45:31,478 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=150783.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:45:37,901 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.5425, 3.2900, 3.1877, 2.4931, 2.9853, 3.2360, 3.4718, 2.8564], + device='cuda:2'), covar=tensor([0.0599, 0.0834, 0.0768, 0.1064, 0.0855, 0.0740, 0.0778, 0.1060], + device='cuda:2'), in_proj_covar=tensor([0.0156, 0.0169, 0.0142, 0.0126, 0.0145, 0.0158, 0.0140, 0.0143], + device='cuda:2'), out_proj_covar=tensor([0.0006, 0.0007, 0.0006, 0.0005, 0.0006, 0.0007, 0.0006, 0.0006], + device='cuda:2') +2022-12-08 17:45:40,492 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=150793.0, num_to_drop=1, layers_to_drop={0} +2022-12-08 17:46:33,855 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.300e+02 2.271e+02 2.646e+02 3.296e+02 2.153e+03, threshold=5.293e+02, percent-clipped=6.0 +2022-12-08 17:46:49,594 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.2695, 2.5560, 2.5408, 2.6064, 2.2008, 2.6515, 2.5067, 1.3746], + device='cuda:2'), covar=tensor([0.0975, 0.0910, 0.0661, 0.0617, 0.0903, 0.0628, 0.0926, 0.1847], + device='cuda:2'), in_proj_covar=tensor([0.0135, 0.0094, 0.0073, 0.0079, 0.0102, 0.0094, 0.0104, 0.0099], + device='cuda:2'), out_proj_covar=tensor([0.0008, 0.0006, 0.0006, 0.0006, 0.0007, 0.0006, 0.0007, 0.0006], + device='cuda:2') +2022-12-08 17:46:55,672 INFO [train.py:873] (2/4) Epoch 20, batch 7200, loss[loss=0.1057, simple_loss=0.1309, pruned_loss=0.04027, over 6941.00 frames. ], tot_loss[loss=0.1011, simple_loss=0.1382, pruned_loss=0.03195, over 1911147.67 frames. ], batch size: 100, lr: 3.87e-03, grad_scale: 8.0 +2022-12-08 17:47:07,883 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=150893.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:47:14,109 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=150900.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:47:40,848 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=150930.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:48:01,353 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.240e+02 1.977e+02 2.439e+02 3.091e+02 5.350e+02, threshold=4.878e+02, percent-clipped=1.0 +2022-12-08 17:48:04,095 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([3.4743, 1.2951, 3.5203, 1.8259, 3.3453, 3.5966, 2.6675, 3.7641], + device='cuda:2'), covar=tensor([0.0322, 0.3668, 0.0541, 0.2427, 0.0881, 0.0506, 0.0939, 0.0309], + device='cuda:2'), in_proj_covar=tensor([0.0173, 0.0156, 0.0161, 0.0169, 0.0166, 0.0181, 0.0133, 0.0153], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:48:05,816 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([1.8441, 1.5890, 2.0543, 1.6490, 1.8939, 1.5234, 1.7089, 1.9482], + device='cuda:2'), covar=tensor([0.2963, 0.2689, 0.0766, 0.1856, 0.1520, 0.1358, 0.1151, 0.0969], + device='cuda:2'), in_proj_covar=tensor([0.0250, 0.0195, 0.0223, 0.0264, 0.0239, 0.0198, 0.0201, 0.0219], + device='cuda:2'), out_proj_covar=tensor([0.0005, 0.0004, 0.0005, 0.0005, 0.0005, 0.0004, 0.0004, 0.0005], + device='cuda:2') +2022-12-08 17:48:07,576 INFO [zipformer.py:626] (2/4) warmup_begin=3333.3, warmup_end=4000.0, batch_count=150961.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:48:22,754 INFO [zipformer.py:626] (2/4) warmup_begin=666.7, warmup_end=1333.3, batch_count=150978.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:48:23,472 INFO [train.py:873] (2/4) Epoch 20, batch 7300, loss[loss=0.116, simple_loss=0.1176, pruned_loss=0.05722, over 1237.00 frames. ], tot_loss[loss=0.09965, simple_loss=0.1372, pruned_loss=0.03103, over 1948162.76 frames. ], batch size: 100, lr: 3.87e-03, grad_scale: 8.0 +2022-12-08 17:48:37,486 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.01 vs. limit=5.0 +2022-12-08 17:49:29,353 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.047e+02 2.139e+02 2.523e+02 3.177e+02 9.954e+02, threshold=5.046e+02, percent-clipped=7.0 +2022-12-08 17:49:51,033 INFO [train.py:873] (2/4) Epoch 20, batch 7400, loss[loss=0.09323, simple_loss=0.1351, pruned_loss=0.0257, over 14527.00 frames. ], tot_loss[loss=0.1004, simple_loss=0.1382, pruned_loss=0.03126, over 1990661.42 frames. ], batch size: 49, lr: 3.87e-03, grad_scale: 8.0 +2022-12-08 17:49:59,301 INFO [zipformer.py:626] (2/4) warmup_begin=1333.3, warmup_end=2000.0, batch_count=151088.0, num_to_drop=1, layers_to_drop={2} +2022-12-08 17:49:59,683 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.27 vs. limit=2.0 +2022-12-08 17:50:03,884 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=192, metric=1.78 vs. limit=2.0 +2022-12-08 17:50:23,283 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([4.0158, 4.1280, 4.3111, 3.8103, 4.1698, 4.4429, 1.5993, 3.9325], + device='cuda:2'), covar=tensor([0.0407, 0.0422, 0.0408, 0.0524, 0.0394, 0.0275, 0.3653, 0.0344], + device='cuda:2'), in_proj_covar=tensor([0.0177, 0.0176, 0.0149, 0.0152, 0.0210, 0.0145, 0.0158, 0.0197], + device='cuda:2'), out_proj_covar=tensor([0.0003, 0.0003, 0.0003, 0.0003, 0.0004, 0.0003, 0.0003, 0.0004], + device='cuda:2') +2022-12-08 17:50:50,896 INFO [zipformer.py:1414] (2/4) attn_weights_entropy = tensor([2.1676, 2.0644, 1.8299, 1.9026, 2.0795, 2.1391, 2.1053, 2.0842], + device='cuda:2'), covar=tensor([0.1253, 0.0889, 0.2632, 0.2765, 0.1454, 0.1257, 0.1612, 0.1295], + device='cuda:2'), in_proj_covar=tensor([0.0400, 0.0284, 0.0454, 0.0574, 0.0363, 0.0463, 0.0404, 0.0406], + device='cuda:2'), out_proj_covar=tensor([0.0004, 0.0003, 0.0004, 0.0005, 0.0004, 0.0004, 0.0004, 0.0004], + device='cuda:2') +2022-12-08 17:50:55,892 INFO [optim.py:369] (2/4) Clipping_scale=2.0, grad-norm quartiles 1.036e+02 2.135e+02 2.491e+02 3.146e+02 1.107e+03, threshold=4.982e+02, percent-clipped=6.0 +2022-12-08 17:50:57,457 INFO [scaling.py:679] (2/4) Whitening: num_groups=8, num_channels=96, metric=1.40 vs. limit=2.0 +2022-12-08 17:51:14,740 INFO [scaling.py:679] (2/4) Whitening: num_groups=1, num_channels=384, metric=4.26 vs. limit=5.0 +2022-12-08 17:51:17,607 INFO [train.py:873] (2/4) Epoch 20, batch 7500, loss[loss=0.1079, simple_loss=0.1411, pruned_loss=0.03737, over 10383.00 frames. ], tot_loss[loss=0.1011, simple_loss=0.1387, pruned_loss=0.03174, over 1997310.44 frames. ], batch size: 100, lr: 3.87e-03, grad_scale: 8.0 +2022-12-08 17:51:21,841 INFO [zipformer.py:626] (2/4) warmup_begin=2666.7, warmup_end=3333.3, batch_count=151184.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:51:29,993 INFO [zipformer.py:626] (2/4) warmup_begin=2000.0, warmup_end=2666.7, batch_count=151193.0, num_to_drop=0, layers_to_drop=set() +2022-12-08 17:52:04,576 INFO [train.py:1091] (2/4) Done!