|
{ |
|
"best_metric": 0.6364516129032258, |
|
"best_model_checkpoint": "output_toy/checkpoint-15500", |
|
"epoch": 0.775, |
|
"global_step": 15500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.0, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 3.5788, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 13.754842758178711, |
|
"max_norm/layer2": 14.107428550720215, |
|
"max_norm/layer3": 14.075852394104004, |
|
"mean_norm": 8.452343925833702, |
|
"mean_norm/layer0": 8.452010810375214, |
|
"mean_norm/layer1": 8.451448321342468, |
|
"mean_norm/layer2": 8.451122224330902, |
|
"mean_norm/layer3": 8.454794347286224, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.03, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 2.2465, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 13.313672065734863, |
|
"max_norm/layer2": 13.402200698852539, |
|
"max_norm/layer3": 13.972790718078613, |
|
"mean_norm": 8.24767641723156, |
|
"mean_norm/layer0": 8.446629762649536, |
|
"mean_norm/layer1": 8.26882529258728, |
|
"mean_norm/layer2": 8.018843710422516, |
|
"mean_norm/layer3": 8.256406903266907, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.3564852016178642, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.31, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.8386362791061401, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.7699, |
|
"eval_samples_per_second": 803.731, |
|
"eval_steps_per_second": 1.57, |
|
"eval_transition_accuracy": 0.3554838709677419, |
|
"step": 500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.05, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.5981, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 13.26521110534668, |
|
"max_norm/layer2": 13.196029663085938, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.097165614366531, |
|
"mean_norm/layer0": 8.447738409042358, |
|
"mean_norm/layer1": 8.100942492485046, |
|
"mean_norm/layer2": 7.6819451451301575, |
|
"mean_norm/layer3": 8.158036410808563, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4204176054226132, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.58, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.4651859998703003, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3952, |
|
"eval_samples_per_second": 811.185, |
|
"eval_steps_per_second": 1.584, |
|
"eval_transition_accuracy": 0.5014516129032258, |
|
"step": 1000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.07, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.3928, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 13.196317672729492, |
|
"max_norm/layer2": 12.821307182312012, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.05762867629528, |
|
"mean_norm/layer0": 8.449561476707458, |
|
"mean_norm/layer1": 8.045644223690033, |
|
"mean_norm/layer2": 7.585634410381317, |
|
"mean_norm/layer3": 8.149674594402313, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4378030131182333, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.79, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.354053258895874, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3967, |
|
"eval_samples_per_second": 811.156, |
|
"eval_steps_per_second": 1.584, |
|
"eval_transition_accuracy": 0.555, |
|
"step": 1500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.1, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.3405, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 12.928963661193848, |
|
"max_norm/layer2": 12.629460334777832, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.056178480386734, |
|
"mean_norm/layer0": 8.451180398464203, |
|
"mean_norm/layer1": 8.046804785728455, |
|
"mean_norm/layer2": 7.5705525279045105, |
|
"mean_norm/layer3": 8.156176209449768, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4426980807086614, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.82, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.3263764381408691, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4538, |
|
"eval_samples_per_second": 810.011, |
|
"eval_steps_per_second": 1.582, |
|
"eval_transition_accuracy": 0.5756451612903226, |
|
"step": 2000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.12, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.3189, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 12.928963661193848, |
|
"max_norm/layer2": 12.610198974609375, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.065410792827606, |
|
"mean_norm/layer0": 8.452252388000488, |
|
"mean_norm/layer1": 8.05913120508194, |
|
"mean_norm/layer2": 7.583977818489075, |
|
"mean_norm/layer3": 8.166281759738922, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4445690245140256, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.86, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.3187371492385864, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.0204, |
|
"eval_samples_per_second": 818.782, |
|
"eval_steps_per_second": 1.599, |
|
"eval_transition_accuracy": 0.5575806451612904, |
|
"step": 2500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.15, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.308, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 12.928963661193848, |
|
"max_norm/layer2": 12.610198974609375, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.076771080493927, |
|
"mean_norm/layer0": 8.453002035617828, |
|
"mean_norm/layer1": 8.070474624633789, |
|
"mean_norm/layer2": 7.606890320777893, |
|
"mean_norm/layer3": 8.176717340946198, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.44684558778297245, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.82, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.3064292669296265, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4873, |
|
"eval_samples_per_second": 809.341, |
|
"eval_steps_per_second": 1.581, |
|
"eval_transition_accuracy": 0.557258064516129, |
|
"step": 3000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.17, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.3009, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 12.928963661193848, |
|
"max_norm/layer2": 12.480420112609863, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.087428167462349, |
|
"mean_norm/layer0": 8.45363199710846, |
|
"mean_norm/layer1": 8.081151723861694, |
|
"mean_norm/layer2": 7.628681242465973, |
|
"mean_norm/layer3": 8.186247706413269, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.44931342658095474, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.87, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2963054180145264, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.5536, |
|
"eval_samples_per_second": 808.018, |
|
"eval_steps_per_second": 1.578, |
|
"eval_transition_accuracy": 0.5762903225806452, |
|
"step": 3500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.2, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2965, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 12.945528030395508, |
|
"max_norm/layer2": 13.130059242248535, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.097855687141418, |
|
"mean_norm/layer0": 8.454240918159485, |
|
"mean_norm/layer1": 8.09092777967453, |
|
"mean_norm/layer2": 7.649804890155792, |
|
"mean_norm/layer3": 8.196449160575867, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4493698961152805, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2922283411026, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3081, |
|
"eval_samples_per_second": 812.939, |
|
"eval_steps_per_second": 1.588, |
|
"eval_transition_accuracy": 0.567741935483871, |
|
"step": 4000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.23, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2919, |
|
"max_norm": 14.111713409423828, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 13.611448287963867, |
|
"max_norm/layer2": 13.752634048461914, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.10820010304451, |
|
"mean_norm/layer0": 8.454896569252014, |
|
"mean_norm/layer1": 8.099043369293213, |
|
"mean_norm/layer2": 7.671496093273163, |
|
"mean_norm/layer3": 8.20736438035965, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.449889175535187, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.91, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2880299091339111, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.5303, |
|
"eval_samples_per_second": 808.482, |
|
"eval_steps_per_second": 1.579, |
|
"eval_transition_accuracy": 0.5820967741935484, |
|
"step": 4500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.25, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2889, |
|
"max_norm": 14.629088401794434, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 14.212156295776367, |
|
"max_norm/layer2": 14.629088401794434, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.117734983563423, |
|
"mean_norm/layer0": 8.455368101596832, |
|
"mean_norm/layer1": 8.104798018932343, |
|
"mean_norm/layer2": 7.693721830844879, |
|
"mean_norm/layer3": 8.217051982879639, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45008597786970966, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2855565547943115, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.1444, |
|
"eval_samples_per_second": 816.252, |
|
"eval_steps_per_second": 1.594, |
|
"eval_transition_accuracy": 0.56, |
|
"step": 5000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.28, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2855, |
|
"max_norm": 15.594667434692383, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 14.633105278015137, |
|
"max_norm/layer2": 15.594667434692383, |
|
"max_norm/layer3": 13.539477348327637, |
|
"mean_norm": 8.12691231071949, |
|
"mean_norm/layer0": 8.45590054988861, |
|
"mean_norm/layer1": 8.110510230064392, |
|
"mean_norm/layer2": 7.71501624584198, |
|
"mean_norm/layer3": 8.226222217082977, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4503207469549705, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2815940380096436, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4723, |
|
"eval_samples_per_second": 809.64, |
|
"eval_steps_per_second": 1.581, |
|
"eval_transition_accuracy": 0.6016129032258064, |
|
"step": 5500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.3, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2828, |
|
"max_norm": 16.523221969604492, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 15.140296936035156, |
|
"max_norm/layer2": 16.523221969604492, |
|
"max_norm/layer3": 15.213438987731934, |
|
"mean_norm": 8.136623591184616, |
|
"mean_norm/layer0": 8.456406712532043, |
|
"mean_norm/layer1": 8.11625623703003, |
|
"mean_norm/layer2": 7.738001704216003, |
|
"mean_norm/layer3": 8.235829710960388, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45019699457123524, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.87, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2844185829162598, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4554, |
|
"eval_samples_per_second": 809.979, |
|
"eval_steps_per_second": 1.582, |
|
"eval_transition_accuracy": 0.5733870967741935, |
|
"step": 6000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.33, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2805, |
|
"max_norm": 17.3682861328125, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 15.587692260742188, |
|
"max_norm/layer2": 17.3682861328125, |
|
"max_norm/layer3": 16.75887107849121, |
|
"mean_norm": 8.145677655935287, |
|
"mean_norm/layer0": 8.456890940666199, |
|
"mean_norm/layer1": 8.120486974716187, |
|
"mean_norm/layer2": 7.760386824607849, |
|
"mean_norm/layer3": 8.244945883750916, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45155730960875984, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.95, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.277733325958252, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4778, |
|
"eval_samples_per_second": 809.529, |
|
"eval_steps_per_second": 1.581, |
|
"eval_transition_accuracy": 0.6083870967741936, |
|
"step": 6500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.35, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2793, |
|
"max_norm": 18.367448806762695, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 16.05299949645996, |
|
"max_norm/layer2": 18.142847061157227, |
|
"max_norm/layer3": 18.367448806762695, |
|
"mean_norm": 8.15514886379242, |
|
"mean_norm/layer0": 8.45737361907959, |
|
"mean_norm/layer1": 8.127346932888031, |
|
"mean_norm/layer2": 7.781527459621429, |
|
"mean_norm/layer3": 8.254347443580627, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45114255890132876, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.93, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2795634269714355, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.6598, |
|
"eval_samples_per_second": 805.906, |
|
"eval_steps_per_second": 1.574, |
|
"eval_transition_accuracy": 0.5680645161290323, |
|
"step": 7000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.38, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2785, |
|
"max_norm": 20.019760131835938, |
|
"max_norm/layer0": 14.111713409423828, |
|
"max_norm/layer1": 16.42165756225586, |
|
"max_norm/layer2": 19.017709732055664, |
|
"max_norm/layer3": 20.019760131835938, |
|
"mean_norm": 8.164311796426773, |
|
"mean_norm/layer0": 8.457763373851776, |
|
"mean_norm/layer1": 8.13219028711319, |
|
"mean_norm/layer2": 7.803518235683441, |
|
"mean_norm/layer3": 8.263775289058685, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45188651497908466, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.95, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2748253345489502, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4299, |
|
"eval_samples_per_second": 810.489, |
|
"eval_steps_per_second": 1.583, |
|
"eval_transition_accuracy": 0.5919354838709677, |
|
"step": 7500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.4, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2764, |
|
"max_norm": 21.821395874023438, |
|
"max_norm/layer0": 14.158592224121094, |
|
"max_norm/layer1": 16.7973690032959, |
|
"max_norm/layer2": 19.965808868408203, |
|
"max_norm/layer3": 21.821395874023438, |
|
"mean_norm": 8.173083677887917, |
|
"mean_norm/layer0": 8.458155512809753, |
|
"mean_norm/layer1": 8.137401163578033, |
|
"mean_norm/layer2": 7.8241875767707825, |
|
"mean_norm/layer3": 8.272590458393097, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.451844222902313, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.276716709136963, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3344, |
|
"eval_samples_per_second": 812.407, |
|
"eval_steps_per_second": 1.587, |
|
"eval_transition_accuracy": 0.5759677419354838, |
|
"step": 8000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.42, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2763, |
|
"max_norm": 23.506425857543945, |
|
"max_norm/layer0": 14.451132774353027, |
|
"max_norm/layer1": 17.121503829956055, |
|
"max_norm/layer2": 20.906761169433594, |
|
"max_norm/layer3": 23.506425857543945, |
|
"mean_norm": 8.181086376309395, |
|
"mean_norm/layer0": 8.458472549915314, |
|
"mean_norm/layer1": 8.140588343143463, |
|
"mean_norm/layer2": 7.84406965970993, |
|
"mean_norm/layer3": 8.281214952468872, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45069200410617616, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.94, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2800538539886475, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.8942, |
|
"eval_samples_per_second": 801.287, |
|
"eval_steps_per_second": 1.565, |
|
"eval_transition_accuracy": 0.582741935483871, |
|
"step": 8500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.45, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2755, |
|
"max_norm": 25.07321548461914, |
|
"max_norm/layer0": 14.874269485473633, |
|
"max_norm/layer1": 17.39398956298828, |
|
"max_norm/layer2": 21.67055892944336, |
|
"max_norm/layer3": 25.07321548461914, |
|
"mean_norm": 8.188907638192177, |
|
"mean_norm/layer0": 8.458899140357971, |
|
"mean_norm/layer1": 8.143711388111115, |
|
"mean_norm/layer2": 7.863752484321594, |
|
"mean_norm/layer3": 8.289267539978027, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4516255536417323, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2754778861999512, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3576, |
|
"eval_samples_per_second": 811.942, |
|
"eval_steps_per_second": 1.586, |
|
"eval_transition_accuracy": 0.5764516129032258, |
|
"step": 9000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.47, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2746, |
|
"max_norm": 26.98938751220703, |
|
"max_norm/layer0": 15.152251243591309, |
|
"max_norm/layer1": 17.66726303100586, |
|
"max_norm/layer2": 22.432802200317383, |
|
"max_norm/layer3": 26.98938751220703, |
|
"mean_norm": 8.197539746761322, |
|
"mean_norm/layer0": 8.4593066573143, |
|
"mean_norm/layer1": 8.148526132106781, |
|
"mean_norm/layer2": 7.884801626205444, |
|
"mean_norm/layer3": 8.297524571418762, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45229261503444884, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2736179828643799, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.4635, |
|
"eval_samples_per_second": 809.817, |
|
"eval_steps_per_second": 1.582, |
|
"eval_transition_accuracy": 0.5864516129032258, |
|
"step": 9500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.5, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2734, |
|
"max_norm": 28.549026489257812, |
|
"max_norm/layer0": 15.429606437683105, |
|
"max_norm/layer1": 17.9191837310791, |
|
"max_norm/layer2": 23.421247482299805, |
|
"max_norm/layer3": 28.549026489257812, |
|
"mean_norm": 8.206039026379585, |
|
"mean_norm/layer0": 8.459603905677795, |
|
"mean_norm/layer1": 8.153472065925598, |
|
"mean_norm/layer2": 7.905435502529144, |
|
"mean_norm/layer3": 8.305644631385803, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4518793061023622, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.91, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2739558219909668, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.3957, |
|
"eval_samples_per_second": 811.175, |
|
"eval_steps_per_second": 1.584, |
|
"eval_transition_accuracy": 0.5779032258064516, |
|
"step": 10000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.53, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2732, |
|
"max_norm": 30.3062801361084, |
|
"max_norm/layer0": 15.69118595123291, |
|
"max_norm/layer1": 18.259578704833984, |
|
"max_norm/layer2": 24.377281188964844, |
|
"max_norm/layer3": 30.3062801361084, |
|
"mean_norm": 8.214424923062325, |
|
"mean_norm/layer0": 8.459942817687988, |
|
"mean_norm/layer1": 8.156991243362427, |
|
"mean_norm/layer2": 7.927173614501953, |
|
"mean_norm/layer3": 8.31359201669693, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45159791961429624, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.89, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2743829488754272, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.5803, |
|
"eval_samples_per_second": 807.485, |
|
"eval_steps_per_second": 1.577, |
|
"eval_transition_accuracy": 0.5879032258064516, |
|
"step": 10500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.55, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2723, |
|
"max_norm": 31.752639770507812, |
|
"max_norm/layer0": 16.03121566772461, |
|
"max_norm/layer1": 18.51296043395996, |
|
"max_norm/layer2": 25.478057861328125, |
|
"max_norm/layer3": 31.752639770507812, |
|
"mean_norm": 8.222758993506432, |
|
"mean_norm/layer0": 8.460269570350647, |
|
"mean_norm/layer1": 8.1603884100914, |
|
"mean_norm/layer2": 7.949049711227417, |
|
"mean_norm/layer3": 8.321328282356262, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45252594234436516, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.89, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.269043207168579, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.1138, |
|
"eval_samples_per_second": 816.875, |
|
"eval_steps_per_second": 1.595, |
|
"eval_transition_accuracy": 0.5811290322580646, |
|
"step": 11000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.57, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2712, |
|
"max_norm": 33.44036102294922, |
|
"max_norm/layer0": 16.178028106689453, |
|
"max_norm/layer1": 18.79450798034668, |
|
"max_norm/layer2": 26.372129440307617, |
|
"max_norm/layer3": 33.44036102294922, |
|
"mean_norm": 8.230609133839607, |
|
"mean_norm/layer0": 8.460545778274536, |
|
"mean_norm/layer1": 8.163512825965881, |
|
"mean_norm/layer2": 7.969985008239746, |
|
"mean_norm/layer3": 8.328392922878265, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45260740265132876, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.93, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2705051898956299, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.2988, |
|
"eval_samples_per_second": 813.126, |
|
"eval_steps_per_second": 1.588, |
|
"eval_transition_accuracy": 0.5779032258064516, |
|
"step": 11500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.6, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2716, |
|
"max_norm": 35.007076263427734, |
|
"max_norm/layer0": 16.497554779052734, |
|
"max_norm/layer1": 18.986597061157227, |
|
"max_norm/layer2": 27.318687438964844, |
|
"max_norm/layer3": 35.007076263427734, |
|
"mean_norm": 8.238363325595856, |
|
"mean_norm/layer0": 8.460874915122986, |
|
"mean_norm/layer1": 8.166603803634644, |
|
"mean_norm/layer2": 7.990897297859192, |
|
"mean_norm/layer3": 8.335077285766602, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4526636318897638, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.89, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2700704336166382, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.2954, |
|
"eval_samples_per_second": 813.195, |
|
"eval_steps_per_second": 1.588, |
|
"eval_transition_accuracy": 0.5759677419354838, |
|
"step": 12000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.62, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2708, |
|
"max_norm": 36.530174255371094, |
|
"max_norm/layer0": 16.774707794189453, |
|
"max_norm/layer1": 19.286115646362305, |
|
"max_norm/layer2": 28.3411808013916, |
|
"max_norm/layer3": 36.530174255371094, |
|
"mean_norm": 8.246006086468697, |
|
"mean_norm/layer0": 8.461170196533203, |
|
"mean_norm/layer1": 8.169004082679749, |
|
"mean_norm/layer2": 8.012158274650574, |
|
"mean_norm/layer3": 8.341691792011261, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45218808632197344, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.95, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2715603113174438, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.6419, |
|
"eval_samples_per_second": 806.261, |
|
"eval_steps_per_second": 1.575, |
|
"eval_transition_accuracy": 0.5485483870967742, |
|
"step": 12500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.65, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2705, |
|
"max_norm": 38.14279556274414, |
|
"max_norm/layer0": 16.91411590576172, |
|
"max_norm/layer1": 19.68568229675293, |
|
"max_norm/layer2": 29.167861938476562, |
|
"max_norm/layer3": 38.14279556274414, |
|
"mean_norm": 8.253673061728477, |
|
"mean_norm/layer0": 8.461421430110931, |
|
"mean_norm/layer1": 8.171639680862427, |
|
"mean_norm/layer2": 8.033495247364044, |
|
"mean_norm/layer3": 8.348135888576508, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4528599536325049, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.93, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2675950527191162, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.2422, |
|
"eval_samples_per_second": 814.27, |
|
"eval_steps_per_second": 1.59, |
|
"eval_transition_accuracy": 0.5733870967741935, |
|
"step": 13000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.68, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2696, |
|
"max_norm": 39.6273307800293, |
|
"max_norm/layer0": 17.14823341369629, |
|
"max_norm/layer1": 19.711994171142578, |
|
"max_norm/layer2": 30.05324363708496, |
|
"max_norm/layer3": 39.6273307800293, |
|
"mean_norm": 8.261498123407364, |
|
"mean_norm/layer0": 8.46165120601654, |
|
"mean_norm/layer1": 8.17492812871933, |
|
"mean_norm/layer2": 8.055188477039337, |
|
"mean_norm/layer3": 8.354224681854248, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4518833911325049, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.91, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2716896533966064, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.327, |
|
"eval_samples_per_second": 812.557, |
|
"eval_steps_per_second": 1.587, |
|
"eval_transition_accuracy": 0.5993548387096774, |
|
"step": 13500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.7, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2687, |
|
"max_norm": 41.1092643737793, |
|
"max_norm/layer0": 17.324541091918945, |
|
"max_norm/layer1": 19.974578857421875, |
|
"max_norm/layer2": 30.87788200378418, |
|
"max_norm/layer3": 41.1092643737793, |
|
"mean_norm": 8.26871033012867, |
|
"mean_norm/layer0": 8.46183955669403, |
|
"mean_norm/layer1": 8.177218735218048, |
|
"mean_norm/layer2": 8.076003432273865, |
|
"mean_norm/layer3": 8.359779596328735, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45239882581815943, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.9, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2687005996704102, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.5627, |
|
"eval_samples_per_second": 807.837, |
|
"eval_steps_per_second": 1.578, |
|
"eval_transition_accuracy": 0.5756451612903226, |
|
"step": 14000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.72, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2685, |
|
"max_norm": 42.766014099121094, |
|
"max_norm/layer0": 17.42141342163086, |
|
"max_norm/layer1": 20.071725845336914, |
|
"max_norm/layer2": 31.913164138793945, |
|
"max_norm/layer3": 42.766014099121094, |
|
"mean_norm": 8.276251748204231, |
|
"mean_norm/layer0": 8.46206510066986, |
|
"mean_norm/layer1": 8.179498374462128, |
|
"mean_norm/layer2": 8.097876787185669, |
|
"mean_norm/layer3": 8.365566730499268, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45213714359313484, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.89, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2709327936172485, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.2398, |
|
"eval_samples_per_second": 814.319, |
|
"eval_steps_per_second": 1.59, |
|
"eval_transition_accuracy": 0.612741935483871, |
|
"step": 14500 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.75, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2685, |
|
"max_norm": 44.61077117919922, |
|
"max_norm/layer0": 17.665727615356445, |
|
"max_norm/layer1": 20.15743637084961, |
|
"max_norm/layer2": 32.966209411621094, |
|
"max_norm/layer3": 44.61077117919922, |
|
"mean_norm": 8.283790707588196, |
|
"mean_norm/layer0": 8.462257981300354, |
|
"mean_norm/layer1": 8.181303024291992, |
|
"mean_norm/layer2": 8.120182931423187, |
|
"mean_norm/layer3": 8.37141889333725, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.4519187146284449, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.91, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2706036567687988, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 40.0952, |
|
"eval_samples_per_second": 817.256, |
|
"eval_steps_per_second": 1.596, |
|
"eval_transition_accuracy": 0.587258064516129, |
|
"step": 15000 |
|
}, |
|
{ |
|
"MSE": 0.0, |
|
"MSE/layer0": 0.0, |
|
"MSE/layer1": 0.0, |
|
"MSE/layer2": 0.0, |
|
"MSE/layer3": 0.0, |
|
"dead_code_fraction": 1.0, |
|
"dead_code_fraction/layer0": 1.0, |
|
"dead_code_fraction/layer1": 1.0, |
|
"dead_code_fraction/layer2": 1.0, |
|
"dead_code_fraction/layer3": 1.0, |
|
"epoch": 0.78, |
|
"input_norm": 0.0, |
|
"input_norm/layer0": 0.0, |
|
"input_norm/layer1": 0.0, |
|
"input_norm/layer2": 0.0, |
|
"input_norm/layer3": 0.0, |
|
"learning_rate": 0.001, |
|
"loss": 1.2675, |
|
"max_norm": 46.33829879760742, |
|
"max_norm/layer0": 17.856664657592773, |
|
"max_norm/layer1": 20.084186553955078, |
|
"max_norm/layer2": 33.940242767333984, |
|
"max_norm/layer3": 46.33829879760742, |
|
"mean_norm": 8.291451185941696, |
|
"mean_norm/layer0": 8.462452054023743, |
|
"mean_norm/layer1": 8.18280303478241, |
|
"mean_norm/layer2": 8.143204748630524, |
|
"mean_norm/layer3": 8.377344906330109, |
|
"multicode_k": 1, |
|
"output_norm": 0.0, |
|
"output_norm/layer0": 0.0, |
|
"output_norm/layer1": 0.0, |
|
"output_norm/layer2": 0.0, |
|
"output_norm/layer3": 0.0, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_MSE/layer0": 0.0, |
|
"eval_MSE/layer1": 0.0, |
|
"eval_MSE/layer2": 0.0, |
|
"eval_MSE/layer3": 0.0, |
|
"eval_accuracy": 0.45268790177472934, |
|
"eval_dead_code_fraction/layer0": 1.0, |
|
"eval_dead_code_fraction/layer1": 1.0, |
|
"eval_dead_code_fraction/layer2": 1.0, |
|
"eval_dead_code_fraction/layer3": 1.0, |
|
"eval_first_transition_accuracy": 0.96, |
|
"eval_input_norm/layer0": 0.0, |
|
"eval_input_norm/layer1": 0.0, |
|
"eval_input_norm/layer2": 0.0, |
|
"eval_input_norm/layer3": 0.0, |
|
"eval_loss": 1.2690919637680054, |
|
"eval_multicode_k": 1, |
|
"eval_output_norm/layer0": 0.0, |
|
"eval_output_norm/layer1": 0.0, |
|
"eval_output_norm/layer2": 0.0, |
|
"eval_output_norm/layer3": 0.0, |
|
"eval_runtime": 39.8501, |
|
"eval_samples_per_second": 822.281, |
|
"eval_steps_per_second": 1.606, |
|
"eval_transition_accuracy": 0.6364516129032258, |
|
"step": 15500 |
|
} |
|
], |
|
"max_steps": 20000, |
|
"num_train_epochs": 9223372036854775807, |
|
"total_flos": 9712749772800000.0, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|