fats-fme commited on
Commit
56f7a7a
·
verified ·
1 Parent(s): 1d97221

Training in progress, step 207, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d64006a8240d0814491b2db4b937e17fb7b606088a33ab275a336effb5b52496
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669a3e72f719737b15fab04bb2c0bb92048dfd22e9ef658a342e99d35c08129b
3
  size 335922386
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ac89b50eb49875d4fc6320c442b1f1a2bb0c6ca5dcf4534babea7e4fa581fbf
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b50dc51f8e724b3669be809cbc82ebe11f85d559dcac23ed00b062397afcd9e
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa2482d7eb8b9907f50055efed6d979a680e476b4380cec06a223fb30358eb52
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ddb4f7d90f5558927f96b9e1767bd45cee0b3d26de04640235b5ab19fc8793
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05243bc9418b5d027b9cd58d0b804f8898dee9480e9cd6d09120cb4b16d4e2f3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89e4cd8ea348bd2f82958df5383cd58880eb3efd1ec75ed70ce4064026b49560
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5025034137460173,
5
  "eval_steps": 69,
6
- "global_step": 138,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -997,6 +997,497 @@
997
  "eval_samples_per_second": 8.378,
998
  "eval_steps_per_second": 2.099,
999
  "step": 138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1000
  }
1001
  ],
1002
  "logging_steps": 1,
@@ -1016,7 +1507,7 @@
1016
  "attributes": {}
1017
  }
1018
  },
1019
- "total_flos": 3.88127927801217e+17,
1020
  "train_batch_size": 2,
1021
  "trial_name": null,
1022
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7537551206190259,
5
  "eval_steps": 69,
6
+ "global_step": 207,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
997
  "eval_samples_per_second": 8.378,
998
  "eval_steps_per_second": 2.099,
999
  "step": 138
1000
+ },
1001
+ {
1002
+ "epoch": 0.5061447428311333,
1003
+ "grad_norm": NaN,
1004
+ "learning_rate": 0.00013170090808176883,
1005
+ "loss": 0.0,
1006
+ "step": 139
1007
+ },
1008
+ {
1009
+ "epoch": 0.5097860719162495,
1010
+ "grad_norm": NaN,
1011
+ "learning_rate": 0.00013036767451096148,
1012
+ "loss": 0.0,
1013
+ "step": 140
1014
+ },
1015
+ {
1016
+ "epoch": 0.5134274010013655,
1017
+ "grad_norm": NaN,
1018
+ "learning_rate": 0.00012902846772544624,
1019
+ "loss": 0.0,
1020
+ "step": 141
1021
+ },
1022
+ {
1023
+ "epoch": 0.5170687300864816,
1024
+ "grad_norm": NaN,
1025
+ "learning_rate": 0.00012768355114248494,
1026
+ "loss": 0.0,
1027
+ "step": 142
1028
+ },
1029
+ {
1030
+ "epoch": 0.5207100591715976,
1031
+ "grad_norm": NaN,
1032
+ "learning_rate": 0.00012633318930243648,
1033
+ "loss": 0.0,
1034
+ "step": 143
1035
+ },
1036
+ {
1037
+ "epoch": 0.5243513882567137,
1038
+ "grad_norm": NaN,
1039
+ "learning_rate": 0.0001249776478167227,
1040
+ "loss": 0.0,
1041
+ "step": 144
1042
+ },
1043
+ {
1044
+ "epoch": 0.5279927173418297,
1045
+ "grad_norm": NaN,
1046
+ "learning_rate": 0.00012361719331558345,
1047
+ "loss": 0.0,
1048
+ "step": 145
1049
+ },
1050
+ {
1051
+ "epoch": 0.5316340464269458,
1052
+ "grad_norm": NaN,
1053
+ "learning_rate": 0.00012225209339563145,
1054
+ "loss": 0.0,
1055
+ "step": 146
1056
+ },
1057
+ {
1058
+ "epoch": 0.535275375512062,
1059
+ "grad_norm": NaN,
1060
+ "learning_rate": 0.000120882616567217,
1061
+ "loss": 0.0,
1062
+ "step": 147
1063
+ },
1064
+ {
1065
+ "epoch": 0.538916704597178,
1066
+ "grad_norm": NaN,
1067
+ "learning_rate": 0.00011950903220161285,
1068
+ "loss": 0.0,
1069
+ "step": 148
1070
+ },
1071
+ {
1072
+ "epoch": 0.542558033682294,
1073
+ "grad_norm": NaN,
1074
+ "learning_rate": 0.00011813161047802985,
1075
+ "loss": 0.0,
1076
+ "step": 149
1077
+ },
1078
+ {
1079
+ "epoch": 0.5461993627674101,
1080
+ "grad_norm": NaN,
1081
+ "learning_rate": 0.00011675062233047364,
1082
+ "loss": 0.0,
1083
+ "step": 150
1084
+ },
1085
+ {
1086
+ "epoch": 0.5498406918525262,
1087
+ "grad_norm": NaN,
1088
+ "learning_rate": 0.000115366339394453,
1089
+ "loss": 0.0,
1090
+ "step": 151
1091
+ },
1092
+ {
1093
+ "epoch": 0.5534820209376422,
1094
+ "grad_norm": NaN,
1095
+ "learning_rate": 0.00011397903395354996,
1096
+ "loss": 0.0,
1097
+ "step": 152
1098
+ },
1099
+ {
1100
+ "epoch": 0.5571233500227583,
1101
+ "grad_norm": NaN,
1102
+ "learning_rate": 0.00011258897888586255,
1103
+ "loss": 0.0,
1104
+ "step": 153
1105
+ },
1106
+ {
1107
+ "epoch": 0.5607646791078744,
1108
+ "grad_norm": NaN,
1109
+ "learning_rate": 0.00011119644761033078,
1110
+ "loss": 0.0,
1111
+ "step": 154
1112
+ },
1113
+ {
1114
+ "epoch": 0.5644060081929905,
1115
+ "grad_norm": NaN,
1116
+ "learning_rate": 0.0001098017140329561,
1117
+ "loss": 0.0,
1118
+ "step": 155
1119
+ },
1120
+ {
1121
+ "epoch": 0.5680473372781065,
1122
+ "grad_norm": NaN,
1123
+ "learning_rate": 0.00010840505249292476,
1124
+ "loss": 0.0,
1125
+ "step": 156
1126
+ },
1127
+ {
1128
+ "epoch": 0.5716886663632226,
1129
+ "grad_norm": NaN,
1130
+ "learning_rate": 0.00010700673770864673,
1131
+ "loss": 0.0,
1132
+ "step": 157
1133
+ },
1134
+ {
1135
+ "epoch": 0.5753299954483386,
1136
+ "grad_norm": NaN,
1137
+ "learning_rate": 0.00010560704472371919,
1138
+ "loss": 0.0,
1139
+ "step": 158
1140
+ },
1141
+ {
1142
+ "epoch": 0.5789713245334547,
1143
+ "grad_norm": NaN,
1144
+ "learning_rate": 0.00010420624885282653,
1145
+ "loss": 0.0,
1146
+ "step": 159
1147
+ },
1148
+ {
1149
+ "epoch": 0.5826126536185707,
1150
+ "grad_norm": NaN,
1151
+ "learning_rate": 0.0001028046256275869,
1152
+ "loss": 0.0,
1153
+ "step": 160
1154
+ },
1155
+ {
1156
+ "epoch": 0.5862539827036869,
1157
+ "grad_norm": NaN,
1158
+ "learning_rate": 0.00010140245074235624,
1159
+ "loss": 0.0,
1160
+ "step": 161
1161
+ },
1162
+ {
1163
+ "epoch": 0.589895311788803,
1164
+ "grad_norm": NaN,
1165
+ "learning_rate": 0.0001,
1166
+ "loss": 0.0,
1167
+ "step": 162
1168
+ },
1169
+ {
1170
+ "epoch": 0.593536640873919,
1171
+ "grad_norm": NaN,
1172
+ "learning_rate": 9.859754925764378e-05,
1173
+ "loss": 0.0,
1174
+ "step": 163
1175
+ },
1176
+ {
1177
+ "epoch": 0.597177969959035,
1178
+ "grad_norm": NaN,
1179
+ "learning_rate": 9.719537437241312e-05,
1180
+ "loss": 0.0,
1181
+ "step": 164
1182
+ },
1183
+ {
1184
+ "epoch": 0.6008192990441511,
1185
+ "grad_norm": NaN,
1186
+ "learning_rate": 9.579375114717351e-05,
1187
+ "loss": 0.0,
1188
+ "step": 165
1189
+ },
1190
+ {
1191
+ "epoch": 0.6044606281292672,
1192
+ "grad_norm": NaN,
1193
+ "learning_rate": 9.439295527628081e-05,
1194
+ "loss": 0.0,
1195
+ "step": 166
1196
+ },
1197
+ {
1198
+ "epoch": 0.6081019572143832,
1199
+ "grad_norm": NaN,
1200
+ "learning_rate": 9.299326229135326e-05,
1201
+ "loss": 0.0,
1202
+ "step": 167
1203
+ },
1204
+ {
1205
+ "epoch": 0.6117432862994994,
1206
+ "grad_norm": NaN,
1207
+ "learning_rate": 9.159494750707526e-05,
1208
+ "loss": 0.0,
1209
+ "step": 168
1210
+ },
1211
+ {
1212
+ "epoch": 0.6153846153846154,
1213
+ "grad_norm": NaN,
1214
+ "learning_rate": 9.019828596704394e-05,
1215
+ "loss": 0.0,
1216
+ "step": 169
1217
+ },
1218
+ {
1219
+ "epoch": 0.6190259444697315,
1220
+ "grad_norm": NaN,
1221
+ "learning_rate": 8.880355238966923e-05,
1222
+ "loss": 0.0,
1223
+ "step": 170
1224
+ },
1225
+ {
1226
+ "epoch": 0.6226672735548475,
1227
+ "grad_norm": NaN,
1228
+ "learning_rate": 8.741102111413748e-05,
1229
+ "loss": 0.0,
1230
+ "step": 171
1231
+ },
1232
+ {
1233
+ "epoch": 0.6263086026399636,
1234
+ "grad_norm": NaN,
1235
+ "learning_rate": 8.602096604645009e-05,
1236
+ "loss": 0.0,
1237
+ "step": 172
1238
+ },
1239
+ {
1240
+ "epoch": 0.6299499317250796,
1241
+ "grad_norm": NaN,
1242
+ "learning_rate": 8.463366060554698e-05,
1243
+ "loss": 0.0,
1244
+ "step": 173
1245
+ },
1246
+ {
1247
+ "epoch": 0.6335912608101957,
1248
+ "grad_norm": NaN,
1249
+ "learning_rate": 8.324937766952638e-05,
1250
+ "loss": 0.0,
1251
+ "step": 174
1252
+ },
1253
+ {
1254
+ "epoch": 0.6372325898953118,
1255
+ "grad_norm": NaN,
1256
+ "learning_rate": 8.186838952197018e-05,
1257
+ "loss": 0.0,
1258
+ "step": 175
1259
+ },
1260
+ {
1261
+ "epoch": 0.6408739189804279,
1262
+ "grad_norm": NaN,
1263
+ "learning_rate": 8.049096779838719e-05,
1264
+ "loss": 0.0,
1265
+ "step": 176
1266
+ },
1267
+ {
1268
+ "epoch": 0.6445152480655439,
1269
+ "grad_norm": NaN,
1270
+ "learning_rate": 7.911738343278304e-05,
1271
+ "loss": 0.0,
1272
+ "step": 177
1273
+ },
1274
+ {
1275
+ "epoch": 0.64815657715066,
1276
+ "grad_norm": NaN,
1277
+ "learning_rate": 7.774790660436858e-05,
1278
+ "loss": 0.0,
1279
+ "step": 178
1280
+ },
1281
+ {
1282
+ "epoch": 0.651797906235776,
1283
+ "grad_norm": NaN,
1284
+ "learning_rate": 7.63828066844166e-05,
1285
+ "loss": 0.0,
1286
+ "step": 179
1287
+ },
1288
+ {
1289
+ "epoch": 0.6554392353208921,
1290
+ "grad_norm": NaN,
1291
+ "learning_rate": 7.502235218327731e-05,
1292
+ "loss": 0.0,
1293
+ "step": 180
1294
+ },
1295
+ {
1296
+ "epoch": 0.6590805644060082,
1297
+ "grad_norm": NaN,
1298
+ "learning_rate": 7.366681069756352e-05,
1299
+ "loss": 0.0,
1300
+ "step": 181
1301
+ },
1302
+ {
1303
+ "epoch": 0.6627218934911243,
1304
+ "grad_norm": NaN,
1305
+ "learning_rate": 7.231644885751507e-05,
1306
+ "loss": 0.0,
1307
+ "step": 182
1308
+ },
1309
+ {
1310
+ "epoch": 0.6663632225762404,
1311
+ "grad_norm": NaN,
1312
+ "learning_rate": 7.097153227455379e-05,
1313
+ "loss": 0.0,
1314
+ "step": 183
1315
+ },
1316
+ {
1317
+ "epoch": 0.6700045516613564,
1318
+ "grad_norm": NaN,
1319
+ "learning_rate": 6.963232548903853e-05,
1320
+ "loss": 0.0,
1321
+ "step": 184
1322
+ },
1323
+ {
1324
+ "epoch": 0.6736458807464725,
1325
+ "grad_norm": NaN,
1326
+ "learning_rate": 6.829909191823121e-05,
1327
+ "loss": 0.0,
1328
+ "step": 185
1329
+ },
1330
+ {
1331
+ "epoch": 0.6772872098315885,
1332
+ "grad_norm": NaN,
1333
+ "learning_rate": 6.697209380448333e-05,
1334
+ "loss": 0.0,
1335
+ "step": 186
1336
+ },
1337
+ {
1338
+ "epoch": 0.6809285389167046,
1339
+ "grad_norm": NaN,
1340
+ "learning_rate": 6.565159216365389e-05,
1341
+ "loss": 0.0,
1342
+ "step": 187
1343
+ },
1344
+ {
1345
+ "epoch": 0.6845698680018206,
1346
+ "grad_norm": NaN,
1347
+ "learning_rate": 6.43378467337687e-05,
1348
+ "loss": 0.0,
1349
+ "step": 188
1350
+ },
1351
+ {
1352
+ "epoch": 0.6882111970869367,
1353
+ "grad_norm": NaN,
1354
+ "learning_rate": 6.30311159239305e-05,
1355
+ "loss": 0.0,
1356
+ "step": 189
1357
+ },
1358
+ {
1359
+ "epoch": 0.6918525261720528,
1360
+ "grad_norm": NaN,
1361
+ "learning_rate": 6.173165676349103e-05,
1362
+ "loss": 0.0,
1363
+ "step": 190
1364
+ },
1365
+ {
1366
+ "epoch": 0.6954938552571689,
1367
+ "grad_norm": NaN,
1368
+ "learning_rate": 6.043972485149414e-05,
1369
+ "loss": 0.0,
1370
+ "step": 191
1371
+ },
1372
+ {
1373
+ "epoch": 0.6991351843422849,
1374
+ "grad_norm": NaN,
1375
+ "learning_rate": 5.9155574306400395e-05,
1376
+ "loss": 0.0,
1377
+ "step": 192
1378
+ },
1379
+ {
1380
+ "epoch": 0.702776513427401,
1381
+ "grad_norm": NaN,
1382
+ "learning_rate": 5.787945771610296e-05,
1383
+ "loss": 0.0,
1384
+ "step": 193
1385
+ },
1386
+ {
1387
+ "epoch": 0.706417842512517,
1388
+ "grad_norm": NaN,
1389
+ "learning_rate": 5.6611626088244194e-05,
1390
+ "loss": 0.0,
1391
+ "step": 194
1392
+ },
1393
+ {
1394
+ "epoch": 0.7100591715976331,
1395
+ "grad_norm": NaN,
1396
+ "learning_rate": 5.5352328800843724e-05,
1397
+ "loss": 0.0,
1398
+ "step": 195
1399
+ },
1400
+ {
1401
+ "epoch": 0.7137005006827492,
1402
+ "grad_norm": NaN,
1403
+ "learning_rate": 5.410181355324622e-05,
1404
+ "loss": 0.0,
1405
+ "step": 196
1406
+ },
1407
+ {
1408
+ "epoch": 0.7173418297678653,
1409
+ "grad_norm": NaN,
1410
+ "learning_rate": 5.286032631740023e-05,
1411
+ "loss": 0.0,
1412
+ "step": 197
1413
+ },
1414
+ {
1415
+ "epoch": 0.7209831588529814,
1416
+ "grad_norm": NaN,
1417
+ "learning_rate": 5.162811128947602e-05,
1418
+ "loss": 0.0,
1419
+ "step": 198
1420
+ },
1421
+ {
1422
+ "epoch": 0.7246244879380974,
1423
+ "grad_norm": NaN,
1424
+ "learning_rate": 5.0405410841833253e-05,
1425
+ "loss": 0.0,
1426
+ "step": 199
1427
+ },
1428
+ {
1429
+ "epoch": 0.7282658170232135,
1430
+ "grad_norm": NaN,
1431
+ "learning_rate": 4.919246547534708e-05,
1432
+ "loss": 0.0,
1433
+ "step": 200
1434
+ },
1435
+ {
1436
+ "epoch": 0.7319071461083295,
1437
+ "grad_norm": NaN,
1438
+ "learning_rate": 4.7989513772102537e-05,
1439
+ "loss": 0.0,
1440
+ "step": 201
1441
+ },
1442
+ {
1443
+ "epoch": 0.7355484751934456,
1444
+ "grad_norm": NaN,
1445
+ "learning_rate": 4.6796792348466356e-05,
1446
+ "loss": 0.0,
1447
+ "step": 202
1448
+ },
1449
+ {
1450
+ "epoch": 0.7391898042785616,
1451
+ "grad_norm": NaN,
1452
+ "learning_rate": 4.561453580854516e-05,
1453
+ "loss": 0.0,
1454
+ "step": 203
1455
+ },
1456
+ {
1457
+ "epoch": 0.7428311333636778,
1458
+ "grad_norm": NaN,
1459
+ "learning_rate": 4.444297669803981e-05,
1460
+ "loss": 0.0,
1461
+ "step": 204
1462
+ },
1463
+ {
1464
+ "epoch": 0.7464724624487938,
1465
+ "grad_norm": NaN,
1466
+ "learning_rate": 4.328234545850442e-05,
1467
+ "loss": 0.0,
1468
+ "step": 205
1469
+ },
1470
+ {
1471
+ "epoch": 0.7501137915339099,
1472
+ "grad_norm": NaN,
1473
+ "learning_rate": 4.213287038201943e-05,
1474
+ "loss": 0.0,
1475
+ "step": 206
1476
+ },
1477
+ {
1478
+ "epoch": 0.7537551206190259,
1479
+ "grad_norm": NaN,
1480
+ "learning_rate": 4.0994777566287204e-05,
1481
+ "loss": 0.0,
1482
+ "step": 207
1483
+ },
1484
+ {
1485
+ "epoch": 0.7537551206190259,
1486
+ "eval_loss": NaN,
1487
+ "eval_runtime": 55.3157,
1488
+ "eval_samples_per_second": 8.37,
1489
+ "eval_steps_per_second": 2.097,
1490
+ "step": 207
1491
  }
1492
  ],
1493
  "logging_steps": 1,
 
1507
  "attributes": {}
1508
  }
1509
  },
1510
+ "total_flos": 5.821918917018255e+17,
1511
  "train_batch_size": 2,
1512
  "trial_name": null,
1513
  "trial_params": null