Marcus2112 commited on
Commit
29ca256
·
verified ·
1 Parent(s): 019f845

Upload folder using huggingface_hub

Browse files
checkpoint-1024/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/vol/tmp/koppelmm/pythia160m_dedup_untrained",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "partial_rotary_factor": 0.25,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000,
24
+ "rotary_emb_base": 10000,
25
+ "rotary_pct": 0.25,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.46.2",
29
+ "use_cache": true,
30
+ "use_parallel_residual": true,
31
+ "vocab_size": 50304
32
+ }
checkpoint-1024/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.46.2"
6
+ }
checkpoint-1024/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5f335ef94c0ce86cfda59e84cdd9bbef1b433fa1974f80e4b56d4498d8c3c6
3
+ size 649308728
checkpoint-1024/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ea90bec2bbb7210ee07df2db4735d39bd992d4cb6e7f1989788717eedf65f1
3
+ size 1298707258
checkpoint-1024/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:726ae76b86900d8abab60df7866887a207be06427ba6369a4451cd2686c4ed71
3
+ size 14244
checkpoint-1024/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ebd1a7d23da6a4b09d2235e23699a761e868e0b6ad2f2892a33dea66611ced6
3
+ size 1948045434
checkpoint-1024/trainer_state.json ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.119799451652303,
5
+ "eval_steps": 100,
6
+ "global_step": 1024,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.010932788971549125,
13
+ "grad_norm": 167.07713317871094,
14
+ "learning_rate": 0.0005999985601583006,
15
+ "loss": 9.6698,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.02186557794309825,
20
+ "grad_norm": 83.92709350585938,
21
+ "learning_rate": 0.0005998257958771109,
22
+ "loss": 8.2484,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.032798366914647374,
27
+ "grad_norm": 127.91200256347656,
28
+ "learning_rate": 0.0005993652532642609,
29
+ "loss": 7.6452,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.0437311558861965,
34
+ "grad_norm": 97.33670043945312,
35
+ "learning_rate": 0.0005986173743570491,
36
+ "loss": 7.4548,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.054663944857745624,
41
+ "grad_norm": 127.0005874633789,
42
+ "learning_rate": 0.0005975828769834513,
43
+ "loss": 7.3226,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.06559673382929475,
48
+ "grad_norm": 104.47633361816406,
49
+ "learning_rate": 0.0005962627540731365,
50
+ "loss": 7.204,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.07652952280084388,
55
+ "grad_norm": 164.4477081298828,
56
+ "learning_rate": 0.0005946582727044349,
57
+ "loss": 7.1105,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.087462311772393,
62
+ "grad_norm": 126.8350601196289,
63
+ "learning_rate": 0.0005927709728881719,
64
+ "loss": 7.0511,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.09839510074394213,
69
+ "grad_norm": 158.55856323242188,
70
+ "learning_rate": 0.0005906026660895383,
71
+ "loss": 7.0642,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.10932788971549125,
76
+ "grad_norm": 126.1555404663086,
77
+ "learning_rate": 0.0005881554334894116,
78
+ "loss": 7.031,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.10932788971549125,
83
+ "eval_loss": 7.01555061340332,
84
+ "eval_runtime": 79.0984,
85
+ "eval_samples_per_second": 118.409,
86
+ "eval_steps_per_second": 14.804,
87
+ "step": 100
88
+ },
89
+ {
90
+ "epoch": 0.12026067868704038,
91
+ "grad_norm": 108.58393096923828,
92
+ "learning_rate": 0.0005854316239868012,
93
+ "loss": 7.0123,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.1311934676585895,
98
+ "grad_norm": 178.0326690673828,
99
+ "learning_rate": 0.0005824338519443309,
100
+ "loss": 6.9897,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.14212625663013861,
105
+ "grad_norm": 192.8655242919922,
106
+ "learning_rate": 0.0005791649946789259,
107
+ "loss": 7.0117,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.15305904560168776,
112
+ "grad_norm": 143.3759002685547,
113
+ "learning_rate": 0.0005756281897001107,
114
+ "loss": 7.0073,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 0.16399183457323688,
119
+ "grad_norm": 171.0679168701172,
120
+ "learning_rate": 0.0005718268316985698,
121
+ "loss": 6.9843,
122
+ "step": 150
123
+ },
124
+ {
125
+ "epoch": 0.174924623544786,
126
+ "grad_norm": 164.86534118652344,
127
+ "learning_rate": 0.0005677645692878606,
128
+ "loss": 7.0083,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.1858574125163351,
133
+ "grad_norm": 125.85225677490234,
134
+ "learning_rate": 0.000563445301502407,
135
+ "loss": 7.02,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.19679020148788426,
140
+ "grad_norm": 144.15589904785156,
141
+ "learning_rate": 0.0005588731740551344,
142
+ "loss": 6.9773,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.20772299045943338,
147
+ "grad_norm": 108.05564880371094,
148
+ "learning_rate": 0.0005540525753583378,
149
+ "loss": 6.9632,
150
+ "step": 190
151
+ },
152
+ {
153
+ "epoch": 0.2186557794309825,
154
+ "grad_norm": 146.53924560546875,
155
+ "learning_rate": 0.0005489881323116018,
156
+ "loss": 6.929,
157
+ "step": 200
158
+ },
159
+ {
160
+ "epoch": 0.2186557794309825,
161
+ "eval_loss": 6.925621509552002,
162
+ "eval_runtime": 78.9467,
163
+ "eval_samples_per_second": 118.637,
164
+ "eval_steps_per_second": 14.833,
165
+ "step": 200
166
+ },
167
+ {
168
+ "epoch": 0.2295885684025316,
169
+ "grad_norm": 204.57968139648438,
170
+ "learning_rate": 0.0005436847058608189,
171
+ "loss": 6.9631,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.24052135737408076,
176
+ "grad_norm": 171.31556701660156,
177
+ "learning_rate": 0.0005381473863325621,
178
+ "loss": 7.0389,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.25145414634562985,
183
+ "grad_norm": 142.57449340820312,
184
+ "learning_rate": 0.0005323814885482963,
185
+ "loss": 6.967,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.262386935317179,
190
+ "grad_norm": 119.19646453857422,
191
+ "learning_rate": 0.000526392546723115,
192
+ "loss": 6.9456,
193
+ "step": 240
194
+ },
195
+ {
196
+ "epoch": 0.27331972428872814,
197
+ "grad_norm": 153.62359619140625,
198
+ "learning_rate": 0.0005201863091538979,
199
+ "loss": 6.9686,
200
+ "step": 250
201
+ },
202
+ {
203
+ "epoch": 0.28425251326027723,
204
+ "grad_norm": 150.35699462890625,
205
+ "learning_rate": 0.000513768732701989,
206
+ "loss": 6.9846,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.2951853022318264,
211
+ "grad_norm": 215.55368041992188,
212
+ "learning_rate": 0.0005071459770756929,
213
+ "loss": 6.9968,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.3061180912033755,
218
+ "grad_norm": 107.55154418945312,
219
+ "learning_rate": 0.0005003243989180711,
220
+ "loss": 7.0033,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.3170508801749246,
225
+ "grad_norm": 190.4154052734375,
226
+ "learning_rate": 0.0004933105457057203,
227
+ "loss": 6.9816,
228
+ "step": 290
229
+ },
230
+ {
231
+ "epoch": 0.32798366914647376,
232
+ "grad_norm": 159.7703094482422,
233
+ "learning_rate": 0.0004861111494643821,
234
+ "loss": 7.0486,
235
+ "step": 300
236
+ },
237
+ {
238
+ "epoch": 0.32798366914647376,
239
+ "eval_loss": 7.4869384765625,
240
+ "eval_runtime": 79.1717,
241
+ "eval_samples_per_second": 118.3,
242
+ "eval_steps_per_second": 14.791,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.3389164581180229,
247
+ "grad_norm": 218.22604370117188,
248
+ "learning_rate": 0.0004794787611927562,
249
+ "loss": 7.2679,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.349849247089572,
254
+ "grad_norm": 182.51431274414062,
255
+ "learning_rate": 0.0004719460124060748,
256
+ "loss": 7.1809,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.36078203606112114,
261
+ "grad_norm": 137.0953826904297,
262
+ "learning_rate": 0.0004642482266637136,
263
+ "loss": 7.0417,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.3717148250326702,
268
+ "grad_norm": 92.07840728759766,
269
+ "learning_rate": 0.0004563927924424775,
270
+ "loss": 6.9309,
271
+ "step": 340
272
+ },
273
+ {
274
+ "epoch": 0.38264761400421937,
275
+ "grad_norm": 147.35975646972656,
276
+ "learning_rate": 0.00044838724953309093,
277
+ "loss": 6.8844,
278
+ "step": 350
279
+ },
280
+ {
281
+ "epoch": 0.3935804029757685,
282
+ "grad_norm": 262.996337890625,
283
+ "learning_rate": 0.0004402392818033671,
284
+ "loss": 6.966,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.4045131919473176,
289
+ "grad_norm": 155.3452606201172,
290
+ "learning_rate": 0.00043195670982308984,
291
+ "loss": 7.0715,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.41544598091886675,
296
+ "grad_norm": 129.5069580078125,
297
+ "learning_rate": 0.00042354748335768664,
298
+ "loss": 7.0806,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.4263787698904159,
303
+ "grad_norm": 92.96502685546875,
304
+ "learning_rate": 0.0004150196737378971,
305
+ "loss": 6.9999,
306
+ "step": 390
307
+ },
308
+ {
309
+ "epoch": 0.437311558861965,
310
+ "grad_norm": 120.41193389892578,
311
+ "learning_rate": 0.0004063814661127606,
312
+ "loss": 6.9339,
313
+ "step": 400
314
+ },
315
+ {
316
+ "epoch": 0.437311558861965,
317
+ "eval_loss": 6.931961536407471,
318
+ "eval_runtime": 78.8373,
319
+ "eval_samples_per_second": 118.802,
320
+ "eval_steps_per_second": 14.853,
321
+ "step": 400
322
+ },
323
+ {
324
+ "epoch": 0.44824434783351413,
325
+ "grad_norm": 188.7049560546875,
326
+ "learning_rate": 0.00039764115159335935,
327
+ "loss": 6.9242,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.4591771368050632,
332
+ "grad_norm": 131.7518768310547,
333
+ "learning_rate": 0.0003888071192948565,
334
+ "loss": 6.9815,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.47010992577661237,
339
+ "grad_norm": 247.91549682617188,
340
+ "learning_rate": 0.0003798878482844695,
341
+ "loss": 7.0838,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.4810427147481615,
346
+ "grad_norm": 135.4517364501953,
347
+ "learning_rate": 0.000370891899443104,
348
+ "loss": 7.1813,
349
+ "step": 440
350
+ },
351
+ {
352
+ "epoch": 0.4919755037197106,
353
+ "grad_norm": 99.5172119140625,
354
+ "learning_rate": 0.00036182790724846315,
355
+ "loss": 7.1557,
356
+ "step": 450
357
+ },
358
+ {
359
+ "epoch": 0.5029082926912597,
360
+ "grad_norm": 165.1914825439453,
361
+ "learning_rate": 0.00035270457148751575,
362
+ "loss": 7.0382,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.5138410816628088,
367
+ "grad_norm": 128.59959411621094,
368
+ "learning_rate": 0.00034353064890628107,
369
+ "loss": 7.0597,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.524773870634358,
374
+ "grad_norm": 142.37147521972656,
375
+ "learning_rate": 0.00033431494480494175,
376
+ "loss": 7.092,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.5357066596059071,
381
+ "grad_norm": 217.4059295654297,
382
+ "learning_rate": 0.0003250663045863544,
383
+ "loss": 7.0457,
384
+ "step": 490
385
+ },
386
+ {
387
+ "epoch": 0.5466394485774563,
388
+ "grad_norm": 125.81988525390625,
389
+ "learning_rate": 0.0003157936052660688,
390
+ "loss": 7.0112,
391
+ "step": 500
392
+ },
393
+ {
394
+ "epoch": 0.5466394485774563,
395
+ "eval_loss": 7.004736423492432,
396
+ "eval_runtime": 78.8041,
397
+ "eval_samples_per_second": 118.852,
398
+ "eval_steps_per_second": 14.86,
399
+ "step": 500
400
+ },
401
+ {
402
+ "epoch": 0.5575722375490054,
403
+ "grad_norm": 170.00523376464844,
404
+ "learning_rate": 0.0003065057469520046,
405
+ "loss": 7.0162,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.5685050265205545,
410
+ "grad_norm": 216.81466674804688,
411
+ "learning_rate": 0.0002972116443019633,
412
+ "loss": 7.0584,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.5794378154921036,
417
+ "grad_norm": 239.21087646484375,
418
+ "learning_rate": 0.0002879202179671755,
419
+ "loss": 7.1254,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.5903706044636527,
424
+ "grad_norm": 190.0070343017578,
425
+ "learning_rate": 0.00027864038603009453,
426
+ "loss": 7.1717,
427
+ "step": 540
428
+ },
429
+ {
430
+ "epoch": 0.6013033934352019,
431
+ "grad_norm": 179.18785095214844,
432
+ "learning_rate": 0.00026938105544465745,
433
+ "loss": 7.1185,
434
+ "step": 550
435
+ },
436
+ {
437
+ "epoch": 0.612236182406751,
438
+ "grad_norm": 279.44781494140625,
439
+ "learning_rate": 0.0002601511134872255,
440
+ "loss": 7.0727,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 0.6231689713783001,
445
+ "grad_norm": 227.90072631835938,
446
+ "learning_rate": 0.0002509594192264121,
447
+ "loss": 7.1088,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 0.6341017603498492,
452
+ "grad_norm": 173.11819458007812,
453
+ "learning_rate": 0.0002418147950199862,
454
+ "loss": 7.0927,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 0.6450345493213984,
459
+ "grad_norm": 164.40736389160156,
460
+ "learning_rate": 0.00023272601804700946,
461
+ "loss": 7.0701,
462
+ "step": 590
463
+ },
464
+ {
465
+ "epoch": 0.6559673382929475,
466
+ "grad_norm": 123.35533142089844,
467
+ "learning_rate": 0.0002237018118833387,
468
+ "loss": 7.0496,
469
+ "step": 600
470
+ },
471
+ {
472
+ "epoch": 0.6559673382929475,
473
+ "eval_loss": 7.052866458892822,
474
+ "eval_runtime": 78.8887,
475
+ "eval_samples_per_second": 118.724,
476
+ "eval_steps_per_second": 14.844,
477
+ "step": 600
478
+ },
479
+ {
480
+ "epoch": 0.6669001272644967,
481
+ "grad_norm": 225.67015075683594,
482
+ "learning_rate": 0.0002147508381285762,
483
+ "loss": 7.04,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 0.6778329162360458,
488
+ "grad_norm": 140.2364501953125,
489
+ "learning_rate": 0.00020588168809250687,
490
+ "loss": 7.0902,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 0.6887657052075948,
495
+ "grad_norm": 262.8550720214844,
496
+ "learning_rate": 0.00019710287454900033,
497
+ "loss": 7.1224,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 0.699698494179144,
502
+ "grad_norm": 150.97813415527344,
503
+ "learning_rate": 0.00018842282356529402,
504
+ "loss": 7.1802,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 0.7106312831506931,
509
+ "grad_norm": 452.73431396484375,
510
+ "learning_rate": 0.00017984986641449754,
511
+ "loss": 7.1497,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 0.7215640721222423,
516
+ "grad_norm": 138.37220764160156,
517
+ "learning_rate": 0.00017139223157908368,
518
+ "loss": 7.1715,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 0.7324968610937914,
523
+ "grad_norm": 144.21133422851562,
524
+ "learning_rate": 0.00016305803685303906,
525
+ "loss": 7.1458,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 0.7434296500653405,
530
+ "grad_norm": 142.4859161376953,
531
+ "learning_rate": 0.00015485528155025473,
532
+ "loss": 7.1041,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 0.7543624390368896,
537
+ "grad_norm": 190.189208984375,
538
+ "learning_rate": 0.00014679183882663872,
539
+ "loss": 7.0798,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 0.7652952280084387,
544
+ "grad_norm": 160.14442443847656,
545
+ "learning_rate": 0.0001388754481233139,
546
+ "loss": 7.074,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 0.7652952280084387,
551
+ "eval_loss": 7.0790934562683105,
552
+ "eval_runtime": 79.0053,
553
+ "eval_samples_per_second": 118.549,
554
+ "eval_steps_per_second": 14.822,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 0.7762280169799879,
559
+ "grad_norm": 173.01499938964844,
560
+ "learning_rate": 0.0001311137077381614,
561
+ "loss": 7.0821,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 0.787160805951537,
566
+ "grad_norm": 156.1138458251953,
567
+ "learning_rate": 0.00012351406753283216,
568
+ "loss": 7.0838,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 0.7980935949230861,
573
+ "grad_norm": 161.9981689453125,
574
+ "learning_rate": 0.00011681901904809884,
575
+ "loss": 7.0639,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 0.8090263838946352,
580
+ "grad_norm": 174.0237579345703,
581
+ "learning_rate": 0.00010954733067505213,
582
+ "loss": 7.0604,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 0.8199591728661844,
587
+ "grad_norm": 141.823974609375,
588
+ "learning_rate": 0.0001024584422885053,
589
+ "loss": 7.0508,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 0.8308919618377335,
594
+ "grad_norm": 121.39106750488281,
595
+ "learning_rate": 9.555915793434476e-05,
596
+ "loss": 7.0568,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 0.8418247508092827,
601
+ "grad_norm": 178.37924194335938,
602
+ "learning_rate": 8.885609967300851e-05,
603
+ "loss": 7.0589,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 0.8527575397808318,
608
+ "grad_norm": 304.8969421386719,
609
+ "learning_rate": 8.235570122350937e-05,
610
+ "loss": 7.0582,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 0.8636903287523808,
615
+ "grad_norm": 128.75843811035156,
616
+ "learning_rate": 7.606420178823293e-05,
617
+ "loss": 7.0622,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 0.87462311772393,
622
+ "grad_norm": 88.88775634765625,
623
+ "learning_rate": 6.998764006443615e-05,
624
+ "loss": 7.0664,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 0.87462311772393,
629
+ "eval_loss": 7.048069477081299,
630
+ "eval_runtime": 78.7086,
631
+ "eval_samples_per_second": 118.996,
632
+ "eval_steps_per_second": 14.878,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 0.8855559066954791,
637
+ "grad_norm": 131.33584594726562,
638
+ "learning_rate": 6.413184844819423e-05,
639
+ "loss": 7.0381,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 0.8964886956670283,
644
+ "grad_norm": 176.8515625,
645
+ "learning_rate": 6e-05,
646
+ "loss": 7.0461,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 0.9074214846385774,
651
+ "grad_norm": 128.32069396972656,
652
+ "learning_rate": 6e-05,
653
+ "loss": 7.0597,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 0.9183542736101264,
658
+ "grad_norm": 150.107421875,
659
+ "learning_rate": 6e-05,
660
+ "loss": 7.0582,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 0.9292870625816756,
665
+ "grad_norm": 174.95352172851562,
666
+ "learning_rate": 6e-05,
667
+ "loss": 7.0729,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 0.9402198515532247,
672
+ "grad_norm": 209.878173828125,
673
+ "learning_rate": 6e-05,
674
+ "loss": 7.0949,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 0.9511526405247739,
679
+ "grad_norm": 181.1326904296875,
680
+ "learning_rate": 6e-05,
681
+ "loss": 7.109,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 0.962085429496323,
686
+ "grad_norm": 197.11639404296875,
687
+ "learning_rate": 6e-05,
688
+ "loss": 7.1132,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 0.9730182184678722,
693
+ "grad_norm": 197.16473388671875,
694
+ "learning_rate": 6e-05,
695
+ "loss": 7.1008,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 0.9839510074394212,
700
+ "grad_norm": 224.1211395263672,
701
+ "learning_rate": 6e-05,
702
+ "loss": 7.1024,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 0.9839510074394212,
707
+ "eval_loss": 7.119234561920166,
708
+ "eval_runtime": 78.4745,
709
+ "eval_samples_per_second": 119.351,
710
+ "eval_steps_per_second": 14.922,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 0.9948837964109704,
715
+ "grad_norm": 161.86753845214844,
716
+ "learning_rate": 6e-05,
717
+ "loss": 7.1127,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 1.0060984463481923,
722
+ "grad_norm": 247.6467742919922,
723
+ "learning_rate": 6e-05,
724
+ "loss": 7.1115,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 1.0170312353197413,
729
+ "grad_norm": 228.1467742919922,
730
+ "learning_rate": 6e-05,
731
+ "loss": 7.1172,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 1.0279640242912904,
736
+ "grad_norm": 400.675537109375,
737
+ "learning_rate": 6e-05,
738
+ "loss": 7.1351,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 1.0388968132628396,
743
+ "grad_norm": 293.3075866699219,
744
+ "learning_rate": 6e-05,
745
+ "loss": 7.1747,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 1.0498296022343887,
750
+ "grad_norm": 439.60760498046875,
751
+ "learning_rate": 6e-05,
752
+ "loss": 7.1955,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 1.0607623912059378,
757
+ "grad_norm": 336.15521240234375,
758
+ "learning_rate": 6e-05,
759
+ "loss": 7.2134,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 1.071695180177487,
764
+ "grad_norm": 232.90606689453125,
765
+ "learning_rate": 6e-05,
766
+ "loss": 7.2589,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 1.0826279691490361,
771
+ "grad_norm": 453.7010803222656,
772
+ "learning_rate": 6e-05,
773
+ "loss": 7.2537,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1.0935607581205853,
778
+ "grad_norm": 156.7413330078125,
779
+ "learning_rate": 6e-05,
780
+ "loss": 7.2678,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1.0935607581205853,
785
+ "eval_loss": 7.271553993225098,
786
+ "eval_runtime": 78.5452,
787
+ "eval_samples_per_second": 119.243,
788
+ "eval_steps_per_second": 14.909,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1.1044935470921344,
793
+ "grad_norm": 225.9600067138672,
794
+ "learning_rate": 6e-05,
795
+ "loss": 7.2489,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1.1154263360636836,
800
+ "grad_norm": 258.6958312988281,
801
+ "learning_rate": 6e-05,
802
+ "loss": 7.2224,
803
+ "step": 1020
804
+ }
805
+ ],
806
+ "logging_steps": 10,
807
+ "max_steps": 1024,
808
+ "num_input_tokens_seen": 0,
809
+ "num_train_epochs": 2,
810
+ "save_steps": 1024,
811
+ "stateful_callbacks": {
812
+ "TrainerControl": {
813
+ "args": {
814
+ "should_epoch_stop": false,
815
+ "should_evaluate": false,
816
+ "should_log": false,
817
+ "should_save": true,
818
+ "should_training_stop": true
819
+ },
820
+ "attributes": {}
821
+ }
822
+ },
823
+ "total_flos": 1.383804151351214e+18,
824
+ "train_batch_size": 8,
825
+ "trial_name": null,
826
+ "trial_params": null
827
+ }
checkpoint-1024/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95492cc580971074153fb0dee7b60aedfda648c5f8a7cb99bd7fbc9464d590d2
3
+ size 5304
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/vol/tmp/koppelmm/pythia160m_dedup_untrained",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 0,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "max_position_embeddings": 2048,
18
+ "model_type": "gpt_neox",
19
+ "num_attention_heads": 12,
20
+ "num_hidden_layers": 12,
21
+ "partial_rotary_factor": 0.25,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000,
24
+ "rotary_emb_base": 10000,
25
+ "rotary_pct": 0.25,
26
+ "tie_word_embeddings": false,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.46.2",
29
+ "use_cache": true,
30
+ "use_parallel_residual": true,
31
+ "vocab_size": 50304
32
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.46.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5f335ef94c0ce86cfda59e84cdd9bbef1b433fa1974f80e4b56d4498d8c3c6
3
+ size 649308728
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95492cc580971074153fb0dee7b60aedfda648c5f8a7cb99bd7fbc9464d590d2
3
+ size 5304