File size: 7,439 Bytes
bc2fa23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.0,
  "eval_steps": 500,
  "global_step": 18819,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.07970667941973537,
      "grad_norm": 14.617377281188965,
      "learning_rate": 4.867155534300441e-05,
      "loss": 4.5164,
      "step": 500
    },
    {
      "epoch": 0.15941335883947075,
      "grad_norm": 13.107848167419434,
      "learning_rate": 4.7343110686008824e-05,
      "loss": 2.9043,
      "step": 1000
    },
    {
      "epoch": 0.2391200382592061,
      "grad_norm": 18.4104061126709,
      "learning_rate": 4.601466602901323e-05,
      "loss": 2.5306,
      "step": 1500
    },
    {
      "epoch": 0.3188267176789415,
      "grad_norm": 11.45405101776123,
      "learning_rate": 4.4686221372017646e-05,
      "loss": 2.3636,
      "step": 2000
    },
    {
      "epoch": 0.39853339709867686,
      "grad_norm": 12.057182312011719,
      "learning_rate": 4.335777671502205e-05,
      "loss": 2.2193,
      "step": 2500
    },
    {
      "epoch": 0.4782400765184122,
      "grad_norm": 21.33402442932129,
      "learning_rate": 4.202933205802647e-05,
      "loss": 2.1756,
      "step": 3000
    },
    {
      "epoch": 0.5579467559381476,
      "grad_norm": 12.961982727050781,
      "learning_rate": 4.070088740103087e-05,
      "loss": 2.1019,
      "step": 3500
    },
    {
      "epoch": 0.637653435357883,
      "grad_norm": 13.711081504821777,
      "learning_rate": 3.937244274403529e-05,
      "loss": 2.0309,
      "step": 4000
    },
    {
      "epoch": 0.7173601147776184,
      "grad_norm": 10.16848087310791,
      "learning_rate": 3.8043998087039694e-05,
      "loss": 2.0221,
      "step": 4500
    },
    {
      "epoch": 0.7970667941973537,
      "grad_norm": 15.42959976196289,
      "learning_rate": 3.6715553430044105e-05,
      "loss": 1.968,
      "step": 5000
    },
    {
      "epoch": 0.8767734736170891,
      "grad_norm": 11.260416030883789,
      "learning_rate": 3.5387108773048516e-05,
      "loss": 1.9626,
      "step": 5500
    },
    {
      "epoch": 0.9564801530368244,
      "grad_norm": 14.895442008972168,
      "learning_rate": 3.405866411605293e-05,
      "loss": 1.9793,
      "step": 6000
    },
    {
      "epoch": 1.03618683245656,
      "grad_norm": 10.09549617767334,
      "learning_rate": 3.273021945905734e-05,
      "loss": 1.8907,
      "step": 6500
    },
    {
      "epoch": 1.1158935118762952,
      "grad_norm": 12.858028411865234,
      "learning_rate": 3.140177480206174e-05,
      "loss": 1.8524,
      "step": 7000
    },
    {
      "epoch": 1.1956001912960306,
      "grad_norm": 12.280284881591797,
      "learning_rate": 3.007333014506616e-05,
      "loss": 1.8517,
      "step": 7500
    },
    {
      "epoch": 1.275306870715766,
      "grad_norm": 10.79765796661377,
      "learning_rate": 2.8744885488070568e-05,
      "loss": 1.8522,
      "step": 8000
    },
    {
      "epoch": 1.3550135501355014,
      "grad_norm": 12.524330139160156,
      "learning_rate": 2.741644083107498e-05,
      "loss": 1.827,
      "step": 8500
    },
    {
      "epoch": 1.4347202295552366,
      "grad_norm": 13.051765441894531,
      "learning_rate": 2.6087996174079386e-05,
      "loss": 1.8092,
      "step": 9000
    },
    {
      "epoch": 1.514426908974972,
      "grad_norm": 11.686491012573242,
      "learning_rate": 2.4759551517083797e-05,
      "loss": 1.8128,
      "step": 9500
    },
    {
      "epoch": 1.5941335883947074,
      "grad_norm": 10.523648262023926,
      "learning_rate": 2.3431106860088208e-05,
      "loss": 1.7951,
      "step": 10000
    },
    {
      "epoch": 1.6738402678144428,
      "grad_norm": 12.37237548828125,
      "learning_rate": 2.210266220309262e-05,
      "loss": 1.7592,
      "step": 10500
    },
    {
      "epoch": 1.7535469472341783,
      "grad_norm": 13.389715194702148,
      "learning_rate": 2.077421754609703e-05,
      "loss": 1.7428,
      "step": 11000
    },
    {
      "epoch": 1.8332536266539137,
      "grad_norm": 10.13847541809082,
      "learning_rate": 1.944577288910144e-05,
      "loss": 1.7337,
      "step": 11500
    },
    {
      "epoch": 1.912960306073649,
      "grad_norm": 9.493837356567383,
      "learning_rate": 1.8117328232105852e-05,
      "loss": 1.7224,
      "step": 12000
    },
    {
      "epoch": 1.9926669854933845,
      "grad_norm": 9.915449142456055,
      "learning_rate": 1.6788883575110263e-05,
      "loss": 1.7306,
      "step": 12500
    },
    {
      "epoch": 2.07237366491312,
      "grad_norm": 10.596574783325195,
      "learning_rate": 1.5460438918114674e-05,
      "loss": 1.6675,
      "step": 13000
    },
    {
      "epoch": 2.152080344332855,
      "grad_norm": 11.658182144165039,
      "learning_rate": 1.4131994261119083e-05,
      "loss": 1.671,
      "step": 13500
    },
    {
      "epoch": 2.2317870237525903,
      "grad_norm": 10.158458709716797,
      "learning_rate": 1.2803549604123494e-05,
      "loss": 1.6658,
      "step": 14000
    },
    {
      "epoch": 2.3114937031723257,
      "grad_norm": 11.206422805786133,
      "learning_rate": 1.1475104947127904e-05,
      "loss": 1.666,
      "step": 14500
    },
    {
      "epoch": 2.391200382592061,
      "grad_norm": 10.595658302307129,
      "learning_rate": 1.0146660290132313e-05,
      "loss": 1.6523,
      "step": 15000
    },
    {
      "epoch": 2.4709070620117966,
      "grad_norm": 8.99512004852295,
      "learning_rate": 8.818215633136724e-06,
      "loss": 1.6529,
      "step": 15500
    },
    {
      "epoch": 2.550613741431532,
      "grad_norm": 8.853923797607422,
      "learning_rate": 7.489770976141135e-06,
      "loss": 1.6617,
      "step": 16000
    },
    {
      "epoch": 2.6303204208512674,
      "grad_norm": 9.30465316772461,
      "learning_rate": 6.161326319145545e-06,
      "loss": 1.6566,
      "step": 16500
    },
    {
      "epoch": 2.710027100271003,
      "grad_norm": 10.036943435668945,
      "learning_rate": 4.832881662149955e-06,
      "loss": 1.6423,
      "step": 17000
    },
    {
      "epoch": 2.789733779690738,
      "grad_norm": 12.505424499511719,
      "learning_rate": 3.504437005154365e-06,
      "loss": 1.6528,
      "step": 17500
    },
    {
      "epoch": 2.869440459110473,
      "grad_norm": 9.32070255279541,
      "learning_rate": 2.1759923481587757e-06,
      "loss": 1.6388,
      "step": 18000
    },
    {
      "epoch": 2.9491471385302086,
      "grad_norm": 13.217713356018066,
      "learning_rate": 8.475476911631861e-07,
      "loss": 1.6282,
      "step": 18500
    },
    {
      "epoch": 3.0,
      "step": 18819,
      "total_flos": 1.5511823227634976e+16,
      "train_loss": 1.9410085403942658,
      "train_runtime": 9037.5655,
      "train_samples_per_second": 8.329,
      "train_steps_per_second": 2.082
    }
  ],
  "logging_steps": 500,
  "max_steps": 18819,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.5511823227634976e+16,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}