BaoLocTown commited on
Commit
266b399
·
1 Parent(s): 4e28dc2

Model save

Browse files
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: hllj/mistral-vi-math
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: sft-mistral-7b-v01-v1
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # sft-mistral-7b-v01-v1
14
+
15
+ This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.4918
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 5e-05
37
+ - train_batch_size: 4
38
+ - eval_batch_size: 4
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: cosine
43
+ - lr_scheduler_warmup_ratio: 0.05
44
+ - num_epochs: 2
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.313 | 0.26 | 500 | 0.5063 |
52
+ | 0.279 | 1.07 | 1000 | 0.4892 |
53
+ | 0.2584 | 1.33 | 1500 | 0.4950 |
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.35.2
59
+ - Pytorch 2.1.0
60
+ - Datasets 2.15.0
61
+ - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed3d959cc6ff0319ebeb80281e439f2cba867f20ea34b65ae440688b5b8c75ef
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50f9f18326b14f78e6e5baa1cd8a08bf2c26653a7be354015978e694797c82eb
3
  size 872450448
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "eval_loss": 0.491791695356369,
4
+ "eval_runtime": 105.74,
5
+ "eval_samples": 852,
6
+ "eval_samples_per_second": 8.058,
7
+ "eval_steps_per_second": 2.014,
8
+ "train_loss": 0.305820442848905,
9
+ "train_runtime": 6666.1209,
10
+ "train_samples": 7665,
11
+ "train_samples_per_second": 2.3,
12
+ "train_steps_per_second": 0.575
13
+ }
config_argument.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cache_dir: ./cache
2
+ ddp_find_unused_parameters: false
3
+ ddp_timeout: 30000
4
+ device_map: auto
5
+ do_eval: true
6
+ do_train: true
7
+ eval_steps: 500
8
+ evaluation_strategy: steps
9
+ fp16: true
10
+ gradient_accumulation_steps: 1
11
+ gradient_checkpointing: true
12
+ gradient_checkpointing_kwargs:
13
+ use_reentrant: false
14
+ hub_model_id: BaoLocTown/sft-mistral-7b-v01-v1
15
+ hub_strategy: every_save
16
+ learning_rate: 5.0e-05
17
+ log_level: info
18
+ logging_first_step: true
19
+ logging_steps: 10
20
+ logging_strategy: steps
21
+ lora_alpha: 128
22
+ lora_dropout: 0.05
23
+ lora_r: 256
24
+ lora_target_modules:
25
+ - q_proj
26
+ - k_proj
27
+ - v_proj
28
+ - o_proj
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ model_name_or_path: hllj/mistral-vi-math
32
+ model_type: auto
33
+ num_train_epochs: 2
34
+ output_dir: outputs-sft-mistral-v01-v1
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 4
37
+ per_device_train_batch_size: 4
38
+ preprocessing_num_workers: 4
39
+ push_to_hub: true
40
+ report_to: wandb
41
+ run_name: sft-mistral-7b-v01-v1
42
+ save_steps: 500
43
+ save_strategy: steps
44
+ save_total_limit: 13
45
+ seed: 42
46
+ torch_dtype: float16
47
+ train_file_dir: datasets/finetune
48
+ use_peft: true
49
+ warmup_ratio: 0.05
50
+ weight_decay: 0.05
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "eval_loss": 0.491791695356369,
4
+ "eval_runtime": 105.74,
5
+ "eval_samples": 852,
6
+ "eval_samples_per_second": 8.058,
7
+ "eval_steps_per_second": 2.014
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "train_loss": 0.305820442848905,
4
+ "train_runtime": 6666.1209,
5
+ "train_samples": 7665,
6
+ "train_samples_per_second": 2.3,
7
+ "train_steps_per_second": 0.575
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.4480959833072509,
5
+ "eval_steps": 500,
6
+ "global_step": 1718,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.604166666666667e-07,
14
+ "loss": 0.7574,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.01,
19
+ "learning_rate": 2.604166666666667e-06,
20
+ "loss": 0.7145,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.01,
25
+ "learning_rate": 5.208333333333334e-06,
26
+ "loss": 0.6428,
27
+ "step": 20
28
+ },
29
+ {
30
+ "epoch": 0.02,
31
+ "learning_rate": 7.8125e-06,
32
+ "loss": 0.5893,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.02,
37
+ "learning_rate": 1.0416666666666668e-05,
38
+ "loss": 0.537,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.03,
43
+ "learning_rate": 1.3020833333333334e-05,
44
+ "loss": 0.4432,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.03,
49
+ "learning_rate": 1.5625e-05,
50
+ "loss": 0.4239,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.04,
55
+ "learning_rate": 1.8229166666666668e-05,
56
+ "loss": 0.3672,
57
+ "step": 70
58
+ },
59
+ {
60
+ "epoch": 0.04,
61
+ "learning_rate": 2.0833333333333336e-05,
62
+ "loss": 0.3798,
63
+ "step": 80
64
+ },
65
+ {
66
+ "epoch": 0.05,
67
+ "learning_rate": 2.34375e-05,
68
+ "loss": 0.3666,
69
+ "step": 90
70
+ },
71
+ {
72
+ "epoch": 0.05,
73
+ "learning_rate": 2.604166666666667e-05,
74
+ "loss": 0.341,
75
+ "step": 100
76
+ },
77
+ {
78
+ "epoch": 0.06,
79
+ "learning_rate": 2.8645833333333333e-05,
80
+ "loss": 0.3855,
81
+ "step": 110
82
+ },
83
+ {
84
+ "epoch": 0.06,
85
+ "learning_rate": 3.125e-05,
86
+ "loss": 0.3517,
87
+ "step": 120
88
+ },
89
+ {
90
+ "epoch": 0.07,
91
+ "learning_rate": 3.385416666666667e-05,
92
+ "loss": 0.3482,
93
+ "step": 130
94
+ },
95
+ {
96
+ "epoch": 0.07,
97
+ "learning_rate": 3.6458333333333336e-05,
98
+ "loss": 0.3465,
99
+ "step": 140
100
+ },
101
+ {
102
+ "epoch": 0.08,
103
+ "learning_rate": 3.90625e-05,
104
+ "loss": 0.3455,
105
+ "step": 150
106
+ },
107
+ {
108
+ "epoch": 0.08,
109
+ "learning_rate": 4.166666666666667e-05,
110
+ "loss": 0.3418,
111
+ "step": 160
112
+ },
113
+ {
114
+ "epoch": 0.09,
115
+ "learning_rate": 4.4270833333333337e-05,
116
+ "loss": 0.338,
117
+ "step": 170
118
+ },
119
+ {
120
+ "epoch": 0.09,
121
+ "learning_rate": 4.6875e-05,
122
+ "loss": 0.3419,
123
+ "step": 180
124
+ },
125
+ {
126
+ "epoch": 0.1,
127
+ "learning_rate": 4.947916666666667e-05,
128
+ "loss": 0.3593,
129
+ "step": 190
130
+ },
131
+ {
132
+ "epoch": 0.1,
133
+ "learning_rate": 4.99994047380455e-05,
134
+ "loss": 0.3421,
135
+ "step": 200
136
+ },
137
+ {
138
+ "epoch": 0.11,
139
+ "learning_rate": 4.999698653493815e-05,
140
+ "loss": 0.3433,
141
+ "step": 210
142
+ },
143
+ {
144
+ "epoch": 0.11,
145
+ "learning_rate": 4.999270836660003e-05,
146
+ "loss": 0.3644,
147
+ "step": 220
148
+ },
149
+ {
150
+ "epoch": 0.12,
151
+ "learning_rate": 4.998657055135927e-05,
152
+ "loss": 0.3415,
153
+ "step": 230
154
+ },
155
+ {
156
+ "epoch": 0.13,
157
+ "learning_rate": 4.9978573545915854e-05,
158
+ "loss": 0.337,
159
+ "step": 240
160
+ },
161
+ {
162
+ "epoch": 0.13,
163
+ "learning_rate": 4.996871794530757e-05,
164
+ "loss": 0.3445,
165
+ "step": 250
166
+ },
167
+ {
168
+ "epoch": 0.14,
169
+ "learning_rate": 4.9957004482865796e-05,
170
+ "loss": 0.3434,
171
+ "step": 260
172
+ },
173
+ {
174
+ "epoch": 0.14,
175
+ "learning_rate": 4.994343403016093e-05,
176
+ "loss": 0.3308,
177
+ "step": 270
178
+ },
179
+ {
180
+ "epoch": 0.15,
181
+ "learning_rate": 4.992800759693746e-05,
182
+ "loss": 0.3223,
183
+ "step": 280
184
+ },
185
+ {
186
+ "epoch": 0.15,
187
+ "learning_rate": 4.9910726331038935e-05,
188
+ "loss": 0.3283,
189
+ "step": 290
190
+ },
191
+ {
192
+ "epoch": 0.16,
193
+ "learning_rate": 4.989159151832251e-05,
194
+ "loss": 0.3344,
195
+ "step": 300
196
+ },
197
+ {
198
+ "epoch": 0.16,
199
+ "learning_rate": 4.987060458256324e-05,
200
+ "loss": 0.3395,
201
+ "step": 310
202
+ },
203
+ {
204
+ "epoch": 0.17,
205
+ "learning_rate": 4.98477670853482e-05,
206
+ "loss": 0.3255,
207
+ "step": 320
208
+ },
209
+ {
210
+ "epoch": 0.17,
211
+ "learning_rate": 4.982308072596025e-05,
212
+ "loss": 0.3075,
213
+ "step": 330
214
+ },
215
+ {
216
+ "epoch": 0.18,
217
+ "learning_rate": 4.979654734125161e-05,
218
+ "loss": 0.3294,
219
+ "step": 340
220
+ },
221
+ {
222
+ "epoch": 0.18,
223
+ "learning_rate": 4.976816890550717e-05,
224
+ "loss": 0.3196,
225
+ "step": 350
226
+ },
227
+ {
228
+ "epoch": 0.19,
229
+ "learning_rate": 4.9737947530297606e-05,
230
+ "loss": 0.3436,
231
+ "step": 360
232
+ },
233
+ {
234
+ "epoch": 0.19,
235
+ "learning_rate": 4.9705885464322266e-05,
236
+ "loss": 0.3114,
237
+ "step": 370
238
+ },
239
+ {
240
+ "epoch": 0.2,
241
+ "learning_rate": 4.967198509324183e-05,
242
+ "loss": 0.3257,
243
+ "step": 380
244
+ },
245
+ {
246
+ "epoch": 0.2,
247
+ "learning_rate": 4.9636248939500805e-05,
248
+ "loss": 0.3062,
249
+ "step": 390
250
+ },
251
+ {
252
+ "epoch": 0.21,
253
+ "learning_rate": 4.959867966213986e-05,
254
+ "loss": 0.3381,
255
+ "step": 400
256
+ },
257
+ {
258
+ "epoch": 0.21,
259
+ "learning_rate": 4.955928005659792e-05,
260
+ "loss": 0.3234,
261
+ "step": 410
262
+ },
263
+ {
264
+ "epoch": 0.22,
265
+ "learning_rate": 4.951805305450422e-05,
266
+ "loss": 0.3345,
267
+ "step": 420
268
+ },
269
+ {
270
+ "epoch": 0.22,
271
+ "learning_rate": 4.947500172346016e-05,
272
+ "loss": 0.317,
273
+ "step": 430
274
+ },
275
+ {
276
+ "epoch": 0.23,
277
+ "learning_rate": 4.9430129266811e-05,
278
+ "loss": 0.3306,
279
+ "step": 440
280
+ },
281
+ {
282
+ "epoch": 0.23,
283
+ "learning_rate": 4.9383439023407585e-05,
284
+ "loss": 0.3393,
285
+ "step": 450
286
+ },
287
+ {
288
+ "epoch": 0.24,
289
+ "learning_rate": 4.933493446735784e-05,
290
+ "loss": 0.3143,
291
+ "step": 460
292
+ },
293
+ {
294
+ "epoch": 0.25,
295
+ "learning_rate": 4.928461920776832e-05,
296
+ "loss": 0.3322,
297
+ "step": 470
298
+ },
299
+ {
300
+ "epoch": 0.25,
301
+ "learning_rate": 4.923249698847564e-05,
302
+ "loss": 0.2923,
303
+ "step": 480
304
+ },
305
+ {
306
+ "epoch": 0.26,
307
+ "learning_rate": 4.91785716877679e-05,
308
+ "loss": 0.3197,
309
+ "step": 490
310
+ },
311
+ {
312
+ "epoch": 0.26,
313
+ "learning_rate": 4.912284731809614e-05,
314
+ "loss": 0.313,
315
+ "step": 500
316
+ },
317
+ {
318
+ "epoch": 0.26,
319
+ "eval_loss": 0.5063315629959106,
320
+ "eval_runtime": 105.7257,
321
+ "eval_samples_per_second": 8.059,
322
+ "eval_steps_per_second": 2.015,
323
+ "step": 500
324
+ },
325
+ {
326
+ "epoch": 0.27,
327
+ "learning_rate": 4.906532802577575e-05,
328
+ "loss": 0.3151,
329
+ "step": 510
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 4.9006018090677974e-05,
334
+ "loss": 0.3121,
335
+ "step": 520
336
+ },
337
+ {
338
+ "epoch": 0.28,
339
+ "learning_rate": 4.8944921925911444e-05,
340
+ "loss": 0.3254,
341
+ "step": 530
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 4.888204407749382e-05,
346
+ "loss": 0.3487,
347
+ "step": 540
348
+ },
349
+ {
350
+ "epoch": 0.29,
351
+ "learning_rate": 4.881738922401353e-05,
352
+ "loss": 0.3261,
353
+ "step": 550
354
+ },
355
+ {
356
+ "epoch": 0.29,
357
+ "learning_rate": 4.8750962176281635e-05,
358
+ "loss": 0.334,
359
+ "step": 560
360
+ },
361
+ {
362
+ "epoch": 0.3,
363
+ "learning_rate": 4.868276787697389e-05,
364
+ "loss": 0.3191,
365
+ "step": 570
366
+ },
367
+ {
368
+ "epoch": 0.3,
369
+ "learning_rate": 4.861281140026296e-05,
370
+ "loss": 0.3118,
371
+ "step": 580
372
+ },
373
+ {
374
+ "epoch": 0.31,
375
+ "learning_rate": 4.854109795144084e-05,
376
+ "loss": 0.3249,
377
+ "step": 590
378
+ },
379
+ {
380
+ "epoch": 0.31,
381
+ "learning_rate": 4.8467632866531596e-05,
382
+ "loss": 0.3095,
383
+ "step": 600
384
+ },
385
+ {
386
+ "epoch": 0.32,
387
+ "learning_rate": 4.8392421611894275e-05,
388
+ "loss": 0.3334,
389
+ "step": 610
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 4.83154697838162e-05,
394
+ "loss": 0.3193,
395
+ "step": 620
396
+ },
397
+ {
398
+ "epoch": 0.33,
399
+ "learning_rate": 4.8236783108096514e-05,
400
+ "loss": 0.297,
401
+ "step": 630
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 4.815636743962022e-05,
406
+ "loss": 0.3232,
407
+ "step": 640
408
+ },
409
+ {
410
+ "epoch": 0.34,
411
+ "learning_rate": 4.8074228761922444e-05,
412
+ "loss": 0.3348,
413
+ "step": 650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 4.799037318674329e-05,
418
+ "loss": 0.3401,
419
+ "step": 660
420
+ },
421
+ {
422
+ "epoch": 0.35,
423
+ "learning_rate": 4.7904806953573e-05,
424
+ "loss": 0.3299,
425
+ "step": 670
426
+ },
427
+ {
428
+ "epoch": 0.35,
429
+ "learning_rate": 4.781753642918778e-05,
430
+ "loss": 0.3152,
431
+ "step": 680
432
+ },
433
+ {
434
+ "epoch": 0.36,
435
+ "learning_rate": 4.772856810717599e-05,
436
+ "loss": 0.3104,
437
+ "step": 690
438
+ },
439
+ {
440
+ "epoch": 0.37,
441
+ "learning_rate": 4.763790860745502e-05,
442
+ "loss": 0.3331,
443
+ "step": 700
444
+ },
445
+ {
446
+ "epoch": 0.37,
447
+ "learning_rate": 4.7545564675778677e-05,
448
+ "loss": 0.332,
449
+ "step": 710
450
+ },
451
+ {
452
+ "epoch": 0.38,
453
+ "learning_rate": 4.745154318323528e-05,
454
+ "loss": 0.3318,
455
+ "step": 720
456
+ },
457
+ {
458
+ "epoch": 0.38,
459
+ "learning_rate": 4.7355851125736394e-05,
460
+ "loss": 0.3166,
461
+ "step": 730
462
+ },
463
+ {
464
+ "epoch": 0.39,
465
+ "learning_rate": 4.725849562349627e-05,
466
+ "loss": 0.3055,
467
+ "step": 740
468
+ },
469
+ {
470
+ "epoch": 0.39,
471
+ "learning_rate": 4.715948392050206e-05,
472
+ "loss": 0.3213,
473
+ "step": 750
474
+ },
475
+ {
476
+ "epoch": 0.4,
477
+ "learning_rate": 4.7058823383974794e-05,
478
+ "loss": 0.315,
479
+ "step": 760
480
+ },
481
+ {
482
+ "epoch": 0.4,
483
+ "learning_rate": 4.6956521503821215e-05,
484
+ "loss": 0.3525,
485
+ "step": 770
486
+ },
487
+ {
488
+ "epoch": 0.41,
489
+ "learning_rate": 4.685258589207648e-05,
490
+ "loss": 0.3089,
491
+ "step": 780
492
+ },
493
+ {
494
+ "epoch": 0.41,
495
+ "learning_rate": 4.6747024282337735e-05,
496
+ "loss": 0.3047,
497
+ "step": 790
498
+ },
499
+ {
500
+ "epoch": 0.42,
501
+ "learning_rate": 4.6639844529188734e-05,
502
+ "loss": 0.3192,
503
+ "step": 800
504
+ },
505
+ {
506
+ "epoch": 0.42,
507
+ "learning_rate": 4.653105460761533e-05,
508
+ "loss": 0.319,
509
+ "step": 810
510
+ },
511
+ {
512
+ "epoch": 0.43,
513
+ "learning_rate": 4.642066261241212e-05,
514
+ "loss": 0.3179,
515
+ "step": 820
516
+ },
517
+ {
518
+ "epoch": 0.43,
519
+ "learning_rate": 4.630867675758013e-05,
520
+ "loss": 0.3061,
521
+ "step": 830
522
+ },
523
+ {
524
+ "epoch": 0.44,
525
+ "learning_rate": 4.61951053757156e-05,
526
+ "loss": 0.3123,
527
+ "step": 840
528
+ },
529
+ {
530
+ "epoch": 0.44,
531
+ "learning_rate": 4.6079956917390024e-05,
532
+ "loss": 0.2952,
533
+ "step": 850
534
+ },
535
+ {
536
+ "epoch": 1.0,
537
+ "learning_rate": 4.5963239950521306e-05,
538
+ "loss": 0.2996,
539
+ "step": 860
540
+ },
541
+ {
542
+ "epoch": 1.01,
543
+ "learning_rate": 4.5844963159736303e-05,
544
+ "loss": 0.285,
545
+ "step": 870
546
+ },
547
+ {
548
+ "epoch": 1.01,
549
+ "learning_rate": 4.572513534572457e-05,
550
+ "loss": 0.2925,
551
+ "step": 880
552
+ },
553
+ {
554
+ "epoch": 1.02,
555
+ "learning_rate": 4.560376542458354e-05,
556
+ "loss": 0.2836,
557
+ "step": 890
558
+ },
559
+ {
560
+ "epoch": 1.02,
561
+ "learning_rate": 4.5480862427155134e-05,
562
+ "loss": 0.2824,
563
+ "step": 900
564
+ },
565
+ {
566
+ "epoch": 1.03,
567
+ "learning_rate": 4.535643549835373e-05,
568
+ "loss": 0.2814,
569
+ "step": 910
570
+ },
571
+ {
572
+ "epoch": 1.03,
573
+ "learning_rate": 4.5230493896485774e-05,
574
+ "loss": 0.2838,
575
+ "step": 920
576
+ },
577
+ {
578
+ "epoch": 1.04,
579
+ "learning_rate": 4.5103046992560855e-05,
580
+ "loss": 0.2802,
581
+ "step": 930
582
+ },
583
+ {
584
+ "epoch": 1.04,
585
+ "learning_rate": 4.4974104269594444e-05,
586
+ "loss": 0.3026,
587
+ "step": 940
588
+ },
589
+ {
590
+ "epoch": 1.05,
591
+ "learning_rate": 4.484367532190228e-05,
592
+ "loss": 0.2613,
593
+ "step": 950
594
+ },
595
+ {
596
+ "epoch": 1.05,
597
+ "learning_rate": 4.47117698543865e-05,
598
+ "loss": 0.276,
599
+ "step": 960
600
+ },
601
+ {
602
+ "epoch": 1.06,
603
+ "learning_rate": 4.457839768181349e-05,
604
+ "loss": 0.2865,
605
+ "step": 970
606
+ },
607
+ {
608
+ "epoch": 1.06,
609
+ "learning_rate": 4.444356872808362e-05,
610
+ "loss": 0.2705,
611
+ "step": 980
612
+ },
613
+ {
614
+ "epoch": 1.07,
615
+ "learning_rate": 4.43072930254928e-05,
616
+ "loss": 0.2942,
617
+ "step": 990
618
+ },
619
+ {
620
+ "epoch": 1.07,
621
+ "learning_rate": 4.4169580713986037e-05,
622
+ "loss": 0.279,
623
+ "step": 1000
624
+ },
625
+ {
626
+ "epoch": 1.07,
627
+ "eval_loss": 0.48916903138160706,
628
+ "eval_runtime": 106.4721,
629
+ "eval_samples_per_second": 8.002,
630
+ "eval_steps_per_second": 2.001,
631
+ "step": 1000
632
+ },
633
+ {
634
+ "epoch": 1.08,
635
+ "learning_rate": 4.4030442040402915e-05,
636
+ "loss": 0.2714,
637
+ "step": 1010
638
+ },
639
+ {
640
+ "epoch": 1.08,
641
+ "learning_rate": 4.388988735771518e-05,
642
+ "loss": 0.2521,
643
+ "step": 1020
644
+ },
645
+ {
646
+ "epoch": 1.09,
647
+ "learning_rate": 4.374792712425637e-05,
648
+ "loss": 0.2797,
649
+ "step": 1030
650
+ },
651
+ {
652
+ "epoch": 1.09,
653
+ "learning_rate": 4.360457190294366e-05,
654
+ "loss": 0.2666,
655
+ "step": 1040
656
+ },
657
+ {
658
+ "epoch": 1.1,
659
+ "learning_rate": 4.345983236049189e-05,
660
+ "loss": 0.2916,
661
+ "step": 1050
662
+ },
663
+ {
664
+ "epoch": 1.1,
665
+ "learning_rate": 4.33137192666199e-05,
666
+ "loss": 0.2758,
667
+ "step": 1060
668
+ },
669
+ {
670
+ "epoch": 1.11,
671
+ "learning_rate": 4.3166243493249136e-05,
672
+ "loss": 0.2632,
673
+ "step": 1070
674
+ },
675
+ {
676
+ "epoch": 1.12,
677
+ "learning_rate": 4.301741601369475e-05,
678
+ "loss": 0.2913,
679
+ "step": 1080
680
+ },
681
+ {
682
+ "epoch": 1.12,
683
+ "learning_rate": 4.286724790184906e-05,
684
+ "loss": 0.2753,
685
+ "step": 1090
686
+ },
687
+ {
688
+ "epoch": 1.13,
689
+ "learning_rate": 4.27157503313576e-05,
690
+ "loss": 0.2565,
691
+ "step": 1100
692
+ },
693
+ {
694
+ "epoch": 1.13,
695
+ "learning_rate": 4.256293457478769e-05,
696
+ "loss": 0.2745,
697
+ "step": 1110
698
+ },
699
+ {
700
+ "epoch": 1.14,
701
+ "learning_rate": 4.24088120027897e-05,
702
+ "loss": 0.2663,
703
+ "step": 1120
704
+ },
705
+ {
706
+ "epoch": 1.14,
707
+ "learning_rate": 4.2253394083250946e-05,
708
+ "loss": 0.2836,
709
+ "step": 1130
710
+ },
711
+ {
712
+ "epoch": 1.15,
713
+ "learning_rate": 4.209669238044245e-05,
714
+ "loss": 0.2894,
715
+ "step": 1140
716
+ },
717
+ {
718
+ "epoch": 1.15,
719
+ "learning_rate": 4.19387185541584e-05,
720
+ "loss": 0.2705,
721
+ "step": 1150
722
+ },
723
+ {
724
+ "epoch": 1.16,
725
+ "learning_rate": 4.1779484358848644e-05,
726
+ "loss": 0.275,
727
+ "step": 1160
728
+ },
729
+ {
730
+ "epoch": 1.16,
731
+ "learning_rate": 4.161900164274403e-05,
732
+ "loss": 0.2566,
733
+ "step": 1170
734
+ },
735
+ {
736
+ "epoch": 1.17,
737
+ "learning_rate": 4.145728234697479e-05,
738
+ "loss": 0.2475,
739
+ "step": 1180
740
+ },
741
+ {
742
+ "epoch": 1.17,
743
+ "learning_rate": 4.12943385046821e-05,
744
+ "loss": 0.2853,
745
+ "step": 1190
746
+ },
747
+ {
748
+ "epoch": 1.18,
749
+ "learning_rate": 4.113018224012262e-05,
750
+ "loss": 0.2749,
751
+ "step": 1200
752
+ },
753
+ {
754
+ "epoch": 1.18,
755
+ "learning_rate": 4.0964825767766465e-05,
756
+ "loss": 0.2693,
757
+ "step": 1210
758
+ },
759
+ {
760
+ "epoch": 1.19,
761
+ "learning_rate": 4.079828139138827e-05,
762
+ "loss": 0.2938,
763
+ "step": 1220
764
+ },
765
+ {
766
+ "epoch": 1.19,
767
+ "learning_rate": 4.063056150315177e-05,
768
+ "loss": 0.2793,
769
+ "step": 1230
770
+ },
771
+ {
772
+ "epoch": 1.2,
773
+ "learning_rate": 4.046167858268766e-05,
774
+ "loss": 0.2647,
775
+ "step": 1240
776
+ },
777
+ {
778
+ "epoch": 1.2,
779
+ "learning_rate": 4.0291645196165026e-05,
780
+ "loss": 0.2764,
781
+ "step": 1250
782
+ },
783
+ {
784
+ "epoch": 1.21,
785
+ "learning_rate": 4.012047399535642e-05,
786
+ "loss": 0.2802,
787
+ "step": 1260
788
+ },
789
+ {
790
+ "epoch": 1.21,
791
+ "learning_rate": 3.9948177716696324e-05,
792
+ "loss": 0.278,
793
+ "step": 1270
794
+ },
795
+ {
796
+ "epoch": 1.22,
797
+ "learning_rate": 3.977476918033357e-05,
798
+ "loss": 0.275,
799
+ "step": 1280
800
+ },
801
+ {
802
+ "epoch": 1.22,
803
+ "learning_rate": 3.960026128917741e-05,
804
+ "loss": 0.2548,
805
+ "step": 1290
806
+ },
807
+ {
808
+ "epoch": 1.23,
809
+ "learning_rate": 3.942466702793738e-05,
810
+ "loss": 0.2629,
811
+ "step": 1300
812
+ },
813
+ {
814
+ "epoch": 1.24,
815
+ "learning_rate": 3.924799946215723e-05,
816
+ "loss": 0.2776,
817
+ "step": 1310
818
+ },
819
+ {
820
+ "epoch": 1.24,
821
+ "learning_rate": 3.9070271737242656e-05,
822
+ "loss": 0.2868,
823
+ "step": 1320
824
+ },
825
+ {
826
+ "epoch": 1.25,
827
+ "learning_rate": 3.889149707748327e-05,
828
+ "loss": 0.2874,
829
+ "step": 1330
830
+ },
831
+ {
832
+ "epoch": 1.25,
833
+ "learning_rate": 3.8711688785068555e-05,
834
+ "loss": 0.2624,
835
+ "step": 1340
836
+ },
837
+ {
838
+ "epoch": 1.26,
839
+ "learning_rate": 3.853086023909808e-05,
840
+ "loss": 0.2674,
841
+ "step": 1350
842
+ },
843
+ {
844
+ "epoch": 1.26,
845
+ "learning_rate": 3.8349024894586014e-05,
846
+ "loss": 0.2613,
847
+ "step": 1360
848
+ },
849
+ {
850
+ "epoch": 1.27,
851
+ "learning_rate": 3.816619628146e-05,
852
+ "loss": 0.2559,
853
+ "step": 1370
854
+ },
855
+ {
856
+ "epoch": 1.27,
857
+ "learning_rate": 3.798238800355436e-05,
858
+ "loss": 0.2713,
859
+ "step": 1380
860
+ },
861
+ {
862
+ "epoch": 1.28,
863
+ "learning_rate": 3.77976137375979e-05,
864
+ "loss": 0.2493,
865
+ "step": 1390
866
+ },
867
+ {
868
+ "epoch": 1.28,
869
+ "learning_rate": 3.761188723219628e-05,
870
+ "loss": 0.2425,
871
+ "step": 1400
872
+ },
873
+ {
874
+ "epoch": 1.29,
875
+ "learning_rate": 3.742522230680896e-05,
876
+ "loss": 0.2541,
877
+ "step": 1410
878
+ },
879
+ {
880
+ "epoch": 1.29,
881
+ "learning_rate": 3.7237632850720995e-05,
882
+ "loss": 0.278,
883
+ "step": 1420
884
+ },
885
+ {
886
+ "epoch": 1.3,
887
+ "learning_rate": 3.70491328220095e-05,
888
+ "loss": 0.2362,
889
+ "step": 1430
890
+ },
891
+ {
892
+ "epoch": 1.3,
893
+ "learning_rate": 3.685973624650511e-05,
894
+ "loss": 0.255,
895
+ "step": 1440
896
+ },
897
+ {
898
+ "epoch": 1.31,
899
+ "learning_rate": 3.6669457216748326e-05,
900
+ "loss": 0.2668,
901
+ "step": 1450
902
+ },
903
+ {
904
+ "epoch": 1.31,
905
+ "learning_rate": 3.647830989094095e-05,
906
+ "loss": 0.2416,
907
+ "step": 1460
908
+ },
909
+ {
910
+ "epoch": 1.32,
911
+ "learning_rate": 3.6286308491892565e-05,
912
+ "loss": 0.2492,
913
+ "step": 1470
914
+ },
915
+ {
916
+ "epoch": 1.32,
917
+ "learning_rate": 3.609346730596229e-05,
918
+ "loss": 0.2546,
919
+ "step": 1480
920
+ },
921
+ {
922
+ "epoch": 1.33,
923
+ "learning_rate": 3.589980068199575e-05,
924
+ "loss": 0.2502,
925
+ "step": 1490
926
+ },
927
+ {
928
+ "epoch": 1.33,
929
+ "learning_rate": 3.570532303025742e-05,
930
+ "loss": 0.2584,
931
+ "step": 1500
932
+ },
933
+ {
934
+ "epoch": 1.33,
935
+ "eval_loss": 0.4950372576713562,
936
+ "eval_runtime": 105.7937,
937
+ "eval_samples_per_second": 8.053,
938
+ "eval_steps_per_second": 2.013,
939
+ "step": 1500
940
+ },
941
+ {
942
+ "epoch": 1.34,
943
+ "learning_rate": 3.551004882135839e-05,
944
+ "loss": 0.2638,
945
+ "step": 1510
946
+ },
947
+ {
948
+ "epoch": 1.34,
949
+ "learning_rate": 3.531399258517962e-05,
950
+ "loss": 0.2383,
951
+ "step": 1520
952
+ },
953
+ {
954
+ "epoch": 1.35,
955
+ "learning_rate": 3.511716890979084e-05,
956
+ "loss": 0.2677,
957
+ "step": 1530
958
+ },
959
+ {
960
+ "epoch": 1.36,
961
+ "learning_rate": 3.4919592440365085e-05,
962
+ "loss": 0.255,
963
+ "step": 1540
964
+ },
965
+ {
966
+ "epoch": 1.36,
967
+ "learning_rate": 3.472127787808893e-05,
968
+ "loss": 0.2474,
969
+ "step": 1550
970
+ },
971
+ {
972
+ "epoch": 1.37,
973
+ "learning_rate": 3.452223997906871e-05,
974
+ "loss": 0.2474,
975
+ "step": 1560
976
+ },
977
+ {
978
+ "epoch": 1.37,
979
+ "learning_rate": 3.432249355323244e-05,
980
+ "loss": 0.2632,
981
+ "step": 1570
982
+ },
983
+ {
984
+ "epoch": 1.38,
985
+ "learning_rate": 3.412205346322794e-05,
986
+ "loss": 0.2338,
987
+ "step": 1580
988
+ },
989
+ {
990
+ "epoch": 1.38,
991
+ "learning_rate": 3.392093462331688e-05,
992
+ "loss": 0.2553,
993
+ "step": 1590
994
+ },
995
+ {
996
+ "epoch": 1.39,
997
+ "learning_rate": 3.371915199826506e-05,
998
+ "loss": 0.2519,
999
+ "step": 1600
1000
+ },
1001
+ {
1002
+ "epoch": 1.39,
1003
+ "learning_rate": 3.3516720602228945e-05,
1004
+ "loss": 0.2549,
1005
+ "step": 1610
1006
+ },
1007
+ {
1008
+ "epoch": 1.4,
1009
+ "learning_rate": 3.331365549763848e-05,
1010
+ "loss": 0.2802,
1011
+ "step": 1620
1012
+ },
1013
+ {
1014
+ "epoch": 1.4,
1015
+ "learning_rate": 3.310997179407631e-05,
1016
+ "loss": 0.2734,
1017
+ "step": 1630
1018
+ },
1019
+ {
1020
+ "epoch": 1.41,
1021
+ "learning_rate": 3.2905684647153536e-05,
1022
+ "loss": 0.2609,
1023
+ "step": 1640
1024
+ },
1025
+ {
1026
+ "epoch": 1.41,
1027
+ "learning_rate": 3.270080925738202e-05,
1028
+ "loss": 0.2383,
1029
+ "step": 1650
1030
+ },
1031
+ {
1032
+ "epoch": 1.42,
1033
+ "learning_rate": 3.2495360869043354e-05,
1034
+ "loss": 0.2576,
1035
+ "step": 1660
1036
+ },
1037
+ {
1038
+ "epoch": 1.42,
1039
+ "learning_rate": 3.2289354769054556e-05,
1040
+ "loss": 0.2305,
1041
+ "step": 1670
1042
+ },
1043
+ {
1044
+ "epoch": 1.43,
1045
+ "learning_rate": 3.2082806285830645e-05,
1046
+ "loss": 0.2508,
1047
+ "step": 1680
1048
+ },
1049
+ {
1050
+ "epoch": 1.43,
1051
+ "learning_rate": 3.187573078814402e-05,
1052
+ "loss": 0.2367,
1053
+ "step": 1690
1054
+ },
1055
+ {
1056
+ "epoch": 1.44,
1057
+ "learning_rate": 3.166814368398098e-05,
1058
+ "loss": 0.2335,
1059
+ "step": 1700
1060
+ },
1061
+ {
1062
+ "epoch": 1.44,
1063
+ "learning_rate": 3.146006041939522e-05,
1064
+ "loss": 0.2523,
1065
+ "step": 1710
1066
+ },
1067
+ {
1068
+ "epoch": 1.45,
1069
+ "step": 1718,
1070
+ "total_flos": 3.091617473007452e+17,
1071
+ "train_loss": 0.305820442848905,
1072
+ "train_runtime": 6666.1209,
1073
+ "train_samples_per_second": 2.3,
1074
+ "train_steps_per_second": 0.575
1075
+ }
1076
+ ],
1077
+ "logging_steps": 10,
1078
+ "max_steps": 3834,
1079
+ "num_train_epochs": 2,
1080
+ "save_steps": 500,
1081
+ "total_flos": 3.091617473007452e+17,
1082
+ "trial_name": null,
1083
+ "trial_params": null
1084
+ }