jikaixuan commited on
Commit
59b129b
·
verified ·
1 Parent(s): 7973cf8

Model save

Browse files
Files changed (5) hide show
  1. README.md +21 -24
  2. adapter_model.safetensors +1 -1
  3. all_results.json +2 -17
  4. train_results.json +2 -2
  5. trainer_state.json +681 -1497
README.md CHANGED
@@ -2,13 +2,10 @@
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - dpo
8
  - generated_from_trainer
9
  base_model: mistralai/Mistral-7B-v0.1
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  model-index:
13
  - name: zephyr-7b
14
  results: []
@@ -19,19 +16,19 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # zephyr-7b
21
 
22
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-qlora](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora) on the HuggingFaceH4/ultrafeedback_binarized dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 0.6906
25
- - Rewards/chosen: -0.3413
26
- - Rewards/rejected: -0.5652
27
- - Rewards/accuracies: 0.3631
28
- - Rewards/margins: 0.2239
29
- - Logps/rejected: -131.9189
30
- - Logps/chosen: -103.0295
31
- - Logits/rejected: -0.1381
32
- - Logits/chosen: -0.2453
33
- - Use Label: 15879.8574
34
- - Pred Label: 4192.1431
35
 
36
  ## Model description
37
 
@@ -68,15 +65,15 @@ The following hyperparameters were used during training:
68
 
69
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Use Label | Pred Label |
70
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:----------:|:----------:|
71
- | 0.6818 | 0.1 | 100 | 0.6814 | -0.0056 | -0.0496 | 0.3393 | 0.0440 | -80.3582 | -69.4632 | -2.0664 | -2.0975 | 1833.4603 | 22.5397 |
72
- | 0.6818 | 0.21 | 200 | 0.6861 | -0.1358 | -0.2381 | 0.3373 | 0.1023 | -99.2068 | -82.4782 | -1.9938 | -2.0215 | 3701.2063 | 258.7936 |
73
- | 0.6848 | 0.31 | 300 | 0.6877 | -0.2068 | -0.3388 | 0.3413 | 0.1320 | -109.2766 | -89.5763 | -1.8828 | -1.9157 | 5437.8730 | 626.1270 |
74
- | 0.6857 | 0.42 | 400 | 0.6885 | -0.1802 | -0.3299 | 0.3532 | 0.1497 | -108.3913 | -86.9237 | -1.4031 | -1.4529 | 7112.4443 | 1055.5555 |
75
- | 0.6894 | 0.52 | 500 | 0.6892 | -0.2862 | -0.4559 | 0.3552 | 0.1697 | -120.9922 | -97.5203 | -0.5997 | -0.6889 | 8741.4287 | 1530.5714 |
76
- | 0.6881 | 0.63 | 600 | 0.6918 | -0.3826 | -0.6059 | 0.3532 | 0.2233 | -135.9845 | -107.1618 | -0.2548 | -0.3579 | 10293.6826 | 2082.3174 |
77
- | 0.6913 | 0.73 | 700 | 0.6899 | -0.3542 | -0.5787 | 0.3671 | 0.2244 | -133.2637 | -104.3247 | -0.2462 | -0.3470 | 11806.4766 | 2673.5239 |
78
- | 0.6893 | 0.84 | 800 | 0.6904 | -0.3443 | -0.5684 | 0.3631 | 0.2241 | -132.2416 | -103.3355 | -0.1293 | -0.2367 | 13331.9043 | 3252.0952 |
79
- | 0.689 | 0.94 | 900 | 0.6907 | -0.3413 | -0.5651 | 0.3631 | 0.2238 | -131.9111 | -103.0301 | -0.1367 | -0.2437 | 14866.4766 | 3821.5239 |
80
 
81
 
82
  ### Framework versions
 
2
  license: apache-2.0
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - dpo
7
  - generated_from_trainer
8
  base_model: mistralai/Mistral-7B-v0.1
 
 
9
  model-index:
10
  - name: zephyr-7b
11
  results: []
 
16
 
17
  # zephyr-7b
18
 
19
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.6918
22
+ - Rewards/chosen: -0.0863
23
+ - Rewards/rejected: -0.1983
24
+ - Rewards/accuracies: 0.3571
25
+ - Rewards/margins: 0.1120
26
+ - Logps/rejected: -95.2291
27
+ - Logps/chosen: -77.5275
28
+ - Logits/rejected: -1.9113
29
+ - Logits/chosen: -1.9391
30
+ - Use Label: 14335.7139
31
+ - Pred Label: 4352.2856
32
 
33
  ## Model description
34
 
 
65
 
66
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen | Use Label | Pred Label |
67
  |:-------------:|:-----:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|:----------:|:----------:|
68
+ | 0.6876 | 0.1 | 100 | 0.6896 | -0.0555 | -0.0989 | 0.3353 | 0.0434 | -85.2883 | -74.4495 | -2.0761 | -2.1076 | 1766.8572 | 89.1429 |
69
+ | 0.6892 | 0.21 | 200 | 0.6894 | -0.0049 | -0.0560 | 0.3492 | 0.0511 | -80.9954 | -69.3876 | -2.0287 | -2.0520 | 3500.8889 | 459.1111 |
70
+ | 0.6904 | 0.31 | 300 | 0.6909 | -0.0625 | -0.1410 | 0.3532 | 0.0785 | -89.5016 | -75.1524 | -1.9943 | -2.0164 | 5140.6826 | 923.3174 |
71
+ | 0.6906 | 0.42 | 400 | 0.6921 | -0.0637 | -0.1541 | 0.3512 | 0.0904 | -90.8064 | -75.2687 | -2.0248 | -2.0481 | 6695.4287 | 1472.5714 |
72
+ | 0.6903 | 0.52 | 500 | 0.6914 | -0.0747 | -0.1726 | 0.3492 | 0.0979 | -92.6561 | -76.3697 | -1.9801 | -2.0071 | 8246.2061 | 2025.7937 |
73
+ | 0.6903 | 0.63 | 600 | 0.6917 | -0.1005 | -0.2047 | 0.3552 | 0.1042 | -95.8670 | -78.9543 | -1.9601 | -1.9870 | 9772.0635 | 2603.9365 |
74
+ | 0.6917 | 0.73 | 700 | 0.6917 | -0.1117 | -0.2224 | 0.3512 | 0.1108 | -97.6411 | -80.0681 | -1.9401 | -1.9659 | 11284.7773 | 3195.2222 |
75
+ | 0.6912 | 0.84 | 800 | 0.6917 | -0.0869 | -0.1981 | 0.3631 | 0.1112 | -95.2089 | -77.5874 | -1.9144 | -1.9422 | 12826.8252 | 3757.1746 |
76
+ | 0.6914 | 0.94 | 900 | 0.6918 | -0.0863 | -0.1983 | 0.3571 | 0.1120 | -95.2291 | -77.5275 | -1.9113 | -1.9391 | 14335.7139 | 4352.2856 |
77
 
78
 
79
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39226468743b2eef561efc7ded35bd95e31454122bc7f6a650b598ae1273a8d0
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ff211fdd55e89806d0731a069b1a1347270a24364fedc285bf81c9b757d749
3
  size 671150064
all_results.json CHANGED
@@ -1,22 +1,7 @@
1
  {
2
  "epoch": 1.0,
3
- "eval_logits/chosen": -0.24525223672389984,
4
- "eval_logits/rejected": -0.1380803883075714,
5
- "eval_logps/chosen": -103.02954864501953,
6
- "eval_logps/rejected": -131.91891479492188,
7
- "eval_loss": 0.6906174421310425,
8
- "eval_pred_label": 4192.14306640625,
9
- "eval_rewards/accuracies": 0.363095223903656,
10
- "eval_rewards/chosen": -0.3412899374961853,
11
- "eval_rewards/margins": 0.22391849756240845,
12
- "eval_rewards/rejected": -0.5652084350585938,
13
- "eval_runtime": 247.5585,
14
- "eval_samples": 2000,
15
- "eval_samples_per_second": 8.079,
16
- "eval_steps_per_second": 0.254,
17
- "eval_use_label": 15879.857421875,
18
- "train_loss": 0.6880922077838039,
19
- "train_runtime": 20023.3666,
20
  "train_samples": 61135,
21
  "train_samples_per_second": 3.053,
22
  "train_steps_per_second": 0.048
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.6906769273168754,
4
+ "train_runtime": 20027.4031,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  "train_samples": 61135,
6
  "train_samples_per_second": 3.053,
7
  "train_steps_per_second": 0.048
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.6880922077838039,
4
- "train_runtime": 20023.3666,
5
  "train_samples": 61135,
6
  "train_samples_per_second": 3.053,
7
  "train_steps_per_second": 0.048
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.6906769273168754,
4
+ "train_runtime": 20027.4031,
5
  "train_samples": 61135,
6
  "train_samples_per_second": 3.053,
7
  "train_steps_per_second": 0.048
trainer_state.json CHANGED
@@ -25,1798 +25,982 @@
25
  "step": 1,
26
  "use_label": 10.0
27
  },
28
- {
29
- "epoch": 0.01,
30
- "grad_norm": 0.60546875,
31
- "learning_rate": 5.208333333333334e-07,
32
- "logits/chosen": -2.2113068103790283,
33
- "logits/rejected": -2.2719719409942627,
34
- "logps/chosen": -57.57659149169922,
35
- "logps/rejected": -65.19544219970703,
36
- "loss": 0.693,
37
- "pred_label": 0.0,
38
- "rewards/accuracies": 0.2152777761220932,
39
- "rewards/chosen": 0.001057142741046846,
40
- "rewards/margins": 3.17241829179693e-05,
41
- "rewards/rejected": 0.001025418401695788,
42
- "step": 10,
43
- "use_label": 90.0
44
- },
45
  {
46
  "epoch": 0.02,
47
  "grad_norm": 0.6796875,
48
  "learning_rate": 1.0416666666666667e-06,
49
- "logits/chosen": -2.243159770965576,
50
- "logits/rejected": -2.2802278995513916,
51
- "logps/chosen": -56.544715881347656,
52
- "logps/rejected": -68.35901641845703,
53
- "loss": 0.6924,
54
  "pred_label": 0.0,
55
- "rewards/accuracies": 0.22499999403953552,
56
- "rewards/chosen": 0.006556531880050898,
57
- "rewards/margins": 0.001379690133035183,
58
- "rewards/rejected": 0.005176841747015715,
59
  "step": 20,
60
- "use_label": 242.0
61
- },
62
- {
63
- "epoch": 0.03,
64
- "grad_norm": 0.55078125,
65
- "learning_rate": 1.5625e-06,
66
- "logits/chosen": -2.2634024620056152,
67
- "logits/rejected": -2.2475943565368652,
68
- "logps/chosen": -53.98667526245117,
69
- "logps/rejected": -67.89213562011719,
70
- "loss": 0.692,
71
- "pred_label": 0.0,
72
- "rewards/accuracies": 0.2750000059604645,
73
- "rewards/chosen": 0.01648966409265995,
74
- "rewards/margins": 0.002599921775981784,
75
- "rewards/rejected": 0.013889740221202374,
76
- "step": 30,
77
- "use_label": 402.0
78
  },
79
  {
80
  "epoch": 0.04,
81
  "grad_norm": 0.6328125,
82
  "learning_rate": 2.0833333333333334e-06,
83
- "logits/chosen": -2.2825467586517334,
84
- "logits/rejected": -2.2754693031311035,
85
- "logps/chosen": -55.582061767578125,
86
- "logps/rejected": -66.59407043457031,
87
- "loss": 0.6909,
88
  "pred_label": 0.0,
89
- "rewards/accuracies": 0.21250000596046448,
90
- "rewards/chosen": 0.018406417220830917,
91
- "rewards/margins": 0.0006764450808987021,
92
- "rewards/rejected": 0.017729971557855606,
93
  "step": 40,
94
- "use_label": 562.0
95
- },
96
- {
97
- "epoch": 0.05,
98
- "grad_norm": 0.6015625,
99
- "learning_rate": 2.604166666666667e-06,
100
- "logits/chosen": -2.3444912433624268,
101
- "logits/rejected": -2.3341281414031982,
102
- "logps/chosen": -69.13630676269531,
103
- "logps/rejected": -84.64376831054688,
104
- "loss": 0.6889,
105
- "pred_label": 0.0,
106
- "rewards/accuracies": 0.2874999940395355,
107
- "rewards/chosen": 0.02657836303114891,
108
- "rewards/margins": 0.005359734408557415,
109
- "rewards/rejected": 0.021218623965978622,
110
- "step": 50,
111
- "use_label": 722.0
112
  },
113
  {
114
  "epoch": 0.06,
115
- "grad_norm": 0.72265625,
116
  "learning_rate": 3.125e-06,
117
- "logits/chosen": -2.3026936054229736,
118
- "logits/rejected": -2.309264659881592,
119
- "logps/chosen": -82.00704193115234,
120
- "logps/rejected": -90.7305908203125,
121
- "loss": 0.6874,
122
  "pred_label": 0.0,
123
- "rewards/accuracies": 0.34375,
124
- "rewards/chosen": 0.03688042238354683,
125
- "rewards/margins": 0.014220851473510265,
126
- "rewards/rejected": 0.02265957184135914,
127
  "step": 60,
128
- "use_label": 882.0
129
- },
130
- {
131
- "epoch": 0.07,
132
- "grad_norm": 0.79296875,
133
- "learning_rate": 3.6458333333333333e-06,
134
- "logits/chosen": -2.344853401184082,
135
- "logits/rejected": -2.3261306285858154,
136
- "logps/chosen": -77.20336151123047,
137
- "logps/rejected": -77.6347885131836,
138
- "loss": 0.6851,
139
- "pred_label": 0.0,
140
- "rewards/accuracies": 0.30000001192092896,
141
- "rewards/chosen": 0.02531364932656288,
142
- "rewards/margins": 0.01608472317457199,
143
- "rewards/rejected": 0.009228924289345741,
144
- "step": 70,
145
- "use_label": 1042.0
146
  },
147
  {
148
  "epoch": 0.08,
149
- "grad_norm": 0.80078125,
150
  "learning_rate": 4.166666666666667e-06,
151
- "logits/chosen": -2.241945743560791,
152
- "logits/rejected": -2.195178985595703,
153
- "logps/chosen": -81.6376953125,
154
- "logps/rejected": -89.05104064941406,
155
- "loss": 0.6814,
156
- "pred_label": 0.9750000238418579,
157
- "rewards/accuracies": 0.32499998807907104,
158
- "rewards/chosen": 0.004142354242503643,
159
- "rewards/margins": 0.025017932057380676,
160
- "rewards/rejected": -0.02087557688355446,
161
  "step": 80,
162
- "use_label": 1201.0250244140625
163
- },
164
- {
165
- "epoch": 0.09,
166
- "grad_norm": 1.578125,
167
- "learning_rate": 4.6875000000000004e-06,
168
- "logits/chosen": -2.1907405853271484,
169
- "logits/rejected": -2.232959270477295,
170
- "logps/chosen": -62.31688690185547,
171
- "logps/rejected": -80.38573455810547,
172
- "loss": 0.6812,
173
- "pred_label": 3.0999999046325684,
174
- "rewards/accuracies": 0.33125001192092896,
175
- "rewards/chosen": -0.012271342799067497,
176
- "rewards/margins": 0.04507603123784065,
177
- "rewards/rejected": -0.0573473684489727,
178
- "step": 90,
179
- "use_label": 1358.9000244140625
180
  },
181
  {
182
  "epoch": 0.1,
183
- "grad_norm": 0.796875,
184
  "learning_rate": 4.9997324926814375e-06,
185
- "logits/chosen": -2.132638454437256,
186
- "logits/rejected": -2.0995519161224365,
187
- "logps/chosen": -76.97563171386719,
188
- "logps/rejected": -79.27615356445312,
189
- "loss": 0.6818,
190
- "pred_label": 7.150000095367432,
191
- "rewards/accuracies": 0.3125,
192
- "rewards/chosen": -0.02400936186313629,
193
- "rewards/margins": 0.05036945268511772,
194
- "rewards/rejected": -0.07437881827354431,
195
  "step": 100,
196
- "use_label": 1514.8499755859375
197
  },
198
  {
199
  "epoch": 0.1,
200
- "eval_logits/chosen": -2.097480297088623,
201
- "eval_logits/rejected": -2.0663790702819824,
202
- "eval_logps/chosen": -69.46318054199219,
203
- "eval_logps/rejected": -80.35824584960938,
204
- "eval_loss": 0.6813791394233704,
205
- "eval_pred_label": 22.539682388305664,
206
- "eval_rewards/accuracies": 0.3392857015132904,
207
- "eval_rewards/chosen": -0.005626226309686899,
208
- "eval_rewards/margins": 0.04397555813193321,
209
- "eval_rewards/rejected": -0.04960178583860397,
210
- "eval_runtime": 245.3242,
211
- "eval_samples_per_second": 8.152,
212
- "eval_steps_per_second": 0.257,
213
- "eval_use_label": 1833.4603271484375,
214
  "step": 100
215
  },
216
- {
217
- "epoch": 0.12,
218
- "grad_norm": 1.1171875,
219
- "learning_rate": 4.996723692767927e-06,
220
- "logits/chosen": -2.114673137664795,
221
- "logits/rejected": -2.094468355178833,
222
- "logps/chosen": -63.9236946105957,
223
- "logps/rejected": -79.44518280029297,
224
- "loss": 0.6827,
225
- "pred_label": 34.0,
226
- "rewards/accuracies": 0.30000001192092896,
227
- "rewards/chosen": -0.02154584601521492,
228
- "rewards/margins": 0.04528125748038292,
229
- "rewards/rejected": -0.06682710349559784,
230
- "step": 110,
231
- "use_label": 2152.0
232
- },
233
  {
234
  "epoch": 0.13,
235
- "grad_norm": 1.0390625,
236
  "learning_rate": 4.9903757462135984e-06,
237
- "logits/chosen": -2.2926628589630127,
238
- "logits/rejected": -2.177788257598877,
239
- "logps/chosen": -83.48246002197266,
240
- "logps/rejected": -97.60291290283203,
241
- "loss": 0.683,
242
- "pred_label": 44.67499923706055,
243
- "rewards/accuracies": 0.3062500059604645,
244
- "rewards/chosen": -0.0941522866487503,
245
- "rewards/margins": 0.06425690650939941,
246
- "rewards/rejected": -0.15840919315814972,
247
  "step": 120,
248
- "use_label": 2301.324951171875
249
- },
250
- {
251
- "epoch": 0.14,
252
- "grad_norm": 0.546875,
253
- "learning_rate": 4.980697142834315e-06,
254
- "logits/chosen": -2.0968613624572754,
255
- "logits/rejected": -2.1124091148376465,
256
- "logps/chosen": -66.370849609375,
257
- "logps/rejected": -77.3319320678711,
258
- "loss": 0.6845,
259
- "pred_label": 57.57500076293945,
260
- "rewards/accuracies": 0.2750000059604645,
261
- "rewards/chosen": -0.07896758615970612,
262
- "rewards/margins": 0.04609644412994385,
263
- "rewards/rejected": -0.12506404519081116,
264
- "step": 130,
265
- "use_label": 2448.425048828125
266
  },
267
  {
268
  "epoch": 0.15,
269
- "grad_norm": 0.78515625,
270
  "learning_rate": 4.967700826904229e-06,
271
- "logits/chosen": -2.1041221618652344,
272
- "logits/rejected": -2.138929843902588,
273
- "logps/chosen": -68.11909484863281,
274
- "logps/rejected": -90.16340637207031,
275
- "loss": 0.6868,
276
- "pred_label": 73.75,
277
- "rewards/accuracies": 0.30000001192092896,
278
- "rewards/chosen": -0.08846104890108109,
279
- "rewards/margins": 0.0647779330611229,
280
- "rewards/rejected": -0.15323898196220398,
281
  "step": 140,
282
- "use_label": 2592.25
283
- },
284
- {
285
- "epoch": 0.16,
286
- "grad_norm": 1.1015625,
287
- "learning_rate": 4.951404179843963e-06,
288
- "logits/chosen": -2.1765952110290527,
289
- "logits/rejected": -2.125175714492798,
290
- "logps/chosen": -54.37804412841797,
291
- "logps/rejected": -58.982269287109375,
292
- "loss": 0.6809,
293
- "pred_label": 91.3499984741211,
294
- "rewards/accuracies": 0.30000001192092896,
295
- "rewards/chosen": -0.06883221119642258,
296
- "rewards/margins": 0.06803621351718903,
297
- "rewards/rejected": -0.136868417263031,
298
- "step": 150,
299
- "use_label": 2734.64990234375
300
  },
301
  {
302
  "epoch": 0.17,
303
- "grad_norm": 1.03125,
304
  "learning_rate": 4.931828996974498e-06,
305
- "logits/chosen": -2.2455694675445557,
306
- "logits/rejected": -2.213240623474121,
307
- "logps/chosen": -94.4081802368164,
308
- "logps/rejected": -107.48802185058594,
309
- "loss": 0.6857,
310
- "pred_label": 115.55000305175781,
311
- "rewards/accuracies": 0.35624998807907104,
312
- "rewards/chosen": -0.12804970145225525,
313
- "rewards/margins": 0.12874242663383484,
314
- "rewards/rejected": -0.2567921280860901,
315
  "step": 160,
316
- "use_label": 2870.449951171875
317
- },
318
- {
319
- "epoch": 0.18,
320
- "grad_norm": 1.1875,
321
- "learning_rate": 4.909001458367867e-06,
322
- "logits/chosen": -2.1201233863830566,
323
- "logits/rejected": -2.0822367668151855,
324
- "logps/chosen": -75.75311279296875,
325
- "logps/rejected": -87.55944061279297,
326
- "loss": 0.6869,
327
- "pred_label": 141.85000610351562,
328
- "rewards/accuracies": 0.33125001192092896,
329
- "rewards/chosen": -0.1179669052362442,
330
- "rewards/margins": 0.09383226186037064,
331
- "rewards/rejected": -0.21179917454719543,
332
- "step": 170,
333
- "use_label": 3004.14990234375
334
  },
335
  {
336
  "epoch": 0.19,
337
- "grad_norm": 1.4296875,
338
  "learning_rate": 4.882952093833628e-06,
339
- "logits/chosen": -2.1013779640197754,
340
- "logits/rejected": -2.121537685394287,
341
- "logps/chosen": -70.6474838256836,
342
- "logps/rejected": -89.79743957519531,
343
- "loss": 0.685,
344
- "pred_label": 161.3249969482422,
345
- "rewards/accuracies": 0.33125001192092896,
346
- "rewards/chosen": -0.08145526796579361,
347
- "rewards/margins": 0.08172430098056793,
348
- "rewards/rejected": -0.16317956149578094,
349
  "step": 180,
350
- "use_label": 3144.675048828125
351
- },
352
- {
353
- "epoch": 0.2,
354
- "grad_norm": 0.8515625,
355
- "learning_rate": 4.853715742087947e-06,
356
- "logits/chosen": -2.1533255577087402,
357
- "logits/rejected": -2.104222297668457,
358
- "logps/chosen": -87.3572998046875,
359
- "logps/rejected": -91.95249938964844,
360
- "loss": 0.6862,
361
- "pred_label": 181.39999389648438,
362
- "rewards/accuracies": 0.375,
363
- "rewards/chosen": -0.13474301993846893,
364
- "rewards/margins": 0.08988693356513977,
365
- "rewards/rejected": -0.2246299535036087,
366
- "step": 190,
367
- "use_label": 3284.60009765625
368
  },
369
  {
370
  "epoch": 0.21,
371
- "grad_norm": 0.96875,
372
  "learning_rate": 4.821331504159906e-06,
373
- "logits/chosen": -2.137516736984253,
374
- "logits/rejected": -2.13090443611145,
375
- "logps/chosen": -94.10081481933594,
376
- "logps/rejected": -95.15316009521484,
377
- "loss": 0.6818,
378
- "pred_label": 205.875,
379
- "rewards/accuracies": 0.38749998807907104,
380
- "rewards/chosen": -0.14046669006347656,
381
- "rewards/margins": 0.07937734574079514,
382
- "rewards/rejected": -0.2198440283536911,
383
  "step": 200,
384
- "use_label": 3420.125
385
  },
386
  {
387
  "epoch": 0.21,
388
- "eval_logits/chosen": -2.021465301513672,
389
- "eval_logits/rejected": -1.9937611818313599,
390
- "eval_logps/chosen": -82.4782485961914,
391
- "eval_logps/rejected": -99.20675659179688,
392
- "eval_loss": 0.6860649585723877,
393
- "eval_pred_label": 258.79364013671875,
394
- "eval_rewards/accuracies": 0.3373015820980072,
395
- "eval_rewards/chosen": -0.13577698171138763,
396
- "eval_rewards/margins": 0.10230996459722519,
397
- "eval_rewards/rejected": -0.23808695375919342,
398
- "eval_runtime": 245.9338,
399
- "eval_samples_per_second": 8.132,
400
- "eval_steps_per_second": 0.256,
401
- "eval_use_label": 3701.206298828125,
402
  "step": 200
403
  },
404
- {
405
- "epoch": 0.22,
406
- "grad_norm": 1.1484375,
407
- "learning_rate": 4.7858426910973435e-06,
408
- "logits/chosen": -2.1574149131774902,
409
- "logits/rejected": -2.1307334899902344,
410
- "logps/chosen": -77.64894104003906,
411
- "logps/rejected": -89.26710510253906,
412
- "loss": 0.6828,
413
- "pred_label": 313.32501220703125,
414
- "rewards/accuracies": 0.3687500059604645,
415
- "rewards/chosen": -0.09638272225856781,
416
- "rewards/margins": 0.12071452289819717,
417
- "rewards/rejected": -0.2170972377061844,
418
- "step": 210,
419
- "use_label": 3976.675048828125
420
- },
421
  {
422
  "epoch": 0.23,
423
- "grad_norm": 1.40625,
424
  "learning_rate": 4.747296766042161e-06,
425
- "logits/chosen": -2.1187565326690674,
426
- "logits/rejected": -2.102626323699951,
427
- "logps/chosen": -90.67762756347656,
428
- "logps/rejected": -96.60699462890625,
429
- "loss": 0.6884,
430
- "pred_label": 343.875,
431
- "rewards/accuracies": 0.35624998807907104,
432
- "rewards/chosen": -0.1462414264678955,
433
- "rewards/margins": 0.12368818372488022,
434
- "rewards/rejected": -0.2699296176433563,
435
  "step": 220,
436
- "use_label": 4106.125
437
- },
438
- {
439
- "epoch": 0.24,
440
- "grad_norm": 1.1484375,
441
- "learning_rate": 4.705745280752586e-06,
442
- "logits/chosen": -2.1437509059906006,
443
- "logits/rejected": -2.084073781967163,
444
- "logps/chosen": -90.86326599121094,
445
- "logps/rejected": -96.72235870361328,
446
- "loss": 0.6875,
447
- "pred_label": 378.6000061035156,
448
- "rewards/accuracies": 0.35624998807907104,
449
- "rewards/chosen": -0.12124122679233551,
450
- "rewards/margins": 0.11637073755264282,
451
- "rewards/rejected": -0.23761197924613953,
452
- "step": 230,
453
- "use_label": 4231.39990234375
454
  },
455
  {
456
  "epoch": 0.25,
457
- "grad_norm": 0.953125,
458
  "learning_rate": 4.661243806657256e-06,
459
- "logits/chosen": -2.1431565284729004,
460
- "logits/rejected": -2.1365227699279785,
461
- "logps/chosen": -71.16796875,
462
- "logps/rejected": -91.01861572265625,
463
- "loss": 0.6846,
464
- "pred_label": 403.125,
465
- "rewards/accuracies": 0.32499998807907104,
466
- "rewards/chosen": -0.07454425096511841,
467
- "rewards/margins": 0.09627760201692581,
468
- "rewards/rejected": -0.17082183063030243,
469
  "step": 240,
470
- "use_label": 4366.875
471
- },
472
- {
473
- "epoch": 0.26,
474
- "grad_norm": 0.890625,
475
- "learning_rate": 4.613851860533367e-06,
476
- "logits/chosen": -2.1595332622528076,
477
- "logits/rejected": -2.183953285217285,
478
- "logps/chosen": -71.86934661865234,
479
- "logps/rejected": -80.0597152709961,
480
- "loss": 0.6844,
481
- "pred_label": 422.25,
482
- "rewards/accuracies": 0.3125,
483
- "rewards/chosen": -0.06741674989461899,
484
- "rewards/margins": 0.08548234403133392,
485
- "rewards/rejected": -0.1528991013765335,
486
- "step": 250,
487
- "use_label": 4507.75
488
  },
489
  {
490
  "epoch": 0.27,
491
- "grad_norm": 1.0390625,
492
  "learning_rate": 4.563632824908252e-06,
493
- "logits/chosen": -2.1189560890197754,
494
- "logits/rejected": -2.071620464324951,
495
- "logps/chosen": -77.1129150390625,
496
- "logps/rejected": -101.45845031738281,
497
- "loss": 0.6837,
498
- "pred_label": 445.79998779296875,
499
- "rewards/accuracies": 0.3125,
500
- "rewards/chosen": -0.16171860694885254,
501
- "rewards/margins": 0.11343212425708771,
502
- "rewards/rejected": -0.27515071630477905,
503
  "step": 260,
504
- "use_label": 4644.2001953125
505
- },
506
- {
507
- "epoch": 0.28,
508
- "grad_norm": 1.0703125,
509
- "learning_rate": 4.510653863290871e-06,
510
- "logits/chosen": -2.1512458324432373,
511
- "logits/rejected": -2.164412021636963,
512
- "logps/chosen": -91.74055480957031,
513
- "logps/rejected": -95.13731384277344,
514
- "loss": 0.6883,
515
- "pred_label": 470.04998779296875,
516
- "rewards/accuracies": 0.3125,
517
- "rewards/chosen": -0.16311386227607727,
518
- "rewards/margins": 0.0933571308851242,
519
- "rewards/rejected": -0.2564709782600403,
520
- "step": 270,
521
- "use_label": 4779.9501953125
522
  },
523
  {
524
  "epoch": 0.29,
525
- "grad_norm": 0.8828125,
526
  "learning_rate": 4.454985830346574e-06,
527
- "logits/chosen": -2.0734293460845947,
528
- "logits/rejected": -2.1033730506896973,
529
- "logps/chosen": -76.7903823852539,
530
- "logps/rejected": -86.99803161621094,
531
- "loss": 0.6858,
532
- "pred_label": 494.9750061035156,
533
- "rewards/accuracies": 0.29374998807907104,
534
- "rewards/chosen": -0.15558014810085297,
535
- "rewards/margins": 0.050300367176532745,
536
- "rewards/rejected": -0.2058805227279663,
537
  "step": 280,
538
- "use_label": 4915.02490234375
539
- },
540
- {
541
- "epoch": 0.3,
542
- "grad_norm": 1.3125,
543
- "learning_rate": 4.396703177135262e-06,
544
- "logits/chosen": -1.9870249032974243,
545
- "logits/rejected": -1.956434965133667,
546
- "logps/chosen": -89.98160552978516,
547
- "logps/rejected": -99.75212097167969,
548
- "loss": 0.6905,
549
- "pred_label": 527.0499877929688,
550
- "rewards/accuracies": 0.36250001192092896,
551
- "rewards/chosen": -0.13706301152706146,
552
- "rewards/margins": 0.16557420790195465,
553
- "rewards/rejected": -0.3026372492313385,
554
- "step": 290,
555
- "use_label": 5042.9501953125
556
  },
557
  {
558
  "epoch": 0.31,
559
- "grad_norm": 1.6015625,
560
  "learning_rate": 4.335883851539693e-06,
561
- "logits/chosen": -1.9497883319854736,
562
- "logits/rejected": -1.964604377746582,
563
- "logps/chosen": -68.64933013916016,
564
- "logps/rejected": -91.48945617675781,
565
- "loss": 0.6848,
566
- "pred_label": 561.8499755859375,
567
- "rewards/accuracies": 0.33125001192092896,
568
- "rewards/chosen": -0.14721202850341797,
569
- "rewards/margins": 0.14547064900398254,
570
- "rewards/rejected": -0.2926826477050781,
571
  "step": 300,
572
- "use_label": 5168.14990234375
573
  },
574
  {
575
  "epoch": 0.31,
576
- "eval_logits/chosen": -1.9156862497329712,
577
- "eval_logits/rejected": -1.8827954530715942,
578
- "eval_logps/chosen": -89.57630920410156,
579
- "eval_logps/rejected": -109.2765884399414,
580
- "eval_loss": 0.6877307295799255,
581
- "eval_pred_label": 626.1270141601562,
582
- "eval_rewards/accuracies": 0.341269850730896,
583
- "eval_rewards/chosen": -0.20675767958164215,
584
- "eval_rewards/margins": 0.13202756643295288,
585
- "eval_rewards/rejected": -0.33878523111343384,
586
- "eval_runtime": 246.2269,
587
- "eval_samples_per_second": 8.123,
588
- "eval_steps_per_second": 0.256,
589
- "eval_use_label": 5437.873046875,
590
  "step": 300
591
  },
592
- {
593
- "epoch": 0.32,
594
- "grad_norm": 1.5,
595
- "learning_rate": 4.2726091940171055e-06,
596
- "logits/chosen": -2.043640613555908,
597
- "logits/rejected": -2.01674222946167,
598
- "logps/chosen": -72.24534606933594,
599
- "logps/rejected": -89.407470703125,
600
- "loss": 0.6865,
601
- "pred_label": 688.9500122070312,
602
- "rewards/accuracies": 0.29374998807907104,
603
- "rewards/chosen": -0.23255303502082825,
604
- "rewards/margins": 0.06651856750249863,
605
- "rewards/rejected": -0.29907160997390747,
606
- "step": 310,
607
- "use_label": 5705.0498046875
608
- },
609
  {
610
  "epoch": 0.33,
611
- "grad_norm": 1.1796875,
612
  "learning_rate": 4.206963828813555e-06,
613
- "logits/chosen": -1.9597671031951904,
614
- "logits/rejected": -1.9893718957901,
615
- "logps/chosen": -94.37977600097656,
616
- "logps/rejected": -118.25643157958984,
617
- "loss": 0.6871,
618
- "pred_label": 724.375,
619
- "rewards/accuracies": 0.36250001192092896,
620
- "rewards/chosen": -0.20438706874847412,
621
- "rewards/margins": 0.13566336035728455,
622
- "rewards/rejected": -0.34005045890808105,
623
  "step": 320,
624
- "use_label": 5829.625
625
- },
626
- {
627
- "epoch": 0.35,
628
- "grad_norm": 0.95703125,
629
- "learning_rate": 4.139035550786495e-06,
630
- "logits/chosen": -1.989506483078003,
631
- "logits/rejected": -1.9580066204071045,
632
- "logps/chosen": -73.50363159179688,
633
- "logps/rejected": -87.75289154052734,
634
- "loss": 0.683,
635
- "pred_label": 754.4500122070312,
636
- "rewards/accuracies": 0.36250001192092896,
637
- "rewards/chosen": -0.1003209576010704,
638
- "rewards/margins": 0.13466720283031464,
639
- "rewards/rejected": -0.23498816788196564,
640
- "step": 330,
641
- "use_label": 5959.5498046875
642
  },
643
  {
644
  "epoch": 0.36,
645
- "grad_norm": 1.0234375,
646
  "learning_rate": 4.068915207986931e-06,
647
- "logits/chosen": -2.0428695678710938,
648
- "logits/rejected": -2.016120195388794,
649
- "logps/chosen": -74.91081237792969,
650
- "logps/rejected": -93.89201354980469,
651
- "loss": 0.6894,
652
- "pred_label": 786.4749755859375,
653
  "rewards/accuracies": 0.3375000059604645,
654
- "rewards/chosen": -0.11903776973485947,
655
- "rewards/margins": 0.11223740875720978,
656
- "rewards/rejected": -0.23127520084381104,
657
  "step": 340,
658
- "use_label": 6087.52490234375
659
- },
660
- {
661
- "epoch": 0.37,
662
- "grad_norm": 0.984375,
663
- "learning_rate": 3.996696580158211e-06,
664
- "logits/chosen": -2.0441341400146484,
665
- "logits/rejected": -2.0229620933532715,
666
- "logps/chosen": -73.9575424194336,
667
- "logps/rejected": -86.34129333496094,
668
- "loss": 0.6869,
669
- "pred_label": 817.5250244140625,
670
- "rewards/accuracies": 0.3125,
671
- "rewards/chosen": -0.133123978972435,
672
- "rewards/margins": 0.08419892936944962,
673
- "rewards/rejected": -0.2173229157924652,
674
- "step": 350,
675
- "use_label": 6216.47509765625
676
  },
677
  {
678
  "epoch": 0.38,
679
- "grad_norm": 1.546875,
680
  "learning_rate": 3.922476253313921e-06,
681
- "logits/chosen": -2.0575146675109863,
682
- "logits/rejected": -2.054591417312622,
683
- "logps/chosen": -82.88232421875,
684
- "logps/rejected": -90.05668640136719,
685
- "loss": 0.6863,
686
- "pred_label": 848.6500244140625,
687
- "rewards/accuracies": 0.3375000059604645,
688
- "rewards/chosen": -0.13817565143108368,
689
- "rewards/margins": 0.11208128929138184,
690
- "rewards/rejected": -0.2502569556236267,
691
  "step": 360,
692
- "use_label": 6345.35009765625
693
- },
694
- {
695
- "epoch": 0.39,
696
- "grad_norm": 0.75,
697
- "learning_rate": 3.846353490562664e-06,
698
- "logits/chosen": -2.076312780380249,
699
- "logits/rejected": -1.9995708465576172,
700
- "logps/chosen": -85.83981323242188,
701
- "logps/rejected": -95.1656723022461,
702
- "loss": 0.6844,
703
- "pred_label": 880.4249877929688,
704
- "rewards/accuracies": 0.3687500059604645,
705
- "rewards/chosen": -0.11745607852935791,
706
- "rewards/margins": 0.14055705070495605,
707
- "rewards/rejected": -0.2580130994319916,
708
- "step": 370,
709
- "use_label": 6473.5751953125
710
  },
711
  {
712
  "epoch": 0.4,
713
- "grad_norm": 0.96484375,
714
  "learning_rate": 3.768430099352445e-06,
715
- "logits/chosen": -2.0079166889190674,
716
- "logits/rejected": -1.986297845840454,
717
- "logps/chosen": -76.30638122558594,
718
- "logps/rejected": -93.93800354003906,
719
- "loss": 0.6924,
720
- "pred_label": 912.5999755859375,
721
- "rewards/accuracies": 0.3062500059604645,
722
- "rewards/chosen": -0.1675274670124054,
723
- "rewards/margins": 0.08305275440216064,
724
- "rewards/rejected": -0.25058022141456604,
725
  "step": 380,
726
- "use_label": 6601.39990234375
727
- },
728
- {
729
- "epoch": 0.41,
730
- "grad_norm": 0.97265625,
731
- "learning_rate": 3.6888102953122307e-06,
732
- "logits/chosen": -1.9291635751724243,
733
- "logits/rejected": -1.914608359336853,
734
- "logps/chosen": -101.44157409667969,
735
- "logps/rejected": -96.10136413574219,
736
- "loss": 0.6878,
737
- "pred_label": 952.8250122070312,
738
- "rewards/accuracies": 0.3187499940395355,
739
- "rewards/chosen": -0.1657881736755371,
740
- "rewards/margins": 0.12364902347326279,
741
- "rewards/rejected": -0.2894372344017029,
742
- "step": 390,
743
- "use_label": 6721.1748046875
744
  },
745
  {
746
  "epoch": 0.42,
747
- "grad_norm": 1.296875,
748
  "learning_rate": 3.607600562872785e-06,
749
- "logits/chosen": -1.8988447189331055,
750
- "logits/rejected": -1.8926557302474976,
751
- "logps/chosen": -87.97608947753906,
752
- "logps/rejected": -108.15446472167969,
753
- "loss": 0.6857,
754
- "pred_label": 987.5999755859375,
755
- "rewards/accuracies": 0.34375,
756
- "rewards/chosen": -0.16945099830627441,
757
- "rewards/margins": 0.11657001823186874,
758
- "rewards/rejected": -0.28602102398872375,
759
  "step": 400,
760
- "use_label": 6846.39990234375
761
  },
762
  {
763
  "epoch": 0.42,
764
- "eval_logits/chosen": -1.4529144763946533,
765
- "eval_logits/rejected": -1.4031411409378052,
766
- "eval_logps/chosen": -86.92367553710938,
767
- "eval_logps/rejected": -108.39134979248047,
768
- "eval_loss": 0.6884719133377075,
769
- "eval_pred_label": 1055.5555419921875,
770
- "eval_rewards/accuracies": 0.3531745970249176,
771
- "eval_rewards/chosen": -0.18023118376731873,
772
- "eval_rewards/margins": 0.14970164000988007,
773
- "eval_rewards/rejected": -0.32993283867836,
774
- "eval_runtime": 246.35,
775
- "eval_samples_per_second": 8.119,
776
- "eval_steps_per_second": 0.256,
777
- "eval_use_label": 7112.4443359375,
778
  "step": 400
779
  },
780
- {
781
- "epoch": 0.43,
782
- "grad_norm": 1.28125,
783
- "learning_rate": 3.5249095128531863e-06,
784
- "logits/chosen": -1.289879560470581,
785
- "logits/rejected": -1.4085474014282227,
786
- "logps/chosen": -85.75054168701172,
787
- "logps/rejected": -96.24283599853516,
788
- "loss": 0.6874,
789
- "pred_label": 1135.699951171875,
790
- "rewards/accuracies": 0.38749998807907104,
791
- "rewards/chosen": -0.21242520213127136,
792
- "rewards/margins": 0.17107079923152924,
793
- "rewards/rejected": -0.3834960162639618,
794
- "step": 410,
795
- "use_label": 7362.2998046875
796
- },
797
  {
798
  "epoch": 0.44,
799
- "grad_norm": 0.97265625,
800
  "learning_rate": 3.4408477372034743e-06,
801
- "logits/chosen": -1.2336995601654053,
802
- "logits/rejected": -1.1623611450195312,
803
- "logps/chosen": -97.20266723632812,
804
- "logps/rejected": -117.6893081665039,
805
- "loss": 0.6882,
806
- "pred_label": 1171.425048828125,
807
- "rewards/accuracies": 0.3375000059604645,
808
- "rewards/chosen": -0.3355943560600281,
809
- "rewards/margins": 0.16045086085796356,
810
- "rewards/rejected": -0.49604520201683044,
811
  "step": 420,
812
- "use_label": 7486.5751953125
813
- },
814
- {
815
- "epoch": 0.45,
816
- "grad_norm": 1.1484375,
817
- "learning_rate": 3.355527661097728e-06,
818
- "logits/chosen": -1.3129976987838745,
819
- "logits/rejected": -1.2275488376617432,
820
- "logps/chosen": -106.88911437988281,
821
- "logps/rejected": -112.3751449584961,
822
- "loss": 0.6918,
823
- "pred_label": 1207.9749755859375,
824
- "rewards/accuracies": 0.3125,
825
- "rewards/chosen": -0.3042059540748596,
826
- "rewards/margins": 0.13597823679447174,
827
- "rewards/rejected": -0.44018417596817017,
828
- "step": 430,
829
- "use_label": 7610.02490234375
830
  },
831
  {
832
  "epoch": 0.46,
833
- "grad_norm": 1.5625,
834
  "learning_rate": 3.269063392575352e-06,
835
- "logits/chosen": -1.3159044981002808,
836
- "logits/rejected": -1.413769006729126,
837
- "logps/chosen": -90.12797546386719,
838
- "logps/rejected": -101.85379028320312,
839
- "loss": 0.6858,
840
- "pred_label": 1242.5,
841
  "rewards/accuracies": 0.33125001192092896,
842
- "rewards/chosen": -0.22682049870491028,
843
- "rewards/margins": 0.159098818898201,
844
- "rewards/rejected": -0.3859192728996277,
845
  "step": 440,
846
- "use_label": 7735.5
847
- },
848
- {
849
- "epoch": 0.47,
850
- "grad_norm": 1.375,
851
- "learning_rate": 3.181570569931697e-06,
852
- "logits/chosen": -1.4389588832855225,
853
- "logits/rejected": -1.5265202522277832,
854
- "logps/chosen": -96.37947845458984,
855
- "logps/rejected": -113.1718521118164,
856
- "loss": 0.6951,
857
- "pred_label": 1281.3499755859375,
858
- "rewards/accuracies": 0.3375000059604645,
859
- "rewards/chosen": -0.2355901300907135,
860
- "rewards/margins": 0.13590970635414124,
861
- "rewards/rejected": -0.37149983644485474,
862
- "step": 450,
863
- "use_label": 7856.64990234375
864
  },
865
  {
866
  "epoch": 0.48,
867
- "grad_norm": 1.015625,
868
  "learning_rate": 3.09316620706208e-06,
869
- "logits/chosen": -1.2455997467041016,
870
- "logits/rejected": -1.1902601718902588,
871
- "logps/chosen": -72.07853698730469,
872
- "logps/rejected": -84.86478424072266,
873
- "loss": 0.6842,
874
- "pred_label": 1311.824951171875,
875
- "rewards/accuracies": 0.29374998807907104,
876
- "rewards/chosen": -0.1508016437292099,
877
- "rewards/margins": 0.1797787994146347,
878
- "rewards/rejected": -0.330580472946167,
879
  "step": 460,
880
- "use_label": 7986.1748046875
881
- },
882
- {
883
- "epoch": 0.49,
884
- "grad_norm": 1.1015625,
885
- "learning_rate": 3.0039685369660785e-06,
886
- "logits/chosen": -1.175449252128601,
887
- "logits/rejected": -1.0759943723678589,
888
- "logps/chosen": -88.91249084472656,
889
- "logps/rejected": -110.02799987792969,
890
- "loss": 0.6873,
891
- "pred_label": 1345.1500244140625,
892
- "rewards/accuracies": 0.3687500059604645,
893
- "rewards/chosen": -0.22000393271446228,
894
- "rewards/margins": 0.1964809000492096,
895
- "rewards/rejected": -0.4164848327636719,
896
- "step": 470,
897
- "use_label": 8112.85009765625
898
  },
899
  {
900
  "epoch": 0.5,
901
- "grad_norm": 1.0859375,
902
  "learning_rate": 2.91409685362137e-06,
903
- "logits/chosen": -1.0014227628707886,
904
- "logits/rejected": -1.0880533456802368,
905
- "logps/chosen": -99.41879272460938,
906
- "logps/rejected": -120.02769470214844,
907
- "loss": 0.6868,
908
- "pred_label": 1391.25,
909
- "rewards/accuracies": 0.35624998807907104,
910
- "rewards/chosen": -0.24276605248451233,
911
- "rewards/margins": 0.17868337035179138,
912
- "rewards/rejected": -0.4214494228363037,
913
  "step": 480,
914
- "use_label": 8226.75
915
- },
916
- {
917
- "epoch": 0.51,
918
- "grad_norm": 1.4375,
919
- "learning_rate": 2.8236713524386085e-06,
920
- "logits/chosen": -1.0729541778564453,
921
- "logits/rejected": -0.9298813939094543,
922
- "logps/chosen": -88.73147583007812,
923
- "logps/rejected": -94.53245544433594,
924
- "loss": 0.6921,
925
- "pred_label": 1428.9000244140625,
926
- "rewards/accuracies": 0.26875001192092896,
927
- "rewards/chosen": -0.22107498347759247,
928
- "rewards/margins": 0.12524999678134918,
929
- "rewards/rejected": -0.34632498025894165,
930
- "step": 490,
931
- "use_label": 8349.099609375
932
  },
933
  {
934
  "epoch": 0.52,
935
- "grad_norm": 1.421875,
936
  "learning_rate": 2.7328129695107205e-06,
937
- "logits/chosen": -0.8902079463005066,
938
- "logits/rejected": -1.065393090248108,
939
- "logps/chosen": -113.58573150634766,
940
- "logps/rejected": -131.9083709716797,
941
- "loss": 0.6894,
942
- "pred_label": 1462.4000244140625,
943
- "rewards/accuracies": 0.41874998807907104,
944
- "rewards/chosen": -0.37447452545166016,
945
- "rewards/margins": 0.17800332605838776,
946
- "rewards/rejected": -0.5524778962135315,
947
  "step": 500,
948
- "use_label": 8475.599609375
949
  },
950
  {
951
  "epoch": 0.52,
952
- "eval_logits/chosen": -0.6888664960861206,
953
- "eval_logits/rejected": -0.5997034311294556,
954
- "eval_logps/chosen": -97.52025604248047,
955
- "eval_logps/rejected": -120.9921646118164,
956
- "eval_loss": 0.6891720294952393,
957
- "eval_pred_label": 1530.5714111328125,
958
- "eval_rewards/accuracies": 0.3551587164402008,
959
- "eval_rewards/chosen": -0.28619715571403503,
960
- "eval_rewards/margins": 0.1697438359260559,
961
- "eval_rewards/rejected": -0.45594096183776855,
962
- "eval_runtime": 246.2759,
963
- "eval_samples_per_second": 8.121,
964
- "eval_steps_per_second": 0.256,
965
- "eval_use_label": 8741.4287109375,
966
  "step": 500
967
  },
968
- {
969
- "epoch": 0.53,
970
- "grad_norm": 1.0078125,
971
- "learning_rate": 2.641643219871597e-06,
972
- "logits/chosen": -0.7708507776260376,
973
- "logits/rejected": -0.882653534412384,
974
- "logps/chosen": -90.50456237792969,
975
- "logps/rejected": -116.84162902832031,
976
- "loss": 0.686,
977
- "pred_label": 1610.5999755859375,
978
- "rewards/accuracies": 0.36250001192092896,
979
- "rewards/chosen": -0.2625977396965027,
980
- "rewards/margins": 0.20036396384239197,
981
- "rewards/rejected": -0.4629616141319275,
982
- "step": 510,
983
- "use_label": 8991.400390625
984
- },
985
  {
986
  "epoch": 0.54,
987
- "grad_norm": 1.4765625,
988
  "learning_rate": 2.5502840349805074e-06,
989
- "logits/chosen": -0.8800374865531921,
990
- "logits/rejected": -1.038163185119629,
991
- "logps/chosen": -100.99266052246094,
992
- "logps/rejected": -116.75798034667969,
993
- "loss": 0.6895,
994
- "pred_label": 1653.0,
995
- "rewards/accuracies": 0.36250001192092896,
996
- "rewards/chosen": -0.2859944701194763,
997
- "rewards/margins": 0.15662841498851776,
998
- "rewards/rejected": -0.4426229000091553,
999
  "step": 520,
1000
- "use_label": 9109.0
1001
- },
1002
- {
1003
- "epoch": 0.55,
1004
- "grad_norm": 1.3671875,
1005
- "learning_rate": 2.4588575996495797e-06,
1006
- "logits/chosen": -0.8304817080497742,
1007
- "logits/rejected": -0.7847825288772583,
1008
- "logps/chosen": -105.92545318603516,
1009
- "logps/rejected": -117.15931701660156,
1010
- "loss": 0.6895,
1011
- "pred_label": 1692.175048828125,
1012
- "rewards/accuracies": 0.36250001192092896,
1013
- "rewards/chosen": -0.316447913646698,
1014
- "rewards/margins": 0.17969803512096405,
1015
- "rewards/rejected": -0.49614596366882324,
1016
- "step": 530,
1017
- "use_label": 9229.8251953125
1018
  },
1019
  {
1020
  "epoch": 0.57,
1021
- "grad_norm": 2.03125,
1022
  "learning_rate": 2.367486188632446e-06,
1023
- "logits/chosen": -0.67156982421875,
1024
- "logits/rejected": -0.8070074319839478,
1025
- "logps/chosen": -112.666748046875,
1026
- "logps/rejected": -131.92593383789062,
1027
- "loss": 0.6896,
1028
- "pred_label": 1734.375,
1029
- "rewards/accuracies": 0.375,
1030
- "rewards/chosen": -0.35928016901016235,
1031
- "rewards/margins": 0.22706659138202667,
1032
- "rewards/rejected": -0.5863467454910278,
1033
  "step": 540,
1034
- "use_label": 9347.625
1035
- },
1036
- {
1037
- "epoch": 0.58,
1038
- "grad_norm": 1.796875,
1039
- "learning_rate": 2.276292003092593e-06,
1040
- "logits/chosen": -0.7944391369819641,
1041
- "logits/rejected": -0.7596977353096008,
1042
- "logps/chosen": -107.38740539550781,
1043
- "logps/rejected": -111.28292083740234,
1044
- "loss": 0.6887,
1045
- "pred_label": 1775.7249755859375,
1046
- "rewards/accuracies": 0.3125,
1047
- "rewards/chosen": -0.3932684063911438,
1048
- "rewards/margins": 0.12325477600097656,
1049
- "rewards/rejected": -0.5165232419967651,
1050
- "step": 550,
1051
- "use_label": 9466.275390625
1052
  },
1053
  {
1054
  "epoch": 0.59,
1055
- "grad_norm": 1.3515625,
1056
  "learning_rate": 2.1853970071701415e-06,
1057
- "logits/chosen": -0.7152852416038513,
1058
- "logits/rejected": -0.7174454927444458,
1059
- "logps/chosen": -104.6649398803711,
1060
- "logps/rejected": -117.61528015136719,
1061
- "loss": 0.6901,
1062
- "pred_label": 1814.375,
1063
- "rewards/accuracies": 0.32499998807907104,
1064
- "rewards/chosen": -0.3510952889919281,
1065
- "rewards/margins": 0.15508435666561127,
1066
- "rewards/rejected": -0.5061796307563782,
1067
  "step": 560,
1068
- "use_label": 9587.625
1069
- },
1070
- {
1071
- "epoch": 0.6,
1072
- "grad_norm": 2.125,
1073
- "learning_rate": 2.0949227648656194e-06,
1074
- "logits/chosen": -0.925454318523407,
1075
- "logits/rejected": -0.849765956401825,
1076
- "logps/chosen": -100.53346252441406,
1077
- "logps/rejected": -131.70309448242188,
1078
- "loss": 0.6872,
1079
- "pred_label": 1852.2249755859375,
1080
- "rewards/accuracies": 0.375,
1081
- "rewards/chosen": -0.3393338620662689,
1082
- "rewards/margins": 0.23398590087890625,
1083
- "rewards/rejected": -0.5733197927474976,
1084
- "step": 570,
1085
- "use_label": 9709.775390625
1086
  },
1087
  {
1088
  "epoch": 0.61,
1089
- "grad_norm": 1.15625,
1090
  "learning_rate": 2.00499027745888e-06,
1091
- "logits/chosen": -0.7680953145027161,
1092
- "logits/rejected": -0.8566532135009766,
1093
- "logps/chosen": -111.98583984375,
1094
- "logps/rejected": -131.1743927001953,
1095
- "loss": 0.6879,
1096
- "pred_label": 1893.7750244140625,
1097
- "rewards/accuracies": 0.34375,
1098
- "rewards/chosen": -0.37074294686317444,
1099
- "rewards/margins": 0.1566895693540573,
1100
- "rewards/rejected": -0.5274325013160706,
1101
  "step": 580,
1102
- "use_label": 9828.224609375
1103
- },
1104
- {
1105
- "epoch": 0.62,
1106
- "grad_norm": 1.1171875,
1107
- "learning_rate": 1.915719821680624e-06,
1108
- "logits/chosen": -0.8080962300300598,
1109
- "logits/rejected": -0.7905328869819641,
1110
- "logps/chosen": -125.2184066772461,
1111
- "logps/rejected": -148.79432678222656,
1112
- "loss": 0.6891,
1113
- "pred_label": 1939.25,
1114
- "rewards/accuracies": 0.40625,
1115
- "rewards/chosen": -0.4552985727787018,
1116
- "rewards/margins": 0.22290782630443573,
1117
- "rewards/rejected": -0.6782063245773315,
1118
- "step": 590,
1119
- "use_label": 9942.75
1120
  },
1121
  {
1122
  "epoch": 0.63,
1123
- "grad_norm": 1.9609375,
1124
  "learning_rate": 1.8272307888529276e-06,
1125
- "logits/chosen": -0.5244548320770264,
1126
- "logits/rejected": -0.7590290904045105,
1127
- "logps/chosen": -122.6807632446289,
1128
- "logps/rejected": -162.36203002929688,
1129
- "loss": 0.6881,
1130
- "pred_label": 1992.0,
1131
- "rewards/accuracies": 0.42500001192092896,
1132
- "rewards/chosen": -0.48354387283325195,
1133
- "rewards/margins": 0.23392179608345032,
1134
- "rewards/rejected": -0.7174656391143799,
1135
  "step": 600,
1136
- "use_label": 10050.0
1137
  },
1138
  {
1139
  "epoch": 0.63,
1140
- "eval_logits/chosen": -0.35794487595558167,
1141
- "eval_logits/rejected": -0.2547617554664612,
1142
- "eval_logps/chosen": -107.16178131103516,
1143
- "eval_logps/rejected": -135.9844512939453,
1144
- "eval_loss": 0.6918326616287231,
1145
- "eval_pred_label": 2082.3173828125,
1146
- "eval_rewards/accuracies": 0.3531745970249176,
1147
- "eval_rewards/chosen": -0.3826123774051666,
1148
- "eval_rewards/margins": 0.22325147688388824,
1149
- "eval_rewards/rejected": -0.6058638095855713,
1150
- "eval_runtime": 248.3104,
1151
- "eval_samples_per_second": 8.054,
1152
  "eval_steps_per_second": 0.254,
1153
- "eval_use_label": 10293.6826171875,
1154
  "step": 600
1155
  },
1156
- {
1157
- "epoch": 0.64,
1158
- "grad_norm": 1.515625,
1159
- "learning_rate": 1.739641525213929e-06,
1160
- "logits/chosen": -0.572044312953949,
1161
- "logits/rejected": -0.654716432094574,
1162
- "logps/chosen": -95.46563720703125,
1163
- "logps/rejected": -132.0639190673828,
1164
- "loss": 0.6926,
1165
- "pred_label": 2185.449951171875,
1166
- "rewards/accuracies": 0.33125001192092896,
1167
- "rewards/chosen": -0.3655874729156494,
1168
- "rewards/margins": 0.21378450095653534,
1169
- "rewards/rejected": -0.579371988773346,
1170
- "step": 610,
1171
- "use_label": 10520.5498046875
1172
- },
1173
  {
1174
  "epoch": 0.65,
1175
- "grad_norm": 1.0859375,
1176
  "learning_rate": 1.6530691736402317e-06,
1177
- "logits/chosen": -0.7425838708877563,
1178
- "logits/rejected": -0.7612688541412354,
1179
- "logps/chosen": -98.45491790771484,
1180
- "logps/rejected": -139.22779846191406,
1181
- "loss": 0.6874,
1182
- "pred_label": 2228.10009765625,
1183
- "rewards/accuracies": 0.39375001192092896,
1184
- "rewards/chosen": -0.3674684762954712,
1185
- "rewards/margins": 0.22383132576942444,
1186
- "rewards/rejected": -0.591299831867218,
1187
  "step": 620,
1188
- "use_label": 10637.900390625
1189
- },
1190
- {
1191
- "epoch": 0.66,
1192
- "grad_norm": 1.34375,
1193
- "learning_rate": 1.5676295169786864e-06,
1194
- "logits/chosen": -0.5626051425933838,
1195
- "logits/rejected": -0.7373117208480835,
1196
- "logps/chosen": -109.76419830322266,
1197
- "logps/rejected": -132.89573669433594,
1198
- "loss": 0.6861,
1199
- "pred_label": 2268.074951171875,
1200
- "rewards/accuracies": 0.35624998807907104,
1201
- "rewards/chosen": -0.3673921525478363,
1202
- "rewards/margins": 0.2162620723247528,
1203
- "rewards/rejected": -0.5836542844772339,
1204
- "step": 630,
1205
- "use_label": 10757.9248046875
1206
  },
1207
  {
1208
  "epoch": 0.67,
1209
- "grad_norm": 1.2578125,
1210
  "learning_rate": 1.4834368231970922e-06,
1211
- "logits/chosen": -0.70842045545578,
1212
- "logits/rejected": -0.5356844663619995,
1213
- "logps/chosen": -115.94453430175781,
1214
- "logps/rejected": -132.53977966308594,
1215
- "loss": 0.6881,
1216
- "pred_label": 2312.199951171875,
1217
- "rewards/accuracies": 0.38749998807907104,
1218
- "rewards/chosen": -0.4425238072872162,
1219
- "rewards/margins": 0.23113970458507538,
1220
- "rewards/rejected": -0.6736636161804199,
1221
  "step": 640,
1222
- "use_label": 10873.7998046875
1223
- },
1224
- {
1225
- "epoch": 0.68,
1226
- "grad_norm": 1.5,
1227
- "learning_rate": 1.4006036925609245e-06,
1228
- "logits/chosen": -0.7530516386032104,
1229
- "logits/rejected": -0.39667490124702454,
1230
- "logps/chosen": -117.97354888916016,
1231
- "logps/rejected": -148.5204620361328,
1232
- "loss": 0.6907,
1233
- "pred_label": 2364.60009765625,
1234
- "rewards/accuracies": 0.35624998807907104,
1235
- "rewards/chosen": -0.4478411078453064,
1236
- "rewards/margins": 0.25875502824783325,
1237
- "rewards/rejected": -0.7065961956977844,
1238
- "step": 650,
1239
- "use_label": 10981.400390625
1240
  },
1241
  {
1242
  "epoch": 0.69,
1243
- "grad_norm": 1.2109375,
1244
  "learning_rate": 1.3192409070404582e-06,
1245
- "logits/chosen": -0.4164413511753082,
1246
- "logits/rejected": -0.5387105345726013,
1247
- "logps/chosen": -93.08172607421875,
1248
- "logps/rejected": -106.9631576538086,
1249
- "loss": 0.6884,
1250
- "pred_label": 2410.39990234375,
1251
- "rewards/accuracies": 0.3062500059604645,
1252
- "rewards/chosen": -0.3495523929595947,
1253
- "rewards/margins": 0.1542079746723175,
1254
- "rewards/rejected": -0.5037603378295898,
1255
  "step": 660,
1256
- "use_label": 11095.599609375
1257
- },
1258
- {
1259
- "epoch": 0.7,
1260
- "grad_norm": 1.515625,
1261
- "learning_rate": 1.2394572821496953e-06,
1262
- "logits/chosen": -0.9564473032951355,
1263
- "logits/rejected": -1.0122594833374023,
1264
- "logps/chosen": -100.20994567871094,
1265
- "logps/rejected": -121.32554626464844,
1266
- "loss": 0.6935,
1267
- "pred_label": 2446.14990234375,
1268
- "rewards/accuracies": 0.34375,
1269
- "rewards/chosen": -0.3450331687927246,
1270
- "rewards/margins": 0.19006122648715973,
1271
- "rewards/rejected": -0.5350943803787231,
1272
- "step": 670,
1273
- "use_label": 11219.849609375
1274
  },
1275
  {
1276
  "epoch": 0.71,
1277
- "grad_norm": 1.546875,
1278
  "learning_rate": 1.1613595214152713e-06,
1279
- "logits/chosen": -0.588452935218811,
1280
- "logits/rejected": -0.6323766708374023,
1281
- "logps/chosen": -125.20991516113281,
1282
- "logps/rejected": -139.94993591308594,
1283
- "loss": 0.6902,
1284
- "pred_label": 2485.10009765625,
1285
- "rewards/accuracies": 0.38749998807907104,
1286
- "rewards/chosen": -0.3915707468986511,
1287
- "rewards/margins": 0.19166378676891327,
1288
- "rewards/rejected": -0.5832345485687256,
1289
  "step": 680,
1290
- "use_label": 11340.900390625
1291
- },
1292
- {
1293
- "epoch": 0.72,
1294
- "grad_norm": 1.578125,
1295
- "learning_rate": 1.0850520736699362e-06,
1296
- "logits/chosen": -0.6506579518318176,
1297
- "logits/rejected": -0.7167869806289673,
1298
- "logps/chosen": -144.53038024902344,
1299
- "logps/rejected": -167.38192749023438,
1300
- "loss": 0.6898,
1301
- "pred_label": 2534.75,
1302
- "rewards/accuracies": 0.39375001192092896,
1303
- "rewards/chosen": -0.42825189232826233,
1304
- "rewards/margins": 0.28569427132606506,
1305
- "rewards/rejected": -0.7139460444450378,
1306
- "step": 690,
1307
- "use_label": 11451.25
1308
  },
1309
  {
1310
  "epoch": 0.73,
1311
- "grad_norm": 1.59375,
1312
  "learning_rate": 1.0106369933615043e-06,
1313
- "logits/chosen": -0.8556931614875793,
1314
- "logits/rejected": -0.6913198232650757,
1315
- "logps/chosen": -105.3968505859375,
1316
- "logps/rejected": -124.95710754394531,
1317
- "loss": 0.6913,
1318
- "pred_label": 2580.824951171875,
1319
- "rewards/accuracies": 0.3375000059604645,
1320
- "rewards/chosen": -0.39049768447875977,
1321
- "rewards/margins": 0.17418017983436584,
1322
- "rewards/rejected": -0.564677894115448,
1323
  "step": 700,
1324
- "use_label": 11565.1748046875
1325
  },
1326
  {
1327
  "epoch": 0.73,
1328
- "eval_logits/chosen": -0.3469957709312439,
1329
- "eval_logits/rejected": -0.24619349837303162,
1330
- "eval_logps/chosen": -104.32471466064453,
1331
- "eval_logps/rejected": -133.26370239257812,
1332
- "eval_loss": 0.6898515224456787,
1333
- "eval_pred_label": 2673.52392578125,
1334
- "eval_rewards/accuracies": 0.3670634925365448,
1335
- "eval_rewards/chosen": -0.35424166917800903,
1336
- "eval_rewards/margins": 0.22441466152668,
1337
- "eval_rewards/rejected": -0.5786563754081726,
1338
- "eval_runtime": 248.2749,
1339
- "eval_samples_per_second": 8.056,
1340
  "eval_steps_per_second": 0.254,
1341
- "eval_use_label": 11806.4765625,
1342
  "step": 700
1343
  },
1344
- {
1345
- "epoch": 0.74,
1346
- "grad_norm": 1.03125,
1347
- "learning_rate": 9.382138040640714e-07,
1348
- "logits/chosen": -0.6519032716751099,
1349
- "logits/rejected": -0.637380063533783,
1350
- "logps/chosen": -102.23021697998047,
1351
- "logps/rejected": -127.60137939453125,
1352
- "loss": 0.6903,
1353
- "pred_label": 2771.699951171875,
1354
- "rewards/accuracies": 0.3499999940395355,
1355
- "rewards/chosen": -0.3915974497795105,
1356
- "rewards/margins": 0.21561889350414276,
1357
- "rewards/rejected": -0.6072162985801697,
1358
- "step": 710,
1359
- "use_label": 12038.2998046875
1360
- },
1361
  {
1362
  "epoch": 0.75,
1363
- "grad_norm": 1.609375,
1364
  "learning_rate": 8.678793653740633e-07,
1365
- "logits/chosen": -0.6509895324707031,
1366
- "logits/rejected": -0.6935362815856934,
1367
- "logps/chosen": -87.30061340332031,
1368
- "logps/rejected": -114.2796630859375,
1369
- "loss": 0.6903,
1370
- "pred_label": 2811.47509765625,
1371
- "rewards/accuracies": 0.29374998807907104,
1372
- "rewards/chosen": -0.30430155992507935,
1373
- "rewards/margins": 0.18221500515937805,
1374
- "rewards/rejected": -0.486516535282135,
1375
  "step": 720,
1376
- "use_label": 12158.525390625
1377
- },
1378
- {
1379
- "epoch": 0.76,
1380
- "grad_norm": 2.21875,
1381
- "learning_rate": 7.997277433690984e-07,
1382
- "logits/chosen": -0.6035222411155701,
1383
- "logits/rejected": -0.65208500623703,
1384
- "logps/chosen": -100.17440032958984,
1385
- "logps/rejected": -119.87808990478516,
1386
- "loss": 0.6865,
1387
- "pred_label": 2850.0,
1388
- "rewards/accuracies": 0.35624998807907104,
1389
- "rewards/chosen": -0.2982019782066345,
1390
- "rewards/margins": 0.2585477828979492,
1391
- "rewards/rejected": -0.5567497611045837,
1392
- "step": 730,
1393
- "use_label": 12280.0
1394
  },
1395
  {
1396
  "epoch": 0.77,
1397
- "grad_norm": 0.80859375,
1398
  "learning_rate": 7.338500848029603e-07,
1399
- "logits/chosen": -0.4770827293395996,
1400
- "logits/rejected": -0.5081530213356018,
1401
- "logps/chosen": -94.86068725585938,
1402
- "logps/rejected": -116.67037200927734,
1403
- "loss": 0.6916,
1404
- "pred_label": 2886.125,
1405
- "rewards/accuracies": 0.28125,
1406
- "rewards/chosen": -0.34235304594039917,
1407
- "rewards/margins": 0.19017408788204193,
1408
- "rewards/rejected": -0.5325270891189575,
1409
  "step": 740,
1410
- "use_label": 12403.875
1411
- },
1412
- {
1413
- "epoch": 0.79,
1414
- "grad_norm": 1.1015625,
1415
- "learning_rate": 6.70334495204884e-07,
1416
- "logits/chosen": -0.5357509851455688,
1417
- "logits/rejected": -0.594279408454895,
1418
- "logps/chosen": -119.76139831542969,
1419
- "logps/rejected": -145.1709747314453,
1420
- "loss": 0.6905,
1421
- "pred_label": 2929.22509765625,
1422
- "rewards/accuracies": 0.35624998807907104,
1423
- "rewards/chosen": -0.4223107397556305,
1424
- "rewards/margins": 0.18705633282661438,
1425
- "rewards/rejected": -0.6093670725822449,
1426
- "step": 750,
1427
- "use_label": 12520.775390625
1428
  },
1429
  {
1430
  "epoch": 0.8,
1431
- "grad_norm": 1.1640625,
1432
  "learning_rate": 6.092659210462232e-07,
1433
- "logits/chosen": -0.6737512350082397,
1434
- "logits/rejected": -0.6523575186729431,
1435
- "logps/chosen": -86.640625,
1436
- "logps/rejected": -124.01812744140625,
1437
- "loss": 0.6899,
1438
- "pred_label": 2976.050048828125,
1439
- "rewards/accuracies": 0.3125,
1440
- "rewards/chosen": -0.32672789692878723,
1441
- "rewards/margins": 0.1930442750453949,
1442
- "rewards/rejected": -0.5197721719741821,
1443
  "step": 760,
1444
- "use_label": 12633.9501953125
1445
- },
1446
- {
1447
- "epoch": 0.81,
1448
- "grad_norm": 1.4375,
1449
- "learning_rate": 5.507260361320738e-07,
1450
- "logits/chosen": -0.6238114833831787,
1451
- "logits/rejected": -0.6686199307441711,
1452
- "logps/chosen": -127.0525131225586,
1453
- "logps/rejected": -142.44747924804688,
1454
- "loss": 0.689,
1455
- "pred_label": 3021.85009765625,
1456
- "rewards/accuracies": 0.39375001192092896,
1457
- "rewards/chosen": -0.43505221605300903,
1458
- "rewards/margins": 0.25210094451904297,
1459
- "rewards/rejected": -0.687153160572052,
1460
- "step": 770,
1461
- "use_label": 12748.150390625
1462
  },
1463
  {
1464
  "epoch": 0.82,
1465
- "grad_norm": 1.7578125,
1466
  "learning_rate": 4.947931323697983e-07,
1467
- "logits/chosen": -0.6369722485542297,
1468
- "logits/rejected": -0.7722553014755249,
1469
- "logps/chosen": -112.76126861572266,
1470
- "logps/rejected": -133.56796264648438,
1471
- "loss": 0.6915,
1472
- "pred_label": 3075.72509765625,
1473
- "rewards/accuracies": 0.36250001192092896,
1474
- "rewards/chosen": -0.3996170461177826,
1475
- "rewards/margins": 0.22261002659797668,
1476
- "rewards/rejected": -0.6222270727157593,
1477
  "step": 780,
1478
- "use_label": 12854.275390625
1479
- },
1480
- {
1481
- "epoch": 0.83,
1482
- "grad_norm": 1.421875,
1483
- "learning_rate": 4.4154201506053985e-07,
1484
- "logits/chosen": -0.5256940126419067,
1485
- "logits/rejected": -0.467402845621109,
1486
- "logps/chosen": -95.73258209228516,
1487
- "logps/rejected": -103.3360366821289,
1488
- "loss": 0.6917,
1489
- "pred_label": 3123.85009765625,
1490
- "rewards/accuracies": 0.32499998807907104,
1491
- "rewards/chosen": -0.30898317694664,
1492
- "rewards/margins": 0.2029590606689453,
1493
- "rewards/rejected": -0.5119422674179077,
1494
- "step": 790,
1495
- "use_label": 12966.150390625
1496
  },
1497
  {
1498
  "epoch": 0.84,
1499
- "grad_norm": 1.359375,
1500
  "learning_rate": 3.910439028537638e-07,
1501
- "logits/chosen": -0.6677756905555725,
1502
- "logits/rejected": -0.607046902179718,
1503
- "logps/chosen": -92.61612701416016,
1504
- "logps/rejected": -115.20296478271484,
1505
- "loss": 0.6893,
1506
- "pred_label": 3166.449951171875,
1507
- "rewards/accuracies": 0.3499999940395355,
1508
- "rewards/chosen": -0.3256850242614746,
1509
- "rewards/margins": 0.20536477863788605,
1510
- "rewards/rejected": -0.5310498476028442,
1511
  "step": 800,
1512
- "use_label": 13083.5498046875
1513
  },
1514
  {
1515
  "epoch": 0.84,
1516
- "eval_logits/chosen": -0.23666124045848846,
1517
- "eval_logits/rejected": -0.1293245106935501,
1518
- "eval_logps/chosen": -103.33552551269531,
1519
- "eval_logps/rejected": -132.24159240722656,
1520
- "eval_loss": 0.6903889179229736,
1521
- "eval_pred_label": 3252.09521484375,
1522
  "eval_rewards/accuracies": 0.363095223903656,
1523
- "eval_rewards/chosen": -0.34434974193573,
1524
- "eval_rewards/margins": 0.22408555448055267,
1525
- "eval_rewards/rejected": -0.5684353113174438,
1526
- "eval_runtime": 248.2839,
1527
- "eval_samples_per_second": 8.055,
1528
  "eval_steps_per_second": 0.254,
1529
- "eval_use_label": 13331.904296875,
1530
  "step": 800
1531
  },
1532
- {
1533
- "epoch": 0.85,
1534
- "grad_norm": 1.3828125,
1535
- "learning_rate": 3.4336633249862084e-07,
1536
- "logits/chosen": -0.6630854606628418,
1537
- "logits/rejected": -0.6445407867431641,
1538
- "logps/chosen": -108.18148040771484,
1539
- "logps/rejected": -135.99142456054688,
1540
- "loss": 0.6901,
1541
- "pred_label": 3350.35009765625,
1542
- "rewards/accuracies": 0.34375,
1543
- "rewards/chosen": -0.3832666873931885,
1544
- "rewards/margins": 0.1908622682094574,
1545
- "rewards/rejected": -0.5741289258003235,
1546
- "step": 810,
1547
- "use_label": 13563.650390625
1548
- },
1549
  {
1550
  "epoch": 0.86,
1551
- "grad_norm": 1.3359375,
1552
  "learning_rate": 2.98573068519539e-07,
1553
- "logits/chosen": -0.6042599081993103,
1554
- "logits/rejected": -0.6371781826019287,
1555
- "logps/chosen": -94.31297302246094,
1556
- "logps/rejected": -101.22802734375,
1557
- "loss": 0.689,
1558
- "pred_label": 3393.47509765625,
1559
- "rewards/accuracies": 0.29374998807907104,
1560
- "rewards/chosen": -0.3432285487651825,
1561
- "rewards/margins": 0.13310988247394562,
1562
- "rewards/rejected": -0.4763384461402893,
1563
  "step": 820,
1564
- "use_label": 13680.525390625
1565
- },
1566
- {
1567
- "epoch": 0.87,
1568
- "grad_norm": 1.484375,
1569
- "learning_rate": 2.5672401793681854e-07,
1570
- "logits/chosen": -0.5476540923118591,
1571
- "logits/rejected": -0.43125781416893005,
1572
- "logps/chosen": -86.91058349609375,
1573
- "logps/rejected": -110.5887222290039,
1574
- "loss": 0.6923,
1575
- "pred_label": 3435.074951171875,
1576
- "rewards/accuracies": 0.36250001192092896,
1577
- "rewards/chosen": -0.2886909246444702,
1578
- "rewards/margins": 0.25071993470191956,
1579
- "rewards/rejected": -0.5394108295440674,
1580
- "step": 830,
1581
- "use_label": 13798.9248046875
1582
  },
1583
  {
1584
  "epoch": 0.88,
1585
- "grad_norm": 1.9296875,
1586
  "learning_rate": 2.178751501463036e-07,
1587
- "logits/chosen": -0.5565081834793091,
1588
- "logits/rejected": -0.6612057685852051,
1589
- "logps/chosen": -89.98490142822266,
1590
- "logps/rejected": -93.48139953613281,
1591
  "loss": 0.6915,
1592
- "pred_label": 3471.35009765625,
1593
- "rewards/accuracies": 0.24375000596046448,
1594
- "rewards/chosen": -0.306854248046875,
1595
- "rewards/margins": 0.09164027869701385,
1596
- "rewards/rejected": -0.39849454164505005,
1597
  "step": 840,
1598
- "use_label": 13922.650390625
1599
- },
1600
- {
1601
- "epoch": 0.89,
1602
- "grad_norm": 1.359375,
1603
- "learning_rate": 1.820784220652766e-07,
1604
- "logits/chosen": -0.6778563261032104,
1605
- "logits/rejected": -0.73534095287323,
1606
- "logps/chosen": -120.2663345336914,
1607
- "logps/rejected": -149.02294921875,
1608
- "loss": 0.6854,
1609
- "pred_label": 3509.0,
1610
- "rewards/accuracies": 0.41874998807907104,
1611
- "rewards/chosen": -0.36049091815948486,
1612
- "rewards/margins": 0.2984590530395508,
1613
- "rewards/rejected": -0.6589499711990356,
1614
- "step": 850,
1615
- "use_label": 14045.0
1616
  },
1617
  {
1618
  "epoch": 0.9,
1619
- "grad_norm": 1.796875,
1620
  "learning_rate": 1.4938170864468636e-07,
1621
- "logits/chosen": -0.5929479002952576,
1622
- "logits/rejected": -0.48117414116859436,
1623
- "logps/chosen": -115.10990142822266,
1624
- "logps/rejected": -133.1912841796875,
1625
- "loss": 0.6892,
1626
- "pred_label": 3556.324951171875,
1627
- "rewards/accuracies": 0.3812499940395355,
1628
- "rewards/chosen": -0.33908045291900635,
1629
- "rewards/margins": 0.23609444499015808,
1630
- "rewards/rejected": -0.5751749277114868,
1631
  "step": 860,
1632
- "use_label": 14157.6748046875
1633
- },
1634
- {
1635
- "epoch": 0.91,
1636
- "grad_norm": 1.7578125,
1637
- "learning_rate": 1.1982873884064466e-07,
1638
- "logits/chosen": -0.6633087992668152,
1639
- "logits/rejected": -0.6678288578987122,
1640
- "logps/chosen": -117.92154693603516,
1641
- "logps/rejected": -145.3701171875,
1642
- "loss": 0.6893,
1643
- "pred_label": 3603.75,
1644
- "rewards/accuracies": 0.375,
1645
- "rewards/chosen": -0.3660942316055298,
1646
- "rewards/margins": 0.2644110918045044,
1647
- "rewards/rejected": -0.6305053234100342,
1648
- "step": 870,
1649
- "use_label": 14270.25
1650
  },
1651
  {
1652
  "epoch": 0.92,
1653
- "grad_norm": 0.87890625,
1654
  "learning_rate": 9.345903713082305e-08,
1655
- "logits/chosen": -0.5895944237709045,
1656
- "logits/rejected": -0.5510295629501343,
1657
- "logps/chosen": -96.94719696044922,
1658
- "logps/rejected": -141.16554260253906,
1659
- "loss": 0.6891,
1660
- "pred_label": 3651.0,
1661
- "rewards/accuracies": 0.4312500059604645,
1662
- "rewards/chosen": -0.3419613242149353,
1663
- "rewards/margins": 0.32287630438804626,
1664
- "rewards/rejected": -0.6648377180099487,
1665
  "step": 880,
1666
- "use_label": 14383.0
1667
- },
1668
- {
1669
- "epoch": 0.93,
1670
- "grad_norm": 1.6484375,
1671
- "learning_rate": 7.030787065396866e-08,
1672
- "logits/chosen": -0.5159703493118286,
1673
- "logits/rejected": -0.5519541501998901,
1674
- "logps/chosen": -96.9026107788086,
1675
- "logps/rejected": -120.7626724243164,
1676
- "loss": 0.693,
1677
- "pred_label": 3690.675048828125,
1678
- "rewards/accuracies": 0.28125,
1679
- "rewards/chosen": -0.3307461142539978,
1680
- "rewards/margins": 0.1426464170217514,
1681
- "rewards/rejected": -0.4733925461769104,
1682
- "step": 890,
1683
- "use_label": 14503.3251953125
1684
  },
1685
  {
1686
  "epoch": 0.94,
1687
- "grad_norm": 1.9609375,
1688
  "learning_rate": 5.0406202043228604e-08,
1689
- "logits/chosen": -0.2721698582172394,
1690
- "logits/rejected": -0.407818466424942,
1691
- "logps/chosen": -104.2662582397461,
1692
- "logps/rejected": -149.70314025878906,
1693
- "loss": 0.689,
1694
- "pred_label": 3732.824951171875,
1695
- "rewards/accuracies": 0.39375001192092896,
1696
- "rewards/chosen": -0.3485477864742279,
1697
- "rewards/margins": 0.2633667290210724,
1698
- "rewards/rejected": -0.6119145154953003,
1699
  "step": 900,
1700
- "use_label": 14621.1748046875
1701
  },
1702
  {
1703
  "epoch": 0.94,
1704
- "eval_logits/chosen": -0.2437347173690796,
1705
- "eval_logits/rejected": -0.13671822845935822,
1706
- "eval_logps/chosen": -103.0300521850586,
1707
- "eval_logps/rejected": -131.91110229492188,
1708
- "eval_loss": 0.6907457709312439,
1709
- "eval_pred_label": 3821.52392578125,
1710
- "eval_rewards/accuracies": 0.363095223903656,
1711
- "eval_rewards/chosen": -0.3412950336933136,
1712
- "eval_rewards/margins": 0.22383520007133484,
1713
- "eval_rewards/rejected": -0.5651301741600037,
1714
- "eval_runtime": 248.2504,
1715
- "eval_samples_per_second": 8.056,
1716
  "eval_steps_per_second": 0.254,
1717
- "eval_use_label": 14866.4765625,
1718
  "step": 900
1719
  },
1720
- {
1721
- "epoch": 0.95,
1722
- "grad_norm": 1.171875,
1723
- "learning_rate": 3.378064801637687e-08,
1724
- "logits/chosen": -0.5370496511459351,
1725
- "logits/rejected": -0.5028234720230103,
1726
- "logps/chosen": -89.67744445800781,
1727
- "logps/rejected": -113.96895599365234,
1728
- "loss": 0.6882,
1729
- "pred_label": 3916.52490234375,
1730
- "rewards/accuracies": 0.3187499940395355,
1731
- "rewards/chosen": -0.2901899218559265,
1732
- "rewards/margins": 0.2133828103542328,
1733
- "rewards/rejected": -0.5035727024078369,
1734
- "step": 910,
1735
- "use_label": 15101.474609375
1736
- },
1737
  {
1738
  "epoch": 0.96,
1739
- "grad_norm": 1.3125,
1740
  "learning_rate": 2.0453443778310766e-08,
1741
- "logits/chosen": -0.43033066391944885,
1742
- "logits/rejected": -0.4173038899898529,
1743
- "logps/chosen": -80.09765625,
1744
- "logps/rejected": -120.93513488769531,
1745
- "loss": 0.6934,
1746
- "pred_label": 3958.0,
1747
- "rewards/accuracies": 0.3125,
1748
- "rewards/chosen": -0.26141807436943054,
1749
- "rewards/margins": 0.23344416916370392,
1750
- "rewards/rejected": -0.49486222863197327,
1751
  "step": 920,
1752
- "use_label": 15220.0
1753
- },
1754
- {
1755
- "epoch": 0.97,
1756
- "grad_norm": 2.109375,
1757
- "learning_rate": 1.0442413283435759e-08,
1758
- "logits/chosen": -0.4513850212097168,
1759
- "logits/rejected": -0.5099025964736938,
1760
- "logps/chosen": -92.44239807128906,
1761
- "logps/rejected": -119.61177062988281,
1762
- "loss": 0.6878,
1763
- "pred_label": 3998.60009765625,
1764
- "rewards/accuracies": 0.33125001192092896,
1765
- "rewards/chosen": -0.29288578033447266,
1766
- "rewards/margins": 0.20934204757213593,
1767
- "rewards/rejected": -0.502227783203125,
1768
- "step": 930,
1769
- "use_label": 15339.400390625
1770
  },
1771
  {
1772
  "epoch": 0.98,
1773
- "grad_norm": 1.25,
1774
  "learning_rate": 3.760945397705828e-09,
1775
- "logits/chosen": -0.3625331521034241,
1776
- "logits/rejected": -0.5358187556266785,
1777
- "logps/chosen": -103.41780090332031,
1778
- "logps/rejected": -130.23828125,
1779
- "loss": 0.691,
1780
- "pred_label": 4038.60009765625,
1781
- "rewards/accuracies": 0.3187499940395355,
1782
- "rewards/chosen": -0.34467238187789917,
1783
- "rewards/margins": 0.18087737262248993,
1784
- "rewards/rejected": -0.5255497694015503,
1785
  "step": 940,
1786
- "use_label": 15459.400390625
1787
- },
1788
- {
1789
- "epoch": 0.99,
1790
- "grad_norm": 1.59375,
1791
- "learning_rate": 4.1797599220405605e-10,
1792
- "logits/chosen": -0.674268901348114,
1793
- "logits/rejected": -0.7018919587135315,
1794
- "logps/chosen": -114.91938781738281,
1795
- "logps/rejected": -133.3175506591797,
1796
- "loss": 0.6895,
1797
- "pred_label": 4082.625,
1798
- "rewards/accuracies": 0.33125001192092896,
1799
- "rewards/chosen": -0.3830910325050354,
1800
- "rewards/margins": 0.1591145098209381,
1801
- "rewards/rejected": -0.5422054529190063,
1802
- "step": 950,
1803
- "use_label": 15575.375
1804
  },
1805
  {
1806
  "epoch": 1.0,
1807
  "step": 955,
1808
  "total_flos": 0.0,
1809
- "train_loss": 0.6880922077838039,
1810
- "train_runtime": 20023.3666,
1811
  "train_samples_per_second": 3.053,
1812
  "train_steps_per_second": 0.048
1813
  }
1814
  ],
1815
- "logging_steps": 10,
1816
  "max_steps": 955,
1817
  "num_input_tokens_seen": 0,
1818
  "num_train_epochs": 1,
1819
- "save_steps": 50,
1820
  "total_flos": 0.0,
1821
  "train_batch_size": 4,
1822
  "trial_name": null,
 
25
  "step": 1,
26
  "use_label": 10.0
27
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  {
29
  "epoch": 0.02,
30
  "grad_norm": 0.6796875,
31
  "learning_rate": 1.0416666666666667e-06,
32
+ "logits/chosen": -2.2281553745269775,
33
+ "logits/rejected": -2.276446580886841,
34
+ "logps/chosen": -57.036190032958984,
35
+ "logps/rejected": -66.88007354736328,
36
+ "loss": 0.6927,
37
  "pred_label": 0.0,
38
+ "rewards/accuracies": 0.24013157188892365,
39
+ "rewards/chosen": 0.003924594726413488,
40
+ "rewards/margins": 0.0009102027979679406,
41
+ "rewards/rejected": 0.0030143915209919214,
42
  "step": 20,
43
+ "use_label": 170.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  },
45
  {
46
  "epoch": 0.04,
47
  "grad_norm": 0.6328125,
48
  "learning_rate": 2.0833333333333334e-06,
49
+ "logits/chosen": -2.2738099098205566,
50
+ "logits/rejected": -2.2623789310455322,
51
+ "logps/chosen": -54.78137969970703,
52
+ "logps/rejected": -67.2437515258789,
53
+ "loss": 0.6914,
54
  "pred_label": 0.0,
55
+ "rewards/accuracies": 0.24687500298023224,
56
+ "rewards/chosen": 0.01747792772948742,
57
+ "rewards/margins": 0.001674558618105948,
58
+ "rewards/rejected": 0.015803368762135506,
59
  "step": 40,
60
+ "use_label": 482.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  },
62
  {
63
  "epoch": 0.06,
64
+ "grad_norm": 0.71875,
65
  "learning_rate": 3.125e-06,
66
+ "logits/chosen": -2.3237431049346924,
67
+ "logits/rejected": -2.321906089782715,
68
+ "logps/chosen": -75.5770034790039,
69
+ "logps/rejected": -87.68544006347656,
70
+ "loss": 0.6885,
71
  "pred_label": 0.0,
72
+ "rewards/accuracies": 0.3125,
73
+ "rewards/chosen": 0.031676117330789566,
74
+ "rewards/margins": 0.009719676338136196,
75
+ "rewards/rejected": 0.021956440061330795,
76
  "step": 60,
77
+ "use_label": 802.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  },
79
  {
80
  "epoch": 0.08,
81
+ "grad_norm": 0.73828125,
82
  "learning_rate": 4.166666666666667e-06,
83
+ "logits/chosen": -2.2948005199432373,
84
+ "logits/rejected": -2.2623462677001953,
85
+ "logps/chosen": -79.29240417480469,
86
+ "logps/rejected": -83.04844665527344,
87
+ "loss": 0.6876,
88
+ "pred_label": 5.800000190734863,
89
+ "rewards/accuracies": 0.3343749940395355,
90
+ "rewards/chosen": 0.016009245067834854,
91
+ "rewards/margins": 0.018887853249907494,
92
+ "rewards/rejected": -0.0028786074835807085,
93
  "step": 80,
94
+ "use_label": 1116.199951171875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  },
96
  {
97
  "epoch": 0.1,
98
+ "grad_norm": 0.6953125,
99
  "learning_rate": 4.9997324926814375e-06,
100
+ "logits/chosen": -2.2056884765625,
101
+ "logits/rejected": -2.210036039352417,
102
+ "logps/chosen": -68.87937927246094,
103
+ "logps/rejected": -77.87590026855469,
104
+ "loss": 0.6876,
105
+ "pred_label": 27.537500381469727,
106
+ "rewards/accuracies": 0.34062498807907104,
107
+ "rewards/chosen": -0.010471501387655735,
108
+ "rewards/margins": 0.03584115579724312,
109
+ "rewards/rejected": -0.04631265625357628,
110
  "step": 100,
111
+ "use_label": 1414.4625244140625
112
  },
113
  {
114
  "epoch": 0.1,
115
+ "eval_logits/chosen": -2.1076083183288574,
116
+ "eval_logits/rejected": -2.0761499404907227,
117
+ "eval_logps/chosen": -74.44951629638672,
118
+ "eval_logps/rejected": -85.2883071899414,
119
+ "eval_loss": 0.6895647048950195,
120
+ "eval_pred_label": 89.14286041259766,
121
+ "eval_rewards/accuracies": 0.335317462682724,
122
+ "eval_rewards/chosen": -0.05548960343003273,
123
+ "eval_rewards/margins": 0.04341282695531845,
124
+ "eval_rewards/rejected": -0.09890241920948029,
125
+ "eval_runtime": 247.5952,
126
+ "eval_samples_per_second": 8.078,
127
+ "eval_steps_per_second": 0.254,
128
+ "eval_use_label": 1766.857177734375,
129
  "step": 100
130
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  {
132
  "epoch": 0.13,
133
+ "grad_norm": 0.7578125,
134
  "learning_rate": 4.9903757462135984e-06,
135
+ "logits/chosen": -2.2542896270751953,
136
+ "logits/rejected": -2.1902401447296143,
137
+ "logps/chosen": -70.2941665649414,
138
+ "logps/rejected": -84.7874755859375,
139
+ "loss": 0.6884,
140
+ "pred_label": 155.6374969482422,
141
+ "rewards/accuracies": 0.3187499940395355,
142
+ "rewards/chosen": -0.023759985342621803,
143
+ "rewards/margins": 0.051492441445589066,
144
+ "rewards/rejected": -0.07525241374969482,
145
  "step": 120,
146
+ "use_label": 2110.362548828125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  },
148
  {
149
  "epoch": 0.15,
150
+ "grad_norm": 0.55859375,
151
  "learning_rate": 4.967700826904229e-06,
152
+ "logits/chosen": -2.1823272705078125,
153
+ "logits/rejected": -2.210157632827759,
154
+ "logps/chosen": -61.80498504638672,
155
+ "logps/rejected": -76.43424224853516,
156
+ "loss": 0.6907,
157
+ "pred_label": 204.22500610351562,
158
+ "rewards/accuracies": 0.26875001192092896,
159
+ "rewards/chosen": -0.029314354062080383,
160
+ "rewards/margins": 0.036702848970890045,
161
+ "rewards/rejected": -0.06601719558238983,
162
  "step": 140,
163
+ "use_label": 2381.77490234375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  },
165
  {
166
  "epoch": 0.17,
167
+ "grad_norm": 0.70703125,
168
  "learning_rate": 4.931828996974498e-06,
169
+ "logits/chosen": -2.251568555831909,
170
+ "logits/rejected": -2.220432996749878,
171
+ "logps/chosen": -66.60148620605469,
172
+ "logps/rejected": -71.53702545166016,
173
+ "loss": 0.69,
174
+ "pred_label": 257.2124938964844,
175
+ "rewards/accuracies": 0.3343749940395355,
176
+ "rewards/chosen": -0.020524730905890465,
177
+ "rewards/margins": 0.05932433158159256,
178
+ "rewards/rejected": -0.07984906435012817,
179
  "step": 160,
180
+ "use_label": 2648.78759765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  },
182
  {
183
  "epoch": 0.19,
184
+ "grad_norm": 0.6796875,
185
  "learning_rate": 4.882952093833628e-06,
186
+ "logits/chosen": -2.114015817642212,
187
+ "logits/rejected": -2.126950740814209,
188
+ "logps/chosen": -66.40071868896484,
189
+ "logps/rejected": -78.54503631591797,
190
+ "loss": 0.6901,
191
+ "pred_label": 319.9624938964844,
192
+ "rewards/accuracies": 0.328125,
193
+ "rewards/chosen": -0.03171534463763237,
194
+ "rewards/margins": 0.0544399619102478,
195
+ "rewards/rejected": -0.08615531027317047,
196
  "step": 180,
197
+ "use_label": 2906.03759765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  },
199
  {
200
  "epoch": 0.21,
201
+ "grad_norm": 0.9140625,
202
  "learning_rate": 4.821331504159906e-06,
203
+ "logits/chosen": -2.138213872909546,
204
+ "logits/rejected": -2.108750343322754,
205
+ "logps/chosen": -77.92289733886719,
206
+ "logps/rejected": -78.32075500488281,
207
+ "loss": 0.6892,
208
+ "pred_label": 383.5249938964844,
209
+ "rewards/accuracies": 0.37812501192092896,
210
+ "rewards/chosen": -0.009543296881020069,
211
+ "rewards/margins": 0.06037301942706108,
212
+ "rewards/rejected": -0.06991632282733917,
213
  "step": 200,
214
+ "use_label": 3162.47509765625
215
  },
216
  {
217
  "epoch": 0.21,
218
+ "eval_logits/chosen": -2.051973581314087,
219
+ "eval_logits/rejected": -2.028658390045166,
220
+ "eval_logps/chosen": -69.3875503540039,
221
+ "eval_logps/rejected": -80.99542999267578,
222
+ "eval_loss": 0.6893584132194519,
223
+ "eval_pred_label": 459.1111145019531,
224
+ "eval_rewards/accuracies": 0.3492063581943512,
225
+ "eval_rewards/chosen": -0.0048699695616960526,
226
+ "eval_rewards/margins": 0.05110359564423561,
227
+ "eval_rewards/rejected": -0.05597356706857681,
228
+ "eval_runtime": 247.8689,
229
+ "eval_samples_per_second": 8.069,
230
+ "eval_steps_per_second": 0.254,
231
+ "eval_use_label": 3500.888916015625,
232
  "step": 200
233
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  {
235
  "epoch": 0.23,
236
+ "grad_norm": 0.765625,
237
  "learning_rate": 4.747296766042161e-06,
238
+ "logits/chosen": -2.172316074371338,
239
+ "logits/rejected": -2.1599390506744385,
240
+ "logps/chosen": -73.75865173339844,
241
+ "logps/rejected": -76.45826721191406,
242
+ "loss": 0.6906,
243
+ "pred_label": 537.4000244140625,
244
+ "rewards/accuracies": 0.34375,
245
+ "rewards/chosen": -0.017265746369957924,
246
+ "rewards/margins": 0.061459798365831375,
247
+ "rewards/rejected": -0.07872554659843445,
248
  "step": 220,
249
+ "use_label": 3832.60009765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  },
251
  {
252
  "epoch": 0.25,
253
+ "grad_norm": 0.671875,
254
  "learning_rate": 4.661243806657256e-06,
255
+ "logits/chosen": -2.1377243995666504,
256
+ "logits/rejected": -2.114131450653076,
257
+ "logps/chosen": -78.08522033691406,
258
+ "logps/rejected": -88.16291809082031,
259
+ "loss": 0.6906,
260
+ "pred_label": 610.8624877929688,
261
+ "rewards/accuracies": 0.3375000059604645,
262
+ "rewards/chosen": -0.06858871877193451,
263
+ "rewards/margins": 0.07855252921581268,
264
+ "rewards/rejected": -0.1471412628889084,
265
  "step": 240,
266
+ "use_label": 4079.137451171875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  },
268
  {
269
  "epoch": 0.27,
270
+ "grad_norm": 0.70703125,
271
  "learning_rate": 4.563632824908252e-06,
272
+ "logits/chosen": -2.1762757301330566,
273
+ "logits/rejected": -2.173243999481201,
274
+ "logps/chosen": -69.33678436279297,
275
+ "logps/rejected": -82.98787689208984,
276
+ "loss": 0.6907,
277
+ "pred_label": 682.2750244140625,
278
+ "rewards/accuracies": 0.33125001192092896,
279
+ "rewards/chosen": -0.06302420794963837,
280
+ "rewards/margins": 0.0732887014746666,
281
+ "rewards/rejected": -0.13631291687488556,
282
  "step": 260,
283
+ "use_label": 4327.72509765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  },
285
  {
286
  "epoch": 0.29,
287
+ "grad_norm": 0.625,
288
  "learning_rate": 4.454985830346574e-06,
289
+ "logits/chosen": -2.16465425491333,
290
+ "logits/rejected": -2.1788923740386963,
291
+ "logps/chosen": -74.41441345214844,
292
+ "logps/rejected": -78.55416870117188,
293
+ "loss": 0.6892,
294
+ "pred_label": 749.125,
295
+ "rewards/accuracies": 0.3062500059604645,
296
+ "rewards/chosen": -0.06083650514483452,
297
+ "rewards/margins": 0.04520425945520401,
298
+ "rewards/rejected": -0.10604077577590942,
299
  "step": 280,
300
+ "use_label": 4580.875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  },
302
  {
303
  "epoch": 0.31,
304
+ "grad_norm": 0.65234375,
305
  "learning_rate": 4.335883851539693e-06,
306
+ "logits/chosen": -2.0553781986236572,
307
+ "logits/rejected": -2.0573229789733887,
308
+ "logps/chosen": -69.96788024902344,
309
+ "logps/rejected": -80.52223205566406,
310
+ "loss": 0.6904,
311
+ "pred_label": 824.5499877929688,
312
+ "rewards/accuracies": 0.359375,
313
+ "rewards/chosen": -0.04866168648004532,
314
+ "rewards/margins": 0.09801270812749863,
315
+ "rewards/rejected": -0.14667439460754395,
316
  "step": 300,
317
+ "use_label": 4825.4501953125
318
  },
319
  {
320
  "epoch": 0.31,
321
+ "eval_logits/chosen": -2.0163989067077637,
322
+ "eval_logits/rejected": -1.9942671060562134,
323
+ "eval_logps/chosen": -75.15243530273438,
324
+ "eval_logps/rejected": -89.50163269042969,
325
+ "eval_loss": 0.6908969879150391,
326
+ "eval_pred_label": 923.3174438476562,
327
+ "eval_rewards/accuracies": 0.3531745970249176,
328
+ "eval_rewards/chosen": -0.06251893937587738,
329
+ "eval_rewards/margins": 0.07851671427488327,
330
+ "eval_rewards/rejected": -0.14103564620018005,
331
+ "eval_runtime": 247.8241,
332
+ "eval_samples_per_second": 8.07,
333
+ "eval_steps_per_second": 0.254,
334
+ "eval_use_label": 5140.6826171875,
335
  "step": 300
336
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  {
338
  "epoch": 0.33,
339
+ "grad_norm": 0.9140625,
340
  "learning_rate": 4.206963828813555e-06,
341
+ "logits/chosen": -2.065279483795166,
342
+ "logits/rejected": -2.0684821605682373,
343
+ "logps/chosen": -72.58639526367188,
344
+ "logps/rejected": -89.45655822753906,
345
+ "loss": 0.6899,
346
+ "pred_label": 1033.7874755859375,
347
+ "rewards/accuracies": 0.3125,
348
+ "rewards/chosen": -0.11120834201574326,
349
+ "rewards/margins": 0.0645986869931221,
350
+ "rewards/rejected": -0.17580702900886536,
351
  "step": 320,
352
+ "use_label": 5440.21240234375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  },
354
  {
355
  "epoch": 0.36,
356
+ "grad_norm": 0.56640625,
357
  "learning_rate": 4.068915207986931e-06,
358
+ "logits/chosen": -2.033398151397705,
359
+ "logits/rejected": -1.991502046585083,
360
+ "logps/chosen": -71.1894760131836,
361
+ "logps/rejected": -84.0774154663086,
362
+ "loss": 0.6917,
363
+ "pred_label": 1122.112548828125,
364
  "rewards/accuracies": 0.3375000059604645,
365
+ "rewards/chosen": -0.07950185984373093,
366
+ "rewards/margins": 0.08617939054965973,
367
+ "rewards/rejected": -0.16568127274513245,
368
  "step": 340,
369
+ "use_label": 5671.8876953125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  },
371
  {
372
  "epoch": 0.38,
373
+ "grad_norm": 0.84765625,
374
  "learning_rate": 3.922476253313921e-06,
375
+ "logits/chosen": -2.0358688831329346,
376
+ "logits/rejected": -2.0224781036376953,
377
+ "logps/chosen": -76.57051849365234,
378
+ "logps/rejected": -84.2589340209961,
379
+ "loss": 0.6914,
380
+ "pred_label": 1204.4124755859375,
381
+ "rewards/accuracies": 0.31562501192092896,
382
+ "rewards/chosen": -0.11715561151504517,
383
+ "rewards/margins": 0.07723374664783478,
384
+ "rewards/rejected": -0.19438934326171875,
385
  "step": 360,
386
+ "use_label": 5909.58740234375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  },
388
  {
389
  "epoch": 0.4,
390
+ "grad_norm": 0.55078125,
391
  "learning_rate": 3.768430099352445e-06,
392
+ "logits/chosen": -2.12782621383667,
393
+ "logits/rejected": -2.086026430130005,
394
+ "logps/chosen": -74.41622161865234,
395
+ "logps/rejected": -85.17180633544922,
396
+ "loss": 0.6918,
397
+ "pred_label": 1289.9375,
398
+ "rewards/accuracies": 0.3656249940395355,
399
+ "rewards/chosen": -0.07592298835515976,
400
+ "rewards/margins": 0.08457346260547638,
401
+ "rewards/rejected": -0.16049645841121674,
402
  "step": 380,
403
+ "use_label": 6144.0625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
  },
405
  {
406
  "epoch": 0.42,
407
+ "grad_norm": 0.73046875,
408
  "learning_rate": 3.607600562872785e-06,
409
+ "logits/chosen": -2.126784086227417,
410
+ "logits/rejected": -2.1261298656463623,
411
+ "logps/chosen": -83.82131958007812,
412
+ "logps/rejected": -86.00455474853516,
413
+ "loss": 0.6906,
414
+ "pred_label": 1373.137451171875,
415
+ "rewards/accuracies": 0.3375000059604645,
416
+ "rewards/chosen": -0.05874443054199219,
417
+ "rewards/margins": 0.06775099784135818,
418
+ "rewards/rejected": -0.12649545073509216,
419
  "step": 400,
420
+ "use_label": 6380.8623046875
421
  },
422
  {
423
  "epoch": 0.42,
424
+ "eval_logits/chosen": -2.0480618476867676,
425
+ "eval_logits/rejected": -2.0248324871063232,
426
+ "eval_logps/chosen": -75.26866149902344,
427
+ "eval_logps/rejected": -90.80635070800781,
428
+ "eval_loss": 0.6920759081840515,
429
+ "eval_pred_label": 1472.5714111328125,
430
+ "eval_rewards/accuracies": 0.3511904776096344,
431
+ "eval_rewards/chosen": -0.06368114054203033,
432
+ "eval_rewards/margins": 0.09040173143148422,
433
+ "eval_rewards/rejected": -0.15408287942409515,
434
+ "eval_runtime": 248.0088,
435
+ "eval_samples_per_second": 8.064,
436
+ "eval_steps_per_second": 0.254,
437
+ "eval_use_label": 6695.4287109375,
438
  "step": 400
439
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  {
441
  "epoch": 0.44,
442
+ "grad_norm": 0.78515625,
443
  "learning_rate": 3.4408477372034743e-06,
444
+ "logits/chosen": -2.055358409881592,
445
+ "logits/rejected": -2.068175792694092,
446
+ "logps/chosen": -70.47552490234375,
447
+ "logps/rejected": -79.02010345458984,
448
+ "loss": 0.6903,
449
+ "pred_label": 1589.0374755859375,
450
+ "rewards/accuracies": 0.3656249940395355,
451
+ "rewards/chosen": -0.06399895995855331,
452
+ "rewards/margins": 0.0963120311498642,
453
+ "rewards/rejected": -0.16031098365783691,
454
  "step": 420,
455
+ "use_label": 6988.96240234375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  },
457
  {
458
  "epoch": 0.46,
459
+ "grad_norm": 0.95703125,
460
  "learning_rate": 3.269063392575352e-06,
461
+ "logits/chosen": -2.0893940925598145,
462
+ "logits/rejected": -2.09212589263916,
463
+ "logps/chosen": -85.68560028076172,
464
+ "logps/rejected": -87.41291809082031,
465
+ "loss": 0.6912,
466
+ "pred_label": 1667.6875,
467
  "rewards/accuracies": 0.33125001192092896,
468
+ "rewards/chosen": -0.13728377223014832,
469
+ "rewards/margins": 0.07875251770019531,
470
+ "rewards/rejected": -0.21603628993034363,
471
  "step": 440,
472
+ "use_label": 7230.3125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
473
  },
474
  {
475
  "epoch": 0.48,
476
+ "grad_norm": 0.53515625,
477
  "learning_rate": 3.09316620706208e-06,
478
+ "logits/chosen": -2.079465389251709,
479
+ "logits/rejected": -2.091001033782959,
480
+ "logps/chosen": -73.67254638671875,
481
+ "logps/rejected": -81.05415344238281,
482
+ "loss": 0.6916,
483
+ "pred_label": 1751.75,
484
+ "rewards/accuracies": 0.30000001192092896,
485
+ "rewards/chosen": -0.0876312330365181,
486
+ "rewards/margins": 0.08376732468605042,
487
+ "rewards/rejected": -0.17139855027198792,
488
  "step": 460,
489
+ "use_label": 7466.25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
490
  },
491
  {
492
  "epoch": 0.5,
493
+ "grad_norm": 0.69921875,
494
  "learning_rate": 2.91409685362137e-06,
495
+ "logits/chosen": -2.0379364490509033,
496
+ "logits/rejected": -2.0492634773254395,
497
+ "logps/chosen": -77.06828308105469,
498
+ "logps/rejected": -89.38865661621094,
499
+ "loss": 0.6912,
500
+ "pred_label": 1832.6500244140625,
501
+ "rewards/accuracies": 0.36250001192092896,
502
+ "rewards/chosen": -0.06041146069765091,
503
+ "rewards/margins": 0.10216375440359116,
504
+ "rewards/rejected": -0.16257521510124207,
505
  "step": 480,
506
+ "use_label": 7705.35009765625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  },
508
  {
509
  "epoch": 0.52,
510
+ "grad_norm": 0.86328125,
511
  "learning_rate": 2.7328129695107205e-06,
512
+ "logits/chosen": -2.031346082687378,
513
+ "logits/rejected": -2.0272762775421143,
514
+ "logps/chosen": -79.55888366699219,
515
+ "logps/rejected": -84.47586822509766,
516
+ "loss": 0.6903,
517
+ "pred_label": 1919.5374755859375,
518
+ "rewards/accuracies": 0.36250001192092896,
519
+ "rewards/chosen": -0.08177755773067474,
520
+ "rewards/margins": 0.08017835766077042,
521
+ "rewards/rejected": -0.16195592284202576,
522
  "step": 500,
523
+ "use_label": 7938.46240234375
524
  },
525
  {
526
  "epoch": 0.52,
527
+ "eval_logits/chosen": -2.0070507526397705,
528
+ "eval_logits/rejected": -1.9800992012023926,
529
+ "eval_logps/chosen": -76.36968231201172,
530
+ "eval_logps/rejected": -92.65614318847656,
531
+ "eval_loss": 0.6914148926734924,
532
+ "eval_pred_label": 2025.793701171875,
533
+ "eval_rewards/accuracies": 0.3492063581943512,
534
+ "eval_rewards/chosen": -0.07469133287668228,
535
+ "eval_rewards/margins": 0.09788943827152252,
536
+ "eval_rewards/rejected": -0.1725807636976242,
537
+ "eval_runtime": 247.8554,
538
+ "eval_samples_per_second": 8.069,
539
+ "eval_steps_per_second": 0.254,
540
+ "eval_use_label": 8246.2060546875,
541
  "step": 500
542
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  {
544
  "epoch": 0.54,
545
+ "grad_norm": 0.78125,
546
  "learning_rate": 2.5502840349805074e-06,
547
+ "logits/chosen": -2.026449203491211,
548
+ "logits/rejected": -2.0701510906219482,
549
+ "logps/chosen": -75.1209487915039,
550
+ "logps/rejected": -88.01356506347656,
551
+ "loss": 0.6913,
552
+ "pred_label": 2148.887451171875,
553
+ "rewards/accuracies": 0.3531250059604645,
554
+ "rewards/chosen": -0.06801941990852356,
555
+ "rewards/margins": 0.09691040217876434,
556
+ "rewards/rejected": -0.1649298369884491,
557
  "step": 520,
558
+ "use_label": 8533.1123046875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  },
560
  {
561
  "epoch": 0.57,
562
+ "grad_norm": 1.09375,
563
  "learning_rate": 2.367486188632446e-06,
564
+ "logits/chosen": -2.0245327949523926,
565
+ "logits/rejected": -2.0479135513305664,
566
+ "logps/chosen": -84.60169219970703,
567
+ "logps/rejected": -90.6330795288086,
568
+ "loss": 0.692,
569
+ "pred_label": 2235.550048828125,
570
+ "rewards/accuracies": 0.359375,
571
+ "rewards/chosen": -0.09091995656490326,
572
+ "rewards/margins": 0.11123095452785492,
573
+ "rewards/rejected": -0.20215091109275818,
574
  "step": 540,
575
+ "use_label": 8766.4501953125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  },
577
  {
578
  "epoch": 0.59,
579
+ "grad_norm": 0.75390625,
580
  "learning_rate": 2.1853970071701415e-06,
581
+ "logits/chosen": -2.0177600383758545,
582
+ "logits/rejected": -2.016798257827759,
583
+ "logps/chosen": -78.94650268554688,
584
+ "logps/rejected": -80.36412811279297,
585
+ "loss": 0.6917,
586
+ "pred_label": 2319.53759765625,
587
+ "rewards/accuracies": 0.2874999940395355,
588
+ "rewards/chosen": -0.10138510167598724,
589
+ "rewards/margins": 0.06911652535200119,
590
+ "rewards/rejected": -0.17050163447856903,
591
  "step": 560,
592
+ "use_label": 9002.462890625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  },
594
  {
595
  "epoch": 0.61,
596
+ "grad_norm": 0.71875,
597
  "learning_rate": 2.00499027745888e-06,
598
+ "logits/chosen": -2.054065704345703,
599
+ "logits/rejected": -2.0555384159088135,
600
+ "logps/chosen": -80.3529281616211,
601
+ "logps/rejected": -95.12947082519531,
602
+ "loss": 0.6919,
603
+ "pred_label": 2401.675048828125,
604
+ "rewards/accuracies": 0.359375,
605
+ "rewards/chosen": -0.09597108513116837,
606
+ "rewards/margins": 0.09131233394145966,
607
+ "rewards/rejected": -0.18728342652320862,
608
  "step": 580,
609
+ "use_label": 9240.3251953125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
  },
611
  {
612
  "epoch": 0.63,
613
+ "grad_norm": 0.76171875,
614
  "learning_rate": 1.8272307888529276e-06,
615
+ "logits/chosen": -2.059126377105713,
616
+ "logits/rejected": -2.099806547164917,
617
+ "logps/chosen": -89.58797454833984,
618
+ "logps/rejected": -108.6166000366211,
619
+ "loss": 0.6903,
620
+ "pred_label": 2492.9375,
621
+ "rewards/accuracies": 0.41874998807907104,
622
+ "rewards/chosen": -0.12580521404743195,
623
+ "rewards/margins": 0.10241512209177017,
624
+ "rewards/rejected": -0.22822031378746033,
625
  "step": 600,
626
+ "use_label": 9469.0625
627
  },
628
  {
629
  "epoch": 0.63,
630
+ "eval_logits/chosen": -1.9870026111602783,
631
+ "eval_logits/rejected": -1.960112452507019,
632
+ "eval_logps/chosen": -78.95431518554688,
633
+ "eval_logps/rejected": -95.86695861816406,
634
+ "eval_loss": 0.6917396187782288,
635
+ "eval_pred_label": 2603.9365234375,
636
+ "eval_rewards/accuracies": 0.3551587164402008,
637
+ "eval_rewards/chosen": -0.1005377396941185,
638
+ "eval_rewards/margins": 0.104151152074337,
639
+ "eval_rewards/rejected": -0.2046888917684555,
640
+ "eval_runtime": 247.9642,
641
+ "eval_samples_per_second": 8.066,
642
  "eval_steps_per_second": 0.254,
643
+ "eval_use_label": 9772.0634765625,
644
  "step": 600
645
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  {
647
  "epoch": 0.65,
648
+ "grad_norm": 0.5859375,
649
  "learning_rate": 1.6530691736402317e-06,
650
+ "logits/chosen": -1.9752880334854126,
651
+ "logits/rejected": -2.011981964111328,
652
+ "logps/chosen": -69.71615600585938,
653
+ "logps/rejected": -95.88337707519531,
654
+ "loss": 0.6918,
655
+ "pred_label": 2726.324951171875,
656
+ "rewards/accuracies": 0.34687501192092896,
657
+ "rewards/chosen": -0.09408678859472275,
658
+ "rewards/margins": 0.09362435340881348,
659
+ "rewards/rejected": -0.18771114945411682,
660
  "step": 620,
661
+ "use_label": 10059.6748046875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  },
663
  {
664
  "epoch": 0.67,
665
+ "grad_norm": 0.73046875,
666
  "learning_rate": 1.4834368231970922e-06,
667
+ "logits/chosen": -2.0288071632385254,
668
+ "logits/rejected": -2.0409998893737793,
669
+ "logps/chosen": -82.56907653808594,
670
+ "logps/rejected": -90.75765228271484,
671
+ "loss": 0.6894,
672
+ "pred_label": 2805.512451171875,
673
+ "rewards/accuracies": 0.36250001192092896,
674
+ "rewards/chosen": -0.10210500657558441,
675
+ "rewards/margins": 0.10695278644561768,
676
+ "rewards/rejected": -0.2090577781200409,
677
  "step": 640,
678
+ "use_label": 10300.4873046875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
  },
680
  {
681
  "epoch": 0.69,
682
+ "grad_norm": 0.5625,
683
  "learning_rate": 1.3192409070404582e-06,
684
+ "logits/chosen": -2.055405855178833,
685
+ "logits/rejected": -2.0071816444396973,
686
+ "logps/chosen": -77.25361633300781,
687
+ "logps/rejected": -88.34065246582031,
688
+ "loss": 0.6915,
689
+ "pred_label": 2899.9375,
690
+ "rewards/accuracies": 0.34687501192092896,
691
+ "rewards/chosen": -0.11595650017261505,
692
+ "rewards/margins": 0.0952102541923523,
693
+ "rewards/rejected": -0.21116676926612854,
694
  "step": 660,
695
+ "use_label": 10526.0625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
696
  },
697
  {
698
  "epoch": 0.71,
699
+ "grad_norm": 0.67578125,
700
  "learning_rate": 1.1613595214152713e-06,
701
+ "logits/chosen": -2.056795597076416,
702
+ "logits/rejected": -2.071035861968994,
703
+ "logps/chosen": -88.15283203125,
704
+ "logps/rejected": -96.39839172363281,
705
+ "loss": 0.6918,
706
+ "pred_label": 2978.0625,
707
+ "rewards/accuracies": 0.3499999940395355,
708
+ "rewards/chosen": -0.12273094803094864,
709
+ "rewards/margins": 0.09404005855321884,
710
+ "rewards/rejected": -0.2167709767818451,
711
  "step": 680,
712
+ "use_label": 10767.9375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  },
714
  {
715
  "epoch": 0.73,
716
+ "grad_norm": 0.74609375,
717
  "learning_rate": 1.0106369933615043e-06,
718
+ "logits/chosen": -2.0782313346862793,
719
+ "logits/rejected": -2.0467371940612793,
720
+ "logps/chosen": -97.93621826171875,
721
+ "logps/rejected": -106.91497802734375,
722
+ "loss": 0.6917,
723
+ "pred_label": 3075.71240234375,
724
+ "rewards/accuracies": 0.3687500059604645,
725
+ "rewards/chosen": -0.1391007900238037,
726
+ "rewards/margins": 0.10766571760177612,
727
+ "rewards/rejected": -0.24676652252674103,
728
  "step": 700,
729
+ "use_label": 10990.287109375
730
  },
731
  {
732
  "epoch": 0.73,
733
+ "eval_logits/chosen": -1.9658821821212769,
734
+ "eval_logits/rejected": -1.9401167631149292,
735
+ "eval_logps/chosen": -80.06806182861328,
736
+ "eval_logps/rejected": -97.64107513427734,
737
+ "eval_loss": 0.6917343735694885,
738
+ "eval_pred_label": 3195.22216796875,
739
+ "eval_rewards/accuracies": 0.3511904776096344,
740
+ "eval_rewards/chosen": -0.11167524009943008,
741
+ "eval_rewards/margins": 0.1107548326253891,
742
+ "eval_rewards/rejected": -0.2224300652742386,
743
+ "eval_runtime": 247.943,
744
+ "eval_samples_per_second": 8.066,
745
  "eval_steps_per_second": 0.254,
746
+ "eval_use_label": 11284.77734375,
747
  "step": 700
748
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
749
  {
750
  "epoch": 0.75,
751
+ "grad_norm": 0.72265625,
752
  "learning_rate": 8.678793653740633e-07,
753
+ "logits/chosen": -2.015249729156494,
754
+ "logits/rejected": -2.0358498096466064,
755
+ "logps/chosen": -70.9017562866211,
756
+ "logps/rejected": -86.4397201538086,
757
+ "loss": 0.6908,
758
+ "pred_label": 3306.39990234375,
759
+ "rewards/accuracies": 0.3187499940395355,
760
+ "rewards/chosen": -0.10931293666362762,
761
+ "rewards/margins": 0.0925455391407013,
762
+ "rewards/rejected": -0.20185847580432892,
763
  "step": 720,
764
+ "use_label": 11583.599609375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
765
  },
766
  {
767
  "epoch": 0.77,
768
+ "grad_norm": 0.83203125,
769
  "learning_rate": 7.338500848029603e-07,
770
+ "logits/chosen": -2.01334810256958,
771
+ "logits/rejected": -2.0296788215637207,
772
+ "logps/chosen": -74.19635772705078,
773
+ "logps/rejected": -83.99024200439453,
774
+ "loss": 0.6911,
775
+ "pred_label": 3386.16259765625,
776
+ "rewards/accuracies": 0.32499998807907104,
777
+ "rewards/chosen": -0.08706559240818024,
778
+ "rewards/margins": 0.11473299562931061,
779
+ "rewards/rejected": -0.20179858803749084,
780
  "step": 740,
781
+ "use_label": 11823.837890625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  },
783
  {
784
  "epoch": 0.8,
785
+ "grad_norm": 0.66015625,
786
  "learning_rate": 6.092659210462232e-07,
787
+ "logits/chosen": -2.052433967590332,
788
+ "logits/rejected": -2.060997724533081,
789
+ "logps/chosen": -76.93110656738281,
790
+ "logps/rejected": -97.30107879638672,
791
+ "loss": 0.6904,
792
+ "pred_label": 3466.5,
793
+ "rewards/accuracies": 0.33125001192092896,
794
+ "rewards/chosen": -0.11182014644145966,
795
+ "rewards/margins": 0.07981495559215546,
796
+ "rewards/rejected": -0.1916351020336151,
797
  "step": 760,
798
+ "use_label": 12063.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
799
  },
800
  {
801
  "epoch": 0.82,
802
+ "grad_norm": 0.859375,
803
  "learning_rate": 4.947931323697983e-07,
804
+ "logits/chosen": -2.032320737838745,
805
+ "logits/rejected": -2.047227144241333,
806
+ "logps/chosen": -89.46810913085938,
807
+ "logps/rejected": -95.58660125732422,
808
+ "loss": 0.6913,
809
+ "pred_label": 3558.875,
810
+ "rewards/accuracies": 0.375,
811
+ "rewards/chosen": -0.11294672638177872,
812
+ "rewards/margins": 0.11753211170434952,
813
+ "rewards/rejected": -0.23047883808612823,
814
  "step": 780,
815
+ "use_label": 12291.125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  },
817
  {
818
  "epoch": 0.84,
819
+ "grad_norm": 0.74609375,
820
  "learning_rate": 3.910439028537638e-07,
821
+ "logits/chosen": -2.010045289993286,
822
+ "logits/rejected": -1.989505410194397,
823
+ "logps/chosen": -70.47514343261719,
824
+ "logps/rejected": -75.11082458496094,
825
+ "loss": 0.6912,
826
+ "pred_label": 3649.22509765625,
827
+ "rewards/accuracies": 0.3656249940395355,
828
+ "rewards/chosen": -0.08034199476242065,
829
+ "rewards/margins": 0.0995674580335617,
830
+ "rewards/rejected": -0.17990948259830475,
831
  "step": 800,
832
+ "use_label": 12520.775390625
833
  },
834
  {
835
  "epoch": 0.84,
836
+ "eval_logits/chosen": -1.9421576261520386,
837
+ "eval_logits/rejected": -1.9144233465194702,
838
+ "eval_logps/chosen": -77.5874252319336,
839
+ "eval_logps/rejected": -95.20885467529297,
840
+ "eval_loss": 0.6917100548744202,
841
+ "eval_pred_label": 3757.174560546875,
842
  "eval_rewards/accuracies": 0.363095223903656,
843
+ "eval_rewards/chosen": -0.08686873316764832,
844
+ "eval_rewards/margins": 0.11123905330896378,
845
+ "eval_rewards/rejected": -0.19810780882835388,
846
+ "eval_runtime": 247.8932,
847
+ "eval_samples_per_second": 8.068,
848
  "eval_steps_per_second": 0.254,
849
+ "eval_use_label": 12826.8251953125,
850
  "step": 800
851
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  {
853
  "epoch": 0.86,
854
+ "grad_norm": 0.828125,
855
  "learning_rate": 2.98573068519539e-07,
856
+ "logits/chosen": -2.035728931427002,
857
+ "logits/rejected": -2.029679775238037,
858
+ "logps/chosen": -74.97032165527344,
859
+ "logps/rejected": -84.2763900756836,
860
+ "loss": 0.6908,
861
+ "pred_label": 3872.199951171875,
862
+ "rewards/accuracies": 0.3343749940395355,
863
+ "rewards/chosen": -0.1004786491394043,
864
+ "rewards/margins": 0.08142165094614029,
865
+ "rewards/rejected": -0.181900292634964,
866
  "step": 820,
867
+ "use_label": 13121.7998046875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
868
  },
869
  {
870
  "epoch": 0.88,
871
+ "grad_norm": 0.6953125,
872
  "learning_rate": 2.178751501463036e-07,
873
+ "logits/chosen": -2.0276803970336914,
874
+ "logits/rejected": -2.0149848461151123,
875
+ "logps/chosen": -66.70552062988281,
876
+ "logps/rejected": -70.63726806640625,
877
  "loss": 0.6915,
878
+ "pred_label": 3954.60009765625,
879
+ "rewards/accuracies": 0.28437501192092896,
880
+ "rewards/chosen": -0.08035041391849518,
881
+ "rewards/margins": 0.07462439686059952,
882
+ "rewards/rejected": -0.1549748182296753,
883
  "step": 840,
884
+ "use_label": 13359.400390625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
885
  },
886
  {
887
  "epoch": 0.9,
888
+ "grad_norm": 0.7578125,
889
  "learning_rate": 1.4938170864468636e-07,
890
+ "logits/chosen": -2.048083543777466,
891
+ "logits/rejected": -2.0321922302246094,
892
+ "logps/chosen": -90.8042221069336,
893
+ "logps/rejected": -100.8233413696289,
894
+ "loss": 0.69,
895
+ "pred_label": 4041.72509765625,
896
+ "rewards/accuracies": 0.40625,
897
+ "rewards/chosen": -0.0809466689825058,
898
+ "rewards/margins": 0.1332779824733734,
899
+ "rewards/rejected": -0.2142246663570404,
900
  "step": 860,
901
+ "use_label": 13592.275390625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  },
903
  {
904
  "epoch": 0.92,
905
+ "grad_norm": 0.5546875,
906
  "learning_rate": 9.345903713082305e-08,
907
+ "logits/chosen": -2.047487735748291,
908
+ "logits/rejected": -2.034466505050659,
909
+ "logps/chosen": -81.69231414794922,
910
+ "logps/rejected": -101.5263442993164,
911
+ "loss": 0.6915,
912
+ "pred_label": 4142.625,
913
+ "rewards/accuracies": 0.38749998807907104,
914
+ "rewards/chosen": -0.09660721570253372,
915
+ "rewards/margins": 0.13364934921264648,
916
+ "rewards/rejected": -0.23025652766227722,
917
  "step": 880,
918
+ "use_label": 13811.375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
919
  },
920
  {
921
  "epoch": 0.94,
922
+ "grad_norm": 0.7578125,
923
  "learning_rate": 5.0406202043228604e-08,
924
+ "logits/chosen": -1.9304163455963135,
925
+ "logits/rejected": -1.9657026529312134,
926
+ "logps/chosen": -75.30284118652344,
927
+ "logps/rejected": -99.71704864501953,
928
+ "loss": 0.6914,
929
+ "pred_label": 4235.9248046875,
930
+ "rewards/accuracies": 0.3375000059604645,
931
+ "rewards/chosen": -0.08683101832866669,
932
+ "rewards/margins": 0.10066400468349457,
933
+ "rewards/rejected": -0.18749502301216125,
934
  "step": 900,
935
+ "use_label": 14038.0751953125
936
  },
937
  {
938
  "epoch": 0.94,
939
+ "eval_logits/chosen": -1.939072847366333,
940
+ "eval_logits/rejected": -1.9112603664398193,
941
+ "eval_logps/chosen": -77.5274658203125,
942
+ "eval_logps/rejected": -95.22908020019531,
943
+ "eval_loss": 0.6917905211448669,
944
+ "eval_pred_label": 4352.28564453125,
945
+ "eval_rewards/accuracies": 0.3571428656578064,
946
+ "eval_rewards/chosen": -0.08626923710107803,
947
+ "eval_rewards/margins": 0.1120409369468689,
948
+ "eval_rewards/rejected": -0.19831016659736633,
949
+ "eval_runtime": 247.7794,
950
+ "eval_samples_per_second": 8.072,
951
  "eval_steps_per_second": 0.254,
952
+ "eval_use_label": 14335.7138671875,
953
  "step": 900
954
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
  {
956
  "epoch": 0.96,
957
+ "grad_norm": 0.80078125,
958
  "learning_rate": 2.0453443778310766e-08,
959
+ "logits/chosen": -1.9801095724105835,
960
+ "logits/rejected": -1.9714418649673462,
961
+ "logps/chosen": -63.8930778503418,
962
+ "logps/rejected": -85.15528869628906,
963
+ "loss": 0.6906,
964
+ "pred_label": 4473.8125,
965
+ "rewards/accuracies": 0.31562501192092896,
966
+ "rewards/chosen": -0.06585933268070221,
967
+ "rewards/margins": 0.11039040982723236,
968
+ "rewards/rejected": -0.17624975740909576,
969
  "step": 920,
970
+ "use_label": 14624.1875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
971
  },
972
  {
973
  "epoch": 0.98,
974
+ "grad_norm": 0.8359375,
975
  "learning_rate": 3.760945397705828e-09,
976
+ "logits/chosen": -1.9589160680770874,
977
+ "logits/rejected": -1.9971154928207397,
978
+ "logps/chosen": -74.0462646484375,
979
+ "logps/rejected": -91.64708709716797,
980
+ "loss": 0.6913,
981
+ "pred_label": 4558.71240234375,
982
+ "rewards/accuracies": 0.32499998807907104,
983
+ "rewards/chosen": -0.0799408107995987,
984
+ "rewards/margins": 0.10116855055093765,
985
+ "rewards/rejected": -0.18110935389995575,
986
  "step": 940,
987
+ "use_label": 14859.287109375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988
  },
989
  {
990
  "epoch": 1.0,
991
  "step": 955,
992
  "total_flos": 0.0,
993
+ "train_loss": 0.6906769273168754,
994
+ "train_runtime": 20027.4031,
995
  "train_samples_per_second": 3.053,
996
  "train_steps_per_second": 0.048
997
  }
998
  ],
999
+ "logging_steps": 20,
1000
  "max_steps": 955,
1001
  "num_input_tokens_seen": 0,
1002
  "num_train_epochs": 1,
1003
+ "save_steps": 100,
1004
  "total_flos": 0.0,
1005
  "train_batch_size": 4,
1006
  "trial_name": null,