Marcus2112
commited on
Upload folder using huggingface_hub
Browse files- checkpoint-1024/config.json +32 -0
- checkpoint-1024/generation_config.json +6 -0
- checkpoint-1024/model.safetensors +3 -0
- checkpoint-1024/optimizer.pt +3 -0
- checkpoint-1024/rng_state.pth +3 -0
- checkpoint-1024/scheduler.pt +3 -0
- checkpoint-1024/trainer_state.json +827 -0
- checkpoint-1024/training_args.bin +3 -0
- config.json +32 -0
- generation_config.json +6 -0
- model.safetensors +3 -0
- training_args.bin +3 -0
checkpoint-1024/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/vol/tmp/koppelmm/pythia160m_dedup_untrained",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 0,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"max_position_embeddings": 2048,
|
18 |
+
"model_type": "gpt_neox",
|
19 |
+
"num_attention_heads": 12,
|
20 |
+
"num_hidden_layers": 12,
|
21 |
+
"partial_rotary_factor": 0.25,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rope_theta": 10000,
|
24 |
+
"rotary_emb_base": 10000,
|
25 |
+
"rotary_pct": 0.25,
|
26 |
+
"tie_word_embeddings": false,
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.46.2",
|
29 |
+
"use_cache": true,
|
30 |
+
"use_parallel_residual": true,
|
31 |
+
"vocab_size": 50304
|
32 |
+
}
|
checkpoint-1024/generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 0,
|
5 |
+
"transformers_version": "4.46.2"
|
6 |
+
}
|
checkpoint-1024/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c5f335ef94c0ce86cfda59e84cdd9bbef1b433fa1974f80e4b56d4498d8c3c6
|
3 |
+
size 649308728
|
checkpoint-1024/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82ea90bec2bbb7210ee07df2db4735d39bd992d4cb6e7f1989788717eedf65f1
|
3 |
+
size 1298707258
|
checkpoint-1024/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:726ae76b86900d8abab60df7866887a207be06427ba6369a4451cd2686c4ed71
|
3 |
+
size 14244
|
checkpoint-1024/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ebd1a7d23da6a4b09d2235e23699a761e868e0b6ad2f2892a33dea66611ced6
|
3 |
+
size 1948045434
|
checkpoint-1024/trainer_state.json
ADDED
@@ -0,0 +1,827 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": null,
|
3 |
+
"best_model_checkpoint": null,
|
4 |
+
"epoch": 1.119799451652303,
|
5 |
+
"eval_steps": 100,
|
6 |
+
"global_step": 1024,
|
7 |
+
"is_hyper_param_search": false,
|
8 |
+
"is_local_process_zero": true,
|
9 |
+
"is_world_process_zero": true,
|
10 |
+
"log_history": [
|
11 |
+
{
|
12 |
+
"epoch": 0.010932788971549125,
|
13 |
+
"grad_norm": 167.07713317871094,
|
14 |
+
"learning_rate": 0.0005999985601583006,
|
15 |
+
"loss": 9.6698,
|
16 |
+
"step": 10
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"epoch": 0.02186557794309825,
|
20 |
+
"grad_norm": 83.92709350585938,
|
21 |
+
"learning_rate": 0.0005998257958771109,
|
22 |
+
"loss": 8.2484,
|
23 |
+
"step": 20
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 0.032798366914647374,
|
27 |
+
"grad_norm": 127.91200256347656,
|
28 |
+
"learning_rate": 0.0005993652532642609,
|
29 |
+
"loss": 7.6452,
|
30 |
+
"step": 30
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"epoch": 0.0437311558861965,
|
34 |
+
"grad_norm": 97.33670043945312,
|
35 |
+
"learning_rate": 0.0005986173743570491,
|
36 |
+
"loss": 7.4548,
|
37 |
+
"step": 40
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"epoch": 0.054663944857745624,
|
41 |
+
"grad_norm": 127.0005874633789,
|
42 |
+
"learning_rate": 0.0005975828769834513,
|
43 |
+
"loss": 7.3226,
|
44 |
+
"step": 50
|
45 |
+
},
|
46 |
+
{
|
47 |
+
"epoch": 0.06559673382929475,
|
48 |
+
"grad_norm": 104.47633361816406,
|
49 |
+
"learning_rate": 0.0005962627540731365,
|
50 |
+
"loss": 7.204,
|
51 |
+
"step": 60
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"epoch": 0.07652952280084388,
|
55 |
+
"grad_norm": 164.4477081298828,
|
56 |
+
"learning_rate": 0.0005946582727044349,
|
57 |
+
"loss": 7.1105,
|
58 |
+
"step": 70
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"epoch": 0.087462311772393,
|
62 |
+
"grad_norm": 126.8350601196289,
|
63 |
+
"learning_rate": 0.0005927709728881719,
|
64 |
+
"loss": 7.0511,
|
65 |
+
"step": 80
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"epoch": 0.09839510074394213,
|
69 |
+
"grad_norm": 158.55856323242188,
|
70 |
+
"learning_rate": 0.0005906026660895383,
|
71 |
+
"loss": 7.0642,
|
72 |
+
"step": 90
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"epoch": 0.10932788971549125,
|
76 |
+
"grad_norm": 126.1555404663086,
|
77 |
+
"learning_rate": 0.0005881554334894116,
|
78 |
+
"loss": 7.031,
|
79 |
+
"step": 100
|
80 |
+
},
|
81 |
+
{
|
82 |
+
"epoch": 0.10932788971549125,
|
83 |
+
"eval_loss": 7.01555061340332,
|
84 |
+
"eval_runtime": 79.0984,
|
85 |
+
"eval_samples_per_second": 118.409,
|
86 |
+
"eval_steps_per_second": 14.804,
|
87 |
+
"step": 100
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"epoch": 0.12026067868704038,
|
91 |
+
"grad_norm": 108.58393096923828,
|
92 |
+
"learning_rate": 0.0005854316239868012,
|
93 |
+
"loss": 7.0123,
|
94 |
+
"step": 110
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"epoch": 0.1311934676585895,
|
98 |
+
"grad_norm": 178.0326690673828,
|
99 |
+
"learning_rate": 0.0005824338519443309,
|
100 |
+
"loss": 6.9897,
|
101 |
+
"step": 120
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"epoch": 0.14212625663013861,
|
105 |
+
"grad_norm": 192.8655242919922,
|
106 |
+
"learning_rate": 0.0005791649946789259,
|
107 |
+
"loss": 7.0117,
|
108 |
+
"step": 130
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"epoch": 0.15305904560168776,
|
112 |
+
"grad_norm": 143.3759002685547,
|
113 |
+
"learning_rate": 0.0005756281897001107,
|
114 |
+
"loss": 7.0073,
|
115 |
+
"step": 140
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"epoch": 0.16399183457323688,
|
119 |
+
"grad_norm": 171.0679168701172,
|
120 |
+
"learning_rate": 0.0005718268316985698,
|
121 |
+
"loss": 6.9843,
|
122 |
+
"step": 150
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"epoch": 0.174924623544786,
|
126 |
+
"grad_norm": 164.86534118652344,
|
127 |
+
"learning_rate": 0.0005677645692878606,
|
128 |
+
"loss": 7.0083,
|
129 |
+
"step": 160
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"epoch": 0.1858574125163351,
|
133 |
+
"grad_norm": 125.85225677490234,
|
134 |
+
"learning_rate": 0.000563445301502407,
|
135 |
+
"loss": 7.02,
|
136 |
+
"step": 170
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"epoch": 0.19679020148788426,
|
140 |
+
"grad_norm": 144.15589904785156,
|
141 |
+
"learning_rate": 0.0005588731740551344,
|
142 |
+
"loss": 6.9773,
|
143 |
+
"step": 180
|
144 |
+
},
|
145 |
+
{
|
146 |
+
"epoch": 0.20772299045943338,
|
147 |
+
"grad_norm": 108.05564880371094,
|
148 |
+
"learning_rate": 0.0005540525753583378,
|
149 |
+
"loss": 6.9632,
|
150 |
+
"step": 190
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"epoch": 0.2186557794309825,
|
154 |
+
"grad_norm": 146.53924560546875,
|
155 |
+
"learning_rate": 0.0005489881323116018,
|
156 |
+
"loss": 6.929,
|
157 |
+
"step": 200
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"epoch": 0.2186557794309825,
|
161 |
+
"eval_loss": 6.925621509552002,
|
162 |
+
"eval_runtime": 78.9467,
|
163 |
+
"eval_samples_per_second": 118.637,
|
164 |
+
"eval_steps_per_second": 14.833,
|
165 |
+
"step": 200
|
166 |
+
},
|
167 |
+
{
|
168 |
+
"epoch": 0.2295885684025316,
|
169 |
+
"grad_norm": 204.57968139648438,
|
170 |
+
"learning_rate": 0.0005436847058608189,
|
171 |
+
"loss": 6.9631,
|
172 |
+
"step": 210
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"epoch": 0.24052135737408076,
|
176 |
+
"grad_norm": 171.31556701660156,
|
177 |
+
"learning_rate": 0.0005381473863325621,
|
178 |
+
"loss": 7.0389,
|
179 |
+
"step": 220
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"epoch": 0.25145414634562985,
|
183 |
+
"grad_norm": 142.57449340820312,
|
184 |
+
"learning_rate": 0.0005323814885482963,
|
185 |
+
"loss": 6.967,
|
186 |
+
"step": 230
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"epoch": 0.262386935317179,
|
190 |
+
"grad_norm": 119.19646453857422,
|
191 |
+
"learning_rate": 0.000526392546723115,
|
192 |
+
"loss": 6.9456,
|
193 |
+
"step": 240
|
194 |
+
},
|
195 |
+
{
|
196 |
+
"epoch": 0.27331972428872814,
|
197 |
+
"grad_norm": 153.62359619140625,
|
198 |
+
"learning_rate": 0.0005201863091538979,
|
199 |
+
"loss": 6.9686,
|
200 |
+
"step": 250
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"epoch": 0.28425251326027723,
|
204 |
+
"grad_norm": 150.35699462890625,
|
205 |
+
"learning_rate": 0.000513768732701989,
|
206 |
+
"loss": 6.9846,
|
207 |
+
"step": 260
|
208 |
+
},
|
209 |
+
{
|
210 |
+
"epoch": 0.2951853022318264,
|
211 |
+
"grad_norm": 215.55368041992188,
|
212 |
+
"learning_rate": 0.0005071459770756929,
|
213 |
+
"loss": 6.9968,
|
214 |
+
"step": 270
|
215 |
+
},
|
216 |
+
{
|
217 |
+
"epoch": 0.3061180912033755,
|
218 |
+
"grad_norm": 107.55154418945312,
|
219 |
+
"learning_rate": 0.0005003243989180711,
|
220 |
+
"loss": 7.0033,
|
221 |
+
"step": 280
|
222 |
+
},
|
223 |
+
{
|
224 |
+
"epoch": 0.3170508801749246,
|
225 |
+
"grad_norm": 190.4154052734375,
|
226 |
+
"learning_rate": 0.0004933105457057203,
|
227 |
+
"loss": 6.9816,
|
228 |
+
"step": 290
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"epoch": 0.32798366914647376,
|
232 |
+
"grad_norm": 159.7703094482422,
|
233 |
+
"learning_rate": 0.0004861111494643821,
|
234 |
+
"loss": 7.0486,
|
235 |
+
"step": 300
|
236 |
+
},
|
237 |
+
{
|
238 |
+
"epoch": 0.32798366914647376,
|
239 |
+
"eval_loss": 7.4869384765625,
|
240 |
+
"eval_runtime": 79.1717,
|
241 |
+
"eval_samples_per_second": 118.3,
|
242 |
+
"eval_steps_per_second": 14.791,
|
243 |
+
"step": 300
|
244 |
+
},
|
245 |
+
{
|
246 |
+
"epoch": 0.3389164581180229,
|
247 |
+
"grad_norm": 218.22604370117188,
|
248 |
+
"learning_rate": 0.0004794787611927562,
|
249 |
+
"loss": 7.2679,
|
250 |
+
"step": 310
|
251 |
+
},
|
252 |
+
{
|
253 |
+
"epoch": 0.349849247089572,
|
254 |
+
"grad_norm": 182.51431274414062,
|
255 |
+
"learning_rate": 0.0004719460124060748,
|
256 |
+
"loss": 7.1809,
|
257 |
+
"step": 320
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"epoch": 0.36078203606112114,
|
261 |
+
"grad_norm": 137.0953826904297,
|
262 |
+
"learning_rate": 0.0004642482266637136,
|
263 |
+
"loss": 7.0417,
|
264 |
+
"step": 330
|
265 |
+
},
|
266 |
+
{
|
267 |
+
"epoch": 0.3717148250326702,
|
268 |
+
"grad_norm": 92.07840728759766,
|
269 |
+
"learning_rate": 0.0004563927924424775,
|
270 |
+
"loss": 6.9309,
|
271 |
+
"step": 340
|
272 |
+
},
|
273 |
+
{
|
274 |
+
"epoch": 0.38264761400421937,
|
275 |
+
"grad_norm": 147.35975646972656,
|
276 |
+
"learning_rate": 0.00044838724953309093,
|
277 |
+
"loss": 6.8844,
|
278 |
+
"step": 350
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"epoch": 0.3935804029757685,
|
282 |
+
"grad_norm": 262.996337890625,
|
283 |
+
"learning_rate": 0.0004402392818033671,
|
284 |
+
"loss": 6.966,
|
285 |
+
"step": 360
|
286 |
+
},
|
287 |
+
{
|
288 |
+
"epoch": 0.4045131919473176,
|
289 |
+
"grad_norm": 155.3452606201172,
|
290 |
+
"learning_rate": 0.00043195670982308984,
|
291 |
+
"loss": 7.0715,
|
292 |
+
"step": 370
|
293 |
+
},
|
294 |
+
{
|
295 |
+
"epoch": 0.41544598091886675,
|
296 |
+
"grad_norm": 129.5069580078125,
|
297 |
+
"learning_rate": 0.00042354748335768664,
|
298 |
+
"loss": 7.0806,
|
299 |
+
"step": 380
|
300 |
+
},
|
301 |
+
{
|
302 |
+
"epoch": 0.4263787698904159,
|
303 |
+
"grad_norm": 92.96502685546875,
|
304 |
+
"learning_rate": 0.0004150196737378971,
|
305 |
+
"loss": 6.9999,
|
306 |
+
"step": 390
|
307 |
+
},
|
308 |
+
{
|
309 |
+
"epoch": 0.437311558861965,
|
310 |
+
"grad_norm": 120.41193389892578,
|
311 |
+
"learning_rate": 0.0004063814661127606,
|
312 |
+
"loss": 6.9339,
|
313 |
+
"step": 400
|
314 |
+
},
|
315 |
+
{
|
316 |
+
"epoch": 0.437311558861965,
|
317 |
+
"eval_loss": 6.931961536407471,
|
318 |
+
"eval_runtime": 78.8373,
|
319 |
+
"eval_samples_per_second": 118.802,
|
320 |
+
"eval_steps_per_second": 14.853,
|
321 |
+
"step": 400
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"epoch": 0.44824434783351413,
|
325 |
+
"grad_norm": 188.7049560546875,
|
326 |
+
"learning_rate": 0.00039764115159335935,
|
327 |
+
"loss": 6.9242,
|
328 |
+
"step": 410
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"epoch": 0.4591771368050632,
|
332 |
+
"grad_norm": 131.7518768310547,
|
333 |
+
"learning_rate": 0.0003888071192948565,
|
334 |
+
"loss": 6.9815,
|
335 |
+
"step": 420
|
336 |
+
},
|
337 |
+
{
|
338 |
+
"epoch": 0.47010992577661237,
|
339 |
+
"grad_norm": 247.91549682617188,
|
340 |
+
"learning_rate": 0.0003798878482844695,
|
341 |
+
"loss": 7.0838,
|
342 |
+
"step": 430
|
343 |
+
},
|
344 |
+
{
|
345 |
+
"epoch": 0.4810427147481615,
|
346 |
+
"grad_norm": 135.4517364501953,
|
347 |
+
"learning_rate": 0.000370891899443104,
|
348 |
+
"loss": 7.1813,
|
349 |
+
"step": 440
|
350 |
+
},
|
351 |
+
{
|
352 |
+
"epoch": 0.4919755037197106,
|
353 |
+
"grad_norm": 99.5172119140625,
|
354 |
+
"learning_rate": 0.00036182790724846315,
|
355 |
+
"loss": 7.1557,
|
356 |
+
"step": 450
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"epoch": 0.5029082926912597,
|
360 |
+
"grad_norm": 165.1914825439453,
|
361 |
+
"learning_rate": 0.00035270457148751575,
|
362 |
+
"loss": 7.0382,
|
363 |
+
"step": 460
|
364 |
+
},
|
365 |
+
{
|
366 |
+
"epoch": 0.5138410816628088,
|
367 |
+
"grad_norm": 128.59959411621094,
|
368 |
+
"learning_rate": 0.00034353064890628107,
|
369 |
+
"loss": 7.0597,
|
370 |
+
"step": 470
|
371 |
+
},
|
372 |
+
{
|
373 |
+
"epoch": 0.524773870634358,
|
374 |
+
"grad_norm": 142.37147521972656,
|
375 |
+
"learning_rate": 0.00033431494480494175,
|
376 |
+
"loss": 7.092,
|
377 |
+
"step": 480
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"epoch": 0.5357066596059071,
|
381 |
+
"grad_norm": 217.4059295654297,
|
382 |
+
"learning_rate": 0.0003250663045863544,
|
383 |
+
"loss": 7.0457,
|
384 |
+
"step": 490
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"epoch": 0.5466394485774563,
|
388 |
+
"grad_norm": 125.81988525390625,
|
389 |
+
"learning_rate": 0.0003157936052660688,
|
390 |
+
"loss": 7.0112,
|
391 |
+
"step": 500
|
392 |
+
},
|
393 |
+
{
|
394 |
+
"epoch": 0.5466394485774563,
|
395 |
+
"eval_loss": 7.004736423492432,
|
396 |
+
"eval_runtime": 78.8041,
|
397 |
+
"eval_samples_per_second": 118.852,
|
398 |
+
"eval_steps_per_second": 14.86,
|
399 |
+
"step": 500
|
400 |
+
},
|
401 |
+
{
|
402 |
+
"epoch": 0.5575722375490054,
|
403 |
+
"grad_norm": 170.00523376464844,
|
404 |
+
"learning_rate": 0.0003065057469520046,
|
405 |
+
"loss": 7.0162,
|
406 |
+
"step": 510
|
407 |
+
},
|
408 |
+
{
|
409 |
+
"epoch": 0.5685050265205545,
|
410 |
+
"grad_norm": 216.81466674804688,
|
411 |
+
"learning_rate": 0.0002972116443019633,
|
412 |
+
"loss": 7.0584,
|
413 |
+
"step": 520
|
414 |
+
},
|
415 |
+
{
|
416 |
+
"epoch": 0.5794378154921036,
|
417 |
+
"grad_norm": 239.21087646484375,
|
418 |
+
"learning_rate": 0.0002879202179671755,
|
419 |
+
"loss": 7.1254,
|
420 |
+
"step": 530
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"epoch": 0.5903706044636527,
|
424 |
+
"grad_norm": 190.0070343017578,
|
425 |
+
"learning_rate": 0.00027864038603009453,
|
426 |
+
"loss": 7.1717,
|
427 |
+
"step": 540
|
428 |
+
},
|
429 |
+
{
|
430 |
+
"epoch": 0.6013033934352019,
|
431 |
+
"grad_norm": 179.18785095214844,
|
432 |
+
"learning_rate": 0.00026938105544465745,
|
433 |
+
"loss": 7.1185,
|
434 |
+
"step": 550
|
435 |
+
},
|
436 |
+
{
|
437 |
+
"epoch": 0.612236182406751,
|
438 |
+
"grad_norm": 279.44781494140625,
|
439 |
+
"learning_rate": 0.0002601511134872255,
|
440 |
+
"loss": 7.0727,
|
441 |
+
"step": 560
|
442 |
+
},
|
443 |
+
{
|
444 |
+
"epoch": 0.6231689713783001,
|
445 |
+
"grad_norm": 227.90072631835938,
|
446 |
+
"learning_rate": 0.0002509594192264121,
|
447 |
+
"loss": 7.1088,
|
448 |
+
"step": 570
|
449 |
+
},
|
450 |
+
{
|
451 |
+
"epoch": 0.6341017603498492,
|
452 |
+
"grad_norm": 173.11819458007812,
|
453 |
+
"learning_rate": 0.0002418147950199862,
|
454 |
+
"loss": 7.0927,
|
455 |
+
"step": 580
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"epoch": 0.6450345493213984,
|
459 |
+
"grad_norm": 164.40736389160156,
|
460 |
+
"learning_rate": 0.00023272601804700946,
|
461 |
+
"loss": 7.0701,
|
462 |
+
"step": 590
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"epoch": 0.6559673382929475,
|
466 |
+
"grad_norm": 123.35533142089844,
|
467 |
+
"learning_rate": 0.0002237018118833387,
|
468 |
+
"loss": 7.0496,
|
469 |
+
"step": 600
|
470 |
+
},
|
471 |
+
{
|
472 |
+
"epoch": 0.6559673382929475,
|
473 |
+
"eval_loss": 7.052866458892822,
|
474 |
+
"eval_runtime": 78.8887,
|
475 |
+
"eval_samples_per_second": 118.724,
|
476 |
+
"eval_steps_per_second": 14.844,
|
477 |
+
"step": 600
|
478 |
+
},
|
479 |
+
{
|
480 |
+
"epoch": 0.6669001272644967,
|
481 |
+
"grad_norm": 225.67015075683594,
|
482 |
+
"learning_rate": 0.0002147508381285762,
|
483 |
+
"loss": 7.04,
|
484 |
+
"step": 610
|
485 |
+
},
|
486 |
+
{
|
487 |
+
"epoch": 0.6778329162360458,
|
488 |
+
"grad_norm": 140.2364501953125,
|
489 |
+
"learning_rate": 0.00020588168809250687,
|
490 |
+
"loss": 7.0902,
|
491 |
+
"step": 620
|
492 |
+
},
|
493 |
+
{
|
494 |
+
"epoch": 0.6887657052075948,
|
495 |
+
"grad_norm": 262.8550720214844,
|
496 |
+
"learning_rate": 0.00019710287454900033,
|
497 |
+
"loss": 7.1224,
|
498 |
+
"step": 630
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"epoch": 0.699698494179144,
|
502 |
+
"grad_norm": 150.97813415527344,
|
503 |
+
"learning_rate": 0.00018842282356529402,
|
504 |
+
"loss": 7.1802,
|
505 |
+
"step": 640
|
506 |
+
},
|
507 |
+
{
|
508 |
+
"epoch": 0.7106312831506931,
|
509 |
+
"grad_norm": 452.73431396484375,
|
510 |
+
"learning_rate": 0.00017984986641449754,
|
511 |
+
"loss": 7.1497,
|
512 |
+
"step": 650
|
513 |
+
},
|
514 |
+
{
|
515 |
+
"epoch": 0.7215640721222423,
|
516 |
+
"grad_norm": 138.37220764160156,
|
517 |
+
"learning_rate": 0.00017139223157908368,
|
518 |
+
"loss": 7.1715,
|
519 |
+
"step": 660
|
520 |
+
},
|
521 |
+
{
|
522 |
+
"epoch": 0.7324968610937914,
|
523 |
+
"grad_norm": 144.21133422851562,
|
524 |
+
"learning_rate": 0.00016305803685303906,
|
525 |
+
"loss": 7.1458,
|
526 |
+
"step": 670
|
527 |
+
},
|
528 |
+
{
|
529 |
+
"epoch": 0.7434296500653405,
|
530 |
+
"grad_norm": 142.4859161376953,
|
531 |
+
"learning_rate": 0.00015485528155025473,
|
532 |
+
"loss": 7.1041,
|
533 |
+
"step": 680
|
534 |
+
},
|
535 |
+
{
|
536 |
+
"epoch": 0.7543624390368896,
|
537 |
+
"grad_norm": 190.189208984375,
|
538 |
+
"learning_rate": 0.00014679183882663872,
|
539 |
+
"loss": 7.0798,
|
540 |
+
"step": 690
|
541 |
+
},
|
542 |
+
{
|
543 |
+
"epoch": 0.7652952280084387,
|
544 |
+
"grad_norm": 160.14442443847656,
|
545 |
+
"learning_rate": 0.0001388754481233139,
|
546 |
+
"loss": 7.074,
|
547 |
+
"step": 700
|
548 |
+
},
|
549 |
+
{
|
550 |
+
"epoch": 0.7652952280084387,
|
551 |
+
"eval_loss": 7.0790934562683105,
|
552 |
+
"eval_runtime": 79.0053,
|
553 |
+
"eval_samples_per_second": 118.549,
|
554 |
+
"eval_steps_per_second": 14.822,
|
555 |
+
"step": 700
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"epoch": 0.7762280169799879,
|
559 |
+
"grad_norm": 173.01499938964844,
|
560 |
+
"learning_rate": 0.0001311137077381614,
|
561 |
+
"loss": 7.0821,
|
562 |
+
"step": 710
|
563 |
+
},
|
564 |
+
{
|
565 |
+
"epoch": 0.787160805951537,
|
566 |
+
"grad_norm": 156.1138458251953,
|
567 |
+
"learning_rate": 0.00012351406753283216,
|
568 |
+
"loss": 7.0838,
|
569 |
+
"step": 720
|
570 |
+
},
|
571 |
+
{
|
572 |
+
"epoch": 0.7980935949230861,
|
573 |
+
"grad_norm": 161.9981689453125,
|
574 |
+
"learning_rate": 0.00011681901904809884,
|
575 |
+
"loss": 7.0639,
|
576 |
+
"step": 730
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"epoch": 0.8090263838946352,
|
580 |
+
"grad_norm": 174.0237579345703,
|
581 |
+
"learning_rate": 0.00010954733067505213,
|
582 |
+
"loss": 7.0604,
|
583 |
+
"step": 740
|
584 |
+
},
|
585 |
+
{
|
586 |
+
"epoch": 0.8199591728661844,
|
587 |
+
"grad_norm": 141.823974609375,
|
588 |
+
"learning_rate": 0.0001024584422885053,
|
589 |
+
"loss": 7.0508,
|
590 |
+
"step": 750
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"epoch": 0.8308919618377335,
|
594 |
+
"grad_norm": 121.39106750488281,
|
595 |
+
"learning_rate": 9.555915793434476e-05,
|
596 |
+
"loss": 7.0568,
|
597 |
+
"step": 760
|
598 |
+
},
|
599 |
+
{
|
600 |
+
"epoch": 0.8418247508092827,
|
601 |
+
"grad_norm": 178.37924194335938,
|
602 |
+
"learning_rate": 8.885609967300851e-05,
|
603 |
+
"loss": 7.0589,
|
604 |
+
"step": 770
|
605 |
+
},
|
606 |
+
{
|
607 |
+
"epoch": 0.8527575397808318,
|
608 |
+
"grad_norm": 304.8969421386719,
|
609 |
+
"learning_rate": 8.235570122350937e-05,
|
610 |
+
"loss": 7.0582,
|
611 |
+
"step": 780
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"epoch": 0.8636903287523808,
|
615 |
+
"grad_norm": 128.75843811035156,
|
616 |
+
"learning_rate": 7.606420178823293e-05,
|
617 |
+
"loss": 7.0622,
|
618 |
+
"step": 790
|
619 |
+
},
|
620 |
+
{
|
621 |
+
"epoch": 0.87462311772393,
|
622 |
+
"grad_norm": 88.88775634765625,
|
623 |
+
"learning_rate": 6.998764006443615e-05,
|
624 |
+
"loss": 7.0664,
|
625 |
+
"step": 800
|
626 |
+
},
|
627 |
+
{
|
628 |
+
"epoch": 0.87462311772393,
|
629 |
+
"eval_loss": 7.048069477081299,
|
630 |
+
"eval_runtime": 78.7086,
|
631 |
+
"eval_samples_per_second": 118.996,
|
632 |
+
"eval_steps_per_second": 14.878,
|
633 |
+
"step": 800
|
634 |
+
},
|
635 |
+
{
|
636 |
+
"epoch": 0.8855559066954791,
|
637 |
+
"grad_norm": 131.33584594726562,
|
638 |
+
"learning_rate": 6.413184844819423e-05,
|
639 |
+
"loss": 7.0381,
|
640 |
+
"step": 810
|
641 |
+
},
|
642 |
+
{
|
643 |
+
"epoch": 0.8964886956670283,
|
644 |
+
"grad_norm": 176.8515625,
|
645 |
+
"learning_rate": 6e-05,
|
646 |
+
"loss": 7.0461,
|
647 |
+
"step": 820
|
648 |
+
},
|
649 |
+
{
|
650 |
+
"epoch": 0.9074214846385774,
|
651 |
+
"grad_norm": 128.32069396972656,
|
652 |
+
"learning_rate": 6e-05,
|
653 |
+
"loss": 7.0597,
|
654 |
+
"step": 830
|
655 |
+
},
|
656 |
+
{
|
657 |
+
"epoch": 0.9183542736101264,
|
658 |
+
"grad_norm": 150.107421875,
|
659 |
+
"learning_rate": 6e-05,
|
660 |
+
"loss": 7.0582,
|
661 |
+
"step": 840
|
662 |
+
},
|
663 |
+
{
|
664 |
+
"epoch": 0.9292870625816756,
|
665 |
+
"grad_norm": 174.95352172851562,
|
666 |
+
"learning_rate": 6e-05,
|
667 |
+
"loss": 7.0729,
|
668 |
+
"step": 850
|
669 |
+
},
|
670 |
+
{
|
671 |
+
"epoch": 0.9402198515532247,
|
672 |
+
"grad_norm": 209.878173828125,
|
673 |
+
"learning_rate": 6e-05,
|
674 |
+
"loss": 7.0949,
|
675 |
+
"step": 860
|
676 |
+
},
|
677 |
+
{
|
678 |
+
"epoch": 0.9511526405247739,
|
679 |
+
"grad_norm": 181.1326904296875,
|
680 |
+
"learning_rate": 6e-05,
|
681 |
+
"loss": 7.109,
|
682 |
+
"step": 870
|
683 |
+
},
|
684 |
+
{
|
685 |
+
"epoch": 0.962085429496323,
|
686 |
+
"grad_norm": 197.11639404296875,
|
687 |
+
"learning_rate": 6e-05,
|
688 |
+
"loss": 7.1132,
|
689 |
+
"step": 880
|
690 |
+
},
|
691 |
+
{
|
692 |
+
"epoch": 0.9730182184678722,
|
693 |
+
"grad_norm": 197.16473388671875,
|
694 |
+
"learning_rate": 6e-05,
|
695 |
+
"loss": 7.1008,
|
696 |
+
"step": 890
|
697 |
+
},
|
698 |
+
{
|
699 |
+
"epoch": 0.9839510074394212,
|
700 |
+
"grad_norm": 224.1211395263672,
|
701 |
+
"learning_rate": 6e-05,
|
702 |
+
"loss": 7.1024,
|
703 |
+
"step": 900
|
704 |
+
},
|
705 |
+
{
|
706 |
+
"epoch": 0.9839510074394212,
|
707 |
+
"eval_loss": 7.119234561920166,
|
708 |
+
"eval_runtime": 78.4745,
|
709 |
+
"eval_samples_per_second": 119.351,
|
710 |
+
"eval_steps_per_second": 14.922,
|
711 |
+
"step": 900
|
712 |
+
},
|
713 |
+
{
|
714 |
+
"epoch": 0.9948837964109704,
|
715 |
+
"grad_norm": 161.86753845214844,
|
716 |
+
"learning_rate": 6e-05,
|
717 |
+
"loss": 7.1127,
|
718 |
+
"step": 910
|
719 |
+
},
|
720 |
+
{
|
721 |
+
"epoch": 1.0060984463481923,
|
722 |
+
"grad_norm": 247.6467742919922,
|
723 |
+
"learning_rate": 6e-05,
|
724 |
+
"loss": 7.1115,
|
725 |
+
"step": 920
|
726 |
+
},
|
727 |
+
{
|
728 |
+
"epoch": 1.0170312353197413,
|
729 |
+
"grad_norm": 228.1467742919922,
|
730 |
+
"learning_rate": 6e-05,
|
731 |
+
"loss": 7.1172,
|
732 |
+
"step": 930
|
733 |
+
},
|
734 |
+
{
|
735 |
+
"epoch": 1.0279640242912904,
|
736 |
+
"grad_norm": 400.675537109375,
|
737 |
+
"learning_rate": 6e-05,
|
738 |
+
"loss": 7.1351,
|
739 |
+
"step": 940
|
740 |
+
},
|
741 |
+
{
|
742 |
+
"epoch": 1.0388968132628396,
|
743 |
+
"grad_norm": 293.3075866699219,
|
744 |
+
"learning_rate": 6e-05,
|
745 |
+
"loss": 7.1747,
|
746 |
+
"step": 950
|
747 |
+
},
|
748 |
+
{
|
749 |
+
"epoch": 1.0498296022343887,
|
750 |
+
"grad_norm": 439.60760498046875,
|
751 |
+
"learning_rate": 6e-05,
|
752 |
+
"loss": 7.1955,
|
753 |
+
"step": 960
|
754 |
+
},
|
755 |
+
{
|
756 |
+
"epoch": 1.0607623912059378,
|
757 |
+
"grad_norm": 336.15521240234375,
|
758 |
+
"learning_rate": 6e-05,
|
759 |
+
"loss": 7.2134,
|
760 |
+
"step": 970
|
761 |
+
},
|
762 |
+
{
|
763 |
+
"epoch": 1.071695180177487,
|
764 |
+
"grad_norm": 232.90606689453125,
|
765 |
+
"learning_rate": 6e-05,
|
766 |
+
"loss": 7.2589,
|
767 |
+
"step": 980
|
768 |
+
},
|
769 |
+
{
|
770 |
+
"epoch": 1.0826279691490361,
|
771 |
+
"grad_norm": 453.7010803222656,
|
772 |
+
"learning_rate": 6e-05,
|
773 |
+
"loss": 7.2537,
|
774 |
+
"step": 990
|
775 |
+
},
|
776 |
+
{
|
777 |
+
"epoch": 1.0935607581205853,
|
778 |
+
"grad_norm": 156.7413330078125,
|
779 |
+
"learning_rate": 6e-05,
|
780 |
+
"loss": 7.2678,
|
781 |
+
"step": 1000
|
782 |
+
},
|
783 |
+
{
|
784 |
+
"epoch": 1.0935607581205853,
|
785 |
+
"eval_loss": 7.271553993225098,
|
786 |
+
"eval_runtime": 78.5452,
|
787 |
+
"eval_samples_per_second": 119.243,
|
788 |
+
"eval_steps_per_second": 14.909,
|
789 |
+
"step": 1000
|
790 |
+
},
|
791 |
+
{
|
792 |
+
"epoch": 1.1044935470921344,
|
793 |
+
"grad_norm": 225.9600067138672,
|
794 |
+
"learning_rate": 6e-05,
|
795 |
+
"loss": 7.2489,
|
796 |
+
"step": 1010
|
797 |
+
},
|
798 |
+
{
|
799 |
+
"epoch": 1.1154263360636836,
|
800 |
+
"grad_norm": 258.6958312988281,
|
801 |
+
"learning_rate": 6e-05,
|
802 |
+
"loss": 7.2224,
|
803 |
+
"step": 1020
|
804 |
+
}
|
805 |
+
],
|
806 |
+
"logging_steps": 10,
|
807 |
+
"max_steps": 1024,
|
808 |
+
"num_input_tokens_seen": 0,
|
809 |
+
"num_train_epochs": 2,
|
810 |
+
"save_steps": 1024,
|
811 |
+
"stateful_callbacks": {
|
812 |
+
"TrainerControl": {
|
813 |
+
"args": {
|
814 |
+
"should_epoch_stop": false,
|
815 |
+
"should_evaluate": false,
|
816 |
+
"should_log": false,
|
817 |
+
"should_save": true,
|
818 |
+
"should_training_stop": true
|
819 |
+
},
|
820 |
+
"attributes": {}
|
821 |
+
}
|
822 |
+
},
|
823 |
+
"total_flos": 1.383804151351214e+18,
|
824 |
+
"train_batch_size": 8,
|
825 |
+
"trial_name": null,
|
826 |
+
"trial_params": null
|
827 |
+
}
|
checkpoint-1024/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95492cc580971074153fb0dee7b60aedfda648c5f8a7cb99bd7fbc9464d590d2
|
3 |
+
size 5304
|
config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/vol/tmp/koppelmm/pythia160m_dedup_untrained",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 0,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"max_position_embeddings": 2048,
|
18 |
+
"model_type": "gpt_neox",
|
19 |
+
"num_attention_heads": 12,
|
20 |
+
"num_hidden_layers": 12,
|
21 |
+
"partial_rotary_factor": 0.25,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rope_theta": 10000,
|
24 |
+
"rotary_emb_base": 10000,
|
25 |
+
"rotary_pct": 0.25,
|
26 |
+
"tie_word_embeddings": false,
|
27 |
+
"torch_dtype": "float32",
|
28 |
+
"transformers_version": "4.46.2",
|
29 |
+
"use_cache": true,
|
30 |
+
"use_parallel_residual": true,
|
31 |
+
"vocab_size": 50304
|
32 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 0,
|
5 |
+
"transformers_version": "4.46.2"
|
6 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c5f335ef94c0ce86cfda59e84cdd9bbef1b433fa1974f80e4b56d4498d8c3c6
|
3 |
+
size 649308728
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:95492cc580971074153fb0dee7b60aedfda648c5f8a7cb99bd7fbc9464d590d2
|
3 |
+
size 5304
|