Training in progress, step 1000, checkpoint
Browse files- checkpoint-100/config.json +31 -0
- checkpoint-100/model.safetensors +3 -0
- checkpoint-100/training_args.bin +3 -0
- checkpoint-1000/config.json +31 -0
- checkpoint-1000/model.safetensors +3 -0
- checkpoint-1000/training_args.bin +3 -0
- checkpoint-200/config.json +31 -0
- checkpoint-200/model.safetensors +3 -0
- checkpoint-200/training_args.bin +3 -0
- checkpoint-300/config.json +31 -0
- checkpoint-300/model.safetensors +3 -0
- checkpoint-300/training_args.bin +3 -0
- checkpoint-400/config.json +31 -0
- checkpoint-400/model.safetensors +3 -0
- checkpoint-400/training_args.bin +3 -0
- checkpoint-500/config.json +31 -0
- checkpoint-500/model.safetensors +3 -0
- checkpoint-500/training_args.bin +3 -0
- checkpoint-600/config.json +31 -0
- checkpoint-600/model.safetensors +3 -0
- checkpoint-600/training_args.bin +3 -0
- checkpoint-700/config.json +31 -0
- checkpoint-700/model.safetensors +3 -0
- checkpoint-700/training_args.bin +3 -0
- checkpoint-800/config.json +31 -0
- checkpoint-800/model.safetensors +3 -0
- checkpoint-800/training_args.bin +3 -0
- checkpoint-900/config.json +31 -0
- checkpoint-900/model.safetensors +3 -0
- checkpoint-900/training_args.bin +3 -0
checkpoint-100/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-100/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:03e56f5b3dd4afc21190625e190827309ea556064666d746669af2409569127a
|
3 |
+
size 324662984
|
checkpoint-100/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-1000/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-1000/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a56aaeb73fd5a850495c644f8711e0fd2770f3e773c4f808af9f7d9b344fd53f
|
3 |
+
size 324662984
|
checkpoint-1000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-200/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-200/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd23533d50f604282bb0a5f22d02f3f22b956993282389c73cafa912b443f37b
|
3 |
+
size 324662984
|
checkpoint-200/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-300/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-300/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:36f18c740a272ca7e32993e375d9b59d0a23be68c1052f6cb99f88c68dd8e9d9
|
3 |
+
size 324662984
|
checkpoint-300/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-400/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-400/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6c52b1b0659f152c1779a66ada1c70277cd1169830e5defefe2ebb2b81bc1e54
|
3 |
+
size 324662984
|
checkpoint-400/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-500/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-500/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6d46c2ed38a318af855fe6a7b7e3bd8d23f9c90968d76ea49c9c04a19a340fe6
|
3 |
+
size 324662984
|
checkpoint-500/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-600/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-600/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1aeacc2c448ac0651561ddccb1ae81763195f7fdf4bd769ac1edbbe4b2be2597
|
3 |
+
size 324662984
|
checkpoint-600/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-700/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-700/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:724bb60a2c9fe997583beba8959b4f2c8e454faacea8001b80b930465e690fed
|
3 |
+
size 324662984
|
checkpoint-700/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-800/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-800/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de87fc2b26d177e393555124766b0d1959f2ee409b61a092812128ab7e82fd73
|
3 |
+
size 324662984
|
checkpoint-800/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|
checkpoint-900/config.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
|
3 |
+
"architectures": [
|
4 |
+
"GPTNeoXForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": true,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 0,
|
9 |
+
"classifier_dropout": 0.1,
|
10 |
+
"eos_token_id": 2,
|
11 |
+
"hidden_act": "gelu",
|
12 |
+
"hidden_dropout": 0.0,
|
13 |
+
"hidden_size": 768,
|
14 |
+
"initializer_range": 0.02,
|
15 |
+
"intermediate_size": 3072,
|
16 |
+
"layer_norm_eps": 1e-05,
|
17 |
+
"layer_norm_epsilon": 1e-05,
|
18 |
+
"max_position_embeddings": 1024,
|
19 |
+
"model_type": "gpt_neox",
|
20 |
+
"num_attention_heads": 12,
|
21 |
+
"num_hidden_layers": 12,
|
22 |
+
"rope_scaling": null,
|
23 |
+
"rotary_emb_base": 10000,
|
24 |
+
"rotary_pct": 0.25,
|
25 |
+
"tie_word_embeddings": false,
|
26 |
+
"torch_dtype": "bfloat16",
|
27 |
+
"transformers_version": "4.38.2",
|
28 |
+
"use_cache": true,
|
29 |
+
"use_parallel_residual": true,
|
30 |
+
"vocab_size": 50304
|
31 |
+
}
|
checkpoint-900/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef4d020683dcf2ce71ffebf9c5c3b3414691acb36632c56bf2296cd12a9d9ad4
|
3 |
+
size 324662984
|
checkpoint-900/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:de3f522b6b895157d4ae37816ea2b39e4b24555bc3782f9f18492c6709abd779
|
3 |
+
size 6520
|