svwingerden georgeyw commited on
Commit
39f70ba
·
verified ·
0 Parent(s):

Duplicate from georgeyw/gpt-2-small-log-spacing

Browse files

Co-authored-by: George Wang <[email protected]>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. checkpoints/checkpoint-100/config.json +31 -0
  3. checkpoints/checkpoint-100/model.safetensors +3 -0
  4. checkpoints/checkpoint-100/training_args.bin +3 -0
  5. checkpoints/checkpoint-1000/config.json +31 -0
  6. checkpoints/checkpoint-1000/model.safetensors +3 -0
  7. checkpoints/checkpoint-1000/training_args.bin +3 -0
  8. checkpoints/checkpoint-10000/config.json +31 -0
  9. checkpoints/checkpoint-10000/model.safetensors +3 -0
  10. checkpoints/checkpoint-10000/training_args.bin +3 -0
  11. checkpoints/checkpoint-100000/config.json +31 -0
  12. checkpoints/checkpoint-100000/model.safetensors +3 -0
  13. checkpoints/checkpoint-100000/training_args.bin +3 -0
  14. checkpoints/checkpoint-10200/config.json +31 -0
  15. checkpoints/checkpoint-10200/model.safetensors +3 -0
  16. checkpoints/checkpoint-10200/training_args.bin +3 -0
  17. checkpoints/checkpoint-102000/config.json +31 -0
  18. checkpoints/checkpoint-102000/model.safetensors +3 -0
  19. checkpoints/checkpoint-102000/training_args.bin +3 -0
  20. checkpoints/checkpoint-10400/config.json +31 -0
  21. checkpoints/checkpoint-10400/model.safetensors +3 -0
  22. checkpoints/checkpoint-10400/training_args.bin +3 -0
  23. checkpoints/checkpoint-104000/config.json +31 -0
  24. checkpoints/checkpoint-104000/model.safetensors +3 -0
  25. checkpoints/checkpoint-104000/training_args.bin +3 -0
  26. checkpoints/checkpoint-10600/config.json +31 -0
  27. checkpoints/checkpoint-10600/model.safetensors +3 -0
  28. checkpoints/checkpoint-10600/training_args.bin +3 -0
  29. checkpoints/checkpoint-106000/config.json +31 -0
  30. checkpoints/checkpoint-106000/model.safetensors +3 -0
  31. checkpoints/checkpoint-106000/training_args.bin +3 -0
  32. checkpoints/checkpoint-10800/config.json +31 -0
  33. checkpoints/checkpoint-10800/model.safetensors +3 -0
  34. checkpoints/checkpoint-10800/training_args.bin +3 -0
  35. checkpoints/checkpoint-108000/config.json +31 -0
  36. checkpoints/checkpoint-108000/model.safetensors +3 -0
  37. checkpoints/checkpoint-108000/training_args.bin +3 -0
  38. checkpoints/checkpoint-1100/config.json +31 -0
  39. checkpoints/checkpoint-1100/model.safetensors +3 -0
  40. checkpoints/checkpoint-1100/training_args.bin +3 -0
  41. checkpoints/checkpoint-11000/config.json +31 -0
  42. checkpoints/checkpoint-11000/model.safetensors +3 -0
  43. checkpoints/checkpoint-11000/training_args.bin +3 -0
  44. checkpoints/checkpoint-110000/config.json +31 -0
  45. checkpoints/checkpoint-110000/model.safetensors +3 -0
  46. checkpoints/checkpoint-110000/training_args.bin +3 -0
  47. checkpoints/checkpoint-11200/config.json +31 -0
  48. checkpoints/checkpoint-11200/model.safetensors +3 -0
  49. checkpoints/checkpoint-11200/training_args.bin +3 -0
  50. checkpoints/checkpoint-112000/config.json +31 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
checkpoints/checkpoint-100/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091d1474cbb537e6bbe54b7cba183bbd4302acf95c5ca594348b4dd0f29a84bf
3
+ size 324662984
checkpoints/checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-1000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af65001b2df1d7aa05097c62001dbcbfe5b950f41419cd4655d4b801acf6149
3
+ size 324662984
checkpoints/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-10000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-10000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4d1f284240ac9ce60b1392b020805cb24e2b923e74f33fa362de8cfc19679b6
3
+ size 324662984
checkpoints/checkpoint-10000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-100000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-100000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:865ea2133301ad30c814dc2cbf48fc21135e4e1fd4008fb1c6b43073d7165845
3
+ size 324662984
checkpoints/checkpoint-100000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-10200/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-10200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea6ff7466539c09e743ab87b298b71ad9e5464a1a3042177a315ff303b29e50e
3
+ size 324662984
checkpoints/checkpoint-10200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-102000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-102000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a966f69917ede86380febf310ccd5cdba623c5c394b1fd11904e154b7a65c54
3
+ size 324662984
checkpoints/checkpoint-102000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-10400/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-10400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2239fdddefa36ab4b4d8493b727787fd44af8b72d5b88b07c7dab4583a68b0f
3
+ size 324662984
checkpoints/checkpoint-10400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-104000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-104000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10d50bba2f464733f9f6b4aa3bdf71599bafb4ef4f7cf8e3cc5c206b104f7c46
3
+ size 324662984
checkpoints/checkpoint-104000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-10600/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-10600/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9708a16779c726d8080525b741eeab6ddf951985b8af7152428df1c55934b853
3
+ size 324662984
checkpoints/checkpoint-10600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-106000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-106000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f4c0b84727b91fd39ec157ff163e070f76511f457cc3991681a1f9379551028
3
+ size 324662984
checkpoints/checkpoint-106000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-10800/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-10800/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea932ce8c955581505a875892eb511d3bf41f173f90f54f9afe37fcac1200f5
3
+ size 324662984
checkpoints/checkpoint-10800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-108000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-108000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:601e60dbdee92703deae4062838b34cdef03e2cf019f41ac434edde21587a234
3
+ size 324662984
checkpoints/checkpoint-108000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-1100/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-1100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8c860f0a48492875a266d2126e7f74ba82b96cd84ad6ec9dd66b682cd9ff335
3
+ size 324662984
checkpoints/checkpoint-1100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-11000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-11000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b0ea9baf8d066e6db2730073a7eb9c771776e6c1ee23dafb035b6383dd2555e
3
+ size 324662984
checkpoints/checkpoint-11000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-110000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-110000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0842441145b1dbad4d4a60f0d8a462673032c2f645dda532c4be9ffe6a2d8b
3
+ size 324662984
checkpoints/checkpoint-110000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-11200/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }
checkpoints/checkpoint-11200/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01bd627aba063c73e5d1fb4e7c9bd9884f073046822792f93540ca56e36a489a
3
+ size 324662984
checkpoints/checkpoint-11200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07671a2ec06f1e2e1209ff63c69f68592015234ed38d9978ac4e0899bcfabf5
3
+ size 6520
checkpoints/checkpoint-112000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "georgeyw/gpt-2-small-init-seed-5",
3
+ "architectures": [
4
+ "GPTNeoXForCausalLM"
5
+ ],
6
+ "attention_bias": true,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.1,
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout": 0.0,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "layer_norm_eps": 1e-05,
17
+ "layer_norm_epsilon": 1e-05,
18
+ "max_position_embeddings": 1024,
19
+ "model_type": "gpt_neox",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "rope_scaling": null,
23
+ "rotary_emb_base": 10000,
24
+ "rotary_pct": 0.25,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.38.2",
28
+ "use_cache": true,
29
+ "use_parallel_residual": true,
30
+ "vocab_size": 50304
31
+ }