End of training
Browse files- README.md +12 -12
- adapter_config.json +2 -2
- adapter_model.bin +1 -1
- adapter_model.safetensors +1 -1
- config.json +1 -16
- training_args.bin +1 -1
README.md
CHANGED
@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
|
|
18 |
|
19 |
axolotl version: `0.4.1`
|
20 |
```yaml
|
21 |
-
adapter:
|
22 |
base_model: EleutherAI/pythia-70m-deduped
|
23 |
bf16: auto
|
24 |
chat_template: llama3
|
@@ -43,7 +43,7 @@ early_stopping_patience: null
|
|
43 |
eval_max_new_tokens: 128
|
44 |
eval_table_size: null
|
45 |
evals_per_epoch: 1
|
46 |
-
flash_attention:
|
47 |
fp16: null
|
48 |
fsdp: null
|
49 |
fsdp_config: null
|
@@ -55,7 +55,7 @@ hub_repo: null
|
|
55 |
hub_strategy: end
|
56 |
hub_token: null
|
57 |
learning_rate: 0.0001
|
58 |
-
load_in_4bit:
|
59 |
load_in_8bit: false
|
60 |
local_rank: null
|
61 |
logging_steps: 1
|
@@ -66,7 +66,7 @@ lora_model_dir: null
|
|
66 |
lora_r: 32
|
67 |
lora_target_linear: true
|
68 |
lr_scheduler: cosine
|
69 |
-
max_steps:
|
70 |
micro_batch_size: 1
|
71 |
mlflow_experiment_name: /tmp/f72a6a3621325054_train_data.json
|
72 |
model_type: AutoModelForCausalLM
|
@@ -78,7 +78,7 @@ resume_from_checkpoint: null
|
|
78 |
s2_attention: null
|
79 |
sample_packing: false
|
80 |
saves_per_epoch: 1
|
81 |
-
sequence_len:
|
82 |
special_tokens:
|
83 |
pad_token: <|endoftext|>
|
84 |
strict: false
|
@@ -105,7 +105,7 @@ xformers_attention: null
|
|
105 |
|
106 |
This model is a fine-tuned version of [EleutherAI/pythia-70m-deduped](https://huggingface.co/EleutherAI/pythia-70m-deduped) on the None dataset.
|
107 |
It achieves the following results on the evaluation set:
|
108 |
-
- Loss:
|
109 |
|
110 |
## Model description
|
111 |
|
@@ -133,17 +133,17 @@ The following hyperparameters were used during training:
|
|
133 |
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
134 |
- lr_scheduler_type: cosine
|
135 |
- lr_scheduler_warmup_steps: 10
|
136 |
-
- training_steps:
|
137 |
|
138 |
### Training results
|
139 |
|
140 |
| Training Loss | Epoch | Step | Validation Loss |
|
141 |
|:-------------:|:------:|:----:|:---------------:|
|
142 |
-
|
|
143 |
-
|
|
144 |
-
|
|
145 |
-
|
|
146 |
-
|
|
147 |
|
148 |
|
149 |
### Framework versions
|
|
|
18 |
|
19 |
axolotl version: `0.4.1`
|
20 |
```yaml
|
21 |
+
adapter: lora
|
22 |
base_model: EleutherAI/pythia-70m-deduped
|
23 |
bf16: auto
|
24 |
chat_template: llama3
|
|
|
43 |
eval_max_new_tokens: 128
|
44 |
eval_table_size: null
|
45 |
evals_per_epoch: 1
|
46 |
+
flash_attention: true
|
47 |
fp16: null
|
48 |
fsdp: null
|
49 |
fsdp_config: null
|
|
|
55 |
hub_strategy: end
|
56 |
hub_token: null
|
57 |
learning_rate: 0.0001
|
58 |
+
load_in_4bit: false
|
59 |
load_in_8bit: false
|
60 |
local_rank: null
|
61 |
logging_steps: 1
|
|
|
66 |
lora_r: 32
|
67 |
lora_target_linear: true
|
68 |
lr_scheduler: cosine
|
69 |
+
max_steps: 5000
|
70 |
micro_batch_size: 1
|
71 |
mlflow_experiment_name: /tmp/f72a6a3621325054_train_data.json
|
72 |
model_type: AutoModelForCausalLM
|
|
|
78 |
s2_attention: null
|
79 |
sample_packing: false
|
80 |
saves_per_epoch: 1
|
81 |
+
sequence_len: 4096
|
82 |
special_tokens:
|
83 |
pad_token: <|endoftext|>
|
84 |
strict: false
|
|
|
105 |
|
106 |
This model is a fine-tuned version of [EleutherAI/pythia-70m-deduped](https://huggingface.co/EleutherAI/pythia-70m-deduped) on the None dataset.
|
107 |
It achieves the following results on the evaluation set:
|
108 |
+
- Loss: 4.9195
|
109 |
|
110 |
## Model description
|
111 |
|
|
|
133 |
- optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
134 |
- lr_scheduler_type: cosine
|
135 |
- lr_scheduler_warmup_steps: 10
|
136 |
+
- training_steps: 5000
|
137 |
|
138 |
### Training results
|
139 |
|
140 |
| Training Loss | Epoch | Step | Validation Loss |
|
141 |
|:-------------:|:------:|:----:|:---------------:|
|
142 |
+
| 142.114 | 0.0002 | 1 | 13.2448 |
|
143 |
+
| 42.1552 | 0.1943 | 1250 | 4.9811 |
|
144 |
+
| 41.2177 | 0.3886 | 2500 | 4.9747 |
|
145 |
+
| 38.7129 | 0.5829 | 3750 | 4.8722 |
|
146 |
+
| 43.7613 | 0.7772 | 5000 | 4.9195 |
|
147 |
|
148 |
|
149 |
### Framework versions
|
adapter_config.json
CHANGED
@@ -20,10 +20,10 @@
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
23 |
-
"dense_4h_to_h",
|
24 |
"query_key_value",
|
25 |
"dense",
|
26 |
-
"dense_h_to_4h"
|
|
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
|
|
20 |
"rank_pattern": {},
|
21 |
"revision": null,
|
22 |
"target_modules": [
|
|
|
23 |
"query_key_value",
|
24 |
"dense",
|
25 |
+
"dense_h_to_4h",
|
26 |
+
"dense_4h_to_h"
|
27 |
],
|
28 |
"task_type": "CAUSAL_LM",
|
29 |
"use_dora": false,
|
adapter_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6309118
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f3dd9a0acd300ceafc8f03f6be90c676d98ac31877157d5cab0dfce9f6b106a
|
3 |
size 6309118
|
adapter_model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6298048
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1bcdb1bf761872e624763685f42d3107f760b7abc8bdf6538bbd79adc754a150
|
3 |
size 6298048
|
config.json
CHANGED
@@ -15,26 +15,11 @@
|
|
15 |
"initializer_range": 0.02,
|
16 |
"intermediate_size": 2048,
|
17 |
"layer_norm_eps": 1e-05,
|
18 |
-
"max_position_embeddings":
|
19 |
"model_type": "gpt_neox",
|
20 |
"num_attention_heads": 8,
|
21 |
"num_hidden_layers": 6,
|
22 |
"partial_rotary_factor": 0.25,
|
23 |
-
"quantization_config": {
|
24 |
-
"_load_in_4bit": true,
|
25 |
-
"_load_in_8bit": false,
|
26 |
-
"bnb_4bit_compute_dtype": "bfloat16",
|
27 |
-
"bnb_4bit_quant_storage": "bfloat16",
|
28 |
-
"bnb_4bit_quant_type": "nf4",
|
29 |
-
"bnb_4bit_use_double_quant": true,
|
30 |
-
"llm_int8_enable_fp32_cpu_offload": false,
|
31 |
-
"llm_int8_has_fp16_weight": false,
|
32 |
-
"llm_int8_skip_modules": null,
|
33 |
-
"llm_int8_threshold": 6.0,
|
34 |
-
"load_in_4bit": true,
|
35 |
-
"load_in_8bit": false,
|
36 |
-
"quant_method": "bitsandbytes"
|
37 |
-
},
|
38 |
"rope_scaling": null,
|
39 |
"rope_theta": 10000,
|
40 |
"rotary_emb_base": 10000,
|
|
|
15 |
"initializer_range": 0.02,
|
16 |
"intermediate_size": 2048,
|
17 |
"layer_norm_eps": 1e-05,
|
18 |
+
"max_position_embeddings": 4096,
|
19 |
"model_type": "gpt_neox",
|
20 |
"num_attention_heads": 8,
|
21 |
"num_hidden_layers": 6,
|
22 |
"partial_rotary_factor": 0.25,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
"rope_scaling": null,
|
24 |
"rope_theta": 10000,
|
25 |
"rotary_emb_base": 10000,
|
training_args.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 6776
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1f554cbf17735eb6b892a751bed034c6d57161dfff197b631ddddcdb54af7d0b
|
3 |
size 6776
|