End of training

Browse files

Files changed (6) hide show

README.md +12 -12
adapter_config.json +2 -2
adapter_model.bin +1 -1
adapter_model.safetensors +1 -1
config.json +1 -16
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -18,7 +18,7 @@ should probably proofread and complete it, then remove this comment. -->
 axolotl version: `0.4.1`
 ```yaml
-adapter: qlora
 base_model: EleutherAI/pythia-70m-deduped
 bf16: auto
 chat_template: llama3
@@ -43,7 +43,7 @@ early_stopping_patience: null
 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 1
-flash_attention: false
 fp16: null
 fsdp: null
 fsdp_config: null
@@ -55,7 +55,7 @@ hub_repo: null
 hub_strategy: end
 hub_token: null
 learning_rate: 0.0001
-load_in_4bit: true
 load_in_8bit: false
 local_rank: null
 logging_steps: 1
@@ -66,7 +66,7 @@ lora_model_dir: null
 lora_r: 32
 lora_target_linear: true
 lr_scheduler: cosine
-max_steps: 100
 micro_batch_size: 1
 mlflow_experiment_name: /tmp/f72a6a3621325054_train_data.json
 model_type: AutoModelForCausalLM
@@ -78,7 +78,7 @@ resume_from_checkpoint: null
 s2_attention: null
 sample_packing: false
 saves_per_epoch: 1
-sequence_len: 512
 special_tokens:
   pad_token: <|endoftext|>
 strict: false
@@ -105,7 +105,7 @@ xformers_attention: null
 This model is a fine-tuned version of [EleutherAI/pythia-70m-deduped](https://huggingface.co/EleutherAI/pythia-70m-deduped) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: 12.5372
 ## Model description
@@ -133,17 +133,17 @@ The following hyperparameters were used during training:
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
-- training_steps: 100
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
-| 143.2771      | 0.0002 | 1    | 13.1402         |
-| 127.6097      | 0.0039 | 25   | 12.9226         |
-| 136.5527      | 0.0078 | 50   | 12.6445         |
-| 118.6354      | 0.0117 | 75   | 12.5535         |
-| 170.2621      | 0.0155 | 100  | 12.5372         |
 ### Framework versions

 axolotl version: `0.4.1`
 ```yaml
+adapter: lora
 base_model: EleutherAI/pythia-70m-deduped
 bf16: auto
 chat_template: llama3
 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 1
+flash_attention: true
 fp16: null
 fsdp: null
 fsdp_config: null
 hub_strategy: end
 hub_token: null
 learning_rate: 0.0001
+load_in_4bit: false
 load_in_8bit: false
 local_rank: null
 logging_steps: 1
 lora_r: 32
 lora_target_linear: true
 lr_scheduler: cosine
+max_steps: 5000
 micro_batch_size: 1
 mlflow_experiment_name: /tmp/f72a6a3621325054_train_data.json
 model_type: AutoModelForCausalLM
 s2_attention: null
 sample_packing: false
 saves_per_epoch: 1
+sequence_len: 4096
 special_tokens:
   pad_token: <|endoftext|>
 strict: false
 This model is a fine-tuned version of [EleutherAI/pythia-70m-deduped](https://huggingface.co/EleutherAI/pythia-70m-deduped) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 4.9195
 ## Model description
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
+- training_steps: 5000
 ### Training results
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
+| 142.114       | 0.0002 | 1    | 13.2448         |
+| 42.1552       | 0.1943 | 1250 | 4.9811          |
+| 41.2177       | 0.3886 | 2500 | 4.9747          |
+| 38.7129       | 0.5829 | 3750 | 4.8722          |
+| 43.7613       | 0.7772 | 5000 | 4.9195          |
 ### Framework versions

adapter_config.json CHANGED Viewed

@@ -20,10 +20,10 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "dense_4h_to_h",
     "query_key_value",
     "dense",
-    "dense_h_to_4h"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "query_key_value",
     "dense",
+    "dense_h_to_4h",
+    "dense_4h_to_h"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

adapter_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9d84b12973df35823ca244fcb622efd668facb95ea051631f311d408611c80e1
 size 6309118

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f3dd9a0acd300ceafc8f03f6be90c676d98ac31877157d5cab0dfce9f6b106a
 size 6309118

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:43f93dae60fe542572b3e538e3546fd9d08a01f2a5c78143fdb52783b93c2140
 size 6298048

 version https://git-lfs.github.com/spec/v1
+oid sha256:1bcdb1bf761872e624763685f42d3107f760b7abc8bdf6538bbd79adc754a150
 size 6298048

config.json CHANGED Viewed

@@ -15,26 +15,11 @@
   "initializer_range": 0.02,
   "intermediate_size": 2048,
   "layer_norm_eps": 1e-05,
-  "max_position_embeddings": 2048,
   "model_type": "gpt_neox",
   "num_attention_heads": 8,
   "num_hidden_layers": 6,
   "partial_rotary_factor": 0.25,
-  "quantization_config": {
-    "_load_in_4bit": true,
-    "_load_in_8bit": false,
-    "bnb_4bit_compute_dtype": "bfloat16",
-    "bnb_4bit_quant_storage": "bfloat16",
-    "bnb_4bit_quant_type": "nf4",
-    "bnb_4bit_use_double_quant": true,
-    "llm_int8_enable_fp32_cpu_offload": false,
-    "llm_int8_has_fp16_weight": false,
-    "llm_int8_skip_modules": null,
-    "llm_int8_threshold": 6.0,
-    "load_in_4bit": true,
-    "load_in_8bit": false,
-    "quant_method": "bitsandbytes"
-  },
   "rope_scaling": null,
   "rope_theta": 10000,
   "rotary_emb_base": 10000,

   "initializer_range": 0.02,
   "intermediate_size": 2048,
   "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 4096,
   "model_type": "gpt_neox",
   "num_attention_heads": 8,
   "num_hidden_layers": 6,
   "partial_rotary_factor": 0.25,
   "rope_scaling": null,
   "rope_theta": 10000,
   "rotary_emb_base": 10000,

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f2886168f662c14435a3762f096590db764b63a4e77e74217bd0b2be10155bc
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:1f554cbf17735eb6b892a751bed034c6d57161dfff197b631ddddcdb54af7d0b
 size 6776