ardaspear
/

35064bc1-2c15-4036-bbb1-561a74589740

@@ -21,7 +21,7 @@ axolotl version: `0.4.1`
 adapter: lora
 base_model: echarlaix/tiny-random-mistral
 bf16: auto
-chat_template: llama3
 dataset_prepared_path: null
 datasets:
 - data_files:
@@ -42,31 +42,31 @@ early_stopping_patience: null
 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 4
-flash_attention: false
 fp16: null
 fsdp: null
 fsdp_config: null
 gradient_accumulation_steps: 4
-gradient_checkpointing: true
 group_by_length: false
 hub_model_id: ardaspear/35064bc1-2c15-4036-bbb1-561a74589740
 hub_repo: null
 hub_strategy: checkpoint
 hub_token: null
-learning_rate: 0.0001
-load_in_4bit: false
 load_in_8bit: false
-local_rank: 0
-logging_steps: 3
-lora_alpha: 128
-lora_dropout: 0.1
-lora_fan_in_fan_out: true
 lora_model_dir: null
-lora_r: 64
 lora_target_linear: true
 lr_scheduler: cosine
 max_steps: 50
-micro_batch_size: 8
 mlflow_experiment_name: /tmp/a74ecd5c5b3909f6_train_data.json
 model_type: AutoModelForCausalLM
 num_epochs: 3
@@ -74,10 +74,10 @@ optimizer: adamw_bnb_8bit
 output_dir: miner_id_24
 pad_to_sequence_len: true
 resume_from_checkpoint: null
-s2_attention: false
 sample_packing: false
 saves_per_epoch: 4
-sequence_len: 1024
 strict: false
 tf32: false
 tokenizer_type: AutoTokenizer
@@ -91,7 +91,7 @@ wandb_project: Gradients-On-Two
 wandb_run: your_name
 wandb_runid: 35064bc1-2c15-4036-bbb1-561a74589740
 warmup_steps: 10
-weight_decay: 0.01
 xformers_attention: null
 ```
@@ -102,7 +102,7 @@ xformers_attention: null
 This model is a fine-tuned version of [echarlaix/tiny-random-mistral](https://huggingface.co/echarlaix/tiny-random-mistral) on the None dataset.
 It achieves the following results on the evaluation set:
-- Loss: nan
 ## Model description
@@ -121,12 +121,12 @@ More information needed
 ### Training hyperparameters
 The following hyperparameters were used during training:
-- learning_rate: 0.0001
-- train_batch_size: 8
-- eval_batch_size: 8
 - seed: 42
 - gradient_accumulation_steps: 4
-- total_train_batch_size: 32
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
@@ -136,17 +136,17 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
-| No log        | 0.0007 | 1    | nan             |
-| 0.0           | 0.0033 | 5    | nan             |
-| 0.0           | 0.0065 | 10   | nan             |
-| 0.0           | 0.0098 | 15   | nan             |
-| 0.0           | 0.0130 | 20   | nan             |
-| 0.0           | 0.0163 | 25   | nan             |
-| 0.0           | 0.0196 | 30   | nan             |
-| 0.0           | 0.0228 | 35   | nan             |
-| 0.0           | 0.0261 | 40   | nan             |
-| 0.0           | 0.0293 | 45   | nan             |
-| 0.0           | 0.0326 | 50   | nan             |
 ### Framework versions

 adapter: lora
 base_model: echarlaix/tiny-random-mistral
 bf16: auto
+chat_template: chatml
 dataset_prepared_path: null
 datasets:
 - data_files:
 eval_max_new_tokens: 128
 eval_table_size: null
 evals_per_epoch: 4
+flash_attention: true
 fp16: null
 fsdp: null
 fsdp_config: null
 gradient_accumulation_steps: 4
+gradient_checkpointing: false
 group_by_length: false
 hub_model_id: ardaspear/35064bc1-2c15-4036-bbb1-561a74589740
 hub_repo: null
 hub_strategy: checkpoint
 hub_token: null
+learning_rate: 0.0002
+load_in_4bit: true
 load_in_8bit: false
+local_rank: null
+logging_steps: 1
+lora_alpha: 32
+lora_dropout: 0.05
+lora_fan_in_fan_out: null
 lora_model_dir: null
+lora_r: 16
 lora_target_linear: true
 lr_scheduler: cosine
 max_steps: 50
+micro_batch_size: 2
 mlflow_experiment_name: /tmp/a74ecd5c5b3909f6_train_data.json
 model_type: AutoModelForCausalLM
 num_epochs: 3
 output_dir: miner_id_24
 pad_to_sequence_len: true
 resume_from_checkpoint: null
+s2_attention: null
 sample_packing: false
 saves_per_epoch: 4
+sequence_len: 4056
 strict: false
 tf32: false
 tokenizer_type: AutoTokenizer
 wandb_run: your_name
 wandb_runid: 35064bc1-2c15-4036-bbb1-561a74589740
 warmup_steps: 10
+weight_decay: 0.0
 xformers_attention: null
 ```
 This model is a fine-tuned version of [echarlaix/tiny-random-mistral](https://huggingface.co/echarlaix/tiny-random-mistral) on the None dataset.
 It achieves the following results on the evaluation set:
+- Loss: 10.3595
 ## Model description
 ### Training hyperparameters
 The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
 - seed: 42
 - gradient_accumulation_steps: 4
+- total_train_batch_size: 8
 - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
 - lr_scheduler_type: cosine
 - lr_scheduler_warmup_steps: 10
 | Training Loss | Epoch  | Step | Validation Loss |
 |:-------------:|:------:|:----:|:---------------:|
+| 41.5398       | 0.0002 | 1    | 10.3783         |
+| 41.5426       | 0.0008 | 5    | 10.3779         |
+| 41.5231       | 0.0016 | 10   | 10.3762         |
+| 41.4979       | 0.0024 | 15   | 10.3736         |
+| 41.4805       | 0.0033 | 20   | 10.3706         |
+| 41.4671       | 0.0041 | 25   | 10.3673         |
+| 41.4543       | 0.0049 | 30   | 10.3643         |
+| 41.4492       | 0.0057 | 35   | 10.3618         |
+| 41.4506       | 0.0065 | 40   | 10.3603         |
+| 41.4457       | 0.0073 | 45   | 10.3597         |
+| 41.4237       | 0.0082 | 50   | 10.3595         |
 ### Framework versions

adapter_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:44d1bf23797b206e048c13a52c1cf238e5f7f726dd906220bfc8f5b33219387f
-size 230786

 version https://git-lfs.github.com/spec/v1
+oid sha256:b215d60527eb02c52436a7ef993abc191af4fb2747e66330b67f1e79dbc4096f
+size 65282