End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +899 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_camel_biology_gpt-4o-mini
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # oh_v1_w_v3_camel_biology_gpt-4o-mini
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5550

 base_model: meta-llama/Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: oh_v1_w_v3_camel_biology_gpt-4o-mini
 # oh_v1_w_v3_camel_biology_gpt-4o-mini
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) on the mlfoundations-dev/oh_v1_w_v3_camel_biology_gpt-4o-mini dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.5550

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9962358845671266,
+    "eval_loss": 0.555037796497345,
+    "eval_runtime": 213.0252,
+    "eval_samples_per_second": 50.388,
+    "eval_steps_per_second": 0.394,
+    "total_flos": 1999575711744000.0,
+    "train_loss": 0.5572832309620664,
+    "train_runtime": 35786.5658,
+    "train_samples_per_second": 17.096,
+    "train_steps_per_second": 0.033
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9962358845671266,
+    "eval_loss": 0.555037796497345,
+    "eval_runtime": 213.0252,
+    "eval_samples_per_second": 50.388,
+    "eval_steps_per_second": 0.394
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9962358845671266,
+    "total_flos": 1999575711744000.0,
+    "train_loss": 0.5572832309620664,
+    "train_runtime": 35786.5658,
+    "train_samples_per_second": 17.096,
+    "train_steps_per_second": 0.033
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,899 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9962358845671266,
+  "eval_steps": 500,
+  "global_step": 1194,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025094102885821833,
+      "grad_norm": 11.718188698364322,
+      "learning_rate": 5e-06,
+      "loss": 0.9141,
+      "step": 10
+    },
+    {
+      "epoch": 0.050188205771643665,
+      "grad_norm": 2.128194193417522,
+      "learning_rate": 5e-06,
+      "loss": 0.7836,
+      "step": 20
+    },
+    {
+      "epoch": 0.07528230865746549,
+      "grad_norm": 2.3757769329973657,
+      "learning_rate": 5e-06,
+      "loss": 0.7498,
+      "step": 30
+    },
+    {
+      "epoch": 0.10037641154328733,
+      "grad_norm": 1.103206879933866,
+      "learning_rate": 5e-06,
+      "loss": 0.7188,
+      "step": 40
+    },
+    {
+      "epoch": 0.12547051442910917,
+      "grad_norm": 0.9274722897954563,
+      "learning_rate": 5e-06,
+      "loss": 0.7002,
+      "step": 50
+    },
+    {
+      "epoch": 0.15056461731493098,
+      "grad_norm": 0.84713375375135,
+      "learning_rate": 5e-06,
+      "loss": 0.6765,
+      "step": 60
+    },
+    {
+      "epoch": 0.17565872020075282,
+      "grad_norm": 0.7850634011267035,
+      "learning_rate": 5e-06,
+      "loss": 0.6679,
+      "step": 70
+    },
+    {
+      "epoch": 0.20075282308657466,
+      "grad_norm": 0.6763495874297807,
+      "learning_rate": 5e-06,
+      "loss": 0.6554,
+      "step": 80
+    },
+    {
+      "epoch": 0.2258469259723965,
+      "grad_norm": 0.6460889441423061,
+      "learning_rate": 5e-06,
+      "loss": 0.6537,
+      "step": 90
+    },
+    {
+      "epoch": 0.25094102885821834,
+      "grad_norm": 0.6381839585959109,
+      "learning_rate": 5e-06,
+      "loss": 0.6469,
+      "step": 100
+    },
+    {
+      "epoch": 0.27603513174404015,
+      "grad_norm": 0.5934854411636431,
+      "learning_rate": 5e-06,
+      "loss": 0.6408,
+      "step": 110
+    },
+    {
+      "epoch": 0.30112923462986196,
+      "grad_norm": 0.5908335131781053,
+      "learning_rate": 5e-06,
+      "loss": 0.6315,
+      "step": 120
+    },
+    {
+      "epoch": 0.32622333751568383,
+      "grad_norm": 0.6207583357857969,
+      "learning_rate": 5e-06,
+      "loss": 0.6295,
+      "step": 130
+    },
+    {
+      "epoch": 0.35131744040150564,
+      "grad_norm": 0.5777218463460926,
+      "learning_rate": 5e-06,
+      "loss": 0.6353,
+      "step": 140
+    },
+    {
+      "epoch": 0.37641154328732745,
+      "grad_norm": 0.8031543297119933,
+      "learning_rate": 5e-06,
+      "loss": 0.6257,
+      "step": 150
+    },
+    {
+      "epoch": 0.4015056461731493,
+      "grad_norm": 0.6565972172028122,
+      "learning_rate": 5e-06,
+      "loss": 0.6223,
+      "step": 160
+    },
+    {
+      "epoch": 0.42659974905897113,
+      "grad_norm": 0.5758668737369184,
+      "learning_rate": 5e-06,
+      "loss": 0.6238,
+      "step": 170
+    },
+    {
+      "epoch": 0.451693851944793,
+      "grad_norm": 0.5079092787071162,
+      "learning_rate": 5e-06,
+      "loss": 0.6143,
+      "step": 180
+    },
+    {
+      "epoch": 0.4767879548306148,
+      "grad_norm": 0.6379157115232267,
+      "learning_rate": 5e-06,
+      "loss": 0.6199,
+      "step": 190
+    },
+    {
+      "epoch": 0.5018820577164367,
+      "grad_norm": 0.5743755502713278,
+      "learning_rate": 5e-06,
+      "loss": 0.6129,
+      "step": 200
+    },
+    {
+      "epoch": 0.5269761606022585,
+      "grad_norm": 0.7914784193340709,
+      "learning_rate": 5e-06,
+      "loss": 0.609,
+      "step": 210
+    },
+    {
+      "epoch": 0.5520702634880803,
+      "grad_norm": 0.6052863906299388,
+      "learning_rate": 5e-06,
+      "loss": 0.6191,
+      "step": 220
+    },
+    {
+      "epoch": 0.5771643663739021,
+      "grad_norm": 0.6079363974587813,
+      "learning_rate": 5e-06,
+      "loss": 0.6036,
+      "step": 230
+    },
+    {
+      "epoch": 0.6022584692597239,
+      "grad_norm": 0.6317013898545292,
+      "learning_rate": 5e-06,
+      "loss": 0.6101,
+      "step": 240
+    },
+    {
+      "epoch": 0.6273525721455459,
+      "grad_norm": 0.6772767650378945,
+      "learning_rate": 5e-06,
+      "loss": 0.6066,
+      "step": 250
+    },
+    {
+      "epoch": 0.6524466750313677,
+      "grad_norm": 0.7659397239810531,
+      "learning_rate": 5e-06,
+      "loss": 0.6085,
+      "step": 260
+    },
+    {
+      "epoch": 0.6775407779171895,
+      "grad_norm": 0.8301922378020979,
+      "learning_rate": 5e-06,
+      "loss": 0.6054,
+      "step": 270
+    },
+    {
+      "epoch": 0.7026348808030113,
+      "grad_norm": 0.4548488940244719,
+      "learning_rate": 5e-06,
+      "loss": 0.6039,
+      "step": 280
+    },
+    {
+      "epoch": 0.7277289836888331,
+      "grad_norm": 0.6991563441452833,
+      "learning_rate": 5e-06,
+      "loss": 0.6041,
+      "step": 290
+    },
+    {
+      "epoch": 0.7528230865746549,
+      "grad_norm": 0.7252866874449959,
+      "learning_rate": 5e-06,
+      "loss": 0.5982,
+      "step": 300
+    },
+    {
+      "epoch": 0.7779171894604768,
+      "grad_norm": 0.5621384726961683,
+      "learning_rate": 5e-06,
+      "loss": 0.5984,
+      "step": 310
+    },
+    {
+      "epoch": 0.8030112923462986,
+      "grad_norm": 0.5544205898879206,
+      "learning_rate": 5e-06,
+      "loss": 0.5938,
+      "step": 320
+    },
+    {
+      "epoch": 0.8281053952321205,
+      "grad_norm": 0.6591982327209397,
+      "learning_rate": 5e-06,
+      "loss": 0.5949,
+      "step": 330
+    },
+    {
+      "epoch": 0.8531994981179423,
+      "grad_norm": 0.4966951651690375,
+      "learning_rate": 5e-06,
+      "loss": 0.5939,
+      "step": 340
+    },
+    {
+      "epoch": 0.8782936010037641,
+      "grad_norm": 0.7936946361545921,
+      "learning_rate": 5e-06,
+      "loss": 0.5952,
+      "step": 350
+    },
+    {
+      "epoch": 0.903387703889586,
+      "grad_norm": 0.6049775996861211,
+      "learning_rate": 5e-06,
+      "loss": 0.5828,
+      "step": 360
+    },
+    {
+      "epoch": 0.9284818067754078,
+      "grad_norm": 0.5343638273778424,
+      "learning_rate": 5e-06,
+      "loss": 0.5851,
+      "step": 370
+    },
+    {
+      "epoch": 0.9535759096612296,
+      "grad_norm": 0.5723040103769105,
+      "learning_rate": 5e-06,
+      "loss": 0.5912,
+      "step": 380
+    },
+    {
+      "epoch": 0.9786700125470514,
+      "grad_norm": 0.5959442881234533,
+      "learning_rate": 5e-06,
+      "loss": 0.5915,
+      "step": 390
+    },
+    {
+      "epoch": 0.998745294855709,
+      "eval_loss": 0.5871068239212036,
+      "eval_runtime": 212.6533,
+      "eval_samples_per_second": 50.477,
+      "eval_steps_per_second": 0.395,
+      "step": 398
+    },
+    {
+      "epoch": 1.0037641154328734,
+      "grad_norm": 0.5888812035860913,
+      "learning_rate": 5e-06,
+      "loss": 0.5826,
+      "step": 400
+    },
+    {
+      "epoch": 1.0288582183186952,
+      "grad_norm": 0.496571742956618,
+      "learning_rate": 5e-06,
+      "loss": 0.5559,
+      "step": 410
+    },
+    {
+      "epoch": 1.053952321204517,
+      "grad_norm": 0.6292730066506935,
+      "learning_rate": 5e-06,
+      "loss": 0.5513,
+      "step": 420
+    },
+    {
+      "epoch": 1.0790464240903388,
+      "grad_norm": 0.588641857548959,
+      "learning_rate": 5e-06,
+      "loss": 0.5507,
+      "step": 430
+    },
+    {
+      "epoch": 1.1041405269761606,
+      "grad_norm": 0.6769239280680559,
+      "learning_rate": 5e-06,
+      "loss": 0.5503,
+      "step": 440
+    },
+    {
+      "epoch": 1.1292346298619824,
+      "grad_norm": 0.5417691042097619,
+      "learning_rate": 5e-06,
+      "loss": 0.548,
+      "step": 450
+    },
+    {
+      "epoch": 1.1543287327478042,
+      "grad_norm": 0.5991902330546697,
+      "learning_rate": 5e-06,
+      "loss": 0.5384,
+      "step": 460
+    },
+    {
+      "epoch": 1.179422835633626,
+      "grad_norm": 0.5820599790063293,
+      "learning_rate": 5e-06,
+      "loss": 0.5487,
+      "step": 470
+    },
+    {
+      "epoch": 1.2045169385194479,
+      "grad_norm": 0.5519292876234776,
+      "learning_rate": 5e-06,
+      "loss": 0.5427,
+      "step": 480
+    },
+    {
+      "epoch": 1.2296110414052697,
+      "grad_norm": 0.49516603784025776,
+      "learning_rate": 5e-06,
+      "loss": 0.546,
+      "step": 490
+    },
+    {
+      "epoch": 1.2547051442910915,
+      "grad_norm": 0.5743423113593866,
+      "learning_rate": 5e-06,
+      "loss": 0.5393,
+      "step": 500
+    },
+    {
+      "epoch": 1.2797992471769133,
+      "grad_norm": 0.5535831554138544,
+      "learning_rate": 5e-06,
+      "loss": 0.547,
+      "step": 510
+    },
+    {
+      "epoch": 1.3048933500627353,
+      "grad_norm": 0.5141692394527988,
+      "learning_rate": 5e-06,
+      "loss": 0.5411,
+      "step": 520
+    },
+    {
+      "epoch": 1.3299874529485571,
+      "grad_norm": 0.4854378839815677,
+      "learning_rate": 5e-06,
+      "loss": 0.5442,
+      "step": 530
+    },
+    {
+      "epoch": 1.355081555834379,
+      "grad_norm": 0.46518640648830495,
+      "learning_rate": 5e-06,
+      "loss": 0.5427,
+      "step": 540
+    },
+    {
+      "epoch": 1.3801756587202008,
+      "grad_norm": 0.534031920116045,
+      "learning_rate": 5e-06,
+      "loss": 0.5417,
+      "step": 550
+    },
+    {
+      "epoch": 1.4052697616060226,
+      "grad_norm": 0.5911980981423581,
+      "learning_rate": 5e-06,
+      "loss": 0.5435,
+      "step": 560
+    },
+    {
+      "epoch": 1.4303638644918444,
+      "grad_norm": 0.5763366884781533,
+      "learning_rate": 5e-06,
+      "loss": 0.5435,
+      "step": 570
+    },
+    {
+      "epoch": 1.4554579673776662,
+      "grad_norm": 0.6044164192343925,
+      "learning_rate": 5e-06,
+      "loss": 0.5458,
+      "step": 580
+    },
+    {
+      "epoch": 1.480552070263488,
+      "grad_norm": 0.5136929530377343,
+      "learning_rate": 5e-06,
+      "loss": 0.5433,
+      "step": 590
+    },
+    {
+      "epoch": 1.50564617314931,
+      "grad_norm": 0.6480890499633274,
+      "learning_rate": 5e-06,
+      "loss": 0.5412,
+      "step": 600
+    },
+    {
+      "epoch": 1.5307402760351319,
+      "grad_norm": 0.5778909116986595,
+      "learning_rate": 5e-06,
+      "loss": 0.5366,
+      "step": 610
+    },
+    {
+      "epoch": 1.5558343789209537,
+      "grad_norm": 0.554397297340124,
+      "learning_rate": 5e-06,
+      "loss": 0.5424,
+      "step": 620
+    },
+    {
+      "epoch": 1.5809284818067755,
+      "grad_norm": 0.512796007100603,
+      "learning_rate": 5e-06,
+      "loss": 0.5408,
+      "step": 630
+    },
+    {
+      "epoch": 1.6060225846925973,
+      "grad_norm": 0.4758665935260214,
+      "learning_rate": 5e-06,
+      "loss": 0.5418,
+      "step": 640
+    },
+    {
+      "epoch": 1.631116687578419,
+      "grad_norm": 0.4968644389707275,
+      "learning_rate": 5e-06,
+      "loss": 0.535,
+      "step": 650
+    },
+    {
+      "epoch": 1.656210790464241,
+      "grad_norm": 0.5920011846841533,
+      "learning_rate": 5e-06,
+      "loss": 0.54,
+      "step": 660
+    },
+    {
+      "epoch": 1.6813048933500627,
+      "grad_norm": 0.5905169032972927,
+      "learning_rate": 5e-06,
+      "loss": 0.5381,
+      "step": 670
+    },
+    {
+      "epoch": 1.7063989962358845,
+      "grad_norm": 0.6296689735308911,
+      "learning_rate": 5e-06,
+      "loss": 0.5358,
+      "step": 680
+    },
+    {
+      "epoch": 1.7314930991217063,
+      "grad_norm": 0.47536620723191336,
+      "learning_rate": 5e-06,
+      "loss": 0.5379,
+      "step": 690
+    },
+    {
+      "epoch": 1.7565872020075282,
+      "grad_norm": 0.5683042454275448,
+      "learning_rate": 5e-06,
+      "loss": 0.5353,
+      "step": 700
+    },
+    {
+      "epoch": 1.78168130489335,
+      "grad_norm": 0.5865132456497922,
+      "learning_rate": 5e-06,
+      "loss": 0.5341,
+      "step": 710
+    },
+    {
+      "epoch": 1.8067754077791718,
+      "grad_norm": 0.5863809886365764,
+      "learning_rate": 5e-06,
+      "loss": 0.5385,
+      "step": 720
+    },
+    {
+      "epoch": 1.8318695106649936,
+      "grad_norm": 0.47137949702370147,
+      "learning_rate": 5e-06,
+      "loss": 0.5349,
+      "step": 730
+    },
+    {
+      "epoch": 1.8569636135508154,
+      "grad_norm": 0.5139487832974206,
+      "learning_rate": 5e-06,
+      "loss": 0.5351,
+      "step": 740
+    },
+    {
+      "epoch": 1.8820577164366374,
+      "grad_norm": 0.5728562358927324,
+      "learning_rate": 5e-06,
+      "loss": 0.5387,
+      "step": 750
+    },
+    {
+      "epoch": 1.9071518193224593,
+      "grad_norm": 0.6768594215912966,
+      "learning_rate": 5e-06,
+      "loss": 0.5364,
+      "step": 760
+    },
+    {
+      "epoch": 1.932245922208281,
+      "grad_norm": 0.5485345915469325,
+      "learning_rate": 5e-06,
+      "loss": 0.5394,
+      "step": 770
+    },
+    {
+      "epoch": 1.9573400250941029,
+      "grad_norm": 0.5148704568756698,
+      "learning_rate": 5e-06,
+      "loss": 0.5358,
+      "step": 780
+    },
+    {
+      "epoch": 1.9824341279799247,
+      "grad_norm": 0.48466535956929385,
+      "learning_rate": 5e-06,
+      "loss": 0.5304,
+      "step": 790
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5621475577354431,
+      "eval_runtime": 213.3338,
+      "eval_samples_per_second": 50.316,
+      "eval_steps_per_second": 0.394,
+      "step": 797
+    },
+    {
+      "epoch": 2.0075282308657467,
+      "grad_norm": 0.6573006701729232,
+      "learning_rate": 5e-06,
+      "loss": 0.5167,
+      "step": 800
+    },
+    {
+      "epoch": 2.0326223337515685,
+      "grad_norm": 0.7871500030029548,
+      "learning_rate": 5e-06,
+      "loss": 0.4942,
+      "step": 810
+    },
+    {
+      "epoch": 2.0577164366373903,
+      "grad_norm": 0.5640563491273413,
+      "learning_rate": 5e-06,
+      "loss": 0.4956,
+      "step": 820
+    },
+    {
+      "epoch": 2.082810539523212,
+      "grad_norm": 0.7511306820222987,
+      "learning_rate": 5e-06,
+      "loss": 0.4938,
+      "step": 830
+    },
+    {
+      "epoch": 2.107904642409034,
+      "grad_norm": 0.5817278040608698,
+      "learning_rate": 5e-06,
+      "loss": 0.4925,
+      "step": 840
+    },
+    {
+      "epoch": 2.132998745294856,
+      "grad_norm": 0.5443042434696377,
+      "learning_rate": 5e-06,
+      "loss": 0.4924,
+      "step": 850
+    },
+    {
+      "epoch": 2.1580928481806776,
+      "grad_norm": 0.5307201281775036,
+      "learning_rate": 5e-06,
+      "loss": 0.5,
+      "step": 860
+    },
+    {
+      "epoch": 2.1831869510664994,
+      "grad_norm": 0.5134207088872962,
+      "learning_rate": 5e-06,
+      "loss": 0.4948,
+      "step": 870
+    },
+    {
+      "epoch": 2.208281053952321,
+      "grad_norm": 0.6926705508270693,
+      "learning_rate": 5e-06,
+      "loss": 0.497,
+      "step": 880
+    },
+    {
+      "epoch": 2.233375156838143,
+      "grad_norm": 0.6946700857047731,
+      "learning_rate": 5e-06,
+      "loss": 0.4922,
+      "step": 890
+    },
+    {
+      "epoch": 2.258469259723965,
+      "grad_norm": 0.5473927966647341,
+      "learning_rate": 5e-06,
+      "loss": 0.4992,
+      "step": 900
+    },
+    {
+      "epoch": 2.2835633626097867,
+      "grad_norm": 0.6500276609885082,
+      "learning_rate": 5e-06,
+      "loss": 0.4932,
+      "step": 910
+    },
+    {
+      "epoch": 2.3086574654956085,
+      "grad_norm": 0.5514076764002837,
+      "learning_rate": 5e-06,
+      "loss": 0.4915,
+      "step": 920
+    },
+    {
+      "epoch": 2.3337515683814303,
+      "grad_norm": 0.49943208945627254,
+      "learning_rate": 5e-06,
+      "loss": 0.4999,
+      "step": 930
+    },
+    {
+      "epoch": 2.358845671267252,
+      "grad_norm": 0.6060583960970398,
+      "learning_rate": 5e-06,
+      "loss": 0.4953,
+      "step": 940
+    },
+    {
+      "epoch": 2.383939774153074,
+      "grad_norm": 0.5320264242785274,
+      "learning_rate": 5e-06,
+      "loss": 0.4964,
+      "step": 950
+    },
+    {
+      "epoch": 2.4090338770388957,
+      "grad_norm": 0.5858837169303328,
+      "learning_rate": 5e-06,
+      "loss": 0.5006,
+      "step": 960
+    },
+    {
+      "epoch": 2.4341279799247175,
+      "grad_norm": 0.5441932891457427,
+      "learning_rate": 5e-06,
+      "loss": 0.4927,
+      "step": 970
+    },
+    {
+      "epoch": 2.4592220828105393,
+      "grad_norm": 0.49825625085166125,
+      "learning_rate": 5e-06,
+      "loss": 0.4986,
+      "step": 980
+    },
+    {
+      "epoch": 2.484316185696361,
+      "grad_norm": 0.5693348465052496,
+      "learning_rate": 5e-06,
+      "loss": 0.4923,
+      "step": 990
+    },
+    {
+      "epoch": 2.509410288582183,
+      "grad_norm": 0.5198895218070472,
+      "learning_rate": 5e-06,
+      "loss": 0.4961,
+      "step": 1000
+    },
+    {
+      "epoch": 2.5345043914680048,
+      "grad_norm": 0.5822359089459672,
+      "learning_rate": 5e-06,
+      "loss": 0.4948,
+      "step": 1010
+    },
+    {
+      "epoch": 2.5595984943538266,
+      "grad_norm": 0.7250677792140846,
+      "learning_rate": 5e-06,
+      "loss": 0.4988,
+      "step": 1020
+    },
+    {
+      "epoch": 2.584692597239649,
+      "grad_norm": 0.49974126038635164,
+      "learning_rate": 5e-06,
+      "loss": 0.4915,
+      "step": 1030
+    },
+    {
+      "epoch": 2.6097867001254706,
+      "grad_norm": 0.4964480117175315,
+      "learning_rate": 5e-06,
+      "loss": 0.4902,
+      "step": 1040
+    },
+    {
+      "epoch": 2.6348808030112925,
+      "grad_norm": 0.5288705472818386,
+      "learning_rate": 5e-06,
+      "loss": 0.497,
+      "step": 1050
+    },
+    {
+      "epoch": 2.6599749058971143,
+      "grad_norm": 0.4795064924818655,
+      "learning_rate": 5e-06,
+      "loss": 0.5,
+      "step": 1060
+    },
+    {
+      "epoch": 2.685069008782936,
+      "grad_norm": 0.5639285671354385,
+      "learning_rate": 5e-06,
+      "loss": 0.496,
+      "step": 1070
+    },
+    {
+      "epoch": 2.710163111668758,
+      "grad_norm": 0.4890610843848083,
+      "learning_rate": 5e-06,
+      "loss": 0.497,
+      "step": 1080
+    },
+    {
+      "epoch": 2.7352572145545797,
+      "grad_norm": 0.5718819316899499,
+      "learning_rate": 5e-06,
+      "loss": 0.4948,
+      "step": 1090
+    },
+    {
+      "epoch": 2.7603513174404015,
+      "grad_norm": 0.564322354041535,
+      "learning_rate": 5e-06,
+      "loss": 0.492,
+      "step": 1100
+    },
+    {
+      "epoch": 2.7854454203262233,
+      "grad_norm": 0.5775080175263282,
+      "learning_rate": 5e-06,
+      "loss": 0.4957,
+      "step": 1110
+    },
+    {
+      "epoch": 2.810539523212045,
+      "grad_norm": 0.553905770960013,
+      "learning_rate": 5e-06,
+      "loss": 0.4993,
+      "step": 1120
+    },
+    {
+      "epoch": 2.835633626097867,
+      "grad_norm": 0.5725357035362282,
+      "learning_rate": 5e-06,
+      "loss": 0.4972,
+      "step": 1130
+    },
+    {
+      "epoch": 2.8607277289836888,
+      "grad_norm": 0.5476057683573488,
+      "learning_rate": 5e-06,
+      "loss": 0.4928,
+      "step": 1140
+    },
+    {
+      "epoch": 2.8858218318695106,
+      "grad_norm": 0.49125993261554296,
+      "learning_rate": 5e-06,
+      "loss": 0.4908,
+      "step": 1150
+    },
+    {
+      "epoch": 2.9109159347553324,
+      "grad_norm": 0.530302463940866,
+      "learning_rate": 5e-06,
+      "loss": 0.4962,
+      "step": 1160
+    },
+    {
+      "epoch": 2.936010037641154,
+      "grad_norm": 0.47444202016080034,
+      "learning_rate": 5e-06,
+      "loss": 0.498,
+      "step": 1170
+    },
+    {
+      "epoch": 2.961104140526976,
+      "grad_norm": 0.49422829602754365,
+      "learning_rate": 5e-06,
+      "loss": 0.4885,
+      "step": 1180
+    },
+    {
+      "epoch": 2.9861982434127983,
+      "grad_norm": 0.47369939752959883,
+      "learning_rate": 5e-06,
+      "loss": 0.4907,
+      "step": 1190
+    },
+    {
+      "epoch": 2.9962358845671266,
+      "eval_loss": 0.555037796497345,
+      "eval_runtime": 214.1596,
+      "eval_samples_per_second": 50.121,
+      "eval_steps_per_second": 0.392,
+      "step": 1194
+    },
+    {
+      "epoch": 2.9962358845671266,
+      "step": 1194,
+      "total_flos": 1999575711744000.0,
+      "train_loss": 0.5572832309620664,
+      "train_runtime": 35786.5658,
+      "train_samples_per_second": 17.096,
+      "train_steps_per_second": 0.033
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1194,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1999575711744000.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed