{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9957446808510637,
  "eval_steps": 500,
  "global_step": 264,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.11347517730496454,
      "grad_norm": 2.358112635127941,
      "learning_rate": 5e-06,
      "loss": 1.045,
      "step": 10
    },
    {
      "epoch": 0.22695035460992907,
      "grad_norm": 2.2025871190711044,
      "learning_rate": 5e-06,
      "loss": 0.939,
      "step": 20
    },
    {
      "epoch": 0.3404255319148936,
      "grad_norm": 1.183824809408691,
      "learning_rate": 5e-06,
      "loss": 0.8991,
      "step": 30
    },
    {
      "epoch": 0.45390070921985815,
      "grad_norm": 1.2551021476147783,
      "learning_rate": 5e-06,
      "loss": 0.8706,
      "step": 40
    },
    {
      "epoch": 0.5673758865248227,
      "grad_norm": 1.6978646589888085,
      "learning_rate": 5e-06,
      "loss": 0.8565,
      "step": 50
    },
    {
      "epoch": 0.6808510638297872,
      "grad_norm": 2.13778699873673,
      "learning_rate": 5e-06,
      "loss": 0.8386,
      "step": 60
    },
    {
      "epoch": 0.7943262411347518,
      "grad_norm": 0.993295806091264,
      "learning_rate": 5e-06,
      "loss": 0.8309,
      "step": 70
    },
    {
      "epoch": 0.9078014184397163,
      "grad_norm": 1.1398762974593635,
      "learning_rate": 5e-06,
      "loss": 0.823,
      "step": 80
    },
    {
      "epoch": 0.9985815602836879,
      "eval_loss": 0.8090236783027649,
      "eval_runtime": 63.3465,
      "eval_samples_per_second": 37.445,
      "eval_steps_per_second": 0.6,
      "step": 88
    },
    {
      "epoch": 1.0212765957446808,
      "grad_norm": 1.013500095467207,
      "learning_rate": 5e-06,
      "loss": 0.8742,
      "step": 90
    },
    {
      "epoch": 1.1347517730496455,
      "grad_norm": 1.056225323349834,
      "learning_rate": 5e-06,
      "loss": 0.7667,
      "step": 100
    },
    {
      "epoch": 1.24822695035461,
      "grad_norm": 0.7290196034792423,
      "learning_rate": 5e-06,
      "loss": 0.755,
      "step": 110
    },
    {
      "epoch": 1.3617021276595744,
      "grad_norm": 0.8838498260846974,
      "learning_rate": 5e-06,
      "loss": 0.7554,
      "step": 120
    },
    {
      "epoch": 1.475177304964539,
      "grad_norm": 0.821991213787815,
      "learning_rate": 5e-06,
      "loss": 0.7556,
      "step": 130
    },
    {
      "epoch": 1.5886524822695036,
      "grad_norm": 0.9855152726966359,
      "learning_rate": 5e-06,
      "loss": 0.7493,
      "step": 140
    },
    {
      "epoch": 1.702127659574468,
      "grad_norm": 0.6490086567527167,
      "learning_rate": 5e-06,
      "loss": 0.7477,
      "step": 150
    },
    {
      "epoch": 1.8156028368794326,
      "grad_norm": 1.0694149262660388,
      "learning_rate": 5e-06,
      "loss": 0.7414,
      "step": 160
    },
    {
      "epoch": 1.9290780141843973,
      "grad_norm": 0.9645011140855406,
      "learning_rate": 5e-06,
      "loss": 0.7481,
      "step": 170
    },
    {
      "epoch": 1.9971631205673759,
      "eval_loss": 0.7898643016815186,
      "eval_runtime": 62.2492,
      "eval_samples_per_second": 38.105,
      "eval_steps_per_second": 0.61,
      "step": 176
    },
    {
      "epoch": 2.0425531914893615,
      "grad_norm": 1.581859190270789,
      "learning_rate": 5e-06,
      "loss": 0.7818,
      "step": 180
    },
    {
      "epoch": 2.1560283687943262,
      "grad_norm": 1.0466470957786433,
      "learning_rate": 5e-06,
      "loss": 0.6863,
      "step": 190
    },
    {
      "epoch": 2.269503546099291,
      "grad_norm": 0.9663026123669691,
      "learning_rate": 5e-06,
      "loss": 0.6798,
      "step": 200
    },
    {
      "epoch": 2.382978723404255,
      "grad_norm": 0.8243226264574698,
      "learning_rate": 5e-06,
      "loss": 0.6826,
      "step": 210
    },
    {
      "epoch": 2.49645390070922,
      "grad_norm": 1.0907354136557872,
      "learning_rate": 5e-06,
      "loss": 0.6839,
      "step": 220
    },
    {
      "epoch": 2.6099290780141846,
      "grad_norm": 0.7996806357479502,
      "learning_rate": 5e-06,
      "loss": 0.687,
      "step": 230
    },
    {
      "epoch": 2.723404255319149,
      "grad_norm": 0.9108831837931511,
      "learning_rate": 5e-06,
      "loss": 0.6902,
      "step": 240
    },
    {
      "epoch": 2.8368794326241136,
      "grad_norm": 0.8473372600949097,
      "learning_rate": 5e-06,
      "loss": 0.6873,
      "step": 250
    },
    {
      "epoch": 2.950354609929078,
      "grad_norm": 0.8244777156304377,
      "learning_rate": 5e-06,
      "loss": 0.6866,
      "step": 260
    },
    {
      "epoch": 2.9957446808510637,
      "eval_loss": 0.7845782041549683,
      "eval_runtime": 58.8988,
      "eval_samples_per_second": 40.272,
      "eval_steps_per_second": 0.645,
      "step": 264
    },
    {
      "epoch": 2.9957446808510637,
      "step": 264,
      "total_flos": 442000453140480.0,
      "train_loss": 0.7778021304896383,
      "train_runtime": 8882.1716,
      "train_samples_per_second": 15.22,
      "train_steps_per_second": 0.03
    }
  ],
  "logging_steps": 10,
  "max_steps": 264,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 442000453140480.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}