{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.42103152724174225,
  "eval_steps": 500,
  "global_step": 21000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.010024560172422436,
      "grad_norm": 0.551948070526123,
      "learning_rate": 2.9699260180042907e-05,
      "loss": 0.0661,
      "step": 500
    },
    {
      "epoch": 0.02004912034484487,
      "grad_norm": 0.3649902641773224,
      "learning_rate": 2.939852036008581e-05,
      "loss": 0.0601,
      "step": 1000
    },
    {
      "epoch": 0.030073680517267304,
      "grad_norm": 0.3754759430885315,
      "learning_rate": 2.909778054012872e-05,
      "loss": 0.0569,
      "step": 1500
    },
    {
      "epoch": 0.04009824068968974,
      "grad_norm": 0.4191639721393585,
      "learning_rate": 2.879704072017162e-05,
      "loss": 0.0568,
      "step": 2000
    },
    {
      "epoch": 0.050122800862112175,
      "grad_norm": 0.650822639465332,
      "learning_rate": 2.8496300900214528e-05,
      "loss": 0.0555,
      "step": 2500
    },
    {
      "epoch": 0.06014736103453461,
      "grad_norm": 0.3467857837677002,
      "learning_rate": 2.8195561080257434e-05,
      "loss": 0.0533,
      "step": 3000
    },
    {
      "epoch": 0.07017192120695705,
      "grad_norm": 0.4036310315132141,
      "learning_rate": 2.789482126030034e-05,
      "loss": 0.0528,
      "step": 3500
    },
    {
      "epoch": 0.08019648137937949,
      "grad_norm": 0.45284321904182434,
      "learning_rate": 2.7594081440343246e-05,
      "loss": 0.0517,
      "step": 4000
    },
    {
      "epoch": 0.09022104155180191,
      "grad_norm": 0.342809796333313,
      "learning_rate": 2.7293341620386152e-05,
      "loss": 0.0513,
      "step": 4500
    },
    {
      "epoch": 0.10024560172422435,
      "grad_norm": 0.2932626008987427,
      "learning_rate": 2.6992601800429055e-05,
      "loss": 0.0507,
      "step": 5000
    },
    {
      "epoch": 0.11027016189664679,
      "grad_norm": 0.355673223733902,
      "learning_rate": 2.669186198047196e-05,
      "loss": 0.0508,
      "step": 5500
    },
    {
      "epoch": 0.12029472206906922,
      "grad_norm": 0.3138273060321808,
      "learning_rate": 2.6391122160514867e-05,
      "loss": 0.0496,
      "step": 6000
    },
    {
      "epoch": 0.13031928224149164,
      "grad_norm": 0.44768887758255005,
      "learning_rate": 2.6090382340557773e-05,
      "loss": 0.0489,
      "step": 6500
    },
    {
      "epoch": 0.1403438424139141,
      "grad_norm": 0.34995996952056885,
      "learning_rate": 2.578964252060068e-05,
      "loss": 0.0489,
      "step": 7000
    },
    {
      "epoch": 0.15036840258633652,
      "grad_norm": 0.331546425819397,
      "learning_rate": 2.548890270064358e-05,
      "loss": 0.0482,
      "step": 7500
    },
    {
      "epoch": 0.16039296275875897,
      "grad_norm": 0.36192241311073303,
      "learning_rate": 2.518816288068649e-05,
      "loss": 0.0481,
      "step": 8000
    },
    {
      "epoch": 0.1704175229311814,
      "grad_norm": 0.3860616683959961,
      "learning_rate": 2.4887423060729397e-05,
      "loss": 0.0478,
      "step": 8500
    },
    {
      "epoch": 0.18044208310360382,
      "grad_norm": 0.2786683440208435,
      "learning_rate": 2.45866832407723e-05,
      "loss": 0.0473,
      "step": 9000
    },
    {
      "epoch": 0.19046664327602628,
      "grad_norm": 0.33059021830558777,
      "learning_rate": 2.4285943420815206e-05,
      "loss": 0.0474,
      "step": 9500
    },
    {
      "epoch": 0.2004912034484487,
      "grad_norm": 0.26813268661499023,
      "learning_rate": 2.398520360085811e-05,
      "loss": 0.0465,
      "step": 10000
    },
    {
      "epoch": 0.21051576362087113,
      "grad_norm": 0.29441842436790466,
      "learning_rate": 2.3684463780901018e-05,
      "loss": 0.0462,
      "step": 10500
    },
    {
      "epoch": 0.22054032379329358,
      "grad_norm": 0.35583028197288513,
      "learning_rate": 2.3383723960943924e-05,
      "loss": 0.0456,
      "step": 11000
    },
    {
      "epoch": 0.230564883965716,
      "grad_norm": 0.23940405249595642,
      "learning_rate": 2.3082984140986827e-05,
      "loss": 0.0456,
      "step": 11500
    },
    {
      "epoch": 0.24058944413813843,
      "grad_norm": 0.34972646832466125,
      "learning_rate": 2.2782244321029733e-05,
      "loss": 0.0456,
      "step": 12000
    },
    {
      "epoch": 0.25061400431056086,
      "grad_norm": 0.3413805663585663,
      "learning_rate": 2.248150450107264e-05,
      "loss": 0.045,
      "step": 12500
    },
    {
      "epoch": 0.2606385644829833,
      "grad_norm": 0.357909619808197,
      "learning_rate": 2.2180764681115545e-05,
      "loss": 0.0453,
      "step": 13000
    },
    {
      "epoch": 0.27066312465540576,
      "grad_norm": 0.28180328011512756,
      "learning_rate": 2.188002486115845e-05,
      "loss": 0.045,
      "step": 13500
    },
    {
      "epoch": 0.2806876848278282,
      "grad_norm": 0.2709687650203705,
      "learning_rate": 2.1579285041201354e-05,
      "loss": 0.0441,
      "step": 14000
    },
    {
      "epoch": 0.2907122450002506,
      "grad_norm": 0.2817750573158264,
      "learning_rate": 2.127854522124426e-05,
      "loss": 0.0447,
      "step": 14500
    },
    {
      "epoch": 0.30073680517267304,
      "grad_norm": 0.2984393835067749,
      "learning_rate": 2.097780540128717e-05,
      "loss": 0.0442,
      "step": 15000
    },
    {
      "epoch": 0.31076136534509546,
      "grad_norm": 0.25747916102409363,
      "learning_rate": 2.0677065581330072e-05,
      "loss": 0.0445,
      "step": 15500
    },
    {
      "epoch": 0.32078592551751794,
      "grad_norm": 0.37057626247406006,
      "learning_rate": 2.0376325761372978e-05,
      "loss": 0.044,
      "step": 16000
    },
    {
      "epoch": 0.33081048568994037,
      "grad_norm": 0.3557540774345398,
      "learning_rate": 2.0075585941415884e-05,
      "loss": 0.0434,
      "step": 16500
    },
    {
      "epoch": 0.3408350458623628,
      "grad_norm": 0.5469168424606323,
      "learning_rate": 1.9774846121458787e-05,
      "loss": 0.0429,
      "step": 17000
    },
    {
      "epoch": 0.3508596060347852,
      "grad_norm": 0.3066796064376831,
      "learning_rate": 1.9474106301501696e-05,
      "loss": 0.0432,
      "step": 17500
    },
    {
      "epoch": 0.36088416620720765,
      "grad_norm": 0.3197426497936249,
      "learning_rate": 1.91733664815446e-05,
      "loss": 0.0427,
      "step": 18000
    },
    {
      "epoch": 0.37090872637963007,
      "grad_norm": 0.2538721561431885,
      "learning_rate": 1.8872626661587505e-05,
      "loss": 0.0429,
      "step": 18500
    },
    {
      "epoch": 0.38093328655205255,
      "grad_norm": 0.26059648394584656,
      "learning_rate": 1.857188684163041e-05,
      "loss": 0.0429,
      "step": 19000
    },
    {
      "epoch": 0.390957846724475,
      "grad_norm": 0.3754318654537201,
      "learning_rate": 1.8271147021673317e-05,
      "loss": 0.043,
      "step": 19500
    },
    {
      "epoch": 0.4009824068968974,
      "grad_norm": 0.4147075116634369,
      "learning_rate": 1.7970407201716223e-05,
      "loss": 0.0422,
      "step": 20000
    },
    {
      "epoch": 0.41100696706931983,
      "grad_norm": 0.26918351650238037,
      "learning_rate": 1.766966738175913e-05,
      "loss": 0.0424,
      "step": 20500
    },
    {
      "epoch": 0.42103152724174225,
      "grad_norm": 0.35338133573532104,
      "learning_rate": 1.7368927561802032e-05,
      "loss": 0.0414,
      "step": 21000
    }
  ],
  "logging_steps": 500,
  "max_steps": 49877,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}