Training in progress, step 2036, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/config.json +28 -0
last-checkpoint/generation_config.json +12 -0
last-checkpoint/model.safetensors +3 -0
last-checkpoint/optimizer.pt +3 -0
last-checkpoint/rng_state.pth +3 -0
last-checkpoint/scheduler.pt +3 -0
last-checkpoint/trainer_state.json +1455 -0
last-checkpoint/training_args.bin +3 -0

last-checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5504,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

last-checkpoint/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "top_p": 0.8,
+  "transformers_version": "4.51.3"
+}

last-checkpoint/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02ee0e3091a12bc5abb031e386a74a22cb9bdc622a9842d1c9722a565b8c7c1
+size 3673690696

last-checkpoint/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb86ff67aa4c157e4b6eff3a0004aa92b27aa127bec34b95e5c548603ce0a291
+size 7347565686

last-checkpoint/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
+size 14244

last-checkpoint/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09933cc0b85c7ce3d86d14a8c13caccc68575168a9217284ecd221f1cc79411a
+size 1064

last-checkpoint/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1455 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 2036,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.004911591355599214,
+      "grad_norm": 4.0625,
+      "learning_rate": 4.977897838899804e-05,
+      "loss": 2.9953,
+      "step": 10
+    },
+    {
+      "epoch": 0.009823182711198428,
+      "grad_norm": 1.6328125,
+      "learning_rate": 4.953339882121808e-05,
+      "loss": 0.0935,
+      "step": 20
+    },
+    {
+      "epoch": 0.014734774066797643,
+      "grad_norm": 1.578125,
+      "learning_rate": 4.928781925343812e-05,
+      "loss": 0.073,
+      "step": 30
+    },
+    {
+      "epoch": 0.019646365422396856,
+      "grad_norm": 1.2890625,
+      "learning_rate": 4.904223968565815e-05,
+      "loss": 0.0679,
+      "step": 40
+    },
+    {
+      "epoch": 0.02455795677799607,
+      "grad_norm": 1.5078125,
+      "learning_rate": 4.87966601178782e-05,
+      "loss": 0.055,
+      "step": 50
+    },
+    {
+      "epoch": 0.029469548133595286,
+      "grad_norm": 1.234375,
+      "learning_rate": 4.855108055009823e-05,
+      "loss": 0.0542,
+      "step": 60
+    },
+    {
+      "epoch": 0.0343811394891945,
+      "grad_norm": 0.984375,
+      "learning_rate": 4.830550098231827e-05,
+      "loss": 0.0458,
+      "step": 70
+    },
+    {
+      "epoch": 0.03929273084479371,
+      "grad_norm": 0.95703125,
+      "learning_rate": 4.805992141453831e-05,
+      "loss": 0.0485,
+      "step": 80
+    },
+    {
+      "epoch": 0.04420432220039293,
+      "grad_norm": 1.21875,
+      "learning_rate": 4.781434184675835e-05,
+      "loss": 0.0563,
+      "step": 90
+    },
+    {
+      "epoch": 0.04911591355599214,
+      "grad_norm": 0.85546875,
+      "learning_rate": 4.756876227897839e-05,
+      "loss": 0.0525,
+      "step": 100
+    },
+    {
+      "epoch": 0.054027504911591355,
+      "grad_norm": 1.09375,
+      "learning_rate": 4.732318271119843e-05,
+      "loss": 0.0501,
+      "step": 110
+    },
+    {
+      "epoch": 0.05893909626719057,
+      "grad_norm": 0.94921875,
+      "learning_rate": 4.7077603143418466e-05,
+      "loss": 0.0466,
+      "step": 120
+    },
+    {
+      "epoch": 0.06385068762278978,
+      "grad_norm": 0.90625,
+      "learning_rate": 4.683202357563851e-05,
+      "loss": 0.0472,
+      "step": 130
+    },
+    {
+      "epoch": 0.068762278978389,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.658644400785855e-05,
+      "loss": 0.0432,
+      "step": 140
+    },
+    {
+      "epoch": 0.07367387033398821,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.634086444007859e-05,
+      "loss": 0.0497,
+      "step": 150
+    },
+    {
+      "epoch": 0.07858546168958742,
+      "grad_norm": 1.3125,
+      "learning_rate": 4.609528487229863e-05,
+      "loss": 0.0406,
+      "step": 160
+    },
+    {
+      "epoch": 0.08349705304518663,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.584970530451866e-05,
+      "loss": 0.048,
+      "step": 170
+    },
+    {
+      "epoch": 0.08840864440078586,
+      "grad_norm": 0.875,
+      "learning_rate": 4.560412573673871e-05,
+      "loss": 0.0437,
+      "step": 180
+    },
+    {
+      "epoch": 0.09332023575638507,
+      "grad_norm": 0.8515625,
+      "learning_rate": 4.535854616895874e-05,
+      "loss": 0.0478,
+      "step": 190
+    },
+    {
+      "epoch": 0.09823182711198428,
+      "grad_norm": 0.8359375,
+      "learning_rate": 4.511296660117879e-05,
+      "loss": 0.0373,
+      "step": 200
+    },
+    {
+      "epoch": 0.1031434184675835,
+      "grad_norm": 0.6171875,
+      "learning_rate": 4.486738703339882e-05,
+      "loss": 0.0446,
+      "step": 210
+    },
+    {
+      "epoch": 0.10805500982318271,
+      "grad_norm": 0.81640625,
+      "learning_rate": 4.462180746561886e-05,
+      "loss": 0.0414,
+      "step": 220
+    },
+    {
+      "epoch": 0.11296660117878192,
+      "grad_norm": 0.70703125,
+      "learning_rate": 4.43762278978389e-05,
+      "loss": 0.0429,
+      "step": 230
+    },
+    {
+      "epoch": 0.11787819253438114,
+      "grad_norm": 1.046875,
+      "learning_rate": 4.413064833005894e-05,
+      "loss": 0.0386,
+      "step": 240
+    },
+    {
+      "epoch": 0.12278978388998035,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.388506876227898e-05,
+      "loss": 0.0404,
+      "step": 250
+    },
+    {
+      "epoch": 0.12770137524557956,
+      "grad_norm": 1.0390625,
+      "learning_rate": 4.3639489194499023e-05,
+      "loss": 0.037,
+      "step": 260
+    },
+    {
+      "epoch": 0.13261296660117877,
+      "grad_norm": 0.9140625,
+      "learning_rate": 4.339390962671906e-05,
+      "loss": 0.0389,
+      "step": 270
+    },
+    {
+      "epoch": 0.137524557956778,
+      "grad_norm": 0.97265625,
+      "learning_rate": 4.31483300589391e-05,
+      "loss": 0.0416,
+      "step": 280
+    },
+    {
+      "epoch": 0.14243614931237722,
+      "grad_norm": 1.7421875,
+      "learning_rate": 4.290275049115914e-05,
+      "loss": 0.0377,
+      "step": 290
+    },
+    {
+      "epoch": 0.14734774066797643,
+      "grad_norm": 0.6015625,
+      "learning_rate": 4.265717092337918e-05,
+      "loss": 0.0327,
+      "step": 300
+    },
+    {
+      "epoch": 0.15225933202357564,
+      "grad_norm": 1.015625,
+      "learning_rate": 4.241159135559922e-05,
+      "loss": 0.0377,
+      "step": 310
+    },
+    {
+      "epoch": 0.15717092337917485,
+      "grad_norm": 0.87109375,
+      "learning_rate": 4.216601178781925e-05,
+      "loss": 0.038,
+      "step": 320
+    },
+    {
+      "epoch": 0.16208251473477406,
+      "grad_norm": 0.65625,
+      "learning_rate": 4.19204322200393e-05,
+      "loss": 0.0381,
+      "step": 330
+    },
+    {
+      "epoch": 0.16699410609037327,
+      "grad_norm": 0.6953125,
+      "learning_rate": 4.167485265225933e-05,
+      "loss": 0.0331,
+      "step": 340
+    },
+    {
+      "epoch": 0.1719056974459725,
+      "grad_norm": 0.76171875,
+      "learning_rate": 4.142927308447937e-05,
+      "loss": 0.0374,
+      "step": 350
+    },
+    {
+      "epoch": 0.17681728880157171,
+      "grad_norm": 0.90234375,
+      "learning_rate": 4.118369351669941e-05,
+      "loss": 0.0389,
+      "step": 360
+    },
+    {
+      "epoch": 0.18172888015717092,
+      "grad_norm": 1.1015625,
+      "learning_rate": 4.093811394891945e-05,
+      "loss": 0.0443,
+      "step": 370
+    },
+    {
+      "epoch": 0.18664047151277013,
+      "grad_norm": 0.6484375,
+      "learning_rate": 4.069253438113949e-05,
+      "loss": 0.0374,
+      "step": 380
+    },
+    {
+      "epoch": 0.19155206286836934,
+      "grad_norm": 0.47265625,
+      "learning_rate": 4.044695481335953e-05,
+      "loss": 0.0232,
+      "step": 390
+    },
+    {
+      "epoch": 0.19646365422396855,
+      "grad_norm": 0.92578125,
+      "learning_rate": 4.020137524557957e-05,
+      "loss": 0.043,
+      "step": 400
+    },
+    {
+      "epoch": 0.2013752455795678,
+      "grad_norm": 0.8125,
+      "learning_rate": 3.995579567779961e-05,
+      "loss": 0.0344,
+      "step": 410
+    },
+    {
+      "epoch": 0.206286836935167,
+      "grad_norm": 0.828125,
+      "learning_rate": 3.971021611001965e-05,
+      "loss": 0.0271,
+      "step": 420
+    },
+    {
+      "epoch": 0.2111984282907662,
+      "grad_norm": 0.953125,
+      "learning_rate": 3.946463654223969e-05,
+      "loss": 0.0378,
+      "step": 430
+    },
+    {
+      "epoch": 0.21611001964636542,
+      "grad_norm": 1.0,
+      "learning_rate": 3.921905697445973e-05,
+      "loss": 0.0356,
+      "step": 440
+    },
+    {
+      "epoch": 0.22102161100196463,
+      "grad_norm": 0.6953125,
+      "learning_rate": 3.897347740667976e-05,
+      "loss": 0.0336,
+      "step": 450
+    },
+    {
+      "epoch": 0.22593320235756384,
+      "grad_norm": 0.89453125,
+      "learning_rate": 3.872789783889981e-05,
+      "loss": 0.0334,
+      "step": 460
+    },
+    {
+      "epoch": 0.23084479371316308,
+      "grad_norm": 0.5390625,
+      "learning_rate": 3.848231827111984e-05,
+      "loss": 0.0316,
+      "step": 470
+    },
+    {
+      "epoch": 0.2357563850687623,
+      "grad_norm": 0.7421875,
+      "learning_rate": 3.823673870333989e-05,
+      "loss": 0.037,
+      "step": 480
+    },
+    {
+      "epoch": 0.2406679764243615,
+      "grad_norm": 0.65234375,
+      "learning_rate": 3.799115913555992e-05,
+      "loss": 0.0336,
+      "step": 490
+    },
+    {
+      "epoch": 0.2455795677799607,
+      "grad_norm": 0.67578125,
+      "learning_rate": 3.774557956777996e-05,
+      "loss": 0.0316,
+      "step": 500
+    },
+    {
+      "epoch": 0.2504911591355599,
+      "grad_norm": 0.89453125,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 0.0287,
+      "step": 510
+    },
+    {
+      "epoch": 0.2554027504911591,
+      "grad_norm": 0.58984375,
+      "learning_rate": 3.725442043222004e-05,
+      "loss": 0.0353,
+      "step": 520
+    },
+    {
+      "epoch": 0.26031434184675833,
+      "grad_norm": 0.828125,
+      "learning_rate": 3.7008840864440084e-05,
+      "loss": 0.0323,
+      "step": 530
+    },
+    {
+      "epoch": 0.26522593320235754,
+      "grad_norm": 0.640625,
+      "learning_rate": 3.676326129666012e-05,
+      "loss": 0.0372,
+      "step": 540
+    },
+    {
+      "epoch": 0.27013752455795675,
+      "grad_norm": 0.8671875,
+      "learning_rate": 3.651768172888016e-05,
+      "loss": 0.0267,
+      "step": 550
+    },
+    {
+      "epoch": 0.275049115913556,
+      "grad_norm": 1.0859375,
+      "learning_rate": 3.62721021611002e-05,
+      "loss": 0.0304,
+      "step": 560
+    },
+    {
+      "epoch": 0.27996070726915523,
+      "grad_norm": 0.5859375,
+      "learning_rate": 3.602652259332024e-05,
+      "loss": 0.0326,
+      "step": 570
+    },
+    {
+      "epoch": 0.28487229862475444,
+      "grad_norm": 0.62890625,
+      "learning_rate": 3.578094302554028e-05,
+      "loss": 0.0403,
+      "step": 580
+    },
+    {
+      "epoch": 0.28978388998035365,
+      "grad_norm": 0.8515625,
+      "learning_rate": 3.553536345776032e-05,
+      "loss": 0.0306,
+      "step": 590
+    },
+    {
+      "epoch": 0.29469548133595286,
+      "grad_norm": 0.78515625,
+      "learning_rate": 3.528978388998035e-05,
+      "loss": 0.028,
+      "step": 600
+    },
+    {
+      "epoch": 0.29960707269155207,
+      "grad_norm": 0.578125,
+      "learning_rate": 3.50442043222004e-05,
+      "loss": 0.029,
+      "step": 610
+    },
+    {
+      "epoch": 0.3045186640471513,
+      "grad_norm": 0.6875,
+      "learning_rate": 3.479862475442043e-05,
+      "loss": 0.0299,
+      "step": 620
+    },
+    {
+      "epoch": 0.3094302554027505,
+      "grad_norm": 0.94140625,
+      "learning_rate": 3.455304518664047e-05,
+      "loss": 0.0293,
+      "step": 630
+    },
+    {
+      "epoch": 0.3143418467583497,
+      "grad_norm": 0.84375,
+      "learning_rate": 3.4307465618860514e-05,
+      "loss": 0.0288,
+      "step": 640
+    },
+    {
+      "epoch": 0.3192534381139489,
+      "grad_norm": 0.462890625,
+      "learning_rate": 3.406188605108055e-05,
+      "loss": 0.0272,
+      "step": 650
+    },
+    {
+      "epoch": 0.3241650294695481,
+      "grad_norm": 1.484375,
+      "learning_rate": 3.3816306483300594e-05,
+      "loss": 0.0392,
+      "step": 660
+    },
+    {
+      "epoch": 0.3290766208251473,
+      "grad_norm": 0.6640625,
+      "learning_rate": 3.357072691552063e-05,
+      "loss": 0.032,
+      "step": 670
+    },
+    {
+      "epoch": 0.33398821218074654,
+      "grad_norm": 1.109375,
+      "learning_rate": 3.332514734774067e-05,
+      "loss": 0.0388,
+      "step": 680
+    },
+    {
+      "epoch": 0.3388998035363458,
+      "grad_norm": 1.390625,
+      "learning_rate": 3.307956777996071e-05,
+      "loss": 0.03,
+      "step": 690
+    },
+    {
+      "epoch": 0.343811394891945,
+      "grad_norm": 0.80859375,
+      "learning_rate": 3.283398821218075e-05,
+      "loss": 0.0316,
+      "step": 700
+    },
+    {
+      "epoch": 0.3487229862475442,
+      "grad_norm": 0.6796875,
+      "learning_rate": 3.258840864440079e-05,
+      "loss": 0.037,
+      "step": 710
+    },
+    {
+      "epoch": 0.35363457760314343,
+      "grad_norm": 0.78125,
+      "learning_rate": 3.234282907662083e-05,
+      "loss": 0.0302,
+      "step": 720
+    },
+    {
+      "epoch": 0.35854616895874264,
+      "grad_norm": 0.5546875,
+      "learning_rate": 3.209724950884086e-05,
+      "loss": 0.0294,
+      "step": 730
+    },
+    {
+      "epoch": 0.36345776031434185,
+      "grad_norm": 0.60546875,
+      "learning_rate": 3.185166994106091e-05,
+      "loss": 0.0303,
+      "step": 740
+    },
+    {
+      "epoch": 0.36836935166994106,
+      "grad_norm": 1.125,
+      "learning_rate": 3.160609037328094e-05,
+      "loss": 0.0304,
+      "step": 750
+    },
+    {
+      "epoch": 0.37328094302554027,
+      "grad_norm": 0.609375,
+      "learning_rate": 3.1360510805500984e-05,
+      "loss": 0.0303,
+      "step": 760
+    },
+    {
+      "epoch": 0.3781925343811395,
+      "grad_norm": 0.9609375,
+      "learning_rate": 3.1114931237721024e-05,
+      "loss": 0.0323,
+      "step": 770
+    },
+    {
+      "epoch": 0.3831041257367387,
+      "grad_norm": 0.796875,
+      "learning_rate": 3.086935166994106e-05,
+      "loss": 0.0324,
+      "step": 780
+    },
+    {
+      "epoch": 0.3880157170923379,
+      "grad_norm": 0.7421875,
+      "learning_rate": 3.0623772102161104e-05,
+      "loss": 0.0318,
+      "step": 790
+    },
+    {
+      "epoch": 0.3929273084479371,
+      "grad_norm": 0.6484375,
+      "learning_rate": 3.0378192534381138e-05,
+      "loss": 0.0295,
+      "step": 800
+    },
+    {
+      "epoch": 0.39783889980353637,
+      "grad_norm": 0.6484375,
+      "learning_rate": 3.013261296660118e-05,
+      "loss": 0.0267,
+      "step": 810
+    },
+    {
+      "epoch": 0.4027504911591356,
+      "grad_norm": 1.0234375,
+      "learning_rate": 2.988703339882122e-05,
+      "loss": 0.0324,
+      "step": 820
+    },
+    {
+      "epoch": 0.4076620825147348,
+      "grad_norm": 0.81640625,
+      "learning_rate": 2.964145383104126e-05,
+      "loss": 0.027,
+      "step": 830
+    },
+    {
+      "epoch": 0.412573673870334,
+      "grad_norm": 0.609375,
+      "learning_rate": 2.9395874263261296e-05,
+      "loss": 0.0294,
+      "step": 840
+    },
+    {
+      "epoch": 0.4174852652259332,
+      "grad_norm": 0.625,
+      "learning_rate": 2.915029469548134e-05,
+      "loss": 0.0265,
+      "step": 850
+    },
+    {
+      "epoch": 0.4223968565815324,
+      "grad_norm": 0.6640625,
+      "learning_rate": 2.8904715127701376e-05,
+      "loss": 0.027,
+      "step": 860
+    },
+    {
+      "epoch": 0.42730844793713163,
+      "grad_norm": 0.4921875,
+      "learning_rate": 2.865913555992142e-05,
+      "loss": 0.0308,
+      "step": 870
+    },
+    {
+      "epoch": 0.43222003929273084,
+      "grad_norm": 0.57421875,
+      "learning_rate": 2.8413555992141457e-05,
+      "loss": 0.0312,
+      "step": 880
+    },
+    {
+      "epoch": 0.43713163064833005,
+      "grad_norm": 0.8984375,
+      "learning_rate": 2.816797642436149e-05,
+      "loss": 0.0297,
+      "step": 890
+    },
+    {
+      "epoch": 0.44204322200392926,
+      "grad_norm": 0.5703125,
+      "learning_rate": 2.7922396856581534e-05,
+      "loss": 0.0308,
+      "step": 900
+    },
+    {
+      "epoch": 0.44695481335952847,
+      "grad_norm": 0.8203125,
+      "learning_rate": 2.767681728880157e-05,
+      "loss": 0.0307,
+      "step": 910
+    },
+    {
+      "epoch": 0.4518664047151277,
+      "grad_norm": 0.6015625,
+      "learning_rate": 2.7431237721021615e-05,
+      "loss": 0.031,
+      "step": 920
+    },
+    {
+      "epoch": 0.4567779960707269,
+      "grad_norm": 0.609375,
+      "learning_rate": 2.718565815324165e-05,
+      "loss": 0.0388,
+      "step": 930
+    },
+    {
+      "epoch": 0.46168958742632615,
+      "grad_norm": 0.578125,
+      "learning_rate": 2.6940078585461692e-05,
+      "loss": 0.0293,
+      "step": 940
+    },
+    {
+      "epoch": 0.46660117878192536,
+      "grad_norm": 0.90234375,
+      "learning_rate": 2.669449901768173e-05,
+      "loss": 0.0339,
+      "step": 950
+    },
+    {
+      "epoch": 0.4715127701375246,
+      "grad_norm": 0.5625,
+      "learning_rate": 2.6448919449901772e-05,
+      "loss": 0.0322,
+      "step": 960
+    },
+    {
+      "epoch": 0.4764243614931238,
+      "grad_norm": 0.5234375,
+      "learning_rate": 2.620333988212181e-05,
+      "loss": 0.0316,
+      "step": 970
+    },
+    {
+      "epoch": 0.481335952848723,
+      "grad_norm": 0.7734375,
+      "learning_rate": 2.595776031434185e-05,
+      "loss": 0.0329,
+      "step": 980
+    },
+    {
+      "epoch": 0.4862475442043222,
+      "grad_norm": 0.58984375,
+      "learning_rate": 2.5712180746561886e-05,
+      "loss": 0.0251,
+      "step": 990
+    },
+    {
+      "epoch": 0.4911591355599214,
+      "grad_norm": 0.9921875,
+      "learning_rate": 2.546660117878193e-05,
+      "loss": 0.0256,
+      "step": 1000
+    },
+    {
+      "epoch": 0.4960707269155206,
+      "grad_norm": 1.1015625,
+      "learning_rate": 2.5221021611001967e-05,
+      "loss": 0.0294,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5009823182711198,
+      "grad_norm": 0.6484375,
+      "learning_rate": 2.4975442043222004e-05,
+      "loss": 0.0251,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5058939096267191,
+      "grad_norm": 0.55859375,
+      "learning_rate": 2.4729862475442044e-05,
+      "loss": 0.0275,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5108055009823183,
+      "grad_norm": 0.5546875,
+      "learning_rate": 2.4484282907662084e-05,
+      "loss": 0.0261,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5157170923379175,
+      "grad_norm": 0.72265625,
+      "learning_rate": 2.4238703339882125e-05,
+      "loss": 0.0292,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5206286836935167,
+      "grad_norm": 0.640625,
+      "learning_rate": 2.3993123772102165e-05,
+      "loss": 0.0252,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5255402750491159,
+      "grad_norm": 0.671875,
+      "learning_rate": 2.37475442043222e-05,
+      "loss": 0.0305,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5304518664047151,
+      "grad_norm": 0.74609375,
+      "learning_rate": 2.350196463654224e-05,
+      "loss": 0.029,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5353634577603144,
+      "grad_norm": 0.62109375,
+      "learning_rate": 2.325638506876228e-05,
+      "loss": 0.0306,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5402750491159135,
+      "grad_norm": 0.5078125,
+      "learning_rate": 2.301080550098232e-05,
+      "loss": 0.0295,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5451866404715128,
+      "grad_norm": 0.55859375,
+      "learning_rate": 2.276522593320236e-05,
+      "loss": 0.0244,
+      "step": 1110
+    },
+    {
+      "epoch": 0.550098231827112,
+      "grad_norm": 0.7890625,
+      "learning_rate": 2.2519646365422397e-05,
+      "loss": 0.0261,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5550098231827112,
+      "grad_norm": 0.59375,
+      "learning_rate": 2.2274066797642437e-05,
+      "loss": 0.0227,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5599214145383105,
+      "grad_norm": 0.44140625,
+      "learning_rate": 2.2028487229862477e-05,
+      "loss": 0.0244,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5648330058939096,
+      "grad_norm": 0.63671875,
+      "learning_rate": 2.1782907662082517e-05,
+      "loss": 0.0235,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5697445972495089,
+      "grad_norm": 0.60546875,
+      "learning_rate": 2.1537328094302554e-05,
+      "loss": 0.0297,
+      "step": 1160
+    },
+    {
+      "epoch": 0.574656188605108,
+      "grad_norm": 0.70703125,
+      "learning_rate": 2.1291748526522595e-05,
+      "loss": 0.0267,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5795677799607073,
+      "grad_norm": 1.03125,
+      "learning_rate": 2.1046168958742635e-05,
+      "loss": 0.0276,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5844793713163065,
+      "grad_norm": 0.5390625,
+      "learning_rate": 2.0800589390962675e-05,
+      "loss": 0.0281,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5893909626719057,
+      "grad_norm": 0.6796875,
+      "learning_rate": 2.0555009823182712e-05,
+      "loss": 0.0306,
+      "step": 1200
+    },
+    {
+      "epoch": 0.5943025540275049,
+      "grad_norm": 0.46875,
+      "learning_rate": 2.030943025540275e-05,
+      "loss": 0.0253,
+      "step": 1210
+    },
+    {
+      "epoch": 0.5992141453831041,
+      "grad_norm": 0.625,
+      "learning_rate": 2.006385068762279e-05,
+      "loss": 0.0258,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6041257367387033,
+      "grad_norm": 0.67578125,
+      "learning_rate": 1.981827111984283e-05,
+      "loss": 0.0275,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6090373280943026,
+      "grad_norm": 0.78515625,
+      "learning_rate": 1.957269155206287e-05,
+      "loss": 0.0275,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6139489194499018,
+      "grad_norm": 0.8125,
+      "learning_rate": 1.932711198428291e-05,
+      "loss": 0.0304,
+      "step": 1250
+    },
+    {
+      "epoch": 0.618860510805501,
+      "grad_norm": 0.703125,
+      "learning_rate": 1.9081532416502947e-05,
+      "loss": 0.0234,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6237721021611002,
+      "grad_norm": 0.77734375,
+      "learning_rate": 1.8835952848722987e-05,
+      "loss": 0.0315,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6286836935166994,
+      "grad_norm": 0.82421875,
+      "learning_rate": 1.8590373280943028e-05,
+      "loss": 0.0293,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6335952848722987,
+      "grad_norm": 0.4609375,
+      "learning_rate": 1.8344793713163068e-05,
+      "loss": 0.0261,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6385068762278978,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.8099214145383105e-05,
+      "loss": 0.0264,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6434184675834971,
+      "grad_norm": 0.73828125,
+      "learning_rate": 1.7853634577603145e-05,
+      "loss": 0.0283,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6483300589390962,
+      "grad_norm": 1.0625,
+      "learning_rate": 1.7608055009823182e-05,
+      "loss": 0.0244,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6532416502946955,
+      "grad_norm": 0.8203125,
+      "learning_rate": 1.7362475442043222e-05,
+      "loss": 0.0249,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6581532416502947,
+      "grad_norm": 0.765625,
+      "learning_rate": 1.7116895874263263e-05,
+      "loss": 0.0246,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6630648330058939,
+      "grad_norm": 0.77734375,
+      "learning_rate": 1.68713163064833e-05,
+      "loss": 0.0268,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6679764243614931,
+      "grad_norm": 0.5390625,
+      "learning_rate": 1.662573673870334e-05,
+      "loss": 0.025,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6728880157170923,
+      "grad_norm": 0.82421875,
+      "learning_rate": 1.638015717092338e-05,
+      "loss": 0.0209,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6777996070726916,
+      "grad_norm": 0.5546875,
+      "learning_rate": 1.613457760314342e-05,
+      "loss": 0.0264,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6827111984282908,
+      "grad_norm": 1.0703125,
+      "learning_rate": 1.5888998035363457e-05,
+      "loss": 0.0227,
+      "step": 1390
+    },
+    {
+      "epoch": 0.68762278978389,
+      "grad_norm": 0.66015625,
+      "learning_rate": 1.5643418467583497e-05,
+      "loss": 0.0313,
+      "step": 1400
+    },
+    {
+      "epoch": 0.6925343811394892,
+      "grad_norm": 0.7578125,
+      "learning_rate": 1.5397838899803538e-05,
+      "loss": 0.0242,
+      "step": 1410
+    },
+    {
+      "epoch": 0.6974459724950884,
+      "grad_norm": 0.6796875,
+      "learning_rate": 1.5152259332023578e-05,
+      "loss": 0.0274,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7023575638506876,
+      "grad_norm": 0.80078125,
+      "learning_rate": 1.4906679764243617e-05,
+      "loss": 0.0235,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7072691552062869,
+      "grad_norm": 0.66015625,
+      "learning_rate": 1.4661100196463657e-05,
+      "loss": 0.0259,
+      "step": 1440
+    },
+    {
+      "epoch": 0.712180746561886,
+      "grad_norm": 0.453125,
+      "learning_rate": 1.4415520628683694e-05,
+      "loss": 0.0246,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7170923379174853,
+      "grad_norm": 0.71484375,
+      "learning_rate": 1.4169941060903732e-05,
+      "loss": 0.0275,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7220039292730844,
+      "grad_norm": 1.1015625,
+      "learning_rate": 1.3924361493123773e-05,
+      "loss": 0.0308,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7269155206286837,
+      "grad_norm": 0.9765625,
+      "learning_rate": 1.3678781925343811e-05,
+      "loss": 0.029,
+      "step": 1480
+    },
+    {
+      "epoch": 0.731827111984283,
+      "grad_norm": 1.7578125,
+      "learning_rate": 1.3433202357563852e-05,
+      "loss": 0.0284,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7367387033398821,
+      "grad_norm": 0.7265625,
+      "learning_rate": 1.318762278978389e-05,
+      "loss": 0.0253,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7416502946954814,
+      "grad_norm": 0.8515625,
+      "learning_rate": 1.294204322200393e-05,
+      "loss": 0.028,
+      "step": 1510
+    },
+    {
+      "epoch": 0.7465618860510805,
+      "grad_norm": 0.53515625,
+      "learning_rate": 1.2696463654223969e-05,
+      "loss": 0.0292,
+      "step": 1520
+    },
+    {
+      "epoch": 0.7514734774066798,
+      "grad_norm": 0.62890625,
+      "learning_rate": 1.245088408644401e-05,
+      "loss": 0.0259,
+      "step": 1530
+    },
+    {
+      "epoch": 0.756385068762279,
+      "grad_norm": 0.6171875,
+      "learning_rate": 1.2205304518664048e-05,
+      "loss": 0.0263,
+      "step": 1540
+    },
+    {
+      "epoch": 0.7612966601178782,
+      "grad_norm": 0.72265625,
+      "learning_rate": 1.1959724950884087e-05,
+      "loss": 0.0279,
+      "step": 1550
+    },
+    {
+      "epoch": 0.7662082514734774,
+      "grad_norm": 0.70703125,
+      "learning_rate": 1.1714145383104127e-05,
+      "loss": 0.0276,
+      "step": 1560
+    },
+    {
+      "epoch": 0.7711198428290766,
+      "grad_norm": 0.98828125,
+      "learning_rate": 1.1468565815324165e-05,
+      "loss": 0.0314,
+      "step": 1570
+    },
+    {
+      "epoch": 0.7760314341846758,
+      "grad_norm": 0.6640625,
+      "learning_rate": 1.1222986247544206e-05,
+      "loss": 0.0283,
+      "step": 1580
+    },
+    {
+      "epoch": 0.7809430255402751,
+      "grad_norm": 0.703125,
+      "learning_rate": 1.0977406679764244e-05,
+      "loss": 0.049,
+      "step": 1590
+    },
+    {
+      "epoch": 0.7858546168958742,
+      "grad_norm": 0.73828125,
+      "learning_rate": 1.0731827111984283e-05,
+      "loss": 0.0294,
+      "step": 1600
+    },
+    {
+      "epoch": 0.7907662082514735,
+      "grad_norm": 0.78125,
+      "learning_rate": 1.0486247544204323e-05,
+      "loss": 0.0257,
+      "step": 1610
+    },
+    {
+      "epoch": 0.7956777996070727,
+      "grad_norm": 0.69140625,
+      "learning_rate": 1.0240667976424362e-05,
+      "loss": 0.0316,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8005893909626719,
+      "grad_norm": 1.0234375,
+      "learning_rate": 9.995088408644402e-06,
+      "loss": 0.033,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8055009823182712,
+      "grad_norm": 0.77734375,
+      "learning_rate": 9.74950884086444e-06,
+      "loss": 0.0263,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8104125736738703,
+      "grad_norm": 0.69140625,
+      "learning_rate": 9.503929273084481e-06,
+      "loss": 0.0248,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8153241650294696,
+      "grad_norm": 0.80859375,
+      "learning_rate": 9.258349705304518e-06,
+      "loss": 0.0256,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8202357563850687,
+      "grad_norm": 0.498046875,
+      "learning_rate": 9.012770137524558e-06,
+      "loss": 0.0258,
+      "step": 1670
+    },
+    {
+      "epoch": 0.825147347740668,
+      "grad_norm": 0.64453125,
+      "learning_rate": 8.767190569744597e-06,
+      "loss": 0.0271,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8300589390962672,
+      "grad_norm": 0.7890625,
+      "learning_rate": 8.521611001964637e-06,
+      "loss": 0.0334,
+      "step": 1690
+    },
+    {
+      "epoch": 0.8349705304518664,
+      "grad_norm": 1.0078125,
+      "learning_rate": 8.276031434184677e-06,
+      "loss": 0.0245,
+      "step": 1700
+    },
+    {
+      "epoch": 0.8398821218074656,
+      "grad_norm": 0.63671875,
+      "learning_rate": 8.030451866404716e-06,
+      "loss": 0.0262,
+      "step": 1710
+    },
+    {
+      "epoch": 0.8447937131630648,
+      "grad_norm": 0.5390625,
+      "learning_rate": 7.784872298624756e-06,
+      "loss": 0.027,
+      "step": 1720
+    },
+    {
+      "epoch": 0.849705304518664,
+      "grad_norm": 0.6953125,
+      "learning_rate": 7.539292730844794e-06,
+      "loss": 0.0253,
+      "step": 1730
+    },
+    {
+      "epoch": 0.8546168958742633,
+      "grad_norm": 0.62109375,
+      "learning_rate": 7.293713163064833e-06,
+      "loss": 0.0247,
+      "step": 1740
+    },
+    {
+      "epoch": 0.8595284872298625,
+      "grad_norm": 1.140625,
+      "learning_rate": 7.048133595284873e-06,
+      "loss": 0.0281,
+      "step": 1750
+    },
+    {
+      "epoch": 0.8644400785854617,
+      "grad_norm": 0.59375,
+      "learning_rate": 6.802554027504912e-06,
+      "loss": 0.0269,
+      "step": 1760
+    },
+    {
+      "epoch": 0.869351669941061,
+      "grad_norm": 1.2421875,
+      "learning_rate": 6.556974459724952e-06,
+      "loss": 0.0255,
+      "step": 1770
+    },
+    {
+      "epoch": 0.8742632612966601,
+      "grad_norm": 0.7578125,
+      "learning_rate": 6.311394891944991e-06,
+      "loss": 0.0253,
+      "step": 1780
+    },
+    {
+      "epoch": 0.8791748526522594,
+      "grad_norm": 0.73828125,
+      "learning_rate": 6.06581532416503e-06,
+      "loss": 0.024,
+      "step": 1790
+    },
+    {
+      "epoch": 0.8840864440078585,
+      "grad_norm": 0.60546875,
+      "learning_rate": 5.820235756385069e-06,
+      "loss": 0.03,
+      "step": 1800
+    },
+    {
+      "epoch": 0.8889980353634578,
+      "grad_norm": 0.5546875,
+      "learning_rate": 5.5746561886051085e-06,
+      "loss": 0.0246,
+      "step": 1810
+    },
+    {
+      "epoch": 0.8939096267190569,
+      "grad_norm": 0.8671875,
+      "learning_rate": 5.329076620825148e-06,
+      "loss": 0.0249,
+      "step": 1820
+    },
+    {
+      "epoch": 0.8988212180746562,
+      "grad_norm": 0.55859375,
+      "learning_rate": 5.0834970530451866e-06,
+      "loss": 0.0267,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9037328094302554,
+      "grad_norm": 0.5703125,
+      "learning_rate": 4.837917485265226e-06,
+      "loss": 0.0232,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9086444007858546,
+      "grad_norm": 0.6953125,
+      "learning_rate": 4.5923379174852655e-06,
+      "loss": 0.0242,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9135559921414538,
+      "grad_norm": 0.71484375,
+      "learning_rate": 4.346758349705305e-06,
+      "loss": 0.0256,
+      "step": 1860
+    },
+    {
+      "epoch": 0.918467583497053,
+      "grad_norm": 1.125,
+      "learning_rate": 4.1011787819253435e-06,
+      "loss": 0.0318,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9233791748526523,
+      "grad_norm": 0.69921875,
+      "learning_rate": 3.855599214145384e-06,
+      "loss": 0.0268,
+      "step": 1880
+    },
+    {
+      "epoch": 0.9282907662082515,
+      "grad_norm": 0.58203125,
+      "learning_rate": 3.6100196463654228e-06,
+      "loss": 0.0249,
+      "step": 1890
+    },
+    {
+      "epoch": 0.9332023575638507,
+      "grad_norm": 0.66796875,
+      "learning_rate": 3.364440078585462e-06,
+      "loss": 0.0306,
+      "step": 1900
+    },
+    {
+      "epoch": 0.9381139489194499,
+      "grad_norm": 0.59765625,
+      "learning_rate": 3.1188605108055012e-06,
+      "loss": 0.0301,
+      "step": 1910
+    },
+    {
+      "epoch": 0.9430255402750491,
+      "grad_norm": 0.65625,
+      "learning_rate": 2.8732809430255403e-06,
+      "loss": 0.0248,
+      "step": 1920
+    },
+    {
+      "epoch": 0.9479371316306483,
+      "grad_norm": 0.76953125,
+      "learning_rate": 2.6277013752455797e-06,
+      "loss": 0.0267,
+      "step": 1930
+    },
+    {
+      "epoch": 0.9528487229862476,
+      "grad_norm": 0.61328125,
+      "learning_rate": 2.382121807465619e-06,
+      "loss": 0.0296,
+      "step": 1940
+    },
+    {
+      "epoch": 0.9577603143418467,
+      "grad_norm": 0.59375,
+      "learning_rate": 2.136542239685658e-06,
+      "loss": 0.0264,
+      "step": 1950
+    },
+    {
+      "epoch": 0.962671905697446,
+      "grad_norm": 0.51953125,
+      "learning_rate": 1.8909626719056976e-06,
+      "loss": 0.024,
+      "step": 1960
+    },
+    {
+      "epoch": 0.9675834970530451,
+      "grad_norm": 0.56640625,
+      "learning_rate": 1.6453831041257368e-06,
+      "loss": 0.0315,
+      "step": 1970
+    },
+    {
+      "epoch": 0.9724950884086444,
+      "grad_norm": 0.859375,
+      "learning_rate": 1.3998035363457763e-06,
+      "loss": 0.024,
+      "step": 1980
+    },
+    {
+      "epoch": 0.9774066797642437,
+      "grad_norm": 0.8125,
+      "learning_rate": 1.1542239685658153e-06,
+      "loss": 0.0282,
+      "step": 1990
+    },
+    {
+      "epoch": 0.9823182711198428,
+      "grad_norm": 0.6328125,
+      "learning_rate": 9.086444007858547e-07,
+      "loss": 0.0231,
+      "step": 2000
+    },
+    {
+      "epoch": 0.9872298624754421,
+      "grad_norm": 0.75390625,
+      "learning_rate": 6.630648330058939e-07,
+      "loss": 0.0247,
+      "step": 2010
+    },
+    {
+      "epoch": 0.9921414538310412,
+      "grad_norm": 0.5546875,
+      "learning_rate": 4.174852652259332e-07,
+      "loss": 0.0233,
+      "step": 2020
+    },
+    {
+      "epoch": 0.9970530451866405,
+      "grad_norm": 0.85546875,
+      "learning_rate": 1.719056974459725e-07,
+      "loss": 0.0366,
+      "step": 2030
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2036,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9084808495824896e+16,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

last-checkpoint/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96b4692fcf47c9552185532e3f4edfd1a00198142146dd822a7aa35f5d117280
+size 5304