diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..2b7322fbd4524e95a5e1d53e532efb919fa152da 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/eval_state.json filter=lfs diff=lfs merge=lfs -text diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/eval_state.json b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/eval_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46bc8e908d775d133918c94ba775838d9608266f --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/eval_state.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8485b1cf2642f7cefe47d3fd95275f3de050508345cfb05218748d709cbd820 +size 18994135 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/model.safetensors b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..66dda841806bfb9e0ceb418bdd47b63e568272b3 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f357416e9a280bf1a2080d024ebc788cec32f2b5515c293a9568caf61e3e3ff +size 3423137083 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/optimizer.pt b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cc201c9da6515bee57b12f9e3fddeaa18955255 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fbf2f631d8bcc7bf9fbd46fa93541f140330b59dbbd56cd64cd6580ff10ae04 +size 7395097 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_0.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..d31bbb8b267e3ac8da2a20a603c7d5b67617535b --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cc6666df8de13dd8db7573a0e43bbfc8058643ac45741ab6b1969b9e738662c +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_1.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb67938f17f9cf0e73518b9539f2bc143bfd4df2 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d3f4f5aae54618adf56ae53085c2d4f218d8b40b215a46be8cdc35d3ba4754 +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_2.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6acf60f0b7420deae09e5b2062248917eb71b4e2 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d68ca08d5c18d824959ccbd169c5aa0ec9b178764785a978d92cfa2b0d073352 +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_3.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..be9c2710cb8d741ef83f9ecd6b8287f9c6bca9dd --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b0952735cc588e4abffc0df9c079516b87c774315c2be85d7d1d072d5231bb +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_4.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..e7ed408be8883e86ce24d3d23d49c53a8e287666 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22190fe0ee10b5a7cca4ee77b2af21105b67c66eff2bbe63a98619c2529b6d76 +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_5.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..dac0b0accd4265d162243294c1469f92d7f6bbdd --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3556a8fc86992d410487f00c532e174baf095db91d65a295c58b17ed567b1f +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_6.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b24714a749ad4261c78c4ac3df8a4c3dfa89814 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ac8ead967914f548dd46bac88e2b966ceafbdea994b9021a09166b556a9ba7f +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_7.pth b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..83e6dee433224ed04f0cad51feb7f5156e8b36e7 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8daea2dfada134daf6711992cc867a8d89ee22817a67d4e33909488403220c +size 15984 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/scheduler.pt b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6688224b4261a8f55519ffdf9db60cc910fb9564 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7192a361b1cb3687505c76aaf3eda85d2feed527438806167f530755dbfe8b3 +size 1064 diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/trainer_state.json b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd24a1828d909ea51aaf3c6cfea3d294045fb79 --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/trainer_state.json @@ -0,0 +1,74754 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9700871603310626, + "eval_steps": 512, + "global_step": 101376, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00029297590273200027, + "grad_norm": 48.20316696166992, + "learning_rate": 3.2552083333333335e-08, + "loss": 7.4372, + "step": 10 + }, + { + "epoch": 0.0005859518054640005, + "grad_norm": 46.522544860839844, + "learning_rate": 6.510416666666667e-08, + "loss": 7.4475, + "step": 20 + }, + { + "epoch": 0.0008789277081960009, + "grad_norm": 41.88468933105469, + "learning_rate": 9.765625e-08, + "loss": 7.3765, + "step": 30 + }, + { + "epoch": 0.001171903610928001, + "grad_norm": 37.764835357666016, + "learning_rate": 1.3020833333333334e-07, + "loss": 7.3299, + "step": 40 + }, + { + "epoch": 0.0014648795136600016, + "grad_norm": 32.434749603271484, + "learning_rate": 1.627604166666667e-07, + "loss": 7.314, + "step": 50 + }, + { + "epoch": 0.0017578554163920018, + "grad_norm": 25.51378631591797, + "learning_rate": 1.953125e-07, + "loss": 7.173, + "step": 60 + }, + { + "epoch": 0.002050831319124002, + "grad_norm": 20.874404907226562, + "learning_rate": 2.2786458333333333e-07, + "loss": 7.1222, + "step": 70 + }, + { + "epoch": 0.002343807221856002, + "grad_norm": 18.2292537689209, + "learning_rate": 2.604166666666667e-07, + "loss": 7.0695, + "step": 80 + }, + { + "epoch": 0.0026367831245880024, + "grad_norm": 15.813260078430176, + "learning_rate": 2.897135416666667e-07, + "loss": 7.0002, + "step": 90 + }, + { + "epoch": 0.002929759027320003, + "grad_norm": 15.147632598876953, + "learning_rate": 3.2226562500000005e-07, + "loss": 6.9427, + "step": 100 + }, + { + "epoch": 0.0032227349300520034, + "grad_norm": 13.248733520507812, + "learning_rate": 3.548177083333334e-07, + "loss": 6.8631, + "step": 110 + }, + { + "epoch": 0.0035157108327840037, + "grad_norm": 12.763676643371582, + "learning_rate": 3.873697916666667e-07, + "loss": 6.817, + "step": 120 + }, + { + "epoch": 0.003808686735516004, + "grad_norm": 11.738790512084961, + "learning_rate": 4.1992187500000004e-07, + "loss": 6.7696, + "step": 130 + }, + { + "epoch": 0.004101662638248004, + "grad_norm": 11.700736999511719, + "learning_rate": 4.524739583333334e-07, + "loss": 6.6989, + "step": 140 + }, + { + "epoch": 0.004394638540980004, + "grad_norm": 10.680469512939453, + "learning_rate": 4.850260416666666e-07, + "loss": 6.6532, + "step": 150 + }, + { + "epoch": 0.004687614443712004, + "grad_norm": 11.255241394042969, + "learning_rate": 5.175781250000001e-07, + "loss": 6.6257, + "step": 160 + }, + { + "epoch": 0.004980590346444005, + "grad_norm": 11.082075119018555, + "learning_rate": 5.501302083333334e-07, + "loss": 6.5903, + "step": 170 + }, + { + "epoch": 0.005273566249176005, + "grad_norm": 10.601607322692871, + "learning_rate": 5.826822916666667e-07, + "loss": 6.617, + "step": 180 + }, + { + "epoch": 0.005566542151908006, + "grad_norm": 11.014235496520996, + "learning_rate": 6.15234375e-07, + "loss": 6.5685, + "step": 190 + }, + { + "epoch": 0.005859518054640006, + "grad_norm": 10.156295776367188, + "learning_rate": 6.477864583333334e-07, + "loss": 6.5341, + "step": 200 + }, + { + "epoch": 0.0061524939573720065, + "grad_norm": 10.384119033813477, + "learning_rate": 6.803385416666667e-07, + "loss": 6.5067, + "step": 210 + }, + { + "epoch": 0.006445469860104007, + "grad_norm": 9.870872497558594, + "learning_rate": 7.128906250000001e-07, + "loss": 6.4943, + "step": 220 + }, + { + "epoch": 0.006738445762836007, + "grad_norm": 10.079934120178223, + "learning_rate": 7.454427083333333e-07, + "loss": 6.4647, + "step": 230 + }, + { + "epoch": 0.007031421665568007, + "grad_norm": 10.224433898925781, + "learning_rate": 7.779947916666668e-07, + "loss": 6.4322, + "step": 240 + }, + { + "epoch": 0.007324397568300008, + "grad_norm": 10.56871509552002, + "learning_rate": 8.105468750000001e-07, + "loss": 6.4616, + "step": 250 + }, + { + "epoch": 0.007617373471032008, + "grad_norm": 10.10598373413086, + "learning_rate": 8.430989583333334e-07, + "loss": 6.3948, + "step": 260 + }, + { + "epoch": 0.007910349373764008, + "grad_norm": 10.26723575592041, + "learning_rate": 8.756510416666668e-07, + "loss": 6.3759, + "step": 270 + }, + { + "epoch": 0.008203325276496008, + "grad_norm": 11.118165969848633, + "learning_rate": 9.08203125e-07, + "loss": 6.3491, + "step": 280 + }, + { + "epoch": 0.008496301179228009, + "grad_norm": 10.838608741760254, + "learning_rate": 9.407552083333334e-07, + "loss": 6.3464, + "step": 290 + }, + { + "epoch": 0.008789277081960008, + "grad_norm": 9.402033805847168, + "learning_rate": 9.733072916666667e-07, + "loss": 6.3568, + "step": 300 + }, + { + "epoch": 0.00908225298469201, + "grad_norm": 11.766927719116211, + "learning_rate": 1.005859375e-06, + "loss": 6.3475, + "step": 310 + }, + { + "epoch": 0.009375228887424009, + "grad_norm": 10.703299522399902, + "learning_rate": 1.0384114583333334e-06, + "loss": 6.287, + "step": 320 + }, + { + "epoch": 0.00966820479015601, + "grad_norm": 10.092641830444336, + "learning_rate": 1.0709635416666668e-06, + "loss": 6.3228, + "step": 330 + }, + { + "epoch": 0.00996118069288801, + "grad_norm": 11.809045791625977, + "learning_rate": 1.1035156250000001e-06, + "loss": 6.3037, + "step": 340 + }, + { + "epoch": 0.01025415659562001, + "grad_norm": 11.34934139251709, + "learning_rate": 1.1360677083333333e-06, + "loss": 6.2724, + "step": 350 + }, + { + "epoch": 0.01054713249835201, + "grad_norm": 10.416779518127441, + "learning_rate": 1.1686197916666668e-06, + "loss": 6.3016, + "step": 360 + }, + { + "epoch": 0.01084010840108401, + "grad_norm": 10.535882949829102, + "learning_rate": 1.2011718750000002e-06, + "loss": 6.2196, + "step": 370 + }, + { + "epoch": 0.011133084303816012, + "grad_norm": 11.14434814453125, + "learning_rate": 1.2337239583333333e-06, + "loss": 6.2671, + "step": 380 + }, + { + "epoch": 0.011426060206548011, + "grad_norm": 10.4091796875, + "learning_rate": 1.2662760416666667e-06, + "loss": 6.2133, + "step": 390 + }, + { + "epoch": 0.011719036109280013, + "grad_norm": 12.410329818725586, + "learning_rate": 1.298828125e-06, + "loss": 6.2288, + "step": 400 + }, + { + "epoch": 0.012012012012012012, + "grad_norm": 11.35128402709961, + "learning_rate": 1.3313802083333336e-06, + "loss": 6.2245, + "step": 410 + }, + { + "epoch": 0.012304987914744013, + "grad_norm": 10.468694686889648, + "learning_rate": 1.3639322916666667e-06, + "loss": 6.2167, + "step": 420 + }, + { + "epoch": 0.012597963817476012, + "grad_norm": 10.45308780670166, + "learning_rate": 1.396484375e-06, + "loss": 6.185, + "step": 430 + }, + { + "epoch": 0.012890939720208014, + "grad_norm": 11.226188659667969, + "learning_rate": 1.4290364583333336e-06, + "loss": 6.2076, + "step": 440 + }, + { + "epoch": 0.013183915622940013, + "grad_norm": 9.819056510925293, + "learning_rate": 1.4615885416666668e-06, + "loss": 6.172, + "step": 450 + }, + { + "epoch": 0.013476891525672014, + "grad_norm": 11.891705513000488, + "learning_rate": 1.4941406250000001e-06, + "loss": 6.1635, + "step": 460 + }, + { + "epoch": 0.013769867428404014, + "grad_norm": 10.216228485107422, + "learning_rate": 1.5266927083333335e-06, + "loss": 6.138, + "step": 470 + }, + { + "epoch": 0.014062843331136015, + "grad_norm": 9.809833526611328, + "learning_rate": 1.5592447916666668e-06, + "loss": 6.1319, + "step": 480 + }, + { + "epoch": 0.014355819233868014, + "grad_norm": 11.25721263885498, + "learning_rate": 1.5917968750000002e-06, + "loss": 6.1495, + "step": 490 + }, + { + "epoch": 0.014648795136600015, + "grad_norm": 11.98207950592041, + "learning_rate": 1.6243489583333335e-06, + "loss": 6.138, + "step": 500 + }, + { + "epoch": 0.014941771039332015, + "grad_norm": 11.844237327575684, + "learning_rate": 1.6569010416666666e-06, + "loss": 6.128, + "step": 510 + }, + { + "epoch": 0.015000366219878415, + "eval_bleu": 0.23257768784008367, + "eval_cap_loss": 1.3365554809570312, + "eval_con_loss": 2.440279483795166, + "eval_loss": 6.217114448547363, + "step": 512 + }, + { + "epoch": 0.015000366219878415, + "eval_bleu": 0.23257768784008367, + "eval_cap_loss": 1.3365554809570312, + "eval_con_loss": 2.440279483795166, + "eval_loss": 6.217114448547363, + "eval_runtime": 51.591, + "eval_samples_per_second": 387.665, + "eval_steps_per_second": 0.388, + "step": 512 + }, + { + "epoch": 0.015234746942064016, + "grad_norm": 11.644484519958496, + "learning_rate": 1.6894531250000002e-06, + "loss": 6.1331, + "step": 520 + }, + { + "epoch": 0.015527722844796015, + "grad_norm": 10.722804069519043, + "learning_rate": 1.7220052083333335e-06, + "loss": 6.0906, + "step": 530 + }, + { + "epoch": 0.015820698747528016, + "grad_norm": 11.453320503234863, + "learning_rate": 1.7545572916666667e-06, + "loss": 6.0702, + "step": 540 + }, + { + "epoch": 0.016113674650260017, + "grad_norm": 10.061469078063965, + "learning_rate": 1.7871093750000002e-06, + "loss": 6.1133, + "step": 550 + }, + { + "epoch": 0.016406650552992015, + "grad_norm": 11.591984748840332, + "learning_rate": 1.8196614583333336e-06, + "loss": 6.0991, + "step": 560 + }, + { + "epoch": 0.016699626455724016, + "grad_norm": 11.352781295776367, + "learning_rate": 1.8522135416666667e-06, + "loss": 6.0838, + "step": 570 + }, + { + "epoch": 0.016992602358456017, + "grad_norm": 10.743990898132324, + "learning_rate": 1.884765625e-06, + "loss": 6.075, + "step": 580 + }, + { + "epoch": 0.01728557826118802, + "grad_norm": 10.079157829284668, + "learning_rate": 1.9173177083333334e-06, + "loss": 6.0779, + "step": 590 + }, + { + "epoch": 0.017578554163920016, + "grad_norm": 11.9786376953125, + "learning_rate": 1.9498697916666666e-06, + "loss": 6.0798, + "step": 600 + }, + { + "epoch": 0.017871530066652017, + "grad_norm": 13.87370777130127, + "learning_rate": 1.982421875e-06, + "loss": 6.0622, + "step": 610 + }, + { + "epoch": 0.01816450596938402, + "grad_norm": 10.923470497131348, + "learning_rate": 2.0149739583333337e-06, + "loss": 6.02, + "step": 620 + }, + { + "epoch": 0.01845748187211602, + "grad_norm": 11.564534187316895, + "learning_rate": 2.047526041666667e-06, + "loss": 6.012, + "step": 630 + }, + { + "epoch": 0.018750457774848017, + "grad_norm": 12.440816879272461, + "learning_rate": 2.0800781250000004e-06, + "loss": 6.0227, + "step": 640 + }, + { + "epoch": 0.01904343367758002, + "grad_norm": 11.482194900512695, + "learning_rate": 2.1126302083333335e-06, + "loss": 5.971, + "step": 650 + }, + { + "epoch": 0.01933640958031202, + "grad_norm": 12.701305389404297, + "learning_rate": 2.1451822916666666e-06, + "loss": 6.0427, + "step": 660 + }, + { + "epoch": 0.01962938548304402, + "grad_norm": 12.747330665588379, + "learning_rate": 2.177734375e-06, + "loss": 5.9975, + "step": 670 + }, + { + "epoch": 0.01992236138577602, + "grad_norm": 12.912580490112305, + "learning_rate": 2.2102864583333338e-06, + "loss": 5.9849, + "step": 680 + }, + { + "epoch": 0.02021533728850802, + "grad_norm": 14.062628746032715, + "learning_rate": 2.242838541666667e-06, + "loss": 6.0129, + "step": 690 + }, + { + "epoch": 0.02050831319124002, + "grad_norm": 13.642184257507324, + "learning_rate": 2.275390625e-06, + "loss": 5.9899, + "step": 700 + }, + { + "epoch": 0.02080128909397202, + "grad_norm": 15.169952392578125, + "learning_rate": 2.3079427083333336e-06, + "loss": 5.9678, + "step": 710 + }, + { + "epoch": 0.02109426499670402, + "grad_norm": 12.565735816955566, + "learning_rate": 2.3404947916666667e-06, + "loss": 5.9914, + "step": 720 + }, + { + "epoch": 0.02138724089943602, + "grad_norm": 12.790975570678711, + "learning_rate": 2.3730468750000003e-06, + "loss": 6.0071, + "step": 730 + }, + { + "epoch": 0.02168021680216802, + "grad_norm": 11.683467864990234, + "learning_rate": 2.4055989583333334e-06, + "loss": 5.972, + "step": 740 + }, + { + "epoch": 0.021973192704900023, + "grad_norm": 12.676822662353516, + "learning_rate": 2.4381510416666666e-06, + "loss": 5.9713, + "step": 750 + }, + { + "epoch": 0.022266168607632024, + "grad_norm": 13.526009559631348, + "learning_rate": 2.470703125e-06, + "loss": 5.9503, + "step": 760 + }, + { + "epoch": 0.02255914451036402, + "grad_norm": 11.41614055633545, + "learning_rate": 2.5032552083333333e-06, + "loss": 5.9367, + "step": 770 + }, + { + "epoch": 0.022852120413096023, + "grad_norm": 15.033052444458008, + "learning_rate": 2.5358072916666672e-06, + "loss": 5.95, + "step": 780 + }, + { + "epoch": 0.023145096315828024, + "grad_norm": 16.80222511291504, + "learning_rate": 2.5683593750000004e-06, + "loss": 5.9426, + "step": 790 + }, + { + "epoch": 0.023438072218560025, + "grad_norm": 15.397530555725098, + "learning_rate": 2.6009114583333335e-06, + "loss": 5.9578, + "step": 800 + }, + { + "epoch": 0.023731048121292023, + "grad_norm": 14.72960376739502, + "learning_rate": 2.633463541666667e-06, + "loss": 5.9393, + "step": 810 + }, + { + "epoch": 0.024024024024024024, + "grad_norm": 11.98216724395752, + "learning_rate": 2.6660156250000002e-06, + "loss": 5.9381, + "step": 820 + }, + { + "epoch": 0.024316999926756025, + "grad_norm": 12.152496337890625, + "learning_rate": 2.6985677083333334e-06, + "loss": 5.926, + "step": 830 + }, + { + "epoch": 0.024609975829488026, + "grad_norm": 16.589662551879883, + "learning_rate": 2.731119791666667e-06, + "loss": 5.9524, + "step": 840 + }, + { + "epoch": 0.024902951732220024, + "grad_norm": 11.710291862487793, + "learning_rate": 2.763671875e-06, + "loss": 5.8652, + "step": 850 + }, + { + "epoch": 0.025195927634952025, + "grad_norm": 11.27197265625, + "learning_rate": 2.796223958333333e-06, + "loss": 5.9197, + "step": 860 + }, + { + "epoch": 0.025488903537684026, + "grad_norm": 16.91851806640625, + "learning_rate": 2.828776041666667e-06, + "loss": 5.9474, + "step": 870 + }, + { + "epoch": 0.025781879440416027, + "grad_norm": 15.069924354553223, + "learning_rate": 2.8613281250000003e-06, + "loss": 5.8922, + "step": 880 + }, + { + "epoch": 0.026074855343148025, + "grad_norm": 12.613478660583496, + "learning_rate": 2.8938802083333334e-06, + "loss": 5.9162, + "step": 890 + }, + { + "epoch": 0.026367831245880026, + "grad_norm": 14.686683654785156, + "learning_rate": 2.926432291666667e-06, + "loss": 5.889, + "step": 900 + }, + { + "epoch": 0.026660807148612027, + "grad_norm": 13.94355583190918, + "learning_rate": 2.958984375e-06, + "loss": 5.8886, + "step": 910 + }, + { + "epoch": 0.026953783051344028, + "grad_norm": 11.737340927124023, + "learning_rate": 2.9915364583333333e-06, + "loss": 5.8833, + "step": 920 + }, + { + "epoch": 0.027246758954076026, + "grad_norm": 14.226699829101562, + "learning_rate": 3.0240885416666673e-06, + "loss": 5.8656, + "step": 930 + }, + { + "epoch": 0.027539734856808027, + "grad_norm": 12.62042236328125, + "learning_rate": 3.0566406250000004e-06, + "loss": 5.8816, + "step": 940 + }, + { + "epoch": 0.027832710759540028, + "grad_norm": 14.637236595153809, + "learning_rate": 3.0891927083333335e-06, + "loss": 5.8583, + "step": 950 + }, + { + "epoch": 0.02812568666227203, + "grad_norm": 12.873172760009766, + "learning_rate": 3.121744791666667e-06, + "loss": 5.8728, + "step": 960 + }, + { + "epoch": 0.028418662565004027, + "grad_norm": 17.045000076293945, + "learning_rate": 3.1542968750000002e-06, + "loss": 5.8593, + "step": 970 + }, + { + "epoch": 0.028711638467736028, + "grad_norm": 15.524353981018066, + "learning_rate": 3.1868489583333334e-06, + "loss": 5.8539, + "step": 980 + }, + { + "epoch": 0.02900461437046803, + "grad_norm": 12.723761558532715, + "learning_rate": 3.219401041666667e-06, + "loss": 5.8682, + "step": 990 + }, + { + "epoch": 0.02929759027320003, + "grad_norm": 17.3734073638916, + "learning_rate": 3.251953125e-06, + "loss": 5.8715, + "step": 1000 + }, + { + "epoch": 0.029590566175932028, + "grad_norm": 16.617141723632812, + "learning_rate": 3.2845052083333336e-06, + "loss": 5.8321, + "step": 1010 + }, + { + "epoch": 0.02988354207866403, + "grad_norm": 15.51398754119873, + "learning_rate": 3.317057291666667e-06, + "loss": 5.8363, + "step": 1020 + }, + { + "epoch": 0.03000073243975683, + "eval_bleu": 0.24637953005366364, + "eval_cap_loss": 1.2843806743621826, + "eval_con_loss": 2.352034091949463, + "eval_loss": 5.988448619842529, + "step": 1024 + }, + { + "epoch": 0.03000073243975683, + "eval_bleu": 0.24637953005366364, + "eval_cap_loss": 1.2843806743621826, + "eval_con_loss": 2.352034091949463, + "eval_loss": 5.988448619842529, + "eval_runtime": 50.4651, + "eval_samples_per_second": 396.313, + "eval_steps_per_second": 0.396, + "step": 1024 + }, + { + "epoch": 0.03017651798139603, + "grad_norm": 14.9268159866333, + "learning_rate": 3.3496093750000003e-06, + "loss": 5.8534, + "step": 1030 + }, + { + "epoch": 0.03046949388412803, + "grad_norm": 17.17924690246582, + "learning_rate": 3.3821614583333335e-06, + "loss": 5.847, + "step": 1040 + }, + { + "epoch": 0.03076246978686003, + "grad_norm": 18.080604553222656, + "learning_rate": 3.414713541666667e-06, + "loss": 5.807, + "step": 1050 + }, + { + "epoch": 0.03105544568959203, + "grad_norm": 16.933380126953125, + "learning_rate": 3.447265625e-06, + "loss": 5.8291, + "step": 1060 + }, + { + "epoch": 0.03134842159232403, + "grad_norm": 14.726670265197754, + "learning_rate": 3.4798177083333333e-06, + "loss": 5.8225, + "step": 1070 + }, + { + "epoch": 0.03164139749505603, + "grad_norm": 15.118639945983887, + "learning_rate": 3.5123697916666673e-06, + "loss": 5.7973, + "step": 1080 + }, + { + "epoch": 0.031934373397788034, + "grad_norm": 16.3491268157959, + "learning_rate": 3.5449218750000004e-06, + "loss": 5.8262, + "step": 1090 + }, + { + "epoch": 0.032227349300520035, + "grad_norm": 18.55785369873047, + "learning_rate": 3.5774739583333335e-06, + "loss": 5.8084, + "step": 1100 + }, + { + "epoch": 0.032520325203252036, + "grad_norm": 15.973133087158203, + "learning_rate": 3.610026041666667e-06, + "loss": 5.8072, + "step": 1110 + }, + { + "epoch": 0.03281330110598403, + "grad_norm": 18.468547821044922, + "learning_rate": 3.6425781250000002e-06, + "loss": 5.8132, + "step": 1120 + }, + { + "epoch": 0.03310627700871603, + "grad_norm": 18.206501007080078, + "learning_rate": 3.6751302083333334e-06, + "loss": 5.7978, + "step": 1130 + }, + { + "epoch": 0.03339925291144803, + "grad_norm": 19.087825775146484, + "learning_rate": 3.7076822916666674e-06, + "loss": 5.7825, + "step": 1140 + }, + { + "epoch": 0.033692228814180034, + "grad_norm": 17.680421829223633, + "learning_rate": 3.7402343750000005e-06, + "loss": 5.7801, + "step": 1150 + }, + { + "epoch": 0.033985204716912035, + "grad_norm": 17.65298843383789, + "learning_rate": 3.7727864583333336e-06, + "loss": 5.7952, + "step": 1160 + }, + { + "epoch": 0.034278180619644036, + "grad_norm": 18.93503189086914, + "learning_rate": 3.805338541666667e-06, + "loss": 5.8244, + "step": 1170 + }, + { + "epoch": 0.03457115652237604, + "grad_norm": 17.83639144897461, + "learning_rate": 3.837890625e-06, + "loss": 5.7536, + "step": 1180 + }, + { + "epoch": 0.03486413242510804, + "grad_norm": 17.4979305267334, + "learning_rate": 3.8704427083333335e-06, + "loss": 5.7827, + "step": 1190 + }, + { + "epoch": 0.03515710832784003, + "grad_norm": 21.07087516784668, + "learning_rate": 3.9029947916666674e-06, + "loss": 5.7959, + "step": 1200 + }, + { + "epoch": 0.03545008423057203, + "grad_norm": 19.510616302490234, + "learning_rate": 3.935546875000001e-06, + "loss": 5.7568, + "step": 1210 + }, + { + "epoch": 0.035743060133304035, + "grad_norm": 16.684951782226562, + "learning_rate": 3.968098958333334e-06, + "loss": 5.7512, + "step": 1220 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 18.535463333129883, + "learning_rate": 4.000651041666667e-06, + "loss": 5.8065, + "step": 1230 + }, + { + "epoch": 0.03632901193876804, + "grad_norm": 21.191545486450195, + "learning_rate": 4.033203125e-06, + "loss": 5.7577, + "step": 1240 + }, + { + "epoch": 0.03662198784150004, + "grad_norm": 18.5275936126709, + "learning_rate": 4.065755208333333e-06, + "loss": 5.754, + "step": 1250 + }, + { + "epoch": 0.03691496374423204, + "grad_norm": 16.51927947998047, + "learning_rate": 4.098307291666667e-06, + "loss": 5.757, + "step": 1260 + }, + { + "epoch": 0.03720793964696404, + "grad_norm": 17.150163650512695, + "learning_rate": 4.130859375e-06, + "loss": 5.7567, + "step": 1270 + }, + { + "epoch": 0.037500915549696034, + "grad_norm": 20.011537551879883, + "learning_rate": 4.163411458333333e-06, + "loss": 5.7403, + "step": 1280 + }, + { + "epoch": 0.037793891452428036, + "grad_norm": 17.820201873779297, + "learning_rate": 4.195963541666667e-06, + "loss": 5.7314, + "step": 1290 + }, + { + "epoch": 0.03808686735516004, + "grad_norm": 18.819259643554688, + "learning_rate": 4.2285156250000005e-06, + "loss": 5.7749, + "step": 1300 + }, + { + "epoch": 0.03837984325789204, + "grad_norm": 23.51628875732422, + "learning_rate": 4.261067708333334e-06, + "loss": 5.7296, + "step": 1310 + }, + { + "epoch": 0.03867281916062404, + "grad_norm": 17.60201072692871, + "learning_rate": 4.293619791666668e-06, + "loss": 5.7486, + "step": 1320 + }, + { + "epoch": 0.03896579506335604, + "grad_norm": 23.809383392333984, + "learning_rate": 4.326171875000001e-06, + "loss": 5.7423, + "step": 1330 + }, + { + "epoch": 0.03925877096608804, + "grad_norm": 21.965349197387695, + "learning_rate": 4.358723958333334e-06, + "loss": 5.7375, + "step": 1340 + }, + { + "epoch": 0.03955174686882004, + "grad_norm": 22.151948928833008, + "learning_rate": 4.3880208333333335e-06, + "loss": 5.7192, + "step": 1350 + }, + { + "epoch": 0.03984472277155204, + "grad_norm": 19.366968154907227, + "learning_rate": 4.4205729166666675e-06, + "loss": 5.7351, + "step": 1360 + }, + { + "epoch": 0.04013769867428404, + "grad_norm": 15.792627334594727, + "learning_rate": 4.453125000000001e-06, + "loss": 5.7161, + "step": 1370 + }, + { + "epoch": 0.04043067457701604, + "grad_norm": 22.35173225402832, + "learning_rate": 4.485677083333334e-06, + "loss": 5.7502, + "step": 1380 + }, + { + "epoch": 0.04072365047974804, + "grad_norm": 19.656108856201172, + "learning_rate": 4.518229166666667e-06, + "loss": 5.7109, + "step": 1390 + }, + { + "epoch": 0.04101662638248004, + "grad_norm": 20.084476470947266, + "learning_rate": 4.55078125e-06, + "loss": 5.7205, + "step": 1400 + }, + { + "epoch": 0.04130960228521204, + "grad_norm": 20.146299362182617, + "learning_rate": 4.583333333333333e-06, + "loss": 5.6854, + "step": 1410 + }, + { + "epoch": 0.04160257818794404, + "grad_norm": 21.842405319213867, + "learning_rate": 4.615885416666667e-06, + "loss": 5.7177, + "step": 1420 + }, + { + "epoch": 0.041895554090676045, + "grad_norm": 22.545310974121094, + "learning_rate": 4.6484375e-06, + "loss": 5.6652, + "step": 1430 + }, + { + "epoch": 0.04218852999340804, + "grad_norm": 22.35773468017578, + "learning_rate": 4.6809895833333335e-06, + "loss": 5.722, + "step": 1440 + }, + { + "epoch": 0.04248150589614004, + "grad_norm": 22.928674697875977, + "learning_rate": 4.7135416666666675e-06, + "loss": 5.6957, + "step": 1450 + }, + { + "epoch": 0.04277448179887204, + "grad_norm": 20.409374237060547, + "learning_rate": 4.746093750000001e-06, + "loss": 5.6632, + "step": 1460 + }, + { + "epoch": 0.04306745770160404, + "grad_norm": 19.346723556518555, + "learning_rate": 4.778645833333334e-06, + "loss": 5.7054, + "step": 1470 + }, + { + "epoch": 0.04336043360433604, + "grad_norm": 20.17963409423828, + "learning_rate": 4.811197916666667e-06, + "loss": 5.6633, + "step": 1480 + }, + { + "epoch": 0.043653409507068044, + "grad_norm": 19.721141815185547, + "learning_rate": 4.84375e-06, + "loss": 5.663, + "step": 1490 + }, + { + "epoch": 0.043946385409800046, + "grad_norm": 23.123302459716797, + "learning_rate": 4.876302083333333e-06, + "loss": 5.7271, + "step": 1500 + }, + { + "epoch": 0.04423936131253205, + "grad_norm": 21.462923049926758, + "learning_rate": 4.908854166666667e-06, + "loss": 5.6763, + "step": 1510 + }, + { + "epoch": 0.04453233721526405, + "grad_norm": 19.878755569458008, + "learning_rate": 4.94140625e-06, + "loss": 5.6991, + "step": 1520 + }, + { + "epoch": 0.04482531311799604, + "grad_norm": 26.107383728027344, + "learning_rate": 4.973958333333333e-06, + "loss": 5.6856, + "step": 1530 + }, + { + "epoch": 0.045001098659635246, + "eval_bleu": 0.25304826700958255, + "eval_cap_loss": 1.2580227851867676, + "eval_con_loss": 2.3025479316711426, + "eval_loss": 5.8631181716918945, + "step": 1536 + }, + { + "epoch": 0.045001098659635246, + "eval_bleu": 0.25304826700958255, + "eval_cap_loss": 1.2580227851867676, + "eval_con_loss": 2.3025479316711426, + "eval_loss": 5.8631181716918945, + "eval_runtime": 52.2851, + "eval_samples_per_second": 382.518, + "eval_steps_per_second": 0.383, + "step": 1536 + }, + { + "epoch": 0.04511828902072804, + "grad_norm": 22.36623191833496, + "learning_rate": 5.0065104166666665e-06, + "loss": 5.6819, + "step": 1540 + }, + { + "epoch": 0.045411264923460044, + "grad_norm": 21.18299102783203, + "learning_rate": 5.0390625000000005e-06, + "loss": 5.6893, + "step": 1550 + }, + { + "epoch": 0.045704240826192045, + "grad_norm": 25.859066009521484, + "learning_rate": 5.0716145833333345e-06, + "loss": 5.6623, + "step": 1560 + }, + { + "epoch": 0.04599721672892405, + "grad_norm": 22.67959976196289, + "learning_rate": 5.104166666666667e-06, + "loss": 5.6731, + "step": 1570 + }, + { + "epoch": 0.04629019263165605, + "grad_norm": 22.98816680908203, + "learning_rate": 5.136718750000001e-06, + "loss": 5.6724, + "step": 1580 + }, + { + "epoch": 0.04658316853438805, + "grad_norm": 23.412900924682617, + "learning_rate": 5.169270833333334e-06, + "loss": 5.6471, + "step": 1590 + }, + { + "epoch": 0.04687614443712005, + "grad_norm": 18.646968841552734, + "learning_rate": 5.201822916666667e-06, + "loss": 5.6416, + "step": 1600 + }, + { + "epoch": 0.047169120339852044, + "grad_norm": 21.697702407836914, + "learning_rate": 5.234375e-06, + "loss": 5.6516, + "step": 1610 + }, + { + "epoch": 0.047462096242584045, + "grad_norm": 21.56520652770996, + "learning_rate": 5.266927083333334e-06, + "loss": 5.6984, + "step": 1620 + }, + { + "epoch": 0.047755072145316046, + "grad_norm": 24.679975509643555, + "learning_rate": 5.2994791666666664e-06, + "loss": 5.6887, + "step": 1630 + }, + { + "epoch": 0.04804804804804805, + "grad_norm": 20.804208755493164, + "learning_rate": 5.3320312500000004e-06, + "loss": 5.6934, + "step": 1640 + }, + { + "epoch": 0.04834102395078005, + "grad_norm": 17.328807830810547, + "learning_rate": 5.364583333333334e-06, + "loss": 5.6446, + "step": 1650 + }, + { + "epoch": 0.04863399985351205, + "grad_norm": 23.071088790893555, + "learning_rate": 5.397135416666667e-06, + "loss": 5.6356, + "step": 1660 + }, + { + "epoch": 0.04892697575624405, + "grad_norm": 25.15633773803711, + "learning_rate": 5.429687500000001e-06, + "loss": 5.6427, + "step": 1670 + }, + { + "epoch": 0.04921995165897605, + "grad_norm": 22.70755958557129, + "learning_rate": 5.462239583333334e-06, + "loss": 5.6285, + "step": 1680 + }, + { + "epoch": 0.049512927561708046, + "grad_norm": 21.863880157470703, + "learning_rate": 5.494791666666667e-06, + "loss": 5.6278, + "step": 1690 + }, + { + "epoch": 0.04980590346444005, + "grad_norm": 21.463104248046875, + "learning_rate": 5.52734375e-06, + "loss": 5.6323, + "step": 1700 + }, + { + "epoch": 0.05009887936717205, + "grad_norm": 23.337263107299805, + "learning_rate": 5.559895833333334e-06, + "loss": 5.6468, + "step": 1710 + }, + { + "epoch": 0.05039185526990405, + "grad_norm": 24.13840675354004, + "learning_rate": 5.592447916666666e-06, + "loss": 5.6245, + "step": 1720 + }, + { + "epoch": 0.05068483117263605, + "grad_norm": 24.737327575683594, + "learning_rate": 5.625e-06, + "loss": 5.6231, + "step": 1730 + }, + { + "epoch": 0.05097780707536805, + "grad_norm": 26.099321365356445, + "learning_rate": 5.657552083333334e-06, + "loss": 5.6086, + "step": 1740 + }, + { + "epoch": 0.05127078297810005, + "grad_norm": 19.930755615234375, + "learning_rate": 5.690104166666667e-06, + "loss": 5.6239, + "step": 1750 + }, + { + "epoch": 0.051563758880832054, + "grad_norm": 25.58686637878418, + "learning_rate": 5.722656250000001e-06, + "loss": 5.6362, + "step": 1760 + }, + { + "epoch": 0.05185673478356405, + "grad_norm": 21.929969787597656, + "learning_rate": 5.755208333333335e-06, + "loss": 5.6107, + "step": 1770 + }, + { + "epoch": 0.05214971068629605, + "grad_norm": 21.737411499023438, + "learning_rate": 5.787760416666667e-06, + "loss": 5.6191, + "step": 1780 + }, + { + "epoch": 0.05244268658902805, + "grad_norm": 22.063091278076172, + "learning_rate": 5.820312500000001e-06, + "loss": 5.6112, + "step": 1790 + }, + { + "epoch": 0.05273566249176005, + "grad_norm": 18.80160903930664, + "learning_rate": 5.852864583333334e-06, + "loss": 5.6201, + "step": 1800 + }, + { + "epoch": 0.05302863839449205, + "grad_norm": 24.0883846282959, + "learning_rate": 5.885416666666667e-06, + "loss": 5.5952, + "step": 1810 + }, + { + "epoch": 0.053321614297224054, + "grad_norm": 24.626495361328125, + "learning_rate": 5.91796875e-06, + "loss": 5.6283, + "step": 1820 + }, + { + "epoch": 0.053614590199956055, + "grad_norm": 25.56309700012207, + "learning_rate": 5.950520833333334e-06, + "loss": 5.6029, + "step": 1830 + }, + { + "epoch": 0.053907566102688056, + "grad_norm": 23.022499084472656, + "learning_rate": 5.9830729166666665e-06, + "loss": 5.5876, + "step": 1840 + }, + { + "epoch": 0.05420054200542006, + "grad_norm": 24.237382888793945, + "learning_rate": 6.0156250000000005e-06, + "loss": 5.5897, + "step": 1850 + }, + { + "epoch": 0.05449351790815205, + "grad_norm": 21.680416107177734, + "learning_rate": 6.0481770833333345e-06, + "loss": 5.5992, + "step": 1860 + }, + { + "epoch": 0.05478649381088405, + "grad_norm": 24.977060317993164, + "learning_rate": 6.080729166666667e-06, + "loss": 5.5702, + "step": 1870 + }, + { + "epoch": 0.055079469713616054, + "grad_norm": 26.246967315673828, + "learning_rate": 6.113281250000001e-06, + "loss": 5.5829, + "step": 1880 + }, + { + "epoch": 0.055372445616348055, + "grad_norm": 24.262577056884766, + "learning_rate": 6.145833333333334e-06, + "loss": 5.5948, + "step": 1890 + }, + { + "epoch": 0.055665421519080056, + "grad_norm": 21.6431827545166, + "learning_rate": 6.178385416666667e-06, + "loss": 5.611, + "step": 1900 + }, + { + "epoch": 0.05595839742181206, + "grad_norm": 20.2042179107666, + "learning_rate": 6.2109375e-06, + "loss": 5.5884, + "step": 1910 + }, + { + "epoch": 0.05625137332454406, + "grad_norm": 20.31168556213379, + "learning_rate": 6.243489583333334e-06, + "loss": 5.5825, + "step": 1920 + }, + { + "epoch": 0.05654434922727606, + "grad_norm": 25.361413955688477, + "learning_rate": 6.2760416666666665e-06, + "loss": 5.6058, + "step": 1930 + }, + { + "epoch": 0.056837325130008054, + "grad_norm": 24.311681747436523, + "learning_rate": 6.3085937500000005e-06, + "loss": 5.5625, + "step": 1940 + }, + { + "epoch": 0.057130301032740055, + "grad_norm": 21.9235782623291, + "learning_rate": 6.3411458333333344e-06, + "loss": 5.5785, + "step": 1950 + }, + { + "epoch": 0.057423276935472056, + "grad_norm": 24.413766860961914, + "learning_rate": 6.373697916666667e-06, + "loss": 5.5913, + "step": 1960 + }, + { + "epoch": 0.05771625283820406, + "grad_norm": 21.78108787536621, + "learning_rate": 6.406250000000001e-06, + "loss": 5.5939, + "step": 1970 + }, + { + "epoch": 0.05800922874093606, + "grad_norm": 22.68929100036621, + "learning_rate": 6.438802083333334e-06, + "loss": 5.5408, + "step": 1980 + }, + { + "epoch": 0.05830220464366806, + "grad_norm": 24.531097412109375, + "learning_rate": 6.471354166666667e-06, + "loss": 5.536, + "step": 1990 + }, + { + "epoch": 0.05859518054640006, + "grad_norm": 21.521728515625, + "learning_rate": 6.50390625e-06, + "loss": 5.5981, + "step": 2000 + }, + { + "epoch": 0.05888815644913206, + "grad_norm": 25.132810592651367, + "learning_rate": 6.536458333333334e-06, + "loss": 5.5907, + "step": 2010 + }, + { + "epoch": 0.059181132351864056, + "grad_norm": 24.116222381591797, + "learning_rate": 6.569010416666667e-06, + "loss": 5.5293, + "step": 2020 + }, + { + "epoch": 0.05947410825459606, + "grad_norm": 26.577117919921875, + "learning_rate": 6.6015625e-06, + "loss": 5.5355, + "step": 2030 + }, + { + "epoch": 0.05976708415732806, + "grad_norm": 21.08549690246582, + "learning_rate": 6.634114583333334e-06, + "loss": 5.5461, + "step": 2040 + }, + { + "epoch": 0.06000146487951366, + "eval_bleu": 0.2572165551739482, + "eval_cap_loss": 1.2404602766036987, + "eval_con_loss": 2.2569570541381836, + "eval_loss": 5.7543745040893555, + "step": 2048 + }, + { + "epoch": 0.06000146487951366, + "eval_bleu": 0.2572165551739482, + "eval_cap_loss": 1.2404602766036987, + "eval_con_loss": 2.2569570541381836, + "eval_loss": 5.7543745040893555, + "eval_runtime": 51.8366, + "eval_samples_per_second": 385.828, + "eval_steps_per_second": 0.386, + "step": 2048 + }, + { + "epoch": 0.06006006006006006, + "grad_norm": 26.911144256591797, + "learning_rate": 6.666666666666667e-06, + "loss": 5.5484, + "step": 2050 + }, + { + "epoch": 0.06035303596279206, + "grad_norm": 30.44013214111328, + "learning_rate": 6.699218750000001e-06, + "loss": 5.5702, + "step": 2060 + }, + { + "epoch": 0.06064601186552406, + "grad_norm": 22.895357131958008, + "learning_rate": 6.731770833333335e-06, + "loss": 5.5825, + "step": 2070 + }, + { + "epoch": 0.06093898776825606, + "grad_norm": 25.833789825439453, + "learning_rate": 6.764322916666667e-06, + "loss": 5.5347, + "step": 2080 + }, + { + "epoch": 0.061231963670988064, + "grad_norm": 22.49154281616211, + "learning_rate": 6.796875000000001e-06, + "loss": 5.5703, + "step": 2090 + }, + { + "epoch": 0.06152493957372006, + "grad_norm": 22.961759567260742, + "learning_rate": 6.829427083333334e-06, + "loss": 5.5393, + "step": 2100 + }, + { + "epoch": 0.06181791547645206, + "grad_norm": 23.915048599243164, + "learning_rate": 6.861979166666667e-06, + "loss": 5.5385, + "step": 2110 + }, + { + "epoch": 0.06211089137918406, + "grad_norm": 21.352157592773438, + "learning_rate": 6.89453125e-06, + "loss": 5.5022, + "step": 2120 + }, + { + "epoch": 0.06240386728191606, + "grad_norm": 21.622861862182617, + "learning_rate": 6.927083333333334e-06, + "loss": 5.5342, + "step": 2130 + }, + { + "epoch": 0.06269684318464806, + "grad_norm": 21.647544860839844, + "learning_rate": 6.9596354166666666e-06, + "loss": 5.5509, + "step": 2140 + }, + { + "epoch": 0.06298981908738006, + "grad_norm": 22.149627685546875, + "learning_rate": 6.9921875000000006e-06, + "loss": 5.5334, + "step": 2150 + }, + { + "epoch": 0.06328279499011207, + "grad_norm": 23.146835327148438, + "learning_rate": 7.0247395833333345e-06, + "loss": 5.52, + "step": 2160 + }, + { + "epoch": 0.06357577089284407, + "grad_norm": 23.91387176513672, + "learning_rate": 7.057291666666667e-06, + "loss": 5.5204, + "step": 2170 + }, + { + "epoch": 0.06386874679557607, + "grad_norm": 24.93581199645996, + "learning_rate": 7.089843750000001e-06, + "loss": 5.5222, + "step": 2180 + }, + { + "epoch": 0.06416172269830807, + "grad_norm": 21.40251350402832, + "learning_rate": 7.122395833333334e-06, + "loss": 5.5305, + "step": 2190 + }, + { + "epoch": 0.06445469860104007, + "grad_norm": 19.337371826171875, + "learning_rate": 7.154947916666667e-06, + "loss": 5.508, + "step": 2200 + }, + { + "epoch": 0.06474767450377207, + "grad_norm": 22.33907699584961, + "learning_rate": 7.1875e-06, + "loss": 5.5392, + "step": 2210 + }, + { + "epoch": 0.06504065040650407, + "grad_norm": 24.831195831298828, + "learning_rate": 7.220052083333334e-06, + "loss": 5.5027, + "step": 2220 + }, + { + "epoch": 0.06533362630923606, + "grad_norm": 22.819625854492188, + "learning_rate": 7.2526041666666665e-06, + "loss": 5.5293, + "step": 2230 + }, + { + "epoch": 0.06562660221196806, + "grad_norm": 22.48828887939453, + "learning_rate": 7.2851562500000005e-06, + "loss": 5.5069, + "step": 2240 + }, + { + "epoch": 0.06591957811470006, + "grad_norm": 21.36451530456543, + "learning_rate": 7.3177083333333345e-06, + "loss": 5.5259, + "step": 2250 + }, + { + "epoch": 0.06621255401743206, + "grad_norm": 23.335607528686523, + "learning_rate": 7.350260416666667e-06, + "loss": 5.5199, + "step": 2260 + }, + { + "epoch": 0.06650552992016406, + "grad_norm": 25.552688598632812, + "learning_rate": 7.382812500000001e-06, + "loss": 5.4869, + "step": 2270 + }, + { + "epoch": 0.06679850582289606, + "grad_norm": 24.87629508972168, + "learning_rate": 7.415364583333335e-06, + "loss": 5.4851, + "step": 2280 + }, + { + "epoch": 0.06709148172562807, + "grad_norm": 24.045516967773438, + "learning_rate": 7.447916666666667e-06, + "loss": 5.4929, + "step": 2290 + }, + { + "epoch": 0.06738445762836007, + "grad_norm": 23.843168258666992, + "learning_rate": 7.480468750000001e-06, + "loss": 5.4727, + "step": 2300 + }, + { + "epoch": 0.06767743353109207, + "grad_norm": 21.426137924194336, + "learning_rate": 7.513020833333334e-06, + "loss": 5.4859, + "step": 2310 + }, + { + "epoch": 0.06797040943382407, + "grad_norm": 22.275087356567383, + "learning_rate": 7.545572916666667e-06, + "loss": 5.4765, + "step": 2320 + }, + { + "epoch": 0.06826338533655607, + "grad_norm": 24.421043395996094, + "learning_rate": 7.578125e-06, + "loss": 5.5189, + "step": 2330 + }, + { + "epoch": 0.06855636123928807, + "grad_norm": 20.434314727783203, + "learning_rate": 7.610677083333334e-06, + "loss": 5.4541, + "step": 2340 + }, + { + "epoch": 0.06884933714202007, + "grad_norm": 21.95121192932129, + "learning_rate": 7.643229166666668e-06, + "loss": 5.4681, + "step": 2350 + }, + { + "epoch": 0.06914231304475207, + "grad_norm": 27.887935638427734, + "learning_rate": 7.67578125e-06, + "loss": 5.5049, + "step": 2360 + }, + { + "epoch": 0.06943528894748408, + "grad_norm": 20.818523406982422, + "learning_rate": 7.708333333333334e-06, + "loss": 5.4706, + "step": 2370 + }, + { + "epoch": 0.06972826485021608, + "grad_norm": 21.731536865234375, + "learning_rate": 7.740885416666667e-06, + "loss": 5.4526, + "step": 2380 + }, + { + "epoch": 0.07002124075294806, + "grad_norm": 26.632726669311523, + "learning_rate": 7.7734375e-06, + "loss": 5.4755, + "step": 2390 + }, + { + "epoch": 0.07031421665568006, + "grad_norm": 19.877174377441406, + "learning_rate": 7.805989583333335e-06, + "loss": 5.481, + "step": 2400 + }, + { + "epoch": 0.07060719255841207, + "grad_norm": 25.740798950195312, + "learning_rate": 7.838541666666666e-06, + "loss": 5.5085, + "step": 2410 + }, + { + "epoch": 0.07090016846114407, + "grad_norm": 22.787023544311523, + "learning_rate": 7.871093750000001e-06, + "loss": 5.4663, + "step": 2420 + }, + { + "epoch": 0.07119314436387607, + "grad_norm": 23.787769317626953, + "learning_rate": 7.903645833333334e-06, + "loss": 5.4369, + "step": 2430 + }, + { + "epoch": 0.07148612026660807, + "grad_norm": 23.968120574951172, + "learning_rate": 7.936197916666667e-06, + "loss": 5.4653, + "step": 2440 + }, + { + "epoch": 0.07177909616934007, + "grad_norm": 23.447175979614258, + "learning_rate": 7.96875e-06, + "loss": 5.453, + "step": 2450 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 22.971782684326172, + "learning_rate": 8.001302083333334e-06, + "loss": 5.4341, + "step": 2460 + }, + { + "epoch": 0.07236504797480407, + "grad_norm": 19.030176162719727, + "learning_rate": 8.033854166666667e-06, + "loss": 5.4399, + "step": 2470 + }, + { + "epoch": 0.07265802387753607, + "grad_norm": 23.737049102783203, + "learning_rate": 8.06640625e-06, + "loss": 5.4175, + "step": 2480 + }, + { + "epoch": 0.07295099978026807, + "grad_norm": 25.660852432250977, + "learning_rate": 8.098958333333335e-06, + "loss": 5.4521, + "step": 2490 + }, + { + "epoch": 0.07324397568300008, + "grad_norm": 26.258888244628906, + "learning_rate": 8.131510416666666e-06, + "loss": 5.4517, + "step": 2500 + }, + { + "epoch": 0.07353695158573208, + "grad_norm": 22.775516510009766, + "learning_rate": 8.164062500000001e-06, + "loss": 5.4565, + "step": 2510 + }, + { + "epoch": 0.07382992748846408, + "grad_norm": 26.29474639892578, + "learning_rate": 8.196614583333334e-06, + "loss": 5.4583, + "step": 2520 + }, + { + "epoch": 0.07412290339119608, + "grad_norm": 23.633710861206055, + "learning_rate": 8.229166666666667e-06, + "loss": 5.467, + "step": 2530 + }, + { + "epoch": 0.07441587929392808, + "grad_norm": 23.748445510864258, + "learning_rate": 8.26171875e-06, + "loss": 5.3924, + "step": 2540 + }, + { + "epoch": 0.07470885519666007, + "grad_norm": 24.195919036865234, + "learning_rate": 8.294270833333334e-06, + "loss": 5.4563, + "step": 2550 + }, + { + "epoch": 0.07500183109939207, + "grad_norm": 24.763580322265625, + "learning_rate": 8.326822916666667e-06, + "loss": 5.4382, + "step": 2560 + }, + { + "epoch": 0.07500183109939207, + "eval_bleu": 0.2615075487130035, + "eval_cap_loss": 1.2303131818771362, + "eval_con_loss": 2.219594955444336, + "eval_loss": 5.669503211975098, + "step": 2560 + }, + { + "epoch": 0.07500183109939207, + "eval_bleu": 0.2615075487130035, + "eval_cap_loss": 1.2303131818771362, + "eval_con_loss": 2.219594955444336, + "eval_loss": 5.669503211975098, + "eval_runtime": 51.2202, + "eval_samples_per_second": 390.471, + "eval_steps_per_second": 0.39, + "step": 2560 + }, + { + "epoch": 0.07529480700212407, + "grad_norm": 22.41862678527832, + "learning_rate": 8.359375e-06, + "loss": 5.4426, + "step": 2570 + }, + { + "epoch": 0.07558778290485607, + "grad_norm": 23.58980369567871, + "learning_rate": 8.391927083333335e-06, + "loss": 5.4192, + "step": 2580 + }, + { + "epoch": 0.07588075880758807, + "grad_norm": 23.08696937561035, + "learning_rate": 8.424479166666666e-06, + "loss": 5.4492, + "step": 2590 + }, + { + "epoch": 0.07617373471032007, + "grad_norm": 25.489398956298828, + "learning_rate": 8.457031250000001e-06, + "loss": 5.4225, + "step": 2600 + }, + { + "epoch": 0.07646671061305207, + "grad_norm": 22.986602783203125, + "learning_rate": 8.489583333333334e-06, + "loss": 5.3803, + "step": 2610 + }, + { + "epoch": 0.07675968651578408, + "grad_norm": 25.6961669921875, + "learning_rate": 8.522135416666667e-06, + "loss": 5.4634, + "step": 2620 + }, + { + "epoch": 0.07705266241851608, + "grad_norm": 24.774566650390625, + "learning_rate": 8.5546875e-06, + "loss": 5.4206, + "step": 2630 + }, + { + "epoch": 0.07734563832124808, + "grad_norm": 21.3874568939209, + "learning_rate": 8.587239583333335e-06, + "loss": 5.4022, + "step": 2640 + }, + { + "epoch": 0.07763861422398008, + "grad_norm": 25.12557029724121, + "learning_rate": 8.619791666666667e-06, + "loss": 5.4158, + "step": 2650 + }, + { + "epoch": 0.07793159012671208, + "grad_norm": 21.66535758972168, + "learning_rate": 8.652343750000002e-06, + "loss": 5.4371, + "step": 2660 + }, + { + "epoch": 0.07822456602944408, + "grad_norm": 22.703500747680664, + "learning_rate": 8.684895833333335e-06, + "loss": 5.4122, + "step": 2670 + }, + { + "epoch": 0.07851754193217608, + "grad_norm": 22.79484748840332, + "learning_rate": 8.717447916666668e-06, + "loss": 5.408, + "step": 2680 + }, + { + "epoch": 0.07881051783490808, + "grad_norm": 25.115394592285156, + "learning_rate": 8.750000000000001e-06, + "loss": 5.3726, + "step": 2690 + }, + { + "epoch": 0.07910349373764008, + "grad_norm": 24.24040985107422, + "learning_rate": 8.782552083333334e-06, + "loss": 5.393, + "step": 2700 + }, + { + "epoch": 0.07939646964037209, + "grad_norm": 22.403589248657227, + "learning_rate": 8.815104166666667e-06, + "loss": 5.3806, + "step": 2710 + }, + { + "epoch": 0.07968944554310407, + "grad_norm": 19.791440963745117, + "learning_rate": 8.84765625e-06, + "loss": 5.405, + "step": 2720 + }, + { + "epoch": 0.07998242144583607, + "grad_norm": 23.934019088745117, + "learning_rate": 8.880208333333335e-06, + "loss": 5.348, + "step": 2730 + }, + { + "epoch": 0.08027539734856808, + "grad_norm": 25.01671600341797, + "learning_rate": 8.912760416666667e-06, + "loss": 5.3709, + "step": 2740 + }, + { + "epoch": 0.08056837325130008, + "grad_norm": 23.963924407958984, + "learning_rate": 8.945312500000001e-06, + "loss": 5.4186, + "step": 2750 + }, + { + "epoch": 0.08086134915403208, + "grad_norm": 23.48053550720215, + "learning_rate": 8.977864583333335e-06, + "loss": 5.3936, + "step": 2760 + }, + { + "epoch": 0.08115432505676408, + "grad_norm": 20.39649772644043, + "learning_rate": 9.010416666666668e-06, + "loss": 5.3798, + "step": 2770 + }, + { + "epoch": 0.08144730095949608, + "grad_norm": 22.49953842163086, + "learning_rate": 9.042968750000001e-06, + "loss": 5.3873, + "step": 2780 + }, + { + "epoch": 0.08174027686222808, + "grad_norm": 24.875600814819336, + "learning_rate": 9.075520833333334e-06, + "loss": 5.3846, + "step": 2790 + }, + { + "epoch": 0.08203325276496008, + "grad_norm": 22.56624984741211, + "learning_rate": 9.108072916666667e-06, + "loss": 5.3928, + "step": 2800 + }, + { + "epoch": 0.08232622866769208, + "grad_norm": 21.44034194946289, + "learning_rate": 9.140625e-06, + "loss": 5.4201, + "step": 2810 + }, + { + "epoch": 0.08261920457042408, + "grad_norm": 24.825634002685547, + "learning_rate": 9.173177083333335e-06, + "loss": 5.4193, + "step": 2820 + }, + { + "epoch": 0.08291218047315609, + "grad_norm": 24.458911895751953, + "learning_rate": 9.205729166666667e-06, + "loss": 5.3657, + "step": 2830 + }, + { + "epoch": 0.08320515637588809, + "grad_norm": 20.082881927490234, + "learning_rate": 9.238281250000001e-06, + "loss": 5.3951, + "step": 2840 + }, + { + "epoch": 0.08349813227862009, + "grad_norm": 23.895917892456055, + "learning_rate": 9.270833333333334e-06, + "loss": 5.3789, + "step": 2850 + }, + { + "epoch": 0.08379110818135209, + "grad_norm": 24.58293342590332, + "learning_rate": 9.303385416666668e-06, + "loss": 5.3812, + "step": 2860 + }, + { + "epoch": 0.08408408408408409, + "grad_norm": 18.577123641967773, + "learning_rate": 9.3359375e-06, + "loss": 5.3498, + "step": 2870 + }, + { + "epoch": 0.08437705998681608, + "grad_norm": 24.70419692993164, + "learning_rate": 9.368489583333334e-06, + "loss": 5.3709, + "step": 2880 + }, + { + "epoch": 0.08467003588954808, + "grad_norm": 20.36850929260254, + "learning_rate": 9.401041666666667e-06, + "loss": 5.3801, + "step": 2890 + }, + { + "epoch": 0.08496301179228008, + "grad_norm": 21.98590660095215, + "learning_rate": 9.43359375e-06, + "loss": 5.3487, + "step": 2900 + }, + { + "epoch": 0.08525598769501208, + "grad_norm": 26.572702407836914, + "learning_rate": 9.466145833333335e-06, + "loss": 5.3308, + "step": 2910 + }, + { + "epoch": 0.08554896359774408, + "grad_norm": 21.940414428710938, + "learning_rate": 9.498697916666666e-06, + "loss": 5.3371, + "step": 2920 + }, + { + "epoch": 0.08584193950047608, + "grad_norm": 23.88878631591797, + "learning_rate": 9.531250000000001e-06, + "loss": 5.3399, + "step": 2930 + }, + { + "epoch": 0.08613491540320808, + "grad_norm": 20.75480079650879, + "learning_rate": 9.563802083333334e-06, + "loss": 5.3446, + "step": 2940 + }, + { + "epoch": 0.08642789130594009, + "grad_norm": 25.877717971801758, + "learning_rate": 9.596354166666668e-06, + "loss": 5.3422, + "step": 2950 + }, + { + "epoch": 0.08672086720867209, + "grad_norm": 25.82989501953125, + "learning_rate": 9.62890625e-06, + "loss": 5.3433, + "step": 2960 + }, + { + "epoch": 0.08701384311140409, + "grad_norm": 23.961721420288086, + "learning_rate": 9.661458333333334e-06, + "loss": 5.3187, + "step": 2970 + }, + { + "epoch": 0.08730681901413609, + "grad_norm": 20.745559692382812, + "learning_rate": 9.694010416666667e-06, + "loss": 5.3296, + "step": 2980 + }, + { + "epoch": 0.08759979491686809, + "grad_norm": 21.30181312561035, + "learning_rate": 9.7265625e-06, + "loss": 5.3667, + "step": 2990 + }, + { + "epoch": 0.08789277081960009, + "grad_norm": 18.93926429748535, + "learning_rate": 9.759114583333335e-06, + "loss": 5.3582, + "step": 3000 + }, + { + "epoch": 0.08818574672233209, + "grad_norm": 22.419424057006836, + "learning_rate": 9.791666666666666e-06, + "loss": 5.3199, + "step": 3010 + }, + { + "epoch": 0.0884787226250641, + "grad_norm": 22.25169563293457, + "learning_rate": 9.824218750000001e-06, + "loss": 5.3293, + "step": 3020 + }, + { + "epoch": 0.0887716985277961, + "grad_norm": 20.132495880126953, + "learning_rate": 9.856770833333334e-06, + "loss": 5.3278, + "step": 3030 + }, + { + "epoch": 0.0890646744305281, + "grad_norm": 21.122909545898438, + "learning_rate": 9.889322916666667e-06, + "loss": 5.3521, + "step": 3040 + }, + { + "epoch": 0.08935765033326008, + "grad_norm": 22.90290641784668, + "learning_rate": 9.921875e-06, + "loss": 5.3111, + "step": 3050 + }, + { + "epoch": 0.08965062623599208, + "grad_norm": 23.73031997680664, + "learning_rate": 9.954427083333334e-06, + "loss": 5.3499, + "step": 3060 + }, + { + "epoch": 0.08994360213872409, + "grad_norm": 21.233112335205078, + "learning_rate": 9.986979166666667e-06, + "loss": 5.2991, + "step": 3070 + }, + { + "epoch": 0.09000219731927049, + "eval_bleu": 0.26372233991262334, + "eval_cap_loss": 1.209324598312378, + "eval_con_loss": 2.1802315711975098, + "eval_loss": 5.569787502288818, + "step": 3072 + }, + { + "epoch": 0.09000219731927049, + "eval_bleu": 0.26372233991262334, + "eval_cap_loss": 1.209324598312378, + "eval_con_loss": 2.1802315711975098, + "eval_loss": 5.569787502288818, + "eval_runtime": 52.3392, + "eval_samples_per_second": 382.123, + "eval_steps_per_second": 0.382, + "step": 3072 + }, + { + "epoch": 0.09023657804145609, + "grad_norm": 25.151317596435547, + "learning_rate": 9.999999909960341e-06, + "loss": 5.3401, + "step": 3080 + }, + { + "epoch": 0.09052955394418809, + "grad_norm": 20.83795928955078, + "learning_rate": 9.999999359717979e-06, + "loss": 5.2893, + "step": 3090 + }, + { + "epoch": 0.09082252984692009, + "grad_norm": 22.61281394958496, + "learning_rate": 9.999998309255346e-06, + "loss": 5.3172, + "step": 3100 + }, + { + "epoch": 0.09111550574965209, + "grad_norm": 22.616531372070312, + "learning_rate": 9.999996758572546e-06, + "loss": 5.3034, + "step": 3110 + }, + { + "epoch": 0.09140848165238409, + "grad_norm": 21.379077911376953, + "learning_rate": 9.999994707669735e-06, + "loss": 5.2862, + "step": 3120 + }, + { + "epoch": 0.09170145755511609, + "grad_norm": 22.076581954956055, + "learning_rate": 9.999992156547119e-06, + "loss": 5.2686, + "step": 3130 + }, + { + "epoch": 0.0919944334578481, + "grad_norm": 20.797449111938477, + "learning_rate": 9.999989105204949e-06, + "loss": 5.3185, + "step": 3140 + }, + { + "epoch": 0.0922874093605801, + "grad_norm": 21.81037712097168, + "learning_rate": 9.999985553643533e-06, + "loss": 5.302, + "step": 3150 + }, + { + "epoch": 0.0925803852633121, + "grad_norm": 20.627504348754883, + "learning_rate": 9.999981501863227e-06, + "loss": 5.2761, + "step": 3160 + }, + { + "epoch": 0.0928733611660441, + "grad_norm": 22.18275260925293, + "learning_rate": 9.999976949864436e-06, + "loss": 5.2835, + "step": 3170 + }, + { + "epoch": 0.0931663370687761, + "grad_norm": 21.9028263092041, + "learning_rate": 9.999971897647614e-06, + "loss": 5.2564, + "step": 3180 + }, + { + "epoch": 0.0934593129715081, + "grad_norm": 21.89460563659668, + "learning_rate": 9.999966345213268e-06, + "loss": 5.2502, + "step": 3190 + }, + { + "epoch": 0.0937522888742401, + "grad_norm": 21.234878540039062, + "learning_rate": 9.999960292561952e-06, + "loss": 5.2734, + "step": 3200 + }, + { + "epoch": 0.09404526477697209, + "grad_norm": 20.073348999023438, + "learning_rate": 9.999953739694275e-06, + "loss": 5.2758, + "step": 3210 + }, + { + "epoch": 0.09433824067970409, + "grad_norm": 20.17218780517578, + "learning_rate": 9.999946686610889e-06, + "loss": 5.2368, + "step": 3220 + }, + { + "epoch": 0.09463121658243609, + "grad_norm": 20.26726722717285, + "learning_rate": 9.9999391333125e-06, + "loss": 5.2424, + "step": 3230 + }, + { + "epoch": 0.09492419248516809, + "grad_norm": 21.153366088867188, + "learning_rate": 9.999931079799864e-06, + "loss": 5.2465, + "step": 3240 + }, + { + "epoch": 0.09521716838790009, + "grad_norm": 18.874616622924805, + "learning_rate": 9.999922526073787e-06, + "loss": 5.2193, + "step": 3250 + }, + { + "epoch": 0.09551014429063209, + "grad_norm": 17.675384521484375, + "learning_rate": 9.999913472135126e-06, + "loss": 5.2224, + "step": 3260 + }, + { + "epoch": 0.0958031201933641, + "grad_norm": 21.024269104003906, + "learning_rate": 9.999903917984786e-06, + "loss": 5.2428, + "step": 3270 + }, + { + "epoch": 0.0960960960960961, + "grad_norm": 20.518754959106445, + "learning_rate": 9.99989386362372e-06, + "loss": 5.2426, + "step": 3280 + }, + { + "epoch": 0.0963890719988281, + "grad_norm": 21.240991592407227, + "learning_rate": 9.999883309052938e-06, + "loss": 5.2807, + "step": 3290 + }, + { + "epoch": 0.0966820479015601, + "grad_norm": 19.533811569213867, + "learning_rate": 9.999872254273494e-06, + "loss": 5.2563, + "step": 3300 + }, + { + "epoch": 0.0969750238042921, + "grad_norm": 17.755687713623047, + "learning_rate": 9.999860699286494e-06, + "loss": 5.2159, + "step": 3310 + }, + { + "epoch": 0.0972679997070241, + "grad_norm": 22.48589515686035, + "learning_rate": 9.999848644093094e-06, + "loss": 5.2441, + "step": 3320 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 21.51368522644043, + "learning_rate": 9.999836088694501e-06, + "loss": 5.2223, + "step": 3330 + }, + { + "epoch": 0.0978539515124881, + "grad_norm": 19.335752487182617, + "learning_rate": 9.99982303309197e-06, + "loss": 5.275, + "step": 3340 + }, + { + "epoch": 0.0981469274152201, + "grad_norm": 19.98351287841797, + "learning_rate": 9.999809477286808e-06, + "loss": 5.2384, + "step": 3350 + }, + { + "epoch": 0.0984399033179521, + "grad_norm": 20.793277740478516, + "learning_rate": 9.99979542128037e-06, + "loss": 5.2495, + "step": 3360 + }, + { + "epoch": 0.0987328792206841, + "grad_norm": 20.067779541015625, + "learning_rate": 9.999780865074062e-06, + "loss": 5.2095, + "step": 3370 + }, + { + "epoch": 0.09902585512341609, + "grad_norm": 19.792373657226562, + "learning_rate": 9.999765808669342e-06, + "loss": 5.2424, + "step": 3380 + }, + { + "epoch": 0.0993188310261481, + "grad_norm": 23.113143920898438, + "learning_rate": 9.999750252067715e-06, + "loss": 5.2291, + "step": 3390 + }, + { + "epoch": 0.0996118069288801, + "grad_norm": 19.054210662841797, + "learning_rate": 9.999734195270739e-06, + "loss": 5.2192, + "step": 3400 + }, + { + "epoch": 0.0999047828316121, + "grad_norm": 19.916479110717773, + "learning_rate": 9.999717638280017e-06, + "loss": 5.2179, + "step": 3410 + }, + { + "epoch": 0.1001977587343441, + "grad_norm": 19.24924659729004, + "learning_rate": 9.999700581097211e-06, + "loss": 5.2443, + "step": 3420 + }, + { + "epoch": 0.1004907346370761, + "grad_norm": 17.995576858520508, + "learning_rate": 9.999683023724021e-06, + "loss": 5.2357, + "step": 3430 + }, + { + "epoch": 0.1007837105398081, + "grad_norm": 18.24885368347168, + "learning_rate": 9.999664966162209e-06, + "loss": 5.2375, + "step": 3440 + }, + { + "epoch": 0.1010766864425401, + "grad_norm": 19.51423454284668, + "learning_rate": 9.999646408413577e-06, + "loss": 5.2166, + "step": 3450 + }, + { + "epoch": 0.1013696623452721, + "grad_norm": 18.923749923706055, + "learning_rate": 9.999627350479984e-06, + "loss": 5.2025, + "step": 3460 + }, + { + "epoch": 0.1016626382480041, + "grad_norm": 19.007612228393555, + "learning_rate": 9.999607792363338e-06, + "loss": 5.1974, + "step": 3470 + }, + { + "epoch": 0.1019556141507361, + "grad_norm": 19.134946823120117, + "learning_rate": 9.999587734065592e-06, + "loss": 5.2433, + "step": 3480 + }, + { + "epoch": 0.1022485900534681, + "grad_norm": 21.34571647644043, + "learning_rate": 9.999567175588756e-06, + "loss": 5.2211, + "step": 3490 + }, + { + "epoch": 0.1025415659562001, + "grad_norm": 17.870147705078125, + "learning_rate": 9.999546116934884e-06, + "loss": 5.2404, + "step": 3500 + }, + { + "epoch": 0.10283454185893211, + "grad_norm": 19.667341232299805, + "learning_rate": 9.999524558106086e-06, + "loss": 5.2069, + "step": 3510 + }, + { + "epoch": 0.10312751776166411, + "grad_norm": 20.671350479125977, + "learning_rate": 9.999502499104517e-06, + "loss": 5.1773, + "step": 3520 + }, + { + "epoch": 0.10342049366439611, + "grad_norm": 19.11866569519043, + "learning_rate": 9.999479939932384e-06, + "loss": 5.1846, + "step": 3530 + }, + { + "epoch": 0.1037134695671281, + "grad_norm": 18.82963752746582, + "learning_rate": 9.999456880591943e-06, + "loss": 5.1849, + "step": 3540 + }, + { + "epoch": 0.1040064454698601, + "grad_norm": 18.70763397216797, + "learning_rate": 9.999433321085502e-06, + "loss": 5.1744, + "step": 3550 + }, + { + "epoch": 0.1042994213725921, + "grad_norm": 18.025585174560547, + "learning_rate": 9.999409261415419e-06, + "loss": 5.1889, + "step": 3560 + }, + { + "epoch": 0.1045923972753241, + "grad_norm": 19.18975830078125, + "learning_rate": 9.999384701584098e-06, + "loss": 5.2142, + "step": 3570 + }, + { + "epoch": 0.1048853731780561, + "grad_norm": 19.676179885864258, + "learning_rate": 9.999359641594e-06, + "loss": 5.1612, + "step": 3580 + }, + { + "epoch": 0.1050025635391489, + "eval_bleu": 0.26600883974610773, + "eval_cap_loss": 1.1969388723373413, + "eval_con_loss": 2.1276140213012695, + "eval_loss": 5.45216703414917, + "step": 3584 + }, + { + "epoch": 0.1050025635391489, + "eval_bleu": 0.26600883974610773, + "eval_cap_loss": 1.1969388723373413, + "eval_con_loss": 2.1276140213012695, + "eval_loss": 5.45216703414917, + "eval_runtime": 53.4563, + "eval_samples_per_second": 374.138, + "eval_steps_per_second": 0.374, + "step": 3584 + }, + { + "epoch": 0.1051783490807881, + "grad_norm": 16.88688087463379, + "learning_rate": 9.99933408144763e-06, + "loss": 5.163, + "step": 3590 + }, + { + "epoch": 0.1054713249835201, + "grad_norm": 20.0775146484375, + "learning_rate": 9.999308021147542e-06, + "loss": 5.1485, + "step": 3600 + }, + { + "epoch": 0.1057643008862521, + "grad_norm": 19.824451446533203, + "learning_rate": 9.99928146069635e-06, + "loss": 5.1617, + "step": 3610 + }, + { + "epoch": 0.1060572767889841, + "grad_norm": 18.529918670654297, + "learning_rate": 9.999254400096706e-06, + "loss": 5.1511, + "step": 3620 + }, + { + "epoch": 0.10635025269171611, + "grad_norm": 17.44389533996582, + "learning_rate": 9.99922683935132e-06, + "loss": 5.1767, + "step": 3630 + }, + { + "epoch": 0.10664322859444811, + "grad_norm": 18.364078521728516, + "learning_rate": 9.999198778462947e-06, + "loss": 5.1665, + "step": 3640 + }, + { + "epoch": 0.10693620449718011, + "grad_norm": 18.64165687561035, + "learning_rate": 9.999170217434395e-06, + "loss": 5.1844, + "step": 3650 + }, + { + "epoch": 0.10722918039991211, + "grad_norm": 19.501895904541016, + "learning_rate": 9.999141156268522e-06, + "loss": 5.1796, + "step": 3660 + }, + { + "epoch": 0.10752215630264411, + "grad_norm": 16.447555541992188, + "learning_rate": 9.999111594968237e-06, + "loss": 5.1688, + "step": 3670 + }, + { + "epoch": 0.10781513220537611, + "grad_norm": 19.774105072021484, + "learning_rate": 9.999081533536492e-06, + "loss": 5.1559, + "step": 3680 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 17.964204788208008, + "learning_rate": 9.999050971976301e-06, + "loss": 5.1324, + "step": 3690 + }, + { + "epoch": 0.10840108401084012, + "grad_norm": 16.136886596679688, + "learning_rate": 9.999019910290719e-06, + "loss": 5.1759, + "step": 3700 + }, + { + "epoch": 0.1086940599135721, + "grad_norm": 18.037179946899414, + "learning_rate": 9.99898834848285e-06, + "loss": 5.13, + "step": 3710 + }, + { + "epoch": 0.1089870358163041, + "grad_norm": 15.988410949707031, + "learning_rate": 9.998956286555859e-06, + "loss": 5.1611, + "step": 3720 + }, + { + "epoch": 0.1092800117190361, + "grad_norm": 19.511638641357422, + "learning_rate": 9.998923724512946e-06, + "loss": 5.159, + "step": 3730 + }, + { + "epoch": 0.1095729876217681, + "grad_norm": 21.107959747314453, + "learning_rate": 9.998890662357374e-06, + "loss": 5.1296, + "step": 3740 + }, + { + "epoch": 0.10986596352450011, + "grad_norm": 18.076534271240234, + "learning_rate": 9.998857100092448e-06, + "loss": 5.1202, + "step": 3750 + }, + { + "epoch": 0.11015893942723211, + "grad_norm": 20.109025955200195, + "learning_rate": 9.998823037721526e-06, + "loss": 5.1138, + "step": 3760 + }, + { + "epoch": 0.11045191532996411, + "grad_norm": 17.771940231323242, + "learning_rate": 9.998788475248015e-06, + "loss": 5.1344, + "step": 3770 + }, + { + "epoch": 0.11074489123269611, + "grad_norm": 16.287261962890625, + "learning_rate": 9.998753412675375e-06, + "loss": 5.1117, + "step": 3780 + }, + { + "epoch": 0.11103786713542811, + "grad_norm": 17.97237777709961, + "learning_rate": 9.998717850007113e-06, + "loss": 5.1268, + "step": 3790 + }, + { + "epoch": 0.11133084303816011, + "grad_norm": 15.676408767700195, + "learning_rate": 9.998681787246786e-06, + "loss": 5.1236, + "step": 3800 + }, + { + "epoch": 0.11162381894089211, + "grad_norm": 18.219141006469727, + "learning_rate": 9.998645224398002e-06, + "loss": 5.1614, + "step": 3810 + }, + { + "epoch": 0.11191679484362412, + "grad_norm": 19.31633949279785, + "learning_rate": 9.99860816146442e-06, + "loss": 5.1137, + "step": 3820 + }, + { + "epoch": 0.11220977074635612, + "grad_norm": 19.124250411987305, + "learning_rate": 9.998570598449747e-06, + "loss": 5.0968, + "step": 3830 + }, + { + "epoch": 0.11250274664908812, + "grad_norm": 19.174591064453125, + "learning_rate": 9.998532535357742e-06, + "loss": 5.1141, + "step": 3840 + }, + { + "epoch": 0.11279572255182012, + "grad_norm": 18.595333099365234, + "learning_rate": 9.998493972192211e-06, + "loss": 5.13, + "step": 3850 + }, + { + "epoch": 0.11308869845455212, + "grad_norm": 19.750028610229492, + "learning_rate": 9.998454908957012e-06, + "loss": 5.0935, + "step": 3860 + }, + { + "epoch": 0.1133816743572841, + "grad_norm": 15.438969612121582, + "learning_rate": 9.998415345656057e-06, + "loss": 5.0989, + "step": 3870 + }, + { + "epoch": 0.11367465026001611, + "grad_norm": 18.788066864013672, + "learning_rate": 9.998375282293298e-06, + "loss": 5.1343, + "step": 3880 + }, + { + "epoch": 0.11396762616274811, + "grad_norm": 17.722370147705078, + "learning_rate": 9.99833471887275e-06, + "loss": 5.1177, + "step": 3890 + }, + { + "epoch": 0.11426060206548011, + "grad_norm": 20.317222595214844, + "learning_rate": 9.998293655398466e-06, + "loss": 5.1218, + "step": 3900 + }, + { + "epoch": 0.11455357796821211, + "grad_norm": 16.26360321044922, + "learning_rate": 9.998252091874554e-06, + "loss": 5.1148, + "step": 3910 + }, + { + "epoch": 0.11484655387094411, + "grad_norm": 19.815101623535156, + "learning_rate": 9.998210028305176e-06, + "loss": 5.1183, + "step": 3920 + }, + { + "epoch": 0.11513952977367611, + "grad_norm": 17.211353302001953, + "learning_rate": 9.998167464694536e-06, + "loss": 5.0892, + "step": 3930 + }, + { + "epoch": 0.11543250567640811, + "grad_norm": 19.816131591796875, + "learning_rate": 9.998124401046896e-06, + "loss": 5.1082, + "step": 3940 + }, + { + "epoch": 0.11572548157914012, + "grad_norm": 16.622093200683594, + "learning_rate": 9.998080837366562e-06, + "loss": 5.0999, + "step": 3950 + }, + { + "epoch": 0.11601845748187212, + "grad_norm": 20.458711624145508, + "learning_rate": 9.998036773657891e-06, + "loss": 5.1027, + "step": 3960 + }, + { + "epoch": 0.11631143338460412, + "grad_norm": 17.497440338134766, + "learning_rate": 9.997992209925296e-06, + "loss": 5.0696, + "step": 3970 + }, + { + "epoch": 0.11660440928733612, + "grad_norm": 20.08896255493164, + "learning_rate": 9.997947146173231e-06, + "loss": 5.0719, + "step": 3980 + }, + { + "epoch": 0.11689738519006812, + "grad_norm": 18.08026123046875, + "learning_rate": 9.997901582406206e-06, + "loss": 5.06, + "step": 3990 + }, + { + "epoch": 0.11719036109280012, + "grad_norm": 19.14794921875, + "learning_rate": 9.99785551862878e-06, + "loss": 5.0701, + "step": 4000 + }, + { + "epoch": 0.11748333699553212, + "grad_norm": 16.507081985473633, + "learning_rate": 9.997808954845559e-06, + "loss": 5.0726, + "step": 4010 + }, + { + "epoch": 0.11777631289826412, + "grad_norm": 17.585416793823242, + "learning_rate": 9.997761891061204e-06, + "loss": 5.1009, + "step": 4020 + }, + { + "epoch": 0.11806928880099612, + "grad_norm": 16.75477409362793, + "learning_rate": 9.997714327280423e-06, + "loss": 5.0238, + "step": 4030 + }, + { + "epoch": 0.11836226470372811, + "grad_norm": 18.68478012084961, + "learning_rate": 9.997666263507973e-06, + "loss": 5.1002, + "step": 4040 + }, + { + "epoch": 0.11865524060646011, + "grad_norm": 17.89875030517578, + "learning_rate": 9.997617699748665e-06, + "loss": 5.0682, + "step": 4050 + }, + { + "epoch": 0.11894821650919211, + "grad_norm": 18.260862350463867, + "learning_rate": 9.997568636007355e-06, + "loss": 5.0752, + "step": 4060 + }, + { + "epoch": 0.11924119241192412, + "grad_norm": 17.00755500793457, + "learning_rate": 9.997519072288952e-06, + "loss": 5.0567, + "step": 4070 + }, + { + "epoch": 0.11953416831465612, + "grad_norm": 20.711755752563477, + "learning_rate": 9.997469008598415e-06, + "loss": 5.0585, + "step": 4080 + }, + { + "epoch": 0.11982714421738812, + "grad_norm": 18.54139518737793, + "learning_rate": 9.997418444940754e-06, + "loss": 5.0534, + "step": 4090 + }, + { + "epoch": 0.12000292975902732, + "eval_bleu": 0.2717564415185317, + "eval_cap_loss": 1.1772674322128296, + "eval_con_loss": 2.0771923065185547, + "eval_loss": 5.33165168762207, + "step": 4096 + }, + { + "epoch": 0.12000292975902732, + "eval_bleu": 0.2717564415185317, + "eval_cap_loss": 1.1772674322128296, + "eval_con_loss": 2.0771923065185547, + "eval_loss": 5.33165168762207, + "eval_runtime": 53.3732, + "eval_samples_per_second": 374.72, + "eval_steps_per_second": 0.375, + "step": 4096 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 16.994606018066406, + "learning_rate": 9.997367381321026e-06, + "loss": 5.0297, + "step": 4100 + }, + { + "epoch": 0.12041309602285212, + "grad_norm": 16.12767791748047, + "learning_rate": 9.997315817744338e-06, + "loss": 5.0589, + "step": 4110 + }, + { + "epoch": 0.12070607192558412, + "grad_norm": 16.71843147277832, + "learning_rate": 9.997263754215853e-06, + "loss": 5.0529, + "step": 4120 + }, + { + "epoch": 0.12099904782831612, + "grad_norm": 16.788002014160156, + "learning_rate": 9.997211190740776e-06, + "loss": 5.0692, + "step": 4130 + }, + { + "epoch": 0.12129202373104812, + "grad_norm": 16.887741088867188, + "learning_rate": 9.997158127324366e-06, + "loss": 5.051, + "step": 4140 + }, + { + "epoch": 0.12158499963378012, + "grad_norm": 19.649215698242188, + "learning_rate": 9.997104563971933e-06, + "loss": 5.0523, + "step": 4150 + }, + { + "epoch": 0.12187797553651213, + "grad_norm": 19.056303024291992, + "learning_rate": 9.997050500688837e-06, + "loss": 5.0396, + "step": 4160 + }, + { + "epoch": 0.12217095143924413, + "grad_norm": 17.124967575073242, + "learning_rate": 9.996995937480483e-06, + "loss": 5.0528, + "step": 4170 + }, + { + "epoch": 0.12246392734197613, + "grad_norm": 16.923276901245117, + "learning_rate": 9.996940874352332e-06, + "loss": 5.0528, + "step": 4180 + }, + { + "epoch": 0.12275690324470813, + "grad_norm": 18.588123321533203, + "learning_rate": 9.996885311309892e-06, + "loss": 5.0349, + "step": 4190 + }, + { + "epoch": 0.12304987914744012, + "grad_norm": 19.01314353942871, + "learning_rate": 9.996829248358722e-06, + "loss": 4.9961, + "step": 4200 + }, + { + "epoch": 0.12334285505017212, + "grad_norm": 16.205156326293945, + "learning_rate": 9.996772685504432e-06, + "loss": 5.0311, + "step": 4210 + }, + { + "epoch": 0.12363583095290412, + "grad_norm": 16.15351104736328, + "learning_rate": 9.99671562275268e-06, + "loss": 5.024, + "step": 4220 + }, + { + "epoch": 0.12392880685563612, + "grad_norm": 17.564823150634766, + "learning_rate": 9.996658060109172e-06, + "loss": 5.0448, + "step": 4230 + }, + { + "epoch": 0.12422178275836812, + "grad_norm": 18.4870548248291, + "learning_rate": 9.99659999757967e-06, + "loss": 5.0593, + "step": 4240 + }, + { + "epoch": 0.12451475866110012, + "grad_norm": 19.27774429321289, + "learning_rate": 9.996541435169984e-06, + "loss": 5.041, + "step": 4250 + }, + { + "epoch": 0.12480773456383212, + "grad_norm": 17.379138946533203, + "learning_rate": 9.99648237288597e-06, + "loss": 5.0286, + "step": 4260 + }, + { + "epoch": 0.12510071046656412, + "grad_norm": 18.903453826904297, + "learning_rate": 9.996422810733537e-06, + "loss": 5.0156, + "step": 4270 + }, + { + "epoch": 0.12539368636929613, + "grad_norm": 18.697107315063477, + "learning_rate": 9.996362748718644e-06, + "loss": 5.0177, + "step": 4280 + }, + { + "epoch": 0.12568666227202813, + "grad_norm": 16.717966079711914, + "learning_rate": 9.996302186847303e-06, + "loss": 5.031, + "step": 4290 + }, + { + "epoch": 0.12597963817476013, + "grad_norm": 14.598145484924316, + "learning_rate": 9.996241125125567e-06, + "loss": 5.0026, + "step": 4300 + }, + { + "epoch": 0.12627261407749213, + "grad_norm": 18.60251808166504, + "learning_rate": 9.99617956355955e-06, + "loss": 5.0482, + "step": 4310 + }, + { + "epoch": 0.12656558998022413, + "grad_norm": 16.328943252563477, + "learning_rate": 9.99611750215541e-06, + "loss": 4.9797, + "step": 4320 + }, + { + "epoch": 0.12685856588295613, + "grad_norm": 18.68697166442871, + "learning_rate": 9.996054940919351e-06, + "loss": 4.9968, + "step": 4330 + }, + { + "epoch": 0.12715154178568813, + "grad_norm": 17.917234420776367, + "learning_rate": 9.99599187985764e-06, + "loss": 5.0054, + "step": 4340 + }, + { + "epoch": 0.12744451768842013, + "grad_norm": 17.809913635253906, + "learning_rate": 9.99592831897658e-06, + "loss": 4.9881, + "step": 4350 + }, + { + "epoch": 0.12773749359115213, + "grad_norm": 17.518491744995117, + "learning_rate": 9.995864258282531e-06, + "loss": 4.9984, + "step": 4360 + }, + { + "epoch": 0.12803046949388414, + "grad_norm": 16.952993392944336, + "learning_rate": 9.995799697781902e-06, + "loss": 5.0149, + "step": 4370 + }, + { + "epoch": 0.12832344539661614, + "grad_norm": 17.15622901916504, + "learning_rate": 9.995734637481153e-06, + "loss": 4.9822, + "step": 4380 + }, + { + "epoch": 0.12861642129934814, + "grad_norm": 18.509479522705078, + "learning_rate": 9.995669077386792e-06, + "loss": 4.9844, + "step": 4390 + }, + { + "epoch": 0.12890939720208014, + "grad_norm": 17.454238891601562, + "learning_rate": 9.99560301750538e-06, + "loss": 5.0179, + "step": 4400 + }, + { + "epoch": 0.12920237310481214, + "grad_norm": 18.064794540405273, + "learning_rate": 9.995536457843522e-06, + "loss": 4.9887, + "step": 4410 + }, + { + "epoch": 0.12949534900754414, + "grad_norm": 18.08984375, + "learning_rate": 9.995469398407879e-06, + "loss": 4.9901, + "step": 4420 + }, + { + "epoch": 0.12978832491027614, + "grad_norm": 18.364177703857422, + "learning_rate": 9.99540183920516e-06, + "loss": 4.9987, + "step": 4430 + }, + { + "epoch": 0.13008130081300814, + "grad_norm": 16.756616592407227, + "learning_rate": 9.995333780242123e-06, + "loss": 4.9838, + "step": 4440 + }, + { + "epoch": 0.13037427671574012, + "grad_norm": 17.526830673217773, + "learning_rate": 9.995265221525579e-06, + "loss": 4.9848, + "step": 4450 + }, + { + "epoch": 0.13066725261847212, + "grad_norm": 17.102020263671875, + "learning_rate": 9.995196163062383e-06, + "loss": 5.0274, + "step": 4460 + }, + { + "epoch": 0.13096022852120412, + "grad_norm": 21.88545036315918, + "learning_rate": 9.99512660485945e-06, + "loss": 4.9917, + "step": 4470 + }, + { + "epoch": 0.13125320442393612, + "grad_norm": 16.729169845581055, + "learning_rate": 9.995056546923734e-06, + "loss": 4.9802, + "step": 4480 + }, + { + "epoch": 0.13154618032666812, + "grad_norm": 15.648212432861328, + "learning_rate": 9.994985989262245e-06, + "loss": 4.9787, + "step": 4490 + }, + { + "epoch": 0.13183915622940012, + "grad_norm": 17.368152618408203, + "learning_rate": 9.99491493188204e-06, + "loss": 4.9864, + "step": 4500 + }, + { + "epoch": 0.13213213213213212, + "grad_norm": 16.556102752685547, + "learning_rate": 9.994843374790231e-06, + "loss": 4.9695, + "step": 4510 + }, + { + "epoch": 0.13242510803486413, + "grad_norm": 15.761122703552246, + "learning_rate": 9.994771317993977e-06, + "loss": 4.9773, + "step": 4520 + }, + { + "epoch": 0.13271808393759613, + "grad_norm": 16.42365074157715, + "learning_rate": 9.994698761500486e-06, + "loss": 4.9368, + "step": 4530 + }, + { + "epoch": 0.13301105984032813, + "grad_norm": 15.049148559570312, + "learning_rate": 9.994625705317014e-06, + "loss": 4.984, + "step": 4540 + }, + { + "epoch": 0.13330403574306013, + "grad_norm": 17.474950790405273, + "learning_rate": 9.994552149450874e-06, + "loss": 4.9791, + "step": 4550 + }, + { + "epoch": 0.13359701164579213, + "grad_norm": 18.190866470336914, + "learning_rate": 9.994478093909424e-06, + "loss": 4.9428, + "step": 4560 + }, + { + "epoch": 0.13388998754852413, + "grad_norm": 16.85440444946289, + "learning_rate": 9.99440353870007e-06, + "loss": 4.9718, + "step": 4570 + }, + { + "epoch": 0.13418296345125613, + "grad_norm": 17.271520614624023, + "learning_rate": 9.994328483830273e-06, + "loss": 4.9428, + "step": 4580 + }, + { + "epoch": 0.13447593935398813, + "grad_norm": 18.354938507080078, + "learning_rate": 9.994252929307542e-06, + "loss": 4.9608, + "step": 4590 + }, + { + "epoch": 0.13476891525672013, + "grad_norm": 16.88800811767578, + "learning_rate": 9.994176875139434e-06, + "loss": 4.9663, + "step": 4600 + }, + { + "epoch": 0.13500329597890573, + "eval_bleu": 0.27642388049089306, + "eval_cap_loss": 1.1635016202926636, + "eval_con_loss": 2.0282726287841797, + "eval_loss": 5.2200469970703125, + "step": 4608 + }, + { + "epoch": 0.13500329597890573, + "eval_bleu": 0.27642388049089306, + "eval_cap_loss": 1.1635016202926636, + "eval_con_loss": 2.0282726287841797, + "eval_loss": 5.2200469970703125, + "eval_runtime": 51.4589, + "eval_samples_per_second": 388.659, + "eval_steps_per_second": 0.389, + "step": 4608 + }, + { + "epoch": 0.13506189115945214, + "grad_norm": 16.54267692565918, + "learning_rate": 9.994100321333561e-06, + "loss": 4.933, + "step": 4610 + }, + { + "epoch": 0.13535486706218414, + "grad_norm": 16.92894172668457, + "learning_rate": 9.994023267897579e-06, + "loss": 4.9679, + "step": 4620 + }, + { + "epoch": 0.13564784296491614, + "grad_norm": 17.28532600402832, + "learning_rate": 9.993945714839198e-06, + "loss": 4.9356, + "step": 4630 + }, + { + "epoch": 0.13594081886764814, + "grad_norm": 16.697650909423828, + "learning_rate": 9.993867662166176e-06, + "loss": 4.9688, + "step": 4640 + }, + { + "epoch": 0.13623379477038014, + "grad_norm": 16.56504249572754, + "learning_rate": 9.993789109886322e-06, + "loss": 4.944, + "step": 4650 + }, + { + "epoch": 0.13652677067311214, + "grad_norm": 13.984020233154297, + "learning_rate": 9.993710058007494e-06, + "loss": 4.9573, + "step": 4660 + }, + { + "epoch": 0.13681974657584414, + "grad_norm": 17.12899398803711, + "learning_rate": 9.993630506537602e-06, + "loss": 4.9494, + "step": 4670 + }, + { + "epoch": 0.13711272247857614, + "grad_norm": 17.706579208374023, + "learning_rate": 9.993550455484604e-06, + "loss": 4.9338, + "step": 4680 + }, + { + "epoch": 0.13740569838130814, + "grad_norm": 15.645454406738281, + "learning_rate": 9.99346990485651e-06, + "loss": 4.9553, + "step": 4690 + }, + { + "epoch": 0.13769867428404015, + "grad_norm": 14.2883939743042, + "learning_rate": 9.993388854661374e-06, + "loss": 4.9135, + "step": 4700 + }, + { + "epoch": 0.13799165018677215, + "grad_norm": 17.532794952392578, + "learning_rate": 9.993307304907311e-06, + "loss": 4.9226, + "step": 4710 + }, + { + "epoch": 0.13828462608950415, + "grad_norm": 17.306968688964844, + "learning_rate": 9.993225255602474e-06, + "loss": 4.9272, + "step": 4720 + }, + { + "epoch": 0.13857760199223615, + "grad_norm": 14.47683048248291, + "learning_rate": 9.993142706755076e-06, + "loss": 4.8986, + "step": 4730 + }, + { + "epoch": 0.13887057789496815, + "grad_norm": 18.44178581237793, + "learning_rate": 9.993059658373372e-06, + "loss": 4.92, + "step": 4740 + }, + { + "epoch": 0.13916355379770015, + "grad_norm": 15.797486305236816, + "learning_rate": 9.992976110465673e-06, + "loss": 4.9439, + "step": 4750 + }, + { + "epoch": 0.13945652970043215, + "grad_norm": 17.534671783447266, + "learning_rate": 9.992892063040338e-06, + "loss": 4.9139, + "step": 4760 + }, + { + "epoch": 0.13974950560316415, + "grad_norm": 18.164295196533203, + "learning_rate": 9.992807516105772e-06, + "loss": 4.9549, + "step": 4770 + }, + { + "epoch": 0.14004248150589613, + "grad_norm": 18.672147750854492, + "learning_rate": 9.992722469670435e-06, + "loss": 4.9006, + "step": 4780 + }, + { + "epoch": 0.14033545740862813, + "grad_norm": 16.766111373901367, + "learning_rate": 9.992636923742836e-06, + "loss": 4.9464, + "step": 4790 + }, + { + "epoch": 0.14062843331136013, + "grad_norm": 17.542278289794922, + "learning_rate": 9.992550878331533e-06, + "loss": 4.9256, + "step": 4800 + }, + { + "epoch": 0.14092140921409213, + "grad_norm": 17.49260711669922, + "learning_rate": 9.992464333445134e-06, + "loss": 4.9161, + "step": 4810 + }, + { + "epoch": 0.14121438511682413, + "grad_norm": 15.634653091430664, + "learning_rate": 9.992377289092299e-06, + "loss": 4.8982, + "step": 4820 + }, + { + "epoch": 0.14150736101955613, + "grad_norm": 16.71021270751953, + "learning_rate": 9.992289745281733e-06, + "loss": 4.9066, + "step": 4830 + }, + { + "epoch": 0.14180033692228813, + "grad_norm": 16.333215713500977, + "learning_rate": 9.992201702022198e-06, + "loss": 4.9075, + "step": 4840 + }, + { + "epoch": 0.14209331282502013, + "grad_norm": 16.565176010131836, + "learning_rate": 9.9921131593225e-06, + "loss": 4.8676, + "step": 4850 + }, + { + "epoch": 0.14238628872775214, + "grad_norm": 16.00234603881836, + "learning_rate": 9.992024117191498e-06, + "loss": 4.8749, + "step": 4860 + }, + { + "epoch": 0.14267926463048414, + "grad_norm": 17.031755447387695, + "learning_rate": 9.991934575638098e-06, + "loss": 4.9046, + "step": 4870 + }, + { + "epoch": 0.14297224053321614, + "grad_norm": 16.793439865112305, + "learning_rate": 9.991844534671262e-06, + "loss": 4.8979, + "step": 4880 + }, + { + "epoch": 0.14326521643594814, + "grad_norm": 16.339866638183594, + "learning_rate": 9.991753994299996e-06, + "loss": 4.8867, + "step": 4890 + }, + { + "epoch": 0.14355819233868014, + "grad_norm": 17.301761627197266, + "learning_rate": 9.991662954533357e-06, + "loss": 4.8984, + "step": 4900 + }, + { + "epoch": 0.14385116824141214, + "grad_norm": 17.42232894897461, + "learning_rate": 9.991571415380453e-06, + "loss": 4.8844, + "step": 4910 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 16.042024612426758, + "learning_rate": 9.991479376850444e-06, + "loss": 4.8885, + "step": 4920 + }, + { + "epoch": 0.14443712004687614, + "grad_norm": 15.88487434387207, + "learning_rate": 9.991386838952536e-06, + "loss": 4.9005, + "step": 4930 + }, + { + "epoch": 0.14473009594960815, + "grad_norm": 16.54978370666504, + "learning_rate": 9.99129380169599e-06, + "loss": 4.8941, + "step": 4940 + }, + { + "epoch": 0.14502307185234015, + "grad_norm": 15.41776180267334, + "learning_rate": 9.991200265090108e-06, + "loss": 4.9072, + "step": 4950 + }, + { + "epoch": 0.14531604775507215, + "grad_norm": 17.713272094726562, + "learning_rate": 9.991106229144254e-06, + "loss": 4.8701, + "step": 4960 + }, + { + "epoch": 0.14560902365780415, + "grad_norm": 16.966615676879883, + "learning_rate": 9.991011693867832e-06, + "loss": 4.874, + "step": 4970 + }, + { + "epoch": 0.14590199956053615, + "grad_norm": 14.783260345458984, + "learning_rate": 9.9909166592703e-06, + "loss": 4.9263, + "step": 4980 + }, + { + "epoch": 0.14619497546326815, + "grad_norm": 16.12990951538086, + "learning_rate": 9.990821125361168e-06, + "loss": 4.9131, + "step": 4990 + }, + { + "epoch": 0.14648795136600015, + "grad_norm": 16.320158004760742, + "learning_rate": 9.99072509214999e-06, + "loss": 4.8968, + "step": 5000 + }, + { + "epoch": 0.14678092726873215, + "grad_norm": 16.19646453857422, + "learning_rate": 9.990628559646376e-06, + "loss": 4.883, + "step": 5010 + }, + { + "epoch": 0.14707390317146415, + "grad_norm": 17.177448272705078, + "learning_rate": 9.990531527859984e-06, + "loss": 4.881, + "step": 5020 + }, + { + "epoch": 0.14736687907419616, + "grad_norm": 17.157873153686523, + "learning_rate": 9.99043399680052e-06, + "loss": 4.9061, + "step": 5030 + }, + { + "epoch": 0.14765985497692816, + "grad_norm": 15.924799919128418, + "learning_rate": 9.990335966477743e-06, + "loss": 4.8872, + "step": 5040 + }, + { + "epoch": 0.14795283087966016, + "grad_norm": 15.931114196777344, + "learning_rate": 9.990237436901457e-06, + "loss": 4.8888, + "step": 5050 + }, + { + "epoch": 0.14824580678239216, + "grad_norm": 15.073715209960938, + "learning_rate": 9.990138408081523e-06, + "loss": 4.8579, + "step": 5060 + }, + { + "epoch": 0.14853878268512416, + "grad_norm": 14.154193878173828, + "learning_rate": 9.990038880027845e-06, + "loss": 4.8662, + "step": 5070 + }, + { + "epoch": 0.14883175858785616, + "grad_norm": 17.58271026611328, + "learning_rate": 9.989938852750382e-06, + "loss": 4.88, + "step": 5080 + }, + { + "epoch": 0.14912473449058816, + "grad_norm": 16.24661636352539, + "learning_rate": 9.989838326259142e-06, + "loss": 4.8335, + "step": 5090 + }, + { + "epoch": 0.14941771039332014, + "grad_norm": 17.41802215576172, + "learning_rate": 9.989737300564181e-06, + "loss": 4.8696, + "step": 5100 + }, + { + "epoch": 0.14971068629605214, + "grad_norm": 15.740774154663086, + "learning_rate": 9.989635775675608e-06, + "loss": 4.8712, + "step": 5110 + }, + { + "epoch": 0.15000366219878414, + "grad_norm": 16.38467025756836, + "learning_rate": 9.989533751603578e-06, + "loss": 4.852, + "step": 5120 + }, + { + "epoch": 0.15000366219878414, + "eval_bleu": 0.276980505682513, + "eval_cap_loss": 1.1550207138061523, + "eval_con_loss": 1.994401454925537, + "eval_loss": 5.143824100494385, + "step": 5120 + }, + { + "epoch": 0.15000366219878414, + "eval_bleu": 0.276980505682513, + "eval_cap_loss": 1.1550207138061523, + "eval_con_loss": 1.994401454925537, + "eval_loss": 5.143824100494385, + "eval_runtime": 58.3268, + "eval_samples_per_second": 342.895, + "eval_steps_per_second": 0.343, + "step": 5120 + }, + { + "epoch": 0.15029663810151614, + "grad_norm": 16.030378341674805, + "learning_rate": 9.989431228358295e-06, + "loss": 4.8404, + "step": 5130 + }, + { + "epoch": 0.15058961400424814, + "grad_norm": 16.89830207824707, + "learning_rate": 9.989328205950021e-06, + "loss": 4.8701, + "step": 5140 + }, + { + "epoch": 0.15088258990698014, + "grad_norm": 15.026524543762207, + "learning_rate": 9.98922468438906e-06, + "loss": 4.8739, + "step": 5150 + }, + { + "epoch": 0.15117556580971214, + "grad_norm": 15.349937438964844, + "learning_rate": 9.98912066368577e-06, + "loss": 4.8463, + "step": 5160 + }, + { + "epoch": 0.15146854171244414, + "grad_norm": 16.31907844543457, + "learning_rate": 9.989016143850556e-06, + "loss": 4.8565, + "step": 5170 + }, + { + "epoch": 0.15176151761517614, + "grad_norm": 16.54384994506836, + "learning_rate": 9.988911124893877e-06, + "loss": 4.8688, + "step": 5180 + }, + { + "epoch": 0.15205449351790815, + "grad_norm": 17.365642547607422, + "learning_rate": 9.988805606826237e-06, + "loss": 4.8465, + "step": 5190 + }, + { + "epoch": 0.15234746942064015, + "grad_norm": 15.45792007446289, + "learning_rate": 9.988699589658195e-06, + "loss": 4.8396, + "step": 5200 + }, + { + "epoch": 0.15264044532337215, + "grad_norm": 17.521574020385742, + "learning_rate": 9.988593073400354e-06, + "loss": 4.8461, + "step": 5210 + }, + { + "epoch": 0.15293342122610415, + "grad_norm": 16.001142501831055, + "learning_rate": 9.988486058063374e-06, + "loss": 4.8305, + "step": 5220 + }, + { + "epoch": 0.15322639712883615, + "grad_norm": 14.591405868530273, + "learning_rate": 9.988378543657959e-06, + "loss": 4.807, + "step": 5230 + }, + { + "epoch": 0.15351937303156815, + "grad_norm": 15.390057563781738, + "learning_rate": 9.988270530194866e-06, + "loss": 4.7978, + "step": 5240 + }, + { + "epoch": 0.15381234893430015, + "grad_norm": 17.452856063842773, + "learning_rate": 9.988162017684899e-06, + "loss": 4.8423, + "step": 5250 + }, + { + "epoch": 0.15410532483703215, + "grad_norm": 14.04377555847168, + "learning_rate": 9.988053006138919e-06, + "loss": 4.8046, + "step": 5260 + }, + { + "epoch": 0.15439830073976415, + "grad_norm": 16.402835845947266, + "learning_rate": 9.987943495567826e-06, + "loss": 4.8379, + "step": 5270 + }, + { + "epoch": 0.15469127664249616, + "grad_norm": 16.739107131958008, + "learning_rate": 9.987833485982582e-06, + "loss": 4.8136, + "step": 5280 + }, + { + "epoch": 0.15498425254522816, + "grad_norm": 14.817536354064941, + "learning_rate": 9.987722977394187e-06, + "loss": 4.8295, + "step": 5290 + }, + { + "epoch": 0.15527722844796016, + "grad_norm": 15.700788497924805, + "learning_rate": 9.9876119698137e-06, + "loss": 4.8298, + "step": 5300 + }, + { + "epoch": 0.15557020435069216, + "grad_norm": 16.641481399536133, + "learning_rate": 9.987500463252225e-06, + "loss": 4.8158, + "step": 5310 + }, + { + "epoch": 0.15586318025342416, + "grad_norm": 14.261114120483398, + "learning_rate": 9.987388457720918e-06, + "loss": 4.8197, + "step": 5320 + }, + { + "epoch": 0.15615615615615616, + "grad_norm": 15.327747344970703, + "learning_rate": 9.987275953230987e-06, + "loss": 4.7981, + "step": 5330 + }, + { + "epoch": 0.15644913205888816, + "grad_norm": 15.78258228302002, + "learning_rate": 9.987162949793684e-06, + "loss": 4.798, + "step": 5340 + }, + { + "epoch": 0.15674210796162016, + "grad_norm": 17.259654998779297, + "learning_rate": 9.987049447420316e-06, + "loss": 4.834, + "step": 5350 + }, + { + "epoch": 0.15703508386435217, + "grad_norm": 17.969173431396484, + "learning_rate": 9.98693544612224e-06, + "loss": 4.7988, + "step": 5360 + }, + { + "epoch": 0.15732805976708417, + "grad_norm": 16.97390365600586, + "learning_rate": 9.986820945910856e-06, + "loss": 4.8046, + "step": 5370 + }, + { + "epoch": 0.15762103566981617, + "grad_norm": 16.34779930114746, + "learning_rate": 9.986705946797624e-06, + "loss": 4.782, + "step": 5380 + }, + { + "epoch": 0.15791401157254817, + "grad_norm": 16.498876571655273, + "learning_rate": 9.986590448794045e-06, + "loss": 4.8138, + "step": 5390 + }, + { + "epoch": 0.15820698747528017, + "grad_norm": 15.634765625, + "learning_rate": 9.986474451911678e-06, + "loss": 4.7883, + "step": 5400 + }, + { + "epoch": 0.15849996337801217, + "grad_norm": 16.45941162109375, + "learning_rate": 9.986357956162127e-06, + "loss": 4.8297, + "step": 5410 + }, + { + "epoch": 0.15879293928074417, + "grad_norm": 15.919486045837402, + "learning_rate": 9.986240961557044e-06, + "loss": 4.7891, + "step": 5420 + }, + { + "epoch": 0.15908591518347615, + "grad_norm": 15.361286163330078, + "learning_rate": 9.986123468108134e-06, + "loss": 4.8164, + "step": 5430 + }, + { + "epoch": 0.15937889108620815, + "grad_norm": 15.5082426071167, + "learning_rate": 9.986005475827154e-06, + "loss": 4.7963, + "step": 5440 + }, + { + "epoch": 0.15967186698894015, + "grad_norm": 17.08993148803711, + "learning_rate": 9.985886984725907e-06, + "loss": 4.75, + "step": 5450 + }, + { + "epoch": 0.15996484289167215, + "grad_norm": 16.1760311126709, + "learning_rate": 9.985767994816247e-06, + "loss": 4.8023, + "step": 5460 + }, + { + "epoch": 0.16025781879440415, + "grad_norm": 16.220029830932617, + "learning_rate": 9.98564850611008e-06, + "loss": 4.8043, + "step": 5470 + }, + { + "epoch": 0.16055079469713615, + "grad_norm": 15.819811820983887, + "learning_rate": 9.985528518619355e-06, + "loss": 4.8206, + "step": 5480 + }, + { + "epoch": 0.16084377059986815, + "grad_norm": 16.26308822631836, + "learning_rate": 9.985408032356084e-06, + "loss": 4.7831, + "step": 5490 + }, + { + "epoch": 0.16113674650260015, + "grad_norm": 18.019954681396484, + "learning_rate": 9.985287047332315e-06, + "loss": 4.8097, + "step": 5500 + }, + { + "epoch": 0.16142972240533215, + "grad_norm": 17.41281509399414, + "learning_rate": 9.985165563560156e-06, + "loss": 4.7871, + "step": 5510 + }, + { + "epoch": 0.16172269830806416, + "grad_norm": 16.246726989746094, + "learning_rate": 9.985043581051755e-06, + "loss": 4.7534, + "step": 5520 + }, + { + "epoch": 0.16201567421079616, + "grad_norm": 14.95959758758545, + "learning_rate": 9.98492109981932e-06, + "loss": 4.7963, + "step": 5530 + }, + { + "epoch": 0.16230865011352816, + "grad_norm": 17.140554428100586, + "learning_rate": 9.984798119875104e-06, + "loss": 4.7917, + "step": 5540 + }, + { + "epoch": 0.16260162601626016, + "grad_norm": 14.25383472442627, + "learning_rate": 9.984674641231409e-06, + "loss": 4.7853, + "step": 5550 + }, + { + "epoch": 0.16289460191899216, + "grad_norm": 14.876585006713867, + "learning_rate": 9.984550663900591e-06, + "loss": 4.7779, + "step": 5560 + }, + { + "epoch": 0.16318757782172416, + "grad_norm": 17.04707145690918, + "learning_rate": 9.98442618789505e-06, + "loss": 4.7843, + "step": 5570 + }, + { + "epoch": 0.16348055372445616, + "grad_norm": 15.26906967163086, + "learning_rate": 9.98430121322724e-06, + "loss": 4.7533, + "step": 5580 + }, + { + "epoch": 0.16377352962718816, + "grad_norm": 16.480648040771484, + "learning_rate": 9.984175739909666e-06, + "loss": 4.7614, + "step": 5590 + }, + { + "epoch": 0.16406650552992016, + "grad_norm": 17.286853790283203, + "learning_rate": 9.984049767954878e-06, + "loss": 4.8246, + "step": 5600 + }, + { + "epoch": 0.16435948143265217, + "grad_norm": 15.621716499328613, + "learning_rate": 9.983923297375482e-06, + "loss": 4.8034, + "step": 5610 + }, + { + "epoch": 0.16465245733538417, + "grad_norm": 15.769620895385742, + "learning_rate": 9.983796328184127e-06, + "loss": 4.7744, + "step": 5620 + }, + { + "epoch": 0.16494543323811617, + "grad_norm": 15.445747375488281, + "learning_rate": 9.983668860393518e-06, + "loss": 4.8168, + "step": 5630 + }, + { + "epoch": 0.16500402841866257, + "eval_bleu": 0.27986639541477826, + "eval_cap_loss": 1.1458417177200317, + "eval_con_loss": 1.9523842334747314, + "eval_loss": 5.050610542297363, + "step": 5632 + }, + { + "epoch": 0.16500402841866257, + "eval_bleu": 0.27986639541477826, + "eval_cap_loss": 1.1458417177200317, + "eval_con_loss": 1.9523842334747314, + "eval_loss": 5.050610542297363, + "eval_runtime": 57.0973, + "eval_samples_per_second": 350.279, + "eval_steps_per_second": 0.35, + "step": 5632 + }, + { + "epoch": 0.16523840914084817, + "grad_norm": 15.17313289642334, + "learning_rate": 9.983540894016406e-06, + "loss": 4.7629, + "step": 5640 + }, + { + "epoch": 0.16553138504358017, + "grad_norm": 14.035499572753906, + "learning_rate": 9.983412429065593e-06, + "loss": 4.7668, + "step": 5650 + }, + { + "epoch": 0.16582436094631217, + "grad_norm": 14.47353744506836, + "learning_rate": 9.983283465553933e-06, + "loss": 4.7574, + "step": 5660 + }, + { + "epoch": 0.16611733684904417, + "grad_norm": 16.319007873535156, + "learning_rate": 9.983154003494328e-06, + "loss": 4.7406, + "step": 5670 + }, + { + "epoch": 0.16641031275177617, + "grad_norm": 15.590869903564453, + "learning_rate": 9.983024042899727e-06, + "loss": 4.7956, + "step": 5680 + }, + { + "epoch": 0.16670328865450817, + "grad_norm": 17.164657592773438, + "learning_rate": 9.982893583783135e-06, + "loss": 4.7704, + "step": 5690 + }, + { + "epoch": 0.16699626455724018, + "grad_norm": 15.989297866821289, + "learning_rate": 9.982762626157603e-06, + "loss": 4.7543, + "step": 5700 + }, + { + "epoch": 0.16728924045997218, + "grad_norm": 13.784409523010254, + "learning_rate": 9.982631170036233e-06, + "loss": 4.7577, + "step": 5710 + }, + { + "epoch": 0.16758221636270418, + "grad_norm": 15.928985595703125, + "learning_rate": 9.982499215432173e-06, + "loss": 4.7248, + "step": 5720 + }, + { + "epoch": 0.16787519226543618, + "grad_norm": 14.551563262939453, + "learning_rate": 9.982366762358626e-06, + "loss": 4.756, + "step": 5730 + }, + { + "epoch": 0.16816816816816818, + "grad_norm": 17.085264205932617, + "learning_rate": 9.982233810828846e-06, + "loss": 4.7673, + "step": 5740 + }, + { + "epoch": 0.16846114407090018, + "grad_norm": 13.516256332397461, + "learning_rate": 9.982100360856131e-06, + "loss": 4.7392, + "step": 5750 + }, + { + "epoch": 0.16875411997363216, + "grad_norm": 15.207222938537598, + "learning_rate": 9.981966412453833e-06, + "loss": 4.7625, + "step": 5760 + }, + { + "epoch": 0.16904709587636416, + "grad_norm": 15.476594924926758, + "learning_rate": 9.98183196563535e-06, + "loss": 4.7547, + "step": 5770 + }, + { + "epoch": 0.16934007177909616, + "grad_norm": 16.047700881958008, + "learning_rate": 9.98169702041414e-06, + "loss": 4.7229, + "step": 5780 + }, + { + "epoch": 0.16963304768182816, + "grad_norm": 16.775896072387695, + "learning_rate": 9.981561576803695e-06, + "loss": 4.7512, + "step": 5790 + }, + { + "epoch": 0.16992602358456016, + "grad_norm": 16.41984748840332, + "learning_rate": 9.98142563481757e-06, + "loss": 4.7089, + "step": 5800 + }, + { + "epoch": 0.17021899948729216, + "grad_norm": 16.64649200439453, + "learning_rate": 9.981289194469363e-06, + "loss": 4.7425, + "step": 5810 + }, + { + "epoch": 0.17051197539002416, + "grad_norm": 15.919966697692871, + "learning_rate": 9.981152255772726e-06, + "loss": 4.7599, + "step": 5820 + }, + { + "epoch": 0.17080495129275616, + "grad_norm": 15.645684242248535, + "learning_rate": 9.981014818741357e-06, + "loss": 4.7198, + "step": 5830 + }, + { + "epoch": 0.17109792719548816, + "grad_norm": 16.43389320373535, + "learning_rate": 9.980876883389008e-06, + "loss": 4.7382, + "step": 5840 + }, + { + "epoch": 0.17139090309822017, + "grad_norm": 16.09212303161621, + "learning_rate": 9.980738449729477e-06, + "loss": 4.7807, + "step": 5850 + }, + { + "epoch": 0.17168387900095217, + "grad_norm": 15.710827827453613, + "learning_rate": 9.980599517776612e-06, + "loss": 4.7686, + "step": 5860 + }, + { + "epoch": 0.17197685490368417, + "grad_norm": 14.70943832397461, + "learning_rate": 9.980460087544317e-06, + "loss": 4.7231, + "step": 5870 + }, + { + "epoch": 0.17226983080641617, + "grad_norm": 16.12755012512207, + "learning_rate": 9.980320159046538e-06, + "loss": 4.724, + "step": 5880 + }, + { + "epoch": 0.17256280670914817, + "grad_norm": 15.553915023803711, + "learning_rate": 9.980179732297272e-06, + "loss": 4.7181, + "step": 5890 + }, + { + "epoch": 0.17285578261188017, + "grad_norm": 14.851536750793457, + "learning_rate": 9.98003880731057e-06, + "loss": 4.7443, + "step": 5900 + }, + { + "epoch": 0.17314875851461217, + "grad_norm": 15.605088233947754, + "learning_rate": 9.979897384100533e-06, + "loss": 4.7722, + "step": 5910 + }, + { + "epoch": 0.17344173441734417, + "grad_norm": 14.485283851623535, + "learning_rate": 9.979755462681306e-06, + "loss": 4.7173, + "step": 5920 + }, + { + "epoch": 0.17373471032007617, + "grad_norm": 15.337462425231934, + "learning_rate": 9.979613043067091e-06, + "loss": 4.7172, + "step": 5930 + }, + { + "epoch": 0.17402768622280818, + "grad_norm": 14.763211250305176, + "learning_rate": 9.979470125272132e-06, + "loss": 4.7677, + "step": 5940 + }, + { + "epoch": 0.17432066212554018, + "grad_norm": 14.636540412902832, + "learning_rate": 9.97932670931073e-06, + "loss": 4.7094, + "step": 5950 + }, + { + "epoch": 0.17461363802827218, + "grad_norm": 13.65485668182373, + "learning_rate": 9.979182795197231e-06, + "loss": 4.733, + "step": 5960 + }, + { + "epoch": 0.17490661393100418, + "grad_norm": 14.75873851776123, + "learning_rate": 9.979038382946036e-06, + "loss": 4.7136, + "step": 5970 + }, + { + "epoch": 0.17519958983373618, + "grad_norm": 15.407052040100098, + "learning_rate": 9.978893472571588e-06, + "loss": 4.7456, + "step": 5980 + }, + { + "epoch": 0.17549256573646818, + "grad_norm": 16.18305015563965, + "learning_rate": 9.978748064088388e-06, + "loss": 4.7097, + "step": 5990 + }, + { + "epoch": 0.17578554163920018, + "grad_norm": 16.594491958618164, + "learning_rate": 9.978602157510984e-06, + "loss": 4.7028, + "step": 6000 + }, + { + "epoch": 0.17607851754193218, + "grad_norm": 16.053319931030273, + "learning_rate": 9.978455752853967e-06, + "loss": 4.7095, + "step": 6010 + }, + { + "epoch": 0.17637149344466418, + "grad_norm": 15.798147201538086, + "learning_rate": 9.978308850131991e-06, + "loss": 4.7106, + "step": 6020 + }, + { + "epoch": 0.17666446934739619, + "grad_norm": 16.6871395111084, + "learning_rate": 9.978161449359749e-06, + "loss": 4.703, + "step": 6030 + }, + { + "epoch": 0.1769574452501282, + "grad_norm": 16.79149627685547, + "learning_rate": 9.978013550551989e-06, + "loss": 4.7145, + "step": 6040 + }, + { + "epoch": 0.1772504211528602, + "grad_norm": 15.30480670928955, + "learning_rate": 9.977865153723508e-06, + "loss": 4.7031, + "step": 6050 + }, + { + "epoch": 0.1775433970555922, + "grad_norm": 15.303237915039062, + "learning_rate": 9.977716258889147e-06, + "loss": 4.7043, + "step": 6060 + }, + { + "epoch": 0.1778363729583242, + "grad_norm": 15.29787540435791, + "learning_rate": 9.97756686606381e-06, + "loss": 4.7056, + "step": 6070 + }, + { + "epoch": 0.1781293488610562, + "grad_norm": 14.0592041015625, + "learning_rate": 9.977416975262437e-06, + "loss": 4.7233, + "step": 6080 + }, + { + "epoch": 0.17842232476378816, + "grad_norm": 14.503782272338867, + "learning_rate": 9.977266586500026e-06, + "loss": 4.6912, + "step": 6090 + }, + { + "epoch": 0.17871530066652017, + "grad_norm": 14.904102325439453, + "learning_rate": 9.977115699791622e-06, + "loss": 4.6739, + "step": 6100 + }, + { + "epoch": 0.17900827656925217, + "grad_norm": 13.647062301635742, + "learning_rate": 9.976964315152321e-06, + "loss": 4.7083, + "step": 6110 + }, + { + "epoch": 0.17930125247198417, + "grad_norm": 16.106170654296875, + "learning_rate": 9.976812432597267e-06, + "loss": 4.7091, + "step": 6120 + }, + { + "epoch": 0.17959422837471617, + "grad_norm": 15.82804012298584, + "learning_rate": 9.976660052141655e-06, + "loss": 4.7288, + "step": 6130 + }, + { + "epoch": 0.17988720427744817, + "grad_norm": 15.924703598022461, + "learning_rate": 9.976507173800731e-06, + "loss": 4.6788, + "step": 6140 + }, + { + "epoch": 0.18000439463854098, + "eval_bleu": 0.2845192269447127, + "eval_cap_loss": 1.1345463991165161, + "eval_con_loss": 1.9172676801681519, + "eval_loss": 4.969081878662109, + "step": 6144 + }, + { + "epoch": 0.18000439463854098, + "eval_bleu": 0.2845192269447127, + "eval_cap_loss": 1.1345463991165161, + "eval_con_loss": 1.9172676801681519, + "eval_loss": 4.969081878662109, + "eval_runtime": 57.2775, + "eval_samples_per_second": 349.177, + "eval_steps_per_second": 0.349, + "step": 6144 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 14.74857234954834, + "learning_rate": 9.97635379758979e-06, + "loss": 4.7153, + "step": 6150 + }, + { + "epoch": 0.18047315608291217, + "grad_norm": 16.93888282775879, + "learning_rate": 9.976199923524175e-06, + "loss": 4.6889, + "step": 6160 + }, + { + "epoch": 0.18076613198564417, + "grad_norm": 17.675434112548828, + "learning_rate": 9.976045551619279e-06, + "loss": 4.7079, + "step": 6170 + }, + { + "epoch": 0.18105910788837618, + "grad_norm": 16.49285316467285, + "learning_rate": 9.975890681890548e-06, + "loss": 4.6786, + "step": 6180 + }, + { + "epoch": 0.18135208379110818, + "grad_norm": 15.255967140197754, + "learning_rate": 9.975735314353475e-06, + "loss": 4.6742, + "step": 6190 + }, + { + "epoch": 0.18164505969384018, + "grad_norm": 15.80370044708252, + "learning_rate": 9.975579449023604e-06, + "loss": 4.6788, + "step": 6200 + }, + { + "epoch": 0.18193803559657218, + "grad_norm": 14.939014434814453, + "learning_rate": 9.97542308591653e-06, + "loss": 4.7091, + "step": 6210 + }, + { + "epoch": 0.18223101149930418, + "grad_norm": 15.409286499023438, + "learning_rate": 9.975266225047892e-06, + "loss": 4.6736, + "step": 6220 + }, + { + "epoch": 0.18252398740203618, + "grad_norm": 13.877237319946289, + "learning_rate": 9.975108866433385e-06, + "loss": 4.6798, + "step": 6230 + }, + { + "epoch": 0.18281696330476818, + "grad_norm": 16.21504020690918, + "learning_rate": 9.974951010088753e-06, + "loss": 4.6862, + "step": 6240 + }, + { + "epoch": 0.18310993920750018, + "grad_norm": 15.583576202392578, + "learning_rate": 9.974792656029789e-06, + "loss": 4.6932, + "step": 6250 + }, + { + "epoch": 0.18340291511023218, + "grad_norm": 15.107256889343262, + "learning_rate": 9.974633804272333e-06, + "loss": 4.6623, + "step": 6260 + }, + { + "epoch": 0.18369589101296419, + "grad_norm": 16.310813903808594, + "learning_rate": 9.97447445483228e-06, + "loss": 4.6885, + "step": 6270 + }, + { + "epoch": 0.1839888669156962, + "grad_norm": 16.671072006225586, + "learning_rate": 9.974314607725569e-06, + "loss": 4.6907, + "step": 6280 + }, + { + "epoch": 0.1842818428184282, + "grad_norm": 14.67841625213623, + "learning_rate": 9.974154262968193e-06, + "loss": 4.6988, + "step": 6290 + }, + { + "epoch": 0.1845748187211602, + "grad_norm": 13.555002212524414, + "learning_rate": 9.973993420576194e-06, + "loss": 4.6532, + "step": 6300 + }, + { + "epoch": 0.1848677946238922, + "grad_norm": 14.97858715057373, + "learning_rate": 9.973832080565663e-06, + "loss": 4.6692, + "step": 6310 + }, + { + "epoch": 0.1851607705266242, + "grad_norm": 13.087434768676758, + "learning_rate": 9.97367024295274e-06, + "loss": 4.6465, + "step": 6320 + }, + { + "epoch": 0.1854537464293562, + "grad_norm": 15.094242095947266, + "learning_rate": 9.973507907753618e-06, + "loss": 4.6635, + "step": 6330 + }, + { + "epoch": 0.1857467223320882, + "grad_norm": 17.715232849121094, + "learning_rate": 9.973345074984537e-06, + "loss": 4.7024, + "step": 6340 + }, + { + "epoch": 0.1860396982348202, + "grad_norm": 18.374414443969727, + "learning_rate": 9.973181744661786e-06, + "loss": 4.6716, + "step": 6350 + }, + { + "epoch": 0.1863326741375522, + "grad_norm": 12.571276664733887, + "learning_rate": 9.973017916801708e-06, + "loss": 4.6933, + "step": 6360 + }, + { + "epoch": 0.1866256500402842, + "grad_norm": 14.855701446533203, + "learning_rate": 9.97285359142069e-06, + "loss": 4.6673, + "step": 6370 + }, + { + "epoch": 0.1869186259430162, + "grad_norm": 14.283609390258789, + "learning_rate": 9.972688768535174e-06, + "loss": 4.6444, + "step": 6380 + }, + { + "epoch": 0.1872116018457482, + "grad_norm": 14.322563171386719, + "learning_rate": 9.972523448161649e-06, + "loss": 4.6585, + "step": 6390 + }, + { + "epoch": 0.1875045777484802, + "grad_norm": 15.801717758178711, + "learning_rate": 9.97235763031665e-06, + "loss": 4.6637, + "step": 6400 + }, + { + "epoch": 0.1877975536512122, + "grad_norm": 14.852728843688965, + "learning_rate": 9.972191315016775e-06, + "loss": 4.6787, + "step": 6410 + }, + { + "epoch": 0.18809052955394417, + "grad_norm": 18.440458297729492, + "learning_rate": 9.972024502278657e-06, + "loss": 4.6718, + "step": 6420 + }, + { + "epoch": 0.18838350545667618, + "grad_norm": 15.053860664367676, + "learning_rate": 9.971857192118983e-06, + "loss": 4.6595, + "step": 6430 + }, + { + "epoch": 0.18867648135940818, + "grad_norm": 15.789196014404297, + "learning_rate": 9.971689384554496e-06, + "loss": 4.6616, + "step": 6440 + }, + { + "epoch": 0.18896945726214018, + "grad_norm": 15.545053482055664, + "learning_rate": 9.971521079601983e-06, + "loss": 4.6574, + "step": 6450 + }, + { + "epoch": 0.18926243316487218, + "grad_norm": 14.628952980041504, + "learning_rate": 9.97135227727828e-06, + "loss": 4.6432, + "step": 6460 + }, + { + "epoch": 0.18955540906760418, + "grad_norm": 14.379283905029297, + "learning_rate": 9.971182977600274e-06, + "loss": 4.6227, + "step": 6470 + }, + { + "epoch": 0.18984838497033618, + "grad_norm": 14.687061309814453, + "learning_rate": 9.971013180584906e-06, + "loss": 4.6353, + "step": 6480 + }, + { + "epoch": 0.19014136087306818, + "grad_norm": 14.41526985168457, + "learning_rate": 9.970842886249161e-06, + "loss": 4.6496, + "step": 6490 + }, + { + "epoch": 0.19043433677580018, + "grad_norm": 14.891467094421387, + "learning_rate": 9.970672094610075e-06, + "loss": 4.6747, + "step": 6500 + }, + { + "epoch": 0.19072731267853218, + "grad_norm": 15.25101375579834, + "learning_rate": 9.970500805684738e-06, + "loss": 4.6197, + "step": 6510 + }, + { + "epoch": 0.19102028858126419, + "grad_norm": 14.67125415802002, + "learning_rate": 9.970329019490282e-06, + "loss": 4.6409, + "step": 6520 + }, + { + "epoch": 0.1913132644839962, + "grad_norm": 14.196409225463867, + "learning_rate": 9.970156736043897e-06, + "loss": 4.652, + "step": 6530 + }, + { + "epoch": 0.1916062403867282, + "grad_norm": 16.81563377380371, + "learning_rate": 9.969983955362818e-06, + "loss": 4.6303, + "step": 6540 + }, + { + "epoch": 0.1918992162894602, + "grad_norm": 15.9114351272583, + "learning_rate": 9.969810677464329e-06, + "loss": 4.634, + "step": 6550 + }, + { + "epoch": 0.1921921921921922, + "grad_norm": 14.85570240020752, + "learning_rate": 9.969636902365766e-06, + "loss": 4.6299, + "step": 6560 + }, + { + "epoch": 0.1924851680949242, + "grad_norm": 15.113425254821777, + "learning_rate": 9.969462630084517e-06, + "loss": 4.6258, + "step": 6570 + }, + { + "epoch": 0.1927781439976562, + "grad_norm": 15.140312194824219, + "learning_rate": 9.969287860638013e-06, + "loss": 4.6214, + "step": 6580 + }, + { + "epoch": 0.1930711199003882, + "grad_norm": 13.74913501739502, + "learning_rate": 9.969112594043742e-06, + "loss": 4.5978, + "step": 6590 + }, + { + "epoch": 0.1933640958031202, + "grad_norm": 14.54120922088623, + "learning_rate": 9.968936830319236e-06, + "loss": 4.6471, + "step": 6600 + }, + { + "epoch": 0.1936570717058522, + "grad_norm": 15.712841033935547, + "learning_rate": 9.96876056948208e-06, + "loss": 4.6342, + "step": 6610 + }, + { + "epoch": 0.1939500476085842, + "grad_norm": 16.07990074157715, + "learning_rate": 9.968583811549905e-06, + "loss": 4.602, + "step": 6620 + }, + { + "epoch": 0.1942430235113162, + "grad_norm": 13.635342597961426, + "learning_rate": 9.9684065565404e-06, + "loss": 4.5968, + "step": 6630 + }, + { + "epoch": 0.1945359994140482, + "grad_norm": 15.570161819458008, + "learning_rate": 9.968228804471295e-06, + "loss": 4.651, + "step": 6640 + }, + { + "epoch": 0.1948289753167802, + "grad_norm": 15.59768295288086, + "learning_rate": 9.968050555360374e-06, + "loss": 4.6101, + "step": 6650 + }, + { + "epoch": 0.1950047608584194, + "eval_bleu": 0.28511758721384334, + "eval_cap_loss": 1.1240849494934082, + "eval_con_loss": 1.883932113647461, + "eval_loss": 4.891949653625488, + "step": 6656 + }, + { + "epoch": 0.1950047608584194, + "eval_bleu": 0.28511758721384334, + "eval_cap_loss": 1.1240849494934082, + "eval_con_loss": 1.883932113647461, + "eval_loss": 4.891949653625488, + "eval_runtime": 56.6022, + "eval_samples_per_second": 353.343, + "eval_steps_per_second": 0.353, + "step": 6656 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 15.40671157836914, + "learning_rate": 9.967871809225468e-06, + "loss": 4.6038, + "step": 6660 + }, + { + "epoch": 0.1954149271222442, + "grad_norm": 16.90543556213379, + "learning_rate": 9.96769256608446e-06, + "loss": 4.6537, + "step": 6670 + }, + { + "epoch": 0.1957079030249762, + "grad_norm": 14.991811752319336, + "learning_rate": 9.967512825955286e-06, + "loss": 4.6168, + "step": 6680 + }, + { + "epoch": 0.1960008789277082, + "grad_norm": 16.15833854675293, + "learning_rate": 9.967332588855925e-06, + "loss": 4.6151, + "step": 6690 + }, + { + "epoch": 0.1962938548304402, + "grad_norm": 14.783873558044434, + "learning_rate": 9.967151854804406e-06, + "loss": 4.6191, + "step": 6700 + }, + { + "epoch": 0.1965868307331722, + "grad_norm": 16.476539611816406, + "learning_rate": 9.966970623818814e-06, + "loss": 4.6287, + "step": 6710 + }, + { + "epoch": 0.1968798066359042, + "grad_norm": 15.758987426757812, + "learning_rate": 9.966788895917282e-06, + "loss": 4.6048, + "step": 6720 + }, + { + "epoch": 0.1971727825386362, + "grad_norm": 13.98912239074707, + "learning_rate": 9.966606671117984e-06, + "loss": 4.6109, + "step": 6730 + }, + { + "epoch": 0.1974657584413682, + "grad_norm": 16.502079010009766, + "learning_rate": 9.966423949439157e-06, + "loss": 4.6081, + "step": 6740 + }, + { + "epoch": 0.19775873434410018, + "grad_norm": 15.107707023620605, + "learning_rate": 9.966240730899078e-06, + "loss": 4.5966, + "step": 6750 + }, + { + "epoch": 0.19805171024683219, + "grad_norm": 15.08143424987793, + "learning_rate": 9.966057015516077e-06, + "loss": 4.6085, + "step": 6760 + }, + { + "epoch": 0.1983446861495642, + "grad_norm": 15.142578125, + "learning_rate": 9.965872803308535e-06, + "loss": 4.5992, + "step": 6770 + }, + { + "epoch": 0.1986376620522962, + "grad_norm": 14.24541187286377, + "learning_rate": 9.965688094294878e-06, + "loss": 4.5956, + "step": 6780 + }, + { + "epoch": 0.1989306379550282, + "grad_norm": 12.681796073913574, + "learning_rate": 9.96550288849359e-06, + "loss": 4.5953, + "step": 6790 + }, + { + "epoch": 0.1992236138577602, + "grad_norm": 15.333942413330078, + "learning_rate": 9.965317185923197e-06, + "loss": 4.5794, + "step": 6800 + }, + { + "epoch": 0.1995165897604922, + "grad_norm": 16.709064483642578, + "learning_rate": 9.965130986602279e-06, + "loss": 4.5707, + "step": 6810 + }, + { + "epoch": 0.1998095656632242, + "grad_norm": 13.78734302520752, + "learning_rate": 9.96494429054946e-06, + "loss": 4.613, + "step": 6820 + }, + { + "epoch": 0.2001025415659562, + "grad_norm": 16.76524543762207, + "learning_rate": 9.964757097783422e-06, + "loss": 4.6124, + "step": 6830 + }, + { + "epoch": 0.2003955174686882, + "grad_norm": 14.452737808227539, + "learning_rate": 9.964569408322891e-06, + "loss": 4.5845, + "step": 6840 + }, + { + "epoch": 0.2006884933714202, + "grad_norm": 15.363730430603027, + "learning_rate": 9.964381222186646e-06, + "loss": 4.6043, + "step": 6850 + }, + { + "epoch": 0.2009814692741522, + "grad_norm": 14.215633392333984, + "learning_rate": 9.964192539393512e-06, + "loss": 4.5976, + "step": 6860 + }, + { + "epoch": 0.2012744451768842, + "grad_norm": 13.864556312561035, + "learning_rate": 9.964003359962365e-06, + "loss": 4.6, + "step": 6870 + }, + { + "epoch": 0.2015674210796162, + "grad_norm": 13.628000259399414, + "learning_rate": 9.963813683912132e-06, + "loss": 4.5879, + "step": 6880 + }, + { + "epoch": 0.2018603969823482, + "grad_norm": 15.548897743225098, + "learning_rate": 9.96362351126179e-06, + "loss": 4.5973, + "step": 6890 + }, + { + "epoch": 0.2021533728850802, + "grad_norm": 17.403642654418945, + "learning_rate": 9.963432842030363e-06, + "loss": 4.588, + "step": 6900 + }, + { + "epoch": 0.2024463487878122, + "grad_norm": 15.315057754516602, + "learning_rate": 9.96324167623693e-06, + "loss": 4.5808, + "step": 6910 + }, + { + "epoch": 0.2027393246905442, + "grad_norm": 16.22166633605957, + "learning_rate": 9.96305001390061e-06, + "loss": 4.5876, + "step": 6920 + }, + { + "epoch": 0.2030323005932762, + "grad_norm": 13.701642990112305, + "learning_rate": 9.962857855040581e-06, + "loss": 4.5611, + "step": 6930 + }, + { + "epoch": 0.2033252764960082, + "grad_norm": 13.780191421508789, + "learning_rate": 9.962665199676069e-06, + "loss": 4.5725, + "step": 6940 + }, + { + "epoch": 0.2036182523987402, + "grad_norm": 14.724891662597656, + "learning_rate": 9.962472047826347e-06, + "loss": 4.6118, + "step": 6950 + }, + { + "epoch": 0.2039112283014722, + "grad_norm": 15.459653854370117, + "learning_rate": 9.962278399510734e-06, + "loss": 4.5724, + "step": 6960 + }, + { + "epoch": 0.2042042042042042, + "grad_norm": 15.57671070098877, + "learning_rate": 9.96208425474861e-06, + "loss": 4.5873, + "step": 6970 + }, + { + "epoch": 0.2044971801069362, + "grad_norm": 14.879781723022461, + "learning_rate": 9.961889613559396e-06, + "loss": 4.5875, + "step": 6980 + }, + { + "epoch": 0.2047901560096682, + "grad_norm": 15.176685333251953, + "learning_rate": 9.961694475962562e-06, + "loss": 4.62, + "step": 6990 + }, + { + "epoch": 0.2050831319124002, + "grad_norm": 15.677452087402344, + "learning_rate": 9.961498841977635e-06, + "loss": 4.5628, + "step": 7000 + }, + { + "epoch": 0.20537610781513221, + "grad_norm": 14.75327205657959, + "learning_rate": 9.961302711624183e-06, + "loss": 4.5744, + "step": 7010 + }, + { + "epoch": 0.20566908371786422, + "grad_norm": 15.789128303527832, + "learning_rate": 9.96110608492183e-06, + "loss": 4.5808, + "step": 7020 + }, + { + "epoch": 0.20596205962059622, + "grad_norm": 14.584884643554688, + "learning_rate": 9.960908961890244e-06, + "loss": 4.5344, + "step": 7030 + }, + { + "epoch": 0.20625503552332822, + "grad_norm": 16.667415618896484, + "learning_rate": 9.96071134254915e-06, + "loss": 4.5511, + "step": 7040 + }, + { + "epoch": 0.20654801142606022, + "grad_norm": 14.458191871643066, + "learning_rate": 9.960513226918317e-06, + "loss": 4.581, + "step": 7050 + }, + { + "epoch": 0.20684098732879222, + "grad_norm": 16.42026710510254, + "learning_rate": 9.960314615017565e-06, + "loss": 4.5861, + "step": 7060 + }, + { + "epoch": 0.20713396323152422, + "grad_norm": 15.60051155090332, + "learning_rate": 9.960115506866766e-06, + "loss": 4.579, + "step": 7070 + }, + { + "epoch": 0.2074269391342562, + "grad_norm": 16.610950469970703, + "learning_rate": 9.959915902485837e-06, + "loss": 4.5741, + "step": 7080 + }, + { + "epoch": 0.2077199150369882, + "grad_norm": 15.08176326751709, + "learning_rate": 9.959715801894748e-06, + "loss": 4.5809, + "step": 7090 + }, + { + "epoch": 0.2080128909397202, + "grad_norm": 14.390581130981445, + "learning_rate": 9.959515205113517e-06, + "loss": 4.5854, + "step": 7100 + }, + { + "epoch": 0.2083058668424522, + "grad_norm": 15.166903495788574, + "learning_rate": 9.959314112162214e-06, + "loss": 4.547, + "step": 7110 + }, + { + "epoch": 0.2085988427451842, + "grad_norm": 14.44621467590332, + "learning_rate": 9.959112523060957e-06, + "loss": 4.5423, + "step": 7120 + }, + { + "epoch": 0.2088918186479162, + "grad_norm": 15.150703430175781, + "learning_rate": 9.958910437829914e-06, + "loss": 4.5495, + "step": 7130 + }, + { + "epoch": 0.2091847945506482, + "grad_norm": 13.500052452087402, + "learning_rate": 9.9587078564893e-06, + "loss": 4.5496, + "step": 7140 + }, + { + "epoch": 0.2094777704533802, + "grad_norm": 13.840226173400879, + "learning_rate": 9.958504779059386e-06, + "loss": 4.5406, + "step": 7150 + }, + { + "epoch": 0.2097707463561122, + "grad_norm": 16.082204818725586, + "learning_rate": 9.958301205560485e-06, + "loss": 4.4852, + "step": 7160 + }, + { + "epoch": 0.2100051270782978, + "eval_bleu": 0.2879774182479831, + "eval_cap_loss": 1.1206718683242798, + "eval_con_loss": 1.8598018884658813, + "eval_loss": 4.840275764465332, + "step": 7168 + }, + { + "epoch": 0.2100051270782978, + "eval_bleu": 0.2879774182479831, + "eval_cap_loss": 1.1206718683242798, + "eval_con_loss": 1.8598018884658813, + "eval_loss": 4.840275764465332, + "eval_runtime": 57.7975, + "eval_samples_per_second": 346.036, + "eval_steps_per_second": 0.346, + "step": 7168 + }, + { + "epoch": 0.2100637222588442, + "grad_norm": 14.636655807495117, + "learning_rate": 9.958097136012967e-06, + "loss": 4.5352, + "step": 7170 + }, + { + "epoch": 0.2103566981615762, + "grad_norm": 15.393104553222656, + "learning_rate": 9.957892570437243e-06, + "loss": 4.5552, + "step": 7180 + }, + { + "epoch": 0.2106496740643082, + "grad_norm": 15.030762672424316, + "learning_rate": 9.957687508853783e-06, + "loss": 4.5203, + "step": 7190 + }, + { + "epoch": 0.2109426499670402, + "grad_norm": 15.426992416381836, + "learning_rate": 9.957481951283103e-06, + "loss": 4.561, + "step": 7200 + }, + { + "epoch": 0.2112356258697722, + "grad_norm": 13.718940734863281, + "learning_rate": 9.957275897745764e-06, + "loss": 4.5125, + "step": 7210 + }, + { + "epoch": 0.2115286017725042, + "grad_norm": 15.797172546386719, + "learning_rate": 9.95706934826238e-06, + "loss": 4.5573, + "step": 7220 + }, + { + "epoch": 0.2118215776752362, + "grad_norm": 13.598864555358887, + "learning_rate": 9.956862302853619e-06, + "loss": 4.5406, + "step": 7230 + }, + { + "epoch": 0.2121145535779682, + "grad_norm": 15.959887504577637, + "learning_rate": 9.956654761540192e-06, + "loss": 4.5844, + "step": 7240 + }, + { + "epoch": 0.2124075294807002, + "grad_norm": 13.924298286437988, + "learning_rate": 9.956446724342863e-06, + "loss": 4.5743, + "step": 7250 + }, + { + "epoch": 0.21270050538343221, + "grad_norm": 15.78144359588623, + "learning_rate": 9.956238191282445e-06, + "loss": 4.5756, + "step": 7260 + }, + { + "epoch": 0.21299348128616422, + "grad_norm": 15.623315811157227, + "learning_rate": 9.956029162379801e-06, + "loss": 4.5405, + "step": 7270 + }, + { + "epoch": 0.21328645718889622, + "grad_norm": 14.016671180725098, + "learning_rate": 9.955819637655841e-06, + "loss": 4.5417, + "step": 7280 + }, + { + "epoch": 0.21357943309162822, + "grad_norm": 14.254438400268555, + "learning_rate": 9.95560961713153e-06, + "loss": 4.5389, + "step": 7290 + }, + { + "epoch": 0.21387240899436022, + "grad_norm": 15.031105041503906, + "learning_rate": 9.955399100827877e-06, + "loss": 4.5357, + "step": 7300 + }, + { + "epoch": 0.21416538489709222, + "grad_norm": 17.03110122680664, + "learning_rate": 9.95518808876594e-06, + "loss": 4.5492, + "step": 7310 + }, + { + "epoch": 0.21445836079982422, + "grad_norm": 14.338513374328613, + "learning_rate": 9.954976580966837e-06, + "loss": 4.5636, + "step": 7320 + }, + { + "epoch": 0.21475133670255622, + "grad_norm": 15.351216316223145, + "learning_rate": 9.954764577451722e-06, + "loss": 4.5369, + "step": 7330 + }, + { + "epoch": 0.21504431260528822, + "grad_norm": 15.944198608398438, + "learning_rate": 9.954552078241805e-06, + "loss": 4.5543, + "step": 7340 + }, + { + "epoch": 0.21533728850802022, + "grad_norm": 14.705706596374512, + "learning_rate": 9.954339083358351e-06, + "loss": 4.5368, + "step": 7350 + }, + { + "epoch": 0.21563026441075223, + "grad_norm": 13.835514068603516, + "learning_rate": 9.954125592822663e-06, + "loss": 4.5442, + "step": 7360 + }, + { + "epoch": 0.21592324031348423, + "grad_norm": 14.248089790344238, + "learning_rate": 9.9539116066561e-06, + "loss": 4.5444, + "step": 7370 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 14.896268844604492, + "learning_rate": 9.953697124880073e-06, + "loss": 4.5354, + "step": 7380 + }, + { + "epoch": 0.21650919211894823, + "grad_norm": 15.555803298950195, + "learning_rate": 9.953482147516036e-06, + "loss": 4.5345, + "step": 7390 + }, + { + "epoch": 0.21680216802168023, + "grad_norm": 14.556449890136719, + "learning_rate": 9.9532666745855e-06, + "loss": 4.5178, + "step": 7400 + }, + { + "epoch": 0.2170951439244122, + "grad_norm": 15.550827026367188, + "learning_rate": 9.953072325256474e-06, + "loss": 4.532, + "step": 7410 + }, + { + "epoch": 0.2173881198271442, + "grad_norm": 14.562829971313477, + "learning_rate": 9.952855910809017e-06, + "loss": 4.5118, + "step": 7420 + }, + { + "epoch": 0.2176810957298762, + "grad_norm": 15.32539176940918, + "learning_rate": 9.952639000857707e-06, + "loss": 4.5258, + "step": 7430 + }, + { + "epoch": 0.2179740716326082, + "grad_norm": 13.963956832885742, + "learning_rate": 9.95242159542425e-06, + "loss": 4.4989, + "step": 7440 + }, + { + "epoch": 0.2182670475353402, + "grad_norm": 13.074813842773438, + "learning_rate": 9.952203694530395e-06, + "loss": 4.4834, + "step": 7450 + }, + { + "epoch": 0.2185600234380722, + "grad_norm": 15.656610488891602, + "learning_rate": 9.95198529819794e-06, + "loss": 4.5103, + "step": 7460 + }, + { + "epoch": 0.2188529993408042, + "grad_norm": 15.539159774780273, + "learning_rate": 9.951766406448735e-06, + "loss": 4.4983, + "step": 7470 + }, + { + "epoch": 0.2191459752435362, + "grad_norm": 15.634493827819824, + "learning_rate": 9.951547019304679e-06, + "loss": 4.522, + "step": 7480 + }, + { + "epoch": 0.2194389511462682, + "grad_norm": 13.974465370178223, + "learning_rate": 9.951327136787721e-06, + "loss": 4.5001, + "step": 7490 + }, + { + "epoch": 0.21973192704900021, + "grad_norm": 12.787046432495117, + "learning_rate": 9.951106758919855e-06, + "loss": 4.4728, + "step": 7500 + }, + { + "epoch": 0.22002490295173222, + "grad_norm": 15.15350341796875, + "learning_rate": 9.950885885723134e-06, + "loss": 4.5167, + "step": 7510 + }, + { + "epoch": 0.22031787885446422, + "grad_norm": 16.609865188598633, + "learning_rate": 9.950664517219654e-06, + "loss": 4.5097, + "step": 7520 + }, + { + "epoch": 0.22061085475719622, + "grad_norm": 14.036136627197266, + "learning_rate": 9.950442653431558e-06, + "loss": 4.5314, + "step": 7530 + }, + { + "epoch": 0.22090383065992822, + "grad_norm": 15.331719398498535, + "learning_rate": 9.950220294381046e-06, + "loss": 4.5021, + "step": 7540 + }, + { + "epoch": 0.22119680656266022, + "grad_norm": 15.032087326049805, + "learning_rate": 9.949997440090362e-06, + "loss": 4.5274, + "step": 7550 + }, + { + "epoch": 0.22148978246539222, + "grad_norm": 13.857993125915527, + "learning_rate": 9.949774090581801e-06, + "loss": 4.5249, + "step": 7560 + }, + { + "epoch": 0.22178275836812422, + "grad_norm": 14.966826438903809, + "learning_rate": 9.949550245877708e-06, + "loss": 4.4948, + "step": 7570 + }, + { + "epoch": 0.22207573427085622, + "grad_norm": 15.412687301635742, + "learning_rate": 9.949325906000477e-06, + "loss": 4.5086, + "step": 7580 + }, + { + "epoch": 0.22236871017358822, + "grad_norm": 14.776711463928223, + "learning_rate": 9.949101070972556e-06, + "loss": 4.5146, + "step": 7590 + }, + { + "epoch": 0.22266168607632023, + "grad_norm": 13.505840301513672, + "learning_rate": 9.94887574081643e-06, + "loss": 4.5101, + "step": 7600 + }, + { + "epoch": 0.22295466197905223, + "grad_norm": 13.691288948059082, + "learning_rate": 9.94864991555465e-06, + "loss": 4.4918, + "step": 7610 + }, + { + "epoch": 0.22324763788178423, + "grad_norm": 15.487963676452637, + "learning_rate": 9.948423595209804e-06, + "loss": 4.498, + "step": 7620 + }, + { + "epoch": 0.22354061378451623, + "grad_norm": 16.36821746826172, + "learning_rate": 9.948196779804535e-06, + "loss": 4.4719, + "step": 7630 + }, + { + "epoch": 0.22383358968724823, + "grad_norm": 15.530467987060547, + "learning_rate": 9.947969469361536e-06, + "loss": 4.5146, + "step": 7640 + }, + { + "epoch": 0.22412656558998023, + "grad_norm": 12.983384132385254, + "learning_rate": 9.947741663903547e-06, + "loss": 4.4934, + "step": 7650 + }, + { + "epoch": 0.22441954149271223, + "grad_norm": 14.922785758972168, + "learning_rate": 9.947513363453359e-06, + "loss": 4.4865, + "step": 7660 + }, + { + "epoch": 0.22471251739544423, + "grad_norm": 13.989174842834473, + "learning_rate": 9.94728456803381e-06, + "loss": 4.515, + "step": 7670 + }, + { + "epoch": 0.22500549329817623, + "grad_norm": 15.519174575805664, + "learning_rate": 9.94705527766779e-06, + "loss": 4.5111, + "step": 7680 + }, + { + "epoch": 0.22500549329817623, + "eval_bleu": 0.29165363253981047, + "eval_cap_loss": 1.108701467514038, + "eval_con_loss": 1.8256480693817139, + "eval_loss": 4.759997367858887, + "step": 7680 + }, + { + "epoch": 0.22500549329817623, + "eval_bleu": 0.29165363253981047, + "eval_cap_loss": 1.108701467514038, + "eval_con_loss": 1.8256480693817139, + "eval_loss": 4.759997367858887, + "eval_runtime": 62.494, + "eval_samples_per_second": 320.031, + "eval_steps_per_second": 0.32, + "step": 7680 + }, + { + "epoch": 0.22529846920090824, + "grad_norm": 15.148163795471191, + "learning_rate": 9.94682549237824e-06, + "loss": 4.484, + "step": 7690 + }, + { + "epoch": 0.22559144510364024, + "grad_norm": 13.415874481201172, + "learning_rate": 9.94659521218815e-06, + "loss": 4.4837, + "step": 7700 + }, + { + "epoch": 0.22588442100637224, + "grad_norm": 15.084836959838867, + "learning_rate": 9.946364437120556e-06, + "loss": 4.4937, + "step": 7710 + }, + { + "epoch": 0.22617739690910424, + "grad_norm": 13.113850593566895, + "learning_rate": 9.946133167198545e-06, + "loss": 4.4731, + "step": 7720 + }, + { + "epoch": 0.22647037281183624, + "grad_norm": 14.756386756896973, + "learning_rate": 9.945901402445255e-06, + "loss": 4.4692, + "step": 7730 + }, + { + "epoch": 0.2267633487145682, + "grad_norm": 13.989232063293457, + "learning_rate": 9.945669142883873e-06, + "loss": 4.5087, + "step": 7740 + }, + { + "epoch": 0.22705632461730021, + "grad_norm": 15.621318817138672, + "learning_rate": 9.945436388537635e-06, + "loss": 4.5025, + "step": 7750 + }, + { + "epoch": 0.22734930052003222, + "grad_norm": 14.540444374084473, + "learning_rate": 9.945203139429825e-06, + "loss": 4.5089, + "step": 7760 + }, + { + "epoch": 0.22764227642276422, + "grad_norm": 15.084650993347168, + "learning_rate": 9.94496939558378e-06, + "loss": 4.5023, + "step": 7770 + }, + { + "epoch": 0.22793525232549622, + "grad_norm": 15.844429969787598, + "learning_rate": 9.944735157022887e-06, + "loss": 4.4712, + "step": 7780 + }, + { + "epoch": 0.22822822822822822, + "grad_norm": 13.812984466552734, + "learning_rate": 9.944500423770576e-06, + "loss": 4.5083, + "step": 7790 + }, + { + "epoch": 0.22852120413096022, + "grad_norm": 15.112933158874512, + "learning_rate": 9.944265195850333e-06, + "loss": 4.4652, + "step": 7800 + }, + { + "epoch": 0.22881418003369222, + "grad_norm": 15.683182716369629, + "learning_rate": 9.94402947328569e-06, + "loss": 4.4808, + "step": 7810 + }, + { + "epoch": 0.22910715593642422, + "grad_norm": 15.049676895141602, + "learning_rate": 9.943793256100229e-06, + "loss": 4.4785, + "step": 7820 + }, + { + "epoch": 0.22940013183915622, + "grad_norm": 13.460580825805664, + "learning_rate": 9.943556544317586e-06, + "loss": 4.4542, + "step": 7830 + }, + { + "epoch": 0.22969310774188822, + "grad_norm": 14.663485527038574, + "learning_rate": 9.943319337961437e-06, + "loss": 4.4761, + "step": 7840 + }, + { + "epoch": 0.22998608364462023, + "grad_norm": 16.721511840820312, + "learning_rate": 9.943081637055517e-06, + "loss": 4.4587, + "step": 7850 + }, + { + "epoch": 0.23027905954735223, + "grad_norm": 14.126707077026367, + "learning_rate": 9.942843441623607e-06, + "loss": 4.4415, + "step": 7860 + }, + { + "epoch": 0.23057203545008423, + "grad_norm": 14.20432186126709, + "learning_rate": 9.942604751689533e-06, + "loss": 4.489, + "step": 7870 + }, + { + "epoch": 0.23086501135281623, + "grad_norm": 13.60831069946289, + "learning_rate": 9.942365567277178e-06, + "loss": 4.4721, + "step": 7880 + }, + { + "epoch": 0.23115798725554823, + "grad_norm": 13.4131498336792, + "learning_rate": 9.94212588841047e-06, + "loss": 4.4893, + "step": 7890 + }, + { + "epoch": 0.23145096315828023, + "grad_norm": 14.092516899108887, + "learning_rate": 9.94188571511339e-06, + "loss": 4.471, + "step": 7900 + }, + { + "epoch": 0.23174393906101223, + "grad_norm": 15.796567916870117, + "learning_rate": 9.941645047409958e-06, + "loss": 4.4689, + "step": 7910 + }, + { + "epoch": 0.23203691496374423, + "grad_norm": 13.416455268859863, + "learning_rate": 9.941403885324262e-06, + "loss": 4.4871, + "step": 7920 + }, + { + "epoch": 0.23232989086647624, + "grad_norm": 15.874873161315918, + "learning_rate": 9.941162228880422e-06, + "loss": 4.4482, + "step": 7930 + }, + { + "epoch": 0.23262286676920824, + "grad_norm": 13.782623291015625, + "learning_rate": 9.940920078102614e-06, + "loss": 4.4706, + "step": 7940 + }, + { + "epoch": 0.23291584267194024, + "grad_norm": 14.142080307006836, + "learning_rate": 9.940677433015067e-06, + "loss": 4.4514, + "step": 7950 + }, + { + "epoch": 0.23320881857467224, + "grad_norm": 15.196389198303223, + "learning_rate": 9.940434293642056e-06, + "loss": 4.4447, + "step": 7960 + }, + { + "epoch": 0.23350179447740424, + "grad_norm": 16.045225143432617, + "learning_rate": 9.940190660007903e-06, + "loss": 4.4732, + "step": 7970 + }, + { + "epoch": 0.23379477038013624, + "grad_norm": 15.268783569335938, + "learning_rate": 9.939946532136984e-06, + "loss": 4.4728, + "step": 7980 + }, + { + "epoch": 0.23408774628286824, + "grad_norm": 15.598456382751465, + "learning_rate": 9.939701910053723e-06, + "loss": 4.4549, + "step": 7990 + }, + { + "epoch": 0.23438072218560024, + "grad_norm": 14.332108497619629, + "learning_rate": 9.939456793782591e-06, + "loss": 4.4837, + "step": 8000 + }, + { + "epoch": 0.23467369808833224, + "grad_norm": 14.102240562438965, + "learning_rate": 9.939211183348111e-06, + "loss": 4.4859, + "step": 8010 + }, + { + "epoch": 0.23496667399106425, + "grad_norm": 14.677070617675781, + "learning_rate": 9.938965078774859e-06, + "loss": 4.4704, + "step": 8020 + }, + { + "epoch": 0.23525964989379625, + "grad_norm": 14.067479133605957, + "learning_rate": 9.938718480087449e-06, + "loss": 4.4267, + "step": 8030 + }, + { + "epoch": 0.23555262579652825, + "grad_norm": 15.1693696975708, + "learning_rate": 9.938471387310557e-06, + "loss": 4.4702, + "step": 8040 + }, + { + "epoch": 0.23584560169926025, + "grad_norm": 15.800459861755371, + "learning_rate": 9.9382238004689e-06, + "loss": 4.4692, + "step": 8050 + }, + { + "epoch": 0.23613857760199225, + "grad_norm": 15.882933616638184, + "learning_rate": 9.937975719587252e-06, + "loss": 4.4169, + "step": 8060 + }, + { + "epoch": 0.23643155350472422, + "grad_norm": 14.901392936706543, + "learning_rate": 9.937727144690428e-06, + "loss": 4.4706, + "step": 8070 + }, + { + "epoch": 0.23672452940745622, + "grad_norm": 15.296814918518066, + "learning_rate": 9.937478075803297e-06, + "loss": 4.4646, + "step": 8080 + }, + { + "epoch": 0.23701750531018823, + "grad_norm": 15.307802200317383, + "learning_rate": 9.937228512950778e-06, + "loss": 4.4298, + "step": 8090 + }, + { + "epoch": 0.23731048121292023, + "grad_norm": 14.338201522827148, + "learning_rate": 9.936978456157838e-06, + "loss": 4.4661, + "step": 8100 + }, + { + "epoch": 0.23760345711565223, + "grad_norm": 15.279305458068848, + "learning_rate": 9.936727905449494e-06, + "loss": 4.4494, + "step": 8110 + }, + { + "epoch": 0.23789643301838423, + "grad_norm": 14.621094703674316, + "learning_rate": 9.93647686085081e-06, + "loss": 4.4239, + "step": 8120 + }, + { + "epoch": 0.23818940892111623, + "grad_norm": 14.962873458862305, + "learning_rate": 9.936225322386906e-06, + "loss": 4.4237, + "step": 8130 + }, + { + "epoch": 0.23848238482384823, + "grad_norm": 15.660168647766113, + "learning_rate": 9.935973290082942e-06, + "loss": 4.4262, + "step": 8140 + }, + { + "epoch": 0.23877536072658023, + "grad_norm": 14.76259708404541, + "learning_rate": 9.935720763964133e-06, + "loss": 4.4484, + "step": 8150 + }, + { + "epoch": 0.23906833662931223, + "grad_norm": 12.987884521484375, + "learning_rate": 9.935467744055748e-06, + "loss": 4.4465, + "step": 8160 + }, + { + "epoch": 0.23936131253204423, + "grad_norm": 13.972810745239258, + "learning_rate": 9.935214230383095e-06, + "loss": 4.426, + "step": 8170 + }, + { + "epoch": 0.23965428843477624, + "grad_norm": 14.446979522705078, + "learning_rate": 9.934960222971536e-06, + "loss": 4.4401, + "step": 8180 + }, + { + "epoch": 0.23994726433750824, + "grad_norm": 13.056072235107422, + "learning_rate": 9.934705721846487e-06, + "loss": 4.422, + "step": 8190 + }, + { + "epoch": 0.24000585951805464, + "eval_bleu": 0.29029961141051785, + "eval_cap_loss": 1.1036983728408813, + "eval_con_loss": 1.7988369464874268, + "eval_loss": 4.701372146606445, + "step": 8192 + }, + { + "epoch": 0.24000585951805464, + "eval_bleu": 0.29029961141051785, + "eval_cap_loss": 1.1036983728408813, + "eval_con_loss": 1.7988369464874268, + "eval_loss": 4.701372146606445, + "eval_runtime": 56.8234, + "eval_samples_per_second": 351.968, + "eval_steps_per_second": 0.352, + "step": 8192 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 13.84671688079834, + "learning_rate": 9.934450727033406e-06, + "loss": 4.4299, + "step": 8200 + }, + { + "epoch": 0.24053321614297224, + "grad_norm": 14.655871391296387, + "learning_rate": 9.934195238557805e-06, + "loss": 4.4375, + "step": 8210 + }, + { + "epoch": 0.24082619204570424, + "grad_norm": 13.949317932128906, + "learning_rate": 9.933939256445244e-06, + "loss": 4.434, + "step": 8220 + }, + { + "epoch": 0.24111916794843624, + "grad_norm": 14.066313743591309, + "learning_rate": 9.933682780721333e-06, + "loss": 4.4214, + "step": 8230 + }, + { + "epoch": 0.24141214385116824, + "grad_norm": 14.369131088256836, + "learning_rate": 9.93342581141173e-06, + "loss": 4.433, + "step": 8240 + }, + { + "epoch": 0.24170511975390024, + "grad_norm": 14.582708358764648, + "learning_rate": 9.933168348542144e-06, + "loss": 4.4379, + "step": 8250 + }, + { + "epoch": 0.24199809565663224, + "grad_norm": 14.857731819152832, + "learning_rate": 9.932910392138331e-06, + "loss": 4.4342, + "step": 8260 + }, + { + "epoch": 0.24229107155936425, + "grad_norm": 14.258744239807129, + "learning_rate": 9.9326519422261e-06, + "loss": 4.4182, + "step": 8270 + }, + { + "epoch": 0.24258404746209625, + "grad_norm": 14.235397338867188, + "learning_rate": 9.932392998831305e-06, + "loss": 4.3933, + "step": 8280 + }, + { + "epoch": 0.24287702336482825, + "grad_norm": 14.630913734436035, + "learning_rate": 9.932133561979858e-06, + "loss": 4.4066, + "step": 8290 + }, + { + "epoch": 0.24316999926756025, + "grad_norm": 14.415507316589355, + "learning_rate": 9.931873631697706e-06, + "loss": 4.3799, + "step": 8300 + }, + { + "epoch": 0.24346297517029225, + "grad_norm": 13.807605743408203, + "learning_rate": 9.931613208010857e-06, + "loss": 4.4139, + "step": 8310 + }, + { + "epoch": 0.24375595107302425, + "grad_norm": 15.441695213317871, + "learning_rate": 9.931352290945365e-06, + "loss": 4.3669, + "step": 8320 + }, + { + "epoch": 0.24404892697575625, + "grad_norm": 13.893861770629883, + "learning_rate": 9.931090880527332e-06, + "loss": 4.4301, + "step": 8330 + }, + { + "epoch": 0.24434190287848825, + "grad_norm": 14.244457244873047, + "learning_rate": 9.930828976782914e-06, + "loss": 4.4519, + "step": 8340 + }, + { + "epoch": 0.24463487878122026, + "grad_norm": 15.025555610656738, + "learning_rate": 9.930566579738309e-06, + "loss": 4.3991, + "step": 8350 + }, + { + "epoch": 0.24492785468395226, + "grad_norm": 13.174605369567871, + "learning_rate": 9.930303689419769e-06, + "loss": 4.4271, + "step": 8360 + }, + { + "epoch": 0.24522083058668426, + "grad_norm": 13.303424835205078, + "learning_rate": 9.930040305853596e-06, + "loss": 4.4109, + "step": 8370 + }, + { + "epoch": 0.24551380648941626, + "grad_norm": 14.110280990600586, + "learning_rate": 9.92977642906614e-06, + "loss": 4.3798, + "step": 8380 + }, + { + "epoch": 0.24580678239214823, + "grad_norm": 14.607367515563965, + "learning_rate": 9.9295120590838e-06, + "loss": 4.4208, + "step": 8390 + }, + { + "epoch": 0.24609975829488023, + "grad_norm": 14.500240325927734, + "learning_rate": 9.929247195933022e-06, + "loss": 4.3729, + "step": 8400 + }, + { + "epoch": 0.24639273419761223, + "grad_norm": 14.793094635009766, + "learning_rate": 9.928981839640309e-06, + "loss": 4.4051, + "step": 8410 + }, + { + "epoch": 0.24668571010034424, + "grad_norm": 13.748632431030273, + "learning_rate": 9.928715990232205e-06, + "loss": 4.4136, + "step": 8420 + }, + { + "epoch": 0.24697868600307624, + "grad_norm": 14.084193229675293, + "learning_rate": 9.928449647735306e-06, + "loss": 4.4092, + "step": 8430 + }, + { + "epoch": 0.24727166190580824, + "grad_norm": 15.121562004089355, + "learning_rate": 9.92818281217626e-06, + "loss": 4.4242, + "step": 8440 + }, + { + "epoch": 0.24756463780854024, + "grad_norm": 14.05024242401123, + "learning_rate": 9.927915483581762e-06, + "loss": 4.4163, + "step": 8450 + }, + { + "epoch": 0.24785761371127224, + "grad_norm": 15.426456451416016, + "learning_rate": 9.927647661978556e-06, + "loss": 4.4029, + "step": 8460 + }, + { + "epoch": 0.24815058961400424, + "grad_norm": 14.473140716552734, + "learning_rate": 9.927379347393435e-06, + "loss": 4.3929, + "step": 8470 + }, + { + "epoch": 0.24844356551673624, + "grad_norm": 14.672041893005371, + "learning_rate": 9.927110539853245e-06, + "loss": 4.4085, + "step": 8480 + }, + { + "epoch": 0.24873654141946824, + "grad_norm": 12.812533378601074, + "learning_rate": 9.926841239384875e-06, + "loss": 4.4149, + "step": 8490 + }, + { + "epoch": 0.24902951732220024, + "grad_norm": 12.583483695983887, + "learning_rate": 9.926571446015271e-06, + "loss": 4.378, + "step": 8500 + }, + { + "epoch": 0.24932249322493225, + "grad_norm": 14.042401313781738, + "learning_rate": 9.926301159771422e-06, + "loss": 4.3868, + "step": 8510 + }, + { + "epoch": 0.24961546912766425, + "grad_norm": 14.931730270385742, + "learning_rate": 9.926030380680367e-06, + "loss": 4.3742, + "step": 8520 + }, + { + "epoch": 0.24990844503039625, + "grad_norm": 15.099139213562012, + "learning_rate": 9.9257591087692e-06, + "loss": 4.3868, + "step": 8530 + }, + { + "epoch": 0.25020142093312825, + "grad_norm": 14.3392915725708, + "learning_rate": 9.925487344065054e-06, + "loss": 4.4299, + "step": 8540 + }, + { + "epoch": 0.2504943968358603, + "grad_norm": 13.977326393127441, + "learning_rate": 9.925215086595125e-06, + "loss": 4.3804, + "step": 8550 + }, + { + "epoch": 0.25078737273859225, + "grad_norm": 13.092670440673828, + "learning_rate": 9.924942336386645e-06, + "loss": 4.4004, + "step": 8560 + }, + { + "epoch": 0.2510803486413242, + "grad_norm": 13.439717292785645, + "learning_rate": 9.924669093466903e-06, + "loss": 4.3761, + "step": 8570 + }, + { + "epoch": 0.25137332454405625, + "grad_norm": 16.074674606323242, + "learning_rate": 9.924395357863237e-06, + "loss": 4.4061, + "step": 8580 + }, + { + "epoch": 0.2516663004467882, + "grad_norm": 14.22133731842041, + "learning_rate": 9.924121129603028e-06, + "loss": 4.3814, + "step": 8590 + }, + { + "epoch": 0.25195927634952026, + "grad_norm": 15.63421630859375, + "learning_rate": 9.923846408713715e-06, + "loss": 4.4249, + "step": 8600 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 14.840424537658691, + "learning_rate": 9.923571195222781e-06, + "loss": 4.368, + "step": 8610 + }, + { + "epoch": 0.25254522815498426, + "grad_norm": 15.775569915771484, + "learning_rate": 9.923295489157761e-06, + "loss": 4.4154, + "step": 8620 + }, + { + "epoch": 0.25283820405771623, + "grad_norm": 14.03663158416748, + "learning_rate": 9.923019290546235e-06, + "loss": 4.3954, + "step": 8630 + }, + { + "epoch": 0.25313117996044826, + "grad_norm": 14.050981521606445, + "learning_rate": 9.922742599415836e-06, + "loss": 4.3887, + "step": 8640 + }, + { + "epoch": 0.25342415586318023, + "grad_norm": 13.471781730651855, + "learning_rate": 9.922465415794245e-06, + "loss": 4.3704, + "step": 8650 + }, + { + "epoch": 0.25371713176591226, + "grad_norm": 13.828595161437988, + "learning_rate": 9.922187739709195e-06, + "loss": 4.3865, + "step": 8660 + }, + { + "epoch": 0.25401010766864424, + "grad_norm": 14.789974212646484, + "learning_rate": 9.921909571188462e-06, + "loss": 4.374, + "step": 8670 + }, + { + "epoch": 0.25430308357137626, + "grad_norm": 15.540875434875488, + "learning_rate": 9.921630910259878e-06, + "loss": 4.394, + "step": 8680 + }, + { + "epoch": 0.25459605947410824, + "grad_norm": 14.173938751220703, + "learning_rate": 9.92135175695132e-06, + "loss": 4.3685, + "step": 8690 + }, + { + "epoch": 0.25488903537684027, + "grad_norm": 12.630620956420898, + "learning_rate": 9.921072111290718e-06, + "loss": 4.3728, + "step": 8700 + }, + { + "epoch": 0.2550062257379331, + "eval_bleu": 0.2933061574841199, + "eval_cap_loss": 1.0996713638305664, + "eval_con_loss": 1.7639787197113037, + "eval_loss": 4.627628803253174, + "step": 8704 + }, + { + "epoch": 0.2550062257379331, + "eval_bleu": 0.2933061574841199, + "eval_cap_loss": 1.0996713638305664, + "eval_con_loss": 1.7639787197113037, + "eval_loss": 4.627628803253174, + "eval_runtime": 54.9973, + "eval_samples_per_second": 363.654, + "eval_steps_per_second": 0.364, + "step": 8704 + }, + { + "epoch": 0.25518201127957224, + "grad_norm": 14.249696731567383, + "learning_rate": 9.920791973306046e-06, + "loss": 4.382, + "step": 8710 + }, + { + "epoch": 0.25547498718230427, + "grad_norm": 15.505410194396973, + "learning_rate": 9.920511343025329e-06, + "loss": 4.3581, + "step": 8720 + }, + { + "epoch": 0.25576796308503624, + "grad_norm": 13.93155288696289, + "learning_rate": 9.920230220476646e-06, + "loss": 4.412, + "step": 8730 + }, + { + "epoch": 0.25606093898776827, + "grad_norm": 13.193828582763672, + "learning_rate": 9.919948605688122e-06, + "loss": 4.3406, + "step": 8740 + }, + { + "epoch": 0.25635391489050025, + "grad_norm": 12.611982345581055, + "learning_rate": 9.919666498687925e-06, + "loss": 4.3431, + "step": 8750 + }, + { + "epoch": 0.2566468907932323, + "grad_norm": 13.342901229858398, + "learning_rate": 9.919383899504286e-06, + "loss": 4.3846, + "step": 8760 + }, + { + "epoch": 0.25693986669596425, + "grad_norm": 16.100717544555664, + "learning_rate": 9.919100808165471e-06, + "loss": 4.3677, + "step": 8770 + }, + { + "epoch": 0.2572328425986963, + "grad_norm": 14.11196231842041, + "learning_rate": 9.918817224699806e-06, + "loss": 4.4016, + "step": 8780 + }, + { + "epoch": 0.25752581850142825, + "grad_norm": 13.459188461303711, + "learning_rate": 9.91853314913566e-06, + "loss": 4.3767, + "step": 8790 + }, + { + "epoch": 0.2578187944041603, + "grad_norm": 13.886845588684082, + "learning_rate": 9.918248581501451e-06, + "loss": 4.4058, + "step": 8800 + }, + { + "epoch": 0.25811177030689225, + "grad_norm": 14.893227577209473, + "learning_rate": 9.917963521825653e-06, + "loss": 4.3645, + "step": 8810 + }, + { + "epoch": 0.2584047462096243, + "grad_norm": 14.968053817749023, + "learning_rate": 9.91767797013678e-06, + "loss": 4.3253, + "step": 8820 + }, + { + "epoch": 0.25869772211235625, + "grad_norm": 13.895403861999512, + "learning_rate": 9.917391926463402e-06, + "loss": 4.3494, + "step": 8830 + }, + { + "epoch": 0.2589906980150883, + "grad_norm": 14.576944351196289, + "learning_rate": 9.917105390834137e-06, + "loss": 4.3432, + "step": 8840 + }, + { + "epoch": 0.25928367391782026, + "grad_norm": 13.501432418823242, + "learning_rate": 9.91681836327765e-06, + "loss": 4.38, + "step": 8850 + }, + { + "epoch": 0.2595766498205523, + "grad_norm": 16.21778106689453, + "learning_rate": 9.916530843822655e-06, + "loss": 4.3295, + "step": 8860 + }, + { + "epoch": 0.25986962572328426, + "grad_norm": 14.649847984313965, + "learning_rate": 9.916242832497918e-06, + "loss": 4.3269, + "step": 8870 + }, + { + "epoch": 0.2601626016260163, + "grad_norm": 16.2540283203125, + "learning_rate": 9.915954329332251e-06, + "loss": 4.3724, + "step": 8880 + }, + { + "epoch": 0.26045557752874826, + "grad_norm": 14.151905059814453, + "learning_rate": 9.915665334354521e-06, + "loss": 4.3574, + "step": 8890 + }, + { + "epoch": 0.26074855343148023, + "grad_norm": 14.146515846252441, + "learning_rate": 9.915375847593637e-06, + "loss": 4.3558, + "step": 8900 + }, + { + "epoch": 0.26104152933421226, + "grad_norm": 13.869664192199707, + "learning_rate": 9.915085869078562e-06, + "loss": 4.3178, + "step": 8910 + }, + { + "epoch": 0.26133450523694424, + "grad_norm": 15.299714088439941, + "learning_rate": 9.914795398838306e-06, + "loss": 4.3828, + "step": 8920 + }, + { + "epoch": 0.26162748113967627, + "grad_norm": 15.967257499694824, + "learning_rate": 9.914504436901928e-06, + "loss": 4.3471, + "step": 8930 + }, + { + "epoch": 0.26192045704240824, + "grad_norm": 15.545003890991211, + "learning_rate": 9.91421298329854e-06, + "loss": 4.3832, + "step": 8940 + }, + { + "epoch": 0.26221343294514027, + "grad_norm": 13.543991088867188, + "learning_rate": 9.913921038057295e-06, + "loss": 4.3581, + "step": 8950 + }, + { + "epoch": 0.26250640884787224, + "grad_norm": 14.917508125305176, + "learning_rate": 9.913628601207405e-06, + "loss": 4.3636, + "step": 8960 + }, + { + "epoch": 0.26279938475060427, + "grad_norm": 15.01425838470459, + "learning_rate": 9.913335672778125e-06, + "loss": 4.3555, + "step": 8970 + }, + { + "epoch": 0.26309236065333624, + "grad_norm": 14.739155769348145, + "learning_rate": 9.91304225279876e-06, + "loss": 4.3488, + "step": 8980 + }, + { + "epoch": 0.2633853365560683, + "grad_norm": 16.197938919067383, + "learning_rate": 9.912748341298667e-06, + "loss": 4.3235, + "step": 8990 + }, + { + "epoch": 0.26367831245880025, + "grad_norm": 13.524062156677246, + "learning_rate": 9.912453938307248e-06, + "loss": 4.3771, + "step": 9000 + }, + { + "epoch": 0.2639712883615323, + "grad_norm": 14.033989906311035, + "learning_rate": 9.912159043853958e-06, + "loss": 4.3636, + "step": 9010 + }, + { + "epoch": 0.26426426426426425, + "grad_norm": 15.918126106262207, + "learning_rate": 9.911863657968296e-06, + "loss": 4.3542, + "step": 9020 + }, + { + "epoch": 0.2645572401669963, + "grad_norm": 14.741896629333496, + "learning_rate": 9.911567780679818e-06, + "loss": 4.3366, + "step": 9030 + }, + { + "epoch": 0.26485021606972825, + "grad_norm": 14.873620986938477, + "learning_rate": 9.911271412018123e-06, + "loss": 4.3503, + "step": 9040 + }, + { + "epoch": 0.2651431919724603, + "grad_norm": 14.796981811523438, + "learning_rate": 9.91097455201286e-06, + "loss": 4.353, + "step": 9050 + }, + { + "epoch": 0.26543616787519225, + "grad_norm": 13.028640747070312, + "learning_rate": 9.91067720069373e-06, + "loss": 4.348, + "step": 9060 + }, + { + "epoch": 0.2657291437779243, + "grad_norm": 15.33365535736084, + "learning_rate": 9.91037935809048e-06, + "loss": 4.3391, + "step": 9070 + }, + { + "epoch": 0.26602211968065625, + "grad_norm": 14.86514663696289, + "learning_rate": 9.910081024232908e-06, + "loss": 4.3311, + "step": 9080 + }, + { + "epoch": 0.2663150955833883, + "grad_norm": 11.930192947387695, + "learning_rate": 9.909782199150856e-06, + "loss": 4.3313, + "step": 9090 + }, + { + "epoch": 0.26660807148612026, + "grad_norm": 12.77729606628418, + "learning_rate": 9.909482882874228e-06, + "loss": 4.3414, + "step": 9100 + }, + { + "epoch": 0.2669010473888523, + "grad_norm": 14.850422859191895, + "learning_rate": 9.909183075432965e-06, + "loss": 4.3672, + "step": 9110 + }, + { + "epoch": 0.26719402329158426, + "grad_norm": 13.311178207397461, + "learning_rate": 9.908882776857057e-06, + "loss": 4.348, + "step": 9120 + }, + { + "epoch": 0.2674869991943163, + "grad_norm": 14.393486022949219, + "learning_rate": 9.908581987176552e-06, + "loss": 4.3428, + "step": 9130 + }, + { + "epoch": 0.26777997509704826, + "grad_norm": 14.444518089294434, + "learning_rate": 9.908280706421543e-06, + "loss": 4.3487, + "step": 9140 + }, + { + "epoch": 0.2680729509997803, + "grad_norm": 13.894659042358398, + "learning_rate": 9.907978934622167e-06, + "loss": 4.335, + "step": 9150 + }, + { + "epoch": 0.26836592690251226, + "grad_norm": 16.001312255859375, + "learning_rate": 9.907676671808616e-06, + "loss": 4.3419, + "step": 9160 + }, + { + "epoch": 0.2686589028052443, + "grad_norm": 13.498442649841309, + "learning_rate": 9.907373918011132e-06, + "loss": 4.3344, + "step": 9170 + }, + { + "epoch": 0.26895187870797627, + "grad_norm": 14.883193016052246, + "learning_rate": 9.90707067326e-06, + "loss": 4.3263, + "step": 9180 + }, + { + "epoch": 0.2692448546107083, + "grad_norm": 14.060989379882812, + "learning_rate": 9.906766937585562e-06, + "loss": 4.3475, + "step": 9190 + }, + { + "epoch": 0.26953783051344027, + "grad_norm": 15.321045875549316, + "learning_rate": 9.906462711018201e-06, + "loss": 4.3214, + "step": 9200 + }, + { + "epoch": 0.2698308064161723, + "grad_norm": 14.382736206054688, + "learning_rate": 9.906157993588356e-06, + "loss": 4.3444, + "step": 9210 + }, + { + "epoch": 0.27000659195781146, + "eval_bleu": 0.29397792113243565, + "eval_cap_loss": 1.0916802883148193, + "eval_con_loss": 1.7469627857208252, + "eval_loss": 4.585606098175049, + "step": 9216 + }, + { + "epoch": 0.27000659195781146, + "eval_bleu": 0.29397792113243565, + "eval_cap_loss": 1.0916802883148193, + "eval_con_loss": 1.7469627857208252, + "eval_loss": 4.585606098175049, + "eval_runtime": 52.8958, + "eval_samples_per_second": 378.102, + "eval_steps_per_second": 0.378, + "step": 9216 + }, + { + "epoch": 0.27012378231890427, + "grad_norm": 15.555707931518555, + "learning_rate": 9.90585278532651e-06, + "loss": 4.3285, + "step": 9220 + }, + { + "epoch": 0.27041675822163624, + "grad_norm": 15.94727897644043, + "learning_rate": 9.905547086263198e-06, + "loss": 4.3493, + "step": 9230 + }, + { + "epoch": 0.2707097341243683, + "grad_norm": 14.759271621704102, + "learning_rate": 9.905240896429004e-06, + "loss": 4.3149, + "step": 9240 + }, + { + "epoch": 0.27100271002710025, + "grad_norm": 13.352738380432129, + "learning_rate": 9.904934215854561e-06, + "loss": 4.3235, + "step": 9250 + }, + { + "epoch": 0.2712956859298323, + "grad_norm": 14.228279113769531, + "learning_rate": 9.904627044570549e-06, + "loss": 4.3385, + "step": 9260 + }, + { + "epoch": 0.27158866183256425, + "grad_norm": 13.891241073608398, + "learning_rate": 9.9043193826077e-06, + "loss": 4.3288, + "step": 9270 + }, + { + "epoch": 0.2718816377352963, + "grad_norm": 14.4035005569458, + "learning_rate": 9.904011229996793e-06, + "loss": 4.3337, + "step": 9280 + }, + { + "epoch": 0.27217461363802825, + "grad_norm": 14.615246772766113, + "learning_rate": 9.903702586768656e-06, + "loss": 4.3487, + "step": 9290 + }, + { + "epoch": 0.2724675895407603, + "grad_norm": 13.729811668395996, + "learning_rate": 9.903393452954169e-06, + "loss": 4.3035, + "step": 9300 + }, + { + "epoch": 0.27276056544349225, + "grad_norm": 14.009378433227539, + "learning_rate": 9.903083828584257e-06, + "loss": 4.2972, + "step": 9310 + }, + { + "epoch": 0.2730535413462243, + "grad_norm": 15.556389808654785, + "learning_rate": 9.902773713689897e-06, + "loss": 4.2998, + "step": 9320 + }, + { + "epoch": 0.27334651724895626, + "grad_norm": 14.040820121765137, + "learning_rate": 9.902463108302115e-06, + "loss": 4.3073, + "step": 9330 + }, + { + "epoch": 0.2736394931516883, + "grad_norm": 12.42579460144043, + "learning_rate": 9.902152012451983e-06, + "loss": 4.2972, + "step": 9340 + }, + { + "epoch": 0.27393246905442026, + "grad_norm": 13.483555793762207, + "learning_rate": 9.901840426170628e-06, + "loss": 4.3572, + "step": 9350 + }, + { + "epoch": 0.2742254449571523, + "grad_norm": 14.139854431152344, + "learning_rate": 9.901528349489218e-06, + "loss": 4.3301, + "step": 9360 + }, + { + "epoch": 0.27451842085988426, + "grad_norm": 13.920985221862793, + "learning_rate": 9.901215782438976e-06, + "loss": 4.3238, + "step": 9370 + }, + { + "epoch": 0.2748113967626163, + "grad_norm": 13.548871040344238, + "learning_rate": 9.900902725051174e-06, + "loss": 4.2841, + "step": 9380 + }, + { + "epoch": 0.27510437266534826, + "grad_norm": 15.188814163208008, + "learning_rate": 9.90058917735713e-06, + "loss": 4.3233, + "step": 9390 + }, + { + "epoch": 0.2753973485680803, + "grad_norm": 14.42529296875, + "learning_rate": 9.900275139388212e-06, + "loss": 4.3329, + "step": 9400 + }, + { + "epoch": 0.27569032447081226, + "grad_norm": 12.826614379882812, + "learning_rate": 9.899960611175841e-06, + "loss": 4.3121, + "step": 9410 + }, + { + "epoch": 0.2759833003735443, + "grad_norm": 14.142885208129883, + "learning_rate": 9.899645592751479e-06, + "loss": 4.315, + "step": 9420 + }, + { + "epoch": 0.27627627627627627, + "grad_norm": 15.172396659851074, + "learning_rate": 9.899330084146646e-06, + "loss": 4.2997, + "step": 9430 + }, + { + "epoch": 0.2765692521790083, + "grad_norm": 14.049992561340332, + "learning_rate": 9.899014085392903e-06, + "loss": 4.3083, + "step": 9440 + }, + { + "epoch": 0.27686222808174027, + "grad_norm": 13.017592430114746, + "learning_rate": 9.898697596521866e-06, + "loss": 4.2846, + "step": 9450 + }, + { + "epoch": 0.2771552039844723, + "grad_norm": 14.073290824890137, + "learning_rate": 9.898380617565198e-06, + "loss": 4.3403, + "step": 9460 + }, + { + "epoch": 0.27744817988720427, + "grad_norm": 15.1058349609375, + "learning_rate": 9.898063148554612e-06, + "loss": 4.3237, + "step": 9470 + }, + { + "epoch": 0.2777411557899363, + "grad_norm": 14.08220100402832, + "learning_rate": 9.897745189521863e-06, + "loss": 4.3085, + "step": 9480 + }, + { + "epoch": 0.2780341316926683, + "grad_norm": 13.763900756835938, + "learning_rate": 9.897426740498768e-06, + "loss": 4.268, + "step": 9490 + }, + { + "epoch": 0.2783271075954003, + "grad_norm": 13.0377836227417, + "learning_rate": 9.897107801517183e-06, + "loss": 4.3181, + "step": 9500 + }, + { + "epoch": 0.2786200834981323, + "grad_norm": 13.38405704498291, + "learning_rate": 9.896788372609016e-06, + "loss": 4.3036, + "step": 9510 + }, + { + "epoch": 0.2789130594008643, + "grad_norm": 13.501379013061523, + "learning_rate": 9.896468453806223e-06, + "loss": 4.3443, + "step": 9520 + }, + { + "epoch": 0.2792060353035963, + "grad_norm": 12.542980194091797, + "learning_rate": 9.896148045140811e-06, + "loss": 4.2636, + "step": 9530 + }, + { + "epoch": 0.2794990112063283, + "grad_norm": 14.669488906860352, + "learning_rate": 9.895827146644837e-06, + "loss": 4.3224, + "step": 9540 + }, + { + "epoch": 0.2797919871090603, + "grad_norm": 13.875134468078613, + "learning_rate": 9.895505758350401e-06, + "loss": 4.2886, + "step": 9550 + }, + { + "epoch": 0.28008496301179225, + "grad_norm": 13.738343238830566, + "learning_rate": 9.895183880289658e-06, + "loss": 4.2874, + "step": 9560 + }, + { + "epoch": 0.2803779389145243, + "grad_norm": 15.915310859680176, + "learning_rate": 9.89486151249481e-06, + "loss": 4.3092, + "step": 9570 + }, + { + "epoch": 0.28067091481725626, + "grad_norm": 14.337254524230957, + "learning_rate": 9.894538654998109e-06, + "loss": 4.315, + "step": 9580 + }, + { + "epoch": 0.2809638907199883, + "grad_norm": 13.73699951171875, + "learning_rate": 9.894215307831851e-06, + "loss": 4.2737, + "step": 9590 + }, + { + "epoch": 0.28125686662272026, + "grad_norm": 12.997262954711914, + "learning_rate": 9.893891471028392e-06, + "loss": 4.2865, + "step": 9600 + }, + { + "epoch": 0.2815498425254523, + "grad_norm": 14.547399520874023, + "learning_rate": 9.893567144620123e-06, + "loss": 4.2793, + "step": 9610 + }, + { + "epoch": 0.28184281842818426, + "grad_norm": 14.841904640197754, + "learning_rate": 9.893242328639494e-06, + "loss": 4.285, + "step": 9620 + }, + { + "epoch": 0.2821357943309163, + "grad_norm": 14.021801948547363, + "learning_rate": 9.892917023119002e-06, + "loss": 4.2774, + "step": 9630 + }, + { + "epoch": 0.28242877023364826, + "grad_norm": 13.662957191467285, + "learning_rate": 9.892591228091188e-06, + "loss": 4.2732, + "step": 9640 + }, + { + "epoch": 0.2827217461363803, + "grad_norm": 14.173666954040527, + "learning_rate": 9.89226494358865e-06, + "loss": 4.2545, + "step": 9650 + }, + { + "epoch": 0.28301472203911227, + "grad_norm": 14.063300132751465, + "learning_rate": 9.891938169644028e-06, + "loss": 4.2874, + "step": 9660 + }, + { + "epoch": 0.2833076979418443, + "grad_norm": 14.732041358947754, + "learning_rate": 9.891610906290016e-06, + "loss": 4.3185, + "step": 9670 + }, + { + "epoch": 0.28360067384457627, + "grad_norm": 15.460115432739258, + "learning_rate": 9.891283153559355e-06, + "loss": 4.2985, + "step": 9680 + }, + { + "epoch": 0.2838936497473083, + "grad_norm": 13.477441787719727, + "learning_rate": 9.890954911484833e-06, + "loss": 4.2896, + "step": 9690 + }, + { + "epoch": 0.28418662565004027, + "grad_norm": 13.448600769042969, + "learning_rate": 9.890626180099287e-06, + "loss": 4.276, + "step": 9700 + }, + { + "epoch": 0.2844796015527723, + "grad_norm": 15.115647315979004, + "learning_rate": 9.890296959435608e-06, + "loss": 4.2927, + "step": 9710 + }, + { + "epoch": 0.28477257745550427, + "grad_norm": 14.486165046691895, + "learning_rate": 9.889967249526733e-06, + "loss": 4.294, + "step": 9720 + }, + { + "epoch": 0.2850069581776899, + "eval_bleu": 0.2967971170918398, + "eval_cap_loss": 1.085601568222046, + "eval_con_loss": 1.7247345447540283, + "eval_loss": 4.535070419311523, + "step": 9728 + }, + { + "epoch": 0.2850069581776899, + "eval_bleu": 0.2967971170918398, + "eval_cap_loss": 1.085601568222046, + "eval_con_loss": 1.7247345447540283, + "eval_loss": 4.535070419311523, + "eval_runtime": 54.2463, + "eval_samples_per_second": 368.689, + "eval_steps_per_second": 0.369, + "step": 9728 + }, + { + "epoch": 0.2850655533582363, + "grad_norm": 13.825709342956543, + "learning_rate": 9.889637050405645e-06, + "loss": 4.2915, + "step": 9730 + }, + { + "epoch": 0.2853585292609683, + "grad_norm": 14.563608169555664, + "learning_rate": 9.889306362105377e-06, + "loss": 4.2675, + "step": 9740 + }, + { + "epoch": 0.2856515051637003, + "grad_norm": 14.051064491271973, + "learning_rate": 9.888975184659018e-06, + "loss": 4.2685, + "step": 9750 + }, + { + "epoch": 0.2859444810664323, + "grad_norm": 14.896685600280762, + "learning_rate": 9.888643518099696e-06, + "loss": 4.2823, + "step": 9760 + }, + { + "epoch": 0.2862374569691643, + "grad_norm": 14.46728515625, + "learning_rate": 9.888311362460592e-06, + "loss": 4.2739, + "step": 9770 + }, + { + "epoch": 0.2865304328718963, + "grad_norm": 13.626880645751953, + "learning_rate": 9.88797871777494e-06, + "loss": 4.2725, + "step": 9780 + }, + { + "epoch": 0.2868234087746283, + "grad_norm": 12.810077667236328, + "learning_rate": 9.887645584076014e-06, + "loss": 4.2795, + "step": 9790 + }, + { + "epoch": 0.2871163846773603, + "grad_norm": 12.882402420043945, + "learning_rate": 9.887311961397146e-06, + "loss": 4.2719, + "step": 9800 + }, + { + "epoch": 0.2874093605800923, + "grad_norm": 15.372468948364258, + "learning_rate": 9.88697784977171e-06, + "loss": 4.2509, + "step": 9810 + }, + { + "epoch": 0.2877023364828243, + "grad_norm": 15.10167407989502, + "learning_rate": 9.886643249233137e-06, + "loss": 4.2736, + "step": 9820 + }, + { + "epoch": 0.2879953123855563, + "grad_norm": 13.485176086425781, + "learning_rate": 9.886308159814896e-06, + "loss": 4.2843, + "step": 9830 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 15.346785545349121, + "learning_rate": 9.885972581550514e-06, + "loss": 4.2873, + "step": 9840 + }, + { + "epoch": 0.2885812641910203, + "grad_norm": 11.945778846740723, + "learning_rate": 9.885636514473563e-06, + "loss": 4.2561, + "step": 9850 + }, + { + "epoch": 0.2888742400937523, + "grad_norm": 15.248359680175781, + "learning_rate": 9.885299958617662e-06, + "loss": 4.2507, + "step": 9860 + }, + { + "epoch": 0.28916721599648426, + "grad_norm": 15.212157249450684, + "learning_rate": 9.884962914016484e-06, + "loss": 4.2911, + "step": 9870 + }, + { + "epoch": 0.2894601918992163, + "grad_norm": 12.31901741027832, + "learning_rate": 9.884625380703751e-06, + "loss": 4.2545, + "step": 9880 + }, + { + "epoch": 0.28975316780194826, + "grad_norm": 13.03384017944336, + "learning_rate": 9.884287358713226e-06, + "loss": 4.2667, + "step": 9890 + }, + { + "epoch": 0.2900461437046803, + "grad_norm": 13.680036544799805, + "learning_rate": 9.883948848078728e-06, + "loss": 4.2934, + "step": 9900 + }, + { + "epoch": 0.29033911960741227, + "grad_norm": 14.331565856933594, + "learning_rate": 9.883609848834123e-06, + "loss": 4.2397, + "step": 9910 + }, + { + "epoch": 0.2906320955101443, + "grad_norm": 12.314718246459961, + "learning_rate": 9.883270361013328e-06, + "loss": 4.2438, + "step": 9920 + }, + { + "epoch": 0.29092507141287627, + "grad_norm": 12.984600067138672, + "learning_rate": 9.882930384650303e-06, + "loss": 4.2567, + "step": 9930 + }, + { + "epoch": 0.2912180473156083, + "grad_norm": 13.481085777282715, + "learning_rate": 9.882589919779065e-06, + "loss": 4.2403, + "step": 9940 + }, + { + "epoch": 0.29151102321834027, + "grad_norm": 13.600337028503418, + "learning_rate": 9.882248966433669e-06, + "loss": 4.2417, + "step": 9950 + }, + { + "epoch": 0.2918039991210723, + "grad_norm": 13.558712005615234, + "learning_rate": 9.881907524648232e-06, + "loss": 4.2874, + "step": 9960 + }, + { + "epoch": 0.2920969750238043, + "grad_norm": 16.10821533203125, + "learning_rate": 9.88156559445691e-06, + "loss": 4.2772, + "step": 9970 + }, + { + "epoch": 0.2923899509265363, + "grad_norm": 14.361590385437012, + "learning_rate": 9.881223175893912e-06, + "loss": 4.2619, + "step": 9980 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 14.739971160888672, + "learning_rate": 9.880880268993494e-06, + "loss": 4.2289, + "step": 9990 + }, + { + "epoch": 0.2929759027320003, + "grad_norm": 15.630659103393555, + "learning_rate": 9.880536873789962e-06, + "loss": 4.2958, + "step": 10000 + }, + { + "epoch": 0.2932688786347323, + "grad_norm": 13.611684799194336, + "learning_rate": 9.880192990317671e-06, + "loss": 4.2571, + "step": 10010 + }, + { + "epoch": 0.2935618545374643, + "grad_norm": 14.070144653320312, + "learning_rate": 9.879848618611026e-06, + "loss": 4.2275, + "step": 10020 + }, + { + "epoch": 0.2938548304401963, + "grad_norm": 14.600704193115234, + "learning_rate": 9.879503758704476e-06, + "loss": 4.2345, + "step": 10030 + }, + { + "epoch": 0.2941478063429283, + "grad_norm": 14.663426399230957, + "learning_rate": 9.879158410632525e-06, + "loss": 4.2171, + "step": 10040 + }, + { + "epoch": 0.2944407822456603, + "grad_norm": 13.32021427154541, + "learning_rate": 9.878812574429722e-06, + "loss": 4.2491, + "step": 10050 + }, + { + "epoch": 0.2947337581483923, + "grad_norm": 14.462124824523926, + "learning_rate": 9.878466250130665e-06, + "loss": 4.2507, + "step": 10060 + }, + { + "epoch": 0.2950267340511243, + "grad_norm": 15.131163597106934, + "learning_rate": 9.878119437770003e-06, + "loss": 4.2594, + "step": 10070 + }, + { + "epoch": 0.2953197099538563, + "grad_norm": 13.835153579711914, + "learning_rate": 9.877772137382433e-06, + "loss": 4.2553, + "step": 10080 + }, + { + "epoch": 0.2956126858565883, + "grad_norm": 13.626821517944336, + "learning_rate": 9.8774243490027e-06, + "loss": 4.2345, + "step": 10090 + }, + { + "epoch": 0.2959056617593203, + "grad_norm": 14.184571266174316, + "learning_rate": 9.877076072665596e-06, + "loss": 4.2784, + "step": 10100 + }, + { + "epoch": 0.2961986376620523, + "grad_norm": 13.360740661621094, + "learning_rate": 9.876727308405967e-06, + "loss": 4.2535, + "step": 10110 + }, + { + "epoch": 0.2964916135647843, + "grad_norm": 13.56796932220459, + "learning_rate": 9.876378056258703e-06, + "loss": 4.2267, + "step": 10120 + }, + { + "epoch": 0.2967845894675163, + "grad_norm": 15.53066349029541, + "learning_rate": 9.876028316258745e-06, + "loss": 4.2468, + "step": 10130 + }, + { + "epoch": 0.2970775653702483, + "grad_norm": 13.847089767456055, + "learning_rate": 9.875678088441083e-06, + "loss": 4.2599, + "step": 10140 + }, + { + "epoch": 0.2973705412729803, + "grad_norm": 13.327733993530273, + "learning_rate": 9.875327372840753e-06, + "loss": 4.2454, + "step": 10150 + }, + { + "epoch": 0.2976635171757123, + "grad_norm": 14.269998550415039, + "learning_rate": 9.874976169492845e-06, + "loss": 4.2549, + "step": 10160 + }, + { + "epoch": 0.2979564930784443, + "grad_norm": 15.102872848510742, + "learning_rate": 9.874624478432494e-06, + "loss": 4.2537, + "step": 10170 + }, + { + "epoch": 0.2982494689811763, + "grad_norm": 13.668438911437988, + "learning_rate": 9.874272299694883e-06, + "loss": 4.2315, + "step": 10180 + }, + { + "epoch": 0.2985424448839083, + "grad_norm": 13.578145027160645, + "learning_rate": 9.873919633315246e-06, + "loss": 4.229, + "step": 10190 + }, + { + "epoch": 0.29883542078664027, + "grad_norm": 13.826428413391113, + "learning_rate": 9.873566479328867e-06, + "loss": 4.2453, + "step": 10200 + }, + { + "epoch": 0.2991283966893723, + "grad_norm": 13.105527877807617, + "learning_rate": 9.873212837771074e-06, + "loss": 4.2625, + "step": 10210 + }, + { + "epoch": 0.2994213725921043, + "grad_norm": 15.577691078186035, + "learning_rate": 9.872858708677249e-06, + "loss": 4.2356, + "step": 10220 + }, + { + "epoch": 0.2997143484948363, + "grad_norm": 12.90665054321289, + "learning_rate": 9.87250409208282e-06, + "loss": 4.2322, + "step": 10230 + }, + { + "epoch": 0.3000073243975683, + "grad_norm": 13.697235107421875, + "learning_rate": 9.872148988023265e-06, + "loss": 4.2431, + "step": 10240 + }, + { + "epoch": 0.3000073243975683, + "eval_bleu": 0.2992839422459736, + "eval_cap_loss": 1.077258586883545, + "eval_con_loss": 1.7040789127349854, + "eval_loss": 4.485416412353516, + "step": 10240 + }, + { + "epoch": 0.3000073243975683, + "eval_bleu": 0.2992839422459736, + "eval_cap_loss": 1.077258586883545, + "eval_con_loss": 1.7040789127349854, + "eval_loss": 4.485416412353516, + "eval_runtime": 58.1895, + "eval_samples_per_second": 343.705, + "eval_steps_per_second": 0.344, + "step": 10240 + }, + { + "epoch": 0.3003003003003003, + "grad_norm": 13.052034378051758, + "learning_rate": 9.871793396534109e-06, + "loss": 4.2512, + "step": 10250 + }, + { + "epoch": 0.3005932762030323, + "grad_norm": 14.481064796447754, + "learning_rate": 9.871437317650926e-06, + "loss": 4.2256, + "step": 10260 + }, + { + "epoch": 0.3008862521057643, + "grad_norm": 13.707125663757324, + "learning_rate": 9.871080751409341e-06, + "loss": 4.2534, + "step": 10270 + }, + { + "epoch": 0.3011792280084963, + "grad_norm": 13.05585765838623, + "learning_rate": 9.870723697845026e-06, + "loss": 4.2549, + "step": 10280 + }, + { + "epoch": 0.3014722039112283, + "grad_norm": 15.04872989654541, + "learning_rate": 9.870366156993703e-06, + "loss": 4.223, + "step": 10290 + }, + { + "epoch": 0.3017651798139603, + "grad_norm": 14.226932525634766, + "learning_rate": 9.870008128891139e-06, + "loss": 4.2248, + "step": 10300 + }, + { + "epoch": 0.3020581557166923, + "grad_norm": 14.618051528930664, + "learning_rate": 9.869649613573157e-06, + "loss": 4.2466, + "step": 10310 + }, + { + "epoch": 0.3023511316194243, + "grad_norm": 14.710643768310547, + "learning_rate": 9.86929061107562e-06, + "loss": 4.217, + "step": 10320 + }, + { + "epoch": 0.3026441075221563, + "grad_norm": 12.669486045837402, + "learning_rate": 9.868931121434446e-06, + "loss": 4.2342, + "step": 10330 + }, + { + "epoch": 0.3029370834248883, + "grad_norm": 14.564130783081055, + "learning_rate": 9.8685711446856e-06, + "loss": 4.2149, + "step": 10340 + }, + { + "epoch": 0.3032300593276203, + "grad_norm": 13.864930152893066, + "learning_rate": 9.868210680865096e-06, + "loss": 4.2057, + "step": 10350 + }, + { + "epoch": 0.3035230352303523, + "grad_norm": 14.688220024108887, + "learning_rate": 9.867849730008994e-06, + "loss": 4.2286, + "step": 10360 + }, + { + "epoch": 0.3038160111330843, + "grad_norm": 14.4833402633667, + "learning_rate": 9.867488292153407e-06, + "loss": 4.2439, + "step": 10370 + }, + { + "epoch": 0.3041089870358163, + "grad_norm": 12.015641212463379, + "learning_rate": 9.867126367334495e-06, + "loss": 4.2256, + "step": 10380 + }, + { + "epoch": 0.3044019629385483, + "grad_norm": 14.051802635192871, + "learning_rate": 9.866763955588465e-06, + "loss": 4.2255, + "step": 10390 + }, + { + "epoch": 0.3046949388412803, + "grad_norm": 12.659721374511719, + "learning_rate": 9.866401056951575e-06, + "loss": 4.1965, + "step": 10400 + }, + { + "epoch": 0.3049879147440123, + "grad_norm": 14.10120677947998, + "learning_rate": 9.866037671460129e-06, + "loss": 4.2388, + "step": 10410 + }, + { + "epoch": 0.3052808906467443, + "grad_norm": 14.442778587341309, + "learning_rate": 9.865673799150485e-06, + "loss": 4.2058, + "step": 10420 + }, + { + "epoch": 0.3055738665494763, + "grad_norm": 14.678797721862793, + "learning_rate": 9.865309440059044e-06, + "loss": 4.1948, + "step": 10430 + }, + { + "epoch": 0.3058668424522083, + "grad_norm": 16.48200035095215, + "learning_rate": 9.864944594222259e-06, + "loss": 4.2191, + "step": 10440 + }, + { + "epoch": 0.3061598183549403, + "grad_norm": 15.201152801513672, + "learning_rate": 9.86457926167663e-06, + "loss": 4.2086, + "step": 10450 + }, + { + "epoch": 0.3064527942576723, + "grad_norm": 13.527443885803223, + "learning_rate": 9.864213442458705e-06, + "loss": 4.2116, + "step": 10460 + }, + { + "epoch": 0.30674577016040433, + "grad_norm": 12.240650177001953, + "learning_rate": 9.863847136605085e-06, + "loss": 4.2252, + "step": 10470 + }, + { + "epoch": 0.3070387460631363, + "grad_norm": 14.205978393554688, + "learning_rate": 9.863480344152414e-06, + "loss": 4.2231, + "step": 10480 + }, + { + "epoch": 0.30733172196586833, + "grad_norm": 13.153482437133789, + "learning_rate": 9.86311306513739e-06, + "loss": 4.2081, + "step": 10490 + }, + { + "epoch": 0.3076246978686003, + "grad_norm": 12.556747436523438, + "learning_rate": 9.862745299596754e-06, + "loss": 4.17, + "step": 10500 + }, + { + "epoch": 0.30791767377133233, + "grad_norm": 14.265571594238281, + "learning_rate": 9.862377047567301e-06, + "loss": 4.1909, + "step": 10510 + }, + { + "epoch": 0.3082106496740643, + "grad_norm": 14.491252899169922, + "learning_rate": 9.862008309085873e-06, + "loss": 4.2239, + "step": 10520 + }, + { + "epoch": 0.3085036255767963, + "grad_norm": 14.464677810668945, + "learning_rate": 9.861639084189358e-06, + "loss": 4.2325, + "step": 10530 + }, + { + "epoch": 0.3087966014795283, + "grad_norm": 12.68281078338623, + "learning_rate": 9.861269372914697e-06, + "loss": 4.1991, + "step": 10540 + }, + { + "epoch": 0.3090895773822603, + "grad_norm": 13.245171546936035, + "learning_rate": 9.860899175298876e-06, + "loss": 4.2382, + "step": 10550 + }, + { + "epoch": 0.3093825532849923, + "grad_norm": 14.007186889648438, + "learning_rate": 9.860528491378929e-06, + "loss": 4.2098, + "step": 10560 + }, + { + "epoch": 0.3096755291877243, + "grad_norm": 13.302517890930176, + "learning_rate": 9.860157321191944e-06, + "loss": 4.1855, + "step": 10570 + }, + { + "epoch": 0.3099685050904563, + "grad_norm": 13.241037368774414, + "learning_rate": 9.859785664775054e-06, + "loss": 4.1632, + "step": 10580 + }, + { + "epoch": 0.3102614809931883, + "grad_norm": 14.82482624053955, + "learning_rate": 9.85941352216544e-06, + "loss": 4.2037, + "step": 10590 + }, + { + "epoch": 0.3105544568959203, + "grad_norm": 13.52000904083252, + "learning_rate": 9.859078178152779e-06, + "loss": 4.2053, + "step": 10600 + }, + { + "epoch": 0.3108474327986523, + "grad_norm": 13.891847610473633, + "learning_rate": 9.858705111879601e-06, + "loss": 4.1977, + "step": 10610 + }, + { + "epoch": 0.3111404087013843, + "grad_norm": 12.626907348632812, + "learning_rate": 9.858331559521801e-06, + "loss": 4.1966, + "step": 10620 + }, + { + "epoch": 0.3114333846041163, + "grad_norm": 16.07732582092285, + "learning_rate": 9.857957521116755e-06, + "loss": 4.2073, + "step": 10630 + }, + { + "epoch": 0.3117263605068483, + "grad_norm": 13.850656509399414, + "learning_rate": 9.857582996701878e-06, + "loss": 4.2023, + "step": 10640 + }, + { + "epoch": 0.3120193364095803, + "grad_norm": 14.266563415527344, + "learning_rate": 9.85720798631464e-06, + "loss": 4.1892, + "step": 10650 + }, + { + "epoch": 0.3123123123123123, + "grad_norm": 13.789026260375977, + "learning_rate": 9.856832489992562e-06, + "loss": 4.1951, + "step": 10660 + }, + { + "epoch": 0.3126052882150443, + "grad_norm": 14.479269981384277, + "learning_rate": 9.856456507773207e-06, + "loss": 4.2225, + "step": 10670 + }, + { + "epoch": 0.3128982641177763, + "grad_norm": 13.662842750549316, + "learning_rate": 9.85608003969419e-06, + "loss": 4.1986, + "step": 10680 + }, + { + "epoch": 0.3131912400205083, + "grad_norm": 13.206548690795898, + "learning_rate": 9.855703085793178e-06, + "loss": 4.2026, + "step": 10690 + }, + { + "epoch": 0.31348421592324033, + "grad_norm": 14.557141304016113, + "learning_rate": 9.855325646107878e-06, + "loss": 4.163, + "step": 10700 + }, + { + "epoch": 0.3137771918259723, + "grad_norm": 13.250200271606445, + "learning_rate": 9.854947720676053e-06, + "loss": 4.1846, + "step": 10710 + }, + { + "epoch": 0.31407016772870433, + "grad_norm": 16.079437255859375, + "learning_rate": 9.854569309535513e-06, + "loss": 4.1941, + "step": 10720 + }, + { + "epoch": 0.3143631436314363, + "grad_norm": 13.987637519836426, + "learning_rate": 9.854190412724114e-06, + "loss": 4.1871, + "step": 10730 + }, + { + "epoch": 0.31465611953416833, + "grad_norm": 15.212264060974121, + "learning_rate": 9.853811030279763e-06, + "loss": 4.1914, + "step": 10740 + }, + { + "epoch": 0.3149490954369003, + "grad_norm": 12.636719703674316, + "learning_rate": 9.853431162240414e-06, + "loss": 4.2047, + "step": 10750 + }, + { + "epoch": 0.3150076906174467, + "eval_bleu": 0.3003326634171149, + "eval_cap_loss": 1.0704299211502075, + "eval_con_loss": 1.6826057434082031, + "eval_loss": 4.435641288757324, + "step": 10752 + }, + { + "epoch": 0.3150076906174467, + "eval_bleu": 0.3003326634171149, + "eval_cap_loss": 1.0704299211502075, + "eval_con_loss": 1.6826057434082031, + "eval_loss": 4.435641288757324, + "eval_runtime": 55.3053, + "eval_samples_per_second": 361.629, + "eval_steps_per_second": 0.362, + "step": 10752 + }, + { + "epoch": 0.31524207133963233, + "grad_norm": 13.47153091430664, + "learning_rate": 9.853050808644073e-06, + "loss": 4.2018, + "step": 10760 + }, + { + "epoch": 0.3155350472423643, + "grad_norm": 14.186649322509766, + "learning_rate": 9.852669969528792e-06, + "loss": 4.1784, + "step": 10770 + }, + { + "epoch": 0.31582802314509634, + "grad_norm": 13.116604804992676, + "learning_rate": 9.852288644932668e-06, + "loss": 4.1797, + "step": 10780 + }, + { + "epoch": 0.3161209990478283, + "grad_norm": 13.540607452392578, + "learning_rate": 9.851906834893853e-06, + "loss": 4.1829, + "step": 10790 + }, + { + "epoch": 0.31641397495056034, + "grad_norm": 13.841035842895508, + "learning_rate": 9.851524539450547e-06, + "loss": 4.1711, + "step": 10800 + }, + { + "epoch": 0.3167069508532923, + "grad_norm": 12.5095796585083, + "learning_rate": 9.851141758640992e-06, + "loss": 4.1747, + "step": 10810 + }, + { + "epoch": 0.31699992675602434, + "grad_norm": 14.508861541748047, + "learning_rate": 9.850758492503484e-06, + "loss": 4.1764, + "step": 10820 + }, + { + "epoch": 0.3172929026587563, + "grad_norm": 13.648031234741211, + "learning_rate": 9.85037474107637e-06, + "loss": 4.169, + "step": 10830 + }, + { + "epoch": 0.31758587856148834, + "grad_norm": 13.131007194519043, + "learning_rate": 9.849990504398037e-06, + "loss": 4.1978, + "step": 10840 + }, + { + "epoch": 0.3178788544642203, + "grad_norm": 14.422877311706543, + "learning_rate": 9.84960578250693e-06, + "loss": 4.204, + "step": 10850 + }, + { + "epoch": 0.3181718303669523, + "grad_norm": 13.398043632507324, + "learning_rate": 9.849220575441533e-06, + "loss": 4.1865, + "step": 10860 + }, + { + "epoch": 0.3184648062696843, + "grad_norm": 12.765737533569336, + "learning_rate": 9.84883488324039e-06, + "loss": 4.1584, + "step": 10870 + }, + { + "epoch": 0.3187577821724163, + "grad_norm": 13.52071762084961, + "learning_rate": 9.848448705942085e-06, + "loss": 4.1818, + "step": 10880 + }, + { + "epoch": 0.3190507580751483, + "grad_norm": 14.82532787322998, + "learning_rate": 9.848062043585248e-06, + "loss": 4.1835, + "step": 10890 + }, + { + "epoch": 0.3193437339778803, + "grad_norm": 13.742535591125488, + "learning_rate": 9.84767489620857e-06, + "loss": 4.1865, + "step": 10900 + }, + { + "epoch": 0.3196367098806123, + "grad_norm": 14.646089553833008, + "learning_rate": 9.847287263850776e-06, + "loss": 4.1785, + "step": 10910 + }, + { + "epoch": 0.3199296857833443, + "grad_norm": 13.050314903259277, + "learning_rate": 9.84689914655065e-06, + "loss": 4.1799, + "step": 10920 + }, + { + "epoch": 0.3202226616860763, + "grad_norm": 14.804306983947754, + "learning_rate": 9.84651054434702e-06, + "loss": 4.1677, + "step": 10930 + }, + { + "epoch": 0.3205156375888083, + "grad_norm": 12.387232780456543, + "learning_rate": 9.846121457278763e-06, + "loss": 4.1851, + "step": 10940 + }, + { + "epoch": 0.32080861349154033, + "grad_norm": 13.390987396240234, + "learning_rate": 9.845731885384806e-06, + "loss": 4.1721, + "step": 10950 + }, + { + "epoch": 0.3211015893942723, + "grad_norm": 14.124608039855957, + "learning_rate": 9.845341828704123e-06, + "loss": 4.17, + "step": 10960 + }, + { + "epoch": 0.32139456529700433, + "grad_norm": 13.233084678649902, + "learning_rate": 9.844951287275736e-06, + "loss": 4.1473, + "step": 10970 + }, + { + "epoch": 0.3216875411997363, + "grad_norm": 14.897387504577637, + "learning_rate": 9.844560261138717e-06, + "loss": 4.1433, + "step": 10980 + }, + { + "epoch": 0.32198051710246833, + "grad_norm": 13.727469444274902, + "learning_rate": 9.844168750332185e-06, + "loss": 4.1944, + "step": 10990 + }, + { + "epoch": 0.3222734930052003, + "grad_norm": 12.94725513458252, + "learning_rate": 9.84377675489531e-06, + "loss": 4.1615, + "step": 11000 + }, + { + "epoch": 0.32256646890793234, + "grad_norm": 14.908360481262207, + "learning_rate": 9.843384274867307e-06, + "loss": 4.1801, + "step": 11010 + }, + { + "epoch": 0.3228594448106643, + "grad_norm": 14.32445240020752, + "learning_rate": 9.842991310287443e-06, + "loss": 4.1581, + "step": 11020 + }, + { + "epoch": 0.32315242071339634, + "grad_norm": 15.203826904296875, + "learning_rate": 9.84259786119503e-06, + "loss": 4.1621, + "step": 11030 + }, + { + "epoch": 0.3234453966161283, + "grad_norm": 13.81208324432373, + "learning_rate": 9.842203927629432e-06, + "loss": 4.1609, + "step": 11040 + }, + { + "epoch": 0.32373837251886034, + "grad_norm": 13.700340270996094, + "learning_rate": 9.841809509630059e-06, + "loss": 4.1676, + "step": 11050 + }, + { + "epoch": 0.3240313484215923, + "grad_norm": 13.212390899658203, + "learning_rate": 9.841414607236368e-06, + "loss": 4.1269, + "step": 11060 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 13.604581832885742, + "learning_rate": 9.84101922048787e-06, + "loss": 4.1433, + "step": 11070 + }, + { + "epoch": 0.3246173002270563, + "grad_norm": 13.770525932312012, + "learning_rate": 9.84062334942412e-06, + "loss": 4.1489, + "step": 11080 + }, + { + "epoch": 0.32491027612978834, + "grad_norm": 11.826311111450195, + "learning_rate": 9.840226994084723e-06, + "loss": 4.1423, + "step": 11090 + }, + { + "epoch": 0.3252032520325203, + "grad_norm": 12.91601276397705, + "learning_rate": 9.839830154509329e-06, + "loss": 4.1813, + "step": 11100 + }, + { + "epoch": 0.32549622793525235, + "grad_norm": 13.569409370422363, + "learning_rate": 9.839432830737643e-06, + "loss": 4.1875, + "step": 11110 + }, + { + "epoch": 0.3257892038379843, + "grad_norm": 14.761457443237305, + "learning_rate": 9.839035022809414e-06, + "loss": 4.1704, + "step": 11120 + }, + { + "epoch": 0.32608217974071635, + "grad_norm": 13.20815372467041, + "learning_rate": 9.83863673076444e-06, + "loss": 4.1597, + "step": 11130 + }, + { + "epoch": 0.3263751556434483, + "grad_norm": 13.798486709594727, + "learning_rate": 9.838237954642569e-06, + "loss": 4.1657, + "step": 11140 + }, + { + "epoch": 0.32666813154618035, + "grad_norm": 12.35238265991211, + "learning_rate": 9.837838694483692e-06, + "loss": 4.1721, + "step": 11150 + }, + { + "epoch": 0.3269611074489123, + "grad_norm": 13.791654586791992, + "learning_rate": 9.837438950327759e-06, + "loss": 4.1627, + "step": 11160 + }, + { + "epoch": 0.32725408335164435, + "grad_norm": 14.955191612243652, + "learning_rate": 9.837038722214756e-06, + "loss": 4.1605, + "step": 11170 + }, + { + "epoch": 0.3275470592543763, + "grad_norm": 14.065284729003906, + "learning_rate": 9.836638010184726e-06, + "loss": 4.1267, + "step": 11180 + }, + { + "epoch": 0.3278400351571083, + "grad_norm": 15.941816329956055, + "learning_rate": 9.836236814277759e-06, + "loss": 4.1577, + "step": 11190 + }, + { + "epoch": 0.32813301105984033, + "grad_norm": 14.448884963989258, + "learning_rate": 9.835835134533991e-06, + "loss": 4.171, + "step": 11200 + }, + { + "epoch": 0.3284259869625723, + "grad_norm": 12.826201438903809, + "learning_rate": 9.835432970993605e-06, + "loss": 4.1452, + "step": 11210 + }, + { + "epoch": 0.32871896286530433, + "grad_norm": 12.559414863586426, + "learning_rate": 9.835030323696841e-06, + "loss": 4.18, + "step": 11220 + }, + { + "epoch": 0.3290119387680363, + "grad_norm": 13.208146095275879, + "learning_rate": 9.83462719268398e-06, + "loss": 4.1611, + "step": 11230 + }, + { + "epoch": 0.32930491467076833, + "grad_norm": 13.794997215270996, + "learning_rate": 9.834223577995349e-06, + "loss": 4.1508, + "step": 11240 + }, + { + "epoch": 0.3295978905735003, + "grad_norm": 14.017576217651367, + "learning_rate": 9.833819479671328e-06, + "loss": 4.1525, + "step": 11250 + }, + { + "epoch": 0.32989086647623234, + "grad_norm": 13.273810386657715, + "learning_rate": 9.833414897752346e-06, + "loss": 4.1335, + "step": 11260 + }, + { + "epoch": 0.33000805683732515, + "eval_bleu": 0.3000988827773419, + "eval_cap_loss": 1.070772647857666, + "eval_con_loss": 1.6635377407073975, + "eval_loss": 4.397848129272461, + "step": 11264 + }, + { + "epoch": 0.33000805683732515, + "eval_bleu": 0.3000988827773419, + "eval_cap_loss": 1.070772647857666, + "eval_con_loss": 1.6635377407073975, + "eval_loss": 4.397848129272461, + "eval_runtime": 55.7818, + "eval_samples_per_second": 358.54, + "eval_steps_per_second": 0.359, + "step": 11264 + }, + { + "epoch": 0.3301838423789643, + "grad_norm": 13.645787239074707, + "learning_rate": 9.833009832278882e-06, + "loss": 4.15, + "step": 11270 + }, + { + "epoch": 0.33047681828169634, + "grad_norm": 13.787063598632812, + "learning_rate": 9.832604283291456e-06, + "loss": 4.1729, + "step": 11280 + }, + { + "epoch": 0.3307697941844283, + "grad_norm": 14.225363731384277, + "learning_rate": 9.832198250830644e-06, + "loss": 4.1415, + "step": 11290 + }, + { + "epoch": 0.33106277008716034, + "grad_norm": 13.682636260986328, + "learning_rate": 9.831791734937064e-06, + "loss": 4.1405, + "step": 11300 + }, + { + "epoch": 0.3313557459898923, + "grad_norm": 13.093945503234863, + "learning_rate": 9.831384735651386e-06, + "loss": 4.1458, + "step": 11310 + }, + { + "epoch": 0.33164872189262434, + "grad_norm": 12.9299955368042, + "learning_rate": 9.830977253014332e-06, + "loss": 4.1278, + "step": 11320 + }, + { + "epoch": 0.3319416977953563, + "grad_norm": 14.764959335327148, + "learning_rate": 9.830569287066661e-06, + "loss": 4.1416, + "step": 11330 + }, + { + "epoch": 0.33223467369808835, + "grad_norm": 12.396750450134277, + "learning_rate": 9.830160837849194e-06, + "loss": 4.1482, + "step": 11340 + }, + { + "epoch": 0.3325276496008203, + "grad_norm": 13.550492286682129, + "learning_rate": 9.829751905402792e-06, + "loss": 4.1395, + "step": 11350 + }, + { + "epoch": 0.33282062550355235, + "grad_norm": 12.911694526672363, + "learning_rate": 9.829342489768363e-06, + "loss": 4.1056, + "step": 11360 + }, + { + "epoch": 0.3331136014062843, + "grad_norm": 13.220741271972656, + "learning_rate": 9.828932590986872e-06, + "loss": 4.146, + "step": 11370 + }, + { + "epoch": 0.33340657730901635, + "grad_norm": 14.705076217651367, + "learning_rate": 9.828522209099322e-06, + "loss": 4.1558, + "step": 11380 + }, + { + "epoch": 0.3336995532117483, + "grad_norm": 12.00400447845459, + "learning_rate": 9.828111344146774e-06, + "loss": 4.1219, + "step": 11390 + }, + { + "epoch": 0.33399252911448035, + "grad_norm": 13.537572860717773, + "learning_rate": 9.82769999617033e-06, + "loss": 4.1253, + "step": 11400 + }, + { + "epoch": 0.3342855050172123, + "grad_norm": 14.032508850097656, + "learning_rate": 9.827288165211143e-06, + "loss": 4.1554, + "step": 11410 + }, + { + "epoch": 0.33457848091994435, + "grad_norm": 13.306488990783691, + "learning_rate": 9.826875851310413e-06, + "loss": 4.1525, + "step": 11420 + }, + { + "epoch": 0.3348714568226763, + "grad_norm": 14.158988952636719, + "learning_rate": 9.826463054509393e-06, + "loss": 4.1186, + "step": 11430 + }, + { + "epoch": 0.33516443272540836, + "grad_norm": 13.326722145080566, + "learning_rate": 9.826049774849378e-06, + "loss": 4.1353, + "step": 11440 + }, + { + "epoch": 0.33545740862814033, + "grad_norm": 14.499959945678711, + "learning_rate": 9.825636012371714e-06, + "loss": 4.1077, + "step": 11450 + }, + { + "epoch": 0.33575038453087236, + "grad_norm": 12.78305721282959, + "learning_rate": 9.825221767117797e-06, + "loss": 4.1306, + "step": 11460 + }, + { + "epoch": 0.33604336043360433, + "grad_norm": 13.72365665435791, + "learning_rate": 9.82480703912907e-06, + "loss": 4.1233, + "step": 11470 + }, + { + "epoch": 0.33633633633633636, + "grad_norm": 13.901249885559082, + "learning_rate": 9.824391828447022e-06, + "loss": 4.1306, + "step": 11480 + }, + { + "epoch": 0.33662931223906833, + "grad_norm": 15.145869255065918, + "learning_rate": 9.823976135113196e-06, + "loss": 4.1371, + "step": 11490 + }, + { + "epoch": 0.33692228814180036, + "grad_norm": 13.25826358795166, + "learning_rate": 9.823559959169176e-06, + "loss": 4.1577, + "step": 11500 + }, + { + "epoch": 0.33721526404453234, + "grad_norm": 13.113600730895996, + "learning_rate": 9.823143300656598e-06, + "loss": 4.1211, + "step": 11510 + }, + { + "epoch": 0.3375082399472643, + "grad_norm": 13.45997428894043, + "learning_rate": 9.822726159617148e-06, + "loss": 4.1142, + "step": 11520 + }, + { + "epoch": 0.33780121584999634, + "grad_norm": 12.294282913208008, + "learning_rate": 9.822308536092559e-06, + "loss": 4.1523, + "step": 11530 + }, + { + "epoch": 0.3380941917527283, + "grad_norm": 15.276236534118652, + "learning_rate": 9.82189043012461e-06, + "loss": 4.1202, + "step": 11540 + }, + { + "epoch": 0.33838716765546034, + "grad_norm": 14.358880996704102, + "learning_rate": 9.821471841755132e-06, + "loss": 4.1137, + "step": 11550 + }, + { + "epoch": 0.3386801435581923, + "grad_norm": 14.0089693069458, + "learning_rate": 9.821052771026001e-06, + "loss": 4.1066, + "step": 11560 + }, + { + "epoch": 0.33897311946092434, + "grad_norm": 12.772789001464844, + "learning_rate": 9.82063321797914e-06, + "loss": 4.1108, + "step": 11570 + }, + { + "epoch": 0.3392660953636563, + "grad_norm": 13.051017761230469, + "learning_rate": 9.820213182656528e-06, + "loss": 4.1373, + "step": 11580 + }, + { + "epoch": 0.33955907126638835, + "grad_norm": 15.207862854003906, + "learning_rate": 9.819792665100182e-06, + "loss": 4.122, + "step": 11590 + }, + { + "epoch": 0.3398520471691203, + "grad_norm": 13.350985527038574, + "learning_rate": 9.819371665352178e-06, + "loss": 4.0841, + "step": 11600 + }, + { + "epoch": 0.34014502307185235, + "grad_norm": 13.075576782226562, + "learning_rate": 9.81895018345463e-06, + "loss": 4.1211, + "step": 11610 + }, + { + "epoch": 0.3404379989745843, + "grad_norm": 12.48145866394043, + "learning_rate": 9.818528219449705e-06, + "loss": 4.1108, + "step": 11620 + }, + { + "epoch": 0.34073097487731635, + "grad_norm": 13.767172813415527, + "learning_rate": 9.81810577337962e-06, + "loss": 4.1318, + "step": 11630 + }, + { + "epoch": 0.3410239507800483, + "grad_norm": 13.285630226135254, + "learning_rate": 9.817682845286636e-06, + "loss": 4.1282, + "step": 11640 + }, + { + "epoch": 0.34131692668278035, + "grad_norm": 14.106114387512207, + "learning_rate": 9.817259435213066e-06, + "loss": 4.1213, + "step": 11650 + }, + { + "epoch": 0.3416099025855123, + "grad_norm": 14.200239181518555, + "learning_rate": 9.81683554320127e-06, + "loss": 4.1019, + "step": 11660 + }, + { + "epoch": 0.34190287848824436, + "grad_norm": 15.472943305969238, + "learning_rate": 9.816411169293656e-06, + "loss": 4.1228, + "step": 11670 + }, + { + "epoch": 0.34219585439097633, + "grad_norm": 12.73769474029541, + "learning_rate": 9.815986313532676e-06, + "loss": 4.1449, + "step": 11680 + }, + { + "epoch": 0.34248883029370836, + "grad_norm": 12.25682258605957, + "learning_rate": 9.815560975960842e-06, + "loss": 4.117, + "step": 11690 + }, + { + "epoch": 0.34278180619644033, + "grad_norm": 13.460747718811035, + "learning_rate": 9.8151351566207e-06, + "loss": 4.1212, + "step": 11700 + }, + { + "epoch": 0.34307478209917236, + "grad_norm": 12.001855850219727, + "learning_rate": 9.814708855554854e-06, + "loss": 4.0965, + "step": 11710 + }, + { + "epoch": 0.34336775800190433, + "grad_norm": 14.045724868774414, + "learning_rate": 9.814282072805949e-06, + "loss": 4.1171, + "step": 11720 + }, + { + "epoch": 0.34366073390463636, + "grad_norm": 13.378206253051758, + "learning_rate": 9.813854808416687e-06, + "loss": 4.1191, + "step": 11730 + }, + { + "epoch": 0.34395370980736834, + "grad_norm": 14.127825736999512, + "learning_rate": 9.81342706242981e-06, + "loss": 4.0928, + "step": 11740 + }, + { + "epoch": 0.34424668571010036, + "grad_norm": 11.63330078125, + "learning_rate": 9.812998834888113e-06, + "loss": 4.1376, + "step": 11750 + }, + { + "epoch": 0.34453966161283234, + "grad_norm": 13.031737327575684, + "learning_rate": 9.812570125834435e-06, + "loss": 4.092, + "step": 11760 + }, + { + "epoch": 0.34483263751556437, + "grad_norm": 13.600454330444336, + "learning_rate": 9.812140935311671e-06, + "loss": 4.0905, + "step": 11770 + }, + { + "epoch": 0.34500842305720353, + "eval_bleu": 0.30279247604597037, + "eval_cap_loss": 1.0633165836334229, + "eval_con_loss": 1.6454185247421265, + "eval_loss": 4.354153633117676, + "step": 11776 + }, + { + "epoch": 0.34500842305720353, + "eval_bleu": 0.30279247604597037, + "eval_cap_loss": 1.0633165836334229, + "eval_con_loss": 1.6454185247421265, + "eval_loss": 4.354153633117676, + "eval_runtime": 52.5611, + "eval_samples_per_second": 380.509, + "eval_steps_per_second": 0.381, + "step": 11776 + }, + { + "epoch": 0.34512561341829634, + "grad_norm": 13.767080307006836, + "learning_rate": 9.811711263362755e-06, + "loss": 4.1035, + "step": 11780 + }, + { + "epoch": 0.34541858932102837, + "grad_norm": 15.677536964416504, + "learning_rate": 9.811281110030673e-06, + "loss": 4.1192, + "step": 11790 + }, + { + "epoch": 0.34571156522376034, + "grad_norm": 14.811854362487793, + "learning_rate": 9.81085047535846e-06, + "loss": 4.097, + "step": 11800 + }, + { + "epoch": 0.34600454112649237, + "grad_norm": 14.026432037353516, + "learning_rate": 9.8104193593892e-06, + "loss": 4.0869, + "step": 11810 + }, + { + "epoch": 0.34629751702922434, + "grad_norm": 13.73586654663086, + "learning_rate": 9.809987762166021e-06, + "loss": 4.1105, + "step": 11820 + }, + { + "epoch": 0.3465904929319564, + "grad_norm": 13.735369682312012, + "learning_rate": 9.809555683732105e-06, + "loss": 4.0946, + "step": 11830 + }, + { + "epoch": 0.34688346883468835, + "grad_norm": 13.300454139709473, + "learning_rate": 9.809123124130678e-06, + "loss": 4.1257, + "step": 11840 + }, + { + "epoch": 0.3471764447374203, + "grad_norm": 15.529960632324219, + "learning_rate": 9.80869008340501e-06, + "loss": 4.1146, + "step": 11850 + }, + { + "epoch": 0.34746942064015235, + "grad_norm": 12.313862800598145, + "learning_rate": 9.808256561598431e-06, + "loss": 4.0769, + "step": 11860 + }, + { + "epoch": 0.3477623965428843, + "grad_norm": 12.664068222045898, + "learning_rate": 9.80782255875431e-06, + "loss": 4.0906, + "step": 11870 + }, + { + "epoch": 0.34805537244561635, + "grad_norm": 14.186506271362305, + "learning_rate": 9.807388074916064e-06, + "loss": 4.0745, + "step": 11880 + }, + { + "epoch": 0.3483483483483483, + "grad_norm": 14.339221954345703, + "learning_rate": 9.806953110127162e-06, + "loss": 4.0783, + "step": 11890 + }, + { + "epoch": 0.34864132425108035, + "grad_norm": 13.744891166687012, + "learning_rate": 9.806517664431121e-06, + "loss": 4.0843, + "step": 11900 + }, + { + "epoch": 0.3489343001538123, + "grad_norm": 13.657973289489746, + "learning_rate": 9.806081737871504e-06, + "loss": 4.0987, + "step": 11910 + }, + { + "epoch": 0.34922727605654436, + "grad_norm": 13.857317924499512, + "learning_rate": 9.805645330491922e-06, + "loss": 4.1237, + "step": 11920 + }, + { + "epoch": 0.34952025195927633, + "grad_norm": 13.56093692779541, + "learning_rate": 9.805208442336035e-06, + "loss": 4.0871, + "step": 11930 + }, + { + "epoch": 0.34981322786200836, + "grad_norm": 13.381333351135254, + "learning_rate": 9.804771073447554e-06, + "loss": 4.1096, + "step": 11940 + }, + { + "epoch": 0.35010620376474033, + "grad_norm": 13.255882263183594, + "learning_rate": 9.804333223870231e-06, + "loss": 4.1043, + "step": 11950 + }, + { + "epoch": 0.35039917966747236, + "grad_norm": 13.525464057922363, + "learning_rate": 9.803894893647872e-06, + "loss": 4.1275, + "step": 11960 + }, + { + "epoch": 0.35069215557020433, + "grad_norm": 13.118865966796875, + "learning_rate": 9.80345608282433e-06, + "loss": 4.0894, + "step": 11970 + }, + { + "epoch": 0.35098513147293636, + "grad_norm": 13.779436111450195, + "learning_rate": 9.803016791443504e-06, + "loss": 4.0738, + "step": 11980 + }, + { + "epoch": 0.35127810737566834, + "grad_norm": 13.225838661193848, + "learning_rate": 9.802577019549345e-06, + "loss": 4.0759, + "step": 11990 + }, + { + "epoch": 0.35157108327840036, + "grad_norm": 13.550140380859375, + "learning_rate": 9.802136767185846e-06, + "loss": 4.0915, + "step": 12000 + }, + { + "epoch": 0.35186405918113234, + "grad_norm": 12.705817222595215, + "learning_rate": 9.801696034397055e-06, + "loss": 4.0779, + "step": 12010 + }, + { + "epoch": 0.35215703508386437, + "grad_norm": 13.83055591583252, + "learning_rate": 9.801254821227063e-06, + "loss": 4.0813, + "step": 12020 + }, + { + "epoch": 0.35245001098659634, + "grad_norm": 15.466154098510742, + "learning_rate": 9.80081312772001e-06, + "loss": 4.0929, + "step": 12030 + }, + { + "epoch": 0.35274298688932837, + "grad_norm": 13.061784744262695, + "learning_rate": 9.800370953920087e-06, + "loss": 4.0926, + "step": 12040 + }, + { + "epoch": 0.35303596279206034, + "grad_norm": 14.237512588500977, + "learning_rate": 9.79992829987153e-06, + "loss": 4.1099, + "step": 12050 + }, + { + "epoch": 0.35332893869479237, + "grad_norm": 12.60843563079834, + "learning_rate": 9.799485165618623e-06, + "loss": 4.1087, + "step": 12060 + }, + { + "epoch": 0.35362191459752434, + "grad_norm": 12.664875030517578, + "learning_rate": 9.799041551205699e-06, + "loss": 4.0815, + "step": 12070 + }, + { + "epoch": 0.3539148905002564, + "grad_norm": 12.75241756439209, + "learning_rate": 9.798597456677139e-06, + "loss": 4.0918, + "step": 12080 + }, + { + "epoch": 0.35420786640298835, + "grad_norm": 13.4146089553833, + "learning_rate": 9.798152882077374e-06, + "loss": 4.0879, + "step": 12090 + }, + { + "epoch": 0.3545008423057204, + "grad_norm": 12.291645050048828, + "learning_rate": 9.797707827450879e-06, + "loss": 4.0729, + "step": 12100 + }, + { + "epoch": 0.35479381820845235, + "grad_norm": 13.107827186584473, + "learning_rate": 9.79726229284218e-06, + "loss": 4.0898, + "step": 12110 + }, + { + "epoch": 0.3550867941111844, + "grad_norm": 13.042969703674316, + "learning_rate": 9.796816278295849e-06, + "loss": 4.0922, + "step": 12120 + }, + { + "epoch": 0.35537977001391635, + "grad_norm": 11.727827072143555, + "learning_rate": 9.79636978385651e-06, + "loss": 4.1071, + "step": 12130 + }, + { + "epoch": 0.3556727459166484, + "grad_norm": 12.738091468811035, + "learning_rate": 9.795922809568828e-06, + "loss": 4.1062, + "step": 12140 + }, + { + "epoch": 0.35596572181938035, + "grad_norm": 15.461360931396484, + "learning_rate": 9.795475355477522e-06, + "loss": 4.1051, + "step": 12150 + }, + { + "epoch": 0.3562586977221124, + "grad_norm": 15.579025268554688, + "learning_rate": 9.795027421627359e-06, + "loss": 4.1189, + "step": 12160 + }, + { + "epoch": 0.35655167362484436, + "grad_norm": 14.419559478759766, + "learning_rate": 9.794579008063149e-06, + "loss": 4.1169, + "step": 12170 + }, + { + "epoch": 0.35684464952757633, + "grad_norm": 14.164288520812988, + "learning_rate": 9.794130114829755e-06, + "loss": 4.0735, + "step": 12180 + }, + { + "epoch": 0.35713762543030836, + "grad_norm": 12.268274307250977, + "learning_rate": 9.793680741972084e-06, + "loss": 4.0988, + "step": 12190 + }, + { + "epoch": 0.35743060133304033, + "grad_norm": 13.829787254333496, + "learning_rate": 9.793230889535097e-06, + "loss": 4.0542, + "step": 12200 + }, + { + "epoch": 0.35772357723577236, + "grad_norm": 13.024742126464844, + "learning_rate": 9.792780557563795e-06, + "loss": 4.0854, + "step": 12210 + }, + { + "epoch": 0.35801655313850433, + "grad_norm": 13.709090232849121, + "learning_rate": 9.792329746103236e-06, + "loss": 4.0577, + "step": 12220 + }, + { + "epoch": 0.35830952904123636, + "grad_norm": 14.142699241638184, + "learning_rate": 9.791878455198513e-06, + "loss": 4.1028, + "step": 12230 + }, + { + "epoch": 0.35860250494396834, + "grad_norm": 14.048547744750977, + "learning_rate": 9.791426684894783e-06, + "loss": 4.0843, + "step": 12240 + }, + { + "epoch": 0.35889548084670037, + "grad_norm": 14.887382507324219, + "learning_rate": 9.79097443523724e-06, + "loss": 4.0944, + "step": 12250 + }, + { + "epoch": 0.35918845674943234, + "grad_norm": 16.754152297973633, + "learning_rate": 9.790521706271128e-06, + "loss": 4.0963, + "step": 12260 + }, + { + "epoch": 0.35948143265216437, + "grad_norm": 12.898554801940918, + "learning_rate": 9.79006849804174e-06, + "loss": 4.0754, + "step": 12270 + }, + { + "epoch": 0.35977440855489634, + "grad_norm": 14.401851654052734, + "learning_rate": 9.789614810594417e-06, + "loss": 4.0485, + "step": 12280 + }, + { + "epoch": 0.36000878927708196, + "eval_bleu": 0.30163511086475614, + "eval_cap_loss": 1.0604357719421387, + "eval_con_loss": 1.629253625869751, + "eval_loss": 4.318943023681641, + "step": 12288 + }, + { + "epoch": 0.36000878927708196, + "eval_bleu": 0.30163511086475614, + "eval_cap_loss": 1.0604357719421387, + "eval_con_loss": 1.629253625869751, + "eval_loss": 4.318943023681641, + "eval_runtime": 54.988, + "eval_samples_per_second": 363.716, + "eval_steps_per_second": 0.364, + "step": 12288 + }, + { + "epoch": 0.36006738445762837, + "grad_norm": 12.511488914489746, + "learning_rate": 9.789160643974549e-06, + "loss": 4.0427, + "step": 12290 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 13.44015884399414, + "learning_rate": 9.78870599822757e-06, + "loss": 4.078, + "step": 12300 + }, + { + "epoch": 0.3606533362630924, + "grad_norm": 13.154059410095215, + "learning_rate": 9.788250873398968e-06, + "loss": 4.0676, + "step": 12310 + }, + { + "epoch": 0.36094631216582435, + "grad_norm": 13.73733139038086, + "learning_rate": 9.787795269534272e-06, + "loss": 4.0671, + "step": 12320 + }, + { + "epoch": 0.3612392880685564, + "grad_norm": 13.716389656066895, + "learning_rate": 9.787339186679067e-06, + "loss": 4.0804, + "step": 12330 + }, + { + "epoch": 0.36153226397128835, + "grad_norm": 12.069001197814941, + "learning_rate": 9.786882624878977e-06, + "loss": 4.1029, + "step": 12340 + }, + { + "epoch": 0.3618252398740204, + "grad_norm": 13.753615379333496, + "learning_rate": 9.78642558417968e-06, + "loss": 4.061, + "step": 12350 + }, + { + "epoch": 0.36211821577675235, + "grad_norm": 14.37895393371582, + "learning_rate": 9.785968064626901e-06, + "loss": 4.0753, + "step": 12360 + }, + { + "epoch": 0.3624111916794844, + "grad_norm": 14.359475135803223, + "learning_rate": 9.78551006626641e-06, + "loss": 4.0399, + "step": 12370 + }, + { + "epoch": 0.36270416758221635, + "grad_norm": 13.866453170776367, + "learning_rate": 9.78505158914403e-06, + "loss": 4.0554, + "step": 12380 + }, + { + "epoch": 0.3629971434849484, + "grad_norm": 13.182557106018066, + "learning_rate": 9.784592633305626e-06, + "loss": 4.083, + "step": 12390 + }, + { + "epoch": 0.36329011938768035, + "grad_norm": 15.119706153869629, + "learning_rate": 9.784133198797115e-06, + "loss": 4.0659, + "step": 12400 + }, + { + "epoch": 0.3635830952904124, + "grad_norm": 14.01385498046875, + "learning_rate": 9.78367328566446e-06, + "loss": 4.0677, + "step": 12410 + }, + { + "epoch": 0.36387607119314436, + "grad_norm": 13.483988761901855, + "learning_rate": 9.783212893953676e-06, + "loss": 4.103, + "step": 12420 + }, + { + "epoch": 0.3641690470958764, + "grad_norm": 14.396063804626465, + "learning_rate": 9.782752023710818e-06, + "loss": 4.0697, + "step": 12430 + }, + { + "epoch": 0.36446202299860836, + "grad_norm": 12.010749816894531, + "learning_rate": 9.782290674981996e-06, + "loss": 4.0663, + "step": 12440 + }, + { + "epoch": 0.3647549989013404, + "grad_norm": 14.130037307739258, + "learning_rate": 9.781828847813364e-06, + "loss": 4.0641, + "step": 12450 + }, + { + "epoch": 0.36504797480407236, + "grad_norm": 13.224223136901855, + "learning_rate": 9.781366542251126e-06, + "loss": 4.07, + "step": 12460 + }, + { + "epoch": 0.3653409507068044, + "grad_norm": 13.565084457397461, + "learning_rate": 9.78090375834153e-06, + "loss": 4.0649, + "step": 12470 + }, + { + "epoch": 0.36563392660953636, + "grad_norm": 15.427536964416504, + "learning_rate": 9.780440496130878e-06, + "loss": 4.0517, + "step": 12480 + }, + { + "epoch": 0.3659269025122684, + "grad_norm": 13.037681579589844, + "learning_rate": 9.779976755665516e-06, + "loss": 4.0715, + "step": 12490 + }, + { + "epoch": 0.36621987841500037, + "grad_norm": 12.87741470336914, + "learning_rate": 9.779512536991839e-06, + "loss": 4.0603, + "step": 12500 + }, + { + "epoch": 0.36651285431773234, + "grad_norm": 12.694574356079102, + "learning_rate": 9.779047840156288e-06, + "loss": 4.0874, + "step": 12510 + }, + { + "epoch": 0.36680583022046437, + "grad_norm": 14.191753387451172, + "learning_rate": 9.778582665205354e-06, + "loss": 4.08, + "step": 12520 + }, + { + "epoch": 0.36709880612319634, + "grad_norm": 13.041732788085938, + "learning_rate": 9.778117012185576e-06, + "loss": 4.0673, + "step": 12530 + }, + { + "epoch": 0.36739178202592837, + "grad_norm": 13.860508918762207, + "learning_rate": 9.777650881143536e-06, + "loss": 4.0754, + "step": 12540 + }, + { + "epoch": 0.36768475792866034, + "grad_norm": 15.75629997253418, + "learning_rate": 9.777184272125872e-06, + "loss": 4.0948, + "step": 12550 + }, + { + "epoch": 0.3679777338313924, + "grad_norm": 13.393556594848633, + "learning_rate": 9.776717185179263e-06, + "loss": 4.0619, + "step": 12560 + }, + { + "epoch": 0.36827070973412435, + "grad_norm": 15.2987642288208, + "learning_rate": 9.776249620350438e-06, + "loss": 4.0628, + "step": 12570 + }, + { + "epoch": 0.3685636856368564, + "grad_norm": 13.164798736572266, + "learning_rate": 9.775781577686177e-06, + "loss": 4.0298, + "step": 12580 + }, + { + "epoch": 0.36885666153958835, + "grad_norm": 13.74841022491455, + "learning_rate": 9.7753130572333e-06, + "loss": 4.0608, + "step": 12590 + }, + { + "epoch": 0.3691496374423204, + "grad_norm": 12.722235679626465, + "learning_rate": 9.774844059038685e-06, + "loss": 4.0511, + "step": 12600 + }, + { + "epoch": 0.36944261334505235, + "grad_norm": 11.85464859008789, + "learning_rate": 9.77437458314925e-06, + "loss": 4.0752, + "step": 12610 + }, + { + "epoch": 0.3697355892477844, + "grad_norm": 13.926864624023438, + "learning_rate": 9.77390462961196e-06, + "loss": 4.0565, + "step": 12620 + }, + { + "epoch": 0.37002856515051635, + "grad_norm": 13.116537094116211, + "learning_rate": 9.77343419847384e-06, + "loss": 4.0589, + "step": 12630 + }, + { + "epoch": 0.3703215410532484, + "grad_norm": 12.458149909973145, + "learning_rate": 9.772963289781945e-06, + "loss": 4.0487, + "step": 12640 + }, + { + "epoch": 0.37061451695598036, + "grad_norm": 13.11514663696289, + "learning_rate": 9.77249190358339e-06, + "loss": 4.087, + "step": 12650 + }, + { + "epoch": 0.3709074928587124, + "grad_norm": 12.84043025970459, + "learning_rate": 9.772020039925334e-06, + "loss": 4.0307, + "step": 12660 + }, + { + "epoch": 0.37120046876144436, + "grad_norm": 14.010823249816895, + "learning_rate": 9.771547698854985e-06, + "loss": 4.03, + "step": 12670 + }, + { + "epoch": 0.3714934446641764, + "grad_norm": 14.785002708435059, + "learning_rate": 9.771074880419597e-06, + "loss": 4.0224, + "step": 12680 + }, + { + "epoch": 0.37178642056690836, + "grad_norm": 14.127418518066406, + "learning_rate": 9.770601584666474e-06, + "loss": 4.0342, + "step": 12690 + }, + { + "epoch": 0.3720793964696404, + "grad_norm": 12.625438690185547, + "learning_rate": 9.770127811642964e-06, + "loss": 4.0428, + "step": 12700 + }, + { + "epoch": 0.37237237237237236, + "grad_norm": 12.145669937133789, + "learning_rate": 9.769653561396468e-06, + "loss": 4.0425, + "step": 12710 + }, + { + "epoch": 0.3726653482751044, + "grad_norm": 14.074153900146484, + "learning_rate": 9.76917883397443e-06, + "loss": 4.0375, + "step": 12720 + }, + { + "epoch": 0.37295832417783636, + "grad_norm": 13.955367088317871, + "learning_rate": 9.768703629424344e-06, + "loss": 4.064, + "step": 12730 + }, + { + "epoch": 0.3732513000805684, + "grad_norm": 12.891091346740723, + "learning_rate": 9.768227947793752e-06, + "loss": 4.0458, + "step": 12740 + }, + { + "epoch": 0.37354427598330037, + "grad_norm": 14.333451271057129, + "learning_rate": 9.767751789130242e-06, + "loss": 4.0314, + "step": 12750 + }, + { + "epoch": 0.3738372518860324, + "grad_norm": 14.003671646118164, + "learning_rate": 9.767275153481452e-06, + "loss": 4.0109, + "step": 12760 + }, + { + "epoch": 0.37413022778876437, + "grad_norm": 13.920406341552734, + "learning_rate": 9.766798040895066e-06, + "loss": 4.0281, + "step": 12770 + }, + { + "epoch": 0.3744232036914964, + "grad_norm": 13.655296325683594, + "learning_rate": 9.766320451418818e-06, + "loss": 4.035, + "step": 12780 + }, + { + "epoch": 0.37471617959422837, + "grad_norm": 13.01014518737793, + "learning_rate": 9.765842385100486e-06, + "loss": 4.0275, + "step": 12790 + }, + { + "epoch": 0.3750091554969604, + "grad_norm": 12.783100128173828, + "learning_rate": 9.765363841987896e-06, + "loss": 4.0243, + "step": 12800 + }, + { + "epoch": 0.3750091554969604, + "eval_bleu": 0.3037291170968875, + "eval_cap_loss": 1.055818796157837, + "eval_con_loss": 1.616769790649414, + "eval_loss": 4.289358139038086, + "step": 12800 + }, + { + "epoch": 0.3750091554969604, + "eval_bleu": 0.3037291170968875, + "eval_cap_loss": 1.055818796157837, + "eval_con_loss": 1.616769790649414, + "eval_loss": 4.289358139038086, + "eval_runtime": 53.9671, + "eval_samples_per_second": 370.596, + "eval_steps_per_second": 0.371, + "step": 12800 + }, + { + "epoch": 0.3753021313996924, + "grad_norm": 12.354119300842285, + "learning_rate": 9.764884822128928e-06, + "loss": 4.0252, + "step": 12810 + }, + { + "epoch": 0.3755951073024244, + "grad_norm": 14.201190948486328, + "learning_rate": 9.764405325571502e-06, + "loss": 4.0684, + "step": 12820 + }, + { + "epoch": 0.3758880832051564, + "grad_norm": 13.605002403259277, + "learning_rate": 9.763925352363588e-06, + "loss": 4.0376, + "step": 12830 + }, + { + "epoch": 0.37618105910788835, + "grad_norm": 12.753727912902832, + "learning_rate": 9.763444902553208e-06, + "loss": 4.0295, + "step": 12840 + }, + { + "epoch": 0.3764740350106204, + "grad_norm": 14.643891334533691, + "learning_rate": 9.762963976188424e-06, + "loss": 4.0327, + "step": 12850 + }, + { + "epoch": 0.37676701091335235, + "grad_norm": 14.399361610412598, + "learning_rate": 9.762482573317353e-06, + "loss": 4.04, + "step": 12860 + }, + { + "epoch": 0.3770599868160844, + "grad_norm": 15.145318984985352, + "learning_rate": 9.762000693988157e-06, + "loss": 4.0371, + "step": 12870 + }, + { + "epoch": 0.37735296271881635, + "grad_norm": 13.717280387878418, + "learning_rate": 9.76151833824904e-06, + "loss": 4.0393, + "step": 12880 + }, + { + "epoch": 0.3776459386215484, + "grad_norm": 11.951484680175781, + "learning_rate": 9.761035506148264e-06, + "loss": 4.0509, + "step": 12890 + }, + { + "epoch": 0.37793891452428036, + "grad_norm": 13.567414283752441, + "learning_rate": 9.760552197734132e-06, + "loss": 4.0409, + "step": 12900 + }, + { + "epoch": 0.3782318904270124, + "grad_norm": 12.899617195129395, + "learning_rate": 9.760068413054997e-06, + "loss": 4.0208, + "step": 12910 + }, + { + "epoch": 0.37852486632974436, + "grad_norm": 13.079622268676758, + "learning_rate": 9.759584152159256e-06, + "loss": 4.0335, + "step": 12920 + }, + { + "epoch": 0.3788178422324764, + "grad_norm": 12.561617851257324, + "learning_rate": 9.75909941509536e-06, + "loss": 4.0169, + "step": 12930 + }, + { + "epoch": 0.37911081813520836, + "grad_norm": 12.922713279724121, + "learning_rate": 9.7586142019118e-06, + "loss": 4.0159, + "step": 12940 + }, + { + "epoch": 0.3794037940379404, + "grad_norm": 13.962514877319336, + "learning_rate": 9.758128512657124e-06, + "loss": 4.0097, + "step": 12950 + }, + { + "epoch": 0.37969676994067236, + "grad_norm": 13.77755069732666, + "learning_rate": 9.757642347379917e-06, + "loss": 4.0294, + "step": 12960 + }, + { + "epoch": 0.3799897458434044, + "grad_norm": 13.663959503173828, + "learning_rate": 9.75715570612882e-06, + "loss": 4.0395, + "step": 12970 + }, + { + "epoch": 0.38028272174613637, + "grad_norm": 13.678473472595215, + "learning_rate": 9.756668588952518e-06, + "loss": 4.0398, + "step": 12980 + }, + { + "epoch": 0.3805756976488684, + "grad_norm": 13.910072326660156, + "learning_rate": 9.756180995899745e-06, + "loss": 4.0524, + "step": 12990 + }, + { + "epoch": 0.38086867355160037, + "grad_norm": 12.315196990966797, + "learning_rate": 9.755692927019278e-06, + "loss": 4.0196, + "step": 13000 + }, + { + "epoch": 0.3811616494543324, + "grad_norm": 12.491225242614746, + "learning_rate": 9.755204382359951e-06, + "loss": 4.008, + "step": 13010 + }, + { + "epoch": 0.38145462535706437, + "grad_norm": 14.589888572692871, + "learning_rate": 9.754715361970635e-06, + "loss": 4.018, + "step": 13020 + }, + { + "epoch": 0.3817476012597964, + "grad_norm": 11.454557418823242, + "learning_rate": 9.754225865900258e-06, + "loss": 4.0418, + "step": 13030 + }, + { + "epoch": 0.38204057716252837, + "grad_norm": 14.500134468078613, + "learning_rate": 9.753735894197787e-06, + "loss": 4.014, + "step": 13040 + }, + { + "epoch": 0.3823335530652604, + "grad_norm": 13.424273490905762, + "learning_rate": 9.753245446912243e-06, + "loss": 3.9997, + "step": 13050 + }, + { + "epoch": 0.3826265289679924, + "grad_norm": 14.233434677124023, + "learning_rate": 9.752754524092693e-06, + "loss": 4.0308, + "step": 13060 + }, + { + "epoch": 0.3829195048707244, + "grad_norm": 13.827391624450684, + "learning_rate": 9.75226312578825e-06, + "loss": 4.0443, + "step": 13070 + }, + { + "epoch": 0.3832124807734564, + "grad_norm": 13.50603199005127, + "learning_rate": 9.751771252048075e-06, + "loss": 4.0149, + "step": 13080 + }, + { + "epoch": 0.3835054566761884, + "grad_norm": 13.717662811279297, + "learning_rate": 9.751278902921378e-06, + "loss": 4.0163, + "step": 13090 + }, + { + "epoch": 0.3837984325789204, + "grad_norm": 12.70270824432373, + "learning_rate": 9.750786078457414e-06, + "loss": 4.0287, + "step": 13100 + }, + { + "epoch": 0.3840914084816524, + "grad_norm": 14.398499488830566, + "learning_rate": 9.75029277870549e-06, + "loss": 4.0224, + "step": 13110 + }, + { + "epoch": 0.3843843843843844, + "grad_norm": 13.074663162231445, + "learning_rate": 9.749799003714954e-06, + "loss": 4.0172, + "step": 13120 + }, + { + "epoch": 0.3846773602871164, + "grad_norm": 12.653932571411133, + "learning_rate": 9.749304753535209e-06, + "loss": 4.0323, + "step": 13130 + }, + { + "epoch": 0.3849703361898484, + "grad_norm": 13.735459327697754, + "learning_rate": 9.7488100282157e-06, + "loss": 3.9838, + "step": 13140 + }, + { + "epoch": 0.3852633120925804, + "grad_norm": 12.491053581237793, + "learning_rate": 9.74831482780592e-06, + "loss": 4.0251, + "step": 13150 + }, + { + "epoch": 0.3855562879953124, + "grad_norm": 11.977575302124023, + "learning_rate": 9.747819152355413e-06, + "loss": 4.0087, + "step": 13160 + }, + { + "epoch": 0.38584926389804436, + "grad_norm": 13.17866039276123, + "learning_rate": 9.74732300191377e-06, + "loss": 4.0368, + "step": 13170 + }, + { + "epoch": 0.3861422398007764, + "grad_norm": 13.284700393676758, + "learning_rate": 9.746826376530623e-06, + "loss": 4.0173, + "step": 13180 + }, + { + "epoch": 0.38643521570350836, + "grad_norm": 12.339211463928223, + "learning_rate": 9.746329276255658e-06, + "loss": 4.0097, + "step": 13190 + }, + { + "epoch": 0.3867281916062404, + "grad_norm": 11.536840438842773, + "learning_rate": 9.745831701138609e-06, + "loss": 4.0039, + "step": 13200 + }, + { + "epoch": 0.38702116750897236, + "grad_norm": 11.838342666625977, + "learning_rate": 9.745333651229255e-06, + "loss": 3.9967, + "step": 13210 + }, + { + "epoch": 0.3873141434117044, + "grad_norm": 12.201598167419434, + "learning_rate": 9.744835126577422e-06, + "loss": 3.9861, + "step": 13220 + }, + { + "epoch": 0.38760711931443637, + "grad_norm": 12.539883613586426, + "learning_rate": 9.744336127232984e-06, + "loss": 4.0218, + "step": 13230 + }, + { + "epoch": 0.3879000952171684, + "grad_norm": 12.068538665771484, + "learning_rate": 9.743836653245865e-06, + "loss": 4.0046, + "step": 13240 + }, + { + "epoch": 0.38819307111990037, + "grad_norm": 13.35682487487793, + "learning_rate": 9.743336704666032e-06, + "loss": 3.9907, + "step": 13250 + }, + { + "epoch": 0.3884860470226324, + "grad_norm": 13.914070129394531, + "learning_rate": 9.742836281543504e-06, + "loss": 3.9855, + "step": 13260 + }, + { + "epoch": 0.38877902292536437, + "grad_norm": 13.104236602783203, + "learning_rate": 9.742335383928343e-06, + "loss": 3.9866, + "step": 13270 + }, + { + "epoch": 0.3890719988280964, + "grad_norm": 11.965299606323242, + "learning_rate": 9.741834011870664e-06, + "loss": 3.9804, + "step": 13280 + }, + { + "epoch": 0.3893649747308284, + "grad_norm": 12.189838409423828, + "learning_rate": 9.741332165420622e-06, + "loss": 4.0063, + "step": 13290 + }, + { + "epoch": 0.3896579506335604, + "grad_norm": 12.702860832214355, + "learning_rate": 9.740829844628428e-06, + "loss": 3.9808, + "step": 13300 + }, + { + "epoch": 0.3899509265362924, + "grad_norm": 12.524924278259277, + "learning_rate": 9.740327049544334e-06, + "loss": 4.0016, + "step": 13310 + }, + { + "epoch": 0.3900095217168388, + "eval_bleu": 0.3068043926132573, + "eval_cap_loss": 1.050140380859375, + "eval_con_loss": 1.597171664237976, + "eval_loss": 4.244483947753906, + "step": 13312 + }, + { + "epoch": 0.3900095217168388, + "eval_bleu": 0.3068043926132573, + "eval_cap_loss": 1.050140380859375, + "eval_con_loss": 1.597171664237976, + "eval_loss": 4.244483947753906, + "eval_runtime": 57.3616, + "eval_samples_per_second": 348.665, + "eval_steps_per_second": 0.349, + "step": 13312 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 12.767597198486328, + "learning_rate": 9.739823780218642e-06, + "loss": 4.0081, + "step": 13320 + }, + { + "epoch": 0.3905368783417564, + "grad_norm": 12.556464195251465, + "learning_rate": 9.7393200367017e-06, + "loss": 4.0317, + "step": 13330 + }, + { + "epoch": 0.3908298542444884, + "grad_norm": 12.900829315185547, + "learning_rate": 9.738815819043907e-06, + "loss": 4.0064, + "step": 13340 + }, + { + "epoch": 0.3911228301472204, + "grad_norm": 12.538851737976074, + "learning_rate": 9.738311127295706e-06, + "loss": 3.9958, + "step": 13350 + }, + { + "epoch": 0.3914158060499524, + "grad_norm": 14.001187324523926, + "learning_rate": 9.737805961507587e-06, + "loss": 4.016, + "step": 13360 + }, + { + "epoch": 0.3917087819526844, + "grad_norm": 13.8721342086792, + "learning_rate": 9.737300321730092e-06, + "loss": 4.0125, + "step": 13370 + }, + { + "epoch": 0.3920017578554164, + "grad_norm": 14.864441871643066, + "learning_rate": 9.736794208013804e-06, + "loss": 4.0058, + "step": 13380 + }, + { + "epoch": 0.3922947337581484, + "grad_norm": 14.432720184326172, + "learning_rate": 9.736287620409356e-06, + "loss": 4.0084, + "step": 13390 + }, + { + "epoch": 0.3925877096608804, + "grad_norm": 13.715950012207031, + "learning_rate": 9.735780558967434e-06, + "loss": 4.0141, + "step": 13400 + }, + { + "epoch": 0.3928806855636124, + "grad_norm": 13.522075653076172, + "learning_rate": 9.735273023738763e-06, + "loss": 3.9898, + "step": 13410 + }, + { + "epoch": 0.3931736614663444, + "grad_norm": 12.504051208496094, + "learning_rate": 9.73476501477412e-06, + "loss": 3.9857, + "step": 13420 + }, + { + "epoch": 0.3934666373690764, + "grad_norm": 15.09616756439209, + "learning_rate": 9.734256532124326e-06, + "loss": 4.0076, + "step": 13430 + }, + { + "epoch": 0.3937596132718084, + "grad_norm": 14.942225456237793, + "learning_rate": 9.733747575840253e-06, + "loss": 4.0042, + "step": 13440 + }, + { + "epoch": 0.3940525891745404, + "grad_norm": 13.044072151184082, + "learning_rate": 9.73323814597282e-06, + "loss": 3.9926, + "step": 13450 + }, + { + "epoch": 0.3943455650772724, + "grad_norm": 13.563177108764648, + "learning_rate": 9.732728242572993e-06, + "loss": 3.9898, + "step": 13460 + }, + { + "epoch": 0.3946385409800044, + "grad_norm": 14.82840347290039, + "learning_rate": 9.732217865691782e-06, + "loss": 4.0099, + "step": 13470 + }, + { + "epoch": 0.3949315168827364, + "grad_norm": 13.946556091308594, + "learning_rate": 9.73170701538025e-06, + "loss": 3.9789, + "step": 13480 + }, + { + "epoch": 0.3952244927854684, + "grad_norm": 14.418631553649902, + "learning_rate": 9.731195691689504e-06, + "loss": 3.9824, + "step": 13490 + }, + { + "epoch": 0.39551746868820037, + "grad_norm": 12.178850173950195, + "learning_rate": 9.730683894670697e-06, + "loss": 3.9653, + "step": 13500 + }, + { + "epoch": 0.3958104445909324, + "grad_norm": 13.476447105407715, + "learning_rate": 9.730171624375032e-06, + "loss": 3.9951, + "step": 13510 + }, + { + "epoch": 0.39610342049366437, + "grad_norm": 13.152355194091797, + "learning_rate": 9.729658880853761e-06, + "loss": 3.9973, + "step": 13520 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 11.438815116882324, + "learning_rate": 9.729145664158178e-06, + "loss": 3.9944, + "step": 13530 + }, + { + "epoch": 0.3966893722991284, + "grad_norm": 13.173835754394531, + "learning_rate": 9.728631974339628e-06, + "loss": 3.9761, + "step": 13540 + }, + { + "epoch": 0.3969823482018604, + "grad_norm": 12.776753425598145, + "learning_rate": 9.728117811449502e-06, + "loss": 3.9874, + "step": 13550 + }, + { + "epoch": 0.3972753241045924, + "grad_norm": 12.517105102539062, + "learning_rate": 9.727603175539242e-06, + "loss": 4.0108, + "step": 13560 + }, + { + "epoch": 0.3975683000073244, + "grad_norm": 12.524224281311035, + "learning_rate": 9.72708806666033e-06, + "loss": 3.9833, + "step": 13570 + }, + { + "epoch": 0.3978612759100564, + "grad_norm": 13.036337852478027, + "learning_rate": 9.726572484864305e-06, + "loss": 3.9899, + "step": 13580 + }, + { + "epoch": 0.3981542518127884, + "grad_norm": 12.69775104522705, + "learning_rate": 9.726056430202743e-06, + "loss": 3.9564, + "step": 13590 + }, + { + "epoch": 0.3984472277155204, + "grad_norm": 13.930957794189453, + "learning_rate": 9.725539902727273e-06, + "loss": 3.9672, + "step": 13600 + }, + { + "epoch": 0.3987402036182524, + "grad_norm": 13.399113655090332, + "learning_rate": 9.725022902489573e-06, + "loss": 3.9654, + "step": 13610 + }, + { + "epoch": 0.3990331795209844, + "grad_norm": 12.152693748474121, + "learning_rate": 9.724505429541362e-06, + "loss": 3.9774, + "step": 13620 + }, + { + "epoch": 0.3993261554237164, + "grad_norm": 12.852349281311035, + "learning_rate": 9.723987483934414e-06, + "loss": 3.9599, + "step": 13630 + }, + { + "epoch": 0.3996191313264484, + "grad_norm": 11.753015518188477, + "learning_rate": 9.723469065720546e-06, + "loss": 3.9715, + "step": 13640 + }, + { + "epoch": 0.3999121072291804, + "grad_norm": 13.074869155883789, + "learning_rate": 9.72295017495162e-06, + "loss": 3.9895, + "step": 13650 + }, + { + "epoch": 0.4002050831319124, + "grad_norm": 12.691691398620605, + "learning_rate": 9.722430811679548e-06, + "loss": 3.9885, + "step": 13660 + }, + { + "epoch": 0.4004980590346444, + "grad_norm": 14.116985321044922, + "learning_rate": 9.721910975956291e-06, + "loss": 3.9903, + "step": 13670 + }, + { + "epoch": 0.4007910349373764, + "grad_norm": 14.289801597595215, + "learning_rate": 9.721390667833856e-06, + "loss": 3.9807, + "step": 13680 + }, + { + "epoch": 0.4010840108401084, + "grad_norm": 13.264803886413574, + "learning_rate": 9.720869887364295e-06, + "loss": 3.965, + "step": 13690 + }, + { + "epoch": 0.4013769867428404, + "grad_norm": 13.869277000427246, + "learning_rate": 9.72034863459971e-06, + "loss": 4.016, + "step": 13700 + }, + { + "epoch": 0.4016699626455724, + "grad_norm": 12.615703582763672, + "learning_rate": 9.719826909592249e-06, + "loss": 3.9753, + "step": 13710 + }, + { + "epoch": 0.4019629385483044, + "grad_norm": 12.570981979370117, + "learning_rate": 9.719304712394109e-06, + "loss": 3.9872, + "step": 13720 + }, + { + "epoch": 0.4022559144510364, + "grad_norm": 12.86910629272461, + "learning_rate": 9.718782043057529e-06, + "loss": 3.9637, + "step": 13730 + }, + { + "epoch": 0.4025488903537684, + "grad_norm": 11.601896286010742, + "learning_rate": 9.718258901634802e-06, + "loss": 4.007, + "step": 13740 + }, + { + "epoch": 0.4028418662565004, + "grad_norm": 14.902052879333496, + "learning_rate": 9.717735288178263e-06, + "loss": 3.9887, + "step": 13750 + }, + { + "epoch": 0.4031348421592324, + "grad_norm": 14.992273330688477, + "learning_rate": 9.7172112027403e-06, + "loss": 3.9668, + "step": 13760 + }, + { + "epoch": 0.4034278180619644, + "grad_norm": 12.984325408935547, + "learning_rate": 9.716686645373342e-06, + "loss": 3.982, + "step": 13770 + }, + { + "epoch": 0.4037207939646964, + "grad_norm": 13.332365989685059, + "learning_rate": 9.716161616129867e-06, + "loss": 3.9637, + "step": 13780 + }, + { + "epoch": 0.40401376986742843, + "grad_norm": 14.047605514526367, + "learning_rate": 9.715636115062403e-06, + "loss": 3.9812, + "step": 13790 + }, + { + "epoch": 0.4043067457701604, + "grad_norm": 13.957067489624023, + "learning_rate": 9.715110142223522e-06, + "loss": 3.9591, + "step": 13800 + }, + { + "epoch": 0.40459972167289243, + "grad_norm": 13.627537727355957, + "learning_rate": 9.714583697665846e-06, + "loss": 3.9758, + "step": 13810 + }, + { + "epoch": 0.4048926975756244, + "grad_norm": 14.328856468200684, + "learning_rate": 9.714056781442042e-06, + "loss": 3.9763, + "step": 13820 + }, + { + "epoch": 0.4050098879367172, + "eval_bleu": 0.30644337469940386, + "eval_cap_loss": 1.0484076738357544, + "eval_con_loss": 1.5799589157104492, + "eval_loss": 4.208325386047363, + "step": 13824 + }, + { + "epoch": 0.4050098879367172, + "eval_bleu": 0.30644337469940386, + "eval_cap_loss": 1.0484076738357544, + "eval_con_loss": 1.5799589157104492, + "eval_loss": 4.208325386047363, + "eval_runtime": 56.649, + "eval_samples_per_second": 353.051, + "eval_steps_per_second": 0.353, + "step": 13824 + }, + { + "epoch": 0.4051856734783564, + "grad_norm": 12.98856258392334, + "learning_rate": 9.713529393604824e-06, + "loss": 3.9882, + "step": 13830 + }, + { + "epoch": 0.4054786493810884, + "grad_norm": 14.523873329162598, + "learning_rate": 9.713001534206954e-06, + "loss": 3.9609, + "step": 13840 + }, + { + "epoch": 0.4057716252838204, + "grad_norm": 13.036930084228516, + "learning_rate": 9.712473203301243e-06, + "loss": 3.9456, + "step": 13850 + }, + { + "epoch": 0.4060646011865524, + "grad_norm": 14.863533020019531, + "learning_rate": 9.711944400940545e-06, + "loss": 3.9738, + "step": 13860 + }, + { + "epoch": 0.4063575770892844, + "grad_norm": 12.75907039642334, + "learning_rate": 9.711415127177766e-06, + "loss": 3.9657, + "step": 13870 + }, + { + "epoch": 0.4066505529920164, + "grad_norm": 11.190326690673828, + "learning_rate": 9.710885382065854e-06, + "loss": 3.9767, + "step": 13880 + }, + { + "epoch": 0.4069435288947484, + "grad_norm": 12.59727668762207, + "learning_rate": 9.71035516565781e-06, + "loss": 3.9708, + "step": 13890 + }, + { + "epoch": 0.4072365047974804, + "grad_norm": 12.060478210449219, + "learning_rate": 9.709824478006674e-06, + "loss": 3.9574, + "step": 13900 + }, + { + "epoch": 0.4075294807002124, + "grad_norm": 12.728986740112305, + "learning_rate": 9.709293319165544e-06, + "loss": 3.9542, + "step": 13910 + }, + { + "epoch": 0.4078224566029444, + "grad_norm": 12.962320327758789, + "learning_rate": 9.708761689187557e-06, + "loss": 3.9767, + "step": 13920 + }, + { + "epoch": 0.4081154325056764, + "grad_norm": 11.967059135437012, + "learning_rate": 9.708229588125898e-06, + "loss": 3.964, + "step": 13930 + }, + { + "epoch": 0.4084084084084084, + "grad_norm": 12.3428955078125, + "learning_rate": 9.707697016033802e-06, + "loss": 3.967, + "step": 13940 + }, + { + "epoch": 0.4087013843111404, + "grad_norm": 12.987447738647461, + "learning_rate": 9.707163972964549e-06, + "loss": 3.9608, + "step": 13950 + }, + { + "epoch": 0.4089943602138724, + "grad_norm": 13.267395973205566, + "learning_rate": 9.706630458971469e-06, + "loss": 3.9793, + "step": 13960 + }, + { + "epoch": 0.4092873361166044, + "grad_norm": 13.640851020812988, + "learning_rate": 9.706096474107933e-06, + "loss": 3.9429, + "step": 13970 + }, + { + "epoch": 0.4095803120193364, + "grad_norm": 14.100242614746094, + "learning_rate": 9.705562018427366e-06, + "loss": 3.9816, + "step": 13980 + }, + { + "epoch": 0.4098732879220684, + "grad_norm": 13.45088005065918, + "learning_rate": 9.705027091983235e-06, + "loss": 3.9431, + "step": 13990 + }, + { + "epoch": 0.4101662638248004, + "grad_norm": 12.776249885559082, + "learning_rate": 9.704491694829059e-06, + "loss": 3.9871, + "step": 14000 + }, + { + "epoch": 0.4104592397275324, + "grad_norm": 13.447357177734375, + "learning_rate": 9.703955827018398e-06, + "loss": 3.962, + "step": 14010 + }, + { + "epoch": 0.41075221563026443, + "grad_norm": 12.1921968460083, + "learning_rate": 9.703419488604867e-06, + "loss": 4.0019, + "step": 14020 + }, + { + "epoch": 0.4110451915329964, + "grad_norm": 12.24876880645752, + "learning_rate": 9.702882679642119e-06, + "loss": 3.9684, + "step": 14030 + }, + { + "epoch": 0.41133816743572843, + "grad_norm": 11.933963775634766, + "learning_rate": 9.702345400183858e-06, + "loss": 3.9549, + "step": 14040 + }, + { + "epoch": 0.4116311433384604, + "grad_norm": 12.713263511657715, + "learning_rate": 9.70180765028384e-06, + "loss": 3.9219, + "step": 14050 + }, + { + "epoch": 0.41192411924119243, + "grad_norm": 13.877272605895996, + "learning_rate": 9.70126942999586e-06, + "loss": 3.9649, + "step": 14060 + }, + { + "epoch": 0.4122170951439244, + "grad_norm": 12.693418502807617, + "learning_rate": 9.700730739373768e-06, + "loss": 3.9711, + "step": 14070 + }, + { + "epoch": 0.41251007104665643, + "grad_norm": 13.593183517456055, + "learning_rate": 9.700191578471452e-06, + "loss": 3.9436, + "step": 14080 + }, + { + "epoch": 0.4128030469493884, + "grad_norm": 13.615533828735352, + "learning_rate": 9.699651947342853e-06, + "loss": 3.9693, + "step": 14090 + }, + { + "epoch": 0.41309602285212044, + "grad_norm": 14.209673881530762, + "learning_rate": 9.699111846041961e-06, + "loss": 3.9578, + "step": 14100 + }, + { + "epoch": 0.4133889987548524, + "grad_norm": 13.380576133728027, + "learning_rate": 9.698571274622806e-06, + "loss": 3.9407, + "step": 14110 + }, + { + "epoch": 0.41368197465758444, + "grad_norm": 14.012070655822754, + "learning_rate": 9.698030233139472e-06, + "loss": 3.9756, + "step": 14120 + }, + { + "epoch": 0.4139749505603164, + "grad_norm": 13.098217964172363, + "learning_rate": 9.697488721646085e-06, + "loss": 3.9575, + "step": 14130 + }, + { + "epoch": 0.41426792646304844, + "grad_norm": 14.087658882141113, + "learning_rate": 9.69694674019682e-06, + "loss": 3.9538, + "step": 14140 + }, + { + "epoch": 0.4145609023657804, + "grad_norm": 13.098031997680664, + "learning_rate": 9.6964042888459e-06, + "loss": 3.973, + "step": 14150 + }, + { + "epoch": 0.4148538782685124, + "grad_norm": 12.544844627380371, + "learning_rate": 9.695861367647595e-06, + "loss": 3.9769, + "step": 14160 + }, + { + "epoch": 0.4151468541712444, + "grad_norm": 12.84732723236084, + "learning_rate": 9.695317976656219e-06, + "loss": 3.9532, + "step": 14170 + }, + { + "epoch": 0.4154398300739764, + "grad_norm": 14.097789764404297, + "learning_rate": 9.694774115926135e-06, + "loss": 3.9342, + "step": 14180 + }, + { + "epoch": 0.4157328059767084, + "grad_norm": 12.757673263549805, + "learning_rate": 9.694229785511754e-06, + "loss": 3.9779, + "step": 14190 + }, + { + "epoch": 0.4160257818794404, + "grad_norm": 13.576018333435059, + "learning_rate": 9.693684985467533e-06, + "loss": 3.95, + "step": 14200 + }, + { + "epoch": 0.4163187577821724, + "grad_norm": 12.345698356628418, + "learning_rate": 9.693139715847976e-06, + "loss": 3.9325, + "step": 14210 + }, + { + "epoch": 0.4166117336849044, + "grad_norm": 12.660469055175781, + "learning_rate": 9.692593976707634e-06, + "loss": 3.9668, + "step": 14220 + }, + { + "epoch": 0.4169047095876364, + "grad_norm": 13.694536209106445, + "learning_rate": 9.692047768101105e-06, + "loss": 3.9442, + "step": 14230 + }, + { + "epoch": 0.4171976854903684, + "grad_norm": 14.081439971923828, + "learning_rate": 9.691501090083034e-06, + "loss": 3.9711, + "step": 14240 + }, + { + "epoch": 0.4174906613931004, + "grad_norm": 14.058470726013184, + "learning_rate": 9.690953942708113e-06, + "loss": 3.9282, + "step": 14250 + }, + { + "epoch": 0.4177836372958324, + "grad_norm": 12.327600479125977, + "learning_rate": 9.690406326031078e-06, + "loss": 3.9612, + "step": 14260 + }, + { + "epoch": 0.41807661319856443, + "grad_norm": 12.447441101074219, + "learning_rate": 9.68985824010672e-06, + "loss": 3.94, + "step": 14270 + }, + { + "epoch": 0.4183695891012964, + "grad_norm": 12.601686477661133, + "learning_rate": 9.689309684989868e-06, + "loss": 3.9248, + "step": 14280 + }, + { + "epoch": 0.41866256500402843, + "grad_norm": 11.888383865356445, + "learning_rate": 9.688760660735403e-06, + "loss": 3.9613, + "step": 14290 + }, + { + "epoch": 0.4189555409067604, + "grad_norm": 13.597185134887695, + "learning_rate": 9.688211167398252e-06, + "loss": 3.9442, + "step": 14300 + }, + { + "epoch": 0.41924851680949243, + "grad_norm": 13.086957931518555, + "learning_rate": 9.687661205033387e-06, + "loss": 3.9496, + "step": 14310 + }, + { + "epoch": 0.4195414927122244, + "grad_norm": 12.359102249145508, + "learning_rate": 9.68711077369583e-06, + "loss": 3.943, + "step": 14320 + }, + { + "epoch": 0.41983446861495644, + "grad_norm": 13.263177871704102, + "learning_rate": 9.686559873440647e-06, + "loss": 3.9565, + "step": 14330 + }, + { + "epoch": 0.4200102541565956, + "eval_bleu": 0.3088076899768883, + "eval_cap_loss": 1.043797254562378, + "eval_con_loss": 1.5695452690124512, + "eval_loss": 4.182887554168701, + "step": 14336 + }, + { + "epoch": 0.4200102541565956, + "eval_bleu": 0.3088076899768883, + "eval_cap_loss": 1.043797254562378, + "eval_con_loss": 1.5695452690124512, + "eval_loss": 4.182887554168701, + "eval_runtime": 54.881, + "eval_samples_per_second": 364.425, + "eval_steps_per_second": 0.364, + "step": 14336 + }, + { + "epoch": 0.4201274445176884, + "grad_norm": 13.863434791564941, + "learning_rate": 9.686008504322953e-06, + "loss": 3.9339, + "step": 14340 + }, + { + "epoch": 0.42042042042042044, + "grad_norm": 14.271584510803223, + "learning_rate": 9.68545666639791e-06, + "loss": 3.9428, + "step": 14350 + }, + { + "epoch": 0.4207133963231524, + "grad_norm": 11.80805778503418, + "learning_rate": 9.684904359720724e-06, + "loss": 3.9312, + "step": 14360 + }, + { + "epoch": 0.42100637222588444, + "grad_norm": 13.337695121765137, + "learning_rate": 9.684351584346652e-06, + "loss": 3.9284, + "step": 14370 + }, + { + "epoch": 0.4212993481286164, + "grad_norm": 13.161843299865723, + "learning_rate": 9.683798340330996e-06, + "loss": 3.9541, + "step": 14380 + }, + { + "epoch": 0.42159232403134844, + "grad_norm": 12.166206359863281, + "learning_rate": 9.683244627729105e-06, + "loss": 3.9541, + "step": 14390 + }, + { + "epoch": 0.4218852999340804, + "grad_norm": 11.825681686401367, + "learning_rate": 9.682690446596372e-06, + "loss": 3.9342, + "step": 14400 + }, + { + "epoch": 0.42217827583681244, + "grad_norm": 13.017210006713867, + "learning_rate": 9.68213579698824e-06, + "loss": 3.9268, + "step": 14410 + }, + { + "epoch": 0.4224712517395444, + "grad_norm": 12.138360023498535, + "learning_rate": 9.681580678960204e-06, + "loss": 3.9419, + "step": 14420 + }, + { + "epoch": 0.42276422764227645, + "grad_norm": 13.356725692749023, + "learning_rate": 9.681025092567793e-06, + "loss": 3.9419, + "step": 14430 + }, + { + "epoch": 0.4230572035450084, + "grad_norm": 12.994601249694824, + "learning_rate": 9.680469037866593e-06, + "loss": 3.9214, + "step": 14440 + }, + { + "epoch": 0.42335017944774045, + "grad_norm": 14.457788467407227, + "learning_rate": 9.679912514912235e-06, + "loss": 3.9319, + "step": 14450 + }, + { + "epoch": 0.4236431553504724, + "grad_norm": 12.277192115783691, + "learning_rate": 9.679355523760395e-06, + "loss": 3.9221, + "step": 14460 + }, + { + "epoch": 0.42393613125320445, + "grad_norm": 11.69019889831543, + "learning_rate": 9.678798064466794e-06, + "loss": 3.9359, + "step": 14470 + }, + { + "epoch": 0.4242291071559364, + "grad_norm": 12.921104431152344, + "learning_rate": 9.678240137087206e-06, + "loss": 3.9302, + "step": 14480 + }, + { + "epoch": 0.4245220830586684, + "grad_norm": 11.972028732299805, + "learning_rate": 9.67768174167745e-06, + "loss": 3.9386, + "step": 14490 + }, + { + "epoch": 0.4248150589614004, + "grad_norm": 13.023443222045898, + "learning_rate": 9.677122878293385e-06, + "loss": 3.9005, + "step": 14500 + }, + { + "epoch": 0.4251080348641324, + "grad_norm": 12.876869201660156, + "learning_rate": 9.676563546990926e-06, + "loss": 3.9585, + "step": 14510 + }, + { + "epoch": 0.42540101076686443, + "grad_norm": 12.680730819702148, + "learning_rate": 9.676003747826028e-06, + "loss": 3.9431, + "step": 14520 + }, + { + "epoch": 0.4256939866695964, + "grad_norm": 13.523950576782227, + "learning_rate": 9.675443480854697e-06, + "loss": 3.9174, + "step": 14530 + }, + { + "epoch": 0.42598696257232843, + "grad_norm": 13.37937068939209, + "learning_rate": 9.674882746132985e-06, + "loss": 3.9346, + "step": 14540 + }, + { + "epoch": 0.4262799384750604, + "grad_norm": 14.545207023620605, + "learning_rate": 9.674321543716989e-06, + "loss": 3.9394, + "step": 14550 + }, + { + "epoch": 0.42657291437779243, + "grad_norm": 12.317680358886719, + "learning_rate": 9.673759873662853e-06, + "loss": 3.9265, + "step": 14560 + }, + { + "epoch": 0.4268658902805244, + "grad_norm": 13.452386856079102, + "learning_rate": 9.673197736026772e-06, + "loss": 3.9032, + "step": 14570 + }, + { + "epoch": 0.42715886618325644, + "grad_norm": 12.942198753356934, + "learning_rate": 9.672635130864983e-06, + "loss": 3.9326, + "step": 14580 + }, + { + "epoch": 0.4274518420859884, + "grad_norm": 12.418012619018555, + "learning_rate": 9.672072058233768e-06, + "loss": 3.8998, + "step": 14590 + }, + { + "epoch": 0.42774481798872044, + "grad_norm": 11.917840957641602, + "learning_rate": 9.671508518189464e-06, + "loss": 3.91, + "step": 14600 + }, + { + "epoch": 0.4280377938914524, + "grad_norm": 13.576249122619629, + "learning_rate": 9.670944510788448e-06, + "loss": 3.9455, + "step": 14610 + }, + { + "epoch": 0.42833076979418444, + "grad_norm": 13.030149459838867, + "learning_rate": 9.670380036087146e-06, + "loss": 3.9458, + "step": 14620 + }, + { + "epoch": 0.4286237456969164, + "grad_norm": 13.074870109558105, + "learning_rate": 9.669815094142028e-06, + "loss": 3.91, + "step": 14630 + }, + { + "epoch": 0.42891672159964844, + "grad_norm": 12.572891235351562, + "learning_rate": 9.669249685009617e-06, + "loss": 3.8985, + "step": 14640 + }, + { + "epoch": 0.4292096975023804, + "grad_norm": 14.034156799316406, + "learning_rate": 9.668683808746475e-06, + "loss": 3.8992, + "step": 14650 + }, + { + "epoch": 0.42950267340511245, + "grad_norm": 12.868736267089844, + "learning_rate": 9.668117465409217e-06, + "loss": 3.9156, + "step": 14660 + }, + { + "epoch": 0.4297956493078444, + "grad_norm": 12.679798126220703, + "learning_rate": 9.6675506550545e-06, + "loss": 3.9169, + "step": 14670 + }, + { + "epoch": 0.43008862521057645, + "grad_norm": 12.03018569946289, + "learning_rate": 9.666983377739035e-06, + "loss": 3.9755, + "step": 14680 + }, + { + "epoch": 0.4303816011133084, + "grad_norm": 13.368114471435547, + "learning_rate": 9.666415633519567e-06, + "loss": 3.9205, + "step": 14690 + }, + { + "epoch": 0.43067457701604045, + "grad_norm": 13.09943962097168, + "learning_rate": 9.665847422452904e-06, + "loss": 3.929, + "step": 14700 + }, + { + "epoch": 0.4309675529187724, + "grad_norm": 11.434608459472656, + "learning_rate": 9.665278744595885e-06, + "loss": 3.9062, + "step": 14710 + }, + { + "epoch": 0.43126052882150445, + "grad_norm": 13.88577938079834, + "learning_rate": 9.664709600005408e-06, + "loss": 3.9388, + "step": 14720 + }, + { + "epoch": 0.4315535047242364, + "grad_norm": 12.617305755615234, + "learning_rate": 9.66413998873841e-06, + "loss": 3.891, + "step": 14730 + }, + { + "epoch": 0.43184648062696845, + "grad_norm": 11.847375869750977, + "learning_rate": 9.663569910851876e-06, + "loss": 3.9141, + "step": 14740 + }, + { + "epoch": 0.4321394565297004, + "grad_norm": 13.348372459411621, + "learning_rate": 9.66299936640284e-06, + "loss": 3.9104, + "step": 14750 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 13.971686363220215, + "learning_rate": 9.662428355448383e-06, + "loss": 3.9265, + "step": 14760 + }, + { + "epoch": 0.43272540833516443, + "grad_norm": 11.930741310119629, + "learning_rate": 9.661856878045632e-06, + "loss": 3.9415, + "step": 14770 + }, + { + "epoch": 0.43301838423789646, + "grad_norm": 13.450057029724121, + "learning_rate": 9.661284934251755e-06, + "loss": 3.915, + "step": 14780 + }, + { + "epoch": 0.43331136014062843, + "grad_norm": 14.307111740112305, + "learning_rate": 9.660712524123976e-06, + "loss": 3.909, + "step": 14790 + }, + { + "epoch": 0.43360433604336046, + "grad_norm": 12.021349906921387, + "learning_rate": 9.660139647719562e-06, + "loss": 3.8929, + "step": 14800 + }, + { + "epoch": 0.43389731194609243, + "grad_norm": 13.417936325073242, + "learning_rate": 9.659566305095821e-06, + "loss": 3.9133, + "step": 14810 + }, + { + "epoch": 0.4341902878488244, + "grad_norm": 13.58305549621582, + "learning_rate": 9.658992496310117e-06, + "loss": 3.9216, + "step": 14820 + }, + { + "epoch": 0.43448326375155644, + "grad_norm": 13.478666305541992, + "learning_rate": 9.658418221419854e-06, + "loss": 3.919, + "step": 14830 + }, + { + "epoch": 0.4347762396542884, + "grad_norm": 13.809708595275879, + "learning_rate": 9.657843480482487e-06, + "loss": 3.9097, + "step": 14840 + }, + { + "epoch": 0.43501062037647403, + "eval_bleu": 0.3106169434472754, + "eval_cap_loss": 1.0382108688354492, + "eval_con_loss": 1.5557363033294678, + "eval_loss": 4.149683475494385, + "step": 14848 + }, + { + "epoch": 0.43501062037647403, + "eval_bleu": 0.3106169434472754, + "eval_cap_loss": 1.0382108688354492, + "eval_con_loss": 1.5557363033294678, + "eval_loss": 4.149683475494385, + "eval_runtime": 55.0446, + "eval_samples_per_second": 363.342, + "eval_steps_per_second": 0.363, + "step": 14848 + }, + { + "epoch": 0.43506921555702044, + "grad_norm": 12.70842456817627, + "learning_rate": 9.657268273555512e-06, + "loss": 3.8976, + "step": 14850 + }, + { + "epoch": 0.4353621914597524, + "grad_norm": 13.588570594787598, + "learning_rate": 9.656692600696478e-06, + "loss": 3.9058, + "step": 14860 + }, + { + "epoch": 0.43565516736248444, + "grad_norm": 12.849590301513672, + "learning_rate": 9.656116461962975e-06, + "loss": 3.9379, + "step": 14870 + }, + { + "epoch": 0.4359481432652164, + "grad_norm": 14.257112503051758, + "learning_rate": 9.655539857412646e-06, + "loss": 3.9099, + "step": 14880 + }, + { + "epoch": 0.43624111916794844, + "grad_norm": 13.854668617248535, + "learning_rate": 9.654962787103173e-06, + "loss": 3.8956, + "step": 14890 + }, + { + "epoch": 0.4365340950706804, + "grad_norm": 13.019905090332031, + "learning_rate": 9.65438525109229e-06, + "loss": 3.9169, + "step": 14900 + }, + { + "epoch": 0.43682707097341245, + "grad_norm": 13.815832138061523, + "learning_rate": 9.653807249437777e-06, + "loss": 3.8969, + "step": 14910 + }, + { + "epoch": 0.4371200468761444, + "grad_norm": 12.468979835510254, + "learning_rate": 9.653228782197458e-06, + "loss": 3.9065, + "step": 14920 + }, + { + "epoch": 0.43741302277887645, + "grad_norm": 12.771622657775879, + "learning_rate": 9.652649849429207e-06, + "loss": 3.9081, + "step": 14930 + }, + { + "epoch": 0.4377059986816084, + "grad_norm": 13.454721450805664, + "learning_rate": 9.65207045119094e-06, + "loss": 3.9352, + "step": 14940 + }, + { + "epoch": 0.43799897458434045, + "grad_norm": 12.93968391418457, + "learning_rate": 9.651490587540626e-06, + "loss": 3.915, + "step": 14950 + }, + { + "epoch": 0.4382919504870724, + "grad_norm": 11.73256778717041, + "learning_rate": 9.650910258536275e-06, + "loss": 3.9295, + "step": 14960 + }, + { + "epoch": 0.43858492638980445, + "grad_norm": 12.408387184143066, + "learning_rate": 9.650329464235943e-06, + "loss": 3.8809, + "step": 14970 + }, + { + "epoch": 0.4388779022925364, + "grad_norm": 13.645265579223633, + "learning_rate": 9.649748204697741e-06, + "loss": 3.8962, + "step": 14980 + }, + { + "epoch": 0.43917087819526845, + "grad_norm": 12.235239028930664, + "learning_rate": 9.649166479979815e-06, + "loss": 3.8991, + "step": 14990 + }, + { + "epoch": 0.43946385409800043, + "grad_norm": 12.040310859680176, + "learning_rate": 9.648584290140366e-06, + "loss": 3.8924, + "step": 15000 + }, + { + "epoch": 0.43975683000073246, + "grad_norm": 11.626635551452637, + "learning_rate": 9.648001635237638e-06, + "loss": 3.9376, + "step": 15010 + }, + { + "epoch": 0.44004980590346443, + "grad_norm": 11.834822654724121, + "learning_rate": 9.647418515329922e-06, + "loss": 3.914, + "step": 15020 + }, + { + "epoch": 0.44034278180619646, + "grad_norm": 13.972037315368652, + "learning_rate": 9.646834930475556e-06, + "loss": 3.8872, + "step": 15030 + }, + { + "epoch": 0.44063575770892843, + "grad_norm": 12.878727912902832, + "learning_rate": 9.646250880732923e-06, + "loss": 3.905, + "step": 15040 + }, + { + "epoch": 0.44092873361166046, + "grad_norm": 13.736748695373535, + "learning_rate": 9.645666366160456e-06, + "loss": 3.9026, + "step": 15050 + }, + { + "epoch": 0.44122170951439244, + "grad_norm": 13.733244895935059, + "learning_rate": 9.645081386816628e-06, + "loss": 3.9082, + "step": 15060 + }, + { + "epoch": 0.44151468541712446, + "grad_norm": 14.363865852355957, + "learning_rate": 9.644495942759968e-06, + "loss": 3.8988, + "step": 15070 + }, + { + "epoch": 0.44180766131985644, + "grad_norm": 13.080283164978027, + "learning_rate": 9.643910034049044e-06, + "loss": 3.8603, + "step": 15080 + }, + { + "epoch": 0.44210063722258847, + "grad_norm": 12.457635879516602, + "learning_rate": 9.643323660742471e-06, + "loss": 3.8688, + "step": 15090 + }, + { + "epoch": 0.44239361312532044, + "grad_norm": 12.717618942260742, + "learning_rate": 9.642736822898915e-06, + "loss": 3.8709, + "step": 15100 + }, + { + "epoch": 0.44268658902805247, + "grad_norm": 13.959824562072754, + "learning_rate": 9.642149520577084e-06, + "loss": 3.9152, + "step": 15110 + }, + { + "epoch": 0.44297956493078444, + "grad_norm": 12.388381004333496, + "learning_rate": 9.641561753835732e-06, + "loss": 3.9055, + "step": 15120 + }, + { + "epoch": 0.44327254083351647, + "grad_norm": 12.461764335632324, + "learning_rate": 9.640973522733667e-06, + "loss": 3.895, + "step": 15130 + }, + { + "epoch": 0.44356551673624844, + "grad_norm": 14.085334777832031, + "learning_rate": 9.640384827329734e-06, + "loss": 3.9092, + "step": 15140 + }, + { + "epoch": 0.4438584926389804, + "grad_norm": 12.342877388000488, + "learning_rate": 9.63979566768283e-06, + "loss": 3.9038, + "step": 15150 + }, + { + "epoch": 0.44415146854171245, + "grad_norm": 14.182404518127441, + "learning_rate": 9.639206043851896e-06, + "loss": 3.8998, + "step": 15160 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 13.512377738952637, + "learning_rate": 9.638615955895921e-06, + "loss": 3.8962, + "step": 15170 + }, + { + "epoch": 0.44473742034717645, + "grad_norm": 13.556227684020996, + "learning_rate": 9.638025403873939e-06, + "loss": 3.8846, + "step": 15180 + }, + { + "epoch": 0.4450303962499084, + "grad_norm": 13.055577278137207, + "learning_rate": 9.637434387845033e-06, + "loss": 3.902, + "step": 15190 + }, + { + "epoch": 0.44532337215264045, + "grad_norm": 12.79382610321045, + "learning_rate": 9.63684290786833e-06, + "loss": 3.8811, + "step": 15200 + }, + { + "epoch": 0.4456163480553724, + "grad_norm": 12.744906425476074, + "learning_rate": 9.636250964003e-06, + "loss": 3.8892, + "step": 15210 + }, + { + "epoch": 0.44590932395810445, + "grad_norm": 12.822798728942871, + "learning_rate": 9.63565855630827e-06, + "loss": 3.8795, + "step": 15220 + }, + { + "epoch": 0.4462022998608364, + "grad_norm": 12.35411548614502, + "learning_rate": 9.635065684843403e-06, + "loss": 3.9154, + "step": 15230 + }, + { + "epoch": 0.44649527576356846, + "grad_norm": 12.43931770324707, + "learning_rate": 9.634472349667712e-06, + "loss": 3.8898, + "step": 15240 + }, + { + "epoch": 0.44678825166630043, + "grad_norm": 13.19000244140625, + "learning_rate": 9.63387855084056e-06, + "loss": 3.8979, + "step": 15250 + }, + { + "epoch": 0.44708122756903246, + "grad_norm": 11.096620559692383, + "learning_rate": 9.63328428842135e-06, + "loss": 3.8855, + "step": 15260 + }, + { + "epoch": 0.44737420347176443, + "grad_norm": 12.695682525634766, + "learning_rate": 9.632689562469535e-06, + "loss": 3.8785, + "step": 15270 + }, + { + "epoch": 0.44766717937449646, + "grad_norm": 12.943378448486328, + "learning_rate": 9.632094373044614e-06, + "loss": 3.8866, + "step": 15280 + }, + { + "epoch": 0.44796015527722843, + "grad_norm": 13.043712615966797, + "learning_rate": 9.631498720206132e-06, + "loss": 3.8771, + "step": 15290 + }, + { + "epoch": 0.44825313117996046, + "grad_norm": 12.91523265838623, + "learning_rate": 9.63090260401368e-06, + "loss": 3.9106, + "step": 15300 + }, + { + "epoch": 0.44854610708269244, + "grad_norm": 13.029062271118164, + "learning_rate": 9.630306024526899e-06, + "loss": 3.8876, + "step": 15310 + }, + { + "epoch": 0.44883908298542446, + "grad_norm": 14.159468650817871, + "learning_rate": 9.62970898180547e-06, + "loss": 3.8854, + "step": 15320 + }, + { + "epoch": 0.44913205888815644, + "grad_norm": 12.461089134216309, + "learning_rate": 9.629111475909125e-06, + "loss": 3.8764, + "step": 15330 + }, + { + "epoch": 0.44942503479088847, + "grad_norm": 12.232316970825195, + "learning_rate": 9.628513506897638e-06, + "loss": 3.8874, + "step": 15340 + }, + { + "epoch": 0.44971801069362044, + "grad_norm": 13.839691162109375, + "learning_rate": 9.627915074830837e-06, + "loss": 3.9152, + "step": 15350 + }, + { + "epoch": 0.45001098659635247, + "grad_norm": 12.857536315917969, + "learning_rate": 9.627316179768588e-06, + "loss": 3.8617, + "step": 15360 + }, + { + "epoch": 0.45001098659635247, + "eval_bleu": 0.30991273140879994, + "eval_cap_loss": 1.0354771614074707, + "eval_con_loss": 1.5405142307281494, + "eval_loss": 4.1165056228637695, + "step": 15360 + }, + { + "epoch": 0.45001098659635247, + "eval_bleu": 0.30991273140879994, + "eval_cap_loss": 1.0354771614074707, + "eval_con_loss": 1.5405142307281494, + "eval_loss": 4.1165056228637695, + "eval_runtime": 56.7558, + "eval_samples_per_second": 352.387, + "eval_steps_per_second": 0.352, + "step": 15360 + }, + { + "epoch": 0.45030396249908444, + "grad_norm": 12.386975288391113, + "learning_rate": 9.626716821770809e-06, + "loss": 3.8561, + "step": 15370 + }, + { + "epoch": 0.45059693840181647, + "grad_norm": 13.914283752441406, + "learning_rate": 9.626117000897461e-06, + "loss": 3.8735, + "step": 15380 + }, + { + "epoch": 0.45088991430454844, + "grad_norm": 14.302947044372559, + "learning_rate": 9.625516717208555e-06, + "loss": 3.8829, + "step": 15390 + }, + { + "epoch": 0.4511828902072805, + "grad_norm": 13.369491577148438, + "learning_rate": 9.62491597076414e-06, + "loss": 3.8811, + "step": 15400 + }, + { + "epoch": 0.45147586611001245, + "grad_norm": 13.496856689453125, + "learning_rate": 9.624314761624323e-06, + "loss": 3.8789, + "step": 15410 + }, + { + "epoch": 0.4517688420127445, + "grad_norm": 14.037216186523438, + "learning_rate": 9.623713089849248e-06, + "loss": 3.9102, + "step": 15420 + }, + { + "epoch": 0.45206181791547645, + "grad_norm": 12.899063110351562, + "learning_rate": 9.623110955499111e-06, + "loss": 3.8918, + "step": 15430 + }, + { + "epoch": 0.4523547938182085, + "grad_norm": 12.388032913208008, + "learning_rate": 9.622508358634151e-06, + "loss": 3.8982, + "step": 15440 + }, + { + "epoch": 0.45264776972094045, + "grad_norm": 12.748863220214844, + "learning_rate": 9.621905299314654e-06, + "loss": 3.8808, + "step": 15450 + }, + { + "epoch": 0.4529407456236725, + "grad_norm": 11.457345962524414, + "learning_rate": 9.621301777600952e-06, + "loss": 3.8283, + "step": 15460 + }, + { + "epoch": 0.45323372152640445, + "grad_norm": 11.653406143188477, + "learning_rate": 9.620697793553424e-06, + "loss": 3.8907, + "step": 15470 + }, + { + "epoch": 0.4535266974291364, + "grad_norm": 12.980907440185547, + "learning_rate": 9.620093347232495e-06, + "loss": 3.9016, + "step": 15480 + }, + { + "epoch": 0.45381967333186846, + "grad_norm": 12.598738670349121, + "learning_rate": 9.619488438698638e-06, + "loss": 3.892, + "step": 15490 + }, + { + "epoch": 0.45411264923460043, + "grad_norm": 14.784530639648438, + "learning_rate": 9.618883068012369e-06, + "loss": 3.8658, + "step": 15500 + }, + { + "epoch": 0.45440562513733246, + "grad_norm": 13.36193561553955, + "learning_rate": 9.618277235234252e-06, + "loss": 3.8809, + "step": 15510 + }, + { + "epoch": 0.45469860104006443, + "grad_norm": 13.215031623840332, + "learning_rate": 9.617670940424897e-06, + "loss": 3.8791, + "step": 15520 + }, + { + "epoch": 0.45499157694279646, + "grad_norm": 12.432334899902344, + "learning_rate": 9.617064183644959e-06, + "loss": 3.8687, + "step": 15530 + }, + { + "epoch": 0.45528455284552843, + "grad_norm": 13.419756889343262, + "learning_rate": 9.616456964955143e-06, + "loss": 3.8561, + "step": 15540 + }, + { + "epoch": 0.45557752874826046, + "grad_norm": 14.198217391967773, + "learning_rate": 9.615849284416196e-06, + "loss": 3.8374, + "step": 15550 + }, + { + "epoch": 0.45587050465099244, + "grad_norm": 13.341034889221191, + "learning_rate": 9.61524114208891e-06, + "loss": 3.8672, + "step": 15560 + }, + { + "epoch": 0.45616348055372447, + "grad_norm": 13.24734878540039, + "learning_rate": 9.614632538034134e-06, + "loss": 3.8554, + "step": 15570 + }, + { + "epoch": 0.45645645645645644, + "grad_norm": 13.988813400268555, + "learning_rate": 9.614023472312748e-06, + "loss": 3.8438, + "step": 15580 + }, + { + "epoch": 0.45674943235918847, + "grad_norm": 12.77743148803711, + "learning_rate": 9.613413944985688e-06, + "loss": 3.8712, + "step": 15590 + }, + { + "epoch": 0.45704240826192044, + "grad_norm": 13.114425659179688, + "learning_rate": 9.612803956113932e-06, + "loss": 3.8856, + "step": 15600 + }, + { + "epoch": 0.45733538416465247, + "grad_norm": 12.867426872253418, + "learning_rate": 9.612193505758508e-06, + "loss": 3.8564, + "step": 15610 + }, + { + "epoch": 0.45762836006738444, + "grad_norm": 13.418704986572266, + "learning_rate": 9.611582593980486e-06, + "loss": 3.8795, + "step": 15620 + }, + { + "epoch": 0.45792133597011647, + "grad_norm": 12.30764389038086, + "learning_rate": 9.610971220840985e-06, + "loss": 3.8514, + "step": 15630 + }, + { + "epoch": 0.45821431187284845, + "grad_norm": 13.288269996643066, + "learning_rate": 9.610359386401172e-06, + "loss": 3.898, + "step": 15640 + }, + { + "epoch": 0.4585072877755805, + "grad_norm": 13.008383750915527, + "learning_rate": 9.609747090722252e-06, + "loss": 3.9064, + "step": 15650 + }, + { + "epoch": 0.45880026367831245, + "grad_norm": 12.465888977050781, + "learning_rate": 9.609134333865486e-06, + "loss": 3.8595, + "step": 15660 + }, + { + "epoch": 0.4590932395810445, + "grad_norm": 12.648077964782715, + "learning_rate": 9.608521115892173e-06, + "loss": 3.8667, + "step": 15670 + }, + { + "epoch": 0.45938621548377645, + "grad_norm": 12.491808891296387, + "learning_rate": 9.607907436863666e-06, + "loss": 3.8453, + "step": 15680 + }, + { + "epoch": 0.4596791913865085, + "grad_norm": 13.049433708190918, + "learning_rate": 9.607293296841356e-06, + "loss": 3.871, + "step": 15690 + }, + { + "epoch": 0.45997216728924045, + "grad_norm": 12.739462852478027, + "learning_rate": 9.606678695886686e-06, + "loss": 3.8537, + "step": 15700 + }, + { + "epoch": 0.4602651431919725, + "grad_norm": 14.891251564025879, + "learning_rate": 9.606063634061142e-06, + "loss": 3.8939, + "step": 15710 + }, + { + "epoch": 0.46055811909470445, + "grad_norm": 12.327018737792969, + "learning_rate": 9.605448111426262e-06, + "loss": 3.8615, + "step": 15720 + }, + { + "epoch": 0.4608510949974365, + "grad_norm": 13.55538272857666, + "learning_rate": 9.604832128043617e-06, + "loss": 3.8473, + "step": 15730 + }, + { + "epoch": 0.46114407090016846, + "grad_norm": 11.633563041687012, + "learning_rate": 9.60421568397484e-06, + "loss": 3.8257, + "step": 15740 + }, + { + "epoch": 0.4614370468029005, + "grad_norm": 14.611474990844727, + "learning_rate": 9.6035987792816e-06, + "loss": 3.856, + "step": 15750 + }, + { + "epoch": 0.46173002270563246, + "grad_norm": 11.973952293395996, + "learning_rate": 9.602981414025613e-06, + "loss": 3.8505, + "step": 15760 + }, + { + "epoch": 0.4620229986083645, + "grad_norm": 13.513167381286621, + "learning_rate": 9.602363588268644e-06, + "loss": 3.883, + "step": 15770 + }, + { + "epoch": 0.46231597451109646, + "grad_norm": 14.033339500427246, + "learning_rate": 9.601745302072503e-06, + "loss": 3.8385, + "step": 15780 + }, + { + "epoch": 0.4626089504138285, + "grad_norm": 14.335535049438477, + "learning_rate": 9.601126555499047e-06, + "loss": 3.8793, + "step": 15790 + }, + { + "epoch": 0.46290192631656046, + "grad_norm": 13.554957389831543, + "learning_rate": 9.600507348610175e-06, + "loss": 3.8492, + "step": 15800 + }, + { + "epoch": 0.46319490221929244, + "grad_norm": 12.270334243774414, + "learning_rate": 9.599887681467837e-06, + "loss": 3.8581, + "step": 15810 + }, + { + "epoch": 0.46348787812202447, + "grad_norm": 14.33385181427002, + "learning_rate": 9.599267554134029e-06, + "loss": 3.8582, + "step": 15820 + }, + { + "epoch": 0.46378085402475644, + "grad_norm": 12.546133995056152, + "learning_rate": 9.598646966670787e-06, + "loss": 3.8711, + "step": 15830 + }, + { + "epoch": 0.46407382992748847, + "grad_norm": 12.715847969055176, + "learning_rate": 9.598025919140197e-06, + "loss": 3.8644, + "step": 15840 + }, + { + "epoch": 0.46436680583022044, + "grad_norm": 14.147093772888184, + "learning_rate": 9.597404411604393e-06, + "loss": 3.8535, + "step": 15850 + }, + { + "epoch": 0.46465978173295247, + "grad_norm": 13.567423820495605, + "learning_rate": 9.596782444125556e-06, + "loss": 3.8393, + "step": 15860 + }, + { + "epoch": 0.46495275763568444, + "grad_norm": 12.369508743286133, + "learning_rate": 9.596160016765906e-06, + "loss": 3.8743, + "step": 15870 + }, + { + "epoch": 0.46501135281623085, + "eval_bleu": 0.31290022497496717, + "eval_cap_loss": 1.029679775238037, + "eval_con_loss": 1.528221845626831, + "eval_loss": 4.086123466491699, + "step": 15872 + }, + { + "epoch": 0.46501135281623085, + "eval_bleu": 0.31290022497496717, + "eval_cap_loss": 1.029679775238037, + "eval_con_loss": 1.528221845626831, + "eval_loss": 4.086123466491699, + "eval_runtime": 55.6455, + "eval_samples_per_second": 359.418, + "eval_steps_per_second": 0.359, + "step": 15872 + }, + { + "epoch": 0.4652457335384165, + "grad_norm": 12.870928764343262, + "learning_rate": 9.595537129587716e-06, + "loss": 3.8638, + "step": 15880 + }, + { + "epoch": 0.46553870944114845, + "grad_norm": 13.248882293701172, + "learning_rate": 9.594913782653298e-06, + "loss": 3.8657, + "step": 15890 + }, + { + "epoch": 0.4658316853438805, + "grad_norm": 13.04900074005127, + "learning_rate": 9.594289976025017e-06, + "loss": 3.8739, + "step": 15900 + }, + { + "epoch": 0.46612466124661245, + "grad_norm": 13.727812767028809, + "learning_rate": 9.59366570976528e-06, + "loss": 3.8714, + "step": 15910 + }, + { + "epoch": 0.4664176371493445, + "grad_norm": 13.01041030883789, + "learning_rate": 9.593040983936545e-06, + "loss": 3.8451, + "step": 15920 + }, + { + "epoch": 0.46671061305207645, + "grad_norm": 13.478998184204102, + "learning_rate": 9.592415798601308e-06, + "loss": 3.8264, + "step": 15930 + }, + { + "epoch": 0.4670035889548085, + "grad_norm": 11.906020164489746, + "learning_rate": 9.591790153822116e-06, + "loss": 3.8516, + "step": 15940 + }, + { + "epoch": 0.46729656485754045, + "grad_norm": 12.0791597366333, + "learning_rate": 9.591226680747993e-06, + "loss": 3.8261, + "step": 15950 + }, + { + "epoch": 0.4675895407602725, + "grad_norm": 13.951678276062012, + "learning_rate": 9.590600163197764e-06, + "loss": 3.8337, + "step": 15960 + }, + { + "epoch": 0.46788251666300446, + "grad_norm": 12.204731941223145, + "learning_rate": 9.589973186385227e-06, + "loss": 3.878, + "step": 15970 + }, + { + "epoch": 0.4681754925657365, + "grad_norm": 11.8947114944458, + "learning_rate": 9.589345750373103e-06, + "loss": 3.8331, + "step": 15980 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 12.71855354309082, + "learning_rate": 9.588717855224164e-06, + "loss": 3.8483, + "step": 15990 + }, + { + "epoch": 0.4687614443712005, + "grad_norm": 12.174749374389648, + "learning_rate": 9.588089501001227e-06, + "loss": 3.8434, + "step": 16000 + }, + { + "epoch": 0.46905442027393246, + "grad_norm": 12.966104507446289, + "learning_rate": 9.587460687767156e-06, + "loss": 3.8562, + "step": 16010 + }, + { + "epoch": 0.4693473961766645, + "grad_norm": 12.508505821228027, + "learning_rate": 9.586831415584861e-06, + "loss": 3.8348, + "step": 16020 + }, + { + "epoch": 0.46964037207939646, + "grad_norm": 13.395581245422363, + "learning_rate": 9.586201684517295e-06, + "loss": 3.863, + "step": 16030 + }, + { + "epoch": 0.4699333479821285, + "grad_norm": 13.867630004882812, + "learning_rate": 9.585571494627459e-06, + "loss": 3.8661, + "step": 16040 + }, + { + "epoch": 0.47022632388486046, + "grad_norm": 12.956842422485352, + "learning_rate": 9.5849408459784e-06, + "loss": 3.8415, + "step": 16050 + }, + { + "epoch": 0.4705192997875925, + "grad_norm": 12.768647193908691, + "learning_rate": 9.58430973863321e-06, + "loss": 3.8372, + "step": 16060 + }, + { + "epoch": 0.47081227569032447, + "grad_norm": 13.562782287597656, + "learning_rate": 9.58367817265503e-06, + "loss": 3.8443, + "step": 16070 + }, + { + "epoch": 0.4711052515930565, + "grad_norm": 13.498738288879395, + "learning_rate": 9.583046148107042e-06, + "loss": 3.8464, + "step": 16080 + }, + { + "epoch": 0.47139822749578847, + "grad_norm": 13.13698673248291, + "learning_rate": 9.582413665052478e-06, + "loss": 3.8334, + "step": 16090 + }, + { + "epoch": 0.4716912033985205, + "grad_norm": 12.319607734680176, + "learning_rate": 9.581780723554613e-06, + "loss": 3.8413, + "step": 16100 + }, + { + "epoch": 0.47198417930125247, + "grad_norm": 14.159520149230957, + "learning_rate": 9.581147323676769e-06, + "loss": 3.8516, + "step": 16110 + }, + { + "epoch": 0.4722771552039845, + "grad_norm": 12.395110130310059, + "learning_rate": 9.580513465482316e-06, + "loss": 3.8484, + "step": 16120 + }, + { + "epoch": 0.4725701311067165, + "grad_norm": 12.348849296569824, + "learning_rate": 9.579879149034663e-06, + "loss": 3.835, + "step": 16130 + }, + { + "epoch": 0.47286310700944845, + "grad_norm": 13.880234718322754, + "learning_rate": 9.579244374397274e-06, + "loss": 3.8035, + "step": 16140 + }, + { + "epoch": 0.4731560829121805, + "grad_norm": 10.762567520141602, + "learning_rate": 9.578609141633653e-06, + "loss": 3.8277, + "step": 16150 + }, + { + "epoch": 0.47344905881491245, + "grad_norm": 12.6214017868042, + "learning_rate": 9.577973450807352e-06, + "loss": 3.857, + "step": 16160 + }, + { + "epoch": 0.4737420347176445, + "grad_norm": 13.143202781677246, + "learning_rate": 9.577337301981967e-06, + "loss": 3.8493, + "step": 16170 + }, + { + "epoch": 0.47403501062037645, + "grad_norm": 13.10555362701416, + "learning_rate": 9.576700695221142e-06, + "loss": 3.8296, + "step": 16180 + }, + { + "epoch": 0.4743279865231085, + "grad_norm": 14.688979148864746, + "learning_rate": 9.576063630588563e-06, + "loss": 3.8434, + "step": 16190 + }, + { + "epoch": 0.47462096242584045, + "grad_norm": 12.744941711425781, + "learning_rate": 9.57542610814797e-06, + "loss": 3.8211, + "step": 16200 + }, + { + "epoch": 0.4749139383285725, + "grad_norm": 13.980661392211914, + "learning_rate": 9.574788127963137e-06, + "loss": 3.8573, + "step": 16210 + }, + { + "epoch": 0.47520691423130446, + "grad_norm": 14.443212509155273, + "learning_rate": 9.574149690097894e-06, + "loss": 3.8246, + "step": 16220 + }, + { + "epoch": 0.4754998901340365, + "grad_norm": 12.516701698303223, + "learning_rate": 9.573510794616112e-06, + "loss": 3.8079, + "step": 16230 + }, + { + "epoch": 0.47579286603676846, + "grad_norm": 13.318535804748535, + "learning_rate": 9.57287144158171e-06, + "loss": 3.8124, + "step": 16240 + }, + { + "epoch": 0.4760858419395005, + "grad_norm": 12.787925720214844, + "learning_rate": 9.572231631058649e-06, + "loss": 3.8258, + "step": 16250 + }, + { + "epoch": 0.47637881784223246, + "grad_norm": 11.757196426391602, + "learning_rate": 9.57159136311094e-06, + "loss": 3.8376, + "step": 16260 + }, + { + "epoch": 0.4766717937449645, + "grad_norm": 12.259482383728027, + "learning_rate": 9.570950637802636e-06, + "loss": 3.8206, + "step": 16270 + }, + { + "epoch": 0.47696476964769646, + "grad_norm": 12.122371673583984, + "learning_rate": 9.570309455197839e-06, + "loss": 3.8268, + "step": 16280 + }, + { + "epoch": 0.4772577455504285, + "grad_norm": 12.85489273071289, + "learning_rate": 9.569667815360697e-06, + "loss": 3.8174, + "step": 16290 + }, + { + "epoch": 0.47755072145316046, + "grad_norm": 12.712169647216797, + "learning_rate": 9.569025718355401e-06, + "loss": 3.8451, + "step": 16300 + }, + { + "epoch": 0.4778436973558925, + "grad_norm": 12.851548194885254, + "learning_rate": 9.568383164246189e-06, + "loss": 3.8455, + "step": 16310 + }, + { + "epoch": 0.47813667325862447, + "grad_norm": 11.80063533782959, + "learning_rate": 9.567740153097345e-06, + "loss": 3.8166, + "step": 16320 + }, + { + "epoch": 0.4784296491613565, + "grad_norm": 12.955425262451172, + "learning_rate": 9.567096684973197e-06, + "loss": 3.8402, + "step": 16330 + }, + { + "epoch": 0.47872262506408847, + "grad_norm": 11.494199752807617, + "learning_rate": 9.566452759938124e-06, + "loss": 3.8157, + "step": 16340 + }, + { + "epoch": 0.4790156009668205, + "grad_norm": 12.77529239654541, + "learning_rate": 9.56580837805654e-06, + "loss": 3.8141, + "step": 16350 + }, + { + "epoch": 0.47930857686955247, + "grad_norm": 12.29283618927002, + "learning_rate": 9.565163539392919e-06, + "loss": 3.805, + "step": 16360 + }, + { + "epoch": 0.4796015527722845, + "grad_norm": 12.3599214553833, + "learning_rate": 9.56451824401177e-06, + "loss": 3.8298, + "step": 16370 + }, + { + "epoch": 0.4798945286750165, + "grad_norm": 12.386861801147461, + "learning_rate": 9.563872491977647e-06, + "loss": 3.8417, + "step": 16380 + }, + { + "epoch": 0.4800117190361093, + "eval_bleu": 0.3134705834106447, + "eval_cap_loss": 1.025575876235962, + "eval_con_loss": 1.512519121170044, + "eval_loss": 4.050614356994629, + "step": 16384 + }, + { + "epoch": 0.4800117190361093, + "eval_bleu": 0.3134705834106447, + "eval_cap_loss": 1.025575876235962, + "eval_con_loss": 1.512519121170044, + "eval_loss": 4.050614356994629, + "eval_runtime": 58.7385, + "eval_samples_per_second": 340.492, + "eval_steps_per_second": 0.34, + "step": 16384 + }, + { + "epoch": 0.4801875045777485, + "grad_norm": 12.73007583618164, + "learning_rate": 9.563226283355163e-06, + "loss": 3.8211, + "step": 16390 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 12.01456069946289, + "learning_rate": 9.56257961820896e-06, + "loss": 3.8269, + "step": 16400 + }, + { + "epoch": 0.4807734563832125, + "grad_norm": 12.748785018920898, + "learning_rate": 9.561932496603735e-06, + "loss": 3.8323, + "step": 16410 + }, + { + "epoch": 0.4810664322859445, + "grad_norm": 11.706099510192871, + "learning_rate": 9.561284918604228e-06, + "loss": 3.8383, + "step": 16420 + }, + { + "epoch": 0.4813594081886765, + "grad_norm": 12.476700782775879, + "learning_rate": 9.560636884275227e-06, + "loss": 3.8125, + "step": 16430 + }, + { + "epoch": 0.4816523840914085, + "grad_norm": 12.666441917419434, + "learning_rate": 9.559988393681563e-06, + "loss": 3.8313, + "step": 16440 + }, + { + "epoch": 0.48194535999414045, + "grad_norm": 12.520492553710938, + "learning_rate": 9.559339446888113e-06, + "loss": 3.8231, + "step": 16450 + }, + { + "epoch": 0.4822383358968725, + "grad_norm": 13.634404182434082, + "learning_rate": 9.558690043959802e-06, + "loss": 3.8099, + "step": 16460 + }, + { + "epoch": 0.48253131179960446, + "grad_norm": 13.344803810119629, + "learning_rate": 9.558040184961597e-06, + "loss": 3.8195, + "step": 16470 + }, + { + "epoch": 0.4828242877023365, + "grad_norm": 12.489740371704102, + "learning_rate": 9.557389869958513e-06, + "loss": 3.8471, + "step": 16480 + }, + { + "epoch": 0.48311726360506846, + "grad_norm": 12.91768741607666, + "learning_rate": 9.556739099015613e-06, + "loss": 3.7906, + "step": 16490 + }, + { + "epoch": 0.4834102395078005, + "grad_norm": 12.437487602233887, + "learning_rate": 9.556087872197997e-06, + "loss": 3.8059, + "step": 16500 + }, + { + "epoch": 0.48370321541053246, + "grad_norm": 12.062070846557617, + "learning_rate": 9.555436189570822e-06, + "loss": 3.8287, + "step": 16510 + }, + { + "epoch": 0.4839961913132645, + "grad_norm": 14.523857116699219, + "learning_rate": 9.554784051199283e-06, + "loss": 3.8201, + "step": 16520 + }, + { + "epoch": 0.48428916721599646, + "grad_norm": 12.258399963378906, + "learning_rate": 9.55413145714862e-06, + "loss": 3.8082, + "step": 16530 + }, + { + "epoch": 0.4845821431187285, + "grad_norm": 12.995464324951172, + "learning_rate": 9.553478407484126e-06, + "loss": 3.8006, + "step": 16540 + }, + { + "epoch": 0.48487511902146047, + "grad_norm": 13.999052047729492, + "learning_rate": 9.552824902271132e-06, + "loss": 3.8352, + "step": 16550 + }, + { + "epoch": 0.4851680949241925, + "grad_norm": 12.441825866699219, + "learning_rate": 9.552170941575017e-06, + "loss": 3.8437, + "step": 16560 + }, + { + "epoch": 0.48546107082692447, + "grad_norm": 12.055156707763672, + "learning_rate": 9.551516525461204e-06, + "loss": 3.8439, + "step": 16570 + }, + { + "epoch": 0.4857540467296565, + "grad_norm": 12.1935453414917, + "learning_rate": 9.550861653995169e-06, + "loss": 3.8242, + "step": 16580 + }, + { + "epoch": 0.48604702263238847, + "grad_norm": 11.426250457763672, + "learning_rate": 9.550206327242423e-06, + "loss": 3.809, + "step": 16590 + }, + { + "epoch": 0.4863399985351205, + "grad_norm": 13.215505599975586, + "learning_rate": 9.54955054526853e-06, + "loss": 3.8195, + "step": 16600 + }, + { + "epoch": 0.4866329744378525, + "grad_norm": 12.164697647094727, + "learning_rate": 9.548894308139097e-06, + "loss": 3.8099, + "step": 16610 + }, + { + "epoch": 0.4869259503405845, + "grad_norm": 13.718069076538086, + "learning_rate": 9.548237615919776e-06, + "loss": 3.8022, + "step": 16620 + }, + { + "epoch": 0.4872189262433165, + "grad_norm": 12.486952781677246, + "learning_rate": 9.547580468676266e-06, + "loss": 3.7983, + "step": 16630 + }, + { + "epoch": 0.4875119021460485, + "grad_norm": 14.245753288269043, + "learning_rate": 9.546922866474309e-06, + "loss": 3.8366, + "step": 16640 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 13.181937217712402, + "learning_rate": 9.546264809379694e-06, + "loss": 3.8019, + "step": 16650 + }, + { + "epoch": 0.4880978539515125, + "grad_norm": 12.745935440063477, + "learning_rate": 9.54560629745826e-06, + "loss": 3.816, + "step": 16660 + }, + { + "epoch": 0.4883908298542445, + "grad_norm": 13.355567932128906, + "learning_rate": 9.54494733077588e-06, + "loss": 3.8359, + "step": 16670 + }, + { + "epoch": 0.4886838057569765, + "grad_norm": 11.320769309997559, + "learning_rate": 9.544287909398486e-06, + "loss": 3.8142, + "step": 16680 + }, + { + "epoch": 0.4889767816597085, + "grad_norm": 10.650129318237305, + "learning_rate": 9.543628033392047e-06, + "loss": 3.7971, + "step": 16690 + }, + { + "epoch": 0.4892697575624405, + "grad_norm": 12.264425277709961, + "learning_rate": 9.54296770282258e-06, + "loss": 3.8302, + "step": 16700 + }, + { + "epoch": 0.4895627334651725, + "grad_norm": 12.909140586853027, + "learning_rate": 9.542306917756146e-06, + "loss": 3.816, + "step": 16710 + }, + { + "epoch": 0.4898557093679045, + "grad_norm": 11.933881759643555, + "learning_rate": 9.541645678258856e-06, + "loss": 3.8115, + "step": 16720 + }, + { + "epoch": 0.4901486852706365, + "grad_norm": 13.582850456237793, + "learning_rate": 9.540983984396857e-06, + "loss": 3.8107, + "step": 16730 + }, + { + "epoch": 0.4904416611733685, + "grad_norm": 12.017635345458984, + "learning_rate": 9.540321836236352e-06, + "loss": 3.7986, + "step": 16740 + }, + { + "epoch": 0.4907346370761005, + "grad_norm": 13.445158958435059, + "learning_rate": 9.539659233843585e-06, + "loss": 3.8086, + "step": 16750 + }, + { + "epoch": 0.4910276129788325, + "grad_norm": 13.889506340026855, + "learning_rate": 9.538996177284845e-06, + "loss": 3.8255, + "step": 16760 + }, + { + "epoch": 0.4913205888815645, + "grad_norm": 13.768719673156738, + "learning_rate": 9.538332666626465e-06, + "loss": 3.8235, + "step": 16770 + }, + { + "epoch": 0.49161356478429646, + "grad_norm": 10.755965232849121, + "learning_rate": 9.537668701934829e-06, + "loss": 3.7738, + "step": 16780 + }, + { + "epoch": 0.4919065406870285, + "grad_norm": 12.216082572937012, + "learning_rate": 9.537004283276357e-06, + "loss": 3.7848, + "step": 16790 + }, + { + "epoch": 0.49219951658976047, + "grad_norm": 12.524221420288086, + "learning_rate": 9.536339410717525e-06, + "loss": 3.8125, + "step": 16800 + }, + { + "epoch": 0.4924924924924925, + "grad_norm": 11.165016174316406, + "learning_rate": 9.53567408432485e-06, + "loss": 3.7881, + "step": 16810 + }, + { + "epoch": 0.49278546839522447, + "grad_norm": 12.873505592346191, + "learning_rate": 9.53500830416489e-06, + "loss": 3.78, + "step": 16820 + }, + { + "epoch": 0.4930784442979565, + "grad_norm": 13.288555145263672, + "learning_rate": 9.534342070304256e-06, + "loss": 3.8224, + "step": 16830 + }, + { + "epoch": 0.49337142020068847, + "grad_norm": 12.237349510192871, + "learning_rate": 9.533675382809598e-06, + "loss": 3.82, + "step": 16840 + }, + { + "epoch": 0.4936643961034205, + "grad_norm": 11.81434154510498, + "learning_rate": 9.533008241747615e-06, + "loss": 3.7985, + "step": 16850 + }, + { + "epoch": 0.4939573720061525, + "grad_norm": 11.68295669555664, + "learning_rate": 9.532340647185052e-06, + "loss": 3.7984, + "step": 16860 + }, + { + "epoch": 0.4942503479088845, + "grad_norm": 12.838603019714355, + "learning_rate": 9.531672599188695e-06, + "loss": 3.8133, + "step": 16870 + }, + { + "epoch": 0.4945433238116165, + "grad_norm": 12.799830436706543, + "learning_rate": 9.531004097825383e-06, + "loss": 3.7972, + "step": 16880 + }, + { + "epoch": 0.4948362997143485, + "grad_norm": 12.976740837097168, + "learning_rate": 9.53033514316199e-06, + "loss": 3.8095, + "step": 16890 + }, + { + "epoch": 0.4950120852559877, + "eval_bleu": 0.3150356714902134, + "eval_cap_loss": 1.0226800441741943, + "eval_con_loss": 1.499431848526001, + "eval_loss": 4.021543979644775, + "step": 16896 + }, + { + "epoch": 0.4950120852559877, + "eval_bleu": 0.3150356714902134, + "eval_cap_loss": 1.0226800441741943, + "eval_con_loss": 1.499431848526001, + "eval_loss": 4.021543979644775, + "eval_runtime": 61.6484, + "eval_samples_per_second": 324.42, + "eval_steps_per_second": 0.324, + "step": 16896 + }, + { + "epoch": 0.4951292756170805, + "grad_norm": 13.799283027648926, + "learning_rate": 9.529665735265444e-06, + "loss": 3.8048, + "step": 16900 + }, + { + "epoch": 0.4954222515198125, + "grad_norm": 10.675951957702637, + "learning_rate": 9.528995874202716e-06, + "loss": 3.7811, + "step": 16910 + }, + { + "epoch": 0.4957152274225445, + "grad_norm": 12.918570518493652, + "learning_rate": 9.528325560040819e-06, + "loss": 3.7958, + "step": 16920 + }, + { + "epoch": 0.4960082033252765, + "grad_norm": 12.740899085998535, + "learning_rate": 9.527654792846814e-06, + "loss": 3.8091, + "step": 16930 + }, + { + "epoch": 0.4963011792280085, + "grad_norm": 13.17920207977295, + "learning_rate": 9.52698357268781e-06, + "loss": 3.7693, + "step": 16940 + }, + { + "epoch": 0.4965941551307405, + "grad_norm": 12.394055366516113, + "learning_rate": 9.52631189963096e-06, + "loss": 3.808, + "step": 16950 + }, + { + "epoch": 0.4968871310334725, + "grad_norm": 13.100262641906738, + "learning_rate": 9.525639773743454e-06, + "loss": 3.7822, + "step": 16960 + }, + { + "epoch": 0.4971801069362045, + "grad_norm": 12.355093955993652, + "learning_rate": 9.524967195092541e-06, + "loss": 3.7768, + "step": 16970 + }, + { + "epoch": 0.4974730828389365, + "grad_norm": 11.899687767028809, + "learning_rate": 9.524294163745504e-06, + "loss": 3.7771, + "step": 16980 + }, + { + "epoch": 0.4977660587416685, + "grad_norm": 13.220376014709473, + "learning_rate": 9.523620679769678e-06, + "loss": 3.79, + "step": 16990 + }, + { + "epoch": 0.4980590346444005, + "grad_norm": 15.651598930358887, + "learning_rate": 9.522946743232442e-06, + "loss": 3.8131, + "step": 17000 + }, + { + "epoch": 0.4983520105471325, + "grad_norm": 13.147910118103027, + "learning_rate": 9.522272354201217e-06, + "loss": 3.7904, + "step": 17010 + }, + { + "epoch": 0.4986449864498645, + "grad_norm": 11.790806770324707, + "learning_rate": 9.521597512743474e-06, + "loss": 3.791, + "step": 17020 + }, + { + "epoch": 0.4989379623525965, + "grad_norm": 12.252983093261719, + "learning_rate": 9.520922218926725e-06, + "loss": 3.8089, + "step": 17030 + }, + { + "epoch": 0.4992309382553285, + "grad_norm": 12.286520957946777, + "learning_rate": 9.520246472818528e-06, + "loss": 3.7769, + "step": 17040 + }, + { + "epoch": 0.4995239141580605, + "grad_norm": 12.930675506591797, + "learning_rate": 9.51957027448649e-06, + "loss": 3.8053, + "step": 17050 + }, + { + "epoch": 0.4998168900607925, + "grad_norm": 10.338582038879395, + "learning_rate": 9.51889362399826e-06, + "loss": 3.7892, + "step": 17060 + }, + { + "epoch": 0.5001098659635245, + "grad_norm": 12.74913215637207, + "learning_rate": 9.518216521421533e-06, + "loss": 3.7921, + "step": 17070 + }, + { + "epoch": 0.5004028418662565, + "grad_norm": 11.731492042541504, + "learning_rate": 9.517538966824049e-06, + "loss": 3.7952, + "step": 17080 + }, + { + "epoch": 0.5006958177689885, + "grad_norm": 12.390288352966309, + "learning_rate": 9.516860960273593e-06, + "loss": 3.7744, + "step": 17090 + }, + { + "epoch": 0.5009887936717206, + "grad_norm": 11.386237144470215, + "learning_rate": 9.516182501837995e-06, + "loss": 3.7715, + "step": 17100 + }, + { + "epoch": 0.5012817695744525, + "grad_norm": 11.949684143066406, + "learning_rate": 9.515503591585131e-06, + "loss": 3.7703, + "step": 17110 + }, + { + "epoch": 0.5015747454771845, + "grad_norm": 12.438505172729492, + "learning_rate": 9.514824229582922e-06, + "loss": 3.8126, + "step": 17120 + }, + { + "epoch": 0.5018677213799165, + "grad_norm": 13.342231750488281, + "learning_rate": 9.514144415899333e-06, + "loss": 3.7643, + "step": 17130 + }, + { + "epoch": 0.5021606972826484, + "grad_norm": 12.836976051330566, + "learning_rate": 9.513464150602381e-06, + "loss": 3.7879, + "step": 17140 + }, + { + "epoch": 0.5024536731853805, + "grad_norm": 12.068809509277344, + "learning_rate": 9.512783433760114e-06, + "loss": 3.7804, + "step": 17150 + }, + { + "epoch": 0.5027466490881125, + "grad_norm": 12.179290771484375, + "learning_rate": 9.512102265440641e-06, + "loss": 3.7989, + "step": 17160 + }, + { + "epoch": 0.5030396249908445, + "grad_norm": 13.228604316711426, + "learning_rate": 9.511420645712104e-06, + "loss": 3.7817, + "step": 17170 + }, + { + "epoch": 0.5033326008935765, + "grad_norm": 13.702258110046387, + "learning_rate": 9.510738574642697e-06, + "loss": 3.7865, + "step": 17180 + }, + { + "epoch": 0.5036255767963085, + "grad_norm": 13.382811546325684, + "learning_rate": 9.510056052300656e-06, + "loss": 3.773, + "step": 17190 + }, + { + "epoch": 0.5039185526990405, + "grad_norm": 12.464451789855957, + "learning_rate": 9.509373078754264e-06, + "loss": 3.7928, + "step": 17200 + }, + { + "epoch": 0.5042115286017725, + "grad_norm": 13.766175270080566, + "learning_rate": 9.508689654071851e-06, + "loss": 3.7874, + "step": 17210 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 12.5075044631958, + "learning_rate": 9.508005778321787e-06, + "loss": 3.7653, + "step": 17220 + }, + { + "epoch": 0.5047974804072365, + "grad_norm": 12.109238624572754, + "learning_rate": 9.507321451572489e-06, + "loss": 3.7729, + "step": 17230 + }, + { + "epoch": 0.5050904563099685, + "grad_norm": 11.771467208862305, + "learning_rate": 9.50663667389242e-06, + "loss": 3.8035, + "step": 17240 + }, + { + "epoch": 0.5053834322127005, + "grad_norm": 14.459776878356934, + "learning_rate": 9.50595144535009e-06, + "loss": 3.7857, + "step": 17250 + }, + { + "epoch": 0.5056764081154325, + "grad_norm": 13.635942459106445, + "learning_rate": 9.50526576601405e-06, + "loss": 3.7855, + "step": 17260 + }, + { + "epoch": 0.5059693840181645, + "grad_norm": 13.693647384643555, + "learning_rate": 9.504579635952899e-06, + "loss": 3.7934, + "step": 17270 + }, + { + "epoch": 0.5062623599208965, + "grad_norm": 12.186750411987305, + "learning_rate": 9.50389305523528e-06, + "loss": 3.7831, + "step": 17280 + }, + { + "epoch": 0.5065553358236285, + "grad_norm": 11.567293167114258, + "learning_rate": 9.503206023929882e-06, + "loss": 3.79, + "step": 17290 + }, + { + "epoch": 0.5068483117263605, + "grad_norm": 13.164304733276367, + "learning_rate": 9.502518542105438e-06, + "loss": 3.7922, + "step": 17300 + }, + { + "epoch": 0.5071412876290925, + "grad_norm": 12.798477172851562, + "learning_rate": 9.501830609830726e-06, + "loss": 3.7682, + "step": 17310 + }, + { + "epoch": 0.5074342635318245, + "grad_norm": 14.381230354309082, + "learning_rate": 9.501142227174571e-06, + "loss": 3.8096, + "step": 17320 + }, + { + "epoch": 0.5077272394345566, + "grad_norm": 12.597195625305176, + "learning_rate": 9.50045339420584e-06, + "loss": 3.772, + "step": 17330 + }, + { + "epoch": 0.5080202153372885, + "grad_norm": 13.472010612487793, + "learning_rate": 9.499764110993447e-06, + "loss": 3.7687, + "step": 17340 + }, + { + "epoch": 0.5083131912400205, + "grad_norm": 11.305741310119629, + "learning_rate": 9.49907437760635e-06, + "loss": 3.776, + "step": 17350 + }, + { + "epoch": 0.5086061671427525, + "grad_norm": 13.371347427368164, + "learning_rate": 9.498384194113555e-06, + "loss": 3.7542, + "step": 17360 + }, + { + "epoch": 0.5088991430454846, + "grad_norm": 11.5806245803833, + "learning_rate": 9.497693560584109e-06, + "loss": 3.7891, + "step": 17370 + }, + { + "epoch": 0.5091921189482165, + "grad_norm": 12.749152183532715, + "learning_rate": 9.497002477087106e-06, + "loss": 3.8078, + "step": 17380 + }, + { + "epoch": 0.5094850948509485, + "grad_norm": 13.044577598571777, + "learning_rate": 9.496310943691685e-06, + "loss": 3.7754, + "step": 17390 + }, + { + "epoch": 0.5097780707536805, + "grad_norm": 12.294549942016602, + "learning_rate": 9.49561896046703e-06, + "loss": 3.78, + "step": 17400 + }, + { + "epoch": 0.5100124514758662, + "eval_bleu": 0.3147736845932444, + "eval_cap_loss": 1.0206326246261597, + "eval_con_loss": 1.4907307624816895, + "eval_loss": 4.002094268798828, + "step": 17408 + }, + { + "epoch": 0.5100124514758662, + "eval_bleu": 0.3147736845932444, + "eval_cap_loss": 1.0206326246261597, + "eval_con_loss": 1.4907307624816895, + "eval_loss": 4.002094268798828, + "eval_runtime": 55.5324, + "eval_samples_per_second": 360.15, + "eval_steps_per_second": 0.36, + "step": 17408 + }, + { + "epoch": 0.5100710466564126, + "grad_norm": 12.556663513183594, + "learning_rate": 9.494926527482369e-06, + "loss": 3.7611, + "step": 17410 + }, + { + "epoch": 0.5103640225591445, + "grad_norm": 13.128988265991211, + "learning_rate": 9.494233644806977e-06, + "loss": 3.8031, + "step": 17420 + }, + { + "epoch": 0.5106569984618765, + "grad_norm": 14.584733009338379, + "learning_rate": 9.493540312510173e-06, + "loss": 3.7595, + "step": 17430 + }, + { + "epoch": 0.5109499743646085, + "grad_norm": 12.446176528930664, + "learning_rate": 9.492846530661318e-06, + "loss": 3.7865, + "step": 17440 + }, + { + "epoch": 0.5112429502673405, + "grad_norm": 12.582476615905762, + "learning_rate": 9.492152299329825e-06, + "loss": 3.7822, + "step": 17450 + }, + { + "epoch": 0.5115359261700725, + "grad_norm": 12.848374366760254, + "learning_rate": 9.491457618585142e-06, + "loss": 3.7741, + "step": 17460 + }, + { + "epoch": 0.5118289020728045, + "grad_norm": 12.001612663269043, + "learning_rate": 9.490762488496774e-06, + "loss": 3.7631, + "step": 17470 + }, + { + "epoch": 0.5121218779755365, + "grad_norm": 12.023472785949707, + "learning_rate": 9.490066909134262e-06, + "loss": 3.7433, + "step": 17480 + }, + { + "epoch": 0.5124148538782685, + "grad_norm": 11.757722854614258, + "learning_rate": 9.489370880567192e-06, + "loss": 3.7671, + "step": 17490 + }, + { + "epoch": 0.5127078297810005, + "grad_norm": 12.97678279876709, + "learning_rate": 9.488674402865202e-06, + "loss": 3.7597, + "step": 17500 + }, + { + "epoch": 0.5130008056837325, + "grad_norm": 12.124809265136719, + "learning_rate": 9.487977476097968e-06, + "loss": 3.7607, + "step": 17510 + }, + { + "epoch": 0.5132937815864645, + "grad_norm": 13.446130752563477, + "learning_rate": 9.487280100335215e-06, + "loss": 3.7974, + "step": 17520 + }, + { + "epoch": 0.5135867574891965, + "grad_norm": 11.125080108642578, + "learning_rate": 9.486582275646707e-06, + "loss": 3.7669, + "step": 17530 + }, + { + "epoch": 0.5138797333919285, + "grad_norm": 13.059073448181152, + "learning_rate": 9.485884002102264e-06, + "loss": 3.778, + "step": 17540 + }, + { + "epoch": 0.5141727092946605, + "grad_norm": 13.029889106750488, + "learning_rate": 9.485185279771738e-06, + "loss": 3.7746, + "step": 17550 + }, + { + "epoch": 0.5144656851973926, + "grad_norm": 12.312271118164062, + "learning_rate": 9.484486108725034e-06, + "loss": 3.7729, + "step": 17560 + }, + { + "epoch": 0.5147586611001245, + "grad_norm": 12.720874786376953, + "learning_rate": 9.483786489032101e-06, + "loss": 3.7808, + "step": 17570 + }, + { + "epoch": 0.5150516370028565, + "grad_norm": 11.529013633728027, + "learning_rate": 9.483086420762933e-06, + "loss": 3.7414, + "step": 17580 + }, + { + "epoch": 0.5153446129055885, + "grad_norm": 12.184840202331543, + "learning_rate": 9.482385903987563e-06, + "loss": 3.7785, + "step": 17590 + }, + { + "epoch": 0.5156375888083206, + "grad_norm": 12.902552604675293, + "learning_rate": 9.481684938776077e-06, + "loss": 3.7712, + "step": 17600 + }, + { + "epoch": 0.5159305647110525, + "grad_norm": 12.609933853149414, + "learning_rate": 9.480983525198604e-06, + "loss": 3.7864, + "step": 17610 + }, + { + "epoch": 0.5162235406137845, + "grad_norm": 13.083603858947754, + "learning_rate": 9.480281663325312e-06, + "loss": 3.7746, + "step": 17620 + }, + { + "epoch": 0.5165165165165165, + "grad_norm": 12.398283958435059, + "learning_rate": 9.479579353226421e-06, + "loss": 3.7872, + "step": 17630 + }, + { + "epoch": 0.5168094924192486, + "grad_norm": 12.669676780700684, + "learning_rate": 9.478876594972191e-06, + "loss": 3.765, + "step": 17640 + }, + { + "epoch": 0.5171024683219805, + "grad_norm": 13.49669361114502, + "learning_rate": 9.478173388632932e-06, + "loss": 3.7639, + "step": 17650 + }, + { + "epoch": 0.5173954442247125, + "grad_norm": 15.181371688842773, + "learning_rate": 9.477469734278991e-06, + "loss": 3.7912, + "step": 17660 + }, + { + "epoch": 0.5176884201274445, + "grad_norm": 13.106871604919434, + "learning_rate": 9.476765631980768e-06, + "loss": 3.7732, + "step": 17670 + }, + { + "epoch": 0.5179813960301766, + "grad_norm": 11.880818367004395, + "learning_rate": 9.476061081808706e-06, + "loss": 3.7333, + "step": 17680 + }, + { + "epoch": 0.5182743719329085, + "grad_norm": 12.301383018493652, + "learning_rate": 9.475356083833286e-06, + "loss": 3.7476, + "step": 17690 + }, + { + "epoch": 0.5185673478356405, + "grad_norm": 13.800215721130371, + "learning_rate": 9.47465063812504e-06, + "loss": 3.7787, + "step": 17700 + }, + { + "epoch": 0.5188603237383725, + "grad_norm": 12.738388061523438, + "learning_rate": 9.473944744754547e-06, + "loss": 3.747, + "step": 17710 + }, + { + "epoch": 0.5191532996411046, + "grad_norm": 11.85040283203125, + "learning_rate": 9.473238403792424e-06, + "loss": 3.7586, + "step": 17720 + }, + { + "epoch": 0.5194462755438365, + "grad_norm": 12.227934837341309, + "learning_rate": 9.472531615309337e-06, + "loss": 3.7698, + "step": 17730 + }, + { + "epoch": 0.5197392514465685, + "grad_norm": 12.908365249633789, + "learning_rate": 9.471824379375998e-06, + "loss": 3.7421, + "step": 17740 + }, + { + "epoch": 0.5200322273493005, + "grad_norm": 11.947480201721191, + "learning_rate": 9.47111669606316e-06, + "loss": 3.7548, + "step": 17750 + }, + { + "epoch": 0.5203252032520326, + "grad_norm": 12.009449005126953, + "learning_rate": 9.470408565441621e-06, + "loss": 3.7643, + "step": 17760 + }, + { + "epoch": 0.5206181791547645, + "grad_norm": 13.337617874145508, + "learning_rate": 9.46969998758223e-06, + "loss": 3.7623, + "step": 17770 + }, + { + "epoch": 0.5209111550574965, + "grad_norm": 11.900442123413086, + "learning_rate": 9.46899096255587e-06, + "loss": 3.77, + "step": 17780 + }, + { + "epoch": 0.5212041309602286, + "grad_norm": 11.899338722229004, + "learning_rate": 9.46828149043348e-06, + "loss": 3.7453, + "step": 17790 + }, + { + "epoch": 0.5214971068629605, + "grad_norm": 11.550363540649414, + "learning_rate": 9.467571571286036e-06, + "loss": 3.7394, + "step": 17800 + }, + { + "epoch": 0.5217900827656925, + "grad_norm": 12.235040664672852, + "learning_rate": 9.46686120518456e-06, + "loss": 3.7247, + "step": 17810 + }, + { + "epoch": 0.5220830586684245, + "grad_norm": 11.594467163085938, + "learning_rate": 9.466150392200125e-06, + "loss": 3.763, + "step": 17820 + }, + { + "epoch": 0.5223760345711566, + "grad_norm": 12.492470741271973, + "learning_rate": 9.465439132403836e-06, + "loss": 3.7733, + "step": 17830 + }, + { + "epoch": 0.5226690104738885, + "grad_norm": 11.954293251037598, + "learning_rate": 9.464727425866856e-06, + "loss": 3.7409, + "step": 17840 + }, + { + "epoch": 0.5229619863766205, + "grad_norm": 12.516684532165527, + "learning_rate": 9.464015272660386e-06, + "loss": 3.7097, + "step": 17850 + }, + { + "epoch": 0.5232549622793525, + "grad_norm": 12.554028511047363, + "learning_rate": 9.46330267285567e-06, + "loss": 3.7703, + "step": 17860 + }, + { + "epoch": 0.5235479381820846, + "grad_norm": 13.302252769470215, + "learning_rate": 9.462589626524004e-06, + "loss": 3.7522, + "step": 17870 + }, + { + "epoch": 0.5238409140848165, + "grad_norm": 12.34770393371582, + "learning_rate": 9.46187613373672e-06, + "loss": 3.7625, + "step": 17880 + }, + { + "epoch": 0.5241338899875485, + "grad_norm": 12.916218757629395, + "learning_rate": 9.4611621945652e-06, + "loss": 3.7635, + "step": 17890 + }, + { + "epoch": 0.5244268658902805, + "grad_norm": 11.596075057983398, + "learning_rate": 9.46044780908087e-06, + "loss": 3.7612, + "step": 17900 + }, + { + "epoch": 0.5247198417930126, + "grad_norm": 12.768528938293457, + "learning_rate": 9.4597329773552e-06, + "loss": 3.7541, + "step": 17910 + }, + { + "epoch": 0.5250128176957445, + "grad_norm": 13.072789192199707, + "learning_rate": 9.459017699459705e-06, + "loss": 3.7316, + "step": 17920 + }, + { + "epoch": 0.5250128176957445, + "eval_bleu": 0.315987736703674, + "eval_cap_loss": 1.0172476768493652, + "eval_con_loss": 1.4836292266845703, + "eval_loss": 3.984506368637085, + "step": 17920 + }, + { + "epoch": 0.5250128176957445, + "eval_bleu": 0.315987736703674, + "eval_cap_loss": 1.0172476768493652, + "eval_con_loss": 1.4836292266845703, + "eval_loss": 3.984506368637085, + "eval_runtime": 57.9118, + "eval_samples_per_second": 345.353, + "eval_steps_per_second": 0.345, + "step": 17920 + }, + { + "epoch": 0.5253057935984765, + "grad_norm": 12.524395942687988, + "learning_rate": 9.458301975465942e-06, + "loss": 3.7488, + "step": 17930 + }, + { + "epoch": 0.5255987695012085, + "grad_norm": 13.071651458740234, + "learning_rate": 9.457585805445516e-06, + "loss": 3.7594, + "step": 17940 + }, + { + "epoch": 0.5258917454039406, + "grad_norm": 11.59280776977539, + "learning_rate": 9.456869189470079e-06, + "loss": 3.76, + "step": 17950 + }, + { + "epoch": 0.5261847213066725, + "grad_norm": 12.818239212036133, + "learning_rate": 9.45615212761132e-06, + "loss": 3.7523, + "step": 17960 + }, + { + "epoch": 0.5264776972094045, + "grad_norm": 12.801309585571289, + "learning_rate": 9.455434619940977e-06, + "loss": 3.7561, + "step": 17970 + }, + { + "epoch": 0.5267706731121365, + "grad_norm": 13.469890594482422, + "learning_rate": 9.454716666530834e-06, + "loss": 3.7416, + "step": 17980 + }, + { + "epoch": 0.5270636490148686, + "grad_norm": 12.08674144744873, + "learning_rate": 9.453998267452717e-06, + "loss": 3.7359, + "step": 17990 + }, + { + "epoch": 0.5273566249176005, + "grad_norm": 12.288737297058105, + "learning_rate": 9.4532794227785e-06, + "loss": 3.7585, + "step": 18000 + }, + { + "epoch": 0.5276496008203325, + "grad_norm": 12.986377716064453, + "learning_rate": 9.452560132580094e-06, + "loss": 3.7601, + "step": 18010 + }, + { + "epoch": 0.5279425767230645, + "grad_norm": 12.60971736907959, + "learning_rate": 9.451840396929466e-06, + "loss": 3.7432, + "step": 18020 + }, + { + "epoch": 0.5282355526257966, + "grad_norm": 12.252971649169922, + "learning_rate": 9.451120215898617e-06, + "loss": 3.7486, + "step": 18030 + }, + { + "epoch": 0.5285285285285285, + "grad_norm": 11.8507080078125, + "learning_rate": 9.450399589559598e-06, + "loss": 3.7458, + "step": 18040 + }, + { + "epoch": 0.5288215044312605, + "grad_norm": 12.301631927490234, + "learning_rate": 9.449678517984503e-06, + "loss": 3.7654, + "step": 18050 + }, + { + "epoch": 0.5291144803339926, + "grad_norm": 12.33114242553711, + "learning_rate": 9.448957001245472e-06, + "loss": 3.7434, + "step": 18060 + }, + { + "epoch": 0.5294074562367246, + "grad_norm": 11.904088020324707, + "learning_rate": 9.448235039414685e-06, + "loss": 3.7794, + "step": 18070 + }, + { + "epoch": 0.5297004321394565, + "grad_norm": 13.017590522766113, + "learning_rate": 9.447512632564378e-06, + "loss": 3.7616, + "step": 18080 + }, + { + "epoch": 0.5299934080421885, + "grad_norm": 12.047432899475098, + "learning_rate": 9.446789780766814e-06, + "loss": 3.7354, + "step": 18090 + }, + { + "epoch": 0.5302863839449206, + "grad_norm": 12.136282920837402, + "learning_rate": 9.446066484094316e-06, + "loss": 3.7591, + "step": 18100 + }, + { + "epoch": 0.5305793598476525, + "grad_norm": 13.044297218322754, + "learning_rate": 9.445342742619244e-06, + "loss": 3.7365, + "step": 18110 + }, + { + "epoch": 0.5308723357503845, + "grad_norm": 12.449051856994629, + "learning_rate": 9.444618556414002e-06, + "loss": 3.7409, + "step": 18120 + }, + { + "epoch": 0.5311653116531165, + "grad_norm": 12.789710998535156, + "learning_rate": 9.443893925551045e-06, + "loss": 3.7211, + "step": 18130 + }, + { + "epoch": 0.5314582875558486, + "grad_norm": 12.846992492675781, + "learning_rate": 9.443168850102864e-06, + "loss": 3.7343, + "step": 18140 + }, + { + "epoch": 0.5317512634585805, + "grad_norm": 12.468252182006836, + "learning_rate": 9.442443330142002e-06, + "loss": 3.7558, + "step": 18150 + }, + { + "epoch": 0.5320442393613125, + "grad_norm": 12.18035888671875, + "learning_rate": 9.441717365741039e-06, + "loss": 3.7359, + "step": 18160 + }, + { + "epoch": 0.5323372152640445, + "grad_norm": 13.089679718017578, + "learning_rate": 9.440990956972604e-06, + "loss": 3.7152, + "step": 18170 + }, + { + "epoch": 0.5326301911667766, + "grad_norm": 12.144460678100586, + "learning_rate": 9.440264103909374e-06, + "loss": 3.7426, + "step": 18180 + }, + { + "epoch": 0.5329231670695085, + "grad_norm": 13.148693084716797, + "learning_rate": 9.439536806624062e-06, + "loss": 3.7246, + "step": 18190 + }, + { + "epoch": 0.5332161429722405, + "grad_norm": 12.658452033996582, + "learning_rate": 9.43880906518943e-06, + "loss": 3.7519, + "step": 18200 + }, + { + "epoch": 0.5335091188749725, + "grad_norm": 12.348062515258789, + "learning_rate": 9.438080879678289e-06, + "loss": 3.7348, + "step": 18210 + }, + { + "epoch": 0.5338020947777046, + "grad_norm": 12.014951705932617, + "learning_rate": 9.437352250163486e-06, + "loss": 3.7375, + "step": 18220 + }, + { + "epoch": 0.5340950706804365, + "grad_norm": 11.912907600402832, + "learning_rate": 9.436623176717914e-06, + "loss": 3.743, + "step": 18230 + }, + { + "epoch": 0.5343880465831685, + "grad_norm": 12.611083984375, + "learning_rate": 9.435893659414517e-06, + "loss": 3.7403, + "step": 18240 + }, + { + "epoch": 0.5346810224859005, + "grad_norm": 12.619928359985352, + "learning_rate": 9.435163698326276e-06, + "loss": 3.7366, + "step": 18250 + }, + { + "epoch": 0.5349739983886326, + "grad_norm": 13.599273681640625, + "learning_rate": 9.434433293526218e-06, + "loss": 3.726, + "step": 18260 + }, + { + "epoch": 0.5352669742913645, + "grad_norm": 11.670703887939453, + "learning_rate": 9.433702445087417e-06, + "loss": 3.7233, + "step": 18270 + }, + { + "epoch": 0.5355599501940965, + "grad_norm": 11.018282890319824, + "learning_rate": 9.432971153082994e-06, + "loss": 3.7554, + "step": 18280 + }, + { + "epoch": 0.5358529260968286, + "grad_norm": 13.009459495544434, + "learning_rate": 9.432239417586107e-06, + "loss": 3.7242, + "step": 18290 + }, + { + "epoch": 0.5361459019995606, + "grad_norm": 11.907285690307617, + "learning_rate": 9.43150723866996e-06, + "loss": 3.7357, + "step": 18300 + }, + { + "epoch": 0.5364388779022925, + "grad_norm": 13.305582046508789, + "learning_rate": 9.430774616407806e-06, + "loss": 3.7395, + "step": 18310 + }, + { + "epoch": 0.5367318538050245, + "grad_norm": 12.457352638244629, + "learning_rate": 9.430041550872937e-06, + "loss": 3.7516, + "step": 18320 + }, + { + "epoch": 0.5370248297077566, + "grad_norm": 11.213765144348145, + "learning_rate": 9.429308042138695e-06, + "loss": 3.7236, + "step": 18330 + }, + { + "epoch": 0.5373178056104886, + "grad_norm": 11.428751945495605, + "learning_rate": 9.42857409027846e-06, + "loss": 3.7616, + "step": 18340 + }, + { + "epoch": 0.5376107815132205, + "grad_norm": 12.407445907592773, + "learning_rate": 9.427839695365662e-06, + "loss": 3.7286, + "step": 18350 + }, + { + "epoch": 0.5379037574159525, + "grad_norm": 12.543622970581055, + "learning_rate": 9.427104857473773e-06, + "loss": 3.7218, + "step": 18360 + }, + { + "epoch": 0.5381967333186846, + "grad_norm": 12.962395668029785, + "learning_rate": 9.426369576676306e-06, + "loss": 3.7185, + "step": 18370 + }, + { + "epoch": 0.5384897092214166, + "grad_norm": 12.097414016723633, + "learning_rate": 9.425633853046826e-06, + "loss": 3.7309, + "step": 18380 + }, + { + "epoch": 0.5387826851241485, + "grad_norm": 12.712462425231934, + "learning_rate": 9.424897686658933e-06, + "loss": 3.7058, + "step": 18390 + }, + { + "epoch": 0.5390756610268805, + "grad_norm": 12.062601089477539, + "learning_rate": 9.424161077586279e-06, + "loss": 3.7722, + "step": 18400 + }, + { + "epoch": 0.5393686369296126, + "grad_norm": 11.407721519470215, + "learning_rate": 9.423424025902556e-06, + "loss": 3.7414, + "step": 18410 + }, + { + "epoch": 0.5396616128323446, + "grad_norm": 11.823223114013672, + "learning_rate": 9.422686531681506e-06, + "loss": 3.749, + "step": 18420 + }, + { + "epoch": 0.5399545887350765, + "grad_norm": 13.222211837768555, + "learning_rate": 9.421948594996905e-06, + "loss": 3.738, + "step": 18430 + }, + { + "epoch": 0.5400131839156229, + "eval_bleu": 0.3161871399059885, + "eval_cap_loss": 1.0144855976104736, + "eval_con_loss": 1.4702420234680176, + "eval_loss": 3.9549694061279297, + "step": 18432 + }, + { + "epoch": 0.5400131839156229, + "eval_bleu": 0.3161871399059885, + "eval_cap_loss": 1.0144855976104736, + "eval_con_loss": 1.4702420234680176, + "eval_loss": 3.9549694061279297, + "eval_runtime": 59.729, + "eval_samples_per_second": 334.846, + "eval_steps_per_second": 0.335, + "step": 18432 + }, + { + "epoch": 0.5402475646378085, + "grad_norm": 13.78310489654541, + "learning_rate": 9.421210215922582e-06, + "loss": 3.7443, + "step": 18440 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 12.20008659362793, + "learning_rate": 9.420471394532407e-06, + "loss": 3.7125, + "step": 18450 + }, + { + "epoch": 0.5408335164432725, + "grad_norm": 11.729195594787598, + "learning_rate": 9.419732130900294e-06, + "loss": 3.7252, + "step": 18460 + }, + { + "epoch": 0.5411264923460045, + "grad_norm": 11.4656400680542, + "learning_rate": 9.418992425100202e-06, + "loss": 3.7115, + "step": 18470 + }, + { + "epoch": 0.5414194682487365, + "grad_norm": 12.386667251586914, + "learning_rate": 9.418252277206136e-06, + "loss": 3.7253, + "step": 18480 + }, + { + "epoch": 0.5417124441514686, + "grad_norm": 12.249430656433105, + "learning_rate": 9.417511687292143e-06, + "loss": 3.724, + "step": 18490 + }, + { + "epoch": 0.5420054200542005, + "grad_norm": 11.752549171447754, + "learning_rate": 9.416770655432315e-06, + "loss": 3.6953, + "step": 18500 + }, + { + "epoch": 0.5422983959569325, + "grad_norm": 13.226070404052734, + "learning_rate": 9.416029181700785e-06, + "loss": 3.7095, + "step": 18510 + }, + { + "epoch": 0.5425913718596646, + "grad_norm": 12.145491600036621, + "learning_rate": 9.415287266171734e-06, + "loss": 3.7302, + "step": 18520 + }, + { + "epoch": 0.5428843477623966, + "grad_norm": 11.357887268066406, + "learning_rate": 9.414544908919387e-06, + "loss": 3.7149, + "step": 18530 + }, + { + "epoch": 0.5431773236651285, + "grad_norm": 12.814656257629395, + "learning_rate": 9.413802110018013e-06, + "loss": 3.734, + "step": 18540 + }, + { + "epoch": 0.5434702995678605, + "grad_norm": 11.341935157775879, + "learning_rate": 9.413058869541924e-06, + "loss": 3.7179, + "step": 18550 + }, + { + "epoch": 0.5437632754705926, + "grad_norm": 12.684646606445312, + "learning_rate": 9.412315187565479e-06, + "loss": 3.7261, + "step": 18560 + }, + { + "epoch": 0.5440562513733246, + "grad_norm": 12.601358413696289, + "learning_rate": 9.411571064163075e-06, + "loss": 3.7459, + "step": 18570 + }, + { + "epoch": 0.5443492272760565, + "grad_norm": 13.085491180419922, + "learning_rate": 9.410826499409158e-06, + "loss": 3.7679, + "step": 18580 + }, + { + "epoch": 0.5446422031787885, + "grad_norm": 11.82050895690918, + "learning_rate": 9.410081493378221e-06, + "loss": 3.73, + "step": 18590 + }, + { + "epoch": 0.5449351790815206, + "grad_norm": 11.26595401763916, + "learning_rate": 9.409336046144793e-06, + "loss": 3.7408, + "step": 18600 + }, + { + "epoch": 0.5452281549842526, + "grad_norm": 11.363204002380371, + "learning_rate": 9.408590157783452e-06, + "loss": 3.7128, + "step": 18610 + }, + { + "epoch": 0.5455211308869845, + "grad_norm": 13.701397895812988, + "learning_rate": 9.407843828368823e-06, + "loss": 3.7442, + "step": 18620 + }, + { + "epoch": 0.5458141067897165, + "grad_norm": 12.180946350097656, + "learning_rate": 9.40709705797557e-06, + "loss": 3.7416, + "step": 18630 + }, + { + "epoch": 0.5461070826924486, + "grad_norm": 12.46260929107666, + "learning_rate": 9.4063498466784e-06, + "loss": 3.7365, + "step": 18640 + }, + { + "epoch": 0.5464000585951806, + "grad_norm": 11.752039909362793, + "learning_rate": 9.405602194552073e-06, + "loss": 3.6783, + "step": 18650 + }, + { + "epoch": 0.5466930344979125, + "grad_norm": 13.761226654052734, + "learning_rate": 9.404854101671382e-06, + "loss": 3.7412, + "step": 18660 + }, + { + "epoch": 0.5469860104006445, + "grad_norm": 13.635090827941895, + "learning_rate": 9.404105568111173e-06, + "loss": 3.7139, + "step": 18670 + }, + { + "epoch": 0.5472789863033766, + "grad_norm": 12.169047355651855, + "learning_rate": 9.403356593946328e-06, + "loss": 3.7021, + "step": 18680 + }, + { + "epoch": 0.5475719622061086, + "grad_norm": 13.116341590881348, + "learning_rate": 9.402607179251782e-06, + "loss": 3.7114, + "step": 18690 + }, + { + "epoch": 0.5478649381088405, + "grad_norm": 13.136465072631836, + "learning_rate": 9.401857324102508e-06, + "loss": 3.7127, + "step": 18700 + }, + { + "epoch": 0.5481579140115725, + "grad_norm": 11.660703659057617, + "learning_rate": 9.401107028573521e-06, + "loss": 3.7374, + "step": 18710 + }, + { + "epoch": 0.5484508899143046, + "grad_norm": 13.457521438598633, + "learning_rate": 9.40035629273989e-06, + "loss": 3.7364, + "step": 18720 + }, + { + "epoch": 0.5487438658170366, + "grad_norm": 12.782115936279297, + "learning_rate": 9.399605116676718e-06, + "loss": 3.6902, + "step": 18730 + }, + { + "epoch": 0.5490368417197685, + "grad_norm": 12.968854904174805, + "learning_rate": 9.398853500459155e-06, + "loss": 3.7211, + "step": 18740 + }, + { + "epoch": 0.5493298176225005, + "grad_norm": 12.522647857666016, + "learning_rate": 9.398101444162395e-06, + "loss": 3.7003, + "step": 18750 + }, + { + "epoch": 0.5496227935252326, + "grad_norm": 12.474125862121582, + "learning_rate": 9.397348947861681e-06, + "loss": 3.7032, + "step": 18760 + }, + { + "epoch": 0.5499157694279645, + "grad_norm": 13.324197769165039, + "learning_rate": 9.396596011632293e-06, + "loss": 3.7203, + "step": 18770 + }, + { + "epoch": 0.5502087453306965, + "grad_norm": 12.310253143310547, + "learning_rate": 9.395842635549558e-06, + "loss": 3.728, + "step": 18780 + }, + { + "epoch": 0.5505017212334286, + "grad_norm": 11.453460693359375, + "learning_rate": 9.395088819688846e-06, + "loss": 3.721, + "step": 18790 + }, + { + "epoch": 0.5507946971361606, + "grad_norm": 11.788787841796875, + "learning_rate": 9.394334564125575e-06, + "loss": 3.7398, + "step": 18800 + }, + { + "epoch": 0.5510876730388925, + "grad_norm": 13.622300148010254, + "learning_rate": 9.3935798689352e-06, + "loss": 3.7237, + "step": 18810 + }, + { + "epoch": 0.5513806489416245, + "grad_norm": 13.199414253234863, + "learning_rate": 9.392824734193225e-06, + "loss": 3.7318, + "step": 18820 + }, + { + "epoch": 0.5516736248443566, + "grad_norm": 12.314921379089355, + "learning_rate": 9.392069159975199e-06, + "loss": 3.6909, + "step": 18830 + }, + { + "epoch": 0.5519666007470886, + "grad_norm": 11.291685104370117, + "learning_rate": 9.391313146356709e-06, + "loss": 3.725, + "step": 18840 + }, + { + "epoch": 0.5522595766498205, + "grad_norm": 12.74377155303955, + "learning_rate": 9.390556693413391e-06, + "loss": 3.7238, + "step": 18850 + }, + { + "epoch": 0.5525525525525525, + "grad_norm": 12.634119033813477, + "learning_rate": 9.389799801220926e-06, + "loss": 3.6978, + "step": 18860 + }, + { + "epoch": 0.5528455284552846, + "grad_norm": 12.394245147705078, + "learning_rate": 9.389042469855033e-06, + "loss": 3.6979, + "step": 18870 + }, + { + "epoch": 0.5531385043580166, + "grad_norm": 12.734098434448242, + "learning_rate": 9.388284699391482e-06, + "loss": 3.7026, + "step": 18880 + }, + { + "epoch": 0.5534314802607485, + "grad_norm": 10.56346321105957, + "learning_rate": 9.38752648990608e-06, + "loss": 3.7326, + "step": 18890 + }, + { + "epoch": 0.5537244561634805, + "grad_norm": 11.621086120605469, + "learning_rate": 9.386767841474683e-06, + "loss": 3.7313, + "step": 18900 + }, + { + "epoch": 0.5540174320662126, + "grad_norm": 11.55963134765625, + "learning_rate": 9.386008754173191e-06, + "loss": 3.7307, + "step": 18910 + }, + { + "epoch": 0.5543104079689446, + "grad_norm": 11.130087852478027, + "learning_rate": 9.385249228077541e-06, + "loss": 3.7098, + "step": 18920 + }, + { + "epoch": 0.5546033838716765, + "grad_norm": 12.346723556518555, + "learning_rate": 9.384489263263725e-06, + "loss": 3.733, + "step": 18930 + }, + { + "epoch": 0.5548963597744085, + "grad_norm": 12.546368598937988, + "learning_rate": 9.38372885980777e-06, + "loss": 3.7072, + "step": 18940 + }, + { + "epoch": 0.5550135501355014, + "eval_bleu": 0.3178461818211461, + "eval_cap_loss": 1.0102349519729614, + "eval_con_loss": 1.4634044170379639, + "eval_loss": 3.9370439052581787, + "step": 18944 + }, + { + "epoch": 0.5550135501355014, + "eval_bleu": 0.3178461818211461, + "eval_cap_loss": 1.0102349519729614, + "eval_con_loss": 1.4634044170379639, + "eval_loss": 3.9370439052581787, + "eval_runtime": 55.4033, + "eval_samples_per_second": 360.989, + "eval_steps_per_second": 0.361, + "step": 18944 + }, + { + "epoch": 0.5551893356771406, + "grad_norm": 11.550348281860352, + "learning_rate": 9.382968017785751e-06, + "loss": 3.7141, + "step": 18950 + }, + { + "epoch": 0.5554823115798726, + "grad_norm": 11.703598022460938, + "learning_rate": 9.382206737273784e-06, + "loss": 3.6967, + "step": 18960 + }, + { + "epoch": 0.5557752874826045, + "grad_norm": 11.935429573059082, + "learning_rate": 9.381445018348032e-06, + "loss": 3.7319, + "step": 18970 + }, + { + "epoch": 0.5560682633853365, + "grad_norm": 12.152336120605469, + "learning_rate": 9.380682861084703e-06, + "loss": 3.6964, + "step": 18980 + }, + { + "epoch": 0.5563612392880686, + "grad_norm": 13.230203628540039, + "learning_rate": 9.37992026556004e-06, + "loss": 3.6798, + "step": 18990 + }, + { + "epoch": 0.5566542151908006, + "grad_norm": 11.52493667602539, + "learning_rate": 9.37915723185034e-06, + "loss": 3.7069, + "step": 19000 + }, + { + "epoch": 0.5569471910935325, + "grad_norm": 11.565300941467285, + "learning_rate": 9.378393760031941e-06, + "loss": 3.7044, + "step": 19010 + }, + { + "epoch": 0.5572401669962646, + "grad_norm": 11.290327072143555, + "learning_rate": 9.37762985018122e-06, + "loss": 3.6876, + "step": 19020 + }, + { + "epoch": 0.5575331428989966, + "grad_norm": 12.274311065673828, + "learning_rate": 9.376865502374607e-06, + "loss": 3.6935, + "step": 19030 + }, + { + "epoch": 0.5578261188017286, + "grad_norm": 12.581189155578613, + "learning_rate": 9.376100716688566e-06, + "loss": 3.7076, + "step": 19040 + }, + { + "epoch": 0.5581190947044605, + "grad_norm": 11.736187934875488, + "learning_rate": 9.375335493199612e-06, + "loss": 3.7211, + "step": 19050 + }, + { + "epoch": 0.5584120706071926, + "grad_norm": 11.709111213684082, + "learning_rate": 9.374569831984298e-06, + "loss": 3.7152, + "step": 19060 + }, + { + "epoch": 0.5587050465099246, + "grad_norm": 12.220419883728027, + "learning_rate": 9.373803733119226e-06, + "loss": 3.7058, + "step": 19070 + }, + { + "epoch": 0.5589980224126566, + "grad_norm": 13.571879386901855, + "learning_rate": 9.373037196681041e-06, + "loss": 3.7128, + "step": 19080 + }, + { + "epoch": 0.5592909983153885, + "grad_norm": 13.272566795349121, + "learning_rate": 9.372270222746427e-06, + "loss": 3.6831, + "step": 19090 + }, + { + "epoch": 0.5595839742181206, + "grad_norm": 12.057178497314453, + "learning_rate": 9.371502811392119e-06, + "loss": 3.7025, + "step": 19100 + }, + { + "epoch": 0.5598769501208526, + "grad_norm": 12.053723335266113, + "learning_rate": 9.370734962694888e-06, + "loss": 3.6897, + "step": 19110 + }, + { + "epoch": 0.5601699260235845, + "grad_norm": 12.18799114227295, + "learning_rate": 9.369966676731554e-06, + "loss": 3.7009, + "step": 19120 + }, + { + "epoch": 0.5604629019263165, + "grad_norm": 12.969797134399414, + "learning_rate": 9.369197953578982e-06, + "loss": 3.6958, + "step": 19130 + }, + { + "epoch": 0.5607558778290486, + "grad_norm": 12.683464050292969, + "learning_rate": 9.368428793314077e-06, + "loss": 3.6982, + "step": 19140 + }, + { + "epoch": 0.5610488537317806, + "grad_norm": 11.335609436035156, + "learning_rate": 9.367659196013786e-06, + "loss": 3.7058, + "step": 19150 + }, + { + "epoch": 0.5613418296345125, + "grad_norm": 11.525291442871094, + "learning_rate": 9.366889161755105e-06, + "loss": 3.7056, + "step": 19160 + }, + { + "epoch": 0.5616348055372445, + "grad_norm": 12.230759620666504, + "learning_rate": 9.366118690615074e-06, + "loss": 3.7114, + "step": 19170 + }, + { + "epoch": 0.5619277814399766, + "grad_norm": 12.226967811584473, + "learning_rate": 9.36534778267077e-06, + "loss": 3.6924, + "step": 19180 + }, + { + "epoch": 0.5622207573427086, + "grad_norm": 11.236692428588867, + "learning_rate": 9.364576437999317e-06, + "loss": 3.7128, + "step": 19190 + }, + { + "epoch": 0.5625137332454405, + "grad_norm": 10.907571792602539, + "learning_rate": 9.363804656677889e-06, + "loss": 3.6879, + "step": 19200 + }, + { + "epoch": 0.5628067091481725, + "grad_norm": 12.895543098449707, + "learning_rate": 9.363032438783693e-06, + "loss": 3.703, + "step": 19210 + }, + { + "epoch": 0.5630996850509046, + "grad_norm": 12.606322288513184, + "learning_rate": 9.362259784393986e-06, + "loss": 3.6998, + "step": 19220 + }, + { + "epoch": 0.5633926609536366, + "grad_norm": 12.693120002746582, + "learning_rate": 9.361486693586068e-06, + "loss": 3.7134, + "step": 19230 + }, + { + "epoch": 0.5636856368563685, + "grad_norm": 12.872469902038574, + "learning_rate": 9.360713166437282e-06, + "loss": 3.7045, + "step": 19240 + }, + { + "epoch": 0.5639786127591006, + "grad_norm": 14.158641815185547, + "learning_rate": 9.359939203025015e-06, + "loss": 3.7001, + "step": 19250 + }, + { + "epoch": 0.5642715886618326, + "grad_norm": 13.34499454498291, + "learning_rate": 9.359164803426698e-06, + "loss": 3.7279, + "step": 19260 + }, + { + "epoch": 0.5645645645645646, + "grad_norm": 11.248991012573242, + "learning_rate": 9.358389967719802e-06, + "loss": 3.7047, + "step": 19270 + }, + { + "epoch": 0.5648575404672965, + "grad_norm": 12.39539623260498, + "learning_rate": 9.357614695981851e-06, + "loss": 3.6921, + "step": 19280 + }, + { + "epoch": 0.5651505163700286, + "grad_norm": 12.154510498046875, + "learning_rate": 9.356838988290401e-06, + "loss": 3.7082, + "step": 19290 + }, + { + "epoch": 0.5654434922727606, + "grad_norm": 12.088459968566895, + "learning_rate": 9.356062844723059e-06, + "loss": 3.6884, + "step": 19300 + }, + { + "epoch": 0.5657364681754926, + "grad_norm": 11.856938362121582, + "learning_rate": 9.355286265357472e-06, + "loss": 3.7081, + "step": 19310 + }, + { + "epoch": 0.5660294440782245, + "grad_norm": 11.795360565185547, + "learning_rate": 9.354509250271334e-06, + "loss": 3.7096, + "step": 19320 + }, + { + "epoch": 0.5663224199809566, + "grad_norm": 11.134729385375977, + "learning_rate": 9.353731799542378e-06, + "loss": 3.6845, + "step": 19330 + }, + { + "epoch": 0.5666153958836886, + "grad_norm": 12.291970252990723, + "learning_rate": 9.352953913248387e-06, + "loss": 3.7066, + "step": 19340 + }, + { + "epoch": 0.5669083717864206, + "grad_norm": 12.423921585083008, + "learning_rate": 9.352175591467181e-06, + "loss": 3.6837, + "step": 19350 + }, + { + "epoch": 0.5672013476891525, + "grad_norm": 11.766033172607422, + "learning_rate": 9.35139683427663e-06, + "loss": 3.6854, + "step": 19360 + }, + { + "epoch": 0.5674943235918846, + "grad_norm": 10.61786937713623, + "learning_rate": 9.350617641754641e-06, + "loss": 3.6997, + "step": 19370 + }, + { + "epoch": 0.5677872994946166, + "grad_norm": 11.644923210144043, + "learning_rate": 9.349838013979166e-06, + "loss": 3.6777, + "step": 19380 + }, + { + "epoch": 0.5680802753973486, + "grad_norm": 12.591412544250488, + "learning_rate": 9.349057951028206e-06, + "loss": 3.6743, + "step": 19390 + }, + { + "epoch": 0.5683732513000805, + "grad_norm": 12.076478958129883, + "learning_rate": 9.348277452979798e-06, + "loss": 3.6945, + "step": 19400 + }, + { + "epoch": 0.5686662272028126, + "grad_norm": 12.244016647338867, + "learning_rate": 9.347496519912032e-06, + "loss": 3.6984, + "step": 19410 + }, + { + "epoch": 0.5689592031055446, + "grad_norm": 11.891851425170898, + "learning_rate": 9.34671515190303e-06, + "loss": 3.6841, + "step": 19420 + }, + { + "epoch": 0.5692521790082765, + "grad_norm": 11.580571174621582, + "learning_rate": 9.345933349030966e-06, + "loss": 3.6805, + "step": 19430 + }, + { + "epoch": 0.5695451549110085, + "grad_norm": 11.364910125732422, + "learning_rate": 9.345151111374054e-06, + "loss": 3.7051, + "step": 19440 + }, + { + "epoch": 0.5698381308137406, + "grad_norm": 13.36971664428711, + "learning_rate": 9.34436843901055e-06, + "loss": 3.6806, + "step": 19450 + }, + { + "epoch": 0.5700139163553798, + "eval_bleu": 0.31726118323744257, + "eval_cap_loss": 1.0107594728469849, + "eval_con_loss": 1.45866060256958, + "eval_loss": 3.9280805587768555, + "step": 19456 + }, + { + "epoch": 0.5700139163553798, + "eval_bleu": 0.31726118323744257, + "eval_cap_loss": 1.0107594728469849, + "eval_con_loss": 1.45866060256958, + "eval_loss": 3.9280805587768555, + "eval_runtime": 62.3859, + "eval_samples_per_second": 320.585, + "eval_steps_per_second": 0.321, + "step": 19456 + }, + { + "epoch": 0.5701311067164726, + "grad_norm": 12.287253379821777, + "learning_rate": 9.34358533201876e-06, + "loss": 3.7221, + "step": 19460 + }, + { + "epoch": 0.5704240826192045, + "grad_norm": 12.451722145080566, + "learning_rate": 9.342801790477027e-06, + "loss": 3.6974, + "step": 19470 + }, + { + "epoch": 0.5707170585219365, + "grad_norm": 12.286231994628906, + "learning_rate": 9.34201781446374e-06, + "loss": 3.7068, + "step": 19480 + }, + { + "epoch": 0.5710100344246686, + "grad_norm": 10.99072265625, + "learning_rate": 9.34123340405733e-06, + "loss": 3.681, + "step": 19490 + }, + { + "epoch": 0.5713030103274006, + "grad_norm": 11.874903678894043, + "learning_rate": 9.340448559336273e-06, + "loss": 3.6847, + "step": 19500 + }, + { + "epoch": 0.5715959862301325, + "grad_norm": 11.480705261230469, + "learning_rate": 9.339663280379092e-06, + "loss": 3.692, + "step": 19510 + }, + { + "epoch": 0.5718889621328646, + "grad_norm": 12.475991249084473, + "learning_rate": 9.338877567264341e-06, + "loss": 3.6549, + "step": 19520 + }, + { + "epoch": 0.5721819380355966, + "grad_norm": 12.255062103271484, + "learning_rate": 9.338091420070637e-06, + "loss": 3.6885, + "step": 19530 + }, + { + "epoch": 0.5724749139383286, + "grad_norm": 12.586803436279297, + "learning_rate": 9.337304838876618e-06, + "loss": 3.7042, + "step": 19540 + }, + { + "epoch": 0.5727678898410605, + "grad_norm": 13.611072540283203, + "learning_rate": 9.336517823760986e-06, + "loss": 3.6544, + "step": 19550 + }, + { + "epoch": 0.5730608657437926, + "grad_norm": 11.580041885375977, + "learning_rate": 9.335730374802472e-06, + "loss": 3.6575, + "step": 19560 + }, + { + "epoch": 0.5733538416465246, + "grad_norm": 12.800311088562012, + "learning_rate": 9.334942492079856e-06, + "loss": 3.6551, + "step": 19570 + }, + { + "epoch": 0.5736468175492566, + "grad_norm": 11.869354248046875, + "learning_rate": 9.334154175671964e-06, + "loss": 3.6926, + "step": 19580 + }, + { + "epoch": 0.5739397934519885, + "grad_norm": 12.215054512023926, + "learning_rate": 9.333365425657658e-06, + "loss": 3.7054, + "step": 19590 + }, + { + "epoch": 0.5742327693547206, + "grad_norm": 13.545782089233398, + "learning_rate": 9.332576242115852e-06, + "loss": 3.6871, + "step": 19600 + }, + { + "epoch": 0.5745257452574526, + "grad_norm": 13.413002967834473, + "learning_rate": 9.331786625125497e-06, + "loss": 3.6937, + "step": 19610 + }, + { + "epoch": 0.5748187211601846, + "grad_norm": 10.575613975524902, + "learning_rate": 9.330996574765589e-06, + "loss": 3.687, + "step": 19620 + }, + { + "epoch": 0.5751116970629165, + "grad_norm": 12.264909744262695, + "learning_rate": 9.33020609111517e-06, + "loss": 3.6875, + "step": 19630 + }, + { + "epoch": 0.5754046729656486, + "grad_norm": 12.727313995361328, + "learning_rate": 9.32941517425332e-06, + "loss": 3.6854, + "step": 19640 + }, + { + "epoch": 0.5756976488683806, + "grad_norm": 11.011251449584961, + "learning_rate": 9.32862382425917e-06, + "loss": 3.6849, + "step": 19650 + }, + { + "epoch": 0.5759906247711126, + "grad_norm": 12.46858024597168, + "learning_rate": 9.327832041211883e-06, + "loss": 3.6628, + "step": 19660 + }, + { + "epoch": 0.5762836006738445, + "grad_norm": 13.320219993591309, + "learning_rate": 9.32703982519068e-06, + "loss": 3.6762, + "step": 19670 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 11.114240646362305, + "learning_rate": 9.326247176274815e-06, + "loss": 3.6879, + "step": 19680 + }, + { + "epoch": 0.5768695524793086, + "grad_norm": 12.371867179870605, + "learning_rate": 9.325454094543583e-06, + "loss": 3.6794, + "step": 19690 + }, + { + "epoch": 0.5771625283820406, + "grad_norm": 13.167166709899902, + "learning_rate": 9.324660580076332e-06, + "loss": 3.6673, + "step": 19700 + }, + { + "epoch": 0.5774555042847725, + "grad_norm": 13.96160888671875, + "learning_rate": 9.32386663295245e-06, + "loss": 3.6697, + "step": 19710 + }, + { + "epoch": 0.5777484801875046, + "grad_norm": 11.380837440490723, + "learning_rate": 9.323072253251361e-06, + "loss": 3.6779, + "step": 19720 + }, + { + "epoch": 0.5780414560902366, + "grad_norm": 11.755537033081055, + "learning_rate": 9.322277441052542e-06, + "loss": 3.6749, + "step": 19730 + }, + { + "epoch": 0.5783344319929685, + "grad_norm": 11.155204772949219, + "learning_rate": 9.321482196435507e-06, + "loss": 3.6757, + "step": 19740 + }, + { + "epoch": 0.5786274078957006, + "grad_norm": 11.928263664245605, + "learning_rate": 9.32068651947982e-06, + "loss": 3.6918, + "step": 19750 + }, + { + "epoch": 0.5789203837984326, + "grad_norm": 12.875592231750488, + "learning_rate": 9.319890410265078e-06, + "loss": 3.6565, + "step": 19760 + }, + { + "epoch": 0.5792133597011646, + "grad_norm": 12.68001937866211, + "learning_rate": 9.319093868870928e-06, + "loss": 3.6798, + "step": 19770 + }, + { + "epoch": 0.5795063356038965, + "grad_norm": 11.756572723388672, + "learning_rate": 9.318296895377064e-06, + "loss": 3.6688, + "step": 19780 + }, + { + "epoch": 0.5797993115066286, + "grad_norm": 11.354010581970215, + "learning_rate": 9.317499489863213e-06, + "loss": 3.6705, + "step": 19790 + }, + { + "epoch": 0.5800922874093606, + "grad_norm": 12.283101081848145, + "learning_rate": 9.316701652409153e-06, + "loss": 3.6556, + "step": 19800 + }, + { + "epoch": 0.5803852633120926, + "grad_norm": 12.482980728149414, + "learning_rate": 9.315903383094704e-06, + "loss": 3.6822, + "step": 19810 + }, + { + "epoch": 0.5806782392148245, + "grad_norm": 11.72366714477539, + "learning_rate": 9.315104681999725e-06, + "loss": 3.6856, + "step": 19820 + }, + { + "epoch": 0.5809712151175566, + "grad_norm": 12.604576110839844, + "learning_rate": 9.314305549204125e-06, + "loss": 3.7019, + "step": 19830 + }, + { + "epoch": 0.5812641910202886, + "grad_norm": 11.171608924865723, + "learning_rate": 9.31350598478785e-06, + "loss": 3.6741, + "step": 19840 + }, + { + "epoch": 0.5815571669230206, + "grad_norm": 12.60439395904541, + "learning_rate": 9.312705988830893e-06, + "loss": 3.6678, + "step": 19850 + }, + { + "epoch": 0.5818501428257525, + "grad_norm": 13.851433753967285, + "learning_rate": 9.311905561413288e-06, + "loss": 3.6554, + "step": 19860 + }, + { + "epoch": 0.5821431187284846, + "grad_norm": 12.76992416381836, + "learning_rate": 9.311104702615113e-06, + "loss": 3.6681, + "step": 19870 + }, + { + "epoch": 0.5824360946312166, + "grad_norm": 12.430328369140625, + "learning_rate": 9.31030341251649e-06, + "loss": 3.6487, + "step": 19880 + }, + { + "epoch": 0.5827290705339486, + "grad_norm": 13.05876636505127, + "learning_rate": 9.309501691197583e-06, + "loss": 3.6681, + "step": 19890 + }, + { + "epoch": 0.5830220464366805, + "grad_norm": 11.855293273925781, + "learning_rate": 9.308699538738598e-06, + "loss": 3.6902, + "step": 19900 + }, + { + "epoch": 0.5833150223394126, + "grad_norm": 10.999730110168457, + "learning_rate": 9.307896955219787e-06, + "loss": 3.6859, + "step": 19910 + }, + { + "epoch": 0.5836079982421446, + "grad_norm": 12.29326343536377, + "learning_rate": 9.307093940721444e-06, + "loss": 3.684, + "step": 19920 + }, + { + "epoch": 0.5839009741448766, + "grad_norm": 13.782124519348145, + "learning_rate": 9.306290495323904e-06, + "loss": 3.6872, + "step": 19930 + }, + { + "epoch": 0.5841939500476085, + "grad_norm": 12.460262298583984, + "learning_rate": 9.30548661910755e-06, + "loss": 3.6458, + "step": 19940 + }, + { + "epoch": 0.5844869259503406, + "grad_norm": 12.165443420410156, + "learning_rate": 9.304762762229212e-06, + "loss": 3.6733, + "step": 19950 + }, + { + "epoch": 0.5847799018530726, + "grad_norm": 12.441862106323242, + "learning_rate": 9.303958067678708e-06, + "loss": 3.6771, + "step": 19960 + }, + { + "epoch": 0.5850142825752582, + "eval_bleu": 0.32022092831422627, + "eval_cap_loss": 1.0045104026794434, + "eval_con_loss": 1.4381303787231445, + "eval_loss": 3.8807711601257324, + "step": 19968 + }, + { + "epoch": 0.5850142825752582, + "eval_bleu": 0.32022092831422627, + "eval_cap_loss": 1.0045104026794434, + "eval_con_loss": 1.4381303787231445, + "eval_loss": 3.8807711601257324, + "eval_runtime": 54.4256, + "eval_samples_per_second": 367.474, + "eval_steps_per_second": 0.367, + "step": 19968 + }, + { + "epoch": 0.5850728777558046, + "grad_norm": 12.201432228088379, + "learning_rate": 9.303152942542735e-06, + "loss": 3.7014, + "step": 19970 + }, + { + "epoch": 0.5853658536585366, + "grad_norm": 11.18786334991455, + "learning_rate": 9.302347386901839e-06, + "loss": 3.6725, + "step": 19980 + }, + { + "epoch": 0.5856588295612686, + "grad_norm": 11.972962379455566, + "learning_rate": 9.301541400836612e-06, + "loss": 3.6662, + "step": 19990 + }, + { + "epoch": 0.5859518054640006, + "grad_norm": 12.112826347351074, + "learning_rate": 9.300734984427689e-06, + "loss": 3.6879, + "step": 20000 + }, + { + "epoch": 0.5862447813667326, + "grad_norm": 12.868903160095215, + "learning_rate": 9.299928137755745e-06, + "loss": 3.6757, + "step": 20010 + }, + { + "epoch": 0.5865377572694646, + "grad_norm": 12.41089153289795, + "learning_rate": 9.299120860901501e-06, + "loss": 3.6701, + "step": 20020 + }, + { + "epoch": 0.5868307331721966, + "grad_norm": 13.6754732131958, + "learning_rate": 9.298313153945722e-06, + "loss": 3.6723, + "step": 20030 + }, + { + "epoch": 0.5871237090749286, + "grad_norm": 11.452674865722656, + "learning_rate": 9.297505016969213e-06, + "loss": 3.6522, + "step": 20040 + }, + { + "epoch": 0.5874166849776606, + "grad_norm": 12.108133316040039, + "learning_rate": 9.296696450052823e-06, + "loss": 3.6532, + "step": 20050 + }, + { + "epoch": 0.5877096608803926, + "grad_norm": 12.817689895629883, + "learning_rate": 9.295887453277443e-06, + "loss": 3.6432, + "step": 20060 + }, + { + "epoch": 0.5880026367831246, + "grad_norm": 12.16918659210205, + "learning_rate": 9.295078026724011e-06, + "loss": 3.6688, + "step": 20070 + }, + { + "epoch": 0.5882956126858566, + "grad_norm": 12.849663734436035, + "learning_rate": 9.294268170473505e-06, + "loss": 3.6663, + "step": 20080 + }, + { + "epoch": 0.5885885885885885, + "grad_norm": 11.897428512573242, + "learning_rate": 9.293457884606945e-06, + "loss": 3.6382, + "step": 20090 + }, + { + "epoch": 0.5888815644913206, + "grad_norm": 11.82168960571289, + "learning_rate": 9.292647169205395e-06, + "loss": 3.6724, + "step": 20100 + }, + { + "epoch": 0.5891745403940526, + "grad_norm": 11.885394096374512, + "learning_rate": 9.291836024349964e-06, + "loss": 3.6492, + "step": 20110 + }, + { + "epoch": 0.5894675162967846, + "grad_norm": 11.791131973266602, + "learning_rate": 9.2910244501218e-06, + "loss": 3.6909, + "step": 20120 + }, + { + "epoch": 0.5897604921995165, + "grad_norm": 10.793614387512207, + "learning_rate": 9.2902124466021e-06, + "loss": 3.6528, + "step": 20130 + }, + { + "epoch": 0.5900534681022486, + "grad_norm": 13.295828819274902, + "learning_rate": 9.289400013872097e-06, + "loss": 3.6731, + "step": 20140 + }, + { + "epoch": 0.5903464440049806, + "grad_norm": 12.320313453674316, + "learning_rate": 9.288587152013072e-06, + "loss": 3.6488, + "step": 20150 + }, + { + "epoch": 0.5906394199077126, + "grad_norm": 12.421635627746582, + "learning_rate": 9.287773861106343e-06, + "loss": 3.6352, + "step": 20160 + }, + { + "epoch": 0.5909323958104445, + "grad_norm": 12.253501892089844, + "learning_rate": 9.286960141233279e-06, + "loss": 3.6518, + "step": 20170 + }, + { + "epoch": 0.5912253717131766, + "grad_norm": 10.98945426940918, + "learning_rate": 9.286145992475286e-06, + "loss": 3.6624, + "step": 20180 + }, + { + "epoch": 0.5915183476159086, + "grad_norm": 12.923944473266602, + "learning_rate": 9.285331414913816e-06, + "loss": 3.6333, + "step": 20190 + }, + { + "epoch": 0.5918113235186406, + "grad_norm": 12.987103462219238, + "learning_rate": 9.284516408630362e-06, + "loss": 3.6722, + "step": 20200 + }, + { + "epoch": 0.5921042994213725, + "grad_norm": 11.012072563171387, + "learning_rate": 9.283700973706459e-06, + "loss": 3.6615, + "step": 20210 + }, + { + "epoch": 0.5923972753241046, + "grad_norm": 11.005080223083496, + "learning_rate": 9.282885110223689e-06, + "loss": 3.6677, + "step": 20220 + }, + { + "epoch": 0.5926902512268366, + "grad_norm": 12.741060256958008, + "learning_rate": 9.282068818263674e-06, + "loss": 3.6897, + "step": 20230 + }, + { + "epoch": 0.5929832271295686, + "grad_norm": 12.707334518432617, + "learning_rate": 9.281252097908078e-06, + "loss": 3.6619, + "step": 20240 + }, + { + "epoch": 0.5932762030323006, + "grad_norm": 11.913028717041016, + "learning_rate": 9.280434949238609e-06, + "loss": 3.6786, + "step": 20250 + }, + { + "epoch": 0.5935691789350326, + "grad_norm": 12.709822654724121, + "learning_rate": 9.279617372337018e-06, + "loss": 3.6526, + "step": 20260 + }, + { + "epoch": 0.5938621548377646, + "grad_norm": 11.840387344360352, + "learning_rate": 9.278799367285098e-06, + "loss": 3.6492, + "step": 20270 + }, + { + "epoch": 0.5941551307404966, + "grad_norm": 11.17848014831543, + "learning_rate": 9.277980934164689e-06, + "loss": 3.6383, + "step": 20280 + }, + { + "epoch": 0.5944481066432286, + "grad_norm": 12.12531566619873, + "learning_rate": 9.277162073057667e-06, + "loss": 3.6352, + "step": 20290 + }, + { + "epoch": 0.5947410825459606, + "grad_norm": 13.658660888671875, + "learning_rate": 9.276342784045954e-06, + "loss": 3.6507, + "step": 20300 + }, + { + "epoch": 0.5950340584486926, + "grad_norm": 12.33107852935791, + "learning_rate": 9.275523067211516e-06, + "loss": 3.6181, + "step": 20310 + }, + { + "epoch": 0.5953270343514246, + "grad_norm": 11.637406349182129, + "learning_rate": 9.274702922636362e-06, + "loss": 3.6288, + "step": 20320 + }, + { + "epoch": 0.5956200102541566, + "grad_norm": 13.579793930053711, + "learning_rate": 9.273882350402541e-06, + "loss": 3.6415, + "step": 20330 + }, + { + "epoch": 0.5959129861568886, + "grad_norm": 11.346091270446777, + "learning_rate": 9.273061350592149e-06, + "loss": 3.6676, + "step": 20340 + }, + { + "epoch": 0.5962059620596206, + "grad_norm": 12.196907997131348, + "learning_rate": 9.272239923287318e-06, + "loss": 3.6959, + "step": 20350 + }, + { + "epoch": 0.5964989379623526, + "grad_norm": 12.703727722167969, + "learning_rate": 9.27141806857023e-06, + "loss": 3.6667, + "step": 20360 + }, + { + "epoch": 0.5967919138650846, + "grad_norm": 11.83812141418457, + "learning_rate": 9.270595786523104e-06, + "loss": 3.6743, + "step": 20370 + }, + { + "epoch": 0.5970848897678166, + "grad_norm": 13.267533302307129, + "learning_rate": 9.269773077228209e-06, + "loss": 3.6486, + "step": 20380 + }, + { + "epoch": 0.5973778656705486, + "grad_norm": 11.889276504516602, + "learning_rate": 9.268949940767847e-06, + "loss": 3.6362, + "step": 20390 + }, + { + "epoch": 0.5976708415732805, + "grad_norm": 12.96410083770752, + "learning_rate": 9.268126377224371e-06, + "loss": 3.6286, + "step": 20400 + }, + { + "epoch": 0.5979638174760126, + "grad_norm": 11.716621398925781, + "learning_rate": 9.267302386680172e-06, + "loss": 3.6302, + "step": 20410 + }, + { + "epoch": 0.5982567933787446, + "grad_norm": 13.96494197845459, + "learning_rate": 9.266477969217686e-06, + "loss": 3.6333, + "step": 20420 + }, + { + "epoch": 0.5985497692814766, + "grad_norm": 12.223847389221191, + "learning_rate": 9.265653124919394e-06, + "loss": 3.6512, + "step": 20430 + }, + { + "epoch": 0.5988427451842085, + "grad_norm": 12.67033863067627, + "learning_rate": 9.264827853867812e-06, + "loss": 3.6547, + "step": 20440 + }, + { + "epoch": 0.5991357210869406, + "grad_norm": 12.962199211120605, + "learning_rate": 9.264002156145506e-06, + "loss": 3.6498, + "step": 20450 + }, + { + "epoch": 0.5994286969896726, + "grad_norm": 11.710027694702148, + "learning_rate": 9.263176031835082e-06, + "loss": 3.6826, + "step": 20460 + }, + { + "epoch": 0.5997216728924046, + "grad_norm": 11.347014427185059, + "learning_rate": 9.262349481019187e-06, + "loss": 3.652, + "step": 20470 + }, + { + "epoch": 0.6000146487951366, + "grad_norm": 12.297945976257324, + "learning_rate": 9.261522503780517e-06, + "loss": 3.6503, + "step": 20480 + }, + { + "epoch": 0.6000146487951366, + "eval_bleu": 0.3195333184028556, + "eval_cap_loss": 1.0012454986572266, + "eval_con_loss": 1.428514838218689, + "eval_loss": 3.8582754135131836, + "step": 20480 + }, + { + "epoch": 0.6000146487951366, + "eval_bleu": 0.3195333184028556, + "eval_cap_loss": 1.0012454986572266, + "eval_con_loss": 1.428514838218689, + "eval_loss": 3.8582754135131836, + "eval_runtime": 56.5698, + "eval_samples_per_second": 353.546, + "eval_steps_per_second": 0.354, + "step": 20480 + }, + { + "epoch": 0.6003076246978686, + "grad_norm": 12.244312286376953, + "learning_rate": 9.260695100201801e-06, + "loss": 3.6136, + "step": 20490 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 12.633452415466309, + "learning_rate": 9.25986727036582e-06, + "loss": 3.6347, + "step": 20500 + }, + { + "epoch": 0.6008935765033326, + "grad_norm": 10.428262710571289, + "learning_rate": 9.259039014355388e-06, + "loss": 3.6608, + "step": 20510 + }, + { + "epoch": 0.6011865524060646, + "grad_norm": 12.299054145812988, + "learning_rate": 9.258210332253374e-06, + "loss": 3.647, + "step": 20520 + }, + { + "epoch": 0.6014795283087966, + "grad_norm": 10.863203048706055, + "learning_rate": 9.257381224142679e-06, + "loss": 3.6344, + "step": 20530 + }, + { + "epoch": 0.6017725042115286, + "grad_norm": 12.184762001037598, + "learning_rate": 9.25655169010625e-06, + "loss": 3.6463, + "step": 20540 + }, + { + "epoch": 0.6020654801142606, + "grad_norm": 11.954679489135742, + "learning_rate": 9.255721730227076e-06, + "loss": 3.6534, + "step": 20550 + }, + { + "epoch": 0.6023584560169926, + "grad_norm": 12.836435317993164, + "learning_rate": 9.254891344588192e-06, + "loss": 3.6566, + "step": 20560 + }, + { + "epoch": 0.6026514319197246, + "grad_norm": 11.634071350097656, + "learning_rate": 9.254060533272674e-06, + "loss": 3.6593, + "step": 20570 + }, + { + "epoch": 0.6029444078224566, + "grad_norm": 13.392931938171387, + "learning_rate": 9.253229296363637e-06, + "loss": 3.6524, + "step": 20580 + }, + { + "epoch": 0.6032373837251886, + "grad_norm": 11.901637077331543, + "learning_rate": 9.252397633944243e-06, + "loss": 3.6423, + "step": 20590 + }, + { + "epoch": 0.6035303596279206, + "grad_norm": 11.204300880432129, + "learning_rate": 9.251565546097693e-06, + "loss": 3.6352, + "step": 20600 + }, + { + "epoch": 0.6038233355306526, + "grad_norm": 12.280657768249512, + "learning_rate": 9.250733032907236e-06, + "loss": 3.6265, + "step": 20610 + }, + { + "epoch": 0.6041163114333846, + "grad_norm": 11.942256927490234, + "learning_rate": 9.249900094456157e-06, + "loss": 3.6717, + "step": 20620 + }, + { + "epoch": 0.6044092873361167, + "grad_norm": 11.202262878417969, + "learning_rate": 9.249066730827787e-06, + "loss": 3.636, + "step": 20630 + }, + { + "epoch": 0.6047022632388486, + "grad_norm": 11.289079666137695, + "learning_rate": 9.248232942105499e-06, + "loss": 3.6323, + "step": 20640 + }, + { + "epoch": 0.6049952391415806, + "grad_norm": 13.243307113647461, + "learning_rate": 9.24739872837271e-06, + "loss": 3.6543, + "step": 20650 + }, + { + "epoch": 0.6052882150443126, + "grad_norm": 11.3878812789917, + "learning_rate": 9.246564089712877e-06, + "loss": 3.6621, + "step": 20660 + }, + { + "epoch": 0.6055811909470447, + "grad_norm": 12.326394081115723, + "learning_rate": 9.2457290262095e-06, + "loss": 3.6205, + "step": 20670 + }, + { + "epoch": 0.6058741668497766, + "grad_norm": 11.634307861328125, + "learning_rate": 9.244893537946123e-06, + "loss": 3.664, + "step": 20680 + }, + { + "epoch": 0.6061671427525086, + "grad_norm": 13.576642990112305, + "learning_rate": 9.244057625006332e-06, + "loss": 3.631, + "step": 20690 + }, + { + "epoch": 0.6064601186552406, + "grad_norm": 11.611746788024902, + "learning_rate": 9.243221287473755e-06, + "loss": 3.6513, + "step": 20700 + }, + { + "epoch": 0.6067530945579727, + "grad_norm": 11.92854118347168, + "learning_rate": 9.242384525432062e-06, + "loss": 3.644, + "step": 20710 + }, + { + "epoch": 0.6070460704607046, + "grad_norm": 12.210738182067871, + "learning_rate": 9.241547338964967e-06, + "loss": 3.6084, + "step": 20720 + }, + { + "epoch": 0.6073390463634366, + "grad_norm": 11.947059631347656, + "learning_rate": 9.240709728156226e-06, + "loss": 3.6536, + "step": 20730 + }, + { + "epoch": 0.6076320222661686, + "grad_norm": 11.654147148132324, + "learning_rate": 9.239871693089634e-06, + "loss": 3.6269, + "step": 20740 + }, + { + "epoch": 0.6079249981689006, + "grad_norm": 12.169292449951172, + "learning_rate": 9.239033233849033e-06, + "loss": 3.6296, + "step": 20750 + }, + { + "epoch": 0.6082179740716326, + "grad_norm": 11.723299026489258, + "learning_rate": 9.238194350518308e-06, + "loss": 3.6355, + "step": 20760 + }, + { + "epoch": 0.6085109499743646, + "grad_norm": 12.083683013916016, + "learning_rate": 9.237355043181381e-06, + "loss": 3.6665, + "step": 20770 + }, + { + "epoch": 0.6088039258770966, + "grad_norm": 12.60084056854248, + "learning_rate": 9.236515311922222e-06, + "loss": 3.6512, + "step": 20780 + }, + { + "epoch": 0.6090969017798286, + "grad_norm": 12.647174835205078, + "learning_rate": 9.235675156824842e-06, + "loss": 3.6615, + "step": 20790 + }, + { + "epoch": 0.6093898776825606, + "grad_norm": 12.15182113647461, + "learning_rate": 9.23483457797329e-06, + "loss": 3.6369, + "step": 20800 + }, + { + "epoch": 0.6096828535852926, + "grad_norm": 11.744132995605469, + "learning_rate": 9.233993575451663e-06, + "loss": 3.6328, + "step": 20810 + }, + { + "epoch": 0.6099758294880246, + "grad_norm": 11.581949234008789, + "learning_rate": 9.2331521493441e-06, + "loss": 3.6321, + "step": 20820 + }, + { + "epoch": 0.6102688053907566, + "grad_norm": 12.125349998474121, + "learning_rate": 9.232310299734777e-06, + "loss": 3.6385, + "step": 20830 + }, + { + "epoch": 0.6105617812934886, + "grad_norm": 12.889867782592773, + "learning_rate": 9.231468026707918e-06, + "loss": 3.6264, + "step": 20840 + }, + { + "epoch": 0.6108547571962206, + "grad_norm": 11.773714065551758, + "learning_rate": 9.230625330347787e-06, + "loss": 3.6069, + "step": 20850 + }, + { + "epoch": 0.6111477330989527, + "grad_norm": 12.397059440612793, + "learning_rate": 9.229782210738692e-06, + "loss": 3.6161, + "step": 20860 + }, + { + "epoch": 0.6114407090016846, + "grad_norm": 11.431992530822754, + "learning_rate": 9.228938667964981e-06, + "loss": 3.6432, + "step": 20870 + }, + { + "epoch": 0.6117336849044166, + "grad_norm": 12.197656631469727, + "learning_rate": 9.228094702111047e-06, + "loss": 3.6521, + "step": 20880 + }, + { + "epoch": 0.6120266608071486, + "grad_norm": 13.042503356933594, + "learning_rate": 9.227250313261321e-06, + "loss": 3.6671, + "step": 20890 + }, + { + "epoch": 0.6123196367098807, + "grad_norm": 10.397649765014648, + "learning_rate": 9.226405501500282e-06, + "loss": 3.595, + "step": 20900 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 12.071754455566406, + "learning_rate": 9.225560266912446e-06, + "loss": 3.6429, + "step": 20910 + }, + { + "epoch": 0.6129055885153446, + "grad_norm": 11.059619903564453, + "learning_rate": 9.224714609582375e-06, + "loss": 3.6123, + "step": 20920 + }, + { + "epoch": 0.6131985644180766, + "grad_norm": 10.756893157958984, + "learning_rate": 9.223868529594671e-06, + "loss": 3.632, + "step": 20930 + }, + { + "epoch": 0.6134915403208087, + "grad_norm": 12.069351196289062, + "learning_rate": 9.22302202703398e-06, + "loss": 3.6355, + "step": 20940 + }, + { + "epoch": 0.6137845162235406, + "grad_norm": 11.944613456726074, + "learning_rate": 9.222175101984989e-06, + "loss": 3.64, + "step": 20950 + }, + { + "epoch": 0.6140774921262726, + "grad_norm": 11.713436126708984, + "learning_rate": 9.22132775453243e-06, + "loss": 3.6321, + "step": 20960 + }, + { + "epoch": 0.6143704680290046, + "grad_norm": 11.210960388183594, + "learning_rate": 9.220479984761072e-06, + "loss": 3.5971, + "step": 20970 + }, + { + "epoch": 0.6146634439317367, + "grad_norm": 11.892902374267578, + "learning_rate": 9.219631792755732e-06, + "loss": 3.6165, + "step": 20980 + }, + { + "epoch": 0.6149564198344686, + "grad_norm": 12.963580131530762, + "learning_rate": 9.218783178601265e-06, + "loss": 3.6634, + "step": 20990 + }, + { + "epoch": 0.615015015015015, + "eval_bleu": 0.3215032538021188, + "eval_cap_loss": 1.000899076461792, + "eval_con_loss": 1.424477219581604, + "eval_loss": 3.849853515625, + "step": 20992 + }, + { + "epoch": 0.615015015015015, + "eval_bleu": 0.3215032538021188, + "eval_cap_loss": 1.000899076461792, + "eval_con_loss": 1.424477219581604, + "eval_loss": 3.849853515625, + "eval_runtime": 56.2129, + "eval_samples_per_second": 355.79, + "eval_steps_per_second": 0.356, + "step": 20992 + }, + { + "epoch": 0.6152493957372006, + "grad_norm": 12.543732643127441, + "learning_rate": 9.217934142382569e-06, + "loss": 3.6146, + "step": 21000 + }, + { + "epoch": 0.6155423716399326, + "grad_norm": 11.927351951599121, + "learning_rate": 9.217084684184588e-06, + "loss": 3.6166, + "step": 21010 + }, + { + "epoch": 0.6158353475426647, + "grad_norm": 11.568089485168457, + "learning_rate": 9.216234804092302e-06, + "loss": 3.6055, + "step": 21020 + }, + { + "epoch": 0.6161283234453966, + "grad_norm": 11.599075317382812, + "learning_rate": 9.215384502190738e-06, + "loss": 3.6101, + "step": 21030 + }, + { + "epoch": 0.6164212993481286, + "grad_norm": 12.428668975830078, + "learning_rate": 9.214533778564965e-06, + "loss": 3.6306, + "step": 21040 + }, + { + "epoch": 0.6167142752508606, + "grad_norm": 11.721837043762207, + "learning_rate": 9.213682633300089e-06, + "loss": 3.6184, + "step": 21050 + }, + { + "epoch": 0.6170072511535926, + "grad_norm": 13.001051902770996, + "learning_rate": 9.212831066481267e-06, + "loss": 3.6158, + "step": 21060 + }, + { + "epoch": 0.6173002270563246, + "grad_norm": 12.545944213867188, + "learning_rate": 9.211979078193689e-06, + "loss": 3.6152, + "step": 21070 + }, + { + "epoch": 0.6175932029590566, + "grad_norm": 11.943947792053223, + "learning_rate": 9.211126668522593e-06, + "loss": 3.6396, + "step": 21080 + }, + { + "epoch": 0.6178861788617886, + "grad_norm": 13.08146858215332, + "learning_rate": 9.21027383755326e-06, + "loss": 3.6094, + "step": 21090 + }, + { + "epoch": 0.6181791547645206, + "grad_norm": 11.68384838104248, + "learning_rate": 9.209420585371005e-06, + "loss": 3.6427, + "step": 21100 + }, + { + "epoch": 0.6184721306672526, + "grad_norm": 12.743232727050781, + "learning_rate": 9.208566912061197e-06, + "loss": 3.6271, + "step": 21110 + }, + { + "epoch": 0.6187651065699846, + "grad_norm": 11.323323249816895, + "learning_rate": 9.207712817709237e-06, + "loss": 3.6222, + "step": 21120 + }, + { + "epoch": 0.6190580824727167, + "grad_norm": 11.694019317626953, + "learning_rate": 9.206858302400573e-06, + "loss": 3.6272, + "step": 21130 + }, + { + "epoch": 0.6193510583754486, + "grad_norm": 11.199543952941895, + "learning_rate": 9.206003366220693e-06, + "loss": 3.6419, + "step": 21140 + }, + { + "epoch": 0.6196440342781806, + "grad_norm": 12.101473808288574, + "learning_rate": 9.205148009255132e-06, + "loss": 3.6341, + "step": 21150 + }, + { + "epoch": 0.6199370101809126, + "grad_norm": 12.12792682647705, + "learning_rate": 9.204292231589457e-06, + "loss": 3.629, + "step": 21160 + }, + { + "epoch": 0.6202299860836447, + "grad_norm": 11.80396556854248, + "learning_rate": 9.203436033309292e-06, + "loss": 3.6173, + "step": 21170 + }, + { + "epoch": 0.6205229619863766, + "grad_norm": 12.644844055175781, + "learning_rate": 9.202579414500287e-06, + "loss": 3.6132, + "step": 21180 + }, + { + "epoch": 0.6208159378891086, + "grad_norm": 11.702146530151367, + "learning_rate": 9.201722375248146e-06, + "loss": 3.5783, + "step": 21190 + }, + { + "epoch": 0.6211089137918406, + "grad_norm": 12.039628028869629, + "learning_rate": 9.200864915638608e-06, + "loss": 3.6379, + "step": 21200 + }, + { + "epoch": 0.6214018896945727, + "grad_norm": 11.779828071594238, + "learning_rate": 9.20000703575746e-06, + "loss": 3.6297, + "step": 21210 + }, + { + "epoch": 0.6216948655973046, + "grad_norm": 11.873653411865234, + "learning_rate": 9.199148735690523e-06, + "loss": 3.6259, + "step": 21220 + }, + { + "epoch": 0.6219878415000366, + "grad_norm": 11.242782592773438, + "learning_rate": 9.198290015523672e-06, + "loss": 3.6276, + "step": 21230 + }, + { + "epoch": 0.6222808174027686, + "grad_norm": 12.194059371948242, + "learning_rate": 9.19743087534281e-06, + "loss": 3.6068, + "step": 21240 + }, + { + "epoch": 0.6225737933055007, + "grad_norm": 11.609164237976074, + "learning_rate": 9.19657131523389e-06, + "loss": 3.6093, + "step": 21250 + }, + { + "epoch": 0.6228667692082326, + "grad_norm": 11.491744995117188, + "learning_rate": 9.195711335282909e-06, + "loss": 3.624, + "step": 21260 + }, + { + "epoch": 0.6231597451109646, + "grad_norm": 11.640286445617676, + "learning_rate": 9.194850935575902e-06, + "loss": 3.632, + "step": 21270 + }, + { + "epoch": 0.6234527210136966, + "grad_norm": 11.731718063354492, + "learning_rate": 9.193990116198946e-06, + "loss": 3.619, + "step": 21280 + }, + { + "epoch": 0.6237456969164287, + "grad_norm": 11.25540542602539, + "learning_rate": 9.193128877238159e-06, + "loss": 3.601, + "step": 21290 + }, + { + "epoch": 0.6240386728191606, + "grad_norm": 11.142794609069824, + "learning_rate": 9.192267218779706e-06, + "loss": 3.6157, + "step": 21300 + }, + { + "epoch": 0.6243316487218926, + "grad_norm": 13.453524589538574, + "learning_rate": 9.191405140909789e-06, + "loss": 3.6018, + "step": 21310 + }, + { + "epoch": 0.6246246246246246, + "grad_norm": 12.304743766784668, + "learning_rate": 9.190542643714655e-06, + "loss": 3.6221, + "step": 21320 + }, + { + "epoch": 0.6249176005273567, + "grad_norm": 12.29442024230957, + "learning_rate": 9.189679727280592e-06, + "loss": 3.6403, + "step": 21330 + }, + { + "epoch": 0.6252105764300886, + "grad_norm": 11.77631664276123, + "learning_rate": 9.188816391693928e-06, + "loss": 3.6132, + "step": 21340 + }, + { + "epoch": 0.6255035523328206, + "grad_norm": 11.280939102172852, + "learning_rate": 9.187952637041036e-06, + "loss": 3.6089, + "step": 21350 + }, + { + "epoch": 0.6257965282355527, + "grad_norm": 11.543425559997559, + "learning_rate": 9.187088463408326e-06, + "loss": 3.605, + "step": 21360 + }, + { + "epoch": 0.6260895041382847, + "grad_norm": 11.036011695861816, + "learning_rate": 9.18622387088226e-06, + "loss": 3.5895, + "step": 21370 + }, + { + "epoch": 0.6263824800410166, + "grad_norm": 13.40429401397705, + "learning_rate": 9.18535885954933e-06, + "loss": 3.6354, + "step": 21380 + }, + { + "epoch": 0.6266754559437486, + "grad_norm": 11.441117286682129, + "learning_rate": 9.184493429496078e-06, + "loss": 3.6181, + "step": 21390 + }, + { + "epoch": 0.6269684318464807, + "grad_norm": 10.900825500488281, + "learning_rate": 9.183627580809083e-06, + "loss": 3.6178, + "step": 21400 + }, + { + "epoch": 0.6272614077492126, + "grad_norm": 10.824607849121094, + "learning_rate": 9.18276131357497e-06, + "loss": 3.6192, + "step": 21410 + }, + { + "epoch": 0.6275543836519446, + "grad_norm": 12.112313270568848, + "learning_rate": 9.181894627880402e-06, + "loss": 3.6, + "step": 21420 + }, + { + "epoch": 0.6278473595546766, + "grad_norm": 10.622360229492188, + "learning_rate": 9.181027523812088e-06, + "loss": 3.6066, + "step": 21430 + }, + { + "epoch": 0.6281403354574087, + "grad_norm": 12.055435180664062, + "learning_rate": 9.180160001456774e-06, + "loss": 3.5851, + "step": 21440 + }, + { + "epoch": 0.6284333113601406, + "grad_norm": 11.955007553100586, + "learning_rate": 9.179292060901252e-06, + "loss": 3.6176, + "step": 21450 + }, + { + "epoch": 0.6287262872628726, + "grad_norm": 12.099366188049316, + "learning_rate": 9.178423702232356e-06, + "loss": 3.6038, + "step": 21460 + }, + { + "epoch": 0.6290192631656046, + "grad_norm": 10.881095886230469, + "learning_rate": 9.177554925536958e-06, + "loss": 3.6203, + "step": 21470 + }, + { + "epoch": 0.6293122390683367, + "grad_norm": 13.016655921936035, + "learning_rate": 9.176685730901973e-06, + "loss": 3.6266, + "step": 21480 + }, + { + "epoch": 0.6296052149710686, + "grad_norm": 12.323745727539062, + "learning_rate": 9.17581611841436e-06, + "loss": 3.6247, + "step": 21490 + }, + { + "epoch": 0.6298981908738006, + "grad_norm": 11.986446380615234, + "learning_rate": 9.17494608816112e-06, + "loss": 3.5728, + "step": 21500 + }, + { + "epoch": 0.6300153812348934, + "eval_bleu": 0.32101938413462283, + "eval_cap_loss": 0.9966208934783936, + "eval_con_loss": 1.4165992736816406, + "eval_loss": 3.8298192024230957, + "step": 21504 + }, + { + "epoch": 0.6300153812348934, + "eval_bleu": 0.32101938413462283, + "eval_cap_loss": 0.9966208934783936, + "eval_con_loss": 1.4165992736816406, + "eval_loss": 3.8298192024230957, + "eval_runtime": 55.0742, + "eval_samples_per_second": 363.146, + "eval_steps_per_second": 0.363, + "step": 21504 + }, + { + "epoch": 0.6301911667765326, + "grad_norm": 10.917587280273438, + "learning_rate": 9.174075640229293e-06, + "loss": 3.6128, + "step": 21510 + }, + { + "epoch": 0.6304841426792647, + "grad_norm": 11.829084396362305, + "learning_rate": 9.173204774705962e-06, + "loss": 3.6072, + "step": 21520 + }, + { + "epoch": 0.6307771185819966, + "grad_norm": 10.583032608032227, + "learning_rate": 9.172333491678252e-06, + "loss": 3.6144, + "step": 21530 + }, + { + "epoch": 0.6310700944847286, + "grad_norm": 11.038076400756836, + "learning_rate": 9.171461791233329e-06, + "loss": 3.5999, + "step": 21540 + }, + { + "epoch": 0.6313630703874606, + "grad_norm": 10.925472259521484, + "learning_rate": 9.170589673458402e-06, + "loss": 3.596, + "step": 21550 + }, + { + "epoch": 0.6316560462901927, + "grad_norm": 12.048918724060059, + "learning_rate": 9.169717138440721e-06, + "loss": 3.6023, + "step": 21560 + }, + { + "epoch": 0.6319490221929246, + "grad_norm": 12.356877326965332, + "learning_rate": 9.16884418626758e-06, + "loss": 3.5931, + "step": 21570 + }, + { + "epoch": 0.6322419980956566, + "grad_norm": 12.882843017578125, + "learning_rate": 9.167970817026311e-06, + "loss": 3.6236, + "step": 21580 + }, + { + "epoch": 0.6325349739983886, + "grad_norm": 13.169841766357422, + "learning_rate": 9.167097030804289e-06, + "loss": 3.6217, + "step": 21590 + }, + { + "epoch": 0.6328279499011207, + "grad_norm": 11.477314949035645, + "learning_rate": 9.166222827688931e-06, + "loss": 3.6091, + "step": 21600 + }, + { + "epoch": 0.6331209258038526, + "grad_norm": 12.381752967834473, + "learning_rate": 9.165348207767697e-06, + "loss": 3.6086, + "step": 21610 + }, + { + "epoch": 0.6334139017065846, + "grad_norm": 12.097752571105957, + "learning_rate": 9.164473171128087e-06, + "loss": 3.6239, + "step": 21620 + }, + { + "epoch": 0.6337068776093167, + "grad_norm": 9.904582977294922, + "learning_rate": 9.163597717857643e-06, + "loss": 3.5972, + "step": 21630 + }, + { + "epoch": 0.6339998535120487, + "grad_norm": 11.079866409301758, + "learning_rate": 9.16272184804395e-06, + "loss": 3.5975, + "step": 21640 + }, + { + "epoch": 0.6342928294147806, + "grad_norm": 11.997457504272461, + "learning_rate": 9.161845561774632e-06, + "loss": 3.5764, + "step": 21650 + }, + { + "epoch": 0.6345858053175126, + "grad_norm": 11.304207801818848, + "learning_rate": 9.160968859137356e-06, + "loss": 3.62, + "step": 21660 + }, + { + "epoch": 0.6348787812202447, + "grad_norm": 11.322297096252441, + "learning_rate": 9.160091740219832e-06, + "loss": 3.6102, + "step": 21670 + }, + { + "epoch": 0.6351717571229767, + "grad_norm": 12.131288528442383, + "learning_rate": 9.159214205109813e-06, + "loss": 3.617, + "step": 21680 + }, + { + "epoch": 0.6354647330257086, + "grad_norm": 11.10956859588623, + "learning_rate": 9.158336253895085e-06, + "loss": 3.5952, + "step": 21690 + }, + { + "epoch": 0.6357577089284406, + "grad_norm": 11.842206001281738, + "learning_rate": 9.157457886663487e-06, + "loss": 3.588, + "step": 21700 + }, + { + "epoch": 0.6360506848311727, + "grad_norm": 13.637805938720703, + "learning_rate": 9.156579103502893e-06, + "loss": 3.5867, + "step": 21710 + }, + { + "epoch": 0.6363436607339046, + "grad_norm": 11.101129531860352, + "learning_rate": 9.155699904501219e-06, + "loss": 3.5967, + "step": 21720 + }, + { + "epoch": 0.6366366366366366, + "grad_norm": 12.11279582977295, + "learning_rate": 9.154820289746426e-06, + "loss": 3.5789, + "step": 21730 + }, + { + "epoch": 0.6369296125393686, + "grad_norm": 11.261101722717285, + "learning_rate": 9.153940259326511e-06, + "loss": 3.6056, + "step": 21740 + }, + { + "epoch": 0.6372225884421007, + "grad_norm": 11.495380401611328, + "learning_rate": 9.153059813329518e-06, + "loss": 3.5909, + "step": 21750 + }, + { + "epoch": 0.6375155643448326, + "grad_norm": 12.346381187438965, + "learning_rate": 9.152178951843532e-06, + "loss": 3.6136, + "step": 21760 + }, + { + "epoch": 0.6378085402475646, + "grad_norm": 12.389922142028809, + "learning_rate": 9.151297674956673e-06, + "loss": 3.6104, + "step": 21770 + }, + { + "epoch": 0.6381015161502966, + "grad_norm": 12.121881484985352, + "learning_rate": 9.15041598275711e-06, + "loss": 3.5877, + "step": 21780 + }, + { + "epoch": 0.6383944920530287, + "grad_norm": 11.496201515197754, + "learning_rate": 9.149533875333055e-06, + "loss": 3.5951, + "step": 21790 + }, + { + "epoch": 0.6386874679557606, + "grad_norm": 10.91817569732666, + "learning_rate": 9.148651352772751e-06, + "loss": 3.5942, + "step": 21800 + }, + { + "epoch": 0.6389804438584926, + "grad_norm": 11.771505355834961, + "learning_rate": 9.147768415164495e-06, + "loss": 3.6119, + "step": 21810 + }, + { + "epoch": 0.6392734197612246, + "grad_norm": 10.545681953430176, + "learning_rate": 9.146885062596615e-06, + "loss": 3.592, + "step": 21820 + }, + { + "epoch": 0.6395663956639567, + "grad_norm": 12.464387893676758, + "learning_rate": 9.146001295157487e-06, + "loss": 3.5853, + "step": 21830 + }, + { + "epoch": 0.6398593715666886, + "grad_norm": 11.95300006866455, + "learning_rate": 9.145117112935526e-06, + "loss": 3.589, + "step": 21840 + }, + { + "epoch": 0.6401523474694206, + "grad_norm": 11.78608226776123, + "learning_rate": 9.144232516019194e-06, + "loss": 3.5959, + "step": 21850 + }, + { + "epoch": 0.6404453233721527, + "grad_norm": 11.548820495605469, + "learning_rate": 9.143347504496982e-06, + "loss": 3.5821, + "step": 21860 + }, + { + "epoch": 0.6407382992748847, + "grad_norm": 12.452394485473633, + "learning_rate": 9.142462078457435e-06, + "loss": 3.613, + "step": 21870 + }, + { + "epoch": 0.6410312751776166, + "grad_norm": 11.322920799255371, + "learning_rate": 9.141576237989133e-06, + "loss": 3.5649, + "step": 21880 + }, + { + "epoch": 0.6413242510803486, + "grad_norm": 10.871248245239258, + "learning_rate": 9.1406899831807e-06, + "loss": 3.5844, + "step": 21890 + }, + { + "epoch": 0.6416172269830807, + "grad_norm": 12.54699993133545, + "learning_rate": 9.139803314120798e-06, + "loss": 3.5795, + "step": 21900 + }, + { + "epoch": 0.6419102028858127, + "grad_norm": 11.612568855285645, + "learning_rate": 9.138916230898137e-06, + "loss": 3.611, + "step": 21910 + }, + { + "epoch": 0.6422031787885446, + "grad_norm": 10.339073181152344, + "learning_rate": 9.138028733601462e-06, + "loss": 3.6181, + "step": 21920 + }, + { + "epoch": 0.6424961546912766, + "grad_norm": 11.853331565856934, + "learning_rate": 9.137140822319564e-06, + "loss": 3.5813, + "step": 21930 + }, + { + "epoch": 0.6427891305940087, + "grad_norm": 11.740256309509277, + "learning_rate": 9.13625249714127e-06, + "loss": 3.6115, + "step": 21940 + }, + { + "epoch": 0.6430821064967407, + "grad_norm": NaN, + "learning_rate": 9.13545265067284e-06, + "loss": 3.5997, + "step": 21950 + }, + { + "epoch": 0.6433750823994726, + "grad_norm": 11.400784492492676, + "learning_rate": 9.134563539336272e-06, + "loss": 3.5554, + "step": 21960 + }, + { + "epoch": 0.6436680583022046, + "grad_norm": 10.92160415649414, + "learning_rate": 9.133674014361151e-06, + "loss": 3.5944, + "step": 21970 + }, + { + "epoch": 0.6439610342049367, + "grad_norm": 11.19637393951416, + "learning_rate": 9.13278407583647e-06, + "loss": 3.5873, + "step": 21980 + }, + { + "epoch": 0.6442540101076687, + "grad_norm": 12.37358283996582, + "learning_rate": 9.131893723851262e-06, + "loss": 3.5942, + "step": 21990 + }, + { + "epoch": 0.6445469860104006, + "grad_norm": 10.806342124938965, + "learning_rate": 9.131002958494599e-06, + "loss": 3.5874, + "step": 22000 + }, + { + "epoch": 0.6448399619131326, + "grad_norm": 13.16428279876709, + "learning_rate": 9.130111779855599e-06, + "loss": 3.6023, + "step": 22010 + }, + { + "epoch": 0.6450157474547719, + "eval_bleu": 0.32283621491455416, + "eval_cap_loss": 0.9934353828430176, + "eval_con_loss": 1.4037630558013916, + "eval_loss": 3.800961494445801, + "step": 22016 + }, + { + "epoch": 0.6450157474547719, + "eval_bleu": 0.32283621491455416, + "eval_cap_loss": 0.9934353828430176, + "eval_con_loss": 1.4037630558013916, + "eval_loss": 3.800961494445801, + "eval_runtime": 56.568, + "eval_samples_per_second": 353.557, + "eval_steps_per_second": 0.354, + "step": 22016 + }, + { + "epoch": 0.6451329378158647, + "grad_norm": 12.814852714538574, + "learning_rate": 9.129220188023419e-06, + "loss": 3.5965, + "step": 22020 + }, + { + "epoch": 0.6454259137185967, + "grad_norm": 10.753159523010254, + "learning_rate": 9.128328183087256e-06, + "loss": 3.615, + "step": 22030 + }, + { + "epoch": 0.6457188896213286, + "grad_norm": 11.08842945098877, + "learning_rate": 9.127435765136353e-06, + "loss": 3.5984, + "step": 22040 + }, + { + "epoch": 0.6460118655240606, + "grad_norm": 11.846715927124023, + "learning_rate": 9.126542934259986e-06, + "loss": 3.5881, + "step": 22050 + }, + { + "epoch": 0.6463048414267927, + "grad_norm": 11.025135040283203, + "learning_rate": 9.125649690547483e-06, + "loss": 3.5917, + "step": 22060 + }, + { + "epoch": 0.6465978173295246, + "grad_norm": 12.575222969055176, + "learning_rate": 9.124756034088203e-06, + "loss": 3.5862, + "step": 22070 + }, + { + "epoch": 0.6468907932322566, + "grad_norm": 11.381425857543945, + "learning_rate": 9.123861964971552e-06, + "loss": 3.5766, + "step": 22080 + }, + { + "epoch": 0.6471837691349887, + "grad_norm": 13.602008819580078, + "learning_rate": 9.122967483286981e-06, + "loss": 3.5825, + "step": 22090 + }, + { + "epoch": 0.6474767450377207, + "grad_norm": 11.36999797821045, + "learning_rate": 9.122072589123973e-06, + "loss": 3.5592, + "step": 22100 + }, + { + "epoch": 0.6477697209404526, + "grad_norm": 11.763554573059082, + "learning_rate": 9.121177282572057e-06, + "loss": 3.6031, + "step": 22110 + }, + { + "epoch": 0.6480626968431846, + "grad_norm": 12.355710983276367, + "learning_rate": 9.120281563720804e-06, + "loss": 3.6003, + "step": 22120 + }, + { + "epoch": 0.6483556727459167, + "grad_norm": 11.473296165466309, + "learning_rate": 9.119385432659827e-06, + "loss": 3.5904, + "step": 22130 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 11.16880989074707, + "learning_rate": 9.118488889478774e-06, + "loss": 3.5959, + "step": 22140 + }, + { + "epoch": 0.6489416245513806, + "grad_norm": 11.470967292785645, + "learning_rate": 9.117591934267343e-06, + "loss": 3.5979, + "step": 22150 + }, + { + "epoch": 0.6492346004541126, + "grad_norm": 11.18114948272705, + "learning_rate": 9.116694567115269e-06, + "loss": 3.6048, + "step": 22160 + }, + { + "epoch": 0.6495275763568447, + "grad_norm": 11.360272407531738, + "learning_rate": 9.115796788112325e-06, + "loss": 3.6026, + "step": 22170 + }, + { + "epoch": 0.6498205522595767, + "grad_norm": 11.764567375183105, + "learning_rate": 9.114898597348332e-06, + "loss": 3.5852, + "step": 22180 + }, + { + "epoch": 0.6501135281623086, + "grad_norm": 11.228961944580078, + "learning_rate": 9.113999994913148e-06, + "loss": 3.57, + "step": 22190 + }, + { + "epoch": 0.6504065040650406, + "grad_norm": 10.347782135009766, + "learning_rate": 9.11310098089667e-06, + "loss": 3.5557, + "step": 22200 + }, + { + "epoch": 0.6506994799677727, + "grad_norm": 12.078365325927734, + "learning_rate": 9.112201555388843e-06, + "loss": 3.5878, + "step": 22210 + }, + { + "epoch": 0.6509924558705047, + "grad_norm": 12.9426908493042, + "learning_rate": 9.111301718479646e-06, + "loss": 3.5752, + "step": 22220 + }, + { + "epoch": 0.6512854317732366, + "grad_norm": 12.47465705871582, + "learning_rate": 9.110401470259104e-06, + "loss": 3.5811, + "step": 22230 + }, + { + "epoch": 0.6515784076759686, + "grad_norm": 13.109038352966309, + "learning_rate": 9.10950081081728e-06, + "loss": 3.5806, + "step": 22240 + }, + { + "epoch": 0.6518713835787007, + "grad_norm": 12.873588562011719, + "learning_rate": 9.108599740244283e-06, + "loss": 3.583, + "step": 22250 + }, + { + "epoch": 0.6521643594814327, + "grad_norm": 11.528298377990723, + "learning_rate": 9.107698258630255e-06, + "loss": 3.5643, + "step": 22260 + }, + { + "epoch": 0.6524573353841646, + "grad_norm": 11.466741561889648, + "learning_rate": 9.106796366065388e-06, + "loss": 3.5888, + "step": 22270 + }, + { + "epoch": 0.6527503112868966, + "grad_norm": 11.326302528381348, + "learning_rate": 9.105894062639908e-06, + "loss": 3.5777, + "step": 22280 + }, + { + "epoch": 0.6530432871896287, + "grad_norm": 13.084019660949707, + "learning_rate": 9.104991348444088e-06, + "loss": 3.5886, + "step": 22290 + }, + { + "epoch": 0.6533362630923607, + "grad_norm": 12.057794570922852, + "learning_rate": 9.104088223568236e-06, + "loss": 3.5932, + "step": 22300 + }, + { + "epoch": 0.6536292389950926, + "grad_norm": 11.126609802246094, + "learning_rate": 9.10318468810271e-06, + "loss": 3.6194, + "step": 22310 + }, + { + "epoch": 0.6539222148978246, + "grad_norm": 11.155805587768555, + "learning_rate": 9.102280742137894e-06, + "loss": 3.5595, + "step": 22320 + }, + { + "epoch": 0.6542151908005567, + "grad_norm": 12.56799602508545, + "learning_rate": 9.10137638576423e-06, + "loss": 3.551, + "step": 22330 + }, + { + "epoch": 0.6545081667032887, + "grad_norm": 12.380681991577148, + "learning_rate": 9.100471619072191e-06, + "loss": 3.571, + "step": 22340 + }, + { + "epoch": 0.6548011426060206, + "grad_norm": 11.106124877929688, + "learning_rate": 9.099566442152295e-06, + "loss": 3.589, + "step": 22350 + }, + { + "epoch": 0.6550941185087527, + "grad_norm": 12.7269868850708, + "learning_rate": 9.098660855095095e-06, + "loss": 3.6128, + "step": 22360 + }, + { + "epoch": 0.6553870944114847, + "grad_norm": 10.847662925720215, + "learning_rate": 9.097754857991198e-06, + "loss": 3.5775, + "step": 22370 + }, + { + "epoch": 0.6556800703142166, + "grad_norm": 13.040546417236328, + "learning_rate": 9.096848450931236e-06, + "loss": 3.5743, + "step": 22380 + }, + { + "epoch": 0.6559730462169486, + "grad_norm": 11.13732624053955, + "learning_rate": 9.095941634005892e-06, + "loss": 3.5667, + "step": 22390 + }, + { + "epoch": 0.6562660221196807, + "grad_norm": 11.89230728149414, + "learning_rate": 9.095034407305887e-06, + "loss": 3.5811, + "step": 22400 + }, + { + "epoch": 0.6565589980224127, + "grad_norm": 11.92296028137207, + "learning_rate": 9.094126770921988e-06, + "loss": 3.5809, + "step": 22410 + }, + { + "epoch": 0.6568519739251446, + "grad_norm": 11.759135246276855, + "learning_rate": 9.093218724944992e-06, + "loss": 3.5794, + "step": 22420 + }, + { + "epoch": 0.6571449498278766, + "grad_norm": 11.8651123046875, + "learning_rate": 9.092310269465748e-06, + "loss": 3.5801, + "step": 22430 + }, + { + "epoch": 0.6574379257306087, + "grad_norm": 11.239107131958008, + "learning_rate": 9.09140140457514e-06, + "loss": 3.5516, + "step": 22440 + }, + { + "epoch": 0.6577309016333407, + "grad_norm": 10.271649360656738, + "learning_rate": 9.090492130364093e-06, + "loss": 3.5996, + "step": 22450 + }, + { + "epoch": 0.6580238775360726, + "grad_norm": 12.690140724182129, + "learning_rate": 9.08958244692358e-06, + "loss": 3.5822, + "step": 22460 + }, + { + "epoch": 0.6583168534388046, + "grad_norm": 12.418766975402832, + "learning_rate": 9.088672354344604e-06, + "loss": 3.5798, + "step": 22470 + }, + { + "epoch": 0.6586098293415367, + "grad_norm": 11.596649169921875, + "learning_rate": 9.087761852718215e-06, + "loss": 3.6048, + "step": 22480 + }, + { + "epoch": 0.6589028052442687, + "grad_norm": 11.291959762573242, + "learning_rate": 9.086850942135507e-06, + "loss": 3.5644, + "step": 22490 + }, + { + "epoch": 0.6591957811470006, + "grad_norm": 12.085200309753418, + "learning_rate": 9.085939622687607e-06, + "loss": 3.5802, + "step": 22500 + }, + { + "epoch": 0.6594887570497326, + "grad_norm": 11.619619369506836, + "learning_rate": 9.08502789446569e-06, + "loss": 3.584, + "step": 22510 + }, + { + "epoch": 0.6597817329524647, + "grad_norm": 12.509965896606445, + "learning_rate": 9.084115757560966e-06, + "loss": 3.6117, + "step": 22520 + }, + { + "epoch": 0.6600161136746503, + "eval_bleu": 0.32451381446501376, + "eval_cap_loss": 0.990376353263855, + "eval_con_loss": 1.3913713693618774, + "eval_loss": 3.7731194496154785, + "step": 22528 + }, + { + "epoch": 0.6600161136746503, + "eval_bleu": 0.32451381446501376, + "eval_cap_loss": 0.990376353263855, + "eval_con_loss": 1.3913713693618774, + "eval_loss": 3.7731194496154785, + "eval_runtime": 57.5673, + "eval_samples_per_second": 347.419, + "eval_steps_per_second": 0.347, + "step": 22528 + }, + { + "epoch": 0.6600747088551967, + "grad_norm": 11.301281929016113, + "learning_rate": 9.083203212064693e-06, + "loss": 3.5721, + "step": 22530 + }, + { + "epoch": 0.6603676847579286, + "grad_norm": 11.300043106079102, + "learning_rate": 9.082290258068163e-06, + "loss": 3.5866, + "step": 22540 + }, + { + "epoch": 0.6606606606606606, + "grad_norm": 12.064274787902832, + "learning_rate": 9.081376895662712e-06, + "loss": 3.5439, + "step": 22550 + }, + { + "epoch": 0.6609536365633927, + "grad_norm": 11.745564460754395, + "learning_rate": 9.080463124939715e-06, + "loss": 3.5912, + "step": 22560 + }, + { + "epoch": 0.6612466124661247, + "grad_norm": 11.100678443908691, + "learning_rate": 9.079548945990593e-06, + "loss": 3.5652, + "step": 22570 + }, + { + "epoch": 0.6615395883688566, + "grad_norm": 12.319958686828613, + "learning_rate": 9.078634358906802e-06, + "loss": 3.567, + "step": 22580 + }, + { + "epoch": 0.6618325642715887, + "grad_norm": 11.273195266723633, + "learning_rate": 9.077719363779839e-06, + "loss": 3.5618, + "step": 22590 + }, + { + "epoch": 0.6621255401743207, + "grad_norm": 10.787589073181152, + "learning_rate": 9.076803960701248e-06, + "loss": 3.5934, + "step": 22600 + }, + { + "epoch": 0.6624185160770527, + "grad_norm": 12.794881820678711, + "learning_rate": 9.075888149762606e-06, + "loss": 3.5732, + "step": 22610 + }, + { + "epoch": 0.6627114919797846, + "grad_norm": 11.270796775817871, + "learning_rate": 9.074971931055537e-06, + "loss": 3.5705, + "step": 22620 + }, + { + "epoch": 0.6630044678825167, + "grad_norm": 12.602149963378906, + "learning_rate": 9.0740553046717e-06, + "loss": 3.5674, + "step": 22630 + }, + { + "epoch": 0.6632974437852487, + "grad_norm": 12.223329544067383, + "learning_rate": 9.073138270702804e-06, + "loss": 3.5507, + "step": 22640 + }, + { + "epoch": 0.6635904196879807, + "grad_norm": 11.966939926147461, + "learning_rate": 9.072220829240587e-06, + "loss": 3.5686, + "step": 22650 + }, + { + "epoch": 0.6638833955907126, + "grad_norm": 11.045984268188477, + "learning_rate": 9.071302980376836e-06, + "loss": 3.5623, + "step": 22660 + }, + { + "epoch": 0.6641763714934447, + "grad_norm": 13.496212005615234, + "learning_rate": 9.070384724203377e-06, + "loss": 3.5802, + "step": 22670 + }, + { + "epoch": 0.6644693473961767, + "grad_norm": 12.249011039733887, + "learning_rate": 9.069466060812074e-06, + "loss": 3.5903, + "step": 22680 + }, + { + "epoch": 0.6647623232989087, + "grad_norm": 12.208293914794922, + "learning_rate": 9.068546990294834e-06, + "loss": 3.5495, + "step": 22690 + }, + { + "epoch": 0.6650552992016406, + "grad_norm": 10.76752758026123, + "learning_rate": 9.067627512743607e-06, + "loss": 3.5559, + "step": 22700 + }, + { + "epoch": 0.6653482751043727, + "grad_norm": 11.438199043273926, + "learning_rate": 9.06670762825038e-06, + "loss": 3.5494, + "step": 22710 + }, + { + "epoch": 0.6656412510071047, + "grad_norm": 10.30392074584961, + "learning_rate": 9.065787336907182e-06, + "loss": 3.5547, + "step": 22720 + }, + { + "epoch": 0.6659342269098366, + "grad_norm": 11.770495414733887, + "learning_rate": 9.06486663880608e-06, + "loss": 3.5921, + "step": 22730 + }, + { + "epoch": 0.6662272028125686, + "grad_norm": 12.053719520568848, + "learning_rate": 9.063945534039189e-06, + "loss": 3.5442, + "step": 22740 + }, + { + "epoch": 0.6665201787153007, + "grad_norm": 11.134612083435059, + "learning_rate": 9.063024022698657e-06, + "loss": 3.5645, + "step": 22750 + }, + { + "epoch": 0.6668131546180327, + "grad_norm": 12.690330505371094, + "learning_rate": 9.062102104876678e-06, + "loss": 3.573, + "step": 22760 + }, + { + "epoch": 0.6671061305207646, + "grad_norm": 11.42208480834961, + "learning_rate": 9.061179780665481e-06, + "loss": 3.5462, + "step": 22770 + }, + { + "epoch": 0.6673991064234966, + "grad_norm": 12.180785179138184, + "learning_rate": 9.060257050157342e-06, + "loss": 3.5711, + "step": 22780 + }, + { + "epoch": 0.6676920823262287, + "grad_norm": 11.46288776397705, + "learning_rate": 9.059333913444574e-06, + "loss": 3.5374, + "step": 22790 + }, + { + "epoch": 0.6679850582289607, + "grad_norm": 10.881182670593262, + "learning_rate": 9.05841037061953e-06, + "loss": 3.5868, + "step": 22800 + }, + { + "epoch": 0.6682780341316926, + "grad_norm": 12.550612449645996, + "learning_rate": 9.057486421774609e-06, + "loss": 3.5699, + "step": 22810 + }, + { + "epoch": 0.6685710100344247, + "grad_norm": 11.49695873260498, + "learning_rate": 9.05656206700224e-06, + "loss": 3.5467, + "step": 22820 + }, + { + "epoch": 0.6688639859371567, + "grad_norm": 11.75654125213623, + "learning_rate": 9.055637306394907e-06, + "loss": 3.5759, + "step": 22830 + }, + { + "epoch": 0.6691569618398887, + "grad_norm": 12.677846908569336, + "learning_rate": 9.05471214004512e-06, + "loss": 3.5681, + "step": 22840 + }, + { + "epoch": 0.6694499377426206, + "grad_norm": 11.263339042663574, + "learning_rate": 9.053786568045439e-06, + "loss": 3.5518, + "step": 22850 + }, + { + "epoch": 0.6697429136453527, + "grad_norm": 12.035893440246582, + "learning_rate": 9.052860590488463e-06, + "loss": 3.5602, + "step": 22860 + }, + { + "epoch": 0.6700358895480847, + "grad_norm": 12.186220169067383, + "learning_rate": 9.051934207466831e-06, + "loss": 3.5663, + "step": 22870 + }, + { + "epoch": 0.6703288654508167, + "grad_norm": 12.43410873413086, + "learning_rate": 9.05100741907322e-06, + "loss": 3.5641, + "step": 22880 + }, + { + "epoch": 0.6706218413535486, + "grad_norm": 11.319164276123047, + "learning_rate": 9.050080225400348e-06, + "loss": 3.5403, + "step": 22890 + }, + { + "epoch": 0.6709148172562807, + "grad_norm": 10.504433631896973, + "learning_rate": 9.049152626540981e-06, + "loss": 3.5764, + "step": 22900 + }, + { + "epoch": 0.6712077931590127, + "grad_norm": 12.127219200134277, + "learning_rate": 9.048224622587915e-06, + "loss": 3.565, + "step": 22910 + }, + { + "epoch": 0.6715007690617447, + "grad_norm": 10.89371109008789, + "learning_rate": 9.047296213633994e-06, + "loss": 3.5671, + "step": 22920 + }, + { + "epoch": 0.6717937449644766, + "grad_norm": 12.661711692810059, + "learning_rate": 9.046367399772097e-06, + "loss": 3.56, + "step": 22930 + }, + { + "epoch": 0.6720867208672087, + "grad_norm": 12.787631034851074, + "learning_rate": 9.04543818109515e-06, + "loss": 3.5924, + "step": 22940 + }, + { + "epoch": 0.6723796967699407, + "grad_norm": 11.804879188537598, + "learning_rate": 9.044508557696111e-06, + "loss": 3.5564, + "step": 22950 + }, + { + "epoch": 0.6726726726726727, + "grad_norm": 10.194284439086914, + "learning_rate": 9.043578529667988e-06, + "loss": 3.5518, + "step": 22960 + }, + { + "epoch": 0.6729656485754046, + "grad_norm": 11.180122375488281, + "learning_rate": 9.042648097103822e-06, + "loss": 3.5487, + "step": 22970 + }, + { + "epoch": 0.6732586244781367, + "grad_norm": 13.240753173828125, + "learning_rate": 9.041717260096698e-06, + "loss": 3.5708, + "step": 22980 + }, + { + "epoch": 0.6735516003808687, + "grad_norm": 14.285050392150879, + "learning_rate": 9.040786018739742e-06, + "loss": 3.5748, + "step": 22990 + }, + { + "epoch": 0.6738445762836007, + "grad_norm": 12.4741792678833, + "learning_rate": 9.039854373126116e-06, + "loss": 3.58, + "step": 23000 + }, + { + "epoch": 0.6741375521863326, + "grad_norm": 11.462026596069336, + "learning_rate": 9.03892232334903e-06, + "loss": 3.5311, + "step": 23010 + }, + { + "epoch": 0.6744305280890647, + "grad_norm": 11.28841495513916, + "learning_rate": 9.037989869501727e-06, + "loss": 3.5579, + "step": 23020 + }, + { + "epoch": 0.6747235039917967, + "grad_norm": 13.008068084716797, + "learning_rate": 9.037057011677492e-06, + "loss": 3.5637, + "step": 23030 + }, + { + "epoch": 0.6750164798945286, + "grad_norm": 11.17167854309082, + "learning_rate": 9.036123749969656e-06, + "loss": 3.5134, + "step": 23040 + }, + { + "epoch": 0.6750164798945286, + "eval_bleu": 0.32374948096556244, + "eval_cap_loss": 0.9894413948059082, + "eval_con_loss": 1.3911685943603516, + "eval_loss": 3.771778106689453, + "step": 23040 + }, + { + "epoch": 0.6750164798945286, + "eval_bleu": 0.32374948096556244, + "eval_cap_loss": 0.9894413948059082, + "eval_con_loss": 1.3911685943603516, + "eval_loss": 3.771778106689453, + "eval_runtime": 56.8404, + "eval_samples_per_second": 351.862, + "eval_steps_per_second": 0.352, + "step": 23040 + }, + { + "epoch": 0.6753094557972606, + "grad_norm": 11.977163314819336, + "learning_rate": 9.035190084471584e-06, + "loss": 3.5638, + "step": 23050 + }, + { + "epoch": 0.6756024316999927, + "grad_norm": 10.93957233428955, + "learning_rate": 9.034256015276685e-06, + "loss": 3.5565, + "step": 23060 + }, + { + "epoch": 0.6758954076027247, + "grad_norm": 11.552677154541016, + "learning_rate": 9.033321542478405e-06, + "loss": 3.5629, + "step": 23070 + }, + { + "epoch": 0.6761883835054566, + "grad_norm": 11.644500732421875, + "learning_rate": 9.032386666170234e-06, + "loss": 3.5787, + "step": 23080 + }, + { + "epoch": 0.6764813594081887, + "grad_norm": 11.82899284362793, + "learning_rate": 9.0314513864457e-06, + "loss": 3.5432, + "step": 23090 + }, + { + "epoch": 0.6767743353109207, + "grad_norm": 11.889198303222656, + "learning_rate": 9.030515703398372e-06, + "loss": 3.5692, + "step": 23100 + }, + { + "epoch": 0.6770673112136527, + "grad_norm": 11.738245964050293, + "learning_rate": 9.029579617121861e-06, + "loss": 3.5595, + "step": 23110 + }, + { + "epoch": 0.6773602871163846, + "grad_norm": 12.107752799987793, + "learning_rate": 9.028643127709815e-06, + "loss": 3.5483, + "step": 23120 + }, + { + "epoch": 0.6776532630191167, + "grad_norm": 12.133642196655273, + "learning_rate": 9.027706235255927e-06, + "loss": 3.5399, + "step": 23130 + }, + { + "epoch": 0.6779462389218487, + "grad_norm": 11.127131462097168, + "learning_rate": 9.026768939853923e-06, + "loss": 3.5571, + "step": 23140 + }, + { + "epoch": 0.6782392148245807, + "grad_norm": 12.601760864257812, + "learning_rate": 9.025831241597579e-06, + "loss": 3.5884, + "step": 23150 + }, + { + "epoch": 0.6785321907273126, + "grad_norm": 11.417027473449707, + "learning_rate": 9.024893140580703e-06, + "loss": 3.5495, + "step": 23160 + }, + { + "epoch": 0.6788251666300447, + "grad_norm": 12.241321563720703, + "learning_rate": 9.023954636897147e-06, + "loss": 3.5383, + "step": 23170 + }, + { + "epoch": 0.6791181425327767, + "grad_norm": 11.754558563232422, + "learning_rate": 9.023015730640802e-06, + "loss": 3.5464, + "step": 23180 + }, + { + "epoch": 0.6794111184355087, + "grad_norm": 13.375649452209473, + "learning_rate": 9.022076421905604e-06, + "loss": 3.5378, + "step": 23190 + }, + { + "epoch": 0.6797040943382406, + "grad_norm": 11.442964553833008, + "learning_rate": 9.021136710785518e-06, + "loss": 3.5725, + "step": 23200 + }, + { + "epoch": 0.6799970702409727, + "grad_norm": 11.870409965515137, + "learning_rate": 9.020196597374563e-06, + "loss": 3.5503, + "step": 23210 + }, + { + "epoch": 0.6802900461437047, + "grad_norm": 11.691259384155273, + "learning_rate": 9.019256081766789e-06, + "loss": 3.5419, + "step": 23220 + }, + { + "epoch": 0.6805830220464367, + "grad_norm": 10.010117530822754, + "learning_rate": 9.01831516405629e-06, + "loss": 3.534, + "step": 23230 + }, + { + "epoch": 0.6808759979491686, + "grad_norm": 11.62779712677002, + "learning_rate": 9.017373844337198e-06, + "loss": 3.5612, + "step": 23240 + }, + { + "epoch": 0.6811689738519007, + "grad_norm": 11.563738822937012, + "learning_rate": 9.016432122703688e-06, + "loss": 3.5687, + "step": 23250 + }, + { + "epoch": 0.6814619497546327, + "grad_norm": 10.802560806274414, + "learning_rate": 9.01548999924997e-06, + "loss": 3.5155, + "step": 23260 + }, + { + "epoch": 0.6817549256573647, + "grad_norm": 11.790979385375977, + "learning_rate": 9.014547474070305e-06, + "loss": 3.5307, + "step": 23270 + }, + { + "epoch": 0.6820479015600966, + "grad_norm": 11.588765144348145, + "learning_rate": 9.01360454725898e-06, + "loss": 3.5265, + "step": 23280 + }, + { + "epoch": 0.6823408774628287, + "grad_norm": 12.38562297821045, + "learning_rate": 9.012661218910333e-06, + "loss": 3.5596, + "step": 23290 + }, + { + "epoch": 0.6826338533655607, + "grad_norm": 11.396895408630371, + "learning_rate": 9.011717489118737e-06, + "loss": 3.5362, + "step": 23300 + }, + { + "epoch": 0.6829268292682927, + "grad_norm": 11.696661949157715, + "learning_rate": 9.010773357978607e-06, + "loss": 3.5499, + "step": 23310 + }, + { + "epoch": 0.6832198051710247, + "grad_norm": 11.15822696685791, + "learning_rate": 9.009828825584397e-06, + "loss": 3.5702, + "step": 23320 + }, + { + "epoch": 0.6835127810737567, + "grad_norm": 10.614365577697754, + "learning_rate": 9.008883892030601e-06, + "loss": 3.5304, + "step": 23330 + }, + { + "epoch": 0.6838057569764887, + "grad_norm": 10.907793045043945, + "learning_rate": 9.00793855741176e-06, + "loss": 3.5318, + "step": 23340 + }, + { + "epoch": 0.6840987328792207, + "grad_norm": 11.754155158996582, + "learning_rate": 9.00699282182244e-06, + "loss": 3.5473, + "step": 23350 + }, + { + "epoch": 0.6843917087819527, + "grad_norm": 11.004071235656738, + "learning_rate": 9.006046685357264e-06, + "loss": 3.5431, + "step": 23360 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 11.14345645904541, + "learning_rate": 9.005100148110883e-06, + "loss": 3.5299, + "step": 23370 + }, + { + "epoch": 0.6849776605874167, + "grad_norm": 10.923399925231934, + "learning_rate": 9.004153210177994e-06, + "loss": 3.5411, + "step": 23380 + }, + { + "epoch": 0.6852706364901486, + "grad_norm": 13.415475845336914, + "learning_rate": 9.003205871653331e-06, + "loss": 3.5338, + "step": 23390 + }, + { + "epoch": 0.6855636123928807, + "grad_norm": 11.027688026428223, + "learning_rate": 9.002258132631672e-06, + "loss": 3.5374, + "step": 23400 + }, + { + "epoch": 0.6858565882956127, + "grad_norm": 10.945868492126465, + "learning_rate": 9.001309993207831e-06, + "loss": 3.5502, + "step": 23410 + }, + { + "epoch": 0.6861495641983447, + "grad_norm": 12.538110733032227, + "learning_rate": 9.000361453476665e-06, + "loss": 3.5251, + "step": 23420 + }, + { + "epoch": 0.6864425401010766, + "grad_norm": 12.780710220336914, + "learning_rate": 8.999412513533067e-06, + "loss": 3.5646, + "step": 23430 + }, + { + "epoch": 0.6867355160038087, + "grad_norm": 10.319843292236328, + "learning_rate": 8.998463173471976e-06, + "loss": 3.5382, + "step": 23440 + }, + { + "epoch": 0.6870284919065407, + "grad_norm": 11.973115921020508, + "learning_rate": 8.997513433388367e-06, + "loss": 3.5461, + "step": 23450 + }, + { + "epoch": 0.6873214678092727, + "grad_norm": 11.290740966796875, + "learning_rate": 8.996563293377254e-06, + "loss": 3.5262, + "step": 23460 + }, + { + "epoch": 0.6876144437120046, + "grad_norm": 12.843335151672363, + "learning_rate": 8.995612753533695e-06, + "loss": 3.5681, + "step": 23470 + }, + { + "epoch": 0.6879074196147367, + "grad_norm": 11.567956924438477, + "learning_rate": 8.994661813952786e-06, + "loss": 3.5219, + "step": 23480 + }, + { + "epoch": 0.6882003955174687, + "grad_norm": 12.18268871307373, + "learning_rate": 8.99371047472966e-06, + "loss": 3.5468, + "step": 23490 + }, + { + "epoch": 0.6884933714202007, + "grad_norm": 10.647245407104492, + "learning_rate": 8.992758735959497e-06, + "loss": 3.5482, + "step": 23500 + }, + { + "epoch": 0.6887863473229326, + "grad_norm": 11.767166137695312, + "learning_rate": 8.99180659773751e-06, + "loss": 3.5469, + "step": 23510 + }, + { + "epoch": 0.6890793232256647, + "grad_norm": 12.31812858581543, + "learning_rate": 8.990854060158956e-06, + "loss": 3.5219, + "step": 23520 + }, + { + "epoch": 0.6893722991283967, + "grad_norm": 12.207271575927734, + "learning_rate": 8.989901123319128e-06, + "loss": 3.5757, + "step": 23530 + }, + { + "epoch": 0.6896652750311287, + "grad_norm": 11.787867546081543, + "learning_rate": 8.988947787313365e-06, + "loss": 3.5495, + "step": 23540 + }, + { + "epoch": 0.6899582509338607, + "grad_norm": 11.442538261413574, + "learning_rate": 8.987994052237042e-06, + "loss": 3.5225, + "step": 23550 + }, + { + "epoch": 0.6900168461144071, + "eval_bleu": 0.3252668354817666, + "eval_cap_loss": 0.9869741797447205, + "eval_con_loss": 1.378373622894287, + "eval_loss": 3.7437212467193604, + "step": 23552 + }, + { + "epoch": 0.6900168461144071, + "eval_bleu": 0.3252668354817666, + "eval_cap_loss": 0.9869741797447205, + "eval_con_loss": 1.378373622894287, + "eval_loss": 3.7437212467193604, + "eval_runtime": 57.2221, + "eval_samples_per_second": 349.516, + "eval_steps_per_second": 0.35, + "step": 23552 + }, + { + "epoch": 0.6902512268365927, + "grad_norm": 11.601426124572754, + "learning_rate": 8.987039918185574e-06, + "loss": 3.5665, + "step": 23560 + }, + { + "epoch": 0.6905442027393247, + "grad_norm": 12.820832252502441, + "learning_rate": 8.986085385254417e-06, + "loss": 3.5696, + "step": 23570 + }, + { + "epoch": 0.6908371786420567, + "grad_norm": 11.382893562316895, + "learning_rate": 8.985130453539065e-06, + "loss": 3.5327, + "step": 23580 + }, + { + "epoch": 0.6911301545447887, + "grad_norm": 10.666020393371582, + "learning_rate": 8.984175123135054e-06, + "loss": 3.5461, + "step": 23590 + }, + { + "epoch": 0.6914231304475207, + "grad_norm": 10.526397705078125, + "learning_rate": 8.98321939413796e-06, + "loss": 3.5451, + "step": 23600 + }, + { + "epoch": 0.6917161063502527, + "grad_norm": 11.398719787597656, + "learning_rate": 8.982263266643398e-06, + "loss": 3.5369, + "step": 23610 + }, + { + "epoch": 0.6920090822529847, + "grad_norm": 11.166108131408691, + "learning_rate": 8.98130674074702e-06, + "loss": 3.5085, + "step": 23620 + }, + { + "epoch": 0.6923020581557167, + "grad_norm": 11.612008094787598, + "learning_rate": 8.980349816544525e-06, + "loss": 3.5308, + "step": 23630 + }, + { + "epoch": 0.6925950340584487, + "grad_norm": 12.255270004272461, + "learning_rate": 8.979392494131644e-06, + "loss": 3.5206, + "step": 23640 + }, + { + "epoch": 0.6928880099611807, + "grad_norm": 11.136868476867676, + "learning_rate": 8.978434773604152e-06, + "loss": 3.5448, + "step": 23650 + }, + { + "epoch": 0.6931809858639127, + "grad_norm": 12.762613296508789, + "learning_rate": 8.977476655057866e-06, + "loss": 3.5227, + "step": 23660 + }, + { + "epoch": 0.6934739617666447, + "grad_norm": 11.93790054321289, + "learning_rate": 8.976518138588636e-06, + "loss": 3.5566, + "step": 23670 + }, + { + "epoch": 0.6937669376693767, + "grad_norm": 10.496298789978027, + "learning_rate": 8.97555922429236e-06, + "loss": 3.533, + "step": 23680 + }, + { + "epoch": 0.6940599135721087, + "grad_norm": 11.73210334777832, + "learning_rate": 8.97459991226497e-06, + "loss": 3.5054, + "step": 23690 + }, + { + "epoch": 0.6943528894748406, + "grad_norm": 11.950751304626465, + "learning_rate": 8.97364020260244e-06, + "loss": 3.5205, + "step": 23700 + }, + { + "epoch": 0.6946458653775727, + "grad_norm": 11.159188270568848, + "learning_rate": 8.97268009540078e-06, + "loss": 3.5282, + "step": 23710 + }, + { + "epoch": 0.6949388412803047, + "grad_norm": 12.479546546936035, + "learning_rate": 8.971719590756047e-06, + "loss": 3.5506, + "step": 23720 + }, + { + "epoch": 0.6952318171830367, + "grad_norm": 12.985175132751465, + "learning_rate": 8.970758688764332e-06, + "loss": 3.5341, + "step": 23730 + }, + { + "epoch": 0.6955247930857686, + "grad_norm": 11.912491798400879, + "learning_rate": 8.969797389521769e-06, + "loss": 3.543, + "step": 23740 + }, + { + "epoch": 0.6958177689885007, + "grad_norm": 11.449748039245605, + "learning_rate": 8.968835693124528e-06, + "loss": 3.5253, + "step": 23750 + }, + { + "epoch": 0.6961107448912327, + "grad_norm": 12.25414752960205, + "learning_rate": 8.967873599668822e-06, + "loss": 3.5247, + "step": 23760 + }, + { + "epoch": 0.6964037207939647, + "grad_norm": 11.14195728302002, + "learning_rate": 8.966911109250905e-06, + "loss": 3.4928, + "step": 23770 + }, + { + "epoch": 0.6966966966966966, + "grad_norm": 12.085360527038574, + "learning_rate": 8.965948221967064e-06, + "loss": 3.5422, + "step": 23780 + }, + { + "epoch": 0.6969896725994287, + "grad_norm": 12.482154846191406, + "learning_rate": 8.964984937913632e-06, + "loss": 3.531, + "step": 23790 + }, + { + "epoch": 0.6972826485021607, + "grad_norm": 10.276180267333984, + "learning_rate": 8.964021257186984e-06, + "loss": 3.5497, + "step": 23800 + }, + { + "epoch": 0.6975756244048927, + "grad_norm": 11.919193267822266, + "learning_rate": 8.963057179883523e-06, + "loss": 3.5373, + "step": 23810 + }, + { + "epoch": 0.6978686003076247, + "grad_norm": 11.846817016601562, + "learning_rate": 8.962092706099706e-06, + "loss": 3.5158, + "step": 23820 + }, + { + "epoch": 0.6981615762103567, + "grad_norm": 11.80537223815918, + "learning_rate": 8.961127835932018e-06, + "loss": 3.5328, + "step": 23830 + }, + { + "epoch": 0.6984545521130887, + "grad_norm": 11.922037124633789, + "learning_rate": 8.960162569476992e-06, + "loss": 3.5458, + "step": 23840 + }, + { + "epoch": 0.6987475280158207, + "grad_norm": 11.537664413452148, + "learning_rate": 8.959196906831196e-06, + "loss": 3.5131, + "step": 23850 + }, + { + "epoch": 0.6990405039185527, + "grad_norm": 11.719078063964844, + "learning_rate": 8.958230848091237e-06, + "loss": 3.5185, + "step": 23860 + }, + { + "epoch": 0.6993334798212847, + "grad_norm": 11.290695190429688, + "learning_rate": 8.957264393353766e-06, + "loss": 3.5235, + "step": 23870 + }, + { + "epoch": 0.6996264557240167, + "grad_norm": 11.267977714538574, + "learning_rate": 8.956297542715469e-06, + "loss": 3.5241, + "step": 23880 + }, + { + "epoch": 0.6999194316267487, + "grad_norm": 12.17104434967041, + "learning_rate": 8.955330296273078e-06, + "loss": 3.5371, + "step": 23890 + }, + { + "epoch": 0.7002124075294807, + "grad_norm": 11.23952865600586, + "learning_rate": 8.954362654123355e-06, + "loss": 3.528, + "step": 23900 + }, + { + "epoch": 0.7005053834322127, + "grad_norm": 12.074586868286133, + "learning_rate": 8.953394616363107e-06, + "loss": 3.5306, + "step": 23910 + }, + { + "epoch": 0.7007983593349447, + "grad_norm": 11.214056015014648, + "learning_rate": 8.952426183089184e-06, + "loss": 3.5309, + "step": 23920 + }, + { + "epoch": 0.7010913352376767, + "grad_norm": 11.174503326416016, + "learning_rate": 8.951457354398471e-06, + "loss": 3.5113, + "step": 23930 + }, + { + "epoch": 0.7013843111404087, + "grad_norm": 10.873713493347168, + "learning_rate": 8.95048813038789e-06, + "loss": 3.4848, + "step": 23940 + }, + { + "epoch": 0.7016772870431407, + "grad_norm": 12.8607816696167, + "learning_rate": 8.949518511154413e-06, + "loss": 3.5511, + "step": 23950 + }, + { + "epoch": 0.7019702629458727, + "grad_norm": 11.42567253112793, + "learning_rate": 8.948645516008876e-06, + "loss": 3.5115, + "step": 23960 + }, + { + "epoch": 0.7022632388486048, + "grad_norm": 10.109893798828125, + "learning_rate": 8.947675146119168e-06, + "loss": 3.5059, + "step": 23970 + }, + { + "epoch": 0.7025562147513367, + "grad_norm": 10.815864562988281, + "learning_rate": 8.946704381287981e-06, + "loss": 3.5308, + "step": 23980 + }, + { + "epoch": 0.7028491906540687, + "grad_norm": 11.041875839233398, + "learning_rate": 8.945733221612439e-06, + "loss": 3.5356, + "step": 23990 + }, + { + "epoch": 0.7031421665568007, + "grad_norm": 12.12874698638916, + "learning_rate": 8.944761667189695e-06, + "loss": 3.5232, + "step": 24000 + }, + { + "epoch": 0.7034351424595328, + "grad_norm": 12.049763679504395, + "learning_rate": 8.943789718116949e-06, + "loss": 3.517, + "step": 24010 + }, + { + "epoch": 0.7037281183622647, + "grad_norm": 11.179682731628418, + "learning_rate": 8.94281737449144e-06, + "loss": 3.5213, + "step": 24020 + }, + { + "epoch": 0.7040210942649967, + "grad_norm": 11.710222244262695, + "learning_rate": 8.941844636410444e-06, + "loss": 3.5259, + "step": 24030 + }, + { + "epoch": 0.7043140701677287, + "grad_norm": 12.28187084197998, + "learning_rate": 8.94087150397128e-06, + "loss": 3.509, + "step": 24040 + }, + { + "epoch": 0.7046070460704607, + "grad_norm": 11.873539924621582, + "learning_rate": 8.9398979772713e-06, + "loss": 3.522, + "step": 24050 + }, + { + "epoch": 0.7049000219731927, + "grad_norm": 10.503429412841797, + "learning_rate": 8.938924056407906e-06, + "loss": 3.5021, + "step": 24060 + }, + { + "epoch": 0.7050172123342855, + "eval_bleu": 0.32661118778173165, + "eval_cap_loss": 0.9821113348007202, + "eval_con_loss": 1.3665580749511719, + "eval_loss": 3.7152276039123535, + "step": 24064 + }, + { + "epoch": 0.7050172123342855, + "eval_bleu": 0.32661118778173165, + "eval_cap_loss": 0.9821113348007202, + "eval_con_loss": 1.3665580749511719, + "eval_loss": 3.7152276039123535, + "eval_runtime": 54.4744, + "eval_samples_per_second": 367.145, + "eval_steps_per_second": 0.367, + "step": 24064 + }, + { + "epoch": 0.7051929978759247, + "grad_norm": 13.224287033081055, + "learning_rate": 8.937949741478524e-06, + "loss": 3.536, + "step": 24070 + }, + { + "epoch": 0.7054859737786567, + "grad_norm": 12.883806228637695, + "learning_rate": 8.936975032580637e-06, + "loss": 3.505, + "step": 24080 + }, + { + "epoch": 0.7057789496813887, + "grad_norm": 12.830772399902344, + "learning_rate": 8.935999929811754e-06, + "loss": 3.5385, + "step": 24090 + }, + { + "epoch": 0.7060719255841207, + "grad_norm": 12.296327590942383, + "learning_rate": 8.93502443326943e-06, + "loss": 3.5206, + "step": 24100 + }, + { + "epoch": 0.7063649014868527, + "grad_norm": 11.085996627807617, + "learning_rate": 8.934048543051256e-06, + "loss": 3.4988, + "step": 24110 + }, + { + "epoch": 0.7066578773895847, + "grad_norm": 11.030000686645508, + "learning_rate": 8.933072259254867e-06, + "loss": 3.5313, + "step": 24120 + }, + { + "epoch": 0.7069508532923167, + "grad_norm": 12.153203010559082, + "learning_rate": 8.93209558197793e-06, + "loss": 3.501, + "step": 24130 + }, + { + "epoch": 0.7072438291950487, + "grad_norm": 11.430927276611328, + "learning_rate": 8.931118511318162e-06, + "loss": 3.5439, + "step": 24140 + }, + { + "epoch": 0.7075368050977807, + "grad_norm": 11.973697662353516, + "learning_rate": 8.930141047373306e-06, + "loss": 3.5181, + "step": 24150 + }, + { + "epoch": 0.7078297810005127, + "grad_norm": 11.437094688415527, + "learning_rate": 8.929163190241157e-06, + "loss": 3.5025, + "step": 24160 + }, + { + "epoch": 0.7081227569032447, + "grad_norm": 11.59714126586914, + "learning_rate": 8.928184940019541e-06, + "loss": 3.5344, + "step": 24170 + }, + { + "epoch": 0.7084157328059767, + "grad_norm": 12.436626434326172, + "learning_rate": 8.927206296806328e-06, + "loss": 3.4996, + "step": 24180 + }, + { + "epoch": 0.7087087087087087, + "grad_norm": 11.834846496582031, + "learning_rate": 8.926227260699423e-06, + "loss": 3.522, + "step": 24190 + }, + { + "epoch": 0.7090016846114408, + "grad_norm": 11.203320503234863, + "learning_rate": 8.925247831796775e-06, + "loss": 3.5264, + "step": 24200 + }, + { + "epoch": 0.7092946605141727, + "grad_norm": 11.450814247131348, + "learning_rate": 8.924268010196369e-06, + "loss": 3.5094, + "step": 24210 + }, + { + "epoch": 0.7095876364169047, + "grad_norm": 11.18714714050293, + "learning_rate": 8.92328779599623e-06, + "loss": 3.5244, + "step": 24220 + }, + { + "epoch": 0.7098806123196367, + "grad_norm": 10.967910766601562, + "learning_rate": 8.922307189294422e-06, + "loss": 3.4797, + "step": 24230 + }, + { + "epoch": 0.7101735882223688, + "grad_norm": 12.430017471313477, + "learning_rate": 8.921326190189051e-06, + "loss": 3.519, + "step": 24240 + }, + { + "epoch": 0.7104665641251007, + "grad_norm": 13.725675582885742, + "learning_rate": 8.92034479877826e-06, + "loss": 3.5337, + "step": 24250 + }, + { + "epoch": 0.7107595400278327, + "grad_norm": 11.078914642333984, + "learning_rate": 8.91936301516023e-06, + "loss": 3.5057, + "step": 24260 + }, + { + "epoch": 0.7110525159305647, + "grad_norm": 11.018447875976562, + "learning_rate": 8.918380839433182e-06, + "loss": 3.5428, + "step": 24270 + }, + { + "epoch": 0.7113454918332968, + "grad_norm": 11.555411338806152, + "learning_rate": 8.917398271695379e-06, + "loss": 3.5269, + "step": 24280 + }, + { + "epoch": 0.7116384677360287, + "grad_norm": 11.29918384552002, + "learning_rate": 8.91641531204512e-06, + "loss": 3.5147, + "step": 24290 + }, + { + "epoch": 0.7119314436387607, + "grad_norm": 12.266815185546875, + "learning_rate": 8.915431960580742e-06, + "loss": 3.5358, + "step": 24300 + }, + { + "epoch": 0.7122244195414927, + "grad_norm": 10.661211967468262, + "learning_rate": 8.91444821740063e-06, + "loss": 3.5227, + "step": 24310 + }, + { + "epoch": 0.7125173954442248, + "grad_norm": 10.535469055175781, + "learning_rate": 8.913464082603195e-06, + "loss": 3.4935, + "step": 24320 + }, + { + "epoch": 0.7128103713469567, + "grad_norm": 11.3489351272583, + "learning_rate": 8.912479556286897e-06, + "loss": 3.5052, + "step": 24330 + }, + { + "epoch": 0.7131033472496887, + "grad_norm": 11.908455848693848, + "learning_rate": 8.91149463855023e-06, + "loss": 3.5189, + "step": 24340 + }, + { + "epoch": 0.7133963231524207, + "grad_norm": 11.79290771484375, + "learning_rate": 8.910509329491732e-06, + "loss": 3.5446, + "step": 24350 + }, + { + "epoch": 0.7136892990551527, + "grad_norm": 11.956856727600098, + "learning_rate": 8.909523629209974e-06, + "loss": 3.4904, + "step": 24360 + }, + { + "epoch": 0.7139822749578847, + "grad_norm": 10.926562309265137, + "learning_rate": 8.908537537803574e-06, + "loss": 3.4972, + "step": 24370 + }, + { + "epoch": 0.7142752508606167, + "grad_norm": 12.97148609161377, + "learning_rate": 8.907551055371181e-06, + "loss": 3.5392, + "step": 24380 + }, + { + "epoch": 0.7145682267633487, + "grad_norm": 12.822108268737793, + "learning_rate": 8.906564182011489e-06, + "loss": 3.5283, + "step": 24390 + }, + { + "epoch": 0.7148612026660807, + "grad_norm": 10.966043472290039, + "learning_rate": 8.905576917823224e-06, + "loss": 3.5499, + "step": 24400 + }, + { + "epoch": 0.7151541785688127, + "grad_norm": 12.285103797912598, + "learning_rate": 8.904589262905162e-06, + "loss": 3.506, + "step": 24410 + }, + { + "epoch": 0.7154471544715447, + "grad_norm": 10.480936050415039, + "learning_rate": 8.903601217356106e-06, + "loss": 3.5219, + "step": 24420 + }, + { + "epoch": 0.7157401303742768, + "grad_norm": 11.951239585876465, + "learning_rate": 8.902612781274911e-06, + "loss": 3.5458, + "step": 24430 + }, + { + "epoch": 0.7160331062770087, + "grad_norm": 11.74111557006836, + "learning_rate": 8.90162395476046e-06, + "loss": 3.5172, + "step": 24440 + }, + { + "epoch": 0.7163260821797407, + "grad_norm": 10.48538875579834, + "learning_rate": 8.90063473791168e-06, + "loss": 3.5103, + "step": 24450 + }, + { + "epoch": 0.7166190580824727, + "grad_norm": 11.668442726135254, + "learning_rate": 8.899645130827534e-06, + "loss": 3.5059, + "step": 24460 + }, + { + "epoch": 0.7169120339852048, + "grad_norm": 12.590505599975586, + "learning_rate": 8.89865513360703e-06, + "loss": 3.5036, + "step": 24470 + }, + { + "epoch": 0.7172050098879367, + "grad_norm": 11.579195976257324, + "learning_rate": 8.89766474634921e-06, + "loss": 3.5234, + "step": 24480 + }, + { + "epoch": 0.7174979857906687, + "grad_norm": 10.53331184387207, + "learning_rate": 8.896673969153157e-06, + "loss": 3.4951, + "step": 24490 + }, + { + "epoch": 0.7177909616934007, + "grad_norm": 11.60143756866455, + "learning_rate": 8.895682802117991e-06, + "loss": 3.4924, + "step": 24500 + }, + { + "epoch": 0.7180839375961328, + "grad_norm": 10.502588272094727, + "learning_rate": 8.894691245342873e-06, + "loss": 3.5109, + "step": 24510 + }, + { + "epoch": 0.7183769134988647, + "grad_norm": 12.179255485534668, + "learning_rate": 8.893699298927e-06, + "loss": 3.5129, + "step": 24520 + }, + { + "epoch": 0.7186698894015967, + "grad_norm": 11.437222480773926, + "learning_rate": 8.892706962969615e-06, + "loss": 3.4896, + "step": 24530 + }, + { + "epoch": 0.7189628653043287, + "grad_norm": 10.996716499328613, + "learning_rate": 8.891714237569992e-06, + "loss": 3.493, + "step": 24540 + }, + { + "epoch": 0.7192558412070608, + "grad_norm": 12.065309524536133, + "learning_rate": 8.89072112282745e-06, + "loss": 3.534, + "step": 24550 + }, + { + "epoch": 0.7195488171097927, + "grad_norm": 11.60893726348877, + "learning_rate": 8.889727618841342e-06, + "loss": 3.5082, + "step": 24560 + }, + { + "epoch": 0.7198417930125247, + "grad_norm": 12.71548843383789, + "learning_rate": 8.88873372571106e-06, + "loss": 3.5342, + "step": 24570 + }, + { + "epoch": 0.7200175785541639, + "eval_bleu": 0.3266108934243602, + "eval_cap_loss": 0.9809491634368896, + "eval_con_loss": 1.3629838228225708, + "eval_loss": 3.7069168090820312, + "step": 24576 + }, + { + "epoch": 0.7200175785541639, + "eval_bleu": 0.3266108934243602, + "eval_cap_loss": 0.9809491634368896, + "eval_con_loss": 1.3629838228225708, + "eval_loss": 3.7069168090820312, + "eval_runtime": 60.6089, + "eval_samples_per_second": 329.985, + "eval_steps_per_second": 0.33, + "step": 24576 + }, + { + "epoch": 0.7201347689152567, + "grad_norm": 11.98422622680664, + "learning_rate": 8.887739443536043e-06, + "loss": 3.5125, + "step": 24580 + }, + { + "epoch": 0.7204277448179888, + "grad_norm": 10.895346641540527, + "learning_rate": 8.886744772415758e-06, + "loss": 3.5174, + "step": 24590 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 10.164366722106934, + "learning_rate": 8.88574971244972e-06, + "loss": 3.5121, + "step": 24600 + }, + { + "epoch": 0.7210136966234527, + "grad_norm": 11.261341094970703, + "learning_rate": 8.884754263737473e-06, + "loss": 3.5064, + "step": 24610 + }, + { + "epoch": 0.7213066725261847, + "grad_norm": 11.922615051269531, + "learning_rate": 8.883758426378613e-06, + "loss": 3.5132, + "step": 24620 + }, + { + "epoch": 0.7215996484289168, + "grad_norm": 11.948569297790527, + "learning_rate": 8.882762200472761e-06, + "loss": 3.5096, + "step": 24630 + }, + { + "epoch": 0.7218926243316487, + "grad_norm": 11.485897064208984, + "learning_rate": 8.881765586119589e-06, + "loss": 3.5101, + "step": 24640 + }, + { + "epoch": 0.7221856002343807, + "grad_norm": 10.856199264526367, + "learning_rate": 8.8807685834188e-06, + "loss": 3.5123, + "step": 24650 + }, + { + "epoch": 0.7224785761371127, + "grad_norm": 11.789773941040039, + "learning_rate": 8.879771192470136e-06, + "loss": 3.5042, + "step": 24660 + }, + { + "epoch": 0.7227715520398448, + "grad_norm": 11.700227737426758, + "learning_rate": 8.878773413373383e-06, + "loss": 3.5027, + "step": 24670 + }, + { + "epoch": 0.7230645279425767, + "grad_norm": 12.389700889587402, + "learning_rate": 8.877775246228361e-06, + "loss": 3.4779, + "step": 24680 + }, + { + "epoch": 0.7233575038453087, + "grad_norm": 12.176299095153809, + "learning_rate": 8.876776691134933e-06, + "loss": 3.5277, + "step": 24690 + }, + { + "epoch": 0.7236504797480408, + "grad_norm": 11.476484298706055, + "learning_rate": 8.875777748192997e-06, + "loss": 3.5056, + "step": 24700 + }, + { + "epoch": 0.7239434556507727, + "grad_norm": 11.658970832824707, + "learning_rate": 8.87477841750249e-06, + "loss": 3.5501, + "step": 24710 + }, + { + "epoch": 0.7242364315535047, + "grad_norm": 11.678295135498047, + "learning_rate": 8.873778699163393e-06, + "loss": 3.4939, + "step": 24720 + }, + { + "epoch": 0.7245294074562367, + "grad_norm": 11.874053955078125, + "learning_rate": 8.872778593275717e-06, + "loss": 3.5125, + "step": 24730 + }, + { + "epoch": 0.7248223833589688, + "grad_norm": 11.426264762878418, + "learning_rate": 8.87177809993952e-06, + "loss": 3.5253, + "step": 24740 + }, + { + "epoch": 0.7251153592617007, + "grad_norm": 11.655733108520508, + "learning_rate": 8.870777219254895e-06, + "loss": 3.5094, + "step": 24750 + }, + { + "epoch": 0.7254083351644327, + "grad_norm": 11.991338729858398, + "learning_rate": 8.869775951321976e-06, + "loss": 3.5117, + "step": 24760 + }, + { + "epoch": 0.7257013110671647, + "grad_norm": 11.764793395996094, + "learning_rate": 8.86877429624093e-06, + "loss": 3.4717, + "step": 24770 + }, + { + "epoch": 0.7259942869698968, + "grad_norm": 12.00718879699707, + "learning_rate": 8.867772254111966e-06, + "loss": 3.5088, + "step": 24780 + }, + { + "epoch": 0.7262872628726287, + "grad_norm": 11.261914253234863, + "learning_rate": 8.866769825035337e-06, + "loss": 3.4891, + "step": 24790 + }, + { + "epoch": 0.7265802387753607, + "grad_norm": 11.745342254638672, + "learning_rate": 8.865767009111327e-06, + "loss": 3.4875, + "step": 24800 + }, + { + "epoch": 0.7268732146780927, + "grad_norm": 10.66648006439209, + "learning_rate": 8.864763806440263e-06, + "loss": 3.5133, + "step": 24810 + }, + { + "epoch": 0.7271661905808248, + "grad_norm": 10.829792022705078, + "learning_rate": 8.863760217122509e-06, + "loss": 3.4997, + "step": 24820 + }, + { + "epoch": 0.7274591664835567, + "grad_norm": 11.719536781311035, + "learning_rate": 8.862756241258467e-06, + "loss": 3.5042, + "step": 24830 + }, + { + "epoch": 0.7277521423862887, + "grad_norm": 11.84765911102295, + "learning_rate": 8.861751878948583e-06, + "loss": 3.5201, + "step": 24840 + }, + { + "epoch": 0.7280451182890207, + "grad_norm": 10.58285140991211, + "learning_rate": 8.860747130293332e-06, + "loss": 3.502, + "step": 24850 + }, + { + "epoch": 0.7283380941917528, + "grad_norm": 11.238059043884277, + "learning_rate": 8.859741995393236e-06, + "loss": 3.4946, + "step": 24860 + }, + { + "epoch": 0.7286310700944847, + "grad_norm": 11.634581565856934, + "learning_rate": 8.858736474348853e-06, + "loss": 3.4959, + "step": 24870 + }, + { + "epoch": 0.7289240459972167, + "grad_norm": 11.845142364501953, + "learning_rate": 8.857730567260779e-06, + "loss": 3.5019, + "step": 24880 + }, + { + "epoch": 0.7292170218999487, + "grad_norm": 11.800474166870117, + "learning_rate": 8.856724274229646e-06, + "loss": 3.4742, + "step": 24890 + }, + { + "epoch": 0.7295099978026808, + "grad_norm": 11.491097450256348, + "learning_rate": 8.855717595356135e-06, + "loss": 3.4944, + "step": 24900 + }, + { + "epoch": 0.7298029737054127, + "grad_norm": 11.339518547058105, + "learning_rate": 8.85471053074095e-06, + "loss": 3.4951, + "step": 24910 + }, + { + "epoch": 0.7300959496081447, + "grad_norm": 12.660045623779297, + "learning_rate": 8.853703080484848e-06, + "loss": 3.4895, + "step": 24920 + }, + { + "epoch": 0.7303889255108768, + "grad_norm": 11.755243301391602, + "learning_rate": 8.852695244688615e-06, + "loss": 3.4865, + "step": 24930 + }, + { + "epoch": 0.7306819014136088, + "grad_norm": 11.596108436584473, + "learning_rate": 8.85168702345308e-06, + "loss": 3.5089, + "step": 24940 + }, + { + "epoch": 0.7309748773163407, + "grad_norm": 9.963410377502441, + "learning_rate": 8.850678416879111e-06, + "loss": 3.4873, + "step": 24950 + }, + { + "epoch": 0.7312678532190727, + "grad_norm": 11.311752319335938, + "learning_rate": 8.849669425067609e-06, + "loss": 3.4879, + "step": 24960 + }, + { + "epoch": 0.7315608291218048, + "grad_norm": 11.752996444702148, + "learning_rate": 8.848660048119521e-06, + "loss": 3.4996, + "step": 24970 + }, + { + "epoch": 0.7318538050245368, + "grad_norm": 11.519777297973633, + "learning_rate": 8.847650286135832e-06, + "loss": 3.4954, + "step": 24980 + }, + { + "epoch": 0.7321467809272687, + "grad_norm": 9.960298538208008, + "learning_rate": 8.846640139217555e-06, + "loss": 3.4863, + "step": 24990 + }, + { + "epoch": 0.7324397568300007, + "grad_norm": 12.422157287597656, + "learning_rate": 8.845629607465756e-06, + "loss": 3.4724, + "step": 25000 + }, + { + "epoch": 0.7327327327327328, + "grad_norm": 11.697816848754883, + "learning_rate": 8.844618690981527e-06, + "loss": 3.4986, + "step": 25010 + }, + { + "epoch": 0.7330257086354647, + "grad_norm": 10.965132713317871, + "learning_rate": 8.843607389866011e-06, + "loss": 3.465, + "step": 25020 + }, + { + "epoch": 0.7333186845381967, + "grad_norm": 11.560293197631836, + "learning_rate": 8.842595704220379e-06, + "loss": 3.4767, + "step": 25030 + }, + { + "epoch": 0.7336116604409287, + "grad_norm": 12.621792793273926, + "learning_rate": 8.841583634145841e-06, + "loss": 3.4903, + "step": 25040 + }, + { + "epoch": 0.7339046363436608, + "grad_norm": 11.40277099609375, + "learning_rate": 8.840571179743653e-06, + "loss": 3.5096, + "step": 25050 + }, + { + "epoch": 0.7341976122463927, + "grad_norm": 11.389336585998535, + "learning_rate": 8.839558341115104e-06, + "loss": 3.4882, + "step": 25060 + }, + { + "epoch": 0.7344905881491247, + "grad_norm": 10.850617408752441, + "learning_rate": 8.838545118361523e-06, + "loss": 3.5032, + "step": 25070 + }, + { + "epoch": 0.7347835640518567, + "grad_norm": 11.464756965637207, + "learning_rate": 8.837531511584276e-06, + "loss": 3.4889, + "step": 25080 + }, + { + "epoch": 0.7350179447740424, + "eval_bleu": 0.32625002537676046, + "eval_cap_loss": 0.9810991287231445, + "eval_con_loss": 1.3564846515655518, + "eval_loss": 3.694068431854248, + "step": 25088 + }, + { + "epoch": 0.7350179447740424, + "eval_bleu": 0.32625002537676046, + "eval_cap_loss": 0.9810991287231445, + "eval_con_loss": 1.3564846515655518, + "eval_loss": 3.694068431854248, + "eval_runtime": 57.1031, + "eval_samples_per_second": 350.244, + "eval_steps_per_second": 0.35, + "step": 25088 + }, + { + "epoch": 0.7350765399545888, + "grad_norm": 11.319315910339355, + "learning_rate": 8.836517520884768e-06, + "loss": 3.4779, + "step": 25090 + }, + { + "epoch": 0.7353695158573207, + "grad_norm": 11.707077980041504, + "learning_rate": 8.835503146364442e-06, + "loss": 3.4906, + "step": 25100 + }, + { + "epoch": 0.7356624917600527, + "grad_norm": 11.228324890136719, + "learning_rate": 8.834488388124785e-06, + "loss": 3.5008, + "step": 25110 + }, + { + "epoch": 0.7359554676627847, + "grad_norm": 11.0851411819458, + "learning_rate": 8.83347324626731e-06, + "loss": 3.5014, + "step": 25120 + }, + { + "epoch": 0.7362484435655168, + "grad_norm": 11.547858238220215, + "learning_rate": 8.832457720893583e-06, + "loss": 3.5109, + "step": 25130 + }, + { + "epoch": 0.7365414194682487, + "grad_norm": 10.812849998474121, + "learning_rate": 8.831441812105197e-06, + "loss": 3.4833, + "step": 25140 + }, + { + "epoch": 0.7368343953709807, + "grad_norm": 12.422633171081543, + "learning_rate": 8.830425520003788e-06, + "loss": 3.4948, + "step": 25150 + }, + { + "epoch": 0.7371273712737128, + "grad_norm": 10.860272407531738, + "learning_rate": 8.829408844691032e-06, + "loss": 3.4899, + "step": 25160 + }, + { + "epoch": 0.7374203471764448, + "grad_norm": 11.92484188079834, + "learning_rate": 8.82839178626864e-06, + "loss": 3.4986, + "step": 25170 + }, + { + "epoch": 0.7377133230791767, + "grad_norm": 11.050969123840332, + "learning_rate": 8.827374344838363e-06, + "loss": 3.4809, + "step": 25180 + }, + { + "epoch": 0.7380062989819087, + "grad_norm": 12.11875057220459, + "learning_rate": 8.82635652050199e-06, + "loss": 3.4865, + "step": 25190 + }, + { + "epoch": 0.7382992748846408, + "grad_norm": 11.547417640686035, + "learning_rate": 8.825338313361347e-06, + "loss": 3.499, + "step": 25200 + }, + { + "epoch": 0.7385922507873728, + "grad_norm": 11.478644371032715, + "learning_rate": 8.824319723518301e-06, + "loss": 3.5062, + "step": 25210 + }, + { + "epoch": 0.7388852266901047, + "grad_norm": 11.303799629211426, + "learning_rate": 8.823300751074757e-06, + "loss": 3.4613, + "step": 25220 + }, + { + "epoch": 0.7391782025928367, + "grad_norm": 10.800552368164062, + "learning_rate": 8.822281396132656e-06, + "loss": 3.4854, + "step": 25230 + }, + { + "epoch": 0.7394711784955688, + "grad_norm": 11.215572357177734, + "learning_rate": 8.821261658793976e-06, + "loss": 3.5012, + "step": 25240 + }, + { + "epoch": 0.7397641543983008, + "grad_norm": 10.876871109008789, + "learning_rate": 8.82024153916074e-06, + "loss": 3.494, + "step": 25250 + }, + { + "epoch": 0.7400571303010327, + "grad_norm": 11.532855033874512, + "learning_rate": 8.819221037335001e-06, + "loss": 3.4839, + "step": 25260 + }, + { + "epoch": 0.7403501062037647, + "grad_norm": 11.356911659240723, + "learning_rate": 8.818200153418856e-06, + "loss": 3.4757, + "step": 25270 + }, + { + "epoch": 0.7406430821064968, + "grad_norm": 11.101908683776855, + "learning_rate": 8.817178887514438e-06, + "loss": 3.4899, + "step": 25280 + }, + { + "epoch": 0.7409360580092288, + "grad_norm": 11.81002140045166, + "learning_rate": 8.81615723972392e-06, + "loss": 3.5058, + "step": 25290 + }, + { + "epoch": 0.7412290339119607, + "grad_norm": 11.097179412841797, + "learning_rate": 8.81513521014951e-06, + "loss": 3.4747, + "step": 25300 + }, + { + "epoch": 0.7415220098146927, + "grad_norm": 11.508158683776855, + "learning_rate": 8.814112798893456e-06, + "loss": 3.4824, + "step": 25310 + }, + { + "epoch": 0.7418149857174248, + "grad_norm": 11.45166301727295, + "learning_rate": 8.813090006058045e-06, + "loss": 3.5045, + "step": 25320 + }, + { + "epoch": 0.7421079616201568, + "grad_norm": 10.907182693481445, + "learning_rate": 8.812066831745602e-06, + "loss": 3.4825, + "step": 25330 + }, + { + "epoch": 0.7424009375228887, + "grad_norm": 10.463828086853027, + "learning_rate": 8.81104327605849e-06, + "loss": 3.4892, + "step": 25340 + }, + { + "epoch": 0.7426939134256207, + "grad_norm": 9.91385555267334, + "learning_rate": 8.810019339099107e-06, + "loss": 3.4885, + "step": 25350 + }, + { + "epoch": 0.7429868893283528, + "grad_norm": 11.646224975585938, + "learning_rate": 8.808995020969893e-06, + "loss": 3.4764, + "step": 25360 + }, + { + "epoch": 0.7432798652310847, + "grad_norm": 10.931553840637207, + "learning_rate": 8.807970321773325e-06, + "loss": 3.4943, + "step": 25370 + }, + { + "epoch": 0.7435728411338167, + "grad_norm": 13.621423721313477, + "learning_rate": 8.806945241611918e-06, + "loss": 3.4824, + "step": 25380 + }, + { + "epoch": 0.7438658170365487, + "grad_norm": 11.316859245300293, + "learning_rate": 8.805919780588225e-06, + "loss": 3.4769, + "step": 25390 + }, + { + "epoch": 0.7441587929392808, + "grad_norm": 10.772239685058594, + "learning_rate": 8.804893938804839e-06, + "loss": 3.4778, + "step": 25400 + }, + { + "epoch": 0.7444517688420127, + "grad_norm": 11.187529563903809, + "learning_rate": 8.803867716364388e-06, + "loss": 3.4635, + "step": 25410 + }, + { + "epoch": 0.7447447447447447, + "grad_norm": 11.486611366271973, + "learning_rate": 8.80284111336954e-06, + "loss": 3.5133, + "step": 25420 + }, + { + "epoch": 0.7450377206474768, + "grad_norm": 10.777658462524414, + "learning_rate": 8.801814129922998e-06, + "loss": 3.467, + "step": 25430 + }, + { + "epoch": 0.7453306965502088, + "grad_norm": 11.051767349243164, + "learning_rate": 8.80078676612751e-06, + "loss": 3.4985, + "step": 25440 + }, + { + "epoch": 0.7456236724529407, + "grad_norm": 11.127635955810547, + "learning_rate": 8.799759022085853e-06, + "loss": 3.4919, + "step": 25450 + }, + { + "epoch": 0.7459166483556727, + "grad_norm": 10.56207275390625, + "learning_rate": 8.798730897900852e-06, + "loss": 3.4691, + "step": 25460 + }, + { + "epoch": 0.7462096242584048, + "grad_norm": 11.330588340759277, + "learning_rate": 8.79770239367536e-06, + "loss": 3.4811, + "step": 25470 + }, + { + "epoch": 0.7465026001611368, + "grad_norm": 11.958491325378418, + "learning_rate": 8.796673509512274e-06, + "loss": 3.4745, + "step": 25480 + }, + { + "epoch": 0.7467955760638687, + "grad_norm": 11.908867835998535, + "learning_rate": 8.795644245514529e-06, + "loss": 3.4547, + "step": 25490 + }, + { + "epoch": 0.7470885519666007, + "grad_norm": 10.803675651550293, + "learning_rate": 8.794614601785097e-06, + "loss": 3.4947, + "step": 25500 + }, + { + "epoch": 0.7473815278693328, + "grad_norm": 12.811369895935059, + "learning_rate": 8.793584578426985e-06, + "loss": 3.5073, + "step": 25510 + }, + { + "epoch": 0.7476745037720648, + "grad_norm": 11.034014701843262, + "learning_rate": 8.792554175543244e-06, + "loss": 3.4787, + "step": 25520 + }, + { + "epoch": 0.7479674796747967, + "grad_norm": 10.212488174438477, + "learning_rate": 8.791523393236957e-06, + "loss": 3.478, + "step": 25530 + }, + { + "epoch": 0.7482604555775287, + "grad_norm": 10.470259666442871, + "learning_rate": 8.790492231611249e-06, + "loss": 3.5142, + "step": 25540 + }, + { + "epoch": 0.7485534314802608, + "grad_norm": 11.49903392791748, + "learning_rate": 8.78946069076928e-06, + "loss": 3.4811, + "step": 25550 + }, + { + "epoch": 0.7488464073829928, + "grad_norm": 10.950384140014648, + "learning_rate": 8.788428770814255e-06, + "loss": 3.4899, + "step": 25560 + }, + { + "epoch": 0.7491393832857247, + "grad_norm": 11.397994041442871, + "learning_rate": 8.787396471849402e-06, + "loss": 3.4977, + "step": 25570 + }, + { + "epoch": 0.7494323591884567, + "grad_norm": 10.550149917602539, + "learning_rate": 8.786363793978008e-06, + "loss": 3.4759, + "step": 25580 + }, + { + "epoch": 0.7497253350911888, + "grad_norm": 11.568855285644531, + "learning_rate": 8.785330737303375e-06, + "loss": 3.4716, + "step": 25590 + }, + { + "epoch": 0.7500183109939208, + "grad_norm": 11.43117904663086, + "learning_rate": 8.784297301928862e-06, + "loss": 3.4817, + "step": 25600 + }, + { + "epoch": 0.7500183109939208, + "eval_bleu": 0.32823002285660385, + "eval_cap_loss": 0.9771826267242432, + "eval_con_loss": 1.350681185722351, + "eval_loss": 3.6785449981689453, + "step": 25600 + }, + { + "epoch": 0.7500183109939208, + "eval_bleu": 0.32823002285660385, + "eval_cap_loss": 0.9771826267242432, + "eval_con_loss": 1.350681185722351, + "eval_loss": 3.6785449981689453, + "eval_runtime": 55.3272, + "eval_samples_per_second": 361.486, + "eval_steps_per_second": 0.361, + "step": 25600 + }, + { + "epoch": 0.7503112868966527, + "grad_norm": 11.698363304138184, + "learning_rate": 8.783263487957856e-06, + "loss": 3.4999, + "step": 25610 + }, + { + "epoch": 0.7506042627993847, + "grad_norm": 11.297891616821289, + "learning_rate": 8.782229295493781e-06, + "loss": 3.4742, + "step": 25620 + }, + { + "epoch": 0.7508972387021168, + "grad_norm": 11.811670303344727, + "learning_rate": 8.781194724640105e-06, + "loss": 3.48, + "step": 25630 + }, + { + "epoch": 0.7511902146048488, + "grad_norm": 11.780673027038574, + "learning_rate": 8.780159775500331e-06, + "loss": 3.4922, + "step": 25640 + }, + { + "epoch": 0.7514831905075807, + "grad_norm": 10.605260848999023, + "learning_rate": 8.779124448177998e-06, + "loss": 3.4988, + "step": 25650 + }, + { + "epoch": 0.7517761664103128, + "grad_norm": 12.532360076904297, + "learning_rate": 8.778088742776685e-06, + "loss": 3.4783, + "step": 25660 + }, + { + "epoch": 0.7520691423130448, + "grad_norm": 11.456196784973145, + "learning_rate": 8.777052659400006e-06, + "loss": 3.4766, + "step": 25670 + }, + { + "epoch": 0.7523621182157767, + "grad_norm": 11.14148998260498, + "learning_rate": 8.776016198151617e-06, + "loss": 3.467, + "step": 25680 + }, + { + "epoch": 0.7526550941185087, + "grad_norm": 10.795299530029297, + "learning_rate": 8.774979359135211e-06, + "loss": 3.4823, + "step": 25690 + }, + { + "epoch": 0.7529480700212408, + "grad_norm": 10.761544227600098, + "learning_rate": 8.773942142454518e-06, + "loss": 3.4881, + "step": 25700 + }, + { + "epoch": 0.7532410459239728, + "grad_norm": 12.081098556518555, + "learning_rate": 8.772904548213301e-06, + "loss": 3.4878, + "step": 25710 + }, + { + "epoch": 0.7535340218267047, + "grad_norm": 11.590167999267578, + "learning_rate": 8.77186657651537e-06, + "loss": 3.4611, + "step": 25720 + }, + { + "epoch": 0.7538269977294367, + "grad_norm": 10.403399467468262, + "learning_rate": 8.770828227464564e-06, + "loss": 3.4538, + "step": 25730 + }, + { + "epoch": 0.7541199736321688, + "grad_norm": 11.072731971740723, + "learning_rate": 8.769789501164767e-06, + "loss": 3.4733, + "step": 25740 + }, + { + "epoch": 0.7544129495349008, + "grad_norm": 11.21470832824707, + "learning_rate": 8.768750397719893e-06, + "loss": 3.4709, + "step": 25750 + }, + { + "epoch": 0.7547059254376327, + "grad_norm": 11.890630722045898, + "learning_rate": 8.767710917233903e-06, + "loss": 3.4585, + "step": 25760 + }, + { + "epoch": 0.7549989013403647, + "grad_norm": 11.753706932067871, + "learning_rate": 8.766671059810788e-06, + "loss": 3.4993, + "step": 25770 + }, + { + "epoch": 0.7552918772430968, + "grad_norm": 12.383502960205078, + "learning_rate": 8.76563082555458e-06, + "loss": 3.4613, + "step": 25780 + }, + { + "epoch": 0.7555848531458288, + "grad_norm": 11.437092781066895, + "learning_rate": 8.764590214569351e-06, + "loss": 3.4844, + "step": 25790 + }, + { + "epoch": 0.7558778290485607, + "grad_norm": 11.36427116394043, + "learning_rate": 8.763549226959204e-06, + "loss": 3.4842, + "step": 25800 + }, + { + "epoch": 0.7561708049512927, + "grad_norm": 11.537652969360352, + "learning_rate": 8.762507862828284e-06, + "loss": 3.4933, + "step": 25810 + }, + { + "epoch": 0.7564637808540248, + "grad_norm": 11.793440818786621, + "learning_rate": 8.761466122280776e-06, + "loss": 3.4738, + "step": 25820 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 10.76569652557373, + "learning_rate": 8.760424005420898e-06, + "loss": 3.4632, + "step": 25830 + }, + { + "epoch": 0.7570497326594887, + "grad_norm": 11.551819801330566, + "learning_rate": 8.759381512352909e-06, + "loss": 3.4713, + "step": 25840 + }, + { + "epoch": 0.7573427085622207, + "grad_norm": 11.402678489685059, + "learning_rate": 8.758338643181102e-06, + "loss": 3.445, + "step": 25850 + }, + { + "epoch": 0.7576356844649528, + "grad_norm": 10.763091087341309, + "learning_rate": 8.757295398009812e-06, + "loss": 3.4504, + "step": 25860 + }, + { + "epoch": 0.7579286603676848, + "grad_norm": 12.878908157348633, + "learning_rate": 8.756251776943409e-06, + "loss": 3.4754, + "step": 25870 + }, + { + "epoch": 0.7582216362704167, + "grad_norm": 11.509954452514648, + "learning_rate": 8.755207780086302e-06, + "loss": 3.4822, + "step": 25880 + }, + { + "epoch": 0.7585146121731488, + "grad_norm": 10.94515323638916, + "learning_rate": 8.754163407542932e-06, + "loss": 3.455, + "step": 25890 + }, + { + "epoch": 0.7588075880758808, + "grad_norm": 11.554261207580566, + "learning_rate": 8.753118659417787e-06, + "loss": 3.493, + "step": 25900 + }, + { + "epoch": 0.7591005639786128, + "grad_norm": 11.854109764099121, + "learning_rate": 8.752073535815388e-06, + "loss": 3.5074, + "step": 25910 + }, + { + "epoch": 0.7593935398813447, + "grad_norm": 10.814809799194336, + "learning_rate": 8.751028036840292e-06, + "loss": 3.4786, + "step": 25920 + }, + { + "epoch": 0.7596865157840768, + "grad_norm": 11.409500122070312, + "learning_rate": 8.749982162597093e-06, + "loss": 3.4552, + "step": 25930 + }, + { + "epoch": 0.7599794916868088, + "grad_norm": 11.884573936462402, + "learning_rate": 8.748935913190428e-06, + "loss": 3.4699, + "step": 25940 + }, + { + "epoch": 0.7602724675895408, + "grad_norm": 11.875904083251953, + "learning_rate": 8.747889288724966e-06, + "loss": 3.4761, + "step": 25950 + }, + { + "epoch": 0.7605654434922727, + "grad_norm": 13.119754791259766, + "learning_rate": 8.74694700611732e-06, + "loss": 3.4673, + "step": 25960 + }, + { + "epoch": 0.7608584193950048, + "grad_norm": 12.724875450134277, + "learning_rate": 8.74589966932865e-06, + "loss": 3.4788, + "step": 25970 + }, + { + "epoch": 0.7611513952977368, + "grad_norm": 10.832013130187988, + "learning_rate": 8.744851957784939e-06, + "loss": 3.4389, + "step": 25980 + }, + { + "epoch": 0.7614443712004688, + "grad_norm": 11.404438018798828, + "learning_rate": 8.743803871591008e-06, + "loss": 3.4841, + "step": 25990 + }, + { + "epoch": 0.7617373471032007, + "grad_norm": 11.715265274047852, + "learning_rate": 8.74275541085171e-06, + "loss": 3.4944, + "step": 26000 + }, + { + "epoch": 0.7620303230059328, + "grad_norm": 11.192203521728516, + "learning_rate": 8.74170657567194e-06, + "loss": 3.4723, + "step": 26010 + }, + { + "epoch": 0.7623232989086648, + "grad_norm": 10.69011116027832, + "learning_rate": 8.740657366156625e-06, + "loss": 3.4693, + "step": 26020 + }, + { + "epoch": 0.7626162748113967, + "grad_norm": 11.886370658874512, + "learning_rate": 8.739607782410733e-06, + "loss": 3.4759, + "step": 26030 + }, + { + "epoch": 0.7629092507141287, + "grad_norm": 11.649091720581055, + "learning_rate": 8.73855782453927e-06, + "loss": 3.4474, + "step": 26040 + }, + { + "epoch": 0.7632022266168608, + "grad_norm": 10.688881874084473, + "learning_rate": 8.737507492647274e-06, + "loss": 3.4523, + "step": 26050 + }, + { + "epoch": 0.7634952025195928, + "grad_norm": 10.733915328979492, + "learning_rate": 8.736456786839828e-06, + "loss": 3.4535, + "step": 26060 + }, + { + "epoch": 0.7637881784223247, + "grad_norm": 12.836689949035645, + "learning_rate": 8.73540570722205e-06, + "loss": 3.4618, + "step": 26070 + }, + { + "epoch": 0.7640811543250567, + "grad_norm": 13.102336883544922, + "learning_rate": 8.734354253899091e-06, + "loss": 3.4469, + "step": 26080 + }, + { + "epoch": 0.7643741302277888, + "grad_norm": 10.777247428894043, + "learning_rate": 8.733302426976144e-06, + "loss": 3.4575, + "step": 26090 + }, + { + "epoch": 0.7646671061305208, + "grad_norm": 10.423601150512695, + "learning_rate": 8.73225022655844e-06, + "loss": 3.4528, + "step": 26100 + }, + { + "epoch": 0.7649600820332527, + "grad_norm": 11.488088607788086, + "learning_rate": 8.73119765275124e-06, + "loss": 3.4491, + "step": 26110 + }, + { + "epoch": 0.7650186772137991, + "eval_bleu": 0.3289464233960948, + "eval_cap_loss": 0.9752053618431091, + "eval_con_loss": 1.34415602684021, + "eval_loss": 3.6635172367095947, + "step": 26112 + }, + { + "epoch": 0.7650186772137991, + "eval_bleu": 0.3289464233960948, + "eval_cap_loss": 0.9752053618431091, + "eval_con_loss": 1.34415602684021, + "eval_loss": 3.6635172367095947, + "eval_runtime": 57.0494, + "eval_samples_per_second": 350.573, + "eval_steps_per_second": 0.351, + "step": 26112 + }, + { + "epoch": 0.7652530579359847, + "grad_norm": 13.124210357666016, + "learning_rate": 8.730144705659853e-06, + "loss": 3.486, + "step": 26120 + }, + { + "epoch": 0.7655460338387168, + "grad_norm": 11.48253345489502, + "learning_rate": 8.72909138538962e-06, + "loss": 3.4623, + "step": 26130 + }, + { + "epoch": 0.7658390097414488, + "grad_norm": 11.946202278137207, + "learning_rate": 8.728037692045916e-06, + "loss": 3.47, + "step": 26140 + }, + { + "epoch": 0.7661319856441807, + "grad_norm": 11.51906681060791, + "learning_rate": 8.726983625734158e-06, + "loss": 3.4561, + "step": 26150 + }, + { + "epoch": 0.7664249615469128, + "grad_norm": 11.512109756469727, + "learning_rate": 8.725929186559801e-06, + "loss": 3.4395, + "step": 26160 + }, + { + "epoch": 0.7667179374496448, + "grad_norm": 10.6419095993042, + "learning_rate": 8.724874374628333e-06, + "loss": 3.4635, + "step": 26170 + }, + { + "epoch": 0.7670109133523768, + "grad_norm": 10.842241287231445, + "learning_rate": 8.723819190045284e-06, + "loss": 3.4596, + "step": 26180 + }, + { + "epoch": 0.7673038892551087, + "grad_norm": 9.546341896057129, + "learning_rate": 8.722763632916218e-06, + "loss": 3.4592, + "step": 26190 + }, + { + "epoch": 0.7675968651578408, + "grad_norm": 10.597944259643555, + "learning_rate": 8.721707703346736e-06, + "loss": 3.4825, + "step": 26200 + }, + { + "epoch": 0.7678898410605728, + "grad_norm": 10.417927742004395, + "learning_rate": 8.720651401442479e-06, + "loss": 3.4642, + "step": 26210 + }, + { + "epoch": 0.7681828169633048, + "grad_norm": 10.518254280090332, + "learning_rate": 8.719594727309122e-06, + "loss": 3.4195, + "step": 26220 + }, + { + "epoch": 0.7684757928660367, + "grad_norm": 12.032302856445312, + "learning_rate": 8.718537681052382e-06, + "loss": 3.4425, + "step": 26230 + }, + { + "epoch": 0.7687687687687688, + "grad_norm": 11.250432014465332, + "learning_rate": 8.717480262778007e-06, + "loss": 3.4568, + "step": 26240 + }, + { + "epoch": 0.7690617446715008, + "grad_norm": 10.934386253356934, + "learning_rate": 8.716422472591786e-06, + "loss": 3.4489, + "step": 26250 + }, + { + "epoch": 0.7693547205742328, + "grad_norm": 11.181798934936523, + "learning_rate": 8.715364310599546e-06, + "loss": 3.4661, + "step": 26260 + }, + { + "epoch": 0.7696476964769647, + "grad_norm": 11.368573188781738, + "learning_rate": 8.71430577690715e-06, + "loss": 3.4624, + "step": 26270 + }, + { + "epoch": 0.7699406723796968, + "grad_norm": 11.75428581237793, + "learning_rate": 8.713246871620498e-06, + "loss": 3.4494, + "step": 26280 + }, + { + "epoch": 0.7702336482824288, + "grad_norm": 11.066402435302734, + "learning_rate": 8.712187594845525e-06, + "loss": 3.4436, + "step": 26290 + }, + { + "epoch": 0.7705266241851608, + "grad_norm": 11.713147163391113, + "learning_rate": 8.711127946688207e-06, + "loss": 3.4558, + "step": 26300 + }, + { + "epoch": 0.7708196000878927, + "grad_norm": 10.547122955322266, + "learning_rate": 8.710067927254555e-06, + "loss": 3.4686, + "step": 26310 + }, + { + "epoch": 0.7711125759906248, + "grad_norm": 10.781330108642578, + "learning_rate": 8.709007536650617e-06, + "loss": 3.4661, + "step": 26320 + }, + { + "epoch": 0.7714055518933568, + "grad_norm": 12.703742980957031, + "learning_rate": 8.707946774982483e-06, + "loss": 3.4667, + "step": 26330 + }, + { + "epoch": 0.7716985277960887, + "grad_norm": 12.119895935058594, + "learning_rate": 8.70688564235627e-06, + "loss": 3.4649, + "step": 26340 + }, + { + "epoch": 0.7719915036988207, + "grad_norm": 11.754212379455566, + "learning_rate": 8.705824138878141e-06, + "loss": 3.4647, + "step": 26350 + }, + { + "epoch": 0.7722844796015528, + "grad_norm": 12.61511516571045, + "learning_rate": 8.704762264654294e-06, + "loss": 3.4504, + "step": 26360 + }, + { + "epoch": 0.7725774555042848, + "grad_norm": 11.199565887451172, + "learning_rate": 8.70370001979096e-06, + "loss": 3.4533, + "step": 26370 + }, + { + "epoch": 0.7728704314070167, + "grad_norm": 11.562153816223145, + "learning_rate": 8.702637404394413e-06, + "loss": 3.4518, + "step": 26380 + }, + { + "epoch": 0.7731634073097488, + "grad_norm": 11.896048545837402, + "learning_rate": 8.701574418570962e-06, + "loss": 3.455, + "step": 26390 + }, + { + "epoch": 0.7734563832124808, + "grad_norm": 11.03256893157959, + "learning_rate": 8.70051106242695e-06, + "loss": 3.4569, + "step": 26400 + }, + { + "epoch": 0.7737493591152128, + "grad_norm": 10.823976516723633, + "learning_rate": 8.699447336068762e-06, + "loss": 3.4609, + "step": 26410 + }, + { + "epoch": 0.7740423350179447, + "grad_norm": 11.43677043914795, + "learning_rate": 8.698383239602815e-06, + "loss": 3.4577, + "step": 26420 + }, + { + "epoch": 0.7743353109206768, + "grad_norm": 11.049436569213867, + "learning_rate": 8.697318773135567e-06, + "loss": 3.4567, + "step": 26430 + }, + { + "epoch": 0.7746282868234088, + "grad_norm": 11.20285415649414, + "learning_rate": 8.696253936773511e-06, + "loss": 3.4718, + "step": 26440 + }, + { + "epoch": 0.7749212627261408, + "grad_norm": 11.59225845336914, + "learning_rate": 8.695188730623176e-06, + "loss": 3.4238, + "step": 26450 + }, + { + "epoch": 0.7752142386288727, + "grad_norm": 11.872284889221191, + "learning_rate": 8.694123154791134e-06, + "loss": 3.4867, + "step": 26460 + }, + { + "epoch": 0.7755072145316048, + "grad_norm": 10.188353538513184, + "learning_rate": 8.693057209383985e-06, + "loss": 3.4545, + "step": 26470 + }, + { + "epoch": 0.7758001904343368, + "grad_norm": 11.370489120483398, + "learning_rate": 8.691990894508373e-06, + "loss": 3.4512, + "step": 26480 + }, + { + "epoch": 0.7760931663370688, + "grad_norm": 11.16467571258545, + "learning_rate": 8.690924210270976e-06, + "loss": 3.4663, + "step": 26490 + }, + { + "epoch": 0.7763861422398007, + "grad_norm": 11.555164337158203, + "learning_rate": 8.68985715677851e-06, + "loss": 3.4563, + "step": 26500 + }, + { + "epoch": 0.7766791181425328, + "grad_norm": 10.904953956604004, + "learning_rate": 8.688789734137727e-06, + "loss": 3.459, + "step": 26510 + }, + { + "epoch": 0.7769720940452648, + "grad_norm": 10.818974494934082, + "learning_rate": 8.687721942455414e-06, + "loss": 3.4666, + "step": 26520 + }, + { + "epoch": 0.7772650699479968, + "grad_norm": 10.468856811523438, + "learning_rate": 8.6866537818384e-06, + "loss": 3.4551, + "step": 26530 + }, + { + "epoch": 0.7775580458507287, + "grad_norm": 11.789722442626953, + "learning_rate": 8.685585252393547e-06, + "loss": 3.4511, + "step": 26540 + }, + { + "epoch": 0.7778510217534608, + "grad_norm": 12.754485130310059, + "learning_rate": 8.684516354227756e-06, + "loss": 3.4541, + "step": 26550 + }, + { + "epoch": 0.7781439976561928, + "grad_norm": 10.684979438781738, + "learning_rate": 8.683447087447962e-06, + "loss": 3.4731, + "step": 26560 + }, + { + "epoch": 0.7784369735589248, + "grad_norm": 10.487730979919434, + "learning_rate": 8.682377452161141e-06, + "loss": 3.4645, + "step": 26570 + }, + { + "epoch": 0.7787299494616567, + "grad_norm": 11.414506912231445, + "learning_rate": 8.681307448474303e-06, + "loss": 3.4432, + "step": 26580 + }, + { + "epoch": 0.7790229253643888, + "grad_norm": 12.111502647399902, + "learning_rate": 8.680237076494495e-06, + "loss": 3.467, + "step": 26590 + }, + { + "epoch": 0.7793159012671208, + "grad_norm": 10.689593315124512, + "learning_rate": 8.679166336328801e-06, + "loss": 3.4724, + "step": 26600 + }, + { + "epoch": 0.7796088771698528, + "grad_norm": 10.96544075012207, + "learning_rate": 8.678095228084343e-06, + "loss": 3.4218, + "step": 26610 + }, + { + "epoch": 0.7799018530725847, + "grad_norm": 12.896249771118164, + "learning_rate": 8.677023751868279e-06, + "loss": 3.4587, + "step": 26620 + }, + { + "epoch": 0.7800190434336776, + "eval_bleu": 0.3291366221399193, + "eval_cap_loss": 0.9721251130104065, + "eval_con_loss": 1.3325741291046143, + "eval_loss": 3.6372733116149902, + "step": 26624 + }, + { + "epoch": 0.7800190434336776, + "eval_bleu": 0.3291366221399193, + "eval_cap_loss": 0.9721251130104065, + "eval_con_loss": 1.3325741291046143, + "eval_loss": 3.6372733116149902, + "eval_runtime": 55.3123, + "eval_samples_per_second": 361.583, + "eval_steps_per_second": 0.362, + "step": 26624 + }, + { + "epoch": 0.7801948289753168, + "grad_norm": 11.176589012145996, + "learning_rate": 8.675951907787804e-06, + "loss": 3.4542, + "step": 26630 + }, + { + "epoch": 0.7804878048780488, + "grad_norm": 13.271219253540039, + "learning_rate": 8.674879695950148e-06, + "loss": 3.4139, + "step": 26640 + }, + { + "epoch": 0.7807807807807807, + "grad_norm": 11.149083137512207, + "learning_rate": 8.673807116462582e-06, + "loss": 3.4322, + "step": 26650 + }, + { + "epoch": 0.7810737566835128, + "grad_norm": 11.477712631225586, + "learning_rate": 8.672734169432411e-06, + "loss": 3.448, + "step": 26660 + }, + { + "epoch": 0.7813667325862448, + "grad_norm": 11.119912147521973, + "learning_rate": 8.671660854966975e-06, + "loss": 3.4845, + "step": 26670 + }, + { + "epoch": 0.7816597084889768, + "grad_norm": 10.530683517456055, + "learning_rate": 8.670587173173652e-06, + "loss": 3.4479, + "step": 26680 + }, + { + "epoch": 0.7819526843917087, + "grad_norm": 11.172308921813965, + "learning_rate": 8.669513124159861e-06, + "loss": 3.4545, + "step": 26690 + }, + { + "epoch": 0.7822456602944408, + "grad_norm": 11.49101734161377, + "learning_rate": 8.66843870803305e-06, + "loss": 3.4821, + "step": 26700 + }, + { + "epoch": 0.7825386361971728, + "grad_norm": 11.83443832397461, + "learning_rate": 8.667363924900713e-06, + "loss": 3.4251, + "step": 26710 + }, + { + "epoch": 0.7828316120999048, + "grad_norm": 12.641042709350586, + "learning_rate": 8.666288774870373e-06, + "loss": 3.4508, + "step": 26720 + }, + { + "epoch": 0.7831245880026367, + "grad_norm": 9.623673439025879, + "learning_rate": 8.665213258049592e-06, + "loss": 3.4386, + "step": 26730 + }, + { + "epoch": 0.7834175639053688, + "grad_norm": 10.645828247070312, + "learning_rate": 8.664137374545969e-06, + "loss": 3.4282, + "step": 26740 + }, + { + "epoch": 0.7837105398081008, + "grad_norm": 10.716808319091797, + "learning_rate": 8.66306112446714e-06, + "loss": 3.4667, + "step": 26750 + }, + { + "epoch": 0.7840035157108328, + "grad_norm": 10.653939247131348, + "learning_rate": 8.661984507920778e-06, + "loss": 3.4506, + "step": 26760 + }, + { + "epoch": 0.7842964916135647, + "grad_norm": 10.51550579071045, + "learning_rate": 8.660907525014593e-06, + "loss": 3.4184, + "step": 26770 + }, + { + "epoch": 0.7845894675162968, + "grad_norm": 11.3683443069458, + "learning_rate": 8.659830175856327e-06, + "loss": 3.4532, + "step": 26780 + }, + { + "epoch": 0.7848824434190288, + "grad_norm": 10.404030799865723, + "learning_rate": 8.658752460553768e-06, + "loss": 3.4102, + "step": 26790 + }, + { + "epoch": 0.7851754193217608, + "grad_norm": 10.873587608337402, + "learning_rate": 8.65767437921473e-06, + "loss": 3.4483, + "step": 26800 + }, + { + "epoch": 0.7854683952244927, + "grad_norm": 12.176032066345215, + "learning_rate": 8.656595931947071e-06, + "loss": 3.4571, + "step": 26810 + }, + { + "epoch": 0.7857613711272248, + "grad_norm": 11.40821361541748, + "learning_rate": 8.655517118858682e-06, + "loss": 3.4346, + "step": 26820 + }, + { + "epoch": 0.7860543470299568, + "grad_norm": 11.977571487426758, + "learning_rate": 8.654437940057495e-06, + "loss": 3.4589, + "step": 26830 + }, + { + "epoch": 0.7863473229326888, + "grad_norm": 10.764180183410645, + "learning_rate": 8.65335839565147e-06, + "loss": 3.4387, + "step": 26840 + }, + { + "epoch": 0.7866402988354207, + "grad_norm": 12.157068252563477, + "learning_rate": 8.652278485748616e-06, + "loss": 3.4495, + "step": 26850 + }, + { + "epoch": 0.7869332747381528, + "grad_norm": 10.716468811035156, + "learning_rate": 8.651198210456964e-06, + "loss": 3.4397, + "step": 26860 + }, + { + "epoch": 0.7872262506408848, + "grad_norm": 11.624465942382812, + "learning_rate": 8.650117569884596e-06, + "loss": 3.4508, + "step": 26870 + }, + { + "epoch": 0.7875192265436168, + "grad_norm": 10.926051139831543, + "learning_rate": 8.649036564139618e-06, + "loss": 3.4438, + "step": 26880 + }, + { + "epoch": 0.7878122024463488, + "grad_norm": 10.390655517578125, + "learning_rate": 8.647955193330182e-06, + "loss": 3.4637, + "step": 26890 + }, + { + "epoch": 0.7881051783490808, + "grad_norm": 12.522567749023438, + "learning_rate": 8.64687345756447e-06, + "loss": 3.462, + "step": 26900 + }, + { + "epoch": 0.7883981542518128, + "grad_norm": 11.01937484741211, + "learning_rate": 8.645791356950705e-06, + "loss": 3.4313, + "step": 26910 + }, + { + "epoch": 0.7886911301545448, + "grad_norm": 10.78005599975586, + "learning_rate": 8.644708891597147e-06, + "loss": 3.4578, + "step": 26920 + }, + { + "epoch": 0.7889841060572768, + "grad_norm": 10.558764457702637, + "learning_rate": 8.643626061612086e-06, + "loss": 3.4422, + "step": 26930 + }, + { + "epoch": 0.7892770819600088, + "grad_norm": 10.019665718078613, + "learning_rate": 8.642542867103854e-06, + "loss": 3.4559, + "step": 26940 + }, + { + "epoch": 0.7895700578627408, + "grad_norm": 10.737624168395996, + "learning_rate": 8.64145930818082e-06, + "loss": 3.444, + "step": 26950 + }, + { + "epoch": 0.7898630337654728, + "grad_norm": 10.590888023376465, + "learning_rate": 8.640375384951385e-06, + "loss": 3.4475, + "step": 26960 + }, + { + "epoch": 0.7901560096682048, + "grad_norm": 10.456374168395996, + "learning_rate": 8.639291097523991e-06, + "loss": 3.4393, + "step": 26970 + }, + { + "epoch": 0.7904489855709368, + "grad_norm": 11.308018684387207, + "learning_rate": 8.638206446007114e-06, + "loss": 3.4608, + "step": 26980 + }, + { + "epoch": 0.7907419614736688, + "grad_norm": 10.529512405395508, + "learning_rate": 8.637121430509266e-06, + "loss": 3.438, + "step": 26990 + }, + { + "epoch": 0.7910349373764007, + "grad_norm": 10.99781608581543, + "learning_rate": 8.636036051138998e-06, + "loss": 3.4281, + "step": 27000 + }, + { + "epoch": 0.7913279132791328, + "grad_norm": 12.488692283630371, + "learning_rate": 8.634950308004894e-06, + "loss": 3.4567, + "step": 27010 + }, + { + "epoch": 0.7916208891818648, + "grad_norm": 11.831522941589355, + "learning_rate": 8.633864201215579e-06, + "loss": 3.4439, + "step": 27020 + }, + { + "epoch": 0.7919138650845968, + "grad_norm": 11.509474754333496, + "learning_rate": 8.632777730879707e-06, + "loss": 3.468, + "step": 27030 + }, + { + "epoch": 0.7922068409873287, + "grad_norm": 11.728483200073242, + "learning_rate": 8.631690897105978e-06, + "loss": 3.4435, + "step": 27040 + }, + { + "epoch": 0.7924998168900608, + "grad_norm": 11.334634780883789, + "learning_rate": 8.630603700003118e-06, + "loss": 3.4277, + "step": 27050 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 10.944825172424316, + "learning_rate": 8.6295161396799e-06, + "loss": 3.4357, + "step": 27060 + }, + { + "epoch": 0.7930857686955248, + "grad_norm": 11.790753364562988, + "learning_rate": 8.628428216245123e-06, + "loss": 3.4288, + "step": 27070 + }, + { + "epoch": 0.7933787445982567, + "grad_norm": 11.833585739135742, + "learning_rate": 8.627339929807632e-06, + "loss": 3.4337, + "step": 27080 + }, + { + "epoch": 0.7936717205009888, + "grad_norm": 12.438934326171875, + "learning_rate": 8.6262512804763e-06, + "loss": 3.4296, + "step": 27090 + }, + { + "epoch": 0.7939646964037208, + "grad_norm": 11.137435913085938, + "learning_rate": 8.625162268360044e-06, + "loss": 3.4275, + "step": 27100 + }, + { + "epoch": 0.7942576723064528, + "grad_norm": 11.420584678649902, + "learning_rate": 8.624072893567807e-06, + "loss": 3.4483, + "step": 27110 + }, + { + "epoch": 0.7945506482091848, + "grad_norm": 9.912652969360352, + "learning_rate": 8.62298315620858e-06, + "loss": 3.4251, + "step": 27120 + }, + { + "epoch": 0.7948436241119168, + "grad_norm": 10.92386245727539, + "learning_rate": 8.621893056391382e-06, + "loss": 3.4415, + "step": 27130 + }, + { + "epoch": 0.795019409653556, + "eval_bleu": 0.32941831069387506, + "eval_cap_loss": 0.9708003401756287, + "eval_con_loss": 1.32943856716156, + "eval_loss": 3.6296772956848145, + "step": 27136 + }, + { + "epoch": 0.795019409653556, + "eval_bleu": 0.32941831069387506, + "eval_cap_loss": 0.9708003401756287, + "eval_con_loss": 1.32943856716156, + "eval_loss": 3.6296772956848145, + "eval_runtime": 56.0905, + "eval_samples_per_second": 356.567, + "eval_steps_per_second": 0.357, + "step": 27136 + }, + { + "epoch": 0.7951366000146488, + "grad_norm": 10.998274803161621, + "learning_rate": 8.62080259422527e-06, + "loss": 3.4513, + "step": 27140 + }, + { + "epoch": 0.7954295759173808, + "grad_norm": 10.845314025878906, + "learning_rate": 8.619711769819341e-06, + "loss": 3.4332, + "step": 27150 + }, + { + "epoch": 0.7957225518201128, + "grad_norm": 11.327783584594727, + "learning_rate": 8.618620583282726e-06, + "loss": 3.4295, + "step": 27160 + }, + { + "epoch": 0.7960155277228448, + "grad_norm": 11.487968444824219, + "learning_rate": 8.617529034724588e-06, + "loss": 3.4208, + "step": 27170 + }, + { + "epoch": 0.7963085036255768, + "grad_norm": 10.954069137573242, + "learning_rate": 8.616437124254133e-06, + "loss": 3.4296, + "step": 27180 + }, + { + "epoch": 0.7966014795283088, + "grad_norm": 12.158836364746094, + "learning_rate": 8.615344851980601e-06, + "loss": 3.452, + "step": 27190 + }, + { + "epoch": 0.7968944554310408, + "grad_norm": 11.096596717834473, + "learning_rate": 8.614252218013264e-06, + "loss": 3.4418, + "step": 27200 + }, + { + "epoch": 0.7971874313337728, + "grad_norm": 11.277837753295898, + "learning_rate": 8.613159222461434e-06, + "loss": 3.4371, + "step": 27210 + }, + { + "epoch": 0.7974804072365048, + "grad_norm": 11.230831146240234, + "learning_rate": 8.612065865434461e-06, + "loss": 3.4591, + "step": 27220 + }, + { + "epoch": 0.7977733831392368, + "grad_norm": 11.153265953063965, + "learning_rate": 8.61097214704173e-06, + "loss": 3.4385, + "step": 27230 + }, + { + "epoch": 0.7980663590419688, + "grad_norm": 10.436962127685547, + "learning_rate": 8.609878067392655e-06, + "loss": 3.4332, + "step": 27240 + }, + { + "epoch": 0.7983593349447008, + "grad_norm": 11.217723846435547, + "learning_rate": 8.608783626596698e-06, + "loss": 3.4155, + "step": 27250 + }, + { + "epoch": 0.7986523108474328, + "grad_norm": 11.409324645996094, + "learning_rate": 8.60768882476335e-06, + "loss": 3.4043, + "step": 27260 + }, + { + "epoch": 0.7989452867501649, + "grad_norm": 10.64237117767334, + "learning_rate": 8.606593662002138e-06, + "loss": 3.425, + "step": 27270 + }, + { + "epoch": 0.7992382626528968, + "grad_norm": 10.663687705993652, + "learning_rate": 8.605498138422626e-06, + "loss": 3.449, + "step": 27280 + }, + { + "epoch": 0.7995312385556288, + "grad_norm": 12.237227439880371, + "learning_rate": 8.604402254134418e-06, + "loss": 3.4356, + "step": 27290 + }, + { + "epoch": 0.7998242144583608, + "grad_norm": 10.481889724731445, + "learning_rate": 8.603306009247148e-06, + "loss": 3.4288, + "step": 27300 + }, + { + "epoch": 0.8001171903610927, + "grad_norm": 11.623735427856445, + "learning_rate": 8.60220940387049e-06, + "loss": 3.4287, + "step": 27310 + }, + { + "epoch": 0.8004101662638248, + "grad_norm": 10.31981372833252, + "learning_rate": 8.60111243811415e-06, + "loss": 3.4426, + "step": 27320 + }, + { + "epoch": 0.8007031421665568, + "grad_norm": 11.484318733215332, + "learning_rate": 8.600015112087878e-06, + "loss": 3.4273, + "step": 27330 + }, + { + "epoch": 0.8009961180692888, + "grad_norm": 11.661853790283203, + "learning_rate": 8.59891742590145e-06, + "loss": 3.4384, + "step": 27340 + }, + { + "epoch": 0.8012890939720207, + "grad_norm": 11.352492332458496, + "learning_rate": 8.597819379664686e-06, + "loss": 3.4245, + "step": 27350 + }, + { + "epoch": 0.8015820698747528, + "grad_norm": 11.187575340270996, + "learning_rate": 8.59672097348744e-06, + "loss": 3.4332, + "step": 27360 + }, + { + "epoch": 0.8018750457774848, + "grad_norm": 11.44735336303711, + "learning_rate": 8.595622207479597e-06, + "loss": 3.4479, + "step": 27370 + }, + { + "epoch": 0.8021680216802168, + "grad_norm": 10.897500038146973, + "learning_rate": 8.594523081751085e-06, + "loss": 3.4596, + "step": 27380 + }, + { + "epoch": 0.8024609975829488, + "grad_norm": 11.722445487976074, + "learning_rate": 8.593423596411866e-06, + "loss": 3.4105, + "step": 27390 + }, + { + "epoch": 0.8027539734856808, + "grad_norm": 11.81982707977295, + "learning_rate": 8.592323751571931e-06, + "loss": 3.4443, + "step": 27400 + }, + { + "epoch": 0.8030469493884128, + "grad_norm": 11.484415054321289, + "learning_rate": 8.591223547341321e-06, + "loss": 3.452, + "step": 27410 + }, + { + "epoch": 0.8033399252911448, + "grad_norm": 10.912803649902344, + "learning_rate": 8.5901229838301e-06, + "loss": 3.4382, + "step": 27420 + }, + { + "epoch": 0.8036329011938768, + "grad_norm": 11.070491790771484, + "learning_rate": 8.589022061148373e-06, + "loss": 3.4321, + "step": 27430 + }, + { + "epoch": 0.8039258770966088, + "grad_norm": 11.018624305725098, + "learning_rate": 8.587920779406283e-06, + "loss": 3.4249, + "step": 27440 + }, + { + "epoch": 0.8042188529993408, + "grad_norm": 11.007036209106445, + "learning_rate": 8.586819138714004e-06, + "loss": 3.4493, + "step": 27450 + }, + { + "epoch": 0.8045118289020728, + "grad_norm": 12.281662940979004, + "learning_rate": 8.58571713918175e-06, + "loss": 3.4283, + "step": 27460 + }, + { + "epoch": 0.8048048048048048, + "grad_norm": 10.73933219909668, + "learning_rate": 8.584614780919772e-06, + "loss": 3.4199, + "step": 27470 + }, + { + "epoch": 0.8050977807075368, + "grad_norm": 12.679369926452637, + "learning_rate": 8.58351206403835e-06, + "loss": 3.4298, + "step": 27480 + }, + { + "epoch": 0.8053907566102688, + "grad_norm": 11.82202434539795, + "learning_rate": 8.582408988647806e-06, + "loss": 3.4274, + "step": 27490 + }, + { + "epoch": 0.8056837325130008, + "grad_norm": 10.874645233154297, + "learning_rate": 8.581305554858497e-06, + "loss": 3.4359, + "step": 27500 + }, + { + "epoch": 0.8059767084157328, + "grad_norm": 11.034425735473633, + "learning_rate": 8.580201762780815e-06, + "loss": 3.4539, + "step": 27510 + }, + { + "epoch": 0.8062696843184648, + "grad_norm": 10.626480102539062, + "learning_rate": 8.579097612525187e-06, + "loss": 3.4294, + "step": 27520 + }, + { + "epoch": 0.8065626602211968, + "grad_norm": 10.908427238464355, + "learning_rate": 8.577993104202075e-06, + "loss": 3.415, + "step": 27530 + }, + { + "epoch": 0.8068556361239289, + "grad_norm": 12.155229568481445, + "learning_rate": 8.576888237921983e-06, + "loss": 3.4333, + "step": 27540 + }, + { + "epoch": 0.8071486120266608, + "grad_norm": 11.533793449401855, + "learning_rate": 8.575783013795443e-06, + "loss": 3.3985, + "step": 27550 + }, + { + "epoch": 0.8074415879293928, + "grad_norm": 10.276236534118652, + "learning_rate": 8.574677431933026e-06, + "loss": 3.4413, + "step": 27560 + }, + { + "epoch": 0.8077345638321248, + "grad_norm": 11.020095825195312, + "learning_rate": 8.573571492445341e-06, + "loss": 3.4267, + "step": 27570 + }, + { + "epoch": 0.8080275397348569, + "grad_norm": 11.835783004760742, + "learning_rate": 8.57246519544303e-06, + "loss": 3.4088, + "step": 27580 + }, + { + "epoch": 0.8083205156375888, + "grad_norm": 10.780755043029785, + "learning_rate": 8.571358541036768e-06, + "loss": 3.434, + "step": 27590 + }, + { + "epoch": 0.8086134915403208, + "grad_norm": 11.46541976928711, + "learning_rate": 8.570251529337275e-06, + "loss": 3.4322, + "step": 27600 + }, + { + "epoch": 0.8089064674430528, + "grad_norm": 11.093564987182617, + "learning_rate": 8.569144160455295e-06, + "loss": 3.4175, + "step": 27610 + }, + { + "epoch": 0.8091994433457849, + "grad_norm": 10.39914321899414, + "learning_rate": 8.56803643450162e-06, + "loss": 3.4082, + "step": 27620 + }, + { + "epoch": 0.8094924192485168, + "grad_norm": 10.990504264831543, + "learning_rate": 8.566928351587067e-06, + "loss": 3.4363, + "step": 27630 + }, + { + "epoch": 0.8097853951512488, + "grad_norm": 10.286308288574219, + "learning_rate": 8.565819911822495e-06, + "loss": 3.4395, + "step": 27640 + }, + { + "epoch": 0.8100197758734344, + "eval_bleu": 0.3306820764403684, + "eval_cap_loss": 0.9698754549026489, + "eval_con_loss": 1.3272812366485596, + "eval_loss": 3.6244380474090576, + "step": 27648 + }, + { + "epoch": 0.8100197758734344, + "eval_bleu": 0.3306820764403684, + "eval_cap_loss": 0.9698754549026489, + "eval_con_loss": 1.3272812366485596, + "eval_loss": 3.6244380474090576, + "eval_runtime": 57.6925, + "eval_samples_per_second": 346.666, + "eval_steps_per_second": 0.347, + "step": 27648 + }, + { + "epoch": 0.8100783710539808, + "grad_norm": 10.147148132324219, + "learning_rate": 8.564711115318794e-06, + "loss": 3.4035, + "step": 27650 + }, + { + "epoch": 0.8103713469567128, + "grad_norm": 10.633777618408203, + "learning_rate": 8.563601962186897e-06, + "loss": 3.4185, + "step": 27660 + }, + { + "epoch": 0.8106643228594448, + "grad_norm": 12.023749351501465, + "learning_rate": 8.562492452537763e-06, + "loss": 3.4421, + "step": 27670 + }, + { + "epoch": 0.8109572987621768, + "grad_norm": 11.260320663452148, + "learning_rate": 8.561382586482395e-06, + "loss": 3.4019, + "step": 27680 + }, + { + "epoch": 0.8112502746649088, + "grad_norm": 10.968564987182617, + "learning_rate": 8.560272364131832e-06, + "loss": 3.4132, + "step": 27690 + }, + { + "epoch": 0.8115432505676408, + "grad_norm": 11.029776573181152, + "learning_rate": 8.559161785597139e-06, + "loss": 3.4103, + "step": 27700 + }, + { + "epoch": 0.8118362264703728, + "grad_norm": 10.013534545898438, + "learning_rate": 8.558050850989423e-06, + "loss": 3.4303, + "step": 27710 + }, + { + "epoch": 0.8121292023731048, + "grad_norm": 10.708664894104004, + "learning_rate": 8.556939560419833e-06, + "loss": 3.4232, + "step": 27720 + }, + { + "epoch": 0.8124221782758368, + "grad_norm": 11.95893383026123, + "learning_rate": 8.555827913999539e-06, + "loss": 3.4138, + "step": 27730 + }, + { + "epoch": 0.8127151541785688, + "grad_norm": 11.294939994812012, + "learning_rate": 8.554715911839759e-06, + "loss": 3.449, + "step": 27740 + }, + { + "epoch": 0.8130081300813008, + "grad_norm": 10.605596542358398, + "learning_rate": 8.55360355405174e-06, + "loss": 3.419, + "step": 27750 + }, + { + "epoch": 0.8133011059840328, + "grad_norm": 11.485698699951172, + "learning_rate": 8.552490840746773e-06, + "loss": 3.4386, + "step": 27760 + }, + { + "epoch": 0.8135940818867649, + "grad_norm": 10.629921913146973, + "learning_rate": 8.551377772036169e-06, + "loss": 3.4322, + "step": 27770 + }, + { + "epoch": 0.8138870577894968, + "grad_norm": 11.084019660949707, + "learning_rate": 8.550264348031291e-06, + "loss": 3.4212, + "step": 27780 + }, + { + "epoch": 0.8141800336922288, + "grad_norm": 11.169535636901855, + "learning_rate": 8.549150568843527e-06, + "loss": 3.4293, + "step": 27790 + }, + { + "epoch": 0.8144730095949608, + "grad_norm": 10.311744689941406, + "learning_rate": 8.548036434584304e-06, + "loss": 3.4219, + "step": 27800 + }, + { + "epoch": 0.8147659854976929, + "grad_norm": 10.653726577758789, + "learning_rate": 8.546921945365087e-06, + "loss": 3.4412, + "step": 27810 + }, + { + "epoch": 0.8150589614004248, + "grad_norm": 11.418914794921875, + "learning_rate": 8.545807101297373e-06, + "loss": 3.3969, + "step": 27820 + }, + { + "epoch": 0.8153519373031568, + "grad_norm": 10.957916259765625, + "learning_rate": 8.544691902492694e-06, + "loss": 3.4052, + "step": 27830 + }, + { + "epoch": 0.8156449132058888, + "grad_norm": 10.943279266357422, + "learning_rate": 8.54357634906262e-06, + "loss": 3.4133, + "step": 27840 + }, + { + "epoch": 0.8159378891086209, + "grad_norm": 12.057724952697754, + "learning_rate": 8.542460441118756e-06, + "loss": 3.4159, + "step": 27850 + }, + { + "epoch": 0.8162308650113528, + "grad_norm": 11.460907936096191, + "learning_rate": 8.541344178772742e-06, + "loss": 3.4112, + "step": 27860 + }, + { + "epoch": 0.8165238409140848, + "grad_norm": 10.443692207336426, + "learning_rate": 8.540227562136252e-06, + "loss": 3.4069, + "step": 27870 + }, + { + "epoch": 0.8168168168168168, + "grad_norm": 11.829421043395996, + "learning_rate": 8.539110591320998e-06, + "loss": 3.4368, + "step": 27880 + }, + { + "epoch": 0.8171097927195489, + "grad_norm": 10.94896125793457, + "learning_rate": 8.537993266438726e-06, + "loss": 3.4162, + "step": 27890 + }, + { + "epoch": 0.8174027686222808, + "grad_norm": 11.435680389404297, + "learning_rate": 8.536875587601219e-06, + "loss": 3.4201, + "step": 27900 + }, + { + "epoch": 0.8176957445250128, + "grad_norm": 10.854945182800293, + "learning_rate": 8.535757554920292e-06, + "loss": 3.4125, + "step": 27910 + }, + { + "epoch": 0.8179887204277448, + "grad_norm": 12.309894561767578, + "learning_rate": 8.534639168507799e-06, + "loss": 3.4177, + "step": 27920 + }, + { + "epoch": 0.8182816963304769, + "grad_norm": 11.913461685180664, + "learning_rate": 8.533520428475626e-06, + "loss": 3.4086, + "step": 27930 + }, + { + "epoch": 0.8185746722332088, + "grad_norm": 10.189269065856934, + "learning_rate": 8.5324013349357e-06, + "loss": 3.4165, + "step": 27940 + }, + { + "epoch": 0.8188676481359408, + "grad_norm": 9.904362678527832, + "learning_rate": 8.531281887999977e-06, + "loss": 3.3687, + "step": 27950 + }, + { + "epoch": 0.8191606240386728, + "grad_norm": 10.801505088806152, + "learning_rate": 8.530274083696981e-06, + "loss": 3.4411, + "step": 27960 + }, + { + "epoch": 0.8194535999414048, + "grad_norm": 13.132245063781738, + "learning_rate": 8.529153965617818e-06, + "loss": 3.4171, + "step": 27970 + }, + { + "epoch": 0.8197465758441368, + "grad_norm": 11.889565467834473, + "learning_rate": 8.528033494467738e-06, + "loss": 3.4241, + "step": 27980 + }, + { + "epoch": 0.8200395517468688, + "grad_norm": 10.951362609863281, + "learning_rate": 8.526912670358836e-06, + "loss": 3.4101, + "step": 27990 + }, + { + "epoch": 0.8203325276496009, + "grad_norm": 11.180742263793945, + "learning_rate": 8.525791493403249e-06, + "loss": 3.4141, + "step": 28000 + }, + { + "epoch": 0.8206255035523328, + "grad_norm": 10.415032386779785, + "learning_rate": 8.524669963713139e-06, + "loss": 3.4189, + "step": 28010 + }, + { + "epoch": 0.8209184794550648, + "grad_norm": 10.63390827178955, + "learning_rate": 8.52354808140071e-06, + "loss": 3.4158, + "step": 28020 + }, + { + "epoch": 0.8212114553577968, + "grad_norm": 11.038694381713867, + "learning_rate": 8.522425846578199e-06, + "loss": 3.4066, + "step": 28030 + }, + { + "epoch": 0.8215044312605289, + "grad_norm": 9.925301551818848, + "learning_rate": 8.521303259357883e-06, + "loss": 3.4194, + "step": 28040 + }, + { + "epoch": 0.8217974071632608, + "grad_norm": 10.0213623046875, + "learning_rate": 8.520180319852066e-06, + "loss": 3.4005, + "step": 28050 + }, + { + "epoch": 0.8220903830659928, + "grad_norm": 11.096454620361328, + "learning_rate": 8.51905702817309e-06, + "loss": 3.4068, + "step": 28060 + }, + { + "epoch": 0.8223833589687248, + "grad_norm": 11.440340042114258, + "learning_rate": 8.517933384433337e-06, + "loss": 3.4004, + "step": 28070 + }, + { + "epoch": 0.8226763348714569, + "grad_norm": 9.80125617980957, + "learning_rate": 8.516809388745221e-06, + "loss": 3.3856, + "step": 28080 + }, + { + "epoch": 0.8229693107741888, + "grad_norm": 11.615192413330078, + "learning_rate": 8.515685041221192e-06, + "loss": 3.4129, + "step": 28090 + }, + { + "epoch": 0.8232622866769208, + "grad_norm": 10.482958793640137, + "learning_rate": 8.51456034197373e-06, + "loss": 3.4271, + "step": 28100 + }, + { + "epoch": 0.8235552625796528, + "grad_norm": 10.660367012023926, + "learning_rate": 8.513435291115357e-06, + "loss": 3.3946, + "step": 28110 + }, + { + "epoch": 0.8238482384823849, + "grad_norm": 11.922951698303223, + "learning_rate": 8.51230988875863e-06, + "loss": 3.3836, + "step": 28120 + }, + { + "epoch": 0.8241412143851168, + "grad_norm": 11.054594993591309, + "learning_rate": 8.511184135016134e-06, + "loss": 3.3952, + "step": 28130 + }, + { + "epoch": 0.8244341902878488, + "grad_norm": 11.795433044433594, + "learning_rate": 8.510058030000498e-06, + "loss": 3.411, + "step": 28140 + }, + { + "epoch": 0.8247271661905808, + "grad_norm": 10.349347114562988, + "learning_rate": 8.50893157382438e-06, + "loss": 3.3923, + "step": 28150 + }, + { + "epoch": 0.8250201420933129, + "grad_norm": 11.792015075683594, + "learning_rate": 8.507804766600475e-06, + "loss": 3.439, + "step": 28160 + }, + { + "epoch": 0.8250201420933129, + "eval_bleu": 0.3310835905038692, + "eval_cap_loss": 0.9661195874214172, + "eval_con_loss": 1.317266583442688, + "eval_loss": 3.6006526947021484, + "step": 28160 + }, + { + "epoch": 0.8250201420933129, + "eval_bleu": 0.3310835905038692, + "eval_cap_loss": 0.9661195874214172, + "eval_con_loss": 1.317266583442688, + "eval_loss": 3.6006526947021484, + "eval_runtime": 54.9285, + "eval_samples_per_second": 364.11, + "eval_steps_per_second": 0.364, + "step": 28160 + }, + { + "epoch": 0.8253131179960448, + "grad_norm": 12.161031723022461, + "learning_rate": 8.506677608441513e-06, + "loss": 3.4409, + "step": 28170 + }, + { + "epoch": 0.8256060938987768, + "grad_norm": 10.559107780456543, + "learning_rate": 8.505550099460264e-06, + "loss": 3.4064, + "step": 28180 + }, + { + "epoch": 0.8258990698015088, + "grad_norm": 10.795823097229004, + "learning_rate": 8.504422239769523e-06, + "loss": 3.3553, + "step": 28190 + }, + { + "epoch": 0.8261920457042409, + "grad_norm": 11.263769149780273, + "learning_rate": 8.50329402948213e-06, + "loss": 3.4001, + "step": 28200 + }, + { + "epoch": 0.8264850216069728, + "grad_norm": 11.530445098876953, + "learning_rate": 8.502165468710952e-06, + "loss": 3.4014, + "step": 28210 + }, + { + "epoch": 0.8267779975097048, + "grad_norm": 10.156315803527832, + "learning_rate": 8.501036557568896e-06, + "loss": 3.3768, + "step": 28220 + }, + { + "epoch": 0.8270709734124368, + "grad_norm": 11.144845008850098, + "learning_rate": 8.499907296168906e-06, + "loss": 3.4328, + "step": 28230 + }, + { + "epoch": 0.8273639493151689, + "grad_norm": 12.24344539642334, + "learning_rate": 8.498777684623952e-06, + "loss": 3.4304, + "step": 28240 + }, + { + "epoch": 0.8276569252179008, + "grad_norm": 11.366966247558594, + "learning_rate": 8.49764772304705e-06, + "loss": 3.4246, + "step": 28250 + }, + { + "epoch": 0.8279499011206328, + "grad_norm": 10.940164566040039, + "learning_rate": 8.496517411551244e-06, + "loss": 3.4151, + "step": 28260 + }, + { + "epoch": 0.8282428770233649, + "grad_norm": 11.459993362426758, + "learning_rate": 8.495386750249614e-06, + "loss": 3.4173, + "step": 28270 + }, + { + "epoch": 0.8285358529260969, + "grad_norm": 11.954769134521484, + "learning_rate": 8.494255739255279e-06, + "loss": 3.4205, + "step": 28280 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 11.715622901916504, + "learning_rate": 8.493124378681387e-06, + "loss": 3.3992, + "step": 28290 + }, + { + "epoch": 0.8291218047315608, + "grad_norm": 12.472573280334473, + "learning_rate": 8.491992668641125e-06, + "loss": 3.374, + "step": 28300 + }, + { + "epoch": 0.8294147806342929, + "grad_norm": 11.412153244018555, + "learning_rate": 8.490860609247713e-06, + "loss": 3.4, + "step": 28310 + }, + { + "epoch": 0.8297077565370248, + "grad_norm": 10.361682891845703, + "learning_rate": 8.489728200614408e-06, + "loss": 3.4034, + "step": 28320 + }, + { + "epoch": 0.8300007324397568, + "grad_norm": 10.442243576049805, + "learning_rate": 8.488595442854502e-06, + "loss": 3.4014, + "step": 28330 + }, + { + "epoch": 0.8302937083424888, + "grad_norm": 10.4912691116333, + "learning_rate": 8.487462336081317e-06, + "loss": 3.4202, + "step": 28340 + }, + { + "epoch": 0.8305866842452209, + "grad_norm": 11.179561614990234, + "learning_rate": 8.486328880408216e-06, + "loss": 3.426, + "step": 28350 + }, + { + "epoch": 0.8308796601479528, + "grad_norm": 10.629716873168945, + "learning_rate": 8.485195075948594e-06, + "loss": 3.4078, + "step": 28360 + }, + { + "epoch": 0.8311726360506848, + "grad_norm": 11.621929168701172, + "learning_rate": 8.484060922815882e-06, + "loss": 3.4004, + "step": 28370 + }, + { + "epoch": 0.8314656119534168, + "grad_norm": 11.643034934997559, + "learning_rate": 8.482926421123545e-06, + "loss": 3.3922, + "step": 28380 + }, + { + "epoch": 0.8317585878561489, + "grad_norm": 11.021538734436035, + "learning_rate": 8.481791570985084e-06, + "loss": 3.4073, + "step": 28390 + }, + { + "epoch": 0.8320515637588808, + "grad_norm": 10.193912506103516, + "learning_rate": 8.480656372514031e-06, + "loss": 3.4218, + "step": 28400 + }, + { + "epoch": 0.8323445396616128, + "grad_norm": 11.869057655334473, + "learning_rate": 8.479520825823961e-06, + "loss": 3.4033, + "step": 28410 + }, + { + "epoch": 0.8326375155643448, + "grad_norm": 11.047660827636719, + "learning_rate": 8.478384931028473e-06, + "loss": 3.3831, + "step": 28420 + }, + { + "epoch": 0.8329304914670769, + "grad_norm": 10.710238456726074, + "learning_rate": 8.47724868824121e-06, + "loss": 3.4016, + "step": 28430 + }, + { + "epoch": 0.8332234673698088, + "grad_norm": 11.564374923706055, + "learning_rate": 8.476112097575845e-06, + "loss": 3.4063, + "step": 28440 + }, + { + "epoch": 0.8335164432725408, + "grad_norm": 11.009642601013184, + "learning_rate": 8.474975159146089e-06, + "loss": 3.3983, + "step": 28450 + }, + { + "epoch": 0.8338094191752728, + "grad_norm": 11.137458801269531, + "learning_rate": 8.473837873065685e-06, + "loss": 3.3959, + "step": 28460 + }, + { + "epoch": 0.8341023950780049, + "grad_norm": 10.783406257629395, + "learning_rate": 8.47270023944841e-06, + "loss": 3.386, + "step": 28470 + }, + { + "epoch": 0.8343953709807368, + "grad_norm": 10.205510139465332, + "learning_rate": 8.47156225840808e-06, + "loss": 3.4206, + "step": 28480 + }, + { + "epoch": 0.8346883468834688, + "grad_norm": 11.447782516479492, + "learning_rate": 8.47042393005854e-06, + "loss": 3.4035, + "step": 28490 + }, + { + "epoch": 0.8349813227862009, + "grad_norm": 11.303844451904297, + "learning_rate": 8.469285254513678e-06, + "loss": 3.3782, + "step": 28500 + }, + { + "epoch": 0.8352742986889329, + "grad_norm": 10.645294189453125, + "learning_rate": 8.468146231887406e-06, + "loss": 3.4089, + "step": 28510 + }, + { + "epoch": 0.8355672745916648, + "grad_norm": 11.685564041137695, + "learning_rate": 8.46700686229368e-06, + "loss": 3.3969, + "step": 28520 + }, + { + "epoch": 0.8358602504943968, + "grad_norm": 11.159017562866211, + "learning_rate": 8.465867145846488e-06, + "loss": 3.4228, + "step": 28530 + }, + { + "epoch": 0.8361532263971289, + "grad_norm": 11.266557693481445, + "learning_rate": 8.464727082659849e-06, + "loss": 3.4116, + "step": 28540 + }, + { + "epoch": 0.8364462022998609, + "grad_norm": 10.361024856567383, + "learning_rate": 8.463586672847821e-06, + "loss": 3.3694, + "step": 28550 + }, + { + "epoch": 0.8367391782025928, + "grad_norm": 10.323500633239746, + "learning_rate": 8.462445916524496e-06, + "loss": 3.3905, + "step": 28560 + }, + { + "epoch": 0.8370321541053248, + "grad_norm": 11.78026294708252, + "learning_rate": 8.461304813804e-06, + "loss": 3.4025, + "step": 28570 + }, + { + "epoch": 0.8373251300080569, + "grad_norm": 11.250649452209473, + "learning_rate": 8.460163364800488e-06, + "loss": 3.4045, + "step": 28580 + }, + { + "epoch": 0.8376181059107889, + "grad_norm": 10.715672492980957, + "learning_rate": 8.459021569628165e-06, + "loss": 3.4103, + "step": 28590 + }, + { + "epoch": 0.8379110818135208, + "grad_norm": 10.997001647949219, + "learning_rate": 8.457879428401252e-06, + "loss": 3.3996, + "step": 28600 + }, + { + "epoch": 0.8382040577162528, + "grad_norm": 10.520708084106445, + "learning_rate": 8.45673694123402e-06, + "loss": 3.4039, + "step": 28610 + }, + { + "epoch": 0.8384970336189849, + "grad_norm": 13.065520286560059, + "learning_rate": 8.455594108240763e-06, + "loss": 3.3963, + "step": 28620 + }, + { + "epoch": 0.8387900095217168, + "grad_norm": 10.99803638458252, + "learning_rate": 8.454450929535818e-06, + "loss": 3.404, + "step": 28630 + }, + { + "epoch": 0.8390829854244488, + "grad_norm": 10.821595191955566, + "learning_rate": 8.453307405233552e-06, + "loss": 3.4236, + "step": 28640 + }, + { + "epoch": 0.8393759613271808, + "grad_norm": 11.579680442810059, + "learning_rate": 8.452163535448368e-06, + "loss": 3.3895, + "step": 28650 + }, + { + "epoch": 0.8396689372299129, + "grad_norm": 10.697815895080566, + "learning_rate": 8.451019320294702e-06, + "loss": 3.3872, + "step": 28660 + }, + { + "epoch": 0.8399619131326448, + "grad_norm": 11.16554069519043, + "learning_rate": 8.449874759887029e-06, + "loss": 3.4067, + "step": 28670 + }, + { + "epoch": 0.8400205083131912, + "eval_bleu": 0.3323786513267122, + "eval_cap_loss": 0.9664551019668579, + "eval_con_loss": 1.3081214427947998, + "eval_loss": 3.582697868347168, + "step": 28672 + }, + { + "epoch": 0.8400205083131912, + "eval_bleu": 0.3323786513267122, + "eval_cap_loss": 0.9664551019668579, + "eval_con_loss": 1.3081214427947998, + "eval_loss": 3.582697868347168, + "eval_runtime": 66.754, + "eval_samples_per_second": 299.607, + "eval_steps_per_second": 0.3, + "step": 28672 + }, + { + "epoch": 0.8402548890353768, + "grad_norm": 10.63290786743164, + "learning_rate": 8.448729854339852e-06, + "loss": 3.4046, + "step": 28680 + }, + { + "epoch": 0.8405478649381088, + "grad_norm": 11.328673362731934, + "learning_rate": 8.447584603767714e-06, + "loss": 3.4044, + "step": 28690 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 11.391138076782227, + "learning_rate": 8.446439008285191e-06, + "loss": 3.3879, + "step": 28700 + }, + { + "epoch": 0.8411338167435728, + "grad_norm": 10.56144905090332, + "learning_rate": 8.44529306800689e-06, + "loss": 3.3759, + "step": 28710 + }, + { + "epoch": 0.8414267926463048, + "grad_norm": 12.193046569824219, + "learning_rate": 8.44414678304746e-06, + "loss": 3.4039, + "step": 28720 + }, + { + "epoch": 0.8417197685490369, + "grad_norm": 10.579451560974121, + "learning_rate": 8.443000153521578e-06, + "loss": 3.4215, + "step": 28730 + }, + { + "epoch": 0.8420127444517689, + "grad_norm": 10.62297534942627, + "learning_rate": 8.441853179543957e-06, + "loss": 3.4187, + "step": 28740 + }, + { + "epoch": 0.8423057203545008, + "grad_norm": 11.862225532531738, + "learning_rate": 8.440705861229344e-06, + "loss": 3.4256, + "step": 28750 + }, + { + "epoch": 0.8425986962572328, + "grad_norm": 11.011163711547852, + "learning_rate": 8.439558198692523e-06, + "loss": 3.3929, + "step": 28760 + }, + { + "epoch": 0.8428916721599649, + "grad_norm": 10.805420875549316, + "learning_rate": 8.438410192048311e-06, + "loss": 3.3967, + "step": 28770 + }, + { + "epoch": 0.8431846480626969, + "grad_norm": 11.329970359802246, + "learning_rate": 8.437261841411559e-06, + "loss": 3.3817, + "step": 28780 + }, + { + "epoch": 0.8434776239654288, + "grad_norm": 11.522541999816895, + "learning_rate": 8.436113146897152e-06, + "loss": 3.3579, + "step": 28790 + }, + { + "epoch": 0.8437705998681608, + "grad_norm": 11.737577438354492, + "learning_rate": 8.43496410862001e-06, + "loss": 3.3953, + "step": 28800 + }, + { + "epoch": 0.8440635757708929, + "grad_norm": 11.651056289672852, + "learning_rate": 8.433814726695089e-06, + "loss": 3.3993, + "step": 28810 + }, + { + "epoch": 0.8443565516736249, + "grad_norm": 11.610539436340332, + "learning_rate": 8.432665001237375e-06, + "loss": 3.3967, + "step": 28820 + }, + { + "epoch": 0.8446495275763568, + "grad_norm": 10.78317642211914, + "learning_rate": 8.431514932361895e-06, + "loss": 3.3864, + "step": 28830 + }, + { + "epoch": 0.8449425034790888, + "grad_norm": 10.283501625061035, + "learning_rate": 8.430364520183702e-06, + "loss": 3.4056, + "step": 28840 + }, + { + "epoch": 0.8452354793818209, + "grad_norm": 10.43968391418457, + "learning_rate": 8.429213764817893e-06, + "loss": 3.3702, + "step": 28850 + }, + { + "epoch": 0.8455284552845529, + "grad_norm": 10.653421401977539, + "learning_rate": 8.42806266637959e-06, + "loss": 3.3993, + "step": 28860 + }, + { + "epoch": 0.8458214311872848, + "grad_norm": 10.866873741149902, + "learning_rate": 8.426911224983956e-06, + "loss": 3.4157, + "step": 28870 + }, + { + "epoch": 0.8461144070900168, + "grad_norm": 11.091384887695312, + "learning_rate": 8.425759440746184e-06, + "loss": 3.412, + "step": 28880 + }, + { + "epoch": 0.8464073829927489, + "grad_norm": 11.045618057250977, + "learning_rate": 8.424607313781505e-06, + "loss": 3.393, + "step": 28890 + }, + { + "epoch": 0.8467003588954809, + "grad_norm": 12.048754692077637, + "learning_rate": 8.423454844205183e-06, + "loss": 3.3955, + "step": 28900 + }, + { + "epoch": 0.8469933347982128, + "grad_norm": 10.34331226348877, + "learning_rate": 8.422302032132513e-06, + "loss": 3.3804, + "step": 28910 + }, + { + "epoch": 0.8472863107009448, + "grad_norm": 10.156455993652344, + "learning_rate": 8.421148877678829e-06, + "loss": 3.3755, + "step": 28920 + }, + { + "epoch": 0.8475792866036769, + "grad_norm": 11.355900764465332, + "learning_rate": 8.419995380959496e-06, + "loss": 3.3761, + "step": 28930 + }, + { + "epoch": 0.8478722625064089, + "grad_norm": 11.64748764038086, + "learning_rate": 8.418841542089916e-06, + "loss": 3.3814, + "step": 28940 + }, + { + "epoch": 0.8481652384091408, + "grad_norm": 10.446391105651855, + "learning_rate": 8.417687361185523e-06, + "loss": 3.3916, + "step": 28950 + }, + { + "epoch": 0.8484582143118728, + "grad_norm": 10.507338523864746, + "learning_rate": 8.416532838361787e-06, + "loss": 3.4002, + "step": 28960 + }, + { + "epoch": 0.8487511902146049, + "grad_norm": 10.079800605773926, + "learning_rate": 8.415377973734207e-06, + "loss": 3.3931, + "step": 28970 + }, + { + "epoch": 0.8490441661173368, + "grad_norm": 11.283857345581055, + "learning_rate": 8.414222767418325e-06, + "loss": 3.364, + "step": 28980 + }, + { + "epoch": 0.8493371420200688, + "grad_norm": 11.404581069946289, + "learning_rate": 8.413067219529712e-06, + "loss": 3.4, + "step": 28990 + }, + { + "epoch": 0.8496301179228009, + "grad_norm": 10.98245620727539, + "learning_rate": 8.411911330183971e-06, + "loss": 3.4068, + "step": 29000 + }, + { + "epoch": 0.8499230938255329, + "grad_norm": 10.755043983459473, + "learning_rate": 8.410755099496747e-06, + "loss": 3.4027, + "step": 29010 + }, + { + "epoch": 0.8502160697282648, + "grad_norm": 11.57300090789795, + "learning_rate": 8.409598527583707e-06, + "loss": 3.3895, + "step": 29020 + }, + { + "epoch": 0.8505090456309968, + "grad_norm": 10.978017807006836, + "learning_rate": 8.408441614560563e-06, + "loss": 3.3928, + "step": 29030 + }, + { + "epoch": 0.8508020215337289, + "grad_norm": 10.684417724609375, + "learning_rate": 8.40728436054306e-06, + "loss": 3.3831, + "step": 29040 + }, + { + "epoch": 0.8510949974364609, + "grad_norm": 10.927193641662598, + "learning_rate": 8.406126765646969e-06, + "loss": 3.4103, + "step": 29050 + }, + { + "epoch": 0.8513879733391928, + "grad_norm": 11.108163833618164, + "learning_rate": 8.404968829988102e-06, + "loss": 3.3887, + "step": 29060 + }, + { + "epoch": 0.8516809492419248, + "grad_norm": 10.621505737304688, + "learning_rate": 8.403810553682307e-06, + "loss": 3.388, + "step": 29070 + }, + { + "epoch": 0.8519739251446569, + "grad_norm": 10.56363296508789, + "learning_rate": 8.40265193684546e-06, + "loss": 3.3993, + "step": 29080 + }, + { + "epoch": 0.8522669010473889, + "grad_norm": 10.040878295898438, + "learning_rate": 8.401492979593474e-06, + "loss": 3.3693, + "step": 29090 + }, + { + "epoch": 0.8525598769501208, + "grad_norm": 12.43188190460205, + "learning_rate": 8.400333682042295e-06, + "loss": 3.3789, + "step": 29100 + }, + { + "epoch": 0.8528528528528528, + "grad_norm": 11.146537780761719, + "learning_rate": 8.399174044307904e-06, + "loss": 3.3707, + "step": 29110 + }, + { + "epoch": 0.8531458287555849, + "grad_norm": 10.575197219848633, + "learning_rate": 8.398014066506316e-06, + "loss": 3.3784, + "step": 29120 + }, + { + "epoch": 0.8534388046583169, + "grad_norm": 10.218196868896484, + "learning_rate": 8.396853748753582e-06, + "loss": 3.4019, + "step": 29130 + }, + { + "epoch": 0.8537317805610488, + "grad_norm": 10.522954940795898, + "learning_rate": 8.395693091165783e-06, + "loss": 3.385, + "step": 29140 + }, + { + "epoch": 0.8540247564637808, + "grad_norm": 11.071453094482422, + "learning_rate": 8.394532093859036e-06, + "loss": 3.39, + "step": 29150 + }, + { + "epoch": 0.8543177323665129, + "grad_norm": 11.317244529724121, + "learning_rate": 8.393370756949492e-06, + "loss": 3.3829, + "step": 29160 + }, + { + "epoch": 0.8546107082692449, + "grad_norm": 11.438992500305176, + "learning_rate": 8.392209080553335e-06, + "loss": 3.3951, + "step": 29170 + }, + { + "epoch": 0.8549036841719768, + "grad_norm": 11.306117057800293, + "learning_rate": 8.391047064786787e-06, + "loss": 3.4088, + "step": 29180 + }, + { + "epoch": 0.8550208745330696, + "eval_bleu": 0.33281076842104107, + "eval_cap_loss": 0.961840033531189, + "eval_con_loss": 1.3046846389770508, + "eval_loss": 3.571209192276001, + "step": 29184 + }, + { + "epoch": 0.8550208745330696, + "eval_bleu": 0.33281076842104107, + "eval_cap_loss": 0.961840033531189, + "eval_con_loss": 1.3046846389770508, + "eval_loss": 3.571209192276001, + "eval_runtime": 55.8931, + "eval_samples_per_second": 357.826, + "eval_steps_per_second": 0.358, + "step": 29184 + }, + { + "epoch": 0.8551966600747088, + "grad_norm": 11.735923767089844, + "learning_rate": 8.389884709766097e-06, + "loss": 3.4115, + "step": 29190 + }, + { + "epoch": 0.8554896359774409, + "grad_norm": 10.874889373779297, + "learning_rate": 8.388722015607553e-06, + "loss": 3.376, + "step": 29200 + }, + { + "epoch": 0.8557826118801729, + "grad_norm": 11.839069366455078, + "learning_rate": 8.387558982427477e-06, + "loss": 3.3813, + "step": 29210 + }, + { + "epoch": 0.8560755877829048, + "grad_norm": 9.50670337677002, + "learning_rate": 8.386395610342222e-06, + "loss": 3.3812, + "step": 29220 + }, + { + "epoch": 0.8563685636856369, + "grad_norm": 11.641897201538086, + "learning_rate": 8.385231899468178e-06, + "loss": 3.3685, + "step": 29230 + }, + { + "epoch": 0.8566615395883689, + "grad_norm": 10.47932243347168, + "learning_rate": 8.384067849921765e-06, + "loss": 3.3756, + "step": 29240 + }, + { + "epoch": 0.8569545154911009, + "grad_norm": 10.291852951049805, + "learning_rate": 8.382903461819441e-06, + "loss": 3.3925, + "step": 29250 + }, + { + "epoch": 0.8572474913938328, + "grad_norm": 11.128689765930176, + "learning_rate": 8.381738735277696e-06, + "loss": 3.4137, + "step": 29260 + }, + { + "epoch": 0.8575404672965649, + "grad_norm": 10.559643745422363, + "learning_rate": 8.380573670413052e-06, + "loss": 3.3855, + "step": 29270 + }, + { + "epoch": 0.8578334431992969, + "grad_norm": 10.903712272644043, + "learning_rate": 8.37940826734207e-06, + "loss": 3.3829, + "step": 29280 + }, + { + "epoch": 0.8581264191020288, + "grad_norm": 10.692764282226562, + "learning_rate": 8.378242526181341e-06, + "loss": 3.3956, + "step": 29290 + }, + { + "epoch": 0.8584193950047608, + "grad_norm": 11.0559720993042, + "learning_rate": 8.37707644704749e-06, + "loss": 3.4011, + "step": 29300 + }, + { + "epoch": 0.8587123709074929, + "grad_norm": 10.319923400878906, + "learning_rate": 8.375910030057174e-06, + "loss": 3.3952, + "step": 29310 + }, + { + "epoch": 0.8590053468102249, + "grad_norm": 11.0838041305542, + "learning_rate": 8.374743275327088e-06, + "loss": 3.3708, + "step": 29320 + }, + { + "epoch": 0.8592983227129568, + "grad_norm": 11.440781593322754, + "learning_rate": 8.373576182973962e-06, + "loss": 3.3518, + "step": 29330 + }, + { + "epoch": 0.8595912986156888, + "grad_norm": 10.120816230773926, + "learning_rate": 8.37240875311455e-06, + "loss": 3.4109, + "step": 29340 + }, + { + "epoch": 0.8598842745184209, + "grad_norm": 11.246816635131836, + "learning_rate": 8.371240985865652e-06, + "loss": 3.3771, + "step": 29350 + }, + { + "epoch": 0.8601772504211529, + "grad_norm": 11.254953384399414, + "learning_rate": 8.370072881344093e-06, + "loss": 3.3481, + "step": 29360 + }, + { + "epoch": 0.8604702263238848, + "grad_norm": 10.469719886779785, + "learning_rate": 8.368904439666739e-06, + "loss": 3.3865, + "step": 29370 + }, + { + "epoch": 0.8607632022266168, + "grad_norm": 11.026680946350098, + "learning_rate": 8.36773566095048e-06, + "loss": 3.3927, + "step": 29380 + }, + { + "epoch": 0.8610561781293489, + "grad_norm": 10.402448654174805, + "learning_rate": 8.366566545312248e-06, + "loss": 3.3833, + "step": 29390 + }, + { + "epoch": 0.8613491540320809, + "grad_norm": 10.14723014831543, + "learning_rate": 8.365397092869007e-06, + "loss": 3.3737, + "step": 29400 + }, + { + "epoch": 0.8616421299348128, + "grad_norm": 10.904383659362793, + "learning_rate": 8.364227303737755e-06, + "loss": 3.3636, + "step": 29410 + }, + { + "epoch": 0.8619351058375448, + "grad_norm": 11.157063484191895, + "learning_rate": 8.363057178035518e-06, + "loss": 3.3743, + "step": 29420 + }, + { + "epoch": 0.8622280817402769, + "grad_norm": 11.179853439331055, + "learning_rate": 8.361886715879364e-06, + "loss": 3.3937, + "step": 29430 + }, + { + "epoch": 0.8625210576430089, + "grad_norm": 10.853559494018555, + "learning_rate": 8.360715917386388e-06, + "loss": 3.3756, + "step": 29440 + }, + { + "epoch": 0.8628140335457408, + "grad_norm": 12.377105712890625, + "learning_rate": 8.359544782673726e-06, + "loss": 3.3633, + "step": 29450 + }, + { + "epoch": 0.8631070094484729, + "grad_norm": 12.135165214538574, + "learning_rate": 8.358373311858536e-06, + "loss": 3.4042, + "step": 29460 + }, + { + "epoch": 0.8633999853512049, + "grad_norm": 10.837443351745605, + "learning_rate": 8.357201505058024e-06, + "loss": 3.3779, + "step": 29470 + }, + { + "epoch": 0.8636929612539369, + "grad_norm": 10.91527271270752, + "learning_rate": 8.356029362389418e-06, + "loss": 3.3621, + "step": 29480 + }, + { + "epoch": 0.8639859371566688, + "grad_norm": 11.752740859985352, + "learning_rate": 8.354856883969984e-06, + "loss": 3.3635, + "step": 29490 + }, + { + "epoch": 0.8642789130594009, + "grad_norm": 11.46235179901123, + "learning_rate": 8.353684069917022e-06, + "loss": 3.3722, + "step": 29500 + }, + { + "epoch": 0.8645718889621329, + "grad_norm": 10.231536865234375, + "learning_rate": 8.352510920347868e-06, + "loss": 3.3592, + "step": 29510 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 11.671590805053711, + "learning_rate": 8.351337435379885e-06, + "loss": 3.3758, + "step": 29520 + }, + { + "epoch": 0.8651578407675968, + "grad_norm": 11.296802520751953, + "learning_rate": 8.350163615130474e-06, + "loss": 3.3711, + "step": 29530 + }, + { + "epoch": 0.8654508166703289, + "grad_norm": 11.545193672180176, + "learning_rate": 8.34898945971707e-06, + "loss": 3.3831, + "step": 29540 + }, + { + "epoch": 0.8657437925730609, + "grad_norm": 10.37607192993164, + "learning_rate": 8.347814969257138e-06, + "loss": 3.3735, + "step": 29550 + }, + { + "epoch": 0.8660367684757929, + "grad_norm": 12.001282691955566, + "learning_rate": 8.34664014386818e-06, + "loss": 3.3975, + "step": 29560 + }, + { + "epoch": 0.8663297443785248, + "grad_norm": 11.215582847595215, + "learning_rate": 8.345464983667732e-06, + "loss": 3.4069, + "step": 29570 + }, + { + "epoch": 0.8666227202812569, + "grad_norm": 11.42966079711914, + "learning_rate": 8.344289488773361e-06, + "loss": 3.3739, + "step": 29580 + }, + { + "epoch": 0.8669156961839889, + "grad_norm": 10.603336334228516, + "learning_rate": 8.343113659302666e-06, + "loss": 3.3715, + "step": 29590 + }, + { + "epoch": 0.8672086720867209, + "grad_norm": 10.892402648925781, + "learning_rate": 8.341937495373284e-06, + "loss": 3.3987, + "step": 29600 + }, + { + "epoch": 0.8675016479894528, + "grad_norm": 10.772066116333008, + "learning_rate": 8.340760997102883e-06, + "loss": 3.3688, + "step": 29610 + }, + { + "epoch": 0.8677946238921849, + "grad_norm": 11.466285705566406, + "learning_rate": 8.339584164609164e-06, + "loss": 3.3774, + "step": 29620 + }, + { + "epoch": 0.8680875997949169, + "grad_norm": 10.38890552520752, + "learning_rate": 8.338406998009863e-06, + "loss": 3.37, + "step": 29630 + }, + { + "epoch": 0.8683805756976488, + "grad_norm": 10.548385620117188, + "learning_rate": 8.337229497422748e-06, + "loss": 3.3836, + "step": 29640 + }, + { + "epoch": 0.8686735516003808, + "grad_norm": 10.456439971923828, + "learning_rate": 8.336051662965619e-06, + "loss": 3.3523, + "step": 29650 + }, + { + "epoch": 0.8689665275031129, + "grad_norm": 11.013508796691895, + "learning_rate": 8.334873494756315e-06, + "loss": 3.3904, + "step": 29660 + }, + { + "epoch": 0.8692595034058449, + "grad_norm": 10.943793296813965, + "learning_rate": 8.333694992912702e-06, + "loss": 3.3797, + "step": 29670 + }, + { + "epoch": 0.8695524793085768, + "grad_norm": 10.760442733764648, + "learning_rate": 8.332516157552684e-06, + "loss": 3.3574, + "step": 29680 + }, + { + "epoch": 0.8698454552113088, + "grad_norm": 11.023306846618652, + "learning_rate": 8.331336988794193e-06, + "loss": 3.3608, + "step": 29690 + }, + { + "epoch": 0.8700212407529481, + "eval_bleu": 0.3331718658569224, + "eval_cap_loss": 0.9618687629699707, + "eval_con_loss": 1.2964668273925781, + "eval_loss": 3.554802656173706, + "step": 29696 + }, + { + "epoch": 0.8700212407529481, + "eval_bleu": 0.3331718658569224, + "eval_cap_loss": 0.9618687629699707, + "eval_con_loss": 1.2964668273925781, + "eval_loss": 3.554802656173706, + "eval_runtime": 57.1287, + "eval_samples_per_second": 350.087, + "eval_steps_per_second": 0.35, + "step": 29696 + }, + { + "epoch": 0.8701384311140409, + "grad_norm": 11.158675193786621, + "learning_rate": 8.330157486755204e-06, + "loss": 3.3755, + "step": 29700 + }, + { + "epoch": 0.8704314070167729, + "grad_norm": 11.10872745513916, + "learning_rate": 8.328977651553713e-06, + "loss": 3.3645, + "step": 29710 + }, + { + "epoch": 0.8707243829195048, + "grad_norm": 10.27478313446045, + "learning_rate": 8.32779748330776e-06, + "loss": 3.3471, + "step": 29720 + }, + { + "epoch": 0.8710173588222369, + "grad_norm": 11.428064346313477, + "learning_rate": 8.326616982135412e-06, + "loss": 3.3454, + "step": 29730 + }, + { + "epoch": 0.8713103347249689, + "grad_norm": 10.515045166015625, + "learning_rate": 8.325436148154768e-06, + "loss": 3.3654, + "step": 29740 + }, + { + "epoch": 0.8716033106277009, + "grad_norm": 10.74237060546875, + "learning_rate": 8.32425498148397e-06, + "loss": 3.3895, + "step": 29750 + }, + { + "epoch": 0.8718962865304328, + "grad_norm": 11.028646469116211, + "learning_rate": 8.323073482241183e-06, + "loss": 3.3789, + "step": 29760 + }, + { + "epoch": 0.8721892624331649, + "grad_norm": 10.529708862304688, + "learning_rate": 8.321891650544606e-06, + "loss": 3.3827, + "step": 29770 + }, + { + "epoch": 0.8724822383358969, + "grad_norm": 11.3214111328125, + "learning_rate": 8.32070948651248e-06, + "loss": 3.3637, + "step": 29780 + }, + { + "epoch": 0.8727752142386289, + "grad_norm": 10.530867576599121, + "learning_rate": 8.319526990263071e-06, + "loss": 3.3719, + "step": 29790 + }, + { + "epoch": 0.8730681901413608, + "grad_norm": 11.517393112182617, + "learning_rate": 8.31834416191468e-06, + "loss": 3.3845, + "step": 29800 + }, + { + "epoch": 0.8733611660440929, + "grad_norm": 10.259695053100586, + "learning_rate": 8.317161001585643e-06, + "loss": 3.3568, + "step": 29810 + }, + { + "epoch": 0.8736541419468249, + "grad_norm": 11.420149803161621, + "learning_rate": 8.315977509394329e-06, + "loss": 3.3732, + "step": 29820 + }, + { + "epoch": 0.8739471178495569, + "grad_norm": 11.657552719116211, + "learning_rate": 8.314793685459135e-06, + "loss": 3.3829, + "step": 29830 + }, + { + "epoch": 0.8742400937522888, + "grad_norm": 10.584972381591797, + "learning_rate": 8.313609529898502e-06, + "loss": 3.3568, + "step": 29840 + }, + { + "epoch": 0.8745330696550209, + "grad_norm": 10.239253044128418, + "learning_rate": 8.312425042830892e-06, + "loss": 3.3837, + "step": 29850 + }, + { + "epoch": 0.8748260455577529, + "grad_norm": 11.099543571472168, + "learning_rate": 8.31124022437481e-06, + "loss": 3.3733, + "step": 29860 + }, + { + "epoch": 0.8751190214604849, + "grad_norm": 10.32801342010498, + "learning_rate": 8.310055074648789e-06, + "loss": 3.3617, + "step": 29870 + }, + { + "epoch": 0.8754119973632168, + "grad_norm": 10.694561004638672, + "learning_rate": 8.308869593771395e-06, + "loss": 3.3632, + "step": 29880 + }, + { + "epoch": 0.8757049732659489, + "grad_norm": 11.176569938659668, + "learning_rate": 8.307683781861228e-06, + "loss": 3.356, + "step": 29890 + }, + { + "epoch": 0.8759979491686809, + "grad_norm": 10.668281555175781, + "learning_rate": 8.306497639036923e-06, + "loss": 3.374, + "step": 29900 + }, + { + "epoch": 0.8762909250714129, + "grad_norm": 10.938618659973145, + "learning_rate": 8.305311165417147e-06, + "loss": 3.3671, + "step": 29910 + }, + { + "epoch": 0.8765839009741448, + "grad_norm": 11.486509323120117, + "learning_rate": 8.304124361120598e-06, + "loss": 3.3612, + "step": 29920 + }, + { + "epoch": 0.8768768768768769, + "grad_norm": 10.901534080505371, + "learning_rate": 8.30293722626601e-06, + "loss": 3.3453, + "step": 29930 + }, + { + "epoch": 0.8771698527796089, + "grad_norm": 10.841011047363281, + "learning_rate": 8.301749760972146e-06, + "loss": 3.362, + "step": 29940 + }, + { + "epoch": 0.8774628286823408, + "grad_norm": 11.640337944030762, + "learning_rate": 8.30056196535781e-06, + "loss": 3.3876, + "step": 29950 + }, + { + "epoch": 0.8777558045850729, + "grad_norm": 11.587151527404785, + "learning_rate": 8.299492666979114e-06, + "loss": 3.3418, + "step": 29960 + }, + { + "epoch": 0.8780487804878049, + "grad_norm": 10.065569877624512, + "learning_rate": 8.298304244083282e-06, + "loss": 3.3829, + "step": 29970 + }, + { + "epoch": 0.8783417563905369, + "grad_norm": 11.404741287231445, + "learning_rate": 8.29711549121168e-06, + "loss": 3.3572, + "step": 29980 + }, + { + "epoch": 0.8786347322932688, + "grad_norm": 11.547711372375488, + "learning_rate": 8.295926408483235e-06, + "loss": 3.3817, + "step": 29990 + }, + { + "epoch": 0.8789277081960009, + "grad_norm": 10.273787498474121, + "learning_rate": 8.294736996016905e-06, + "loss": 3.3686, + "step": 30000 + }, + { + "epoch": 0.8792206840987329, + "grad_norm": 10.919272422790527, + "learning_rate": 8.293547253931687e-06, + "loss": 3.3654, + "step": 30010 + }, + { + "epoch": 0.8795136600014649, + "grad_norm": 10.960151672363281, + "learning_rate": 8.292357182346605e-06, + "loss": 3.3715, + "step": 30020 + }, + { + "epoch": 0.8798066359041968, + "grad_norm": 11.728584289550781, + "learning_rate": 8.291166781380721e-06, + "loss": 3.3565, + "step": 30030 + }, + { + "epoch": 0.8800996118069289, + "grad_norm": 10.396467208862305, + "learning_rate": 8.289976051153126e-06, + "loss": 3.3767, + "step": 30040 + }, + { + "epoch": 0.8803925877096609, + "grad_norm": 11.445452690124512, + "learning_rate": 8.288784991782945e-06, + "loss": 3.3685, + "step": 30050 + }, + { + "epoch": 0.8806855636123929, + "grad_norm": 11.595305442810059, + "learning_rate": 8.287593603389339e-06, + "loss": 3.3619, + "step": 30060 + }, + { + "epoch": 0.8809785395151248, + "grad_norm": 10.420138359069824, + "learning_rate": 8.286401886091495e-06, + "loss": 3.3511, + "step": 30070 + }, + { + "epoch": 0.8812715154178569, + "grad_norm": 10.805855751037598, + "learning_rate": 8.285209840008641e-06, + "loss": 3.3668, + "step": 30080 + }, + { + "epoch": 0.8815644913205889, + "grad_norm": 10.25864028930664, + "learning_rate": 8.284017465260035e-06, + "loss": 3.3611, + "step": 30090 + }, + { + "epoch": 0.8818574672233209, + "grad_norm": 11.125330924987793, + "learning_rate": 8.282824761964963e-06, + "loss": 3.3817, + "step": 30100 + }, + { + "epoch": 0.8821504431260528, + "grad_norm": 10.698201179504395, + "learning_rate": 8.281631730242749e-06, + "loss": 3.3704, + "step": 30110 + }, + { + "epoch": 0.8824434190287849, + "grad_norm": 9.5828857421875, + "learning_rate": 8.280438370212751e-06, + "loss": 3.3522, + "step": 30120 + }, + { + "epoch": 0.8827363949315169, + "grad_norm": 10.810783386230469, + "learning_rate": 8.279244681994357e-06, + "loss": 3.3812, + "step": 30130 + }, + { + "epoch": 0.8830293708342489, + "grad_norm": 10.337356567382812, + "learning_rate": 8.278050665706986e-06, + "loss": 3.3573, + "step": 30140 + }, + { + "epoch": 0.8833223467369808, + "grad_norm": 10.310083389282227, + "learning_rate": 8.276856321470095e-06, + "loss": 3.3561, + "step": 30150 + }, + { + "epoch": 0.8836153226397129, + "grad_norm": 11.838809967041016, + "learning_rate": 8.275661649403169e-06, + "loss": 3.3459, + "step": 30160 + }, + { + "epoch": 0.8839082985424449, + "grad_norm": 10.702436447143555, + "learning_rate": 8.27446664962573e-06, + "loss": 3.3689, + "step": 30170 + }, + { + "epoch": 0.8842012744451769, + "grad_norm": 9.795417785644531, + "learning_rate": 8.273271322257329e-06, + "loss": 3.3349, + "step": 30180 + }, + { + "epoch": 0.8844942503479089, + "grad_norm": 10.537034034729004, + "learning_rate": 8.272075667417552e-06, + "loss": 3.3596, + "step": 30190 + }, + { + "epoch": 0.8847872262506409, + "grad_norm": 12.37125301361084, + "learning_rate": 8.270879685226017e-06, + "loss": 3.3771, + "step": 30200 + }, + { + "epoch": 0.8850216069728265, + "eval_bleu": 0.3336200247375735, + "eval_cap_loss": 0.957573652267456, + "eval_con_loss": 1.297119379043579, + "eval_loss": 3.5518126487731934, + "step": 30208 + }, + { + "epoch": 0.8850216069728265, + "eval_bleu": 0.3336200247375735, + "eval_cap_loss": 0.957573652267456, + "eval_con_loss": 1.297119379043579, + "eval_loss": 3.5518126487731934, + "eval_runtime": 60.5564, + "eval_samples_per_second": 330.271, + "eval_steps_per_second": 0.33, + "step": 30208 + }, + { + "epoch": 0.8850802021533729, + "grad_norm": 11.35163402557373, + "learning_rate": 8.269683375802372e-06, + "loss": 3.3489, + "step": 30210 + }, + { + "epoch": 0.8853731780561049, + "grad_norm": 11.578800201416016, + "learning_rate": 8.268486739266308e-06, + "loss": 3.3674, + "step": 30220 + }, + { + "epoch": 0.8856661539588369, + "grad_norm": 10.513941764831543, + "learning_rate": 8.267289775737535e-06, + "loss": 3.359, + "step": 30230 + }, + { + "epoch": 0.8859591298615689, + "grad_norm": 11.75010871887207, + "learning_rate": 8.266092485335805e-06, + "loss": 3.3763, + "step": 30240 + }, + { + "epoch": 0.8862521057643009, + "grad_norm": 11.764273643493652, + "learning_rate": 8.264894868180898e-06, + "loss": 3.3502, + "step": 30250 + }, + { + "epoch": 0.8865450816670329, + "grad_norm": 11.533585548400879, + "learning_rate": 8.26369692439263e-06, + "loss": 3.371, + "step": 30260 + }, + { + "epoch": 0.8868380575697649, + "grad_norm": 11.605118751525879, + "learning_rate": 8.262498654090846e-06, + "loss": 3.3628, + "step": 30270 + }, + { + "epoch": 0.8871310334724969, + "grad_norm": 11.501765251159668, + "learning_rate": 8.26130005739543e-06, + "loss": 3.381, + "step": 30280 + }, + { + "epoch": 0.8874240093752289, + "grad_norm": 11.384305000305176, + "learning_rate": 8.26010113442629e-06, + "loss": 3.3822, + "step": 30290 + }, + { + "epoch": 0.8877169852779608, + "grad_norm": 10.195365905761719, + "learning_rate": 8.258901885303374e-06, + "loss": 3.37, + "step": 30300 + }, + { + "epoch": 0.8880099611806929, + "grad_norm": 10.441720008850098, + "learning_rate": 8.25770231014666e-06, + "loss": 3.3514, + "step": 30310 + }, + { + "epoch": 0.8883029370834249, + "grad_norm": 10.517163276672363, + "learning_rate": 8.256502409076154e-06, + "loss": 3.3868, + "step": 30320 + }, + { + "epoch": 0.8885959129861569, + "grad_norm": 11.692268371582031, + "learning_rate": 8.255302182211904e-06, + "loss": 3.3664, + "step": 30330 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 10.180502891540527, + "learning_rate": 8.254101629673983e-06, + "loss": 3.3716, + "step": 30340 + }, + { + "epoch": 0.8891818647916209, + "grad_norm": 10.401322364807129, + "learning_rate": 8.2529007515825e-06, + "loss": 3.3802, + "step": 30350 + }, + { + "epoch": 0.8894748406943529, + "grad_norm": 11.941841125488281, + "learning_rate": 8.251699548057596e-06, + "loss": 3.36, + "step": 30360 + }, + { + "epoch": 0.8897678165970849, + "grad_norm": 10.77122974395752, + "learning_rate": 8.250498019219445e-06, + "loss": 3.3705, + "step": 30370 + }, + { + "epoch": 0.8900607924998168, + "grad_norm": 10.484074592590332, + "learning_rate": 8.24929616518825e-06, + "loss": 3.3656, + "step": 30380 + }, + { + "epoch": 0.8903537684025489, + "grad_norm": 10.346179008483887, + "learning_rate": 8.248093986084252e-06, + "loss": 3.3623, + "step": 30390 + }, + { + "epoch": 0.8906467443052809, + "grad_norm": 10.254220008850098, + "learning_rate": 8.246891482027722e-06, + "loss": 3.3306, + "step": 30400 + }, + { + "epoch": 0.8909397202080129, + "grad_norm": 11.11728286743164, + "learning_rate": 8.245688653138959e-06, + "loss": 3.3276, + "step": 30410 + }, + { + "epoch": 0.8912326961107448, + "grad_norm": 11.544373512268066, + "learning_rate": 8.244485499538305e-06, + "loss": 3.3613, + "step": 30420 + }, + { + "epoch": 0.8915256720134769, + "grad_norm": 12.357559204101562, + "learning_rate": 8.243282021346125e-06, + "loss": 3.3418, + "step": 30430 + }, + { + "epoch": 0.8918186479162089, + "grad_norm": 11.418152809143066, + "learning_rate": 8.242078218682821e-06, + "loss": 3.3517, + "step": 30440 + }, + { + "epoch": 0.8921116238189409, + "grad_norm": 10.797380447387695, + "learning_rate": 8.240874091668824e-06, + "loss": 3.3473, + "step": 30450 + }, + { + "epoch": 0.8924045997216729, + "grad_norm": 11.474592208862305, + "learning_rate": 8.239669640424603e-06, + "loss": 3.3574, + "step": 30460 + }, + { + "epoch": 0.8926975756244049, + "grad_norm": 11.103228569030762, + "learning_rate": 8.238464865070655e-06, + "loss": 3.3505, + "step": 30470 + }, + { + "epoch": 0.8929905515271369, + "grad_norm": 10.879101753234863, + "learning_rate": 8.23725976572751e-06, + "loss": 3.3641, + "step": 30480 + }, + { + "epoch": 0.8932835274298689, + "grad_norm": 11.550766944885254, + "learning_rate": 8.23605434251573e-06, + "loss": 3.371, + "step": 30490 + }, + { + "epoch": 0.8935765033326009, + "grad_norm": 11.858642578125, + "learning_rate": 8.234848595555915e-06, + "loss": 3.3346, + "step": 30500 + }, + { + "epoch": 0.8938694792353329, + "grad_norm": 10.627043724060059, + "learning_rate": 8.233642524968688e-06, + "loss": 3.3488, + "step": 30510 + }, + { + "epoch": 0.8941624551380649, + "grad_norm": 12.227088928222656, + "learning_rate": 8.23243613087471e-06, + "loss": 3.3461, + "step": 30520 + }, + { + "epoch": 0.8944554310407969, + "grad_norm": 11.371790885925293, + "learning_rate": 8.231229413394675e-06, + "loss": 3.3678, + "step": 30530 + }, + { + "epoch": 0.8947484069435289, + "grad_norm": 10.951679229736328, + "learning_rate": 8.230022372649309e-06, + "loss": 3.3518, + "step": 30540 + }, + { + "epoch": 0.8950413828462609, + "grad_norm": 10.429259300231934, + "learning_rate": 8.228815008759365e-06, + "loss": 3.4027, + "step": 30550 + }, + { + "epoch": 0.8953343587489929, + "grad_norm": 11.419511795043945, + "learning_rate": 8.227607321845638e-06, + "loss": 3.3686, + "step": 30560 + }, + { + "epoch": 0.895627334651725, + "grad_norm": 10.719669342041016, + "learning_rate": 8.226399312028945e-06, + "loss": 3.3568, + "step": 30570 + }, + { + "epoch": 0.8959203105544569, + "grad_norm": 10.916747093200684, + "learning_rate": 8.225190979430145e-06, + "loss": 3.3541, + "step": 30580 + }, + { + "epoch": 0.8962132864571889, + "grad_norm": 10.462042808532715, + "learning_rate": 8.223982324170119e-06, + "loss": 3.3581, + "step": 30590 + }, + { + "epoch": 0.8965062623599209, + "grad_norm": 11.036065101623535, + "learning_rate": 8.22277334636979e-06, + "loss": 3.356, + "step": 30600 + }, + { + "epoch": 0.8967992382626528, + "grad_norm": 11.561779022216797, + "learning_rate": 8.221564046150108e-06, + "loss": 3.3445, + "step": 30610 + }, + { + "epoch": 0.8970922141653849, + "grad_norm": 10.075597763061523, + "learning_rate": 8.220354423632056e-06, + "loss": 3.3545, + "step": 30620 + }, + { + "epoch": 0.8973851900681169, + "grad_norm": 10.882328987121582, + "learning_rate": 8.21914447893665e-06, + "loss": 3.3442, + "step": 30630 + }, + { + "epoch": 0.8976781659708489, + "grad_norm": 10.637252807617188, + "learning_rate": 8.217934212184938e-06, + "loss": 3.3547, + "step": 30640 + }, + { + "epoch": 0.8979711418735808, + "grad_norm": 11.376338958740234, + "learning_rate": 8.216723623497998e-06, + "loss": 3.3414, + "step": 30650 + }, + { + "epoch": 0.8982641177763129, + "grad_norm": 11.676483154296875, + "learning_rate": 8.215512712996943e-06, + "loss": 3.3437, + "step": 30660 + }, + { + "epoch": 0.8985570936790449, + "grad_norm": 11.949992179870605, + "learning_rate": 8.21430148080292e-06, + "loss": 3.3513, + "step": 30670 + }, + { + "epoch": 0.8988500695817769, + "grad_norm": 10.224363327026367, + "learning_rate": 8.2130899270371e-06, + "loss": 3.3516, + "step": 30680 + }, + { + "epoch": 0.8991430454845089, + "grad_norm": 11.194315910339355, + "learning_rate": 8.2118780518207e-06, + "loss": 3.3629, + "step": 30690 + }, + { + "epoch": 0.8994360213872409, + "grad_norm": 10.477357864379883, + "learning_rate": 8.210665855274954e-06, + "loss": 3.3555, + "step": 30700 + }, + { + "epoch": 0.8997289972899729, + "grad_norm": 10.744463920593262, + "learning_rate": 8.209453337521138e-06, + "loss": 3.3704, + "step": 30710 + }, + { + "epoch": 0.9000219731927049, + "grad_norm": 11.484292984008789, + "learning_rate": 8.208240498680556e-06, + "loss": 3.3264, + "step": 30720 + }, + { + "epoch": 0.9000219731927049, + "eval_bleu": 0.33297462204295514, + "eval_cap_loss": 0.9590845108032227, + "eval_con_loss": 1.2920234203338623, + "eval_loss": 3.543130874633789, + "step": 30720 + }, + { + "epoch": 0.9000219731927049, + "eval_bleu": 0.33297462204295514, + "eval_cap_loss": 0.9590845108032227, + "eval_con_loss": 1.2920234203338623, + "eval_loss": 3.543130874633789, + "eval_runtime": 65.0639, + "eval_samples_per_second": 307.39, + "eval_steps_per_second": 0.307, + "step": 30720 + }, + { + "epoch": 0.9003149490954369, + "grad_norm": 12.36959171295166, + "learning_rate": 8.207027338874546e-06, + "loss": 3.3314, + "step": 30730 + }, + { + "epoch": 0.9006079249981689, + "grad_norm": 11.095726013183594, + "learning_rate": 8.205813858224478e-06, + "loss": 3.3492, + "step": 30740 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 11.221593856811523, + "learning_rate": 8.204600056851753e-06, + "loss": 3.3402, + "step": 30750 + }, + { + "epoch": 0.9011938768036329, + "grad_norm": 12.373833656311035, + "learning_rate": 8.203385934877803e-06, + "loss": 3.3754, + "step": 30760 + }, + { + "epoch": 0.9014868527063649, + "grad_norm": 10.966906547546387, + "learning_rate": 8.202171492424098e-06, + "loss": 3.3593, + "step": 30770 + }, + { + "epoch": 0.9017798286090969, + "grad_norm": 11.160589218139648, + "learning_rate": 8.20095672961213e-06, + "loss": 3.364, + "step": 30780 + }, + { + "epoch": 0.9020728045118289, + "grad_norm": 10.270081520080566, + "learning_rate": 8.199741646563435e-06, + "loss": 3.3363, + "step": 30790 + }, + { + "epoch": 0.902365780414561, + "grad_norm": 11.395282745361328, + "learning_rate": 8.19852624339957e-06, + "loss": 3.3422, + "step": 30800 + }, + { + "epoch": 0.9026587563172929, + "grad_norm": 11.36366081237793, + "learning_rate": 8.19731052024213e-06, + "loss": 3.3336, + "step": 30810 + }, + { + "epoch": 0.9029517322200249, + "grad_norm": 11.945984840393066, + "learning_rate": 8.196094477212741e-06, + "loss": 3.3665, + "step": 30820 + }, + { + "epoch": 0.9032447081227569, + "grad_norm": 10.891108512878418, + "learning_rate": 8.194878114433062e-06, + "loss": 3.3548, + "step": 30830 + }, + { + "epoch": 0.903537684025489, + "grad_norm": 10.005308151245117, + "learning_rate": 8.193661432024783e-06, + "loss": 3.3288, + "step": 30840 + }, + { + "epoch": 0.9038306599282209, + "grad_norm": 10.809102058410645, + "learning_rate": 8.192444430109626e-06, + "loss": 3.34, + "step": 30850 + }, + { + "epoch": 0.9041236358309529, + "grad_norm": 11.127388954162598, + "learning_rate": 8.19122710880934e-06, + "loss": 3.3382, + "step": 30860 + }, + { + "epoch": 0.9044166117336849, + "grad_norm": 10.717702865600586, + "learning_rate": 8.19000946824572e-06, + "loss": 3.3361, + "step": 30870 + }, + { + "epoch": 0.904709587636417, + "grad_norm": 10.935877799987793, + "learning_rate": 8.188791508540574e-06, + "loss": 3.3513, + "step": 30880 + }, + { + "epoch": 0.9050025635391489, + "grad_norm": 10.84599494934082, + "learning_rate": 8.187573229815757e-06, + "loss": 3.3335, + "step": 30890 + }, + { + "epoch": 0.9052955394418809, + "grad_norm": 10.283318519592285, + "learning_rate": 8.186354632193151e-06, + "loss": 3.3561, + "step": 30900 + }, + { + "epoch": 0.9055885153446129, + "grad_norm": 10.34257984161377, + "learning_rate": 8.185135715794668e-06, + "loss": 3.3599, + "step": 30910 + }, + { + "epoch": 0.905881491247345, + "grad_norm": 10.169829368591309, + "learning_rate": 8.183916480742252e-06, + "loss": 3.3611, + "step": 30920 + }, + { + "epoch": 0.9061744671500769, + "grad_norm": 11.101421356201172, + "learning_rate": 8.182696927157883e-06, + "loss": 3.349, + "step": 30930 + }, + { + "epoch": 0.9064674430528089, + "grad_norm": 10.266020774841309, + "learning_rate": 8.181477055163566e-06, + "loss": 3.3482, + "step": 30940 + }, + { + "epoch": 0.9067604189555409, + "grad_norm": 10.23483943939209, + "learning_rate": 8.180256864881348e-06, + "loss": 3.3499, + "step": 30950 + }, + { + "epoch": 0.9070533948582729, + "grad_norm": 11.132615089416504, + "learning_rate": 8.179036356433296e-06, + "loss": 3.3458, + "step": 30960 + }, + { + "epoch": 0.9073463707610049, + "grad_norm": 10.921674728393555, + "learning_rate": 8.17781552994152e-06, + "loss": 3.3395, + "step": 30970 + }, + { + "epoch": 0.9076393466637369, + "grad_norm": 10.196318626403809, + "learning_rate": 8.176594385528151e-06, + "loss": 3.3263, + "step": 30980 + }, + { + "epoch": 0.9079323225664689, + "grad_norm": 11.190627098083496, + "learning_rate": 8.17537292331536e-06, + "loss": 3.3459, + "step": 30990 + }, + { + "epoch": 0.9082252984692009, + "grad_norm": 10.071724891662598, + "learning_rate": 8.174151143425348e-06, + "loss": 3.3341, + "step": 31000 + }, + { + "epoch": 0.9085182743719329, + "grad_norm": 11.593085289001465, + "learning_rate": 8.172929045980345e-06, + "loss": 3.3589, + "step": 31010 + }, + { + "epoch": 0.9088112502746649, + "grad_norm": 10.566102981567383, + "learning_rate": 8.171706631102616e-06, + "loss": 3.3562, + "step": 31020 + }, + { + "epoch": 0.909104226177397, + "grad_norm": 9.7378511428833, + "learning_rate": 8.170483898914455e-06, + "loss": 3.3457, + "step": 31030 + }, + { + "epoch": 0.9093972020801289, + "grad_norm": 9.868982315063477, + "learning_rate": 8.16926084953819e-06, + "loss": 3.321, + "step": 31040 + }, + { + "epoch": 0.9096901779828609, + "grad_norm": 10.682579040527344, + "learning_rate": 8.16803748309618e-06, + "loss": 3.3313, + "step": 31050 + }, + { + "epoch": 0.9099831538855929, + "grad_norm": 10.412065505981445, + "learning_rate": 8.166813799710814e-06, + "loss": 3.3449, + "step": 31060 + }, + { + "epoch": 0.910276129788325, + "grad_norm": 11.274150848388672, + "learning_rate": 8.165589799504516e-06, + "loss": 3.3718, + "step": 31070 + }, + { + "epoch": 0.9105691056910569, + "grad_norm": 10.222894668579102, + "learning_rate": 8.16436548259974e-06, + "loss": 3.3492, + "step": 31080 + }, + { + "epoch": 0.9108620815937889, + "grad_norm": 10.156421661376953, + "learning_rate": 8.16314084911897e-06, + "loss": 3.3643, + "step": 31090 + }, + { + "epoch": 0.9111550574965209, + "grad_norm": 10.174240112304688, + "learning_rate": 8.161915899184725e-06, + "loss": 3.3538, + "step": 31100 + }, + { + "epoch": 0.911448033399253, + "grad_norm": 10.48812484741211, + "learning_rate": 8.160690632919553e-06, + "loss": 3.3619, + "step": 31110 + }, + { + "epoch": 0.9117410093019849, + "grad_norm": 11.857813835144043, + "learning_rate": 8.159465050446034e-06, + "loss": 3.3494, + "step": 31120 + }, + { + "epoch": 0.9120339852047169, + "grad_norm": 9.652013778686523, + "learning_rate": 8.158239151886783e-06, + "loss": 3.3266, + "step": 31130 + }, + { + "epoch": 0.9123269611074489, + "grad_norm": 11.881214141845703, + "learning_rate": 8.157012937364442e-06, + "loss": 3.337, + "step": 31140 + }, + { + "epoch": 0.912619937010181, + "grad_norm": 10.999650001525879, + "learning_rate": 8.155786407001685e-06, + "loss": 3.3542, + "step": 31150 + }, + { + "epoch": 0.9129129129129129, + "grad_norm": 11.81067180633545, + "learning_rate": 8.15455956092122e-06, + "loss": 3.3231, + "step": 31160 + }, + { + "epoch": 0.9132058888156449, + "grad_norm": 11.092147827148438, + "learning_rate": 8.153332399245788e-06, + "loss": 3.3324, + "step": 31170 + }, + { + "epoch": 0.9134988647183769, + "grad_norm": 10.243229866027832, + "learning_rate": 8.152104922098158e-06, + "loss": 3.337, + "step": 31180 + }, + { + "epoch": 0.913791840621109, + "grad_norm": 9.969902992248535, + "learning_rate": 8.150877129601129e-06, + "loss": 3.3409, + "step": 31190 + }, + { + "epoch": 0.9140848165238409, + "grad_norm": 9.988624572753906, + "learning_rate": 8.14964902187754e-06, + "loss": 3.3492, + "step": 31200 + }, + { + "epoch": 0.9143777924265729, + "grad_norm": 10.963723182678223, + "learning_rate": 8.148420599050248e-06, + "loss": 3.3122, + "step": 31210 + }, + { + "epoch": 0.9146707683293049, + "grad_norm": 11.160887718200684, + "learning_rate": 8.147191861242158e-06, + "loss": 3.351, + "step": 31220 + }, + { + "epoch": 0.914963744232037, + "grad_norm": 11.263592720031738, + "learning_rate": 8.145962808576194e-06, + "loss": 3.3437, + "step": 31230 + }, + { + "epoch": 0.9150223394125833, + "eval_bleu": 0.33486966821176095, + "eval_cap_loss": 0.955473780632019, + "eval_con_loss": 1.286455750465393, + "eval_loss": 3.5283854007720947, + "step": 31232 + }, + { + "epoch": 0.9150223394125833, + "eval_bleu": 0.33486966821176095, + "eval_cap_loss": 0.955473780632019, + "eval_con_loss": 1.286455750465393, + "eval_loss": 3.5283854007720947, + "eval_runtime": 55.3287, + "eval_samples_per_second": 361.476, + "eval_steps_per_second": 0.361, + "step": 31232 + }, + { + "epoch": 0.9152567201347689, + "grad_norm": 10.23577880859375, + "learning_rate": 8.144733441175314e-06, + "loss": 3.3195, + "step": 31240 + }, + { + "epoch": 0.9155496960375009, + "grad_norm": 11.612424850463867, + "learning_rate": 8.143503759162509e-06, + "loss": 3.3506, + "step": 31250 + }, + { + "epoch": 0.9158426719402329, + "grad_norm": 9.771659851074219, + "learning_rate": 8.142273762660805e-06, + "loss": 3.3461, + "step": 31260 + }, + { + "epoch": 0.9161356478429649, + "grad_norm": 11.110848426818848, + "learning_rate": 8.141043451793252e-06, + "loss": 3.3299, + "step": 31270 + }, + { + "epoch": 0.9164286237456969, + "grad_norm": 11.279923439025879, + "learning_rate": 8.139812826682937e-06, + "loss": 3.3244, + "step": 31280 + }, + { + "epoch": 0.9167215996484289, + "grad_norm": 10.593377113342285, + "learning_rate": 8.138581887452977e-06, + "loss": 3.3351, + "step": 31290 + }, + { + "epoch": 0.917014575551161, + "grad_norm": 11.220492362976074, + "learning_rate": 8.13735063422652e-06, + "loss": 3.3443, + "step": 31300 + }, + { + "epoch": 0.9173075514538929, + "grad_norm": 12.235224723815918, + "learning_rate": 8.136119067126745e-06, + "loss": 3.3388, + "step": 31310 + }, + { + "epoch": 0.9176005273566249, + "grad_norm": 10.413290023803711, + "learning_rate": 8.134887186276863e-06, + "loss": 3.3503, + "step": 31320 + }, + { + "epoch": 0.9178935032593569, + "grad_norm": 12.296113014221191, + "learning_rate": 8.133654991800118e-06, + "loss": 3.3547, + "step": 31330 + }, + { + "epoch": 0.918186479162089, + "grad_norm": 10.58515739440918, + "learning_rate": 8.13242248381978e-06, + "loss": 3.3204, + "step": 31340 + }, + { + "epoch": 0.9184794550648209, + "grad_norm": 11.940128326416016, + "learning_rate": 8.131189662459158e-06, + "loss": 3.3224, + "step": 31350 + }, + { + "epoch": 0.9187724309675529, + "grad_norm": 11.84405517578125, + "learning_rate": 8.129956527841588e-06, + "loss": 3.3433, + "step": 31360 + }, + { + "epoch": 0.9190654068702849, + "grad_norm": 10.385124206542969, + "learning_rate": 8.128723080090435e-06, + "loss": 3.3324, + "step": 31370 + }, + { + "epoch": 0.919358382773017, + "grad_norm": 10.557710647583008, + "learning_rate": 8.1274893193291e-06, + "loss": 3.3306, + "step": 31380 + }, + { + "epoch": 0.9196513586757489, + "grad_norm": 10.431175231933594, + "learning_rate": 8.126255245681014e-06, + "loss": 3.3308, + "step": 31390 + }, + { + "epoch": 0.9199443345784809, + "grad_norm": 10.708431243896484, + "learning_rate": 8.125020859269638e-06, + "loss": 3.3343, + "step": 31400 + }, + { + "epoch": 0.9202373104812129, + "grad_norm": 12.333030700683594, + "learning_rate": 8.123786160218466e-06, + "loss": 3.3171, + "step": 31410 + }, + { + "epoch": 0.920530286383945, + "grad_norm": 11.155685424804688, + "learning_rate": 8.12255114865102e-06, + "loss": 3.3277, + "step": 31420 + }, + { + "epoch": 0.9208232622866769, + "grad_norm": 10.68431282043457, + "learning_rate": 8.121315824690856e-06, + "loss": 3.3405, + "step": 31430 + }, + { + "epoch": 0.9211162381894089, + "grad_norm": 11.0316162109375, + "learning_rate": 8.120080188461564e-06, + "loss": 3.3035, + "step": 31440 + }, + { + "epoch": 0.9214092140921409, + "grad_norm": 10.157567977905273, + "learning_rate": 8.11884424008676e-06, + "loss": 3.3098, + "step": 31450 + }, + { + "epoch": 0.921702189994873, + "grad_norm": 10.289789199829102, + "learning_rate": 8.117607979690092e-06, + "loss": 3.328, + "step": 31460 + }, + { + "epoch": 0.9219951658976049, + "grad_norm": 10.274734497070312, + "learning_rate": 8.116371407395243e-06, + "loss": 3.3242, + "step": 31470 + }, + { + "epoch": 0.9222881418003369, + "grad_norm": 10.449366569519043, + "learning_rate": 8.115134523325923e-06, + "loss": 3.346, + "step": 31480 + }, + { + "epoch": 0.9225811177030689, + "grad_norm": 10.659029006958008, + "learning_rate": 8.113897327605875e-06, + "loss": 3.3441, + "step": 31490 + }, + { + "epoch": 0.922874093605801, + "grad_norm": 10.089306831359863, + "learning_rate": 8.112659820358874e-06, + "loss": 3.3494, + "step": 31500 + }, + { + "epoch": 0.9231670695085329, + "grad_norm": 10.577277183532715, + "learning_rate": 8.111422001708725e-06, + "loss": 3.3314, + "step": 31510 + }, + { + "epoch": 0.9234600454112649, + "grad_norm": 11.09656810760498, + "learning_rate": 8.110183871779263e-06, + "loss": 3.3248, + "step": 31520 + }, + { + "epoch": 0.923753021313997, + "grad_norm": 11.671804428100586, + "learning_rate": 8.108945430694359e-06, + "loss": 3.3122, + "step": 31530 + }, + { + "epoch": 0.924045997216729, + "grad_norm": 10.281843185424805, + "learning_rate": 8.107706678577906e-06, + "loss": 3.3526, + "step": 31540 + }, + { + "epoch": 0.9243389731194609, + "grad_norm": 10.599287033081055, + "learning_rate": 8.10646761555384e-06, + "loss": 3.3594, + "step": 31550 + }, + { + "epoch": 0.9246319490221929, + "grad_norm": 10.392760276794434, + "learning_rate": 8.105228241746117e-06, + "loss": 3.3301, + "step": 31560 + }, + { + "epoch": 0.924924924924925, + "grad_norm": 10.65473461151123, + "learning_rate": 8.103988557278732e-06, + "loss": 3.3249, + "step": 31570 + }, + { + "epoch": 0.925217900827657, + "grad_norm": 10.607946395874023, + "learning_rate": 8.102748562275708e-06, + "loss": 3.3349, + "step": 31580 + }, + { + "epoch": 0.9255108767303889, + "grad_norm": 10.497553825378418, + "learning_rate": 8.101508256861099e-06, + "loss": 3.358, + "step": 31590 + }, + { + "epoch": 0.9258038526331209, + "grad_norm": 11.192273139953613, + "learning_rate": 8.100267641158988e-06, + "loss": 3.341, + "step": 31600 + }, + { + "epoch": 0.926096828535853, + "grad_norm": 10.933640480041504, + "learning_rate": 8.099026715293492e-06, + "loss": 3.3008, + "step": 31610 + }, + { + "epoch": 0.9263898044385849, + "grad_norm": 10.651044845581055, + "learning_rate": 8.09778547938876e-06, + "loss": 3.3157, + "step": 31620 + }, + { + "epoch": 0.9266827803413169, + "grad_norm": 11.229990005493164, + "learning_rate": 8.09654393356897e-06, + "loss": 3.3232, + "step": 31630 + }, + { + "epoch": 0.9269757562440489, + "grad_norm": 12.084211349487305, + "learning_rate": 8.09530207795833e-06, + "loss": 3.3423, + "step": 31640 + }, + { + "epoch": 0.927268732146781, + "grad_norm": 10.976543426513672, + "learning_rate": 8.094059912681081e-06, + "loss": 3.3282, + "step": 31650 + }, + { + "epoch": 0.9275617080495129, + "grad_norm": 10.227568626403809, + "learning_rate": 8.092817437861495e-06, + "loss": 3.3462, + "step": 31660 + }, + { + "epoch": 0.9278546839522449, + "grad_norm": 10.992524147033691, + "learning_rate": 8.091574653623873e-06, + "loss": 3.3369, + "step": 31670 + }, + { + "epoch": 0.9281476598549769, + "grad_norm": 10.8491792678833, + "learning_rate": 8.090331560092546e-06, + "loss": 3.3352, + "step": 31680 + }, + { + "epoch": 0.928440635757709, + "grad_norm": 11.028058052062988, + "learning_rate": 8.089088157391885e-06, + "loss": 3.3036, + "step": 31690 + }, + { + "epoch": 0.9287336116604409, + "grad_norm": 10.876716613769531, + "learning_rate": 8.087844445646278e-06, + "loss": 3.3452, + "step": 31700 + }, + { + "epoch": 0.9290265875631729, + "grad_norm": 11.116703987121582, + "learning_rate": 8.086600424980153e-06, + "loss": 3.3104, + "step": 31710 + }, + { + "epoch": 0.9293195634659049, + "grad_norm": 10.900169372558594, + "learning_rate": 8.08535609551797e-06, + "loss": 3.3367, + "step": 31720 + }, + { + "epoch": 0.929612539368637, + "grad_norm": 10.327814102172852, + "learning_rate": 8.084111457384213e-06, + "loss": 3.3586, + "step": 31730 + }, + { + "epoch": 0.9299055152713689, + "grad_norm": 11.269112586975098, + "learning_rate": 8.082866510703402e-06, + "loss": 3.3147, + "step": 31740 + }, + { + "epoch": 0.9300227056324617, + "eval_bleu": 0.3351585898609697, + "eval_cap_loss": 0.9546217322349548, + "eval_con_loss": 1.2799651622772217, + "eval_loss": 3.514552116394043, + "step": 31744 + }, + { + "epoch": 0.9300227056324617, + "eval_bleu": 0.3351585898609697, + "eval_cap_loss": 0.9546217322349548, + "eval_con_loss": 1.2799651622772217, + "eval_loss": 3.514552116394043, + "eval_runtime": 61.1667, + "eval_samples_per_second": 326.975, + "eval_steps_per_second": 0.327, + "step": 31744 + }, + { + "epoch": 0.9301984911741009, + "grad_norm": 9.38675308227539, + "learning_rate": 8.081621255600088e-06, + "loss": 3.3194, + "step": 31750 + }, + { + "epoch": 0.930491467076833, + "grad_norm": 11.749146461486816, + "learning_rate": 8.08037569219885e-06, + "loss": 3.332, + "step": 31760 + }, + { + "epoch": 0.930784442979565, + "grad_norm": 10.64948558807373, + "learning_rate": 8.0791298206243e-06, + "loss": 3.3213, + "step": 31770 + }, + { + "epoch": 0.9310774188822969, + "grad_norm": 11.445023536682129, + "learning_rate": 8.077883641001075e-06, + "loss": 3.3456, + "step": 31780 + }, + { + "epoch": 0.9313703947850289, + "grad_norm": 10.436490058898926, + "learning_rate": 8.076637153453855e-06, + "loss": 3.326, + "step": 31790 + }, + { + "epoch": 0.931663370687761, + "grad_norm": 11.10351276397705, + "learning_rate": 8.075390358107341e-06, + "loss": 3.3085, + "step": 31800 + }, + { + "epoch": 0.931956346590493, + "grad_norm": 10.10634708404541, + "learning_rate": 8.074143255086268e-06, + "loss": 3.3131, + "step": 31810 + }, + { + "epoch": 0.9322493224932249, + "grad_norm": 11.251418113708496, + "learning_rate": 8.072895844515398e-06, + "loss": 3.3154, + "step": 31820 + }, + { + "epoch": 0.9325422983959569, + "grad_norm": 10.407242774963379, + "learning_rate": 8.071648126519533e-06, + "loss": 3.302, + "step": 31830 + }, + { + "epoch": 0.932835274298689, + "grad_norm": 11.221912384033203, + "learning_rate": 8.070400101223495e-06, + "loss": 3.3142, + "step": 31840 + }, + { + "epoch": 0.933128250201421, + "grad_norm": 11.28333854675293, + "learning_rate": 8.06915176875214e-06, + "loss": 3.336, + "step": 31850 + }, + { + "epoch": 0.9334212261041529, + "grad_norm": 9.819619178771973, + "learning_rate": 8.067903129230363e-06, + "loss": 3.3277, + "step": 31860 + }, + { + "epoch": 0.9337142020068849, + "grad_norm": 11.96299934387207, + "learning_rate": 8.066654182783079e-06, + "loss": 3.3102, + "step": 31870 + }, + { + "epoch": 0.934007177909617, + "grad_norm": 10.512228965759277, + "learning_rate": 8.065404929535237e-06, + "loss": 3.3431, + "step": 31880 + }, + { + "epoch": 0.934300153812349, + "grad_norm": 10.681817054748535, + "learning_rate": 8.064155369611819e-06, + "loss": 3.3431, + "step": 31890 + }, + { + "epoch": 0.9345931297150809, + "grad_norm": 11.337838172912598, + "learning_rate": 8.062905503137832e-06, + "loss": 3.3343, + "step": 31900 + }, + { + "epoch": 0.9348861056178129, + "grad_norm": 10.590629577636719, + "learning_rate": 8.061655330238324e-06, + "loss": 3.3187, + "step": 31910 + }, + { + "epoch": 0.935179081520545, + "grad_norm": 10.531058311462402, + "learning_rate": 8.060404851038364e-06, + "loss": 3.3328, + "step": 31920 + }, + { + "epoch": 0.9354720574232769, + "grad_norm": 10.715607643127441, + "learning_rate": 8.059154065663054e-06, + "loss": 3.3418, + "step": 31930 + }, + { + "epoch": 0.9357650333260089, + "grad_norm": 11.476642608642578, + "learning_rate": 8.057902974237531e-06, + "loss": 3.3369, + "step": 31940 + }, + { + "epoch": 0.9360580092287409, + "grad_norm": 11.194515228271484, + "learning_rate": 8.056651576886957e-06, + "loss": 3.3286, + "step": 31950 + }, + { + "epoch": 0.936350985131473, + "grad_norm": 10.588412284851074, + "learning_rate": 8.055525057808995e-06, + "loss": 3.3423, + "step": 31960 + }, + { + "epoch": 0.9366439610342049, + "grad_norm": 11.974124908447266, + "learning_rate": 8.054273079545762e-06, + "loss": 3.3457, + "step": 31970 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 10.717418670654297, + "learning_rate": 8.053020795720627e-06, + "loss": 3.3203, + "step": 31980 + }, + { + "epoch": 0.9372299128396689, + "grad_norm": 9.269216537475586, + "learning_rate": 8.051768206458874e-06, + "loss": 3.3304, + "step": 31990 + }, + { + "epoch": 0.937522888742401, + "grad_norm": 10.276286125183105, + "learning_rate": 8.050515311885818e-06, + "loss": 3.3246, + "step": 32000 + }, + { + "epoch": 0.9378158646451329, + "grad_norm": 10.924190521240234, + "learning_rate": 8.049262112126803e-06, + "loss": 3.3217, + "step": 32010 + }, + { + "epoch": 0.9381088405478649, + "grad_norm": 10.820732116699219, + "learning_rate": 8.048008607307206e-06, + "loss": 3.3203, + "step": 32020 + }, + { + "epoch": 0.938401816450597, + "grad_norm": 10.1741361618042, + "learning_rate": 8.04675479755243e-06, + "loss": 3.3016, + "step": 32030 + }, + { + "epoch": 0.938694792353329, + "grad_norm": 10.783428192138672, + "learning_rate": 8.04550068298791e-06, + "loss": 3.3484, + "step": 32040 + }, + { + "epoch": 0.9389877682560609, + "grad_norm": 10.645389556884766, + "learning_rate": 8.044246263739116e-06, + "loss": 3.3294, + "step": 32050 + }, + { + "epoch": 0.9392807441587929, + "grad_norm": 10.495849609375, + "learning_rate": 8.042991539931546e-06, + "loss": 3.3211, + "step": 32060 + }, + { + "epoch": 0.939573720061525, + "grad_norm": 11.200181007385254, + "learning_rate": 8.041736511690726e-06, + "loss": 3.3311, + "step": 32070 + }, + { + "epoch": 0.939866695964257, + "grad_norm": 9.991130828857422, + "learning_rate": 8.040481179142212e-06, + "loss": 3.2992, + "step": 32080 + }, + { + "epoch": 0.9401596718669889, + "grad_norm": 10.435827255249023, + "learning_rate": 8.039225542411598e-06, + "loss": 3.3155, + "step": 32090 + }, + { + "epoch": 0.9404526477697209, + "grad_norm": 11.033099174499512, + "learning_rate": 8.037969601624495e-06, + "loss": 3.3316, + "step": 32100 + }, + { + "epoch": 0.940745623672453, + "grad_norm": 11.093291282653809, + "learning_rate": 8.036713356906561e-06, + "loss": 3.3366, + "step": 32110 + }, + { + "epoch": 0.941038599575185, + "grad_norm": 10.146787643432617, + "learning_rate": 8.035456808383468e-06, + "loss": 3.3336, + "step": 32120 + }, + { + "epoch": 0.9413315754779169, + "grad_norm": 9.978157997131348, + "learning_rate": 8.034199956180934e-06, + "loss": 3.3093, + "step": 32130 + }, + { + "epoch": 0.9416245513806489, + "grad_norm": 11.027117729187012, + "learning_rate": 8.032942800424692e-06, + "loss": 3.3236, + "step": 32140 + }, + { + "epoch": 0.941917527283381, + "grad_norm": 10.961113929748535, + "learning_rate": 8.03168534124052e-06, + "loss": 3.3129, + "step": 32150 + }, + { + "epoch": 0.942210503186113, + "grad_norm": 10.640993118286133, + "learning_rate": 8.030427578754213e-06, + "loss": 3.3389, + "step": 32160 + }, + { + "epoch": 0.9425034790888449, + "grad_norm": 10.322227478027344, + "learning_rate": 8.029169513091608e-06, + "loss": 3.3337, + "step": 32170 + }, + { + "epoch": 0.9427964549915769, + "grad_norm": 10.221193313598633, + "learning_rate": 8.027911144378563e-06, + "loss": 3.3168, + "step": 32180 + }, + { + "epoch": 0.943089430894309, + "grad_norm": 10.694893836975098, + "learning_rate": 8.026652472740974e-06, + "loss": 3.3242, + "step": 32190 + }, + { + "epoch": 0.943382406797041, + "grad_norm": 10.990355491638184, + "learning_rate": 8.02539349830476e-06, + "loss": 3.3443, + "step": 32200 + }, + { + "epoch": 0.9436753826997729, + "grad_norm": 11.260810852050781, + "learning_rate": 8.024134221195876e-06, + "loss": 3.3108, + "step": 32210 + }, + { + "epoch": 0.9439683586025049, + "grad_norm": 10.403327941894531, + "learning_rate": 8.022874641540304e-06, + "loss": 3.3176, + "step": 32220 + }, + { + "epoch": 0.944261334505237, + "grad_norm": 11.21558952331543, + "learning_rate": 8.02161475946406e-06, + "loss": 3.3242, + "step": 32230 + }, + { + "epoch": 0.944554310407969, + "grad_norm": 9.90707778930664, + "learning_rate": 8.020354575093183e-06, + "loss": 3.3413, + "step": 32240 + }, + { + "epoch": 0.9448472863107009, + "grad_norm": 10.530220031738281, + "learning_rate": 8.019094088553753e-06, + "loss": 3.3097, + "step": 32250 + }, + { + "epoch": 0.9450230718523401, + "eval_bleu": 0.3357178905698592, + "eval_cap_loss": 0.9515960216522217, + "eval_con_loss": 1.2720509767532349, + "eval_loss": 3.4956979751586914, + "step": 32256 + }, + { + "epoch": 0.9450230718523401, + "eval_bleu": 0.3357178905698592, + "eval_cap_loss": 0.9515960216522217, + "eval_con_loss": 1.2720509767532349, + "eval_loss": 3.4956979751586914, + "eval_runtime": 64.3256, + "eval_samples_per_second": 310.918, + "eval_steps_per_second": 0.311, + "step": 32256 + }, + { + "epoch": 0.945140262213433, + "grad_norm": 10.040410995483398, + "learning_rate": 8.017833299971868e-06, + "loss": 3.3245, + "step": 32260 + }, + { + "epoch": 0.945433238116165, + "grad_norm": 11.011148452758789, + "learning_rate": 8.01657220947367e-06, + "loss": 3.3218, + "step": 32270 + }, + { + "epoch": 0.9457262140188969, + "grad_norm": 10.638496398925781, + "learning_rate": 8.015310817185315e-06, + "loss": 3.332, + "step": 32280 + }, + { + "epoch": 0.9460191899216289, + "grad_norm": 11.365195274353027, + "learning_rate": 8.014049123233003e-06, + "loss": 3.305, + "step": 32290 + }, + { + "epoch": 0.946312165824361, + "grad_norm": 10.817123413085938, + "learning_rate": 8.01278712774296e-06, + "loss": 3.2989, + "step": 32300 + }, + { + "epoch": 0.946605141727093, + "grad_norm": 11.624929428100586, + "learning_rate": 8.011524830841437e-06, + "loss": 3.3142, + "step": 32310 + }, + { + "epoch": 0.9468981176298249, + "grad_norm": 11.657447814941406, + "learning_rate": 8.010262232654722e-06, + "loss": 3.3166, + "step": 32320 + }, + { + "epoch": 0.9471910935325569, + "grad_norm": 9.559881210327148, + "learning_rate": 8.00899933330913e-06, + "loss": 3.2943, + "step": 32330 + }, + { + "epoch": 0.947484069435289, + "grad_norm": 10.641284942626953, + "learning_rate": 8.007736132931006e-06, + "loss": 3.3288, + "step": 32340 + }, + { + "epoch": 0.947777045338021, + "grad_norm": 10.616351127624512, + "learning_rate": 8.006472631646726e-06, + "loss": 3.3217, + "step": 32350 + }, + { + "epoch": 0.9480700212407529, + "grad_norm": 10.667989730834961, + "learning_rate": 8.005208829582697e-06, + "loss": 3.3371, + "step": 32360 + }, + { + "epoch": 0.9483629971434849, + "grad_norm": 10.22488021850586, + "learning_rate": 8.003944726865354e-06, + "loss": 3.3047, + "step": 32370 + }, + { + "epoch": 0.948655973046217, + "grad_norm": 10.79202938079834, + "learning_rate": 8.002680323621163e-06, + "loss": 3.32, + "step": 32380 + }, + { + "epoch": 0.948948948948949, + "grad_norm": 10.107049942016602, + "learning_rate": 8.00141561997662e-06, + "loss": 3.3218, + "step": 32390 + }, + { + "epoch": 0.9492419248516809, + "grad_norm": 10.72153377532959, + "learning_rate": 8.000150616058248e-06, + "loss": 3.3322, + "step": 32400 + }, + { + "epoch": 0.9495349007544129, + "grad_norm": 11.145442008972168, + "learning_rate": 7.99888531199261e-06, + "loss": 3.3234, + "step": 32410 + }, + { + "epoch": 0.949827876657145, + "grad_norm": 11.403520584106445, + "learning_rate": 7.997619707906286e-06, + "loss": 3.3037, + "step": 32420 + }, + { + "epoch": 0.950120852559877, + "grad_norm": 10.39166259765625, + "learning_rate": 7.996353803925896e-06, + "loss": 3.3358, + "step": 32430 + }, + { + "epoch": 0.9504138284626089, + "grad_norm": 11.27517032623291, + "learning_rate": 7.995087600178083e-06, + "loss": 3.3093, + "step": 32440 + }, + { + "epoch": 0.9507068043653409, + "grad_norm": 10.542811393737793, + "learning_rate": 7.993821096789527e-06, + "loss": 3.3242, + "step": 32450 + }, + { + "epoch": 0.950999780268073, + "grad_norm": 10.166762351989746, + "learning_rate": 7.99255429388693e-06, + "loss": 3.303, + "step": 32460 + }, + { + "epoch": 0.951292756170805, + "grad_norm": 10.878371238708496, + "learning_rate": 7.991287191597032e-06, + "loss": 3.3233, + "step": 32470 + }, + { + "epoch": 0.9515857320735369, + "grad_norm": 11.122732162475586, + "learning_rate": 7.990019790046595e-06, + "loss": 3.3181, + "step": 32480 + }, + { + "epoch": 0.951878707976269, + "grad_norm": 10.865165710449219, + "learning_rate": 7.988752089362419e-06, + "loss": 3.3035, + "step": 32490 + }, + { + "epoch": 0.952171683879001, + "grad_norm": 11.030380249023438, + "learning_rate": 7.987484089671327e-06, + "loss": 3.304, + "step": 32500 + }, + { + "epoch": 0.952464659781733, + "grad_norm": 10.428754806518555, + "learning_rate": 7.986215791100176e-06, + "loss": 3.2968, + "step": 32510 + }, + { + "epoch": 0.9527576356844649, + "grad_norm": 10.867204666137695, + "learning_rate": 7.984947193775853e-06, + "loss": 3.3142, + "step": 32520 + }, + { + "epoch": 0.953050611587197, + "grad_norm": 11.898416519165039, + "learning_rate": 7.98367829782527e-06, + "loss": 3.3091, + "step": 32530 + }, + { + "epoch": 0.953343587489929, + "grad_norm": 11.0288667678833, + "learning_rate": 7.982409103375377e-06, + "loss": 3.3072, + "step": 32540 + }, + { + "epoch": 0.953636563392661, + "grad_norm": 10.859001159667969, + "learning_rate": 7.981139610553146e-06, + "loss": 3.3224, + "step": 32550 + }, + { + "epoch": 0.9539295392953929, + "grad_norm": 12.132184982299805, + "learning_rate": 7.979869819485584e-06, + "loss": 3.3012, + "step": 32560 + }, + { + "epoch": 0.954222515198125, + "grad_norm": 11.021322250366211, + "learning_rate": 7.978599730299726e-06, + "loss": 3.2878, + "step": 32570 + }, + { + "epoch": 0.954515491100857, + "grad_norm": 10.047565460205078, + "learning_rate": 7.977329343122637e-06, + "loss": 3.2991, + "step": 32580 + }, + { + "epoch": 0.9548084670035889, + "grad_norm": 10.324673652648926, + "learning_rate": 7.97605865808141e-06, + "loss": 3.3053, + "step": 32590 + }, + { + "epoch": 0.9551014429063209, + "grad_norm": 10.63984489440918, + "learning_rate": 7.974787675303168e-06, + "loss": 3.3166, + "step": 32600 + }, + { + "epoch": 0.955394418809053, + "grad_norm": 11.176955223083496, + "learning_rate": 7.973516394915073e-06, + "loss": 3.3066, + "step": 32610 + }, + { + "epoch": 0.955687394711785, + "grad_norm": 11.27182674407959, + "learning_rate": 7.9722448170443e-06, + "loss": 3.3169, + "step": 32620 + }, + { + "epoch": 0.9559803706145169, + "grad_norm": 11.687674522399902, + "learning_rate": 7.970972941818067e-06, + "loss": 3.3096, + "step": 32630 + }, + { + "epoch": 0.9562733465172489, + "grad_norm": 10.969283103942871, + "learning_rate": 7.96970076936362e-06, + "loss": 3.3213, + "step": 32640 + }, + { + "epoch": 0.956566322419981, + "grad_norm": 10.383974075317383, + "learning_rate": 7.968428299808228e-06, + "loss": 3.3108, + "step": 32650 + }, + { + "epoch": 0.956859298322713, + "grad_norm": 11.310135841369629, + "learning_rate": 7.967155533279197e-06, + "loss": 3.3335, + "step": 32660 + }, + { + "epoch": 0.9571522742254449, + "grad_norm": 10.714847564697266, + "learning_rate": 7.965882469903855e-06, + "loss": 3.3187, + "step": 32670 + }, + { + "epoch": 0.9574452501281769, + "grad_norm": 11.69101619720459, + "learning_rate": 7.964609109809571e-06, + "loss": 3.3365, + "step": 32680 + }, + { + "epoch": 0.957738226030909, + "grad_norm": 10.687361717224121, + "learning_rate": 7.963335453123734e-06, + "loss": 3.3101, + "step": 32690 + }, + { + "epoch": 0.958031201933641, + "grad_norm": 10.6578369140625, + "learning_rate": 7.962061499973765e-06, + "loss": 3.32, + "step": 32700 + }, + { + "epoch": 0.9583241778363729, + "grad_norm": 10.931675910949707, + "learning_rate": 7.960787250487117e-06, + "loss": 3.3177, + "step": 32710 + }, + { + "epoch": 0.9586171537391049, + "grad_norm": 10.002975463867188, + "learning_rate": 7.959512704791269e-06, + "loss": 3.2968, + "step": 32720 + }, + { + "epoch": 0.958910129641837, + "grad_norm": 10.312373161315918, + "learning_rate": 7.958237863013734e-06, + "loss": 3.299, + "step": 32730 + }, + { + "epoch": 0.959203105544569, + "grad_norm": 10.839104652404785, + "learning_rate": 7.95696272528205e-06, + "loss": 3.3158, + "step": 32740 + }, + { + "epoch": 0.9594960814473009, + "grad_norm": 10.2455415725708, + "learning_rate": 7.95568729172379e-06, + "loss": 3.3022, + "step": 32750 + }, + { + "epoch": 0.959789057350033, + "grad_norm": 10.484140396118164, + "learning_rate": 7.954411562466552e-06, + "loss": 3.3105, + "step": 32760 + }, + { + "epoch": 0.9600234380722186, + "eval_bleu": 0.3358343063713027, + "eval_cap_loss": 0.9498487114906311, + "eval_con_loss": 1.2708873748779297, + "eval_loss": 3.4916234016418457, + "step": 32768 + }, + { + "epoch": 0.9600234380722186, + "eval_bleu": 0.3358343063713027, + "eval_cap_loss": 0.9498487114906311, + "eval_con_loss": 1.2708873748779297, + "eval_loss": 3.4916234016418457, + "eval_runtime": 55.8961, + "eval_samples_per_second": 357.807, + "eval_steps_per_second": 0.358, + "step": 32768 + }, + { + "epoch": 0.960082033252765, + "grad_norm": 10.772202491760254, + "learning_rate": 7.953135537637964e-06, + "loss": 3.3093, + "step": 32770 + }, + { + "epoch": 0.960375009155497, + "grad_norm": 10.11131477355957, + "learning_rate": 7.951859217365687e-06, + "loss": 3.3202, + "step": 32780 + }, + { + "epoch": 0.9606679850582289, + "grad_norm": 10.681462287902832, + "learning_rate": 7.950582601777407e-06, + "loss": 3.2973, + "step": 32790 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 10.1140775680542, + "learning_rate": 7.949305691000844e-06, + "loss": 3.3014, + "step": 32800 + }, + { + "epoch": 0.961253936863693, + "grad_norm": 10.078682899475098, + "learning_rate": 7.948028485163744e-06, + "loss": 3.3168, + "step": 32810 + }, + { + "epoch": 0.961546912766425, + "grad_norm": 10.086588859558105, + "learning_rate": 7.946750984393883e-06, + "loss": 3.3356, + "step": 32820 + }, + { + "epoch": 0.9618398886691569, + "grad_norm": 10.177552223205566, + "learning_rate": 7.945473188819068e-06, + "loss": 3.3033, + "step": 32830 + }, + { + "epoch": 0.962132864571889, + "grad_norm": 10.718594551086426, + "learning_rate": 7.944195098567138e-06, + "loss": 3.3033, + "step": 32840 + }, + { + "epoch": 0.962425840474621, + "grad_norm": 10.508200645446777, + "learning_rate": 7.942916713765954e-06, + "loss": 3.2886, + "step": 32850 + }, + { + "epoch": 0.962718816377353, + "grad_norm": 10.85108757019043, + "learning_rate": 7.941638034543412e-06, + "loss": 3.3444, + "step": 32860 + }, + { + "epoch": 0.9630117922800849, + "grad_norm": 10.75610065460205, + "learning_rate": 7.940359061027439e-06, + "loss": 3.3017, + "step": 32870 + }, + { + "epoch": 0.963304768182817, + "grad_norm": 11.24431037902832, + "learning_rate": 7.939079793345984e-06, + "loss": 3.3327, + "step": 32880 + }, + { + "epoch": 0.963597744085549, + "grad_norm": 11.376026153564453, + "learning_rate": 7.937800231627035e-06, + "loss": 3.3315, + "step": 32890 + }, + { + "epoch": 0.9638907199882809, + "grad_norm": 10.34264087677002, + "learning_rate": 7.9365203759986e-06, + "loss": 3.3161, + "step": 32900 + }, + { + "epoch": 0.9641836958910129, + "grad_norm": 10.801630973815918, + "learning_rate": 7.935240226588724e-06, + "loss": 3.2865, + "step": 32910 + }, + { + "epoch": 0.964476671793745, + "grad_norm": 11.370366096496582, + "learning_rate": 7.933959783525478e-06, + "loss": 3.3103, + "step": 32920 + }, + { + "epoch": 0.964769647696477, + "grad_norm": 10.81047248840332, + "learning_rate": 7.932679046936962e-06, + "loss": 3.2996, + "step": 32930 + }, + { + "epoch": 0.9650626235992089, + "grad_norm": 10.510452270507812, + "learning_rate": 7.931398016951306e-06, + "loss": 3.34, + "step": 32940 + }, + { + "epoch": 0.9653555995019409, + "grad_norm": 10.766789436340332, + "learning_rate": 7.930116693696668e-06, + "loss": 3.2998, + "step": 32950 + }, + { + "epoch": 0.965648575404673, + "grad_norm": 10.514841079711914, + "learning_rate": 7.92883507730124e-06, + "loss": 3.3119, + "step": 32960 + }, + { + "epoch": 0.965941551307405, + "grad_norm": 10.91912841796875, + "learning_rate": 7.927553167893241e-06, + "loss": 3.2871, + "step": 32970 + }, + { + "epoch": 0.9662345272101369, + "grad_norm": 11.75261402130127, + "learning_rate": 7.926270965600913e-06, + "loss": 3.2857, + "step": 32980 + }, + { + "epoch": 0.966527503112869, + "grad_norm": 11.241813659667969, + "learning_rate": 7.924988470552537e-06, + "loss": 3.294, + "step": 32990 + }, + { + "epoch": 0.966820479015601, + "grad_norm": 10.638099670410156, + "learning_rate": 7.92370568287642e-06, + "loss": 3.3077, + "step": 33000 + }, + { + "epoch": 0.967113454918333, + "grad_norm": 10.63379192352295, + "learning_rate": 7.922422602700892e-06, + "loss": 3.3091, + "step": 33010 + }, + { + "epoch": 0.9674064308210649, + "grad_norm": 11.274679183959961, + "learning_rate": 7.921139230154321e-06, + "loss": 3.3103, + "step": 33020 + }, + { + "epoch": 0.967699406723797, + "grad_norm": 10.127496719360352, + "learning_rate": 7.919855565365102e-06, + "loss": 3.2998, + "step": 33030 + }, + { + "epoch": 0.967992382626529, + "grad_norm": 10.40744400024414, + "learning_rate": 7.918571608461657e-06, + "loss": 3.2918, + "step": 33040 + }, + { + "epoch": 0.968285358529261, + "grad_norm": 11.052331924438477, + "learning_rate": 7.917287359572436e-06, + "loss": 3.3042, + "step": 33050 + }, + { + "epoch": 0.9685783344319929, + "grad_norm": 10.48709487915039, + "learning_rate": 7.916002818825924e-06, + "loss": 3.2869, + "step": 33060 + }, + { + "epoch": 0.968871310334725, + "grad_norm": 11.571605682373047, + "learning_rate": 7.914717986350631e-06, + "loss": 3.3273, + "step": 33070 + }, + { + "epoch": 0.969164286237457, + "grad_norm": 10.583534240722656, + "learning_rate": 7.913432862275094e-06, + "loss": 3.3226, + "step": 33080 + }, + { + "epoch": 0.969457262140189, + "grad_norm": 11.326142311096191, + "learning_rate": 7.912147446727884e-06, + "loss": 3.3129, + "step": 33090 + }, + { + "epoch": 0.9697502380429209, + "grad_norm": 10.263561248779297, + "learning_rate": 7.910861739837598e-06, + "loss": 3.3114, + "step": 33100 + }, + { + "epoch": 0.970043213945653, + "grad_norm": 9.975799560546875, + "learning_rate": 7.909575741732867e-06, + "loss": 3.3106, + "step": 33110 + }, + { + "epoch": 0.970336189848385, + "grad_norm": 11.55835247039795, + "learning_rate": 7.908289452542343e-06, + "loss": 3.3041, + "step": 33120 + }, + { + "epoch": 0.970629165751117, + "grad_norm": 10.901909828186035, + "learning_rate": 7.907002872394716e-06, + "loss": 3.2954, + "step": 33130 + }, + { + "epoch": 0.9709221416538489, + "grad_norm": 10.594654083251953, + "learning_rate": 7.905716001418695e-06, + "loss": 3.3387, + "step": 33140 + }, + { + "epoch": 0.971215117556581, + "grad_norm": 10.813980102539062, + "learning_rate": 7.904428839743031e-06, + "loss": 3.301, + "step": 33150 + }, + { + "epoch": 0.971508093459313, + "grad_norm": 10.636871337890625, + "learning_rate": 7.903141387496488e-06, + "loss": 3.3134, + "step": 33160 + }, + { + "epoch": 0.971801069362045, + "grad_norm": 10.761366844177246, + "learning_rate": 7.901853644807878e-06, + "loss": 3.2974, + "step": 33170 + }, + { + "epoch": 0.9720940452647769, + "grad_norm": 10.680479049682617, + "learning_rate": 7.900565611806024e-06, + "loss": 3.2846, + "step": 33180 + }, + { + "epoch": 0.972387021167509, + "grad_norm": 10.6324462890625, + "learning_rate": 7.899277288619788e-06, + "loss": 3.3159, + "step": 33190 + }, + { + "epoch": 0.972679997070241, + "grad_norm": 10.370050430297852, + "learning_rate": 7.897988675378063e-06, + "loss": 3.3182, + "step": 33200 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 10.28499698638916, + "learning_rate": 7.896699772209762e-06, + "loss": 3.2976, + "step": 33210 + }, + { + "epoch": 0.973265948875705, + "grad_norm": 10.145495414733887, + "learning_rate": 7.895410579243836e-06, + "loss": 3.3053, + "step": 33220 + }, + { + "epoch": 0.973558924778437, + "grad_norm": 11.063908576965332, + "learning_rate": 7.894121096609258e-06, + "loss": 3.2995, + "step": 33230 + }, + { + "epoch": 0.973851900681169, + "grad_norm": 10.25475025177002, + "learning_rate": 7.892831324435035e-06, + "loss": 3.2971, + "step": 33240 + }, + { + "epoch": 0.9741448765839009, + "grad_norm": 11.035176277160645, + "learning_rate": 7.891541262850201e-06, + "loss": 3.3056, + "step": 33250 + }, + { + "epoch": 0.974437852486633, + "grad_norm": 10.746499061584473, + "learning_rate": 7.89025091198382e-06, + "loss": 3.2957, + "step": 33260 + }, + { + "epoch": 0.974730828389365, + "grad_norm": 11.991039276123047, + "learning_rate": 7.88896027196498e-06, + "loss": 3.3042, + "step": 33270 + }, + { + "epoch": 0.975023804292097, + "grad_norm": 10.05875015258789, + "learning_rate": 7.887669342922806e-06, + "loss": 3.3054, + "step": 33280 + }, + { + "epoch": 0.975023804292097, + "eval_bleu": 0.33670584819400373, + "eval_cap_loss": 0.9488292932510376, + "eval_con_loss": 1.2619242668151855, + "eval_loss": 3.4726779460906982, + "step": 33280 + }, + { + "epoch": 0.975023804292097, + "eval_bleu": 0.33670584819400373, + "eval_cap_loss": 0.9488292932510376, + "eval_con_loss": 1.2619242668151855, + "eval_loss": 3.4726779460906982, + "eval_runtime": 55.8685, + "eval_samples_per_second": 357.983, + "eval_steps_per_second": 0.358, + "step": 33280 + }, + { + "epoch": 0.9753167801948289, + "grad_norm": 11.158945083618164, + "learning_rate": 7.886378124986445e-06, + "loss": 3.3005, + "step": 33290 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 10.184383392333984, + "learning_rate": 7.885086618285076e-06, + "loss": 3.301, + "step": 33300 + }, + { + "epoch": 0.975902732000293, + "grad_norm": 10.212288856506348, + "learning_rate": 7.883794822947909e-06, + "loss": 3.288, + "step": 33310 + }, + { + "epoch": 0.976195707903025, + "grad_norm": 10.997262954711914, + "learning_rate": 7.882502739104178e-06, + "loss": 3.315, + "step": 33320 + }, + { + "epoch": 0.9764886838057569, + "grad_norm": 10.055501937866211, + "learning_rate": 7.88121036688315e-06, + "loss": 3.2785, + "step": 33330 + }, + { + "epoch": 0.976781659708489, + "grad_norm": 10.727088928222656, + "learning_rate": 7.87991770641412e-06, + "loss": 3.3167, + "step": 33340 + }, + { + "epoch": 0.977074635611221, + "grad_norm": 10.58312702178955, + "learning_rate": 7.878624757826407e-06, + "loss": 3.3088, + "step": 33350 + }, + { + "epoch": 0.977367611513953, + "grad_norm": 10.56566047668457, + "learning_rate": 7.877331521249368e-06, + "loss": 3.2933, + "step": 33360 + }, + { + "epoch": 0.9776605874166849, + "grad_norm": 11.569347381591797, + "learning_rate": 7.876037996812377e-06, + "loss": 3.2927, + "step": 33370 + }, + { + "epoch": 0.977953563319417, + "grad_norm": 12.011153221130371, + "learning_rate": 7.87474418464485e-06, + "loss": 3.2902, + "step": 33380 + }, + { + "epoch": 0.978246539222149, + "grad_norm": 11.125604629516602, + "learning_rate": 7.873450084876222e-06, + "loss": 3.2951, + "step": 33390 + }, + { + "epoch": 0.978539515124881, + "grad_norm": 10.2149019241333, + "learning_rate": 7.872155697635962e-06, + "loss": 3.313, + "step": 33400 + }, + { + "epoch": 0.9788324910276129, + "grad_norm": 10.83493423461914, + "learning_rate": 7.870861023053561e-06, + "loss": 3.3174, + "step": 33410 + }, + { + "epoch": 0.979125466930345, + "grad_norm": 10.550976753234863, + "learning_rate": 7.86956606125855e-06, + "loss": 3.285, + "step": 33420 + }, + { + "epoch": 0.979418442833077, + "grad_norm": 10.414582252502441, + "learning_rate": 7.868270812380477e-06, + "loss": 3.2743, + "step": 33430 + }, + { + "epoch": 0.979711418735809, + "grad_norm": 10.319621086120605, + "learning_rate": 7.866975276548926e-06, + "loss": 3.2887, + "step": 33440 + }, + { + "epoch": 0.9800043946385409, + "grad_norm": 10.384062767028809, + "learning_rate": 7.86567945389351e-06, + "loss": 3.2886, + "step": 33450 + }, + { + "epoch": 0.980297370541273, + "grad_norm": 11.36227798461914, + "learning_rate": 7.864383344543866e-06, + "loss": 3.2803, + "step": 33460 + }, + { + "epoch": 0.980590346444005, + "grad_norm": 9.625499725341797, + "learning_rate": 7.863086948629661e-06, + "loss": 3.3062, + "step": 33470 + }, + { + "epoch": 0.980883322346737, + "grad_norm": 9.772439956665039, + "learning_rate": 7.861790266280593e-06, + "loss": 3.2851, + "step": 33480 + }, + { + "epoch": 0.981176298249469, + "grad_norm": 10.4647855758667, + "learning_rate": 7.860493297626388e-06, + "loss": 3.3039, + "step": 33490 + }, + { + "epoch": 0.981469274152201, + "grad_norm": 10.802864074707031, + "learning_rate": 7.859196042796799e-06, + "loss": 3.2997, + "step": 33500 + }, + { + "epoch": 0.981762250054933, + "grad_norm": 10.691347122192383, + "learning_rate": 7.857898501921608e-06, + "loss": 3.2945, + "step": 33510 + }, + { + "epoch": 0.982055225957665, + "grad_norm": 10.313589096069336, + "learning_rate": 7.856600675130628e-06, + "loss": 3.3059, + "step": 33520 + }, + { + "epoch": 0.982348201860397, + "grad_norm": 10.345171928405762, + "learning_rate": 7.855302562553698e-06, + "loss": 3.2703, + "step": 33530 + }, + { + "epoch": 0.982641177763129, + "grad_norm": 11.079147338867188, + "learning_rate": 7.854004164320686e-06, + "loss": 3.2863, + "step": 33540 + }, + { + "epoch": 0.982934153665861, + "grad_norm": 9.624765396118164, + "learning_rate": 7.85270548056149e-06, + "loss": 3.2911, + "step": 33550 + }, + { + "epoch": 0.9832271295685929, + "grad_norm": 10.949725151062012, + "learning_rate": 7.851406511406036e-06, + "loss": 3.3013, + "step": 33560 + }, + { + "epoch": 0.983520105471325, + "grad_norm": 10.479231834411621, + "learning_rate": 7.850107256984276e-06, + "loss": 3.3082, + "step": 33570 + }, + { + "epoch": 0.983813081374057, + "grad_norm": 9.782393455505371, + "learning_rate": 7.848807717426194e-06, + "loss": 3.3124, + "step": 33580 + }, + { + "epoch": 0.984106057276789, + "grad_norm": 10.457330703735352, + "learning_rate": 7.847507892861804e-06, + "loss": 3.3064, + "step": 33590 + }, + { + "epoch": 0.9843990331795209, + "grad_norm": 9.954065322875977, + "learning_rate": 7.846207783421141e-06, + "loss": 3.292, + "step": 33600 + }, + { + "epoch": 0.984692009082253, + "grad_norm": 9.698078155517578, + "learning_rate": 7.844907389234273e-06, + "loss": 3.3122, + "step": 33610 + }, + { + "epoch": 0.984984984984985, + "grad_norm": 10.40878677368164, + "learning_rate": 7.843606710431302e-06, + "loss": 3.3135, + "step": 33620 + }, + { + "epoch": 0.985277960887717, + "grad_norm": 11.082322120666504, + "learning_rate": 7.84230574714235e-06, + "loss": 3.2996, + "step": 33630 + }, + { + "epoch": 0.9855709367904489, + "grad_norm": 10.973125457763672, + "learning_rate": 7.84100449949757e-06, + "loss": 3.2757, + "step": 33640 + }, + { + "epoch": 0.985863912693181, + "grad_norm": 10.465559005737305, + "learning_rate": 7.839702967627145e-06, + "loss": 3.3073, + "step": 33650 + }, + { + "epoch": 0.986156888595913, + "grad_norm": 10.878790855407715, + "learning_rate": 7.838401151661285e-06, + "loss": 3.2969, + "step": 33660 + }, + { + "epoch": 0.986449864498645, + "grad_norm": 10.461912155151367, + "learning_rate": 7.837099051730229e-06, + "loss": 3.2776, + "step": 33670 + }, + { + "epoch": 0.9867428404013769, + "grad_norm": 10.694326400756836, + "learning_rate": 7.835796667964246e-06, + "loss": 3.2725, + "step": 33680 + }, + { + "epoch": 0.987035816304109, + "grad_norm": 10.151097297668457, + "learning_rate": 7.834494000493632e-06, + "loss": 3.2721, + "step": 33690 + }, + { + "epoch": 0.987328792206841, + "grad_norm": 12.18084716796875, + "learning_rate": 7.833191049448706e-06, + "loss": 3.3261, + "step": 33700 + }, + { + "epoch": 0.987621768109573, + "grad_norm": 11.854268074035645, + "learning_rate": 7.831887814959826e-06, + "loss": 3.299, + "step": 33710 + }, + { + "epoch": 0.987914744012305, + "grad_norm": 10.105633735656738, + "learning_rate": 7.830584297157371e-06, + "loss": 3.2863, + "step": 33720 + }, + { + "epoch": 0.988207719915037, + "grad_norm": 11.147832870483398, + "learning_rate": 7.829280496171751e-06, + "loss": 3.2905, + "step": 33730 + }, + { + "epoch": 0.988500695817769, + "grad_norm": 9.4248046875, + "learning_rate": 7.827976412133404e-06, + "loss": 3.2942, + "step": 33740 + }, + { + "epoch": 0.988793671720501, + "grad_norm": 11.087652206420898, + "learning_rate": 7.826672045172795e-06, + "loss": 3.2869, + "step": 33750 + }, + { + "epoch": 0.989086647623233, + "grad_norm": 10.918313026428223, + "learning_rate": 7.825367395420414e-06, + "loss": 3.2982, + "step": 33760 + }, + { + "epoch": 0.989379623525965, + "grad_norm": 9.490056037902832, + "learning_rate": 7.82406246300679e-06, + "loss": 3.2681, + "step": 33770 + }, + { + "epoch": 0.989672599428697, + "grad_norm": 10.822103500366211, + "learning_rate": 7.822757248062472e-06, + "loss": 3.3104, + "step": 33780 + }, + { + "epoch": 0.989965575331429, + "grad_norm": 11.577491760253906, + "learning_rate": 7.821451750718036e-06, + "loss": 3.3027, + "step": 33790 + }, + { + "epoch": 0.9900241705119754, + "eval_bleu": 0.3365688145723216, + "eval_cap_loss": 0.9497959613800049, + "eval_con_loss": 1.2610042095184326, + "eval_loss": 3.471804141998291, + "step": 33792 + }, + { + "epoch": 0.9900241705119754, + "eval_bleu": 0.3365688145723216, + "eval_cap_loss": 0.9497959613800049, + "eval_con_loss": 1.2610042095184326, + "eval_loss": 3.471804141998291, + "eval_runtime": 56.1304, + "eval_samples_per_second": 356.313, + "eval_steps_per_second": 0.356, + "step": 33792 + }, + { + "epoch": 0.990258551234161, + "grad_norm": 10.270682334899902, + "learning_rate": 7.820145971104093e-06, + "loss": 3.293, + "step": 33800 + }, + { + "epoch": 0.990551527136893, + "grad_norm": 10.472042083740234, + "learning_rate": 7.818839909351278e-06, + "loss": 3.2827, + "step": 33810 + }, + { + "epoch": 0.990844503039625, + "grad_norm": 11.610106468200684, + "learning_rate": 7.817533565590251e-06, + "loss": 3.2752, + "step": 33820 + }, + { + "epoch": 0.991137478942357, + "grad_norm": 10.43080997467041, + "learning_rate": 7.816226939951707e-06, + "loss": 3.2844, + "step": 33830 + }, + { + "epoch": 0.991430454845089, + "grad_norm": 9.712448120117188, + "learning_rate": 7.814920032566367e-06, + "loss": 3.3069, + "step": 33840 + }, + { + "epoch": 0.991723430747821, + "grad_norm": 10.602527618408203, + "learning_rate": 7.81361284356498e-06, + "loss": 3.2975, + "step": 33850 + }, + { + "epoch": 0.992016406650553, + "grad_norm": 10.134773254394531, + "learning_rate": 7.812305373078318e-06, + "loss": 3.2898, + "step": 33860 + }, + { + "epoch": 0.992309382553285, + "grad_norm": 10.09359359741211, + "learning_rate": 7.810997621237188e-06, + "loss": 3.2793, + "step": 33870 + }, + { + "epoch": 0.992602358456017, + "grad_norm": 9.609041213989258, + "learning_rate": 7.809689588172424e-06, + "loss": 3.2692, + "step": 33880 + }, + { + "epoch": 0.992895334358749, + "grad_norm": 11.214205741882324, + "learning_rate": 7.808381274014885e-06, + "loss": 3.2799, + "step": 33890 + }, + { + "epoch": 0.993188310261481, + "grad_norm": 11.359757423400879, + "learning_rate": 7.807072678895462e-06, + "loss": 3.2875, + "step": 33900 + }, + { + "epoch": 0.9934812861642129, + "grad_norm": 11.128511428833008, + "learning_rate": 7.805763802945072e-06, + "loss": 3.2876, + "step": 33910 + }, + { + "epoch": 0.993774262066945, + "grad_norm": 10.624732971191406, + "learning_rate": 7.804454646294657e-06, + "loss": 3.2937, + "step": 33920 + }, + { + "epoch": 0.994067237969677, + "grad_norm": 11.785651206970215, + "learning_rate": 7.803145209075195e-06, + "loss": 3.2761, + "step": 33930 + }, + { + "epoch": 0.994360213872409, + "grad_norm": 10.744977951049805, + "learning_rate": 7.801835491417683e-06, + "loss": 3.266, + "step": 33940 + }, + { + "epoch": 0.994653189775141, + "grad_norm": 10.555571556091309, + "learning_rate": 7.800525493453158e-06, + "loss": 3.2991, + "step": 33950 + }, + { + "epoch": 0.994946165677873, + "grad_norm": 11.071915626525879, + "learning_rate": 7.799346255730898e-06, + "loss": 3.2823, + "step": 33960 + }, + { + "epoch": 0.995239141580605, + "grad_norm": 10.716797828674316, + "learning_rate": 7.798035725544123e-06, + "loss": 3.2915, + "step": 33970 + }, + { + "epoch": 0.995532117483337, + "grad_norm": 11.03663158416748, + "learning_rate": 7.796724915430472e-06, + "loss": 3.3051, + "step": 33980 + }, + { + "epoch": 0.995825093386069, + "grad_norm": 10.15781307220459, + "learning_rate": 7.795413825521086e-06, + "loss": 3.3114, + "step": 33990 + }, + { + "epoch": 0.996118069288801, + "grad_norm": 11.091156005859375, + "learning_rate": 7.79410245594713e-06, + "loss": 3.2965, + "step": 34000 + }, + { + "epoch": 0.996411045191533, + "grad_norm": 10.130446434020996, + "learning_rate": 7.792790806839803e-06, + "loss": 3.2707, + "step": 34010 + }, + { + "epoch": 0.996704021094265, + "grad_norm": 10.01889419555664, + "learning_rate": 7.791478878330323e-06, + "loss": 3.2955, + "step": 34020 + }, + { + "epoch": 0.996996996996997, + "grad_norm": 12.314446449279785, + "learning_rate": 7.790166670549944e-06, + "loss": 3.3008, + "step": 34030 + }, + { + "epoch": 0.997289972899729, + "grad_norm": 11.598437309265137, + "learning_rate": 7.788854183629941e-06, + "loss": 3.3033, + "step": 34040 + }, + { + "epoch": 0.997582948802461, + "grad_norm": 10.49874210357666, + "learning_rate": 7.787541417701625e-06, + "loss": 3.265, + "step": 34050 + }, + { + "epoch": 0.997875924705193, + "grad_norm": 10.71951675415039, + "learning_rate": 7.786228372896324e-06, + "loss": 3.2986, + "step": 34060 + }, + { + "epoch": 0.998168900607925, + "grad_norm": 9.745684623718262, + "learning_rate": 7.78491504934541e-06, + "loss": 3.2981, + "step": 34070 + }, + { + "epoch": 0.998461876510657, + "grad_norm": 10.39040470123291, + "learning_rate": 7.783601447180264e-06, + "loss": 3.2893, + "step": 34080 + }, + { + "epoch": 0.998754852413389, + "grad_norm": 10.207707405090332, + "learning_rate": 7.782287566532308e-06, + "loss": 3.2527, + "step": 34090 + }, + { + "epoch": 0.999047828316121, + "grad_norm": 10.04611587524414, + "learning_rate": 7.780973407532987e-06, + "loss": 3.2771, + "step": 34100 + }, + { + "epoch": 0.999340804218853, + "grad_norm": 9.11734676361084, + "learning_rate": 7.77965897031378e-06, + "loss": 3.2987, + "step": 34110 + }, + { + "epoch": 0.999633780121585, + "grad_norm": 11.055103302001953, + "learning_rate": 7.778344255006179e-06, + "loss": 3.3032, + "step": 34120 + }, + { + "epoch": 0.999926756024317, + "grad_norm": 11.929482460021973, + "learning_rate": 7.777029261741722e-06, + "loss": 3.2849, + "step": 34130 + }, + { + "epoch": 1.000219731927049, + "grad_norm": 19.192760467529297, + "learning_rate": 7.77571399065196e-06, + "loss": 3.2711, + "step": 34140 + }, + { + "epoch": 1.000512707829781, + "grad_norm": 20.37128448486328, + "learning_rate": 7.774398441868483e-06, + "loss": 3.3021, + "step": 34150 + }, + { + "epoch": 1.000805683732513, + "grad_norm": 22.83686637878418, + "learning_rate": 7.773082615522903e-06, + "loss": 3.2863, + "step": 34160 + }, + { + "epoch": 1.001098659635245, + "grad_norm": 19.462738037109375, + "learning_rate": 7.771766511746857e-06, + "loss": 3.3347, + "step": 34170 + }, + { + "epoch": 1.001391635537977, + "grad_norm": 19.480838775634766, + "learning_rate": 7.770450130672016e-06, + "loss": 3.3073, + "step": 34180 + }, + { + "epoch": 1.001684611440709, + "grad_norm": 21.243896484375, + "learning_rate": 7.769133472430077e-06, + "loss": 3.328, + "step": 34190 + }, + { + "epoch": 1.0019775873434411, + "grad_norm": 21.352018356323242, + "learning_rate": 7.767816537152763e-06, + "loss": 3.338, + "step": 34200 + }, + { + "epoch": 1.002270563246173, + "grad_norm": 22.632112503051758, + "learning_rate": 7.766499324971825e-06, + "loss": 3.3429, + "step": 34210 + }, + { + "epoch": 1.002563539148905, + "grad_norm": 23.145586013793945, + "learning_rate": 7.765181836019042e-06, + "loss": 3.3087, + "step": 34220 + }, + { + "epoch": 1.002856515051637, + "grad_norm": 16.683856964111328, + "learning_rate": 7.76386407042622e-06, + "loss": 3.3272, + "step": 34230 + }, + { + "epoch": 1.003149490954369, + "grad_norm": 19.852367401123047, + "learning_rate": 7.7625460283252e-06, + "loss": 3.3297, + "step": 34240 + }, + { + "epoch": 1.003442466857101, + "grad_norm": 21.324113845825195, + "learning_rate": 7.761227709847837e-06, + "loss": 3.3144, + "step": 34250 + }, + { + "epoch": 1.003735442759833, + "grad_norm": 17.350297927856445, + "learning_rate": 7.759909115126021e-06, + "loss": 3.325, + "step": 34260 + }, + { + "epoch": 1.004028418662565, + "grad_norm": 20.306419372558594, + "learning_rate": 7.758590244291674e-06, + "loss": 3.3183, + "step": 34270 + }, + { + "epoch": 1.004321394565297, + "grad_norm": 17.0413875579834, + "learning_rate": 7.75727109747674e-06, + "loss": 3.3516, + "step": 34280 + }, + { + "epoch": 1.004614370468029, + "grad_norm": 19.87367820739746, + "learning_rate": 7.755951674813191e-06, + "loss": 3.3209, + "step": 34290 + }, + { + "epoch": 1.004907346370761, + "grad_norm": 20.74055290222168, + "learning_rate": 7.754631976433027e-06, + "loss": 3.3392, + "step": 34300 + }, + { + "epoch": 1.0050245367318538, + "eval_bleu": 0.33445048332354826, + "eval_cap_loss": 0.9538121819496155, + "eval_con_loss": 1.2847559452056885, + "eval_loss": 3.5233240127563477, + "step": 34304 + }, + { + "epoch": 1.0050245367318538, + "eval_bleu": 0.33445048332354826, + "eval_cap_loss": 0.9538121819496155, + "eval_con_loss": 1.2847559452056885, + "eval_loss": 3.5233240127563477, + "eval_runtime": 50.8602, + "eval_samples_per_second": 393.235, + "eval_steps_per_second": 0.393, + "step": 34304 + }, + { + "epoch": 1.005200322273493, + "grad_norm": 23.372175216674805, + "learning_rate": 7.753312002468276e-06, + "loss": 3.3237, + "step": 34310 + }, + { + "epoch": 1.005493298176225, + "grad_norm": 20.47061538696289, + "learning_rate": 7.751991753050993e-06, + "loss": 3.3421, + "step": 34320 + }, + { + "epoch": 1.005786274078957, + "grad_norm": 24.491361618041992, + "learning_rate": 7.750671228313265e-06, + "loss": 3.3549, + "step": 34330 + }, + { + "epoch": 1.006079249981689, + "grad_norm": 18.604042053222656, + "learning_rate": 7.749350428387197e-06, + "loss": 3.3345, + "step": 34340 + }, + { + "epoch": 1.006372225884421, + "grad_norm": 19.923099517822266, + "learning_rate": 7.748029353404931e-06, + "loss": 3.3223, + "step": 34350 + }, + { + "epoch": 1.006665201787153, + "grad_norm": 18.66996192932129, + "learning_rate": 7.74670800349863e-06, + "loss": 3.3267, + "step": 34360 + }, + { + "epoch": 1.006958177689885, + "grad_norm": 22.195268630981445, + "learning_rate": 7.745386378800492e-06, + "loss": 3.3474, + "step": 34370 + }, + { + "epoch": 1.007251153592617, + "grad_norm": 16.231006622314453, + "learning_rate": 7.744064479442733e-06, + "loss": 3.3518, + "step": 34380 + }, + { + "epoch": 1.007544129495349, + "grad_norm": 19.545818328857422, + "learning_rate": 7.742742305557603e-06, + "loss": 3.3019, + "step": 34390 + }, + { + "epoch": 1.007837105398081, + "grad_norm": 19.54378318786621, + "learning_rate": 7.741419857277377e-06, + "loss": 3.3353, + "step": 34400 + }, + { + "epoch": 1.008130081300813, + "grad_norm": 21.117918014526367, + "learning_rate": 7.74009713473436e-06, + "loss": 3.3595, + "step": 34410 + }, + { + "epoch": 1.008423057203545, + "grad_norm": 19.042123794555664, + "learning_rate": 7.73877413806088e-06, + "loss": 3.324, + "step": 34420 + }, + { + "epoch": 1.008716033106277, + "grad_norm": 17.128761291503906, + "learning_rate": 7.737450867389297e-06, + "loss": 3.3389, + "step": 34430 + }, + { + "epoch": 1.009009009009009, + "grad_norm": 19.45064926147461, + "learning_rate": 7.736127322851994e-06, + "loss": 3.3332, + "step": 34440 + }, + { + "epoch": 1.009301984911741, + "grad_norm": 20.403148651123047, + "learning_rate": 7.734803504581387e-06, + "loss": 3.3318, + "step": 34450 + }, + { + "epoch": 1.009594960814473, + "grad_norm": 20.980112075805664, + "learning_rate": 7.733479412709913e-06, + "loss": 3.3545, + "step": 34460 + }, + { + "epoch": 1.0098879367172051, + "grad_norm": 17.09270477294922, + "learning_rate": 7.732155047370042e-06, + "loss": 3.3563, + "step": 34470 + }, + { + "epoch": 1.010180912619937, + "grad_norm": 20.11016845703125, + "learning_rate": 7.730830408694267e-06, + "loss": 3.3379, + "step": 34480 + }, + { + "epoch": 1.010473888522669, + "grad_norm": 19.615079879760742, + "learning_rate": 7.72950549681511e-06, + "loss": 3.3405, + "step": 34490 + }, + { + "epoch": 1.010766864425401, + "grad_norm": 18.091997146606445, + "learning_rate": 7.728180311865124e-06, + "loss": 3.3451, + "step": 34500 + }, + { + "epoch": 1.011059840328133, + "grad_norm": 20.775362014770508, + "learning_rate": 7.726854853976882e-06, + "loss": 3.3547, + "step": 34510 + }, + { + "epoch": 1.011352816230865, + "grad_norm": 18.420459747314453, + "learning_rate": 7.725529123282991e-06, + "loss": 3.343, + "step": 34520 + }, + { + "epoch": 1.011645792133597, + "grad_norm": 18.662891387939453, + "learning_rate": 7.72420311991608e-06, + "loss": 3.3166, + "step": 34530 + }, + { + "epoch": 1.011938768036329, + "grad_norm": 19.124298095703125, + "learning_rate": 7.722876844008809e-06, + "loss": 3.3288, + "step": 34540 + }, + { + "epoch": 1.012231743939061, + "grad_norm": 20.721303939819336, + "learning_rate": 7.721550295693865e-06, + "loss": 3.3498, + "step": 34550 + }, + { + "epoch": 1.012524719841793, + "grad_norm": 16.22440528869629, + "learning_rate": 7.720223475103959e-06, + "loss": 3.3526, + "step": 34560 + }, + { + "epoch": 1.012817695744525, + "grad_norm": 16.184846878051758, + "learning_rate": 7.718896382371834e-06, + "loss": 3.3433, + "step": 34570 + }, + { + "epoch": 1.013110671647257, + "grad_norm": 19.79545021057129, + "learning_rate": 7.717569017630256e-06, + "loss": 3.3319, + "step": 34580 + }, + { + "epoch": 1.013403647549989, + "grad_norm": 21.502546310424805, + "learning_rate": 7.716241381012021e-06, + "loss": 3.349, + "step": 34590 + }, + { + "epoch": 1.013696623452721, + "grad_norm": 20.224124908447266, + "learning_rate": 7.714913472649951e-06, + "loss": 3.352, + "step": 34600 + }, + { + "epoch": 1.013989599355453, + "grad_norm": 18.305368423461914, + "learning_rate": 7.713585292676894e-06, + "loss": 3.3418, + "step": 34610 + }, + { + "epoch": 1.014282575258185, + "grad_norm": 18.19740104675293, + "learning_rate": 7.712256841225729e-06, + "loss": 3.3356, + "step": 34620 + }, + { + "epoch": 1.014575551160917, + "grad_norm": 20.008472442626953, + "learning_rate": 7.710928118429357e-06, + "loss": 3.3345, + "step": 34630 + }, + { + "epoch": 1.014868527063649, + "grad_norm": 19.193523406982422, + "learning_rate": 7.70959912442071e-06, + "loss": 3.3286, + "step": 34640 + }, + { + "epoch": 1.015161502966381, + "grad_norm": 22.420698165893555, + "learning_rate": 7.708269859332747e-06, + "loss": 3.3651, + "step": 34650 + }, + { + "epoch": 1.015454478869113, + "grad_norm": 23.36321449279785, + "learning_rate": 7.706940323298453e-06, + "loss": 3.3418, + "step": 34660 + }, + { + "epoch": 1.015747454771845, + "grad_norm": 17.765302658081055, + "learning_rate": 7.705610516450838e-06, + "loss": 3.3417, + "step": 34670 + }, + { + "epoch": 1.016040430674577, + "grad_norm": 20.175874710083008, + "learning_rate": 7.704280438922943e-06, + "loss": 3.3597, + "step": 34680 + }, + { + "epoch": 1.016333406577309, + "grad_norm": 21.293703079223633, + "learning_rate": 7.702950090847833e-06, + "loss": 3.352, + "step": 34690 + }, + { + "epoch": 1.016626382480041, + "grad_norm": 21.252613067626953, + "learning_rate": 7.701619472358605e-06, + "loss": 3.3554, + "step": 34700 + }, + { + "epoch": 1.016919358382773, + "grad_norm": 19.15425682067871, + "learning_rate": 7.700288583588374e-06, + "loss": 3.3471, + "step": 34710 + }, + { + "epoch": 1.017212334285505, + "grad_norm": 23.633546829223633, + "learning_rate": 7.698957424670293e-06, + "loss": 3.3516, + "step": 34720 + }, + { + "epoch": 1.017505310188237, + "grad_norm": 21.28249740600586, + "learning_rate": 7.697625995737532e-06, + "loss": 3.3367, + "step": 34730 + }, + { + "epoch": 1.0177982860909691, + "grad_norm": 15.5936918258667, + "learning_rate": 7.696294296923296e-06, + "loss": 3.3221, + "step": 34740 + }, + { + "epoch": 1.018091261993701, + "grad_norm": 19.128419876098633, + "learning_rate": 7.69496232836081e-06, + "loss": 3.3434, + "step": 34750 + }, + { + "epoch": 1.018384237896433, + "grad_norm": 19.37213706970215, + "learning_rate": 7.693630090183334e-06, + "loss": 3.354, + "step": 34760 + }, + { + "epoch": 1.018677213799165, + "grad_norm": 20.548912048339844, + "learning_rate": 7.692297582524149e-06, + "loss": 3.3222, + "step": 34770 + }, + { + "epoch": 1.018970189701897, + "grad_norm": 18.954238891601562, + "learning_rate": 7.690964805516562e-06, + "loss": 3.3419, + "step": 34780 + }, + { + "epoch": 1.019263165604629, + "grad_norm": 16.885345458984375, + "learning_rate": 7.689631759293912e-06, + "loss": 3.3468, + "step": 34790 + }, + { + "epoch": 1.019556141507361, + "grad_norm": 19.393463134765625, + "learning_rate": 7.688298443989562e-06, + "loss": 3.3582, + "step": 34800 + }, + { + "epoch": 1.019849117410093, + "grad_norm": 20.19963264465332, + "learning_rate": 7.686964859736899e-06, + "loss": 3.3454, + "step": 34810 + }, + { + "epoch": 1.0200249029517323, + "eval_bleu": 0.33369937372475345, + "eval_cap_loss": 0.9560219645500183, + "eval_con_loss": 1.2896748781204224, + "eval_loss": 3.535371780395508, + "step": 34816 + }, + { + "epoch": 1.0200249029517323, + "eval_bleu": 0.33369937372475345, + "eval_cap_loss": 0.9560219645500183, + "eval_con_loss": 1.2896748781204224, + "eval_loss": 3.535371780395508, + "eval_runtime": 53.2908, + "eval_samples_per_second": 375.299, + "eval_steps_per_second": 0.375, + "step": 34816 + }, + { + "epoch": 1.0201420933128251, + "grad_norm": 17.88078498840332, + "learning_rate": 7.685631006669346e-06, + "loss": 3.3235, + "step": 34820 + }, + { + "epoch": 1.020435069215557, + "grad_norm": 17.237613677978516, + "learning_rate": 7.684296884920344e-06, + "loss": 3.3508, + "step": 34830 + }, + { + "epoch": 1.020728045118289, + "grad_norm": 20.71148109436035, + "learning_rate": 7.682962494623363e-06, + "loss": 3.3361, + "step": 34840 + }, + { + "epoch": 1.021021021021021, + "grad_norm": 20.758567810058594, + "learning_rate": 7.6816278359119e-06, + "loss": 3.3508, + "step": 34850 + }, + { + "epoch": 1.021313996923753, + "grad_norm": 21.89423370361328, + "learning_rate": 7.680292908919485e-06, + "loss": 3.3455, + "step": 34860 + }, + { + "epoch": 1.021606972826485, + "grad_norm": 19.34027671813965, + "learning_rate": 7.678957713779665e-06, + "loss": 3.3493, + "step": 34870 + }, + { + "epoch": 1.021899948729217, + "grad_norm": 19.80699920654297, + "learning_rate": 7.677622250626018e-06, + "loss": 3.3366, + "step": 34880 + }, + { + "epoch": 1.022192924631949, + "grad_norm": 19.704044342041016, + "learning_rate": 7.676286519592151e-06, + "loss": 3.3246, + "step": 34890 + }, + { + "epoch": 1.022485900534681, + "grad_norm": 18.417631149291992, + "learning_rate": 7.674950520811698e-06, + "loss": 3.3115, + "step": 34900 + }, + { + "epoch": 1.022778876437413, + "grad_norm": 17.710063934326172, + "learning_rate": 7.673614254418313e-06, + "loss": 3.3476, + "step": 34910 + }, + { + "epoch": 1.023071852340145, + "grad_norm": 18.29934310913086, + "learning_rate": 7.672277720545685e-06, + "loss": 3.3643, + "step": 34920 + }, + { + "epoch": 1.0233648282428771, + "grad_norm": 20.713642120361328, + "learning_rate": 7.670940919327523e-06, + "loss": 3.3633, + "step": 34930 + }, + { + "epoch": 1.023657804145609, + "grad_norm": 22.59192657470703, + "learning_rate": 7.66960385089757e-06, + "loss": 3.3362, + "step": 34940 + }, + { + "epoch": 1.023950780048341, + "grad_norm": 19.572622299194336, + "learning_rate": 7.66826651538959e-06, + "loss": 3.3375, + "step": 34950 + }, + { + "epoch": 1.024243755951073, + "grad_norm": 17.579980850219727, + "learning_rate": 7.666928912937375e-06, + "loss": 3.3403, + "step": 34960 + }, + { + "epoch": 1.024536731853805, + "grad_norm": 17.715930938720703, + "learning_rate": 7.665591043674745e-06, + "loss": 3.3689, + "step": 34970 + }, + { + "epoch": 1.024829707756537, + "grad_norm": 18.465736389160156, + "learning_rate": 7.664252907735546e-06, + "loss": 3.3663, + "step": 34980 + }, + { + "epoch": 1.025122683659269, + "grad_norm": 18.926280975341797, + "learning_rate": 7.662914505253648e-06, + "loss": 3.3576, + "step": 34990 + }, + { + "epoch": 1.025415659562001, + "grad_norm": 19.522558212280273, + "learning_rate": 7.661575836362954e-06, + "loss": 3.3164, + "step": 35000 + }, + { + "epoch": 1.0257086354647331, + "grad_norm": 19.865947723388672, + "learning_rate": 7.660236901197387e-06, + "loss": 3.365, + "step": 35010 + }, + { + "epoch": 1.026001611367465, + "grad_norm": 22.916128158569336, + "learning_rate": 7.6588976998909e-06, + "loss": 3.353, + "step": 35020 + }, + { + "epoch": 1.026294587270197, + "grad_norm": 18.41000747680664, + "learning_rate": 7.657558232577476e-06, + "loss": 3.3354, + "step": 35030 + }, + { + "epoch": 1.026587563172929, + "grad_norm": 18.92070198059082, + "learning_rate": 7.656218499391116e-06, + "loss": 3.3289, + "step": 35040 + }, + { + "epoch": 1.026880539075661, + "grad_norm": 17.873868942260742, + "learning_rate": 7.654878500465853e-06, + "loss": 3.3431, + "step": 35050 + }, + { + "epoch": 1.027173514978393, + "grad_norm": 19.325651168823242, + "learning_rate": 7.653538235935748e-06, + "loss": 3.3524, + "step": 35060 + }, + { + "epoch": 1.027466490881125, + "grad_norm": 15.669853210449219, + "learning_rate": 7.652197705934884e-06, + "loss": 3.3462, + "step": 35070 + }, + { + "epoch": 1.027759466783857, + "grad_norm": 17.992027282714844, + "learning_rate": 7.650856910597376e-06, + "loss": 3.3428, + "step": 35080 + }, + { + "epoch": 1.0280524426865891, + "grad_norm": 18.97701644897461, + "learning_rate": 7.64951585005736e-06, + "loss": 3.3528, + "step": 35090 + }, + { + "epoch": 1.028345418589321, + "grad_norm": 21.134693145751953, + "learning_rate": 7.648174524449004e-06, + "loss": 3.3334, + "step": 35100 + }, + { + "epoch": 1.028638394492053, + "grad_norm": 24.515932083129883, + "learning_rate": 7.646832933906496e-06, + "loss": 3.338, + "step": 35110 + }, + { + "epoch": 1.028931370394785, + "grad_norm": 22.08915138244629, + "learning_rate": 7.645491078564056e-06, + "loss": 3.3469, + "step": 35120 + }, + { + "epoch": 1.029224346297517, + "grad_norm": 21.297761917114258, + "learning_rate": 7.64414895855593e-06, + "loss": 3.3477, + "step": 35130 + }, + { + "epoch": 1.029517322200249, + "grad_norm": 17.0189151763916, + "learning_rate": 7.642806574016387e-06, + "loss": 3.3462, + "step": 35140 + }, + { + "epoch": 1.029810298102981, + "grad_norm": 19.614681243896484, + "learning_rate": 7.641463925079727e-06, + "loss": 3.3383, + "step": 35150 + }, + { + "epoch": 1.030103274005713, + "grad_norm": 18.223953247070312, + "learning_rate": 7.64012101188027e-06, + "loss": 3.337, + "step": 35160 + }, + { + "epoch": 1.0303962499084451, + "grad_norm": 17.995193481445312, + "learning_rate": 7.638777834552372e-06, + "loss": 3.3517, + "step": 35170 + }, + { + "epoch": 1.030689225811177, + "grad_norm": 19.292991638183594, + "learning_rate": 7.637434393230404e-06, + "loss": 3.3591, + "step": 35180 + }, + { + "epoch": 1.030982201713909, + "grad_norm": 20.426342010498047, + "learning_rate": 7.636090688048775e-06, + "loss": 3.3404, + "step": 35190 + }, + { + "epoch": 1.0312751776166411, + "grad_norm": 17.0275821685791, + "learning_rate": 7.634746719141912e-06, + "loss": 3.3389, + "step": 35200 + }, + { + "epoch": 1.031568153519373, + "grad_norm": 20.931915283203125, + "learning_rate": 7.633402486644268e-06, + "loss": 3.3686, + "step": 35210 + }, + { + "epoch": 1.031861129422105, + "grad_norm": 18.826269149780273, + "learning_rate": 7.632057990690331e-06, + "loss": 3.3617, + "step": 35220 + }, + { + "epoch": 1.032154105324837, + "grad_norm": 20.004894256591797, + "learning_rate": 7.630713231414609e-06, + "loss": 3.3441, + "step": 35230 + }, + { + "epoch": 1.032447081227569, + "grad_norm": 19.91136360168457, + "learning_rate": 7.629368208951633e-06, + "loss": 3.3718, + "step": 35240 + }, + { + "epoch": 1.032740057130301, + "grad_norm": 16.239463806152344, + "learning_rate": 7.628022923435967e-06, + "loss": 3.3229, + "step": 35250 + }, + { + "epoch": 1.033033033033033, + "grad_norm": 19.766357421875, + "learning_rate": 7.6266773750022005e-06, + "loss": 3.3717, + "step": 35260 + }, + { + "epoch": 1.033326008935765, + "grad_norm": 18.903902053833008, + "learning_rate": 7.625331563784947e-06, + "loss": 3.3409, + "step": 35270 + }, + { + "epoch": 1.0336189848384971, + "grad_norm": 19.651927947998047, + "learning_rate": 7.623985489918845e-06, + "loss": 3.3438, + "step": 35280 + }, + { + "epoch": 1.033911960741229, + "grad_norm": 17.456857681274414, + "learning_rate": 7.622639153538563e-06, + "loss": 3.3635, + "step": 35290 + }, + { + "epoch": 1.034204936643961, + "grad_norm": 18.109460830688477, + "learning_rate": 7.621292554778792e-06, + "loss": 3.3397, + "step": 35300 + }, + { + "epoch": 1.034497912546693, + "grad_norm": 19.673891067504883, + "learning_rate": 7.6199456937742535e-06, + "loss": 3.327, + "step": 35310 + }, + { + "epoch": 1.034790888449425, + "grad_norm": 17.840951919555664, + "learning_rate": 7.618598570659691e-06, + "loss": 3.3558, + "step": 35320 + }, + { + "epoch": 1.0350252691716106, + "eval_bleu": 0.33307848462425094, + "eval_cap_loss": 0.9567497968673706, + "eval_con_loss": 1.2909367084503174, + "eval_loss": 3.5386228561401367, + "step": 35328 + }, + { + "epoch": 1.0350252691716106, + "eval_bleu": 0.33307848462425094, + "eval_cap_loss": 0.9567497968673706, + "eval_con_loss": 1.2909367084503174, + "eval_loss": 3.5386228561401367, + "eval_runtime": 52.6128, + "eval_samples_per_second": 380.135, + "eval_steps_per_second": 0.38, + "step": 35328 + }, + { + "epoch": 1.035083864352157, + "grad_norm": 17.16716194152832, + "learning_rate": 7.617251185569876e-06, + "loss": 3.3436, + "step": 35330 + }, + { + "epoch": 1.035376840254889, + "grad_norm": 16.327804565429688, + "learning_rate": 7.615903538639611e-06, + "loss": 3.3293, + "step": 35340 + }, + { + "epoch": 1.035669816157621, + "grad_norm": 20.811317443847656, + "learning_rate": 7.614555630003715e-06, + "loss": 3.3568, + "step": 35350 + }, + { + "epoch": 1.0359627920603531, + "grad_norm": 19.79791259765625, + "learning_rate": 7.613207459797039e-06, + "loss": 3.3602, + "step": 35360 + }, + { + "epoch": 1.036255767963085, + "grad_norm": 19.961666107177734, + "learning_rate": 7.61185902815446e-06, + "loss": 3.3609, + "step": 35370 + }, + { + "epoch": 1.036548743865817, + "grad_norm": 18.636112213134766, + "learning_rate": 7.610510335210881e-06, + "loss": 3.3411, + "step": 35380 + }, + { + "epoch": 1.036841719768549, + "grad_norm": 17.189720153808594, + "learning_rate": 7.609161381101232e-06, + "loss": 3.3528, + "step": 35390 + }, + { + "epoch": 1.037134695671281, + "grad_norm": 20.13739776611328, + "learning_rate": 7.607812165960466e-06, + "loss": 3.3597, + "step": 35400 + }, + { + "epoch": 1.037427671574013, + "grad_norm": 20.081708908081055, + "learning_rate": 7.606462689923564e-06, + "loss": 3.3475, + "step": 35410 + }, + { + "epoch": 1.037720647476745, + "grad_norm": 20.783039093017578, + "learning_rate": 7.605112953125533e-06, + "loss": 3.3501, + "step": 35420 + }, + { + "epoch": 1.038013623379477, + "grad_norm": 18.112899780273438, + "learning_rate": 7.603762955701407e-06, + "loss": 3.34, + "step": 35430 + }, + { + "epoch": 1.0383065992822091, + "grad_norm": 22.475502014160156, + "learning_rate": 7.602412697786246e-06, + "loss": 3.3362, + "step": 35440 + }, + { + "epoch": 1.038599575184941, + "grad_norm": 15.89416790008545, + "learning_rate": 7.601062179515133e-06, + "loss": 3.3583, + "step": 35450 + }, + { + "epoch": 1.038892551087673, + "grad_norm": 21.100889205932617, + "learning_rate": 7.5997114010231795e-06, + "loss": 3.3675, + "step": 35460 + }, + { + "epoch": 1.0391855269904051, + "grad_norm": 20.078472137451172, + "learning_rate": 7.598360362445526e-06, + "loss": 3.3584, + "step": 35470 + }, + { + "epoch": 1.039478502893137, + "grad_norm": 19.427175521850586, + "learning_rate": 7.597009063917333e-06, + "loss": 3.3306, + "step": 35480 + }, + { + "epoch": 1.039771478795869, + "grad_norm": 19.08909034729004, + "learning_rate": 7.5956575055737914e-06, + "loss": 3.359, + "step": 35490 + }, + { + "epoch": 1.040064454698601, + "grad_norm": 20.023094177246094, + "learning_rate": 7.594305687550114e-06, + "loss": 3.3326, + "step": 35500 + }, + { + "epoch": 1.040357430601333, + "grad_norm": 18.87820816040039, + "learning_rate": 7.592953609981546e-06, + "loss": 3.3324, + "step": 35510 + }, + { + "epoch": 1.040650406504065, + "grad_norm": 21.821317672729492, + "learning_rate": 7.591601273003353e-06, + "loss": 3.3434, + "step": 35520 + }, + { + "epoch": 1.040943382406797, + "grad_norm": 20.293304443359375, + "learning_rate": 7.590248676750827e-06, + "loss": 3.3358, + "step": 35530 + }, + { + "epoch": 1.041236358309529, + "grad_norm": 20.458812713623047, + "learning_rate": 7.588895821359288e-06, + "loss": 3.3741, + "step": 35540 + }, + { + "epoch": 1.0415293342122611, + "grad_norm": 20.703853607177734, + "learning_rate": 7.587542706964081e-06, + "loss": 3.3478, + "step": 35550 + }, + { + "epoch": 1.041822310114993, + "grad_norm": 19.502212524414062, + "learning_rate": 7.586189333700579e-06, + "loss": 3.3533, + "step": 35560 + }, + { + "epoch": 1.042115286017725, + "grad_norm": 17.913732528686523, + "learning_rate": 7.584835701704176e-06, + "loss": 3.3525, + "step": 35570 + }, + { + "epoch": 1.042408261920457, + "grad_norm": 20.260725021362305, + "learning_rate": 7.583481811110298e-06, + "loss": 3.3459, + "step": 35580 + }, + { + "epoch": 1.042701237823189, + "grad_norm": 16.90032958984375, + "learning_rate": 7.58212766205439e-06, + "loss": 3.3392, + "step": 35590 + }, + { + "epoch": 1.042994213725921, + "grad_norm": 21.544597625732422, + "learning_rate": 7.580773254671932e-06, + "loss": 3.3331, + "step": 35600 + }, + { + "epoch": 1.043287189628653, + "grad_norm": 22.047393798828125, + "learning_rate": 7.579418589098419e-06, + "loss": 3.3522, + "step": 35610 + }, + { + "epoch": 1.043580165531385, + "grad_norm": 18.632551193237305, + "learning_rate": 7.578063665469378e-06, + "loss": 3.3306, + "step": 35620 + }, + { + "epoch": 1.0438731414341171, + "grad_norm": 17.440853118896484, + "learning_rate": 7.576708483920364e-06, + "loss": 3.3498, + "step": 35630 + }, + { + "epoch": 1.044166117336849, + "grad_norm": 17.52276039123535, + "learning_rate": 7.575353044586953e-06, + "loss": 3.3701, + "step": 35640 + }, + { + "epoch": 1.044459093239581, + "grad_norm": 18.57608985900879, + "learning_rate": 7.573997347604749e-06, + "loss": 3.3207, + "step": 35650 + }, + { + "epoch": 1.0447520691423131, + "grad_norm": 16.86911392211914, + "learning_rate": 7.5726413931093814e-06, + "loss": 3.3459, + "step": 35660 + }, + { + "epoch": 1.045045045045045, + "grad_norm": 18.12965202331543, + "learning_rate": 7.571285181236505e-06, + "loss": 3.3469, + "step": 35670 + }, + { + "epoch": 1.045338020947777, + "grad_norm": 19.28858757019043, + "learning_rate": 7.569928712121803e-06, + "loss": 3.3499, + "step": 35680 + }, + { + "epoch": 1.045630996850509, + "grad_norm": 20.251413345336914, + "learning_rate": 7.568571985900979e-06, + "loss": 3.3409, + "step": 35690 + }, + { + "epoch": 1.045923972753241, + "grad_norm": 19.827865600585938, + "learning_rate": 7.567215002709765e-06, + "loss": 3.344, + "step": 35700 + }, + { + "epoch": 1.0462169486559731, + "grad_norm": 18.978538513183594, + "learning_rate": 7.565857762683921e-06, + "loss": 3.3352, + "step": 35710 + }, + { + "epoch": 1.046509924558705, + "grad_norm": 18.794118881225586, + "learning_rate": 7.564500265959234e-06, + "loss": 3.3578, + "step": 35720 + }, + { + "epoch": 1.046802900461437, + "grad_norm": 19.712379455566406, + "learning_rate": 7.563142512671506e-06, + "loss": 3.3321, + "step": 35730 + }, + { + "epoch": 1.0470958763641691, + "grad_norm": 18.939695358276367, + "learning_rate": 7.561784502956577e-06, + "loss": 3.3349, + "step": 35740 + }, + { + "epoch": 1.047388852266901, + "grad_norm": 17.662818908691406, + "learning_rate": 7.560426236950309e-06, + "loss": 3.3476, + "step": 35750 + }, + { + "epoch": 1.047681828169633, + "grad_norm": 20.31946563720703, + "learning_rate": 7.559067714788586e-06, + "loss": 3.3194, + "step": 35760 + }, + { + "epoch": 1.047974804072365, + "grad_norm": 15.43947982788086, + "learning_rate": 7.55770893660732e-06, + "loss": 3.3375, + "step": 35770 + }, + { + "epoch": 1.048267779975097, + "grad_norm": 20.37908172607422, + "learning_rate": 7.556349902542448e-06, + "loss": 3.3371, + "step": 35780 + }, + { + "epoch": 1.0485607558778292, + "grad_norm": 19.579689025878906, + "learning_rate": 7.554990612729936e-06, + "loss": 3.3701, + "step": 35790 + }, + { + "epoch": 1.048853731780561, + "grad_norm": 22.033966064453125, + "learning_rate": 7.553631067305771e-06, + "loss": 3.3316, + "step": 35800 + }, + { + "epoch": 1.049146707683293, + "grad_norm": 19.860498428344727, + "learning_rate": 7.552271266405968e-06, + "loss": 3.3158, + "step": 35810 + }, + { + "epoch": 1.0494396835860251, + "grad_norm": 16.40704917907715, + "learning_rate": 7.550911210166567e-06, + "loss": 3.3432, + "step": 35820 + }, + { + "epoch": 1.049732659488757, + "grad_norm": 17.960208892822266, + "learning_rate": 7.549550898723633e-06, + "loss": 3.368, + "step": 35830 + }, + { + "epoch": 1.050025635391489, + "grad_norm": 16.149944305419922, + "learning_rate": 7.548190332213258e-06, + "loss": 3.366, + "step": 35840 + }, + { + "epoch": 1.050025635391489, + "eval_bleu": 0.3330599949721904, + "eval_cap_loss": 0.9568620920181274, + "eval_con_loss": 1.2888412475585938, + "eval_loss": 3.5345444679260254, + "step": 35840 + }, + { + "epoch": 1.050025635391489, + "eval_bleu": 0.3330599949721904, + "eval_cap_loss": 0.9568620920181274, + "eval_con_loss": 1.2888412475585938, + "eval_loss": 3.5345444679260254, + "eval_runtime": 53.0223, + "eval_samples_per_second": 377.2, + "eval_steps_per_second": 0.377, + "step": 35840 + }, + { + "epoch": 1.050318611294221, + "grad_norm": 19.967378616333008, + "learning_rate": 7.546829510771558e-06, + "loss": 3.339, + "step": 35850 + }, + { + "epoch": 1.050611587196953, + "grad_norm": 15.947935104370117, + "learning_rate": 7.5454684345346765e-06, + "loss": 3.3616, + "step": 35860 + }, + { + "epoch": 1.0509045630996852, + "grad_norm": 14.9132719039917, + "learning_rate": 7.544107103638778e-06, + "loss": 3.3411, + "step": 35870 + }, + { + "epoch": 1.051197539002417, + "grad_norm": 20.516347885131836, + "learning_rate": 7.542745518220058e-06, + "loss": 3.3899, + "step": 35880 + }, + { + "epoch": 1.051490514905149, + "grad_norm": 18.797348022460938, + "learning_rate": 7.541383678414735e-06, + "loss": 3.335, + "step": 35890 + }, + { + "epoch": 1.0517834908078811, + "grad_norm": 24.02049446105957, + "learning_rate": 7.540021584359053e-06, + "loss": 3.3374, + "step": 35900 + }, + { + "epoch": 1.052076466710613, + "grad_norm": 18.418066024780273, + "learning_rate": 7.53865923618928e-06, + "loss": 3.3495, + "step": 35910 + }, + { + "epoch": 1.052369442613345, + "grad_norm": 18.31145477294922, + "learning_rate": 7.5372966340417105e-06, + "loss": 3.3428, + "step": 35920 + }, + { + "epoch": 1.0526624185160771, + "grad_norm": 17.568050384521484, + "learning_rate": 7.535933778052669e-06, + "loss": 3.3275, + "step": 35930 + }, + { + "epoch": 1.052955394418809, + "grad_norm": 17.188425064086914, + "learning_rate": 7.534570668358496e-06, + "loss": 3.3489, + "step": 35940 + }, + { + "epoch": 1.053248370321541, + "grad_norm": 18.073511123657227, + "learning_rate": 7.5332073050955655e-06, + "loss": 3.3379, + "step": 35950 + }, + { + "epoch": 1.053541346224273, + "grad_norm": 17.153745651245117, + "learning_rate": 7.5319800614703706e-06, + "loss": 3.3186, + "step": 35960 + }, + { + "epoch": 1.053834322127005, + "grad_norm": 23.351716995239258, + "learning_rate": 7.530616216802593e-06, + "loss": 3.3394, + "step": 35970 + }, + { + "epoch": 1.0541272980297371, + "grad_norm": 18.376482009887695, + "learning_rate": 7.529252118961675e-06, + "loss": 3.3279, + "step": 35980 + }, + { + "epoch": 1.054420273932469, + "grad_norm": 17.883342742919922, + "learning_rate": 7.52788776808409e-06, + "loss": 3.3356, + "step": 35990 + }, + { + "epoch": 1.054713249835201, + "grad_norm": 19.97841453552246, + "learning_rate": 7.52652316430633e-06, + "loss": 3.3614, + "step": 36000 + }, + { + "epoch": 1.0550062257379331, + "grad_norm": 18.05038833618164, + "learning_rate": 7.525158307764917e-06, + "loss": 3.3471, + "step": 36010 + }, + { + "epoch": 1.055299201640665, + "grad_norm": 21.22806739807129, + "learning_rate": 7.523793198596397e-06, + "loss": 3.3378, + "step": 36020 + }, + { + "epoch": 1.055592177543397, + "grad_norm": 19.749300003051758, + "learning_rate": 7.5224278369373385e-06, + "loss": 3.3509, + "step": 36030 + }, + { + "epoch": 1.055885153446129, + "grad_norm": 18.568077087402344, + "learning_rate": 7.521062222924343e-06, + "loss": 3.3411, + "step": 36040 + }, + { + "epoch": 1.056178129348861, + "grad_norm": 18.853492736816406, + "learning_rate": 7.519696356694026e-06, + "loss": 3.3648, + "step": 36050 + }, + { + "epoch": 1.0564711052515932, + "grad_norm": 18.88772964477539, + "learning_rate": 7.51833023838304e-06, + "loss": 3.365, + "step": 36060 + }, + { + "epoch": 1.056764081154325, + "grad_norm": 16.72566795349121, + "learning_rate": 7.516963868128054e-06, + "loss": 3.3669, + "step": 36070 + }, + { + "epoch": 1.057057057057057, + "grad_norm": 17.8383731842041, + "learning_rate": 7.5155972460657645e-06, + "loss": 3.3665, + "step": 36080 + }, + { + "epoch": 1.0573500329597891, + "grad_norm": 16.795856475830078, + "learning_rate": 7.5142303723328954e-06, + "loss": 3.369, + "step": 36090 + }, + { + "epoch": 1.057643008862521, + "grad_norm": 19.431795120239258, + "learning_rate": 7.512863247066193e-06, + "loss": 3.3431, + "step": 36100 + }, + { + "epoch": 1.057935984765253, + "grad_norm": 16.677698135375977, + "learning_rate": 7.511495870402432e-06, + "loss": 3.3303, + "step": 36110 + }, + { + "epoch": 1.058228960667985, + "grad_norm": 16.9414005279541, + "learning_rate": 7.510128242478409e-06, + "loss": 3.3611, + "step": 36120 + }, + { + "epoch": 1.058521936570717, + "grad_norm": 15.648551940917969, + "learning_rate": 7.5087603634309495e-06, + "loss": 3.3317, + "step": 36130 + }, + { + "epoch": 1.0588149124734492, + "grad_norm": 19.05082130432129, + "learning_rate": 7.507392233396897e-06, + "loss": 3.3494, + "step": 36140 + }, + { + "epoch": 1.059107888376181, + "grad_norm": 16.048053741455078, + "learning_rate": 7.506023852513128e-06, + "loss": 3.3514, + "step": 36150 + }, + { + "epoch": 1.059400864278913, + "grad_norm": 17.867368698120117, + "learning_rate": 7.504655220916541e-06, + "loss": 3.3237, + "step": 36160 + }, + { + "epoch": 1.0596938401816451, + "grad_norm": 18.585763931274414, + "learning_rate": 7.503286338744059e-06, + "loss": 3.3478, + "step": 36170 + }, + { + "epoch": 1.059986816084377, + "grad_norm": 18.546977996826172, + "learning_rate": 7.501917206132629e-06, + "loss": 3.3396, + "step": 36180 + }, + { + "epoch": 1.060279791987109, + "grad_norm": 20.085996627807617, + "learning_rate": 7.5005478232192265e-06, + "loss": 3.3468, + "step": 36190 + }, + { + "epoch": 1.0605727678898411, + "grad_norm": 19.707231521606445, + "learning_rate": 7.49917819014085e-06, + "loss": 3.3422, + "step": 36200 + }, + { + "epoch": 1.060865743792573, + "grad_norm": 19.91645050048828, + "learning_rate": 7.497808307034523e-06, + "loss": 3.3411, + "step": 36210 + }, + { + "epoch": 1.061158719695305, + "grad_norm": 19.169145584106445, + "learning_rate": 7.496438174037293e-06, + "loss": 3.3343, + "step": 36220 + }, + { + "epoch": 1.061451695598037, + "grad_norm": 19.346113204956055, + "learning_rate": 7.495067791286235e-06, + "loss": 3.3352, + "step": 36230 + }, + { + "epoch": 1.061744671500769, + "grad_norm": 17.414592742919922, + "learning_rate": 7.493697158918447e-06, + "loss": 3.3542, + "step": 36240 + }, + { + "epoch": 1.0620376474035012, + "grad_norm": 20.98406982421875, + "learning_rate": 7.492326277071052e-06, + "loss": 3.336, + "step": 36250 + }, + { + "epoch": 1.062330623306233, + "grad_norm": 19.437503814697266, + "learning_rate": 7.490955145881202e-06, + "loss": 3.3567, + "step": 36260 + }, + { + "epoch": 1.062623599208965, + "grad_norm": 17.463520050048828, + "learning_rate": 7.4895837654860645e-06, + "loss": 3.3519, + "step": 36270 + }, + { + "epoch": 1.0629165751116971, + "grad_norm": 20.05430793762207, + "learning_rate": 7.488212136022844e-06, + "loss": 3.3505, + "step": 36280 + }, + { + "epoch": 1.063209551014429, + "grad_norm": 17.301809310913086, + "learning_rate": 7.48684025762876e-06, + "loss": 3.3449, + "step": 36290 + }, + { + "epoch": 1.063502526917161, + "grad_norm": 18.856496810913086, + "learning_rate": 7.4854681304410636e-06, + "loss": 3.3328, + "step": 36300 + }, + { + "epoch": 1.063795502819893, + "grad_norm": 19.524782180786133, + "learning_rate": 7.484095754597025e-06, + "loss": 3.3335, + "step": 36310 + }, + { + "epoch": 1.064088478722625, + "grad_norm": 18.66658592224121, + "learning_rate": 7.482723130233944e-06, + "loss": 3.338, + "step": 36320 + }, + { + "epoch": 1.0643814546253572, + "grad_norm": 15.884167671203613, + "learning_rate": 7.481350257489145e-06, + "loss": 3.3612, + "step": 36330 + }, + { + "epoch": 1.064674430528089, + "grad_norm": 19.698657989501953, + "learning_rate": 7.479977136499973e-06, + "loss": 3.3219, + "step": 36340 + }, + { + "epoch": 1.064967406430821, + "grad_norm": 19.638080596923828, + "learning_rate": 7.478603767403801e-06, + "loss": 3.3394, + "step": 36350 + }, + { + "epoch": 1.0650260016113675, + "eval_bleu": 0.33321479239266794, + "eval_cap_loss": 0.9570866823196411, + "eval_con_loss": 1.28947114944458, + "eval_loss": 3.5360288619995117, + "step": 36352 + }, + { + "epoch": 1.0650260016113675, + "eval_bleu": 0.33321479239266794, + "eval_cap_loss": 0.9570866823196411, + "eval_con_loss": 1.28947114944458, + "eval_loss": 3.5360288619995117, + "eval_runtime": 51.202, + "eval_samples_per_second": 390.61, + "eval_steps_per_second": 0.391, + "step": 36352 + }, + { + "epoch": 1.0652603823335531, + "grad_norm": 20.4095516204834, + "learning_rate": 7.4772301503380285e-06, + "loss": 3.3414, + "step": 36360 + }, + { + "epoch": 1.065553358236285, + "grad_norm": 17.45343589782715, + "learning_rate": 7.4758562854400755e-06, + "loss": 3.3608, + "step": 36370 + }, + { + "epoch": 1.065846334139017, + "grad_norm": 19.93836784362793, + "learning_rate": 7.474482172847391e-06, + "loss": 3.3548, + "step": 36380 + }, + { + "epoch": 1.066139310041749, + "grad_norm": 20.051942825317383, + "learning_rate": 7.473107812697442e-06, + "loss": 3.3307, + "step": 36390 + }, + { + "epoch": 1.066432285944481, + "grad_norm": 21.40591812133789, + "learning_rate": 7.471733205127733e-06, + "loss": 3.3356, + "step": 36400 + }, + { + "epoch": 1.0667252618472132, + "grad_norm": 18.732120513916016, + "learning_rate": 7.470358350275779e-06, + "loss": 3.3429, + "step": 36410 + }, + { + "epoch": 1.067018237749945, + "grad_norm": 20.487184524536133, + "learning_rate": 7.468983248279129e-06, + "loss": 3.3403, + "step": 36420 + }, + { + "epoch": 1.067311213652677, + "grad_norm": 15.0443754196167, + "learning_rate": 7.467607899275352e-06, + "loss": 3.3436, + "step": 36430 + }, + { + "epoch": 1.0676041895554091, + "grad_norm": 16.03028678894043, + "learning_rate": 7.466232303402046e-06, + "loss": 3.3304, + "step": 36440 + }, + { + "epoch": 1.067897165458141, + "grad_norm": 18.003795623779297, + "learning_rate": 7.464856460796829e-06, + "loss": 3.3472, + "step": 36450 + }, + { + "epoch": 1.068190141360873, + "grad_norm": 20.416091918945312, + "learning_rate": 7.463480371597347e-06, + "loss": 3.3432, + "step": 36460 + }, + { + "epoch": 1.0684831172636051, + "grad_norm": 24.008834838867188, + "learning_rate": 7.4621040359412675e-06, + "loss": 3.3454, + "step": 36470 + }, + { + "epoch": 1.068776093166337, + "grad_norm": 18.9836368560791, + "learning_rate": 7.460727453966287e-06, + "loss": 3.3246, + "step": 36480 + }, + { + "epoch": 1.069069069069069, + "grad_norm": 18.989730834960938, + "learning_rate": 7.459350625810124e-06, + "loss": 3.3414, + "step": 36490 + }, + { + "epoch": 1.069362044971801, + "grad_norm": 20.362789154052734, + "learning_rate": 7.457973551610523e-06, + "loss": 3.3315, + "step": 36500 + }, + { + "epoch": 1.069655020874533, + "grad_norm": 16.88204002380371, + "learning_rate": 7.4565962315052495e-06, + "loss": 3.3537, + "step": 36510 + }, + { + "epoch": 1.0699479967772652, + "grad_norm": 18.67716407775879, + "learning_rate": 7.455218665632098e-06, + "loss": 3.3398, + "step": 36520 + }, + { + "epoch": 1.070240972679997, + "grad_norm": 16.861858367919922, + "learning_rate": 7.453840854128884e-06, + "loss": 3.3617, + "step": 36530 + }, + { + "epoch": 1.070533948582729, + "grad_norm": 17.781545639038086, + "learning_rate": 7.452462797133453e-06, + "loss": 3.367, + "step": 36540 + }, + { + "epoch": 1.0708269244854611, + "grad_norm": 17.34689712524414, + "learning_rate": 7.451084494783668e-06, + "loss": 3.3472, + "step": 36550 + }, + { + "epoch": 1.071119900388193, + "grad_norm": 18.133285522460938, + "learning_rate": 7.44970594721742e-06, + "loss": 3.3491, + "step": 36560 + }, + { + "epoch": 1.0714128762909252, + "grad_norm": 17.948139190673828, + "learning_rate": 7.448327154572628e-06, + "loss": 3.3552, + "step": 36570 + }, + { + "epoch": 1.071705852193657, + "grad_norm": 19.131656646728516, + "learning_rate": 7.446948116987227e-06, + "loss": 3.3206, + "step": 36580 + }, + { + "epoch": 1.071998828096389, + "grad_norm": 20.31876564025879, + "learning_rate": 7.445568834599186e-06, + "loss": 3.3293, + "step": 36590 + }, + { + "epoch": 1.0722918039991212, + "grad_norm": 19.38796615600586, + "learning_rate": 7.444189307546491e-06, + "loss": 3.3418, + "step": 36600 + }, + { + "epoch": 1.072584779901853, + "grad_norm": 16.627607345581055, + "learning_rate": 7.442809535967157e-06, + "loss": 3.3454, + "step": 36610 + }, + { + "epoch": 1.072877755804585, + "grad_norm": 18.684566497802734, + "learning_rate": 7.441429519999222e-06, + "loss": 3.3611, + "step": 36620 + }, + { + "epoch": 1.0731707317073171, + "grad_norm": 19.23566246032715, + "learning_rate": 7.440049259780748e-06, + "loss": 3.3253, + "step": 36630 + }, + { + "epoch": 1.073463707610049, + "grad_norm": 17.945642471313477, + "learning_rate": 7.438668755449822e-06, + "loss": 3.3237, + "step": 36640 + }, + { + "epoch": 1.073756683512781, + "grad_norm": 16.83313751220703, + "learning_rate": 7.437288007144554e-06, + "loss": 3.3373, + "step": 36650 + }, + { + "epoch": 1.0740496594155131, + "grad_norm": 20.981416702270508, + "learning_rate": 7.435907015003083e-06, + "loss": 3.3252, + "step": 36660 + }, + { + "epoch": 1.074342635318245, + "grad_norm": 17.718673706054688, + "learning_rate": 7.4345257791635665e-06, + "loss": 3.3066, + "step": 36670 + }, + { + "epoch": 1.0746356112209772, + "grad_norm": 21.161766052246094, + "learning_rate": 7.433144299764187e-06, + "loss": 3.3343, + "step": 36680 + }, + { + "epoch": 1.074928587123709, + "grad_norm": 20.84720802307129, + "learning_rate": 7.431762576943157e-06, + "loss": 3.3581, + "step": 36690 + }, + { + "epoch": 1.075221563026441, + "grad_norm": 18.618045806884766, + "learning_rate": 7.430380610838709e-06, + "loss": 3.3285, + "step": 36700 + }, + { + "epoch": 1.0755145389291731, + "grad_norm": 18.560277938842773, + "learning_rate": 7.4289984015890995e-06, + "loss": 3.3499, + "step": 36710 + }, + { + "epoch": 1.075807514831905, + "grad_norm": 15.868610382080078, + "learning_rate": 7.4276159493326095e-06, + "loss": 3.3537, + "step": 36720 + }, + { + "epoch": 1.076100490734637, + "grad_norm": 18.51148796081543, + "learning_rate": 7.426233254207547e-06, + "loss": 3.3512, + "step": 36730 + }, + { + "epoch": 1.0763934666373691, + "grad_norm": 18.39826011657715, + "learning_rate": 7.4248503163522425e-06, + "loss": 3.3298, + "step": 36740 + }, + { + "epoch": 1.076686442540101, + "grad_norm": 17.42144203186035, + "learning_rate": 7.423467135905049e-06, + "loss": 3.3391, + "step": 36750 + }, + { + "epoch": 1.0769794184428332, + "grad_norm": 16.287397384643555, + "learning_rate": 7.422083713004346e-06, + "loss": 3.3414, + "step": 36760 + }, + { + "epoch": 1.077272394345565, + "grad_norm": 15.153436660766602, + "learning_rate": 7.420700047788539e-06, + "loss": 3.3282, + "step": 36770 + }, + { + "epoch": 1.077565370248297, + "grad_norm": 18.307723999023438, + "learning_rate": 7.419316140396053e-06, + "loss": 3.3316, + "step": 36780 + }, + { + "epoch": 1.0778583461510292, + "grad_norm": 20.48295783996582, + "learning_rate": 7.417931990965341e-06, + "loss": 3.3299, + "step": 36790 + }, + { + "epoch": 1.078151322053761, + "grad_norm": 17.25931739807129, + "learning_rate": 7.416547599634878e-06, + "loss": 3.3498, + "step": 36800 + }, + { + "epoch": 1.078444297956493, + "grad_norm": 18.081539154052734, + "learning_rate": 7.415162966543164e-06, + "loss": 3.3554, + "step": 36810 + }, + { + "epoch": 1.0787372738592251, + "grad_norm": 19.587825775146484, + "learning_rate": 7.413778091828725e-06, + "loss": 3.3539, + "step": 36820 + }, + { + "epoch": 1.079030249761957, + "grad_norm": 17.16874122619629, + "learning_rate": 7.412392975630107e-06, + "loss": 3.3585, + "step": 36830 + }, + { + "epoch": 1.0793232256646892, + "grad_norm": 17.484411239624023, + "learning_rate": 7.411007618085885e-06, + "loss": 3.3489, + "step": 36840 + }, + { + "epoch": 1.079616201567421, + "grad_norm": 16.405282974243164, + "learning_rate": 7.409622019334654e-06, + "loss": 3.3377, + "step": 36850 + }, + { + "epoch": 1.079909177470153, + "grad_norm": 19.033382415771484, + "learning_rate": 7.4082361795150355e-06, + "loss": 3.3393, + "step": 36860 + }, + { + "epoch": 1.0800263678312458, + "eval_bleu": 0.333615387446105, + "eval_cap_loss": 0.9566165804862976, + "eval_con_loss": 1.2879023551940918, + "eval_loss": 3.532421350479126, + "step": 36864 + }, + { + "epoch": 1.0800263678312458, + "eval_bleu": 0.333615387446105, + "eval_cap_loss": 0.9566165804862976, + "eval_con_loss": 1.2879023551940918, + "eval_loss": 3.532421350479126, + "eval_runtime": 52.2835, + "eval_samples_per_second": 382.53, + "eval_steps_per_second": 0.383, + "step": 36864 + }, + { + "epoch": 1.0802021533728852, + "grad_norm": 18.40435791015625, + "learning_rate": 7.4068500987656765e-06, + "loss": 3.3258, + "step": 36870 + }, + { + "epoch": 1.080495129275617, + "grad_norm": 15.80107307434082, + "learning_rate": 7.405463777225244e-06, + "loss": 3.3641, + "step": 36880 + }, + { + "epoch": 1.080788105178349, + "grad_norm": 19.45716094970703, + "learning_rate": 7.40407721503243e-06, + "loss": 3.3479, + "step": 36890 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 19.60926628112793, + "learning_rate": 7.402690412325955e-06, + "loss": 3.3481, + "step": 36900 + }, + { + "epoch": 1.081374056983813, + "grad_norm": 17.716554641723633, + "learning_rate": 7.401303369244559e-06, + "loss": 3.3685, + "step": 36910 + }, + { + "epoch": 1.081667032886545, + "grad_norm": 17.24224090576172, + "learning_rate": 7.3999160859270054e-06, + "loss": 3.3322, + "step": 36920 + }, + { + "epoch": 1.0819600087892771, + "grad_norm": 19.677734375, + "learning_rate": 7.398528562512089e-06, + "loss": 3.3475, + "step": 36930 + }, + { + "epoch": 1.082252984692009, + "grad_norm": 18.099506378173828, + "learning_rate": 7.397140799138618e-06, + "loss": 3.3178, + "step": 36940 + }, + { + "epoch": 1.0825459605947412, + "grad_norm": 19.11260414123535, + "learning_rate": 7.395752795945432e-06, + "loss": 3.3455, + "step": 36950 + }, + { + "epoch": 1.082838936497473, + "grad_norm": 17.090864181518555, + "learning_rate": 7.394364553071392e-06, + "loss": 3.3617, + "step": 36960 + }, + { + "epoch": 1.083131912400205, + "grad_norm": 16.426485061645508, + "learning_rate": 7.392976070655384e-06, + "loss": 3.3727, + "step": 36970 + }, + { + "epoch": 1.0834248883029372, + "grad_norm": 23.187049865722656, + "learning_rate": 7.391587348836318e-06, + "loss": 3.3631, + "step": 36980 + }, + { + "epoch": 1.083717864205669, + "grad_norm": 16.79970359802246, + "learning_rate": 7.390198387753125e-06, + "loss": 3.3467, + "step": 36990 + }, + { + "epoch": 1.084010840108401, + "grad_norm": 19.386512756347656, + "learning_rate": 7.388809187544764e-06, + "loss": 3.3353, + "step": 37000 + }, + { + "epoch": 1.0843038160111331, + "grad_norm": 18.671693801879883, + "learning_rate": 7.3874197483502175e-06, + "loss": 3.3331, + "step": 37010 + }, + { + "epoch": 1.084596791913865, + "grad_norm": 21.45449447631836, + "learning_rate": 7.386030070308489e-06, + "loss": 3.3529, + "step": 37020 + }, + { + "epoch": 1.0848897678165972, + "grad_norm": 20.421438217163086, + "learning_rate": 7.384640153558606e-06, + "loss": 3.3441, + "step": 37030 + }, + { + "epoch": 1.085182743719329, + "grad_norm": 17.647212982177734, + "learning_rate": 7.383249998239624e-06, + "loss": 3.3494, + "step": 37040 + }, + { + "epoch": 1.085475719622061, + "grad_norm": 17.703529357910156, + "learning_rate": 7.38185960449062e-06, + "loss": 3.3412, + "step": 37050 + }, + { + "epoch": 1.0857686955247932, + "grad_norm": 20.975393295288086, + "learning_rate": 7.380468972450694e-06, + "loss": 3.3554, + "step": 37060 + }, + { + "epoch": 1.086061671427525, + "grad_norm": 16.5207462310791, + "learning_rate": 7.37907810225897e-06, + "loss": 3.3553, + "step": 37070 + }, + { + "epoch": 1.086354647330257, + "grad_norm": 21.27530288696289, + "learning_rate": 7.3776869940545935e-06, + "loss": 3.3467, + "step": 37080 + }, + { + "epoch": 1.0866476232329891, + "grad_norm": 20.977697372436523, + "learning_rate": 7.376295647976743e-06, + "loss": 3.322, + "step": 37090 + }, + { + "epoch": 1.086940599135721, + "grad_norm": 22.42572021484375, + "learning_rate": 7.37490406416461e-06, + "loss": 3.3269, + "step": 37100 + }, + { + "epoch": 1.0872335750384532, + "grad_norm": 18.77800178527832, + "learning_rate": 7.373512242757414e-06, + "loss": 3.3592, + "step": 37110 + }, + { + "epoch": 1.087526550941185, + "grad_norm": 16.92587661743164, + "learning_rate": 7.372120183894401e-06, + "loss": 3.3439, + "step": 37120 + }, + { + "epoch": 1.087819526843917, + "grad_norm": 18.280254364013672, + "learning_rate": 7.370727887714837e-06, + "loss": 3.342, + "step": 37130 + }, + { + "epoch": 1.0881125027466492, + "grad_norm": 18.638525009155273, + "learning_rate": 7.369335354358013e-06, + "loss": 3.3362, + "step": 37140 + }, + { + "epoch": 1.088405478649381, + "grad_norm": 17.21963882446289, + "learning_rate": 7.3679425839632435e-06, + "loss": 3.3364, + "step": 37150 + }, + { + "epoch": 1.088698454552113, + "grad_norm": 19.972251892089844, + "learning_rate": 7.366549576669866e-06, + "loss": 3.3426, + "step": 37160 + }, + { + "epoch": 1.0889914304548451, + "grad_norm": 17.21381187438965, + "learning_rate": 7.365156332617244e-06, + "loss": 3.328, + "step": 37170 + }, + { + "epoch": 1.089284406357577, + "grad_norm": 18.347925186157227, + "learning_rate": 7.363762851944763e-06, + "loss": 3.3428, + "step": 37180 + }, + { + "epoch": 1.089577382260309, + "grad_norm": 17.351421356201172, + "learning_rate": 7.362369134791833e-06, + "loss": 3.331, + "step": 37190 + }, + { + "epoch": 1.0898703581630411, + "grad_norm": 17.141586303710938, + "learning_rate": 7.360975181297885e-06, + "loss": 3.339, + "step": 37200 + }, + { + "epoch": 1.090163334065773, + "grad_norm": 17.82767677307129, + "learning_rate": 7.359580991602378e-06, + "loss": 3.3402, + "step": 37210 + }, + { + "epoch": 1.0904563099685052, + "grad_norm": 19.893905639648438, + "learning_rate": 7.358186565844793e-06, + "loss": 3.3557, + "step": 37220 + }, + { + "epoch": 1.090749285871237, + "grad_norm": 19.04560661315918, + "learning_rate": 7.356791904164631e-06, + "loss": 3.348, + "step": 37230 + }, + { + "epoch": 1.091042261773969, + "grad_norm": 15.87691593170166, + "learning_rate": 7.355397006701422e-06, + "loss": 3.3436, + "step": 37240 + }, + { + "epoch": 1.0913352376767012, + "grad_norm": 16.3347225189209, + "learning_rate": 7.354001873594716e-06, + "loss": 3.3251, + "step": 37250 + }, + { + "epoch": 1.091628213579433, + "grad_norm": 19.537885665893555, + "learning_rate": 7.35260650498409e-06, + "loss": 3.324, + "step": 37260 + }, + { + "epoch": 1.091921189482165, + "grad_norm": 15.312085151672363, + "learning_rate": 7.35121090100914e-06, + "loss": 3.3192, + "step": 37270 + }, + { + "epoch": 1.0922141653848971, + "grad_norm": 16.17685317993164, + "learning_rate": 7.349815061809489e-06, + "loss": 3.3318, + "step": 37280 + }, + { + "epoch": 1.092507141287629, + "grad_norm": 20.59050178527832, + "learning_rate": 7.34841898752478e-06, + "loss": 3.318, + "step": 37290 + }, + { + "epoch": 1.0928001171903612, + "grad_norm": 16.061643600463867, + "learning_rate": 7.347022678294687e-06, + "loss": 3.3533, + "step": 37300 + }, + { + "epoch": 1.093093093093093, + "grad_norm": 17.12520408630371, + "learning_rate": 7.345626134258897e-06, + "loss": 3.3457, + "step": 37310 + }, + { + "epoch": 1.093386068995825, + "grad_norm": 17.60364532470703, + "learning_rate": 7.34422935555713e-06, + "loss": 3.362, + "step": 37320 + }, + { + "epoch": 1.0936790448985572, + "grad_norm": 19.599456787109375, + "learning_rate": 7.342832342329122e-06, + "loss": 3.3509, + "step": 37330 + }, + { + "epoch": 1.093972020801289, + "grad_norm": 19.36039924621582, + "learning_rate": 7.341435094714639e-06, + "loss": 3.3272, + "step": 37340 + }, + { + "epoch": 1.094264996704021, + "grad_norm": 17.12041473388672, + "learning_rate": 7.340037612853466e-06, + "loss": 3.3182, + "step": 37350 + }, + { + "epoch": 1.0945579726067531, + "grad_norm": 18.671287536621094, + "learning_rate": 7.338639896885413e-06, + "loss": 3.3131, + "step": 37360 + }, + { + "epoch": 1.094850948509485, + "grad_norm": 17.270963668823242, + "learning_rate": 7.337241946950312e-06, + "loss": 3.3565, + "step": 37370 + }, + { + "epoch": 1.0950267340511244, + "eval_bleu": 0.3334284919395401, + "eval_cap_loss": 0.9566031098365784, + "eval_con_loss": 1.2845962047576904, + "eval_loss": 3.5257954597473145, + "step": 37376 + }, + { + "epoch": 1.0950267340511244, + "eval_bleu": 0.3334284919395401, + "eval_cap_loss": 0.9566031098365784, + "eval_con_loss": 1.2845962047576904, + "eval_loss": 3.5257954597473145, + "eval_runtime": 52.5321, + "eval_samples_per_second": 380.72, + "eval_steps_per_second": 0.381, + "step": 37376 + }, + { + "epoch": 1.0951439244122172, + "grad_norm": 20.06707763671875, + "learning_rate": 7.335843763188021e-06, + "loss": 3.3344, + "step": 37380 + }, + { + "epoch": 1.0954369003149491, + "grad_norm": 21.120752334594727, + "learning_rate": 7.3344453457384205e-06, + "loss": 3.3502, + "step": 37390 + }, + { + "epoch": 1.095729876217681, + "grad_norm": 16.555164337158203, + "learning_rate": 7.333046694741412e-06, + "loss": 3.3193, + "step": 37400 + }, + { + "epoch": 1.0960228521204132, + "grad_norm": 17.57384490966797, + "learning_rate": 7.331647810336922e-06, + "loss": 3.3325, + "step": 37410 + }, + { + "epoch": 1.096315828023145, + "grad_norm": 18.182308197021484, + "learning_rate": 7.3302486926649045e-06, + "loss": 3.3312, + "step": 37420 + }, + { + "epoch": 1.096608803925877, + "grad_norm": 18.079206466674805, + "learning_rate": 7.32884934186533e-06, + "loss": 3.3259, + "step": 37430 + }, + { + "epoch": 1.0969017798286091, + "grad_norm": 12.205955505371094, + "learning_rate": 7.327449758078194e-06, + "loss": 3.3463, + "step": 37440 + }, + { + "epoch": 1.097194755731341, + "grad_norm": 17.09302520751953, + "learning_rate": 7.326049941443517e-06, + "loss": 3.342, + "step": 37450 + }, + { + "epoch": 1.097487731634073, + "grad_norm": 17.259912490844727, + "learning_rate": 7.324649892101345e-06, + "loss": 3.3128, + "step": 37460 + }, + { + "epoch": 1.0977807075368051, + "grad_norm": 16.625329971313477, + "learning_rate": 7.323249610191743e-06, + "loss": 3.3331, + "step": 37470 + }, + { + "epoch": 1.098073683439537, + "grad_norm": 18.079097747802734, + "learning_rate": 7.321849095854801e-06, + "loss": 3.3254, + "step": 37480 + }, + { + "epoch": 1.0983666593422692, + "grad_norm": 19.568092346191406, + "learning_rate": 7.32044834923063e-06, + "loss": 3.3267, + "step": 37490 + }, + { + "epoch": 1.098659635245001, + "grad_norm": 14.497773170471191, + "learning_rate": 7.31904737045937e-06, + "loss": 3.3413, + "step": 37500 + }, + { + "epoch": 1.098952611147733, + "grad_norm": 17.445369720458984, + "learning_rate": 7.31764615968118e-06, + "loss": 3.3382, + "step": 37510 + }, + { + "epoch": 1.0992455870504652, + "grad_norm": 16.610734939575195, + "learning_rate": 7.31624471703624e-06, + "loss": 3.3529, + "step": 37520 + }, + { + "epoch": 1.099538562953197, + "grad_norm": 17.85201644897461, + "learning_rate": 7.314843042664759e-06, + "loss": 3.3323, + "step": 37530 + }, + { + "epoch": 1.0998315388559292, + "grad_norm": 19.929550170898438, + "learning_rate": 7.3134411367069645e-06, + "loss": 3.3415, + "step": 37540 + }, + { + "epoch": 1.1001245147586611, + "grad_norm": 18.183835983276367, + "learning_rate": 7.312038999303108e-06, + "loss": 3.332, + "step": 37550 + }, + { + "epoch": 1.100417490661393, + "grad_norm": 18.96469497680664, + "learning_rate": 7.3106366305934694e-06, + "loss": 3.3369, + "step": 37560 + }, + { + "epoch": 1.1007104665641252, + "grad_norm": 19.335058212280273, + "learning_rate": 7.309234030718343e-06, + "loss": 3.3315, + "step": 37570 + }, + { + "epoch": 1.101003442466857, + "grad_norm": 18.493600845336914, + "learning_rate": 7.307831199818052e-06, + "loss": 3.3083, + "step": 37580 + }, + { + "epoch": 1.101296418369589, + "grad_norm": 22.1346435546875, + "learning_rate": 7.306428138032941e-06, + "loss": 3.3032, + "step": 37590 + }, + { + "epoch": 1.1015893942723212, + "grad_norm": 21.593568801879883, + "learning_rate": 7.30502484550338e-06, + "loss": 3.3455, + "step": 37600 + }, + { + "epoch": 1.101882370175053, + "grad_norm": 19.12135124206543, + "learning_rate": 7.303621322369757e-06, + "loss": 3.3463, + "step": 37610 + }, + { + "epoch": 1.102175346077785, + "grad_norm": 16.79543113708496, + "learning_rate": 7.302217568772488e-06, + "loss": 3.3259, + "step": 37620 + }, + { + "epoch": 1.1024683219805171, + "grad_norm": 18.984712600708008, + "learning_rate": 7.30081358485201e-06, + "loss": 3.3156, + "step": 37630 + }, + { + "epoch": 1.102761297883249, + "grad_norm": 19.44772720336914, + "learning_rate": 7.299409370748782e-06, + "loss": 3.334, + "step": 37640 + }, + { + "epoch": 1.1030542737859812, + "grad_norm": 17.35723304748535, + "learning_rate": 7.2980049266032905e-06, + "loss": 3.3275, + "step": 37650 + }, + { + "epoch": 1.1033472496887131, + "grad_norm": 17.868938446044922, + "learning_rate": 7.2966002525560375e-06, + "loss": 3.3344, + "step": 37660 + }, + { + "epoch": 1.103640225591445, + "grad_norm": 19.06653594970703, + "learning_rate": 7.295195348747555e-06, + "loss": 3.3322, + "step": 37670 + }, + { + "epoch": 1.1039332014941772, + "grad_norm": 18.805675506591797, + "learning_rate": 7.293790215318397e-06, + "loss": 3.3723, + "step": 37680 + }, + { + "epoch": 1.104226177396909, + "grad_norm": 18.664060592651367, + "learning_rate": 7.292384852409134e-06, + "loss": 3.3131, + "step": 37690 + }, + { + "epoch": 1.104519153299641, + "grad_norm": 17.711902618408203, + "learning_rate": 7.290979260160368e-06, + "loss": 3.3418, + "step": 37700 + }, + { + "epoch": 1.1048121292023731, + "grad_norm": 18.8179988861084, + "learning_rate": 7.289573438712718e-06, + "loss": 3.3351, + "step": 37710 + }, + { + "epoch": 1.105105105105105, + "grad_norm": 18.010629653930664, + "learning_rate": 7.288167388206831e-06, + "loss": 3.3079, + "step": 37720 + }, + { + "epoch": 1.1053980810078372, + "grad_norm": 17.84392547607422, + "learning_rate": 7.28676110878337e-06, + "loss": 3.3404, + "step": 37730 + }, + { + "epoch": 1.1056910569105691, + "grad_norm": 17.92461395263672, + "learning_rate": 7.2853546005830266e-06, + "loss": 3.3316, + "step": 37740 + }, + { + "epoch": 1.105984032813301, + "grad_norm": 18.337005615234375, + "learning_rate": 7.283947863746516e-06, + "loss": 3.3253, + "step": 37750 + }, + { + "epoch": 1.1062770087160332, + "grad_norm": 17.439226150512695, + "learning_rate": 7.282540898414571e-06, + "loss": 3.3605, + "step": 37760 + }, + { + "epoch": 1.106569984618765, + "grad_norm": 16.34686851501465, + "learning_rate": 7.281133704727951e-06, + "loss": 3.3188, + "step": 37770 + }, + { + "epoch": 1.106862960521497, + "grad_norm": 17.74872589111328, + "learning_rate": 7.279726282827436e-06, + "loss": 3.3427, + "step": 37780 + }, + { + "epoch": 1.1071559364242292, + "grad_norm": 15.54246997833252, + "learning_rate": 7.278318632853832e-06, + "loss": 3.3458, + "step": 37790 + }, + { + "epoch": 1.107448912326961, + "grad_norm": 20.553958892822266, + "learning_rate": 7.2769107549479675e-06, + "loss": 3.313, + "step": 37800 + }, + { + "epoch": 1.1077418882296932, + "grad_norm": 18.178590774536133, + "learning_rate": 7.275502649250689e-06, + "loss": 3.3431, + "step": 37810 + }, + { + "epoch": 1.1080348641324251, + "grad_norm": 17.225021362304688, + "learning_rate": 7.2740943159028695e-06, + "loss": 3.3425, + "step": 37820 + }, + { + "epoch": 1.108327840035157, + "grad_norm": 16.429174423217773, + "learning_rate": 7.272685755045408e-06, + "loss": 3.3429, + "step": 37830 + }, + { + "epoch": 1.1086208159378892, + "grad_norm": 17.760128021240234, + "learning_rate": 7.271276966819219e-06, + "loss": 3.3295, + "step": 37840 + }, + { + "epoch": 1.108913791840621, + "grad_norm": 18.77180290222168, + "learning_rate": 7.269867951365244e-06, + "loss": 3.3354, + "step": 37850 + }, + { + "epoch": 1.109206767743353, + "grad_norm": 18.45425033569336, + "learning_rate": 7.2684587088244465e-06, + "loss": 3.3125, + "step": 37860 + }, + { + "epoch": 1.1094997436460852, + "grad_norm": 16.960874557495117, + "learning_rate": 7.267049239337814e-06, + "loss": 3.3316, + "step": 37870 + }, + { + "epoch": 1.109792719548817, + "grad_norm": 18.430389404296875, + "learning_rate": 7.265639543046357e-06, + "loss": 3.3216, + "step": 37880 + }, + { + "epoch": 1.1100271002710027, + "eval_bleu": 0.3334318150466994, + "eval_cap_loss": 0.9557788968086243, + "eval_con_loss": 1.2850019931793213, + "eval_loss": 3.525782585144043, + "step": 37888 + }, + { + "epoch": 1.1100271002710027, + "eval_bleu": 0.3334318150466994, + "eval_cap_loss": 0.9557788968086243, + "eval_con_loss": 1.2850019931793213, + "eval_loss": 3.525782585144043, + "eval_runtime": 52.5262, + "eval_samples_per_second": 380.763, + "eval_steps_per_second": 0.381, + "step": 37888 + }, + { + "epoch": 1.110085695451549, + "grad_norm": 17.733976364135742, + "learning_rate": 7.264229620091104e-06, + "loss": 3.3463, + "step": 37890 + }, + { + "epoch": 1.1103786713542811, + "grad_norm": 20.646562576293945, + "learning_rate": 7.2628194706131086e-06, + "loss": 3.3602, + "step": 37900 + }, + { + "epoch": 1.110671647257013, + "grad_norm": 19.940444946289062, + "learning_rate": 7.261409094753451e-06, + "loss": 3.3176, + "step": 37910 + }, + { + "epoch": 1.1109646231597452, + "grad_norm": 20.42207908630371, + "learning_rate": 7.259998492653231e-06, + "loss": 3.3444, + "step": 37920 + }, + { + "epoch": 1.1112575990624771, + "grad_norm": 14.358384132385254, + "learning_rate": 7.2585876644535705e-06, + "loss": 3.3358, + "step": 37930 + }, + { + "epoch": 1.111550574965209, + "grad_norm": 20.570837020874023, + "learning_rate": 7.257176610295613e-06, + "loss": 3.3246, + "step": 37940 + }, + { + "epoch": 1.1118435508679412, + "grad_norm": 22.00197982788086, + "learning_rate": 7.255765330320527e-06, + "loss": 3.3332, + "step": 37950 + }, + { + "epoch": 1.112136526770673, + "grad_norm": 16.24378776550293, + "learning_rate": 7.254494985385998e-06, + "loss": 3.339, + "step": 37960 + }, + { + "epoch": 1.112429502673405, + "grad_norm": 19.605199813842773, + "learning_rate": 7.253083276747366e-06, + "loss": 3.3593, + "step": 37970 + }, + { + "epoch": 1.1127224785761372, + "grad_norm": 18.793472290039062, + "learning_rate": 7.251671342701118e-06, + "loss": 3.3427, + "step": 37980 + }, + { + "epoch": 1.113015454478869, + "grad_norm": 14.334705352783203, + "learning_rate": 7.250259183388512e-06, + "loss": 3.3638, + "step": 37990 + }, + { + "epoch": 1.1133084303816012, + "grad_norm": 21.72418212890625, + "learning_rate": 7.248846798950825e-06, + "loss": 3.3232, + "step": 38000 + }, + { + "epoch": 1.1136014062843331, + "grad_norm": 17.338104248046875, + "learning_rate": 7.247434189529358e-06, + "loss": 3.3395, + "step": 38010 + }, + { + "epoch": 1.113894382187065, + "grad_norm": 16.147491455078125, + "learning_rate": 7.246021355265432e-06, + "loss": 3.3138, + "step": 38020 + }, + { + "epoch": 1.1141873580897972, + "grad_norm": 19.68994140625, + "learning_rate": 7.244608296300396e-06, + "loss": 3.3281, + "step": 38030 + }, + { + "epoch": 1.114480333992529, + "grad_norm": 17.661779403686523, + "learning_rate": 7.243195012775617e-06, + "loss": 3.3379, + "step": 38040 + }, + { + "epoch": 1.114773309895261, + "grad_norm": 19.68119239807129, + "learning_rate": 7.241781504832486e-06, + "loss": 3.3283, + "step": 38050 + }, + { + "epoch": 1.1150662857979932, + "grad_norm": 18.162817001342773, + "learning_rate": 7.240367772612413e-06, + "loss": 3.3227, + "step": 38060 + }, + { + "epoch": 1.115359261700725, + "grad_norm": 21.64845085144043, + "learning_rate": 7.238953816256839e-06, + "loss": 3.3241, + "step": 38070 + }, + { + "epoch": 1.1156522376034572, + "grad_norm": 15.09078598022461, + "learning_rate": 7.237539635907217e-06, + "loss": 3.3231, + "step": 38080 + }, + { + "epoch": 1.1159452135061891, + "grad_norm": 18.291955947875977, + "learning_rate": 7.23612523170503e-06, + "loss": 3.3117, + "step": 38090 + }, + { + "epoch": 1.116238189408921, + "grad_norm": 19.198806762695312, + "learning_rate": 7.234710603791779e-06, + "loss": 3.3294, + "step": 38100 + }, + { + "epoch": 1.1165311653116532, + "grad_norm": 20.317277908325195, + "learning_rate": 7.233295752308992e-06, + "loss": 3.3389, + "step": 38110 + }, + { + "epoch": 1.1168241412143851, + "grad_norm": 18.48377799987793, + "learning_rate": 7.231880677398214e-06, + "loss": 3.317, + "step": 38120 + }, + { + "epoch": 1.117117117117117, + "grad_norm": 19.020492553710938, + "learning_rate": 7.230465379201015e-06, + "loss": 3.3424, + "step": 38130 + }, + { + "epoch": 1.1174100930198492, + "grad_norm": 21.24330711364746, + "learning_rate": 7.229049857858988e-06, + "loss": 3.3061, + "step": 38140 + }, + { + "epoch": 1.117703068922581, + "grad_norm": 19.46956443786621, + "learning_rate": 7.227634113513747e-06, + "loss": 3.3563, + "step": 38150 + }, + { + "epoch": 1.117996044825313, + "grad_norm": 15.503252029418945, + "learning_rate": 7.226218146306929e-06, + "loss": 3.3161, + "step": 38160 + }, + { + "epoch": 1.1182890207280451, + "grad_norm": 17.12249755859375, + "learning_rate": 7.224801956380194e-06, + "loss": 3.3435, + "step": 38170 + }, + { + "epoch": 1.118581996630777, + "grad_norm": 19.513729095458984, + "learning_rate": 7.223385543875221e-06, + "loss": 3.3306, + "step": 38180 + }, + { + "epoch": 1.1188749725335092, + "grad_norm": 18.875566482543945, + "learning_rate": 7.2219689089337165e-06, + "loss": 3.3296, + "step": 38190 + }, + { + "epoch": 1.1191679484362411, + "grad_norm": 19.783138275146484, + "learning_rate": 7.220552051697405e-06, + "loss": 3.3265, + "step": 38200 + }, + { + "epoch": 1.119460924338973, + "grad_norm": 19.24566078186035, + "learning_rate": 7.219134972308035e-06, + "loss": 3.3445, + "step": 38210 + }, + { + "epoch": 1.1197539002417052, + "grad_norm": 18.48334503173828, + "learning_rate": 7.217717670907376e-06, + "loss": 3.3507, + "step": 38220 + }, + { + "epoch": 1.120046876144437, + "grad_norm": 14.632134437561035, + "learning_rate": 7.216300147637222e-06, + "loss": 3.3307, + "step": 38230 + }, + { + "epoch": 1.1203398520471692, + "grad_norm": 17.077560424804688, + "learning_rate": 7.214882402639386e-06, + "loss": 3.3422, + "step": 38240 + }, + { + "epoch": 1.1206328279499012, + "grad_norm": 18.71262550354004, + "learning_rate": 7.2134644360557075e-06, + "loss": 3.3424, + "step": 38250 + }, + { + "epoch": 1.120925803852633, + "grad_norm": 18.8005313873291, + "learning_rate": 7.2120462480280425e-06, + "loss": 3.3301, + "step": 38260 + }, + { + "epoch": 1.1212187797553652, + "grad_norm": 17.21078872680664, + "learning_rate": 7.210627838698274e-06, + "loss": 3.3577, + "step": 38270 + }, + { + "epoch": 1.1215117556580971, + "grad_norm": 20.39728355407715, + "learning_rate": 7.209209208208307e-06, + "loss": 3.3007, + "step": 38280 + }, + { + "epoch": 1.121804731560829, + "grad_norm": 17.911457061767578, + "learning_rate": 7.207790356700066e-06, + "loss": 3.333, + "step": 38290 + }, + { + "epoch": 1.1220977074635612, + "grad_norm": 16.1473445892334, + "learning_rate": 7.2063712843154975e-06, + "loss": 3.3359, + "step": 38300 + }, + { + "epoch": 1.122390683366293, + "grad_norm": 19.131954193115234, + "learning_rate": 7.204951991196571e-06, + "loss": 3.2981, + "step": 38310 + }, + { + "epoch": 1.122683659269025, + "grad_norm": 15.536737442016602, + "learning_rate": 7.20353247748528e-06, + "loss": 3.3304, + "step": 38320 + }, + { + "epoch": 1.1229766351717572, + "grad_norm": 20.175432205200195, + "learning_rate": 7.202112743323637e-06, + "loss": 3.3373, + "step": 38330 + }, + { + "epoch": 1.123269611074489, + "grad_norm": 17.88945198059082, + "learning_rate": 7.2006927888536805e-06, + "loss": 3.3316, + "step": 38340 + }, + { + "epoch": 1.1235625869772212, + "grad_norm": 18.489238739013672, + "learning_rate": 7.199272614217465e-06, + "loss": 3.3056, + "step": 38350 + }, + { + "epoch": 1.1238555628799531, + "grad_norm": 18.116178512573242, + "learning_rate": 7.197852219557073e-06, + "loss": 3.3256, + "step": 38360 + }, + { + "epoch": 1.124148538782685, + "grad_norm": 17.165624618530273, + "learning_rate": 7.196431605014606e-06, + "loss": 3.3296, + "step": 38370 + }, + { + "epoch": 1.1244415146854172, + "grad_norm": 17.105070114135742, + "learning_rate": 7.1950107707321884e-06, + "loss": 3.3133, + "step": 38380 + }, + { + "epoch": 1.1247344905881491, + "grad_norm": 16.84476089477539, + "learning_rate": 7.193589716851963e-06, + "loss": 3.3062, + "step": 38390 + }, + { + "epoch": 1.125027466490881, + "grad_norm": 18.35101318359375, + "learning_rate": 7.1921684435161034e-06, + "loss": 3.3099, + "step": 38400 + }, + { + "epoch": 1.125027466490881, + "eval_bleu": 0.3340683155751731, + "eval_cap_loss": 0.9552081823348999, + "eval_con_loss": 1.281868815422058, + "eval_loss": 3.5189456939697266, + "step": 38400 + }, + { + "epoch": 1.125027466490881, + "eval_bleu": 0.3340683155751731, + "eval_cap_loss": 0.9552081823348999, + "eval_con_loss": 1.281868815422058, + "eval_loss": 3.5189456939697266, + "eval_runtime": 52.5428, + "eval_samples_per_second": 380.642, + "eval_steps_per_second": 0.381, + "step": 38400 + }, + { + "epoch": 1.1253204423936132, + "grad_norm": 19.884462356567383, + "learning_rate": 7.190746950866796e-06, + "loss": 3.2936, + "step": 38410 + }, + { + "epoch": 1.125613418296345, + "grad_norm": 16.896360397338867, + "learning_rate": 7.1893252390462545e-06, + "loss": 3.3251, + "step": 38420 + }, + { + "epoch": 1.125906394199077, + "grad_norm": 15.690694808959961, + "learning_rate": 7.187903308196709e-06, + "loss": 3.3139, + "step": 38430 + }, + { + "epoch": 1.1261993701018091, + "grad_norm": 15.34272289276123, + "learning_rate": 7.1864811584604185e-06, + "loss": 3.3331, + "step": 38440 + }, + { + "epoch": 1.126492346004541, + "grad_norm": 18.535125732421875, + "learning_rate": 7.185058789979662e-06, + "loss": 3.3224, + "step": 38450 + }, + { + "epoch": 1.1267853219072732, + "grad_norm": 18.401269912719727, + "learning_rate": 7.183636202896737e-06, + "loss": 3.3387, + "step": 38460 + }, + { + "epoch": 1.1270782978100051, + "grad_norm": 18.905824661254883, + "learning_rate": 7.182213397353962e-06, + "loss": 3.3323, + "step": 38470 + }, + { + "epoch": 1.127371273712737, + "grad_norm": 15.417428016662598, + "learning_rate": 7.180790373493686e-06, + "loss": 3.3337, + "step": 38480 + }, + { + "epoch": 1.1276642496154692, + "grad_norm": 18.19550895690918, + "learning_rate": 7.17936713145827e-06, + "loss": 3.3375, + "step": 38490 + }, + { + "epoch": 1.127957225518201, + "grad_norm": 16.392681121826172, + "learning_rate": 7.1779436713901045e-06, + "loss": 3.3455, + "step": 38500 + }, + { + "epoch": 1.1282502014209332, + "grad_norm": 14.740063667297363, + "learning_rate": 7.176519993431592e-06, + "loss": 3.3193, + "step": 38510 + }, + { + "epoch": 1.1285431773236652, + "grad_norm": 17.760122299194336, + "learning_rate": 7.175096097725169e-06, + "loss": 3.3322, + "step": 38520 + }, + { + "epoch": 1.128836153226397, + "grad_norm": 16.422306060791016, + "learning_rate": 7.173671984413287e-06, + "loss": 3.3293, + "step": 38530 + }, + { + "epoch": 1.1291291291291292, + "grad_norm": 17.75336456298828, + "learning_rate": 7.172247653638418e-06, + "loss": 3.3247, + "step": 38540 + }, + { + "epoch": 1.1294221050318611, + "grad_norm": 14.914653778076172, + "learning_rate": 7.1708231055430585e-06, + "loss": 3.3117, + "step": 38550 + }, + { + "epoch": 1.129715080934593, + "grad_norm": 16.69765853881836, + "learning_rate": 7.1693983402697265e-06, + "loss": 3.3218, + "step": 38560 + }, + { + "epoch": 1.1300080568373252, + "grad_norm": 18.18131446838379, + "learning_rate": 7.167973357960962e-06, + "loss": 3.3082, + "step": 38570 + }, + { + "epoch": 1.130301032740057, + "grad_norm": 17.76541519165039, + "learning_rate": 7.166548158759325e-06, + "loss": 3.3259, + "step": 38580 + }, + { + "epoch": 1.130594008642789, + "grad_norm": 20.512060165405273, + "learning_rate": 7.165122742807397e-06, + "loss": 3.34, + "step": 38590 + }, + { + "epoch": 1.1308869845455212, + "grad_norm": 16.360416412353516, + "learning_rate": 7.163697110247783e-06, + "loss": 3.3154, + "step": 38600 + }, + { + "epoch": 1.131179960448253, + "grad_norm": 20.934017181396484, + "learning_rate": 7.162271261223112e-06, + "loss": 3.3065, + "step": 38610 + }, + { + "epoch": 1.1314729363509852, + "grad_norm": 20.316009521484375, + "learning_rate": 7.160845195876029e-06, + "loss": 3.3471, + "step": 38620 + }, + { + "epoch": 1.1317659122537171, + "grad_norm": 17.654447555541992, + "learning_rate": 7.159418914349204e-06, + "loss": 3.3308, + "step": 38630 + }, + { + "epoch": 1.132058888156449, + "grad_norm": 17.632490158081055, + "learning_rate": 7.157992416785328e-06, + "loss": 3.3086, + "step": 38640 + }, + { + "epoch": 1.1323518640591812, + "grad_norm": 19.7943115234375, + "learning_rate": 7.156565703327114e-06, + "loss": 3.3243, + "step": 38650 + }, + { + "epoch": 1.1326448399619131, + "grad_norm": 21.49497413635254, + "learning_rate": 7.155138774117296e-06, + "loss": 3.3277, + "step": 38660 + }, + { + "epoch": 1.132937815864645, + "grad_norm": 19.20389747619629, + "learning_rate": 7.1537116292986295e-06, + "loss": 3.3261, + "step": 38670 + }, + { + "epoch": 1.1332307917673772, + "grad_norm": 19.02892303466797, + "learning_rate": 7.152284269013892e-06, + "loss": 3.3099, + "step": 38680 + }, + { + "epoch": 1.133523767670109, + "grad_norm": 19.919904708862305, + "learning_rate": 7.150856693405881e-06, + "loss": 3.3117, + "step": 38690 + }, + { + "epoch": 1.133816743572841, + "grad_norm": 17.8795166015625, + "learning_rate": 7.14942890261742e-06, + "loss": 3.3317, + "step": 38700 + }, + { + "epoch": 1.1341097194755732, + "grad_norm": 16.070690155029297, + "learning_rate": 7.14800089679135e-06, + "loss": 3.3248, + "step": 38710 + }, + { + "epoch": 1.134402695378305, + "grad_norm": 20.470508575439453, + "learning_rate": 7.146572676070532e-06, + "loss": 3.315, + "step": 38720 + }, + { + "epoch": 1.1346956712810372, + "grad_norm": 23.15511131286621, + "learning_rate": 7.1451442405978545e-06, + "loss": 3.3488, + "step": 38730 + }, + { + "epoch": 1.1349886471837691, + "grad_norm": 20.39939308166504, + "learning_rate": 7.1437155905162224e-06, + "loss": 3.3043, + "step": 38740 + }, + { + "epoch": 1.135281623086501, + "grad_norm": 18.315322875976562, + "learning_rate": 7.142286725968564e-06, + "loss": 3.3331, + "step": 38750 + }, + { + "epoch": 1.1355745989892332, + "grad_norm": 18.945730209350586, + "learning_rate": 7.140857647097825e-06, + "loss": 3.3335, + "step": 38760 + }, + { + "epoch": 1.135867574891965, + "grad_norm": 18.31247329711914, + "learning_rate": 7.139428354046983e-06, + "loss": 3.3315, + "step": 38770 + }, + { + "epoch": 1.1361605507946972, + "grad_norm": 18.423263549804688, + "learning_rate": 7.137998846959026e-06, + "loss": 3.3233, + "step": 38780 + }, + { + "epoch": 1.1364535266974292, + "grad_norm": 18.229602813720703, + "learning_rate": 7.13656912597697e-06, + "loss": 3.3261, + "step": 38790 + }, + { + "epoch": 1.136746502600161, + "grad_norm": 17.708213806152344, + "learning_rate": 7.135139191243847e-06, + "loss": 3.3254, + "step": 38800 + }, + { + "epoch": 1.1370394785028932, + "grad_norm": 19.105056762695312, + "learning_rate": 7.1337090429027164e-06, + "loss": 3.3275, + "step": 38810 + }, + { + "epoch": 1.1373324544056251, + "grad_norm": 20.880502700805664, + "learning_rate": 7.132278681096655e-06, + "loss": 3.3245, + "step": 38820 + }, + { + "epoch": 1.137625430308357, + "grad_norm": 19.463207244873047, + "learning_rate": 7.130848105968762e-06, + "loss": 3.3489, + "step": 38830 + }, + { + "epoch": 1.1379184062110892, + "grad_norm": 13.776983261108398, + "learning_rate": 7.129417317662156e-06, + "loss": 3.3161, + "step": 38840 + }, + { + "epoch": 1.1382113821138211, + "grad_norm": 18.634994506835938, + "learning_rate": 7.127986316319983e-06, + "loss": 3.3444, + "step": 38850 + }, + { + "epoch": 1.138504358016553, + "grad_norm": 15.940793991088867, + "learning_rate": 7.126555102085403e-06, + "loss": 3.3201, + "step": 38860 + }, + { + "epoch": 1.1387973339192852, + "grad_norm": 15.280556678771973, + "learning_rate": 7.125123675101602e-06, + "loss": 3.3207, + "step": 38870 + }, + { + "epoch": 1.139090309822017, + "grad_norm": 16.3870792388916, + "learning_rate": 7.123692035511783e-06, + "loss": 3.3116, + "step": 38880 + }, + { + "epoch": 1.1393832857247492, + "grad_norm": 17.961681365966797, + "learning_rate": 7.122260183459178e-06, + "loss": 3.3328, + "step": 38890 + }, + { + "epoch": 1.1396762616274811, + "grad_norm": 15.37101936340332, + "learning_rate": 7.120828119087031e-06, + "loss": 3.3339, + "step": 38900 + }, + { + "epoch": 1.139969237530213, + "grad_norm": 19.444564819335938, + "learning_rate": 7.119395842538615e-06, + "loss": 3.3429, + "step": 38910 + }, + { + "epoch": 1.1400278327107596, + "eval_bleu": 0.33412587633024043, + "eval_cap_loss": 0.9544479846954346, + "eval_con_loss": 1.2771906852722168, + "eval_loss": 3.508829116821289, + "step": 38912 + }, + { + "epoch": 1.1400278327107596, + "eval_bleu": 0.33412587633024043, + "eval_cap_loss": 0.9544479846954346, + "eval_con_loss": 1.2771906852722168, + "eval_loss": 3.508829116821289, + "eval_runtime": 52.7452, + "eval_samples_per_second": 379.181, + "eval_steps_per_second": 0.379, + "step": 38912 + }, + { + "epoch": 1.1402622134329452, + "grad_norm": 19.883134841918945, + "learning_rate": 7.1179633539572165e-06, + "loss": 3.3334, + "step": 38920 + }, + { + "epoch": 1.1405551893356771, + "grad_norm": 18.49119758605957, + "learning_rate": 7.116530653486151e-06, + "loss": 3.3086, + "step": 38930 + }, + { + "epoch": 1.1408481652384093, + "grad_norm": 17.83989143371582, + "learning_rate": 7.115097741268751e-06, + "loss": 3.3362, + "step": 38940 + }, + { + "epoch": 1.1411411411411412, + "grad_norm": 16.086877822875977, + "learning_rate": 7.11366461744837e-06, + "loss": 3.3205, + "step": 38950 + }, + { + "epoch": 1.141434117043873, + "grad_norm": 17.11916732788086, + "learning_rate": 7.1122312821683826e-06, + "loss": 3.3247, + "step": 38960 + }, + { + "epoch": 1.1417270929466052, + "grad_norm": 16.958219528198242, + "learning_rate": 7.110797735572188e-06, + "loss": 3.3154, + "step": 38970 + }, + { + "epoch": 1.1420200688493372, + "grad_norm": 17.941871643066406, + "learning_rate": 7.109363977803204e-06, + "loss": 3.3141, + "step": 38980 + }, + { + "epoch": 1.142313044752069, + "grad_norm": 17.001983642578125, + "learning_rate": 7.107930009004867e-06, + "loss": 3.2893, + "step": 38990 + }, + { + "epoch": 1.1426060206548012, + "grad_norm": 16.088111877441406, + "learning_rate": 7.106495829320639e-06, + "loss": 3.3008, + "step": 39000 + }, + { + "epoch": 1.1428989965575331, + "grad_norm": 17.10287094116211, + "learning_rate": 7.105061438894001e-06, + "loss": 3.3132, + "step": 39010 + }, + { + "epoch": 1.143191972460265, + "grad_norm": 16.587482452392578, + "learning_rate": 7.103626837868455e-06, + "loss": 3.3299, + "step": 39020 + }, + { + "epoch": 1.1434849483629972, + "grad_norm": 14.258821487426758, + "learning_rate": 7.102192026387524e-06, + "loss": 3.3257, + "step": 39030 + }, + { + "epoch": 1.143777924265729, + "grad_norm": 18.910335540771484, + "learning_rate": 7.100757004594753e-06, + "loss": 3.3278, + "step": 39040 + }, + { + "epoch": 1.1440709001684612, + "grad_norm": 15.763808250427246, + "learning_rate": 7.099321772633706e-06, + "loss": 3.3222, + "step": 39050 + }, + { + "epoch": 1.1443638760711932, + "grad_norm": 16.027652740478516, + "learning_rate": 7.097886330647971e-06, + "loss": 3.3313, + "step": 39060 + }, + { + "epoch": 1.144656851973925, + "grad_norm": 15.955085754394531, + "learning_rate": 7.096450678781156e-06, + "loss": 3.3424, + "step": 39070 + }, + { + "epoch": 1.1449498278766572, + "grad_norm": 19.155364990234375, + "learning_rate": 7.0950148171768864e-06, + "loss": 3.3138, + "step": 39080 + }, + { + "epoch": 1.1452428037793891, + "grad_norm": 19.38562774658203, + "learning_rate": 7.093578745978816e-06, + "loss": 3.3284, + "step": 39090 + }, + { + "epoch": 1.145535779682121, + "grad_norm": 13.635222434997559, + "learning_rate": 7.092142465330612e-06, + "loss": 3.3119, + "step": 39100 + }, + { + "epoch": 1.1458287555848532, + "grad_norm": 18.32695198059082, + "learning_rate": 7.0907059753759665e-06, + "loss": 3.3397, + "step": 39110 + }, + { + "epoch": 1.1461217314875851, + "grad_norm": 17.796649932861328, + "learning_rate": 7.08926927625859e-06, + "loss": 3.3219, + "step": 39120 + }, + { + "epoch": 1.146414707390317, + "grad_norm": 18.72443389892578, + "learning_rate": 7.087832368122219e-06, + "loss": 3.3272, + "step": 39130 + }, + { + "epoch": 1.1467076832930492, + "grad_norm": 18.64318084716797, + "learning_rate": 7.0863952511106075e-06, + "loss": 3.3195, + "step": 39140 + }, + { + "epoch": 1.147000659195781, + "grad_norm": 18.380603790283203, + "learning_rate": 7.084957925367528e-06, + "loss": 3.3242, + "step": 39150 + }, + { + "epoch": 1.1472936350985132, + "grad_norm": 22.85987663269043, + "learning_rate": 7.083520391036778e-06, + "loss": 3.3476, + "step": 39160 + }, + { + "epoch": 1.1475866110012451, + "grad_norm": 18.993438720703125, + "learning_rate": 7.082082648262174e-06, + "loss": 3.3414, + "step": 39170 + }, + { + "epoch": 1.147879586903977, + "grad_norm": 16.985153198242188, + "learning_rate": 7.0806446971875545e-06, + "loss": 3.3338, + "step": 39180 + }, + { + "epoch": 1.1481725628067092, + "grad_norm": 18.9093017578125, + "learning_rate": 7.079206537956776e-06, + "loss": 3.3162, + "step": 39190 + }, + { + "epoch": 1.1484655387094411, + "grad_norm": 21.323936462402344, + "learning_rate": 7.077768170713718e-06, + "loss": 3.3293, + "step": 39200 + }, + { + "epoch": 1.1487585146121733, + "grad_norm": 17.492507934570312, + "learning_rate": 7.076329595602283e-06, + "loss": 3.2936, + "step": 39210 + }, + { + "epoch": 1.1490514905149052, + "grad_norm": 19.73670768737793, + "learning_rate": 7.07489081276639e-06, + "loss": 3.3278, + "step": 39220 + }, + { + "epoch": 1.149344466417637, + "grad_norm": 18.58633041381836, + "learning_rate": 7.073451822349981e-06, + "loss": 3.3276, + "step": 39230 + }, + { + "epoch": 1.1496374423203692, + "grad_norm": 19.370681762695312, + "learning_rate": 7.072012624497018e-06, + "loss": 3.3487, + "step": 39240 + }, + { + "epoch": 1.1499304182231012, + "grad_norm": 21.584285736083984, + "learning_rate": 7.0705732193514865e-06, + "loss": 3.319, + "step": 39250 + }, + { + "epoch": 1.150223394125833, + "grad_norm": 17.817094802856445, + "learning_rate": 7.069133607057388e-06, + "loss": 3.3168, + "step": 39260 + }, + { + "epoch": 1.1505163700285652, + "grad_norm": 19.901596069335938, + "learning_rate": 7.067693787758747e-06, + "loss": 3.3109, + "step": 39270 + }, + { + "epoch": 1.1508093459312971, + "grad_norm": 17.345796585083008, + "learning_rate": 7.066253761599609e-06, + "loss": 3.335, + "step": 39280 + }, + { + "epoch": 1.151102321834029, + "grad_norm": 17.831867218017578, + "learning_rate": 7.0648135287240425e-06, + "loss": 3.3229, + "step": 39290 + }, + { + "epoch": 1.1513952977367612, + "grad_norm": 19.2534122467041, + "learning_rate": 7.063373089276132e-06, + "loss": 3.3209, + "step": 39300 + }, + { + "epoch": 1.151688273639493, + "grad_norm": 18.197145462036133, + "learning_rate": 7.061932443399984e-06, + "loss": 3.3253, + "step": 39310 + }, + { + "epoch": 1.1519812495422252, + "grad_norm": 18.836538314819336, + "learning_rate": 7.060491591239731e-06, + "loss": 3.3293, + "step": 39320 + }, + { + "epoch": 1.1522742254449572, + "grad_norm": 18.515016555786133, + "learning_rate": 7.059050532939517e-06, + "loss": 3.3298, + "step": 39330 + }, + { + "epoch": 1.152567201347689, + "grad_norm": 20.271406173706055, + "learning_rate": 7.057609268643513e-06, + "loss": 3.308, + "step": 39340 + }, + { + "epoch": 1.1528601772504212, + "grad_norm": 19.593198776245117, + "learning_rate": 7.05616779849591e-06, + "loss": 3.2919, + "step": 39350 + }, + { + "epoch": 1.1531531531531531, + "grad_norm": 14.93291187286377, + "learning_rate": 7.054726122640918e-06, + "loss": 3.3064, + "step": 39360 + }, + { + "epoch": 1.153446129055885, + "grad_norm": 17.17276954650879, + "learning_rate": 7.053284241222765e-06, + "loss": 3.322, + "step": 39370 + }, + { + "epoch": 1.1537391049586172, + "grad_norm": 17.10282325744629, + "learning_rate": 7.0518421543857085e-06, + "loss": 3.3195, + "step": 39380 + }, + { + "epoch": 1.1540320808613491, + "grad_norm": 17.719053268432617, + "learning_rate": 7.050399862274015e-06, + "loss": 3.3088, + "step": 39390 + }, + { + "epoch": 1.154325056764081, + "grad_norm": 18.000043869018555, + "learning_rate": 7.048957365031983e-06, + "loss": 3.3121, + "step": 39400 + }, + { + "epoch": 1.1546180326668132, + "grad_norm": 20.758928298950195, + "learning_rate": 7.047514662803919e-06, + "loss": 3.3299, + "step": 39410 + }, + { + "epoch": 1.154911008569545, + "grad_norm": 16.86815071105957, + "learning_rate": 7.046071755734163e-06, + "loss": 3.3141, + "step": 39420 + }, + { + "epoch": 1.155028198930638, + "eval_bleu": 0.33422729472056045, + "eval_cap_loss": 0.9543795585632324, + "eval_con_loss": 1.2772746086120605, + "eval_loss": 3.5089285373687744, + "step": 39424 + }, + { + "epoch": 1.155028198930638, + "eval_bleu": 0.33422729472056045, + "eval_cap_loss": 0.9543795585632324, + "eval_con_loss": 1.2772746086120605, + "eval_loss": 3.5089285373687744, + "eval_runtime": 56.2363, + "eval_samples_per_second": 355.642, + "eval_steps_per_second": 0.356, + "step": 39424 + }, + { + "epoch": 1.1552039844722772, + "grad_norm": 15.231229782104492, + "learning_rate": 7.044628643967066e-06, + "loss": 3.3345, + "step": 39430 + }, + { + "epoch": 1.1554969603750092, + "grad_norm": 16.0593318939209, + "learning_rate": 7.043185327647004e-06, + "loss": 3.3219, + "step": 39440 + }, + { + "epoch": 1.155789936277741, + "grad_norm": 19.4714298248291, + "learning_rate": 7.041741806918372e-06, + "loss": 3.2971, + "step": 39450 + }, + { + "epoch": 1.1560829121804732, + "grad_norm": 14.796954154968262, + "learning_rate": 7.040298081925585e-06, + "loss": 3.3169, + "step": 39460 + }, + { + "epoch": 1.1563758880832051, + "grad_norm": 14.494242668151855, + "learning_rate": 7.038854152813079e-06, + "loss": 3.308, + "step": 39470 + }, + { + "epoch": 1.1566688639859373, + "grad_norm": 15.204977989196777, + "learning_rate": 7.037410019725312e-06, + "loss": 3.3234, + "step": 39480 + }, + { + "epoch": 1.1569618398886692, + "grad_norm": 16.744937896728516, + "learning_rate": 7.035965682806759e-06, + "loss": 3.3202, + "step": 39490 + }, + { + "epoch": 1.157254815791401, + "grad_norm": 16.16558074951172, + "learning_rate": 7.034521142201919e-06, + "loss": 3.3221, + "step": 39500 + }, + { + "epoch": 1.1575477916941332, + "grad_norm": 19.9268741607666, + "learning_rate": 7.033076398055309e-06, + "loss": 3.3293, + "step": 39510 + }, + { + "epoch": 1.1578407675968652, + "grad_norm": 18.412107467651367, + "learning_rate": 7.031631450511468e-06, + "loss": 3.3216, + "step": 39520 + }, + { + "epoch": 1.158133743499597, + "grad_norm": 19.607479095458984, + "learning_rate": 7.030186299714953e-06, + "loss": 3.293, + "step": 39530 + }, + { + "epoch": 1.1584267194023292, + "grad_norm": 15.866424560546875, + "learning_rate": 7.028740945810343e-06, + "loss": 3.3114, + "step": 39540 + }, + { + "epoch": 1.1587196953050611, + "grad_norm": 16.69890594482422, + "learning_rate": 7.027295388942238e-06, + "loss": 3.3159, + "step": 39550 + }, + { + "epoch": 1.159012671207793, + "grad_norm": 17.313901901245117, + "learning_rate": 7.025849629255257e-06, + "loss": 3.3452, + "step": 39560 + }, + { + "epoch": 1.1593056471105252, + "grad_norm": 16.57794952392578, + "learning_rate": 7.0244036668940395e-06, + "loss": 3.3114, + "step": 39570 + }, + { + "epoch": 1.1595986230132571, + "grad_norm": 17.6519832611084, + "learning_rate": 7.022957502003245e-06, + "loss": 3.3172, + "step": 39580 + }, + { + "epoch": 1.1598915989159893, + "grad_norm": 19.82004737854004, + "learning_rate": 7.021511134727554e-06, + "loss": 3.3126, + "step": 39590 + }, + { + "epoch": 1.1601845748187212, + "grad_norm": 18.396484375, + "learning_rate": 7.0200645652116685e-06, + "loss": 3.3075, + "step": 39600 + }, + { + "epoch": 1.160477550721453, + "grad_norm": 16.236665725708008, + "learning_rate": 7.018617793600306e-06, + "loss": 3.3134, + "step": 39610 + }, + { + "epoch": 1.1607705266241852, + "grad_norm": 17.056045532226562, + "learning_rate": 7.017170820038211e-06, + "loss": 3.3238, + "step": 39620 + }, + { + "epoch": 1.1610635025269171, + "grad_norm": 14.941996574401855, + "learning_rate": 7.015723644670143e-06, + "loss": 3.3021, + "step": 39630 + }, + { + "epoch": 1.1613564784296493, + "grad_norm": 19.748397827148438, + "learning_rate": 7.014276267640882e-06, + "loss": 3.3077, + "step": 39640 + }, + { + "epoch": 1.1616494543323812, + "grad_norm": 21.247922897338867, + "learning_rate": 7.012828689095231e-06, + "loss": 3.3379, + "step": 39650 + }, + { + "epoch": 1.1619424302351131, + "grad_norm": 17.181455612182617, + "learning_rate": 7.011380909178012e-06, + "loss": 3.3043, + "step": 39660 + }, + { + "epoch": 1.162235406137845, + "grad_norm": 17.8524227142334, + "learning_rate": 7.0099329280340656e-06, + "loss": 3.3036, + "step": 39670 + }, + { + "epoch": 1.1625283820405772, + "grad_norm": 17.02459716796875, + "learning_rate": 7.008484745808254e-06, + "loss": 3.2883, + "step": 39680 + }, + { + "epoch": 1.162821357943309, + "grad_norm": 17.35309600830078, + "learning_rate": 7.007036362645461e-06, + "loss": 3.3207, + "step": 39690 + }, + { + "epoch": 1.1631143338460412, + "grad_norm": 18.450342178344727, + "learning_rate": 7.005587778690585e-06, + "loss": 3.306, + "step": 39700 + }, + { + "epoch": 1.1634073097487732, + "grad_norm": 18.752456665039062, + "learning_rate": 7.0041389940885516e-06, + "loss": 3.3022, + "step": 39710 + }, + { + "epoch": 1.163700285651505, + "grad_norm": 15.904338836669922, + "learning_rate": 7.002690008984303e-06, + "loss": 3.3212, + "step": 39720 + }, + { + "epoch": 1.1639932615542372, + "grad_norm": 19.34502601623535, + "learning_rate": 7.001240823522799e-06, + "loss": 3.3161, + "step": 39730 + }, + { + "epoch": 1.1642862374569691, + "grad_norm": 14.553836822509766, + "learning_rate": 6.999791437849024e-06, + "loss": 3.3157, + "step": 39740 + }, + { + "epoch": 1.1645792133597013, + "grad_norm": 18.021535873413086, + "learning_rate": 6.998341852107981e-06, + "loss": 3.3306, + "step": 39750 + }, + { + "epoch": 1.1648721892624332, + "grad_norm": 18.449241638183594, + "learning_rate": 6.99689206644469e-06, + "loss": 3.3231, + "step": 39760 + }, + { + "epoch": 1.165165165165165, + "grad_norm": 18.069169998168945, + "learning_rate": 6.995442081004197e-06, + "loss": 3.3192, + "step": 39770 + }, + { + "epoch": 1.1654581410678972, + "grad_norm": 20.332073211669922, + "learning_rate": 6.99399189593156e-06, + "loss": 3.3357, + "step": 39780 + }, + { + "epoch": 1.1657511169706292, + "grad_norm": 18.184534072875977, + "learning_rate": 6.992541511371864e-06, + "loss": 3.3177, + "step": 39790 + }, + { + "epoch": 1.166044092873361, + "grad_norm": 19.750764846801758, + "learning_rate": 6.991090927470212e-06, + "loss": 3.3182, + "step": 39800 + }, + { + "epoch": 1.1663370687760932, + "grad_norm": 16.255611419677734, + "learning_rate": 6.989640144371726e-06, + "loss": 3.3036, + "step": 39810 + }, + { + "epoch": 1.1666300446788251, + "grad_norm": 16.773160934448242, + "learning_rate": 6.988189162221546e-06, + "loss": 3.3046, + "step": 39820 + }, + { + "epoch": 1.166923020581557, + "grad_norm": 16.815509796142578, + "learning_rate": 6.986737981164838e-06, + "loss": 3.3117, + "step": 39830 + }, + { + "epoch": 1.1672159964842892, + "grad_norm": 20.255435943603516, + "learning_rate": 6.98528660134678e-06, + "loss": 3.2881, + "step": 39840 + }, + { + "epoch": 1.1675089723870211, + "grad_norm": 17.565074920654297, + "learning_rate": 6.983835022912577e-06, + "loss": 3.3271, + "step": 39850 + }, + { + "epoch": 1.1678019482897533, + "grad_norm": 16.90127944946289, + "learning_rate": 6.9823832460074495e-06, + "loss": 3.3362, + "step": 39860 + }, + { + "epoch": 1.1680949241924852, + "grad_norm": 18.30064582824707, + "learning_rate": 6.980931270776638e-06, + "loss": 3.3136, + "step": 39870 + }, + { + "epoch": 1.168387900095217, + "grad_norm": 17.6611328125, + "learning_rate": 6.979479097365407e-06, + "loss": 3.3257, + "step": 39880 + }, + { + "epoch": 1.1686808759979492, + "grad_norm": 20.08228302001953, + "learning_rate": 6.978026725919036e-06, + "loss": 3.3452, + "step": 39890 + }, + { + "epoch": 1.1689738519006811, + "grad_norm": 18.630353927612305, + "learning_rate": 6.976574156582825e-06, + "loss": 3.3244, + "step": 39900 + }, + { + "epoch": 1.1692668278034133, + "grad_norm": 17.640586853027344, + "learning_rate": 6.975121389502097e-06, + "loss": 3.3178, + "step": 39910 + }, + { + "epoch": 1.1695598037061452, + "grad_norm": 19.515047073364258, + "learning_rate": 6.973668424822192e-06, + "loss": 3.3182, + "step": 39920 + }, + { + "epoch": 1.1698527796088771, + "grad_norm": 17.15389060974121, + "learning_rate": 6.9722152626884705e-06, + "loss": 3.3292, + "step": 39930 + }, + { + "epoch": 1.1700285651505165, + "eval_bleu": 0.3345371338853413, + "eval_cap_loss": 0.9529820084571838, + "eval_con_loss": 1.2746397256851196, + "eval_loss": 3.502261161804199, + "step": 39936 + }, + { + "epoch": 1.1700285651505165, + "eval_bleu": 0.3345371338853413, + "eval_cap_loss": 0.9529820084571838, + "eval_con_loss": 1.2746397256851196, + "eval_loss": 3.502261161804199, + "eval_runtime": 52.5367, + "eval_samples_per_second": 380.686, + "eval_steps_per_second": 0.381, + "step": 39936 + }, + { + "epoch": 1.1701457555116093, + "grad_norm": 17.592750549316406, + "learning_rate": 6.970761903246313e-06, + "loss": 3.3443, + "step": 39940 + }, + { + "epoch": 1.1704387314143412, + "grad_norm": 17.460769653320312, + "learning_rate": 6.969308346641119e-06, + "loss": 3.3001, + "step": 39950 + }, + { + "epoch": 1.170731707317073, + "grad_norm": 16.121400833129883, + "learning_rate": 6.967999977242237e-06, + "loss": 3.3286, + "step": 39960 + }, + { + "epoch": 1.1710246832198052, + "grad_norm": 19.817781448364258, + "learning_rate": 6.966546046427921e-06, + "loss": 3.3078, + "step": 39970 + }, + { + "epoch": 1.1713176591225372, + "grad_norm": 18.397701263427734, + "learning_rate": 6.9650919188723424e-06, + "loss": 3.3296, + "step": 39980 + }, + { + "epoch": 1.171610635025269, + "grad_norm": 16.102401733398438, + "learning_rate": 6.963637594720974e-06, + "loss": 3.3178, + "step": 39990 + }, + { + "epoch": 1.1719036109280012, + "grad_norm": 19.00397491455078, + "learning_rate": 6.962183074119313e-06, + "loss": 3.2882, + "step": 40000 + }, + { + "epoch": 1.1721965868307331, + "grad_norm": 15.362652778625488, + "learning_rate": 6.960728357212878e-06, + "loss": 3.2942, + "step": 40010 + }, + { + "epoch": 1.1724895627334653, + "grad_norm": 16.653541564941406, + "learning_rate": 6.9592734441472036e-06, + "loss": 3.3267, + "step": 40020 + }, + { + "epoch": 1.1727825386361972, + "grad_norm": 15.589285850524902, + "learning_rate": 6.957818335067844e-06, + "loss": 3.3398, + "step": 40030 + }, + { + "epoch": 1.173075514538929, + "grad_norm": 17.836063385009766, + "learning_rate": 6.956363030120377e-06, + "loss": 3.3181, + "step": 40040 + }, + { + "epoch": 1.1733684904416612, + "grad_norm": 17.135635375976562, + "learning_rate": 6.954907529450394e-06, + "loss": 3.3441, + "step": 40050 + }, + { + "epoch": 1.1736614663443932, + "grad_norm": 17.471649169921875, + "learning_rate": 6.953451833203512e-06, + "loss": 3.2782, + "step": 40060 + }, + { + "epoch": 1.173954442247125, + "grad_norm": 17.591819763183594, + "learning_rate": 6.951995941525362e-06, + "loss": 3.3173, + "step": 40070 + }, + { + "epoch": 1.1742474181498572, + "grad_norm": 17.490449905395508, + "learning_rate": 6.950539854561601e-06, + "loss": 3.3063, + "step": 40080 + }, + { + "epoch": 1.1745403940525891, + "grad_norm": 17.678009033203125, + "learning_rate": 6.949083572457899e-06, + "loss": 3.3366, + "step": 40090 + }, + { + "epoch": 1.174833369955321, + "grad_norm": 20.20042610168457, + "learning_rate": 6.947627095359948e-06, + "loss": 3.3319, + "step": 40100 + }, + { + "epoch": 1.1751263458580532, + "grad_norm": 16.84571075439453, + "learning_rate": 6.946170423413461e-06, + "loss": 3.3195, + "step": 40110 + }, + { + "epoch": 1.1754193217607851, + "grad_norm": 17.98654556274414, + "learning_rate": 6.944713556764171e-06, + "loss": 3.3009, + "step": 40120 + }, + { + "epoch": 1.1757122976635173, + "grad_norm": 15.81982421875, + "learning_rate": 6.943256495557827e-06, + "loss": 3.3182, + "step": 40130 + }, + { + "epoch": 1.1760052735662492, + "grad_norm": 18.812442779541016, + "learning_rate": 6.941799239940197e-06, + "loss": 3.3001, + "step": 40140 + }, + { + "epoch": 1.176298249468981, + "grad_norm": 16.991470336914062, + "learning_rate": 6.940341790057076e-06, + "loss": 3.3416, + "step": 40150 + }, + { + "epoch": 1.1765912253717132, + "grad_norm": 16.156085968017578, + "learning_rate": 6.93888414605427e-06, + "loss": 3.3092, + "step": 40160 + }, + { + "epoch": 1.1768842012744452, + "grad_norm": 14.752495765686035, + "learning_rate": 6.9374263080776085e-06, + "loss": 3.2941, + "step": 40170 + }, + { + "epoch": 1.1771771771771773, + "grad_norm": 18.229839324951172, + "learning_rate": 6.935968276272938e-06, + "loss": 3.3167, + "step": 40180 + }, + { + "epoch": 1.1774701530799092, + "grad_norm": 17.9815731048584, + "learning_rate": 6.934510050786127e-06, + "loss": 3.3047, + "step": 40190 + }, + { + "epoch": 1.1777631289826411, + "grad_norm": 17.555774688720703, + "learning_rate": 6.933051631763063e-06, + "loss": 3.3064, + "step": 40200 + }, + { + "epoch": 1.1780561048853733, + "grad_norm": 18.029132843017578, + "learning_rate": 6.931593019349653e-06, + "loss": 3.3083, + "step": 40210 + }, + { + "epoch": 1.1783490807881052, + "grad_norm": 18.050737380981445, + "learning_rate": 6.9301342136918195e-06, + "loss": 3.322, + "step": 40220 + }, + { + "epoch": 1.178642056690837, + "grad_norm": 16.14980697631836, + "learning_rate": 6.928675214935509e-06, + "loss": 3.3162, + "step": 40230 + }, + { + "epoch": 1.1789350325935692, + "grad_norm": 18.84735679626465, + "learning_rate": 6.927216023226687e-06, + "loss": 3.3283, + "step": 40240 + }, + { + "epoch": 1.1792280084963012, + "grad_norm": 18.793102264404297, + "learning_rate": 6.925756638711335e-06, + "loss": 3.3117, + "step": 40250 + }, + { + "epoch": 1.179520984399033, + "grad_norm": 17.16503143310547, + "learning_rate": 6.924297061535456e-06, + "loss": 3.3097, + "step": 40260 + }, + { + "epoch": 1.1798139603017652, + "grad_norm": 17.87147331237793, + "learning_rate": 6.922837291845073e-06, + "loss": 3.297, + "step": 40270 + }, + { + "epoch": 1.1801069362044971, + "grad_norm": 15.7445068359375, + "learning_rate": 6.921377329786228e-06, + "loss": 3.3004, + "step": 40280 + }, + { + "epoch": 1.1803999121072293, + "grad_norm": 16.483705520629883, + "learning_rate": 6.919917175504978e-06, + "loss": 3.3198, + "step": 40290 + }, + { + "epoch": 1.1806928880099612, + "grad_norm": 17.794710159301758, + "learning_rate": 6.918456829147406e-06, + "loss": 3.3147, + "step": 40300 + }, + { + "epoch": 1.1809858639126931, + "grad_norm": 17.550865173339844, + "learning_rate": 6.91699629085961e-06, + "loss": 3.3098, + "step": 40310 + }, + { + "epoch": 1.1812788398154253, + "grad_norm": 17.180509567260742, + "learning_rate": 6.915535560787708e-06, + "loss": 3.3216, + "step": 40320 + }, + { + "epoch": 1.1815718157181572, + "grad_norm": 16.76283073425293, + "learning_rate": 6.914074639077838e-06, + "loss": 3.3112, + "step": 40330 + }, + { + "epoch": 1.181864791620889, + "grad_norm": 17.896596908569336, + "learning_rate": 6.912613525876155e-06, + "loss": 3.3171, + "step": 40340 + }, + { + "epoch": 1.1821577675236212, + "grad_norm": 18.961687088012695, + "learning_rate": 6.911152221328837e-06, + "loss": 3.3154, + "step": 40350 + }, + { + "epoch": 1.1824507434263531, + "grad_norm": 17.198745727539062, + "learning_rate": 6.909690725582078e-06, + "loss": 3.3409, + "step": 40360 + }, + { + "epoch": 1.182743719329085, + "grad_norm": 15.888729095458984, + "learning_rate": 6.908229038782092e-06, + "loss": 3.2912, + "step": 40370 + }, + { + "epoch": 1.1830366952318172, + "grad_norm": 22.201114654541016, + "learning_rate": 6.906767161075111e-06, + "loss": 3.323, + "step": 40380 + }, + { + "epoch": 1.1833296711345491, + "grad_norm": 17.32027816772461, + "learning_rate": 6.905305092607387e-06, + "loss": 3.3253, + "step": 40390 + }, + { + "epoch": 1.1836226470372813, + "grad_norm": 23.450618743896484, + "learning_rate": 6.903842833525194e-06, + "loss": 3.3122, + "step": 40400 + }, + { + "epoch": 1.1839156229400132, + "grad_norm": 19.57669448852539, + "learning_rate": 6.9023803839748204e-06, + "loss": 3.2898, + "step": 40410 + }, + { + "epoch": 1.184208598842745, + "grad_norm": 17.771835327148438, + "learning_rate": 6.900917744102575e-06, + "loss": 3.3194, + "step": 40420 + }, + { + "epoch": 1.1845015747454772, + "grad_norm": 14.66722583770752, + "learning_rate": 6.899454914054787e-06, + "loss": 3.326, + "step": 40430 + }, + { + "epoch": 1.1847945506482092, + "grad_norm": 16.584835052490234, + "learning_rate": 6.897991893977805e-06, + "loss": 3.309, + "step": 40440 + }, + { + "epoch": 1.1850289313703948, + "eval_bleu": 0.334232401746803, + "eval_cap_loss": 0.9527198076248169, + "eval_con_loss": 1.274674654006958, + "eval_loss": 3.5020689964294434, + "step": 40448 + }, + { + "epoch": 1.1850289313703948, + "eval_bleu": 0.334232401746803, + "eval_cap_loss": 0.9527198076248169, + "eval_con_loss": 1.274674654006958, + "eval_loss": 3.5020689964294434, + "eval_runtime": 54.467, + "eval_samples_per_second": 367.195, + "eval_steps_per_second": 0.367, + "step": 40448 + }, + { + "epoch": 1.1850875265509413, + "grad_norm": 15.947677612304688, + "learning_rate": 6.896528684017995e-06, + "loss": 3.3257, + "step": 40450 + }, + { + "epoch": 1.1853805024536732, + "grad_norm": 13.758484840393066, + "learning_rate": 6.8950652843217405e-06, + "loss": 3.3243, + "step": 40460 + }, + { + "epoch": 1.1856734783564051, + "grad_norm": 17.411088943481445, + "learning_rate": 6.893601695035447e-06, + "loss": 3.3066, + "step": 40470 + }, + { + "epoch": 1.1859664542591373, + "grad_norm": 18.457561492919922, + "learning_rate": 6.892137916305539e-06, + "loss": 3.303, + "step": 40480 + }, + { + "epoch": 1.1862594301618692, + "grad_norm": 16.29061508178711, + "learning_rate": 6.890673948278458e-06, + "loss": 3.3205, + "step": 40490 + }, + { + "epoch": 1.186552406064601, + "grad_norm": 21.107646942138672, + "learning_rate": 6.889209791100666e-06, + "loss": 3.3013, + "step": 40500 + }, + { + "epoch": 1.1868453819673332, + "grad_norm": 17.90639877319336, + "learning_rate": 6.887745444918641e-06, + "loss": 3.301, + "step": 40510 + }, + { + "epoch": 1.1871383578700652, + "grad_norm": 14.676240921020508, + "learning_rate": 6.886280909878886e-06, + "loss": 3.2975, + "step": 40520 + }, + { + "epoch": 1.187431333772797, + "grad_norm": 15.30368709564209, + "learning_rate": 6.884816186127916e-06, + "loss": 3.3329, + "step": 40530 + }, + { + "epoch": 1.1877243096755292, + "grad_norm": 20.474903106689453, + "learning_rate": 6.883351273812269e-06, + "loss": 3.2956, + "step": 40540 + }, + { + "epoch": 1.1880172855782611, + "grad_norm": 17.787263870239258, + "learning_rate": 6.8818861730785e-06, + "loss": 3.3101, + "step": 40550 + }, + { + "epoch": 1.1883102614809933, + "grad_norm": 20.016164779663086, + "learning_rate": 6.880420884073185e-06, + "loss": 3.3082, + "step": 40560 + }, + { + "epoch": 1.1886032373837252, + "grad_norm": 16.083316802978516, + "learning_rate": 6.878955406942915e-06, + "loss": 3.2713, + "step": 40570 + }, + { + "epoch": 1.1888962132864571, + "grad_norm": 16.030494689941406, + "learning_rate": 6.877489741834307e-06, + "loss": 3.331, + "step": 40580 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 18.984928131103516, + "learning_rate": 6.876023888893986e-06, + "loss": 3.3207, + "step": 40590 + }, + { + "epoch": 1.1894821650919212, + "grad_norm": 17.681161880493164, + "learning_rate": 6.8745578482686075e-06, + "loss": 3.3103, + "step": 40600 + }, + { + "epoch": 1.1897751409946533, + "grad_norm": 17.216638565063477, + "learning_rate": 6.8730916201048356e-06, + "loss": 3.3155, + "step": 40610 + }, + { + "epoch": 1.1900681168973852, + "grad_norm": 18.57621955871582, + "learning_rate": 6.871625204549362e-06, + "loss": 3.3284, + "step": 40620 + }, + { + "epoch": 1.1903610928001171, + "grad_norm": 16.92151641845703, + "learning_rate": 6.870158601748889e-06, + "loss": 3.3067, + "step": 40630 + }, + { + "epoch": 1.190654068702849, + "grad_norm": 19.278696060180664, + "learning_rate": 6.868691811850143e-06, + "loss": 3.3063, + "step": 40640 + }, + { + "epoch": 1.1909470446055812, + "grad_norm": 20.217060089111328, + "learning_rate": 6.867224834999868e-06, + "loss": 3.2975, + "step": 40650 + }, + { + "epoch": 1.1912400205083131, + "grad_norm": 15.085759162902832, + "learning_rate": 6.865757671344827e-06, + "loss": 3.3164, + "step": 40660 + }, + { + "epoch": 1.1915329964110453, + "grad_norm": 20.753459930419922, + "learning_rate": 6.864290321031799e-06, + "loss": 3.3277, + "step": 40670 + }, + { + "epoch": 1.1918259723137772, + "grad_norm": 18.02787971496582, + "learning_rate": 6.862822784207586e-06, + "loss": 3.3229, + "step": 40680 + }, + { + "epoch": 1.192118948216509, + "grad_norm": 17.75383186340332, + "learning_rate": 6.861355061019004e-06, + "loss": 3.3133, + "step": 40690 + }, + { + "epoch": 1.1924119241192412, + "grad_norm": 16.298458099365234, + "learning_rate": 6.859887151612892e-06, + "loss": 3.2819, + "step": 40700 + }, + { + "epoch": 1.1927049000219732, + "grad_norm": 14.97705364227295, + "learning_rate": 6.858419056136104e-06, + "loss": 3.3089, + "step": 40710 + }, + { + "epoch": 1.1929978759247053, + "grad_norm": 18.544527053833008, + "learning_rate": 6.856950774735516e-06, + "loss": 3.3203, + "step": 40720 + }, + { + "epoch": 1.1932908518274372, + "grad_norm": 18.82259750366211, + "learning_rate": 6.855482307558019e-06, + "loss": 3.3195, + "step": 40730 + }, + { + "epoch": 1.1935838277301691, + "grad_norm": 17.55775260925293, + "learning_rate": 6.854013654750527e-06, + "loss": 3.3021, + "step": 40740 + }, + { + "epoch": 1.1938768036329013, + "grad_norm": 15.536916732788086, + "learning_rate": 6.852544816459968e-06, + "loss": 3.2981, + "step": 40750 + }, + { + "epoch": 1.1941697795356332, + "grad_norm": 18.039188385009766, + "learning_rate": 6.851075792833288e-06, + "loss": 3.309, + "step": 40760 + }, + { + "epoch": 1.194462755438365, + "grad_norm": 18.825183868408203, + "learning_rate": 6.849606584017459e-06, + "loss": 3.3152, + "step": 40770 + }, + { + "epoch": 1.1947557313410972, + "grad_norm": 17.519262313842773, + "learning_rate": 6.848137190159465e-06, + "loss": 3.2913, + "step": 40780 + }, + { + "epoch": 1.1950487072438292, + "grad_norm": 20.46907615661621, + "learning_rate": 6.84666761140631e-06, + "loss": 3.3048, + "step": 40790 + }, + { + "epoch": 1.195341683146561, + "grad_norm": 15.947521209716797, + "learning_rate": 6.845197847905015e-06, + "loss": 3.3036, + "step": 40800 + }, + { + "epoch": 1.1956346590492932, + "grad_norm": 18.29218864440918, + "learning_rate": 6.843727899802624e-06, + "loss": 3.3092, + "step": 40810 + }, + { + "epoch": 1.1959276349520251, + "grad_norm": 17.262096405029297, + "learning_rate": 6.842257767246195e-06, + "loss": 3.3103, + "step": 40820 + }, + { + "epoch": 1.1962206108547573, + "grad_norm": 15.927123069763184, + "learning_rate": 6.840787450382805e-06, + "loss": 3.3031, + "step": 40830 + }, + { + "epoch": 1.1965135867574892, + "grad_norm": 17.621702194213867, + "learning_rate": 6.839316949359551e-06, + "loss": 3.304, + "step": 40840 + }, + { + "epoch": 1.1968065626602211, + "grad_norm": 18.185943603515625, + "learning_rate": 6.8378462643235514e-06, + "loss": 3.3102, + "step": 40850 + }, + { + "epoch": 1.1970995385629533, + "grad_norm": 14.704607009887695, + "learning_rate": 6.836375395421934e-06, + "loss": 3.3097, + "step": 40860 + }, + { + "epoch": 1.1973925144656852, + "grad_norm": 15.23694896697998, + "learning_rate": 6.834904342801855e-06, + "loss": 3.3212, + "step": 40870 + }, + { + "epoch": 1.1976854903684173, + "grad_norm": 16.330429077148438, + "learning_rate": 6.8334331066104805e-06, + "loss": 3.2997, + "step": 40880 + }, + { + "epoch": 1.1979784662711492, + "grad_norm": 16.82853126525879, + "learning_rate": 6.831961686995002e-06, + "loss": 3.3174, + "step": 40890 + }, + { + "epoch": 1.1982714421738812, + "grad_norm": 15.725966453552246, + "learning_rate": 6.830490084102626e-06, + "loss": 3.3194, + "step": 40900 + }, + { + "epoch": 1.1985644180766133, + "grad_norm": 14.721405982971191, + "learning_rate": 6.829018298080576e-06, + "loss": 3.3188, + "step": 40910 + }, + { + "epoch": 1.1988573939793452, + "grad_norm": 17.18648338317871, + "learning_rate": 6.8275463290760975e-06, + "loss": 3.3132, + "step": 40920 + }, + { + "epoch": 1.1991503698820771, + "grad_norm": 17.74485206604004, + "learning_rate": 6.8260741772364505e-06, + "loss": 3.2933, + "step": 40930 + }, + { + "epoch": 1.1994433457848093, + "grad_norm": 17.180591583251953, + "learning_rate": 6.824601842708917e-06, + "loss": 3.3121, + "step": 40940 + }, + { + "epoch": 1.1997363216875412, + "grad_norm": 19.411909103393555, + "learning_rate": 6.8231293256407935e-06, + "loss": 3.3241, + "step": 40950 + }, + { + "epoch": 1.200029297590273, + "grad_norm": 20.259302139282227, + "learning_rate": 6.821656626179396e-06, + "loss": 3.3347, + "step": 40960 + }, + { + "epoch": 1.200029297590273, + "eval_bleu": 0.3344168478571948, + "eval_cap_loss": 0.9520607590675354, + "eval_con_loss": 1.2693358659744263, + "eval_loss": 3.4907326698303223, + "step": 40960 + }, + { + "epoch": 1.200029297590273, + "eval_bleu": 0.3344168478571948, + "eval_cap_loss": 0.9520607590675354, + "eval_con_loss": 1.2693358659744263, + "eval_loss": 3.4907326698303223, + "eval_runtime": 57.0867, + "eval_samples_per_second": 350.345, + "eval_steps_per_second": 0.35, + "step": 40960 + }, + { + "epoch": 1.2003222734930052, + "grad_norm": 22.447847366333008, + "learning_rate": 6.820183744472062e-06, + "loss": 3.3136, + "step": 40970 + }, + { + "epoch": 1.2006152493957372, + "grad_norm": 16.67936897277832, + "learning_rate": 6.8187106806661425e-06, + "loss": 3.2893, + "step": 40980 + }, + { + "epoch": 1.2009082252984693, + "grad_norm": 16.5426082611084, + "learning_rate": 6.81723743490901e-06, + "loss": 3.2896, + "step": 40990 + }, + { + "epoch": 1.2012012012012012, + "grad_norm": 18.066246032714844, + "learning_rate": 6.815764007348052e-06, + "loss": 3.3212, + "step": 41000 + }, + { + "epoch": 1.2014941771039331, + "grad_norm": 18.840003967285156, + "learning_rate": 6.81429039813068e-06, + "loss": 3.3037, + "step": 41010 + }, + { + "epoch": 1.2017871530066653, + "grad_norm": 17.551025390625, + "learning_rate": 6.812816607404316e-06, + "loss": 3.2732, + "step": 41020 + }, + { + "epoch": 1.2020801289093972, + "grad_norm": 14.56613826751709, + "learning_rate": 6.811342635316406e-06, + "loss": 3.3258, + "step": 41030 + }, + { + "epoch": 1.2023731048121291, + "grad_norm": 17.500913619995117, + "learning_rate": 6.80986848201441e-06, + "loss": 3.303, + "step": 41040 + }, + { + "epoch": 1.2026660807148613, + "grad_norm": 16.465261459350586, + "learning_rate": 6.808394147645812e-06, + "loss": 3.2976, + "step": 41050 + }, + { + "epoch": 1.2029590566175932, + "grad_norm": 15.795876502990723, + "learning_rate": 6.806919632358108e-06, + "loss": 3.3015, + "step": 41060 + }, + { + "epoch": 1.203252032520325, + "grad_norm": 15.727105140686035, + "learning_rate": 6.805444936298814e-06, + "loss": 3.3237, + "step": 41070 + }, + { + "epoch": 1.2035450084230572, + "grad_norm": 20.4886417388916, + "learning_rate": 6.803970059615465e-06, + "loss": 3.2986, + "step": 41080 + }, + { + "epoch": 1.2038379843257891, + "grad_norm": 16.222875595092773, + "learning_rate": 6.802495002455615e-06, + "loss": 3.3042, + "step": 41090 + }, + { + "epoch": 1.2041309602285213, + "grad_norm": 18.282386779785156, + "learning_rate": 6.801019764966834e-06, + "loss": 3.3106, + "step": 41100 + }, + { + "epoch": 1.2044239361312532, + "grad_norm": 19.83264923095703, + "learning_rate": 6.799544347296711e-06, + "loss": 3.295, + "step": 41110 + }, + { + "epoch": 1.2047169120339851, + "grad_norm": 20.766517639160156, + "learning_rate": 6.798068749592851e-06, + "loss": 3.3079, + "step": 41120 + }, + { + "epoch": 1.2050098879367173, + "grad_norm": 21.317276000976562, + "learning_rate": 6.796592972002881e-06, + "loss": 3.2849, + "step": 41130 + }, + { + "epoch": 1.2053028638394492, + "grad_norm": 20.7138614654541, + "learning_rate": 6.795117014674444e-06, + "loss": 3.3177, + "step": 41140 + }, + { + "epoch": 1.2055958397421813, + "grad_norm": 19.04650115966797, + "learning_rate": 6.793640877755199e-06, + "loss": 3.3003, + "step": 41150 + }, + { + "epoch": 1.2058888156449132, + "grad_norm": 15.909958839416504, + "learning_rate": 6.792164561392825e-06, + "loss": 3.3158, + "step": 41160 + }, + { + "epoch": 1.2061817915476452, + "grad_norm": 19.342927932739258, + "learning_rate": 6.790688065735019e-06, + "loss": 3.3016, + "step": 41170 + }, + { + "epoch": 1.2064747674503773, + "grad_norm": 18.354707717895508, + "learning_rate": 6.789211390929497e-06, + "loss": 3.3218, + "step": 41180 + }, + { + "epoch": 1.2067677433531092, + "grad_norm": 17.41854476928711, + "learning_rate": 6.78773453712399e-06, + "loss": 3.3101, + "step": 41190 + }, + { + "epoch": 1.2070607192558411, + "grad_norm": 18.91446304321289, + "learning_rate": 6.786257504466247e-06, + "loss": 3.3047, + "step": 41200 + }, + { + "epoch": 1.2073536951585733, + "grad_norm": 16.763778686523438, + "learning_rate": 6.7847802931040404e-06, + "loss": 3.3209, + "step": 41210 + }, + { + "epoch": 1.2076466710613052, + "grad_norm": 17.584850311279297, + "learning_rate": 6.783302903185154e-06, + "loss": 3.3045, + "step": 41220 + }, + { + "epoch": 1.207939646964037, + "grad_norm": 19.43440055847168, + "learning_rate": 6.781825334857391e-06, + "loss": 3.3367, + "step": 41230 + }, + { + "epoch": 1.2082326228667692, + "grad_norm": 17.06128692626953, + "learning_rate": 6.780347588268574e-06, + "loss": 3.3051, + "step": 41240 + }, + { + "epoch": 1.2085255987695012, + "grad_norm": 17.741609573364258, + "learning_rate": 6.778869663566545e-06, + "loss": 3.2862, + "step": 41250 + }, + { + "epoch": 1.2088185746722333, + "grad_norm": 16.792743682861328, + "learning_rate": 6.777391560899161e-06, + "loss": 3.2983, + "step": 41260 + }, + { + "epoch": 1.2091115505749652, + "grad_norm": 16.931241989135742, + "learning_rate": 6.775913280414294e-06, + "loss": 3.2922, + "step": 41270 + }, + { + "epoch": 1.2094045264776971, + "grad_norm": 16.783811569213867, + "learning_rate": 6.7744348222598386e-06, + "loss": 3.2948, + "step": 41280 + }, + { + "epoch": 1.2096975023804293, + "grad_norm": 18.801855087280273, + "learning_rate": 6.772956186583708e-06, + "loss": 3.3185, + "step": 41290 + }, + { + "epoch": 1.2099904782831612, + "grad_norm": 18.295665740966797, + "learning_rate": 6.77147737353383e-06, + "loss": 3.3207, + "step": 41300 + }, + { + "epoch": 1.2102834541858931, + "grad_norm": 15.898606300354004, + "learning_rate": 6.7699983832581495e-06, + "loss": 3.3103, + "step": 41310 + }, + { + "epoch": 1.2105764300886253, + "grad_norm": 20.930681228637695, + "learning_rate": 6.768519215904632e-06, + "loss": 3.3293, + "step": 41320 + }, + { + "epoch": 1.2108694059913572, + "grad_norm": 16.368621826171875, + "learning_rate": 6.767039871621259e-06, + "loss": 3.2905, + "step": 41330 + }, + { + "epoch": 1.211162381894089, + "grad_norm": 17.95341682434082, + "learning_rate": 6.765560350556031e-06, + "loss": 3.2995, + "step": 41340 + }, + { + "epoch": 1.2114553577968212, + "grad_norm": 19.878990173339844, + "learning_rate": 6.764080652856964e-06, + "loss": 3.3046, + "step": 41350 + }, + { + "epoch": 1.2117483336995531, + "grad_norm": 16.766382217407227, + "learning_rate": 6.762600778672095e-06, + "loss": 3.3196, + "step": 41360 + }, + { + "epoch": 1.2120413096022853, + "grad_norm": 17.783525466918945, + "learning_rate": 6.7611207281494735e-06, + "loss": 3.2986, + "step": 41370 + }, + { + "epoch": 1.2123342855050172, + "grad_norm": 17.180015563964844, + "learning_rate": 6.759640501437172e-06, + "loss": 3.3278, + "step": 41380 + }, + { + "epoch": 1.2126272614077491, + "grad_norm": 16.839805603027344, + "learning_rate": 6.758160098683279e-06, + "loss": 3.313, + "step": 41390 + }, + { + "epoch": 1.2129202373104813, + "grad_norm": 15.608368873596191, + "learning_rate": 6.756679520035898e-06, + "loss": 3.2967, + "step": 41400 + }, + { + "epoch": 1.2132132132132132, + "grad_norm": 15.978321075439453, + "learning_rate": 6.755198765643153e-06, + "loss": 3.3403, + "step": 41410 + }, + { + "epoch": 1.2135061891159453, + "grad_norm": 20.89664077758789, + "learning_rate": 6.753717835653185e-06, + "loss": 3.3196, + "step": 41420 + }, + { + "epoch": 1.2137991650186772, + "grad_norm": 15.96187973022461, + "learning_rate": 6.752236730214152e-06, + "loss": 3.2787, + "step": 41430 + }, + { + "epoch": 1.2140921409214092, + "grad_norm": 17.916128158569336, + "learning_rate": 6.750755449474229e-06, + "loss": 3.3091, + "step": 41440 + }, + { + "epoch": 1.2143851168241413, + "grad_norm": 17.127281188964844, + "learning_rate": 6.749273993581611e-06, + "loss": 3.2995, + "step": 41450 + }, + { + "epoch": 1.2146780927268732, + "grad_norm": 20.12791633605957, + "learning_rate": 6.747792362684508e-06, + "loss": 3.319, + "step": 41460 + }, + { + "epoch": 1.2149710686296051, + "grad_norm": 16.696470260620117, + "learning_rate": 6.746310556931148e-06, + "loss": 3.3112, + "step": 41470 + }, + { + "epoch": 1.2150296638101516, + "eval_bleu": 0.33530170374174734, + "eval_cap_loss": 0.9518038034439087, + "eval_con_loss": 1.2656300067901611, + "eval_loss": 3.4830641746520996, + "step": 41472 + }, + { + "epoch": 1.2150296638101516, + "eval_bleu": 0.33530170374174734, + "eval_cap_loss": 0.9518038034439087, + "eval_con_loss": 1.2656300067901611, + "eval_loss": 3.4830641746520996, + "eval_runtime": 55.8257, + "eval_samples_per_second": 358.258, + "eval_steps_per_second": 0.358, + "step": 41472 + }, + { + "epoch": 1.2152640445323373, + "grad_norm": 15.540353775024414, + "learning_rate": 6.744828576469778e-06, + "loss": 3.3124, + "step": 41480 + }, + { + "epoch": 1.2155570204350692, + "grad_norm": 15.36385440826416, + "learning_rate": 6.74334642144866e-06, + "loss": 3.3315, + "step": 41490 + }, + { + "epoch": 1.215849996337801, + "grad_norm": 15.082806587219238, + "learning_rate": 6.7418640920160745e-06, + "loss": 3.305, + "step": 41500 + }, + { + "epoch": 1.2161429722405332, + "grad_norm": 18.660297393798828, + "learning_rate": 6.740381588320323e-06, + "loss": 3.3222, + "step": 41510 + }, + { + "epoch": 1.2164359481432652, + "grad_norm": 20.185691833496094, + "learning_rate": 6.738898910509718e-06, + "loss": 3.3072, + "step": 41520 + }, + { + "epoch": 1.2167289240459973, + "grad_norm": 19.81439781188965, + "learning_rate": 6.737416058732593e-06, + "loss": 3.2799, + "step": 41530 + }, + { + "epoch": 1.2170218999487292, + "grad_norm": 17.647214889526367, + "learning_rate": 6.7359330331373e-06, + "loss": 3.3353, + "step": 41540 + }, + { + "epoch": 1.2173148758514611, + "grad_norm": 19.022336959838867, + "learning_rate": 6.734449833872205e-06, + "loss": 3.3092, + "step": 41550 + }, + { + "epoch": 1.2176078517541933, + "grad_norm": 19.543556213378906, + "learning_rate": 6.732966461085695e-06, + "loss": 3.2962, + "step": 41560 + }, + { + "epoch": 1.2179008276569252, + "grad_norm": 20.719486236572266, + "learning_rate": 6.731482914926171e-06, + "loss": 3.3339, + "step": 41570 + }, + { + "epoch": 1.2181938035596573, + "grad_norm": 17.436012268066406, + "learning_rate": 6.729999195542054e-06, + "loss": 3.3037, + "step": 41580 + }, + { + "epoch": 1.2184867794623893, + "grad_norm": 18.267616271972656, + "learning_rate": 6.728515303081782e-06, + "loss": 3.3045, + "step": 41590 + }, + { + "epoch": 1.2187797553651212, + "grad_norm": 18.363962173461914, + "learning_rate": 6.727031237693809e-06, + "loss": 3.3054, + "step": 41600 + }, + { + "epoch": 1.2190727312678533, + "grad_norm": 18.636924743652344, + "learning_rate": 6.725546999526606e-06, + "loss": 3.2884, + "step": 41610 + }, + { + "epoch": 1.2193657071705852, + "grad_norm": 19.811599731445312, + "learning_rate": 6.7240625887286634e-06, + "loss": 3.2939, + "step": 41620 + }, + { + "epoch": 1.2196586830733172, + "grad_norm": 17.09603500366211, + "learning_rate": 6.722578005448488e-06, + "loss": 3.2998, + "step": 41630 + }, + { + "epoch": 1.2199516589760493, + "grad_norm": 17.14892578125, + "learning_rate": 6.721093249834601e-06, + "loss": 3.3013, + "step": 41640 + }, + { + "epoch": 1.2202446348787812, + "grad_norm": 19.14449691772461, + "learning_rate": 6.719608322035547e-06, + "loss": 3.3129, + "step": 41650 + }, + { + "epoch": 1.2205376107815131, + "grad_norm": 15.702230453491211, + "learning_rate": 6.718123222199882e-06, + "loss": 3.286, + "step": 41660 + }, + { + "epoch": 1.2208305866842453, + "grad_norm": 17.17837142944336, + "learning_rate": 6.716637950476182e-06, + "loss": 3.3275, + "step": 41670 + }, + { + "epoch": 1.2211235625869772, + "grad_norm": 19.814682006835938, + "learning_rate": 6.715152507013039e-06, + "loss": 3.3023, + "step": 41680 + }, + { + "epoch": 1.2214165384897093, + "grad_norm": 19.421611785888672, + "learning_rate": 6.713666891959063e-06, + "loss": 3.3169, + "step": 41690 + }, + { + "epoch": 1.2217095143924412, + "grad_norm": 15.807023048400879, + "learning_rate": 6.712181105462882e-06, + "loss": 3.2888, + "step": 41700 + }, + { + "epoch": 1.2220024902951732, + "grad_norm": 20.48529624938965, + "learning_rate": 6.71069514767314e-06, + "loss": 3.2997, + "step": 41710 + }, + { + "epoch": 1.2222954661979053, + "grad_norm": 19.887859344482422, + "learning_rate": 6.709209018738496e-06, + "loss": 3.2954, + "step": 41720 + }, + { + "epoch": 1.2225884421006372, + "grad_norm": 18.61648941040039, + "learning_rate": 6.707722718807631e-06, + "loss": 3.2936, + "step": 41730 + }, + { + "epoch": 1.2228814180033691, + "grad_norm": 16.1044921875, + "learning_rate": 6.70623624802924e-06, + "loss": 3.3035, + "step": 41740 + }, + { + "epoch": 1.2231743939061013, + "grad_norm": 18.692405700683594, + "learning_rate": 6.704749606552034e-06, + "loss": 3.2985, + "step": 41750 + }, + { + "epoch": 1.2234673698088332, + "grad_norm": 14.078256607055664, + "learning_rate": 6.7032627945247456e-06, + "loss": 3.2737, + "step": 41760 + }, + { + "epoch": 1.2237603457115651, + "grad_norm": 16.3837947845459, + "learning_rate": 6.701775812096119e-06, + "loss": 3.29, + "step": 41770 + }, + { + "epoch": 1.2240533216142973, + "grad_norm": 16.21749496459961, + "learning_rate": 6.700288659414917e-06, + "loss": 3.3041, + "step": 41780 + }, + { + "epoch": 1.2243462975170292, + "grad_norm": 16.90768814086914, + "learning_rate": 6.698801336629923e-06, + "loss": 3.3236, + "step": 41790 + }, + { + "epoch": 1.2246392734197613, + "grad_norm": 17.057531356811523, + "learning_rate": 6.697313843889934e-06, + "loss": 3.2995, + "step": 41800 + }, + { + "epoch": 1.2249322493224932, + "grad_norm": 16.444137573242188, + "learning_rate": 6.695826181343766e-06, + "loss": 3.3016, + "step": 41810 + }, + { + "epoch": 1.2252252252252251, + "grad_norm": 19.36460304260254, + "learning_rate": 6.694338349140248e-06, + "loss": 3.2954, + "step": 41820 + }, + { + "epoch": 1.2255182011279573, + "grad_norm": 20.046091079711914, + "learning_rate": 6.692850347428231e-06, + "loss": 3.311, + "step": 41830 + }, + { + "epoch": 1.2258111770306892, + "grad_norm": 16.85747528076172, + "learning_rate": 6.6913621763565806e-06, + "loss": 3.3165, + "step": 41840 + }, + { + "epoch": 1.2261041529334213, + "grad_norm": 17.8056583404541, + "learning_rate": 6.6898738360741785e-06, + "loss": 3.3027, + "step": 41850 + }, + { + "epoch": 1.2263971288361533, + "grad_norm": 15.772611618041992, + "learning_rate": 6.688385326729924e-06, + "loss": 3.3056, + "step": 41860 + }, + { + "epoch": 1.2266901047388852, + "grad_norm": 18.427980422973633, + "learning_rate": 6.686896648472736e-06, + "loss": 3.2874, + "step": 41870 + }, + { + "epoch": 1.2269830806416173, + "grad_norm": 16.704309463500977, + "learning_rate": 6.685407801451546e-06, + "loss": 3.3236, + "step": 41880 + }, + { + "epoch": 1.2272760565443492, + "grad_norm": 17.334285736083984, + "learning_rate": 6.683918785815304e-06, + "loss": 3.3072, + "step": 41890 + }, + { + "epoch": 1.2275690324470812, + "grad_norm": 20.21170997619629, + "learning_rate": 6.682429601712976e-06, + "loss": 3.328, + "step": 41900 + }, + { + "epoch": 1.2278620083498133, + "grad_norm": 16.391767501831055, + "learning_rate": 6.680940249293551e-06, + "loss": 3.2897, + "step": 41910 + }, + { + "epoch": 1.2281549842525452, + "grad_norm": 18.522775650024414, + "learning_rate": 6.679450728706025e-06, + "loss": 3.3, + "step": 41920 + }, + { + "epoch": 1.2284479601552771, + "grad_norm": 14.450841903686523, + "learning_rate": 6.677961040099419e-06, + "loss": 3.3001, + "step": 41930 + }, + { + "epoch": 1.2287409360580093, + "grad_norm": 15.396918296813965, + "learning_rate": 6.676471183622764e-06, + "loss": 3.2997, + "step": 41940 + }, + { + "epoch": 1.2290339119607412, + "grad_norm": 18.016033172607422, + "learning_rate": 6.674981159425114e-06, + "loss": 3.2989, + "step": 41950 + }, + { + "epoch": 1.2293268878634733, + "grad_norm": NaN, + "learning_rate": 6.673639994368984e-06, + "loss": 3.2958, + "step": 41960 + }, + { + "epoch": 1.2296198637662052, + "grad_norm": 16.893169403076172, + "learning_rate": 6.672149651912137e-06, + "loss": 3.3042, + "step": 41970 + }, + { + "epoch": 1.2299128396689372, + "grad_norm": 17.391016006469727, + "learning_rate": 6.67065914216664e-06, + "loss": 3.2918, + "step": 41980 + }, + { + "epoch": 1.23003003003003, + "eval_bleu": 0.3350129465065168, + "eval_cap_loss": 0.9510658383369446, + "eval_con_loss": 1.2665514945983887, + "eval_loss": 3.4841690063476562, + "step": 41984 + }, + { + "epoch": 1.23003003003003, + "eval_bleu": 0.3350129465065168, + "eval_cap_loss": 0.9510658383369446, + "eval_con_loss": 1.2665514945983887, + "eval_loss": 3.4841690063476562, + "eval_runtime": 56.5039, + "eval_samples_per_second": 353.958, + "eval_steps_per_second": 0.354, + "step": 41984 + }, + { + "epoch": 1.2302058155716693, + "grad_norm": 16.699758529663086, + "learning_rate": 6.669168465281607e-06, + "loss": 3.3232, + "step": 41990 + }, + { + "epoch": 1.2304987914744012, + "grad_norm": 16.772693634033203, + "learning_rate": 6.667677621406172e-06, + "loss": 3.2826, + "step": 42000 + }, + { + "epoch": 1.2307917673771331, + "grad_norm": 19.713825225830078, + "learning_rate": 6.666186610689485e-06, + "loss": 3.2821, + "step": 42010 + }, + { + "epoch": 1.2310847432798653, + "grad_norm": 17.820253372192383, + "learning_rate": 6.664695433280712e-06, + "loss": 3.3284, + "step": 42020 + }, + { + "epoch": 1.2313777191825972, + "grad_norm": 18.2928524017334, + "learning_rate": 6.663204089329038e-06, + "loss": 3.3174, + "step": 42030 + }, + { + "epoch": 1.2316706950853291, + "grad_norm": 16.676973342895508, + "learning_rate": 6.6617125789836625e-06, + "loss": 3.3077, + "step": 42040 + }, + { + "epoch": 1.2319636709880613, + "grad_norm": 15.973733901977539, + "learning_rate": 6.6602209023938e-06, + "loss": 3.3, + "step": 42050 + }, + { + "epoch": 1.2322566468907932, + "grad_norm": 16.140832901000977, + "learning_rate": 6.658729059708689e-06, + "loss": 3.296, + "step": 42060 + }, + { + "epoch": 1.2325496227935253, + "grad_norm": 17.6358699798584, + "learning_rate": 6.657237051077575e-06, + "loss": 3.3244, + "step": 42070 + }, + { + "epoch": 1.2328425986962572, + "grad_norm": 20.190038681030273, + "learning_rate": 6.655744876649727e-06, + "loss": 3.3198, + "step": 42080 + }, + { + "epoch": 1.2331355745989891, + "grad_norm": 16.700599670410156, + "learning_rate": 6.6542525365744245e-06, + "loss": 3.3008, + "step": 42090 + }, + { + "epoch": 1.2334285505017213, + "grad_norm": 16.305917739868164, + "learning_rate": 6.652760031000973e-06, + "loss": 3.2754, + "step": 42100 + }, + { + "epoch": 1.2337215264044532, + "grad_norm": 17.447572708129883, + "learning_rate": 6.651267360078685e-06, + "loss": 3.3253, + "step": 42110 + }, + { + "epoch": 1.2340145023071853, + "grad_norm": 17.396142959594727, + "learning_rate": 6.649774523956893e-06, + "loss": 3.2798, + "step": 42120 + }, + { + "epoch": 1.2343074782099173, + "grad_norm": 19.251346588134766, + "learning_rate": 6.64828152278495e-06, + "loss": 3.3064, + "step": 42130 + }, + { + "epoch": 1.2346004541126492, + "grad_norm": 20.60921287536621, + "learning_rate": 6.646788356712217e-06, + "loss": 3.2956, + "step": 42140 + }, + { + "epoch": 1.2348934300153813, + "grad_norm": 16.48649024963379, + "learning_rate": 6.645295025888081e-06, + "loss": 3.323, + "step": 42150 + }, + { + "epoch": 1.2351864059181132, + "grad_norm": 15.218493461608887, + "learning_rate": 6.643801530461939e-06, + "loss": 3.2955, + "step": 42160 + }, + { + "epoch": 1.2354793818208452, + "grad_norm": 18.10065460205078, + "learning_rate": 6.642307870583206e-06, + "loss": 3.2839, + "step": 42170 + }, + { + "epoch": 1.2357723577235773, + "grad_norm": 16.703874588012695, + "learning_rate": 6.640814046401312e-06, + "loss": 3.3162, + "step": 42180 + }, + { + "epoch": 1.2360653336263092, + "grad_norm": 19.473052978515625, + "learning_rate": 6.639320058065709e-06, + "loss": 3.2872, + "step": 42190 + }, + { + "epoch": 1.2363583095290411, + "grad_norm": 20.627796173095703, + "learning_rate": 6.6378259057258594e-06, + "loss": 3.3166, + "step": 42200 + }, + { + "epoch": 1.2366512854317733, + "grad_norm": 18.690216064453125, + "learning_rate": 6.636331589531246e-06, + "loss": 3.3006, + "step": 42210 + }, + { + "epoch": 1.2369442613345052, + "grad_norm": 17.266380310058594, + "learning_rate": 6.634837109631363e-06, + "loss": 3.3157, + "step": 42220 + }, + { + "epoch": 1.2372372372372373, + "grad_norm": 16.495540618896484, + "learning_rate": 6.633342466175728e-06, + "loss": 3.2557, + "step": 42230 + }, + { + "epoch": 1.2375302131399692, + "grad_norm": 15.678487777709961, + "learning_rate": 6.6318476593138706e-06, + "loss": 3.2816, + "step": 42240 + }, + { + "epoch": 1.2378231890427012, + "grad_norm": 18.367000579833984, + "learning_rate": 6.630352689195334e-06, + "loss": 3.2958, + "step": 42250 + }, + { + "epoch": 1.2381161649454333, + "grad_norm": 20.71607780456543, + "learning_rate": 6.628857555969683e-06, + "loss": 3.2873, + "step": 42260 + }, + { + "epoch": 1.2384091408481652, + "grad_norm": 18.068899154663086, + "learning_rate": 6.6273622597865e-06, + "loss": 3.2944, + "step": 42270 + }, + { + "epoch": 1.2387021167508971, + "grad_norm": 18.948368072509766, + "learning_rate": 6.625866800795375e-06, + "loss": 3.3142, + "step": 42280 + }, + { + "epoch": 1.2389950926536293, + "grad_norm": 15.59411907196045, + "learning_rate": 6.624371179145925e-06, + "loss": 3.3079, + "step": 42290 + }, + { + "epoch": 1.2392880685563612, + "grad_norm": 16.079561233520508, + "learning_rate": 6.6228753949877734e-06, + "loss": 3.3113, + "step": 42300 + }, + { + "epoch": 1.2395810444590931, + "grad_norm": 17.83519744873047, + "learning_rate": 6.621379448470567e-06, + "loss": 3.2931, + "step": 42310 + }, + { + "epoch": 1.2398740203618253, + "grad_norm": 18.376989364624023, + "learning_rate": 6.619883339743967e-06, + "loss": 3.2989, + "step": 42320 + }, + { + "epoch": 1.2401669962645572, + "grad_norm": 19.169633865356445, + "learning_rate": 6.6183870689576505e-06, + "loss": 3.3098, + "step": 42330 + }, + { + "epoch": 1.2404599721672893, + "grad_norm": 18.232816696166992, + "learning_rate": 6.616890636261306e-06, + "loss": 3.3111, + "step": 42340 + }, + { + "epoch": 1.2407529480700212, + "grad_norm": 15.973055839538574, + "learning_rate": 6.615394041804648e-06, + "loss": 3.2852, + "step": 42350 + }, + { + "epoch": 1.2410459239727532, + "grad_norm": 16.888999938964844, + "learning_rate": 6.6138972857374e-06, + "loss": 3.2936, + "step": 42360 + }, + { + "epoch": 1.2413388998754853, + "grad_norm": 16.06661033630371, + "learning_rate": 6.6124003682093044e-06, + "loss": 3.3016, + "step": 42370 + }, + { + "epoch": 1.2416318757782172, + "grad_norm": 15.80553150177002, + "learning_rate": 6.610903289370115e-06, + "loss": 3.2931, + "step": 42380 + }, + { + "epoch": 1.2419248516809493, + "grad_norm": 18.913835525512695, + "learning_rate": 6.609406049369611e-06, + "loss": 3.2861, + "step": 42390 + }, + { + "epoch": 1.2422178275836813, + "grad_norm": 18.464998245239258, + "learning_rate": 6.607908648357579e-06, + "loss": 3.2895, + "step": 42400 + }, + { + "epoch": 1.2425108034864132, + "grad_norm": 18.139406204223633, + "learning_rate": 6.606411086483828e-06, + "loss": 3.2843, + "step": 42410 + }, + { + "epoch": 1.2428037793891453, + "grad_norm": 16.639162063598633, + "learning_rate": 6.6049133638981755e-06, + "loss": 3.3163, + "step": 42420 + }, + { + "epoch": 1.2430967552918772, + "grad_norm": 14.972046852111816, + "learning_rate": 6.603415480750465e-06, + "loss": 3.2917, + "step": 42430 + }, + { + "epoch": 1.2433897311946092, + "grad_norm": 19.026458740234375, + "learning_rate": 6.601917437190548e-06, + "loss": 3.2899, + "step": 42440 + }, + { + "epoch": 1.2436827070973413, + "grad_norm": 18.263408660888672, + "learning_rate": 6.600419233368295e-06, + "loss": 3.3037, + "step": 42450 + }, + { + "epoch": 1.2439756830000732, + "grad_norm": 17.735429763793945, + "learning_rate": 6.598920869433592e-06, + "loss": 3.2947, + "step": 42460 + }, + { + "epoch": 1.2442686589028051, + "grad_norm": 16.279130935668945, + "learning_rate": 6.597422345536342e-06, + "loss": 3.3134, + "step": 42470 + }, + { + "epoch": 1.2445616348055373, + "grad_norm": 18.151077270507812, + "learning_rate": 6.595923661826464e-06, + "loss": 3.2862, + "step": 42480 + }, + { + "epoch": 1.2448546107082692, + "grad_norm": 16.095537185668945, + "learning_rate": 6.594424818453891e-06, + "loss": 3.3112, + "step": 42490 + }, + { + "epoch": 1.2450303962499085, + "eval_bleu": 0.3356308990258134, + "eval_cap_loss": 0.9500963687896729, + "eval_con_loss": 1.2658045291900635, + "eval_loss": 3.4817051887512207, + "step": 42496 + }, + { + "epoch": 1.2450303962499085, + "eval_bleu": 0.3356308990258134, + "eval_cap_loss": 0.9500963687896729, + "eval_con_loss": 1.2658045291900635, + "eval_loss": 3.4817051887512207, + "eval_runtime": 67.2624, + "eval_samples_per_second": 297.343, + "eval_steps_per_second": 0.297, + "step": 42496 + }, + { + "epoch": 1.2451475866110013, + "grad_norm": 19.031124114990234, + "learning_rate": 6.592925815568576e-06, + "loss": 3.319, + "step": 42500 + }, + { + "epoch": 1.2454405625137333, + "grad_norm": 16.384918212890625, + "learning_rate": 6.591426653320482e-06, + "loss": 3.3062, + "step": 42510 + }, + { + "epoch": 1.2457335384164652, + "grad_norm": 16.579248428344727, + "learning_rate": 6.589927331859594e-06, + "loss": 3.3042, + "step": 42520 + }, + { + "epoch": 1.2460265143191973, + "grad_norm": 17.409385681152344, + "learning_rate": 6.588427851335909e-06, + "loss": 3.2804, + "step": 42530 + }, + { + "epoch": 1.2463194902219292, + "grad_norm": 15.045611381530762, + "learning_rate": 6.586928211899442e-06, + "loss": 3.3028, + "step": 42540 + }, + { + "epoch": 1.2466124661246614, + "grad_norm": 14.860201835632324, + "learning_rate": 6.58542841370022e-06, + "loss": 3.292, + "step": 42550 + }, + { + "epoch": 1.2469054420273933, + "grad_norm": 16.867563247680664, + "learning_rate": 6.583928456888293e-06, + "loss": 3.3007, + "step": 42560 + }, + { + "epoch": 1.2471984179301252, + "grad_norm": 18.30900001525879, + "learning_rate": 6.582428341613721e-06, + "loss": 3.3166, + "step": 42570 + }, + { + "epoch": 1.2474913938328573, + "grad_norm": 20.76506805419922, + "learning_rate": 6.580928068026582e-06, + "loss": 3.3016, + "step": 42580 + }, + { + "epoch": 1.2477843697355893, + "grad_norm": 15.25455093383789, + "learning_rate": 6.579427636276968e-06, + "loss": 3.2612, + "step": 42590 + }, + { + "epoch": 1.2480773456383212, + "grad_norm": 20.221158981323242, + "learning_rate": 6.57792704651499e-06, + "loss": 3.2661, + "step": 42600 + }, + { + "epoch": 1.2483703215410533, + "grad_norm": 16.697877883911133, + "learning_rate": 6.576426298890773e-06, + "loss": 3.3002, + "step": 42610 + }, + { + "epoch": 1.2486632974437852, + "grad_norm": 18.69090461730957, + "learning_rate": 6.574925393554456e-06, + "loss": 3.3088, + "step": 42620 + }, + { + "epoch": 1.2489562733465172, + "grad_norm": 16.67581558227539, + "learning_rate": 6.5734243306561985e-06, + "loss": 3.2998, + "step": 42630 + }, + { + "epoch": 1.2492492492492493, + "grad_norm": 17.068695068359375, + "learning_rate": 6.571923110346171e-06, + "loss": 3.2904, + "step": 42640 + }, + { + "epoch": 1.2495422251519812, + "grad_norm": 18.133926391601562, + "learning_rate": 6.570421732774564e-06, + "loss": 3.3112, + "step": 42650 + }, + { + "epoch": 1.2498352010547134, + "grad_norm": 19.810691833496094, + "learning_rate": 6.5689201980915775e-06, + "loss": 3.3081, + "step": 42660 + }, + { + "epoch": 1.2501281769574453, + "grad_norm": 19.24250030517578, + "learning_rate": 6.567418506447432e-06, + "loss": 3.2879, + "step": 42670 + }, + { + "epoch": 1.2504211528601772, + "grad_norm": 19.425575256347656, + "learning_rate": 6.565916657992366e-06, + "loss": 3.3026, + "step": 42680 + }, + { + "epoch": 1.2507141287629093, + "grad_norm": 17.904170989990234, + "learning_rate": 6.5644146528766295e-06, + "loss": 3.2923, + "step": 42690 + }, + { + "epoch": 1.2510071046656412, + "grad_norm": 16.226943969726562, + "learning_rate": 6.562912491250487e-06, + "loss": 3.2795, + "step": 42700 + }, + { + "epoch": 1.2513000805683734, + "grad_norm": 17.325698852539062, + "learning_rate": 6.5614101732642225e-06, + "loss": 3.3099, + "step": 42710 + }, + { + "epoch": 1.2515930564711053, + "grad_norm": 19.391447067260742, + "learning_rate": 6.559907699068133e-06, + "loss": 3.2813, + "step": 42720 + }, + { + "epoch": 1.2518860323738372, + "grad_norm": 16.211261749267578, + "learning_rate": 6.558405068812533e-06, + "loss": 3.2938, + "step": 42730 + }, + { + "epoch": 1.2521790082765691, + "grad_norm": 14.789870262145996, + "learning_rate": 6.556902282647753e-06, + "loss": 3.2815, + "step": 42740 + }, + { + "epoch": 1.2524719841793013, + "grad_norm": 18.215913772583008, + "learning_rate": 6.555399340724135e-06, + "loss": 3.2782, + "step": 42750 + }, + { + "epoch": 1.2527649600820332, + "grad_norm": 19.541584014892578, + "learning_rate": 6.553896243192042e-06, + "loss": 3.3007, + "step": 42760 + }, + { + "epoch": 1.2530579359847653, + "grad_norm": 19.31566047668457, + "learning_rate": 6.552392990201848e-06, + "loss": 3.313, + "step": 42770 + }, + { + "epoch": 1.2533509118874973, + "grad_norm": 14.898724555969238, + "learning_rate": 6.5508895819039456e-06, + "loss": 3.282, + "step": 42780 + }, + { + "epoch": 1.2536438877902292, + "grad_norm": 19.836576461791992, + "learning_rate": 6.5493860184487405e-06, + "loss": 3.2876, + "step": 42790 + }, + { + "epoch": 1.2539368636929613, + "grad_norm": 17.153745651245117, + "learning_rate": 6.547882299986658e-06, + "loss": 3.27, + "step": 42800 + }, + { + "epoch": 1.2542298395956932, + "grad_norm": 17.420310974121094, + "learning_rate": 6.546378426668134e-06, + "loss": 3.2912, + "step": 42810 + }, + { + "epoch": 1.2545228154984254, + "grad_norm": 19.326034545898438, + "learning_rate": 6.5448743986436225e-06, + "loss": 3.2962, + "step": 42820 + }, + { + "epoch": 1.2548157914011573, + "grad_norm": 17.918872833251953, + "learning_rate": 6.543370216063593e-06, + "loss": 3.2815, + "step": 42830 + }, + { + "epoch": 1.2551087673038892, + "grad_norm": 18.99216651916504, + "learning_rate": 6.541865879078528e-06, + "loss": 3.2867, + "step": 42840 + }, + { + "epoch": 1.2554017432066211, + "grad_norm": 18.85188865661621, + "learning_rate": 6.540361387838933e-06, + "loss": 3.2841, + "step": 42850 + }, + { + "epoch": 1.2556947191093533, + "grad_norm": 15.88124942779541, + "learning_rate": 6.538856742495319e-06, + "loss": 3.2844, + "step": 42860 + }, + { + "epoch": 1.2559876950120852, + "grad_norm": 16.538667678833008, + "learning_rate": 6.537351943198216e-06, + "loss": 3.2942, + "step": 42870 + }, + { + "epoch": 1.2562806709148173, + "grad_norm": 14.501191139221191, + "learning_rate": 6.5358469900981726e-06, + "loss": 3.278, + "step": 42880 + }, + { + "epoch": 1.2565736468175492, + "grad_norm": 16.262788772583008, + "learning_rate": 6.534341883345749e-06, + "loss": 3.2816, + "step": 42890 + }, + { + "epoch": 1.2568666227202812, + "grad_norm": 17.81420135498047, + "learning_rate": 6.532836623091525e-06, + "loss": 3.3044, + "step": 42900 + }, + { + "epoch": 1.2571595986230133, + "grad_norm": 17.918407440185547, + "learning_rate": 6.531331209486089e-06, + "loss": 3.2699, + "step": 42910 + }, + { + "epoch": 1.2574525745257452, + "grad_norm": 18.649972915649414, + "learning_rate": 6.529825642680051e-06, + "loss": 3.3008, + "step": 42920 + }, + { + "epoch": 1.2577455504284774, + "grad_norm": 15.804500579833984, + "learning_rate": 6.528319922824033e-06, + "loss": 3.303, + "step": 42930 + }, + { + "epoch": 1.2580385263312093, + "grad_norm": 18.412899017333984, + "learning_rate": 6.5268140500686764e-06, + "loss": 3.2878, + "step": 42940 + }, + { + "epoch": 1.2583315022339412, + "grad_norm": 20.39000701904297, + "learning_rate": 6.525308024564631e-06, + "loss": 3.3081, + "step": 42950 + }, + { + "epoch": 1.2586244781366733, + "grad_norm": 19.03835678100586, + "learning_rate": 6.523801846462565e-06, + "loss": 3.3218, + "step": 42960 + }, + { + "epoch": 1.2589174540394052, + "grad_norm": 16.592430114746094, + "learning_rate": 6.522295515913168e-06, + "loss": 3.297, + "step": 42970 + }, + { + "epoch": 1.2592104299421374, + "grad_norm": 17.665632247924805, + "learning_rate": 6.520789033067134e-06, + "loss": 3.3106, + "step": 42980 + }, + { + "epoch": 1.2595034058448693, + "grad_norm": 15.410162925720215, + "learning_rate": 6.51928239807518e-06, + "loss": 3.2961, + "step": 42990 + }, + { + "epoch": 1.2597963817476012, + "grad_norm": 16.413599014282227, + "learning_rate": 6.517775611088037e-06, + "loss": 3.294, + "step": 43000 + }, + { + "epoch": 1.2600307624697868, + "eval_bleu": 0.3355337953715731, + "eval_cap_loss": 0.949788510799408, + "eval_con_loss": 1.2647836208343506, + "eval_loss": 3.479355812072754, + "step": 43008 + }, + { + "epoch": 1.2600307624697868, + "eval_bleu": 0.3355337953715731, + "eval_cap_loss": 0.949788510799408, + "eval_con_loss": 1.2647836208343506, + "eval_loss": 3.479355812072754, + "eval_runtime": 53.2716, + "eval_samples_per_second": 375.435, + "eval_steps_per_second": 0.375, + "step": 43008 + }, + { + "epoch": 1.2600893576503331, + "grad_norm": 18.218059539794922, + "learning_rate": 6.516268672256447e-06, + "loss": 3.2841, + "step": 43010 + }, + { + "epoch": 1.2603823335530653, + "grad_norm": 16.27004051208496, + "learning_rate": 6.5147615817311725e-06, + "loss": 3.2962, + "step": 43020 + }, + { + "epoch": 1.2606753094557972, + "grad_norm": 21.72547721862793, + "learning_rate": 6.51325433966299e-06, + "loss": 3.307, + "step": 43030 + }, + { + "epoch": 1.2609682853585293, + "grad_norm": 17.32815933227539, + "learning_rate": 6.511746946202686e-06, + "loss": 3.285, + "step": 43040 + }, + { + "epoch": 1.2612612612612613, + "grad_norm": 17.479663848876953, + "learning_rate": 6.510239401501071e-06, + "loss": 3.3041, + "step": 43050 + }, + { + "epoch": 1.2615542371639932, + "grad_norm": 17.041973114013672, + "learning_rate": 6.508731705708964e-06, + "loss": 3.329, + "step": 43060 + }, + { + "epoch": 1.2618472130667253, + "grad_norm": 16.369653701782227, + "learning_rate": 6.507223858977198e-06, + "loss": 3.3069, + "step": 43070 + }, + { + "epoch": 1.2621401889694572, + "grad_norm": 17.746599197387695, + "learning_rate": 6.50571586145663e-06, + "loss": 3.3072, + "step": 43080 + }, + { + "epoch": 1.2624331648721894, + "grad_norm": 15.97793960571289, + "learning_rate": 6.504207713298123e-06, + "loss": 3.2638, + "step": 43090 + }, + { + "epoch": 1.2627261407749213, + "grad_norm": 17.23325538635254, + "learning_rate": 6.502699414652558e-06, + "loss": 3.2936, + "step": 43100 + }, + { + "epoch": 1.2630191166776532, + "grad_norm": 17.686616897583008, + "learning_rate": 6.5011909656708305e-06, + "loss": 3.2702, + "step": 43110 + }, + { + "epoch": 1.2633120925803851, + "grad_norm": 15.213828086853027, + "learning_rate": 6.4996823665038546e-06, + "loss": 3.2889, + "step": 43120 + }, + { + "epoch": 1.2636050684831173, + "grad_norm": 17.58605194091797, + "learning_rate": 6.4981736173025545e-06, + "loss": 3.281, + "step": 43130 + }, + { + "epoch": 1.2638980443858492, + "grad_norm": 15.62355899810791, + "learning_rate": 6.496664718217873e-06, + "loss": 3.285, + "step": 43140 + }, + { + "epoch": 1.2641910202885813, + "grad_norm": 18.043285369873047, + "learning_rate": 6.4951556694007644e-06, + "loss": 3.2873, + "step": 43150 + }, + { + "epoch": 1.2644839961913132, + "grad_norm": 18.595787048339844, + "learning_rate": 6.493646471002203e-06, + "loss": 3.3053, + "step": 43160 + }, + { + "epoch": 1.2647769720940452, + "grad_norm": 17.516368865966797, + "learning_rate": 6.4921371231731735e-06, + "loss": 3.262, + "step": 43170 + }, + { + "epoch": 1.2650699479967773, + "grad_norm": 20.362316131591797, + "learning_rate": 6.490627626064677e-06, + "loss": 3.2802, + "step": 43180 + }, + { + "epoch": 1.2653629238995092, + "grad_norm": 18.533184051513672, + "learning_rate": 6.4891179798277294e-06, + "loss": 3.3054, + "step": 43190 + }, + { + "epoch": 1.2656558998022414, + "grad_norm": 18.27627182006836, + "learning_rate": 6.487608184613363e-06, + "loss": 3.2778, + "step": 43200 + }, + { + "epoch": 1.2659488757049733, + "grad_norm": 15.816596984863281, + "learning_rate": 6.486098240572624e-06, + "loss": 3.2787, + "step": 43210 + }, + { + "epoch": 1.2662418516077052, + "grad_norm": 19.09499168395996, + "learning_rate": 6.484588147856572e-06, + "loss": 3.3152, + "step": 43220 + }, + { + "epoch": 1.2665348275104373, + "grad_norm": 21.491819381713867, + "learning_rate": 6.483077906616285e-06, + "loss": 3.2842, + "step": 43230 + }, + { + "epoch": 1.2668278034131693, + "grad_norm": 17.41731071472168, + "learning_rate": 6.48156751700285e-06, + "loss": 3.3111, + "step": 43240 + }, + { + "epoch": 1.2671207793159014, + "grad_norm": 17.446102142333984, + "learning_rate": 6.480056979167377e-06, + "loss": 3.2937, + "step": 43250 + }, + { + "epoch": 1.2674137552186333, + "grad_norm": 19.8627872467041, + "learning_rate": 6.4785462932609836e-06, + "loss": 3.271, + "step": 43260 + }, + { + "epoch": 1.2677067311213652, + "grad_norm": 20.367727279663086, + "learning_rate": 6.477035459434805e-06, + "loss": 3.2926, + "step": 43270 + }, + { + "epoch": 1.2679997070240971, + "grad_norm": 15.29594898223877, + "learning_rate": 6.4755244778399905e-06, + "loss": 3.2988, + "step": 43280 + }, + { + "epoch": 1.2682926829268293, + "grad_norm": 16.895328521728516, + "learning_rate": 6.474013348627708e-06, + "loss": 3.2996, + "step": 43290 + }, + { + "epoch": 1.2685856588295612, + "grad_norm": 16.447696685791016, + "learning_rate": 6.472502071949134e-06, + "loss": 3.2734, + "step": 43300 + }, + { + "epoch": 1.2688786347322933, + "grad_norm": 16.25102996826172, + "learning_rate": 6.470990647955464e-06, + "loss": 3.2963, + "step": 43310 + }, + { + "epoch": 1.2691716106350253, + "grad_norm": 15.6679048538208, + "learning_rate": 6.469479076797906e-06, + "loss": 3.3, + "step": 43320 + }, + { + "epoch": 1.2694645865377572, + "grad_norm": 17.562606811523438, + "learning_rate": 6.467967358627686e-06, + "loss": 3.2861, + "step": 43330 + }, + { + "epoch": 1.2697575624404893, + "grad_norm": 16.30111312866211, + "learning_rate": 6.46645549359604e-06, + "loss": 3.3127, + "step": 43340 + }, + { + "epoch": 1.2700505383432212, + "grad_norm": 19.677661895751953, + "learning_rate": 6.464943481854222e-06, + "loss": 3.2889, + "step": 43350 + }, + { + "epoch": 1.2703435142459534, + "grad_norm": 15.65476131439209, + "learning_rate": 6.4634313235535e-06, + "loss": 3.2945, + "step": 43360 + }, + { + "epoch": 1.2706364901486853, + "grad_norm": 18.69684600830078, + "learning_rate": 6.461919018845156e-06, + "loss": 3.295, + "step": 43370 + }, + { + "epoch": 1.2709294660514172, + "grad_norm": 18.802112579345703, + "learning_rate": 6.460406567880488e-06, + "loss": 3.2908, + "step": 43380 + }, + { + "epoch": 1.2712224419541494, + "grad_norm": 15.031940460205078, + "learning_rate": 6.458893970810808e-06, + "loss": 3.2682, + "step": 43390 + }, + { + "epoch": 1.2715154178568813, + "grad_norm": 18.08688735961914, + "learning_rate": 6.457381227787439e-06, + "loss": 3.272, + "step": 43400 + }, + { + "epoch": 1.2718083937596134, + "grad_norm": 19.657264709472656, + "learning_rate": 6.455868338961727e-06, + "loss": 3.2863, + "step": 43410 + }, + { + "epoch": 1.2721013696623453, + "grad_norm": 17.15200424194336, + "learning_rate": 6.454355304485024e-06, + "loss": 3.289, + "step": 43420 + }, + { + "epoch": 1.2723943455650772, + "grad_norm": 18.204465866088867, + "learning_rate": 6.452842124508701e-06, + "loss": 3.2944, + "step": 43430 + }, + { + "epoch": 1.2726873214678092, + "grad_norm": 18.980440139770508, + "learning_rate": 6.451328799184142e-06, + "loss": 3.2936, + "step": 43440 + }, + { + "epoch": 1.2729802973705413, + "grad_norm": 19.81885528564453, + "learning_rate": 6.449815328662749e-06, + "loss": 3.2899, + "step": 43450 + }, + { + "epoch": 1.2732732732732732, + "grad_norm": 16.285547256469727, + "learning_rate": 6.448301713095934e-06, + "loss": 3.2883, + "step": 43460 + }, + { + "epoch": 1.2735662491760054, + "grad_norm": 15.083161354064941, + "learning_rate": 6.446787952635124e-06, + "loss": 3.2944, + "step": 43470 + }, + { + "epoch": 1.2738592250787373, + "grad_norm": 14.919405937194824, + "learning_rate": 6.445274047431762e-06, + "loss": 3.2843, + "step": 43480 + }, + { + "epoch": 1.2741522009814692, + "grad_norm": 18.198335647583008, + "learning_rate": 6.443759997637306e-06, + "loss": 3.2962, + "step": 43490 + }, + { + "epoch": 1.2744451768842013, + "grad_norm": 16.713027954101562, + "learning_rate": 6.442245803403231e-06, + "loss": 3.2805, + "step": 43500 + }, + { + "epoch": 1.2747381527869333, + "grad_norm": 17.483642578125, + "learning_rate": 6.440731464881017e-06, + "loss": 3.2833, + "step": 43510 + }, + { + "epoch": 1.2750311286896654, + "grad_norm": 18.284425735473633, + "learning_rate": 6.4392169822221675e-06, + "loss": 3.2889, + "step": 43520 + }, + { + "epoch": 1.2750311286896654, + "eval_bleu": 0.33541517866720794, + "eval_cap_loss": 0.9490867257118225, + "eval_con_loss": 1.2625164985656738, + "eval_loss": 3.4741196632385254, + "step": 43520 + }, + { + "epoch": 1.2750311286896654, + "eval_bleu": 0.33541517866720794, + "eval_cap_loss": 0.9490867257118225, + "eval_con_loss": 1.2625164985656738, + "eval_loss": 3.4741196632385254, + "eval_runtime": 56.3708, + "eval_samples_per_second": 354.793, + "eval_steps_per_second": 0.355, + "step": 43520 + }, + { + "epoch": 1.2753241045923973, + "grad_norm": 17.925201416015625, + "learning_rate": 6.4377023555781995e-06, + "loss": 3.2873, + "step": 43530 + }, + { + "epoch": 1.2756170804951292, + "grad_norm": 16.298555374145508, + "learning_rate": 6.436187585100638e-06, + "loss": 3.3313, + "step": 43540 + }, + { + "epoch": 1.2759100563978611, + "grad_norm": 12.936400413513184, + "learning_rate": 6.434672670941031e-06, + "loss": 3.2687, + "step": 43550 + }, + { + "epoch": 1.2762030323005933, + "grad_norm": 18.128612518310547, + "learning_rate": 6.4331576132509335e-06, + "loss": 3.2581, + "step": 43560 + }, + { + "epoch": 1.2764960082033252, + "grad_norm": 16.425832748413086, + "learning_rate": 6.4316424121819195e-06, + "loss": 3.3048, + "step": 43570 + }, + { + "epoch": 1.2767889841060573, + "grad_norm": 17.999160766601562, + "learning_rate": 6.430127067885577e-06, + "loss": 3.2921, + "step": 43580 + }, + { + "epoch": 1.2770819600087893, + "grad_norm": 16.435895919799805, + "learning_rate": 6.428611580513505e-06, + "loss": 3.2883, + "step": 43590 + }, + { + "epoch": 1.2773749359115212, + "grad_norm": 15.971597671508789, + "learning_rate": 6.427095950217321e-06, + "loss": 3.2784, + "step": 43600 + }, + { + "epoch": 1.2776679118142533, + "grad_norm": 18.12330436706543, + "learning_rate": 6.425580177148653e-06, + "loss": 3.2944, + "step": 43610 + }, + { + "epoch": 1.2779608877169852, + "grad_norm": 18.160400390625, + "learning_rate": 6.424064261459147e-06, + "loss": 3.2957, + "step": 43620 + }, + { + "epoch": 1.2782538636197174, + "grad_norm": 18.57019805908203, + "learning_rate": 6.422548203300459e-06, + "loss": 3.313, + "step": 43630 + }, + { + "epoch": 1.2785468395224493, + "grad_norm": 16.393856048583984, + "learning_rate": 6.4210320028242646e-06, + "loss": 3.2727, + "step": 43640 + }, + { + "epoch": 1.2788398154251812, + "grad_norm": 17.847209930419922, + "learning_rate": 6.419515660182247e-06, + "loss": 3.2773, + "step": 43650 + }, + { + "epoch": 1.2791327913279134, + "grad_norm": 16.593114852905273, + "learning_rate": 6.41799917552611e-06, + "loss": 3.2904, + "step": 43660 + }, + { + "epoch": 1.2794257672306453, + "grad_norm": 15.865588188171387, + "learning_rate": 6.416482549007569e-06, + "loss": 3.2713, + "step": 43670 + }, + { + "epoch": 1.2797187431333774, + "grad_norm": 18.093822479248047, + "learning_rate": 6.4149657807783525e-06, + "loss": 3.2857, + "step": 43680 + }, + { + "epoch": 1.2800117190361093, + "grad_norm": 16.73385238647461, + "learning_rate": 6.4134488709902034e-06, + "loss": 3.2656, + "step": 43690 + }, + { + "epoch": 1.2803046949388412, + "grad_norm": 17.700817108154297, + "learning_rate": 6.411931819794881e-06, + "loss": 3.2649, + "step": 43700 + }, + { + "epoch": 1.2805976708415732, + "grad_norm": 18.171188354492188, + "learning_rate": 6.410414627344157e-06, + "loss": 3.2802, + "step": 43710 + }, + { + "epoch": 1.2808906467443053, + "grad_norm": 20.75981903076172, + "learning_rate": 6.4088972937898175e-06, + "loss": 3.2916, + "step": 43720 + }, + { + "epoch": 1.2811836226470372, + "grad_norm": 14.300901412963867, + "learning_rate": 6.407379819283661e-06, + "loss": 3.2676, + "step": 43730 + }, + { + "epoch": 1.2814765985497694, + "grad_norm": 19.302183151245117, + "learning_rate": 6.405862203977505e-06, + "loss": 3.2785, + "step": 43740 + }, + { + "epoch": 1.2817695744525013, + "grad_norm": 16.735506057739258, + "learning_rate": 6.404344448023175e-06, + "loss": 3.2914, + "step": 43750 + }, + { + "epoch": 1.2820625503552332, + "grad_norm": 20.21074104309082, + "learning_rate": 6.4028265515725165e-06, + "loss": 3.2913, + "step": 43760 + }, + { + "epoch": 1.2823555262579653, + "grad_norm": 15.653417587280273, + "learning_rate": 6.401308514777382e-06, + "loss": 3.2986, + "step": 43770 + }, + { + "epoch": 1.2826485021606973, + "grad_norm": 15.59525203704834, + "learning_rate": 6.399790337789646e-06, + "loss": 3.2958, + "step": 43780 + }, + { + "epoch": 1.2829414780634294, + "grad_norm": 17.92841339111328, + "learning_rate": 6.3982720207611895e-06, + "loss": 3.2878, + "step": 43790 + }, + { + "epoch": 1.2832344539661613, + "grad_norm": 15.808941841125488, + "learning_rate": 6.396753563843915e-06, + "loss": 3.2822, + "step": 43800 + }, + { + "epoch": 1.2835274298688932, + "grad_norm": 16.7301025390625, + "learning_rate": 6.395234967189731e-06, + "loss": 3.2895, + "step": 43810 + }, + { + "epoch": 1.2838204057716252, + "grad_norm": 17.927518844604492, + "learning_rate": 6.3937162309505676e-06, + "loss": 3.2788, + "step": 43820 + }, + { + "epoch": 1.2841133816743573, + "grad_norm": 18.351322174072266, + "learning_rate": 6.392197355278363e-06, + "loss": 3.2718, + "step": 43830 + }, + { + "epoch": 1.2844063575770892, + "grad_norm": 19.87566375732422, + "learning_rate": 6.390678340325074e-06, + "loss": 3.2969, + "step": 43840 + }, + { + "epoch": 1.2846993334798213, + "grad_norm": 18.191049575805664, + "learning_rate": 6.389159186242666e-06, + "loss": 3.303, + "step": 43850 + }, + { + "epoch": 1.2849923093825533, + "grad_norm": 15.793567657470703, + "learning_rate": 6.387639893183124e-06, + "loss": 3.2801, + "step": 43860 + }, + { + "epoch": 1.2852852852852852, + "grad_norm": 18.00504493713379, + "learning_rate": 6.386120461298444e-06, + "loss": 3.304, + "step": 43870 + }, + { + "epoch": 1.2855782611880173, + "grad_norm": 17.708621978759766, + "learning_rate": 6.384600890740634e-06, + "loss": 3.2653, + "step": 43880 + }, + { + "epoch": 1.2858712370907492, + "grad_norm": 17.972612380981445, + "learning_rate": 6.38308118166172e-06, + "loss": 3.3049, + "step": 43890 + }, + { + "epoch": 1.2861642129934814, + "grad_norm": 16.46506118774414, + "learning_rate": 6.381561334213739e-06, + "loss": 3.272, + "step": 43900 + }, + { + "epoch": 1.2864571888962133, + "grad_norm": 17.447912216186523, + "learning_rate": 6.380041348548745e-06, + "loss": 3.2792, + "step": 43910 + }, + { + "epoch": 1.2867501647989452, + "grad_norm": 16.716840744018555, + "learning_rate": 6.378521224818798e-06, + "loss": 3.2869, + "step": 43920 + }, + { + "epoch": 1.2870431407016774, + "grad_norm": 17.617164611816406, + "learning_rate": 6.377000963175983e-06, + "loss": 3.2956, + "step": 43930 + }, + { + "epoch": 1.2873361166044093, + "grad_norm": 17.259357452392578, + "learning_rate": 6.375480563772391e-06, + "loss": 3.2874, + "step": 43940 + }, + { + "epoch": 1.2876290925071414, + "grad_norm": 16.654752731323242, + "learning_rate": 6.373960026760129e-06, + "loss": 3.2654, + "step": 43950 + }, + { + "epoch": 1.2879220684098733, + "grad_norm": 14.737160682678223, + "learning_rate": 6.372439352291317e-06, + "loss": 3.281, + "step": 43960 + }, + { + "epoch": 1.2882150443126053, + "grad_norm": 17.91364860534668, + "learning_rate": 6.371070627869777e-06, + "loss": 3.301, + "step": 43970 + }, + { + "epoch": 1.2885080202153372, + "grad_norm": 19.437728881835938, + "learning_rate": 6.3695496926526635e-06, + "loss": 3.2848, + "step": 43980 + }, + { + "epoch": 1.2888009961180693, + "grad_norm": 17.33213996887207, + "learning_rate": 6.36802862042023e-06, + "loss": 3.3065, + "step": 43990 + }, + { + "epoch": 1.2890939720208012, + "grad_norm": 16.979280471801758, + "learning_rate": 6.366507411324648e-06, + "loss": 3.2735, + "step": 44000 + }, + { + "epoch": 1.2893869479235334, + "grad_norm": 21.276334762573242, + "learning_rate": 6.364986065518106e-06, + "loss": 3.2807, + "step": 44010 + }, + { + "epoch": 1.2896799238262653, + "grad_norm": 14.173625946044922, + "learning_rate": 6.363464583152807e-06, + "loss": 3.2603, + "step": 44020 + }, + { + "epoch": 1.2899728997289972, + "grad_norm": 16.72439956665039, + "learning_rate": 6.361942964380967e-06, + "loss": 3.2846, + "step": 44030 + }, + { + "epoch": 1.2900314949095437, + "eval_bleu": 0.3358645047415229, + "eval_cap_loss": 0.948165774345398, + "eval_con_loss": 1.2624199390411377, + "eval_loss": 3.473005533218384, + "step": 44032 + }, + { + "epoch": 1.2900314949095437, + "eval_bleu": 0.3358645047415229, + "eval_cap_loss": 0.948165774345398, + "eval_con_loss": 1.2624199390411377, + "eval_loss": 3.473005533218384, + "eval_runtime": 54.8745, + "eval_samples_per_second": 364.468, + "eval_steps_per_second": 0.364, + "step": 44032 + }, + { + "epoch": 1.2902658756317293, + "grad_norm": 16.973936080932617, + "learning_rate": 6.360421209354812e-06, + "loss": 3.2808, + "step": 44040 + }, + { + "epoch": 1.2905588515344613, + "grad_norm": 18.285110473632812, + "learning_rate": 6.358899318226587e-06, + "loss": 3.27, + "step": 44050 + }, + { + "epoch": 1.2908518274371934, + "grad_norm": 17.555795669555664, + "learning_rate": 6.357377291148547e-06, + "loss": 3.2635, + "step": 44060 + }, + { + "epoch": 1.2911448033399253, + "grad_norm": 17.518306732177734, + "learning_rate": 6.355855128272962e-06, + "loss": 3.3012, + "step": 44070 + }, + { + "epoch": 1.2914377792426572, + "grad_norm": 16.627347946166992, + "learning_rate": 6.354332829752116e-06, + "loss": 3.2699, + "step": 44080 + }, + { + "epoch": 1.2917307551453892, + "grad_norm": 16.822729110717773, + "learning_rate": 6.352810395738304e-06, + "loss": 3.2823, + "step": 44090 + }, + { + "epoch": 1.2920237310481213, + "grad_norm": 17.837337493896484, + "learning_rate": 6.351287826383838e-06, + "loss": 3.2864, + "step": 44100 + }, + { + "epoch": 1.2923167069508532, + "grad_norm": 16.970958709716797, + "learning_rate": 6.3497651218410425e-06, + "loss": 3.2932, + "step": 44110 + }, + { + "epoch": 1.2926096828535854, + "grad_norm": 19.401355743408203, + "learning_rate": 6.348242282262256e-06, + "loss": 3.2946, + "step": 44120 + }, + { + "epoch": 1.2929026587563173, + "grad_norm": 17.533756256103516, + "learning_rate": 6.346719307799825e-06, + "loss": 3.2675, + "step": 44130 + }, + { + "epoch": 1.2931956346590492, + "grad_norm": 16.406158447265625, + "learning_rate": 6.345196198606118e-06, + "loss": 3.2762, + "step": 44140 + }, + { + "epoch": 1.2934886105617813, + "grad_norm": 15.512602806091309, + "learning_rate": 6.343672954833513e-06, + "loss": 3.2739, + "step": 44150 + }, + { + "epoch": 1.2937815864645132, + "grad_norm": 16.911334991455078, + "learning_rate": 6.342149576634401e-06, + "loss": 3.2668, + "step": 44160 + }, + { + "epoch": 1.2940745623672454, + "grad_norm": 18.067626953125, + "learning_rate": 6.340626064161185e-06, + "loss": 3.31, + "step": 44170 + }, + { + "epoch": 1.2943675382699773, + "grad_norm": 18.174604415893555, + "learning_rate": 6.339102417566285e-06, + "loss": 3.276, + "step": 44180 + }, + { + "epoch": 1.2946605141727092, + "grad_norm": 17.073450088500977, + "learning_rate": 6.337578637002134e-06, + "loss": 3.2736, + "step": 44190 + }, + { + "epoch": 1.2949534900754414, + "grad_norm": 20.70328712463379, + "learning_rate": 6.336054722621177e-06, + "loss": 3.2962, + "step": 44200 + }, + { + "epoch": 1.2952464659781733, + "grad_norm": 15.987021446228027, + "learning_rate": 6.334530674575868e-06, + "loss": 3.2804, + "step": 44210 + }, + { + "epoch": 1.2955394418809054, + "grad_norm": 16.349340438842773, + "learning_rate": 6.3330064930186855e-06, + "loss": 3.28, + "step": 44220 + }, + { + "epoch": 1.2958324177836373, + "grad_norm": 15.96300983428955, + "learning_rate": 6.3314821781021105e-06, + "loss": 3.289, + "step": 44230 + }, + { + "epoch": 1.2961253936863693, + "grad_norm": 19.61104393005371, + "learning_rate": 6.329957729978642e-06, + "loss": 3.2641, + "step": 44240 + }, + { + "epoch": 1.2964183695891012, + "grad_norm": 15.405271530151367, + "learning_rate": 6.328433148800794e-06, + "loss": 3.2803, + "step": 44250 + }, + { + "epoch": 1.2967113454918333, + "grad_norm": 17.88445472717285, + "learning_rate": 6.32690843472109e-06, + "loss": 3.2734, + "step": 44260 + }, + { + "epoch": 1.2970043213945652, + "grad_norm": 14.261899948120117, + "learning_rate": 6.3253835878920725e-06, + "loss": 3.2714, + "step": 44270 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 21.379013061523438, + "learning_rate": 6.323858608466288e-06, + "loss": 3.2685, + "step": 44280 + }, + { + "epoch": 1.2975902732000293, + "grad_norm": 20.204193115234375, + "learning_rate": 6.322333496596304e-06, + "loss": 3.3022, + "step": 44290 + }, + { + "epoch": 1.2978832491027612, + "grad_norm": 17.52553939819336, + "learning_rate": 6.320808252434698e-06, + "loss": 3.2595, + "step": 44300 + }, + { + "epoch": 1.2981762250054933, + "grad_norm": 17.28678321838379, + "learning_rate": 6.319282876134064e-06, + "loss": 3.305, + "step": 44310 + }, + { + "epoch": 1.2984692009082253, + "grad_norm": 19.371509552001953, + "learning_rate": 6.317757367847005e-06, + "loss": 3.258, + "step": 44320 + }, + { + "epoch": 1.2987621768109574, + "grad_norm": 16.741029739379883, + "learning_rate": 6.316231727726139e-06, + "loss": 3.2904, + "step": 44330 + }, + { + "epoch": 1.2990551527136893, + "grad_norm": 16.11528968811035, + "learning_rate": 6.314705955924096e-06, + "loss": 3.2621, + "step": 44340 + }, + { + "epoch": 1.2993481286164212, + "grad_norm": 15.514640808105469, + "learning_rate": 6.313180052593524e-06, + "loss": 3.2686, + "step": 44350 + }, + { + "epoch": 1.2996411045191534, + "grad_norm": 18.397117614746094, + "learning_rate": 6.3116540178870786e-06, + "loss": 3.2725, + "step": 44360 + }, + { + "epoch": 1.2999340804218853, + "grad_norm": 15.54427719116211, + "learning_rate": 6.31012785195743e-06, + "loss": 3.2763, + "step": 44370 + }, + { + "epoch": 1.3002270563246174, + "grad_norm": 16.39841651916504, + "learning_rate": 6.308601554957262e-06, + "loss": 3.2602, + "step": 44380 + }, + { + "epoch": 1.3005200322273494, + "grad_norm": 17.714582443237305, + "learning_rate": 6.307075127039273e-06, + "loss": 3.2969, + "step": 44390 + }, + { + "epoch": 1.3008130081300813, + "grad_norm": 17.01740074157715, + "learning_rate": 6.305548568356172e-06, + "loss": 3.2944, + "step": 44400 + }, + { + "epoch": 1.3011059840328132, + "grad_norm": 16.379850387573242, + "learning_rate": 6.304021879060682e-06, + "loss": 3.2853, + "step": 44410 + }, + { + "epoch": 1.3013989599355453, + "grad_norm": 16.299421310424805, + "learning_rate": 6.302495059305539e-06, + "loss": 3.2875, + "step": 44420 + }, + { + "epoch": 1.3016919358382772, + "grad_norm": 16.935901641845703, + "learning_rate": 6.300968109243494e-06, + "loss": 3.273, + "step": 44430 + }, + { + "epoch": 1.3019849117410094, + "grad_norm": 17.84444236755371, + "learning_rate": 6.2994410290273065e-06, + "loss": 3.2851, + "step": 44440 + }, + { + "epoch": 1.3022778876437413, + "grad_norm": 18.628753662109375, + "learning_rate": 6.297913818809755e-06, + "loss": 3.2934, + "step": 44450 + }, + { + "epoch": 1.3025708635464732, + "grad_norm": 17.137296676635742, + "learning_rate": 6.2963864787436245e-06, + "loss": 3.304, + "step": 44460 + }, + { + "epoch": 1.3028638394492054, + "grad_norm": 15.141984939575195, + "learning_rate": 6.29485900898172e-06, + "loss": 3.2889, + "step": 44470 + }, + { + "epoch": 1.3031568153519373, + "grad_norm": 16.291576385498047, + "learning_rate": 6.2933314096768525e-06, + "loss": 3.2772, + "step": 44480 + }, + { + "epoch": 1.3034497912546694, + "grad_norm": 15.361674308776855, + "learning_rate": 6.29180368098185e-06, + "loss": 3.2926, + "step": 44490 + }, + { + "epoch": 1.3037427671574013, + "grad_norm": 17.328899383544922, + "learning_rate": 6.290275823049555e-06, + "loss": 3.285, + "step": 44500 + }, + { + "epoch": 1.3040357430601333, + "grad_norm": 16.528976440429688, + "learning_rate": 6.2887478360328165e-06, + "loss": 3.2713, + "step": 44510 + }, + { + "epoch": 1.3043287189628652, + "grad_norm": 17.483169555664062, + "learning_rate": 6.287219720084505e-06, + "loss": 3.2541, + "step": 44520 + }, + { + "epoch": 1.3046216948655973, + "grad_norm": 17.305744171142578, + "learning_rate": 6.285691475357497e-06, + "loss": 3.2689, + "step": 44530 + }, + { + "epoch": 1.3049146707683292, + "grad_norm": 18.147069931030273, + "learning_rate": 6.284163102004683e-06, + "loss": 3.3032, + "step": 44540 + }, + { + "epoch": 1.305031861129422, + "eval_bleu": 0.336704111254563, + "eval_cap_loss": 0.9472922086715698, + "eval_con_loss": 1.2576887607574463, + "eval_loss": 3.462669610977173, + "step": 44544 + }, + { + "epoch": 1.305031861129422, + "eval_bleu": 0.336704111254563, + "eval_cap_loss": 0.9472922086715698, + "eval_con_loss": 1.2576887607574463, + "eval_loss": 3.462669610977173, + "eval_runtime": 55.5373, + "eval_samples_per_second": 360.119, + "eval_steps_per_second": 0.36, + "step": 44544 + }, + { + "epoch": 1.3052076466710614, + "grad_norm": 15.553308486938477, + "learning_rate": 6.28263460017897e-06, + "loss": 3.2786, + "step": 44550 + }, + { + "epoch": 1.3055006225737933, + "grad_norm": 15.944170951843262, + "learning_rate": 6.281105970033277e-06, + "loss": 3.3052, + "step": 44560 + }, + { + "epoch": 1.3057935984765252, + "grad_norm": 18.56805419921875, + "learning_rate": 6.27957721172053e-06, + "loss": 3.2843, + "step": 44570 + }, + { + "epoch": 1.3060865743792573, + "grad_norm": 20.8170166015625, + "learning_rate": 6.278048325393674e-06, + "loss": 3.2521, + "step": 44580 + }, + { + "epoch": 1.3063795502819893, + "grad_norm": 17.9276065826416, + "learning_rate": 6.276519311205667e-06, + "loss": 3.2593, + "step": 44590 + }, + { + "epoch": 1.3066725261847214, + "grad_norm": 16.4718074798584, + "learning_rate": 6.274990169309475e-06, + "loss": 3.2808, + "step": 44600 + }, + { + "epoch": 1.3069655020874533, + "grad_norm": 17.74094581604004, + "learning_rate": 6.2734608998580815e-06, + "loss": 3.3097, + "step": 44610 + }, + { + "epoch": 1.3072584779901852, + "grad_norm": 18.4493465423584, + "learning_rate": 6.2719315030044805e-06, + "loss": 3.3002, + "step": 44620 + }, + { + "epoch": 1.3075514538929174, + "grad_norm": 18.24626350402832, + "learning_rate": 6.270401978901678e-06, + "loss": 3.273, + "step": 44630 + }, + { + "epoch": 1.3078444297956493, + "grad_norm": 19.151813507080078, + "learning_rate": 6.2688723277026955e-06, + "loss": 3.2537, + "step": 44640 + }, + { + "epoch": 1.3081374056983814, + "grad_norm": 21.9906005859375, + "learning_rate": 6.267342549560564e-06, + "loss": 3.2951, + "step": 44650 + }, + { + "epoch": 1.3084303816011134, + "grad_norm": 19.077987670898438, + "learning_rate": 6.26581264462833e-06, + "loss": 3.2576, + "step": 44660 + }, + { + "epoch": 1.3087233575038453, + "grad_norm": 19.97502326965332, + "learning_rate": 6.264282613059048e-06, + "loss": 3.271, + "step": 44670 + }, + { + "epoch": 1.3090163334065772, + "grad_norm": 18.71693229675293, + "learning_rate": 6.262752455005794e-06, + "loss": 3.2702, + "step": 44680 + }, + { + "epoch": 1.3093093093093093, + "grad_norm": 18.590652465820312, + "learning_rate": 6.261222170621648e-06, + "loss": 3.28, + "step": 44690 + }, + { + "epoch": 1.3096022852120413, + "grad_norm": 17.753116607666016, + "learning_rate": 6.2596917600597075e-06, + "loss": 3.2482, + "step": 44700 + }, + { + "epoch": 1.3098952611147734, + "grad_norm": 17.342653274536133, + "learning_rate": 6.258161223473077e-06, + "loss": 3.2676, + "step": 44710 + }, + { + "epoch": 1.3101882370175053, + "grad_norm": 17.832468032836914, + "learning_rate": 6.256630561014882e-06, + "loss": 3.2888, + "step": 44720 + }, + { + "epoch": 1.3104812129202372, + "grad_norm": 17.62263298034668, + "learning_rate": 6.255099772838256e-06, + "loss": 3.2656, + "step": 44730 + }, + { + "epoch": 1.3107741888229694, + "grad_norm": 14.441282272338867, + "learning_rate": 6.253568859096343e-06, + "loss": 3.2732, + "step": 44740 + }, + { + "epoch": 1.3110671647257013, + "grad_norm": 17.420230865478516, + "learning_rate": 6.252037819942303e-06, + "loss": 3.2836, + "step": 44750 + }, + { + "epoch": 1.3113601406284334, + "grad_norm": 16.644664764404297, + "learning_rate": 6.250506655529306e-06, + "loss": 3.2737, + "step": 44760 + }, + { + "epoch": 1.3116531165311653, + "grad_norm": 16.368465423583984, + "learning_rate": 6.2489753660105375e-06, + "loss": 3.25, + "step": 44770 + }, + { + "epoch": 1.3119460924338973, + "grad_norm": 14.422226905822754, + "learning_rate": 6.247443951539195e-06, + "loss": 3.2586, + "step": 44780 + }, + { + "epoch": 1.3122390683366292, + "grad_norm": 15.821616172790527, + "learning_rate": 6.245912412268484e-06, + "loss": 3.28, + "step": 44790 + }, + { + "epoch": 1.3125320442393613, + "grad_norm": 16.299142837524414, + "learning_rate": 6.2443807483516296e-06, + "loss": 3.2662, + "step": 44800 + }, + { + "epoch": 1.3128250201420932, + "grad_norm": 19.10049057006836, + "learning_rate": 6.242848959941864e-06, + "loss": 3.2737, + "step": 44810 + }, + { + "epoch": 1.3131179960448254, + "grad_norm": 14.942424774169922, + "learning_rate": 6.241317047192432e-06, + "loss": 3.2668, + "step": 44820 + }, + { + "epoch": 1.3134109719475573, + "grad_norm": 18.253860473632812, + "learning_rate": 6.239785010256595e-06, + "loss": 3.2743, + "step": 44830 + }, + { + "epoch": 1.3137039478502892, + "grad_norm": 17.590394973754883, + "learning_rate": 6.238252849287622e-06, + "loss": 3.2897, + "step": 44840 + }, + { + "epoch": 1.3139969237530214, + "grad_norm": 14.87265396118164, + "learning_rate": 6.2367205644388e-06, + "loss": 3.2746, + "step": 44850 + }, + { + "epoch": 1.3142898996557533, + "grad_norm": 13.381335258483887, + "learning_rate": 6.235188155863421e-06, + "loss": 3.2486, + "step": 44860 + }, + { + "epoch": 1.3145828755584854, + "grad_norm": 17.103015899658203, + "learning_rate": 6.233655623714795e-06, + "loss": 3.2869, + "step": 44870 + }, + { + "epoch": 1.3148758514612173, + "grad_norm": 18.583229064941406, + "learning_rate": 6.232122968146244e-06, + "loss": 3.2714, + "step": 44880 + }, + { + "epoch": 1.3151688273639492, + "grad_norm": 16.955068588256836, + "learning_rate": 6.230590189311099e-06, + "loss": 3.2703, + "step": 44890 + }, + { + "epoch": 1.3154618032666814, + "grad_norm": 17.693748474121094, + "learning_rate": 6.229057287362708e-06, + "loss": 3.2721, + "step": 44900 + }, + { + "epoch": 1.3157547791694133, + "grad_norm": 17.47372817993164, + "learning_rate": 6.227524262454424e-06, + "loss": 3.2992, + "step": 44910 + }, + { + "epoch": 1.3160477550721454, + "grad_norm": 17.12322235107422, + "learning_rate": 6.225991114739622e-06, + "loss": 3.2633, + "step": 44920 + }, + { + "epoch": 1.3163407309748774, + "grad_norm": 14.411422729492188, + "learning_rate": 6.224457844371683e-06, + "loss": 3.2617, + "step": 44930 + }, + { + "epoch": 1.3166337068776093, + "grad_norm": 18.859195709228516, + "learning_rate": 6.222924451504001e-06, + "loss": 3.2652, + "step": 44940 + }, + { + "epoch": 1.3169266827803412, + "grad_norm": 17.89043426513672, + "learning_rate": 6.221390936289981e-06, + "loss": 3.2826, + "step": 44950 + }, + { + "epoch": 1.3172196586830733, + "grad_norm": 19.699783325195312, + "learning_rate": 6.2198572988830465e-06, + "loss": 3.2657, + "step": 44960 + }, + { + "epoch": 1.3175126345858053, + "grad_norm": 19.99451446533203, + "learning_rate": 6.218323539436626e-06, + "loss": 3.2846, + "step": 44970 + }, + { + "epoch": 1.3178056104885374, + "grad_norm": 15.056050300598145, + "learning_rate": 6.216789658104163e-06, + "loss": 3.2947, + "step": 44980 + }, + { + "epoch": 1.3180985863912693, + "grad_norm": 19.155832290649414, + "learning_rate": 6.215255655039113e-06, + "loss": 3.2686, + "step": 44990 + }, + { + "epoch": 1.3183915622940012, + "grad_norm": 18.927173614501953, + "learning_rate": 6.213721530394946e-06, + "loss": 3.2768, + "step": 45000 + }, + { + "epoch": 1.3186845381967334, + "grad_norm": 18.559823989868164, + "learning_rate": 6.21218728432514e-06, + "loss": 3.2822, + "step": 45010 + }, + { + "epoch": 1.3189775140994653, + "grad_norm": 15.124588966369629, + "learning_rate": 6.210652916983189e-06, + "loss": 3.2572, + "step": 45020 + }, + { + "epoch": 1.3192704900021974, + "grad_norm": 16.05169105529785, + "learning_rate": 6.209118428522594e-06, + "loss": 3.2599, + "step": 45030 + }, + { + "epoch": 1.3195634659049293, + "grad_norm": 14.44285774230957, + "learning_rate": 6.207583819096874e-06, + "loss": 3.2794, + "step": 45040 + }, + { + "epoch": 1.3198564418076613, + "grad_norm": 18.138643264770508, + "learning_rate": 6.2060490888595594e-06, + "loss": 3.2728, + "step": 45050 + }, + { + "epoch": 1.3200322273493006, + "eval_bleu": 0.3365861960821116, + "eval_cap_loss": 0.9471063017845154, + "eval_con_loss": 1.2527868747711182, + "eval_loss": 3.4526801109313965, + "step": 45056 + }, + { + "epoch": 1.3200322273493006, + "eval_bleu": 0.3365861960821116, + "eval_cap_loss": 0.9471063017845154, + "eval_con_loss": 1.2527868747711182, + "eval_loss": 3.4526801109313965, + "eval_runtime": 54.6005, + "eval_samples_per_second": 366.297, + "eval_steps_per_second": 0.366, + "step": 45056 + }, + { + "epoch": 1.3201494177103934, + "grad_norm": 14.915727615356445, + "learning_rate": 6.204514237964187e-06, + "loss": 3.2727, + "step": 45060 + }, + { + "epoch": 1.3204423936131253, + "grad_norm": 16.601613998413086, + "learning_rate": 6.202979266564311e-06, + "loss": 3.2522, + "step": 45070 + }, + { + "epoch": 1.3207353695158572, + "grad_norm": 17.894668579101562, + "learning_rate": 6.201444174813497e-06, + "loss": 3.2615, + "step": 45080 + }, + { + "epoch": 1.3210283454185894, + "grad_norm": 20.968605041503906, + "learning_rate": 6.199908962865322e-06, + "loss": 3.2607, + "step": 45090 + }, + { + "epoch": 1.3213213213213213, + "grad_norm": 17.657482147216797, + "learning_rate": 6.198373630873373e-06, + "loss": 3.2637, + "step": 45100 + }, + { + "epoch": 1.3216142972240532, + "grad_norm": 16.658363342285156, + "learning_rate": 6.196838178991251e-06, + "loss": 3.288, + "step": 45110 + }, + { + "epoch": 1.3219072731267854, + "grad_norm": 15.887762069702148, + "learning_rate": 6.195302607372572e-06, + "loss": 3.2797, + "step": 45120 + }, + { + "epoch": 1.3222002490295173, + "grad_norm": 19.88873863220215, + "learning_rate": 6.193766916170958e-06, + "loss": 3.2649, + "step": 45130 + }, + { + "epoch": 1.3224932249322494, + "grad_norm": 18.584827423095703, + "learning_rate": 6.1922311055400464e-06, + "loss": 3.2583, + "step": 45140 + }, + { + "epoch": 1.3227862008349813, + "grad_norm": 15.825933456420898, + "learning_rate": 6.190695175633484e-06, + "loss": 3.2809, + "step": 45150 + }, + { + "epoch": 1.3230791767377132, + "grad_norm": 19.398649215698242, + "learning_rate": 6.189159126604936e-06, + "loss": 3.2478, + "step": 45160 + }, + { + "epoch": 1.3233721526404454, + "grad_norm": 14.373698234558105, + "learning_rate": 6.187622958608072e-06, + "loss": 3.2666, + "step": 45170 + }, + { + "epoch": 1.3236651285431773, + "grad_norm": 18.67801856994629, + "learning_rate": 6.1860866717965764e-06, + "loss": 3.2614, + "step": 45180 + }, + { + "epoch": 1.3239581044459094, + "grad_norm": 19.09300422668457, + "learning_rate": 6.184550266324145e-06, + "loss": 3.2894, + "step": 45190 + }, + { + "epoch": 1.3242510803486414, + "grad_norm": 18.213260650634766, + "learning_rate": 6.183013742344489e-06, + "loss": 3.2668, + "step": 45200 + }, + { + "epoch": 1.3245440562513733, + "grad_norm": 18.092185974121094, + "learning_rate": 6.181477100011326e-06, + "loss": 3.2784, + "step": 45210 + }, + { + "epoch": 1.3248370321541052, + "grad_norm": 19.1761531829834, + "learning_rate": 6.179940339478389e-06, + "loss": 3.2794, + "step": 45220 + }, + { + "epoch": 1.3251300080568373, + "grad_norm": 18.452226638793945, + "learning_rate": 6.1784034608994205e-06, + "loss": 3.3003, + "step": 45230 + }, + { + "epoch": 1.3254229839595693, + "grad_norm": 18.06001091003418, + "learning_rate": 6.176866464428177e-06, + "loss": 3.2761, + "step": 45240 + }, + { + "epoch": 1.3257159598623014, + "grad_norm": 16.557048797607422, + "learning_rate": 6.175329350218426e-06, + "loss": 3.2885, + "step": 45250 + }, + { + "epoch": 1.3260089357650333, + "grad_norm": 18.666200637817383, + "learning_rate": 6.173792118423948e-06, + "loss": 3.2927, + "step": 45260 + }, + { + "epoch": 1.3263019116677652, + "grad_norm": 17.380786895751953, + "learning_rate": 6.17225476919853e-06, + "loss": 3.273, + "step": 45270 + }, + { + "epoch": 1.3265948875704974, + "grad_norm": 18.216358184814453, + "learning_rate": 6.1707173026959785e-06, + "loss": 3.2622, + "step": 45280 + }, + { + "epoch": 1.3268878634732293, + "grad_norm": 18.12831687927246, + "learning_rate": 6.169179719070106e-06, + "loss": 3.2985, + "step": 45290 + }, + { + "epoch": 1.3271808393759614, + "grad_norm": 16.825437545776367, + "learning_rate": 6.16764201847474e-06, + "loss": 3.2696, + "step": 45300 + }, + { + "epoch": 1.3274738152786933, + "grad_norm": 17.188793182373047, + "learning_rate": 6.166104201063716e-06, + "loss": 3.2839, + "step": 45310 + }, + { + "epoch": 1.3277667911814253, + "grad_norm": 14.799108505249023, + "learning_rate": 6.164566266990884e-06, + "loss": 3.2764, + "step": 45320 + }, + { + "epoch": 1.3280597670841574, + "grad_norm": 14.904187202453613, + "learning_rate": 6.16302821641011e-06, + "loss": 3.2618, + "step": 45330 + }, + { + "epoch": 1.3283527429868893, + "grad_norm": 19.2086181640625, + "learning_rate": 6.161490049475261e-06, + "loss": 3.2848, + "step": 45340 + }, + { + "epoch": 1.3286457188896215, + "grad_norm": 17.934778213500977, + "learning_rate": 6.159951766340223e-06, + "loss": 3.2679, + "step": 45350 + }, + { + "epoch": 1.3289386947923534, + "grad_norm": 16.16594123840332, + "learning_rate": 6.158413367158892e-06, + "loss": 3.2899, + "step": 45360 + }, + { + "epoch": 1.3292316706950853, + "grad_norm": 17.06792449951172, + "learning_rate": 6.156874852085177e-06, + "loss": 3.2465, + "step": 45370 + }, + { + "epoch": 1.3295246465978172, + "grad_norm": 15.126568794250488, + "learning_rate": 6.155336221272997e-06, + "loss": 3.2827, + "step": 45380 + }, + { + "epoch": 1.3298176225005494, + "grad_norm": 19.611886978149414, + "learning_rate": 6.153797474876282e-06, + "loss": 3.2845, + "step": 45390 + }, + { + "epoch": 1.3301105984032813, + "grad_norm": 16.178600311279297, + "learning_rate": 6.152258613048974e-06, + "loss": 3.2552, + "step": 45400 + }, + { + "epoch": 1.3304035743060134, + "grad_norm": 18.29557228088379, + "learning_rate": 6.150719635945029e-06, + "loss": 3.2678, + "step": 45410 + }, + { + "epoch": 1.3306965502087453, + "grad_norm": 19.36787223815918, + "learning_rate": 6.149180543718412e-06, + "loss": 3.2775, + "step": 45420 + }, + { + "epoch": 1.3309895261114773, + "grad_norm": 17.259260177612305, + "learning_rate": 6.147641336523098e-06, + "loss": 3.2635, + "step": 45430 + }, + { + "epoch": 1.3312825020142094, + "grad_norm": 14.494851112365723, + "learning_rate": 6.146102014513078e-06, + "loss": 3.2695, + "step": 45440 + }, + { + "epoch": 1.3315754779169413, + "grad_norm": 17.285762786865234, + "learning_rate": 6.14456257784235e-06, + "loss": 3.2273, + "step": 45450 + }, + { + "epoch": 1.3318684538196734, + "grad_norm": 18.811172485351562, + "learning_rate": 6.143023026664928e-06, + "loss": 3.2662, + "step": 45460 + }, + { + "epoch": 1.3321614297224054, + "grad_norm": 16.23650360107422, + "learning_rate": 6.141483361134832e-06, + "loss": 3.2644, + "step": 45470 + }, + { + "epoch": 1.3324544056251373, + "grad_norm": 15.014397621154785, + "learning_rate": 6.139943581406098e-06, + "loss": 3.2687, + "step": 45480 + }, + { + "epoch": 1.3327473815278692, + "grad_norm": 16.95566749572754, + "learning_rate": 6.138403687632773e-06, + "loss": 3.2723, + "step": 45490 + }, + { + "epoch": 1.3330403574306013, + "grad_norm": 16.311534881591797, + "learning_rate": 6.136863679968913e-06, + "loss": 3.2766, + "step": 45500 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 18.592580795288086, + "learning_rate": 6.135323558568587e-06, + "loss": 3.271, + "step": 45510 + }, + { + "epoch": 1.3336263092360654, + "grad_norm": 19.162569046020508, + "learning_rate": 6.133783323585871e-06, + "loss": 3.2437, + "step": 45520 + }, + { + "epoch": 1.3339192851387973, + "grad_norm": 20.575923919677734, + "learning_rate": 6.132242975174863e-06, + "loss": 3.2698, + "step": 45530 + }, + { + "epoch": 1.3342122610415292, + "grad_norm": 16.93717384338379, + "learning_rate": 6.1307025134896635e-06, + "loss": 3.2641, + "step": 45540 + }, + { + "epoch": 1.3345052369442614, + "grad_norm": 14.525680541992188, + "learning_rate": 6.129161938684385e-06, + "loss": 3.2768, + "step": 45550 + }, + { + "epoch": 1.3347982128469933, + "grad_norm": 14.889464378356934, + "learning_rate": 6.127621250913152e-06, + "loss": 3.2597, + "step": 45560 + }, + { + "epoch": 1.335032593569179, + "eval_bleu": 0.3373075945173827, + "eval_cap_loss": 0.946201741695404, + "eval_con_loss": 1.2545839548110962, + "eval_loss": 3.455369710922241, + "step": 45568 + }, + { + "epoch": 1.335032593569179, + "eval_bleu": 0.3373075945173827, + "eval_cap_loss": 0.946201741695404, + "eval_con_loss": 1.2545839548110962, + "eval_loss": 3.455369710922241, + "eval_runtime": 57.9604, + "eval_samples_per_second": 345.063, + "eval_steps_per_second": 0.345, + "step": 45568 + }, + { + "epoch": 1.3350911887497254, + "grad_norm": 17.270280838012695, + "learning_rate": 6.1260804503301045e-06, + "loss": 3.2738, + "step": 45570 + }, + { + "epoch": 1.3353841646524574, + "grad_norm": 15.034247398376465, + "learning_rate": 6.1245395370893875e-06, + "loss": 3.277, + "step": 45580 + }, + { + "epoch": 1.3356771405551893, + "grad_norm": 17.220420837402344, + "learning_rate": 6.122998511345163e-06, + "loss": 3.2727, + "step": 45590 + }, + { + "epoch": 1.3359701164579214, + "grad_norm": 18.578264236450195, + "learning_rate": 6.1214573732515985e-06, + "loss": 3.2832, + "step": 45600 + }, + { + "epoch": 1.3362630923606533, + "grad_norm": 19.620573043823242, + "learning_rate": 6.119916122962878e-06, + "loss": 3.2765, + "step": 45610 + }, + { + "epoch": 1.3365560682633855, + "grad_norm": 19.3465576171875, + "learning_rate": 6.118374760633193e-06, + "loss": 3.2664, + "step": 45620 + }, + { + "epoch": 1.3368490441661174, + "grad_norm": 16.6912841796875, + "learning_rate": 6.116833286416749e-06, + "loss": 3.248, + "step": 45630 + }, + { + "epoch": 1.3371420200688493, + "grad_norm": 19.254791259765625, + "learning_rate": 6.11529170046776e-06, + "loss": 3.2797, + "step": 45640 + }, + { + "epoch": 1.3374349959715812, + "grad_norm": 16.222034454345703, + "learning_rate": 6.113750002940454e-06, + "loss": 3.2603, + "step": 45650 + }, + { + "epoch": 1.3377279718743134, + "grad_norm": 16.62348747253418, + "learning_rate": 6.112208193989068e-06, + "loss": 3.2702, + "step": 45660 + }, + { + "epoch": 1.3380209477770453, + "grad_norm": 18.31853675842285, + "learning_rate": 6.110666273767849e-06, + "loss": 3.269, + "step": 45670 + }, + { + "epoch": 1.3383139236797774, + "grad_norm": 16.70162582397461, + "learning_rate": 6.1091242424310595e-06, + "loss": 3.2967, + "step": 45680 + }, + { + "epoch": 1.3386068995825093, + "grad_norm": 19.051942825317383, + "learning_rate": 6.107582100132969e-06, + "loss": 3.2644, + "step": 45690 + }, + { + "epoch": 1.3388998754852413, + "grad_norm": 19.484615325927734, + "learning_rate": 6.106039847027861e-06, + "loss": 3.2758, + "step": 45700 + }, + { + "epoch": 1.3391928513879734, + "grad_norm": 17.382007598876953, + "learning_rate": 6.10449748327003e-06, + "loss": 3.2603, + "step": 45710 + }, + { + "epoch": 1.3394858272907053, + "grad_norm": 19.198501586914062, + "learning_rate": 6.102955009013777e-06, + "loss": 3.2693, + "step": 45720 + }, + { + "epoch": 1.3397788031934375, + "grad_norm": 16.56517791748047, + "learning_rate": 6.101412424413419e-06, + "loss": 3.2446, + "step": 45730 + }, + { + "epoch": 1.3400717790961694, + "grad_norm": 15.090256690979004, + "learning_rate": 6.099869729623282e-06, + "loss": 3.2671, + "step": 45740 + }, + { + "epoch": 1.3403647549989013, + "grad_norm": 18.98227882385254, + "learning_rate": 6.098326924797705e-06, + "loss": 3.2703, + "step": 45750 + }, + { + "epoch": 1.3406577309016332, + "grad_norm": 14.867993354797363, + "learning_rate": 6.096784010091035e-06, + "loss": 3.2859, + "step": 45760 + }, + { + "epoch": 1.3409507068043653, + "grad_norm": 17.525196075439453, + "learning_rate": 6.095240985657631e-06, + "loss": 3.2927, + "step": 45770 + }, + { + "epoch": 1.3412436827070973, + "grad_norm": 16.832483291625977, + "learning_rate": 6.093697851651865e-06, + "loss": 3.2434, + "step": 45780 + }, + { + "epoch": 1.3415366586098294, + "grad_norm": 15.795870780944824, + "learning_rate": 6.092154608228118e-06, + "loss": 3.2805, + "step": 45790 + }, + { + "epoch": 1.3418296345125613, + "grad_norm": 18.566272735595703, + "learning_rate": 6.090611255540782e-06, + "loss": 3.2807, + "step": 45800 + }, + { + "epoch": 1.3421226104152932, + "grad_norm": 16.926944732666016, + "learning_rate": 6.089067793744258e-06, + "loss": 3.2771, + "step": 45810 + }, + { + "epoch": 1.3424155863180254, + "grad_norm": 15.022124290466309, + "learning_rate": 6.087524222992965e-06, + "loss": 3.2739, + "step": 45820 + }, + { + "epoch": 1.3427085622207573, + "grad_norm": 19.525300979614258, + "learning_rate": 6.085980543441325e-06, + "loss": 3.2642, + "step": 45830 + }, + { + "epoch": 1.3430015381234894, + "grad_norm": 17.683073043823242, + "learning_rate": 6.084436755243774e-06, + "loss": 3.2608, + "step": 45840 + }, + { + "epoch": 1.3432945140262214, + "grad_norm": 17.718639373779297, + "learning_rate": 6.082892858554758e-06, + "loss": 3.2559, + "step": 45850 + }, + { + "epoch": 1.3435874899289533, + "grad_norm": 17.860591888427734, + "learning_rate": 6.081348853528738e-06, + "loss": 3.2511, + "step": 45860 + }, + { + "epoch": 1.3438804658316854, + "grad_norm": 20.60076332092285, + "learning_rate": 6.079804740320181e-06, + "loss": 3.2647, + "step": 45870 + }, + { + "epoch": 1.3441734417344173, + "grad_norm": 17.043519973754883, + "learning_rate": 6.078260519083565e-06, + "loss": 3.2624, + "step": 45880 + }, + { + "epoch": 1.3444664176371495, + "grad_norm": 15.955348014831543, + "learning_rate": 6.0767161899733806e-06, + "loss": 3.273, + "step": 45890 + }, + { + "epoch": 1.3447593935398814, + "grad_norm": 17.927343368530273, + "learning_rate": 6.075171753144131e-06, + "loss": 3.2679, + "step": 45900 + }, + { + "epoch": 1.3450523694426133, + "grad_norm": 16.65511131286621, + "learning_rate": 6.073627208750326e-06, + "loss": 3.2657, + "step": 45910 + }, + { + "epoch": 1.3453453453453452, + "grad_norm": 18.020259857177734, + "learning_rate": 6.072082556946487e-06, + "loss": 3.2598, + "step": 45920 + }, + { + "epoch": 1.3456383212480774, + "grad_norm": 16.844703674316406, + "learning_rate": 6.07053779788715e-06, + "loss": 3.2731, + "step": 45930 + }, + { + "epoch": 1.3459312971508093, + "grad_norm": 17.340831756591797, + "learning_rate": 6.068992931726856e-06, + "loss": 3.253, + "step": 45940 + }, + { + "epoch": 1.3462242730535414, + "grad_norm": 16.363014221191406, + "learning_rate": 6.067447958620163e-06, + "loss": 3.2666, + "step": 45950 + }, + { + "epoch": 1.3465172489562733, + "grad_norm": 17.576168060302734, + "learning_rate": 6.065902878721634e-06, + "loss": 3.2467, + "step": 45960 + }, + { + "epoch": 1.3468102248590053, + "grad_norm": 17.06229019165039, + "learning_rate": 6.064512215633696e-06, + "loss": 3.2462, + "step": 45970 + }, + { + "epoch": 1.3471032007617374, + "grad_norm": 17.955429077148438, + "learning_rate": 6.0629669332565456e-06, + "loss": 3.2514, + "step": 45980 + }, + { + "epoch": 1.3473961766644693, + "grad_norm": 18.941984176635742, + "learning_rate": 6.0614215445358605e-06, + "loss": 3.2865, + "step": 45990 + }, + { + "epoch": 1.3476891525672015, + "grad_norm": 16.866483688354492, + "learning_rate": 6.059876049626245e-06, + "loss": 3.2585, + "step": 46000 + }, + { + "epoch": 1.3479821284699334, + "grad_norm": 18.648269653320312, + "learning_rate": 6.05833044868232e-06, + "loss": 3.2654, + "step": 46010 + }, + { + "epoch": 1.3482751043726653, + "grad_norm": 16.27410888671875, + "learning_rate": 6.056784741858713e-06, + "loss": 3.2448, + "step": 46020 + }, + { + "epoch": 1.3485680802753974, + "grad_norm": 17.450347900390625, + "learning_rate": 6.055238929310063e-06, + "loss": 3.2534, + "step": 46030 + }, + { + "epoch": 1.3488610561781293, + "grad_norm": 17.198875427246094, + "learning_rate": 6.053693011191017e-06, + "loss": 3.2442, + "step": 46040 + }, + { + "epoch": 1.3491540320808615, + "grad_norm": 19.818012237548828, + "learning_rate": 6.052146987656237e-06, + "loss": 3.2478, + "step": 46050 + }, + { + "epoch": 1.3494470079835934, + "grad_norm": 17.539918899536133, + "learning_rate": 6.0506008588603945e-06, + "loss": 3.2536, + "step": 46060 + }, + { + "epoch": 1.3497399838863253, + "grad_norm": 15.794025421142578, + "learning_rate": 6.049054624958169e-06, + "loss": 3.2441, + "step": 46070 + }, + { + "epoch": 1.3500329597890572, + "grad_norm": 17.185012817382812, + "learning_rate": 6.047508286104251e-06, + "loss": 3.2703, + "step": 46080 + }, + { + "epoch": 1.3500329597890572, + "eval_bleu": 0.3370649711407426, + "eval_cap_loss": 0.9455769658088684, + "eval_con_loss": 1.2522292137145996, + "eval_loss": 3.450035572052002, + "step": 46080 + }, + { + "epoch": 1.3500329597890572, + "eval_bleu": 0.3370649711407426, + "eval_cap_loss": 0.9455769658088684, + "eval_con_loss": 1.2522292137145996, + "eval_loss": 3.450035572052002, + "eval_runtime": 57.3313, + "eval_samples_per_second": 348.85, + "eval_steps_per_second": 0.349, + "step": 46080 + }, + { + "epoch": 1.3503259356917894, + "grad_norm": 19.189699172973633, + "learning_rate": 6.045961842453343e-06, + "loss": 3.268, + "step": 46090 + }, + { + "epoch": 1.3506189115945213, + "grad_norm": 19.493680953979492, + "learning_rate": 6.0444152941601586e-06, + "loss": 3.2679, + "step": 46100 + }, + { + "epoch": 1.3509118874972534, + "grad_norm": 17.398571014404297, + "learning_rate": 6.042868641379421e-06, + "loss": 3.268, + "step": 46110 + }, + { + "epoch": 1.3512048633999854, + "grad_norm": 18.889917373657227, + "learning_rate": 6.041321884265863e-06, + "loss": 3.2616, + "step": 46120 + }, + { + "epoch": 1.3514978393027173, + "grad_norm": 19.15239906311035, + "learning_rate": 6.039775022974227e-06, + "loss": 3.2717, + "step": 46130 + }, + { + "epoch": 1.3517908152054494, + "grad_norm": 17.761823654174805, + "learning_rate": 6.03822805765927e-06, + "loss": 3.2622, + "step": 46140 + }, + { + "epoch": 1.3520837911081813, + "grad_norm": 14.950602531433105, + "learning_rate": 6.036680988475756e-06, + "loss": 3.2501, + "step": 46150 + }, + { + "epoch": 1.3523767670109135, + "grad_norm": 17.454700469970703, + "learning_rate": 6.035133815578459e-06, + "loss": 3.2634, + "step": 46160 + }, + { + "epoch": 1.3526697429136454, + "grad_norm": 18.184253692626953, + "learning_rate": 6.033586539122164e-06, + "loss": 3.271, + "step": 46170 + }, + { + "epoch": 1.3529627188163773, + "grad_norm": 13.572525024414062, + "learning_rate": 6.032039159261669e-06, + "loss": 3.2629, + "step": 46180 + }, + { + "epoch": 1.3532556947191092, + "grad_norm": 15.85556411743164, + "learning_rate": 6.030491676151778e-06, + "loss": 3.2456, + "step": 46190 + }, + { + "epoch": 1.3535486706218414, + "grad_norm": 16.957103729248047, + "learning_rate": 6.028944089947309e-06, + "loss": 3.2628, + "step": 46200 + }, + { + "epoch": 1.3538416465245733, + "grad_norm": 18.379690170288086, + "learning_rate": 6.027396400803089e-06, + "loss": 3.2879, + "step": 46210 + }, + { + "epoch": 1.3541346224273054, + "grad_norm": 16.762252807617188, + "learning_rate": 6.0258486088739535e-06, + "loss": 3.2745, + "step": 46220 + }, + { + "epoch": 1.3544275983300373, + "grad_norm": 17.36819839477539, + "learning_rate": 6.0243007143147506e-06, + "loss": 3.2879, + "step": 46230 + }, + { + "epoch": 1.3547205742327693, + "grad_norm": 15.146291732788086, + "learning_rate": 6.022752717280339e-06, + "loss": 3.2728, + "step": 46240 + }, + { + "epoch": 1.3550135501355014, + "grad_norm": 18.52228355407715, + "learning_rate": 6.021204617925584e-06, + "loss": 3.2678, + "step": 46250 + }, + { + "epoch": 1.3553065260382333, + "grad_norm": 16.747394561767578, + "learning_rate": 6.019656416405368e-06, + "loss": 3.2667, + "step": 46260 + }, + { + "epoch": 1.3555995019409655, + "grad_norm": 16.15129852294922, + "learning_rate": 6.0181081128745766e-06, + "loss": 3.2586, + "step": 46270 + }, + { + "epoch": 1.3558924778436974, + "grad_norm": 20.302988052368164, + "learning_rate": 6.0165597074881084e-06, + "loss": 3.2358, + "step": 46280 + }, + { + "epoch": 1.3561854537464293, + "grad_norm": 15.913785934448242, + "learning_rate": 6.015011200400872e-06, + "loss": 3.2227, + "step": 46290 + }, + { + "epoch": 1.3564784296491614, + "grad_norm": 17.155412673950195, + "learning_rate": 6.013462591767787e-06, + "loss": 3.2346, + "step": 46300 + }, + { + "epoch": 1.3567714055518934, + "grad_norm": 16.087961196899414, + "learning_rate": 6.011913881743783e-06, + "loss": 3.2427, + "step": 46310 + }, + { + "epoch": 1.3570643814546255, + "grad_norm": 17.84566307067871, + "learning_rate": 6.010365070483798e-06, + "loss": 3.269, + "step": 46320 + }, + { + "epoch": 1.3573573573573574, + "grad_norm": 18.80278205871582, + "learning_rate": 6.008816158142782e-06, + "loss": 3.2528, + "step": 46330 + }, + { + "epoch": 1.3576503332600893, + "grad_norm": 15.227240562438965, + "learning_rate": 6.007267144875694e-06, + "loss": 3.2542, + "step": 46340 + }, + { + "epoch": 1.3579433091628212, + "grad_norm": 16.85003089904785, + "learning_rate": 6.005718030837505e-06, + "loss": 3.2533, + "step": 46350 + }, + { + "epoch": 1.3582362850655534, + "grad_norm": 18.518863677978516, + "learning_rate": 6.004168816183193e-06, + "loss": 3.2669, + "step": 46360 + }, + { + "epoch": 1.3585292609682853, + "grad_norm": 16.297365188598633, + "learning_rate": 6.002619501067749e-06, + "loss": 3.2426, + "step": 46370 + }, + { + "epoch": 1.3588222368710174, + "grad_norm": 19.521209716796875, + "learning_rate": 6.001070085646172e-06, + "loss": 3.26, + "step": 46380 + }, + { + "epoch": 1.3591152127737494, + "grad_norm": 15.491192817687988, + "learning_rate": 5.999520570073471e-06, + "loss": 3.2608, + "step": 46390 + }, + { + "epoch": 1.3594081886764813, + "grad_norm": 19.52519416809082, + "learning_rate": 5.997970954504669e-06, + "loss": 3.2948, + "step": 46400 + }, + { + "epoch": 1.3597011645792134, + "grad_norm": 16.736650466918945, + "learning_rate": 5.996421239094793e-06, + "loss": 3.2668, + "step": 46410 + }, + { + "epoch": 1.3599941404819453, + "grad_norm": 17.15010643005371, + "learning_rate": 5.994871423998882e-06, + "loss": 3.2698, + "step": 46420 + }, + { + "epoch": 1.3602871163846775, + "grad_norm": 17.47751235961914, + "learning_rate": 5.99332150937199e-06, + "loss": 3.27, + "step": 46430 + }, + { + "epoch": 1.3605800922874094, + "grad_norm": 18.76993179321289, + "learning_rate": 5.991771495369173e-06, + "loss": 3.2763, + "step": 46440 + }, + { + "epoch": 1.3608730681901413, + "grad_norm": 15.820785522460938, + "learning_rate": 5.990221382145502e-06, + "loss": 3.2643, + "step": 46450 + }, + { + "epoch": 1.3611660440928732, + "grad_norm": 15.582209587097168, + "learning_rate": 5.988671169856056e-06, + "loss": 3.2751, + "step": 46460 + }, + { + "epoch": 1.3614590199956054, + "grad_norm": 20.00847816467285, + "learning_rate": 5.987120858655926e-06, + "loss": 3.2774, + "step": 46470 + }, + { + "epoch": 1.3617519958983373, + "grad_norm": 16.56260108947754, + "learning_rate": 5.985570448700209e-06, + "loss": 3.2732, + "step": 46480 + }, + { + "epoch": 1.3620449718010694, + "grad_norm": 16.4959659576416, + "learning_rate": 5.984019940144018e-06, + "loss": 3.2407, + "step": 46490 + }, + { + "epoch": 1.3623379477038013, + "grad_norm": 18.351438522338867, + "learning_rate": 5.982469333142468e-06, + "loss": 3.2616, + "step": 46500 + }, + { + "epoch": 1.3626309236065333, + "grad_norm": 18.39299201965332, + "learning_rate": 5.980918627850692e-06, + "loss": 3.2644, + "step": 46510 + }, + { + "epoch": 1.3629238995092654, + "grad_norm": 16.476900100708008, + "learning_rate": 5.979367824423825e-06, + "loss": 3.2529, + "step": 46520 + }, + { + "epoch": 1.3632168754119973, + "grad_norm": 16.342748641967773, + "learning_rate": 5.977816923017018e-06, + "loss": 3.258, + "step": 46530 + }, + { + "epoch": 1.3635098513147295, + "grad_norm": 16.63872718811035, + "learning_rate": 5.976265923785428e-06, + "loss": 3.2596, + "step": 46540 + }, + { + "epoch": 1.3638028272174614, + "grad_norm": 17.747220993041992, + "learning_rate": 5.974714826884226e-06, + "loss": 3.2653, + "step": 46550 + }, + { + "epoch": 1.3640958031201933, + "grad_norm": 17.34913444519043, + "learning_rate": 5.973163632468588e-06, + "loss": 3.2583, + "step": 46560 + }, + { + "epoch": 1.3643887790229254, + "grad_norm": 15.247611999511719, + "learning_rate": 5.971612340693702e-06, + "loss": 3.2434, + "step": 46570 + }, + { + "epoch": 1.3646817549256574, + "grad_norm": 15.57809829711914, + "learning_rate": 5.9700609517147655e-06, + "loss": 3.2671, + "step": 46580 + }, + { + "epoch": 1.3649747308283895, + "grad_norm": 22.038612365722656, + "learning_rate": 5.9685094656869865e-06, + "loss": 3.2593, + "step": 46590 + }, + { + "epoch": 1.3650333260089358, + "eval_bleu": 0.3368766601057798, + "eval_cap_loss": 0.944898247718811, + "eval_con_loss": 1.2468113899230957, + "eval_loss": 3.438520908355713, + "step": 46592 + }, + { + "epoch": 1.3650333260089358, + "eval_bleu": 0.3368766601057798, + "eval_cap_loss": 0.944898247718811, + "eval_con_loss": 1.2468113899230957, + "eval_loss": 3.438520908355713, + "eval_runtime": 60.4786, + "eval_samples_per_second": 330.695, + "eval_steps_per_second": 0.331, + "step": 46592 + }, + { + "epoch": 1.3652677067311214, + "grad_norm": 16.2918701171875, + "learning_rate": 5.9669578827655815e-06, + "loss": 3.2531, + "step": 46600 + }, + { + "epoch": 1.3655606826338533, + "grad_norm": 19.84616470336914, + "learning_rate": 5.965406203105777e-06, + "loss": 3.2733, + "step": 46610 + }, + { + "epoch": 1.3658536585365852, + "grad_norm": 18.334409713745117, + "learning_rate": 5.963854426862808e-06, + "loss": 3.2693, + "step": 46620 + }, + { + "epoch": 1.3661466344393174, + "grad_norm": 17.985382080078125, + "learning_rate": 5.9623025541919235e-06, + "loss": 3.2729, + "step": 46630 + }, + { + "epoch": 1.3664396103420493, + "grad_norm": 15.605181694030762, + "learning_rate": 5.9607505852483775e-06, + "loss": 3.2706, + "step": 46640 + }, + { + "epoch": 1.3667325862447814, + "grad_norm": 17.74521827697754, + "learning_rate": 5.959198520187435e-06, + "loss": 3.2423, + "step": 46650 + }, + { + "epoch": 1.3670255621475134, + "grad_norm": 15.489283561706543, + "learning_rate": 5.957646359164371e-06, + "loss": 3.2477, + "step": 46660 + }, + { + "epoch": 1.3673185380502453, + "grad_norm": 16.297531127929688, + "learning_rate": 5.956094102334471e-06, + "loss": 3.2563, + "step": 46670 + }, + { + "epoch": 1.3676115139529774, + "grad_norm": 16.725669860839844, + "learning_rate": 5.9545417498530275e-06, + "loss": 3.2728, + "step": 46680 + }, + { + "epoch": 1.3679044898557093, + "grad_norm": 15.398550987243652, + "learning_rate": 5.952989301875346e-06, + "loss": 3.2467, + "step": 46690 + }, + { + "epoch": 1.3681974657584415, + "grad_norm": 20.37610626220703, + "learning_rate": 5.951436758556738e-06, + "loss": 3.2574, + "step": 46700 + }, + { + "epoch": 1.3684904416611734, + "grad_norm": 18.70783805847168, + "learning_rate": 5.949884120052527e-06, + "loss": 3.2613, + "step": 46710 + }, + { + "epoch": 1.3687834175639053, + "grad_norm": 17.446378707885742, + "learning_rate": 5.948331386518045e-06, + "loss": 3.2669, + "step": 46720 + }, + { + "epoch": 1.3690763934666372, + "grad_norm": 18.407329559326172, + "learning_rate": 5.946778558108634e-06, + "loss": 3.2539, + "step": 46730 + }, + { + "epoch": 1.3693693693693694, + "grad_norm": 19.24036407470703, + "learning_rate": 5.945225634979647e-06, + "loss": 3.2561, + "step": 46740 + }, + { + "epoch": 1.3696623452721013, + "grad_norm": 20.732839584350586, + "learning_rate": 5.94367261728644e-06, + "loss": 3.2545, + "step": 46750 + }, + { + "epoch": 1.3699553211748334, + "grad_norm": 15.241096496582031, + "learning_rate": 5.942119505184389e-06, + "loss": 3.2684, + "step": 46760 + }, + { + "epoch": 1.3702482970775653, + "grad_norm": 15.862488746643066, + "learning_rate": 5.940566298828871e-06, + "loss": 3.2528, + "step": 46770 + }, + { + "epoch": 1.3705412729802973, + "grad_norm": 17.419109344482422, + "learning_rate": 5.939012998375274e-06, + "loss": 3.2481, + "step": 46780 + }, + { + "epoch": 1.3708342488830294, + "grad_norm": 20.877321243286133, + "learning_rate": 5.937459603978997e-06, + "loss": 3.2513, + "step": 46790 + }, + { + "epoch": 1.3711272247857613, + "grad_norm": 16.838497161865234, + "learning_rate": 5.935906115795449e-06, + "loss": 3.2505, + "step": 46800 + }, + { + "epoch": 1.3714202006884935, + "grad_norm": 16.685522079467773, + "learning_rate": 5.934352533980047e-06, + "loss": 3.2345, + "step": 46810 + }, + { + "epoch": 1.3717131765912254, + "grad_norm": 17.679738998413086, + "learning_rate": 5.932798858688217e-06, + "loss": 3.2641, + "step": 46820 + }, + { + "epoch": 1.3720061524939573, + "grad_norm": 17.67915916442871, + "learning_rate": 5.9312450900753935e-06, + "loss": 3.2623, + "step": 46830 + }, + { + "epoch": 1.3722991283966894, + "grad_norm": 19.77245330810547, + "learning_rate": 5.929691228297027e-06, + "loss": 3.2381, + "step": 46840 + }, + { + "epoch": 1.3725921042994214, + "grad_norm": 20.051511764526367, + "learning_rate": 5.928137273508566e-06, + "loss": 3.2648, + "step": 46850 + }, + { + "epoch": 1.3728850802021535, + "grad_norm": 16.422840118408203, + "learning_rate": 5.9265832258654785e-06, + "loss": 3.2463, + "step": 46860 + }, + { + "epoch": 1.3731780561048854, + "grad_norm": 19.00783348083496, + "learning_rate": 5.925029085523236e-06, + "loss": 3.2537, + "step": 46870 + }, + { + "epoch": 1.3734710320076173, + "grad_norm": 16.26688003540039, + "learning_rate": 5.923474852637322e-06, + "loss": 3.285, + "step": 46880 + }, + { + "epoch": 1.3737640079103492, + "grad_norm": 14.914787292480469, + "learning_rate": 5.921920527363228e-06, + "loss": 3.2705, + "step": 46890 + }, + { + "epoch": 1.3740569838130814, + "grad_norm": 15.451395988464355, + "learning_rate": 5.920366109856454e-06, + "loss": 3.2356, + "step": 46900 + }, + { + "epoch": 1.3743499597158133, + "grad_norm": 20.318910598754883, + "learning_rate": 5.918811600272511e-06, + "loss": 3.2572, + "step": 46910 + }, + { + "epoch": 1.3746429356185454, + "grad_norm": 16.904691696166992, + "learning_rate": 5.917256998766919e-06, + "loss": 3.2592, + "step": 46920 + }, + { + "epoch": 1.3749359115212774, + "grad_norm": 17.342164993286133, + "learning_rate": 5.915702305495207e-06, + "loss": 3.2568, + "step": 46930 + }, + { + "epoch": 1.3752288874240093, + "grad_norm": 19.799659729003906, + "learning_rate": 5.914147520612912e-06, + "loss": 3.255, + "step": 46940 + }, + { + "epoch": 1.3755218633267414, + "grad_norm": 17.29310417175293, + "learning_rate": 5.912592644275579e-06, + "loss": 3.2564, + "step": 46950 + }, + { + "epoch": 1.3758148392294733, + "grad_norm": 16.52950096130371, + "learning_rate": 5.911037676638767e-06, + "loss": 3.2606, + "step": 46960 + }, + { + "epoch": 1.3761078151322055, + "grad_norm": 14.681723594665527, + "learning_rate": 5.909482617858041e-06, + "loss": 3.244, + "step": 46970 + }, + { + "epoch": 1.3764007910349374, + "grad_norm": 19.213260650634766, + "learning_rate": 5.907927468088976e-06, + "loss": 3.2645, + "step": 46980 + }, + { + "epoch": 1.3766937669376693, + "grad_norm": 16.84784507751465, + "learning_rate": 5.9063722274871516e-06, + "loss": 3.2409, + "step": 46990 + }, + { + "epoch": 1.3769867428404015, + "grad_norm": 15.76114559173584, + "learning_rate": 5.904816896208164e-06, + "loss": 3.2531, + "step": 47000 + }, + { + "epoch": 1.3772797187431334, + "grad_norm": 19.568464279174805, + "learning_rate": 5.903261474407616e-06, + "loss": 3.2697, + "step": 47010 + }, + { + "epoch": 1.3775726946458655, + "grad_norm": 15.766106605529785, + "learning_rate": 5.901705962241115e-06, + "loss": 3.2604, + "step": 47020 + }, + { + "epoch": 1.3778656705485974, + "grad_norm": 19.51219940185547, + "learning_rate": 5.900150359864281e-06, + "loss": 3.2687, + "step": 47030 + }, + { + "epoch": 1.3781586464513293, + "grad_norm": 16.14290428161621, + "learning_rate": 5.898594667432746e-06, + "loss": 3.2612, + "step": 47040 + }, + { + "epoch": 1.3784516223540613, + "grad_norm": 19.221294403076172, + "learning_rate": 5.8970388851021445e-06, + "loss": 3.2444, + "step": 47050 + }, + { + "epoch": 1.3787445982567934, + "grad_norm": 17.762252807617188, + "learning_rate": 5.895483013028125e-06, + "loss": 3.2641, + "step": 47060 + }, + { + "epoch": 1.3790375741595253, + "grad_norm": 16.967998504638672, + "learning_rate": 5.893927051366341e-06, + "loss": 3.2289, + "step": 47070 + }, + { + "epoch": 1.3793305500622575, + "grad_norm": 15.990864753723145, + "learning_rate": 5.8923710002724595e-06, + "loss": 3.2559, + "step": 47080 + }, + { + "epoch": 1.3796235259649894, + "grad_norm": 17.078466415405273, + "learning_rate": 5.8908148599021555e-06, + "loss": 3.2559, + "step": 47090 + }, + { + "epoch": 1.3799165018677213, + "grad_norm": 15.33562183380127, + "learning_rate": 5.889258630411109e-06, + "loss": 3.239, + "step": 47100 + }, + { + "epoch": 1.3800336922288141, + "eval_bleu": 0.337788059478279, + "eval_cap_loss": 0.9441919922828674, + "eval_con_loss": 1.2473801374435425, + "eval_loss": 3.4389522075653076, + "step": 47104 + }, + { + "epoch": 1.3800336922288141, + "eval_bleu": 0.337788059478279, + "eval_cap_loss": 0.9441919922828674, + "eval_con_loss": 1.2473801374435425, + "eval_loss": 3.4389522075653076, + "eval_runtime": 56.2519, + "eval_samples_per_second": 355.544, + "eval_steps_per_second": 0.356, + "step": 47104 + }, + { + "epoch": 1.3802094777704534, + "grad_norm": 17.772226333618164, + "learning_rate": 5.887702311955011e-06, + "loss": 3.2265, + "step": 47110 + }, + { + "epoch": 1.3805024536731854, + "grad_norm": 14.79086971282959, + "learning_rate": 5.886145904689565e-06, + "loss": 3.2649, + "step": 47120 + }, + { + "epoch": 1.3807954295759175, + "grad_norm": 20.172073364257812, + "learning_rate": 5.884589408770478e-06, + "loss": 3.2365, + "step": 47130 + }, + { + "epoch": 1.3810884054786494, + "grad_norm": 16.86201286315918, + "learning_rate": 5.883032824353469e-06, + "loss": 3.2711, + "step": 47140 + }, + { + "epoch": 1.3813813813813813, + "grad_norm": 18.801847457885742, + "learning_rate": 5.881476151594264e-06, + "loss": 3.2667, + "step": 47150 + }, + { + "epoch": 1.3816743572841133, + "grad_norm": 18.41217041015625, + "learning_rate": 5.879919390648601e-06, + "loss": 3.2717, + "step": 47160 + }, + { + "epoch": 1.3819673331868454, + "grad_norm": 17.884252548217773, + "learning_rate": 5.878362541672222e-06, + "loss": 3.2789, + "step": 47170 + }, + { + "epoch": 1.3822603090895773, + "grad_norm": 17.315195083618164, + "learning_rate": 5.876805604820883e-06, + "loss": 3.2456, + "step": 47180 + }, + { + "epoch": 1.3825532849923095, + "grad_norm": 19.095687866210938, + "learning_rate": 5.875248580250344e-06, + "loss": 3.2423, + "step": 47190 + }, + { + "epoch": 1.3828462608950414, + "grad_norm": 16.726001739501953, + "learning_rate": 5.873691468116378e-06, + "loss": 3.2723, + "step": 47200 + }, + { + "epoch": 1.3831392367977733, + "grad_norm": 15.513148307800293, + "learning_rate": 5.872134268574764e-06, + "loss": 3.2614, + "step": 47210 + }, + { + "epoch": 1.3834322127005054, + "grad_norm": 17.2137393951416, + "learning_rate": 5.870576981781291e-06, + "loss": 3.2455, + "step": 47220 + }, + { + "epoch": 1.3837251886032373, + "grad_norm": 15.763782501220703, + "learning_rate": 5.8690196078917535e-06, + "loss": 3.2457, + "step": 47230 + }, + { + "epoch": 1.3840181645059695, + "grad_norm": 17.24970817565918, + "learning_rate": 5.867462147061963e-06, + "loss": 3.2251, + "step": 47240 + }, + { + "epoch": 1.3843111404087014, + "grad_norm": 18.32120132446289, + "learning_rate": 5.865904599447729e-06, + "loss": 3.2508, + "step": 47250 + }, + { + "epoch": 1.3846041163114333, + "grad_norm": 17.16779899597168, + "learning_rate": 5.864346965204879e-06, + "loss": 3.245, + "step": 47260 + }, + { + "epoch": 1.3848970922141655, + "grad_norm": 17.48908805847168, + "learning_rate": 5.862789244489242e-06, + "loss": 3.2304, + "step": 47270 + }, + { + "epoch": 1.3851900681168974, + "grad_norm": 16.088417053222656, + "learning_rate": 5.86123143745666e-06, + "loss": 3.2791, + "step": 47280 + }, + { + "epoch": 1.3854830440196295, + "grad_norm": 16.190486907958984, + "learning_rate": 5.859673544262982e-06, + "loss": 3.244, + "step": 47290 + }, + { + "epoch": 1.3857760199223614, + "grad_norm": 15.303278923034668, + "learning_rate": 5.8581155650640665e-06, + "loss": 3.2434, + "step": 47300 + }, + { + "epoch": 1.3860689958250934, + "grad_norm": 17.17697525024414, + "learning_rate": 5.856557500015779e-06, + "loss": 3.2687, + "step": 47310 + }, + { + "epoch": 1.3863619717278253, + "grad_norm": 18.87099838256836, + "learning_rate": 5.8549993492739964e-06, + "loss": 3.2593, + "step": 47320 + }, + { + "epoch": 1.3866549476305574, + "grad_norm": 14.031722068786621, + "learning_rate": 5.853441112994601e-06, + "loss": 3.2431, + "step": 47330 + }, + { + "epoch": 1.3869479235332893, + "grad_norm": 16.9622859954834, + "learning_rate": 5.851882791333486e-06, + "loss": 3.2363, + "step": 47340 + }, + { + "epoch": 1.3872408994360215, + "grad_norm": 17.35539436340332, + "learning_rate": 5.850324384446551e-06, + "loss": 3.2756, + "step": 47350 + }, + { + "epoch": 1.3875338753387534, + "grad_norm": 17.667207717895508, + "learning_rate": 5.848765892489706e-06, + "loss": 3.2745, + "step": 47360 + }, + { + "epoch": 1.3878268512414853, + "grad_norm": 17.597055435180664, + "learning_rate": 5.84720731561887e-06, + "loss": 3.2502, + "step": 47370 + }, + { + "epoch": 1.3881198271442174, + "grad_norm": 19.82908821105957, + "learning_rate": 5.845648653989968e-06, + "loss": 3.2458, + "step": 47380 + }, + { + "epoch": 1.3884128030469494, + "grad_norm": 17.115602493286133, + "learning_rate": 5.844089907758935e-06, + "loss": 3.2339, + "step": 47390 + }, + { + "epoch": 1.3887057789496815, + "grad_norm": 14.834869384765625, + "learning_rate": 5.8425310770817145e-06, + "loss": 3.2591, + "step": 47400 + }, + { + "epoch": 1.3889987548524134, + "grad_norm": 18.50284767150879, + "learning_rate": 5.840972162114259e-06, + "loss": 3.2496, + "step": 47410 + }, + { + "epoch": 1.3892917307551453, + "grad_norm": 18.166845321655273, + "learning_rate": 5.839413163012528e-06, + "loss": 3.2609, + "step": 47420 + }, + { + "epoch": 1.3895847066578773, + "grad_norm": 16.356090545654297, + "learning_rate": 5.837854079932489e-06, + "loss": 3.2427, + "step": 47430 + }, + { + "epoch": 1.3898776825606094, + "grad_norm": 15.510551452636719, + "learning_rate": 5.83629491303012e-06, + "loss": 3.2293, + "step": 47440 + }, + { + "epoch": 1.3901706584633413, + "grad_norm": 18.7862606048584, + "learning_rate": 5.834735662461407e-06, + "loss": 3.2738, + "step": 47450 + }, + { + "epoch": 1.3904636343660735, + "grad_norm": 17.872285842895508, + "learning_rate": 5.833176328382344e-06, + "loss": 3.2369, + "step": 47460 + }, + { + "epoch": 1.3907566102688054, + "grad_norm": 14.947635650634766, + "learning_rate": 5.831616910948932e-06, + "loss": 3.2622, + "step": 47470 + }, + { + "epoch": 1.3910495861715373, + "grad_norm": 20.054277420043945, + "learning_rate": 5.8300574103171806e-06, + "loss": 3.2243, + "step": 47480 + }, + { + "epoch": 1.3913425620742694, + "grad_norm": 16.411367416381836, + "learning_rate": 5.828497826643111e-06, + "loss": 3.2496, + "step": 47490 + }, + { + "epoch": 1.3916355379770013, + "grad_norm": 15.302019119262695, + "learning_rate": 5.8269381600827505e-06, + "loss": 3.2483, + "step": 47500 + }, + { + "epoch": 1.3919285138797335, + "grad_norm": 16.009397506713867, + "learning_rate": 5.825378410792132e-06, + "loss": 3.2526, + "step": 47510 + }, + { + "epoch": 1.3922214897824654, + "grad_norm": 17.2564697265625, + "learning_rate": 5.8238185789273e-06, + "loss": 3.2496, + "step": 47520 + }, + { + "epoch": 1.3925144656851973, + "grad_norm": 20.279979705810547, + "learning_rate": 5.822258664644306e-06, + "loss": 3.2493, + "step": 47530 + }, + { + "epoch": 1.3928074415879295, + "grad_norm": 15.345588684082031, + "learning_rate": 5.820698668099212e-06, + "loss": 3.2578, + "step": 47540 + }, + { + "epoch": 1.3931004174906614, + "grad_norm": 17.41375160217285, + "learning_rate": 5.8191385894480855e-06, + "loss": 3.2453, + "step": 47550 + }, + { + "epoch": 1.3933933933933935, + "grad_norm": 15.901433944702148, + "learning_rate": 5.817578428847002e-06, + "loss": 3.2532, + "step": 47560 + }, + { + "epoch": 1.3936863692961254, + "grad_norm": 16.978595733642578, + "learning_rate": 5.816018186452048e-06, + "loss": 3.2621, + "step": 47570 + }, + { + "epoch": 1.3939793451988574, + "grad_norm": 15.497074127197266, + "learning_rate": 5.814457862419316e-06, + "loss": 3.2769, + "step": 47580 + }, + { + "epoch": 1.3942723211015893, + "grad_norm": 18.465791702270508, + "learning_rate": 5.8128974569049056e-06, + "loss": 3.2205, + "step": 47590 + }, + { + "epoch": 1.3945652970043214, + "grad_norm": 15.216771125793457, + "learning_rate": 5.811336970064928e-06, + "loss": 3.2408, + "step": 47600 + }, + { + "epoch": 1.3948582729070533, + "grad_norm": 15.47849178314209, + "learning_rate": 5.8097764020555006e-06, + "loss": 3.2652, + "step": 47610 + }, + { + "epoch": 1.3950340584486927, + "eval_bleu": 0.33757574038876265, + "eval_cap_loss": 0.9439195394515991, + "eval_con_loss": 1.2457709312438965, + "eval_loss": 3.4354615211486816, + "step": 47616 + }, + { + "epoch": 1.3950340584486927, + "eval_bleu": 0.33757574038876265, + "eval_cap_loss": 0.9439195394515991, + "eval_con_loss": 1.2457709312438965, + "eval_loss": 3.4354615211486816, + "eval_runtime": 53.5316, + "eval_samples_per_second": 373.611, + "eval_steps_per_second": 0.374, + "step": 47616 + }, + { + "epoch": 1.3951512488097855, + "grad_norm": 16.443431854248047, + "learning_rate": 5.8082157530327476e-06, + "loss": 3.245, + "step": 47620 + }, + { + "epoch": 1.3954442247125174, + "grad_norm": 17.883525848388672, + "learning_rate": 5.806655023152804e-06, + "loss": 3.2508, + "step": 47630 + }, + { + "epoch": 1.3957372006152493, + "grad_norm": 18.06635093688965, + "learning_rate": 5.80509421257181e-06, + "loss": 3.2814, + "step": 47640 + }, + { + "epoch": 1.3960301765179814, + "grad_norm": 16.10955810546875, + "learning_rate": 5.803533321445918e-06, + "loss": 3.2521, + "step": 47650 + }, + { + "epoch": 1.3963231524207134, + "grad_norm": 13.786580085754395, + "learning_rate": 5.801972349931284e-06, + "loss": 3.2563, + "step": 47660 + }, + { + "epoch": 1.3966161283234455, + "grad_norm": 19.067947387695312, + "learning_rate": 5.800411298184074e-06, + "loss": 3.2479, + "step": 47670 + }, + { + "epoch": 1.3969091042261774, + "grad_norm": 16.91391372680664, + "learning_rate": 5.798850166360461e-06, + "loss": 3.2416, + "step": 47680 + }, + { + "epoch": 1.3972020801289093, + "grad_norm": 18.03921127319336, + "learning_rate": 5.7972889546166314e-06, + "loss": 3.2489, + "step": 47690 + }, + { + "epoch": 1.3974950560316415, + "grad_norm": 16.741430282592773, + "learning_rate": 5.79572766310877e-06, + "loss": 3.251, + "step": 47700 + }, + { + "epoch": 1.3977880319343734, + "grad_norm": 16.44737434387207, + "learning_rate": 5.794166291993078e-06, + "loss": 3.2493, + "step": 47710 + }, + { + "epoch": 1.3980810078371053, + "grad_norm": 20.4849853515625, + "learning_rate": 5.792604841425757e-06, + "loss": 3.274, + "step": 47720 + }, + { + "epoch": 1.3983739837398375, + "grad_norm": 18.72747802734375, + "learning_rate": 5.791043311563027e-06, + "loss": 3.2355, + "step": 47730 + }, + { + "epoch": 1.3986669596425694, + "grad_norm": 16.754291534423828, + "learning_rate": 5.789481702561106e-06, + "loss": 3.2446, + "step": 47740 + }, + { + "epoch": 1.3989599355453013, + "grad_norm": 16.643190383911133, + "learning_rate": 5.787920014576225e-06, + "loss": 3.2554, + "step": 47750 + }, + { + "epoch": 1.3992529114480334, + "grad_norm": 18.588237762451172, + "learning_rate": 5.786358247764619e-06, + "loss": 3.2275, + "step": 47760 + }, + { + "epoch": 1.3995458873507653, + "grad_norm": 16.795270919799805, + "learning_rate": 5.784796402282538e-06, + "loss": 3.2469, + "step": 47770 + }, + { + "epoch": 1.3998388632534975, + "grad_norm": 21.325531005859375, + "learning_rate": 5.783234478286233e-06, + "loss": 3.2549, + "step": 47780 + }, + { + "epoch": 1.4001318391562294, + "grad_norm": 17.84745216369629, + "learning_rate": 5.781672475931964e-06, + "loss": 3.2393, + "step": 47790 + }, + { + "epoch": 1.4004248150589613, + "grad_norm": 16.710439682006836, + "learning_rate": 5.780110395376002e-06, + "loss": 3.2579, + "step": 47800 + }, + { + "epoch": 1.4007177909616935, + "grad_norm": 16.273653030395508, + "learning_rate": 5.778548236774622e-06, + "loss": 3.2644, + "step": 47810 + }, + { + "epoch": 1.4010107668644254, + "grad_norm": 14.540459632873535, + "learning_rate": 5.776986000284111e-06, + "loss": 3.2583, + "step": 47820 + }, + { + "epoch": 1.4013037427671575, + "grad_norm": 17.55532455444336, + "learning_rate": 5.775423686060759e-06, + "loss": 3.2557, + "step": 47830 + }, + { + "epoch": 1.4015967186698894, + "grad_norm": 15.630904197692871, + "learning_rate": 5.7738612942608695e-06, + "loss": 3.235, + "step": 47840 + }, + { + "epoch": 1.4018896945726214, + "grad_norm": 20.582962036132812, + "learning_rate": 5.7722988250407474e-06, + "loss": 3.2305, + "step": 47850 + }, + { + "epoch": 1.4021826704753533, + "grad_norm": 17.813190460205078, + "learning_rate": 5.770736278556711e-06, + "loss": 3.2433, + "step": 47860 + }, + { + "epoch": 1.4024756463780854, + "grad_norm": 19.775148391723633, + "learning_rate": 5.769173654965082e-06, + "loss": 3.2415, + "step": 47870 + }, + { + "epoch": 1.4027686222808173, + "grad_norm": 16.599201202392578, + "learning_rate": 5.767610954422193e-06, + "loss": 3.263, + "step": 47880 + }, + { + "epoch": 1.4030615981835495, + "grad_norm": 19.833044052124023, + "learning_rate": 5.766048177084379e-06, + "loss": 3.25, + "step": 47890 + }, + { + "epoch": 1.4033545740862814, + "grad_norm": 14.4520902633667, + "learning_rate": 5.764485323107992e-06, + "loss": 3.249, + "step": 47900 + }, + { + "epoch": 1.4036475499890133, + "grad_norm": 15.923966407775879, + "learning_rate": 5.762922392649385e-06, + "loss": 3.2441, + "step": 47910 + }, + { + "epoch": 1.4039405258917454, + "grad_norm": 20.48739242553711, + "learning_rate": 5.761359385864917e-06, + "loss": 3.2495, + "step": 47920 + }, + { + "epoch": 1.4042335017944774, + "grad_norm": 18.05710792541504, + "learning_rate": 5.759796302910959e-06, + "loss": 3.2623, + "step": 47930 + }, + { + "epoch": 1.4045264776972095, + "grad_norm": 19.279848098754883, + "learning_rate": 5.758233143943891e-06, + "loss": 3.2512, + "step": 47940 + }, + { + "epoch": 1.4048194535999414, + "grad_norm": 14.394857406616211, + "learning_rate": 5.756669909120094e-06, + "loss": 3.2267, + "step": 47950 + }, + { + "epoch": 1.4051124295026733, + "grad_norm": 19.91413688659668, + "learning_rate": 5.755106598595962e-06, + "loss": 3.2712, + "step": 47960 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 17.71310806274414, + "learning_rate": 5.75369955452972e-06, + "loss": 3.273, + "step": 47970 + }, + { + "epoch": 1.4056983813081374, + "grad_norm": 15.861708641052246, + "learning_rate": 5.75213610060584e-06, + "loss": 3.226, + "step": 47980 + }, + { + "epoch": 1.4059913572108695, + "grad_norm": 16.777170181274414, + "learning_rate": 5.750572571435205e-06, + "loss": 3.2549, + "step": 47990 + }, + { + "epoch": 1.4062843331136015, + "grad_norm": 17.6169490814209, + "learning_rate": 5.749008967174234e-06, + "loss": 3.2415, + "step": 48000 + }, + { + "epoch": 1.4065773090163334, + "grad_norm": 16.28682518005371, + "learning_rate": 5.747445287979363e-06, + "loss": 3.25, + "step": 48010 + }, + { + "epoch": 1.4068702849190653, + "grad_norm": 17.55040168762207, + "learning_rate": 5.745881534007025e-06, + "loss": 3.2637, + "step": 48020 + }, + { + "epoch": 1.4071632608217974, + "grad_norm": 18.54486656188965, + "learning_rate": 5.744317705413664e-06, + "loss": 3.242, + "step": 48030 + }, + { + "epoch": 1.4074562367245294, + "grad_norm": 17.23825454711914, + "learning_rate": 5.742753802355733e-06, + "loss": 3.2294, + "step": 48040 + }, + { + "epoch": 1.4077492126272615, + "grad_norm": 16.905088424682617, + "learning_rate": 5.74118982498969e-06, + "loss": 3.2592, + "step": 48050 + }, + { + "epoch": 1.4080421885299934, + "grad_norm": 19.430301666259766, + "learning_rate": 5.739625773472003e-06, + "loss": 3.2345, + "step": 48060 + }, + { + "epoch": 1.4083351644327253, + "grad_norm": 16.58437156677246, + "learning_rate": 5.738061647959146e-06, + "loss": 3.241, + "step": 48070 + }, + { + "epoch": 1.4086281403354575, + "grad_norm": 19.216508865356445, + "learning_rate": 5.736497448607597e-06, + "loss": 3.2516, + "step": 48080 + }, + { + "epoch": 1.4089211162381894, + "grad_norm": 18.58104705810547, + "learning_rate": 5.73493317557385e-06, + "loss": 3.2414, + "step": 48090 + }, + { + "epoch": 1.4092140921409215, + "grad_norm": 17.848299026489258, + "learning_rate": 5.7333688290144e-06, + "loss": 3.2673, + "step": 48100 + }, + { + "epoch": 1.4095070680436534, + "grad_norm": 17.58653450012207, + "learning_rate": 5.731804409085749e-06, + "loss": 3.2409, + "step": 48110 + }, + { + "epoch": 1.4098000439463854, + "grad_norm": 20.400325775146484, + "learning_rate": 5.730239915944407e-06, + "loss": 3.234, + "step": 48120 + }, + { + "epoch": 1.410034424668571, + "eval_bleu": 0.3383900853874161, + "eval_cap_loss": 0.9428002834320068, + "eval_con_loss": 1.2429399490356445, + "eval_loss": 3.428680419921875, + "step": 48128 + }, + { + "epoch": 1.410034424668571, + "eval_bleu": 0.3383900853874161, + "eval_cap_loss": 0.9428002834320068, + "eval_con_loss": 1.2429399490356445, + "eval_loss": 3.428680419921875, + "eval_runtime": 52.986, + "eval_samples_per_second": 377.458, + "eval_steps_per_second": 0.377, + "step": 48128 + }, + { + "epoch": 1.4100930198491173, + "grad_norm": 20.28081512451172, + "learning_rate": 5.728675349746893e-06, + "loss": 3.2651, + "step": 48130 + }, + { + "epoch": 1.4103859957518494, + "grad_norm": 18.230934143066406, + "learning_rate": 5.727110710649735e-06, + "loss": 3.2492, + "step": 48140 + }, + { + "epoch": 1.4106789716545813, + "grad_norm": 22.18732452392578, + "learning_rate": 5.725545998809465e-06, + "loss": 3.2372, + "step": 48150 + }, + { + "epoch": 1.4109719475573135, + "grad_norm": 18.487417221069336, + "learning_rate": 5.723981214382621e-06, + "loss": 3.2495, + "step": 48160 + }, + { + "epoch": 1.4112649234600454, + "grad_norm": 17.93792724609375, + "learning_rate": 5.722416357525752e-06, + "loss": 3.2502, + "step": 48170 + }, + { + "epoch": 1.4115578993627773, + "grad_norm": 17.90898323059082, + "learning_rate": 5.720851428395413e-06, + "loss": 3.2511, + "step": 48180 + }, + { + "epoch": 1.4118508752655095, + "grad_norm": 20.855484008789062, + "learning_rate": 5.719286427148165e-06, + "loss": 3.2667, + "step": 48190 + }, + { + "epoch": 1.4121438511682414, + "grad_norm": 18.550941467285156, + "learning_rate": 5.717721353940578e-06, + "loss": 3.2458, + "step": 48200 + }, + { + "epoch": 1.4124368270709735, + "grad_norm": 17.809423446655273, + "learning_rate": 5.716156208929226e-06, + "loss": 3.2458, + "step": 48210 + }, + { + "epoch": 1.4127298029737054, + "grad_norm": 16.013629913330078, + "learning_rate": 5.714590992270695e-06, + "loss": 3.25, + "step": 48220 + }, + { + "epoch": 1.4130227788764373, + "grad_norm": 16.553974151611328, + "learning_rate": 5.713025704121574e-06, + "loss": 3.2358, + "step": 48230 + }, + { + "epoch": 1.4133157547791695, + "grad_norm": 15.818912506103516, + "learning_rate": 5.711460344638463e-06, + "loss": 3.2352, + "step": 48240 + }, + { + "epoch": 1.4136087306819014, + "grad_norm": 20.338848114013672, + "learning_rate": 5.709894913977964e-06, + "loss": 3.2255, + "step": 48250 + }, + { + "epoch": 1.4139017065846335, + "grad_norm": 18.559036254882812, + "learning_rate": 5.7083294122966915e-06, + "loss": 3.2585, + "step": 48260 + }, + { + "epoch": 1.4141946824873655, + "grad_norm": 14.978217124938965, + "learning_rate": 5.706763839751264e-06, + "loss": 3.2563, + "step": 48270 + }, + { + "epoch": 1.4144876583900974, + "grad_norm": 16.451221466064453, + "learning_rate": 5.705198196498307e-06, + "loss": 3.2522, + "step": 48280 + }, + { + "epoch": 1.4147806342928293, + "grad_norm": 17.59916114807129, + "learning_rate": 5.703632482694453e-06, + "loss": 3.251, + "step": 48290 + }, + { + "epoch": 1.4150736101955614, + "grad_norm": 18.750646591186523, + "learning_rate": 5.7020666984963445e-06, + "loss": 3.2482, + "step": 48300 + }, + { + "epoch": 1.4153665860982934, + "grad_norm": 17.897058486938477, + "learning_rate": 5.700500844060629e-06, + "loss": 3.2568, + "step": 48310 + }, + { + "epoch": 1.4156595620010255, + "grad_norm": 18.692039489746094, + "learning_rate": 5.698934919543959e-06, + "loss": 3.234, + "step": 48320 + }, + { + "epoch": 1.4159525379037574, + "grad_norm": 14.250751495361328, + "learning_rate": 5.697368925102996e-06, + "loss": 3.2682, + "step": 48330 + }, + { + "epoch": 1.4162455138064893, + "grad_norm": 16.241838455200195, + "learning_rate": 5.695802860894411e-06, + "loss": 3.2635, + "step": 48340 + }, + { + "epoch": 1.4165384897092215, + "grad_norm": 16.013864517211914, + "learning_rate": 5.694236727074878e-06, + "loss": 3.2264, + "step": 48350 + }, + { + "epoch": 1.4168314656119534, + "grad_norm": 17.869163513183594, + "learning_rate": 5.692670523801079e-06, + "loss": 3.2447, + "step": 48360 + }, + { + "epoch": 1.4171244415146855, + "grad_norm": 16.786203384399414, + "learning_rate": 5.6911042512297025e-06, + "loss": 3.2461, + "step": 48370 + }, + { + "epoch": 1.4174174174174174, + "grad_norm": 15.726099967956543, + "learning_rate": 5.689537909517446e-06, + "loss": 3.2236, + "step": 48380 + }, + { + "epoch": 1.4177103933201494, + "grad_norm": 15.717976570129395, + "learning_rate": 5.687971498821013e-06, + "loss": 3.215, + "step": 48390 + }, + { + "epoch": 1.4180033692228813, + "grad_norm": 15.054158210754395, + "learning_rate": 5.686405019297115e-06, + "loss": 3.24, + "step": 48400 + }, + { + "epoch": 1.4182963451256134, + "grad_norm": 17.101398468017578, + "learning_rate": 5.684838471102464e-06, + "loss": 3.2401, + "step": 48410 + }, + { + "epoch": 1.4185893210283453, + "grad_norm": 15.891361236572266, + "learning_rate": 5.683271854393788e-06, + "loss": 3.2593, + "step": 48420 + }, + { + "epoch": 1.4188822969310775, + "grad_norm": 18.34932518005371, + "learning_rate": 5.681705169327817e-06, + "loss": 3.2371, + "step": 48430 + }, + { + "epoch": 1.4191752728338094, + "grad_norm": 17.669885635375977, + "learning_rate": 5.680138416061289e-06, + "loss": 3.2622, + "step": 48440 + }, + { + "epoch": 1.4194682487365413, + "grad_norm": 15.206664085388184, + "learning_rate": 5.678571594750946e-06, + "loss": 3.2328, + "step": 48450 + }, + { + "epoch": 1.4197612246392735, + "grad_norm": 19.32114028930664, + "learning_rate": 5.677004705553542e-06, + "loss": 3.2348, + "step": 48460 + }, + { + "epoch": 1.4200542005420054, + "grad_norm": 14.38621997833252, + "learning_rate": 5.675437748625833e-06, + "loss": 3.2316, + "step": 48470 + }, + { + "epoch": 1.4203471764447375, + "grad_norm": 20.13926887512207, + "learning_rate": 5.673870724124584e-06, + "loss": 3.2546, + "step": 48480 + }, + { + "epoch": 1.4206401523474694, + "grad_norm": 17.6055965423584, + "learning_rate": 5.672303632206568e-06, + "loss": 3.2364, + "step": 48490 + }, + { + "epoch": 1.4209331282502013, + "grad_norm": 18.898744583129883, + "learning_rate": 5.6707364730285605e-06, + "loss": 3.2373, + "step": 48500 + }, + { + "epoch": 1.4212261041529335, + "grad_norm": 17.295475006103516, + "learning_rate": 5.66916924674735e-06, + "loss": 3.2422, + "step": 48510 + }, + { + "epoch": 1.4215190800556654, + "grad_norm": 15.80126953125, + "learning_rate": 5.667601953519727e-06, + "loss": 3.2386, + "step": 48520 + }, + { + "epoch": 1.4218120559583975, + "grad_norm": 15.238672256469727, + "learning_rate": 5.666034593502489e-06, + "loss": 3.242, + "step": 48530 + }, + { + "epoch": 1.4221050318611295, + "grad_norm": 14.986542701721191, + "learning_rate": 5.66446716685244e-06, + "loss": 3.2203, + "step": 48540 + }, + { + "epoch": 1.4223980077638614, + "grad_norm": 16.746877670288086, + "learning_rate": 5.662899673726394e-06, + "loss": 3.2633, + "step": 48550 + }, + { + "epoch": 1.4226909836665933, + "grad_norm": 19.29926300048828, + "learning_rate": 5.661332114281168e-06, + "loss": 3.2518, + "step": 48560 + }, + { + "epoch": 1.4229839595693254, + "grad_norm": 16.957033157348633, + "learning_rate": 5.659764488673589e-06, + "loss": 3.2285, + "step": 48570 + }, + { + "epoch": 1.4232769354720574, + "grad_norm": 18.053237915039062, + "learning_rate": 5.6581967970604845e-06, + "loss": 3.2273, + "step": 48580 + }, + { + "epoch": 1.4235699113747895, + "grad_norm": 18.152006149291992, + "learning_rate": 5.6566290395986975e-06, + "loss": 3.2556, + "step": 48590 + }, + { + "epoch": 1.4238628872775214, + "grad_norm": 21.37352752685547, + "learning_rate": 5.65506121644507e-06, + "loss": 3.2445, + "step": 48600 + }, + { + "epoch": 1.4241558631802533, + "grad_norm": 16.686120986938477, + "learning_rate": 5.653493327756455e-06, + "loss": 3.2155, + "step": 48610 + }, + { + "epoch": 1.4244488390829855, + "grad_norm": 17.603601455688477, + "learning_rate": 5.651925373689708e-06, + "loss": 3.2635, + "step": 48620 + }, + { + "epoch": 1.4247418149857174, + "grad_norm": 16.766494750976562, + "learning_rate": 5.650357354401696e-06, + "loss": 3.2342, + "step": 48630 + }, + { + "epoch": 1.4250347908884495, + "grad_norm": 17.601085662841797, + "learning_rate": 5.64878927004929e-06, + "loss": 3.2183, + "step": 48640 + }, + { + "epoch": 1.4250347908884495, + "eval_bleu": 0.33836323312409816, + "eval_cap_loss": 0.9418238997459412, + "eval_con_loss": 1.2384276390075684, + "eval_loss": 3.4186792373657227, + "step": 48640 + }, + { + "epoch": 1.4250347908884495, + "eval_bleu": 0.33836323312409816, + "eval_cap_loss": 0.9418238997459412, + "eval_con_loss": 1.2384276390075684, + "eval_loss": 3.4186792373657227, + "eval_runtime": 58.4977, + "eval_samples_per_second": 341.894, + "eval_steps_per_second": 0.342, + "step": 48640 + }, + { + "epoch": 1.4253277667911814, + "grad_norm": 18.462390899658203, + "learning_rate": 5.6472211207893655e-06, + "loss": 3.2277, + "step": 48650 + }, + { + "epoch": 1.4256207426939134, + "grad_norm": 16.304031372070312, + "learning_rate": 5.645652906778808e-06, + "loss": 3.2405, + "step": 48660 + }, + { + "epoch": 1.4259137185966455, + "grad_norm": 16.58443260192871, + "learning_rate": 5.644084628174508e-06, + "loss": 3.2228, + "step": 48670 + }, + { + "epoch": 1.4262066944993774, + "grad_norm": 16.75217628479004, + "learning_rate": 5.642516285133361e-06, + "loss": 3.2155, + "step": 48680 + }, + { + "epoch": 1.4264996704021096, + "grad_norm": 18.68769645690918, + "learning_rate": 5.640947877812274e-06, + "loss": 3.258, + "step": 48690 + }, + { + "epoch": 1.4267926463048415, + "grad_norm": 19.50636863708496, + "learning_rate": 5.639379406368152e-06, + "loss": 3.2338, + "step": 48700 + }, + { + "epoch": 1.4270856222075734, + "grad_norm": 17.829133987426758, + "learning_rate": 5.637810870957915e-06, + "loss": 3.2416, + "step": 48710 + }, + { + "epoch": 1.4273785981103053, + "grad_norm": 19.45341682434082, + "learning_rate": 5.636242271738483e-06, + "loss": 3.2311, + "step": 48720 + }, + { + "epoch": 1.4276715740130375, + "grad_norm": 16.942115783691406, + "learning_rate": 5.634673608866789e-06, + "loss": 3.2419, + "step": 48730 + }, + { + "epoch": 1.4279645499157694, + "grad_norm": 18.626853942871094, + "learning_rate": 5.633104882499762e-06, + "loss": 3.2289, + "step": 48740 + }, + { + "epoch": 1.4282575258185015, + "grad_norm": 15.749075889587402, + "learning_rate": 5.631536092794351e-06, + "loss": 3.2335, + "step": 48750 + }, + { + "epoch": 1.4285505017212334, + "grad_norm": 16.230165481567383, + "learning_rate": 5.6299672399075e-06, + "loss": 3.2117, + "step": 48760 + }, + { + "epoch": 1.4288434776239654, + "grad_norm": 18.409439086914062, + "learning_rate": 5.628398323996163e-06, + "loss": 3.2537, + "step": 48770 + }, + { + "epoch": 1.4291364535266975, + "grad_norm": 16.681684494018555, + "learning_rate": 5.626829345217303e-06, + "loss": 3.2206, + "step": 48780 + }, + { + "epoch": 1.4294294294294294, + "grad_norm": 14.955892562866211, + "learning_rate": 5.625260303727885e-06, + "loss": 3.262, + "step": 48790 + }, + { + "epoch": 1.4297224053321616, + "grad_norm": 17.938335418701172, + "learning_rate": 5.6236911996848846e-06, + "loss": 3.263, + "step": 48800 + }, + { + "epoch": 1.4300153812348935, + "grad_norm": 18.617910385131836, + "learning_rate": 5.622122033245278e-06, + "loss": 3.2554, + "step": 48810 + }, + { + "epoch": 1.4303083571376254, + "grad_norm": 15.768575668334961, + "learning_rate": 5.620552804566053e-06, + "loss": 3.2232, + "step": 48820 + }, + { + "epoch": 1.4306013330403573, + "grad_norm": 16.066795349121094, + "learning_rate": 5.6189835138042024e-06, + "loss": 3.2632, + "step": 48830 + }, + { + "epoch": 1.4308943089430894, + "grad_norm": 18.54822540283203, + "learning_rate": 5.6174141611167235e-06, + "loss": 3.2421, + "step": 48840 + }, + { + "epoch": 1.4311872848458214, + "grad_norm": 16.072484970092773, + "learning_rate": 5.6158447466606205e-06, + "loss": 3.2328, + "step": 48850 + }, + { + "epoch": 1.4314802607485535, + "grad_norm": 15.282363891601562, + "learning_rate": 5.614275270592904e-06, + "loss": 3.2363, + "step": 48860 + }, + { + "epoch": 1.4317732366512854, + "grad_norm": 18.00886344909668, + "learning_rate": 5.612705733070592e-06, + "loss": 3.2647, + "step": 48870 + }, + { + "epoch": 1.4320662125540173, + "grad_norm": 15.890471458435059, + "learning_rate": 5.611136134250705e-06, + "loss": 3.2525, + "step": 48880 + }, + { + "epoch": 1.4323591884567495, + "grad_norm": 16.711389541625977, + "learning_rate": 5.6095664742902746e-06, + "loss": 3.2397, + "step": 48890 + }, + { + "epoch": 1.4326521643594814, + "grad_norm": 16.670618057250977, + "learning_rate": 5.607996753346336e-06, + "loss": 3.2424, + "step": 48900 + }, + { + "epoch": 1.4329451402622135, + "grad_norm": 19.325172424316406, + "learning_rate": 5.606426971575926e-06, + "loss": 3.2532, + "step": 48910 + }, + { + "epoch": 1.4332381161649455, + "grad_norm": 15.319558143615723, + "learning_rate": 5.604857129136098e-06, + "loss": 3.2279, + "step": 48920 + }, + { + "epoch": 1.4335310920676774, + "grad_norm": 16.901350021362305, + "learning_rate": 5.603287226183902e-06, + "loss": 3.242, + "step": 48930 + }, + { + "epoch": 1.4338240679704095, + "grad_norm": 15.435789108276367, + "learning_rate": 5.601717262876399e-06, + "loss": 3.2379, + "step": 48940 + }, + { + "epoch": 1.4341170438731414, + "grad_norm": 18.26671600341797, + "learning_rate": 5.600147239370652e-06, + "loss": 3.2373, + "step": 48950 + }, + { + "epoch": 1.4344100197758736, + "grad_norm": 15.028426170349121, + "learning_rate": 5.598577155823736e-06, + "loss": 3.2373, + "step": 48960 + }, + { + "epoch": 1.4347029956786055, + "grad_norm": 14.334989547729492, + "learning_rate": 5.5970070123927255e-06, + "loss": 3.2513, + "step": 48970 + }, + { + "epoch": 1.4349959715813374, + "grad_norm": 17.33523178100586, + "learning_rate": 5.595436809234707e-06, + "loss": 3.2148, + "step": 48980 + }, + { + "epoch": 1.4352889474840693, + "grad_norm": 18.061885833740234, + "learning_rate": 5.593866546506767e-06, + "loss": 3.2513, + "step": 48990 + }, + { + "epoch": 1.4355819233868015, + "grad_norm": 17.24138832092285, + "learning_rate": 5.592296224366002e-06, + "loss": 3.2118, + "step": 49000 + }, + { + "epoch": 1.4358748992895334, + "grad_norm": 16.99064826965332, + "learning_rate": 5.590725842969514e-06, + "loss": 3.2175, + "step": 49010 + }, + { + "epoch": 1.4361678751922655, + "grad_norm": 17.817825317382812, + "learning_rate": 5.589155402474411e-06, + "loss": 3.231, + "step": 49020 + }, + { + "epoch": 1.4364608510949974, + "grad_norm": 18.12320327758789, + "learning_rate": 5.5875849030378036e-06, + "loss": 3.257, + "step": 49030 + }, + { + "epoch": 1.4367538269977294, + "grad_norm": 17.253002166748047, + "learning_rate": 5.586014344816815e-06, + "loss": 3.2565, + "step": 49040 + }, + { + "epoch": 1.4370468029004615, + "grad_norm": 17.508291244506836, + "learning_rate": 5.584443727968566e-06, + "loss": 3.2459, + "step": 49050 + }, + { + "epoch": 1.4373397788031934, + "grad_norm": 17.807466506958008, + "learning_rate": 5.5828730526501906e-06, + "loss": 3.2324, + "step": 49060 + }, + { + "epoch": 1.4376327547059256, + "grad_norm": 17.913352966308594, + "learning_rate": 5.5813023190188235e-06, + "loss": 3.2321, + "step": 49070 + }, + { + "epoch": 1.4379257306086575, + "grad_norm": 15.848189353942871, + "learning_rate": 5.5797315272316074e-06, + "loss": 3.2349, + "step": 49080 + }, + { + "epoch": 1.4382187065113894, + "grad_norm": 15.89129638671875, + "learning_rate": 5.578160677445693e-06, + "loss": 3.2436, + "step": 49090 + }, + { + "epoch": 1.4385116824141213, + "grad_norm": 16.15705680847168, + "learning_rate": 5.576589769818233e-06, + "loss": 3.2475, + "step": 49100 + }, + { + "epoch": 1.4388046583168534, + "grad_norm": 19.58823013305664, + "learning_rate": 5.575018804506386e-06, + "loss": 3.248, + "step": 49110 + }, + { + "epoch": 1.4390976342195854, + "grad_norm": 18.075237274169922, + "learning_rate": 5.573447781667321e-06, + "loss": 3.216, + "step": 49120 + }, + { + "epoch": 1.4393906101223175, + "grad_norm": 17.724712371826172, + "learning_rate": 5.571876701458205e-06, + "loss": 3.2236, + "step": 49130 + }, + { + "epoch": 1.4396835860250494, + "grad_norm": 14.998577117919922, + "learning_rate": 5.57030556403622e-06, + "loss": 3.2333, + "step": 49140 + }, + { + "epoch": 1.4399765619277813, + "grad_norm": 18.223073959350586, + "learning_rate": 5.568734369558545e-06, + "loss": 3.2497, + "step": 49150 + }, + { + "epoch": 1.4400351571083279, + "eval_bleu": 0.33864891626220367, + "eval_cap_loss": 0.9415960311889648, + "eval_con_loss": 1.2402079105377197, + "eval_loss": 3.4220118522644043, + "step": 49152 + }, + { + "epoch": 1.4400351571083279, + "eval_bleu": 0.33864891626220367, + "eval_cap_loss": 0.9415960311889648, + "eval_con_loss": 1.2402079105377197, + "eval_loss": 3.4220118522644043, + "eval_runtime": 54.1247, + "eval_samples_per_second": 369.517, + "eval_steps_per_second": 0.37, + "step": 49152 + }, + { + "epoch": 1.4402695378305135, + "grad_norm": 15.78544807434082, + "learning_rate": 5.567163118182373e-06, + "loss": 3.2107, + "step": 49160 + }, + { + "epoch": 1.4405625137332454, + "grad_norm": 16.464570999145508, + "learning_rate": 5.565591810064894e-06, + "loss": 3.2182, + "step": 49170 + }, + { + "epoch": 1.4408554896359775, + "grad_norm": 17.43706512451172, + "learning_rate": 5.564020445363309e-06, + "loss": 3.2501, + "step": 49180 + }, + { + "epoch": 1.4411484655387095, + "grad_norm": 17.19083023071289, + "learning_rate": 5.562449024234826e-06, + "loss": 3.2187, + "step": 49190 + }, + { + "epoch": 1.4414414414414414, + "grad_norm": 18.852033615112305, + "learning_rate": 5.560877546836655e-06, + "loss": 3.2692, + "step": 49200 + }, + { + "epoch": 1.4417344173441735, + "grad_norm": 14.316603660583496, + "learning_rate": 5.559306013326014e-06, + "loss": 3.239, + "step": 49210 + }, + { + "epoch": 1.4420273932469054, + "grad_norm": 17.988950729370117, + "learning_rate": 5.557734423860122e-06, + "loss": 3.2509, + "step": 49220 + }, + { + "epoch": 1.4423203691496376, + "grad_norm": 17.519926071166992, + "learning_rate": 5.556162778596213e-06, + "loss": 3.231, + "step": 49230 + }, + { + "epoch": 1.4426133450523695, + "grad_norm": 17.609548568725586, + "learning_rate": 5.554591077691516e-06, + "loss": 3.2741, + "step": 49240 + }, + { + "epoch": 1.4429063209551014, + "grad_norm": 15.219289779663086, + "learning_rate": 5.553019321303271e-06, + "loss": 3.2248, + "step": 49250 + }, + { + "epoch": 1.4431992968578333, + "grad_norm": 15.767775535583496, + "learning_rate": 5.551447509588725e-06, + "loss": 3.1989, + "step": 49260 + }, + { + "epoch": 1.4434922727605655, + "grad_norm": 16.065717697143555, + "learning_rate": 5.549875642705127e-06, + "loss": 3.2462, + "step": 49270 + }, + { + "epoch": 1.4437852486632974, + "grad_norm": 16.609806060791016, + "learning_rate": 5.548303720809734e-06, + "loss": 3.2351, + "step": 49280 + }, + { + "epoch": 1.4440782245660295, + "grad_norm": 16.5321044921875, + "learning_rate": 5.546731744059807e-06, + "loss": 3.2205, + "step": 49290 + }, + { + "epoch": 1.4443712004687614, + "grad_norm": 19.93864631652832, + "learning_rate": 5.545159712612611e-06, + "loss": 3.2495, + "step": 49300 + }, + { + "epoch": 1.4446641763714934, + "grad_norm": 15.85244369506836, + "learning_rate": 5.543587626625421e-06, + "loss": 3.2407, + "step": 49310 + }, + { + "epoch": 1.4449571522742255, + "grad_norm": 17.315473556518555, + "learning_rate": 5.542015486255514e-06, + "loss": 3.2281, + "step": 49320 + }, + { + "epoch": 1.4452501281769574, + "grad_norm": 16.15890884399414, + "learning_rate": 5.540443291660173e-06, + "loss": 3.2234, + "step": 49330 + }, + { + "epoch": 1.4455431040796896, + "grad_norm": 16.193574905395508, + "learning_rate": 5.538871042996688e-06, + "loss": 3.2438, + "step": 49340 + }, + { + "epoch": 1.4458360799824215, + "grad_norm": 17.243375778198242, + "learning_rate": 5.53729874042235e-06, + "loss": 3.2494, + "step": 49350 + }, + { + "epoch": 1.4461290558851534, + "grad_norm": 17.051414489746094, + "learning_rate": 5.535726384094462e-06, + "loss": 3.2314, + "step": 49360 + }, + { + "epoch": 1.4464220317878853, + "grad_norm": 13.087298393249512, + "learning_rate": 5.534153974170329e-06, + "loss": 3.2383, + "step": 49370 + }, + { + "epoch": 1.4467150076906174, + "grad_norm": 17.826826095581055, + "learning_rate": 5.532581510807258e-06, + "loss": 3.2373, + "step": 49380 + }, + { + "epoch": 1.4470079835933494, + "grad_norm": 17.864274978637695, + "learning_rate": 5.531008994162566e-06, + "loss": 3.2195, + "step": 49390 + }, + { + "epoch": 1.4473009594960815, + "grad_norm": 16.2109317779541, + "learning_rate": 5.529436424393576e-06, + "loss": 3.2306, + "step": 49400 + }, + { + "epoch": 1.4475939353988134, + "grad_norm": 19.447341918945312, + "learning_rate": 5.527863801657612e-06, + "loss": 3.238, + "step": 49410 + }, + { + "epoch": 1.4478869113015453, + "grad_norm": 16.983539581298828, + "learning_rate": 5.5262911261120075e-06, + "loss": 3.2634, + "step": 49420 + }, + { + "epoch": 1.4481798872042775, + "grad_norm": 19.075668334960938, + "learning_rate": 5.524718397914097e-06, + "loss": 3.2279, + "step": 49430 + }, + { + "epoch": 1.4484728631070094, + "grad_norm": 15.520550727844238, + "learning_rate": 5.523145617221224e-06, + "loss": 3.2246, + "step": 49440 + }, + { + "epoch": 1.4487658390097415, + "grad_norm": 18.868066787719727, + "learning_rate": 5.521572784190736e-06, + "loss": 3.242, + "step": 49450 + }, + { + "epoch": 1.4490588149124735, + "grad_norm": 17.389720916748047, + "learning_rate": 5.5199998989799866e-06, + "loss": 3.2372, + "step": 49460 + }, + { + "epoch": 1.4493517908152054, + "grad_norm": 15.235702514648438, + "learning_rate": 5.5184269617463295e-06, + "loss": 3.2417, + "step": 49470 + }, + { + "epoch": 1.4496447667179375, + "grad_norm": 15.525327682495117, + "learning_rate": 5.516853972647132e-06, + "loss": 3.243, + "step": 49480 + }, + { + "epoch": 1.4499377426206694, + "grad_norm": 16.236759185791016, + "learning_rate": 5.515280931839762e-06, + "loss": 3.224, + "step": 49490 + }, + { + "epoch": 1.4502307185234016, + "grad_norm": 14.618476867675781, + "learning_rate": 5.513707839481591e-06, + "loss": 3.2238, + "step": 49500 + }, + { + "epoch": 1.4505236944261335, + "grad_norm": 18.10725212097168, + "learning_rate": 5.512134695729998e-06, + "loss": 3.2473, + "step": 49510 + }, + { + "epoch": 1.4508166703288654, + "grad_norm": 17.996673583984375, + "learning_rate": 5.5105615007423675e-06, + "loss": 3.226, + "step": 49520 + }, + { + "epoch": 1.4511096462315973, + "grad_norm": 16.650999069213867, + "learning_rate": 5.508988254676087e-06, + "loss": 3.2192, + "step": 49530 + }, + { + "epoch": 1.4514026221343295, + "grad_norm": 17.731914520263672, + "learning_rate": 5.507414957688554e-06, + "loss": 3.2546, + "step": 49540 + }, + { + "epoch": 1.4516955980370614, + "grad_norm": 17.08881187438965, + "learning_rate": 5.505841609937162e-06, + "loss": 3.2229, + "step": 49550 + }, + { + "epoch": 1.4519885739397935, + "grad_norm": 16.467451095581055, + "learning_rate": 5.504268211579318e-06, + "loss": 3.214, + "step": 49560 + }, + { + "epoch": 1.4522815498425254, + "grad_norm": 17.816503524780273, + "learning_rate": 5.502694762772432e-06, + "loss": 3.2356, + "step": 49570 + }, + { + "epoch": 1.4525745257452574, + "grad_norm": 16.48475456237793, + "learning_rate": 5.5011212636739175e-06, + "loss": 3.2414, + "step": 49580 + }, + { + "epoch": 1.4528675016479895, + "grad_norm": 17.016080856323242, + "learning_rate": 5.499547714441195e-06, + "loss": 3.2356, + "step": 49590 + }, + { + "epoch": 1.4531604775507214, + "grad_norm": 20.23782730102539, + "learning_rate": 5.497974115231685e-06, + "loss": 3.2344, + "step": 49600 + }, + { + "epoch": 1.4534534534534536, + "grad_norm": 16.861961364746094, + "learning_rate": 5.4964004662028206e-06, + "loss": 3.2388, + "step": 49610 + }, + { + "epoch": 1.4537464293561855, + "grad_norm": 15.671738624572754, + "learning_rate": 5.494826767512034e-06, + "loss": 3.2174, + "step": 49620 + }, + { + "epoch": 1.4540394052589174, + "grad_norm": 14.451844215393066, + "learning_rate": 5.493253019316765e-06, + "loss": 3.237, + "step": 49630 + }, + { + "epoch": 1.4543323811616495, + "grad_norm": 17.057729721069336, + "learning_rate": 5.491679221774458e-06, + "loss": 3.2383, + "step": 49640 + }, + { + "epoch": 1.4546253570643815, + "grad_norm": 18.398462295532227, + "learning_rate": 5.490105375042562e-06, + "loss": 3.2146, + "step": 49650 + }, + { + "epoch": 1.4549183329671136, + "grad_norm": 15.467896461486816, + "learning_rate": 5.48853147927853e-06, + "loss": 3.2369, + "step": 49660 + }, + { + "epoch": 1.4550355233282062, + "eval_bleu": 0.3389174365771333, + "eval_cap_loss": 0.9409569501876831, + "eval_con_loss": 1.234084963798523, + "eval_loss": 3.4091267585754395, + "step": 49664 + }, + { + "epoch": 1.4550355233282062, + "eval_bleu": 0.3389174365771333, + "eval_cap_loss": 0.9409569501876831, + "eval_con_loss": 1.234084963798523, + "eval_loss": 3.4091267585754395, + "eval_runtime": 51.8097, + "eval_samples_per_second": 386.028, + "eval_steps_per_second": 0.386, + "step": 49664 + }, + { + "epoch": 1.4552113088698455, + "grad_norm": 17.54368019104004, + "learning_rate": 5.486957534639823e-06, + "loss": 3.2393, + "step": 49670 + }, + { + "epoch": 1.4555042847725774, + "grad_norm": 17.96737289428711, + "learning_rate": 5.485383541283903e-06, + "loss": 3.2454, + "step": 49680 + }, + { + "epoch": 1.4557972606753093, + "grad_norm": 16.296083450317383, + "learning_rate": 5.48380949936824e-06, + "loss": 3.2133, + "step": 49690 + }, + { + "epoch": 1.4560902365780415, + "grad_norm": 17.900014877319336, + "learning_rate": 5.482235409050307e-06, + "loss": 3.2522, + "step": 49700 + }, + { + "epoch": 1.4563832124807734, + "grad_norm": 19.959524154663086, + "learning_rate": 5.480661270487582e-06, + "loss": 3.2257, + "step": 49710 + }, + { + "epoch": 1.4566761883835055, + "grad_norm": 15.009293556213379, + "learning_rate": 5.4790870838375475e-06, + "loss": 3.2323, + "step": 49720 + }, + { + "epoch": 1.4569691642862375, + "grad_norm": 17.285091400146484, + "learning_rate": 5.4775128492576945e-06, + "loss": 3.2396, + "step": 49730 + }, + { + "epoch": 1.4572621401889694, + "grad_norm": 18.514005661010742, + "learning_rate": 5.475938566905513e-06, + "loss": 3.2447, + "step": 49740 + }, + { + "epoch": 1.4575551160917015, + "grad_norm": 18.97156524658203, + "learning_rate": 5.474364236938502e-06, + "loss": 3.241, + "step": 49750 + }, + { + "epoch": 1.4578480919944334, + "grad_norm": 14.599343299865723, + "learning_rate": 5.472789859514162e-06, + "loss": 3.2236, + "step": 49760 + }, + { + "epoch": 1.4581410678971656, + "grad_norm": 13.622815132141113, + "learning_rate": 5.471215434790002e-06, + "loss": 3.2288, + "step": 49770 + }, + { + "epoch": 1.4584340437998975, + "grad_norm": 19.893877029418945, + "learning_rate": 5.469640962923535e-06, + "loss": 3.2175, + "step": 49780 + }, + { + "epoch": 1.4587270197026294, + "grad_norm": 19.45740509033203, + "learning_rate": 5.468066444072276e-06, + "loss": 3.2427, + "step": 49790 + }, + { + "epoch": 1.4590199956053613, + "grad_norm": 16.543161392211914, + "learning_rate": 5.466491878393743e-06, + "loss": 3.2411, + "step": 49800 + }, + { + "epoch": 1.4593129715080935, + "grad_norm": 14.789901733398438, + "learning_rate": 5.464917266045468e-06, + "loss": 3.2199, + "step": 49810 + }, + { + "epoch": 1.4596059474108254, + "grad_norm": 15.166975021362305, + "learning_rate": 5.463342607184978e-06, + "loss": 3.236, + "step": 49820 + }, + { + "epoch": 1.4598989233135575, + "grad_norm": 16.798051834106445, + "learning_rate": 5.461767901969809e-06, + "loss": 3.2341, + "step": 49830 + }, + { + "epoch": 1.4601918992162894, + "grad_norm": 14.766622543334961, + "learning_rate": 5.4601931505575e-06, + "loss": 3.2247, + "step": 49840 + }, + { + "epoch": 1.4604848751190214, + "grad_norm": 17.38770294189453, + "learning_rate": 5.4586183531055984e-06, + "loss": 3.2014, + "step": 49850 + }, + { + "epoch": 1.4607778510217535, + "grad_norm": 13.47825813293457, + "learning_rate": 5.45704350977165e-06, + "loss": 3.2172, + "step": 49860 + }, + { + "epoch": 1.4610708269244854, + "grad_norm": 21.413076400756836, + "learning_rate": 5.455468620713211e-06, + "loss": 3.2593, + "step": 49870 + }, + { + "epoch": 1.4613638028272176, + "grad_norm": 20.120084762573242, + "learning_rate": 5.453893686087838e-06, + "loss": 3.2513, + "step": 49880 + }, + { + "epoch": 1.4616567787299495, + "grad_norm": 16.67658805847168, + "learning_rate": 5.452318706053095e-06, + "loss": 3.2455, + "step": 49890 + }, + { + "epoch": 1.4619497546326814, + "grad_norm": 16.1076602935791, + "learning_rate": 5.450743680766549e-06, + "loss": 3.2254, + "step": 49900 + }, + { + "epoch": 1.4622427305354135, + "grad_norm": 18.72467613220215, + "learning_rate": 5.449168610385772e-06, + "loss": 3.2327, + "step": 49910 + }, + { + "epoch": 1.4625357064381455, + "grad_norm": 15.442710876464844, + "learning_rate": 5.447593495068339e-06, + "loss": 3.2189, + "step": 49920 + }, + { + "epoch": 1.4628286823408776, + "grad_norm": 18.997718811035156, + "learning_rate": 5.446018334971833e-06, + "loss": 3.2331, + "step": 49930 + }, + { + "epoch": 1.4631216582436095, + "grad_norm": 16.802555084228516, + "learning_rate": 5.444443130253839e-06, + "loss": 3.2431, + "step": 49940 + }, + { + "epoch": 1.4634146341463414, + "grad_norm": 14.805130004882812, + "learning_rate": 5.442867881071947e-06, + "loss": 3.2561, + "step": 49950 + }, + { + "epoch": 1.4637076100490733, + "grad_norm": 19.70696449279785, + "learning_rate": 5.441292587583751e-06, + "loss": 3.2569, + "step": 49960 + }, + { + "epoch": 1.4640005859518055, + "grad_norm": 17.498920440673828, + "learning_rate": 5.439874785692739e-06, + "loss": 3.2409, + "step": 49970 + }, + { + "epoch": 1.4642935618545374, + "grad_norm": 16.4594669342041, + "learning_rate": 5.4382994084567545e-06, + "loss": 3.2377, + "step": 49980 + }, + { + "epoch": 1.4645865377572695, + "grad_norm": 15.979717254638672, + "learning_rate": 5.436723987371513e-06, + "loss": 3.2619, + "step": 49990 + }, + { + "epoch": 1.4648795136600015, + "grad_norm": 15.181096076965332, + "learning_rate": 5.435148522594627e-06, + "loss": 3.2297, + "step": 50000 + }, + { + "epoch": 1.4651724895627334, + "grad_norm": 16.82904815673828, + "learning_rate": 5.433573014283714e-06, + "loss": 3.2267, + "step": 50010 + }, + { + "epoch": 1.4654654654654655, + "grad_norm": 17.214984893798828, + "learning_rate": 5.4319974625963935e-06, + "loss": 3.2409, + "step": 50020 + }, + { + "epoch": 1.4657584413681974, + "grad_norm": 16.722585678100586, + "learning_rate": 5.43042186769029e-06, + "loss": 3.2318, + "step": 50030 + }, + { + "epoch": 1.4660514172709296, + "grad_norm": 20.678672790527344, + "learning_rate": 5.42884622972303e-06, + "loss": 3.2212, + "step": 50040 + }, + { + "epoch": 1.4663443931736615, + "grad_norm": 20.44985580444336, + "learning_rate": 5.427270548852251e-06, + "loss": 3.232, + "step": 50050 + }, + { + "epoch": 1.4666373690763934, + "grad_norm": 18.468454360961914, + "learning_rate": 5.4256948252355876e-06, + "loss": 3.2451, + "step": 50060 + }, + { + "epoch": 1.4669303449791253, + "grad_norm": 17.54448127746582, + "learning_rate": 5.424119059030684e-06, + "loss": 3.2161, + "step": 50070 + }, + { + "epoch": 1.4672233208818575, + "grad_norm": 21.186513900756836, + "learning_rate": 5.4225432503951825e-06, + "loss": 3.241, + "step": 50080 + }, + { + "epoch": 1.4675162967845894, + "grad_norm": 18.174640655517578, + "learning_rate": 5.420967399486738e-06, + "loss": 3.2018, + "step": 50090 + }, + { + "epoch": 1.4678092726873215, + "grad_norm": 18.581279754638672, + "learning_rate": 5.4193915064630005e-06, + "loss": 3.2152, + "step": 50100 + }, + { + "epoch": 1.4681022485900534, + "grad_norm": 15.660799980163574, + "learning_rate": 5.417815571481633e-06, + "loss": 3.2343, + "step": 50110 + }, + { + "epoch": 1.4683952244927854, + "grad_norm": 15.336440086364746, + "learning_rate": 5.416239594700294e-06, + "loss": 3.2428, + "step": 50120 + }, + { + "epoch": 1.4686882003955175, + "grad_norm": 17.974586486816406, + "learning_rate": 5.414663576276655e-06, + "loss": 3.223, + "step": 50130 + }, + { + "epoch": 1.4689811762982494, + "grad_norm": 16.171823501586914, + "learning_rate": 5.413087516368385e-06, + "loss": 3.2452, + "step": 50140 + }, + { + "epoch": 1.4692741522009816, + "grad_norm": 17.62972068786621, + "learning_rate": 5.41151141513316e-06, + "loss": 3.2394, + "step": 50150 + }, + { + "epoch": 1.4695671281037135, + "grad_norm": 16.804332733154297, + "learning_rate": 5.409935272728658e-06, + "loss": 3.2301, + "step": 50160 + }, + { + "epoch": 1.4698601040064454, + "grad_norm": 16.846939086914062, + "learning_rate": 5.408359089312565e-06, + "loss": 3.2132, + "step": 50170 + }, + { + "epoch": 1.4700358895480847, + "eval_bleu": 0.3389613257337674, + "eval_cap_loss": 0.9405626058578491, + "eval_con_loss": 1.2345869541168213, + "eval_loss": 3.4097366333007812, + "step": 50176 + }, + { + "epoch": 1.4700358895480847, + "eval_bleu": 0.3389613257337674, + "eval_cap_loss": 0.9405626058578491, + "eval_con_loss": 1.2345869541168213, + "eval_loss": 3.4097366333007812, + "eval_runtime": 53.9017, + "eval_samples_per_second": 371.046, + "eval_steps_per_second": 0.371, + "step": 50176 + }, + { + "epoch": 1.4701530799091775, + "grad_norm": 17.29787254333496, + "learning_rate": 5.406782865042568e-06, + "loss": 3.2256, + "step": 50180 + }, + { + "epoch": 1.4704460558119095, + "grad_norm": 14.522734642028809, + "learning_rate": 5.405206600076358e-06, + "loss": 3.2323, + "step": 50190 + }, + { + "epoch": 1.4707390317146416, + "grad_norm": 17.767288208007812, + "learning_rate": 5.403630294571631e-06, + "loss": 3.2416, + "step": 50200 + }, + { + "epoch": 1.4710320076173735, + "grad_norm": 19.018169403076172, + "learning_rate": 5.402053948686089e-06, + "loss": 3.2358, + "step": 50210 + }, + { + "epoch": 1.4713249835201054, + "grad_norm": 16.813257217407227, + "learning_rate": 5.400477562577435e-06, + "loss": 3.2314, + "step": 50220 + }, + { + "epoch": 1.4716179594228374, + "grad_norm": 14.79941463470459, + "learning_rate": 5.398901136403376e-06, + "loss": 3.2318, + "step": 50230 + }, + { + "epoch": 1.4719109353255695, + "grad_norm": 18.589977264404297, + "learning_rate": 5.397324670321625e-06, + "loss": 3.2052, + "step": 50240 + }, + { + "epoch": 1.4722039112283014, + "grad_norm": 20.231353759765625, + "learning_rate": 5.395748164489896e-06, + "loss": 3.2314, + "step": 50250 + }, + { + "epoch": 1.4724968871310335, + "grad_norm": 12.861088752746582, + "learning_rate": 5.394171619065914e-06, + "loss": 3.2234, + "step": 50260 + }, + { + "epoch": 1.4727898630337655, + "grad_norm": 14.690145492553711, + "learning_rate": 5.3925950342074005e-06, + "loss": 3.224, + "step": 50270 + }, + { + "epoch": 1.4730828389364974, + "grad_norm": 14.429949760437012, + "learning_rate": 5.391018410072082e-06, + "loss": 3.2236, + "step": 50280 + }, + { + "epoch": 1.4733758148392295, + "grad_norm": 18.815519332885742, + "learning_rate": 5.389441746817689e-06, + "loss": 3.2348, + "step": 50290 + }, + { + "epoch": 1.4736687907419614, + "grad_norm": 18.768674850463867, + "learning_rate": 5.387865044601963e-06, + "loss": 3.2134, + "step": 50300 + }, + { + "epoch": 1.4739617666446936, + "grad_norm": 16.700557708740234, + "learning_rate": 5.386288303582639e-06, + "loss": 3.233, + "step": 50310 + }, + { + "epoch": 1.4742547425474255, + "grad_norm": 16.51518440246582, + "learning_rate": 5.384711523917463e-06, + "loss": 3.2343, + "step": 50320 + }, + { + "epoch": 1.4745477184501574, + "grad_norm": 15.109516143798828, + "learning_rate": 5.383134705764181e-06, + "loss": 3.2335, + "step": 50330 + }, + { + "epoch": 1.4748406943528893, + "grad_norm": 20.62188720703125, + "learning_rate": 5.381557849280544e-06, + "loss": 3.2348, + "step": 50340 + }, + { + "epoch": 1.4751336702556215, + "grad_norm": 19.037067413330078, + "learning_rate": 5.3799809546243085e-06, + "loss": 3.2093, + "step": 50350 + }, + { + "epoch": 1.4754266461583534, + "grad_norm": 17.20465660095215, + "learning_rate": 5.378404021953233e-06, + "loss": 3.2181, + "step": 50360 + }, + { + "epoch": 1.4757196220610855, + "grad_norm": 15.572236061096191, + "learning_rate": 5.376827051425078e-06, + "loss": 3.2147, + "step": 50370 + }, + { + "epoch": 1.4760125979638175, + "grad_norm": 17.82308578491211, + "learning_rate": 5.3752500431976155e-06, + "loss": 3.2268, + "step": 50380 + }, + { + "epoch": 1.4763055738665494, + "grad_norm": 16.917009353637695, + "learning_rate": 5.373672997428611e-06, + "loss": 3.2346, + "step": 50390 + }, + { + "epoch": 1.4765985497692815, + "grad_norm": 20.556238174438477, + "learning_rate": 5.37209591427584e-06, + "loss": 3.2109, + "step": 50400 + }, + { + "epoch": 1.4768915256720134, + "grad_norm": 20.5754451751709, + "learning_rate": 5.3705187938970794e-06, + "loss": 3.2027, + "step": 50410 + }, + { + "epoch": 1.4771845015747456, + "grad_norm": 14.749821662902832, + "learning_rate": 5.368941636450113e-06, + "loss": 3.1929, + "step": 50420 + }, + { + "epoch": 1.4774774774774775, + "grad_norm": 17.699892044067383, + "learning_rate": 5.367364442092724e-06, + "loss": 3.2113, + "step": 50430 + }, + { + "epoch": 1.4777704533802094, + "grad_norm": 17.021833419799805, + "learning_rate": 5.3657872109827026e-06, + "loss": 3.2114, + "step": 50440 + }, + { + "epoch": 1.4780634292829415, + "grad_norm": 16.21695327758789, + "learning_rate": 5.364209943277841e-06, + "loss": 3.2102, + "step": 50450 + }, + { + "epoch": 1.4783564051856735, + "grad_norm": 15.3029146194458, + "learning_rate": 5.362632639135934e-06, + "loss": 3.2308, + "step": 50460 + }, + { + "epoch": 1.4786493810884056, + "grad_norm": 16.933082580566406, + "learning_rate": 5.3610552987147845e-06, + "loss": 3.1939, + "step": 50470 + }, + { + "epoch": 1.4789423569911375, + "grad_norm": 17.91149139404297, + "learning_rate": 5.359477922172194e-06, + "loss": 3.2209, + "step": 50480 + }, + { + "epoch": 1.4792353328938694, + "grad_norm": 19.163490295410156, + "learning_rate": 5.3579005096659695e-06, + "loss": 3.2443, + "step": 50490 + }, + { + "epoch": 1.4795283087966014, + "grad_norm": 18.516735076904297, + "learning_rate": 5.356323061353923e-06, + "loss": 3.2311, + "step": 50500 + }, + { + "epoch": 1.4798212846993335, + "grad_norm": 17.580167770385742, + "learning_rate": 5.354745577393868e-06, + "loss": 3.2386, + "step": 50510 + }, + { + "epoch": 1.4801142606020654, + "grad_norm": 17.397602081298828, + "learning_rate": 5.353168057943623e-06, + "loss": 3.2275, + "step": 50520 + }, + { + "epoch": 1.4804072365047976, + "grad_norm": 13.045212745666504, + "learning_rate": 5.351590503161007e-06, + "loss": 3.2358, + "step": 50530 + }, + { + "epoch": 1.4807002124075295, + "grad_norm": 16.360454559326172, + "learning_rate": 5.350012913203849e-06, + "loss": 3.2412, + "step": 50540 + }, + { + "epoch": 1.4809931883102614, + "grad_norm": 15.270153045654297, + "learning_rate": 5.348435288229976e-06, + "loss": 3.2362, + "step": 50550 + }, + { + "epoch": 1.4812861642129935, + "grad_norm": 17.858150482177734, + "learning_rate": 5.34685762839722e-06, + "loss": 3.2196, + "step": 50560 + }, + { + "epoch": 1.4815791401157254, + "grad_norm": 16.2511043548584, + "learning_rate": 5.345279933863413e-06, + "loss": 3.2321, + "step": 50570 + }, + { + "epoch": 1.4818721160184576, + "grad_norm": 17.158470153808594, + "learning_rate": 5.3437022047864e-06, + "loss": 3.2256, + "step": 50580 + }, + { + "epoch": 1.4821650919211895, + "grad_norm": 12.019499778747559, + "learning_rate": 5.342124441324021e-06, + "loss": 3.2435, + "step": 50590 + }, + { + "epoch": 1.4824580678239214, + "grad_norm": 17.033523559570312, + "learning_rate": 5.340546643634118e-06, + "loss": 3.2392, + "step": 50600 + }, + { + "epoch": 1.4827510437266536, + "grad_norm": 17.760276794433594, + "learning_rate": 5.338968811874547e-06, + "loss": 3.2392, + "step": 50610 + }, + { + "epoch": 1.4830440196293855, + "grad_norm": 18.250930786132812, + "learning_rate": 5.337390946203155e-06, + "loss": 3.2508, + "step": 50620 + }, + { + "epoch": 1.4833369955321176, + "grad_norm": 15.987285614013672, + "learning_rate": 5.3358130467778015e-06, + "loss": 3.2186, + "step": 50630 + }, + { + "epoch": 1.4836299714348495, + "grad_norm": 16.633502960205078, + "learning_rate": 5.334235113756346e-06, + "loss": 3.2143, + "step": 50640 + }, + { + "epoch": 1.4839229473375815, + "grad_norm": 18.35929298400879, + "learning_rate": 5.332657147296649e-06, + "loss": 3.2267, + "step": 50650 + }, + { + "epoch": 1.4842159232403134, + "grad_norm": 16.554889678955078, + "learning_rate": 5.331079147556578e-06, + "loss": 3.236, + "step": 50660 + }, + { + "epoch": 1.4845088991430455, + "grad_norm": 14.604026794433594, + "learning_rate": 5.329501114694004e-06, + "loss": 3.228, + "step": 50670 + }, + { + "epoch": 1.4848018750457774, + "grad_norm": 15.146485328674316, + "learning_rate": 5.327923048866797e-06, + "loss": 3.1967, + "step": 50680 + }, + { + "epoch": 1.485036255767963, + "eval_bleu": 0.33921718157684627, + "eval_cap_loss": 0.9395105242729187, + "eval_con_loss": 1.231065034866333, + "eval_loss": 3.4016406536102295, + "step": 50688 + }, + { + "epoch": 1.485036255767963, + "eval_bleu": 0.33921718157684627, + "eval_cap_loss": 0.9395105242729187, + "eval_con_loss": 1.231065034866333, + "eval_loss": 3.4016406536102295, + "eval_runtime": 55.7907, + "eval_samples_per_second": 358.483, + "eval_steps_per_second": 0.358, + "step": 50688 + }, + { + "epoch": 1.485109499743646, + "grad_norm": 8.415544509887695, + "learning_rate": 5.326344950232835e-06, + "loss": 3.2441, + "step": 50690 + }, + { + "epoch": 1.485402475646378, + "grad_norm": 8.368660926818848, + "learning_rate": 5.324766818949996e-06, + "loss": 3.2108, + "step": 50700 + }, + { + "epoch": 1.48569545154911, + "grad_norm": 8.313858032226562, + "learning_rate": 5.3231886551761635e-06, + "loss": 3.2124, + "step": 50710 + }, + { + "epoch": 1.485988427451842, + "grad_norm": 8.788347244262695, + "learning_rate": 5.321610459069225e-06, + "loss": 3.1814, + "step": 50720 + }, + { + "epoch": 1.4862814033545741, + "grad_norm": 8.221402168273926, + "learning_rate": 5.320032230787067e-06, + "loss": 3.1811, + "step": 50730 + }, + { + "epoch": 1.486574379257306, + "grad_norm": 8.491487503051758, + "learning_rate": 5.318453970487582e-06, + "loss": 3.1915, + "step": 50740 + }, + { + "epoch": 1.486867355160038, + "grad_norm": 8.41349983215332, + "learning_rate": 5.316875678328669e-06, + "loss": 3.2079, + "step": 50750 + }, + { + "epoch": 1.4871603310627701, + "grad_norm": 8.302898406982422, + "learning_rate": 5.315297354468223e-06, + "loss": 3.1892, + "step": 50760 + }, + { + "epoch": 1.487453306965502, + "grad_norm": 9.065145492553711, + "learning_rate": 5.313718999064148e-06, + "loss": 3.2074, + "step": 50770 + }, + { + "epoch": 1.4877462828682342, + "grad_norm": 8.137097358703613, + "learning_rate": 5.312140612274347e-06, + "loss": 3.1719, + "step": 50780 + }, + { + "epoch": 1.488039258770966, + "grad_norm": 8.648456573486328, + "learning_rate": 5.310562194256731e-06, + "loss": 3.1764, + "step": 50790 + }, + { + "epoch": 1.488332234673698, + "grad_norm": 8.566890716552734, + "learning_rate": 5.308983745169209e-06, + "loss": 3.1678, + "step": 50800 + }, + { + "epoch": 1.4886252105764302, + "grad_norm": 8.393436431884766, + "learning_rate": 5.307405265169697e-06, + "loss": 3.171, + "step": 50810 + }, + { + "epoch": 1.488918186479162, + "grad_norm": 8.321046829223633, + "learning_rate": 5.305826754416111e-06, + "loss": 3.1585, + "step": 50820 + }, + { + "epoch": 1.4892111623818942, + "grad_norm": 8.157771110534668, + "learning_rate": 5.304248213066374e-06, + "loss": 3.1688, + "step": 50830 + }, + { + "epoch": 1.4895041382846261, + "grad_norm": 8.934114456176758, + "learning_rate": 5.302669641278408e-06, + "loss": 3.1963, + "step": 50840 + }, + { + "epoch": 1.489797114187358, + "grad_norm": 8.325257301330566, + "learning_rate": 5.30109103921014e-06, + "loss": 3.1714, + "step": 50850 + }, + { + "epoch": 1.49009009009009, + "grad_norm": 8.640647888183594, + "learning_rate": 5.299512407019499e-06, + "loss": 3.1797, + "step": 50860 + }, + { + "epoch": 1.490383065992822, + "grad_norm": 8.52275276184082, + "learning_rate": 5.297933744864419e-06, + "loss": 3.1663, + "step": 50870 + }, + { + "epoch": 1.490676041895554, + "grad_norm": 8.383081436157227, + "learning_rate": 5.296355052902837e-06, + "loss": 3.1684, + "step": 50880 + }, + { + "epoch": 1.4909690177982862, + "grad_norm": 8.574543952941895, + "learning_rate": 5.294776331292689e-06, + "loss": 3.1782, + "step": 50890 + }, + { + "epoch": 1.491261993701018, + "grad_norm": 8.7151460647583, + "learning_rate": 5.293197580191917e-06, + "loss": 3.1836, + "step": 50900 + }, + { + "epoch": 1.49155496960375, + "grad_norm": 8.758657455444336, + "learning_rate": 5.291618799758468e-06, + "loss": 3.1773, + "step": 50910 + }, + { + "epoch": 1.4918479455064821, + "grad_norm": 8.065762519836426, + "learning_rate": 5.290039990150287e-06, + "loss": 3.1743, + "step": 50920 + }, + { + "epoch": 1.492140921409214, + "grad_norm": 9.01583194732666, + "learning_rate": 5.288461151525327e-06, + "loss": 3.1748, + "step": 50930 + }, + { + "epoch": 1.4924338973119462, + "grad_norm": 8.86422348022461, + "learning_rate": 5.286882284041539e-06, + "loss": 3.1854, + "step": 50940 + }, + { + "epoch": 1.4927268732146781, + "grad_norm": 8.77214527130127, + "learning_rate": 5.285303387856882e-06, + "loss": 3.1833, + "step": 50950 + }, + { + "epoch": 1.49301984911741, + "grad_norm": 8.107643127441406, + "learning_rate": 5.283724463129311e-06, + "loss": 3.1695, + "step": 50960 + }, + { + "epoch": 1.4933128250201422, + "grad_norm": 8.571802139282227, + "learning_rate": 5.282145510016793e-06, + "loss": 3.1754, + "step": 50970 + }, + { + "epoch": 1.493605800922874, + "grad_norm": 8.631721496582031, + "learning_rate": 5.280566528677289e-06, + "loss": 3.1621, + "step": 50980 + }, + { + "epoch": 1.493898776825606, + "grad_norm": 8.471765518188477, + "learning_rate": 5.27898751926877e-06, + "loss": 3.1635, + "step": 50990 + }, + { + "epoch": 1.4941917527283382, + "grad_norm": 8.372424125671387, + "learning_rate": 5.277408481949204e-06, + "loss": 3.203, + "step": 51000 + }, + { + "epoch": 1.49448472863107, + "grad_norm": 8.98491382598877, + "learning_rate": 5.275829416876566e-06, + "loss": 3.1644, + "step": 51010 + }, + { + "epoch": 1.494777704533802, + "grad_norm": 8.78726577758789, + "learning_rate": 5.2742503242088295e-06, + "loss": 3.1761, + "step": 51020 + }, + { + "epoch": 1.4950706804365341, + "grad_norm": 9.033971786499023, + "learning_rate": 5.272671204103977e-06, + "loss": 3.1706, + "step": 51030 + }, + { + "epoch": 1.495363656339266, + "grad_norm": 8.851226806640625, + "learning_rate": 5.2710920567199875e-06, + "loss": 3.1949, + "step": 51040 + }, + { + "epoch": 1.4956566322419982, + "grad_norm": 8.421777725219727, + "learning_rate": 5.2695128822148466e-06, + "loss": 3.1652, + "step": 51050 + }, + { + "epoch": 1.49594960814473, + "grad_norm": 8.683267593383789, + "learning_rate": 5.2679336807465395e-06, + "loss": 3.167, + "step": 51060 + }, + { + "epoch": 1.496242584047462, + "grad_norm": 8.49428939819336, + "learning_rate": 5.266354452473058e-06, + "loss": 3.1864, + "step": 51070 + }, + { + "epoch": 1.4965355599501942, + "grad_norm": 8.566364288330078, + "learning_rate": 5.264775197552394e-06, + "loss": 3.1727, + "step": 51080 + }, + { + "epoch": 1.496828535852926, + "grad_norm": 9.127594947814941, + "learning_rate": 5.263195916142543e-06, + "loss": 3.1775, + "step": 51090 + }, + { + "epoch": 1.4971215117556582, + "grad_norm": 8.131237983703613, + "learning_rate": 5.261616608401501e-06, + "loss": 3.1457, + "step": 51100 + }, + { + "epoch": 1.4974144876583901, + "grad_norm": 8.394726753234863, + "learning_rate": 5.26003727448727e-06, + "loss": 3.1547, + "step": 51110 + }, + { + "epoch": 1.497707463561122, + "grad_norm": 8.440760612487793, + "learning_rate": 5.258457914557852e-06, + "loss": 3.1742, + "step": 51120 + }, + { + "epoch": 1.498000439463854, + "grad_norm": 8.351314544677734, + "learning_rate": 5.256878528771253e-06, + "loss": 3.1576, + "step": 51130 + }, + { + "epoch": 1.4982934153665861, + "grad_norm": 8.005354881286621, + "learning_rate": 5.255299117285482e-06, + "loss": 3.1557, + "step": 51140 + }, + { + "epoch": 1.498586391269318, + "grad_norm": 8.03897476196289, + "learning_rate": 5.253719680258548e-06, + "loss": 3.1548, + "step": 51150 + }, + { + "epoch": 1.4988793671720502, + "grad_norm": 8.413506507873535, + "learning_rate": 5.252140217848466e-06, + "loss": 3.1736, + "step": 51160 + }, + { + "epoch": 1.499172343074782, + "grad_norm": 7.949096202850342, + "learning_rate": 5.250560730213251e-06, + "loss": 3.1608, + "step": 51170 + }, + { + "epoch": 1.499465318977514, + "grad_norm": 8.87627124786377, + "learning_rate": 5.24898121751092e-06, + "loss": 3.1583, + "step": 51180 + }, + { + "epoch": 1.4997582948802461, + "grad_norm": 9.263818740844727, + "learning_rate": 5.247401679899496e-06, + "loss": 3.171, + "step": 51190 + }, + { + "epoch": 1.500051270782978, + "grad_norm": 8.80300235748291, + "learning_rate": 5.245822117537004e-06, + "loss": 3.1642, + "step": 51200 + }, + { + "epoch": 1.500051270782978, + "eval_bleu": 0.343144011941446, + "eval_cap_loss": 0.9311057329177856, + "eval_con_loss": 1.204994797706604, + "eval_loss": 3.341095209121704, + "step": 51200 + }, + { + "epoch": 1.500051270782978, + "eval_bleu": 0.343144011941446, + "eval_cap_loss": 0.9311057329177856, + "eval_con_loss": 1.204994797706604, + "eval_loss": 3.341095209121704, + "eval_runtime": 52.0288, + "eval_samples_per_second": 384.403, + "eval_steps_per_second": 0.384, + "step": 51200 + }, + { + "epoch": 1.5003442466857102, + "grad_norm": 8.962583541870117, + "learning_rate": 5.244242530581466e-06, + "loss": 3.1799, + "step": 51210 + }, + { + "epoch": 1.5006372225884421, + "grad_norm": 8.126555442810059, + "learning_rate": 5.242662919190911e-06, + "loss": 3.1597, + "step": 51220 + }, + { + "epoch": 1.500930198491174, + "grad_norm": 8.259430885314941, + "learning_rate": 5.241083283523371e-06, + "loss": 3.1591, + "step": 51230 + }, + { + "epoch": 1.501223174393906, + "grad_norm": 8.72915267944336, + "learning_rate": 5.239503623736879e-06, + "loss": 3.1764, + "step": 51240 + }, + { + "epoch": 1.501516150296638, + "grad_norm": 8.366486549377441, + "learning_rate": 5.237923939989471e-06, + "loss": 3.1624, + "step": 51250 + }, + { + "epoch": 1.5018091261993702, + "grad_norm": 8.372663497924805, + "learning_rate": 5.236344232439182e-06, + "loss": 3.1503, + "step": 51260 + }, + { + "epoch": 1.5021021021021022, + "grad_norm": 8.36845588684082, + "learning_rate": 5.2347645012440574e-06, + "loss": 3.1696, + "step": 51270 + }, + { + "epoch": 1.502395078004834, + "grad_norm": 8.385665893554688, + "learning_rate": 5.2331847465621365e-06, + "loss": 3.1415, + "step": 51280 + }, + { + "epoch": 1.502688053907566, + "grad_norm": 8.666770935058594, + "learning_rate": 5.231604968551466e-06, + "loss": 3.176, + "step": 51290 + }, + { + "epoch": 1.5029810298102981, + "grad_norm": 8.856642723083496, + "learning_rate": 5.23002516737009e-06, + "loss": 3.1566, + "step": 51300 + }, + { + "epoch": 1.50327400571303, + "grad_norm": 8.299357414245605, + "learning_rate": 5.228445343176063e-06, + "loss": 3.1535, + "step": 51310 + }, + { + "epoch": 1.5035669816157622, + "grad_norm": 9.027207374572754, + "learning_rate": 5.226865496127432e-06, + "loss": 3.1649, + "step": 51320 + }, + { + "epoch": 1.503859957518494, + "grad_norm": 8.50649642944336, + "learning_rate": 5.225285626382257e-06, + "loss": 3.1515, + "step": 51330 + }, + { + "epoch": 1.504152933421226, + "grad_norm": 7.993706703186035, + "learning_rate": 5.223705734098591e-06, + "loss": 3.1491, + "step": 51340 + }, + { + "epoch": 1.504445909323958, + "grad_norm": 8.441570281982422, + "learning_rate": 5.222125819434492e-06, + "loss": 3.1869, + "step": 51350 + }, + { + "epoch": 1.50473888522669, + "grad_norm": 8.774001121520996, + "learning_rate": 5.220545882548024e-06, + "loss": 3.1694, + "step": 51360 + }, + { + "epoch": 1.5050318611294222, + "grad_norm": 8.727777481079102, + "learning_rate": 5.218965923597249e-06, + "loss": 3.1558, + "step": 51370 + }, + { + "epoch": 1.5053248370321541, + "grad_norm": 8.88333511352539, + "learning_rate": 5.217385942740232e-06, + "loss": 3.1418, + "step": 51380 + }, + { + "epoch": 1.505617812934886, + "grad_norm": 8.753259658813477, + "learning_rate": 5.21580594013504e-06, + "loss": 3.1623, + "step": 51390 + }, + { + "epoch": 1.505910788837618, + "grad_norm": 8.280718803405762, + "learning_rate": 5.214225915939746e-06, + "loss": 3.1617, + "step": 51400 + }, + { + "epoch": 1.5062037647403501, + "grad_norm": 8.465285301208496, + "learning_rate": 5.212645870312419e-06, + "loss": 3.1448, + "step": 51410 + }, + { + "epoch": 1.5064967406430823, + "grad_norm": 8.657845497131348, + "learning_rate": 5.211065803411135e-06, + "loss": 3.1599, + "step": 51420 + }, + { + "epoch": 1.5067897165458142, + "grad_norm": 8.191426277160645, + "learning_rate": 5.209485715393968e-06, + "loss": 3.1851, + "step": 51430 + }, + { + "epoch": 1.507082692448546, + "grad_norm": 7.674312114715576, + "learning_rate": 5.207905606419e-06, + "loss": 3.1612, + "step": 51440 + }, + { + "epoch": 1.507375668351278, + "grad_norm": 8.865761756896973, + "learning_rate": 5.2063254766443084e-06, + "loss": 3.1627, + "step": 51450 + }, + { + "epoch": 1.5076686442540101, + "grad_norm": 8.28426456451416, + "learning_rate": 5.204745326227978e-06, + "loss": 3.1511, + "step": 51460 + }, + { + "epoch": 1.507961620156742, + "grad_norm": 7.9443511962890625, + "learning_rate": 5.20316515532809e-06, + "loss": 3.1391, + "step": 51470 + }, + { + "epoch": 1.5082545960594742, + "grad_norm": 9.252354621887207, + "learning_rate": 5.201584964102736e-06, + "loss": 3.1707, + "step": 51480 + }, + { + "epoch": 1.5085475719622061, + "grad_norm": 8.72398567199707, + "learning_rate": 5.200004752710002e-06, + "loss": 3.1679, + "step": 51490 + }, + { + "epoch": 1.508840547864938, + "grad_norm": 9.1123046875, + "learning_rate": 5.198424521307979e-06, + "loss": 3.1696, + "step": 51500 + }, + { + "epoch": 1.50913352376767, + "grad_norm": 8.858030319213867, + "learning_rate": 5.196844270054758e-06, + "loss": 3.1629, + "step": 51510 + }, + { + "epoch": 1.509426499670402, + "grad_norm": 9.246156692504883, + "learning_rate": 5.195263999108438e-06, + "loss": 3.1695, + "step": 51520 + }, + { + "epoch": 1.5097194755731342, + "grad_norm": 8.601280212402344, + "learning_rate": 5.1936837086271116e-06, + "loss": 3.1589, + "step": 51530 + }, + { + "epoch": 1.5100124514758662, + "grad_norm": 8.30382251739502, + "learning_rate": 5.192103398768881e-06, + "loss": 3.1701, + "step": 51540 + }, + { + "epoch": 1.510305427378598, + "grad_norm": 8.211824417114258, + "learning_rate": 5.190523069691843e-06, + "loss": 3.1768, + "step": 51550 + }, + { + "epoch": 1.51059840328133, + "grad_norm": 9.411044120788574, + "learning_rate": 5.188942721554103e-06, + "loss": 3.1413, + "step": 51560 + }, + { + "epoch": 1.5108913791840621, + "grad_norm": 8.656189918518066, + "learning_rate": 5.187362354513766e-06, + "loss": 3.1674, + "step": 51570 + }, + { + "epoch": 1.5111843550867943, + "grad_norm": 8.616340637207031, + "learning_rate": 5.185781968728936e-06, + "loss": 3.1705, + "step": 51580 + }, + { + "epoch": 1.5114773309895262, + "grad_norm": 8.598649024963379, + "learning_rate": 5.184201564357721e-06, + "loss": 3.1605, + "step": 51590 + }, + { + "epoch": 1.511770306892258, + "grad_norm": 8.633376121520996, + "learning_rate": 5.182621141558234e-06, + "loss": 3.15, + "step": 51600 + }, + { + "epoch": 1.51206328279499, + "grad_norm": 8.758502006530762, + "learning_rate": 5.181040700488585e-06, + "loss": 3.1517, + "step": 51610 + }, + { + "epoch": 1.5123562586977222, + "grad_norm": 8.571746826171875, + "learning_rate": 5.179460241306889e-06, + "loss": 3.1546, + "step": 51620 + }, + { + "epoch": 1.512649234600454, + "grad_norm": 8.795798301696777, + "learning_rate": 5.177879764171257e-06, + "loss": 3.1522, + "step": 51630 + }, + { + "epoch": 1.5129422105031862, + "grad_norm": 8.405508041381836, + "learning_rate": 5.176299269239814e-06, + "loss": 3.1399, + "step": 51640 + }, + { + "epoch": 1.5132351864059181, + "grad_norm": 8.311934471130371, + "learning_rate": 5.1747187566706735e-06, + "loss": 3.165, + "step": 51650 + }, + { + "epoch": 1.51352816230865, + "grad_norm": 9.228873252868652, + "learning_rate": 5.173138226621958e-06, + "loss": 3.1586, + "step": 51660 + }, + { + "epoch": 1.513821138211382, + "grad_norm": 8.888557434082031, + "learning_rate": 5.171557679251788e-06, + "loss": 3.1717, + "step": 51670 + }, + { + "epoch": 1.5141141141141141, + "grad_norm": 8.390716552734375, + "learning_rate": 5.169977114718292e-06, + "loss": 3.1603, + "step": 51680 + }, + { + "epoch": 1.5144070900168463, + "grad_norm": 8.7487211227417, + "learning_rate": 5.168396533179593e-06, + "loss": 3.1634, + "step": 51690 + }, + { + "epoch": 1.5147000659195782, + "grad_norm": 8.517693519592285, + "learning_rate": 5.166815934793822e-06, + "loss": 3.1428, + "step": 51700 + }, + { + "epoch": 1.51499304182231, + "grad_norm": 8.817834854125977, + "learning_rate": 5.165235319719106e-06, + "loss": 3.1397, + "step": 51710 + }, + { + "epoch": 1.5150516370028564, + "eval_bleu": 0.34363256786246393, + "eval_cap_loss": 0.9289023280143738, + "eval_con_loss": 1.1994819641113281, + "eval_loss": 3.3278660774230957, + "step": 51712 + }, + { + "epoch": 1.5150516370028564, + "eval_bleu": 0.34363256786246393, + "eval_cap_loss": 0.9289023280143738, + "eval_con_loss": 1.1994819641113281, + "eval_loss": 3.3278660774230957, + "eval_runtime": 50.9929, + "eval_samples_per_second": 392.211, + "eval_steps_per_second": 0.392, + "step": 51712 + }, + { + "epoch": 1.515286017725042, + "grad_norm": 8.245842933654785, + "learning_rate": 5.1636546881135754e-06, + "loss": 3.1347, + "step": 51720 + }, + { + "epoch": 1.5155789936277742, + "grad_norm": 9.126054763793945, + "learning_rate": 5.162074040135364e-06, + "loss": 3.1656, + "step": 51730 + }, + { + "epoch": 1.515871969530506, + "grad_norm": 8.243552207946777, + "learning_rate": 5.160493375942608e-06, + "loss": 3.1748, + "step": 51740 + }, + { + "epoch": 1.5161649454332382, + "grad_norm": 8.808847427368164, + "learning_rate": 5.158912695693441e-06, + "loss": 3.1528, + "step": 51750 + }, + { + "epoch": 1.5164579213359701, + "grad_norm": 8.545345306396484, + "learning_rate": 5.157331999546e-06, + "loss": 3.153, + "step": 51760 + }, + { + "epoch": 1.516750897238702, + "grad_norm": 8.486441612243652, + "learning_rate": 5.155751287658428e-06, + "loss": 3.1445, + "step": 51770 + }, + { + "epoch": 1.517043873141434, + "grad_norm": 8.672775268554688, + "learning_rate": 5.1541705601888634e-06, + "loss": 3.1731, + "step": 51780 + }, + { + "epoch": 1.517336849044166, + "grad_norm": 8.314386367797852, + "learning_rate": 5.1525898172954496e-06, + "loss": 3.1386, + "step": 51790 + }, + { + "epoch": 1.5176298249468982, + "grad_norm": 8.530346870422363, + "learning_rate": 5.1510090591363285e-06, + "loss": 3.1609, + "step": 51800 + }, + { + "epoch": 1.5179228008496302, + "grad_norm": 8.079116821289062, + "learning_rate": 5.149428285869647e-06, + "loss": 3.1296, + "step": 51810 + }, + { + "epoch": 1.518215776752362, + "grad_norm": 8.18740177154541, + "learning_rate": 5.147847497653553e-06, + "loss": 3.146, + "step": 51820 + }, + { + "epoch": 1.518508752655094, + "grad_norm": 9.312529563903809, + "learning_rate": 5.146266694646194e-06, + "loss": 3.1508, + "step": 51830 + }, + { + "epoch": 1.5188017285578261, + "grad_norm": 8.314557075500488, + "learning_rate": 5.144685877005719e-06, + "loss": 3.1373, + "step": 51840 + }, + { + "epoch": 1.5190947044605583, + "grad_norm": 8.486719131469727, + "learning_rate": 5.143105044890281e-06, + "loss": 3.1527, + "step": 51850 + }, + { + "epoch": 1.5193876803632902, + "grad_norm": 8.21899127960205, + "learning_rate": 5.141524198458032e-06, + "loss": 3.1575, + "step": 51860 + }, + { + "epoch": 1.5196806562660221, + "grad_norm": 8.579495429992676, + "learning_rate": 5.139943337867127e-06, + "loss": 3.1453, + "step": 51870 + }, + { + "epoch": 1.519973632168754, + "grad_norm": 8.538172721862793, + "learning_rate": 5.13836246327572e-06, + "loss": 3.1371, + "step": 51880 + }, + { + "epoch": 1.5202666080714862, + "grad_norm": 9.172334671020508, + "learning_rate": 5.13678157484197e-06, + "loss": 3.1682, + "step": 51890 + }, + { + "epoch": 1.520559583974218, + "grad_norm": 8.843663215637207, + "learning_rate": 5.1352006727240364e-06, + "loss": 3.1574, + "step": 51900 + }, + { + "epoch": 1.5208525598769502, + "grad_norm": 8.520994186401367, + "learning_rate": 5.1336197570800775e-06, + "loss": 3.1604, + "step": 51910 + }, + { + "epoch": 1.5211455357796821, + "grad_norm": 9.039958000183105, + "learning_rate": 5.132038828068252e-06, + "loss": 3.1562, + "step": 51920 + }, + { + "epoch": 1.521438511682414, + "grad_norm": 8.635611534118652, + "learning_rate": 5.130457885846727e-06, + "loss": 3.1424, + "step": 51930 + }, + { + "epoch": 1.521731487585146, + "grad_norm": 8.60450267791748, + "learning_rate": 5.128876930573665e-06, + "loss": 3.1408, + "step": 51940 + }, + { + "epoch": 1.5220244634878781, + "grad_norm": 8.449539184570312, + "learning_rate": 5.12729596240723e-06, + "loss": 3.1467, + "step": 51950 + }, + { + "epoch": 1.5223174393906103, + "grad_norm": 9.111474990844727, + "learning_rate": 5.125714981505589e-06, + "loss": 3.1415, + "step": 51960 + }, + { + "epoch": 1.5226104152933422, + "grad_norm": 9.152364730834961, + "learning_rate": 5.124133988026909e-06, + "loss": 3.1466, + "step": 51970 + }, + { + "epoch": 1.522903391196074, + "grad_norm": 8.263641357421875, + "learning_rate": 5.122552982129362e-06, + "loss": 3.1445, + "step": 51980 + }, + { + "epoch": 1.523196367098806, + "grad_norm": 8.702726364135742, + "learning_rate": 5.1209719639711155e-06, + "loss": 3.1551, + "step": 51990 + }, + { + "epoch": 1.5234893430015382, + "grad_norm": 8.297789573669434, + "learning_rate": 5.119390933710341e-06, + "loss": 3.1363, + "step": 52000 + }, + { + "epoch": 1.52378231890427, + "grad_norm": 8.523819923400879, + "learning_rate": 5.117809891505212e-06, + "loss": 3.1602, + "step": 52010 + }, + { + "epoch": 1.5240752948070022, + "grad_norm": 8.850852012634277, + "learning_rate": 5.116228837513904e-06, + "loss": 3.161, + "step": 52020 + }, + { + "epoch": 1.5243682707097341, + "grad_norm": 9.088247299194336, + "learning_rate": 5.114647771894589e-06, + "loss": 3.1542, + "step": 52030 + }, + { + "epoch": 1.524661246612466, + "grad_norm": 8.670654296875, + "learning_rate": 5.113066694805444e-06, + "loss": 3.1404, + "step": 52040 + }, + { + "epoch": 1.524954222515198, + "grad_norm": 8.610963821411133, + "learning_rate": 5.111485606404647e-06, + "loss": 3.1363, + "step": 52050 + }, + { + "epoch": 1.52524719841793, + "grad_norm": 8.180251121520996, + "learning_rate": 5.109904506850377e-06, + "loss": 3.1483, + "step": 52060 + }, + { + "epoch": 1.5255401743206622, + "grad_norm": 8.560254096984863, + "learning_rate": 5.108323396300814e-06, + "loss": 3.131, + "step": 52070 + }, + { + "epoch": 1.5258331502233942, + "grad_norm": 8.017560005187988, + "learning_rate": 5.106742274914137e-06, + "loss": 3.1556, + "step": 52080 + }, + { + "epoch": 1.526126126126126, + "grad_norm": 8.402871131896973, + "learning_rate": 5.1051611428485285e-06, + "loss": 3.1518, + "step": 52090 + }, + { + "epoch": 1.526419102028858, + "grad_norm": 8.732322692871094, + "learning_rate": 5.103580000262172e-06, + "loss": 3.1566, + "step": 52100 + }, + { + "epoch": 1.5267120779315901, + "grad_norm": 8.78670883178711, + "learning_rate": 5.101998847313251e-06, + "loss": 3.1465, + "step": 52110 + }, + { + "epoch": 1.5270050538343223, + "grad_norm": 8.677411079406738, + "learning_rate": 5.100417684159951e-06, + "loss": 3.1526, + "step": 52120 + }, + { + "epoch": 1.5272980297370542, + "grad_norm": 8.522546768188477, + "learning_rate": 5.098836510960456e-06, + "loss": 3.1685, + "step": 52130 + }, + { + "epoch": 1.5275910056397861, + "grad_norm": 8.726868629455566, + "learning_rate": 5.097255327872955e-06, + "loss": 3.1287, + "step": 52140 + }, + { + "epoch": 1.527883981542518, + "grad_norm": 8.805760383605957, + "learning_rate": 5.095674135055636e-06, + "loss": 3.1668, + "step": 52150 + }, + { + "epoch": 1.5281769574452502, + "grad_norm": 7.955661296844482, + "learning_rate": 5.094092932666688e-06, + "loss": 3.1318, + "step": 52160 + }, + { + "epoch": 1.528469933347982, + "grad_norm": 8.77187728881836, + "learning_rate": 5.092511720864298e-06, + "loss": 3.1689, + "step": 52170 + }, + { + "epoch": 1.5287629092507142, + "grad_norm": 8.962908744812012, + "learning_rate": 5.090930499806661e-06, + "loss": 3.1564, + "step": 52180 + }, + { + "epoch": 1.5290558851534461, + "grad_norm": 8.00936222076416, + "learning_rate": 5.089349269651967e-06, + "loss": 3.1328, + "step": 52190 + }, + { + "epoch": 1.529348861056178, + "grad_norm": 8.817808151245117, + "learning_rate": 5.087768030558409e-06, + "loss": 3.1298, + "step": 52200 + }, + { + "epoch": 1.52964183695891, + "grad_norm": 8.624449729919434, + "learning_rate": 5.086186782684178e-06, + "loss": 3.1366, + "step": 52210 + }, + { + "epoch": 1.5299348128616421, + "grad_norm": 8.55356216430664, + "learning_rate": 5.0846055261874724e-06, + "loss": 3.1691, + "step": 52220 + }, + { + "epoch": 1.530052003222735, + "eval_bleu": 0.34436980056874317, + "eval_cap_loss": 0.9271775484085083, + "eval_con_loss": 1.194507360458374, + "eval_loss": 3.316192626953125, + "step": 52224 + }, + { + "epoch": 1.530052003222735, + "eval_bleu": 0.34436980056874317, + "eval_cap_loss": 0.9271775484085083, + "eval_con_loss": 1.194507360458374, + "eval_loss": 3.316192626953125, + "eval_runtime": 51.8233, + "eval_samples_per_second": 385.927, + "eval_steps_per_second": 0.386, + "step": 52224 + }, + { + "epoch": 1.5302277887643743, + "grad_norm": 9.149628639221191, + "learning_rate": 5.083024261226486e-06, + "loss": 3.1435, + "step": 52230 + }, + { + "epoch": 1.5305207646671062, + "grad_norm": 9.055959701538086, + "learning_rate": 5.081442987959415e-06, + "loss": 3.1234, + "step": 52240 + }, + { + "epoch": 1.530813740569838, + "grad_norm": 8.225736618041992, + "learning_rate": 5.079861706544454e-06, + "loss": 3.1708, + "step": 52250 + }, + { + "epoch": 1.53110671647257, + "grad_norm": 8.899113655090332, + "learning_rate": 5.078280417139806e-06, + "loss": 3.1359, + "step": 52260 + }, + { + "epoch": 1.5313996923753022, + "grad_norm": 8.581658363342285, + "learning_rate": 5.0766991199036655e-06, + "loss": 3.1323, + "step": 52270 + }, + { + "epoch": 1.5316926682780343, + "grad_norm": 9.151034355163574, + "learning_rate": 5.075117814994234e-06, + "loss": 3.1556, + "step": 52280 + }, + { + "epoch": 1.5319856441807662, + "grad_norm": 9.301283836364746, + "learning_rate": 5.073536502569708e-06, + "loss": 3.1504, + "step": 52290 + }, + { + "epoch": 1.5322786200834981, + "grad_norm": 8.476110458374023, + "learning_rate": 5.071955182788294e-06, + "loss": 3.1327, + "step": 52300 + }, + { + "epoch": 1.53257159598623, + "grad_norm": 8.198807716369629, + "learning_rate": 5.070373855808189e-06, + "loss": 3.1379, + "step": 52310 + }, + { + "epoch": 1.532864571888962, + "grad_norm": 9.101912498474121, + "learning_rate": 5.068792521787599e-06, + "loss": 3.138, + "step": 52320 + }, + { + "epoch": 1.533157547791694, + "grad_norm": 9.076054573059082, + "learning_rate": 5.067211180884723e-06, + "loss": 3.1385, + "step": 52330 + }, + { + "epoch": 1.5334505236944262, + "grad_norm": 8.718988418579102, + "learning_rate": 5.06562983325777e-06, + "loss": 3.146, + "step": 52340 + }, + { + "epoch": 1.5337434995971582, + "grad_norm": 8.250903129577637, + "learning_rate": 5.06404847906494e-06, + "loss": 3.1457, + "step": 52350 + }, + { + "epoch": 1.53403647549989, + "grad_norm": 8.724946022033691, + "learning_rate": 5.062467118464441e-06, + "loss": 3.1586, + "step": 52360 + }, + { + "epoch": 1.534329451402622, + "grad_norm": 8.416396141052246, + "learning_rate": 5.060885751614475e-06, + "loss": 3.1734, + "step": 52370 + }, + { + "epoch": 1.5346224273053541, + "grad_norm": 8.567972183227539, + "learning_rate": 5.059304378673252e-06, + "loss": 3.1535, + "step": 52380 + }, + { + "epoch": 1.5349154032080863, + "grad_norm": 8.963924407958984, + "learning_rate": 5.057722999798977e-06, + "loss": 3.127, + "step": 52390 + }, + { + "epoch": 1.5352083791108182, + "grad_norm": 8.335789680480957, + "learning_rate": 5.056141615149858e-06, + "loss": 3.1366, + "step": 52400 + }, + { + "epoch": 1.5355013550135501, + "grad_norm": 9.127753257751465, + "learning_rate": 5.0545602248841054e-06, + "loss": 3.1352, + "step": 52410 + }, + { + "epoch": 1.535794330916282, + "grad_norm": 9.238720893859863, + "learning_rate": 5.052978829159925e-06, + "loss": 3.1528, + "step": 52420 + }, + { + "epoch": 1.5360873068190142, + "grad_norm": 8.673076629638672, + "learning_rate": 5.051397428135527e-06, + "loss": 3.1469, + "step": 52430 + }, + { + "epoch": 1.536380282721746, + "grad_norm": 8.44206714630127, + "learning_rate": 5.049816021969122e-06, + "loss": 3.1491, + "step": 52440 + }, + { + "epoch": 1.5366732586244782, + "grad_norm": 8.544922828674316, + "learning_rate": 5.048234610818919e-06, + "loss": 3.1376, + "step": 52450 + }, + { + "epoch": 1.5369662345272102, + "grad_norm": 8.648707389831543, + "learning_rate": 5.046653194843129e-06, + "loss": 3.1534, + "step": 52460 + }, + { + "epoch": 1.537259210429942, + "grad_norm": 8.40310001373291, + "learning_rate": 5.045071774199963e-06, + "loss": 3.1439, + "step": 52470 + }, + { + "epoch": 1.537552186332674, + "grad_norm": 8.907357215881348, + "learning_rate": 5.043490349047635e-06, + "loss": 3.1357, + "step": 52480 + }, + { + "epoch": 1.5378451622354061, + "grad_norm": 8.576940536499023, + "learning_rate": 5.041908919544356e-06, + "loss": 3.1462, + "step": 52490 + }, + { + "epoch": 1.5381381381381383, + "grad_norm": 8.455103874206543, + "learning_rate": 5.040327485848336e-06, + "loss": 3.149, + "step": 52500 + }, + { + "epoch": 1.5384311140408702, + "grad_norm": 8.787654876708984, + "learning_rate": 5.038746048117792e-06, + "loss": 3.1403, + "step": 52510 + }, + { + "epoch": 1.538724089943602, + "grad_norm": 9.253552436828613, + "learning_rate": 5.037164606510935e-06, + "loss": 3.1688, + "step": 52520 + }, + { + "epoch": 1.539017065846334, + "grad_norm": 8.291232109069824, + "learning_rate": 5.03558316118598e-06, + "loss": 3.1324, + "step": 52530 + }, + { + "epoch": 1.5393100417490662, + "grad_norm": 8.741731643676758, + "learning_rate": 5.0340017123011396e-06, + "loss": 3.1317, + "step": 52540 + }, + { + "epoch": 1.5396030176517983, + "grad_norm": 9.061458587646484, + "learning_rate": 5.032420260014632e-06, + "loss": 3.137, + "step": 52550 + }, + { + "epoch": 1.5398959935545302, + "grad_norm": 8.961209297180176, + "learning_rate": 5.030838804484668e-06, + "loss": 3.1597, + "step": 52560 + }, + { + "epoch": 1.5401889694572621, + "grad_norm": 8.349135398864746, + "learning_rate": 5.029257345869466e-06, + "loss": 3.1389, + "step": 52570 + }, + { + "epoch": 1.540481945359994, + "grad_norm": 8.638431549072266, + "learning_rate": 5.027675884327238e-06, + "loss": 3.1399, + "step": 52580 + }, + { + "epoch": 1.5407749212627262, + "grad_norm": 8.463245391845703, + "learning_rate": 5.0260944200162034e-06, + "loss": 3.1607, + "step": 52590 + }, + { + "epoch": 1.5410678971654581, + "grad_norm": 9.045578956604004, + "learning_rate": 5.024512953094577e-06, + "loss": 3.1496, + "step": 52600 + }, + { + "epoch": 1.5413608730681903, + "grad_norm": 8.613021850585938, + "learning_rate": 5.022931483720574e-06, + "loss": 3.1257, + "step": 52610 + }, + { + "epoch": 1.5416538489709222, + "grad_norm": 8.69627857208252, + "learning_rate": 5.021350012052413e-06, + "loss": 3.1194, + "step": 52620 + }, + { + "epoch": 1.541946824873654, + "grad_norm": 9.103736877441406, + "learning_rate": 5.01976853824831e-06, + "loss": 3.156, + "step": 52630 + }, + { + "epoch": 1.542239800776386, + "grad_norm": 8.247775077819824, + "learning_rate": 5.018187062466482e-06, + "loss": 3.1326, + "step": 52640 + }, + { + "epoch": 1.5425327766791181, + "grad_norm": 8.589775085449219, + "learning_rate": 5.016605584865146e-06, + "loss": 3.1676, + "step": 52650 + }, + { + "epoch": 1.5428257525818503, + "grad_norm": 8.163162231445312, + "learning_rate": 5.015024105602518e-06, + "loss": 3.126, + "step": 52660 + }, + { + "epoch": 1.5431187284845822, + "grad_norm": 8.839664459228516, + "learning_rate": 5.013442624836821e-06, + "loss": 3.1304, + "step": 52670 + }, + { + "epoch": 1.5434117043873141, + "grad_norm": 8.555083274841309, + "learning_rate": 5.0118611427262665e-06, + "loss": 3.133, + "step": 52680 + }, + { + "epoch": 1.543704680290046, + "grad_norm": 8.902806282043457, + "learning_rate": 5.010279659429076e-06, + "loss": 3.1266, + "step": 52690 + }, + { + "epoch": 1.5439976561927782, + "grad_norm": 9.16134262084961, + "learning_rate": 5.008698175103466e-06, + "loss": 3.1587, + "step": 52700 + }, + { + "epoch": 1.54429063209551, + "grad_norm": 8.348450660705566, + "learning_rate": 5.0071166899076565e-06, + "loss": 3.1455, + "step": 52710 + }, + { + "epoch": 1.5445836079982422, + "grad_norm": 8.488837242126465, + "learning_rate": 5.005535203999863e-06, + "loss": 3.1394, + "step": 52720 + }, + { + "epoch": 1.5448765839009742, + "grad_norm": 8.696962356567383, + "learning_rate": 5.003953717538306e-06, + "loss": 3.1338, + "step": 52730 + }, + { + "epoch": 1.5450523694426135, + "eval_bleu": 0.3452042130314188, + "eval_cap_loss": 0.9253991842269897, + "eval_con_loss": 1.191490888595581, + "eval_loss": 3.3083810806274414, + "step": 52736 + }, + { + "epoch": 1.5450523694426135, + "eval_bleu": 0.3452042130314188, + "eval_cap_loss": 0.9253991842269897, + "eval_con_loss": 1.191490888595581, + "eval_loss": 3.3083810806274414, + "eval_runtime": 52.2106, + "eval_samples_per_second": 383.064, + "eval_steps_per_second": 0.383, + "step": 52736 + }, + { + "epoch": 1.545169559803706, + "grad_norm": 8.996966361999512, + "learning_rate": 5.002372230681202e-06, + "loss": 3.1353, + "step": 52740 + }, + { + "epoch": 1.545462535706438, + "grad_norm": 9.460991859436035, + "learning_rate": 5.000790743586771e-06, + "loss": 3.1526, + "step": 52750 + }, + { + "epoch": 1.5457555116091701, + "grad_norm": 8.428253173828125, + "learning_rate": 4.99920925641323e-06, + "loss": 3.1489, + "step": 52760 + }, + { + "epoch": 1.5460484875119023, + "grad_norm": 8.566868782043457, + "learning_rate": 4.997627769318801e-06, + "loss": 3.1396, + "step": 52770 + }, + { + "epoch": 1.5463414634146342, + "grad_norm": 8.697120666503906, + "learning_rate": 4.996046282461696e-06, + "loss": 3.1393, + "step": 52780 + }, + { + "epoch": 1.546634439317366, + "grad_norm": 9.219966888427734, + "learning_rate": 4.99446479600014e-06, + "loss": 3.1555, + "step": 52790 + }, + { + "epoch": 1.546927415220098, + "grad_norm": 8.700045585632324, + "learning_rate": 4.992883310092346e-06, + "loss": 3.1429, + "step": 52800 + }, + { + "epoch": 1.5472203911228302, + "grad_norm": 8.881845474243164, + "learning_rate": 4.991301824896537e-06, + "loss": 3.1326, + "step": 52810 + }, + { + "epoch": 1.5475133670255623, + "grad_norm": 8.960335731506348, + "learning_rate": 4.989720340570925e-06, + "loss": 3.1203, + "step": 52820 + }, + { + "epoch": 1.5478063429282942, + "grad_norm": 8.947568893432617, + "learning_rate": 4.988138857273736e-06, + "loss": 3.1519, + "step": 52830 + }, + { + "epoch": 1.5480993188310261, + "grad_norm": 9.077143669128418, + "learning_rate": 4.986557375163182e-06, + "loss": 3.1283, + "step": 52840 + }, + { + "epoch": 1.548392294733758, + "grad_norm": 8.680407524108887, + "learning_rate": 4.9849758943974835e-06, + "loss": 3.1465, + "step": 52850 + }, + { + "epoch": 1.5486852706364902, + "grad_norm": 8.61997127532959, + "learning_rate": 4.9833944151348555e-06, + "loss": 3.1441, + "step": 52860 + }, + { + "epoch": 1.5489782465392221, + "grad_norm": 8.608501434326172, + "learning_rate": 4.981812937533521e-06, + "loss": 3.1229, + "step": 52870 + }, + { + "epoch": 1.5492712224419543, + "grad_norm": 8.763042449951172, + "learning_rate": 4.980231461751692e-06, + "loss": 3.1571, + "step": 52880 + }, + { + "epoch": 1.5495641983446862, + "grad_norm": 8.299022674560547, + "learning_rate": 4.97864998794759e-06, + "loss": 3.1398, + "step": 52890 + }, + { + "epoch": 1.549857174247418, + "grad_norm": 9.28212833404541, + "learning_rate": 4.977068516279427e-06, + "loss": 3.1553, + "step": 52900 + }, + { + "epoch": 1.55015015015015, + "grad_norm": 8.193346977233887, + "learning_rate": 4.975487046905426e-06, + "loss": 3.1434, + "step": 52910 + }, + { + "epoch": 1.5504431260528821, + "grad_norm": 8.48239517211914, + "learning_rate": 4.973905579983798e-06, + "loss": 3.1395, + "step": 52920 + }, + { + "epoch": 1.5507361019556143, + "grad_norm": 9.234841346740723, + "learning_rate": 4.9723241156727645e-06, + "loss": 3.1392, + "step": 52930 + }, + { + "epoch": 1.5510290778583462, + "grad_norm": 8.49924373626709, + "learning_rate": 4.970742654130536e-06, + "loss": 3.1358, + "step": 52940 + }, + { + "epoch": 1.5513220537610781, + "grad_norm": 9.255303382873535, + "learning_rate": 4.969161195515332e-06, + "loss": 3.132, + "step": 52950 + }, + { + "epoch": 1.55161502966381, + "grad_norm": 8.466821670532227, + "learning_rate": 4.9675797399853695e-06, + "loss": 3.1456, + "step": 52960 + }, + { + "epoch": 1.5519080055665422, + "grad_norm": 8.632437705993652, + "learning_rate": 4.9659982876988596e-06, + "loss": 3.1297, + "step": 52970 + }, + { + "epoch": 1.552200981469274, + "grad_norm": 8.507126808166504, + "learning_rate": 4.964416838814021e-06, + "loss": 3.1352, + "step": 52980 + }, + { + "epoch": 1.5524939573720062, + "grad_norm": 8.573017120361328, + "learning_rate": 4.962835393489065e-06, + "loss": 3.1548, + "step": 52990 + }, + { + "epoch": 1.5527869332747382, + "grad_norm": 8.868476867675781, + "learning_rate": 4.961253951882209e-06, + "loss": 3.1229, + "step": 53000 + }, + { + "epoch": 1.55307990917747, + "grad_norm": 9.047771453857422, + "learning_rate": 4.959672514151665e-06, + "loss": 3.1161, + "step": 53010 + }, + { + "epoch": 1.553372885080202, + "grad_norm": 8.491437911987305, + "learning_rate": 4.958091080455646e-06, + "loss": 3.1423, + "step": 53020 + }, + { + "epoch": 1.5536658609829341, + "grad_norm": 8.679669380187988, + "learning_rate": 4.956509650952365e-06, + "loss": 3.1488, + "step": 53030 + }, + { + "epoch": 1.5539588368856663, + "grad_norm": 8.788909912109375, + "learning_rate": 4.954928225800038e-06, + "loss": 3.1239, + "step": 53040 + }, + { + "epoch": 1.5542518127883982, + "grad_norm": 8.765093803405762, + "learning_rate": 4.953346805156872e-06, + "loss": 3.129, + "step": 53050 + }, + { + "epoch": 1.55454478869113, + "grad_norm": 8.706318855285645, + "learning_rate": 4.951765389181082e-06, + "loss": 3.1593, + "step": 53060 + }, + { + "epoch": 1.554837764593862, + "grad_norm": 8.883589744567871, + "learning_rate": 4.950183978030878e-06, + "loss": 3.147, + "step": 53070 + }, + { + "epoch": 1.5551307404965942, + "grad_norm": 9.107340812683105, + "learning_rate": 4.948602571864474e-06, + "loss": 3.1394, + "step": 53080 + }, + { + "epoch": 1.5554237163993263, + "grad_norm": 8.658973693847656, + "learning_rate": 4.947021170840076e-06, + "loss": 3.134, + "step": 53090 + }, + { + "epoch": 1.5557166923020582, + "grad_norm": 9.581003189086914, + "learning_rate": 4.945439775115895e-06, + "loss": 3.1581, + "step": 53100 + }, + { + "epoch": 1.5560096682047901, + "grad_norm": 8.62735366821289, + "learning_rate": 4.943858384850142e-06, + "loss": 3.1347, + "step": 53110 + }, + { + "epoch": 1.556302644107522, + "grad_norm": 8.733503341674805, + "learning_rate": 4.942277000201024e-06, + "loss": 3.1249, + "step": 53120 + }, + { + "epoch": 1.5565956200102542, + "grad_norm": 9.136136054992676, + "learning_rate": 4.94069562132675e-06, + "loss": 3.1224, + "step": 53130 + }, + { + "epoch": 1.5568885959129861, + "grad_norm": 8.135148048400879, + "learning_rate": 4.939114248385527e-06, + "loss": 3.1552, + "step": 53140 + }, + { + "epoch": 1.5571815718157183, + "grad_norm": 8.362598419189453, + "learning_rate": 4.937532881535561e-06, + "loss": 3.1389, + "step": 53150 + }, + { + "epoch": 1.5574745477184502, + "grad_norm": 8.381772994995117, + "learning_rate": 4.935951520935062e-06, + "loss": 3.1518, + "step": 53160 + }, + { + "epoch": 1.557767523621182, + "grad_norm": 8.905563354492188, + "learning_rate": 4.934370166742231e-06, + "loss": 3.159, + "step": 53170 + }, + { + "epoch": 1.558060499523914, + "grad_norm": 8.785085678100586, + "learning_rate": 4.932788819115278e-06, + "loss": 3.1566, + "step": 53180 + }, + { + "epoch": 1.5583534754266462, + "grad_norm": 8.839600563049316, + "learning_rate": 4.931207478212402e-06, + "loss": 3.132, + "step": 53190 + }, + { + "epoch": 1.5586464513293783, + "grad_norm": 8.775846481323242, + "learning_rate": 4.9296261441918125e-06, + "loss": 3.142, + "step": 53200 + }, + { + "epoch": 1.5589394272321102, + "grad_norm": 8.691015243530273, + "learning_rate": 4.928044817211708e-06, + "loss": 3.1296, + "step": 53210 + }, + { + "epoch": 1.5592324031348421, + "grad_norm": 8.70347785949707, + "learning_rate": 4.926463497430293e-06, + "loss": 3.1359, + "step": 53220 + }, + { + "epoch": 1.559525379037574, + "grad_norm": 8.494046211242676, + "learning_rate": 4.924882185005768e-06, + "loss": 3.1377, + "step": 53230 + }, + { + "epoch": 1.5598183549403062, + "grad_norm": 8.275067329406738, + "learning_rate": 4.923300880096337e-06, + "loss": 3.1237, + "step": 53240 + }, + { + "epoch": 1.5600527356624918, + "eval_bleu": 0.3453466718990472, + "eval_cap_loss": 0.9246909618377686, + "eval_con_loss": 1.190596103668213, + "eval_loss": 3.3058834075927734, + "step": 53248 + }, + { + "epoch": 1.5600527356624918, + "eval_bleu": 0.3453466718990472, + "eval_cap_loss": 0.9246909618377686, + "eval_con_loss": 1.190596103668213, + "eval_loss": 3.3058834075927734, + "eval_runtime": 52.0199, + "eval_samples_per_second": 384.468, + "eval_steps_per_second": 0.384, + "step": 53248 + }, + { + "epoch": 1.5601113308430383, + "grad_norm": 8.632277488708496, + "learning_rate": 4.921719582860196e-06, + "loss": 3.1472, + "step": 53250 + }, + { + "epoch": 1.5604043067457702, + "grad_norm": 8.933531761169434, + "learning_rate": 4.920138293455547e-06, + "loss": 3.1598, + "step": 53260 + }, + { + "epoch": 1.5606972826485022, + "grad_norm": 9.010202407836914, + "learning_rate": 4.9185570120405866e-06, + "loss": 3.1358, + "step": 53270 + }, + { + "epoch": 1.560990258551234, + "grad_norm": 9.282668113708496, + "learning_rate": 4.916975738773517e-06, + "loss": 3.1534, + "step": 53280 + }, + { + "epoch": 1.561283234453966, + "grad_norm": 9.180595397949219, + "learning_rate": 4.915394473812529e-06, + "loss": 3.1277, + "step": 53290 + }, + { + "epoch": 1.5615762103566981, + "grad_norm": 7.872917175292969, + "learning_rate": 4.9138132173158224e-06, + "loss": 3.1162, + "step": 53300 + }, + { + "epoch": 1.5618691862594303, + "grad_norm": 8.093706130981445, + "learning_rate": 4.912231969441593e-06, + "loss": 3.1348, + "step": 53310 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 8.82567310333252, + "learning_rate": 4.910650730348034e-06, + "loss": 3.1449, + "step": 53320 + }, + { + "epoch": 1.5624551380648941, + "grad_norm": 8.526106834411621, + "learning_rate": 4.909069500193341e-06, + "loss": 3.1142, + "step": 53330 + }, + { + "epoch": 1.562748113967626, + "grad_norm": 8.563192367553711, + "learning_rate": 4.907488279135703e-06, + "loss": 3.1392, + "step": 53340 + }, + { + "epoch": 1.5630410898703582, + "grad_norm": 8.01716423034668, + "learning_rate": 4.9059070673333145e-06, + "loss": 3.1308, + "step": 53350 + }, + { + "epoch": 1.5633340657730903, + "grad_norm": 8.194097518920898, + "learning_rate": 4.904325864944364e-06, + "loss": 3.1209, + "step": 53360 + }, + { + "epoch": 1.5636270416758222, + "grad_norm": 8.973724365234375, + "learning_rate": 4.902744672127046e-06, + "loss": 3.132, + "step": 53370 + }, + { + "epoch": 1.5639200175785541, + "grad_norm": 8.889386177062988, + "learning_rate": 4.901163489039545e-06, + "loss": 3.1394, + "step": 53380 + }, + { + "epoch": 1.564212993481286, + "grad_norm": 8.06000804901123, + "learning_rate": 4.899582315840051e-06, + "loss": 3.1186, + "step": 53390 + }, + { + "epoch": 1.5645059693840182, + "grad_norm": 9.817858695983887, + "learning_rate": 4.898001152686749e-06, + "loss": 3.135, + "step": 53400 + }, + { + "epoch": 1.5647989452867501, + "grad_norm": 8.635472297668457, + "learning_rate": 4.89641999973783e-06, + "loss": 3.1375, + "step": 53410 + }, + { + "epoch": 1.5650919211894823, + "grad_norm": 9.054786682128906, + "learning_rate": 4.894838857151472e-06, + "loss": 3.1288, + "step": 53420 + }, + { + "epoch": 1.5653848970922142, + "grad_norm": 8.41107177734375, + "learning_rate": 4.893257725085865e-06, + "loss": 3.1446, + "step": 53430 + }, + { + "epoch": 1.565677872994946, + "grad_norm": 8.770251274108887, + "learning_rate": 4.891676603699187e-06, + "loss": 3.1421, + "step": 53440 + }, + { + "epoch": 1.565970848897678, + "grad_norm": 8.895161628723145, + "learning_rate": 4.890095493149624e-06, + "loss": 3.1522, + "step": 53450 + }, + { + "epoch": 1.5662638248004102, + "grad_norm": 8.919896125793457, + "learning_rate": 4.888514393595354e-06, + "loss": 3.1325, + "step": 53460 + }, + { + "epoch": 1.5665568007031423, + "grad_norm": 8.881046295166016, + "learning_rate": 4.886933305194558e-06, + "loss": 3.1373, + "step": 53470 + }, + { + "epoch": 1.5668497766058742, + "grad_norm": 8.369608879089355, + "learning_rate": 4.885352228105413e-06, + "loss": 3.1097, + "step": 53480 + }, + { + "epoch": 1.5671427525086061, + "grad_norm": 8.275466918945312, + "learning_rate": 4.883771162486099e-06, + "loss": 3.1276, + "step": 53490 + }, + { + "epoch": 1.567435728411338, + "grad_norm": 8.973822593688965, + "learning_rate": 4.882190108494789e-06, + "loss": 3.1284, + "step": 53500 + }, + { + "epoch": 1.5677287043140702, + "grad_norm": 9.57668399810791, + "learning_rate": 4.880609066289661e-06, + "loss": 3.1318, + "step": 53510 + }, + { + "epoch": 1.5680216802168023, + "grad_norm": 8.951374053955078, + "learning_rate": 4.879028036028885e-06, + "loss": 3.1264, + "step": 53520 + }, + { + "epoch": 1.5683146561195342, + "grad_norm": 8.668938636779785, + "learning_rate": 4.8774470178706405e-06, + "loss": 3.1142, + "step": 53530 + }, + { + "epoch": 1.5686076320222662, + "grad_norm": 9.891193389892578, + "learning_rate": 4.875866011973092e-06, + "loss": 3.1471, + "step": 53540 + }, + { + "epoch": 1.568900607924998, + "grad_norm": 8.649956703186035, + "learning_rate": 4.874285018494413e-06, + "loss": 3.1303, + "step": 53550 + }, + { + "epoch": 1.5691935838277302, + "grad_norm": 8.161383628845215, + "learning_rate": 4.8727040375927715e-06, + "loss": 3.1352, + "step": 53560 + }, + { + "epoch": 1.5694865597304621, + "grad_norm": 9.190947532653809, + "learning_rate": 4.8711230694263375e-06, + "loss": 3.1428, + "step": 53570 + }, + { + "epoch": 1.5697795356331943, + "grad_norm": 8.868623733520508, + "learning_rate": 4.869542114153275e-06, + "loss": 3.1254, + "step": 53580 + }, + { + "epoch": 1.5700725115359262, + "grad_norm": 9.10586929321289, + "learning_rate": 4.8679611719317495e-06, + "loss": 3.1296, + "step": 53590 + }, + { + "epoch": 1.5703654874386581, + "grad_norm": 8.545889854431152, + "learning_rate": 4.866380242919925e-06, + "loss": 3.1189, + "step": 53600 + }, + { + "epoch": 1.57065846334139, + "grad_norm": 8.834019660949707, + "learning_rate": 4.864799327275966e-06, + "loss": 3.1487, + "step": 53610 + }, + { + "epoch": 1.5709514392441222, + "grad_norm": 8.809355735778809, + "learning_rate": 4.863218425158031e-06, + "loss": 3.1255, + "step": 53620 + }, + { + "epoch": 1.5712444151468543, + "grad_norm": 8.93606185913086, + "learning_rate": 4.861637536724281e-06, + "loss": 3.1272, + "step": 53630 + }, + { + "epoch": 1.5715373910495862, + "grad_norm": 8.875829696655273, + "learning_rate": 4.860056662132874e-06, + "loss": 3.1114, + "step": 53640 + }, + { + "epoch": 1.5718303669523181, + "grad_norm": 8.597573280334473, + "learning_rate": 4.858475801541971e-06, + "loss": 3.1276, + "step": 53650 + }, + { + "epoch": 1.57212334285505, + "grad_norm": 8.55723762512207, + "learning_rate": 4.856894955109721e-06, + "loss": 3.1532, + "step": 53660 + }, + { + "epoch": 1.5724163187577822, + "grad_norm": 8.949463844299316, + "learning_rate": 4.855314122994282e-06, + "loss": 3.1391, + "step": 53670 + }, + { + "epoch": 1.5727092946605141, + "grad_norm": 9.095958709716797, + "learning_rate": 4.853733305353808e-06, + "loss": 3.1122, + "step": 53680 + }, + { + "epoch": 1.5730022705632463, + "grad_norm": 8.607802391052246, + "learning_rate": 4.852152502346447e-06, + "loss": 3.1292, + "step": 53690 + }, + { + "epoch": 1.5732952464659782, + "grad_norm": 8.805545806884766, + "learning_rate": 4.850571714130354e-06, + "loss": 3.1369, + "step": 53700 + }, + { + "epoch": 1.57358822236871, + "grad_norm": 8.516944885253906, + "learning_rate": 4.848990940863672e-06, + "loss": 3.1283, + "step": 53710 + }, + { + "epoch": 1.573881198271442, + "grad_norm": 8.937950134277344, + "learning_rate": 4.847410182704551e-06, + "loss": 3.1279, + "step": 53720 + }, + { + "epoch": 1.5741741741741742, + "grad_norm": 8.712188720703125, + "learning_rate": 4.845829439811136e-06, + "loss": 3.1409, + "step": 53730 + }, + { + "epoch": 1.5744671500769063, + "grad_norm": 8.838842391967773, + "learning_rate": 4.844248712341573e-06, + "loss": 3.1438, + "step": 53740 + }, + { + "epoch": 1.5747601259796382, + "grad_norm": 8.260781288146973, + "learning_rate": 4.842668000454e-06, + "loss": 3.1299, + "step": 53750 + }, + { + "epoch": 1.5750531018823701, + "grad_norm": 9.017898559570312, + "learning_rate": 4.8410873043065605e-06, + "loss": 3.1304, + "step": 53760 + }, + { + "epoch": 1.5750531018823701, + "eval_bleu": 0.345642563929523, + "eval_cap_loss": 0.9230955839157104, + "eval_con_loss": 1.1846985816955566, + "eval_loss": 3.2924928665161133, + "step": 53760 + }, + { + "epoch": 1.5750531018823701, + "eval_bleu": 0.345642563929523, + "eval_cap_loss": 0.9230955839157104, + "eval_con_loss": 1.1846985816955566, + "eval_loss": 3.2924928665161133, + "eval_runtime": 52.0376, + "eval_samples_per_second": 384.337, + "eval_steps_per_second": 0.384, + "step": 53760 + }, + { + "epoch": 1.575346077785102, + "grad_norm": 9.671822547912598, + "learning_rate": 4.839506624057393e-06, + "loss": 3.1296, + "step": 53770 + }, + { + "epoch": 1.5756390536878342, + "grad_norm": 8.853106498718262, + "learning_rate": 4.837925959864637e-06, + "loss": 3.133, + "step": 53780 + }, + { + "epoch": 1.5759320295905663, + "grad_norm": 8.422102928161621, + "learning_rate": 4.836345311886425e-06, + "loss": 3.1311, + "step": 53790 + }, + { + "epoch": 1.5762250054932982, + "grad_norm": 9.18836498260498, + "learning_rate": 4.834764680280895e-06, + "loss": 3.1321, + "step": 53800 + }, + { + "epoch": 1.5765179813960302, + "grad_norm": 9.228401184082031, + "learning_rate": 4.833184065206179e-06, + "loss": 3.1135, + "step": 53810 + }, + { + "epoch": 1.576810957298762, + "grad_norm": 8.864761352539062, + "learning_rate": 4.8316034668204074e-06, + "loss": 3.116, + "step": 53820 + }, + { + "epoch": 1.5771039332014942, + "grad_norm": 9.04131031036377, + "learning_rate": 4.830022885281709e-06, + "loss": 3.1417, + "step": 53830 + }, + { + "epoch": 1.5773969091042261, + "grad_norm": 9.058087348937988, + "learning_rate": 4.828442320748213e-06, + "loss": 3.1415, + "step": 53840 + }, + { + "epoch": 1.5776898850069583, + "grad_norm": 9.108179092407227, + "learning_rate": 4.826861773378044e-06, + "loss": 3.1329, + "step": 53850 + }, + { + "epoch": 1.5779828609096902, + "grad_norm": 8.682344436645508, + "learning_rate": 4.82528124332933e-06, + "loss": 3.1131, + "step": 53860 + }, + { + "epoch": 1.5782758368124221, + "grad_norm": 8.668892860412598, + "learning_rate": 4.823700730760188e-06, + "loss": 3.1346, + "step": 53870 + }, + { + "epoch": 1.578568812715154, + "grad_norm": 8.691253662109375, + "learning_rate": 4.822120235828744e-06, + "loss": 3.134, + "step": 53880 + }, + { + "epoch": 1.5788617886178862, + "grad_norm": 8.46118450164795, + "learning_rate": 4.820539758693113e-06, + "loss": 3.12, + "step": 53890 + }, + { + "epoch": 1.5791547645206183, + "grad_norm": 8.75467300415039, + "learning_rate": 4.818959299511417e-06, + "loss": 3.1413, + "step": 53900 + }, + { + "epoch": 1.5794477404233502, + "grad_norm": 9.081381797790527, + "learning_rate": 4.817378858441767e-06, + "loss": 3.1116, + "step": 53910 + }, + { + "epoch": 1.5797407163260822, + "grad_norm": 8.702916145324707, + "learning_rate": 4.81579843564228e-06, + "loss": 3.1152, + "step": 53920 + }, + { + "epoch": 1.580033692228814, + "grad_norm": 8.908595085144043, + "learning_rate": 4.814218031271065e-06, + "loss": 3.1091, + "step": 53930 + }, + { + "epoch": 1.5803266681315462, + "grad_norm": 8.581450462341309, + "learning_rate": 4.812637645486237e-06, + "loss": 3.1193, + "step": 53940 + }, + { + "epoch": 1.5806196440342781, + "grad_norm": 9.389884948730469, + "learning_rate": 4.8110572784458974e-06, + "loss": 3.1297, + "step": 53950 + }, + { + "epoch": 1.5809126199370103, + "grad_norm": 8.224224090576172, + "learning_rate": 4.8094769303081585e-06, + "loss": 3.1125, + "step": 53960 + }, + { + "epoch": 1.5812055958397422, + "grad_norm": 9.154450416564941, + "learning_rate": 4.8078966012311206e-06, + "loss": 3.1411, + "step": 53970 + }, + { + "epoch": 1.581498571742474, + "grad_norm": 8.72181224822998, + "learning_rate": 4.80631629137289e-06, + "loss": 3.1245, + "step": 53980 + }, + { + "epoch": 1.581791547645206, + "grad_norm": 8.983288764953613, + "learning_rate": 4.804736000891564e-06, + "loss": 3.1047, + "step": 53990 + }, + { + "epoch": 1.5820845235479382, + "grad_norm": 8.921116828918457, + "learning_rate": 4.803155729945243e-06, + "loss": 3.1221, + "step": 54000 + }, + { + "epoch": 1.5823774994506703, + "grad_norm": 10.184905052185059, + "learning_rate": 4.801575478692023e-06, + "loss": 3.1318, + "step": 54010 + }, + { + "epoch": 1.5826704753534022, + "grad_norm": 8.856364250183105, + "learning_rate": 4.799995247289999e-06, + "loss": 3.1343, + "step": 54020 + }, + { + "epoch": 1.5829634512561341, + "grad_norm": 8.686835289001465, + "learning_rate": 4.7984150358972655e-06, + "loss": 3.1373, + "step": 54030 + }, + { + "epoch": 1.583256427158866, + "grad_norm": 8.59578800201416, + "learning_rate": 4.79683484467191e-06, + "loss": 3.1308, + "step": 54040 + }, + { + "epoch": 1.5835494030615982, + "grad_norm": 8.635054588317871, + "learning_rate": 4.795254673772024e-06, + "loss": 3.1141, + "step": 54050 + }, + { + "epoch": 1.5838423789643303, + "grad_norm": 8.783279418945312, + "learning_rate": 4.7936745233556915e-06, + "loss": 3.1143, + "step": 54060 + }, + { + "epoch": 1.5841353548670623, + "grad_norm": 8.53049373626709, + "learning_rate": 4.792094393581002e-06, + "loss": 3.1145, + "step": 54070 + }, + { + "epoch": 1.5844283307697942, + "grad_norm": 8.980721473693848, + "learning_rate": 4.790514284606032e-06, + "loss": 3.1287, + "step": 54080 + }, + { + "epoch": 1.584721306672526, + "grad_norm": 8.251287460327148, + "learning_rate": 4.788934196588867e-06, + "loss": 3.1072, + "step": 54090 + }, + { + "epoch": 1.5850142825752582, + "grad_norm": 9.10707950592041, + "learning_rate": 4.787354129687581e-06, + "loss": 3.1031, + "step": 54100 + }, + { + "epoch": 1.5853072584779901, + "grad_norm": 9.345111846923828, + "learning_rate": 4.785774084060256e-06, + "loss": 3.139, + "step": 54110 + }, + { + "epoch": 1.5856002343807223, + "grad_norm": 8.683091163635254, + "learning_rate": 4.78419405986496e-06, + "loss": 3.139, + "step": 54120 + }, + { + "epoch": 1.5858932102834542, + "grad_norm": 8.543965339660645, + "learning_rate": 4.782614057259769e-06, + "loss": 3.1441, + "step": 54130 + }, + { + "epoch": 1.5861861861861861, + "grad_norm": 9.361129760742188, + "learning_rate": 4.7810340764027515e-06, + "loss": 3.1357, + "step": 54140 + }, + { + "epoch": 1.586479162088918, + "grad_norm": 8.413372039794922, + "learning_rate": 4.779454117451978e-06, + "loss": 3.1267, + "step": 54150 + }, + { + "epoch": 1.5867721379916502, + "grad_norm": 9.389697074890137, + "learning_rate": 4.777874180565508e-06, + "loss": 3.1396, + "step": 54160 + }, + { + "epoch": 1.5870651138943823, + "grad_norm": 8.702374458312988, + "learning_rate": 4.776294265901411e-06, + "loss": 3.131, + "step": 54170 + }, + { + "epoch": 1.5873580897971142, + "grad_norm": 8.748509407043457, + "learning_rate": 4.774714373617743e-06, + "loss": 3.1236, + "step": 54180 + }, + { + "epoch": 1.5876510656998462, + "grad_norm": 8.872577667236328, + "learning_rate": 4.773134503872569e-06, + "loss": 3.1076, + "step": 54190 + }, + { + "epoch": 1.587944041602578, + "grad_norm": 8.466779708862305, + "learning_rate": 4.771554656823939e-06, + "loss": 3.1049, + "step": 54200 + }, + { + "epoch": 1.5882370175053102, + "grad_norm": 8.763105392456055, + "learning_rate": 4.769974832629911e-06, + "loss": 3.1158, + "step": 54210 + }, + { + "epoch": 1.5885299934080424, + "grad_norm": 8.648475646972656, + "learning_rate": 4.768395031448536e-06, + "loss": 3.1455, + "step": 54220 + }, + { + "epoch": 1.5888229693107743, + "grad_norm": 8.779388427734375, + "learning_rate": 4.766815253437866e-06, + "loss": 3.1161, + "step": 54230 + }, + { + "epoch": 1.5891159452135062, + "grad_norm": 8.522500038146973, + "learning_rate": 4.765235498755944e-06, + "loss": 3.1204, + "step": 54240 + }, + { + "epoch": 1.589408921116238, + "grad_norm": 9.79067611694336, + "learning_rate": 4.763655767560819e-06, + "loss": 3.1182, + "step": 54250 + }, + { + "epoch": 1.5897018970189702, + "grad_norm": 8.930092811584473, + "learning_rate": 4.762076060010531e-06, + "loss": 3.1083, + "step": 54260 + }, + { + "epoch": 1.5899948729217022, + "grad_norm": 8.862858772277832, + "learning_rate": 4.760496376263123e-06, + "loss": 3.1331, + "step": 54270 + }, + { + "epoch": 1.5900534681022487, + "eval_bleu": 0.34626386374227097, + "eval_cap_loss": 0.9217615127563477, + "eval_con_loss": 1.1830617189407349, + "eval_loss": 3.2878847122192383, + "step": 54272 + }, + { + "epoch": 1.5900534681022487, + "eval_bleu": 0.34626386374227097, + "eval_cap_loss": 0.9217615127563477, + "eval_con_loss": 1.1830617189407349, + "eval_loss": 3.2878847122192383, + "eval_runtime": 55.8558, + "eval_samples_per_second": 358.065, + "eval_steps_per_second": 0.358, + "step": 54272 + }, + { + "epoch": 1.5902878488244343, + "grad_norm": 8.42785358428955, + "learning_rate": 4.758916716476631e-06, + "loss": 3.126, + "step": 54280 + }, + { + "epoch": 1.5905808247271662, + "grad_norm": 9.090117454528809, + "learning_rate": 4.757337080809091e-06, + "loss": 3.1196, + "step": 54290 + }, + { + "epoch": 1.5908738006298981, + "grad_norm": 9.270843505859375, + "learning_rate": 4.755757469418536e-06, + "loss": 3.1407, + "step": 54300 + }, + { + "epoch": 1.59116677653263, + "grad_norm": 8.275261878967285, + "learning_rate": 4.754177882462999e-06, + "loss": 3.091, + "step": 54310 + }, + { + "epoch": 1.5914597524353622, + "grad_norm": 8.573863983154297, + "learning_rate": 4.7525983201005046e-06, + "loss": 3.1262, + "step": 54320 + }, + { + "epoch": 1.5917527283380943, + "grad_norm": 9.204020500183105, + "learning_rate": 4.7510187824890816e-06, + "loss": 3.1164, + "step": 54330 + }, + { + "epoch": 1.5920457042408263, + "grad_norm": 9.0060453414917, + "learning_rate": 4.749439269786751e-06, + "loss": 3.117, + "step": 54340 + }, + { + "epoch": 1.5923386801435582, + "grad_norm": 8.706339836120605, + "learning_rate": 4.7478597821515375e-06, + "loss": 3.1154, + "step": 54350 + }, + { + "epoch": 1.59263165604629, + "grad_norm": 8.318801879882812, + "learning_rate": 4.746280319741454e-06, + "loss": 3.1222, + "step": 54360 + }, + { + "epoch": 1.5929246319490222, + "grad_norm": 8.885135650634766, + "learning_rate": 4.74470088271452e-06, + "loss": 3.1086, + "step": 54370 + }, + { + "epoch": 1.5932176078517541, + "grad_norm": 8.702425003051758, + "learning_rate": 4.743121471228748e-06, + "loss": 3.1401, + "step": 54380 + }, + { + "epoch": 1.5935105837544863, + "grad_norm": 8.726239204406738, + "learning_rate": 4.741542085442148e-06, + "loss": 3.1034, + "step": 54390 + }, + { + "epoch": 1.5938035596572182, + "grad_norm": 8.3672513961792, + "learning_rate": 4.739962725512732e-06, + "loss": 3.1333, + "step": 54400 + }, + { + "epoch": 1.5940965355599501, + "grad_norm": 9.054116249084473, + "learning_rate": 4.7383833915985e-06, + "loss": 3.1319, + "step": 54410 + }, + { + "epoch": 1.594389511462682, + "grad_norm": 9.176050186157227, + "learning_rate": 4.736804083857458e-06, + "loss": 3.1418, + "step": 54420 + }, + { + "epoch": 1.5946824873654142, + "grad_norm": 8.905774116516113, + "learning_rate": 4.735224802447605e-06, + "loss": 3.1134, + "step": 54430 + }, + { + "epoch": 1.5949754632681463, + "grad_norm": 8.317906379699707, + "learning_rate": 4.733645547526943e-06, + "loss": 3.1073, + "step": 54440 + }, + { + "epoch": 1.5952684391708782, + "grad_norm": 8.247818946838379, + "learning_rate": 4.7320663192534605e-06, + "loss": 3.114, + "step": 54450 + }, + { + "epoch": 1.5955614150736102, + "grad_norm": 8.392147064208984, + "learning_rate": 4.730487117785155e-06, + "loss": 3.1319, + "step": 54460 + }, + { + "epoch": 1.595854390976342, + "grad_norm": 9.000086784362793, + "learning_rate": 4.7289079432800124e-06, + "loss": 3.1104, + "step": 54470 + }, + { + "epoch": 1.5961473668790742, + "grad_norm": 8.714783668518066, + "learning_rate": 4.7273287958960245e-06, + "loss": 3.1198, + "step": 54480 + }, + { + "epoch": 1.5964403427818064, + "grad_norm": 8.855203628540039, + "learning_rate": 4.7257496757911705e-06, + "loss": 3.1218, + "step": 54490 + }, + { + "epoch": 1.5967333186845383, + "grad_norm": 8.723742485046387, + "learning_rate": 4.724170583123436e-06, + "loss": 3.1202, + "step": 54500 + }, + { + "epoch": 1.5970262945872702, + "grad_norm": 9.644450187683105, + "learning_rate": 4.722591518050797e-06, + "loss": 3.1428, + "step": 54510 + }, + { + "epoch": 1.597319270490002, + "grad_norm": 9.58849811553955, + "learning_rate": 4.721012480731232e-06, + "loss": 3.132, + "step": 54520 + }, + { + "epoch": 1.5976122463927342, + "grad_norm": 9.17578411102295, + "learning_rate": 4.719433471322711e-06, + "loss": 3.1157, + "step": 54530 + }, + { + "epoch": 1.5979052222954662, + "grad_norm": 8.296476364135742, + "learning_rate": 4.717854489983209e-06, + "loss": 3.1129, + "step": 54540 + }, + { + "epoch": 1.5981981981981983, + "grad_norm": 8.861602783203125, + "learning_rate": 4.716275536870689e-06, + "loss": 3.1213, + "step": 54550 + }, + { + "epoch": 1.5984911741009302, + "grad_norm": 8.424731254577637, + "learning_rate": 4.714696612143121e-06, + "loss": 3.1194, + "step": 54560 + }, + { + "epoch": 1.5987841500036621, + "grad_norm": 9.751250267028809, + "learning_rate": 4.7131177159584625e-06, + "loss": 3.1295, + "step": 54570 + }, + { + "epoch": 1.599077125906394, + "grad_norm": 8.713706970214844, + "learning_rate": 4.711538848474676e-06, + "loss": 3.1171, + "step": 54580 + }, + { + "epoch": 1.5993701018091262, + "grad_norm": 8.936087608337402, + "learning_rate": 4.709960009849714e-06, + "loss": 3.117, + "step": 54590 + }, + { + "epoch": 1.5996630777118583, + "grad_norm": 9.26085376739502, + "learning_rate": 4.708381200241535e-06, + "loss": 3.1119, + "step": 54600 + }, + { + "epoch": 1.5999560536145903, + "grad_norm": 8.372782707214355, + "learning_rate": 4.706802419808084e-06, + "loss": 3.1192, + "step": 54610 + }, + { + "epoch": 1.6002490295173222, + "grad_norm": 9.083720207214355, + "learning_rate": 4.705223668707313e-06, + "loss": 3.132, + "step": 54620 + }, + { + "epoch": 1.600542005420054, + "grad_norm": 8.970640182495117, + "learning_rate": 4.703644947097165e-06, + "loss": 3.1059, + "step": 54630 + }, + { + "epoch": 1.6008349813227862, + "grad_norm": 9.123103141784668, + "learning_rate": 4.702066255135582e-06, + "loss": 3.108, + "step": 54640 + }, + { + "epoch": 1.6011279572255182, + "grad_norm": 8.843426704406738, + "learning_rate": 4.7004875929805025e-06, + "loss": 3.122, + "step": 54650 + }, + { + "epoch": 1.6014209331282503, + "grad_norm": 8.570196151733398, + "learning_rate": 4.698908960789862e-06, + "loss": 3.1202, + "step": 54660 + }, + { + "epoch": 1.6017139090309822, + "grad_norm": 8.72919750213623, + "learning_rate": 4.6973303587215926e-06, + "loss": 3.1308, + "step": 54670 + }, + { + "epoch": 1.6020068849337141, + "grad_norm": 8.954269409179688, + "learning_rate": 4.695751786933629e-06, + "loss": 3.1282, + "step": 54680 + }, + { + "epoch": 1.602299860836446, + "grad_norm": 8.647355079650879, + "learning_rate": 4.6941732455838904e-06, + "loss": 3.1084, + "step": 54690 + }, + { + "epoch": 1.6025928367391782, + "grad_norm": 8.7269287109375, + "learning_rate": 4.692594734830304e-06, + "loss": 3.0977, + "step": 54700 + }, + { + "epoch": 1.6028858126419103, + "grad_norm": 8.377601623535156, + "learning_rate": 4.691016254830792e-06, + "loss": 3.1169, + "step": 54710 + }, + { + "epoch": 1.6031787885446422, + "grad_norm": 9.14828109741211, + "learning_rate": 4.689437805743272e-06, + "loss": 3.1346, + "step": 54720 + }, + { + "epoch": 1.6034717644473742, + "grad_norm": 8.910527229309082, + "learning_rate": 4.687859387725655e-06, + "loss": 3.1106, + "step": 54730 + }, + { + "epoch": 1.603764740350106, + "grad_norm": 8.5636568069458, + "learning_rate": 4.686281000935854e-06, + "loss": 3.1073, + "step": 54740 + }, + { + "epoch": 1.6040577162528382, + "grad_norm": 9.365283012390137, + "learning_rate": 4.684702645531778e-06, + "loss": 3.1132, + "step": 54750 + }, + { + "epoch": 1.6043506921555704, + "grad_norm": 8.96485424041748, + "learning_rate": 4.683124321671331e-06, + "loss": 3.1107, + "step": 54760 + }, + { + "epoch": 1.6046436680583023, + "grad_norm": 9.21026611328125, + "learning_rate": 4.6815460295124185e-06, + "loss": 3.1189, + "step": 54770 + }, + { + "epoch": 1.6049366439610342, + "grad_norm": 9.11426830291748, + "learning_rate": 4.679967769212934e-06, + "loss": 3.1338, + "step": 54780 + }, + { + "epoch": 1.605053834322127, + "eval_bleu": 0.34647747848286276, + "eval_cap_loss": 0.9212445616722107, + "eval_con_loss": 1.1792466640472412, + "eval_loss": 3.279737949371338, + "step": 54784 + }, + { + "epoch": 1.605053834322127, + "eval_bleu": 0.34647747848286276, + "eval_cap_loss": 0.9212445616722107, + "eval_con_loss": 1.1792466640472412, + "eval_loss": 3.279737949371338, + "eval_runtime": 54.3103, + "eval_samples_per_second": 368.254, + "eval_steps_per_second": 0.368, + "step": 54784 + }, + { + "epoch": 1.605229619863766, + "grad_norm": 9.27445125579834, + "learning_rate": 4.678389540930776e-06, + "loss": 3.1099, + "step": 54790 + }, + { + "epoch": 1.6055225957664983, + "grad_norm": 9.221430778503418, + "learning_rate": 4.6768113448238364e-06, + "loss": 3.1061, + "step": 54800 + }, + { + "epoch": 1.6058155716692302, + "grad_norm": 8.983186721801758, + "learning_rate": 4.675233181050006e-06, + "loss": 3.1097, + "step": 54810 + }, + { + "epoch": 1.6061085475719623, + "grad_norm": 9.601192474365234, + "learning_rate": 4.673655049767167e-06, + "loss": 3.1299, + "step": 54820 + }, + { + "epoch": 1.6064015234746942, + "grad_norm": 9.18031120300293, + "learning_rate": 4.672076951133205e-06, + "loss": 3.113, + "step": 54830 + }, + { + "epoch": 1.6066944993774261, + "grad_norm": 8.955723762512207, + "learning_rate": 4.670498885305997e-06, + "loss": 3.1173, + "step": 54840 + }, + { + "epoch": 1.606987475280158, + "grad_norm": 8.819525718688965, + "learning_rate": 4.668920852443424e-06, + "loss": 3.1097, + "step": 54850 + }, + { + "epoch": 1.6072804511828902, + "grad_norm": 9.385075569152832, + "learning_rate": 4.667342852703353e-06, + "loss": 3.1081, + "step": 54860 + }, + { + "epoch": 1.6075734270856223, + "grad_norm": 9.467814445495605, + "learning_rate": 4.665764886243656e-06, + "loss": 3.1242, + "step": 54870 + }, + { + "epoch": 1.6078664029883543, + "grad_norm": 8.606614112854004, + "learning_rate": 4.664186953222199e-06, + "loss": 3.1322, + "step": 54880 + }, + { + "epoch": 1.6081593788910862, + "grad_norm": 9.020720481872559, + "learning_rate": 4.662609053796847e-06, + "loss": 3.1156, + "step": 54890 + }, + { + "epoch": 1.608452354793818, + "grad_norm": 8.820096015930176, + "learning_rate": 4.661031188125456e-06, + "loss": 3.1187, + "step": 54900 + }, + { + "epoch": 1.6087453306965502, + "grad_norm": 9.421317100524902, + "learning_rate": 4.659453356365883e-06, + "loss": 3.1164, + "step": 54910 + }, + { + "epoch": 1.6090383065992824, + "grad_norm": 8.506755828857422, + "learning_rate": 4.657875558675982e-06, + "loss": 3.1296, + "step": 54920 + }, + { + "epoch": 1.6093312825020143, + "grad_norm": 8.948088645935059, + "learning_rate": 4.656297795213603e-06, + "loss": 3.1327, + "step": 54930 + }, + { + "epoch": 1.6096242584047462, + "grad_norm": 8.957830429077148, + "learning_rate": 4.654720066136588e-06, + "loss": 3.1178, + "step": 54940 + }, + { + "epoch": 1.6099172343074781, + "grad_norm": 8.709362983703613, + "learning_rate": 4.6531423716027826e-06, + "loss": 3.1155, + "step": 54950 + }, + { + "epoch": 1.61021021021021, + "grad_norm": 8.466486930847168, + "learning_rate": 4.651564711770025e-06, + "loss": 3.0927, + "step": 54960 + }, + { + "epoch": 1.6105031861129422, + "grad_norm": 8.207919120788574, + "learning_rate": 4.649987086796153e-06, + "loss": 3.0903, + "step": 54970 + }, + { + "epoch": 1.6107961620156743, + "grad_norm": 9.090028762817383, + "learning_rate": 4.648409496838994e-06, + "loss": 3.1134, + "step": 54980 + }, + { + "epoch": 1.6110891379184062, + "grad_norm": 8.447914123535156, + "learning_rate": 4.6468319420563795e-06, + "loss": 3.1174, + "step": 54990 + }, + { + "epoch": 1.6113821138211382, + "grad_norm": 9.174086570739746, + "learning_rate": 4.645254422606134e-06, + "loss": 3.0959, + "step": 55000 + }, + { + "epoch": 1.61167508972387, + "grad_norm": 9.150630950927734, + "learning_rate": 4.64367693864608e-06, + "loss": 3.1235, + "step": 55010 + }, + { + "epoch": 1.6119680656266022, + "grad_norm": 8.982342720031738, + "learning_rate": 4.642099490334032e-06, + "loss": 3.1093, + "step": 55020 + }, + { + "epoch": 1.6122610415293344, + "grad_norm": 8.477967262268066, + "learning_rate": 4.640522077827808e-06, + "loss": 3.0998, + "step": 55030 + }, + { + "epoch": 1.6125540174320663, + "grad_norm": 9.707470893859863, + "learning_rate": 4.638944701285217e-06, + "loss": 3.1197, + "step": 55040 + }, + { + "epoch": 1.6128469933347982, + "grad_norm": 9.103780746459961, + "learning_rate": 4.637367360864068e-06, + "loss": 3.1163, + "step": 55050 + }, + { + "epoch": 1.6131399692375301, + "grad_norm": 9.28167724609375, + "learning_rate": 4.635790056722162e-06, + "loss": 3.126, + "step": 55060 + }, + { + "epoch": 1.6134329451402623, + "grad_norm": 8.74531078338623, + "learning_rate": 4.634212789017299e-06, + "loss": 3.1103, + "step": 55070 + }, + { + "epoch": 1.6137259210429942, + "grad_norm": 8.51749324798584, + "learning_rate": 4.632635557907277e-06, + "loss": 3.126, + "step": 55080 + }, + { + "epoch": 1.6140188969457263, + "grad_norm": 8.994915008544922, + "learning_rate": 4.63105836354989e-06, + "loss": 3.1106, + "step": 55090 + }, + { + "epoch": 1.6143118728484582, + "grad_norm": 9.3167085647583, + "learning_rate": 4.629481206102922e-06, + "loss": 3.1044, + "step": 55100 + }, + { + "epoch": 1.6146048487511901, + "grad_norm": 9.146857261657715, + "learning_rate": 4.627904085724161e-06, + "loss": 3.12, + "step": 55110 + }, + { + "epoch": 1.614897824653922, + "grad_norm": 8.92854118347168, + "learning_rate": 4.626327002571391e-06, + "loss": 3.0906, + "step": 55120 + }, + { + "epoch": 1.6151908005566542, + "grad_norm": 8.738101959228516, + "learning_rate": 4.624749956802385e-06, + "loss": 3.124, + "step": 55130 + }, + { + "epoch": 1.6154837764593863, + "grad_norm": 9.105076789855957, + "learning_rate": 4.6231729485749225e-06, + "loss": 3.117, + "step": 55140 + }, + { + "epoch": 1.6157767523621183, + "grad_norm": 8.889777183532715, + "learning_rate": 4.6215959780467685e-06, + "loss": 3.1137, + "step": 55150 + }, + { + "epoch": 1.6160697282648502, + "grad_norm": 10.046481132507324, + "learning_rate": 4.620019045375693e-06, + "loss": 3.1326, + "step": 55160 + }, + { + "epoch": 1.616362704167582, + "grad_norm": 8.799893379211426, + "learning_rate": 4.6184421507194566e-06, + "loss": 3.1153, + "step": 55170 + }, + { + "epoch": 1.6166556800703142, + "grad_norm": 8.5267333984375, + "learning_rate": 4.616865294235821e-06, + "loss": 3.1346, + "step": 55180 + }, + { + "epoch": 1.6169486559730464, + "grad_norm": 8.714025497436523, + "learning_rate": 4.615288476082538e-06, + "loss": 3.1071, + "step": 55190 + }, + { + "epoch": 1.6172416318757783, + "grad_norm": 8.495280265808105, + "learning_rate": 4.6137116964173616e-06, + "loss": 3.1061, + "step": 55200 + }, + { + "epoch": 1.6175346077785102, + "grad_norm": 8.468865394592285, + "learning_rate": 4.612134955398037e-06, + "loss": 3.1179, + "step": 55210 + }, + { + "epoch": 1.6178275836812421, + "grad_norm": 8.368931770324707, + "learning_rate": 4.610558253182312e-06, + "loss": 3.1049, + "step": 55220 + }, + { + "epoch": 1.6181205595839743, + "grad_norm": 9.348690032958984, + "learning_rate": 4.60898158992792e-06, + "loss": 3.0985, + "step": 55230 + }, + { + "epoch": 1.6184135354867062, + "grad_norm": 8.620522499084473, + "learning_rate": 4.607404965792601e-06, + "loss": 3.1277, + "step": 55240 + }, + { + "epoch": 1.6187065113894383, + "grad_norm": 8.436800956726074, + "learning_rate": 4.605828380934085e-06, + "loss": 3.1105, + "step": 55250 + }, + { + "epoch": 1.6189994872921702, + "grad_norm": 8.498716354370117, + "learning_rate": 4.604251835510104e-06, + "loss": 3.1099, + "step": 55260 + }, + { + "epoch": 1.6192924631949022, + "grad_norm": 8.865447998046875, + "learning_rate": 4.602675329678377e-06, + "loss": 3.1183, + "step": 55270 + }, + { + "epoch": 1.619585439097634, + "grad_norm": 8.552040100097656, + "learning_rate": 4.601098863596626e-06, + "loss": 3.1006, + "step": 55280 + }, + { + "epoch": 1.6198784150003662, + "grad_norm": 8.655166625976562, + "learning_rate": 4.599522437422566e-06, + "loss": 3.1006, + "step": 55290 + }, + { + "epoch": 1.6200542005420053, + "eval_bleu": 0.3471166953369565, + "eval_cap_loss": 0.9201269149780273, + "eval_con_loss": 1.1792407035827637, + "eval_loss": 3.2786083221435547, + "step": 55296 + }, + { + "epoch": 1.6200542005420053, + "eval_bleu": 0.3471166953369565, + "eval_cap_loss": 0.9201269149780273, + "eval_con_loss": 1.1792407035827637, + "eval_loss": 3.2786083221435547, + "eval_runtime": 52.4957, + "eval_samples_per_second": 380.983, + "eval_steps_per_second": 0.381, + "step": 55296 + }, + { + "epoch": 1.6201713909030984, + "grad_norm": 8.974627494812012, + "learning_rate": 4.597946051313913e-06, + "loss": 3.1325, + "step": 55300 + }, + { + "epoch": 1.6204643668058303, + "grad_norm": 8.608529090881348, + "learning_rate": 4.59636970542837e-06, + "loss": 3.1177, + "step": 55310 + }, + { + "epoch": 1.6207573427085622, + "grad_norm": 8.823287010192871, + "learning_rate": 4.5947933999236445e-06, + "loss": 3.0858, + "step": 55320 + }, + { + "epoch": 1.6210503186112941, + "grad_norm": 8.684788703918457, + "learning_rate": 4.5932171349574336e-06, + "loss": 3.1153, + "step": 55330 + }, + { + "epoch": 1.6213432945140263, + "grad_norm": 8.761777877807617, + "learning_rate": 4.591640910687437e-06, + "loss": 3.1197, + "step": 55340 + }, + { + "epoch": 1.6216362704167582, + "grad_norm": 8.588440895080566, + "learning_rate": 4.5900647272713436e-06, + "loss": 3.1027, + "step": 55350 + }, + { + "epoch": 1.6219292463194903, + "grad_norm": 8.507891654968262, + "learning_rate": 4.588488584866843e-06, + "loss": 3.1069, + "step": 55360 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 8.728935241699219, + "learning_rate": 4.586912483631616e-06, + "loss": 3.0903, + "step": 55370 + }, + { + "epoch": 1.6225151981249541, + "grad_norm": 8.687084197998047, + "learning_rate": 4.585336423723348e-06, + "loss": 3.1227, + "step": 55380 + }, + { + "epoch": 1.622808174027686, + "grad_norm": 9.126538276672363, + "learning_rate": 4.583760405299707e-06, + "loss": 3.1387, + "step": 55390 + }, + { + "epoch": 1.6231011499304182, + "grad_norm": 9.119478225708008, + "learning_rate": 4.5821844285183695e-06, + "loss": 3.126, + "step": 55400 + }, + { + "epoch": 1.6233941258331503, + "grad_norm": 8.599806785583496, + "learning_rate": 4.580608493537e-06, + "loss": 3.1072, + "step": 55410 + }, + { + "epoch": 1.6236871017358823, + "grad_norm": 8.994921684265137, + "learning_rate": 4.579032600513265e-06, + "loss": 3.1231, + "step": 55420 + }, + { + "epoch": 1.6239800776386142, + "grad_norm": 8.634134292602539, + "learning_rate": 4.577456749604819e-06, + "loss": 3.1137, + "step": 55430 + }, + { + "epoch": 1.624273053541346, + "grad_norm": 9.4020414352417, + "learning_rate": 4.575880940969319e-06, + "loss": 3.1182, + "step": 55440 + }, + { + "epoch": 1.6245660294440782, + "grad_norm": 8.916646003723145, + "learning_rate": 4.574305174764413e-06, + "loss": 3.1098, + "step": 55450 + }, + { + "epoch": 1.6248590053468104, + "grad_norm": 9.797038078308105, + "learning_rate": 4.572729451147749e-06, + "loss": 3.1052, + "step": 55460 + }, + { + "epoch": 1.6251519812495423, + "grad_norm": 9.142083168029785, + "learning_rate": 4.5711537702769715e-06, + "loss": 3.1045, + "step": 55470 + }, + { + "epoch": 1.6254449571522742, + "grad_norm": 8.922483444213867, + "learning_rate": 4.569578132309712e-06, + "loss": 3.111, + "step": 55480 + }, + { + "epoch": 1.6257379330550061, + "grad_norm": 8.797048568725586, + "learning_rate": 4.568002537403608e-06, + "loss": 3.1099, + "step": 55490 + }, + { + "epoch": 1.6260309089577383, + "grad_norm": 8.905427932739258, + "learning_rate": 4.566426985716287e-06, + "loss": 3.1211, + "step": 55500 + }, + { + "epoch": 1.6263238848604702, + "grad_norm": 9.52366828918457, + "learning_rate": 4.564851477405374e-06, + "loss": 3.1119, + "step": 55510 + }, + { + "epoch": 1.6266168607632023, + "grad_norm": 8.83592414855957, + "learning_rate": 4.563276012628489e-06, + "loss": 3.0996, + "step": 55520 + }, + { + "epoch": 1.6269098366659343, + "grad_norm": 9.365194320678711, + "learning_rate": 4.561700591543248e-06, + "loss": 3.1195, + "step": 55530 + }, + { + "epoch": 1.6272028125686662, + "grad_norm": 8.709277153015137, + "learning_rate": 4.560125214307261e-06, + "loss": 3.1137, + "step": 55540 + }, + { + "epoch": 1.627495788471398, + "grad_norm": 9.073613166809082, + "learning_rate": 4.558549881078139e-06, + "loss": 3.0997, + "step": 55550 + }, + { + "epoch": 1.6277887643741302, + "grad_norm": 8.658278465270996, + "learning_rate": 4.5569745920134804e-06, + "loss": 3.1013, + "step": 55560 + }, + { + "epoch": 1.6280817402768624, + "grad_norm": 9.00124454498291, + "learning_rate": 4.555399347270887e-06, + "loss": 3.1197, + "step": 55570 + }, + { + "epoch": 1.6283747161795943, + "grad_norm": 8.986186981201172, + "learning_rate": 4.553824147007949e-06, + "loss": 3.0999, + "step": 55580 + }, + { + "epoch": 1.6286676920823262, + "grad_norm": 9.3468017578125, + "learning_rate": 4.552248991382261e-06, + "loss": 3.1129, + "step": 55590 + }, + { + "epoch": 1.6289606679850581, + "grad_norm": 9.068061828613281, + "learning_rate": 4.550673880551403e-06, + "loss": 3.1052, + "step": 55600 + }, + { + "epoch": 1.6292536438877903, + "grad_norm": 8.323390007019043, + "learning_rate": 4.549098814672958e-06, + "loss": 3.0957, + "step": 55610 + }, + { + "epoch": 1.6295466197905222, + "grad_norm": 9.311666488647461, + "learning_rate": 4.547523793904501e-06, + "loss": 3.1036, + "step": 55620 + }, + { + "epoch": 1.6298395956932543, + "grad_norm": 9.073163986206055, + "learning_rate": 4.545948818403605e-06, + "loss": 3.1159, + "step": 55630 + }, + { + "epoch": 1.6301325715959862, + "grad_norm": 8.342765808105469, + "learning_rate": 4.544373888327835e-06, + "loss": 3.1088, + "step": 55640 + }, + { + "epoch": 1.6304255474987182, + "grad_norm": 8.819703102111816, + "learning_rate": 4.542799003834755e-06, + "loss": 3.0859, + "step": 55650 + }, + { + "epoch": 1.63071852340145, + "grad_norm": 9.014324188232422, + "learning_rate": 4.541224165081921e-06, + "loss": 3.1171, + "step": 55660 + }, + { + "epoch": 1.6310114993041822, + "grad_norm": 8.845229148864746, + "learning_rate": 4.53964937222689e-06, + "loss": 3.0945, + "step": 55670 + }, + { + "epoch": 1.6313044752069144, + "grad_norm": 8.794533729553223, + "learning_rate": 4.538074625427206e-06, + "loss": 3.1081, + "step": 55680 + }, + { + "epoch": 1.6315974511096463, + "grad_norm": 9.219206809997559, + "learning_rate": 4.536499924840416e-06, + "loss": 3.1063, + "step": 55690 + }, + { + "epoch": 1.6318904270123782, + "grad_norm": 9.237812995910645, + "learning_rate": 4.534925270624057e-06, + "loss": 3.1005, + "step": 55700 + }, + { + "epoch": 1.63218340291511, + "grad_norm": 8.5232572555542, + "learning_rate": 4.533350662935669e-06, + "loss": 3.0789, + "step": 55710 + }, + { + "epoch": 1.6324763788178422, + "grad_norm": 9.473200798034668, + "learning_rate": 4.531776101932775e-06, + "loss": 3.1258, + "step": 55720 + }, + { + "epoch": 1.6327693547205744, + "grad_norm": 9.105447769165039, + "learning_rate": 4.530201587772906e-06, + "loss": 3.1131, + "step": 55730 + }, + { + "epoch": 1.6330623306233063, + "grad_norm": 8.913201332092285, + "learning_rate": 4.52862712061358e-06, + "loss": 3.1204, + "step": 55740 + }, + { + "epoch": 1.6333553065260382, + "grad_norm": 9.43756103515625, + "learning_rate": 4.527052700612315e-06, + "loss": 3.104, + "step": 55750 + }, + { + "epoch": 1.6336482824287701, + "grad_norm": 8.907894134521484, + "learning_rate": 4.525478327926621e-06, + "loss": 3.1303, + "step": 55760 + }, + { + "epoch": 1.6339412583315023, + "grad_norm": 9.320003509521484, + "learning_rate": 4.5239040027140055e-06, + "loss": 3.0999, + "step": 55770 + }, + { + "epoch": 1.6342342342342342, + "grad_norm": 8.691184043884277, + "learning_rate": 4.522329725131968e-06, + "loss": 3.1081, + "step": 55780 + }, + { + "epoch": 1.6345272101369663, + "grad_norm": 9.434588432312012, + "learning_rate": 4.52075549533801e-06, + "loss": 3.1311, + "step": 55790 + }, + { + "epoch": 1.6348201860396983, + "grad_norm": 8.625839233398438, + "learning_rate": 4.51918131348962e-06, + "loss": 3.1254, + "step": 55800 + }, + { + "epoch": 1.6350545667618839, + "eval_bleu": 0.34734387356873675, + "eval_cap_loss": 0.9186502695083618, + "eval_con_loss": 1.1756497621536255, + "eval_loss": 3.2699496746063232, + "step": 55808 + }, + { + "epoch": 1.6350545667618839, + "eval_bleu": 0.34734387356873675, + "eval_cap_loss": 0.9186502695083618, + "eval_con_loss": 1.1756497621536255, + "eval_loss": 3.2699496746063232, + "eval_runtime": 71.7362, + "eval_samples_per_second": 278.799, + "eval_steps_per_second": 0.279, + "step": 55808 + }, + { + "epoch": 1.6351131619424302, + "grad_norm": 9.08464241027832, + "learning_rate": 4.517607179744287e-06, + "loss": 3.1041, + "step": 55810 + }, + { + "epoch": 1.635406137845162, + "grad_norm": 8.831562995910645, + "learning_rate": 4.516033094259494e-06, + "loss": 3.0999, + "step": 55820 + }, + { + "epoch": 1.6356991137478942, + "grad_norm": 9.125024795532227, + "learning_rate": 4.514459057192718e-06, + "loss": 3.1301, + "step": 55830 + }, + { + "epoch": 1.6359920896506264, + "grad_norm": 8.883073806762695, + "learning_rate": 4.512885068701436e-06, + "loss": 3.1026, + "step": 55840 + }, + { + "epoch": 1.6362850655533583, + "grad_norm": 9.509873390197754, + "learning_rate": 4.5113111289431085e-06, + "loss": 3.1041, + "step": 55850 + }, + { + "epoch": 1.6365780414560902, + "grad_norm": 8.803288459777832, + "learning_rate": 4.5097372380752055e-06, + "loss": 3.1171, + "step": 55860 + }, + { + "epoch": 1.6368710173588221, + "grad_norm": 8.750347137451172, + "learning_rate": 4.508163396255182e-06, + "loss": 3.0864, + "step": 55870 + }, + { + "epoch": 1.6371639932615543, + "grad_norm": 9.240811347961426, + "learning_rate": 4.506589603640496e-06, + "loss": 3.1142, + "step": 55880 + }, + { + "epoch": 1.6374569691642864, + "grad_norm": 8.365375518798828, + "learning_rate": 4.505015860388589e-06, + "loss": 3.1092, + "step": 55890 + }, + { + "epoch": 1.6377499450670183, + "grad_norm": 9.353466033935547, + "learning_rate": 4.50344216665691e-06, + "loss": 3.1192, + "step": 55900 + }, + { + "epoch": 1.6380429209697502, + "grad_norm": 9.483393669128418, + "learning_rate": 4.501868522602896e-06, + "loss": 3.1, + "step": 55910 + }, + { + "epoch": 1.6383358968724822, + "grad_norm": 9.684605598449707, + "learning_rate": 4.500294928383982e-06, + "loss": 3.1241, + "step": 55920 + }, + { + "epoch": 1.638628872775214, + "grad_norm": 9.307060241699219, + "learning_rate": 4.498721384157595e-06, + "loss": 3.1162, + "step": 55930 + }, + { + "epoch": 1.6389218486779462, + "grad_norm": 8.829492568969727, + "learning_rate": 4.49714789008116e-06, + "loss": 3.095, + "step": 55940 + }, + { + "epoch": 1.6392148245806784, + "grad_norm": 9.568678855895996, + "learning_rate": 4.495574446312094e-06, + "loss": 3.1133, + "step": 55950 + }, + { + "epoch": 1.6395078004834103, + "grad_norm": 9.020308494567871, + "learning_rate": 4.494001053007814e-06, + "loss": 3.1098, + "step": 55960 + }, + { + "epoch": 1.6398007763861422, + "grad_norm": 8.927383422851562, + "learning_rate": 4.492427710325724e-06, + "loss": 3.1309, + "step": 55970 + }, + { + "epoch": 1.640093752288874, + "grad_norm": 8.510459899902344, + "learning_rate": 4.490854418423233e-06, + "loss": 3.0877, + "step": 55980 + }, + { + "epoch": 1.6403867281916062, + "grad_norm": 9.356813430786133, + "learning_rate": 4.489281177457734e-06, + "loss": 3.1126, + "step": 55990 + }, + { + "epoch": 1.6406797040943384, + "grad_norm": 9.120984077453613, + "learning_rate": 4.487707987586627e-06, + "loss": 3.1216, + "step": 56000 + }, + { + "epoch": 1.6409726799970703, + "grad_norm": 9.449435234069824, + "learning_rate": 4.486134848967292e-06, + "loss": 3.1093, + "step": 56010 + }, + { + "epoch": 1.6412656558998022, + "grad_norm": 8.89202880859375, + "learning_rate": 4.48456176175712e-06, + "loss": 3.123, + "step": 56020 + }, + { + "epoch": 1.6415586318025341, + "grad_norm": 8.250785827636719, + "learning_rate": 4.482988726113484e-06, + "loss": 3.1126, + "step": 56030 + }, + { + "epoch": 1.6418516077052663, + "grad_norm": 8.269648551940918, + "learning_rate": 4.481415742193761e-06, + "loss": 3.101, + "step": 56040 + }, + { + "epoch": 1.6421445836079982, + "grad_norm": 9.499271392822266, + "learning_rate": 4.479842810155313e-06, + "loss": 3.1102, + "step": 56050 + }, + { + "epoch": 1.6424375595107303, + "grad_norm": 8.447493553161621, + "learning_rate": 4.478269930155508e-06, + "loss": 3.0931, + "step": 56060 + }, + { + "epoch": 1.6427305354134623, + "grad_norm": 8.89946460723877, + "learning_rate": 4.4766971023517005e-06, + "loss": 3.1069, + "step": 56070 + }, + { + "epoch": 1.6430235113161942, + "grad_norm": 9.195661544799805, + "learning_rate": 4.475124326901245e-06, + "loss": 3.1066, + "step": 56080 + }, + { + "epoch": 1.643316487218926, + "grad_norm": 8.60022258758545, + "learning_rate": 4.473551603961483e-06, + "loss": 3.1195, + "step": 56090 + }, + { + "epoch": 1.6436094631216582, + "grad_norm": 8.349206924438477, + "learning_rate": 4.471978933689764e-06, + "loss": 3.1168, + "step": 56100 + }, + { + "epoch": 1.6439024390243904, + "grad_norm": 9.013644218444824, + "learning_rate": 4.470406316243416e-06, + "loss": 3.1227, + "step": 56110 + }, + { + "epoch": 1.6441954149271223, + "grad_norm": 8.592193603515625, + "learning_rate": 4.468833751779777e-06, + "loss": 3.0938, + "step": 56120 + }, + { + "epoch": 1.6444883908298542, + "grad_norm": 8.995079040527344, + "learning_rate": 4.4672612404561675e-06, + "loss": 3.1017, + "step": 56130 + }, + { + "epoch": 1.6447813667325861, + "grad_norm": 8.654196739196777, + "learning_rate": 4.465688782429913e-06, + "loss": 3.0975, + "step": 56140 + }, + { + "epoch": 1.6450743426353183, + "grad_norm": 8.689680099487305, + "learning_rate": 4.464116377858324e-06, + "loss": 3.1219, + "step": 56150 + }, + { + "epoch": 1.6453673185380504, + "grad_norm": 9.33357048034668, + "learning_rate": 4.462544026898713e-06, + "loss": 3.105, + "step": 56160 + }, + { + "epoch": 1.6456602944407823, + "grad_norm": 8.696653366088867, + "learning_rate": 4.460971729708382e-06, + "loss": 3.0983, + "step": 56170 + }, + { + "epoch": 1.6459532703435142, + "grad_norm": 8.977508544921875, + "learning_rate": 4.459399486444631e-06, + "loss": 3.1153, + "step": 56180 + }, + { + "epoch": 1.6462462462462462, + "grad_norm": 8.759178161621094, + "learning_rate": 4.457827297264757e-06, + "loss": 3.0987, + "step": 56190 + }, + { + "epoch": 1.6465392221489783, + "grad_norm": 8.771377563476562, + "learning_rate": 4.456255162326043e-06, + "loss": 3.1113, + "step": 56200 + }, + { + "epoch": 1.6468321980517102, + "grad_norm": 10.03372573852539, + "learning_rate": 4.454683081785775e-06, + "loss": 3.0971, + "step": 56210 + }, + { + "epoch": 1.6471251739544424, + "grad_norm": 9.112407684326172, + "learning_rate": 4.4531110558012285e-06, + "loss": 3.1103, + "step": 56220 + }, + { + "epoch": 1.6474181498571743, + "grad_norm": 8.93600845336914, + "learning_rate": 4.451539084529679e-06, + "loss": 3.0935, + "step": 56230 + }, + { + "epoch": 1.6477111257599062, + "grad_norm": 8.834790229797363, + "learning_rate": 4.449967168128387e-06, + "loss": 3.1044, + "step": 56240 + }, + { + "epoch": 1.648004101662638, + "grad_norm": 8.676046371459961, + "learning_rate": 4.4483953067546184e-06, + "loss": 3.0942, + "step": 56250 + }, + { + "epoch": 1.6482970775653702, + "grad_norm": 8.993964195251465, + "learning_rate": 4.446823500565625e-06, + "loss": 3.1162, + "step": 56260 + }, + { + "epoch": 1.6485900534681024, + "grad_norm": 9.031542778015137, + "learning_rate": 4.445251749718662e-06, + "loss": 3.1054, + "step": 56270 + }, + { + "epoch": 1.6488830293708343, + "grad_norm": 8.672524452209473, + "learning_rate": 4.443680054370968e-06, + "loss": 3.1052, + "step": 56280 + }, + { + "epoch": 1.6491760052735662, + "grad_norm": 9.013548851013184, + "learning_rate": 4.442108414679784e-06, + "loss": 3.1054, + "step": 56290 + }, + { + "epoch": 1.6494689811762981, + "grad_norm": 9.19996166229248, + "learning_rate": 4.440536830802342e-06, + "loss": 3.1034, + "step": 56300 + }, + { + "epoch": 1.6497619570790303, + "grad_norm": 9.440655708312988, + "learning_rate": 4.438965302895874e-06, + "loss": 3.0808, + "step": 56310 + }, + { + "epoch": 1.6500549329817622, + "grad_norm": 8.722078323364258, + "learning_rate": 4.437393831117596e-06, + "loss": 3.1175, + "step": 56320 + }, + { + "epoch": 1.6500549329817622, + "eval_bleu": 0.34715274914019706, + "eval_cap_loss": 0.9183895587921143, + "eval_con_loss": 1.1743032932281494, + "eval_loss": 3.266996145248413, + "step": 56320 + }, + { + "epoch": 1.6500549329817622, + "eval_bleu": 0.34715274914019706, + "eval_cap_loss": 0.9183895587921143, + "eval_con_loss": 1.1743032932281494, + "eval_loss": 3.266996145248413, + "eval_runtime": 55.8482, + "eval_samples_per_second": 358.113, + "eval_steps_per_second": 0.358, + "step": 56320 + }, + { + "epoch": 1.6503479088844943, + "grad_norm": 8.990741729736328, + "learning_rate": 4.435822415624729e-06, + "loss": 3.1226, + "step": 56330 + }, + { + "epoch": 1.6506408847872263, + "grad_norm": 9.315949440002441, + "learning_rate": 4.434251056574481e-06, + "loss": 3.0917, + "step": 56340 + }, + { + "epoch": 1.6509338606899582, + "grad_norm": 8.33463191986084, + "learning_rate": 4.43267975412406e-06, + "loss": 3.0868, + "step": 56350 + }, + { + "epoch": 1.65122683659269, + "grad_norm": 8.731338500976562, + "learning_rate": 4.431108508430662e-06, + "loss": 3.0905, + "step": 56360 + }, + { + "epoch": 1.6515198124954222, + "grad_norm": 8.861903190612793, + "learning_rate": 4.4295373196514844e-06, + "loss": 3.1221, + "step": 56370 + }, + { + "epoch": 1.6518127883981544, + "grad_norm": 9.127852439880371, + "learning_rate": 4.427966187943712e-06, + "loss": 3.0927, + "step": 56380 + }, + { + "epoch": 1.6521057643008863, + "grad_norm": 8.947826385498047, + "learning_rate": 4.42639511346453e-06, + "loss": 3.1343, + "step": 56390 + }, + { + "epoch": 1.6523987402036182, + "grad_norm": 10.0328950881958, + "learning_rate": 4.424824096371113e-06, + "loss": 3.1055, + "step": 56400 + }, + { + "epoch": 1.6526917161063501, + "grad_norm": 9.085613250732422, + "learning_rate": 4.4232531368206325e-06, + "loss": 3.1112, + "step": 56410 + }, + { + "epoch": 1.6529846920090823, + "grad_norm": 9.240286827087402, + "learning_rate": 4.4216822349702535e-06, + "loss": 3.0945, + "step": 56420 + }, + { + "epoch": 1.6532776679118144, + "grad_norm": 8.663528442382812, + "learning_rate": 4.420111390977138e-06, + "loss": 3.0926, + "step": 56430 + }, + { + "epoch": 1.6535706438145463, + "grad_norm": 8.975990295410156, + "learning_rate": 4.4185406049984355e-06, + "loss": 3.0771, + "step": 56440 + }, + { + "epoch": 1.6538636197172782, + "grad_norm": 9.561355590820312, + "learning_rate": 4.416969877191297e-06, + "loss": 3.098, + "step": 56450 + }, + { + "epoch": 1.6541565956200102, + "grad_norm": 8.945834159851074, + "learning_rate": 4.4153992077128615e-06, + "loss": 3.1232, + "step": 56460 + }, + { + "epoch": 1.6544495715227423, + "grad_norm": 9.002303123474121, + "learning_rate": 4.41382859672027e-06, + "loss": 3.1039, + "step": 56470 + }, + { + "epoch": 1.6547425474254742, + "grad_norm": 8.356863975524902, + "learning_rate": 4.412258044370647e-06, + "loss": 3.1112, + "step": 56480 + }, + { + "epoch": 1.6550355233282064, + "grad_norm": 8.325749397277832, + "learning_rate": 4.4106875508211204e-06, + "loss": 3.1057, + "step": 56490 + }, + { + "epoch": 1.6553284992309383, + "grad_norm": 9.084931373596191, + "learning_rate": 4.409117116228808e-06, + "loss": 3.1138, + "step": 56500 + }, + { + "epoch": 1.6556214751336702, + "grad_norm": 9.279823303222656, + "learning_rate": 4.407546740750825e-06, + "loss": 3.1162, + "step": 56510 + }, + { + "epoch": 1.6559144510364021, + "grad_norm": 8.901307106018066, + "learning_rate": 4.405976424544272e-06, + "loss": 3.1099, + "step": 56520 + }, + { + "epoch": 1.6562074269391343, + "grad_norm": 9.391672134399414, + "learning_rate": 4.4044061677662545e-06, + "loss": 3.103, + "step": 56530 + }, + { + "epoch": 1.6565004028418664, + "grad_norm": 9.2474365234375, + "learning_rate": 4.402835970573867e-06, + "loss": 3.1034, + "step": 56540 + }, + { + "epoch": 1.6567933787445983, + "grad_norm": 9.102275848388672, + "learning_rate": 4.401265833124197e-06, + "loss": 3.0951, + "step": 56550 + }, + { + "epoch": 1.6570863546473302, + "grad_norm": 8.53316593170166, + "learning_rate": 4.399695755574329e-06, + "loss": 3.1006, + "step": 56560 + }, + { + "epoch": 1.6573793305500621, + "grad_norm": 8.831528663635254, + "learning_rate": 4.398125738081339e-06, + "loss": 3.0879, + "step": 56570 + }, + { + "epoch": 1.6576723064527943, + "grad_norm": 8.784735679626465, + "learning_rate": 4.396555780802298e-06, + "loss": 3.1063, + "step": 56580 + }, + { + "epoch": 1.6579652823555262, + "grad_norm": 8.81678295135498, + "learning_rate": 4.394985883894271e-06, + "loss": 3.0862, + "step": 56590 + }, + { + "epoch": 1.6582582582582583, + "grad_norm": 9.463736534118652, + "learning_rate": 4.393416047514318e-06, + "loss": 3.1089, + "step": 56600 + }, + { + "epoch": 1.6585512341609903, + "grad_norm": 8.193624496459961, + "learning_rate": 4.39184627181949e-06, + "loss": 3.1042, + "step": 56610 + }, + { + "epoch": 1.6588442100637222, + "grad_norm": 9.258109092712402, + "learning_rate": 4.390276556966834e-06, + "loss": 3.1178, + "step": 56620 + }, + { + "epoch": 1.659137185966454, + "grad_norm": 8.91843032836914, + "learning_rate": 4.388706903113391e-06, + "loss": 3.1076, + "step": 56630 + }, + { + "epoch": 1.6594301618691862, + "grad_norm": 8.552559852600098, + "learning_rate": 4.387137310416197e-06, + "loss": 3.1151, + "step": 56640 + }, + { + "epoch": 1.6597231377719184, + "grad_norm": 8.916926383972168, + "learning_rate": 4.385567779032278e-06, + "loss": 3.1345, + "step": 56650 + }, + { + "epoch": 1.6600161136746503, + "grad_norm": 8.583518028259277, + "learning_rate": 4.383998309118657e-06, + "loss": 3.1062, + "step": 56660 + }, + { + "epoch": 1.6603090895773822, + "grad_norm": 9.046131134033203, + "learning_rate": 4.3824289008323505e-06, + "loss": 3.0971, + "step": 56670 + }, + { + "epoch": 1.6606020654801141, + "grad_norm": 8.765929222106934, + "learning_rate": 4.38085955433037e-06, + "loss": 3.119, + "step": 56680 + }, + { + "epoch": 1.6608950413828463, + "grad_norm": 8.526586532592773, + "learning_rate": 4.379447195433949e-06, + "loss": 3.0995, + "step": 56690 + }, + { + "epoch": 1.6611880172855784, + "grad_norm": 8.824836730957031, + "learning_rate": 4.377877966754724e-06, + "loss": 3.0989, + "step": 56700 + }, + { + "epoch": 1.6614809931883103, + "grad_norm": 9.485621452331543, + "learning_rate": 4.376308800315118e-06, + "loss": 3.1041, + "step": 56710 + }, + { + "epoch": 1.6617739690910422, + "grad_norm": 9.092209815979004, + "learning_rate": 4.374739696272118e-06, + "loss": 3.1195, + "step": 56720 + }, + { + "epoch": 1.6620669449937742, + "grad_norm": 8.925345420837402, + "learning_rate": 4.3731706547826995e-06, + "loss": 3.1201, + "step": 56730 + }, + { + "epoch": 1.6623599208965063, + "grad_norm": 8.91881275177002, + "learning_rate": 4.371601676003839e-06, + "loss": 3.1005, + "step": 56740 + }, + { + "epoch": 1.6626528967992382, + "grad_norm": 8.478368759155273, + "learning_rate": 4.370032760092501e-06, + "loss": 3.0797, + "step": 56750 + }, + { + "epoch": 1.6629458727019704, + "grad_norm": 9.568140029907227, + "learning_rate": 4.368463907205651e-06, + "loss": 3.0771, + "step": 56760 + }, + { + "epoch": 1.6632388486047023, + "grad_norm": 9.68337345123291, + "learning_rate": 4.366895117500239e-06, + "loss": 3.0899, + "step": 56770 + }, + { + "epoch": 1.6635318245074342, + "grad_norm": 8.716207504272461, + "learning_rate": 4.365326391133214e-06, + "loss": 3.111, + "step": 56780 + }, + { + "epoch": 1.6638248004101661, + "grad_norm": 8.566407203674316, + "learning_rate": 4.363757728261518e-06, + "loss": 3.0767, + "step": 56790 + }, + { + "epoch": 1.6641177763128983, + "grad_norm": 8.698427200317383, + "learning_rate": 4.362189129042088e-06, + "loss": 3.1067, + "step": 56800 + }, + { + "epoch": 1.6644107522156304, + "grad_norm": 8.045969009399414, + "learning_rate": 4.36062059363185e-06, + "loss": 3.0933, + "step": 56810 + }, + { + "epoch": 1.6647037281183623, + "grad_norm": 7.9251627922058105, + "learning_rate": 4.359052122187729e-06, + "loss": 3.0838, + "step": 56820 + }, + { + "epoch": 1.6649967040210942, + "grad_norm": 9.22426700592041, + "learning_rate": 4.3574837148666394e-06, + "loss": 3.1104, + "step": 56830 + }, + { + "epoch": 1.6650552992016405, + "eval_bleu": 0.3477126915896502, + "eval_cap_loss": 0.9173703193664551, + "eval_con_loss": 1.172691822052002, + "eval_loss": 3.262754201889038, + "step": 56832 + }, + { + "epoch": 1.6650552992016405, + "eval_bleu": 0.3477126915896502, + "eval_cap_loss": 0.9173703193664551, + "eval_con_loss": 1.172691822052002, + "eval_loss": 3.262754201889038, + "eval_runtime": 58.8975, + "eval_samples_per_second": 339.573, + "eval_steps_per_second": 0.34, + "step": 56832 + }, + { + "epoch": 1.6652896799238261, + "grad_norm": 9.27273941040039, + "learning_rate": 4.355915371825495e-06, + "loss": 3.0878, + "step": 56840 + }, + { + "epoch": 1.6655826558265583, + "grad_norm": 9.171077728271484, + "learning_rate": 4.354347093221194e-06, + "loss": 3.1032, + "step": 56850 + }, + { + "epoch": 1.6658756317292904, + "grad_norm": 8.688098907470703, + "learning_rate": 4.352778879210635e-06, + "loss": 3.1166, + "step": 56860 + }, + { + "epoch": 1.6661686076320223, + "grad_norm": 8.864920616149902, + "learning_rate": 4.351210729950711e-06, + "loss": 3.0839, + "step": 56870 + }, + { + "epoch": 1.6664615835347543, + "grad_norm": 9.47545337677002, + "learning_rate": 4.349642645598304e-06, + "loss": 3.1178, + "step": 56880 + }, + { + "epoch": 1.6667545594374862, + "grad_norm": 9.324642181396484, + "learning_rate": 4.348074626310293e-06, + "loss": 3.1106, + "step": 56890 + }, + { + "epoch": 1.667047535340218, + "grad_norm": 7.876544952392578, + "learning_rate": 4.346506672243546e-06, + "loss": 3.0922, + "step": 56900 + }, + { + "epoch": 1.6673405112429502, + "grad_norm": 9.172675132751465, + "learning_rate": 4.3449387835549305e-06, + "loss": 3.0906, + "step": 56910 + }, + { + "epoch": 1.6676334871456824, + "grad_norm": 8.584565162658691, + "learning_rate": 4.343370960401303e-06, + "loss": 3.1139, + "step": 56920 + }, + { + "epoch": 1.6679264630484143, + "grad_norm": 8.971969604492188, + "learning_rate": 4.341803202939516e-06, + "loss": 3.0844, + "step": 56930 + }, + { + "epoch": 1.6682194389511462, + "grad_norm": 8.680039405822754, + "learning_rate": 4.340235511326413e-06, + "loss": 3.0976, + "step": 56940 + }, + { + "epoch": 1.6685124148538781, + "grad_norm": 8.659380912780762, + "learning_rate": 4.338667885718833e-06, + "loss": 3.0854, + "step": 56950 + }, + { + "epoch": 1.6688053907566103, + "grad_norm": 8.440585136413574, + "learning_rate": 4.337100326273607e-06, + "loss": 3.0626, + "step": 56960 + }, + { + "epoch": 1.6690983666593424, + "grad_norm": 9.095687866210938, + "learning_rate": 4.335532833147562e-06, + "loss": 3.0959, + "step": 56970 + }, + { + "epoch": 1.6693913425620743, + "grad_norm": 8.809881210327148, + "learning_rate": 4.333965406497513e-06, + "loss": 3.0858, + "step": 56980 + }, + { + "epoch": 1.6696843184648062, + "grad_norm": 8.542991638183594, + "learning_rate": 4.332398046480275e-06, + "loss": 3.0972, + "step": 56990 + }, + { + "epoch": 1.6699772943675382, + "grad_norm": 8.634902000427246, + "learning_rate": 4.33083075325265e-06, + "loss": 3.0874, + "step": 57000 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 9.332106590270996, + "learning_rate": 4.32926352697144e-06, + "loss": 3.0869, + "step": 57010 + }, + { + "epoch": 1.6705632461730022, + "grad_norm": 8.531357765197754, + "learning_rate": 4.327696367793434e-06, + "loss": 3.0879, + "step": 57020 + }, + { + "epoch": 1.6708562220757344, + "grad_norm": 9.200474739074707, + "learning_rate": 4.326129275875417e-06, + "loss": 3.0895, + "step": 57030 + }, + { + "epoch": 1.6711491979784663, + "grad_norm": 8.7991361618042, + "learning_rate": 4.324562251374168e-06, + "loss": 3.0952, + "step": 57040 + }, + { + "epoch": 1.6714421738811982, + "grad_norm": 9.385889053344727, + "learning_rate": 4.322995294446461e-06, + "loss": 3.1087, + "step": 57050 + }, + { + "epoch": 1.6717351497839301, + "grad_norm": 9.304838180541992, + "learning_rate": 4.321428405249056e-06, + "loss": 3.0758, + "step": 57060 + }, + { + "epoch": 1.6720281256866623, + "grad_norm": 8.99569034576416, + "learning_rate": 4.319861583938714e-06, + "loss": 3.1212, + "step": 57070 + }, + { + "epoch": 1.6723211015893944, + "grad_norm": 8.860766410827637, + "learning_rate": 4.318294830672184e-06, + "loss": 3.1084, + "step": 57080 + }, + { + "epoch": 1.6726140774921263, + "grad_norm": 9.05104923248291, + "learning_rate": 4.316728145606214e-06, + "loss": 3.0975, + "step": 57090 + }, + { + "epoch": 1.6729070533948582, + "grad_norm": 9.7015962600708, + "learning_rate": 4.315161528897538e-06, + "loss": 3.0957, + "step": 57100 + }, + { + "epoch": 1.6732000292975902, + "grad_norm": 8.838701248168945, + "learning_rate": 4.313594980702888e-06, + "loss": 3.0965, + "step": 57110 + }, + { + "epoch": 1.6734930052003223, + "grad_norm": 9.465991973876953, + "learning_rate": 4.312028501178988e-06, + "loss": 3.0943, + "step": 57120 + }, + { + "epoch": 1.6737859811030544, + "grad_norm": 9.513385772705078, + "learning_rate": 4.310462090482556e-06, + "loss": 3.0948, + "step": 57130 + }, + { + "epoch": 1.6740789570057864, + "grad_norm": 9.256674766540527, + "learning_rate": 4.308895748770299e-06, + "loss": 3.0917, + "step": 57140 + }, + { + "epoch": 1.6743719329085183, + "grad_norm": 9.186577796936035, + "learning_rate": 4.307329476198924e-06, + "loss": 3.1048, + "step": 57150 + }, + { + "epoch": 1.6746649088112502, + "grad_norm": 8.370290756225586, + "learning_rate": 4.305763272925123e-06, + "loss": 3.0737, + "step": 57160 + }, + { + "epoch": 1.6749578847139823, + "grad_norm": 8.466390609741211, + "learning_rate": 4.304197139105591e-06, + "loss": 3.0991, + "step": 57170 + }, + { + "epoch": 1.6752508606167142, + "grad_norm": 8.912835121154785, + "learning_rate": 4.3026310748970045e-06, + "loss": 3.1091, + "step": 57180 + }, + { + "epoch": 1.6755438365194464, + "grad_norm": 9.447407722473145, + "learning_rate": 4.301065080456043e-06, + "loss": 3.1178, + "step": 57190 + }, + { + "epoch": 1.6758368124221783, + "grad_norm": 8.684920310974121, + "learning_rate": 4.299499155939373e-06, + "loss": 3.0992, + "step": 57200 + }, + { + "epoch": 1.6761297883249102, + "grad_norm": 9.279104232788086, + "learning_rate": 4.297933301503657e-06, + "loss": 3.0945, + "step": 57210 + }, + { + "epoch": 1.6764227642276421, + "grad_norm": 8.926016807556152, + "learning_rate": 4.296367517305548e-06, + "loss": 3.0971, + "step": 57220 + }, + { + "epoch": 1.6767157401303743, + "grad_norm": 8.536849975585938, + "learning_rate": 4.294801803501695e-06, + "loss": 3.0987, + "step": 57230 + }, + { + "epoch": 1.6770087160331064, + "grad_norm": 8.977547645568848, + "learning_rate": 4.293236160248738e-06, + "loss": 3.0865, + "step": 57240 + }, + { + "epoch": 1.6773016919358383, + "grad_norm": 9.404254913330078, + "learning_rate": 4.2916705877033085e-06, + "loss": 3.1302, + "step": 57250 + }, + { + "epoch": 1.6775946678385703, + "grad_norm": 8.61915397644043, + "learning_rate": 4.290105086022037e-06, + "loss": 3.079, + "step": 57260 + }, + { + "epoch": 1.6778876437413022, + "grad_norm": 8.84057331085205, + "learning_rate": 4.288539655361538e-06, + "loss": 3.1026, + "step": 57270 + }, + { + "epoch": 1.6781806196440343, + "grad_norm": 9.294628143310547, + "learning_rate": 4.286974295878427e-06, + "loss": 3.1052, + "step": 57280 + }, + { + "epoch": 1.6784735955467662, + "grad_norm": 8.410283088684082, + "learning_rate": 4.285409007729306e-06, + "loss": 3.0828, + "step": 57290 + }, + { + "epoch": 1.6787665714494984, + "grad_norm": 8.53775405883789, + "learning_rate": 4.283843791070776e-06, + "loss": 3.0843, + "step": 57300 + }, + { + "epoch": 1.6790595473522303, + "grad_norm": 8.971404075622559, + "learning_rate": 4.282278646059424e-06, + "loss": 3.0854, + "step": 57310 + }, + { + "epoch": 1.6793525232549622, + "grad_norm": 9.022726058959961, + "learning_rate": 4.2807135728518365e-06, + "loss": 3.1077, + "step": 57320 + }, + { + "epoch": 1.6796454991576941, + "grad_norm": 8.603121757507324, + "learning_rate": 4.279148571604588e-06, + "loss": 3.079, + "step": 57330 + }, + { + "epoch": 1.6799384750604263, + "grad_norm": 8.77421760559082, + "learning_rate": 4.27758364247425e-06, + "loss": 3.0705, + "step": 57340 + }, + { + "epoch": 1.680055665421519, + "eval_bleu": 0.3477099364082082, + "eval_cap_loss": 0.9168689250946045, + "eval_con_loss": 1.1720480918884277, + "eval_loss": 3.260964870452881, + "step": 57344 + }, + { + "epoch": 1.680055665421519, + "eval_bleu": 0.3477099364082082, + "eval_cap_loss": 0.9168689250946045, + "eval_con_loss": 1.1720480918884277, + "eval_loss": 3.260964870452881, + "eval_runtime": 53.1006, + "eval_samples_per_second": 376.643, + "eval_steps_per_second": 0.377, + "step": 57344 + }, + { + "epoch": 1.6802314509631584, + "grad_norm": 8.955487251281738, + "learning_rate": 4.27601878561738e-06, + "loss": 3.0928, + "step": 57350 + }, + { + "epoch": 1.6805244268658903, + "grad_norm": 8.836312294006348, + "learning_rate": 4.2744540011905365e-06, + "loss": 3.0939, + "step": 57360 + }, + { + "epoch": 1.6808174027686222, + "grad_norm": 9.554643630981445, + "learning_rate": 4.272889289350265e-06, + "loss": 3.1007, + "step": 57370 + }, + { + "epoch": 1.6811103786713542, + "grad_norm": 9.356534957885742, + "learning_rate": 4.271324650253108e-06, + "loss": 3.093, + "step": 57380 + }, + { + "epoch": 1.6814033545740863, + "grad_norm": 9.396164894104004, + "learning_rate": 4.269760084055596e-06, + "loss": 3.1115, + "step": 57390 + }, + { + "epoch": 1.6816963304768184, + "grad_norm": 8.74732494354248, + "learning_rate": 4.268195590914254e-06, + "loss": 3.0817, + "step": 57400 + }, + { + "epoch": 1.6819893063795504, + "grad_norm": 9.471535682678223, + "learning_rate": 4.266631170985602e-06, + "loss": 3.109, + "step": 57410 + }, + { + "epoch": 1.6822822822822823, + "grad_norm": 9.002546310424805, + "learning_rate": 4.2650668244261516e-06, + "loss": 3.096, + "step": 57420 + }, + { + "epoch": 1.6825752581850142, + "grad_norm": 9.498472213745117, + "learning_rate": 4.263502551392404e-06, + "loss": 3.0823, + "step": 57430 + }, + { + "epoch": 1.6828682340877463, + "grad_norm": 9.281999588012695, + "learning_rate": 4.261938352040857e-06, + "loss": 3.0856, + "step": 57440 + }, + { + "epoch": 1.6831612099904782, + "grad_norm": 9.03374195098877, + "learning_rate": 4.260374226527998e-06, + "loss": 3.0942, + "step": 57450 + }, + { + "epoch": 1.6834541858932104, + "grad_norm": 8.489023208618164, + "learning_rate": 4.2588101750103126e-06, + "loss": 3.0879, + "step": 57460 + }, + { + "epoch": 1.6837471617959423, + "grad_norm": 8.754354476928711, + "learning_rate": 4.257246197644269e-06, + "loss": 3.0795, + "step": 57470 + }, + { + "epoch": 1.6840401376986742, + "grad_norm": 8.81434154510498, + "learning_rate": 4.255682294586338e-06, + "loss": 3.1016, + "step": 57480 + }, + { + "epoch": 1.6843331136014061, + "grad_norm": 8.771100997924805, + "learning_rate": 4.254118465992976e-06, + "loss": 3.0657, + "step": 57490 + }, + { + "epoch": 1.6846260895041383, + "grad_norm": 8.384522438049316, + "learning_rate": 4.252554712020639e-06, + "loss": 3.0881, + "step": 57500 + }, + { + "epoch": 1.6849190654068704, + "grad_norm": 8.319351196289062, + "learning_rate": 4.250991032825767e-06, + "loss": 3.0776, + "step": 57510 + }, + { + "epoch": 1.6852120413096023, + "grad_norm": 8.86658763885498, + "learning_rate": 4.2494274285647985e-06, + "loss": 3.0955, + "step": 57520 + }, + { + "epoch": 1.6855050172123343, + "grad_norm": 8.664041519165039, + "learning_rate": 4.247863899394162e-06, + "loss": 3.1057, + "step": 57530 + }, + { + "epoch": 1.6857979931150662, + "grad_norm": 9.024576187133789, + "learning_rate": 4.246300445470282e-06, + "loss": 3.0892, + "step": 57540 + }, + { + "epoch": 1.6860909690177983, + "grad_norm": 9.074055671691895, + "learning_rate": 4.2447370669495694e-06, + "loss": 3.1022, + "step": 57550 + }, + { + "epoch": 1.6863839449205302, + "grad_norm": 8.997655868530273, + "learning_rate": 4.243173763988433e-06, + "loss": 3.0769, + "step": 57560 + }, + { + "epoch": 1.6866769208232624, + "grad_norm": 8.860915184020996, + "learning_rate": 4.24161053674327e-06, + "loss": 3.0956, + "step": 57570 + }, + { + "epoch": 1.6869698967259943, + "grad_norm": 8.769539833068848, + "learning_rate": 4.240047385370475e-06, + "loss": 3.1091, + "step": 57580 + }, + { + "epoch": 1.6872628726287262, + "grad_norm": 8.932976722717285, + "learning_rate": 4.238484310026431e-06, + "loss": 3.0709, + "step": 57590 + }, + { + "epoch": 1.6875558485314581, + "grad_norm": 9.070908546447754, + "learning_rate": 4.236921310867512e-06, + "loss": 3.0857, + "step": 57600 + }, + { + "epoch": 1.6878488244341903, + "grad_norm": 9.106833457946777, + "learning_rate": 4.2353583880500905e-06, + "loss": 3.1168, + "step": 57610 + }, + { + "epoch": 1.6881418003369224, + "grad_norm": 9.479940414428711, + "learning_rate": 4.233795541730523e-06, + "loss": 3.128, + "step": 57620 + }, + { + "epoch": 1.6884347762396543, + "grad_norm": 9.081696510314941, + "learning_rate": 4.2322327720651705e-06, + "loss": 3.0891, + "step": 57630 + }, + { + "epoch": 1.6887277521423862, + "grad_norm": 8.81502628326416, + "learning_rate": 4.23067007921037e-06, + "loss": 3.0815, + "step": 57640 + }, + { + "epoch": 1.6890207280451182, + "grad_norm": 8.46205997467041, + "learning_rate": 4.229107463322466e-06, + "loss": 3.1045, + "step": 57650 + }, + { + "epoch": 1.6893137039478503, + "grad_norm": 8.932291984558105, + "learning_rate": 4.227544924557785e-06, + "loss": 3.0713, + "step": 57660 + }, + { + "epoch": 1.6896066798505824, + "grad_norm": 9.562417984008789, + "learning_rate": 4.225982463072654e-06, + "loss": 3.085, + "step": 57670 + }, + { + "epoch": 1.6898996557533144, + "grad_norm": 9.296741485595703, + "learning_rate": 4.224420079023383e-06, + "loss": 3.1068, + "step": 57680 + }, + { + "epoch": 1.6901926316560463, + "grad_norm": 8.602065086364746, + "learning_rate": 4.2228577725662826e-06, + "loss": 3.098, + "step": 57690 + }, + { + "epoch": 1.6904856075587782, + "grad_norm": 9.327439308166504, + "learning_rate": 4.221295543857651e-06, + "loss": 3.0998, + "step": 57700 + }, + { + "epoch": 1.6907785834615103, + "grad_norm": 8.597208976745605, + "learning_rate": 4.219733393053782e-06, + "loss": 3.0862, + "step": 57710 + }, + { + "epoch": 1.6910715593642422, + "grad_norm": 8.715561866760254, + "learning_rate": 4.218171320310956e-06, + "loss": 3.1049, + "step": 57720 + }, + { + "epoch": 1.6913645352669744, + "grad_norm": 8.793405532836914, + "learning_rate": 4.216609325785452e-06, + "loss": 3.0868, + "step": 57730 + }, + { + "epoch": 1.6916575111697063, + "grad_norm": 8.972160339355469, + "learning_rate": 4.2150474096335356e-06, + "loss": 3.0775, + "step": 57740 + }, + { + "epoch": 1.6919504870724382, + "grad_norm": 9.146408081054688, + "learning_rate": 4.213485572011472e-06, + "loss": 3.1112, + "step": 57750 + }, + { + "epoch": 1.6922434629751701, + "grad_norm": 9.033902168273926, + "learning_rate": 4.211923813075508e-06, + "loss": 3.0914, + "step": 57760 + }, + { + "epoch": 1.6925364388779023, + "grad_norm": 9.036552429199219, + "learning_rate": 4.210362132981892e-06, + "loss": 3.0753, + "step": 57770 + }, + { + "epoch": 1.6928294147806344, + "grad_norm": 8.691794395446777, + "learning_rate": 4.208800531886859e-06, + "loss": 3.0956, + "step": 57780 + }, + { + "epoch": 1.6931223906833663, + "grad_norm": 8.643278121948242, + "learning_rate": 4.207239009946641e-06, + "loss": 3.0831, + "step": 57790 + }, + { + "epoch": 1.6934153665860983, + "grad_norm": 8.855677604675293, + "learning_rate": 4.205677567317455e-06, + "loss": 3.1202, + "step": 57800 + }, + { + "epoch": 1.6937083424888302, + "grad_norm": 9.578110694885254, + "learning_rate": 4.204116204155516e-06, + "loss": 3.1345, + "step": 57810 + }, + { + "epoch": 1.6940013183915623, + "grad_norm": 8.757585525512695, + "learning_rate": 4.202554920617029e-06, + "loss": 3.0943, + "step": 57820 + }, + { + "epoch": 1.6942942942942945, + "grad_norm": 8.616838455200195, + "learning_rate": 4.200993716858192e-06, + "loss": 3.0872, + "step": 57830 + }, + { + "epoch": 1.6945872701970264, + "grad_norm": 8.756989479064941, + "learning_rate": 4.199432593035192e-06, + "loss": 3.0923, + "step": 57840 + }, + { + "epoch": 1.6948802460997583, + "grad_norm": 9.004810333251953, + "learning_rate": 4.197871549304212e-06, + "loss": 3.1015, + "step": 57850 + }, + { + "epoch": 1.6950560316413976, + "eval_bleu": 0.34833823233466976, + "eval_cap_loss": 0.9157131910324097, + "eval_con_loss": 1.1687355041503906, + "eval_loss": 3.2531840801239014, + "step": 57856 + }, + { + "epoch": 1.6950560316413976, + "eval_bleu": 0.34833823233466976, + "eval_cap_loss": 0.9157131910324097, + "eval_con_loss": 1.1687355041503906, + "eval_loss": 3.2531840801239014, + "eval_runtime": 55.7527, + "eval_samples_per_second": 358.727, + "eval_steps_per_second": 0.359, + "step": 57856 + }, + { + "epoch": 1.6951732220024902, + "grad_norm": 9.255110740661621, + "learning_rate": 4.1963105858214235e-06, + "loss": 3.09, + "step": 57860 + }, + { + "epoch": 1.6954661979052223, + "grad_norm": 8.54899787902832, + "learning_rate": 4.194749702742996e-06, + "loss": 3.0944, + "step": 57870 + }, + { + "epoch": 1.6957591738079543, + "grad_norm": 8.778107643127441, + "learning_rate": 4.19318890022508e-06, + "loss": 3.0695, + "step": 57880 + }, + { + "epoch": 1.6960521497106864, + "grad_norm": 8.5987548828125, + "learning_rate": 4.191628178423829e-06, + "loss": 3.0942, + "step": 57890 + }, + { + "epoch": 1.6963451256134183, + "grad_norm": 9.51515007019043, + "learning_rate": 4.190067537495382e-06, + "loss": 3.0779, + "step": 57900 + }, + { + "epoch": 1.6966381015161502, + "grad_norm": 8.827274322509766, + "learning_rate": 4.188506977595875e-06, + "loss": 3.0961, + "step": 57910 + }, + { + "epoch": 1.6969310774188822, + "grad_norm": 9.760221481323242, + "learning_rate": 4.186946498881429e-06, + "loss": 3.1008, + "step": 57920 + }, + { + "epoch": 1.6972240533216143, + "grad_norm": 9.313610076904297, + "learning_rate": 4.185386101508162e-06, + "loss": 3.0826, + "step": 57930 + }, + { + "epoch": 1.6975170292243464, + "grad_norm": 8.27818489074707, + "learning_rate": 4.1838257856321825e-06, + "loss": 3.0938, + "step": 57940 + }, + { + "epoch": 1.6978100051270784, + "grad_norm": 8.48028564453125, + "learning_rate": 4.18226555140959e-06, + "loss": 3.093, + "step": 57950 + }, + { + "epoch": 1.6981029810298103, + "grad_norm": 8.830694198608398, + "learning_rate": 4.180705398996481e-06, + "loss": 3.0713, + "step": 57960 + }, + { + "epoch": 1.6983959569325422, + "grad_norm": 8.8165283203125, + "learning_rate": 4.179145328548931e-06, + "loss": 3.0817, + "step": 57970 + }, + { + "epoch": 1.6986889328352743, + "grad_norm": 8.416308403015137, + "learning_rate": 4.177585340223026e-06, + "loss": 3.0841, + "step": 57980 + }, + { + "epoch": 1.6989819087380063, + "grad_norm": 8.905762672424316, + "learning_rate": 4.176025434174826e-06, + "loss": 3.0749, + "step": 57990 + }, + { + "epoch": 1.6992748846407384, + "grad_norm": 8.66970443725586, + "learning_rate": 4.1744656105603944e-06, + "loss": 3.091, + "step": 58000 + }, + { + "epoch": 1.6995678605434703, + "grad_norm": 8.98485279083252, + "learning_rate": 4.1729058695357775e-06, + "loss": 3.1186, + "step": 58010 + }, + { + "epoch": 1.6998608364462022, + "grad_norm": 9.28170394897461, + "learning_rate": 4.171346211257025e-06, + "loss": 3.0999, + "step": 58020 + }, + { + "epoch": 1.7001538123489341, + "grad_norm": 8.890033721923828, + "learning_rate": 4.169786635880166e-06, + "loss": 3.1047, + "step": 58030 + }, + { + "epoch": 1.7004467882516663, + "grad_norm": 8.79651165008545, + "learning_rate": 4.16822714356123e-06, + "loss": 3.0892, + "step": 58040 + }, + { + "epoch": 1.7007397641543984, + "grad_norm": 8.654054641723633, + "learning_rate": 4.16666773445623e-06, + "loss": 3.0838, + "step": 58050 + }, + { + "epoch": 1.7010327400571303, + "grad_norm": 9.562790870666504, + "learning_rate": 4.1651084087211835e-06, + "loss": 3.057, + "step": 58060 + }, + { + "epoch": 1.7013257159598623, + "grad_norm": 8.782421112060547, + "learning_rate": 4.163549166512084e-06, + "loss": 3.0907, + "step": 58070 + }, + { + "epoch": 1.7016186918625942, + "grad_norm": 8.99947738647461, + "learning_rate": 4.161990007984932e-06, + "loss": 3.0927, + "step": 58080 + }, + { + "epoch": 1.7019116677653263, + "grad_norm": 9.368387222290039, + "learning_rate": 4.160430933295703e-06, + "loss": 3.0753, + "step": 58090 + }, + { + "epoch": 1.7022046436680585, + "grad_norm": 9.584991455078125, + "learning_rate": 4.158871942600382e-06, + "loss": 3.0855, + "step": 58100 + }, + { + "epoch": 1.7024976195707904, + "grad_norm": 9.962206840515137, + "learning_rate": 4.15731303605493e-06, + "loss": 3.092, + "step": 58110 + }, + { + "epoch": 1.7027905954735223, + "grad_norm": 9.826904296875, + "learning_rate": 4.155754213815313e-06, + "loss": 3.0893, + "step": 58120 + }, + { + "epoch": 1.7030835713762542, + "grad_norm": 8.39582633972168, + "learning_rate": 4.154195476037474e-06, + "loss": 3.0786, + "step": 58130 + }, + { + "epoch": 1.7033765472789864, + "grad_norm": 8.96886920928955, + "learning_rate": 4.1526368228773625e-06, + "loss": 3.0813, + "step": 58140 + }, + { + "epoch": 1.7036695231817183, + "grad_norm": 8.59756088256836, + "learning_rate": 4.151078254490908e-06, + "loss": 3.0985, + "step": 58150 + }, + { + "epoch": 1.7039624990844504, + "grad_norm": 8.923428535461426, + "learning_rate": 4.149519771034039e-06, + "loss": 3.0774, + "step": 58160 + }, + { + "epoch": 1.7042554749871823, + "grad_norm": 8.95309829711914, + "learning_rate": 4.147961372662669e-06, + "loss": 3.091, + "step": 58170 + }, + { + "epoch": 1.7045484508899142, + "grad_norm": 8.82087230682373, + "learning_rate": 4.146403059532712e-06, + "loss": 3.0836, + "step": 58180 + }, + { + "epoch": 1.7048414267926462, + "grad_norm": 8.97415542602539, + "learning_rate": 4.144844831800062e-06, + "loss": 3.0955, + "step": 58190 + }, + { + "epoch": 1.7051344026953783, + "grad_norm": 8.72177791595459, + "learning_rate": 4.143286689620614e-06, + "loss": 3.0822, + "step": 58200 + }, + { + "epoch": 1.7054273785981104, + "grad_norm": 9.473910331726074, + "learning_rate": 4.141728633150249e-06, + "loss": 3.081, + "step": 58210 + }, + { + "epoch": 1.7057203545008424, + "grad_norm": 8.91490650177002, + "learning_rate": 4.140170662544845e-06, + "loss": 3.0821, + "step": 58220 + }, + { + "epoch": 1.7060133304035743, + "grad_norm": 9.112236022949219, + "learning_rate": 4.138612777960262e-06, + "loss": 3.0985, + "step": 58230 + }, + { + "epoch": 1.7063063063063062, + "grad_norm": 9.241869926452637, + "learning_rate": 4.137054979552363e-06, + "loss": 3.1191, + "step": 58240 + }, + { + "epoch": 1.7065992822090383, + "grad_norm": 9.423279762268066, + "learning_rate": 4.13549726747699e-06, + "loss": 3.084, + "step": 58250 + }, + { + "epoch": 1.7068922581117703, + "grad_norm": 8.841160774230957, + "learning_rate": 4.13393964188999e-06, + "loss": 3.0975, + "step": 58260 + }, + { + "epoch": 1.7071852340145024, + "grad_norm": 9.520442962646484, + "learning_rate": 4.132382102947188e-06, + "loss": 3.0812, + "step": 58270 + }, + { + "epoch": 1.7074782099172343, + "grad_norm": 8.864082336425781, + "learning_rate": 4.1308246508044125e-06, + "loss": 3.0837, + "step": 58280 + }, + { + "epoch": 1.7077711858199662, + "grad_norm": 8.8252534866333, + "learning_rate": 4.1292672856174695e-06, + "loss": 3.0894, + "step": 58290 + }, + { + "epoch": 1.7080641617226981, + "grad_norm": 8.829981803894043, + "learning_rate": 4.12771000754217e-06, + "loss": 3.0847, + "step": 58300 + }, + { + "epoch": 1.7083571376254303, + "grad_norm": 9.41779613494873, + "learning_rate": 4.126152816734313e-06, + "loss": 3.0795, + "step": 58310 + }, + { + "epoch": 1.7086501135281624, + "grad_norm": 8.6283540725708, + "learning_rate": 4.124595713349678e-06, + "loss": 3.0776, + "step": 58320 + }, + { + "epoch": 1.7089430894308943, + "grad_norm": 8.572906494140625, + "learning_rate": 4.12303869754405e-06, + "loss": 3.0914, + "step": 58330 + }, + { + "epoch": 1.7092360653336263, + "grad_norm": 9.039346694946289, + "learning_rate": 4.121481769473197e-06, + "loss": 3.0954, + "step": 58340 + }, + { + "epoch": 1.7095290412363582, + "grad_norm": 9.125765800476074, + "learning_rate": 4.119924929292882e-06, + "loss": 3.079, + "step": 58350 + }, + { + "epoch": 1.7098220171390903, + "grad_norm": 8.600706100463867, + "learning_rate": 4.118368177158855e-06, + "loss": 3.0887, + "step": 58360 + }, + { + "epoch": 1.710056397861276, + "eval_bleu": 0.34825894180964206, + "eval_cap_loss": 0.9152491092681885, + "eval_con_loss": 1.1675446033477783, + "eval_loss": 3.250338315963745, + "step": 58368 + }, + { + "epoch": 1.710056397861276, + "eval_bleu": 0.34825894180964206, + "eval_cap_loss": 0.9152491092681885, + "eval_con_loss": 1.1675446033477783, + "eval_loss": 3.250338315963745, + "eval_runtime": 58.9427, + "eval_samples_per_second": 339.313, + "eval_steps_per_second": 0.339, + "step": 58368 + }, + { + "epoch": 1.7101149930418225, + "grad_norm": 8.746832847595215, + "learning_rate": 4.116811513226862e-06, + "loss": 3.0953, + "step": 58370 + }, + { + "epoch": 1.7104079689445544, + "grad_norm": 9.148788452148438, + "learning_rate": 4.115254937652638e-06, + "loss": 3.0975, + "step": 58380 + }, + { + "epoch": 1.7107009448472863, + "grad_norm": 9.187371253967285, + "learning_rate": 4.1136984505919085e-06, + "loss": 3.0802, + "step": 58390 + }, + { + "epoch": 1.7109939207500182, + "grad_norm": 8.672272682189941, + "learning_rate": 4.11214205220039e-06, + "loss": 3.0696, + "step": 58400 + }, + { + "epoch": 1.7112868966527504, + "grad_norm": 8.734050750732422, + "learning_rate": 4.110585742633792e-06, + "loss": 3.0958, + "step": 58410 + }, + { + "epoch": 1.7115798725554823, + "grad_norm": 8.748642921447754, + "learning_rate": 4.109029522047812e-06, + "loss": 3.086, + "step": 58420 + }, + { + "epoch": 1.7118728484582144, + "grad_norm": 9.27337646484375, + "learning_rate": 4.107473390598145e-06, + "loss": 3.0729, + "step": 58430 + }, + { + "epoch": 1.7121658243609463, + "grad_norm": 9.325095176696777, + "learning_rate": 4.105917348440469e-06, + "loss": 3.0802, + "step": 58440 + }, + { + "epoch": 1.7124588002636782, + "grad_norm": 9.258429527282715, + "learning_rate": 4.104361395730458e-06, + "loss": 3.0743, + "step": 58450 + }, + { + "epoch": 1.7127517761664102, + "grad_norm": 9.201744079589844, + "learning_rate": 4.102805532623775e-06, + "loss": 3.0751, + "step": 58460 + }, + { + "epoch": 1.7130447520691423, + "grad_norm": 9.097555160522461, + "learning_rate": 4.101249759276077e-06, + "loss": 3.0675, + "step": 58470 + }, + { + "epoch": 1.7133377279718744, + "grad_norm": 9.062970161437988, + "learning_rate": 4.099694075843007e-06, + "loss": 3.109, + "step": 58480 + }, + { + "epoch": 1.7136307038746064, + "grad_norm": 9.46957778930664, + "learning_rate": 4.098138482480205e-06, + "loss": 3.0953, + "step": 58490 + }, + { + "epoch": 1.7139236797773383, + "grad_norm": 9.5542573928833, + "learning_rate": 4.096582979343296e-06, + "loss": 3.0797, + "step": 58500 + }, + { + "epoch": 1.7142166556800702, + "grad_norm": 9.27653980255127, + "learning_rate": 4.095027566587901e-06, + "loss": 3.1062, + "step": 58510 + }, + { + "epoch": 1.7145096315828023, + "grad_norm": 9.480071067810059, + "learning_rate": 4.093472244369628e-06, + "loss": 3.0954, + "step": 58520 + }, + { + "epoch": 1.7148026074855345, + "grad_norm": 8.805956840515137, + "learning_rate": 4.091917012844079e-06, + "loss": 3.0778, + "step": 58530 + }, + { + "epoch": 1.7150955833882664, + "grad_norm": 9.408530235290527, + "learning_rate": 4.090361872166845e-06, + "loss": 3.0813, + "step": 58540 + }, + { + "epoch": 1.7153885592909983, + "grad_norm": 8.910560607910156, + "learning_rate": 4.08880682249351e-06, + "loss": 3.0825, + "step": 58550 + }, + { + "epoch": 1.7156815351937302, + "grad_norm": 9.260397911071777, + "learning_rate": 4.087251863979646e-06, + "loss": 3.0867, + "step": 58560 + }, + { + "epoch": 1.7159745110964622, + "grad_norm": 9.085143089294434, + "learning_rate": 4.085696996780818e-06, + "loss": 3.0751, + "step": 58570 + }, + { + "epoch": 1.7162674869991943, + "grad_norm": 9.514814376831055, + "learning_rate": 4.0841422210525806e-06, + "loss": 3.0786, + "step": 58580 + }, + { + "epoch": 1.7165604629019264, + "grad_norm": 9.15192985534668, + "learning_rate": 4.082587536950482e-06, + "loss": 3.0942, + "step": 58590 + }, + { + "epoch": 1.7168534388046583, + "grad_norm": 8.169109344482422, + "learning_rate": 4.081032944630056e-06, + "loss": 3.0698, + "step": 58600 + }, + { + "epoch": 1.7171464147073903, + "grad_norm": 8.923704147338867, + "learning_rate": 4.079478444246832e-06, + "loss": 3.1091, + "step": 58610 + }, + { + "epoch": 1.7174393906101222, + "grad_norm": 8.852508544921875, + "learning_rate": 4.077924035956328e-06, + "loss": 3.1043, + "step": 58620 + }, + { + "epoch": 1.7177323665128543, + "grad_norm": 9.33465576171875, + "learning_rate": 4.076369719914055e-06, + "loss": 3.0933, + "step": 58630 + }, + { + "epoch": 1.7180253424155865, + "grad_norm": 9.214475631713867, + "learning_rate": 4.074815496275511e-06, + "loss": 3.0851, + "step": 58640 + }, + { + "epoch": 1.7183183183183184, + "grad_norm": 9.039851188659668, + "learning_rate": 4.073261365196186e-06, + "loss": 3.0901, + "step": 58650 + }, + { + "epoch": 1.7186112942210503, + "grad_norm": 9.324823379516602, + "learning_rate": 4.071707326831565e-06, + "loss": 3.0622, + "step": 58660 + }, + { + "epoch": 1.7189042701237822, + "grad_norm": 8.437095642089844, + "learning_rate": 4.070153381337118e-06, + "loss": 3.0804, + "step": 58670 + }, + { + "epoch": 1.7191972460265144, + "grad_norm": 9.036991119384766, + "learning_rate": 4.068599528868311e-06, + "loss": 3.0891, + "step": 58680 + }, + { + "epoch": 1.7194902219292463, + "grad_norm": NaN, + "learning_rate": 4.067201141311785e-06, + "loss": 3.0944, + "step": 58690 + }, + { + "epoch": 1.7197831978319784, + "grad_norm": 8.809744834899902, + "learning_rate": 4.065647466019955e-06, + "loss": 3.0872, + "step": 58700 + }, + { + "epoch": 1.7200761737347103, + "grad_norm": 9.254232406616211, + "learning_rate": 4.064093884204552e-06, + "loss": 3.0957, + "step": 58710 + }, + { + "epoch": 1.7203691496374423, + "grad_norm": 9.142960548400879, + "learning_rate": 4.062540396021006e-06, + "loss": 3.0794, + "step": 58720 + }, + { + "epoch": 1.7206621255401742, + "grad_norm": 8.543123245239258, + "learning_rate": 4.060987001624728e-06, + "loss": 3.0897, + "step": 58730 + }, + { + "epoch": 1.7209551014429063, + "grad_norm": 8.667753219604492, + "learning_rate": 4.059433701171131e-06, + "loss": 3.1037, + "step": 58740 + }, + { + "epoch": 1.7212480773456384, + "grad_norm": 8.456661224365234, + "learning_rate": 4.057880494815612e-06, + "loss": 3.0676, + "step": 58750 + }, + { + "epoch": 1.7215410532483704, + "grad_norm": 9.069618225097656, + "learning_rate": 4.056327382713561e-06, + "loss": 3.052, + "step": 58760 + }, + { + "epoch": 1.7218340291511023, + "grad_norm": 8.646257400512695, + "learning_rate": 4.054774365020355e-06, + "loss": 3.1004, + "step": 58770 + }, + { + "epoch": 1.7221270050538342, + "grad_norm": 8.644089698791504, + "learning_rate": 4.053221441891366e-06, + "loss": 3.0664, + "step": 58780 + }, + { + "epoch": 1.7224199809565663, + "grad_norm": 9.301020622253418, + "learning_rate": 4.051668613481956e-06, + "loss": 3.083, + "step": 58790 + }, + { + "epoch": 1.7227129568592985, + "grad_norm": 8.926610946655273, + "learning_rate": 4.050115879947475e-06, + "loss": 3.0957, + "step": 58800 + }, + { + "epoch": 1.7230059327620304, + "grad_norm": 8.84366226196289, + "learning_rate": 4.048563241443264e-06, + "loss": 3.0518, + "step": 58810 + }, + { + "epoch": 1.7232989086647623, + "grad_norm": 8.572774887084961, + "learning_rate": 4.047010698124656e-06, + "loss": 3.0654, + "step": 58820 + }, + { + "epoch": 1.7235918845674942, + "grad_norm": 8.862808227539062, + "learning_rate": 4.045458250146973e-06, + "loss": 3.0878, + "step": 58830 + }, + { + "epoch": 1.7238848604702264, + "grad_norm": 10.250845909118652, + "learning_rate": 4.0439058976655325e-06, + "loss": 3.0954, + "step": 58840 + }, + { + "epoch": 1.7241778363729583, + "grad_norm": 8.999458312988281, + "learning_rate": 4.042353640835631e-06, + "loss": 3.0858, + "step": 58850 + }, + { + "epoch": 1.7244708122756904, + "grad_norm": 8.38154125213623, + "learning_rate": 4.0408014798125675e-06, + "loss": 3.045, + "step": 58860 + }, + { + "epoch": 1.7247637881784224, + "grad_norm": 8.538734436035156, + "learning_rate": 4.039249414751624e-06, + "loss": 3.0742, + "step": 58870 + }, + { + "epoch": 1.7250567640811543, + "grad_norm": 8.912181854248047, + "learning_rate": 4.037697445808079e-06, + "loss": 3.0762, + "step": 58880 + }, + { + "epoch": 1.7250567640811543, + "eval_bleu": 0.34847606882017995, + "eval_cap_loss": 0.914316713809967, + "eval_con_loss": 1.1640183925628662, + "eval_loss": 3.2423534393310547, + "step": 58880 + }, + { + "epoch": 1.7250567640811543, + "eval_bleu": 0.34847606882017995, + "eval_cap_loss": 0.914316713809967, + "eval_con_loss": 1.1640183925628662, + "eval_loss": 3.2423534393310547, + "eval_runtime": 57.3058, + "eval_samples_per_second": 349.005, + "eval_steps_per_second": 0.349, + "step": 58880 + }, + { + "epoch": 1.7253497399838862, + "grad_norm": 9.133298873901367, + "learning_rate": 4.036145573137193e-06, + "loss": 3.0703, + "step": 58890 + }, + { + "epoch": 1.7256427158866183, + "grad_norm": 9.411772727966309, + "learning_rate": 4.034593796894226e-06, + "loss": 3.0982, + "step": 58900 + }, + { + "epoch": 1.7259356917893505, + "grad_norm": 9.035560607910156, + "learning_rate": 4.03304211723442e-06, + "loss": 3.0642, + "step": 58910 + }, + { + "epoch": 1.7262286676920824, + "grad_norm": 8.966349601745605, + "learning_rate": 4.031490534313016e-06, + "loss": 3.0776, + "step": 58920 + }, + { + "epoch": 1.7265216435948143, + "grad_norm": 9.760088920593262, + "learning_rate": 4.029939048285236e-06, + "loss": 3.0964, + "step": 58930 + }, + { + "epoch": 1.7268146194975462, + "grad_norm": 9.465763092041016, + "learning_rate": 4.0283876593062995e-06, + "loss": 3.0823, + "step": 58940 + }, + { + "epoch": 1.7271075954002784, + "grad_norm": 8.913122177124023, + "learning_rate": 4.026836367531413e-06, + "loss": 3.0985, + "step": 58950 + }, + { + "epoch": 1.7274005713030103, + "grad_norm": 9.246867179870605, + "learning_rate": 4.025285173115776e-06, + "loss": 3.0746, + "step": 58960 + }, + { + "epoch": 1.7276935472057424, + "grad_norm": 8.681418418884277, + "learning_rate": 4.0237340762145726e-06, + "loss": 3.089, + "step": 58970 + }, + { + "epoch": 1.7279865231084743, + "grad_norm": 9.602179527282715, + "learning_rate": 4.022183076982984e-06, + "loss": 3.0626, + "step": 58980 + }, + { + "epoch": 1.7282794990112063, + "grad_norm": 9.338997840881348, + "learning_rate": 4.020632175576177e-06, + "loss": 3.0947, + "step": 58990 + }, + { + "epoch": 1.7285724749139382, + "grad_norm": 9.193803787231445, + "learning_rate": 4.019081372149309e-06, + "loss": 3.0985, + "step": 59000 + }, + { + "epoch": 1.7288654508166703, + "grad_norm": 9.196210861206055, + "learning_rate": 4.017530666857534e-06, + "loss": 3.097, + "step": 59010 + }, + { + "epoch": 1.7291584267194025, + "grad_norm": 8.590088844299316, + "learning_rate": 4.0159800598559835e-06, + "loss": 3.0806, + "step": 59020 + }, + { + "epoch": 1.7294514026221344, + "grad_norm": 9.28098201751709, + "learning_rate": 4.0144295512997914e-06, + "loss": 3.0797, + "step": 59030 + }, + { + "epoch": 1.7297443785248663, + "grad_norm": 9.129977226257324, + "learning_rate": 4.012879141344075e-06, + "loss": 3.0892, + "step": 59040 + }, + { + "epoch": 1.7300373544275982, + "grad_norm": 9.066824913024902, + "learning_rate": 4.011328830143945e-06, + "loss": 3.0764, + "step": 59050 + }, + { + "epoch": 1.7303303303303303, + "grad_norm": 8.742634773254395, + "learning_rate": 4.009778617854499e-06, + "loss": 3.0921, + "step": 59060 + }, + { + "epoch": 1.7306233062330625, + "grad_norm": 9.190409660339355, + "learning_rate": 4.008228504630828e-06, + "loss": 3.0899, + "step": 59070 + }, + { + "epoch": 1.7309162821357944, + "grad_norm": 8.848410606384277, + "learning_rate": 4.00667849062801e-06, + "loss": 3.0784, + "step": 59080 + }, + { + "epoch": 1.7312092580385263, + "grad_norm": 9.639668464660645, + "learning_rate": 4.005128576001118e-06, + "loss": 3.0865, + "step": 59090 + }, + { + "epoch": 1.7315022339412582, + "grad_norm": 8.612578392028809, + "learning_rate": 4.003578760905208e-06, + "loss": 3.0845, + "step": 59100 + }, + { + "epoch": 1.7317952098439904, + "grad_norm": 9.21455192565918, + "learning_rate": 4.002029045495332e-06, + "loss": 3.0885, + "step": 59110 + }, + { + "epoch": 1.7320881857467223, + "grad_norm": 8.867390632629395, + "learning_rate": 4.0004794299265286e-06, + "loss": 3.0513, + "step": 59120 + }, + { + "epoch": 1.7323811616494544, + "grad_norm": 9.12234878540039, + "learning_rate": 3.99892991435383e-06, + "loss": 3.0761, + "step": 59130 + }, + { + "epoch": 1.7326741375521864, + "grad_norm": 8.959196090698242, + "learning_rate": 3.997380498932252e-06, + "loss": 3.0869, + "step": 59140 + }, + { + "epoch": 1.7329671134549183, + "grad_norm": 9.34020709991455, + "learning_rate": 3.995831183816808e-06, + "loss": 3.0933, + "step": 59150 + }, + { + "epoch": 1.7332600893576502, + "grad_norm": 8.738700866699219, + "learning_rate": 3.994281969162496e-06, + "loss": 3.0636, + "step": 59160 + }, + { + "epoch": 1.7335530652603823, + "grad_norm": 9.604740142822266, + "learning_rate": 3.992732855124308e-06, + "loss": 3.099, + "step": 59170 + }, + { + "epoch": 1.7338460411631145, + "grad_norm": 8.75337028503418, + "learning_rate": 3.99118384185722e-06, + "loss": 3.0905, + "step": 59180 + }, + { + "epoch": 1.7341390170658464, + "grad_norm": 9.28864574432373, + "learning_rate": 3.989634929516204e-06, + "loss": 3.0769, + "step": 59190 + }, + { + "epoch": 1.7344319929685783, + "grad_norm": 9.173728942871094, + "learning_rate": 3.988086118256219e-06, + "loss": 3.0856, + "step": 59200 + }, + { + "epoch": 1.7347249688713102, + "grad_norm": 8.531831741333008, + "learning_rate": 3.986537408232216e-06, + "loss": 3.0731, + "step": 59210 + }, + { + "epoch": 1.7350179447740424, + "grad_norm": 8.877666473388672, + "learning_rate": 3.98498879959913e-06, + "loss": 3.0597, + "step": 59220 + }, + { + "epoch": 1.7353109206767743, + "grad_norm": 9.280668258666992, + "learning_rate": 3.983440292511894e-06, + "loss": 3.0862, + "step": 59230 + }, + { + "epoch": 1.7356038965795064, + "grad_norm": 8.272043228149414, + "learning_rate": 3.981891887125425e-06, + "loss": 3.0636, + "step": 59240 + }, + { + "epoch": 1.7358968724822383, + "grad_norm": 8.873201370239258, + "learning_rate": 3.980343583594634e-06, + "loss": 3.0897, + "step": 59250 + }, + { + "epoch": 1.7361898483849703, + "grad_norm": 9.09018611907959, + "learning_rate": 3.978795382074416e-06, + "loss": 3.0899, + "step": 59260 + }, + { + "epoch": 1.7364828242877022, + "grad_norm": 8.856014251708984, + "learning_rate": 3.977247282719664e-06, + "loss": 3.0924, + "step": 59270 + }, + { + "epoch": 1.7367758001904343, + "grad_norm": 9.85999870300293, + "learning_rate": 3.97569928568525e-06, + "loss": 3.0992, + "step": 59280 + }, + { + "epoch": 1.7370687760931665, + "grad_norm": 8.98985767364502, + "learning_rate": 3.974151391126049e-06, + "loss": 3.0828, + "step": 59290 + }, + { + "epoch": 1.7373617519958984, + "grad_norm": 9.419718742370605, + "learning_rate": 3.972603599196914e-06, + "loss": 3.0808, + "step": 59300 + }, + { + "epoch": 1.7376547278986303, + "grad_norm": 9.277263641357422, + "learning_rate": 3.9710559100526926e-06, + "loss": 3.0807, + "step": 59310 + }, + { + "epoch": 1.7379477038013622, + "grad_norm": 8.949945449829102, + "learning_rate": 3.9695083238482235e-06, + "loss": 3.0795, + "step": 59320 + }, + { + "epoch": 1.7382406797040943, + "grad_norm": 9.562213897705078, + "learning_rate": 3.967960840738334e-06, + "loss": 3.0793, + "step": 59330 + }, + { + "epoch": 1.7385336556068265, + "grad_norm": 8.875813484191895, + "learning_rate": 3.966413460877838e-06, + "loss": 3.0852, + "step": 59340 + }, + { + "epoch": 1.7388266315095584, + "grad_norm": 8.936612129211426, + "learning_rate": 3.964866184421543e-06, + "loss": 3.0828, + "step": 59350 + }, + { + "epoch": 1.7391196074122903, + "grad_norm": 9.404308319091797, + "learning_rate": 3.963319011524246e-06, + "loss": 3.0797, + "step": 59360 + }, + { + "epoch": 1.7394125833150222, + "grad_norm": 9.843257904052734, + "learning_rate": 3.96177194234073e-06, + "loss": 3.0864, + "step": 59370 + }, + { + "epoch": 1.7397055592177544, + "grad_norm": 8.511406898498535, + "learning_rate": 3.960224977025774e-06, + "loss": 3.0688, + "step": 59380 + }, + { + "epoch": 1.7399985351204863, + "grad_norm": 8.69702434539795, + "learning_rate": 3.958678115734139e-06, + "loss": 3.0915, + "step": 59390 + }, + { + "epoch": 1.7400571303010328, + "eval_bleu": 0.3490322930844425, + "eval_cap_loss": 0.9134423732757568, + "eval_con_loss": 1.1615283489227295, + "eval_loss": 3.236499071121216, + "step": 59392 + }, + { + "epoch": 1.7400571303010328, + "eval_bleu": 0.3490322930844425, + "eval_cap_loss": 0.9134423732757568, + "eval_con_loss": 1.1615283489227295, + "eval_loss": 3.236499071121216, + "eval_runtime": 55.6977, + "eval_samples_per_second": 359.082, + "eval_steps_per_second": 0.359, + "step": 59392 + }, + { + "epoch": 1.7402915110232184, + "grad_norm": 8.997987747192383, + "learning_rate": 3.957131358620581e-06, + "loss": 3.0667, + "step": 59400 + }, + { + "epoch": 1.7405844869259504, + "grad_norm": 9.395491600036621, + "learning_rate": 3.955584705839841e-06, + "loss": 3.1012, + "step": 59410 + }, + { + "epoch": 1.7408774628286823, + "grad_norm": 8.807588577270508, + "learning_rate": 3.954038157546659e-06, + "loss": 3.0835, + "step": 59420 + }, + { + "epoch": 1.7411704387314142, + "grad_norm": 9.188193321228027, + "learning_rate": 3.952491713895751e-06, + "loss": 3.0835, + "step": 59430 + }, + { + "epoch": 1.7414634146341463, + "grad_norm": 9.105667114257812, + "learning_rate": 3.950945375041833e-06, + "loss": 3.0717, + "step": 59440 + }, + { + "epoch": 1.7417563905368785, + "grad_norm": 9.30809497833252, + "learning_rate": 3.949399141139606e-06, + "loss": 3.0896, + "step": 59450 + }, + { + "epoch": 1.7420493664396104, + "grad_norm": 9.062102317810059, + "learning_rate": 3.947853012343763e-06, + "loss": 3.0564, + "step": 59460 + }, + { + "epoch": 1.7423423423423423, + "grad_norm": 9.097018241882324, + "learning_rate": 3.9463069888089835e-06, + "loss": 3.0997, + "step": 59470 + }, + { + "epoch": 1.7426353182450742, + "grad_norm": 9.096291542053223, + "learning_rate": 3.944761070689938e-06, + "loss": 3.0622, + "step": 59480 + }, + { + "epoch": 1.7429282941478064, + "grad_norm": 8.63574504852295, + "learning_rate": 3.943215258141287e-06, + "loss": 3.0841, + "step": 59490 + }, + { + "epoch": 1.7432212700505385, + "grad_norm": 9.4409818649292, + "learning_rate": 3.9416695513176815e-06, + "loss": 3.0714, + "step": 59500 + }, + { + "epoch": 1.7435142459532704, + "grad_norm": 9.508157730102539, + "learning_rate": 3.940123950373756e-06, + "loss": 3.0699, + "step": 59510 + }, + { + "epoch": 1.7438072218560023, + "grad_norm": 9.018285751342773, + "learning_rate": 3.938578455464143e-06, + "loss": 3.0889, + "step": 59520 + }, + { + "epoch": 1.7441001977587343, + "grad_norm": 9.584528923034668, + "learning_rate": 3.937033066743456e-06, + "loss": 3.0741, + "step": 59530 + }, + { + "epoch": 1.7443931736614662, + "grad_norm": 8.488497734069824, + "learning_rate": 3.9354877843663074e-06, + "loss": 3.0895, + "step": 59540 + }, + { + "epoch": 1.7446861495641983, + "grad_norm": 9.249126434326172, + "learning_rate": 3.933942608487288e-06, + "loss": 3.0964, + "step": 59550 + }, + { + "epoch": 1.7449791254669305, + "grad_norm": 9.218165397644043, + "learning_rate": 3.932397539260987e-06, + "loss": 3.0852, + "step": 59560 + }, + { + "epoch": 1.7452721013696624, + "grad_norm": 8.591341018676758, + "learning_rate": 3.930852576841977e-06, + "loss": 3.0959, + "step": 59570 + }, + { + "epoch": 1.7455650772723943, + "grad_norm": 8.794452667236328, + "learning_rate": 3.929307721384827e-06, + "loss": 3.0862, + "step": 59580 + }, + { + "epoch": 1.7458580531751262, + "grad_norm": 8.828221321105957, + "learning_rate": 3.927762973044084e-06, + "loss": 3.0887, + "step": 59590 + }, + { + "epoch": 1.7461510290778584, + "grad_norm": 9.423272132873535, + "learning_rate": 3.926218331974295e-06, + "loss": 3.1066, + "step": 59600 + }, + { + "epoch": 1.7464440049805905, + "grad_norm": 9.365294456481934, + "learning_rate": 3.924673798329991e-06, + "loss": 3.086, + "step": 59610 + }, + { + "epoch": 1.7467369808833224, + "grad_norm": 9.745279312133789, + "learning_rate": 3.923129372265696e-06, + "loss": 3.0902, + "step": 59620 + }, + { + "epoch": 1.7470299567860543, + "grad_norm": 9.626408576965332, + "learning_rate": 3.921585053935916e-06, + "loss": 3.0527, + "step": 59630 + }, + { + "epoch": 1.7473229326887862, + "grad_norm": 9.154479026794434, + "learning_rate": 3.920040843495154e-06, + "loss": 3.0796, + "step": 59640 + }, + { + "epoch": 1.7476159085915184, + "grad_norm": 8.77637004852295, + "learning_rate": 3.918496741097898e-06, + "loss": 3.0598, + "step": 59650 + }, + { + "epoch": 1.7479088844942503, + "grad_norm": 10.068869590759277, + "learning_rate": 3.9169527468986285e-06, + "loss": 3.0577, + "step": 59660 + }, + { + "epoch": 1.7482018603969824, + "grad_norm": 8.659213066101074, + "learning_rate": 3.915408861051809e-06, + "loss": 3.0549, + "step": 59670 + }, + { + "epoch": 1.7484948362997144, + "grad_norm": 8.97026538848877, + "learning_rate": 3.913865083711899e-06, + "loss": 3.077, + "step": 59680 + }, + { + "epoch": 1.7487878122024463, + "grad_norm": 9.120222091674805, + "learning_rate": 3.9123214150333435e-06, + "loss": 3.0723, + "step": 59690 + }, + { + "epoch": 1.7490807881051782, + "grad_norm": 8.584049224853516, + "learning_rate": 3.910777855170577e-06, + "loss": 3.0852, + "step": 59700 + }, + { + "epoch": 1.7493737640079103, + "grad_norm": 9.729670524597168, + "learning_rate": 3.909234404278026e-06, + "loss": 3.0593, + "step": 59710 + }, + { + "epoch": 1.7496667399106425, + "grad_norm": 8.882246017456055, + "learning_rate": 3.907691062510099e-06, + "loss": 3.0701, + "step": 59720 + }, + { + "epoch": 1.7499597158133744, + "grad_norm": 8.798897743225098, + "learning_rate": 3.906147830021203e-06, + "loss": 3.0867, + "step": 59730 + }, + { + "epoch": 1.7502526917161063, + "grad_norm": 8.6824951171875, + "learning_rate": 3.904604706965726e-06, + "loss": 3.069, + "step": 59740 + }, + { + "epoch": 1.7505456676188382, + "grad_norm": 8.740169525146484, + "learning_rate": 3.90306169349805e-06, + "loss": 3.0815, + "step": 59750 + }, + { + "epoch": 1.7508386435215704, + "grad_norm": 8.928925514221191, + "learning_rate": 3.9015187897725424e-06, + "loss": 3.0994, + "step": 59760 + }, + { + "epoch": 1.7511316194243025, + "grad_norm": 9.457514762878418, + "learning_rate": 3.8999759959435636e-06, + "loss": 3.0816, + "step": 59770 + }, + { + "epoch": 1.7514245953270344, + "grad_norm": 9.068278312683105, + "learning_rate": 3.89843331216546e-06, + "loss": 3.0703, + "step": 59780 + }, + { + "epoch": 1.7517175712297663, + "grad_norm": 8.999881744384766, + "learning_rate": 3.896890738592569e-06, + "loss": 3.0899, + "step": 59790 + }, + { + "epoch": 1.7520105471324983, + "grad_norm": 8.890965461730957, + "learning_rate": 3.895348275379213e-06, + "loss": 3.0943, + "step": 59800 + }, + { + "epoch": 1.7523035230352304, + "grad_norm": 9.369621276855469, + "learning_rate": 3.89380592267971e-06, + "loss": 3.0753, + "step": 59810 + }, + { + "epoch": 1.7525964989379623, + "grad_norm": 9.457769393920898, + "learning_rate": 3.89226368064836e-06, + "loss": 3.0681, + "step": 59820 + }, + { + "epoch": 1.7528894748406945, + "grad_norm": 9.289687156677246, + "learning_rate": 3.890721549439459e-06, + "loss": 3.0811, + "step": 59830 + }, + { + "epoch": 1.7531824507434264, + "grad_norm": 8.974608421325684, + "learning_rate": 3.889179529207285e-06, + "loss": 3.0902, + "step": 59840 + }, + { + "epoch": 1.7534754266461583, + "grad_norm": 8.87979507446289, + "learning_rate": 3.887637620106109e-06, + "loss": 3.0736, + "step": 59850 + }, + { + "epoch": 1.7537684025488902, + "grad_norm": 9.060302734375, + "learning_rate": 3.886095822290189e-06, + "loss": 3.0775, + "step": 59860 + }, + { + "epoch": 1.7540613784516224, + "grad_norm": 9.323887825012207, + "learning_rate": 3.884554135913774e-06, + "loss": 3.0826, + "step": 59870 + }, + { + "epoch": 1.7543543543543545, + "grad_norm": 9.735852241516113, + "learning_rate": 3.8830125611311e-06, + "loss": 3.0499, + "step": 59880 + }, + { + "epoch": 1.7546473302570864, + "grad_norm": 8.829174041748047, + "learning_rate": 3.881471098096392e-06, + "loss": 3.0751, + "step": 59890 + }, + { + "epoch": 1.7549403061598183, + "grad_norm": 8.77865219116211, + "learning_rate": 3.879929746963864e-06, + "loss": 3.0729, + "step": 59900 + }, + { + "epoch": 1.7550574965209111, + "eval_bleu": 0.349038849570229, + "eval_cap_loss": 0.9124710559844971, + "eval_con_loss": 1.1584361791610718, + "eval_loss": 3.2293434143066406, + "step": 59904 + }, + { + "epoch": 1.7550574965209111, + "eval_bleu": 0.349038849570229, + "eval_cap_loss": 0.9124710559844971, + "eval_con_loss": 1.1584361791610718, + "eval_loss": 3.2293434143066406, + "eval_runtime": 57.3584, + "eval_samples_per_second": 348.685, + "eval_steps_per_second": 0.349, + "step": 59904 + }, + { + "epoch": 1.7552332820625502, + "grad_norm": 8.517390251159668, + "learning_rate": 3.878388507887722e-06, + "loss": 3.0907, + "step": 59910 + }, + { + "epoch": 1.7555262579652824, + "grad_norm": 9.025131225585938, + "learning_rate": 3.876847381022153e-06, + "loss": 3.0793, + "step": 59920 + }, + { + "epoch": 1.7558192338680143, + "grad_norm": 8.684404373168945, + "learning_rate": 3.875306366521341e-06, + "loss": 3.0845, + "step": 59930 + }, + { + "epoch": 1.7561122097707464, + "grad_norm": 9.188104629516602, + "learning_rate": 3.873765464539454e-06, + "loss": 3.0764, + "step": 59940 + }, + { + "epoch": 1.7564051856734784, + "grad_norm": 9.489201545715332, + "learning_rate": 3.872224675230653e-06, + "loss": 3.0937, + "step": 59950 + }, + { + "epoch": 1.7566981615762103, + "grad_norm": 9.291435241699219, + "learning_rate": 3.8706839987490785e-06, + "loss": 3.1127, + "step": 59960 + }, + { + "epoch": 1.7569911374789422, + "grad_norm": 9.73381519317627, + "learning_rate": 3.869143435248872e-06, + "loss": 3.0633, + "step": 59970 + }, + { + "epoch": 1.7572841133816743, + "grad_norm": 8.635599136352539, + "learning_rate": 3.867602984884155e-06, + "loss": 3.1063, + "step": 59980 + }, + { + "epoch": 1.7575770892844065, + "grad_norm": 8.843602180480957, + "learning_rate": 3.866062647809043e-06, + "loss": 3.0761, + "step": 59990 + }, + { + "epoch": 1.7578700651871384, + "grad_norm": 8.970982551574707, + "learning_rate": 3.864522424177634e-06, + "loss": 3.0752, + "step": 60000 + }, + { + "epoch": 1.7581630410898703, + "grad_norm": 9.19855785369873, + "learning_rate": 3.86298231414402e-06, + "loss": 3.084, + "step": 60010 + }, + { + "epoch": 1.7584560169926022, + "grad_norm": 9.082979202270508, + "learning_rate": 3.861442317862279e-06, + "loss": 3.1019, + "step": 60020 + }, + { + "epoch": 1.7587489928953344, + "grad_norm": 9.229846000671387, + "learning_rate": 3.859902435486482e-06, + "loss": 3.0782, + "step": 60030 + }, + { + "epoch": 1.7590419687980665, + "grad_norm": 9.613600730895996, + "learning_rate": 3.85836266717068e-06, + "loss": 3.0824, + "step": 60040 + }, + { + "epoch": 1.7593349447007984, + "grad_norm": 9.333344459533691, + "learning_rate": 3.85682301306892e-06, + "loss": 3.0872, + "step": 60050 + }, + { + "epoch": 1.7596279206035303, + "grad_norm": 8.895462036132812, + "learning_rate": 3.855283473335236e-06, + "loss": 3.0683, + "step": 60060 + }, + { + "epoch": 1.7599208965062623, + "grad_norm": 8.911349296569824, + "learning_rate": 3.853744048123648e-06, + "loss": 3.0864, + "step": 60070 + }, + { + "epoch": 1.7602138724089944, + "grad_norm": 8.595193862915039, + "learning_rate": 3.85220473758817e-06, + "loss": 3.0756, + "step": 60080 + }, + { + "epoch": 1.7605068483117263, + "grad_norm": 9.232812881469727, + "learning_rate": 3.850665541882794e-06, + "loss": 3.07, + "step": 60090 + }, + { + "epoch": 1.7607998242144585, + "grad_norm": 9.591363906860352, + "learning_rate": 3.849126461161517e-06, + "loss": 3.0741, + "step": 60100 + }, + { + "epoch": 1.7610928001171904, + "grad_norm": 8.798230171203613, + "learning_rate": 3.847587495578306e-06, + "loss": 3.0719, + "step": 60110 + }, + { + "epoch": 1.7613857760199223, + "grad_norm": 8.565958976745605, + "learning_rate": 3.846048645287132e-06, + "loss": 3.082, + "step": 60120 + }, + { + "epoch": 1.7616787519226542, + "grad_norm": 9.24624252319336, + "learning_rate": 3.844509910441941e-06, + "loss": 3.0816, + "step": 60130 + }, + { + "epoch": 1.7619717278253864, + "grad_norm": 8.96744155883789, + "learning_rate": 3.842971291196682e-06, + "loss": 3.0941, + "step": 60140 + }, + { + "epoch": 1.7622647037281185, + "grad_norm": 9.278365135192871, + "learning_rate": 3.841432787705278e-06, + "loss": 3.0718, + "step": 60150 + }, + { + "epoch": 1.7625576796308504, + "grad_norm": 9.488468170166016, + "learning_rate": 3.839894400121653e-06, + "loss": 3.1154, + "step": 60160 + }, + { + "epoch": 1.7628506555335823, + "grad_norm": 8.614862442016602, + "learning_rate": 3.8383561285997074e-06, + "loss": 3.0401, + "step": 60170 + }, + { + "epoch": 1.7631436314363143, + "grad_norm": 9.7229642868042, + "learning_rate": 3.836817973293342e-06, + "loss": 3.09, + "step": 60180 + }, + { + "epoch": 1.7634366073390464, + "grad_norm": 8.526725769042969, + "learning_rate": 3.8352799343564355e-06, + "loss": 3.0541, + "step": 60190 + }, + { + "epoch": 1.7637295832417783, + "grad_norm": 8.802919387817383, + "learning_rate": 3.833742011942864e-06, + "loss": 3.0733, + "step": 60200 + }, + { + "epoch": 1.7640225591445104, + "grad_norm": 9.490668296813965, + "learning_rate": 3.832204206206481e-06, + "loss": 3.0725, + "step": 60210 + }, + { + "epoch": 1.7643155350472424, + "grad_norm": 8.814671516418457, + "learning_rate": 3.830666517301143e-06, + "loss": 3.0884, + "step": 60220 + }, + { + "epoch": 1.7646085109499743, + "grad_norm": 9.50935173034668, + "learning_rate": 3.82912894538068e-06, + "loss": 3.0849, + "step": 60230 + }, + { + "epoch": 1.7649014868527062, + "grad_norm": 9.645551681518555, + "learning_rate": 3.827591490598922e-06, + "loss": 3.0956, + "step": 60240 + }, + { + "epoch": 1.7651944627554383, + "grad_norm": 9.424745559692383, + "learning_rate": 3.826054153109676e-06, + "loss": 3.0646, + "step": 60250 + }, + { + "epoch": 1.7654874386581705, + "grad_norm": 9.342972755432129, + "learning_rate": 3.82451693306675e-06, + "loss": 3.0768, + "step": 60260 + }, + { + "epoch": 1.7657804145609024, + "grad_norm": 9.535086631774902, + "learning_rate": 3.82297983062393e-06, + "loss": 3.0759, + "step": 60270 + }, + { + "epoch": 1.7660733904636343, + "grad_norm": 8.821154594421387, + "learning_rate": 3.821442845934996e-06, + "loss": 3.0781, + "step": 60280 + }, + { + "epoch": 1.7663663663663662, + "grad_norm": 9.323271751403809, + "learning_rate": 3.8199059791537105e-06, + "loss": 3.081, + "step": 60290 + }, + { + "epoch": 1.7666593422690984, + "grad_norm": 8.900598526000977, + "learning_rate": 3.8183692304338335e-06, + "loss": 3.0464, + "step": 60300 + }, + { + "epoch": 1.7669523181718305, + "grad_norm": 9.275443077087402, + "learning_rate": 3.816832599929103e-06, + "loss": 3.0798, + "step": 60310 + }, + { + "epoch": 1.7672452940745624, + "grad_norm": 9.178145408630371, + "learning_rate": 3.815296087793251e-06, + "loss": 3.0738, + "step": 60320 + }, + { + "epoch": 1.7675382699772944, + "grad_norm": 9.022836685180664, + "learning_rate": 3.813759694179997e-06, + "loss": 3.0697, + "step": 60330 + }, + { + "epoch": 1.7678312458800263, + "grad_norm": 9.078303337097168, + "learning_rate": 3.81222341924305e-06, + "loss": 3.0683, + "step": 60340 + }, + { + "epoch": 1.7681242217827584, + "grad_norm": 8.7505464553833, + "learning_rate": 3.8106872631361003e-06, + "loss": 3.0835, + "step": 60350 + }, + { + "epoch": 1.7684171976854903, + "grad_norm": 8.392800331115723, + "learning_rate": 3.809151226012835e-06, + "loss": 3.0534, + "step": 60360 + }, + { + "epoch": 1.7687101735882225, + "grad_norm": 9.099273681640625, + "learning_rate": 3.807615308026924e-06, + "loss": 3.0804, + "step": 60370 + }, + { + "epoch": 1.7690031494909544, + "grad_norm": 8.861050605773926, + "learning_rate": 3.8060795093320293e-06, + "loss": 3.0577, + "step": 60380 + }, + { + "epoch": 1.7692961253936863, + "grad_norm": 9.65778923034668, + "learning_rate": 3.804543830081794e-06, + "loss": 3.0873, + "step": 60390 + }, + { + "epoch": 1.7695891012964182, + "grad_norm": 9.045199394226074, + "learning_rate": 3.8030082704298578e-06, + "loss": 3.0966, + "step": 60400 + }, + { + "epoch": 1.7698820771991504, + "grad_norm": 9.560547828674316, + "learning_rate": 3.801472830529841e-06, + "loss": 3.0704, + "step": 60410 + }, + { + "epoch": 1.7700578627407895, + "eval_bleu": 0.34967086864309915, + "eval_cap_loss": 0.9120594263076782, + "eval_con_loss": 1.1585536003112793, + "eval_loss": 3.2291669845581055, + "step": 60416 + }, + { + "epoch": 1.7700578627407895, + "eval_bleu": 0.34967086864309915, + "eval_cap_loss": 0.9120594263076782, + "eval_con_loss": 1.1585536003112793, + "eval_loss": 3.2291669845581055, + "eval_runtime": 58.0495, + "eval_samples_per_second": 344.534, + "eval_steps_per_second": 0.345, + "step": 60416 + }, + { + "epoch": 1.7701750531018825, + "grad_norm": 9.030055046081543, + "learning_rate": 3.7999375105353563e-06, + "loss": 3.0766, + "step": 60420 + }, + { + "epoch": 1.7704680290046144, + "grad_norm": 8.597670555114746, + "learning_rate": 3.7984023106000073e-06, + "loss": 3.0675, + "step": 60430 + }, + { + "epoch": 1.7707610049073463, + "grad_norm": 9.031255722045898, + "learning_rate": 3.7968672308773757e-06, + "loss": 3.0771, + "step": 60440 + }, + { + "epoch": 1.7710539808100783, + "grad_norm": 8.881298065185547, + "learning_rate": 3.7953322715210406e-06, + "loss": 3.0712, + "step": 60450 + }, + { + "epoch": 1.7713469567128104, + "grad_norm": 9.751913070678711, + "learning_rate": 3.7937974326845636e-06, + "loss": 3.0636, + "step": 60460 + }, + { + "epoch": 1.7716399326155425, + "grad_norm": 9.312051773071289, + "learning_rate": 3.7922627145214997e-06, + "loss": 3.0722, + "step": 60470 + }, + { + "epoch": 1.7719329085182745, + "grad_norm": 8.704583168029785, + "learning_rate": 3.790728117185383e-06, + "loss": 3.0741, + "step": 60480 + }, + { + "epoch": 1.7722258844210064, + "grad_norm": 8.669824600219727, + "learning_rate": 3.7891936408297457e-06, + "loss": 3.0759, + "step": 60490 + }, + { + "epoch": 1.7725188603237383, + "grad_norm": 9.093440055847168, + "learning_rate": 3.7876592856081e-06, + "loss": 3.0603, + "step": 60500 + }, + { + "epoch": 1.7728118362264704, + "grad_norm": 9.109284400939941, + "learning_rate": 3.7861250516739523e-06, + "loss": 3.0638, + "step": 60510 + }, + { + "epoch": 1.7731048121292023, + "grad_norm": 9.364057540893555, + "learning_rate": 3.7845909391807894e-06, + "loss": 3.0672, + "step": 60520 + }, + { + "epoch": 1.7733977880319345, + "grad_norm": 8.856925964355469, + "learning_rate": 3.7830569482820932e-06, + "loss": 3.0873, + "step": 60530 + }, + { + "epoch": 1.7736907639346664, + "grad_norm": 9.146403312683105, + "learning_rate": 3.7815230791313283e-06, + "loss": 3.0713, + "step": 60540 + }, + { + "epoch": 1.7739837398373983, + "grad_norm": 8.834148406982422, + "learning_rate": 3.779989331881953e-06, + "loss": 3.0693, + "step": 60550 + }, + { + "epoch": 1.7742767157401302, + "grad_norm": 9.310224533081055, + "learning_rate": 3.7784557066874045e-06, + "loss": 3.0791, + "step": 60560 + }, + { + "epoch": 1.7745696916428624, + "grad_norm": 9.294276237487793, + "learning_rate": 3.7769222037011167e-06, + "loss": 3.0978, + "step": 60570 + }, + { + "epoch": 1.7748626675455945, + "grad_norm": 8.72270393371582, + "learning_rate": 3.7753888230765045e-06, + "loss": 3.0543, + "step": 60580 + }, + { + "epoch": 1.7751556434483264, + "grad_norm": 9.45362377166748, + "learning_rate": 3.773855564966977e-06, + "loss": 3.0728, + "step": 60590 + }, + { + "epoch": 1.7754486193510584, + "grad_norm": 8.940171241760254, + "learning_rate": 3.7723224295259247e-06, + "loss": 3.0821, + "step": 60600 + }, + { + "epoch": 1.7757415952537903, + "grad_norm": 9.512040138244629, + "learning_rate": 3.7707894169067307e-06, + "loss": 3.0649, + "step": 60610 + }, + { + "epoch": 1.7760345711565224, + "grad_norm": 8.848993301391602, + "learning_rate": 3.769256527262761e-06, + "loss": 3.0611, + "step": 60620 + }, + { + "epoch": 1.7763275470592543, + "grad_norm": 8.538473129272461, + "learning_rate": 3.7677237607473767e-06, + "loss": 3.0718, + "step": 60630 + }, + { + "epoch": 1.7766205229619865, + "grad_norm": 9.144926071166992, + "learning_rate": 3.7661911175139172e-06, + "loss": 3.0747, + "step": 60640 + }, + { + "epoch": 1.7769134988647184, + "grad_norm": 9.341351509094238, + "learning_rate": 3.764658597715717e-06, + "loss": 3.0681, + "step": 60650 + }, + { + "epoch": 1.7772064747674503, + "grad_norm": 9.116084098815918, + "learning_rate": 3.763126201506095e-06, + "loss": 3.0807, + "step": 60660 + }, + { + "epoch": 1.7774994506701822, + "grad_norm": 9.852286338806152, + "learning_rate": 3.7615939290383596e-06, + "loss": 3.0614, + "step": 60670 + }, + { + "epoch": 1.7777924265729144, + "grad_norm": 9.434694290161133, + "learning_rate": 3.7600617804658025e-06, + "loss": 3.1045, + "step": 60680 + }, + { + "epoch": 1.7780854024756465, + "grad_norm": 9.417901992797852, + "learning_rate": 3.7585297559417094e-06, + "loss": 3.0687, + "step": 60690 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 9.261198997497559, + "learning_rate": 3.757151040058138e-06, + "loss": 3.0626, + "step": 60700 + }, + { + "epoch": 1.7786713542811103, + "grad_norm": 8.68742561340332, + "learning_rate": 3.755619251648373e-06, + "loss": 3.0755, + "step": 60710 + }, + { + "epoch": 1.7789643301838423, + "grad_norm": 9.020288467407227, + "learning_rate": 3.754087587731517e-06, + "loss": 3.0798, + "step": 60720 + }, + { + "epoch": 1.7792573060865744, + "grad_norm": 9.010156631469727, + "learning_rate": 3.752556048460807e-06, + "loss": 3.0739, + "step": 60730 + }, + { + "epoch": 1.7795502819893065, + "grad_norm": 9.18504810333252, + "learning_rate": 3.751024633989463e-06, + "loss": 3.0804, + "step": 60740 + }, + { + "epoch": 1.7798432578920385, + "grad_norm": 8.675297737121582, + "learning_rate": 3.7494933444706944e-06, + "loss": 3.0537, + "step": 60750 + }, + { + "epoch": 1.7801362337947704, + "grad_norm": 8.985271453857422, + "learning_rate": 3.7479621800576994e-06, + "loss": 3.0805, + "step": 60760 + }, + { + "epoch": 1.7804292096975023, + "grad_norm": 9.341015815734863, + "learning_rate": 3.746431140903658e-06, + "loss": 3.0652, + "step": 60770 + }, + { + "epoch": 1.7807221856002344, + "grad_norm": 8.648244857788086, + "learning_rate": 3.744900227161745e-06, + "loss": 3.0824, + "step": 60780 + }, + { + "epoch": 1.7810151615029663, + "grad_norm": 8.518485069274902, + "learning_rate": 3.743369438985117e-06, + "loss": 3.0564, + "step": 60790 + }, + { + "epoch": 1.7813081374056985, + "grad_norm": 9.71821403503418, + "learning_rate": 3.741838776526924e-06, + "loss": 3.0828, + "step": 60800 + }, + { + "epoch": 1.7816011133084304, + "grad_norm": 9.138811111450195, + "learning_rate": 3.7403082399402946e-06, + "loss": 3.0609, + "step": 60810 + }, + { + "epoch": 1.7818940892111623, + "grad_norm": 9.414584159851074, + "learning_rate": 3.7387778293783527e-06, + "loss": 3.0588, + "step": 60820 + }, + { + "epoch": 1.7821870651138942, + "grad_norm": 8.977625846862793, + "learning_rate": 3.737247544994206e-06, + "loss": 3.0585, + "step": 60830 + }, + { + "epoch": 1.7824800410166264, + "grad_norm": 8.744295120239258, + "learning_rate": 3.7357173869409526e-06, + "loss": 3.0616, + "step": 60840 + }, + { + "epoch": 1.7827730169193585, + "grad_norm": 9.502336502075195, + "learning_rate": 3.7341873553716723e-06, + "loss": 3.0546, + "step": 60850 + }, + { + "epoch": 1.7830659928220904, + "grad_norm": 8.646567344665527, + "learning_rate": 3.7326574504394376e-06, + "loss": 3.0754, + "step": 60860 + }, + { + "epoch": 1.7833589687248224, + "grad_norm": 9.122082710266113, + "learning_rate": 3.7311276722973053e-06, + "loss": 3.0699, + "step": 60870 + }, + { + "epoch": 1.7836519446275543, + "grad_norm": 8.556193351745605, + "learning_rate": 3.7295980210983233e-06, + "loss": 3.0595, + "step": 60880 + }, + { + "epoch": 1.7839449205302864, + "grad_norm": 8.486488342285156, + "learning_rate": 3.7280684969955204e-06, + "loss": 3.0623, + "step": 60890 + }, + { + "epoch": 1.7842378964330183, + "grad_norm": 9.344343185424805, + "learning_rate": 3.7265391001419193e-06, + "loss": 3.066, + "step": 60900 + }, + { + "epoch": 1.7845308723357505, + "grad_norm": 9.03391170501709, + "learning_rate": 3.725009830690525e-06, + "loss": 3.0646, + "step": 60910 + }, + { + "epoch": 1.7848238482384824, + "grad_norm": 9.428199768066406, + "learning_rate": 3.7234806887943355e-06, + "loss": 3.0493, + "step": 60920 + }, + { + "epoch": 1.785058228960668, + "eval_bleu": 0.34928839558661384, + "eval_cap_loss": 0.9118969440460205, + "eval_con_loss": 1.1588138341903687, + "eval_loss": 3.229524612426758, + "step": 60928 + }, + { + "epoch": 1.785058228960668, + "eval_bleu": 0.34928839558661384, + "eval_cap_loss": 0.9118969440460205, + "eval_con_loss": 1.1588138341903687, + "eval_loss": 3.229524612426758, + "eval_runtime": 63.0202, + "eval_samples_per_second": 317.359, + "eval_steps_per_second": 0.317, + "step": 60928 + }, + { + "epoch": 1.7851168241412143, + "grad_norm": 8.877456665039062, + "learning_rate": 3.721951674606327e-06, + "loss": 3.0679, + "step": 60930 + }, + { + "epoch": 1.7854098000439462, + "grad_norm": 8.540168762207031, + "learning_rate": 3.720422788279472e-06, + "loss": 3.0721, + "step": 60940 + }, + { + "epoch": 1.7857027759466784, + "grad_norm": 8.901036262512207, + "learning_rate": 3.7188940299667254e-06, + "loss": 3.0701, + "step": 60950 + }, + { + "epoch": 1.7859957518494105, + "grad_norm": 9.10660171508789, + "learning_rate": 3.7173653998210313e-06, + "loss": 3.1105, + "step": 60960 + }, + { + "epoch": 1.7862887277521424, + "grad_norm": 9.587334632873535, + "learning_rate": 3.7158368979953187e-06, + "loss": 3.0752, + "step": 60970 + }, + { + "epoch": 1.7865817036548743, + "grad_norm": 8.990840911865234, + "learning_rate": 3.714308524642506e-06, + "loss": 3.0432, + "step": 60980 + }, + { + "epoch": 1.7868746795576063, + "grad_norm": 8.457100868225098, + "learning_rate": 3.7127802799154965e-06, + "loss": 3.0706, + "step": 60990 + }, + { + "epoch": 1.7871676554603384, + "grad_norm": 8.739151000976562, + "learning_rate": 3.711252163967185e-06, + "loss": 3.0523, + "step": 61000 + }, + { + "epoch": 1.7874606313630705, + "grad_norm": 9.280622482299805, + "learning_rate": 3.709724176950448e-06, + "loss": 3.0611, + "step": 61010 + }, + { + "epoch": 1.7877536072658025, + "grad_norm": 8.521021842956543, + "learning_rate": 3.7081963190181513e-06, + "loss": 3.0515, + "step": 61020 + }, + { + "epoch": 1.7880465831685344, + "grad_norm": 8.55663013458252, + "learning_rate": 3.7066685903231492e-06, + "loss": 3.072, + "step": 61030 + }, + { + "epoch": 1.7883395590712663, + "grad_norm": 9.469326972961426, + "learning_rate": 3.7051409910182833e-06, + "loss": 3.0648, + "step": 61040 + }, + { + "epoch": 1.7886325349739984, + "grad_norm": 9.217070579528809, + "learning_rate": 3.7036135212563768e-06, + "loss": 3.0621, + "step": 61050 + }, + { + "epoch": 1.7889255108767304, + "grad_norm": 8.98384952545166, + "learning_rate": 3.7020861811902477e-06, + "loss": 3.0699, + "step": 61060 + }, + { + "epoch": 1.7892184867794625, + "grad_norm": 9.467822074890137, + "learning_rate": 3.7005589709726943e-06, + "loss": 3.0794, + "step": 61070 + }, + { + "epoch": 1.7895114626821944, + "grad_norm": 9.069388389587402, + "learning_rate": 3.699031890756509e-06, + "loss": 3.082, + "step": 61080 + }, + { + "epoch": 1.7898044385849263, + "grad_norm": 9.436028480529785, + "learning_rate": 3.6975049406944623e-06, + "loss": 3.074, + "step": 61090 + }, + { + "epoch": 1.7900974144876582, + "grad_norm": 8.99055290222168, + "learning_rate": 3.695978120939319e-06, + "loss": 3.0581, + "step": 61100 + }, + { + "epoch": 1.7903903903903904, + "grad_norm": 8.604195594787598, + "learning_rate": 3.6944514316438294e-06, + "loss": 3.0517, + "step": 61110 + }, + { + "epoch": 1.7906833662931225, + "grad_norm": 8.978041648864746, + "learning_rate": 3.692924872960727e-06, + "loss": 3.0623, + "step": 61120 + }, + { + "epoch": 1.7909763421958544, + "grad_norm": 10.293723106384277, + "learning_rate": 3.6913984450427393e-06, + "loss": 3.0612, + "step": 61130 + }, + { + "epoch": 1.7912693180985864, + "grad_norm": 8.855380058288574, + "learning_rate": 3.689872148042571e-06, + "loss": 3.0651, + "step": 61140 + }, + { + "epoch": 1.7915622940013183, + "grad_norm": 9.07919979095459, + "learning_rate": 3.6883459821129227e-06, + "loss": 3.0643, + "step": 61150 + }, + { + "epoch": 1.7918552699040504, + "grad_norm": 9.121639251708984, + "learning_rate": 3.686819947406476e-06, + "loss": 3.0937, + "step": 61160 + }, + { + "epoch": 1.7921482458067826, + "grad_norm": 9.081001281738281, + "learning_rate": 3.6852940440759044e-06, + "loss": 3.0754, + "step": 61170 + }, + { + "epoch": 1.7924412217095145, + "grad_norm": 9.132932662963867, + "learning_rate": 3.683768272273862e-06, + "loss": 3.0666, + "step": 61180 + }, + { + "epoch": 1.7927341976122464, + "grad_norm": 9.698857307434082, + "learning_rate": 3.6822426321529967e-06, + "loss": 3.0761, + "step": 61190 + }, + { + "epoch": 1.7930271735149783, + "grad_norm": 9.066696166992188, + "learning_rate": 3.6807171238659368e-06, + "loss": 3.0584, + "step": 61200 + }, + { + "epoch": 1.7933201494177102, + "grad_norm": 9.118417739868164, + "learning_rate": 3.6791917475653027e-06, + "loss": 3.0936, + "step": 61210 + }, + { + "epoch": 1.7936131253204424, + "grad_norm": 8.487163543701172, + "learning_rate": 3.6776665034036973e-06, + "loss": 3.0672, + "step": 61220 + }, + { + "epoch": 1.7939061012231745, + "grad_norm": 8.474424362182617, + "learning_rate": 3.6761413915337135e-06, + "loss": 3.0581, + "step": 61230 + }, + { + "epoch": 1.7941990771259064, + "grad_norm": 8.809235572814941, + "learning_rate": 3.6746164121079287e-06, + "loss": 3.051, + "step": 61240 + }, + { + "epoch": 1.7944920530286383, + "grad_norm": 8.57335090637207, + "learning_rate": 3.67309156527891e-06, + "loss": 3.0917, + "step": 61250 + }, + { + "epoch": 1.7947850289313703, + "grad_norm": 8.52562427520752, + "learning_rate": 3.6715668511992063e-06, + "loss": 3.0637, + "step": 61260 + }, + { + "epoch": 1.7950780048341024, + "grad_norm": 8.899408340454102, + "learning_rate": 3.670042270021359e-06, + "loss": 3.0783, + "step": 61270 + }, + { + "epoch": 1.7953709807368345, + "grad_norm": 8.948980331420898, + "learning_rate": 3.668517821897891e-06, + "loss": 3.0782, + "step": 61280 + }, + { + "epoch": 1.7956639566395665, + "grad_norm": 9.244172096252441, + "learning_rate": 3.666993506981318e-06, + "loss": 3.0809, + "step": 61290 + }, + { + "epoch": 1.7959569325422984, + "grad_norm": 9.21287727355957, + "learning_rate": 3.6654693254241334e-06, + "loss": 3.0553, + "step": 61300 + }, + { + "epoch": 1.7962499084450303, + "grad_norm": 8.725174903869629, + "learning_rate": 3.6639452773788266e-06, + "loss": 3.0725, + "step": 61310 + }, + { + "epoch": 1.7965428843477624, + "grad_norm": 9.298426628112793, + "learning_rate": 3.6624213629978667e-06, + "loss": 3.0787, + "step": 61320 + }, + { + "epoch": 1.7968358602504944, + "grad_norm": 8.804049491882324, + "learning_rate": 3.6608975824337167e-06, + "loss": 3.0635, + "step": 61330 + }, + { + "epoch": 1.7971288361532265, + "grad_norm": 8.397180557250977, + "learning_rate": 3.659373935838817e-06, + "loss": 3.0894, + "step": 61340 + }, + { + "epoch": 1.7974218120559584, + "grad_norm": 8.540148735046387, + "learning_rate": 3.657850423365602e-06, + "loss": 3.0574, + "step": 61350 + }, + { + "epoch": 1.7977147879586903, + "grad_norm": 10.148384094238281, + "learning_rate": 3.656327045166488e-06, + "loss": 3.0785, + "step": 61360 + }, + { + "epoch": 1.7980077638614222, + "grad_norm": 9.039490699768066, + "learning_rate": 3.654803801393884e-06, + "loss": 3.0755, + "step": 61370 + }, + { + "epoch": 1.7983007397641544, + "grad_norm": 9.693076133728027, + "learning_rate": 3.653280692200177e-06, + "loss": 3.0914, + "step": 61380 + }, + { + "epoch": 1.7985937156668865, + "grad_norm": 9.037335395812988, + "learning_rate": 3.6517577177377472e-06, + "loss": 3.0881, + "step": 61390 + }, + { + "epoch": 1.7988866915696184, + "grad_norm": 9.223164558410645, + "learning_rate": 3.650234878158958e-06, + "loss": 3.1023, + "step": 61400 + }, + { + "epoch": 1.7991796674723504, + "grad_norm": 8.63864517211914, + "learning_rate": 3.6487121736161647e-06, + "loss": 3.0683, + "step": 61410 + }, + { + "epoch": 1.7994726433750823, + "grad_norm": 9.221535682678223, + "learning_rate": 3.6471896042616984e-06, + "loss": 3.0842, + "step": 61420 + }, + { + "epoch": 1.7997656192778144, + "grad_norm": 9.18408203125, + "learning_rate": 3.6456671702478874e-06, + "loss": 3.0791, + "step": 61430 + }, + { + "epoch": 1.8000585951805466, + "grad_norm": 9.815741539001465, + "learning_rate": 3.6441448717270395e-06, + "loss": 3.0815, + "step": 61440 + }, + { + "epoch": 1.8000585951805466, + "eval_bleu": 0.35013451143650587, + "eval_cap_loss": 0.9107074737548828, + "eval_con_loss": 1.1555373668670654, + "eval_loss": 3.2217819690704346, + "step": 61440 + }, + { + "epoch": 1.8000585951805466, + "eval_bleu": 0.35013451143650587, + "eval_cap_loss": 0.9107074737548828, + "eval_con_loss": 1.1555373668670654, + "eval_loss": 3.2217819690704346, + "eval_runtime": 60.2723, + "eval_samples_per_second": 331.827, + "eval_steps_per_second": 0.332, + "step": 61440 + }, + { + "epoch": 1.8003515710832785, + "grad_norm": 9.8917875289917, + "learning_rate": 3.642622708851456e-06, + "loss": 3.0777, + "step": 61450 + }, + { + "epoch": 1.8006445469860104, + "grad_norm": 9.00200080871582, + "learning_rate": 3.641100681773415e-06, + "loss": 3.0853, + "step": 61460 + }, + { + "epoch": 1.8009375228887423, + "grad_norm": 8.453060150146484, + "learning_rate": 3.6395787906451886e-06, + "loss": 3.0707, + "step": 61470 + }, + { + "epoch": 1.8012304987914745, + "grad_norm": 8.918872833251953, + "learning_rate": 3.6380570356190346e-06, + "loss": 3.0754, + "step": 61480 + }, + { + "epoch": 1.8015234746942064, + "grad_norm": 8.409501075744629, + "learning_rate": 3.636535416847192e-06, + "loss": 3.0716, + "step": 61490 + }, + { + "epoch": 1.8018164505969385, + "grad_norm": 9.053424835205078, + "learning_rate": 3.635013934481895e-06, + "loss": 3.09, + "step": 61500 + }, + { + "epoch": 1.8021094264996704, + "grad_norm": 9.277589797973633, + "learning_rate": 3.6334925886753536e-06, + "loss": 3.0586, + "step": 61510 + }, + { + "epoch": 1.8024024024024023, + "grad_norm": 9.095685958862305, + "learning_rate": 3.6319713795797718e-06, + "loss": 3.0547, + "step": 61520 + }, + { + "epoch": 1.8026953783051343, + "grad_norm": 8.734920501708984, + "learning_rate": 3.630450307347336e-06, + "loss": 3.0691, + "step": 61530 + }, + { + "epoch": 1.8029883542078664, + "grad_norm": 9.357439994812012, + "learning_rate": 3.628929372130225e-06, + "loss": 3.0607, + "step": 61540 + }, + { + "epoch": 1.8032813301105985, + "grad_norm": 9.264031410217285, + "learning_rate": 3.6274085740805932e-06, + "loss": 3.059, + "step": 61550 + }, + { + "epoch": 1.8035743060133305, + "grad_norm": 9.57312297821045, + "learning_rate": 3.6258879133505916e-06, + "loss": 3.0581, + "step": 61560 + }, + { + "epoch": 1.8038672819160624, + "grad_norm": 8.674677848815918, + "learning_rate": 3.6243673900923504e-06, + "loss": 3.0554, + "step": 61570 + }, + { + "epoch": 1.8041602578187943, + "grad_norm": 8.786955833435059, + "learning_rate": 3.622847004457993e-06, + "loss": 3.0839, + "step": 61580 + }, + { + "epoch": 1.8044532337215264, + "grad_norm": 8.698802947998047, + "learning_rate": 3.62132675659962e-06, + "loss": 3.0823, + "step": 61590 + }, + { + "epoch": 1.8047462096242584, + "grad_norm": 9.285799026489258, + "learning_rate": 3.6198066466693265e-06, + "loss": 3.0805, + "step": 61600 + }, + { + "epoch": 1.8050391855269905, + "grad_norm": 8.380745887756348, + "learning_rate": 3.618286674819189e-06, + "loss": 3.0595, + "step": 61610 + }, + { + "epoch": 1.8053321614297224, + "grad_norm": 9.088539123535156, + "learning_rate": 3.6167668412012747e-06, + "loss": 3.0735, + "step": 61620 + }, + { + "epoch": 1.8056251373324543, + "grad_norm": 8.765156745910645, + "learning_rate": 3.6152471459676288e-06, + "loss": 3.082, + "step": 61630 + }, + { + "epoch": 1.8059181132351863, + "grad_norm": 9.00057601928711, + "learning_rate": 3.613727589270292e-06, + "loss": 3.0621, + "step": 61640 + }, + { + "epoch": 1.8062110891379184, + "grad_norm": 9.296948432922363, + "learning_rate": 3.6122081712612834e-06, + "loss": 3.0616, + "step": 61650 + }, + { + "epoch": 1.8065040650406505, + "grad_norm": 9.05984115600586, + "learning_rate": 3.6106888920926162e-06, + "loss": 3.0536, + "step": 61660 + }, + { + "epoch": 1.8067970409433824, + "grad_norm": 9.001459121704102, + "learning_rate": 3.6091697519162815e-06, + "loss": 3.0733, + "step": 61670 + }, + { + "epoch": 1.8070900168461144, + "grad_norm": 9.000568389892578, + "learning_rate": 3.607650750884262e-06, + "loss": 3.0425, + "step": 61680 + }, + { + "epoch": 1.8073829927488463, + "grad_norm": 8.55111312866211, + "learning_rate": 3.606131889148523e-06, + "loss": 3.0773, + "step": 61690 + }, + { + "epoch": 1.8076759686515784, + "grad_norm": 9.542771339416504, + "learning_rate": 3.604613166861022e-06, + "loss": 3.0593, + "step": 61700 + }, + { + "epoch": 1.8079689445543106, + "grad_norm": 8.932451248168945, + "learning_rate": 3.6030945841736927e-06, + "loss": 3.0728, + "step": 61710 + }, + { + "epoch": 1.8082619204570425, + "grad_norm": 8.570428848266602, + "learning_rate": 3.601576141238463e-06, + "loss": 3.0791, + "step": 61720 + }, + { + "epoch": 1.8085548963597744, + "grad_norm": 8.956692695617676, + "learning_rate": 3.6000578382072438e-06, + "loss": 3.0707, + "step": 61730 + }, + { + "epoch": 1.8088478722625063, + "grad_norm": 8.907549858093262, + "learning_rate": 3.598539675231934e-06, + "loss": 3.0529, + "step": 61740 + }, + { + "epoch": 1.8091408481652385, + "grad_norm": 8.477505683898926, + "learning_rate": 3.597021652464413e-06, + "loss": 3.0483, + "step": 61750 + }, + { + "epoch": 1.8094338240679704, + "grad_norm": 8.615493774414062, + "learning_rate": 3.595503770056553e-06, + "loss": 3.0799, + "step": 61760 + }, + { + "epoch": 1.8097267999707025, + "grad_norm": 8.611026763916016, + "learning_rate": 3.5939860281602073e-06, + "loss": 3.065, + "step": 61770 + }, + { + "epoch": 1.8100197758734344, + "grad_norm": 9.358122825622559, + "learning_rate": 3.5924684269272202e-06, + "loss": 3.0503, + "step": 61780 + }, + { + "epoch": 1.8103127517761664, + "grad_norm": 9.120122909545898, + "learning_rate": 3.590950966509414e-06, + "loss": 3.0708, + "step": 61790 + }, + { + "epoch": 1.8106057276788983, + "grad_norm": 9.840282440185547, + "learning_rate": 3.589433647058605e-06, + "loss": 3.0864, + "step": 61800 + }, + { + "epoch": 1.8108987035816304, + "grad_norm": 8.774749755859375, + "learning_rate": 3.58791646872659e-06, + "loss": 3.0606, + "step": 61810 + }, + { + "epoch": 1.8111916794843625, + "grad_norm": 8.634186744689941, + "learning_rate": 3.5863994316651553e-06, + "loss": 3.0574, + "step": 61820 + }, + { + "epoch": 1.8114846553870945, + "grad_norm": 8.808683395385742, + "learning_rate": 3.584882536026072e-06, + "loss": 3.0753, + "step": 61830 + }, + { + "epoch": 1.8117776312898264, + "grad_norm": 8.764113426208496, + "learning_rate": 3.5833657819610933e-06, + "loss": 3.0593, + "step": 61840 + }, + { + "epoch": 1.8120706071925583, + "grad_norm": 9.523192405700684, + "learning_rate": 3.5818491696219654e-06, + "loss": 3.0822, + "step": 61850 + }, + { + "epoch": 1.8123635830952904, + "grad_norm": 9.140434265136719, + "learning_rate": 3.5803326991604128e-06, + "loss": 3.0595, + "step": 61860 + }, + { + "epoch": 1.8126565589980224, + "grad_norm": 9.291216850280762, + "learning_rate": 3.578816370728153e-06, + "loss": 3.0912, + "step": 61870 + }, + { + "epoch": 1.8129495349007545, + "grad_norm": 9.640287399291992, + "learning_rate": 3.5773001844768824e-06, + "loss": 3.0455, + "step": 61880 + }, + { + "epoch": 1.8132425108034864, + "grad_norm": 9.000645637512207, + "learning_rate": 3.575784140558288e-06, + "loss": 3.0442, + "step": 61890 + }, + { + "epoch": 1.8135354867062183, + "grad_norm": 8.841487884521484, + "learning_rate": 3.5742682391240403e-06, + "loss": 3.0706, + "step": 61900 + }, + { + "epoch": 1.8138284626089503, + "grad_norm": 8.714720726013184, + "learning_rate": 3.5727524803257984e-06, + "loss": 3.0549, + "step": 61910 + }, + { + "epoch": 1.8141214385116824, + "grad_norm": 9.666903495788574, + "learning_rate": 3.571236864315202e-06, + "loss": 3.0773, + "step": 61920 + }, + { + "epoch": 1.8144144144144145, + "grad_norm": 8.660313606262207, + "learning_rate": 3.569721391243881e-06, + "loss": 3.0446, + "step": 61930 + }, + { + "epoch": 1.8147073903171465, + "grad_norm": 8.620244979858398, + "learning_rate": 3.5682060612634483e-06, + "loss": 3.045, + "step": 61940 + }, + { + "epoch": 1.8150003662198784, + "grad_norm": 10.07860279083252, + "learning_rate": 3.566690874525507e-06, + "loss": 3.0586, + "step": 61950 + }, + { + "epoch": 1.8150589614004247, + "eval_bleu": 0.3502836733243732, + "eval_cap_loss": 0.9099849462509155, + "eval_con_loss": 1.15474534034729, + "eval_loss": 3.219475746154785, + "step": 61952 + }, + { + "epoch": 1.8150589614004247, + "eval_bleu": 0.3502836733243732, + "eval_cap_loss": 0.9099849462509155, + "eval_con_loss": 1.15474534034729, + "eval_loss": 3.219475746154785, + "eval_runtime": 63.8302, + "eval_samples_per_second": 313.331, + "eval_steps_per_second": 0.313, + "step": 61952 + }, + { + "epoch": 1.8152933421226103, + "grad_norm": 9.05897331237793, + "learning_rate": 3.5651758311816376e-06, + "loss": 3.0645, + "step": 61960 + }, + { + "epoch": 1.8155863180253424, + "grad_norm": 8.720466613769531, + "learning_rate": 3.5636609313834147e-06, + "loss": 3.056, + "step": 61970 + }, + { + "epoch": 1.8158792939280746, + "grad_norm": 8.811544418334961, + "learning_rate": 3.5621461752823936e-06, + "loss": 3.0518, + "step": 61980 + }, + { + "epoch": 1.8161722698308065, + "grad_norm": 8.353322982788086, + "learning_rate": 3.5606315630301193e-06, + "loss": 3.0466, + "step": 61990 + }, + { + "epoch": 1.8164652457335384, + "grad_norm": 9.354757308959961, + "learning_rate": 3.559117094778115e-06, + "loss": 3.0519, + "step": 62000 + }, + { + "epoch": 1.8167582216362703, + "grad_norm": 9.23709487915039, + "learning_rate": 3.557602770677898e-06, + "loss": 3.0668, + "step": 62010 + }, + { + "epoch": 1.8170511975390025, + "grad_norm": 9.462357521057129, + "learning_rate": 3.556088590880966e-06, + "loss": 3.0613, + "step": 62020 + }, + { + "epoch": 1.8173441734417344, + "grad_norm": 8.840486526489258, + "learning_rate": 3.5545745555388056e-06, + "loss": 3.0607, + "step": 62030 + }, + { + "epoch": 1.8176371493444665, + "grad_norm": 9.356768608093262, + "learning_rate": 3.553060664802883e-06, + "loss": 3.0745, + "step": 62040 + }, + { + "epoch": 1.8179301252471984, + "grad_norm": 9.424819946289062, + "learning_rate": 3.551546918824659e-06, + "loss": 3.0663, + "step": 62050 + }, + { + "epoch": 1.8182231011499304, + "grad_norm": 8.701571464538574, + "learning_rate": 3.5500333177555703e-06, + "loss": 3.0549, + "step": 62060 + }, + { + "epoch": 1.8185160770526623, + "grad_norm": 8.604852676391602, + "learning_rate": 3.5485198617470474e-06, + "loss": 3.0556, + "step": 62070 + }, + { + "epoch": 1.8188090529553944, + "grad_norm": 7.553781986236572, + "learning_rate": 3.5470065509504996e-06, + "loss": 3.05, + "step": 62080 + }, + { + "epoch": 1.8191020288581266, + "grad_norm": 9.10896110534668, + "learning_rate": 3.545493385517327e-06, + "loss": 3.0686, + "step": 62090 + }, + { + "epoch": 1.8193950047608585, + "grad_norm": 8.961837768554688, + "learning_rate": 3.5439803655989113e-06, + "loss": 3.0692, + "step": 62100 + }, + { + "epoch": 1.8196879806635904, + "grad_norm": 9.11472225189209, + "learning_rate": 3.5424674913466227e-06, + "loss": 3.0536, + "step": 62110 + }, + { + "epoch": 1.8199809565663223, + "grad_norm": 9.716005325317383, + "learning_rate": 3.5409547629118124e-06, + "loss": 3.0767, + "step": 62120 + }, + { + "epoch": 1.8202739324690544, + "grad_norm": 9.082351684570312, + "learning_rate": 3.5394421804458244e-06, + "loss": 3.0526, + "step": 62130 + }, + { + "epoch": 1.8205669083717866, + "grad_norm": 9.320725440979004, + "learning_rate": 3.5379297440999795e-06, + "loss": 3.0804, + "step": 62140 + }, + { + "epoch": 1.8208598842745185, + "grad_norm": 8.869011878967285, + "learning_rate": 3.5364174540255907e-06, + "loss": 3.0683, + "step": 62150 + }, + { + "epoch": 1.8211528601772504, + "grad_norm": 8.808306694030762, + "learning_rate": 3.5349053103739496e-06, + "loss": 3.0612, + "step": 62160 + }, + { + "epoch": 1.8214458360799823, + "grad_norm": 9.253647804260254, + "learning_rate": 3.533393313296344e-06, + "loss": 3.0782, + "step": 62170 + }, + { + "epoch": 1.8217388119827143, + "grad_norm": 8.912796974182129, + "learning_rate": 3.5318814629440334e-06, + "loss": 3.0416, + "step": 62180 + }, + { + "epoch": 1.8220317878854464, + "grad_norm": 9.065220832824707, + "learning_rate": 3.530369759468273e-06, + "loss": 3.0453, + "step": 62190 + }, + { + "epoch": 1.8223247637881785, + "grad_norm": 9.641283988952637, + "learning_rate": 3.5288582030202995e-06, + "loss": 3.0628, + "step": 62200 + }, + { + "epoch": 1.8226177396909105, + "grad_norm": 9.329473495483398, + "learning_rate": 3.527346793751334e-06, + "loss": 3.0537, + "step": 62210 + }, + { + "epoch": 1.8229107155936424, + "grad_norm": 8.911675453186035, + "learning_rate": 3.525835531812587e-06, + "loss": 3.0486, + "step": 62220 + }, + { + "epoch": 1.8232036914963743, + "grad_norm": 9.260193824768066, + "learning_rate": 3.524324417355247e-06, + "loss": 3.0547, + "step": 62230 + }, + { + "epoch": 1.8234966673991064, + "grad_norm": 8.410517692565918, + "learning_rate": 3.5228134505304957e-06, + "loss": 3.0507, + "step": 62240 + }, + { + "epoch": 1.8237896433018386, + "grad_norm": 9.546628952026367, + "learning_rate": 3.521302631489493e-06, + "loss": 3.0645, + "step": 62250 + }, + { + "epoch": 1.8240826192045705, + "grad_norm": 9.619589805603027, + "learning_rate": 3.519791960383392e-06, + "loss": 3.0758, + "step": 62260 + }, + { + "epoch": 1.8243755951073024, + "grad_norm": 9.151453018188477, + "learning_rate": 3.518281437363321e-06, + "loss": 3.0773, + "step": 62270 + }, + { + "epoch": 1.8246685710100343, + "grad_norm": 9.226609230041504, + "learning_rate": 3.516771062580403e-06, + "loss": 3.0387, + "step": 62280 + }, + { + "epoch": 1.8249615469127665, + "grad_norm": 8.901361465454102, + "learning_rate": 3.515260836185739e-06, + "loss": 3.056, + "step": 62290 + }, + { + "epoch": 1.8252545228154984, + "grad_norm": 8.64664363861084, + "learning_rate": 3.513750758330422e-06, + "loss": 3.0582, + "step": 62300 + }, + { + "epoch": 1.8255474987182305, + "grad_norm": 8.816914558410645, + "learning_rate": 3.5122408291655215e-06, + "loss": 3.0581, + "step": 62310 + }, + { + "epoch": 1.8258404746209624, + "grad_norm": 9.46225357055664, + "learning_rate": 3.5107310488421008e-06, + "loss": 3.0839, + "step": 62320 + }, + { + "epoch": 1.8261334505236944, + "grad_norm": 8.808910369873047, + "learning_rate": 3.509221417511202e-06, + "loss": 3.0532, + "step": 62330 + }, + { + "epoch": 1.8264264264264263, + "grad_norm": 8.640020370483398, + "learning_rate": 3.507711935323858e-06, + "loss": 3.0519, + "step": 62340 + }, + { + "epoch": 1.8267194023291584, + "grad_norm": 8.255330085754395, + "learning_rate": 3.5062026024310795e-06, + "loss": 3.0546, + "step": 62350 + }, + { + "epoch": 1.8270123782318906, + "grad_norm": 9.092462539672852, + "learning_rate": 3.5046934189838696e-06, + "loss": 3.0601, + "step": 62360 + }, + { + "epoch": 1.8273053541346225, + "grad_norm": 8.686080932617188, + "learning_rate": 3.5031843851332105e-06, + "loss": 3.0615, + "step": 62370 + }, + { + "epoch": 1.8275983300373544, + "grad_norm": 9.474212646484375, + "learning_rate": 3.501675501030075e-06, + "loss": 3.0553, + "step": 62380 + }, + { + "epoch": 1.8278913059400863, + "grad_norm": 9.054327011108398, + "learning_rate": 3.500166766825415e-06, + "loss": 3.0758, + "step": 62390 + }, + { + "epoch": 1.8281842818428184, + "grad_norm": 8.82336711883545, + "learning_rate": 3.4986581826701714e-06, + "loss": 3.0777, + "step": 62400 + }, + { + "epoch": 1.8284772577455506, + "grad_norm": 8.75238037109375, + "learning_rate": 3.4971497487152693e-06, + "loss": 3.044, + "step": 62410 + }, + { + "epoch": 1.8287702336482825, + "grad_norm": 9.196759223937988, + "learning_rate": 3.495641465111621e-06, + "loss": 3.0628, + "step": 62420 + }, + { + "epoch": 1.8290632095510144, + "grad_norm": 9.145197868347168, + "learning_rate": 3.4941333320101173e-06, + "loss": 3.0817, + "step": 62430 + }, + { + "epoch": 1.8293561854537463, + "grad_norm": 8.40756607055664, + "learning_rate": 3.49262534956164e-06, + "loss": 3.067, + "step": 62440 + }, + { + "epoch": 1.8296491613564785, + "grad_norm": 9.602737426757812, + "learning_rate": 3.491117517917053e-06, + "loss": 3.0576, + "step": 62450 + }, + { + "epoch": 1.8299421372592104, + "grad_norm": 9.570377349853516, + "learning_rate": 3.4896098372272086e-06, + "loss": 3.0735, + "step": 62460 + }, + { + "epoch": 1.8300593276203032, + "eval_bleu": 0.35007232636298785, + "eval_cap_loss": 0.9100363254547119, + "eval_con_loss": 1.1527525186538696, + "eval_loss": 3.215541362762451, + "step": 62464 + }, + { + "epoch": 1.8300593276203032, + "eval_bleu": 0.35007232636298785, + "eval_cap_loss": 0.9100363254547119, + "eval_con_loss": 1.1527525186538696, + "eval_loss": 3.215541362762451, + "eval_runtime": 57.6698, + "eval_samples_per_second": 346.802, + "eval_steps_per_second": 0.347, + "step": 62464 + }, + { + "epoch": 1.8302351131619425, + "grad_norm": 8.819340705871582, + "learning_rate": 3.4881023076429367e-06, + "loss": 3.0661, + "step": 62470 + }, + { + "epoch": 1.8305280890646745, + "grad_norm": 8.96890926361084, + "learning_rate": 3.48659492931506e-06, + "loss": 3.0683, + "step": 62480 + }, + { + "epoch": 1.8308210649674064, + "grad_norm": 9.000947952270508, + "learning_rate": 3.485087702394382e-06, + "loss": 3.0567, + "step": 62490 + }, + { + "epoch": 1.8311140408701383, + "grad_norm": 9.274025917053223, + "learning_rate": 3.483580627031693e-06, + "loss": 3.077, + "step": 62500 + }, + { + "epoch": 1.8314070167728704, + "grad_norm": 8.722979545593262, + "learning_rate": 3.482073703377763e-06, + "loss": 3.0878, + "step": 62510 + }, + { + "epoch": 1.8316999926756026, + "grad_norm": 8.88839340209961, + "learning_rate": 3.480566931583355e-06, + "loss": 3.0689, + "step": 62520 + }, + { + "epoch": 1.8319929685783345, + "grad_norm": 8.941320419311523, + "learning_rate": 3.4790603117992107e-06, + "loss": 3.0387, + "step": 62530 + }, + { + "epoch": 1.8322859444810664, + "grad_norm": 9.190714836120605, + "learning_rate": 3.4775538441760577e-06, + "loss": 3.0503, + "step": 62540 + }, + { + "epoch": 1.8325789203837983, + "grad_norm": 8.188568115234375, + "learning_rate": 3.476047528864612e-06, + "loss": 3.0576, + "step": 62550 + }, + { + "epoch": 1.8328718962865305, + "grad_norm": 8.862749099731445, + "learning_rate": 3.474541366015569e-06, + "loss": 3.0606, + "step": 62560 + }, + { + "epoch": 1.8331648721892624, + "grad_norm": 9.384869575500488, + "learning_rate": 3.4730353557796126e-06, + "loss": 3.036, + "step": 62570 + }, + { + "epoch": 1.8334578480919945, + "grad_norm": 9.489714622497559, + "learning_rate": 3.4715294983074084e-06, + "loss": 3.0698, + "step": 62580 + }, + { + "epoch": 1.8337508239947264, + "grad_norm": 8.970657348632812, + "learning_rate": 3.470023793749612e-06, + "loss": 3.0641, + "step": 62590 + }, + { + "epoch": 1.8340437998974584, + "grad_norm": 9.038802146911621, + "learning_rate": 3.4685182422568553e-06, + "loss": 3.0573, + "step": 62600 + }, + { + "epoch": 1.8343367758001903, + "grad_norm": 9.184016227722168, + "learning_rate": 3.4670128439797644e-06, + "loss": 3.0527, + "step": 62610 + }, + { + "epoch": 1.8346297517029224, + "grad_norm": 9.576803207397461, + "learning_rate": 3.4655075990689424e-06, + "loss": 3.0408, + "step": 62620 + }, + { + "epoch": 1.8349227276056546, + "grad_norm": 9.03641128540039, + "learning_rate": 3.464002507674983e-06, + "loss": 3.0654, + "step": 62630 + }, + { + "epoch": 1.8352157035083865, + "grad_norm": 9.166613578796387, + "learning_rate": 3.4624975699484582e-06, + "loss": 3.0513, + "step": 62640 + }, + { + "epoch": 1.8355086794111184, + "grad_norm": 9.012676239013672, + "learning_rate": 3.4609927860399307e-06, + "loss": 3.0691, + "step": 62650 + }, + { + "epoch": 1.8358016553138503, + "grad_norm": 9.371786117553711, + "learning_rate": 3.459488156099944e-06, + "loss": 3.0574, + "step": 62660 + }, + { + "epoch": 1.8360946312165825, + "grad_norm": 9.37832260131836, + "learning_rate": 3.457983680279029e-06, + "loss": 3.0609, + "step": 62670 + }, + { + "epoch": 1.8363876071193146, + "grad_norm": 8.731684684753418, + "learning_rate": 3.456479358727696e-06, + "loss": 3.0625, + "step": 62680 + }, + { + "epoch": 1.8366805830220465, + "grad_norm": 8.435905456542969, + "learning_rate": 3.454975191596447e-06, + "loss": 3.0269, + "step": 62690 + }, + { + "epoch": 1.8369735589247784, + "grad_norm": 9.058753967285156, + "learning_rate": 3.453621573331868e-06, + "loss": 3.0705, + "step": 62700 + }, + { + "epoch": 1.8372665348275103, + "grad_norm": 9.397599220275879, + "learning_rate": 3.4521177000133456e-06, + "loss": 3.0595, + "step": 62710 + }, + { + "epoch": 1.8375595107302425, + "grad_norm": 9.201003074645996, + "learning_rate": 3.450613981551261e-06, + "loss": 3.0475, + "step": 62720 + }, + { + "epoch": 1.8378524866329744, + "grad_norm": 9.376046180725098, + "learning_rate": 3.449110418096058e-06, + "loss": 3.0754, + "step": 62730 + }, + { + "epoch": 1.8381454625357065, + "grad_norm": 9.007882118225098, + "learning_rate": 3.447607009798154e-06, + "loss": 3.0528, + "step": 62740 + }, + { + "epoch": 1.8384384384384385, + "grad_norm": 8.690180778503418, + "learning_rate": 3.4461037568079614e-06, + "loss": 3.0421, + "step": 62750 + }, + { + "epoch": 1.8387314143411704, + "grad_norm": 9.27067756652832, + "learning_rate": 3.4446006592758667e-06, + "loss": 3.058, + "step": 62760 + }, + { + "epoch": 1.8390243902439023, + "grad_norm": 8.994020462036133, + "learning_rate": 3.4430977173522494e-06, + "loss": 3.0575, + "step": 62770 + }, + { + "epoch": 1.8393173661466344, + "grad_norm": 8.845556259155273, + "learning_rate": 3.4415949311874676e-06, + "loss": 3.0528, + "step": 62780 + }, + { + "epoch": 1.8396103420493666, + "grad_norm": 8.726908683776855, + "learning_rate": 3.44009230093187e-06, + "loss": 3.074, + "step": 62790 + }, + { + "epoch": 1.8399033179520985, + "grad_norm": 8.771220207214355, + "learning_rate": 3.43858982673578e-06, + "loss": 3.0687, + "step": 62800 + }, + { + "epoch": 1.8401962938548304, + "grad_norm": 8.93181037902832, + "learning_rate": 3.437087508749516e-06, + "loss": 3.0641, + "step": 62810 + }, + { + "epoch": 1.8404892697575623, + "grad_norm": 9.479713439941406, + "learning_rate": 3.4355853471233725e-06, + "loss": 3.0533, + "step": 62820 + }, + { + "epoch": 1.8407822456602945, + "grad_norm": 9.083850860595703, + "learning_rate": 3.434083342007636e-06, + "loss": 3.0749, + "step": 62830 + }, + { + "epoch": 1.8410752215630264, + "grad_norm": 8.894367218017578, + "learning_rate": 3.432581493552569e-06, + "loss": 3.0489, + "step": 62840 + }, + { + "epoch": 1.8413681974657585, + "grad_norm": 8.764352798461914, + "learning_rate": 3.431079801908426e-06, + "loss": 3.059, + "step": 62850 + }, + { + "epoch": 1.8416611733684904, + "grad_norm": 8.54228687286377, + "learning_rate": 3.4295782672254385e-06, + "loss": 3.059, + "step": 62860 + }, + { + "epoch": 1.8419541492712224, + "grad_norm": 8.721017837524414, + "learning_rate": 3.428076889653829e-06, + "loss": 3.0512, + "step": 62870 + }, + { + "epoch": 1.8422471251739543, + "grad_norm": 9.147909164428711, + "learning_rate": 3.4265756693438027e-06, + "loss": 3.0644, + "step": 62880 + }, + { + "epoch": 1.8425401010766864, + "grad_norm": 9.12767219543457, + "learning_rate": 3.425074606445544e-06, + "loss": 3.0555, + "step": 62890 + }, + { + "epoch": 1.8428330769794186, + "grad_norm": 8.773905754089355, + "learning_rate": 3.4235737011092286e-06, + "loss": 3.0678, + "step": 62900 + }, + { + "epoch": 1.8431260528821505, + "grad_norm": 8.525740623474121, + "learning_rate": 3.4220729534850105e-06, + "loss": 3.0383, + "step": 62910 + }, + { + "epoch": 1.8434190287848824, + "grad_norm": 8.626358985900879, + "learning_rate": 3.420572363723034e-06, + "loss": 3.0364, + "step": 62920 + }, + { + "epoch": 1.8437120046876143, + "grad_norm": 8.953078269958496, + "learning_rate": 3.419071931973419e-06, + "loss": 3.0475, + "step": 62930 + }, + { + "epoch": 1.8440049805903465, + "grad_norm": 8.528388977050781, + "learning_rate": 3.41757165838628e-06, + "loss": 3.0547, + "step": 62940 + }, + { + "epoch": 1.8442979564930786, + "grad_norm": 9.512333869934082, + "learning_rate": 3.416071543111707e-06, + "loss": 3.0727, + "step": 62950 + }, + { + "epoch": 1.8445909323958105, + "grad_norm": 8.954931259155273, + "learning_rate": 3.414571586299781e-06, + "loss": 3.0461, + "step": 62960 + }, + { + "epoch": 1.8448839082985424, + "grad_norm": 8.272700309753418, + "learning_rate": 3.41307178810056e-06, + "loss": 3.0512, + "step": 62970 + }, + { + "epoch": 1.8450596938401818, + "eval_bleu": 0.35080206070186376, + "eval_cap_loss": 0.9088912010192871, + "eval_con_loss": 1.150328516960144, + "eval_loss": 3.209548234939575, + "step": 62976 + }, + { + "epoch": 1.8450596938401818, + "eval_bleu": 0.35080206070186376, + "eval_cap_loss": 0.9088912010192871, + "eval_con_loss": 1.150328516960144, + "eval_loss": 3.209548234939575, + "eval_runtime": 62.2975, + "eval_samples_per_second": 321.04, + "eval_steps_per_second": 0.321, + "step": 62976 + }, + { + "epoch": 1.8451768842012743, + "grad_norm": 8.577282905578613, + "learning_rate": 3.4115721486640923e-06, + "loss": 3.0316, + "step": 62980 + }, + { + "epoch": 1.8454698601040065, + "grad_norm": 8.722970008850098, + "learning_rate": 3.410072668140407e-06, + "loss": 3.0767, + "step": 62990 + }, + { + "epoch": 1.8457628360067384, + "grad_norm": 8.925727844238281, + "learning_rate": 3.4085733466795203e-06, + "loss": 3.0503, + "step": 63000 + }, + { + "epoch": 1.8460558119094705, + "grad_norm": 9.497374534606934, + "learning_rate": 3.407074184431426e-06, + "loss": 3.0492, + "step": 63010 + }, + { + "epoch": 1.8463487878122025, + "grad_norm": 9.190235137939453, + "learning_rate": 3.4055751815461102e-06, + "loss": 3.0458, + "step": 63020 + }, + { + "epoch": 1.8466417637149344, + "grad_norm": 9.189004898071289, + "learning_rate": 3.4040763381735377e-06, + "loss": 3.0606, + "step": 63030 + }, + { + "epoch": 1.8469347396176663, + "grad_norm": 9.321066856384277, + "learning_rate": 3.4025776544636606e-06, + "loss": 3.087, + "step": 63040 + }, + { + "epoch": 1.8472277155203984, + "grad_norm": 9.634866714477539, + "learning_rate": 3.4010791305664103e-06, + "loss": 3.062, + "step": 63050 + }, + { + "epoch": 1.8475206914231306, + "grad_norm": 8.445170402526855, + "learning_rate": 3.399580766631708e-06, + "loss": 3.0634, + "step": 63060 + }, + { + "epoch": 1.8478136673258625, + "grad_norm": 8.823590278625488, + "learning_rate": 3.3980825628094537e-06, + "loss": 3.0671, + "step": 63070 + }, + { + "epoch": 1.8481066432285944, + "grad_norm": 9.173791885375977, + "learning_rate": 3.3965845192495373e-06, + "loss": 3.0449, + "step": 63080 + }, + { + "epoch": 1.8483996191313263, + "grad_norm": 9.692294120788574, + "learning_rate": 3.3950866361018253e-06, + "loss": 3.0572, + "step": 63090 + }, + { + "epoch": 1.8486925950340585, + "grad_norm": 8.679134368896484, + "learning_rate": 3.393588913516175e-06, + "loss": 3.0705, + "step": 63100 + }, + { + "epoch": 1.8489855709367906, + "grad_norm": 10.004046440124512, + "learning_rate": 3.3920913516424213e-06, + "loss": 3.0578, + "step": 63110 + }, + { + "epoch": 1.8492785468395225, + "grad_norm": 9.140739440917969, + "learning_rate": 3.3905939506303908e-06, + "loss": 3.0403, + "step": 63120 + }, + { + "epoch": 1.8495715227422544, + "grad_norm": 9.589123725891113, + "learning_rate": 3.3890967106298855e-06, + "loss": 3.0632, + "step": 63130 + }, + { + "epoch": 1.8498644986449864, + "grad_norm": 8.937101364135742, + "learning_rate": 3.387599631790699e-06, + "loss": 3.0437, + "step": 63140 + }, + { + "epoch": 1.8501574745477183, + "grad_norm": 8.846139907836914, + "learning_rate": 3.3861027142626014e-06, + "loss": 3.033, + "step": 63150 + }, + { + "epoch": 1.8504504504504504, + "grad_norm": 8.77951431274414, + "learning_rate": 3.3846059581953537e-06, + "loss": 3.0425, + "step": 63160 + }, + { + "epoch": 1.8507434263531826, + "grad_norm": 9.42020034790039, + "learning_rate": 3.3831093637386954e-06, + "loss": 3.0712, + "step": 63170 + }, + { + "epoch": 1.8510364022559145, + "grad_norm": 8.402292251586914, + "learning_rate": 3.381612931042353e-06, + "loss": 3.0452, + "step": 63180 + }, + { + "epoch": 1.8513293781586464, + "grad_norm": 8.875062942504883, + "learning_rate": 3.3801166602560343e-06, + "loss": 3.0704, + "step": 63190 + }, + { + "epoch": 1.8516223540613783, + "grad_norm": 8.504937171936035, + "learning_rate": 3.3786205515294344e-06, + "loss": 3.0464, + "step": 63200 + }, + { + "epoch": 1.8519153299641105, + "grad_norm": 8.690715789794922, + "learning_rate": 3.3771246050122286e-06, + "loss": 3.0488, + "step": 63210 + }, + { + "epoch": 1.8522083058668426, + "grad_norm": 8.752854347229004, + "learning_rate": 3.3756288208540767e-06, + "loss": 3.068, + "step": 63220 + }, + { + "epoch": 1.8525012817695745, + "grad_norm": 9.250186920166016, + "learning_rate": 3.3741331992046256e-06, + "loss": 3.0691, + "step": 63230 + }, + { + "epoch": 1.8527942576723064, + "grad_norm": 9.274179458618164, + "learning_rate": 3.3726377402135012e-06, + "loss": 3.068, + "step": 63240 + }, + { + "epoch": 1.8530872335750384, + "grad_norm": 9.15030574798584, + "learning_rate": 3.3711424440303174e-06, + "loss": 3.0523, + "step": 63250 + }, + { + "epoch": 1.8533802094777705, + "grad_norm": 9.263562202453613, + "learning_rate": 3.3696473108046675e-06, + "loss": 3.0509, + "step": 63260 + }, + { + "epoch": 1.8536731853805024, + "grad_norm": 8.546683311462402, + "learning_rate": 3.3681523406861315e-06, + "loss": 3.0629, + "step": 63270 + }, + { + "epoch": 1.8539661612832345, + "grad_norm": 8.540765762329102, + "learning_rate": 3.3666575338242722e-06, + "loss": 3.0665, + "step": 63280 + }, + { + "epoch": 1.8542591371859665, + "grad_norm": 9.448084831237793, + "learning_rate": 3.365162890368638e-06, + "loss": 3.069, + "step": 63290 + }, + { + "epoch": 1.8545521130886984, + "grad_norm": 9.275680541992188, + "learning_rate": 3.3636684104687557e-06, + "loss": 3.0417, + "step": 63300 + }, + { + "epoch": 1.8548450889914303, + "grad_norm": 9.548877716064453, + "learning_rate": 3.362174094274141e-06, + "loss": 3.0662, + "step": 63310 + }, + { + "epoch": 1.8551380648941624, + "grad_norm": 8.977487564086914, + "learning_rate": 3.3606799419342916e-06, + "loss": 3.0462, + "step": 63320 + }, + { + "epoch": 1.8554310407968946, + "grad_norm": 9.132976531982422, + "learning_rate": 3.3591859535986894e-06, + "loss": 3.0694, + "step": 63330 + }, + { + "epoch": 1.8557240166996265, + "grad_norm": 8.267518997192383, + "learning_rate": 3.3576921294167963e-06, + "loss": 3.0386, + "step": 63340 + }, + { + "epoch": 1.8560169926023584, + "grad_norm": 9.296791076660156, + "learning_rate": 3.356198469538063e-06, + "loss": 3.0373, + "step": 63350 + }, + { + "epoch": 1.8563099685050903, + "grad_norm": 9.322319030761719, + "learning_rate": 3.3547049741119196e-06, + "loss": 3.0718, + "step": 63360 + }, + { + "epoch": 1.8566029444078225, + "grad_norm": 8.69913101196289, + "learning_rate": 3.3532116432877836e-06, + "loss": 3.0658, + "step": 63370 + }, + { + "epoch": 1.8568959203105546, + "grad_norm": 9.141465187072754, + "learning_rate": 3.351718477215052e-06, + "loss": 3.0716, + "step": 63380 + }, + { + "epoch": 1.8571888962132865, + "grad_norm": 9.011565208435059, + "learning_rate": 3.3502254760431074e-06, + "loss": 3.0644, + "step": 63390 + }, + { + "epoch": 1.8574818721160185, + "grad_norm": 9.248270034790039, + "learning_rate": 3.348732639921316e-06, + "loss": 3.0777, + "step": 63400 + }, + { + "epoch": 1.8577748480187504, + "grad_norm": 9.125016212463379, + "learning_rate": 3.3472399689990297e-06, + "loss": 3.0638, + "step": 63410 + }, + { + "epoch": 1.8580678239214825, + "grad_norm": 9.299680709838867, + "learning_rate": 3.345747463425576e-06, + "loss": 3.0595, + "step": 63420 + }, + { + "epoch": 1.8583607998242144, + "grad_norm": 8.509663581848145, + "learning_rate": 3.3442551233502763e-06, + "loss": 3.0253, + "step": 63430 + }, + { + "epoch": 1.8586537757269466, + "grad_norm": 9.175427436828613, + "learning_rate": 3.3427629489224268e-06, + "loss": 3.0547, + "step": 63440 + }, + { + "epoch": 1.8589467516296785, + "grad_norm": 8.798880577087402, + "learning_rate": 3.3412709402913134e-06, + "loss": 3.0642, + "step": 63450 + }, + { + "epoch": 1.8592397275324104, + "grad_norm": 9.159967422485352, + "learning_rate": 3.3397790976062004e-06, + "loss": 3.063, + "step": 63460 + }, + { + "epoch": 1.8595327034351423, + "grad_norm": 9.141916275024414, + "learning_rate": 3.33828742101634e-06, + "loss": 3.0604, + "step": 63470 + }, + { + "epoch": 1.8598256793378745, + "grad_norm": 9.742531776428223, + "learning_rate": 3.3367959106709633e-06, + "loss": 3.0462, + "step": 63480 + }, + { + "epoch": 1.86006006006006, + "eval_bleu": 0.3505541658066704, + "eval_cap_loss": 0.9087993502616882, + "eval_con_loss": 1.1492462158203125, + "eval_loss": 3.207291603088379, + "step": 63488 + }, + { + "epoch": 1.86006006006006, + "eval_bleu": 0.3505541658066704, + "eval_cap_loss": 0.9087993502616882, + "eval_con_loss": 1.1492462158203125, + "eval_loss": 3.207291603088379, + "eval_runtime": 55.2989, + "eval_samples_per_second": 361.671, + "eval_steps_per_second": 0.362, + "step": 63488 + }, + { + "epoch": 1.8601186552406066, + "grad_norm": 9.33288288116455, + "learning_rate": 3.3353045667192905e-06, + "loss": 3.034, + "step": 63490 + }, + { + "epoch": 1.8604116311433385, + "grad_norm": 9.218969345092773, + "learning_rate": 3.3338133893105175e-06, + "loss": 3.0562, + "step": 63500 + }, + { + "epoch": 1.8607046070460704, + "grad_norm": 8.880365371704102, + "learning_rate": 3.332322378593831e-06, + "loss": 3.0658, + "step": 63510 + }, + { + "epoch": 1.8609975829488024, + "grad_norm": 8.974156379699707, + "learning_rate": 3.3308315347183947e-06, + "loss": 3.0696, + "step": 63520 + }, + { + "epoch": 1.8612905588515345, + "grad_norm": 9.151590347290039, + "learning_rate": 3.329340857833363e-06, + "loss": 3.0417, + "step": 63530 + }, + { + "epoch": 1.8615835347542664, + "grad_norm": 9.153843879699707, + "learning_rate": 3.3278503480878642e-06, + "loss": 3.0623, + "step": 63540 + }, + { + "epoch": 1.8618765106569986, + "grad_norm": 9.475127220153809, + "learning_rate": 3.3263600056310198e-06, + "loss": 3.0459, + "step": 63550 + }, + { + "epoch": 1.8621694865597305, + "grad_norm": 9.47914981842041, + "learning_rate": 3.3248698306119256e-06, + "loss": 3.0391, + "step": 63560 + }, + { + "epoch": 1.8624624624624624, + "grad_norm": 8.609469413757324, + "learning_rate": 3.323379823179669e-06, + "loss": 3.0459, + "step": 63570 + }, + { + "epoch": 1.8627554383651943, + "grad_norm": 10.25971794128418, + "learning_rate": 3.321889983483311e-06, + "loss": 3.0665, + "step": 63580 + }, + { + "epoch": 1.8630484142679264, + "grad_norm": 9.426351547241211, + "learning_rate": 3.320400311671905e-06, + "loss": 3.0348, + "step": 63590 + }, + { + "epoch": 1.8633413901706586, + "grad_norm": 9.66862678527832, + "learning_rate": 3.318910807894484e-06, + "loss": 3.0511, + "step": 63600 + }, + { + "epoch": 1.8636343660733905, + "grad_norm": 9.090420722961426, + "learning_rate": 3.317421472300063e-06, + "loss": 3.0762, + "step": 63610 + }, + { + "epoch": 1.8639273419761224, + "grad_norm": 8.604307174682617, + "learning_rate": 3.3159323050376422e-06, + "loss": 3.057, + "step": 63620 + }, + { + "epoch": 1.8642203178788543, + "grad_norm": 8.992502212524414, + "learning_rate": 3.3144433062562022e-06, + "loss": 3.0563, + "step": 63630 + }, + { + "epoch": 1.8645132937815865, + "grad_norm": 8.977513313293457, + "learning_rate": 3.3129544761047093e-06, + "loss": 3.0483, + "step": 63640 + }, + { + "epoch": 1.8648062696843186, + "grad_norm": 9.204197883605957, + "learning_rate": 3.311465814732111e-06, + "loss": 3.0634, + "step": 63650 + }, + { + "epoch": 1.8650992455870505, + "grad_norm": 9.01103687286377, + "learning_rate": 3.3099773222873434e-06, + "loss": 3.0599, + "step": 63660 + }, + { + "epoch": 1.8653922214897825, + "grad_norm": 9.362427711486816, + "learning_rate": 3.3084889989193148e-06, + "loss": 3.0617, + "step": 63670 + }, + { + "epoch": 1.8656851973925144, + "grad_norm": 9.509366035461426, + "learning_rate": 3.307000844776928e-06, + "loss": 3.0649, + "step": 63680 + }, + { + "epoch": 1.8659781732952465, + "grad_norm": 9.30508804321289, + "learning_rate": 3.305512860009061e-06, + "loss": 3.0541, + "step": 63690 + }, + { + "epoch": 1.8662711491979784, + "grad_norm": 9.453890800476074, + "learning_rate": 3.3040250447645826e-06, + "loss": 3.051, + "step": 63700 + }, + { + "epoch": 1.8665641251007106, + "grad_norm": 9.261798858642578, + "learning_rate": 3.3025373991923336e-06, + "loss": 3.0583, + "step": 63710 + }, + { + "epoch": 1.8668571010034425, + "grad_norm": 8.753434181213379, + "learning_rate": 3.3010499234411485e-06, + "loss": 3.0736, + "step": 63720 + }, + { + "epoch": 1.8671500769061744, + "grad_norm": 8.875327110290527, + "learning_rate": 3.2995626176598378e-06, + "loss": 3.0574, + "step": 63730 + }, + { + "epoch": 1.8674430528089063, + "grad_norm": 9.232061386108398, + "learning_rate": 3.2980754819972006e-06, + "loss": 3.0695, + "step": 63740 + }, + { + "epoch": 1.8677360287116385, + "grad_norm": 9.805264472961426, + "learning_rate": 3.2965885166020128e-06, + "loss": 3.0534, + "step": 63750 + }, + { + "epoch": 1.8680290046143706, + "grad_norm": 8.821125030517578, + "learning_rate": 3.2951017216230387e-06, + "loss": 3.0445, + "step": 63760 + }, + { + "epoch": 1.8683219805171025, + "grad_norm": 8.966998100280762, + "learning_rate": 3.293615097209021e-06, + "loss": 3.0617, + "step": 63770 + }, + { + "epoch": 1.8686149564198344, + "grad_norm": 8.587024688720703, + "learning_rate": 3.292128643508692e-06, + "loss": 3.0461, + "step": 63780 + }, + { + "epoch": 1.8689079323225664, + "grad_norm": 8.848190307617188, + "learning_rate": 3.2906423606707576e-06, + "loss": 3.0452, + "step": 63790 + }, + { + "epoch": 1.8692009082252985, + "grad_norm": 8.99872875213623, + "learning_rate": 3.2891562488439145e-06, + "loss": 3.0369, + "step": 63800 + }, + { + "epoch": 1.8694938841280304, + "grad_norm": 8.687289237976074, + "learning_rate": 3.2876703081768378e-06, + "loss": 3.0394, + "step": 63810 + }, + { + "epoch": 1.8697868600307626, + "grad_norm": 9.96061897277832, + "learning_rate": 3.28618453881819e-06, + "loss": 3.0216, + "step": 63820 + }, + { + "epoch": 1.8700798359334945, + "grad_norm": 9.332474708557129, + "learning_rate": 3.284698940916609e-06, + "loss": 3.0761, + "step": 63830 + }, + { + "epoch": 1.8703728118362264, + "grad_norm": 9.147355079650879, + "learning_rate": 3.283213514620724e-06, + "loss": 3.04, + "step": 63840 + }, + { + "epoch": 1.8706657877389583, + "grad_norm": 9.691062927246094, + "learning_rate": 3.28172826007914e-06, + "loss": 3.0625, + "step": 63850 + }, + { + "epoch": 1.8709587636416904, + "grad_norm": 8.609454154968262, + "learning_rate": 3.2802431774404514e-06, + "loss": 3.0203, + "step": 63860 + }, + { + "epoch": 1.8712517395444226, + "grad_norm": 9.63193416595459, + "learning_rate": 3.278758266853228e-06, + "loss": 3.0698, + "step": 63870 + }, + { + "epoch": 1.8715447154471545, + "grad_norm": 9.26760196685791, + "learning_rate": 3.2772735284660295e-06, + "loss": 3.0459, + "step": 63880 + }, + { + "epoch": 1.8718376913498864, + "grad_norm": 8.209235191345215, + "learning_rate": 3.2757889624273925e-06, + "loss": 3.0376, + "step": 63890 + }, + { + "epoch": 1.8721306672526183, + "grad_norm": 8.85163402557373, + "learning_rate": 3.2743045688858425e-06, + "loss": 3.0571, + "step": 63900 + }, + { + "epoch": 1.8724236431553505, + "grad_norm": 9.262887954711914, + "learning_rate": 3.27282034798988e-06, + "loss": 3.0675, + "step": 63910 + }, + { + "epoch": 1.8727166190580826, + "grad_norm": 8.875672340393066, + "learning_rate": 3.271336299887996e-06, + "loss": 3.0632, + "step": 63920 + }, + { + "epoch": 1.8730095949608145, + "grad_norm": 9.07973861694336, + "learning_rate": 3.269852424728659e-06, + "loss": 3.075, + "step": 63930 + }, + { + "epoch": 1.8733025708635465, + "grad_norm": 9.108926773071289, + "learning_rate": 3.2683687226603212e-06, + "loss": 3.0483, + "step": 63940 + }, + { + "epoch": 1.8735955467662784, + "grad_norm": 8.812748908996582, + "learning_rate": 3.2668851938314217e-06, + "loss": 3.0511, + "step": 63950 + }, + { + "epoch": 1.8738885226690105, + "grad_norm": 9.392999649047852, + "learning_rate": 3.2654018383903736e-06, + "loss": 3.0532, + "step": 63960 + }, + { + "epoch": 1.8741814985717424, + "grad_norm": 9.116098403930664, + "learning_rate": 3.263918656485583e-06, + "loss": 3.0419, + "step": 63970 + }, + { + "epoch": 1.8744744744744746, + "grad_norm": 8.710110664367676, + "learning_rate": 3.2624356482654297e-06, + "loss": 3.05, + "step": 63980 + }, + { + "epoch": 1.8747674503772065, + "grad_norm": 8.927573204040527, + "learning_rate": 3.2609528138782826e-06, + "loss": 3.0545, + "step": 63990 + }, + { + "epoch": 1.8750604262799384, + "grad_norm": 8.790080070495605, + "learning_rate": 3.259470153472486e-06, + "loss": 3.054, + "step": 64000 + }, + { + "epoch": 1.8750604262799384, + "eval_bleu": 0.3514506099028949, + "eval_cap_loss": 0.9080103635787964, + "eval_con_loss": 1.1488447189331055, + "eval_loss": 3.205699920654297, + "step": 64000 + }, + { + "epoch": 1.8750604262799384, + "eval_bleu": 0.3514506099028949, + "eval_cap_loss": 0.9080103635787964, + "eval_con_loss": 1.1488447189331055, + "eval_loss": 3.205699920654297, + "eval_runtime": 56.456, + "eval_samples_per_second": 354.258, + "eval_steps_per_second": 0.354, + "step": 64000 + }, + { + "epoch": 1.8753534021826703, + "grad_norm": 9.052239418029785, + "learning_rate": 3.2579876671963776e-06, + "loss": 3.0345, + "step": 64010 + }, + { + "epoch": 1.8756463780854025, + "grad_norm": 9.206005096435547, + "learning_rate": 3.256505355198266e-06, + "loss": 3.0513, + "step": 64020 + }, + { + "epoch": 1.8759393539881346, + "grad_norm": 9.046277046203613, + "learning_rate": 3.2550232176264516e-06, + "loss": 3.0435, + "step": 64030 + }, + { + "epoch": 1.8762323298908665, + "grad_norm": 8.466553688049316, + "learning_rate": 3.2535412546292093e-06, + "loss": 3.0453, + "step": 64040 + }, + { + "epoch": 1.8765253057935984, + "grad_norm": 8.99744701385498, + "learning_rate": 3.2520594663548055e-06, + "loss": 3.0664, + "step": 64050 + }, + { + "epoch": 1.8768182816963304, + "grad_norm": 9.42381477355957, + "learning_rate": 3.250577852951481e-06, + "loss": 3.051, + "step": 64060 + }, + { + "epoch": 1.8771112575990625, + "grad_norm": 8.540202140808105, + "learning_rate": 3.249096414567464e-06, + "loss": 3.0515, + "step": 64070 + }, + { + "epoch": 1.8774042335017946, + "grad_norm": 9.313702583312988, + "learning_rate": 3.24761515135096e-06, + "loss": 3.0533, + "step": 64080 + }, + { + "epoch": 1.8776972094045266, + "grad_norm": 9.834268569946289, + "learning_rate": 3.2461340634501677e-06, + "loss": 3.0544, + "step": 64090 + }, + { + "epoch": 1.8779901853072585, + "grad_norm": 10.035770416259766, + "learning_rate": 3.244653151013254e-06, + "loss": 3.065, + "step": 64100 + }, + { + "epoch": 1.8782831612099904, + "grad_norm": 9.166455268859863, + "learning_rate": 3.243172414188381e-06, + "loss": 3.05, + "step": 64110 + }, + { + "epoch": 1.8785761371127225, + "grad_norm": 9.148099899291992, + "learning_rate": 3.241691853123682e-06, + "loss": 3.0421, + "step": 64120 + }, + { + "epoch": 1.8788691130154545, + "grad_norm": 8.577225685119629, + "learning_rate": 3.240211467967284e-06, + "loss": 3.0377, + "step": 64130 + }, + { + "epoch": 1.8791620889181866, + "grad_norm": 9.769481658935547, + "learning_rate": 3.2387312588672865e-06, + "loss": 3.0772, + "step": 64140 + }, + { + "epoch": 1.8794550648209185, + "grad_norm": 9.711018562316895, + "learning_rate": 3.237251225971779e-06, + "loss": 3.0427, + "step": 64150 + }, + { + "epoch": 1.8797480407236504, + "grad_norm": 8.983983993530273, + "learning_rate": 3.2357713694288255e-06, + "loss": 3.0616, + "step": 64160 + }, + { + "epoch": 1.8800410166263823, + "grad_norm": 8.819991111755371, + "learning_rate": 3.234291689386483e-06, + "loss": 3.068, + "step": 64170 + }, + { + "epoch": 1.8803339925291145, + "grad_norm": 9.10708999633789, + "learning_rate": 3.2328121859927788e-06, + "loss": 3.0331, + "step": 64180 + }, + { + "epoch": 1.8806269684318466, + "grad_norm": 9.290393829345703, + "learning_rate": 3.231332859395734e-06, + "loss": 3.0735, + "step": 64190 + }, + { + "epoch": 1.8809199443345785, + "grad_norm": 9.166808128356934, + "learning_rate": 3.229853709743339e-06, + "loss": 3.0334, + "step": 64200 + }, + { + "epoch": 1.8812129202373105, + "grad_norm": 9.39996337890625, + "learning_rate": 3.2283747371835826e-06, + "loss": 3.056, + "step": 64210 + }, + { + "epoch": 1.8815058961400424, + "grad_norm": 8.673916816711426, + "learning_rate": 3.226895941864421e-06, + "loss": 3.0634, + "step": 64220 + }, + { + "epoch": 1.8817988720427745, + "grad_norm": 9.071918487548828, + "learning_rate": 3.225417323933802e-06, + "loss": 3.0624, + "step": 64230 + }, + { + "epoch": 1.8820918479455064, + "grad_norm": 9.078619956970215, + "learning_rate": 3.2239388835396492e-06, + "loss": 3.0271, + "step": 64240 + }, + { + "epoch": 1.8823848238482386, + "grad_norm": 8.806164741516113, + "learning_rate": 3.222460620829877e-06, + "loss": 3.0578, + "step": 64250 + }, + { + "epoch": 1.8826777997509705, + "grad_norm": 9.465846061706543, + "learning_rate": 3.2209825359523717e-06, + "loss": 3.0557, + "step": 64260 + }, + { + "epoch": 1.8829707756537024, + "grad_norm": 9.396425247192383, + "learning_rate": 3.2195046290550114e-06, + "loss": 3.0644, + "step": 64270 + }, + { + "epoch": 1.8832637515564343, + "grad_norm": 9.063594818115234, + "learning_rate": 3.2180269002856467e-06, + "loss": 3.0474, + "step": 64280 + }, + { + "epoch": 1.8835567274591665, + "grad_norm": 9.057476043701172, + "learning_rate": 3.2165493497921217e-06, + "loss": 3.0448, + "step": 64290 + }, + { + "epoch": 1.8838497033618986, + "grad_norm": 8.975831985473633, + "learning_rate": 3.2150719777222504e-06, + "loss": 3.0329, + "step": 64300 + }, + { + "epoch": 1.8841426792646305, + "grad_norm": 8.694581031799316, + "learning_rate": 3.213594784223839e-06, + "loss": 3.0394, + "step": 64310 + }, + { + "epoch": 1.8844356551673624, + "grad_norm": 8.96037483215332, + "learning_rate": 3.212117769444672e-06, + "loss": 3.0372, + "step": 64320 + }, + { + "epoch": 1.8847286310700944, + "grad_norm": 8.587617874145508, + "learning_rate": 3.210640933532514e-06, + "loss": 3.0428, + "step": 64330 + }, + { + "epoch": 1.8850216069728265, + "grad_norm": 8.802291870117188, + "learning_rate": 3.2091642766351164e-06, + "loss": 3.053, + "step": 64340 + }, + { + "epoch": 1.8853145828755586, + "grad_norm": 9.002342224121094, + "learning_rate": 3.2076877989002065e-06, + "loss": 3.0503, + "step": 64350 + }, + { + "epoch": 1.8856075587782906, + "grad_norm": 8.854019165039062, + "learning_rate": 3.2062115004755003e-06, + "loss": 3.0475, + "step": 64360 + }, + { + "epoch": 1.8859005346810225, + "grad_norm": 9.710904121398926, + "learning_rate": 3.2047353815086894e-06, + "loss": 3.0653, + "step": 64370 + }, + { + "epoch": 1.8861935105837544, + "grad_norm": 9.01966381072998, + "learning_rate": 3.203259442147455e-06, + "loss": 3.0309, + "step": 64380 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 8.94744873046875, + "learning_rate": 3.2017836825394523e-06, + "loss": 3.0434, + "step": 64390 + }, + { + "epoch": 1.8867794623892185, + "grad_norm": 9.738163948059082, + "learning_rate": 3.2003081028323243e-06, + "loss": 3.0548, + "step": 64400 + }, + { + "epoch": 1.8870724382919506, + "grad_norm": 9.035741806030273, + "learning_rate": 3.198832703173692e-06, + "loss": 3.045, + "step": 64410 + }, + { + "epoch": 1.8873654141946825, + "grad_norm": 8.910844802856445, + "learning_rate": 3.1973574837111643e-06, + "loss": 3.0475, + "step": 64420 + }, + { + "epoch": 1.8876583900974144, + "grad_norm": 9.29116439819336, + "learning_rate": 3.195882444592323e-06, + "loss": 3.0586, + "step": 64430 + }, + { + "epoch": 1.8879513660001463, + "grad_norm": 9.059956550598145, + "learning_rate": 3.1944075859647415e-06, + "loss": 3.0431, + "step": 64440 + }, + { + "epoch": 1.8882443419028785, + "grad_norm": 8.910731315612793, + "learning_rate": 3.1929329079759675e-06, + "loss": 3.0426, + "step": 64450 + }, + { + "epoch": 1.8885373178056106, + "grad_norm": 8.982834815979004, + "learning_rate": 3.1914584107735367e-06, + "loss": 3.0373, + "step": 64460 + }, + { + "epoch": 1.8888302937083425, + "grad_norm": 8.74328899383545, + "learning_rate": 3.189984094504961e-06, + "loss": 3.0361, + "step": 64470 + }, + { + "epoch": 1.8891232696110745, + "grad_norm": 9.460076332092285, + "learning_rate": 3.188509959317738e-06, + "loss": 3.0425, + "step": 64480 + }, + { + "epoch": 1.8894162455138064, + "grad_norm": 8.989214897155762, + "learning_rate": 3.1870360053593463e-06, + "loss": 3.0663, + "step": 64490 + }, + { + "epoch": 1.8897092214165385, + "grad_norm": 9.403727531433105, + "learning_rate": 3.1855622327772483e-06, + "loss": 3.0238, + "step": 64500 + }, + { + "epoch": 1.8900021973192704, + "grad_norm": 9.682820320129395, + "learning_rate": 3.1840886417188823e-06, + "loss": 3.0622, + "step": 64510 + }, + { + "epoch": 1.890060792499817, + "eval_bleu": 0.3511256855386195, + "eval_cap_loss": 0.9072508811950684, + "eval_con_loss": 1.146315574645996, + "eval_loss": 3.1998820304870605, + "step": 64512 + }, + { + "epoch": 1.890060792499817, + "eval_bleu": 0.3511256855386195, + "eval_cap_loss": 0.9072508811950684, + "eval_con_loss": 1.146315574645996, + "eval_loss": 3.1998820304870605, + "eval_runtime": 58.2132, + "eval_samples_per_second": 343.565, + "eval_steps_per_second": 0.344, + "step": 64512 + }, + { + "epoch": 1.8902951732220026, + "grad_norm": 9.475645065307617, + "learning_rate": 3.182615232331675e-06, + "loss": 3.0569, + "step": 64520 + }, + { + "epoch": 1.8905881491247345, + "grad_norm": 9.763463973999023, + "learning_rate": 3.1811420047630304e-06, + "loss": 3.0387, + "step": 64530 + }, + { + "epoch": 1.8908811250274664, + "grad_norm": 8.98813533782959, + "learning_rate": 3.179668959160339e-06, + "loss": 3.0479, + "step": 64540 + }, + { + "epoch": 1.8911741009301983, + "grad_norm": 8.624966621398926, + "learning_rate": 3.178196095670967e-06, + "loss": 3.0453, + "step": 64550 + }, + { + "epoch": 1.8914670768329305, + "grad_norm": 8.887187957763672, + "learning_rate": 3.1767234144422685e-06, + "loss": 3.0565, + "step": 64560 + }, + { + "epoch": 1.8917600527356626, + "grad_norm": 9.030359268188477, + "learning_rate": 3.1752509156215738e-06, + "loss": 3.0635, + "step": 64570 + }, + { + "epoch": 1.8920530286383945, + "grad_norm": 8.742783546447754, + "learning_rate": 3.1737785993562015e-06, + "loss": 3.0412, + "step": 64580 + }, + { + "epoch": 1.8923460045411264, + "grad_norm": 8.991765975952148, + "learning_rate": 3.1723064657934434e-06, + "loss": 3.0441, + "step": 64590 + }, + { + "epoch": 1.8926389804438584, + "grad_norm": 8.956560134887695, + "learning_rate": 3.170834515080581e-06, + "loss": 3.0329, + "step": 64600 + }, + { + "epoch": 1.8929319563465905, + "grad_norm": 8.777953147888184, + "learning_rate": 3.1693627473648723e-06, + "loss": 3.0423, + "step": 64610 + }, + { + "epoch": 1.8932249322493226, + "grad_norm": 9.252917289733887, + "learning_rate": 3.167891162793563e-06, + "loss": 3.0568, + "step": 64620 + }, + { + "epoch": 1.8935179081520546, + "grad_norm": 8.840283393859863, + "learning_rate": 3.16641976151387e-06, + "loss": 3.0567, + "step": 64630 + }, + { + "epoch": 1.8938108840547865, + "grad_norm": 9.552207946777344, + "learning_rate": 3.1649485436730032e-06, + "loss": 3.0616, + "step": 64640 + }, + { + "epoch": 1.8941038599575184, + "grad_norm": 9.40411376953125, + "learning_rate": 3.163477509418146e-06, + "loss": 3.043, + "step": 64650 + }, + { + "epoch": 1.8943968358602505, + "grad_norm": 8.858837127685547, + "learning_rate": 3.1620066588964694e-06, + "loss": 3.0497, + "step": 64660 + }, + { + "epoch": 1.8946898117629825, + "grad_norm": 9.253886222839355, + "learning_rate": 3.160535992255123e-06, + "loss": 3.0277, + "step": 64670 + }, + { + "epoch": 1.8949827876657146, + "grad_norm": 9.21107292175293, + "learning_rate": 3.159065509641236e-06, + "loss": 3.0531, + "step": 64680 + }, + { + "epoch": 1.8952757635684465, + "grad_norm": 9.21335506439209, + "learning_rate": 3.1575952112019235e-06, + "loss": 3.0641, + "step": 64690 + }, + { + "epoch": 1.8955687394711784, + "grad_norm": 8.849775314331055, + "learning_rate": 3.1562721001973766e-06, + "loss": 3.0384, + "step": 64700 + }, + { + "epoch": 1.8958617153739104, + "grad_norm": 9.769028663635254, + "learning_rate": 3.1548021520949867e-06, + "loss": 3.0448, + "step": 64710 + }, + { + "epoch": 1.8961546912766425, + "grad_norm": 8.406717300415039, + "learning_rate": 3.153332388593692e-06, + "loss": 3.0413, + "step": 64720 + }, + { + "epoch": 1.8964476671793746, + "grad_norm": 9.063920021057129, + "learning_rate": 3.151862809840536e-06, + "loss": 3.0429, + "step": 64730 + }, + { + "epoch": 1.8967406430821065, + "grad_norm": 9.391977310180664, + "learning_rate": 3.150393415982541e-06, + "loss": 3.0554, + "step": 64740 + }, + { + "epoch": 1.8970336189848385, + "grad_norm": 9.523113250732422, + "learning_rate": 3.1489242071667137e-06, + "loss": 3.0489, + "step": 64750 + }, + { + "epoch": 1.8973265948875704, + "grad_norm": 9.03508186340332, + "learning_rate": 3.147455183540035e-06, + "loss": 3.0583, + "step": 64760 + }, + { + "epoch": 1.8976195707903025, + "grad_norm": 9.092524528503418, + "learning_rate": 3.1459863452494753e-06, + "loss": 3.046, + "step": 64770 + }, + { + "epoch": 1.8979125466930347, + "grad_norm": 8.254154205322266, + "learning_rate": 3.1445176924419817e-06, + "loss": 3.0315, + "step": 64780 + }, + { + "epoch": 1.8982055225957666, + "grad_norm": 9.212616920471191, + "learning_rate": 3.143049225264486e-06, + "loss": 3.0348, + "step": 64790 + }, + { + "epoch": 1.8984984984984985, + "grad_norm": 9.416128158569336, + "learning_rate": 3.141580943863897e-06, + "loss": 3.0554, + "step": 64800 + }, + { + "epoch": 1.8987914744012304, + "grad_norm": 9.119231224060059, + "learning_rate": 3.14011284838711e-06, + "loss": 3.04, + "step": 64810 + }, + { + "epoch": 1.8990844503039623, + "grad_norm": 9.288166999816895, + "learning_rate": 3.1386449389809968e-06, + "loss": 3.0435, + "step": 64820 + }, + { + "epoch": 1.8993774262066945, + "grad_norm": 9.45418930053711, + "learning_rate": 3.137177215792417e-06, + "loss": 3.0684, + "step": 64830 + }, + { + "epoch": 1.8996704021094266, + "grad_norm": 9.390993118286133, + "learning_rate": 3.1357096789682028e-06, + "loss": 3.0379, + "step": 64840 + }, + { + "epoch": 1.8999633780121585, + "grad_norm": 9.091148376464844, + "learning_rate": 3.1342423286551756e-06, + "loss": 3.0409, + "step": 64850 + }, + { + "epoch": 1.9002563539148905, + "grad_norm": 8.175374984741211, + "learning_rate": 3.1327751650001333e-06, + "loss": 3.0557, + "step": 64860 + }, + { + "epoch": 1.9005493298176224, + "grad_norm": 9.472800254821777, + "learning_rate": 3.1313081881498597e-06, + "loss": 3.026, + "step": 64870 + }, + { + "epoch": 1.9008423057203545, + "grad_norm": 8.906452178955078, + "learning_rate": 3.129841398251113e-06, + "loss": 3.0517, + "step": 64880 + }, + { + "epoch": 1.9011352816230866, + "grad_norm": 8.785847663879395, + "learning_rate": 3.128374795450641e-06, + "loss": 3.0376, + "step": 64890 + }, + { + "epoch": 1.9014282575258186, + "grad_norm": 8.945577621459961, + "learning_rate": 3.1269083798951657e-06, + "loss": 3.0532, + "step": 64900 + }, + { + "epoch": 1.9017212334285505, + "grad_norm": 8.821545600891113, + "learning_rate": 3.125442151731396e-06, + "loss": 3.0322, + "step": 64910 + }, + { + "epoch": 1.9020142093312824, + "grad_norm": 9.821475982666016, + "learning_rate": 3.123976111106015e-06, + "loss": 3.0326, + "step": 64920 + }, + { + "epoch": 1.9023071852340145, + "grad_norm": 9.881537437438965, + "learning_rate": 3.1225102581656965e-06, + "loss": 3.0571, + "step": 64930 + }, + { + "epoch": 1.9026001611367465, + "grad_norm": 9.212247848510742, + "learning_rate": 3.1210445930570855e-06, + "loss": 3.0521, + "step": 64940 + }, + { + "epoch": 1.9028931370394786, + "grad_norm": 9.345014572143555, + "learning_rate": 3.119579115926818e-06, + "loss": 3.0346, + "step": 64950 + }, + { + "epoch": 1.9031861129422105, + "grad_norm": 9.29224681854248, + "learning_rate": 3.1181138269215018e-06, + "loss": 3.045, + "step": 64960 + }, + { + "epoch": 1.9034790888449424, + "grad_norm": 9.046417236328125, + "learning_rate": 3.1166487261877333e-06, + "loss": 3.031, + "step": 64970 + }, + { + "epoch": 1.9037720647476744, + "grad_norm": 9.50043773651123, + "learning_rate": 3.115183813872085e-06, + "loss": 3.0512, + "step": 64980 + }, + { + "epoch": 1.9040650406504065, + "grad_norm": 8.894587516784668, + "learning_rate": 3.1137190901211146e-06, + "loss": 3.0376, + "step": 64990 + }, + { + "epoch": 1.9043580165531386, + "grad_norm": 8.902680397033691, + "learning_rate": 3.11225455508136e-06, + "loss": 3.0422, + "step": 65000 + }, + { + "epoch": 1.9046509924558706, + "grad_norm": 8.65910816192627, + "learning_rate": 3.110790208899335e-06, + "loss": 3.0457, + "step": 65010 + }, + { + "epoch": 1.9049439683586025, + "grad_norm": 8.834700584411621, + "learning_rate": 3.109326051721543e-06, + "loss": 3.0434, + "step": 65020 + }, + { + "epoch": 1.9050611587196953, + "eval_bleu": 0.35126113856559993, + "eval_cap_loss": 0.906408429145813, + "eval_con_loss": 1.1442698240280151, + "eval_loss": 3.194948196411133, + "step": 65024 + }, + { + "epoch": 1.9050611587196953, + "eval_bleu": 0.35126113856559993, + "eval_cap_loss": 0.906408429145813, + "eval_con_loss": 1.1442698240280151, + "eval_loss": 3.194948196411133, + "eval_runtime": 55.4658, + "eval_samples_per_second": 360.583, + "eval_steps_per_second": 0.361, + "step": 65024 + }, + { + "epoch": 1.9052369442613344, + "grad_norm": 8.98983097076416, + "learning_rate": 3.1078620836944617e-06, + "loss": 3.0496, + "step": 65030 + }, + { + "epoch": 1.9055299201640665, + "grad_norm": 9.145203590393066, + "learning_rate": 3.1063983049645548e-06, + "loss": 3.0385, + "step": 65040 + }, + { + "epoch": 1.9058228960667987, + "grad_norm": 9.288213729858398, + "learning_rate": 3.1049347156782607e-06, + "loss": 3.0363, + "step": 65050 + }, + { + "epoch": 1.9061158719695306, + "grad_norm": 10.174243927001953, + "learning_rate": 3.1034713159820075e-06, + "loss": 3.0476, + "step": 65060 + }, + { + "epoch": 1.9064088478722625, + "grad_norm": 9.213035583496094, + "learning_rate": 3.1020081060221953e-06, + "loss": 3.0456, + "step": 65070 + }, + { + "epoch": 1.9067018237749944, + "grad_norm": 9.34978199005127, + "learning_rate": 3.1005450859452136e-06, + "loss": 3.0288, + "step": 65080 + }, + { + "epoch": 1.9069947996777266, + "grad_norm": 8.427800178527832, + "learning_rate": 3.0990822558974253e-06, + "loss": 3.0376, + "step": 65090 + }, + { + "epoch": 1.9072877755804585, + "grad_norm": 9.294174194335938, + "learning_rate": 3.0976196160251817e-06, + "loss": 3.0272, + "step": 65100 + }, + { + "epoch": 1.9075807514831906, + "grad_norm": 9.236980438232422, + "learning_rate": 3.0961571664748064e-06, + "loss": 3.0622, + "step": 65110 + }, + { + "epoch": 1.9078737273859225, + "grad_norm": 8.776501655578613, + "learning_rate": 3.0946949073926142e-06, + "loss": 3.0375, + "step": 65120 + }, + { + "epoch": 1.9081667032886545, + "grad_norm": 9.051634788513184, + "learning_rate": 3.0932328389248907e-06, + "loss": 3.0465, + "step": 65130 + }, + { + "epoch": 1.9084596791913864, + "grad_norm": 9.199605941772461, + "learning_rate": 3.0917709612179104e-06, + "loss": 3.0435, + "step": 65140 + }, + { + "epoch": 1.9087526550941185, + "grad_norm": 9.78489875793457, + "learning_rate": 3.090309274417923e-06, + "loss": 3.0493, + "step": 65150 + }, + { + "epoch": 1.9090456309968507, + "grad_norm": 9.026740074157715, + "learning_rate": 3.0888477786711646e-06, + "loss": 3.0525, + "step": 65160 + }, + { + "epoch": 1.9093386068995826, + "grad_norm": 8.6939697265625, + "learning_rate": 3.0873864741238457e-06, + "loss": 3.0538, + "step": 65170 + }, + { + "epoch": 1.9096315828023145, + "grad_norm": 9.0361967086792, + "learning_rate": 3.085925360922164e-06, + "loss": 3.0336, + "step": 65180 + }, + { + "epoch": 1.9099245587050464, + "grad_norm": 8.68632984161377, + "learning_rate": 3.0844644392122937e-06, + "loss": 3.0117, + "step": 65190 + }, + { + "epoch": 1.9102175346077785, + "grad_norm": 8.683320999145508, + "learning_rate": 3.083003709140393e-06, + "loss": 3.0529, + "step": 65200 + }, + { + "epoch": 1.9105105105105105, + "grad_norm": 8.887330055236816, + "learning_rate": 3.081543170852596e-06, + "loss": 3.0221, + "step": 65210 + }, + { + "epoch": 1.9108034864132426, + "grad_norm": 9.927905082702637, + "learning_rate": 3.080082824495024e-06, + "loss": 3.0603, + "step": 65220 + }, + { + "epoch": 1.9110964623159745, + "grad_norm": 9.879054069519043, + "learning_rate": 3.0786226702137745e-06, + "loss": 3.0489, + "step": 65230 + }, + { + "epoch": 1.9113894382187064, + "grad_norm": 8.781588554382324, + "learning_rate": 3.0771627081549293e-06, + "loss": 3.0413, + "step": 65240 + }, + { + "epoch": 1.9116824141214384, + "grad_norm": 9.227595329284668, + "learning_rate": 3.0757029384645454e-06, + "loss": 3.0536, + "step": 65250 + }, + { + "epoch": 1.9119753900241705, + "grad_norm": 8.981466293334961, + "learning_rate": 3.074243361288667e-06, + "loss": 3.0369, + "step": 65260 + }, + { + "epoch": 1.9122683659269026, + "grad_norm": 9.687784194946289, + "learning_rate": 3.0727839767733143e-06, + "loss": 3.0172, + "step": 65270 + }, + { + "epoch": 1.9125613418296346, + "grad_norm": 9.658628463745117, + "learning_rate": 3.071324785064493e-06, + "loss": 3.0414, + "step": 65280 + }, + { + "epoch": 1.9128543177323665, + "grad_norm": 9.035808563232422, + "learning_rate": 3.069865786308182e-06, + "loss": 3.0391, + "step": 65290 + }, + { + "epoch": 1.9131472936350984, + "grad_norm": 8.469500541687012, + "learning_rate": 3.068406980650349e-06, + "loss": 3.0283, + "step": 65300 + }, + { + "epoch": 1.9134402695378305, + "grad_norm": 9.644173622131348, + "learning_rate": 3.066948368236937e-06, + "loss": 3.0277, + "step": 65310 + }, + { + "epoch": 1.9137332454405627, + "grad_norm": 9.366390228271484, + "learning_rate": 3.065489949213875e-06, + "loss": 3.058, + "step": 65320 + }, + { + "epoch": 1.9140262213432946, + "grad_norm": 9.370108604431152, + "learning_rate": 3.0640317237270645e-06, + "loss": 3.0421, + "step": 65330 + }, + { + "epoch": 1.9143191972460265, + "grad_norm": 9.063094139099121, + "learning_rate": 3.0625736919223936e-06, + "loss": 3.0668, + "step": 65340 + }, + { + "epoch": 1.9146121731487584, + "grad_norm": 8.766315460205078, + "learning_rate": 3.061115853945732e-06, + "loss": 3.0322, + "step": 65350 + }, + { + "epoch": 1.9149051490514906, + "grad_norm": 9.465474128723145, + "learning_rate": 3.059658209942925e-06, + "loss": 3.0316, + "step": 65360 + }, + { + "epoch": 1.9151981249542225, + "grad_norm": 8.992582321166992, + "learning_rate": 3.058200760059804e-06, + "loss": 3.0239, + "step": 65370 + }, + { + "epoch": 1.9154911008569546, + "grad_norm": 8.988423347473145, + "learning_rate": 3.0567435044421755e-06, + "loss": 3.0486, + "step": 65380 + }, + { + "epoch": 1.9157840767596865, + "grad_norm": 9.242990493774414, + "learning_rate": 3.0552864432358308e-06, + "loss": 3.0564, + "step": 65390 + }, + { + "epoch": 1.9160770526624185, + "grad_norm": 9.345465660095215, + "learning_rate": 3.053829576586539e-06, + "loss": 3.05, + "step": 65400 + }, + { + "epoch": 1.9163700285651504, + "grad_norm": 9.046442985534668, + "learning_rate": 3.0523729046400543e-06, + "loss": 3.0365, + "step": 65410 + }, + { + "epoch": 1.9166630044678825, + "grad_norm": 9.331452369689941, + "learning_rate": 3.0509164275421026e-06, + "loss": 3.0257, + "step": 65420 + }, + { + "epoch": 1.9169559803706147, + "grad_norm": 9.003050804138184, + "learning_rate": 3.049460145438401e-06, + "loss": 3.0312, + "step": 65430 + }, + { + "epoch": 1.9172489562733466, + "grad_norm": 8.347201347351074, + "learning_rate": 3.0480040584746374e-06, + "loss": 3.0597, + "step": 65440 + }, + { + "epoch": 1.9175419321760785, + "grad_norm": 8.897717475891113, + "learning_rate": 3.04654816679649e-06, + "loss": 3.0446, + "step": 65450 + }, + { + "epoch": 1.9178349080788104, + "grad_norm": 9.442028999328613, + "learning_rate": 3.0450924705496065e-06, + "loss": 3.0503, + "step": 65460 + }, + { + "epoch": 1.9181278839815425, + "grad_norm": 8.629730224609375, + "learning_rate": 3.043636969879625e-06, + "loss": 3.0457, + "step": 65470 + }, + { + "epoch": 1.9184208598842745, + "grad_norm": 9.130809783935547, + "learning_rate": 3.042181664932156e-06, + "loss": 3.0539, + "step": 65480 + }, + { + "epoch": 1.9187138357870066, + "grad_norm": 9.213091850280762, + "learning_rate": 3.040726555852799e-06, + "loss": 3.0512, + "step": 65490 + }, + { + "epoch": 1.9190068116897385, + "grad_norm": 9.273293495178223, + "learning_rate": 3.0392716427871233e-06, + "loss": 3.0524, + "step": 65500 + }, + { + "epoch": 1.9192997875924704, + "grad_norm": 9.089561462402344, + "learning_rate": 3.0378169258806888e-06, + "loss": 3.0526, + "step": 65510 + }, + { + "epoch": 1.9195927634952024, + "grad_norm": 9.046930313110352, + "learning_rate": 3.036362405279028e-06, + "loss": 3.0428, + "step": 65520 + }, + { + "epoch": 1.9198857393979345, + "grad_norm": 9.603256225585938, + "learning_rate": 3.034908081127661e-06, + "loss": 3.0573, + "step": 65530 + }, + { + "epoch": 1.9200615249395736, + "eval_bleu": 0.3514652345322154, + "eval_cap_loss": 0.9066113233566284, + "eval_con_loss": 1.1436901092529297, + "eval_loss": 3.1939916610717773, + "step": 65536 + }, + { + "epoch": 1.9200615249395736, + "eval_bleu": 0.3514652345322154, + "eval_cap_loss": 0.9066113233566284, + "eval_con_loss": 1.1436901092529297, + "eval_loss": 3.1939916610717773, + "eval_runtime": 55.7722, + "eval_samples_per_second": 358.601, + "eval_steps_per_second": 0.359, + "step": 65536 + }, + { + "epoch": 1.9201787153006666, + "grad_norm": 8.683460235595703, + "learning_rate": 3.0334539535720797e-06, + "loss": 3.0439, + "step": 65540 + }, + { + "epoch": 1.9204716912033986, + "grad_norm": 9.210546493530273, + "learning_rate": 3.0320000227577652e-06, + "loss": 3.0488, + "step": 65550 + }, + { + "epoch": 1.9207646671061305, + "grad_norm": 9.153868675231934, + "learning_rate": 3.0305462888301705e-06, + "loss": 3.0382, + "step": 65560 + }, + { + "epoch": 1.9210576430088624, + "grad_norm": 8.465953826904297, + "learning_rate": 3.029092751934738e-06, + "loss": 3.0353, + "step": 65570 + }, + { + "epoch": 1.9213506189115945, + "grad_norm": 9.574050903320312, + "learning_rate": 3.0276394122168805e-06, + "loss": 3.0375, + "step": 65580 + }, + { + "epoch": 1.9216435948143267, + "grad_norm": 9.393808364868164, + "learning_rate": 3.026186269821999e-06, + "loss": 3.0311, + "step": 65590 + }, + { + "epoch": 1.9219365707170586, + "grad_norm": 9.99512004852295, + "learning_rate": 3.0247333248954703e-06, + "loss": 3.0503, + "step": 65600 + }, + { + "epoch": 1.9222295466197905, + "grad_norm": 9.6173095703125, + "learning_rate": 3.023280577582655e-06, + "loss": 3.0394, + "step": 65610 + }, + { + "epoch": 1.9225225225225224, + "grad_norm": 8.592347145080566, + "learning_rate": 3.0218280280288894e-06, + "loss": 3.0334, + "step": 65620 + }, + { + "epoch": 1.9228154984252546, + "grad_norm": 8.860210418701172, + "learning_rate": 3.0203756763794934e-06, + "loss": 3.0218, + "step": 65630 + }, + { + "epoch": 1.9231084743279865, + "grad_norm": 8.848950386047363, + "learning_rate": 3.0189235227797654e-06, + "loss": 3.0291, + "step": 65640 + }, + { + "epoch": 1.9234014502307186, + "grad_norm": 8.74700927734375, + "learning_rate": 3.017471567374988e-06, + "loss": 3.0148, + "step": 65650 + }, + { + "epoch": 1.9236944261334505, + "grad_norm": 9.20439624786377, + "learning_rate": 3.0160198103104165e-06, + "loss": 3.0421, + "step": 65660 + }, + { + "epoch": 1.9239874020361825, + "grad_norm": 9.314653396606445, + "learning_rate": 3.0145682517312936e-06, + "loss": 3.0451, + "step": 65670 + }, + { + "epoch": 1.9242803779389144, + "grad_norm": 8.881900787353516, + "learning_rate": 3.0131168917828363e-06, + "loss": 3.0468, + "step": 65680 + }, + { + "epoch": 1.9245733538416465, + "grad_norm": 9.25633430480957, + "learning_rate": 3.0116657306102486e-06, + "loss": 3.0471, + "step": 65690 + }, + { + "epoch": 1.9248663297443787, + "grad_norm": 8.671381950378418, + "learning_rate": 3.0103598556282754e-06, + "loss": 3.0525, + "step": 65700 + }, + { + "epoch": 1.9251593056471106, + "grad_norm": 8.812321662902832, + "learning_rate": 3.0089090725297878e-06, + "loss": 3.0333, + "step": 65710 + }, + { + "epoch": 1.9254522815498425, + "grad_norm": 9.268426895141602, + "learning_rate": 3.0074584886281365e-06, + "loss": 3.0309, + "step": 65720 + }, + { + "epoch": 1.9257452574525744, + "grad_norm": 9.372303009033203, + "learning_rate": 3.006008104068441e-06, + "loss": 3.0457, + "step": 65730 + }, + { + "epoch": 1.9260382333553066, + "grad_norm": 9.064726829528809, + "learning_rate": 3.004557918995805e-06, + "loss": 3.0372, + "step": 65740 + }, + { + "epoch": 1.9263312092580387, + "grad_norm": 9.10348129272461, + "learning_rate": 3.0031079335553097e-06, + "loss": 3.037, + "step": 65750 + }, + { + "epoch": 1.9266241851607706, + "grad_norm": 8.989701271057129, + "learning_rate": 3.0016581478920205e-06, + "loss": 3.0556, + "step": 65760 + }, + { + "epoch": 1.9269171610635025, + "grad_norm": 9.398385047912598, + "learning_rate": 3.0002085621509764e-06, + "loss": 3.0596, + "step": 65770 + }, + { + "epoch": 1.9272101369662344, + "grad_norm": 9.0578031539917, + "learning_rate": 2.9987591764772015e-06, + "loss": 3.0436, + "step": 65780 + }, + { + "epoch": 1.9275031128689664, + "grad_norm": 8.901575088500977, + "learning_rate": 2.9973099910156976e-06, + "loss": 3.0335, + "step": 65790 + }, + { + "epoch": 1.9277960887716985, + "grad_norm": 9.675191879272461, + "learning_rate": 2.995861005911449e-06, + "loss": 3.0301, + "step": 65800 + }, + { + "epoch": 1.9280890646744306, + "grad_norm": 9.521591186523438, + "learning_rate": 2.994412221309415e-06, + "loss": 3.0283, + "step": 65810 + }, + { + "epoch": 1.9283820405771626, + "grad_norm": 9.069169998168945, + "learning_rate": 2.9929636373545414e-06, + "loss": 3.023, + "step": 65820 + }, + { + "epoch": 1.9286750164798945, + "grad_norm": 9.38128662109375, + "learning_rate": 2.991515254191746e-06, + "loss": 3.0364, + "step": 65830 + }, + { + "epoch": 1.9289679923826264, + "grad_norm": 8.843396186828613, + "learning_rate": 2.9900670719659365e-06, + "loss": 3.0254, + "step": 65840 + }, + { + "epoch": 1.9292609682853585, + "grad_norm": 9.182538032531738, + "learning_rate": 2.9886190908219892e-06, + "loss": 3.0626, + "step": 65850 + }, + { + "epoch": 1.9295539441880907, + "grad_norm": 8.484257698059082, + "learning_rate": 2.98717131090477e-06, + "loss": 3.0207, + "step": 65860 + }, + { + "epoch": 1.9298469200908226, + "grad_norm": 9.39767074584961, + "learning_rate": 2.985723732359119e-06, + "loss": 3.0548, + "step": 65870 + }, + { + "epoch": 1.9301398959935545, + "grad_norm": 9.160656929016113, + "learning_rate": 2.9842763553298595e-06, + "loss": 3.0446, + "step": 65880 + }, + { + "epoch": 1.9304328718962864, + "grad_norm": 8.957691192626953, + "learning_rate": 2.9828291799617904e-06, + "loss": 3.0401, + "step": 65890 + }, + { + "epoch": 1.9307258477990186, + "grad_norm": 8.730045318603516, + "learning_rate": 2.9813822063996945e-06, + "loss": 3.0411, + "step": 65900 + }, + { + "epoch": 1.9310188237017505, + "grad_norm": 9.462676048278809, + "learning_rate": 2.9799354347883323e-06, + "loss": 3.0483, + "step": 65910 + }, + { + "epoch": 1.9313117996044826, + "grad_norm": 9.608495712280273, + "learning_rate": 2.9784888652724474e-06, + "loss": 3.0456, + "step": 65920 + }, + { + "epoch": 1.9316047755072145, + "grad_norm": 9.172243118286133, + "learning_rate": 2.977042497996756e-06, + "loss": 3.0397, + "step": 65930 + }, + { + "epoch": 1.9318977514099465, + "grad_norm": 9.050337791442871, + "learning_rate": 2.975596333105962e-06, + "loss": 3.0348, + "step": 65940 + }, + { + "epoch": 1.9321907273126784, + "grad_norm": 8.730801582336426, + "learning_rate": 2.974150370744744e-06, + "loss": 3.0266, + "step": 65950 + }, + { + "epoch": 1.9324837032154105, + "grad_norm": 9.464729309082031, + "learning_rate": 2.972704611057764e-06, + "loss": 3.0416, + "step": 65960 + }, + { + "epoch": 1.9327766791181427, + "grad_norm": 8.627631187438965, + "learning_rate": 2.971259054189658e-06, + "loss": 3.0379, + "step": 65970 + }, + { + "epoch": 1.9330696550208746, + "grad_norm": 9.268043518066406, + "learning_rate": 2.969813700285049e-06, + "loss": 3.0463, + "step": 65980 + }, + { + "epoch": 1.9333626309236065, + "grad_norm": 8.787014961242676, + "learning_rate": 2.968368549488533e-06, + "loss": 3.0393, + "step": 65990 + }, + { + "epoch": 1.9336556068263384, + "grad_norm": 9.146746635437012, + "learning_rate": 2.9669236019446928e-06, + "loss": 3.0373, + "step": 66000 + }, + { + "epoch": 1.9339485827290706, + "grad_norm": 9.312825202941895, + "learning_rate": 2.965478857798082e-06, + "loss": 3.0502, + "step": 66010 + }, + { + "epoch": 1.9342415586318027, + "grad_norm": 8.491748809814453, + "learning_rate": 2.9640343171932427e-06, + "loss": 3.0469, + "step": 66020 + }, + { + "epoch": 1.9345345345345346, + "grad_norm": 8.350600242614746, + "learning_rate": 2.9625899802746893e-06, + "loss": 3.0183, + "step": 66030 + }, + { + "epoch": 1.9348275104372665, + "grad_norm": 8.919795036315918, + "learning_rate": 2.9611458471869214e-06, + "loss": 3.0308, + "step": 66040 + }, + { + "epoch": 1.9350618911594522, + "eval_bleu": 0.35173825116890634, + "eval_cap_loss": 0.9054580926895142, + "eval_con_loss": 1.1428548097610474, + "eval_loss": 3.1911675930023193, + "step": 66048 + }, + { + "epoch": 1.9350618911594522, + "eval_bleu": 0.35173825116890634, + "eval_cap_loss": 0.9054580926895142, + "eval_con_loss": 1.1428548097610474, + "eval_loss": 3.1911675930023193, + "eval_runtime": 54.7834, + "eval_samples_per_second": 365.074, + "eval_steps_per_second": 0.365, + "step": 66048 + }, + { + "epoch": 1.9351204863399984, + "grad_norm": 9.175612449645996, + "learning_rate": 2.9597019180744167e-06, + "loss": 3.0462, + "step": 66050 + }, + { + "epoch": 1.9354134622427306, + "grad_norm": 9.239049911499023, + "learning_rate": 2.958258193081629e-06, + "loss": 3.0203, + "step": 66060 + }, + { + "epoch": 1.9357064381454625, + "grad_norm": 8.383676528930664, + "learning_rate": 2.956814672352997e-06, + "loss": 3.0532, + "step": 66070 + }, + { + "epoch": 1.9359994140481946, + "grad_norm": 9.27977180480957, + "learning_rate": 2.9553713560329335e-06, + "loss": 3.0364, + "step": 66080 + }, + { + "epoch": 1.9362923899509266, + "grad_norm": 9.280186653137207, + "learning_rate": 2.953928244265838e-06, + "loss": 3.0336, + "step": 66090 + }, + { + "epoch": 1.9365853658536585, + "grad_norm": 9.041882514953613, + "learning_rate": 2.9524853371960803e-06, + "loss": 3.0406, + "step": 66100 + }, + { + "epoch": 1.9368783417563904, + "grad_norm": 9.205631256103516, + "learning_rate": 2.951042634968019e-06, + "loss": 3.0373, + "step": 66110 + }, + { + "epoch": 1.9371713176591225, + "grad_norm": 8.733664512634277, + "learning_rate": 2.9496001377259844e-06, + "loss": 3.0419, + "step": 66120 + }, + { + "epoch": 1.9374642935618547, + "grad_norm": 9.686022758483887, + "learning_rate": 2.9481578456142936e-06, + "loss": 3.0289, + "step": 66130 + }, + { + "epoch": 1.9377572694645866, + "grad_norm": 9.294120788574219, + "learning_rate": 2.9467157587772348e-06, + "loss": 3.0335, + "step": 66140 + }, + { + "epoch": 1.9380502453673185, + "grad_norm": 9.469676971435547, + "learning_rate": 2.945273877359084e-06, + "loss": 3.0229, + "step": 66150 + }, + { + "epoch": 1.9383432212700504, + "grad_norm": 9.331703186035156, + "learning_rate": 2.94383220150409e-06, + "loss": 3.0113, + "step": 66160 + }, + { + "epoch": 1.9386361971727826, + "grad_norm": 9.380171775817871, + "learning_rate": 2.942390731356488e-06, + "loss": 3.0325, + "step": 66170 + }, + { + "epoch": 1.9389291730755145, + "grad_norm": 8.544177055358887, + "learning_rate": 2.9409494670604837e-06, + "loss": 3.0484, + "step": 66180 + }, + { + "epoch": 1.9392221489782466, + "grad_norm": 8.93599796295166, + "learning_rate": 2.9395084087602705e-06, + "loss": 3.0187, + "step": 66190 + }, + { + "epoch": 1.9395151248809785, + "grad_norm": 8.834903717041016, + "learning_rate": 2.9380675566000148e-06, + "loss": 3.0356, + "step": 66200 + }, + { + "epoch": 1.9398081007837105, + "grad_norm": 8.916290283203125, + "learning_rate": 2.9366269107238697e-06, + "loss": 3.0348, + "step": 66210 + }, + { + "epoch": 1.9401010766864424, + "grad_norm": 9.39677619934082, + "learning_rate": 2.9351864712759587e-06, + "loss": 3.0303, + "step": 66220 + }, + { + "epoch": 1.9403940525891745, + "grad_norm": 8.985897064208984, + "learning_rate": 2.9337462384003922e-06, + "loss": 3.0381, + "step": 66230 + }, + { + "epoch": 1.9406870284919067, + "grad_norm": 9.646970748901367, + "learning_rate": 2.9323062122412544e-06, + "loss": 3.0498, + "step": 66240 + }, + { + "epoch": 1.9409800043946386, + "grad_norm": 9.736445426940918, + "learning_rate": 2.930866392942615e-06, + "loss": 3.0301, + "step": 66250 + }, + { + "epoch": 1.9412729802973705, + "grad_norm": 8.975503921508789, + "learning_rate": 2.9294267806485155e-06, + "loss": 3.0422, + "step": 66260 + }, + { + "epoch": 1.9415659562001024, + "grad_norm": 9.237060546875, + "learning_rate": 2.927987375502983e-06, + "loss": 3.0285, + "step": 66270 + }, + { + "epoch": 1.9418589321028346, + "grad_norm": 9.083706855773926, + "learning_rate": 2.92654817765002e-06, + "loss": 3.033, + "step": 66280 + }, + { + "epoch": 1.9421519080055667, + "grad_norm": 8.771180152893066, + "learning_rate": 2.9251091872336123e-06, + "loss": 3.0266, + "step": 66290 + }, + { + "epoch": 1.9424448839082986, + "grad_norm": 8.727778434753418, + "learning_rate": 2.9236704043977182e-06, + "loss": 3.0421, + "step": 66300 + }, + { + "epoch": 1.9427378598110305, + "grad_norm": 8.82863998413086, + "learning_rate": 2.9222318292862826e-06, + "loss": 3.0455, + "step": 66310 + }, + { + "epoch": 1.9430308357137624, + "grad_norm": 9.190202713012695, + "learning_rate": 2.9207934620432256e-06, + "loss": 3.0513, + "step": 66320 + }, + { + "epoch": 1.9433238116164946, + "grad_norm": 9.164046287536621, + "learning_rate": 2.919355302812449e-06, + "loss": 3.046, + "step": 66330 + }, + { + "epoch": 1.9436167875192265, + "grad_norm": 9.741769790649414, + "learning_rate": 2.9179173517378263e-06, + "loss": 3.038, + "step": 66340 + }, + { + "epoch": 1.9439097634219586, + "grad_norm": 8.482494354248047, + "learning_rate": 2.9164796089632235e-06, + "loss": 3.0299, + "step": 66350 + }, + { + "epoch": 1.9442027393246906, + "grad_norm": 8.67581558227539, + "learning_rate": 2.9150420746324727e-06, + "loss": 3.0273, + "step": 66360 + }, + { + "epoch": 1.9444957152274225, + "grad_norm": 8.946026802062988, + "learning_rate": 2.913604748889395e-06, + "loss": 3.0256, + "step": 66370 + }, + { + "epoch": 1.9447886911301544, + "grad_norm": 9.0700101852417, + "learning_rate": 2.9121676318777813e-06, + "loss": 3.034, + "step": 66380 + }, + { + "epoch": 1.9450816670328865, + "grad_norm": 9.288740158081055, + "learning_rate": 2.91073072374141e-06, + "loss": 3.0347, + "step": 66390 + }, + { + "epoch": 1.9453746429356187, + "grad_norm": 9.065045356750488, + "learning_rate": 2.9092940246240364e-06, + "loss": 3.0152, + "step": 66400 + }, + { + "epoch": 1.9456676188383506, + "grad_norm": 9.09171199798584, + "learning_rate": 2.90785753466939e-06, + "loss": 3.0409, + "step": 66410 + }, + { + "epoch": 1.9459605947410825, + "grad_norm": 8.562604904174805, + "learning_rate": 2.9064212540211843e-06, + "loss": 3.0288, + "step": 66420 + }, + { + "epoch": 1.9462535706438144, + "grad_norm": 8.652181625366211, + "learning_rate": 2.9049851828231123e-06, + "loss": 3.0304, + "step": 66430 + }, + { + "epoch": 1.9465465465465466, + "grad_norm": 8.932780265808105, + "learning_rate": 2.9035493212188453e-06, + "loss": 3.0652, + "step": 66440 + }, + { + "epoch": 1.9468395224492785, + "grad_norm": 9.407997131347656, + "learning_rate": 2.9021136693520286e-06, + "loss": 3.0382, + "step": 66450 + }, + { + "epoch": 1.9471324983520106, + "grad_norm": 9.88050651550293, + "learning_rate": 2.9006782273662947e-06, + "loss": 3.0504, + "step": 66460 + }, + { + "epoch": 1.9474254742547426, + "grad_norm": 8.594477653503418, + "learning_rate": 2.899242995405248e-06, + "loss": 3.0167, + "step": 66470 + }, + { + "epoch": 1.9477184501574745, + "grad_norm": 9.37751579284668, + "learning_rate": 2.8978079736124777e-06, + "loss": 3.0242, + "step": 66480 + }, + { + "epoch": 1.9480114260602064, + "grad_norm": 8.8877534866333, + "learning_rate": 2.896373162131546e-06, + "loss": 3.037, + "step": 66490 + }, + { + "epoch": 1.9483044019629385, + "grad_norm": 9.212482452392578, + "learning_rate": 2.894938561105999e-06, + "loss": 3.0327, + "step": 66500 + }, + { + "epoch": 1.9485973778656707, + "grad_norm": 9.170950889587402, + "learning_rate": 2.89350417067936e-06, + "loss": 3.0349, + "step": 66510 + }, + { + "epoch": 1.9488903537684026, + "grad_norm": 9.436854362487793, + "learning_rate": 2.8920699909951333e-06, + "loss": 3.0244, + "step": 66520 + }, + { + "epoch": 1.9491833296711345, + "grad_norm": 9.01541805267334, + "learning_rate": 2.8906360221967966e-06, + "loss": 3.0319, + "step": 66530 + }, + { + "epoch": 1.9494763055738664, + "grad_norm": 8.649113655090332, + "learning_rate": 2.889202264427813e-06, + "loss": 3.0357, + "step": 66540 + }, + { + "epoch": 1.9497692814765986, + "grad_norm": 9.667495727539062, + "learning_rate": 2.887768717831618e-06, + "loss": 3.0299, + "step": 66550 + }, + { + "epoch": 1.9500622573793307, + "grad_norm": 9.366146087646484, + "learning_rate": 2.8863353825516328e-06, + "loss": 3.0415, + "step": 66560 + }, + { + "epoch": 1.9500622573793307, + "eval_bleu": 0.35190133599016404, + "eval_cap_loss": 0.9048666954040527, + "eval_con_loss": 1.141799807548523, + "eval_loss": 3.1884660720825195, + "step": 66560 + }, + { + "epoch": 1.9500622573793307, + "eval_bleu": 0.35190133599016404, + "eval_cap_loss": 0.9048666954040527, + "eval_con_loss": 1.141799807548523, + "eval_loss": 3.1884660720825195, + "eval_runtime": 60.09, + "eval_samples_per_second": 332.834, + "eval_steps_per_second": 0.333, + "step": 66560 + }, + { + "epoch": 1.9503552332820626, + "grad_norm": 9.37522029876709, + "learning_rate": 2.884902258731251e-06, + "loss": 3.0474, + "step": 66570 + }, + { + "epoch": 1.9506482091847945, + "grad_norm": 8.937308311462402, + "learning_rate": 2.8834693465138504e-06, + "loss": 3.0181, + "step": 66580 + }, + { + "epoch": 1.9509411850875265, + "grad_norm": 8.932894706726074, + "learning_rate": 2.882036646042784e-06, + "loss": 3.0478, + "step": 66590 + }, + { + "epoch": 1.9512341609902586, + "grad_norm": 8.425172805786133, + "learning_rate": 2.8806041574613875e-06, + "loss": 3.0239, + "step": 66600 + }, + { + "epoch": 1.9515271368929905, + "grad_norm": 9.463360786437988, + "learning_rate": 2.8791718809129698e-06, + "loss": 3.0375, + "step": 66610 + }, + { + "epoch": 1.9518201127957227, + "grad_norm": 9.346872329711914, + "learning_rate": 2.877739816540824e-06, + "loss": 3.0346, + "step": 66620 + }, + { + "epoch": 1.9521130886984546, + "grad_norm": 9.55285930633545, + "learning_rate": 2.876307964488218e-06, + "loss": 3.033, + "step": 66630 + }, + { + "epoch": 1.9524060646011865, + "grad_norm": 8.89330005645752, + "learning_rate": 2.874876324898402e-06, + "loss": 3.0462, + "step": 66640 + }, + { + "epoch": 1.9526990405039184, + "grad_norm": 8.763958930969238, + "learning_rate": 2.8734448979145995e-06, + "loss": 3.0156, + "step": 66650 + }, + { + "epoch": 1.9529920164066505, + "grad_norm": 9.422234535217285, + "learning_rate": 2.8720136836800188e-06, + "loss": 3.0532, + "step": 66660 + }, + { + "epoch": 1.9532849923093827, + "grad_norm": 8.711450576782227, + "learning_rate": 2.8705826823378448e-06, + "loss": 3.0674, + "step": 66670 + }, + { + "epoch": 1.9535779682121146, + "grad_norm": 8.802693367004395, + "learning_rate": 2.8691518940312413e-06, + "loss": 3.0483, + "step": 66680 + }, + { + "epoch": 1.9538709441148465, + "grad_norm": 8.968478202819824, + "learning_rate": 2.8677213189033472e-06, + "loss": 3.0394, + "step": 66690 + }, + { + "epoch": 1.9541639200175784, + "grad_norm": 9.067439079284668, + "learning_rate": 2.8662909570972865e-06, + "loss": 3.0362, + "step": 66700 + }, + { + "epoch": 1.9544568959203106, + "grad_norm": 9.328032493591309, + "learning_rate": 2.864860808756154e-06, + "loss": 3.0297, + "step": 66710 + }, + { + "epoch": 1.9547498718230427, + "grad_norm": 8.9055757522583, + "learning_rate": 2.8634308740230333e-06, + "loss": 3.033, + "step": 66720 + }, + { + "epoch": 1.9550428477257746, + "grad_norm": 9.359533309936523, + "learning_rate": 2.8620011530409754e-06, + "loss": 3.0342, + "step": 66730 + }, + { + "epoch": 1.9553358236285066, + "grad_norm": 9.58213996887207, + "learning_rate": 2.860571645953018e-06, + "loss": 3.0364, + "step": 66740 + }, + { + "epoch": 1.9556287995312385, + "grad_norm": 8.808300018310547, + "learning_rate": 2.8591423529021746e-06, + "loss": 3.0219, + "step": 66750 + }, + { + "epoch": 1.9559217754339706, + "grad_norm": 8.830524444580078, + "learning_rate": 2.8577132740314375e-06, + "loss": 3.0108, + "step": 66760 + }, + { + "epoch": 1.9562147513367025, + "grad_norm": 9.364973068237305, + "learning_rate": 2.8562844094837792e-06, + "loss": 3.0347, + "step": 66770 + }, + { + "epoch": 1.9565077272394347, + "grad_norm": 8.951459884643555, + "learning_rate": 2.8548557594021464e-06, + "loss": 3.0421, + "step": 66780 + }, + { + "epoch": 1.9568007031421666, + "grad_norm": 8.922880172729492, + "learning_rate": 2.8534273239294698e-06, + "loss": 3.0231, + "step": 66790 + }, + { + "epoch": 1.9570936790448985, + "grad_norm": 9.2860107421875, + "learning_rate": 2.8519991032086515e-06, + "loss": 3.0287, + "step": 66800 + }, + { + "epoch": 1.9573866549476304, + "grad_norm": 9.000066757202148, + "learning_rate": 2.850571097382582e-06, + "loss": 3.0476, + "step": 66810 + }, + { + "epoch": 1.9576796308503626, + "grad_norm": 9.669137001037598, + "learning_rate": 2.84914330659412e-06, + "loss": 3.0361, + "step": 66820 + }, + { + "epoch": 1.9579726067530947, + "grad_norm": 9.405601501464844, + "learning_rate": 2.8477157309861094e-06, + "loss": 3.0393, + "step": 66830 + }, + { + "epoch": 1.9582655826558266, + "grad_norm": 8.675071716308594, + "learning_rate": 2.846288370701371e-06, + "loss": 3.0325, + "step": 66840 + }, + { + "epoch": 1.9585585585585585, + "grad_norm": 9.302145004272461, + "learning_rate": 2.8448612258827057e-06, + "loss": 3.0599, + "step": 66850 + }, + { + "epoch": 1.9588515344612905, + "grad_norm": 9.349014282226562, + "learning_rate": 2.843434296672886e-06, + "loss": 3.0274, + "step": 66860 + }, + { + "epoch": 1.9591445103640226, + "grad_norm": 9.549907684326172, + "learning_rate": 2.842007583214673e-06, + "loss": 3.0468, + "step": 66870 + }, + { + "epoch": 1.9594374862667545, + "grad_norm": 9.364129066467285, + "learning_rate": 2.8405810856507964e-06, + "loss": 3.0496, + "step": 66880 + }, + { + "epoch": 1.9597304621694867, + "grad_norm": 8.985529899597168, + "learning_rate": 2.8391548041239732e-06, + "loss": 3.0622, + "step": 66890 + }, + { + "epoch": 1.9600234380722186, + "grad_norm": 9.247299194335938, + "learning_rate": 2.837728738776889e-06, + "loss": 3.0192, + "step": 66900 + }, + { + "epoch": 1.9603164139749505, + "grad_norm": 8.843280792236328, + "learning_rate": 2.836302889752217e-06, + "loss": 3.0507, + "step": 66910 + }, + { + "epoch": 1.9606093898776824, + "grad_norm": 9.31928825378418, + "learning_rate": 2.834877257192604e-06, + "loss": 3.0199, + "step": 66920 + }, + { + "epoch": 1.9609023657804145, + "grad_norm": 9.281152725219727, + "learning_rate": 2.8334518412406782e-06, + "loss": 3.0149, + "step": 66930 + }, + { + "epoch": 1.9611953416831467, + "grad_norm": 8.877639770507812, + "learning_rate": 2.8320266420390398e-06, + "loss": 3.033, + "step": 66940 + }, + { + "epoch": 1.9614883175858786, + "grad_norm": 9.343581199645996, + "learning_rate": 2.8306016597302756e-06, + "loss": 3.0208, + "step": 66950 + }, + { + "epoch": 1.9617812934886105, + "grad_norm": 8.557137489318848, + "learning_rate": 2.829176894456943e-06, + "loss": 3.028, + "step": 66960 + }, + { + "epoch": 1.9620742693913424, + "grad_norm": 9.427024841308594, + "learning_rate": 2.8277523463615852e-06, + "loss": 3.0425, + "step": 66970 + }, + { + "epoch": 1.9623672452940746, + "grad_norm": 9.338077545166016, + "learning_rate": 2.826328015586715e-06, + "loss": 3.0298, + "step": 66980 + }, + { + "epoch": 1.9626602211968067, + "grad_norm": 8.854384422302246, + "learning_rate": 2.8249039022748315e-06, + "loss": 3.0426, + "step": 66990 + }, + { + "epoch": 1.9629531970995386, + "grad_norm": 9.437331199645996, + "learning_rate": 2.8234800065684076e-06, + "loss": 3.032, + "step": 67000 + }, + { + "epoch": 1.9632461730022706, + "grad_norm": 9.509106636047363, + "learning_rate": 2.822056328609899e-06, + "loss": 3.0371, + "step": 67010 + }, + { + "epoch": 1.9635391489050025, + "grad_norm": 9.269217491149902, + "learning_rate": 2.8206328685417305e-06, + "loss": 3.0374, + "step": 67020 + }, + { + "epoch": 1.9638321248077346, + "grad_norm": 9.399959564208984, + "learning_rate": 2.819209626506316e-06, + "loss": 3.049, + "step": 67030 + }, + { + "epoch": 1.9641251007104665, + "grad_norm": 9.373745918273926, + "learning_rate": 2.8177866026460386e-06, + "loss": 3.0253, + "step": 67040 + }, + { + "epoch": 1.9644180766131987, + "grad_norm": 9.309881210327148, + "learning_rate": 2.816363797103267e-06, + "loss": 3.0324, + "step": 67050 + }, + { + "epoch": 1.9647110525159306, + "grad_norm": 8.870946884155273, + "learning_rate": 2.81494121002034e-06, + "loss": 3.0468, + "step": 67060 + }, + { + "epoch": 1.9650040284186625, + "grad_norm": 8.937124252319336, + "learning_rate": 2.813518841539582e-06, + "loss": 3.0322, + "step": 67070 + }, + { + "epoch": 1.9650626235992088, + "eval_bleu": 0.3520663230039905, + "eval_cap_loss": 0.9046527147293091, + "eval_con_loss": 1.1386828422546387, + "eval_loss": 3.182018280029297, + "step": 67072 + }, + { + "epoch": 1.9650626235992088, + "eval_bleu": 0.3520663230039905, + "eval_cap_loss": 0.9046527147293091, + "eval_con_loss": 1.1386828422546387, + "eval_loss": 3.182018280029297, + "eval_runtime": 67.2504, + "eval_samples_per_second": 297.396, + "eval_steps_per_second": 0.297, + "step": 67072 + }, + { + "epoch": 1.9652970043213944, + "grad_norm": 9.301347732543945, + "learning_rate": 2.8120966918032914e-06, + "loss": 3.0105, + "step": 67080 + }, + { + "epoch": 1.9655899802241266, + "grad_norm": 9.34712028503418, + "learning_rate": 2.810674760953749e-06, + "loss": 3.0237, + "step": 67090 + }, + { + "epoch": 1.9658829561268587, + "grad_norm": 9.239845275878906, + "learning_rate": 2.8092530491332054e-06, + "loss": 3.0405, + "step": 67100 + }, + { + "epoch": 1.9661759320295906, + "grad_norm": 9.441009521484375, + "learning_rate": 2.807831556483897e-06, + "loss": 3.038, + "step": 67110 + }, + { + "epoch": 1.9664689079323225, + "grad_norm": 9.127325057983398, + "learning_rate": 2.8064102831480377e-06, + "loss": 3.0287, + "step": 67120 + }, + { + "epoch": 1.9667618838350545, + "grad_norm": 9.318177223205566, + "learning_rate": 2.8049892292678136e-06, + "loss": 3.0368, + "step": 67130 + }, + { + "epoch": 1.9670548597377866, + "grad_norm": 8.645757675170898, + "learning_rate": 2.8035683949853963e-06, + "loss": 3.0085, + "step": 67140 + }, + { + "epoch": 1.9673478356405185, + "grad_norm": 9.191816329956055, + "learning_rate": 2.8021477804429287e-06, + "loss": 3.0284, + "step": 67150 + }, + { + "epoch": 1.9676408115432507, + "grad_norm": 8.48883056640625, + "learning_rate": 2.8007273857825355e-06, + "loss": 3.025, + "step": 67160 + }, + { + "epoch": 1.9679337874459826, + "grad_norm": 9.238638877868652, + "learning_rate": 2.7993072111463203e-06, + "loss": 3.0103, + "step": 67170 + }, + { + "epoch": 1.9682267633487145, + "grad_norm": 8.907201766967773, + "learning_rate": 2.7978872566763637e-06, + "loss": 3.0341, + "step": 67180 + }, + { + "epoch": 1.9685197392514464, + "grad_norm": 9.542062759399414, + "learning_rate": 2.7964675225147207e-06, + "loss": 3.0325, + "step": 67190 + }, + { + "epoch": 1.9688127151541785, + "grad_norm": 9.068896293640137, + "learning_rate": 2.7950480088034302e-06, + "loss": 3.035, + "step": 67200 + }, + { + "epoch": 1.9691056910569107, + "grad_norm": 9.398818016052246, + "learning_rate": 2.793628715684504e-06, + "loss": 3.0355, + "step": 67210 + }, + { + "epoch": 1.9693986669596426, + "grad_norm": 9.091464042663574, + "learning_rate": 2.792209643299937e-06, + "loss": 3.0517, + "step": 67220 + }, + { + "epoch": 1.9696916428623745, + "grad_norm": 9.259780883789062, + "learning_rate": 2.790790791791694e-06, + "loss": 3.0257, + "step": 67230 + }, + { + "epoch": 1.9699846187651064, + "grad_norm": 9.180634498596191, + "learning_rate": 2.7893721613017255e-06, + "loss": 3.0302, + "step": 67240 + }, + { + "epoch": 1.9702775946678386, + "grad_norm": 9.716475486755371, + "learning_rate": 2.7879537519719566e-06, + "loss": 3.0176, + "step": 67250 + }, + { + "epoch": 1.9705705705705707, + "grad_norm": 9.433886528015137, + "learning_rate": 2.7865355639442946e-06, + "loss": 3.025, + "step": 67260 + }, + { + "epoch": 1.9708635464733026, + "grad_norm": 9.227235794067383, + "learning_rate": 2.785117597360615e-06, + "loss": 3.0221, + "step": 67270 + }, + { + "epoch": 1.9711565223760346, + "grad_norm": 9.015236854553223, + "learning_rate": 2.7836998523627804e-06, + "loss": 3.0407, + "step": 67280 + }, + { + "epoch": 1.9714494982787665, + "grad_norm": 8.84760856628418, + "learning_rate": 2.7822823290926254e-06, + "loss": 3.0342, + "step": 67290 + }, + { + "epoch": 1.9717424741814986, + "grad_norm": 9.143855094909668, + "learning_rate": 2.780865027691968e-06, + "loss": 3.0225, + "step": 67300 + }, + { + "epoch": 1.9720354500842305, + "grad_norm": 9.686656951904297, + "learning_rate": 2.7794479483025973e-06, + "loss": 3.0307, + "step": 67310 + }, + { + "epoch": 1.9723284259869627, + "grad_norm": 9.232287406921387, + "learning_rate": 2.7780310910662843e-06, + "loss": 3.0385, + "step": 67320 + }, + { + "epoch": 1.9726214018896946, + "grad_norm": 9.451346397399902, + "learning_rate": 2.7766144561247787e-06, + "loss": 3.0423, + "step": 67330 + }, + { + "epoch": 1.9729143777924265, + "grad_norm": 9.385723114013672, + "learning_rate": 2.775198043619808e-06, + "loss": 3.054, + "step": 67340 + }, + { + "epoch": 1.9732073536951584, + "grad_norm": 9.397104263305664, + "learning_rate": 2.7737818536930717e-06, + "loss": 3.047, + "step": 67350 + }, + { + "epoch": 1.9735003295978906, + "grad_norm": 9.269303321838379, + "learning_rate": 2.772365886486256e-06, + "loss": 3.0533, + "step": 67360 + }, + { + "epoch": 1.9737933055006227, + "grad_norm": 8.975492477416992, + "learning_rate": 2.7709501421410136e-06, + "loss": 3.0431, + "step": 67370 + }, + { + "epoch": 1.9740862814033546, + "grad_norm": 9.350919723510742, + "learning_rate": 2.769534620798988e-06, + "loss": 3.0284, + "step": 67380 + }, + { + "epoch": 1.9743792573060865, + "grad_norm": 9.141169548034668, + "learning_rate": 2.7681193226017888e-06, + "loss": 3.0288, + "step": 67390 + }, + { + "epoch": 1.9746722332088185, + "grad_norm": 9.148425102233887, + "learning_rate": 2.7667042476910094e-06, + "loss": 3.0468, + "step": 67400 + }, + { + "epoch": 1.9749652091115506, + "grad_norm": 9.131755828857422, + "learning_rate": 2.7652893962082207e-06, + "loss": 3.0179, + "step": 67410 + }, + { + "epoch": 1.9752581850142827, + "grad_norm": 9.583839416503906, + "learning_rate": 2.763874768294972e-06, + "loss": 3.0357, + "step": 67420 + }, + { + "epoch": 1.9755511609170147, + "grad_norm": 8.995011329650879, + "learning_rate": 2.762460364092784e-06, + "loss": 3.0245, + "step": 67430 + }, + { + "epoch": 1.9758441368197466, + "grad_norm": 9.35151195526123, + "learning_rate": 2.7610461837431637e-06, + "loss": 3.0223, + "step": 67440 + }, + { + "epoch": 1.9761371127224785, + "grad_norm": 9.006526947021484, + "learning_rate": 2.7596322273875874e-06, + "loss": 3.0463, + "step": 67450 + }, + { + "epoch": 1.9764300886252104, + "grad_norm": 9.133018493652344, + "learning_rate": 2.7582184951675156e-06, + "loss": 3.011, + "step": 67460 + }, + { + "epoch": 1.9767230645279426, + "grad_norm": 8.392627716064453, + "learning_rate": 2.756804987224385e-06, + "loss": 3.0303, + "step": 67470 + }, + { + "epoch": 1.9770160404306747, + "grad_norm": 8.855671882629395, + "learning_rate": 2.7553917036996047e-06, + "loss": 3.0215, + "step": 67480 + }, + { + "epoch": 1.9773090163334066, + "grad_norm": 9.34375, + "learning_rate": 2.7539786447345684e-06, + "loss": 3.0319, + "step": 67490 + }, + { + "epoch": 1.9776019922361385, + "grad_norm": 8.845474243164062, + "learning_rate": 2.7525658104706432e-06, + "loss": 3.039, + "step": 67500 + }, + { + "epoch": 1.9778949681388704, + "grad_norm": 9.183257102966309, + "learning_rate": 2.7511532010491766e-06, + "loss": 3.0165, + "step": 67510 + }, + { + "epoch": 1.9781879440416026, + "grad_norm": 8.87154769897461, + "learning_rate": 2.7497408166114887e-06, + "loss": 3.0312, + "step": 67520 + }, + { + "epoch": 1.9784809199443347, + "grad_norm": 9.609557151794434, + "learning_rate": 2.7483286572988833e-06, + "loss": 3.0363, + "step": 67530 + }, + { + "epoch": 1.9787738958470666, + "grad_norm": 9.323368072509766, + "learning_rate": 2.7469167232526357e-06, + "loss": 3.038, + "step": 67540 + }, + { + "epoch": 1.9790668717497986, + "grad_norm": 9.702801704406738, + "learning_rate": 2.7455050146140044e-06, + "loss": 3.0497, + "step": 67550 + }, + { + "epoch": 1.9793598476525305, + "grad_norm": 8.982555389404297, + "learning_rate": 2.744093531524219e-06, + "loss": 3.0474, + "step": 67560 + }, + { + "epoch": 1.9796528235552626, + "grad_norm": 8.678030014038086, + "learning_rate": 2.7426822741244917e-06, + "loss": 3.0289, + "step": 67570 + }, + { + "epoch": 1.9799457994579945, + "grad_norm": 9.315796852111816, + "learning_rate": 2.741271242556012e-06, + "loss": 3.051, + "step": 67580 + }, + { + "epoch": 1.9800629898190873, + "eval_bleu": 0.3524518559601092, + "eval_cap_loss": 0.9040206670761108, + "eval_con_loss": 1.1382763385772705, + "eval_loss": 3.1805732250213623, + "step": 67584 + }, + { + "epoch": 1.9800629898190873, + "eval_bleu": 0.3524518559601092, + "eval_cap_loss": 0.9040206670761108, + "eval_con_loss": 1.1382763385772705, + "eval_loss": 3.1805732250213623, + "eval_runtime": 59.0629, + "eval_samples_per_second": 338.622, + "eval_steps_per_second": 0.339, + "step": 67584 + }, + { + "epoch": 1.9802387753607267, + "grad_norm": 9.290489196777344, + "learning_rate": 2.739860436959945e-06, + "loss": 3.0227, + "step": 67590 + }, + { + "epoch": 1.9805317512634586, + "grad_norm": 9.499844551086426, + "learning_rate": 2.73844985747743e-06, + "loss": 3.043, + "step": 67600 + }, + { + "epoch": 1.9808247271661905, + "grad_norm": 8.241901397705078, + "learning_rate": 2.7370395042495913e-06, + "loss": 3.0218, + "step": 67610 + }, + { + "epoch": 1.9811177030689224, + "grad_norm": 9.641922950744629, + "learning_rate": 2.7356293774175223e-06, + "loss": 3.0509, + "step": 67620 + }, + { + "epoch": 1.9814106789716546, + "grad_norm": 8.291583061218262, + "learning_rate": 2.734219477122303e-06, + "loss": 3.0234, + "step": 67630 + }, + { + "epoch": 1.9817036548743867, + "grad_norm": 9.025955200195312, + "learning_rate": 2.73280980350498e-06, + "loss": 3.0189, + "step": 67640 + }, + { + "epoch": 1.9819966307771186, + "grad_norm": 9.33190631866455, + "learning_rate": 2.7314003567065862e-06, + "loss": 3.0438, + "step": 67650 + }, + { + "epoch": 1.9822896066798505, + "grad_norm": 9.21124267578125, + "learning_rate": 2.7299911368681263e-06, + "loss": 3.0376, + "step": 67660 + }, + { + "epoch": 1.9825825825825825, + "grad_norm": 9.90676498413086, + "learning_rate": 2.728582144130589e-06, + "loss": 3.0394, + "step": 67670 + }, + { + "epoch": 1.9828755584853146, + "grad_norm": 9.209893226623535, + "learning_rate": 2.7271733786349297e-06, + "loss": 3.0164, + "step": 67680 + }, + { + "epoch": 1.9831685343880467, + "grad_norm": 9.338244438171387, + "learning_rate": 2.7257648405220914e-06, + "loss": 3.0339, + "step": 67690 + }, + { + "epoch": 1.9834615102907787, + "grad_norm": 9.53454303741455, + "learning_rate": 2.7243565299329868e-06, + "loss": 3.0412, + "step": 67700 + }, + { + "epoch": 1.9837544861935106, + "grad_norm": 8.507733345031738, + "learning_rate": 2.7229484470085117e-06, + "loss": 3.0488, + "step": 67710 + }, + { + "epoch": 1.9840474620962425, + "grad_norm": 9.114501953125, + "learning_rate": 2.7215405918895333e-06, + "loss": 3.0267, + "step": 67720 + }, + { + "epoch": 1.9843404379989746, + "grad_norm": 9.675797462463379, + "learning_rate": 2.7201329647169017e-06, + "loss": 3.0052, + "step": 67730 + }, + { + "epoch": 1.9846334139017066, + "grad_norm": 9.569245338439941, + "learning_rate": 2.7187255656314404e-06, + "loss": 3.0318, + "step": 67740 + }, + { + "epoch": 1.9849263898044387, + "grad_norm": 8.867124557495117, + "learning_rate": 2.717318394773954e-06, + "loss": 3.045, + "step": 67750 + }, + { + "epoch": 1.9852193657071706, + "grad_norm": 8.65703296661377, + "learning_rate": 2.7159114522852177e-06, + "loss": 3.0133, + "step": 67760 + }, + { + "epoch": 1.9855123416099025, + "grad_norm": 8.86561393737793, + "learning_rate": 2.7145047383059915e-06, + "loss": 3.0044, + "step": 67770 + }, + { + "epoch": 1.9858053175126344, + "grad_norm": 8.754826545715332, + "learning_rate": 2.7130982529770046e-06, + "loss": 3.0204, + "step": 67780 + }, + { + "epoch": 1.9860982934153666, + "grad_norm": 9.416047096252441, + "learning_rate": 2.7116919964389723e-06, + "loss": 3.0369, + "step": 67790 + }, + { + "epoch": 1.9863912693180987, + "grad_norm": 9.881521224975586, + "learning_rate": 2.710285968832577e-06, + "loss": 3.0539, + "step": 67800 + }, + { + "epoch": 1.9866842452208306, + "grad_norm": 9.645012855529785, + "learning_rate": 2.7088801702984867e-06, + "loss": 3.0371, + "step": 67810 + }, + { + "epoch": 1.9869772211235626, + "grad_norm": 9.2636137008667, + "learning_rate": 2.7074746009773423e-06, + "loss": 3.0366, + "step": 67820 + }, + { + "epoch": 1.9872701970262945, + "grad_norm": 9.369877815246582, + "learning_rate": 2.7060692610097633e-06, + "loss": 3.032, + "step": 67830 + }, + { + "epoch": 1.9875631729290266, + "grad_norm": 9.876287460327148, + "learning_rate": 2.704664150536347e-06, + "loss": 3.0405, + "step": 67840 + }, + { + "epoch": 1.9878561488317585, + "grad_norm": 9.27919864654541, + "learning_rate": 2.703259269697663e-06, + "loss": 3.0151, + "step": 67850 + }, + { + "epoch": 1.9881491247344907, + "grad_norm": 8.743006706237793, + "learning_rate": 2.701854618634264e-06, + "loss": 3.0279, + "step": 67860 + }, + { + "epoch": 1.9884421006372226, + "grad_norm": 8.876596450805664, + "learning_rate": 2.7004501974866754e-06, + "loss": 3.0219, + "step": 67870 + }, + { + "epoch": 1.9887350765399545, + "grad_norm": 8.683591842651367, + "learning_rate": 2.6990460063954027e-06, + "loss": 3.039, + "step": 67880 + }, + { + "epoch": 1.9890280524426864, + "grad_norm": 9.062911033630371, + "learning_rate": 2.6976420455009244e-06, + "loss": 3.021, + "step": 67890 + }, + { + "epoch": 1.9893210283454186, + "grad_norm": 8.855585098266602, + "learning_rate": 2.6962383149437006e-06, + "loss": 3.0227, + "step": 67900 + }, + { + "epoch": 1.9896140042481507, + "grad_norm": 9.55474853515625, + "learning_rate": 2.6948348148641645e-06, + "loss": 3.0211, + "step": 67910 + }, + { + "epoch": 1.9899069801508826, + "grad_norm": 9.165003776550293, + "learning_rate": 2.6934315454027323e-06, + "loss": 3.0425, + "step": 67920 + }, + { + "epoch": 1.9901999560536145, + "grad_norm": 9.102903366088867, + "learning_rate": 2.6920285066997865e-06, + "loss": 3.0274, + "step": 67930 + }, + { + "epoch": 1.9904929319563465, + "grad_norm": 9.58520221710205, + "learning_rate": 2.690625698895698e-06, + "loss": 3.0316, + "step": 67940 + }, + { + "epoch": 1.9907859078590786, + "grad_norm": 9.411687850952148, + "learning_rate": 2.6892231221308063e-06, + "loss": 3.024, + "step": 67950 + }, + { + "epoch": 1.9910788837618107, + "grad_norm": 9.192612648010254, + "learning_rate": 2.6878207765454328e-06, + "loss": 3.031, + "step": 67960 + }, + { + "epoch": 1.9913718596645427, + "grad_norm": 9.499330520629883, + "learning_rate": 2.6864186622798715e-06, + "loss": 3.034, + "step": 67970 + }, + { + "epoch": 1.9916648355672746, + "grad_norm": 9.252355575561523, + "learning_rate": 2.6850167794743966e-06, + "loss": 3.02, + "step": 67980 + }, + { + "epoch": 1.9919578114700065, + "grad_norm": 9.276089668273926, + "learning_rate": 2.683615128269259e-06, + "loss": 3.0139, + "step": 67990 + }, + { + "epoch": 1.9922507873727386, + "grad_norm": 9.491936683654785, + "learning_rate": 2.682213708804687e-06, + "loss": 3.0225, + "step": 68000 + }, + { + "epoch": 1.9925437632754706, + "grad_norm": 8.999177932739258, + "learning_rate": 2.68081252122088e-06, + "loss": 3.0269, + "step": 68010 + }, + { + "epoch": 1.9928367391782027, + "grad_norm": 9.705119132995605, + "learning_rate": 2.679411565658023e-06, + "loss": 3.0317, + "step": 68020 + }, + { + "epoch": 1.9931297150809346, + "grad_norm": 9.000835418701172, + "learning_rate": 2.67801084225627e-06, + "loss": 3.0404, + "step": 68030 + }, + { + "epoch": 1.9934226909836665, + "grad_norm": 8.877927780151367, + "learning_rate": 2.6766103511557582e-06, + "loss": 3.0088, + "step": 68040 + }, + { + "epoch": 1.9937156668863985, + "grad_norm": 9.468542098999023, + "learning_rate": 2.675210092496594e-06, + "loss": 3.0362, + "step": 68050 + }, + { + "epoch": 1.9940086427891306, + "grad_norm": 9.26716423034668, + "learning_rate": 2.6738100664188684e-06, + "loss": 3.0251, + "step": 68060 + }, + { + "epoch": 1.9943016186918627, + "grad_norm": 9.673930168151855, + "learning_rate": 2.6724102730626444e-06, + "loss": 3.035, + "step": 68070 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 9.470138549804688, + "learning_rate": 2.671010712567965e-06, + "loss": 3.029, + "step": 68080 + }, + { + "epoch": 1.9948875704973266, + "grad_norm": 8.794164657592773, + "learning_rate": 2.669611385074845e-06, + "loss": 3.0266, + "step": 68090 + }, + { + "epoch": 1.995063356038966, + "eval_bleu": 0.3522180245602251, + "eval_cap_loss": 0.9042752385139465, + "eval_con_loss": 1.1378846168518066, + "eval_loss": 3.180044651031494, + "step": 68096 + }, + { + "epoch": 1.995063356038966, + "eval_bleu": 0.3522180245602251, + "eval_cap_loss": 0.9042752385139465, + "eval_con_loss": 1.1378846168518066, + "eval_loss": 3.180044651031494, + "eval_runtime": 56.8304, + "eval_samples_per_second": 351.924, + "eval_steps_per_second": 0.352, + "step": 68096 + }, + { + "epoch": 1.9951805464000585, + "grad_norm": 9.589741706848145, + "learning_rate": 2.6682122907232823e-06, + "loss": 3.0324, + "step": 68100 + }, + { + "epoch": 1.9954735223027906, + "grad_norm": 8.946523666381836, + "learning_rate": 2.666813429653243e-06, + "loss": 3.0411, + "step": 68110 + }, + { + "epoch": 1.9957664982055225, + "grad_norm": 9.7788724899292, + "learning_rate": 2.665414802004681e-06, + "loss": 3.0239, + "step": 68120 + }, + { + "epoch": 1.9960594741082547, + "grad_norm": 9.50627326965332, + "learning_rate": 2.6640164079175147e-06, + "loss": 3.0431, + "step": 68130 + }, + { + "epoch": 1.9963524500109866, + "grad_norm": 8.872432708740234, + "learning_rate": 2.6626182475316474e-06, + "loss": 3.0125, + "step": 68140 + }, + { + "epoch": 1.9966454259137185, + "grad_norm": 9.107104301452637, + "learning_rate": 2.661220320986958e-06, + "loss": 3.0475, + "step": 68150 + }, + { + "epoch": 1.9969384018164504, + "grad_norm": 9.079438209533691, + "learning_rate": 2.659822628423302e-06, + "loss": 3.0399, + "step": 68160 + }, + { + "epoch": 1.9972313777191826, + "grad_norm": 8.815004348754883, + "learning_rate": 2.6584251699805062e-06, + "loss": 3.0225, + "step": 68170 + }, + { + "epoch": 1.9975243536219147, + "grad_norm": 8.51163387298584, + "learning_rate": 2.6570279457983796e-06, + "loss": 3.0399, + "step": 68180 + }, + { + "epoch": 1.9978173295246466, + "grad_norm": 8.645805358886719, + "learning_rate": 2.655630956016709e-06, + "loss": 3.023, + "step": 68190 + }, + { + "epoch": 1.9981103054273786, + "grad_norm": 10.152469635009766, + "learning_rate": 2.654234200775251e-06, + "loss": 3.04, + "step": 68200 + }, + { + "epoch": 1.9984032813301105, + "grad_norm": 8.6574068069458, + "learning_rate": 2.652837680213745e-06, + "loss": 3.0238, + "step": 68210 + }, + { + "epoch": 1.9986962572328426, + "grad_norm": 9.384949684143066, + "learning_rate": 2.651441394471901e-06, + "loss": 3.0251, + "step": 68220 + }, + { + "epoch": 1.9989892331355748, + "grad_norm": 8.712075233459473, + "learning_rate": 2.6500453436894157e-06, + "loss": 3.0172, + "step": 68230 + }, + { + "epoch": 1.9992822090383067, + "grad_norm": 9.08700180053711, + "learning_rate": 2.64864952800595e-06, + "loss": 3.0279, + "step": 68240 + }, + { + "epoch": 1.9995751849410386, + "grad_norm": 9.885160446166992, + "learning_rate": 2.6472539475611504e-06, + "loss": 3.0281, + "step": 68250 + }, + { + "epoch": 1.9998681608437705, + "grad_norm": 9.400811195373535, + "learning_rate": 2.6458586024946333e-06, + "loss": 3.0351, + "step": 68260 + }, + { + "epoch": 2.0001611367465024, + "grad_norm": 21.97415542602539, + "learning_rate": 2.6444634929459985e-06, + "loss": 3.0209, + "step": 68270 + }, + { + "epoch": 2.000454112649235, + "grad_norm": 19.539268493652344, + "learning_rate": 2.643068619054814e-06, + "loss": 3.047, + "step": 68280 + }, + { + "epoch": 2.0007470885519667, + "grad_norm": 20.613628387451172, + "learning_rate": 2.641673980960631e-06, + "loss": 3.0281, + "step": 68290 + }, + { + "epoch": 2.0010400644546986, + "grad_norm": 16.367429733276367, + "learning_rate": 2.6402795788029756e-06, + "loss": 3.022, + "step": 68300 + }, + { + "epoch": 2.0013330403574305, + "grad_norm": 19.26883888244629, + "learning_rate": 2.6388854127213492e-06, + "loss": 3.0384, + "step": 68310 + }, + { + "epoch": 2.0016260162601625, + "grad_norm": 17.32339096069336, + "learning_rate": 2.6374914828552284e-06, + "loss": 3.0351, + "step": 68320 + }, + { + "epoch": 2.001918992162895, + "grad_norm": 21.651466369628906, + "learning_rate": 2.63609778934407e-06, + "loss": 3.0368, + "step": 68330 + }, + { + "epoch": 2.0022119680656267, + "grad_norm": 21.361257553100586, + "learning_rate": 2.6347043323273016e-06, + "loss": 3.0393, + "step": 68340 + }, + { + "epoch": 2.0025049439683587, + "grad_norm": 19.534400939941406, + "learning_rate": 2.6333111119443333e-06, + "loss": 3.0297, + "step": 68350 + }, + { + "epoch": 2.0027979198710906, + "grad_norm": 17.751047134399414, + "learning_rate": 2.6319181283345453e-06, + "loss": 3.0225, + "step": 68360 + }, + { + "epoch": 2.0030908957738225, + "grad_norm": 18.56531524658203, + "learning_rate": 2.6305253816372992e-06, + "loss": 3.0378, + "step": 68370 + }, + { + "epoch": 2.0033838716765544, + "grad_norm": 17.267501831054688, + "learning_rate": 2.6291328719919314e-06, + "loss": 3.0278, + "step": 68380 + }, + { + "epoch": 2.0036768475792868, + "grad_norm": 19.436355590820312, + "learning_rate": 2.6277405995377556e-06, + "loss": 3.0313, + "step": 68390 + }, + { + "epoch": 2.0039698234820187, + "grad_norm": 18.444290161132812, + "learning_rate": 2.626348564414057e-06, + "loss": 3.0443, + "step": 68400 + }, + { + "epoch": 2.0042627993847506, + "grad_norm": 19.8458194732666, + "learning_rate": 2.624956766760104e-06, + "loss": 3.0495, + "step": 68410 + }, + { + "epoch": 2.0045557752874825, + "grad_norm": 20.572118759155273, + "learning_rate": 2.6235652067151334e-06, + "loss": 3.043, + "step": 68420 + }, + { + "epoch": 2.0048487511902144, + "grad_norm": 20.16639518737793, + "learning_rate": 2.6221738844183676e-06, + "loss": 3.0534, + "step": 68430 + }, + { + "epoch": 2.005141727092947, + "grad_norm": 19.5582218170166, + "learning_rate": 2.6207828000089953e-06, + "loss": 3.0329, + "step": 68440 + }, + { + "epoch": 2.0054347029956787, + "grad_norm": 16.396753311157227, + "learning_rate": 2.619391953626188e-06, + "loss": 3.0508, + "step": 68450 + }, + { + "epoch": 2.0057276788984106, + "grad_norm": 18.24407958984375, + "learning_rate": 2.6180013454090926e-06, + "loss": 3.0286, + "step": 68460 + }, + { + "epoch": 2.0060206548011426, + "grad_norm": 19.441747665405273, + "learning_rate": 2.616610975496833e-06, + "loss": 3.0403, + "step": 68470 + }, + { + "epoch": 2.0063136307038745, + "grad_norm": 18.484691619873047, + "learning_rate": 2.615220844028502e-06, + "loss": 3.03, + "step": 68480 + }, + { + "epoch": 2.0066066066066064, + "grad_norm": 19.895212173461914, + "learning_rate": 2.61383095114318e-06, + "loss": 3.0291, + "step": 68490 + }, + { + "epoch": 2.0068995825093388, + "grad_norm": 19.209259033203125, + "learning_rate": 2.612441296979912e-06, + "loss": 3.0356, + "step": 68500 + }, + { + "epoch": 2.0071925584120707, + "grad_norm": 21.27365493774414, + "learning_rate": 2.61105188167773e-06, + "loss": 3.0295, + "step": 68510 + }, + { + "epoch": 2.0074855343148026, + "grad_norm": 20.174484252929688, + "learning_rate": 2.609662705375632e-06, + "loss": 3.0575, + "step": 68520 + }, + { + "epoch": 2.0077785102175345, + "grad_norm": 17.452518463134766, + "learning_rate": 2.608273768212599e-06, + "loss": 3.0433, + "step": 68530 + }, + { + "epoch": 2.0080714861202664, + "grad_norm": 19.206024169921875, + "learning_rate": 2.6068850703275856e-06, + "loss": 3.059, + "step": 68540 + }, + { + "epoch": 2.008364462022999, + "grad_norm": 21.62403678894043, + "learning_rate": 2.6054966118595233e-06, + "loss": 3.0367, + "step": 68550 + }, + { + "epoch": 2.0086574379257307, + "grad_norm": 18.543842315673828, + "learning_rate": 2.604108392947321e-06, + "loss": 3.0648, + "step": 68560 + }, + { + "epoch": 2.0089504138284626, + "grad_norm": 17.644031524658203, + "learning_rate": 2.602720413729857e-06, + "loss": 3.0537, + "step": 68570 + }, + { + "epoch": 2.0092433897311945, + "grad_norm": 20.958675384521484, + "learning_rate": 2.6013326743459953e-06, + "loss": 3.0403, + "step": 68580 + }, + { + "epoch": 2.0095363656339265, + "grad_norm": 19.34272575378418, + "learning_rate": 2.599945174934567e-06, + "loss": 3.0363, + "step": 68590 + }, + { + "epoch": 2.009829341536659, + "grad_norm": 16.481943130493164, + "learning_rate": 2.5985579156343865e-06, + "loss": 3.0521, + "step": 68600 + }, + { + "epoch": 2.010063722258844, + "eval_bleu": 0.35143254328441753, + "eval_cap_loss": 0.9061561822891235, + "eval_con_loss": 1.1468616724014282, + "eval_loss": 3.1998796463012695, + "step": 68608 + }, + { + "epoch": 2.010063722258844, + "eval_bleu": 0.35143254328441753, + "eval_cap_loss": 0.9061561822891235, + "eval_con_loss": 1.1468616724014282, + "eval_loss": 3.1998796463012695, + "eval_runtime": 52.443, + "eval_samples_per_second": 381.367, + "eval_steps_per_second": 0.381, + "step": 68608 + }, + { + "epoch": 2.0101223174393907, + "grad_norm": 17.53737449645996, + "learning_rate": 2.5971708965842378e-06, + "loss": 3.0463, + "step": 68610 + }, + { + "epoch": 2.0104152933421227, + "grad_norm": 18.26576805114746, + "learning_rate": 2.595784117922885e-06, + "loss": 3.0592, + "step": 68620 + }, + { + "epoch": 2.0107082692448546, + "grad_norm": 16.96661949157715, + "learning_rate": 2.5943975797890674e-06, + "loss": 3.0406, + "step": 68630 + }, + { + "epoch": 2.0110012451475865, + "grad_norm": 16.0164852142334, + "learning_rate": 2.5930112823215024e-06, + "loss": 3.0443, + "step": 68640 + }, + { + "epoch": 2.0112942210503184, + "grad_norm": 21.607879638671875, + "learning_rate": 2.5916252256588762e-06, + "loss": 3.0524, + "step": 68650 + }, + { + "epoch": 2.0115871969530508, + "grad_norm": 16.885866165161133, + "learning_rate": 2.5902394099398596e-06, + "loss": 3.0338, + "step": 68660 + }, + { + "epoch": 2.0118801728557827, + "grad_norm": 20.523466110229492, + "learning_rate": 2.5888538353030924e-06, + "loss": 3.0573, + "step": 68670 + }, + { + "epoch": 2.0121731487585146, + "grad_norm": 17.599876403808594, + "learning_rate": 2.5874685018871948e-06, + "loss": 3.0298, + "step": 68680 + }, + { + "epoch": 2.0124661246612465, + "grad_norm": 18.915815353393555, + "learning_rate": 2.5860834098307593e-06, + "loss": 3.0386, + "step": 68690 + }, + { + "epoch": 2.0127591005639784, + "grad_norm": 17.676372528076172, + "learning_rate": 2.584698559272357e-06, + "loss": 3.0398, + "step": 68700 + }, + { + "epoch": 2.013052076466711, + "grad_norm": 19.05833625793457, + "learning_rate": 2.5833139503505343e-06, + "loss": 3.0269, + "step": 68710 + }, + { + "epoch": 2.0133450523694427, + "grad_norm": 18.37013053894043, + "learning_rate": 2.5819295832038157e-06, + "loss": 3.0414, + "step": 68720 + }, + { + "epoch": 2.0136380282721746, + "grad_norm": 22.270973205566406, + "learning_rate": 2.580545457970694e-06, + "loss": 3.0223, + "step": 68730 + }, + { + "epoch": 2.0139310041749066, + "grad_norm": 18.93882942199707, + "learning_rate": 2.579161574789647e-06, + "loss": 3.0383, + "step": 68740 + }, + { + "epoch": 2.0142239800776385, + "grad_norm": 19.15229606628418, + "learning_rate": 2.5777779337991205e-06, + "loss": 3.027, + "step": 68750 + }, + { + "epoch": 2.0145169559803704, + "grad_norm": 21.84851837158203, + "learning_rate": 2.576394535137543e-06, + "loss": 3.044, + "step": 68760 + }, + { + "epoch": 2.0148099318831028, + "grad_norm": 20.408123016357422, + "learning_rate": 2.575011378943311e-06, + "loss": 3.0465, + "step": 68770 + }, + { + "epoch": 2.0151029077858347, + "grad_norm": 18.805606842041016, + "learning_rate": 2.573628465354804e-06, + "loss": 3.0377, + "step": 68780 + }, + { + "epoch": 2.0153958836885666, + "grad_norm": 19.349403381347656, + "learning_rate": 2.572245794510374e-06, + "loss": 3.0446, + "step": 68790 + }, + { + "epoch": 2.0156888595912985, + "grad_norm": 18.94744873046875, + "learning_rate": 2.5708633665483506e-06, + "loss": 3.0408, + "step": 68800 + }, + { + "epoch": 2.0159818354940304, + "grad_norm": 17.023523330688477, + "learning_rate": 2.569481181607034e-06, + "loss": 3.0127, + "step": 68810 + }, + { + "epoch": 2.016274811396763, + "grad_norm": 19.896116256713867, + "learning_rate": 2.568099239824707e-06, + "loss": 3.0459, + "step": 68820 + }, + { + "epoch": 2.0165677872994947, + "grad_norm": 22.629533767700195, + "learning_rate": 2.5667175413396216e-06, + "loss": 3.063, + "step": 68830 + }, + { + "epoch": 2.0168607632022266, + "grad_norm": 19.312856674194336, + "learning_rate": 2.565474220836437e-06, + "loss": 3.0601, + "step": 68840 + }, + { + "epoch": 2.0171537391049585, + "grad_norm": 17.539335250854492, + "learning_rate": 2.564092984996919e-06, + "loss": 3.0245, + "step": 68850 + }, + { + "epoch": 2.0174467150076905, + "grad_norm": 20.251741409301758, + "learning_rate": 2.5627119928554477e-06, + "loss": 3.0549, + "step": 68860 + }, + { + "epoch": 2.017739690910423, + "grad_norm": 17.738910675048828, + "learning_rate": 2.56133124455018e-06, + "loss": 3.029, + "step": 68870 + }, + { + "epoch": 2.0180326668131547, + "grad_norm": 18.91864585876465, + "learning_rate": 2.5599507402192524e-06, + "loss": 3.0271, + "step": 68880 + }, + { + "epoch": 2.0183256427158867, + "grad_norm": 18.59192657470703, + "learning_rate": 2.5585704800007797e-06, + "loss": 3.0334, + "step": 68890 + }, + { + "epoch": 2.0186186186186186, + "grad_norm": 24.062253952026367, + "learning_rate": 2.5571904640328437e-06, + "loss": 3.0359, + "step": 68900 + }, + { + "epoch": 2.0189115945213505, + "grad_norm": 19.85233497619629, + "learning_rate": 2.55581069245351e-06, + "loss": 3.0525, + "step": 68910 + }, + { + "epoch": 2.0192045704240824, + "grad_norm": 17.902494430541992, + "learning_rate": 2.5544311654008142e-06, + "loss": 3.0341, + "step": 68920 + }, + { + "epoch": 2.0194975463268148, + "grad_norm": 19.438961029052734, + "learning_rate": 2.553051883012774e-06, + "loss": 3.0375, + "step": 68930 + }, + { + "epoch": 2.0197905222295467, + "grad_norm": 14.174229621887207, + "learning_rate": 2.551672845427373e-06, + "loss": 3.0406, + "step": 68940 + }, + { + "epoch": 2.0200834981322786, + "grad_norm": 15.412330627441406, + "learning_rate": 2.5502940527825813e-06, + "loss": 3.0308, + "step": 68950 + }, + { + "epoch": 2.0203764740350105, + "grad_norm": 19.006017684936523, + "learning_rate": 2.548915505216333e-06, + "loss": 3.0457, + "step": 68960 + }, + { + "epoch": 2.0206694499377424, + "grad_norm": 19.029335021972656, + "learning_rate": 2.5475372028665497e-06, + "loss": 3.0102, + "step": 68970 + }, + { + "epoch": 2.020962425840475, + "grad_norm": 17.471281051635742, + "learning_rate": 2.5461591458711164e-06, + "loss": 3.0325, + "step": 68980 + }, + { + "epoch": 2.0212554017432067, + "grad_norm": 19.39676284790039, + "learning_rate": 2.544781334367903e-06, + "loss": 3.0486, + "step": 68990 + }, + { + "epoch": 2.0215483776459386, + "grad_norm": 15.514202117919922, + "learning_rate": 2.543403768494751e-06, + "loss": 3.0249, + "step": 69000 + }, + { + "epoch": 2.0218413535486706, + "grad_norm": 20.024728775024414, + "learning_rate": 2.5420264483894787e-06, + "loss": 3.0406, + "step": 69010 + }, + { + "epoch": 2.0221343294514025, + "grad_norm": 16.82016372680664, + "learning_rate": 2.5406493741898764e-06, + "loss": 3.0389, + "step": 69020 + }, + { + "epoch": 2.022427305354135, + "grad_norm": 20.355703353881836, + "learning_rate": 2.539272546033714e-06, + "loss": 3.0266, + "step": 69030 + }, + { + "epoch": 2.0227202812568668, + "grad_norm": 17.734310150146484, + "learning_rate": 2.5378959640587337e-06, + "loss": 3.0426, + "step": 69040 + }, + { + "epoch": 2.0230132571595987, + "grad_norm": 20.305631637573242, + "learning_rate": 2.536519628402657e-06, + "loss": 3.0508, + "step": 69050 + }, + { + "epoch": 2.0233062330623306, + "grad_norm": 20.177413940429688, + "learning_rate": 2.5351435392031733e-06, + "loss": 3.0257, + "step": 69060 + }, + { + "epoch": 2.0235992089650625, + "grad_norm": 20.66153907775879, + "learning_rate": 2.5337676965979556e-06, + "loss": 3.0578, + "step": 69070 + }, + { + "epoch": 2.0238921848677944, + "grad_norm": 20.51680564880371, + "learning_rate": 2.5323921007246484e-06, + "loss": 3.0467, + "step": 69080 + }, + { + "epoch": 2.024185160770527, + "grad_norm": 17.998754501342773, + "learning_rate": 2.5310167517208728e-06, + "loss": 3.0729, + "step": 69090 + }, + { + "epoch": 2.0244781366732587, + "grad_norm": 20.090106964111328, + "learning_rate": 2.529641649724222e-06, + "loss": 3.0477, + "step": 69100 + }, + { + "epoch": 2.0247711125759906, + "grad_norm": 21.37087059020996, + "learning_rate": 2.52826679487227e-06, + "loss": 3.0552, + "step": 69110 + }, + { + "epoch": 2.0250640884787225, + "grad_norm": 17.64130210876465, + "learning_rate": 2.5268921873025585e-06, + "loss": 3.0405, + "step": 69120 + }, + { + "epoch": 2.0250640884787225, + "eval_bleu": 0.3514919011662507, + "eval_cap_loss": 0.9063278436660767, + "eval_con_loss": 1.1469923257827759, + "eval_loss": 3.200312376022339, + "step": 69120 + }, + { + "epoch": 2.0250640884787225, + "eval_bleu": 0.3514919011662507, + "eval_cap_loss": 0.9063278436660767, + "eval_con_loss": 1.1469923257827759, + "eval_loss": 3.200312376022339, + "eval_runtime": 53.066, + "eval_samples_per_second": 376.889, + "eval_steps_per_second": 0.377, + "step": 69120 + }, + { + "epoch": 2.0253570643814545, + "grad_norm": 20.854116439819336, + "learning_rate": 2.525517827152614e-06, + "loss": 3.0624, + "step": 69130 + }, + { + "epoch": 2.025650040284187, + "grad_norm": 17.250722885131836, + "learning_rate": 2.524143714559927e-06, + "loss": 3.0513, + "step": 69140 + }, + { + "epoch": 2.0259430161869187, + "grad_norm": 20.176321029663086, + "learning_rate": 2.5227698496619736e-06, + "loss": 3.0478, + "step": 69150 + }, + { + "epoch": 2.0262359920896507, + "grad_norm": 18.098466873168945, + "learning_rate": 2.5213962325961994e-06, + "loss": 3.0437, + "step": 69160 + }, + { + "epoch": 2.0265289679923826, + "grad_norm": 19.372262954711914, + "learning_rate": 2.520022863500029e-06, + "loss": 3.0558, + "step": 69170 + }, + { + "epoch": 2.0268219438951145, + "grad_norm": 20.488157272338867, + "learning_rate": 2.5186497425108563e-06, + "loss": 3.0342, + "step": 69180 + }, + { + "epoch": 2.0271149197978464, + "grad_norm": 18.11314582824707, + "learning_rate": 2.5172768697660577e-06, + "loss": 3.044, + "step": 69190 + }, + { + "epoch": 2.0274078957005788, + "grad_norm": 18.785306930541992, + "learning_rate": 2.515904245402976e-06, + "loss": 3.0366, + "step": 69200 + }, + { + "epoch": 2.0277008716033107, + "grad_norm": 17.73948860168457, + "learning_rate": 2.5145318695589394e-06, + "loss": 3.0325, + "step": 69210 + }, + { + "epoch": 2.0279938475060426, + "grad_norm": 19.879627227783203, + "learning_rate": 2.5131597423712416e-06, + "loss": 3.0426, + "step": 69220 + }, + { + "epoch": 2.0282868234087745, + "grad_norm": 22.48439598083496, + "learning_rate": 2.5117878639771575e-06, + "loss": 3.0572, + "step": 69230 + }, + { + "epoch": 2.0285797993115064, + "grad_norm": 23.747764587402344, + "learning_rate": 2.5104162345139355e-06, + "loss": 3.0598, + "step": 69240 + }, + { + "epoch": 2.028872775214239, + "grad_norm": 16.89480209350586, + "learning_rate": 2.509044854118799e-06, + "loss": 3.0578, + "step": 69250 + }, + { + "epoch": 2.0291657511169707, + "grad_norm": 18.235689163208008, + "learning_rate": 2.507673722928948e-06, + "loss": 3.0536, + "step": 69260 + }, + { + "epoch": 2.0294587270197026, + "grad_norm": 19.908382415771484, + "learning_rate": 2.5063028410815537e-06, + "loss": 3.0513, + "step": 69270 + }, + { + "epoch": 2.0297517029224346, + "grad_norm": 15.726092338562012, + "learning_rate": 2.5049322087137674e-06, + "loss": 3.0423, + "step": 69280 + }, + { + "epoch": 2.0300446788251665, + "grad_norm": 17.399988174438477, + "learning_rate": 2.503561825962708e-06, + "loss": 3.0406, + "step": 69290 + }, + { + "epoch": 2.030337654727899, + "grad_norm": 19.024887084960938, + "learning_rate": 2.5021916929654794e-06, + "loss": 3.0569, + "step": 69300 + }, + { + "epoch": 2.0306306306306308, + "grad_norm": 20.851146697998047, + "learning_rate": 2.5008218098591507e-06, + "loss": 3.0638, + "step": 69310 + }, + { + "epoch": 2.0309236065333627, + "grad_norm": 17.850969314575195, + "learning_rate": 2.4994521767807735e-06, + "loss": 3.0405, + "step": 69320 + }, + { + "epoch": 2.0312165824360946, + "grad_norm": 16.89542579650879, + "learning_rate": 2.4980827938673706e-06, + "loss": 3.0374, + "step": 69330 + }, + { + "epoch": 2.0315095583388265, + "grad_norm": 19.91242027282715, + "learning_rate": 2.496713661255942e-06, + "loss": 3.0494, + "step": 69340 + }, + { + "epoch": 2.0318025342415584, + "grad_norm": 18.392263412475586, + "learning_rate": 2.4953447790834594e-06, + "loss": 3.0423, + "step": 69350 + }, + { + "epoch": 2.032095510144291, + "grad_norm": 23.31220054626465, + "learning_rate": 2.4939761474868734e-06, + "loss": 3.0489, + "step": 69360 + }, + { + "epoch": 2.0323884860470227, + "grad_norm": 20.985740661621094, + "learning_rate": 2.492607766603104e-06, + "loss": 3.0207, + "step": 69370 + }, + { + "epoch": 2.0326814619497546, + "grad_norm": 18.784894943237305, + "learning_rate": 2.491239636569054e-06, + "loss": 3.0307, + "step": 69380 + }, + { + "epoch": 2.0329744378524865, + "grad_norm": 19.418012619018555, + "learning_rate": 2.489871757521592e-06, + "loss": 3.0247, + "step": 69390 + }, + { + "epoch": 2.0332674137552185, + "grad_norm": 18.996936798095703, + "learning_rate": 2.4885041295975686e-06, + "loss": 3.0555, + "step": 69400 + }, + { + "epoch": 2.033560389657951, + "grad_norm": 24.122798919677734, + "learning_rate": 2.487136752933807e-06, + "loss": 3.0459, + "step": 69410 + }, + { + "epoch": 2.0338533655606827, + "grad_norm": 17.848724365234375, + "learning_rate": 2.4857696276671067e-06, + "loss": 3.0371, + "step": 69420 + }, + { + "epoch": 2.0341463414634147, + "grad_norm": 18.226173400878906, + "learning_rate": 2.4844027539342375e-06, + "loss": 3.0795, + "step": 69430 + }, + { + "epoch": 2.0344393173661466, + "grad_norm": 19.64680290222168, + "learning_rate": 2.4830361318719493e-06, + "loss": 3.0464, + "step": 69440 + }, + { + "epoch": 2.0347322932688785, + "grad_norm": 17.588972091674805, + "learning_rate": 2.481669761616962e-06, + "loss": 3.0465, + "step": 69450 + }, + { + "epoch": 2.0350252691716104, + "grad_norm": 19.34797477722168, + "learning_rate": 2.4803036433059762e-06, + "loss": 3.0461, + "step": 69460 + }, + { + "epoch": 2.035318245074343, + "grad_norm": 17.398523330688477, + "learning_rate": 2.47893777707566e-06, + "loss": 3.0486, + "step": 69470 + }, + { + "epoch": 2.0356112209770747, + "grad_norm": 15.990434646606445, + "learning_rate": 2.4775721630626624e-06, + "loss": 3.0296, + "step": 69480 + }, + { + "epoch": 2.0359041968798066, + "grad_norm": 21.966978073120117, + "learning_rate": 2.476206801403605e-06, + "loss": 3.0455, + "step": 69490 + }, + { + "epoch": 2.0361971727825385, + "grad_norm": 17.553443908691406, + "learning_rate": 2.4748416922350856e-06, + "loss": 3.0306, + "step": 69500 + }, + { + "epoch": 2.0364901486852705, + "grad_norm": 20.1547794342041, + "learning_rate": 2.4734768356936718e-06, + "loss": 3.0492, + "step": 69510 + }, + { + "epoch": 2.036783124588003, + "grad_norm": 19.707653045654297, + "learning_rate": 2.472112231915913e-06, + "loss": 3.0298, + "step": 69520 + }, + { + "epoch": 2.0370761004907347, + "grad_norm": 16.455535888671875, + "learning_rate": 2.4707478810383264e-06, + "loss": 3.0513, + "step": 69530 + }, + { + "epoch": 2.0373690763934666, + "grad_norm": 21.47121810913086, + "learning_rate": 2.4693837831974105e-06, + "loss": 3.0496, + "step": 69540 + }, + { + "epoch": 2.0376620522961986, + "grad_norm": 20.570653915405273, + "learning_rate": 2.468019938529631e-06, + "loss": 3.0294, + "step": 69550 + }, + { + "epoch": 2.0379550281989305, + "grad_norm": 21.794761657714844, + "learning_rate": 2.4666563471714354e-06, + "loss": 3.0249, + "step": 69560 + }, + { + "epoch": 2.038248004101663, + "grad_norm": 19.18895149230957, + "learning_rate": 2.4652930092592424e-06, + "loss": 3.0545, + "step": 69570 + }, + { + "epoch": 2.0385409800043948, + "grad_norm": 19.022785186767578, + "learning_rate": 2.4639299249294462e-06, + "loss": 3.0408, + "step": 69580 + }, + { + "epoch": 2.0388339559071267, + "grad_norm": 18.615571975708008, + "learning_rate": 2.4625670943184164e-06, + "loss": 3.0216, + "step": 69590 + }, + { + "epoch": 2.0391269318098586, + "grad_norm": 20.420799255371094, + "learning_rate": 2.4612045175624934e-06, + "loss": 3.0545, + "step": 69600 + }, + { + "epoch": 2.0394199077125905, + "grad_norm": 19.506664276123047, + "learning_rate": 2.459842194797998e-06, + "loss": 3.0465, + "step": 69610 + }, + { + "epoch": 2.0397128836153224, + "grad_norm": 18.516645431518555, + "learning_rate": 2.458480126161219e-06, + "loss": 3.0309, + "step": 69620 + }, + { + "epoch": 2.040005859518055, + "grad_norm": 16.952781677246094, + "learning_rate": 2.457118311788427e-06, + "loss": 3.0547, + "step": 69630 + }, + { + "epoch": 2.040064454698601, + "eval_bleu": 0.35074680649673373, + "eval_cap_loss": 0.9066649675369263, + "eval_con_loss": 1.1474876403808594, + "eval_loss": 3.2016403675079346, + "step": 69632 + }, + { + "epoch": 2.040064454698601, + "eval_bleu": 0.35074680649673373, + "eval_cap_loss": 0.9066649675369263, + "eval_con_loss": 1.1474876403808594, + "eval_loss": 3.2016403675079346, + "eval_runtime": 52.0742, + "eval_samples_per_second": 384.067, + "eval_steps_per_second": 0.384, + "step": 69632 + }, + { + "epoch": 2.0402988354207867, + "grad_norm": 18.327285766601562, + "learning_rate": 2.4557567518158585e-06, + "loss": 3.0299, + "step": 69640 + }, + { + "epoch": 2.0405918113235186, + "grad_norm": 20.87584114074707, + "learning_rate": 2.4543954463797337e-06, + "loss": 3.0418, + "step": 69650 + }, + { + "epoch": 2.0408847872262506, + "grad_norm": 18.98100471496582, + "learning_rate": 2.453034395616242e-06, + "loss": 3.0354, + "step": 69660 + }, + { + "epoch": 2.0411777631289825, + "grad_norm": 17.8748836517334, + "learning_rate": 2.4516735996615495e-06, + "loss": 3.067, + "step": 69670 + }, + { + "epoch": 2.041470739031715, + "grad_norm": 19.793615341186523, + "learning_rate": 2.4503130586517933e-06, + "loss": 3.0532, + "step": 69680 + }, + { + "epoch": 2.0417637149344467, + "grad_norm": 14.602084159851074, + "learning_rate": 2.4489527727230904e-06, + "loss": 3.0267, + "step": 69690 + }, + { + "epoch": 2.0420566908371787, + "grad_norm": 22.18909454345703, + "learning_rate": 2.447592742011526e-06, + "loss": 3.0294, + "step": 69700 + }, + { + "epoch": 2.0423496667399106, + "grad_norm": 20.4666690826416, + "learning_rate": 2.4462329666531665e-06, + "loss": 3.0369, + "step": 69710 + }, + { + "epoch": 2.0426426426426425, + "grad_norm": 18.146455764770508, + "learning_rate": 2.4448734467840464e-06, + "loss": 3.0506, + "step": 69720 + }, + { + "epoch": 2.042935618545375, + "grad_norm": 17.279251098632812, + "learning_rate": 2.4435141825401783e-06, + "loss": 3.0567, + "step": 69730 + }, + { + "epoch": 2.043228594448107, + "grad_norm": 19.399600982666016, + "learning_rate": 2.4421551740575494e-06, + "loss": 3.0431, + "step": 69740 + }, + { + "epoch": 2.0435215703508387, + "grad_norm": 19.124723434448242, + "learning_rate": 2.440796421472122e-06, + "loss": 3.0475, + "step": 69750 + }, + { + "epoch": 2.0438145462535706, + "grad_norm": 17.782379150390625, + "learning_rate": 2.4394379249198276e-06, + "loss": 3.0241, + "step": 69760 + }, + { + "epoch": 2.0441075221563025, + "grad_norm": 16.970298767089844, + "learning_rate": 2.438079684536579e-06, + "loss": 3.0421, + "step": 69770 + }, + { + "epoch": 2.0444004980590345, + "grad_norm": 20.149944305419922, + "learning_rate": 2.4367217004582574e-06, + "loss": 3.041, + "step": 69780 + }, + { + "epoch": 2.044693473961767, + "grad_norm": 17.69672393798828, + "learning_rate": 2.4353639728207234e-06, + "loss": 3.0471, + "step": 69790 + }, + { + "epoch": 2.0449864498644987, + "grad_norm": 15.485025405883789, + "learning_rate": 2.434006501759807e-06, + "loss": 3.0266, + "step": 69800 + }, + { + "epoch": 2.0452794257672307, + "grad_norm": 19.828744888305664, + "learning_rate": 2.4326492874113166e-06, + "loss": 3.0475, + "step": 69810 + }, + { + "epoch": 2.0455724016699626, + "grad_norm": 18.573143005371094, + "learning_rate": 2.431292329911033e-06, + "loss": 3.052, + "step": 69820 + }, + { + "epoch": 2.0458653775726945, + "grad_norm": 18.613916397094727, + "learning_rate": 2.4299356293947146e-06, + "loss": 3.0391, + "step": 69830 + }, + { + "epoch": 2.046158353475427, + "grad_norm": 17.79893684387207, + "learning_rate": 2.4285791859980866e-06, + "loss": 3.0589, + "step": 69840 + }, + { + "epoch": 2.0464513293781588, + "grad_norm": 19.14837074279785, + "learning_rate": 2.4272229998568576e-06, + "loss": 3.0617, + "step": 69850 + }, + { + "epoch": 2.0467443052808907, + "grad_norm": 17.579025268554688, + "learning_rate": 2.4258670711067016e-06, + "loss": 3.0475, + "step": 69860 + }, + { + "epoch": 2.0470372811836226, + "grad_norm": 19.409832000732422, + "learning_rate": 2.4245113998832747e-06, + "loss": 3.0531, + "step": 69870 + }, + { + "epoch": 2.0473302570863545, + "grad_norm": 20.956851959228516, + "learning_rate": 2.4231559863222013e-06, + "loss": 3.0554, + "step": 69880 + }, + { + "epoch": 2.0476232329890864, + "grad_norm": 19.20332145690918, + "learning_rate": 2.4218008305590833e-06, + "loss": 3.0338, + "step": 69890 + }, + { + "epoch": 2.047916208891819, + "grad_norm": 18.855636596679688, + "learning_rate": 2.420445932729496e-06, + "loss": 3.0713, + "step": 69900 + }, + { + "epoch": 2.0482091847945507, + "grad_norm": 15.397140502929688, + "learning_rate": 2.4190912929689913e-06, + "loss": 3.0371, + "step": 69910 + }, + { + "epoch": 2.0485021606972826, + "grad_norm": 19.749271392822266, + "learning_rate": 2.417736911413089e-06, + "loss": 3.0168, + "step": 69920 + }, + { + "epoch": 2.0487951366000146, + "grad_norm": 19.531381607055664, + "learning_rate": 2.41638278819729e-06, + "loss": 3.0215, + "step": 69930 + }, + { + "epoch": 2.0490881125027465, + "grad_norm": 19.337148666381836, + "learning_rate": 2.415028923457064e-06, + "loss": 3.0257, + "step": 69940 + }, + { + "epoch": 2.049381088405479, + "grad_norm": 20.824954986572266, + "learning_rate": 2.4136753173278572e-06, + "loss": 3.0367, + "step": 69950 + }, + { + "epoch": 2.0496740643082108, + "grad_norm": 21.050283432006836, + "learning_rate": 2.412321969945093e-06, + "loss": 3.0417, + "step": 69960 + }, + { + "epoch": 2.0499670402109427, + "grad_norm": 19.011476516723633, + "learning_rate": 2.4109688814441616e-06, + "loss": 3.0438, + "step": 69970 + }, + { + "epoch": 2.0502600161136746, + "grad_norm": 17.309345245361328, + "learning_rate": 2.4096160519604334e-06, + "loss": 3.0454, + "step": 69980 + }, + { + "epoch": 2.0505529920164065, + "grad_norm": 19.45785903930664, + "learning_rate": 2.408263481629251e-06, + "loss": 3.0373, + "step": 69990 + }, + { + "epoch": 2.050845967919139, + "grad_norm": 18.959259033203125, + "learning_rate": 2.406911170585933e-06, + "loss": 3.0247, + "step": 70000 + }, + { + "epoch": 2.051138943821871, + "grad_norm": 16.898588180541992, + "learning_rate": 2.4055591189657662e-06, + "loss": 3.0725, + "step": 70010 + }, + { + "epoch": 2.0514319197246027, + "grad_norm": 21.439420700073242, + "learning_rate": 2.404207326904019e-06, + "loss": 3.025, + "step": 70020 + }, + { + "epoch": 2.0517248956273346, + "grad_norm": 21.778806686401367, + "learning_rate": 2.4028557945359265e-06, + "loss": 3.0486, + "step": 70030 + }, + { + "epoch": 2.0520178715300665, + "grad_norm": 16.392005920410156, + "learning_rate": 2.4015045219967058e-06, + "loss": 3.0524, + "step": 70040 + }, + { + "epoch": 2.0523108474327985, + "grad_norm": 20.849258422851562, + "learning_rate": 2.400153509421537e-06, + "loss": 3.0327, + "step": 70050 + }, + { + "epoch": 2.052603823335531, + "grad_norm": 14.25741958618164, + "learning_rate": 2.3988027569455895e-06, + "loss": 3.0287, + "step": 70060 + }, + { + "epoch": 2.0528967992382627, + "grad_norm": 17.52887725830078, + "learning_rate": 2.3974522647039915e-06, + "loss": 3.0465, + "step": 70070 + }, + { + "epoch": 2.0531897751409947, + "grad_norm": 19.545621871948242, + "learning_rate": 2.3961020328318562e-06, + "loss": 3.0392, + "step": 70080 + }, + { + "epoch": 2.0534827510437266, + "grad_norm": 18.68536376953125, + "learning_rate": 2.3947520614642617e-06, + "loss": 3.0485, + "step": 70090 + }, + { + "epoch": 2.0537757269464585, + "grad_norm": 16.481355667114258, + "learning_rate": 2.3934023507362695e-06, + "loss": 3.0374, + "step": 70100 + }, + { + "epoch": 2.054068702849191, + "grad_norm": 20.82215118408203, + "learning_rate": 2.392052900782906e-06, + "loss": 3.037, + "step": 70110 + }, + { + "epoch": 2.0543616787519228, + "grad_norm": 22.60198974609375, + "learning_rate": 2.3907037117391784e-06, + "loss": 3.0327, + "step": 70120 + }, + { + "epoch": 2.0546546546546547, + "grad_norm": 17.965295791625977, + "learning_rate": 2.3893547837400607e-06, + "loss": 3.0523, + "step": 70130 + }, + { + "epoch": 2.0549476305573866, + "grad_norm": 19.240304946899414, + "learning_rate": 2.388006116920512e-06, + "loss": 3.0525, + "step": 70140 + }, + { + "epoch": 2.0550648209184796, + "eval_bleu": 0.35121966282480693, + "eval_cap_loss": 0.9067361354827881, + "eval_con_loss": 1.1478495597839355, + "eval_loss": 3.2024354934692383, + "step": 70144 + }, + { + "epoch": 2.0550648209184796, + "eval_bleu": 0.35121966282480693, + "eval_cap_loss": 0.9067361354827881, + "eval_con_loss": 1.1478495597839355, + "eval_loss": 3.2024354934692383, + "eval_runtime": 52.2859, + "eval_samples_per_second": 382.512, + "eval_steps_per_second": 0.383, + "step": 70144 + }, + { + "epoch": 2.0552406064601185, + "grad_norm": 20.995820999145508, + "learning_rate": 2.386657711415453e-06, + "loss": 3.0517, + "step": 70150 + }, + { + "epoch": 2.0555335823628504, + "grad_norm": 15.926921844482422, + "learning_rate": 2.385309567359788e-06, + "loss": 3.0375, + "step": 70160 + }, + { + "epoch": 2.055826558265583, + "grad_norm": 19.695161819458008, + "learning_rate": 2.3839616848883862e-06, + "loss": 3.028, + "step": 70170 + }, + { + "epoch": 2.0561195341683147, + "grad_norm": 16.935272216796875, + "learning_rate": 2.3826140641360988e-06, + "loss": 3.0364, + "step": 70180 + }, + { + "epoch": 2.0564125100710466, + "grad_norm": 17.354026794433594, + "learning_rate": 2.3812667052377446e-06, + "loss": 3.0401, + "step": 70190 + }, + { + "epoch": 2.0567054859737786, + "grad_norm": 17.102203369140625, + "learning_rate": 2.3799196083281218e-06, + "loss": 3.009, + "step": 70200 + }, + { + "epoch": 2.0569984618765105, + "grad_norm": 11.708443641662598, + "learning_rate": 2.3785727735419937e-06, + "loss": 3.0604, + "step": 70210 + }, + { + "epoch": 2.057291437779243, + "grad_norm": 18.604337692260742, + "learning_rate": 2.3772262010141115e-06, + "loss": 3.0534, + "step": 70220 + }, + { + "epoch": 2.0575844136819748, + "grad_norm": 17.290813446044922, + "learning_rate": 2.375879890879185e-06, + "loss": 3.0459, + "step": 70230 + }, + { + "epoch": 2.0578773895847067, + "grad_norm": 22.03449821472168, + "learning_rate": 2.3745338432719102e-06, + "loss": 3.0406, + "step": 70240 + }, + { + "epoch": 2.0581703654874386, + "grad_norm": 15.831110000610352, + "learning_rate": 2.373188058326945e-06, + "loss": 3.0484, + "step": 70250 + }, + { + "epoch": 2.0584633413901705, + "grad_norm": 19.055490493774414, + "learning_rate": 2.371842536178932e-06, + "loss": 3.0461, + "step": 70260 + }, + { + "epoch": 2.058756317292903, + "grad_norm": 16.87665367126465, + "learning_rate": 2.3704972769624796e-06, + "loss": 3.0218, + "step": 70270 + }, + { + "epoch": 2.059049293195635, + "grad_norm": 17.99141502380371, + "learning_rate": 2.3691522808121757e-06, + "loss": 3.0691, + "step": 70280 + }, + { + "epoch": 2.0593422690983667, + "grad_norm": 17.138042449951172, + "learning_rate": 2.3678075478625747e-06, + "loss": 3.0419, + "step": 70290 + }, + { + "epoch": 2.0596352450010986, + "grad_norm": 17.971101760864258, + "learning_rate": 2.366463078248214e-06, + "loss": 3.0235, + "step": 70300 + }, + { + "epoch": 2.0599282209038305, + "grad_norm": 18.859615325927734, + "learning_rate": 2.3651188721035996e-06, + "loss": 3.0323, + "step": 70310 + }, + { + "epoch": 2.0602211968065625, + "grad_norm": 19.0852108001709, + "learning_rate": 2.3637749295632084e-06, + "loss": 3.0325, + "step": 70320 + }, + { + "epoch": 2.060514172709295, + "grad_norm": 19.588085174560547, + "learning_rate": 2.3624312507614967e-06, + "loss": 3.0382, + "step": 70330 + }, + { + "epoch": 2.0608071486120267, + "grad_norm": 17.558834075927734, + "learning_rate": 2.3610878358328885e-06, + "loss": 3.0517, + "step": 70340 + }, + { + "epoch": 2.0611001245147587, + "grad_norm": 18.210065841674805, + "learning_rate": 2.359744684911788e-06, + "loss": 3.0508, + "step": 70350 + }, + { + "epoch": 2.0613931004174906, + "grad_norm": 16.535232543945312, + "learning_rate": 2.3584017981325663e-06, + "loss": 3.034, + "step": 70360 + }, + { + "epoch": 2.0616860763202225, + "grad_norm": 18.662078857421875, + "learning_rate": 2.3570591756295717e-06, + "loss": 3.0314, + "step": 70370 + }, + { + "epoch": 2.061979052222955, + "grad_norm": 17.78031349182129, + "learning_rate": 2.3557168175371274e-06, + "loss": 3.045, + "step": 70380 + }, + { + "epoch": 2.0622720281256868, + "grad_norm": 18.946720123291016, + "learning_rate": 2.3543747239895295e-06, + "loss": 3.0433, + "step": 70390 + }, + { + "epoch": 2.0625650040284187, + "grad_norm": 16.625205993652344, + "learning_rate": 2.353032895121042e-06, + "loss": 3.0318, + "step": 70400 + }, + { + "epoch": 2.0628579799311506, + "grad_norm": 17.900888442993164, + "learning_rate": 2.3516913310659117e-06, + "loss": 3.0596, + "step": 70410 + }, + { + "epoch": 2.0631509558338825, + "grad_norm": 14.190695762634277, + "learning_rate": 2.3503500319583493e-06, + "loss": 3.0389, + "step": 70420 + }, + { + "epoch": 2.0634439317366144, + "grad_norm": 18.03022003173828, + "learning_rate": 2.349008997932549e-06, + "loss": 3.0371, + "step": 70430 + }, + { + "epoch": 2.063736907639347, + "grad_norm": 21.143802642822266, + "learning_rate": 2.347668229122669e-06, + "loss": 3.0337, + "step": 70440 + }, + { + "epoch": 2.0640298835420787, + "grad_norm": 15.145450592041016, + "learning_rate": 2.346327725662846e-06, + "loss": 3.0037, + "step": 70450 + }, + { + "epoch": 2.0643228594448106, + "grad_norm": 22.32362174987793, + "learning_rate": 2.3449874876871907e-06, + "loss": 3.0164, + "step": 70460 + }, + { + "epoch": 2.0646158353475426, + "grad_norm": 18.17943000793457, + "learning_rate": 2.3436475153297876e-06, + "loss": 3.0451, + "step": 70470 + }, + { + "epoch": 2.0649088112502745, + "grad_norm": 18.441991806030273, + "learning_rate": 2.342307808724689e-06, + "loss": 3.0379, + "step": 70480 + }, + { + "epoch": 2.065201787153007, + "grad_norm": 17.580656051635742, + "learning_rate": 2.3409683680059285e-06, + "loss": 3.0301, + "step": 70490 + }, + { + "epoch": 2.0654947630557388, + "grad_norm": 17.929161071777344, + "learning_rate": 2.339629193307506e-06, + "loss": 3.0192, + "step": 70500 + }, + { + "epoch": 2.0657877389584707, + "grad_norm": 19.914480209350586, + "learning_rate": 2.3382902847634003e-06, + "loss": 3.0408, + "step": 70510 + }, + { + "epoch": 2.0660807148612026, + "grad_norm": 18.50761604309082, + "learning_rate": 2.336951642507559e-06, + "loss": 3.0377, + "step": 70520 + }, + { + "epoch": 2.0663736907639345, + "grad_norm": 21.824596405029297, + "learning_rate": 2.3356132666739064e-06, + "loss": 3.0305, + "step": 70530 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 18.278059005737305, + "learning_rate": 2.3342751573963384e-06, + "loss": 3.0369, + "step": 70540 + }, + { + "epoch": 2.066959642569399, + "grad_norm": 17.52166175842285, + "learning_rate": 2.3329373148087286e-06, + "loss": 3.0249, + "step": 70550 + }, + { + "epoch": 2.0672526184721307, + "grad_norm": 16.435331344604492, + "learning_rate": 2.3315997390449146e-06, + "loss": 3.0267, + "step": 70560 + }, + { + "epoch": 2.0675455943748626, + "grad_norm": 18.663982391357422, + "learning_rate": 2.3302624302387183e-06, + "loss": 3.0554, + "step": 70570 + }, + { + "epoch": 2.0678385702775945, + "grad_norm": 19.037988662719727, + "learning_rate": 2.3289253885239242e-06, + "loss": 3.0458, + "step": 70580 + }, + { + "epoch": 2.0681315461803265, + "grad_norm": 17.957748413085938, + "learning_rate": 2.3275886140343005e-06, + "loss": 3.0421, + "step": 70590 + }, + { + "epoch": 2.068424522083059, + "grad_norm": 17.184415817260742, + "learning_rate": 2.3262521069035794e-06, + "loss": 3.042, + "step": 70600 + }, + { + "epoch": 2.0687174979857907, + "grad_norm": 16.145418167114258, + "learning_rate": 2.3249158672654716e-06, + "loss": 3.0268, + "step": 70610 + }, + { + "epoch": 2.0690104738885227, + "grad_norm": 18.476898193359375, + "learning_rate": 2.323579895253661e-06, + "loss": 3.031, + "step": 70620 + }, + { + "epoch": 2.0693034497912546, + "grad_norm": 20.666343688964844, + "learning_rate": 2.3222441910018045e-06, + "loss": 3.0469, + "step": 70630 + }, + { + "epoch": 2.0695964256939865, + "grad_norm": 19.33371353149414, + "learning_rate": 2.320908754643529e-06, + "loss": 3.0566, + "step": 70640 + }, + { + "epoch": 2.069889401596719, + "grad_norm": 18.51830291748047, + "learning_rate": 2.319573586312439e-06, + "loss": 3.036, + "step": 70650 + }, + { + "epoch": 2.0700651871383577, + "eval_bleu": 0.35118581647300084, + "eval_cap_loss": 0.9064056873321533, + "eval_con_loss": 1.1466312408447266, + "eval_loss": 3.1996684074401855, + "step": 70656 + }, + { + "epoch": 2.0700651871383577, + "eval_bleu": 0.35118581647300084, + "eval_cap_loss": 0.9064056873321533, + "eval_con_loss": 1.1466312408447266, + "eval_loss": 3.1996684074401855, + "eval_runtime": 52.3247, + "eval_samples_per_second": 382.228, + "eval_steps_per_second": 0.382, + "step": 70656 + }, + { + "epoch": 2.0701823774994508, + "grad_norm": 23.09701156616211, + "learning_rate": 2.318238686142108e-06, + "loss": 3.0434, + "step": 70660 + }, + { + "epoch": 2.0704753534021827, + "grad_norm": 21.590343475341797, + "learning_rate": 2.3169040542660864e-06, + "loss": 3.0346, + "step": 70670 + }, + { + "epoch": 2.0707683293049146, + "grad_norm": 19.041475296020508, + "learning_rate": 2.3155696908178974e-06, + "loss": 3.0423, + "step": 70680 + }, + { + "epoch": 2.0710613052076465, + "grad_norm": 18.350196838378906, + "learning_rate": 2.3142355959310326e-06, + "loss": 3.0463, + "step": 70690 + }, + { + "epoch": 2.071354281110379, + "grad_norm": 13.936199188232422, + "learning_rate": 2.312901769738963e-06, + "loss": 3.0241, + "step": 70700 + }, + { + "epoch": 2.071647257013111, + "grad_norm": 19.412479400634766, + "learning_rate": 2.3115682123751288e-06, + "loss": 3.037, + "step": 70710 + }, + { + "epoch": 2.0719402329158427, + "grad_norm": 20.25098419189453, + "learning_rate": 2.3102349239729466e-06, + "loss": 3.0564, + "step": 70720 + }, + { + "epoch": 2.0722332088185746, + "grad_norm": 18.23261260986328, + "learning_rate": 2.308901904665801e-06, + "loss": 3.0583, + "step": 70730 + }, + { + "epoch": 2.0725261847213066, + "grad_norm": 18.528444290161133, + "learning_rate": 2.307569154587056e-06, + "loss": 3.0434, + "step": 70740 + }, + { + "epoch": 2.0728191606240385, + "grad_norm": 19.02518081665039, + "learning_rate": 2.3062366738700413e-06, + "loss": 3.0346, + "step": 70750 + }, + { + "epoch": 2.073112136526771, + "grad_norm": 19.589197158813477, + "learning_rate": 2.3049044626480683e-06, + "loss": 3.0361, + "step": 70760 + }, + { + "epoch": 2.0734051124295028, + "grad_norm": 17.553430557250977, + "learning_rate": 2.3035725210544117e-06, + "loss": 3.0474, + "step": 70770 + }, + { + "epoch": 2.0736980883322347, + "grad_norm": 20.19639778137207, + "learning_rate": 2.3022408492223275e-06, + "loss": 3.0642, + "step": 70780 + }, + { + "epoch": 2.0739910642349666, + "grad_norm": 20.051069259643555, + "learning_rate": 2.3009094472850413e-06, + "loss": 3.0525, + "step": 70790 + }, + { + "epoch": 2.0742840401376985, + "grad_norm": 18.04062843322754, + "learning_rate": 2.299578315375753e-06, + "loss": 3.0471, + "step": 70800 + }, + { + "epoch": 2.074577016040431, + "grad_norm": 18.72857666015625, + "learning_rate": 2.298247453627632e-06, + "loss": 3.039, + "step": 70810 + }, + { + "epoch": 2.074869991943163, + "grad_norm": 17.168323516845703, + "learning_rate": 2.296916862173825e-06, + "loss": 3.0502, + "step": 70820 + }, + { + "epoch": 2.0751629678458947, + "grad_norm": 19.309791564941406, + "learning_rate": 2.295586541147449e-06, + "loss": 3.0224, + "step": 70830 + }, + { + "epoch": 2.0754559437486266, + "grad_norm": 20.233604431152344, + "learning_rate": 2.2942564906815953e-06, + "loss": 3.0439, + "step": 70840 + }, + { + "epoch": 2.0757489196513585, + "grad_norm": 16.298240661621094, + "learning_rate": 2.2929267109093257e-06, + "loss": 3.0329, + "step": 70850 + }, + { + "epoch": 2.0760418955540905, + "grad_norm": 17.930849075317383, + "learning_rate": 2.291597201963678e-06, + "loss": 3.0378, + "step": 70860 + }, + { + "epoch": 2.076334871456823, + "grad_norm": 18.048751831054688, + "learning_rate": 2.2902679639776614e-06, + "loss": 3.0555, + "step": 70870 + }, + { + "epoch": 2.0766278473595547, + "grad_norm": 19.15348243713379, + "learning_rate": 2.2889389970842613e-06, + "loss": 3.0367, + "step": 70880 + }, + { + "epoch": 2.0769208232622867, + "grad_norm": 17.615570068359375, + "learning_rate": 2.2876103014164284e-06, + "loss": 3.0536, + "step": 70890 + }, + { + "epoch": 2.0772137991650186, + "grad_norm": 16.6790714263916, + "learning_rate": 2.2862818771070944e-06, + "loss": 3.0431, + "step": 70900 + }, + { + "epoch": 2.0775067750677505, + "grad_norm": 22.040260314941406, + "learning_rate": 2.284953724289157e-06, + "loss": 3.0508, + "step": 70910 + }, + { + "epoch": 2.077799750970483, + "grad_norm": 20.716726303100586, + "learning_rate": 2.283625843095493e-06, + "loss": 3.0131, + "step": 70920 + }, + { + "epoch": 2.078092726873215, + "grad_norm": 18.908584594726562, + "learning_rate": 2.2822982336589467e-06, + "loss": 3.0451, + "step": 70930 + }, + { + "epoch": 2.0783857027759467, + "grad_norm": 19.051673889160156, + "learning_rate": 2.2809708961123383e-06, + "loss": 3.0339, + "step": 70940 + }, + { + "epoch": 2.0786786786786786, + "grad_norm": 19.402400970458984, + "learning_rate": 2.2796438305884597e-06, + "loss": 3.0434, + "step": 70950 + }, + { + "epoch": 2.0789716545814105, + "grad_norm": 15.122520446777344, + "learning_rate": 2.278317037220079e-06, + "loss": 3.0538, + "step": 70960 + }, + { + "epoch": 2.0792646304841425, + "grad_norm": 21.941017150878906, + "learning_rate": 2.2769905161399297e-06, + "loss": 3.0406, + "step": 70970 + }, + { + "epoch": 2.079557606386875, + "grad_norm": 18.110191345214844, + "learning_rate": 2.275664267480726e-06, + "loss": 3.036, + "step": 70980 + }, + { + "epoch": 2.0798505822896067, + "grad_norm": 19.61331558227539, + "learning_rate": 2.274338291375147e-06, + "loss": 3.0529, + "step": 70990 + }, + { + "epoch": 2.0801435581923386, + "grad_norm": 19.81867218017578, + "learning_rate": 2.273012587955854e-06, + "loss": 3.0354, + "step": 71000 + }, + { + "epoch": 2.0804365340950706, + "grad_norm": 19.873924255371094, + "learning_rate": 2.2716871573554698e-06, + "loss": 3.0217, + "step": 71010 + }, + { + "epoch": 2.0807295099978025, + "grad_norm": 17.484956741333008, + "learning_rate": 2.2703619997066e-06, + "loss": 3.0151, + "step": 71020 + }, + { + "epoch": 2.081022485900535, + "grad_norm": 21.19591522216797, + "learning_rate": 2.2690371151418173e-06, + "loss": 3.0547, + "step": 71030 + }, + { + "epoch": 2.0813154618032668, + "grad_norm": 18.821624755859375, + "learning_rate": 2.267712503793669e-06, + "loss": 3.0375, + "step": 71040 + }, + { + "epoch": 2.0816084377059987, + "grad_norm": 20.91379165649414, + "learning_rate": 2.2663881657946768e-06, + "loss": 3.0356, + "step": 71050 + }, + { + "epoch": 2.0819014136087306, + "grad_norm": 20.75700569152832, + "learning_rate": 2.265064101277328e-06, + "loss": 3.0346, + "step": 71060 + }, + { + "epoch": 2.0821943895114625, + "grad_norm": 20.01905059814453, + "learning_rate": 2.263740310374093e-06, + "loss": 3.0471, + "step": 71070 + }, + { + "epoch": 2.082487365414195, + "grad_norm": 21.798274993896484, + "learning_rate": 2.2624167932174037e-06, + "loss": 3.0573, + "step": 71080 + }, + { + "epoch": 2.082780341316927, + "grad_norm": 17.093460083007812, + "learning_rate": 2.2610935499396747e-06, + "loss": 3.0512, + "step": 71090 + }, + { + "epoch": 2.0830733172196587, + "grad_norm": 22.084558486938477, + "learning_rate": 2.259770580673285e-06, + "loss": 3.0285, + "step": 71100 + }, + { + "epoch": 2.0833662931223906, + "grad_norm": 16.023401260375977, + "learning_rate": 2.258447885550591e-06, + "loss": 3.0376, + "step": 71110 + }, + { + "epoch": 2.0836592690251226, + "grad_norm": 20.030506134033203, + "learning_rate": 2.2571254647039216e-06, + "loss": 3.0487, + "step": 71120 + }, + { + "epoch": 2.0839522449278545, + "grad_norm": 15.589778900146484, + "learning_rate": 2.2558033182655775e-06, + "loss": 3.0316, + "step": 71130 + }, + { + "epoch": 2.084245220830587, + "grad_norm": 17.013887405395508, + "learning_rate": 2.2544814463678296e-06, + "loss": 3.045, + "step": 71140 + }, + { + "epoch": 2.0845381967333187, + "grad_norm": 18.225473403930664, + "learning_rate": 2.2531598491429256e-06, + "loss": 3.0321, + "step": 71150 + }, + { + "epoch": 2.0848311726360507, + "grad_norm": 21.77761459350586, + "learning_rate": 2.2518385267230815e-06, + "loss": 3.0335, + "step": 71160 + }, + { + "epoch": 2.0850655533582363, + "eval_bleu": 0.3511411485015297, + "eval_cap_loss": 0.9062362313270569, + "eval_con_loss": 1.1459743976593018, + "eval_loss": 3.1981852054595947, + "step": 71168 + }, + { + "epoch": 2.0850655533582363, + "eval_bleu": 0.3511411485015297, + "eval_cap_loss": 0.9062362313270569, + "eval_con_loss": 1.1459743976593018, + "eval_loss": 3.1981852054595947, + "eval_runtime": 53.6031, + "eval_samples_per_second": 373.113, + "eval_steps_per_second": 0.373, + "step": 71168 + }, + { + "epoch": 2.0851241485387826, + "grad_norm": 17.46792984008789, + "learning_rate": 2.2505174792404895e-06, + "loss": 3.0172, + "step": 71170 + }, + { + "epoch": 2.0854171244415145, + "grad_norm": 19.44733238220215, + "learning_rate": 2.2491967068273095e-06, + "loss": 3.0335, + "step": 71180 + }, + { + "epoch": 2.085710100344247, + "grad_norm": 18.71560287475586, + "learning_rate": 2.2478762096156796e-06, + "loss": 3.0456, + "step": 71190 + }, + { + "epoch": 2.086003076246979, + "grad_norm": 18.538909912109375, + "learning_rate": 2.2465559877377067e-06, + "loss": 3.0592, + "step": 71200 + }, + { + "epoch": 2.0862960521497107, + "grad_norm": 19.599376678466797, + "learning_rate": 2.2452360413254736e-06, + "loss": 3.0437, + "step": 71210 + }, + { + "epoch": 2.0865890280524426, + "grad_norm": 19.71991539001465, + "learning_rate": 2.2439163705110295e-06, + "loss": 3.0518, + "step": 71220 + }, + { + "epoch": 2.0868820039551745, + "grad_norm": 17.40052604675293, + "learning_rate": 2.2425969754264032e-06, + "loss": 3.027, + "step": 71230 + }, + { + "epoch": 2.087174979857907, + "grad_norm": 22.52243995666504, + "learning_rate": 2.241277856203588e-06, + "loss": 3.0367, + "step": 71240 + }, + { + "epoch": 2.087467955760639, + "grad_norm": 14.980255126953125, + "learning_rate": 2.2399590129745585e-06, + "loss": 3.0357, + "step": 71250 + }, + { + "epoch": 2.0877609316633707, + "grad_norm": 19.412067413330078, + "learning_rate": 2.2386404458712535e-06, + "loss": 3.0524, + "step": 71260 + }, + { + "epoch": 2.0880539075661027, + "grad_norm": 20.54974365234375, + "learning_rate": 2.2373221550255886e-06, + "loss": 3.0231, + "step": 71270 + }, + { + "epoch": 2.0883468834688346, + "grad_norm": 17.15669059753418, + "learning_rate": 2.2360041405694523e-06, + "loss": 3.0341, + "step": 71280 + }, + { + "epoch": 2.0886398593715665, + "grad_norm": 19.198747634887695, + "learning_rate": 2.2346864026347046e-06, + "loss": 3.0389, + "step": 71290 + }, + { + "epoch": 2.088932835274299, + "grad_norm": 18.20522689819336, + "learning_rate": 2.233368941353175e-06, + "loss": 3.0451, + "step": 71300 + }, + { + "epoch": 2.0892258111770308, + "grad_norm": 19.839500427246094, + "learning_rate": 2.2320517568566706e-06, + "loss": 3.0225, + "step": 71310 + }, + { + "epoch": 2.0895187870797627, + "grad_norm": 15.822179794311523, + "learning_rate": 2.2307348492769647e-06, + "loss": 3.06, + "step": 71320 + }, + { + "epoch": 2.0898117629824946, + "grad_norm": 20.031965255737305, + "learning_rate": 2.229418218745809e-06, + "loss": 3.0674, + "step": 71330 + }, + { + "epoch": 2.0901047388852265, + "grad_norm": 19.904741287231445, + "learning_rate": 2.228101865394922e-06, + "loss": 3.0429, + "step": 71340 + }, + { + "epoch": 2.090397714787959, + "grad_norm": 16.9857120513916, + "learning_rate": 2.226785789355998e-06, + "loss": 3.0607, + "step": 71350 + }, + { + "epoch": 2.090690690690691, + "grad_norm": 18.113370895385742, + "learning_rate": 2.2254699907607024e-06, + "loss": 3.0502, + "step": 71360 + }, + { + "epoch": 2.0909836665934227, + "grad_norm": 19.176158905029297, + "learning_rate": 2.224154469740676e-06, + "loss": 3.0352, + "step": 71370 + }, + { + "epoch": 2.0912766424961546, + "grad_norm": 16.581222534179688, + "learning_rate": 2.2228392264275245e-06, + "loss": 3.0481, + "step": 71380 + }, + { + "epoch": 2.0915696183988866, + "grad_norm": 20.21709632873535, + "learning_rate": 2.2215242609528313e-06, + "loss": 3.043, + "step": 71390 + }, + { + "epoch": 2.0918625943016185, + "grad_norm": 17.09323501586914, + "learning_rate": 2.2202095734481544e-06, + "loss": 3.0223, + "step": 71400 + }, + { + "epoch": 2.092155570204351, + "grad_norm": 19.040082931518555, + "learning_rate": 2.218895164045015e-06, + "loss": 3.0351, + "step": 71410 + }, + { + "epoch": 2.0924485461070828, + "grad_norm": 17.51456642150879, + "learning_rate": 2.2175810328749177e-06, + "loss": 3.0492, + "step": 71420 + }, + { + "epoch": 2.0927415220098147, + "grad_norm": 22.859066009521484, + "learning_rate": 2.216267180069328e-06, + "loss": 3.0342, + "step": 71430 + }, + { + "epoch": 2.0930344979125466, + "grad_norm": 18.324865341186523, + "learning_rate": 2.2149536057596917e-06, + "loss": 3.064, + "step": 71440 + }, + { + "epoch": 2.0933274738152785, + "grad_norm": 20.463214874267578, + "learning_rate": 2.213640310077424e-06, + "loss": 3.0407, + "step": 71450 + }, + { + "epoch": 2.093620449718011, + "grad_norm": 16.368511199951172, + "learning_rate": 2.212327293153914e-06, + "loss": 3.0619, + "step": 71460 + }, + { + "epoch": 2.093913425620743, + "grad_norm": 21.58026123046875, + "learning_rate": 2.211014555120518e-06, + "loss": 3.0417, + "step": 71470 + }, + { + "epoch": 2.0942064015234747, + "grad_norm": 17.148008346557617, + "learning_rate": 2.2097020961085708e-06, + "loss": 3.079, + "step": 71480 + }, + { + "epoch": 2.0944993774262066, + "grad_norm": 21.24039077758789, + "learning_rate": 2.2083899162493734e-06, + "loss": 3.0554, + "step": 71490 + }, + { + "epoch": 2.0947923533289385, + "grad_norm": 22.062746047973633, + "learning_rate": 2.2070780156742043e-06, + "loss": 3.0439, + "step": 71500 + }, + { + "epoch": 2.095085329231671, + "grad_norm": 20.434133529663086, + "learning_rate": 2.2057663945143082e-06, + "loss": 3.046, + "step": 71510 + }, + { + "epoch": 2.095378305134403, + "grad_norm": 20.01483917236328, + "learning_rate": 2.204455052900906e-06, + "loss": 3.0412, + "step": 71520 + }, + { + "epoch": 2.0956712810371347, + "grad_norm": 21.558841705322266, + "learning_rate": 2.2031439909651913e-06, + "loss": 3.0587, + "step": 71530 + }, + { + "epoch": 2.0959642569398667, + "grad_norm": 18.90597152709961, + "learning_rate": 2.201833208838329e-06, + "loss": 3.0451, + "step": 71540 + }, + { + "epoch": 2.0962572328425986, + "grad_norm": 20.613662719726562, + "learning_rate": 2.2005227066514504e-06, + "loss": 3.0571, + "step": 71550 + }, + { + "epoch": 2.0965502087453305, + "grad_norm": 18.528121948242188, + "learning_rate": 2.1992124845356687e-06, + "loss": 3.0174, + "step": 71560 + }, + { + "epoch": 2.096843184648063, + "grad_norm": 17.773704528808594, + "learning_rate": 2.1979025426220596e-06, + "loss": 3.0278, + "step": 71570 + }, + { + "epoch": 2.0971361605507948, + "grad_norm": 21.007339477539062, + "learning_rate": 2.196592881041679e-06, + "loss": 3.0546, + "step": 71580 + }, + { + "epoch": 2.0974291364535267, + "grad_norm": 21.81416130065918, + "learning_rate": 2.195283499925546e-06, + "loss": 3.0452, + "step": 71590 + }, + { + "epoch": 2.0977221123562586, + "grad_norm": 17.19646644592285, + "learning_rate": 2.19397439940466e-06, + "loss": 3.034, + "step": 71600 + }, + { + "epoch": 2.0980150882589905, + "grad_norm": 18.07573890686035, + "learning_rate": 2.1926655796099873e-06, + "loss": 3.0448, + "step": 71610 + }, + { + "epoch": 2.098308064161723, + "grad_norm": 23.923606872558594, + "learning_rate": 2.1913570406724695e-06, + "loss": 3.0383, + "step": 71620 + }, + { + "epoch": 2.098601040064455, + "grad_norm": 22.257246017456055, + "learning_rate": 2.190048782723015e-06, + "loss": 3.0389, + "step": 71630 + }, + { + "epoch": 2.0988940159671867, + "grad_norm": 15.952543258666992, + "learning_rate": 2.188740805892511e-06, + "loss": 3.0513, + "step": 71640 + }, + { + "epoch": 2.0991869918699186, + "grad_norm": 20.883255004882812, + "learning_rate": 2.1874331103118086e-06, + "loss": 3.0324, + "step": 71650 + }, + { + "epoch": 2.0994799677726506, + "grad_norm": 23.477012634277344, + "learning_rate": 2.186125696111739e-06, + "loss": 3.0611, + "step": 71660 + }, + { + "epoch": 2.099772943675383, + "grad_norm": 16.099937438964844, + "learning_rate": 2.184818563423098e-06, + "loss": 3.0455, + "step": 71670 + }, + { + "epoch": 2.100065919578115, + "grad_norm": 18.552703857421875, + "learning_rate": 2.183511712376657e-06, + "loss": 3.0502, + "step": 71680 + }, + { + "epoch": 2.100065919578115, + "eval_bleu": 0.35103668241499797, + "eval_cap_loss": 0.906186044216156, + "eval_con_loss": 1.1466370820999146, + "eval_loss": 3.199460029602051, + "step": 71680 + }, + { + "epoch": 2.100065919578115, + "eval_bleu": 0.35103668241499797, + "eval_cap_loss": 0.906186044216156, + "eval_con_loss": 1.1466370820999146, + "eval_loss": 3.199460029602051, + "eval_runtime": 56.1216, + "eval_samples_per_second": 356.369, + "eval_steps_per_second": 0.356, + "step": 71680 + }, + { + "epoch": 2.1003588954808468, + "grad_norm": 18.459728240966797, + "learning_rate": 2.18220514310316e-06, + "loss": 3.0643, + "step": 71690 + }, + { + "epoch": 2.1006518713835787, + "grad_norm": 21.639970779418945, + "learning_rate": 2.1808988557333225e-06, + "loss": 3.0305, + "step": 71700 + }, + { + "epoch": 2.1009448472863106, + "grad_norm": 20.860864639282227, + "learning_rate": 2.179592850397828e-06, + "loss": 3.0655, + "step": 71710 + }, + { + "epoch": 2.1012378231890425, + "grad_norm": 21.0828914642334, + "learning_rate": 2.1782871272273377e-06, + "loss": 3.0358, + "step": 71720 + }, + { + "epoch": 2.101530799091775, + "grad_norm": 15.550856590270996, + "learning_rate": 2.176981686352478e-06, + "loss": 3.0403, + "step": 71730 + }, + { + "epoch": 2.101823774994507, + "grad_norm": 14.10881519317627, + "learning_rate": 2.1756765279038518e-06, + "loss": 3.0488, + "step": 71740 + }, + { + "epoch": 2.1021167508972387, + "grad_norm": 17.487319946289062, + "learning_rate": 2.1743716520120346e-06, + "loss": 3.0405, + "step": 71750 + }, + { + "epoch": 2.1024097267999706, + "grad_norm": 18.94965934753418, + "learning_rate": 2.173067058807568e-06, + "loss": 3.0375, + "step": 71760 + }, + { + "epoch": 2.1027027027027025, + "grad_norm": 19.278650283813477, + "learning_rate": 2.171762748420971e-06, + "loss": 3.0255, + "step": 71770 + }, + { + "epoch": 2.102995678605435, + "grad_norm": 18.378498077392578, + "learning_rate": 2.170458720982731e-06, + "loss": 3.0351, + "step": 71780 + }, + { + "epoch": 2.103288654508167, + "grad_norm": 18.776304244995117, + "learning_rate": 2.1691549766233113e-06, + "loss": 3.0635, + "step": 71790 + }, + { + "epoch": 2.1035816304108987, + "grad_norm": 18.390193939208984, + "learning_rate": 2.167851515473139e-06, + "loss": 3.0549, + "step": 71800 + }, + { + "epoch": 2.1038746063136307, + "grad_norm": 18.137319564819336, + "learning_rate": 2.166548337662622e-06, + "loss": 3.053, + "step": 71810 + }, + { + "epoch": 2.1041675822163626, + "grad_norm": 17.829511642456055, + "learning_rate": 2.1652454433221315e-06, + "loss": 3.0363, + "step": 71820 + }, + { + "epoch": 2.1044605581190945, + "grad_norm": 20.765697479248047, + "learning_rate": 2.163942832582019e-06, + "loss": 3.0458, + "step": 71830 + }, + { + "epoch": 2.104753534021827, + "grad_norm": 17.133033752441406, + "learning_rate": 2.1626405055725974e-06, + "loss": 3.0435, + "step": 71840 + }, + { + "epoch": 2.1050465099245588, + "grad_norm": 15.211833000183105, + "learning_rate": 2.1613384624241606e-06, + "loss": 3.0418, + "step": 71850 + }, + { + "epoch": 2.1053394858272907, + "grad_norm": 17.457408905029297, + "learning_rate": 2.160036703266969e-06, + "loss": 3.0486, + "step": 71860 + }, + { + "epoch": 2.1056324617300226, + "grad_norm": 17.866607666015625, + "learning_rate": 2.1587352282312583e-06, + "loss": 3.034, + "step": 71870 + }, + { + "epoch": 2.1059254376327545, + "grad_norm": 18.364849090576172, + "learning_rate": 2.1574340374472296e-06, + "loss": 3.0198, + "step": 71880 + }, + { + "epoch": 2.106218413535487, + "grad_norm": 16.819347381591797, + "learning_rate": 2.156133131045063e-06, + "loss": 3.0387, + "step": 71890 + }, + { + "epoch": 2.106511389438219, + "grad_norm": 18.47951889038086, + "learning_rate": 2.1548325091549026e-06, + "loss": 3.0347, + "step": 71900 + }, + { + "epoch": 2.1068043653409507, + "grad_norm": 16.488149642944336, + "learning_rate": 2.1535321719068713e-06, + "loss": 3.0359, + "step": 71910 + }, + { + "epoch": 2.1070973412436826, + "grad_norm": 22.85731315612793, + "learning_rate": 2.1522321194310577e-06, + "loss": 3.037, + "step": 71920 + }, + { + "epoch": 2.1073903171464146, + "grad_norm": 15.057500839233398, + "learning_rate": 2.150932351857525e-06, + "loss": 3.047, + "step": 71930 + }, + { + "epoch": 2.107683293049147, + "grad_norm": 17.504262924194336, + "learning_rate": 2.149632869316307e-06, + "loss": 3.0489, + "step": 71940 + }, + { + "epoch": 2.107976268951879, + "grad_norm": 19.200790405273438, + "learning_rate": 2.148333671937412e-06, + "loss": 3.0516, + "step": 71950 + }, + { + "epoch": 2.1082692448546108, + "grad_norm": 21.483121871948242, + "learning_rate": 2.1470347598508128e-06, + "loss": 3.0525, + "step": 71960 + }, + { + "epoch": 2.1085622207573427, + "grad_norm": 20.835763931274414, + "learning_rate": 2.1457361331864618e-06, + "loss": 3.031, + "step": 71970 + }, + { + "epoch": 2.1088551966600746, + "grad_norm": 18.470266342163086, + "learning_rate": 2.1444377920742747e-06, + "loss": 3.0327, + "step": 71980 + }, + { + "epoch": 2.1091481725628065, + "grad_norm": 18.218544006347656, + "learning_rate": 2.1431397366441474e-06, + "loss": 3.0335, + "step": 71990 + }, + { + "epoch": 2.109441148465539, + "grad_norm": 17.21381950378418, + "learning_rate": 2.141841967025938e-06, + "loss": 3.0396, + "step": 72000 + }, + { + "epoch": 2.109734124368271, + "grad_norm": 18.5155086517334, + "learning_rate": 2.140544483349484e-06, + "loss": 3.0311, + "step": 72010 + }, + { + "epoch": 2.1100271002710027, + "grad_norm": 19.18182373046875, + "learning_rate": 2.139247285744589e-06, + "loss": 3.0521, + "step": 72020 + }, + { + "epoch": 2.1103200761737346, + "grad_norm": 18.368188858032227, + "learning_rate": 2.1379503743410323e-06, + "loss": 3.0495, + "step": 72030 + }, + { + "epoch": 2.1106130520764665, + "grad_norm": 18.926023483276367, + "learning_rate": 2.136653749268559e-06, + "loss": 3.0574, + "step": 72040 + }, + { + "epoch": 2.110906027979199, + "grad_norm": 21.672285079956055, + "learning_rate": 2.135357410656893e-06, + "loss": 3.0359, + "step": 72050 + }, + { + "epoch": 2.111199003881931, + "grad_norm": 19.050064086914062, + "learning_rate": 2.13406135863572e-06, + "loss": 3.0313, + "step": 72060 + }, + { + "epoch": 2.1114919797846627, + "grad_norm": 19.57837677001953, + "learning_rate": 2.1327655933347074e-06, + "loss": 3.0417, + "step": 72070 + }, + { + "epoch": 2.1117849556873947, + "grad_norm": 21.55324935913086, + "learning_rate": 2.1314701148834842e-06, + "loss": 3.0433, + "step": 72080 + }, + { + "epoch": 2.1120779315901266, + "grad_norm": 18.427602767944336, + "learning_rate": 2.130174923411658e-06, + "loss": 3.0413, + "step": 72090 + }, + { + "epoch": 2.112370907492859, + "grad_norm": 18.874258041381836, + "learning_rate": 2.1288800190488045e-06, + "loss": 3.041, + "step": 72100 + }, + { + "epoch": 2.112663883395591, + "grad_norm": 19.39142608642578, + "learning_rate": 2.1275854019244717e-06, + "loss": 3.0533, + "step": 72110 + }, + { + "epoch": 2.1129568592983228, + "grad_norm": 18.40638542175293, + "learning_rate": 2.1262910721681793e-06, + "loss": 3.0664, + "step": 72120 + }, + { + "epoch": 2.1132498352010547, + "grad_norm": 18.247047424316406, + "learning_rate": 2.124997029909414e-06, + "loss": 3.0488, + "step": 72130 + }, + { + "epoch": 2.1135428111037866, + "grad_norm": 19.980667114257812, + "learning_rate": 2.1237032752776415e-06, + "loss": 3.0708, + "step": 72140 + }, + { + "epoch": 2.1138357870065185, + "grad_norm": 18.19919204711914, + "learning_rate": 2.1224098084022896e-06, + "loss": 3.0263, + "step": 72150 + }, + { + "epoch": 2.114128762909251, + "grad_norm": 17.631696701049805, + "learning_rate": 2.121116629412766e-06, + "loss": 3.0361, + "step": 72160 + }, + { + "epoch": 2.114421738811983, + "grad_norm": 19.952991485595703, + "learning_rate": 2.1198237384384424e-06, + "loss": 3.0433, + "step": 72170 + }, + { + "epoch": 2.1147147147147147, + "grad_norm": 16.186418533325195, + "learning_rate": 2.1185311356086657e-06, + "loss": 3.0392, + "step": 72180 + }, + { + "epoch": 2.1150076906174466, + "grad_norm": 18.478538513183594, + "learning_rate": 2.1172388210527544e-06, + "loss": 3.0291, + "step": 72190 + }, + { + "epoch": 2.115066285797993, + "eval_bleu": 0.35115044250358535, + "eval_cap_loss": 0.9062657356262207, + "eval_con_loss": 1.1441712379455566, + "eval_loss": 3.194607973098755, + "step": 72192 + }, + { + "epoch": 2.115066285797993, + "eval_bleu": 0.35115044250358535, + "eval_cap_loss": 0.9062657356262207, + "eval_con_loss": 1.1441712379455566, + "eval_loss": 3.194607973098755, + "eval_runtime": 52.2156, + "eval_samples_per_second": 383.028, + "eval_steps_per_second": 0.383, + "step": 72192 + }, + { + "epoch": 2.1153006665201786, + "grad_norm": 18.405317306518555, + "learning_rate": 2.115946794899997e-06, + "loss": 3.0278, + "step": 72200 + }, + { + "epoch": 2.115593642422911, + "grad_norm": 16.819852828979492, + "learning_rate": 2.1146550572796515e-06, + "loss": 3.0594, + "step": 72210 + }, + { + "epoch": 2.115886618325643, + "grad_norm": 21.64528465270996, + "learning_rate": 2.1133636083209506e-06, + "loss": 3.033, + "step": 72220 + }, + { + "epoch": 2.1161795942283748, + "grad_norm": 19.025327682495117, + "learning_rate": 2.1120724481530937e-06, + "loss": 3.0466, + "step": 72230 + }, + { + "epoch": 2.1164725701311067, + "grad_norm": 16.82233238220215, + "learning_rate": 2.1107815769052557e-06, + "loss": 3.0166, + "step": 72240 + }, + { + "epoch": 2.1167655460338386, + "grad_norm": 19.921419143676758, + "learning_rate": 2.1094909947065784e-06, + "loss": 3.0459, + "step": 72250 + }, + { + "epoch": 2.1170585219365705, + "grad_norm": 18.748693466186523, + "learning_rate": 2.1082007016861783e-06, + "loss": 3.0412, + "step": 72260 + }, + { + "epoch": 2.117351497839303, + "grad_norm": 18.0955867767334, + "learning_rate": 2.106910697973142e-06, + "loss": 3.0445, + "step": 72270 + }, + { + "epoch": 2.117644473742035, + "grad_norm": 14.096735000610352, + "learning_rate": 2.1056209836965273e-06, + "loss": 3.0203, + "step": 72280 + }, + { + "epoch": 2.1179374496447667, + "grad_norm": 18.189815521240234, + "learning_rate": 2.10433155898536e-06, + "loss": 3.0516, + "step": 72290 + }, + { + "epoch": 2.1182304255474986, + "grad_norm": 16.9205322265625, + "learning_rate": 2.1030424239686426e-06, + "loss": 3.0348, + "step": 72300 + }, + { + "epoch": 2.1185234014502305, + "grad_norm": 18.571565628051758, + "learning_rate": 2.1017535787753417e-06, + "loss": 3.0377, + "step": 72310 + }, + { + "epoch": 2.118816377352963, + "grad_norm": 17.416698455810547, + "learning_rate": 2.1004650235344027e-06, + "loss": 3.0612, + "step": 72320 + }, + { + "epoch": 2.119109353255695, + "grad_norm": 18.689884185791016, + "learning_rate": 2.0991767583747336e-06, + "loss": 3.0502, + "step": 72330 + }, + { + "epoch": 2.1194023291584267, + "grad_norm": 17.777997970581055, + "learning_rate": 2.0978887834252198e-06, + "loss": 3.0217, + "step": 72340 + }, + { + "epoch": 2.1196953050611587, + "grad_norm": 21.69280433654785, + "learning_rate": 2.0966010988147156e-06, + "loss": 3.0285, + "step": 72350 + }, + { + "epoch": 2.1199882809638906, + "grad_norm": 19.808019638061523, + "learning_rate": 2.095313704672048e-06, + "loss": 3.0406, + "step": 72360 + }, + { + "epoch": 2.1202812568666225, + "grad_norm": 22.34588623046875, + "learning_rate": 2.0940266011260097e-06, + "loss": 3.0401, + "step": 72370 + }, + { + "epoch": 2.120574232769355, + "grad_norm": 20.50459861755371, + "learning_rate": 2.092739788305371e-06, + "loss": 3.0639, + "step": 72380 + }, + { + "epoch": 2.120867208672087, + "grad_norm": 16.333696365356445, + "learning_rate": 2.0914532663388664e-06, + "loss": 3.0348, + "step": 72390 + }, + { + "epoch": 2.1211601845748187, + "grad_norm": 19.630935668945312, + "learning_rate": 2.0901670353552082e-06, + "loss": 3.0471, + "step": 72400 + }, + { + "epoch": 2.1214531604775506, + "grad_norm": 19.705486297607422, + "learning_rate": 2.0888810954830736e-06, + "loss": 3.0569, + "step": 72410 + }, + { + "epoch": 2.1217461363802825, + "grad_norm": 19.31632423400879, + "learning_rate": 2.087595446851114e-06, + "loss": 3.0533, + "step": 72420 + }, + { + "epoch": 2.122039112283015, + "grad_norm": 19.338817596435547, + "learning_rate": 2.0863100895879513e-06, + "loss": 3.0308, + "step": 72430 + }, + { + "epoch": 2.122332088185747, + "grad_norm": 17.209190368652344, + "learning_rate": 2.08502502382218e-06, + "loss": 3.0615, + "step": 72440 + }, + { + "epoch": 2.1226250640884787, + "grad_norm": 18.614660263061523, + "learning_rate": 2.083740249682359e-06, + "loss": 3.0439, + "step": 72450 + }, + { + "epoch": 2.1229180399912106, + "grad_norm": 20.03639030456543, + "learning_rate": 2.082455767297024e-06, + "loss": 3.0428, + "step": 72460 + }, + { + "epoch": 2.1232110158939426, + "grad_norm": 22.693016052246094, + "learning_rate": 2.0811715767946824e-06, + "loss": 3.0234, + "step": 72470 + }, + { + "epoch": 2.123503991796675, + "grad_norm": 16.922225952148438, + "learning_rate": 2.079887678303806e-06, + "loss": 3.0471, + "step": 72480 + }, + { + "epoch": 2.123796967699407, + "grad_norm": 17.027084350585938, + "learning_rate": 2.0786040719528442e-06, + "loss": 3.0256, + "step": 72490 + }, + { + "epoch": 2.1240899436021388, + "grad_norm": 18.937162399291992, + "learning_rate": 2.0773207578702122e-06, + "loss": 3.04, + "step": 72500 + }, + { + "epoch": 2.1243829195048707, + "grad_norm": 17.234264373779297, + "learning_rate": 2.076037736184298e-06, + "loss": 3.0247, + "step": 72510 + }, + { + "epoch": 2.1246758954076026, + "grad_norm": 19.201679229736328, + "learning_rate": 2.074755007023461e-06, + "loss": 3.0297, + "step": 72520 + }, + { + "epoch": 2.1249688713103345, + "grad_norm": 18.313995361328125, + "learning_rate": 2.073472570516033e-06, + "loss": 3.021, + "step": 72530 + }, + { + "epoch": 2.125261847213067, + "grad_norm": 19.28325080871582, + "learning_rate": 2.0721904267903097e-06, + "loss": 3.0417, + "step": 72540 + }, + { + "epoch": 2.125554823115799, + "grad_norm": 17.50749969482422, + "learning_rate": 2.0709085759745654e-06, + "loss": 3.0343, + "step": 72550 + }, + { + "epoch": 2.1258477990185307, + "grad_norm": 18.8962459564209, + "learning_rate": 2.06962701819704e-06, + "loss": 3.0528, + "step": 72560 + }, + { + "epoch": 2.1261407749212626, + "grad_norm": 19.229476928710938, + "learning_rate": 2.0683457535859465e-06, + "loss": 3.0213, + "step": 72570 + }, + { + "epoch": 2.1264337508239946, + "grad_norm": 19.232088088989258, + "learning_rate": 2.067064782269466e-06, + "loss": 3.041, + "step": 72580 + }, + { + "epoch": 2.126726726726727, + "grad_norm": 18.596994400024414, + "learning_rate": 2.0657841043757537e-06, + "loss": 3.0228, + "step": 72590 + }, + { + "epoch": 2.127019702629459, + "grad_norm": 18.490169525146484, + "learning_rate": 2.0645037200329333e-06, + "loss": 3.0458, + "step": 72600 + }, + { + "epoch": 2.1273126785321907, + "grad_norm": 20.135982513427734, + "learning_rate": 2.0632236293691023e-06, + "loss": 3.024, + "step": 72610 + }, + { + "epoch": 2.1276056544349227, + "grad_norm": 22.49176597595215, + "learning_rate": 2.061943832512322e-06, + "loss": 3.0404, + "step": 72620 + }, + { + "epoch": 2.1278986303376546, + "grad_norm": 20.494596481323242, + "learning_rate": 2.060664329590632e-06, + "loss": 3.0494, + "step": 72630 + }, + { + "epoch": 2.128191606240387, + "grad_norm": 18.976945877075195, + "learning_rate": 2.0593851207320355e-06, + "loss": 3.049, + "step": 72640 + }, + { + "epoch": 2.128484582143119, + "grad_norm": 15.921378135681152, + "learning_rate": 2.0581062060645135e-06, + "loss": 3.0571, + "step": 72650 + }, + { + "epoch": 2.128777558045851, + "grad_norm": 20.89639663696289, + "learning_rate": 2.0568275857160096e-06, + "loss": 3.0367, + "step": 72660 + }, + { + "epoch": 2.1290705339485827, + "grad_norm": 17.285594940185547, + "learning_rate": 2.0555492598144456e-06, + "loss": 3.0376, + "step": 72670 + }, + { + "epoch": 2.1293635098513146, + "grad_norm": 23.735065460205078, + "learning_rate": 2.0542712284877086e-06, + "loss": 3.0329, + "step": 72680 + }, + { + "epoch": 2.1296564857540465, + "grad_norm": 16.826732635498047, + "learning_rate": 2.0529934918636605e-06, + "loss": 3.0532, + "step": 72690 + }, + { + "epoch": 2.129949461656779, + "grad_norm": 15.89677906036377, + "learning_rate": 2.051716050070128e-06, + "loss": 3.0294, + "step": 72700 + }, + { + "epoch": 2.1300666520178715, + "eval_bleu": 0.35140664416677087, + "eval_cap_loss": 0.9060213565826416, + "eval_con_loss": 1.143951654434204, + "eval_loss": 3.19392466545105, + "step": 72704 + }, + { + "epoch": 2.1300666520178715, + "eval_bleu": 0.35140664416677087, + "eval_cap_loss": 0.9060213565826416, + "eval_con_loss": 1.143951654434204, + "eval_loss": 3.19392466545105, + "eval_runtime": 53.3549, + "eval_samples_per_second": 374.848, + "eval_steps_per_second": 0.375, + "step": 72704 + }, + { + "epoch": 2.130242437559511, + "grad_norm": 17.541120529174805, + "learning_rate": 2.050438903234915e-06, + "loss": 3.0621, + "step": 72710 + }, + { + "epoch": 2.1305354134622427, + "grad_norm": 17.335308074951172, + "learning_rate": 2.0491620514857885e-06, + "loss": 3.0311, + "step": 72720 + }, + { + "epoch": 2.1308283893649747, + "grad_norm": 17.935890197753906, + "learning_rate": 2.047885494950494e-06, + "loss": 3.0465, + "step": 72730 + }, + { + "epoch": 2.1311213652677066, + "grad_norm": 21.98090171813965, + "learning_rate": 2.0466092337567396e-06, + "loss": 3.0303, + "step": 72740 + }, + { + "epoch": 2.131414341170439, + "grad_norm": 17.462078094482422, + "learning_rate": 2.04533326803221e-06, + "loss": 3.0363, + "step": 72750 + }, + { + "epoch": 2.131707317073171, + "grad_norm": 17.196340560913086, + "learning_rate": 2.044057597904557e-06, + "loss": 3.0218, + "step": 72760 + }, + { + "epoch": 2.1320002929759028, + "grad_norm": 20.670106887817383, + "learning_rate": 2.0427822235014055e-06, + "loss": 3.0363, + "step": 72770 + }, + { + "epoch": 2.1322932688786347, + "grad_norm": 19.10226058959961, + "learning_rate": 2.041507144950347e-06, + "loss": 3.027, + "step": 72780 + }, + { + "epoch": 2.1325862447813666, + "grad_norm": 15.26544189453125, + "learning_rate": 2.0402323623789484e-06, + "loss": 3.0307, + "step": 72790 + }, + { + "epoch": 2.1328792206840985, + "grad_norm": 16.73879623413086, + "learning_rate": 2.0389578759147397e-06, + "loss": 3.0302, + "step": 72800 + }, + { + "epoch": 2.133172196586831, + "grad_norm": 20.782032012939453, + "learning_rate": 2.0376836856852306e-06, + "loss": 3.0358, + "step": 72810 + }, + { + "epoch": 2.133465172489563, + "grad_norm": 19.086210250854492, + "learning_rate": 2.036409791817892e-06, + "loss": 3.0182, + "step": 72820 + }, + { + "epoch": 2.1337581483922947, + "grad_norm": 21.206710815429688, + "learning_rate": 2.0351361944401717e-06, + "loss": 3.0306, + "step": 72830 + }, + { + "epoch": 2.1340511242950266, + "grad_norm": 14.376442909240723, + "learning_rate": 2.033990210404157e-06, + "loss": 3.0297, + "step": 72840 + }, + { + "epoch": 2.1343441001977586, + "grad_norm": 16.774028778076172, + "learning_rate": 2.032717176707717e-06, + "loss": 3.0206, + "step": 72850 + }, + { + "epoch": 2.134637076100491, + "grad_norm": 16.3071231842041, + "learning_rate": 2.031444439870319e-06, + "loss": 3.0321, + "step": 72860 + }, + { + "epoch": 2.134930052003223, + "grad_norm": 21.627525329589844, + "learning_rate": 2.0301720000192926e-06, + "loss": 3.049, + "step": 72870 + }, + { + "epoch": 2.1352230279059548, + "grad_norm": 19.305688858032227, + "learning_rate": 2.02889985728194e-06, + "loss": 3.0236, + "step": 72880 + }, + { + "epoch": 2.1355160038086867, + "grad_norm": 17.551179885864258, + "learning_rate": 2.027628011785528e-06, + "loss": 3.0447, + "step": 72890 + }, + { + "epoch": 2.1358089797114186, + "grad_norm": 18.277219772338867, + "learning_rate": 2.0263564636573013e-06, + "loss": 3.0347, + "step": 72900 + }, + { + "epoch": 2.1361019556141505, + "grad_norm": 21.580028533935547, + "learning_rate": 2.0250852130244656e-06, + "loss": 3.0453, + "step": 72910 + }, + { + "epoch": 2.136394931516883, + "grad_norm": 19.027563095092773, + "learning_rate": 2.0238142600142073e-06, + "loss": 3.0287, + "step": 72920 + }, + { + "epoch": 2.136687907419615, + "grad_norm": 15.883684158325195, + "learning_rate": 2.0225436047536733e-06, + "loss": 3.0152, + "step": 72930 + }, + { + "epoch": 2.1369808833223467, + "grad_norm": 21.054845809936523, + "learning_rate": 2.021273247369987e-06, + "loss": 3.0383, + "step": 72940 + }, + { + "epoch": 2.1372738592250786, + "grad_norm": 18.615530014038086, + "learning_rate": 2.02000318799024e-06, + "loss": 3.0337, + "step": 72950 + }, + { + "epoch": 2.1375668351278105, + "grad_norm": 18.48637580871582, + "learning_rate": 2.0187334267414958e-06, + "loss": 3.0523, + "step": 72960 + }, + { + "epoch": 2.137859811030543, + "grad_norm": 18.868209838867188, + "learning_rate": 2.017463963750784e-06, + "loss": 3.0223, + "step": 72970 + }, + { + "epoch": 2.138152786933275, + "grad_norm": 20.5822696685791, + "learning_rate": 2.016194799145109e-06, + "loss": 3.0286, + "step": 72980 + }, + { + "epoch": 2.1384457628360067, + "grad_norm": 17.65095329284668, + "learning_rate": 2.014925933051441e-06, + "loss": 3.0437, + "step": 72990 + }, + { + "epoch": 2.1387387387387387, + "grad_norm": 18.161802291870117, + "learning_rate": 2.0136573655967247e-06, + "loss": 3.0362, + "step": 73000 + }, + { + "epoch": 2.1390317146414706, + "grad_norm": 18.24907684326172, + "learning_rate": 2.0123890969078696e-06, + "loss": 3.0307, + "step": 73010 + }, + { + "epoch": 2.139324690544203, + "grad_norm": 17.07781410217285, + "learning_rate": 2.0111211271117607e-06, + "loss": 3.0309, + "step": 73020 + }, + { + "epoch": 2.139617666446935, + "grad_norm": 19.606969833374023, + "learning_rate": 2.0098534563352507e-06, + "loss": 3.038, + "step": 73030 + }, + { + "epoch": 2.1399106423496668, + "grad_norm": 21.17438507080078, + "learning_rate": 2.0085860847051647e-06, + "loss": 3.0289, + "step": 73040 + }, + { + "epoch": 2.1402036182523987, + "grad_norm": 22.42432975769043, + "learning_rate": 2.0073190123482912e-06, + "loss": 3.0539, + "step": 73050 + }, + { + "epoch": 2.1404965941551306, + "grad_norm": 20.64442253112793, + "learning_rate": 2.006052239391397e-06, + "loss": 3.0514, + "step": 73060 + }, + { + "epoch": 2.140789570057863, + "grad_norm": 22.559890747070312, + "learning_rate": 2.0047857659612123e-06, + "loss": 3.034, + "step": 73070 + }, + { + "epoch": 2.141082545960595, + "grad_norm": 16.878631591796875, + "learning_rate": 2.0035195921844434e-06, + "loss": 3.0425, + "step": 73080 + }, + { + "epoch": 2.141375521863327, + "grad_norm": 18.098108291625977, + "learning_rate": 2.0022537181877604e-06, + "loss": 3.0441, + "step": 73090 + }, + { + "epoch": 2.1416684977660587, + "grad_norm": 18.57897186279297, + "learning_rate": 2.000988144097808e-06, + "loss": 3.0348, + "step": 73100 + }, + { + "epoch": 2.1419614736687906, + "grad_norm": 17.27414894104004, + "learning_rate": 1.9997228700411987e-06, + "loss": 3.0173, + "step": 73110 + }, + { + "epoch": 2.1422544495715226, + "grad_norm": 17.846515655517578, + "learning_rate": 1.9984578961445184e-06, + "loss": 3.0217, + "step": 73120 + }, + { + "epoch": 2.142547425474255, + "grad_norm": 19.190650939941406, + "learning_rate": 1.997193222534316e-06, + "loss": 3.0177, + "step": 73130 + }, + { + "epoch": 2.142840401376987, + "grad_norm": 16.979650497436523, + "learning_rate": 1.9959288493371163e-06, + "loss": 3.0308, + "step": 73140 + }, + { + "epoch": 2.1431333772797188, + "grad_norm": 16.995574951171875, + "learning_rate": 1.994664776679415e-06, + "loss": 3.0383, + "step": 73150 + }, + { + "epoch": 2.1434263531824507, + "grad_norm": 19.663867950439453, + "learning_rate": 1.993401004687671e-06, + "loss": 3.0328, + "step": 73160 + }, + { + "epoch": 2.1437193290851826, + "grad_norm": 16.54515838623047, + "learning_rate": 1.9921375334883205e-06, + "loss": 3.0473, + "step": 73170 + }, + { + "epoch": 2.144012304987915, + "grad_norm": 20.964826583862305, + "learning_rate": 1.9908743632077633e-06, + "loss": 3.0361, + "step": 73180 + }, + { + "epoch": 2.144305280890647, + "grad_norm": 16.16773223876953, + "learning_rate": 1.989611493972373e-06, + "loss": 3.0464, + "step": 73190 + }, + { + "epoch": 2.144598256793379, + "grad_norm": 17.36885643005371, + "learning_rate": 1.9883489259084924e-06, + "loss": 3.0164, + "step": 73200 + }, + { + "epoch": 2.1448912326961107, + "grad_norm": 16.46112060546875, + "learning_rate": 1.987086659142437e-06, + "loss": 3.0199, + "step": 73210 + }, + { + "epoch": 2.14506701823775, + "eval_bleu": 0.35175796429341427, + "eval_cap_loss": 0.9056313037872314, + "eval_con_loss": 1.14286208152771, + "eval_loss": 3.1913554668426514, + "step": 73216 + }, + { + "epoch": 2.14506701823775, + "eval_bleu": 0.35175796429341427, + "eval_cap_loss": 0.9056313037872314, + "eval_con_loss": 1.14286208152771, + "eval_loss": 3.1913554668426514, + "eval_runtime": 56.1796, + "eval_samples_per_second": 356.001, + "eval_steps_per_second": 0.356, + "step": 73216 + }, + { + "epoch": 2.1451842085988426, + "grad_norm": 18.795228958129883, + "learning_rate": 1.9858246938004845e-06, + "loss": 3.0443, + "step": 73220 + }, + { + "epoch": 2.1454771845015745, + "grad_norm": 19.214292526245117, + "learning_rate": 1.9845630300088896e-06, + "loss": 3.0378, + "step": 73230 + }, + { + "epoch": 2.145770160404307, + "grad_norm": 17.98617935180664, + "learning_rate": 1.9833016678938725e-06, + "loss": 3.0157, + "step": 73240 + }, + { + "epoch": 2.146063136307039, + "grad_norm": 15.915963172912598, + "learning_rate": 1.9820406075816286e-06, + "loss": 3.0274, + "step": 73250 + }, + { + "epoch": 2.1463561122097707, + "grad_norm": 19.316749572753906, + "learning_rate": 1.9807798491983144e-06, + "loss": 3.0349, + "step": 73260 + }, + { + "epoch": 2.1466490881125027, + "grad_norm": 18.545089721679688, + "learning_rate": 1.9795193928700632e-06, + "loss": 3.0496, + "step": 73270 + }, + { + "epoch": 2.1469420640152346, + "grad_norm": 17.757522583007812, + "learning_rate": 1.978259238722978e-06, + "loss": 3.0313, + "step": 73280 + }, + { + "epoch": 2.147235039917967, + "grad_norm": 17.71137809753418, + "learning_rate": 1.97699938688313e-06, + "loss": 3.0356, + "step": 73290 + }, + { + "epoch": 2.147528015820699, + "grad_norm": 17.901260375976562, + "learning_rate": 1.9757398374765564e-06, + "loss": 3.0523, + "step": 73300 + }, + { + "epoch": 2.1478209917234308, + "grad_norm": 17.770021438598633, + "learning_rate": 1.974480590629272e-06, + "loss": 3.0325, + "step": 73310 + }, + { + "epoch": 2.1481139676261627, + "grad_norm": 17.01178741455078, + "learning_rate": 1.9732216464672526e-06, + "loss": 3.0378, + "step": 73320 + }, + { + "epoch": 2.1484069435288946, + "grad_norm": 19.390296936035156, + "learning_rate": 1.9719630051164517e-06, + "loss": 3.0411, + "step": 73330 + }, + { + "epoch": 2.1486999194316265, + "grad_norm": 21.609098434448242, + "learning_rate": 1.9707046667027864e-06, + "loss": 3.0293, + "step": 73340 + }, + { + "epoch": 2.148992895334359, + "grad_norm": 19.664997100830078, + "learning_rate": 1.9694466313521463e-06, + "loss": 3.0344, + "step": 73350 + }, + { + "epoch": 2.149285871237091, + "grad_norm": 17.850923538208008, + "learning_rate": 1.968188899190392e-06, + "loss": 3.0263, + "step": 73360 + }, + { + "epoch": 2.1495788471398227, + "grad_norm": 19.020954132080078, + "learning_rate": 1.9669314703433518e-06, + "loss": 3.0208, + "step": 73370 + }, + { + "epoch": 2.1498718230425546, + "grad_norm": 15.053718566894531, + "learning_rate": 1.9656743449368224e-06, + "loss": 3.0444, + "step": 73380 + }, + { + "epoch": 2.1501647989452866, + "grad_norm": 14.043724060058594, + "learning_rate": 1.9644175230965744e-06, + "loss": 3.0218, + "step": 73390 + }, + { + "epoch": 2.150457774848019, + "grad_norm": 18.38821792602539, + "learning_rate": 1.963161004948342e-06, + "loss": 3.0299, + "step": 73400 + }, + { + "epoch": 2.150750750750751, + "grad_norm": 15.196855545043945, + "learning_rate": 1.9619047906178356e-06, + "loss": 3.0455, + "step": 73410 + }, + { + "epoch": 2.1510437266534828, + "grad_norm": 17.208974838256836, + "learning_rate": 1.960648880230729e-06, + "loss": 3.0324, + "step": 73420 + }, + { + "epoch": 2.1513367025562147, + "grad_norm": 15.237541198730469, + "learning_rate": 1.95939327391267e-06, + "loss": 3.0498, + "step": 73430 + }, + { + "epoch": 2.1516296784589466, + "grad_norm": 14.305912971496582, + "learning_rate": 1.9581379717892748e-06, + "loss": 3.0374, + "step": 73440 + }, + { + "epoch": 2.151922654361679, + "grad_norm": 22.913408279418945, + "learning_rate": 1.95688297398613e-06, + "loss": 3.045, + "step": 73450 + }, + { + "epoch": 2.152215630264411, + "grad_norm": 18.27918815612793, + "learning_rate": 1.9556282806287878e-06, + "loss": 3.0505, + "step": 73460 + }, + { + "epoch": 2.152508606167143, + "grad_norm": 16.891420364379883, + "learning_rate": 1.9543738918427767e-06, + "loss": 3.0402, + "step": 73470 + }, + { + "epoch": 2.1528015820698747, + "grad_norm": 22.238977432250977, + "learning_rate": 1.953119807753587e-06, + "loss": 3.0448, + "step": 73480 + }, + { + "epoch": 2.1530945579726066, + "grad_norm": 19.6129093170166, + "learning_rate": 1.951866028486685e-06, + "loss": 3.0524, + "step": 73490 + }, + { + "epoch": 2.153387533875339, + "grad_norm": 17.245540618896484, + "learning_rate": 1.950612554167502e-06, + "loss": 3.005, + "step": 73500 + }, + { + "epoch": 2.153680509778071, + "grad_norm": 19.044063568115234, + "learning_rate": 1.949359384921441e-06, + "loss": 3.055, + "step": 73510 + }, + { + "epoch": 2.153973485680803, + "grad_norm": 15.259649276733398, + "learning_rate": 1.9481065208738745e-06, + "loss": 3.0463, + "step": 73520 + }, + { + "epoch": 2.1542664615835347, + "grad_norm": 16.336179733276367, + "learning_rate": 1.9468539621501447e-06, + "loss": 3.0153, + "step": 73530 + }, + { + "epoch": 2.1545594374862667, + "grad_norm": 13.19817066192627, + "learning_rate": 1.9456017088755643e-06, + "loss": 3.006, + "step": 73540 + }, + { + "epoch": 2.1548524133889986, + "grad_norm": 18.218259811401367, + "learning_rate": 1.9443497611754097e-06, + "loss": 3.0419, + "step": 73550 + }, + { + "epoch": 2.155145389291731, + "grad_norm": 16.96271324157715, + "learning_rate": 1.943098119174935e-06, + "loss": 3.0342, + "step": 73560 + }, + { + "epoch": 2.155438365194463, + "grad_norm": 19.75491714477539, + "learning_rate": 1.941846782999355e-06, + "loss": 3.0271, + "step": 73570 + }, + { + "epoch": 2.1557313410971948, + "grad_norm": 19.547822952270508, + "learning_rate": 1.9405957527738632e-06, + "loss": 3.0414, + "step": 73580 + }, + { + "epoch": 2.1560243169999267, + "grad_norm": 16.205448150634766, + "learning_rate": 1.9393450286236137e-06, + "loss": 3.0457, + "step": 73590 + }, + { + "epoch": 2.1563172929026586, + "grad_norm": 18.859359741210938, + "learning_rate": 1.9380946106737358e-06, + "loss": 3.032, + "step": 73600 + }, + { + "epoch": 2.156610268805391, + "grad_norm": 21.970449447631836, + "learning_rate": 1.936844499049327e-06, + "loss": 3.0338, + "step": 73610 + }, + { + "epoch": 2.156903244708123, + "grad_norm": 17.329429626464844, + "learning_rate": 1.9355946938754537e-06, + "loss": 3.0383, + "step": 73620 + }, + { + "epoch": 2.157196220610855, + "grad_norm": 20.167436599731445, + "learning_rate": 1.9343451952771504e-06, + "loss": 3.0254, + "step": 73630 + }, + { + "epoch": 2.1574891965135867, + "grad_norm": 17.559350967407227, + "learning_rate": 1.9330960033794227e-06, + "loss": 3.0359, + "step": 73640 + }, + { + "epoch": 2.1577821724163186, + "grad_norm": 17.997451782226562, + "learning_rate": 1.9318471183072433e-06, + "loss": 3.0297, + "step": 73650 + }, + { + "epoch": 2.1580751483190506, + "grad_norm": 17.963455200195312, + "learning_rate": 1.930598540185559e-06, + "loss": 3.0123, + "step": 73660 + }, + { + "epoch": 2.158368124221783, + "grad_norm": 21.52311897277832, + "learning_rate": 1.929350269139279e-06, + "loss": 3.0433, + "step": 73670 + }, + { + "epoch": 2.158661100124515, + "grad_norm": 17.084444046020508, + "learning_rate": 1.928102305293287e-06, + "loss": 3.0328, + "step": 73680 + }, + { + "epoch": 2.1589540760272468, + "grad_norm": 18.12763786315918, + "learning_rate": 1.9268546487724343e-06, + "loss": 3.0385, + "step": 73690 + }, + { + "epoch": 2.1592470519299787, + "grad_norm": 16.335159301757812, + "learning_rate": 1.925607299701543e-06, + "loss": 3.033, + "step": 73700 + }, + { + "epoch": 2.1595400278327106, + "grad_norm": 15.43361759185791, + "learning_rate": 1.9243602582054006e-06, + "loss": 3.0588, + "step": 73710 + }, + { + "epoch": 2.159833003735443, + "grad_norm": 18.674449920654297, + "learning_rate": 1.9231135244087686e-06, + "loss": 3.039, + "step": 73720 + }, + { + "epoch": 2.160067384457628, + "eval_bleu": 0.3515872564871613, + "eval_cap_loss": 0.9054363965988159, + "eval_con_loss": 1.1427479982376099, + "eval_loss": 3.190932273864746, + "step": 73728 + }, + { + "epoch": 2.160067384457628, + "eval_bleu": 0.3515872564871613, + "eval_cap_loss": 0.9054363965988159, + "eval_con_loss": 1.1427479982376099, + "eval_loss": 3.190932273864746, + "eval_runtime": 53.6347, + "eval_samples_per_second": 372.893, + "eval_steps_per_second": 0.373, + "step": 73728 + }, + { + "epoch": 2.160125979638175, + "grad_norm": 18.8185977935791, + "learning_rate": 1.921867098436372e-06, + "loss": 3.0364, + "step": 73730 + }, + { + "epoch": 2.160418955540907, + "grad_norm": 21.87688446044922, + "learning_rate": 1.920620980412912e-06, + "loss": 3.039, + "step": 73740 + }, + { + "epoch": 2.1607119314436387, + "grad_norm": 17.65203857421875, + "learning_rate": 1.919375170463052e-06, + "loss": 3.0261, + "step": 73750 + }, + { + "epoch": 2.1610049073463706, + "grad_norm": 20.38711166381836, + "learning_rate": 1.9181296687114293e-06, + "loss": 3.0449, + "step": 73760 + }, + { + "epoch": 2.1612978832491025, + "grad_norm": 15.740436553955078, + "learning_rate": 1.9168844752826493e-06, + "loss": 3.0404, + "step": 73770 + }, + { + "epoch": 2.161590859151835, + "grad_norm": 19.960113525390625, + "learning_rate": 1.9156395903012877e-06, + "loss": 3.0146, + "step": 73780 + }, + { + "epoch": 2.161883835054567, + "grad_norm": 19.071178436279297, + "learning_rate": 1.9143950138918845e-06, + "loss": 3.0373, + "step": 73790 + }, + { + "epoch": 2.1621768109572987, + "grad_norm": 20.97027587890625, + "learning_rate": 1.9131507461789556e-06, + "loss": 3.0232, + "step": 73800 + }, + { + "epoch": 2.1624697868600307, + "grad_norm": 19.57915687561035, + "learning_rate": 1.911906787286979e-06, + "loss": 3.0201, + "step": 73810 + }, + { + "epoch": 2.1627627627627626, + "grad_norm": 21.05506134033203, + "learning_rate": 1.9106631373404096e-06, + "loss": 3.038, + "step": 73820 + }, + { + "epoch": 2.163055738665495, + "grad_norm": 22.28468894958496, + "learning_rate": 1.909419796463663e-06, + "loss": 3.0364, + "step": 73830 + }, + { + "epoch": 2.163348714568227, + "grad_norm": 19.49498748779297, + "learning_rate": 1.9081767647811293e-06, + "loss": 3.0079, + "step": 73840 + }, + { + "epoch": 2.163641690470959, + "grad_norm": 20.062902450561523, + "learning_rate": 1.906934042417168e-06, + "loss": 3.0437, + "step": 73850 + }, + { + "epoch": 2.1639346663736907, + "grad_norm": 17.257598876953125, + "learning_rate": 1.9056916294961047e-06, + "loss": 3.0248, + "step": 73860 + }, + { + "epoch": 2.1642276422764226, + "grad_norm": 18.56773567199707, + "learning_rate": 1.904449526142238e-06, + "loss": 3.0493, + "step": 73870 + }, + { + "epoch": 2.1645206181791545, + "grad_norm": 18.629995346069336, + "learning_rate": 1.9032077324798286e-06, + "loss": 3.0292, + "step": 73880 + }, + { + "epoch": 2.164813594081887, + "grad_norm": 20.558774948120117, + "learning_rate": 1.9019662486331153e-06, + "loss": 3.0316, + "step": 73890 + }, + { + "epoch": 2.165106569984619, + "grad_norm": 18.30131721496582, + "learning_rate": 1.9007250747262968e-06, + "loss": 3.0319, + "step": 73900 + }, + { + "epoch": 2.1653995458873507, + "grad_norm": 20.238649368286133, + "learning_rate": 1.8994842108835493e-06, + "loss": 3.0333, + "step": 73910 + }, + { + "epoch": 2.1656925217900826, + "grad_norm": 17.592008590698242, + "learning_rate": 1.8982436572290097e-06, + "loss": 3.0437, + "step": 73920 + }, + { + "epoch": 2.1659854976928146, + "grad_norm": 20.01427459716797, + "learning_rate": 1.89700341388679e-06, + "loss": 3.0411, + "step": 73930 + }, + { + "epoch": 2.166278473595547, + "grad_norm": 21.796871185302734, + "learning_rate": 1.8957634809809694e-06, + "loss": 3.0234, + "step": 73940 + }, + { + "epoch": 2.166571449498279, + "grad_norm": 15.771742820739746, + "learning_rate": 1.894523858635598e-06, + "loss": 3.0253, + "step": 73950 + }, + { + "epoch": 2.1668644254010108, + "grad_norm": 17.33553695678711, + "learning_rate": 1.893284546974688e-06, + "loss": 3.0287, + "step": 73960 + }, + { + "epoch": 2.1671574013037427, + "grad_norm": 21.912260055541992, + "learning_rate": 1.8920455461222293e-06, + "loss": 3.0599, + "step": 73970 + }, + { + "epoch": 2.1674503772064746, + "grad_norm": 17.509180068969727, + "learning_rate": 1.890806856202173e-06, + "loss": 3.018, + "step": 73980 + }, + { + "epoch": 2.167743353109207, + "grad_norm": 20.016490936279297, + "learning_rate": 1.889568477338447e-06, + "loss": 3.0313, + "step": 73990 + }, + { + "epoch": 2.168036329011939, + "grad_norm": 17.27622413635254, + "learning_rate": 1.8883304096549387e-06, + "loss": 3.0261, + "step": 74000 + }, + { + "epoch": 2.168329304914671, + "grad_norm": 18.687114715576172, + "learning_rate": 1.8870926532755123e-06, + "loss": 3.0338, + "step": 74010 + }, + { + "epoch": 2.1686222808174027, + "grad_norm": 19.44053840637207, + "learning_rate": 1.885978938801366e-06, + "loss": 3.0382, + "step": 74020 + }, + { + "epoch": 2.1689152567201346, + "grad_norm": 15.843368530273438, + "learning_rate": 1.884741774240823e-06, + "loss": 3.0299, + "step": 74030 + }, + { + "epoch": 2.169208232622867, + "grad_norm": 17.841779708862305, + "learning_rate": 1.8835049213433814e-06, + "loss": 3.0273, + "step": 74040 + }, + { + "epoch": 2.169501208525599, + "grad_norm": 17.611061096191406, + "learning_rate": 1.882268380232783e-06, + "loss": 3.0157, + "step": 74050 + }, + { + "epoch": 2.169794184428331, + "grad_norm": 19.795249938964844, + "learning_rate": 1.8810321510327363e-06, + "loss": 3.0319, + "step": 74060 + }, + { + "epoch": 2.1700871603310627, + "grad_norm": 13.69568157196045, + "learning_rate": 1.8797962338669202e-06, + "loss": 3.0292, + "step": 74070 + }, + { + "epoch": 2.1703801362337947, + "grad_norm": 15.147345542907715, + "learning_rate": 1.8785606288589774e-06, + "loss": 3.0391, + "step": 74080 + }, + { + "epoch": 2.1706731121365266, + "grad_norm": 24.325393676757812, + "learning_rate": 1.8773253361325272e-06, + "loss": 3.0272, + "step": 74090 + }, + { + "epoch": 2.170966088039259, + "grad_norm": 20.453603744506836, + "learning_rate": 1.8760903558111488e-06, + "loss": 3.042, + "step": 74100 + }, + { + "epoch": 2.171259063941991, + "grad_norm": 13.097229957580566, + "learning_rate": 1.8748556880183988e-06, + "loss": 3.0299, + "step": 74110 + }, + { + "epoch": 2.171552039844723, + "grad_norm": 19.694721221923828, + "learning_rate": 1.8736213328777947e-06, + "loss": 3.0294, + "step": 74120 + }, + { + "epoch": 2.1718450157474547, + "grad_norm": 16.26905632019043, + "learning_rate": 1.8723872905128275e-06, + "loss": 3.0346, + "step": 74130 + }, + { + "epoch": 2.1721379916501866, + "grad_norm": 19.066871643066406, + "learning_rate": 1.8711535610469567e-06, + "loss": 3.0359, + "step": 74140 + }, + { + "epoch": 2.172430967552919, + "grad_norm": 20.31555938720703, + "learning_rate": 1.8699201446036115e-06, + "loss": 3.0332, + "step": 74150 + }, + { + "epoch": 2.172723943455651, + "grad_norm": 19.33944320678711, + "learning_rate": 1.868687041306183e-06, + "loss": 3.0395, + "step": 74160 + }, + { + "epoch": 2.173016919358383, + "grad_norm": 19.464740753173828, + "learning_rate": 1.8674542512780408e-06, + "loss": 3.0136, + "step": 74170 + }, + { + "epoch": 2.1733098952611147, + "grad_norm": 15.857271194458008, + "learning_rate": 1.8662217746425143e-06, + "loss": 3.0219, + "step": 74180 + }, + { + "epoch": 2.1736028711638467, + "grad_norm": 20.36992835998535, + "learning_rate": 1.8649896115229072e-06, + "loss": 3.0137, + "step": 74190 + }, + { + "epoch": 2.1738958470665786, + "grad_norm": 18.59386444091797, + "learning_rate": 1.8637577620424918e-06, + "loss": 3.0257, + "step": 74200 + }, + { + "epoch": 2.174188822969311, + "grad_norm": 20.9173641204834, + "learning_rate": 1.8625262263245026e-06, + "loss": 3.0383, + "step": 74210 + }, + { + "epoch": 2.174481798872043, + "grad_norm": 20.97679328918457, + "learning_rate": 1.8612950044921534e-06, + "loss": 3.0264, + "step": 74220 + }, + { + "epoch": 2.1747747747747748, + "grad_norm": 18.91117286682129, + "learning_rate": 1.8600640966686156e-06, + "loss": 3.0204, + "step": 74230 + }, + { + "epoch": 2.1750677506775067, + "grad_norm": 16.016469955444336, + "learning_rate": 1.8588335029770388e-06, + "loss": 3.0137, + "step": 74240 + }, + { + "epoch": 2.1750677506775067, + "eval_bleu": 0.3515348097079284, + "eval_cap_loss": 0.9052395820617676, + "eval_con_loss": 1.142512321472168, + "eval_loss": 3.1902642250061035, + "step": 74240 + }, + { + "epoch": 2.1750677506775067, + "eval_bleu": 0.3515348097079284, + "eval_cap_loss": 0.9052395820617676, + "eval_con_loss": 1.142512321472168, + "eval_loss": 3.1902642250061035, + "eval_runtime": 52.514, + "eval_samples_per_second": 380.851, + "eval_steps_per_second": 0.381, + "step": 74240 + }, + { + "epoch": 2.1753607265802386, + "grad_norm": 18.81361961364746, + "learning_rate": 1.8576032235405323e-06, + "loss": 3.0386, + "step": 74250 + }, + { + "epoch": 2.175653702482971, + "grad_norm": 19.73063850402832, + "learning_rate": 1.8563732584821815e-06, + "loss": 3.0219, + "step": 74260 + }, + { + "epoch": 2.175946678385703, + "grad_norm": 17.181306838989258, + "learning_rate": 1.855143607925034e-06, + "loss": 3.0315, + "step": 74270 + }, + { + "epoch": 2.176239654288435, + "grad_norm": 19.489564895629883, + "learning_rate": 1.8539142719921127e-06, + "loss": 3.0362, + "step": 74280 + }, + { + "epoch": 2.1765326301911667, + "grad_norm": 16.18267250061035, + "learning_rate": 1.8526852508063996e-06, + "loss": 3.009, + "step": 74290 + }, + { + "epoch": 2.1768256060938986, + "grad_norm": 15.161147117614746, + "learning_rate": 1.8514565444908582e-06, + "loss": 3.0416, + "step": 74300 + }, + { + "epoch": 2.1771185819966306, + "grad_norm": 18.642807006835938, + "learning_rate": 1.8502281531684086e-06, + "loss": 3.0065, + "step": 74310 + }, + { + "epoch": 2.177411557899363, + "grad_norm": 19.09383773803711, + "learning_rate": 1.8490000769619466e-06, + "loss": 3.0226, + "step": 74320 + }, + { + "epoch": 2.177704533802095, + "grad_norm": 19.755849838256836, + "learning_rate": 1.847772315994331e-06, + "loss": 3.0193, + "step": 74330 + }, + { + "epoch": 2.1779975097048268, + "grad_norm": 18.95479393005371, + "learning_rate": 1.8465448703883959e-06, + "loss": 3.0259, + "step": 74340 + }, + { + "epoch": 2.1782904856075587, + "grad_norm": 19.942747116088867, + "learning_rate": 1.8453177402669354e-06, + "loss": 3.026, + "step": 74350 + }, + { + "epoch": 2.1785834615102906, + "grad_norm": 16.385515213012695, + "learning_rate": 1.844090925752719e-06, + "loss": 3.0503, + "step": 74360 + }, + { + "epoch": 2.178876437413023, + "grad_norm": 17.951704025268555, + "learning_rate": 1.8428644269684825e-06, + "loss": 3.0363, + "step": 74370 + }, + { + "epoch": 2.179169413315755, + "grad_norm": 13.80080509185791, + "learning_rate": 1.8416382440369307e-06, + "loss": 3.0288, + "step": 74380 + }, + { + "epoch": 2.179462389218487, + "grad_norm": 21.853158950805664, + "learning_rate": 1.8404123770807331e-06, + "loss": 3.0331, + "step": 74390 + }, + { + "epoch": 2.1797553651212187, + "grad_norm": 17.970975875854492, + "learning_rate": 1.8391868262225337e-06, + "loss": 3.0416, + "step": 74400 + }, + { + "epoch": 2.1800483410239506, + "grad_norm": 20.501480102539062, + "learning_rate": 1.837961591584938e-06, + "loss": 3.0373, + "step": 74410 + }, + { + "epoch": 2.180341316926683, + "grad_norm": 17.547260284423828, + "learning_rate": 1.8367366732905273e-06, + "loss": 3.0364, + "step": 74420 + }, + { + "epoch": 2.180634292829415, + "grad_norm": 14.293238639831543, + "learning_rate": 1.8355120714618435e-06, + "loss": 3.0388, + "step": 74430 + }, + { + "epoch": 2.180927268732147, + "grad_norm": 16.6423282623291, + "learning_rate": 1.8342877862214027e-06, + "loss": 3.0225, + "step": 74440 + }, + { + "epoch": 2.1812202446348787, + "grad_norm": 17.82136344909668, + "learning_rate": 1.8330638176916875e-06, + "loss": 3.0159, + "step": 74450 + }, + { + "epoch": 2.1815132205376107, + "grad_norm": 16.878313064575195, + "learning_rate": 1.83184016599515e-06, + "loss": 3.0555, + "step": 74460 + }, + { + "epoch": 2.181806196440343, + "grad_norm": 16.870168685913086, + "learning_rate": 1.8306168312542067e-06, + "loss": 3.0367, + "step": 74470 + }, + { + "epoch": 2.182099172343075, + "grad_norm": 18.611055374145508, + "learning_rate": 1.8293938135912475e-06, + "loss": 3.0433, + "step": 74480 + }, + { + "epoch": 2.182392148245807, + "grad_norm": 20.257720947265625, + "learning_rate": 1.828171113128625e-06, + "loss": 3.0436, + "step": 74490 + }, + { + "epoch": 2.1826851241485388, + "grad_norm": 21.539627075195312, + "learning_rate": 1.8269487299886663e-06, + "loss": 3.057, + "step": 74500 + }, + { + "epoch": 2.1829781000512707, + "grad_norm": 20.476505279541016, + "learning_rate": 1.8257266642936605e-06, + "loss": 3.0345, + "step": 74510 + }, + { + "epoch": 2.1832710759540026, + "grad_norm": 16.042980194091797, + "learning_rate": 1.8245049161658701e-06, + "loss": 3.036, + "step": 74520 + }, + { + "epoch": 2.183564051856735, + "grad_norm": 19.264307022094727, + "learning_rate": 1.8232834857275228e-06, + "loss": 3.0518, + "step": 74530 + }, + { + "epoch": 2.183857027759467, + "grad_norm": 17.20970916748047, + "learning_rate": 1.8220623731008175e-06, + "loss": 3.0525, + "step": 74540 + }, + { + "epoch": 2.184150003662199, + "grad_norm": 15.830159187316895, + "learning_rate": 1.8208415784079165e-06, + "loss": 3.0379, + "step": 74550 + }, + { + "epoch": 2.1844429795649307, + "grad_norm": 17.162677764892578, + "learning_rate": 1.8196211017709536e-06, + "loss": 3.0368, + "step": 74560 + }, + { + "epoch": 2.1847359554676626, + "grad_norm": 16.768644332885742, + "learning_rate": 1.8184009433120331e-06, + "loss": 3.0327, + "step": 74570 + }, + { + "epoch": 2.185028931370395, + "grad_norm": 17.520305633544922, + "learning_rate": 1.8171811031532205e-06, + "loss": 3.0343, + "step": 74580 + }, + { + "epoch": 2.185321907273127, + "grad_norm": 18.782867431640625, + "learning_rate": 1.8159615814165576e-06, + "loss": 3.037, + "step": 74590 + }, + { + "epoch": 2.185614883175859, + "grad_norm": 15.131007194519043, + "learning_rate": 1.8147423782240465e-06, + "loss": 3.0156, + "step": 74600 + }, + { + "epoch": 2.1859078590785908, + "grad_norm": 20.645036697387695, + "learning_rate": 1.8135234936976626e-06, + "loss": 3.0144, + "step": 74610 + }, + { + "epoch": 2.1862008349813227, + "grad_norm": 18.192167282104492, + "learning_rate": 1.8123049279593484e-06, + "loss": 3.0306, + "step": 74620 + }, + { + "epoch": 2.1864938108840546, + "grad_norm": 19.609527587890625, + "learning_rate": 1.8110866811310163e-06, + "loss": 3.0147, + "step": 74630 + }, + { + "epoch": 2.186786786786787, + "grad_norm": 16.95496940612793, + "learning_rate": 1.8098687533345404e-06, + "loss": 3.04, + "step": 74640 + }, + { + "epoch": 2.187079762689519, + "grad_norm": 19.748218536376953, + "learning_rate": 1.8086511446917715e-06, + "loss": 3.0279, + "step": 74650 + }, + { + "epoch": 2.187372738592251, + "grad_norm": 18.072206497192383, + "learning_rate": 1.8074338553245203e-06, + "loss": 3.0365, + "step": 74660 + }, + { + "epoch": 2.1876657144949827, + "grad_norm": 20.72575569152832, + "learning_rate": 1.8062168853545726e-06, + "loss": 3.033, + "step": 74670 + }, + { + "epoch": 2.1879586903977146, + "grad_norm": 19.425493240356445, + "learning_rate": 1.8050002349036754e-06, + "loss": 3.0174, + "step": 74680 + }, + { + "epoch": 2.188251666300447, + "grad_norm": 17.258012771606445, + "learning_rate": 1.8037839040935495e-06, + "loss": 3.0573, + "step": 74690 + }, + { + "epoch": 2.188544642203179, + "grad_norm": 18.09671401977539, + "learning_rate": 1.8025678930458817e-06, + "loss": 3.0232, + "step": 74700 + }, + { + "epoch": 2.188837618105911, + "grad_norm": 19.815099716186523, + "learning_rate": 1.8013522018823281e-06, + "loss": 3.0248, + "step": 74710 + }, + { + "epoch": 2.1891305940086427, + "grad_norm": 17.401151657104492, + "learning_rate": 1.8001368307245083e-06, + "loss": 3.0588, + "step": 74720 + }, + { + "epoch": 2.1894235699113747, + "grad_norm": 18.453184127807617, + "learning_rate": 1.7989217796940156e-06, + "loss": 3.0271, + "step": 74730 + }, + { + "epoch": 2.1897165458141066, + "grad_norm": 19.51264762878418, + "learning_rate": 1.7977070489124066e-06, + "loss": 3.0379, + "step": 74740 + }, + { + "epoch": 2.190009521716839, + "grad_norm": 19.907365798950195, + "learning_rate": 1.79649263850121e-06, + "loss": 3.0372, + "step": 74750 + }, + { + "epoch": 2.1900681168973852, + "eval_bleu": 0.3516086952891044, + "eval_cap_loss": 0.9048551321029663, + "eval_con_loss": 1.1415146589279175, + "eval_loss": 3.187884569168091, + "step": 74752 + }, + { + "epoch": 2.1900681168973852, + "eval_bleu": 0.3516086952891044, + "eval_cap_loss": 0.9048551321029663, + "eval_con_loss": 1.1415146589279175, + "eval_loss": 3.187884569168091, + "eval_runtime": 53.1788, + "eval_samples_per_second": 376.089, + "eval_steps_per_second": 0.376, + "step": 74752 + }, + { + "epoch": 2.190302497619571, + "grad_norm": 19.797950744628906, + "learning_rate": 1.7952785485819175e-06, + "loss": 3.0438, + "step": 74760 + }, + { + "epoch": 2.1905954735223028, + "grad_norm": 21.730083465576172, + "learning_rate": 1.794064779275993e-06, + "loss": 3.0035, + "step": 74770 + }, + { + "epoch": 2.1908884494250347, + "grad_norm": 17.406675338745117, + "learning_rate": 1.7928513307048679e-06, + "loss": 3.0324, + "step": 74780 + }, + { + "epoch": 2.1911814253277666, + "grad_norm": 20.454423904418945, + "learning_rate": 1.791638202989941e-06, + "loss": 3.0214, + "step": 74790 + }, + { + "epoch": 2.191474401230499, + "grad_norm": 16.642173767089844, + "learning_rate": 1.7904253962525753e-06, + "loss": 3.0424, + "step": 74800 + }, + { + "epoch": 2.191767377133231, + "grad_norm": 17.986543655395508, + "learning_rate": 1.7892129106141092e-06, + "loss": 3.0477, + "step": 74810 + }, + { + "epoch": 2.192060353035963, + "grad_norm": 16.94471549987793, + "learning_rate": 1.7880007461958404e-06, + "loss": 3.0362, + "step": 74820 + }, + { + "epoch": 2.1923533289386947, + "grad_norm": 19.49074363708496, + "learning_rate": 1.7867889031190427e-06, + "loss": 3.0357, + "step": 74830 + }, + { + "epoch": 2.1926463048414266, + "grad_norm": 13.221144676208496, + "learning_rate": 1.7855773815049503e-06, + "loss": 3.0491, + "step": 74840 + }, + { + "epoch": 2.1929392807441586, + "grad_norm": 18.431013107299805, + "learning_rate": 1.7843661814747703e-06, + "loss": 3.0448, + "step": 74850 + }, + { + "epoch": 2.193232256646891, + "grad_norm": 16.213090896606445, + "learning_rate": 1.7831553031496757e-06, + "loss": 3.0316, + "step": 74860 + }, + { + "epoch": 2.193525232549623, + "grad_norm": 19.561452865600586, + "learning_rate": 1.7819447466508105e-06, + "loss": 3.0388, + "step": 74870 + }, + { + "epoch": 2.1938182084523548, + "grad_norm": 20.383026123046875, + "learning_rate": 1.7807345120992792e-06, + "loss": 3.0376, + "step": 74880 + }, + { + "epoch": 2.1941111843550867, + "grad_norm": 18.00314712524414, + "learning_rate": 1.7795245996161625e-06, + "loss": 3.0388, + "step": 74890 + }, + { + "epoch": 2.1944041602578186, + "grad_norm": 16.008630752563477, + "learning_rate": 1.7783150093225016e-06, + "loss": 3.0388, + "step": 74900 + }, + { + "epoch": 2.194697136160551, + "grad_norm": 18.85450553894043, + "learning_rate": 1.7771057413393107e-06, + "loss": 3.0398, + "step": 74910 + }, + { + "epoch": 2.194990112063283, + "grad_norm": 17.297616958618164, + "learning_rate": 1.7758967957875705e-06, + "loss": 3.0335, + "step": 74920 + }, + { + "epoch": 2.195283087966015, + "grad_norm": 17.932308197021484, + "learning_rate": 1.774688172788227e-06, + "loss": 3.0254, + "step": 74930 + }, + { + "epoch": 2.1955760638687467, + "grad_norm": 16.48090171813965, + "learning_rate": 1.773479872462196e-06, + "loss": 3.0344, + "step": 74940 + }, + { + "epoch": 2.1958690397714786, + "grad_norm": 18.79505157470703, + "learning_rate": 1.7722718949303618e-06, + "loss": 3.0426, + "step": 74950 + }, + { + "epoch": 2.196162015674211, + "grad_norm": 18.9886474609375, + "learning_rate": 1.7710642403135768e-06, + "loss": 3.0243, + "step": 74960 + }, + { + "epoch": 2.196454991576943, + "grad_norm": 20.255687713623047, + "learning_rate": 1.7698569087326568e-06, + "loss": 3.0508, + "step": 74970 + }, + { + "epoch": 2.196747967479675, + "grad_norm": 17.97933006286621, + "learning_rate": 1.7686499003083901e-06, + "loss": 3.0192, + "step": 74980 + }, + { + "epoch": 2.1970409433824067, + "grad_norm": 22.184799194335938, + "learning_rate": 1.7674432151615283e-06, + "loss": 3.0443, + "step": 74990 + }, + { + "epoch": 2.1973339192851387, + "grad_norm": 18.063676834106445, + "learning_rate": 1.7662368534127972e-06, + "loss": 3.0121, + "step": 75000 + }, + { + "epoch": 2.197626895187871, + "grad_norm": 24.733121871948242, + "learning_rate": 1.765030815182881e-06, + "loss": 3.055, + "step": 75010 + }, + { + "epoch": 2.197919871090603, + "grad_norm": 20.513322830200195, + "learning_rate": 1.76382510059244e-06, + "loss": 3.0133, + "step": 75020 + }, + { + "epoch": 2.198212846993335, + "grad_norm": 16.684051513671875, + "learning_rate": 1.7626197097620979e-06, + "loss": 3.0134, + "step": 75030 + }, + { + "epoch": 2.1985058228960668, + "grad_norm": 17.211627960205078, + "learning_rate": 1.761414642812449e-06, + "loss": 3.0181, + "step": 75040 + }, + { + "epoch": 2.1987987987987987, + "grad_norm": 20.550046920776367, + "learning_rate": 1.7602098998640493e-06, + "loss": 3.0317, + "step": 75050 + }, + { + "epoch": 2.1990917747015306, + "grad_norm": 18.288623809814453, + "learning_rate": 1.7590054810374296e-06, + "loss": 3.0399, + "step": 75060 + }, + { + "epoch": 2.199384750604263, + "grad_norm": 18.233675003051758, + "learning_rate": 1.7578013864530818e-06, + "loss": 3.0391, + "step": 75070 + }, + { + "epoch": 2.199677726506995, + "grad_norm": 20.603288650512695, + "learning_rate": 1.756597616231472e-06, + "loss": 3.0257, + "step": 75080 + }, + { + "epoch": 2.199970702409727, + "grad_norm": 20.062339782714844, + "learning_rate": 1.755394170493026e-06, + "loss": 3.034, + "step": 75090 + }, + { + "epoch": 2.2002636783124587, + "grad_norm": 20.707233428955078, + "learning_rate": 1.7541910493581443e-06, + "loss": 3.0074, + "step": 75100 + }, + { + "epoch": 2.2005566542151906, + "grad_norm": 15.020148277282715, + "learning_rate": 1.752988252947191e-06, + "loss": 3.0412, + "step": 75110 + }, + { + "epoch": 2.200849630117923, + "grad_norm": 16.984378814697266, + "learning_rate": 1.7517857813805006e-06, + "loss": 3.0392, + "step": 75120 + }, + { + "epoch": 2.201142606020655, + "grad_norm": 21.473907470703125, + "learning_rate": 1.7505836347783705e-06, + "loss": 3.0192, + "step": 75130 + }, + { + "epoch": 2.201435581923387, + "grad_norm": 19.694005966186523, + "learning_rate": 1.7493818132610714e-06, + "loss": 3.0362, + "step": 75140 + }, + { + "epoch": 2.2017285578261188, + "grad_norm": 22.701194763183594, + "learning_rate": 1.7481803169488342e-06, + "loss": 3.0392, + "step": 75150 + }, + { + "epoch": 2.2020215337288507, + "grad_norm": 18.897174835205078, + "learning_rate": 1.7469791459618663e-06, + "loss": 3.0306, + "step": 75160 + }, + { + "epoch": 2.2023145096315826, + "grad_norm": 16.704360961914062, + "learning_rate": 1.7457783004203333e-06, + "loss": 3.0422, + "step": 75170 + }, + { + "epoch": 2.202607485534315, + "grad_norm": 19.633588790893555, + "learning_rate": 1.7445777804443747e-06, + "loss": 3.0426, + "step": 75180 + }, + { + "epoch": 2.202900461437047, + "grad_norm": 17.353776931762695, + "learning_rate": 1.7433775861540957e-06, + "loss": 3.0354, + "step": 75190 + }, + { + "epoch": 2.203193437339779, + "grad_norm": 19.17076301574707, + "learning_rate": 1.7421777176695698e-06, + "loss": 3.041, + "step": 75200 + }, + { + "epoch": 2.2034864132425107, + "grad_norm": 18.03443145751953, + "learning_rate": 1.740978175110834e-06, + "loss": 3.0321, + "step": 75210 + }, + { + "epoch": 2.2037793891452426, + "grad_norm": 21.21408462524414, + "learning_rate": 1.739778958597898e-06, + "loss": 3.0083, + "step": 75220 + }, + { + "epoch": 2.204072365047975, + "grad_norm": 19.443313598632812, + "learning_rate": 1.738580068250733e-06, + "loss": 3.0244, + "step": 75230 + }, + { + "epoch": 2.204365340950707, + "grad_norm": 20.616315841674805, + "learning_rate": 1.7373815041892849e-06, + "loss": 3.0423, + "step": 75240 + }, + { + "epoch": 2.204658316853439, + "grad_norm": 18.26750946044922, + "learning_rate": 1.7361832665334589e-06, + "loss": 3.0513, + "step": 75250 + }, + { + "epoch": 2.2049512927561707, + "grad_norm": 20.335006713867188, + "learning_rate": 1.7349853554031337e-06, + "loss": 3.0318, + "step": 75260 + }, + { + "epoch": 2.2050684831172638, + "eval_bleu": 0.35161832851237745, + "eval_cap_loss": 0.9047744274139404, + "eval_con_loss": 1.1420047283172607, + "eval_loss": 3.188784122467041, + "step": 75264 + }, + { + "epoch": 2.2050684831172638, + "eval_bleu": 0.35161832851237745, + "eval_cap_loss": 0.9047744274139404, + "eval_con_loss": 1.1420047283172607, + "eval_loss": 3.188784122467041, + "eval_runtime": 53.5019, + "eval_samples_per_second": 373.819, + "eval_steps_per_second": 0.374, + "step": 75264 + }, + { + "epoch": 2.2052442686589027, + "grad_norm": 18.450191497802734, + "learning_rate": 1.7337877709181527e-06, + "loss": 3.0276, + "step": 75270 + }, + { + "epoch": 2.2055372445616346, + "grad_norm": 16.907583236694336, + "learning_rate": 1.7325905131983278e-06, + "loss": 3.0337, + "step": 75280 + }, + { + "epoch": 2.205830220464367, + "grad_norm": 18.078569412231445, + "learning_rate": 1.7313935823634386e-06, + "loss": 3.0268, + "step": 75290 + }, + { + "epoch": 2.206123196367099, + "grad_norm": 18.430246353149414, + "learning_rate": 1.7301969785332284e-06, + "loss": 3.0366, + "step": 75300 + }, + { + "epoch": 2.2064161722698308, + "grad_norm": 18.684329986572266, + "learning_rate": 1.7290007018274125e-06, + "loss": 3.0283, + "step": 75310 + }, + { + "epoch": 2.2067091481725627, + "grad_norm": 18.531335830688477, + "learning_rate": 1.7278047523656689e-06, + "loss": 3.0354, + "step": 75320 + }, + { + "epoch": 2.2070021240752946, + "grad_norm": 15.808465957641602, + "learning_rate": 1.726609130267648e-06, + "loss": 3.034, + "step": 75330 + }, + { + "epoch": 2.207295099978027, + "grad_norm": 15.65496826171875, + "learning_rate": 1.7254138356529615e-06, + "loss": 3.0244, + "step": 75340 + }, + { + "epoch": 2.207588075880759, + "grad_norm": 15.826281547546387, + "learning_rate": 1.7242188686411938e-06, + "loss": 3.0157, + "step": 75350 + }, + { + "epoch": 2.207881051783491, + "grad_norm": 22.437345504760742, + "learning_rate": 1.7230242293518935e-06, + "loss": 3.0133, + "step": 75360 + }, + { + "epoch": 2.2081740276862227, + "grad_norm": 20.607288360595703, + "learning_rate": 1.7218299179045789e-06, + "loss": 3.0534, + "step": 75370 + }, + { + "epoch": 2.2084670035889546, + "grad_norm": 21.31264305114746, + "learning_rate": 1.7206359344187307e-06, + "loss": 3.044, + "step": 75380 + }, + { + "epoch": 2.208759979491687, + "grad_norm": 16.800025939941406, + "learning_rate": 1.7194422790138028e-06, + "loss": 3.0232, + "step": 75390 + }, + { + "epoch": 2.209052955394419, + "grad_norm": 16.446500778198242, + "learning_rate": 1.7182489518092105e-06, + "loss": 3.0256, + "step": 75400 + }, + { + "epoch": 2.209345931297151, + "grad_norm": 18.610008239746094, + "learning_rate": 1.7170559529243424e-06, + "loss": 3.0682, + "step": 75410 + }, + { + "epoch": 2.2096389071998828, + "grad_norm": 18.63874053955078, + "learning_rate": 1.7158632824785475e-06, + "loss": 3.0166, + "step": 75420 + }, + { + "epoch": 2.2099318831026147, + "grad_norm": 15.18290901184082, + "learning_rate": 1.7146709405911472e-06, + "loss": 3.0185, + "step": 75430 + }, + { + "epoch": 2.210224859005347, + "grad_norm": 21.500118255615234, + "learning_rate": 1.713478927381428e-06, + "loss": 3.043, + "step": 75440 + }, + { + "epoch": 2.210517834908079, + "grad_norm": 20.12386703491211, + "learning_rate": 1.7122872429686455e-06, + "loss": 3.0251, + "step": 75450 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 19.21940803527832, + "learning_rate": 1.7110958874720174e-06, + "loss": 3.0236, + "step": 75460 + }, + { + "epoch": 2.211103786713543, + "grad_norm": 19.801387786865234, + "learning_rate": 1.709904861010735e-06, + "loss": 3.0321, + "step": 75470 + }, + { + "epoch": 2.2113967626162747, + "grad_norm": 20.063180923461914, + "learning_rate": 1.70871416370395e-06, + "loss": 3.0372, + "step": 75480 + }, + { + "epoch": 2.2116897385190066, + "grad_norm": 17.5378475189209, + "learning_rate": 1.7075237956707885e-06, + "loss": 3.033, + "step": 75490 + }, + { + "epoch": 2.211982714421739, + "grad_norm": 15.085609436035156, + "learning_rate": 1.7063337570303357e-06, + "loss": 3.0339, + "step": 75500 + }, + { + "epoch": 2.212275690324471, + "grad_norm": 18.484636306762695, + "learning_rate": 1.7051440479016496e-06, + "loss": 3.0228, + "step": 75510 + }, + { + "epoch": 2.212568666227203, + "grad_norm": 18.137073516845703, + "learning_rate": 1.703954668403754e-06, + "loss": 3.0308, + "step": 75520 + }, + { + "epoch": 2.2128616421299347, + "grad_norm": 18.60008430480957, + "learning_rate": 1.7027656186556408e-06, + "loss": 3.018, + "step": 75530 + }, + { + "epoch": 2.2131546180326667, + "grad_norm": 23.079627990722656, + "learning_rate": 1.7015768987762638e-06, + "loss": 3.0424, + "step": 75540 + }, + { + "epoch": 2.213447593935399, + "grad_norm": 15.58521556854248, + "learning_rate": 1.7003885088845513e-06, + "loss": 3.0195, + "step": 75550 + }, + { + "epoch": 2.213740569838131, + "grad_norm": 16.49451446533203, + "learning_rate": 1.6992004490993902e-06, + "loss": 3.0228, + "step": 75560 + }, + { + "epoch": 2.214033545740863, + "grad_norm": 19.42581558227539, + "learning_rate": 1.6980127195396434e-06, + "loss": 3.0374, + "step": 75570 + }, + { + "epoch": 2.214326521643595, + "grad_norm": 18.11069679260254, + "learning_rate": 1.696825320324132e-06, + "loss": 3.0201, + "step": 75580 + }, + { + "epoch": 2.2146194975463267, + "grad_norm": 23.244388580322266, + "learning_rate": 1.6956382515716501e-06, + "loss": 3.0281, + "step": 75590 + }, + { + "epoch": 2.2149124734490586, + "grad_norm": 21.300609588623047, + "learning_rate": 1.6944515134009577e-06, + "loss": 3.0485, + "step": 75600 + }, + { + "epoch": 2.215205449351791, + "grad_norm": 17.268569946289062, + "learning_rate": 1.6932651059307813e-06, + "loss": 3.0473, + "step": 75610 + }, + { + "epoch": 2.215498425254523, + "grad_norm": 16.532196044921875, + "learning_rate": 1.6920790292798117e-06, + "loss": 3.0313, + "step": 75620 + }, + { + "epoch": 2.215791401157255, + "grad_norm": 17.748462677001953, + "learning_rate": 1.6908932835667096e-06, + "loss": 3.0134, + "step": 75630 + }, + { + "epoch": 2.2160843770599867, + "grad_norm": 20.77571678161621, + "learning_rate": 1.6897078689101037e-06, + "loss": 3.0439, + "step": 75640 + }, + { + "epoch": 2.2163773529627186, + "grad_norm": 18.527494430541992, + "learning_rate": 1.688522785428585e-06, + "loss": 3.0316, + "step": 75650 + }, + { + "epoch": 2.216670328865451, + "grad_norm": 19.687366485595703, + "learning_rate": 1.687338033240717e-06, + "loss": 3.0099, + "step": 75660 + }, + { + "epoch": 2.216963304768183, + "grad_norm": 18.413427352905273, + "learning_rate": 1.6861536124650235e-06, + "loss": 3.0194, + "step": 75670 + }, + { + "epoch": 2.217256280670915, + "grad_norm": 22.133333206176758, + "learning_rate": 1.6849695232200008e-06, + "loss": 3.0442, + "step": 75680 + }, + { + "epoch": 2.2175492565736468, + "grad_norm": 19.64278221130371, + "learning_rate": 1.6837857656241098e-06, + "loss": 3.024, + "step": 75690 + }, + { + "epoch": 2.2178422324763787, + "grad_norm": 19.884239196777344, + "learning_rate": 1.6826023397957802e-06, + "loss": 3.0175, + "step": 75700 + }, + { + "epoch": 2.2181352083791106, + "grad_norm": 17.235292434692383, + "learning_rate": 1.6814192458534034e-06, + "loss": 3.0143, + "step": 75710 + }, + { + "epoch": 2.218428184281843, + "grad_norm": 17.956424713134766, + "learning_rate": 1.680236483915344e-06, + "loss": 3.0241, + "step": 75720 + }, + { + "epoch": 2.218721160184575, + "grad_norm": 16.668304443359375, + "learning_rate": 1.6790540540999278e-06, + "loss": 3.0357, + "step": 75730 + }, + { + "epoch": 2.219014136087307, + "grad_norm": 20.230993270874023, + "learning_rate": 1.677871956525452e-06, + "loss": 3.0443, + "step": 75740 + }, + { + "epoch": 2.2193071119900387, + "grad_norm": 19.766313552856445, + "learning_rate": 1.6766901913101763e-06, + "loss": 3.02, + "step": 75750 + }, + { + "epoch": 2.2196000878927706, + "grad_norm": 17.116100311279297, + "learning_rate": 1.6755087585723307e-06, + "loss": 3.0421, + "step": 75760 + }, + { + "epoch": 2.219893063795503, + "grad_norm": 18.927818298339844, + "learning_rate": 1.6743276584301093e-06, + "loss": 3.0383, + "step": 75770 + }, + { + "epoch": 2.220068849337142, + "eval_bleu": 0.35181724232496375, + "eval_cap_loss": 0.9044945240020752, + "eval_con_loss": 1.1414762735366821, + "eval_loss": 3.1874470710754395, + "step": 75776 + }, + { + "epoch": 2.220068849337142, + "eval_bleu": 0.35181724232496375, + "eval_cap_loss": 0.9044945240020752, + "eval_con_loss": 1.1414762735366821, + "eval_loss": 3.1874470710754395, + "eval_runtime": 53.4818, + "eval_samples_per_second": 373.959, + "eval_steps_per_second": 0.374, + "step": 75776 + }, + { + "epoch": 2.220186039698235, + "grad_norm": 18.542829513549805, + "learning_rate": 1.6731468910016774e-06, + "loss": 3.0349, + "step": 75780 + }, + { + "epoch": 2.220479015600967, + "grad_norm": 15.360909461975098, + "learning_rate": 1.6719664564051601e-06, + "loss": 3.023, + "step": 75790 + }, + { + "epoch": 2.2207719915036988, + "grad_norm": 17.510360717773438, + "learning_rate": 1.6707863547586557e-06, + "loss": 3.0451, + "step": 75800 + }, + { + "epoch": 2.2210649674064307, + "grad_norm": 18.16405487060547, + "learning_rate": 1.669606586180223e-06, + "loss": 3.0295, + "step": 75810 + }, + { + "epoch": 2.2213579433091626, + "grad_norm": 19.582237243652344, + "learning_rate": 1.6684271507878946e-06, + "loss": 3.0268, + "step": 75820 + }, + { + "epoch": 2.221650919211895, + "grad_norm": 18.010047912597656, + "learning_rate": 1.6672480486996629e-06, + "loss": 3.0288, + "step": 75830 + }, + { + "epoch": 2.221943895114627, + "grad_norm": 18.3834285736084, + "learning_rate": 1.666069280033491e-06, + "loss": 3.0264, + "step": 75840 + }, + { + "epoch": 2.222236871017359, + "grad_norm": 19.271228790283203, + "learning_rate": 1.6648908449073082e-06, + "loss": 3.042, + "step": 75850 + }, + { + "epoch": 2.2225298469200907, + "grad_norm": 18.62398338317871, + "learning_rate": 1.6637127434390115e-06, + "loss": 3.0131, + "step": 75860 + }, + { + "epoch": 2.2228228228228226, + "grad_norm": 18.234786987304688, + "learning_rate": 1.6625349757464597e-06, + "loss": 3.0101, + "step": 75870 + }, + { + "epoch": 2.223115798725555, + "grad_norm": 16.29979705810547, + "learning_rate": 1.6613575419474848e-06, + "loss": 3.0284, + "step": 75880 + }, + { + "epoch": 2.223408774628287, + "grad_norm": 15.074068069458008, + "learning_rate": 1.6601804421598787e-06, + "loss": 3.019, + "step": 75890 + }, + { + "epoch": 2.223701750531019, + "grad_norm": 16.701200485229492, + "learning_rate": 1.6590036765014068e-06, + "loss": 3.0224, + "step": 75900 + }, + { + "epoch": 2.2239947264337507, + "grad_norm": 17.323238372802734, + "learning_rate": 1.6578272450897932e-06, + "loss": 3.0267, + "step": 75910 + }, + { + "epoch": 2.2242877023364827, + "grad_norm": 17.727083206176758, + "learning_rate": 1.6566511480427355e-06, + "loss": 3.0178, + "step": 75920 + }, + { + "epoch": 2.224580678239215, + "grad_norm": 18.842735290527344, + "learning_rate": 1.6554753854778954e-06, + "loss": 3.0175, + "step": 75930 + }, + { + "epoch": 2.224873654141947, + "grad_norm": 18.924686431884766, + "learning_rate": 1.654299957512902e-06, + "loss": 3.035, + "step": 75940 + }, + { + "epoch": 2.225166630044679, + "grad_norm": 15.542263984680176, + "learning_rate": 1.653124864265347e-06, + "loss": 3.0139, + "step": 75950 + }, + { + "epoch": 2.2254596059474108, + "grad_norm": 15.98270320892334, + "learning_rate": 1.6519501058527949e-06, + "loss": 3.0212, + "step": 75960 + }, + { + "epoch": 2.2257525818501427, + "grad_norm": 15.732995986938477, + "learning_rate": 1.6507756823927696e-06, + "loss": 3.0342, + "step": 75970 + }, + { + "epoch": 2.226045557752875, + "grad_norm": 18.637765884399414, + "learning_rate": 1.6496015940027665e-06, + "loss": 3.0318, + "step": 75980 + }, + { + "epoch": 2.226338533655607, + "grad_norm": 17.184309005737305, + "learning_rate": 1.6484278408002485e-06, + "loss": 3.0531, + "step": 75990 + }, + { + "epoch": 2.226631509558339, + "grad_norm": 19.033130645751953, + "learning_rate": 1.6472544229026393e-06, + "loss": 3.0388, + "step": 76000 + }, + { + "epoch": 2.226924485461071, + "grad_norm": 19.008031845092773, + "learning_rate": 1.6460813404273341e-06, + "loss": 3.0171, + "step": 76010 + }, + { + "epoch": 2.2272174613638027, + "grad_norm": 16.58753204345703, + "learning_rate": 1.6449085934916925e-06, + "loss": 3.0494, + "step": 76020 + }, + { + "epoch": 2.2275104372665346, + "grad_norm": 18.467315673828125, + "learning_rate": 1.6437361822130432e-06, + "loss": 3.0341, + "step": 76030 + }, + { + "epoch": 2.227803413169267, + "grad_norm": 17.81159210205078, + "learning_rate": 1.6425641067086745e-06, + "loss": 3.0344, + "step": 76040 + }, + { + "epoch": 2.228096389071999, + "grad_norm": 21.744001388549805, + "learning_rate": 1.6413923670958498e-06, + "loss": 3.0247, + "step": 76050 + }, + { + "epoch": 2.228389364974731, + "grad_norm": 20.92670249938965, + "learning_rate": 1.6402209634917915e-06, + "loss": 3.0261, + "step": 76060 + }, + { + "epoch": 2.2286823408774628, + "grad_norm": 17.23270606994629, + "learning_rate": 1.639049896013694e-06, + "loss": 3.0318, + "step": 76070 + }, + { + "epoch": 2.2289753167801947, + "grad_norm": 18.92718505859375, + "learning_rate": 1.6378791647787134e-06, + "loss": 3.0103, + "step": 76080 + }, + { + "epoch": 2.229268292682927, + "grad_norm": 16.520549774169922, + "learning_rate": 1.6367087699039752e-06, + "loss": 3.0176, + "step": 76090 + }, + { + "epoch": 2.229561268585659, + "grad_norm": 17.115894317626953, + "learning_rate": 1.6355387115065712e-06, + "loss": 3.0405, + "step": 76100 + }, + { + "epoch": 2.229854244488391, + "grad_norm": 18.78145408630371, + "learning_rate": 1.6343689897035598e-06, + "loss": 3.0427, + "step": 76110 + }, + { + "epoch": 2.230147220391123, + "grad_norm": 14.733383178710938, + "learning_rate": 1.633199604611962e-06, + "loss": 3.0369, + "step": 76120 + }, + { + "epoch": 2.2304401962938547, + "grad_norm": 19.392385482788086, + "learning_rate": 1.6320305563487704e-06, + "loss": 3.0162, + "step": 76130 + }, + { + "epoch": 2.2307331721965866, + "grad_norm": 18.194597244262695, + "learning_rate": 1.6308618450309388e-06, + "loss": 3.0301, + "step": 76140 + }, + { + "epoch": 2.231026148099319, + "grad_norm": 20.19285011291504, + "learning_rate": 1.629693470775393e-06, + "loss": 3.0292, + "step": 76150 + }, + { + "epoch": 2.231319124002051, + "grad_norm": 22.10480499267578, + "learning_rate": 1.628525433699018e-06, + "loss": 3.039, + "step": 76160 + }, + { + "epoch": 2.231612099904783, + "grad_norm": 15.695321083068848, + "learning_rate": 1.627357733918672e-06, + "loss": 3.0495, + "step": 76170 + }, + { + "epoch": 2.2319050758075147, + "grad_norm": 18.8200626373291, + "learning_rate": 1.6261903715511746e-06, + "loss": 3.0304, + "step": 76180 + }, + { + "epoch": 2.2321980517102467, + "grad_norm": 18.730884552001953, + "learning_rate": 1.6250233467133164e-06, + "loss": 3.0151, + "step": 76190 + }, + { + "epoch": 2.232491027612979, + "grad_norm": 17.71366310119629, + "learning_rate": 1.6238566595218475e-06, + "loss": 3.0181, + "step": 76200 + }, + { + "epoch": 2.232784003515711, + "grad_norm": 17.584821701049805, + "learning_rate": 1.6226903100934916e-06, + "loss": 3.0256, + "step": 76210 + }, + { + "epoch": 2.233076979418443, + "grad_norm": 16.28295135498047, + "learning_rate": 1.621524298544932e-06, + "loss": 3.029, + "step": 76220 + }, + { + "epoch": 2.2333699553211748, + "grad_norm": 21.865568161010742, + "learning_rate": 1.6203586249928239e-06, + "loss": 3.041, + "step": 76230 + }, + { + "epoch": 2.2336629312239067, + "grad_norm": 17.798784255981445, + "learning_rate": 1.6191932895537827e-06, + "loss": 3.0456, + "step": 76240 + }, + { + "epoch": 2.2339559071266386, + "grad_norm": 17.8358154296875, + "learning_rate": 1.6180282923443953e-06, + "loss": 3.0219, + "step": 76250 + }, + { + "epoch": 2.234248883029371, + "grad_norm": 18.327861785888672, + "learning_rate": 1.6168636334812126e-06, + "loss": 3.0314, + "step": 76260 + }, + { + "epoch": 2.234541858932103, + "grad_norm": 19.585058212280273, + "learning_rate": 1.6156993130807542e-06, + "loss": 3.0107, + "step": 76270 + }, + { + "epoch": 2.234834834834835, + "grad_norm": 18.326231002807617, + "learning_rate": 1.614535331259499e-06, + "loss": 3.0154, + "step": 76280 + }, + { + "epoch": 2.2350692155570204, + "eval_bleu": 0.3516893372112485, + "eval_cap_loss": 0.9043487906455994, + "eval_con_loss": 1.1412771940231323, + "eval_loss": 3.1869029998779297, + "step": 76288 + }, + { + "epoch": 2.2350692155570204, + "eval_bleu": 0.3516893372112485, + "eval_cap_loss": 0.9043487906455994, + "eval_con_loss": 1.1412771940231323, + "eval_loss": 3.1869029998779297, + "eval_runtime": 56.6376, + "eval_samples_per_second": 353.122, + "eval_steps_per_second": 0.353, + "step": 76288 + }, + { + "epoch": 2.2351278107375667, + "grad_norm": 18.592073440551758, + "learning_rate": 1.6133716881339008e-06, + "loss": 3.0422, + "step": 76290 + }, + { + "epoch": 2.2354207866402986, + "grad_norm": 16.577173233032227, + "learning_rate": 1.6122083838203707e-06, + "loss": 3.0291, + "step": 76300 + }, + { + "epoch": 2.235713762543031, + "grad_norm": 16.994670867919922, + "learning_rate": 1.6110454184352948e-06, + "loss": 3.036, + "step": 76310 + }, + { + "epoch": 2.236006738445763, + "grad_norm": 19.911697387695312, + "learning_rate": 1.6098827920950166e-06, + "loss": 3.027, + "step": 76320 + }, + { + "epoch": 2.236299714348495, + "grad_norm": 19.351043701171875, + "learning_rate": 1.6087205049158527e-06, + "loss": 3.0271, + "step": 76330 + }, + { + "epoch": 2.2365926902512268, + "grad_norm": 19.900033950805664, + "learning_rate": 1.6075585570140823e-06, + "loss": 3.0546, + "step": 76340 + }, + { + "epoch": 2.2368856661539587, + "grad_norm": 17.805938720703125, + "learning_rate": 1.6063969485059516e-06, + "loss": 3.0137, + "step": 76350 + }, + { + "epoch": 2.237178642056691, + "grad_norm": 18.00385856628418, + "learning_rate": 1.6052356795076745e-06, + "loss": 3.0154, + "step": 76360 + }, + { + "epoch": 2.237471617959423, + "grad_norm": 15.768245697021484, + "learning_rate": 1.6040747501354254e-06, + "loss": 3.0242, + "step": 76370 + }, + { + "epoch": 2.237764593862155, + "grad_norm": 16.3023681640625, + "learning_rate": 1.6029141605053522e-06, + "loss": 3.0379, + "step": 76380 + }, + { + "epoch": 2.238057569764887, + "grad_norm": 16.080116271972656, + "learning_rate": 1.6017539107335612e-06, + "loss": 3.029, + "step": 76390 + }, + { + "epoch": 2.2383505456676187, + "grad_norm": 22.6314640045166, + "learning_rate": 1.6005940009361315e-06, + "loss": 3.0328, + "step": 76400 + }, + { + "epoch": 2.238643521570351, + "grad_norm": 20.80614471435547, + "learning_rate": 1.5994344312291027e-06, + "loss": 3.0118, + "step": 76410 + }, + { + "epoch": 2.238936497473083, + "grad_norm": 16.97856903076172, + "learning_rate": 1.598275201728484e-06, + "loss": 3.0238, + "step": 76420 + }, + { + "epoch": 2.239229473375815, + "grad_norm": 18.189476013183594, + "learning_rate": 1.5971163125502504e-06, + "loss": 3.0121, + "step": 76430 + }, + { + "epoch": 2.239522449278547, + "grad_norm": 17.875566482543945, + "learning_rate": 1.5959577638103419e-06, + "loss": 3.0192, + "step": 76440 + }, + { + "epoch": 2.2398154251812787, + "grad_norm": 17.947580337524414, + "learning_rate": 1.5947995556246626e-06, + "loss": 3.0264, + "step": 76450 + }, + { + "epoch": 2.2401084010840107, + "grad_norm": 18.68323516845703, + "learning_rate": 1.5936416881090865e-06, + "loss": 3.0331, + "step": 76460 + }, + { + "epoch": 2.240401376986743, + "grad_norm": 17.714319229125977, + "learning_rate": 1.592484161379449e-06, + "loss": 3.0141, + "step": 76470 + }, + { + "epoch": 2.240694352889475, + "grad_norm": 16.75088882446289, + "learning_rate": 1.5913269755515565e-06, + "loss": 3.027, + "step": 76480 + }, + { + "epoch": 2.240987328792207, + "grad_norm": 17.233911514282227, + "learning_rate": 1.5901701307411755e-06, + "loss": 3.026, + "step": 76490 + }, + { + "epoch": 2.2412803046949388, + "grad_norm": 18.116832733154297, + "learning_rate": 1.5890136270640432e-06, + "loss": 3.024, + "step": 76500 + }, + { + "epoch": 2.2415732805976707, + "grad_norm": 18.890810012817383, + "learning_rate": 1.5878574646358608e-06, + "loss": 3.045, + "step": 76510 + }, + { + "epoch": 2.241866256500403, + "grad_norm": 20.971538543701172, + "learning_rate": 1.586701643572297e-06, + "loss": 3.0503, + "step": 76520 + }, + { + "epoch": 2.242159232403135, + "grad_norm": 15.869677543640137, + "learning_rate": 1.5855461639889819e-06, + "loss": 3.0126, + "step": 76530 + }, + { + "epoch": 2.242452208305867, + "grad_norm": 16.86819076538086, + "learning_rate": 1.5843910260015171e-06, + "loss": 3.0114, + "step": 76540 + }, + { + "epoch": 2.242745184208599, + "grad_norm": 17.026920318603516, + "learning_rate": 1.583236229725465e-06, + "loss": 3.0358, + "step": 76550 + }, + { + "epoch": 2.2430381601113307, + "grad_norm": 14.403315544128418, + "learning_rate": 1.5820817752763584e-06, + "loss": 3.0086, + "step": 76560 + }, + { + "epoch": 2.2433311360140626, + "grad_norm": 15.146397590637207, + "learning_rate": 1.5809276627696907e-06, + "loss": 3.0229, + "step": 76570 + }, + { + "epoch": 2.243624111916795, + "grad_norm": 15.604092597961426, + "learning_rate": 1.579773892320926e-06, + "loss": 3.0339, + "step": 76580 + }, + { + "epoch": 2.243917087819527, + "grad_norm": 20.51361083984375, + "learning_rate": 1.5786204640454917e-06, + "loss": 3.0291, + "step": 76590 + }, + { + "epoch": 2.244210063722259, + "grad_norm": 22.472219467163086, + "learning_rate": 1.577467378058783e-06, + "loss": 3.0255, + "step": 76600 + }, + { + "epoch": 2.2445030396249908, + "grad_norm": 19.421483993530273, + "learning_rate": 1.5763146344761566e-06, + "loss": 3.0386, + "step": 76610 + }, + { + "epoch": 2.2447960155277227, + "grad_norm": 18.146926879882812, + "learning_rate": 1.5751622334129407e-06, + "loss": 2.9969, + "step": 76620 + }, + { + "epoch": 2.245088991430455, + "grad_norm": 14.387704849243164, + "learning_rate": 1.5740101749844223e-06, + "loss": 3.0379, + "step": 76630 + }, + { + "epoch": 2.245381967333187, + "grad_norm": 17.028478622436523, + "learning_rate": 1.572858459305862e-06, + "loss": 3.0166, + "step": 76640 + }, + { + "epoch": 2.245674943235919, + "grad_norm": 19.08685302734375, + "learning_rate": 1.5717070864924794e-06, + "loss": 3.0416, + "step": 76650 + }, + { + "epoch": 2.245967919138651, + "grad_norm": 16.639257431030273, + "learning_rate": 1.5705560566594629e-06, + "loss": 3.0445, + "step": 76660 + }, + { + "epoch": 2.2462608950413827, + "grad_norm": 16.932048797607422, + "learning_rate": 1.5694053699219664e-06, + "loss": 3.0194, + "step": 76670 + }, + { + "epoch": 2.2465538709441146, + "grad_norm": 15.751989364624023, + "learning_rate": 1.5682550263951119e-06, + "loss": 3.0189, + "step": 76680 + }, + { + "epoch": 2.246846846846847, + "grad_norm": 15.505216598510742, + "learning_rate": 1.56710502619398e-06, + "loss": 3.0367, + "step": 76690 + }, + { + "epoch": 2.247139822749579, + "grad_norm": 17.772321701049805, + "learning_rate": 1.5659553694336238e-06, + "loss": 3.0254, + "step": 76700 + }, + { + "epoch": 2.247432798652311, + "grad_norm": 19.47917366027832, + "learning_rate": 1.5648060562290612e-06, + "loss": 3.0177, + "step": 76710 + }, + { + "epoch": 2.2477257745550427, + "grad_norm": 21.541915893554688, + "learning_rate": 1.5636570866952704e-06, + "loss": 3.0157, + "step": 76720 + }, + { + "epoch": 2.2480187504577747, + "grad_norm": 17.836915969848633, + "learning_rate": 1.562508460947203e-06, + "loss": 3.0024, + "step": 76730 + }, + { + "epoch": 2.248311726360507, + "grad_norm": 15.339987754821777, + "learning_rate": 1.5613601790997678e-06, + "loss": 3.0056, + "step": 76740 + }, + { + "epoch": 2.248604702263239, + "grad_norm": 20.148441314697266, + "learning_rate": 1.5602122412678466e-06, + "loss": 3.0318, + "step": 76750 + }, + { + "epoch": 2.248897678165971, + "grad_norm": 16.714279174804688, + "learning_rate": 1.559064647566283e-06, + "loss": 3.0457, + "step": 76760 + }, + { + "epoch": 2.2491906540687028, + "grad_norm": 20.71636390686035, + "learning_rate": 1.5579173981098883e-06, + "loss": 3.0359, + "step": 76770 + }, + { + "epoch": 2.2494836299714347, + "grad_norm": 18.3326358795166, + "learning_rate": 1.5567704930134353e-06, + "loss": 3.0371, + "step": 76780 + }, + { + "epoch": 2.249776605874167, + "grad_norm": 17.396089553833008, + "learning_rate": 1.5556239323916682e-06, + "loss": 3.0108, + "step": 76790 + }, + { + "epoch": 2.250069581776899, + "grad_norm": 19.845277786254883, + "learning_rate": 1.5544777163592906e-06, + "loss": 3.0319, + "step": 76800 + }, + { + "epoch": 2.250069581776899, + "eval_bleu": 0.3518268961803909, + "eval_cap_loss": 0.9043251276016235, + "eval_con_loss": 1.1401088237762451, + "eval_loss": 3.1845428943634033, + "step": 76800 + }, + { + "epoch": 2.250069581776899, + "eval_bleu": 0.3518268961803909, + "eval_cap_loss": 0.9043251276016235, + "eval_con_loss": 1.1401088237762451, + "eval_loss": 3.1845428943634033, + "eval_runtime": 54.6193, + "eval_samples_per_second": 366.171, + "eval_steps_per_second": 0.366, + "step": 76800 + }, + { + "epoch": 2.250362557679631, + "grad_norm": 15.251419067382812, + "learning_rate": 1.553331845030977e-06, + "loss": 3.0298, + "step": 76810 + }, + { + "epoch": 2.250655533582363, + "grad_norm": 14.351486206054688, + "learning_rate": 1.5521863185213626e-06, + "loss": 3.0094, + "step": 76820 + }, + { + "epoch": 2.2509485094850947, + "grad_norm": 16.92524528503418, + "learning_rate": 1.5510411369450512e-06, + "loss": 3.0244, + "step": 76830 + }, + { + "epoch": 2.251241485387827, + "grad_norm": 19.013320922851562, + "learning_rate": 1.5498963004166128e-06, + "loss": 3.033, + "step": 76840 + }, + { + "epoch": 2.251534461290559, + "grad_norm": 18.26290512084961, + "learning_rate": 1.5487518090505822e-06, + "loss": 3.0372, + "step": 76850 + }, + { + "epoch": 2.251827437193291, + "grad_norm": 21.960241317749023, + "learning_rate": 1.5476076629614562e-06, + "loss": 3.0099, + "step": 76860 + }, + { + "epoch": 2.252120413096023, + "grad_norm": 17.956653594970703, + "learning_rate": 1.5464638622637025e-06, + "loss": 3.0177, + "step": 76870 + }, + { + "epoch": 2.2524133889987548, + "grad_norm": 17.894309997558594, + "learning_rate": 1.5453204070717482e-06, + "loss": 3.0132, + "step": 76880 + }, + { + "epoch": 2.2527063649014867, + "grad_norm": 23.972061157226562, + "learning_rate": 1.5441772974999935e-06, + "loss": 3.0613, + "step": 76890 + }, + { + "epoch": 2.252999340804219, + "grad_norm": 19.216657638549805, + "learning_rate": 1.5430345336627956e-06, + "loss": 3.0323, + "step": 76900 + }, + { + "epoch": 2.253292316706951, + "grad_norm": 19.314851760864258, + "learning_rate": 1.5418921156744831e-06, + "loss": 3.0423, + "step": 76910 + }, + { + "epoch": 2.253585292609683, + "grad_norm": 21.021976470947266, + "learning_rate": 1.5407500436493482e-06, + "loss": 3.0255, + "step": 76920 + }, + { + "epoch": 2.253878268512415, + "grad_norm": 22.531478881835938, + "learning_rate": 1.5396083177016502e-06, + "loss": 3.0382, + "step": 76930 + }, + { + "epoch": 2.2541712444151467, + "grad_norm": 18.561338424682617, + "learning_rate": 1.5384669379456085e-06, + "loss": 3.0069, + "step": 76940 + }, + { + "epoch": 2.254464220317879, + "grad_norm": 16.912109375, + "learning_rate": 1.5373259044954148e-06, + "loss": 3.0308, + "step": 76950 + }, + { + "epoch": 2.254757196220611, + "grad_norm": 21.921232223510742, + "learning_rate": 1.5361852174652191e-06, + "loss": 3.0308, + "step": 76960 + }, + { + "epoch": 2.255050172123343, + "grad_norm": 20.30146598815918, + "learning_rate": 1.5350448769691445e-06, + "loss": 3.0326, + "step": 76970 + }, + { + "epoch": 2.255343148026075, + "grad_norm": 19.735153198242188, + "learning_rate": 1.533904883121271e-06, + "loss": 3.0379, + "step": 76980 + }, + { + "epoch": 2.2556361239288067, + "grad_norm": 22.317272186279297, + "learning_rate": 1.5327652360356504e-06, + "loss": 3.0168, + "step": 76990 + }, + { + "epoch": 2.2559290998315387, + "grad_norm": 15.44192123413086, + "learning_rate": 1.5316259358262974e-06, + "loss": 3.0176, + "step": 77000 + }, + { + "epoch": 2.256222075734271, + "grad_norm": 20.64763832092285, + "learning_rate": 1.5304869826071939e-06, + "loss": 3.0403, + "step": 77010 + }, + { + "epoch": 2.256515051637003, + "grad_norm": 20.95937156677246, + "learning_rate": 1.5293483764922823e-06, + "loss": 3.0138, + "step": 77020 + }, + { + "epoch": 2.256808027539735, + "grad_norm": 20.156980514526367, + "learning_rate": 1.5282101175954773e-06, + "loss": 3.0356, + "step": 77030 + }, + { + "epoch": 2.257101003442467, + "grad_norm": 17.631553649902344, + "learning_rate": 1.5270722060306502e-06, + "loss": 3.0181, + "step": 77040 + }, + { + "epoch": 2.2573939793451987, + "grad_norm": 15.52866268157959, + "learning_rate": 1.5259346419116466e-06, + "loss": 3.0157, + "step": 77050 + }, + { + "epoch": 2.257686955247931, + "grad_norm": 17.1400089263916, + "learning_rate": 1.524797425352269e-06, + "loss": 3.0287, + "step": 77060 + }, + { + "epoch": 2.257979931150663, + "grad_norm": 18.619556427001953, + "learning_rate": 1.523660556466292e-06, + "loss": 3.0178, + "step": 77070 + }, + { + "epoch": 2.258272907053395, + "grad_norm": 18.804296493530273, + "learning_rate": 1.5225240353674519e-06, + "loss": 3.0328, + "step": 77080 + }, + { + "epoch": 2.258565882956127, + "grad_norm": 16.522729873657227, + "learning_rate": 1.5213878621694506e-06, + "loss": 3.0269, + "step": 77090 + }, + { + "epoch": 2.2588588588588587, + "grad_norm": 19.74977684020996, + "learning_rate": 1.5202520369859575e-06, + "loss": 3.0437, + "step": 77100 + }, + { + "epoch": 2.2591518347615906, + "grad_norm": 19.608068466186523, + "learning_rate": 1.5191165599306019e-06, + "loss": 3.0587, + "step": 77110 + }, + { + "epoch": 2.259444810664323, + "grad_norm": 18.789649963378906, + "learning_rate": 1.5179814311169843e-06, + "loss": 3.02, + "step": 77120 + }, + { + "epoch": 2.259737786567055, + "grad_norm": 20.031042098999023, + "learning_rate": 1.5168466506586654e-06, + "loss": 3.0319, + "step": 77130 + }, + { + "epoch": 2.260030762469787, + "grad_norm": 19.049335479736328, + "learning_rate": 1.515712218669176e-06, + "loss": 3.0141, + "step": 77140 + }, + { + "epoch": 2.2603237383725188, + "grad_norm": 20.823488235473633, + "learning_rate": 1.5145781352620054e-06, + "loss": 3.0396, + "step": 77150 + }, + { + "epoch": 2.2606167142752507, + "grad_norm": 14.977864265441895, + "learning_rate": 1.5134444005506143e-06, + "loss": 3.0322, + "step": 77160 + }, + { + "epoch": 2.260909690177983, + "grad_norm": 21.740671157836914, + "learning_rate": 1.5123110146484254e-06, + "loss": 3.0331, + "step": 77170 + }, + { + "epoch": 2.261202666080715, + "grad_norm": 18.60765266418457, + "learning_rate": 1.5111779776688302e-06, + "loss": 3.016, + "step": 77180 + }, + { + "epoch": 2.261495641983447, + "grad_norm": 19.972270965576172, + "learning_rate": 1.5100452897251777e-06, + "loss": 3.0419, + "step": 77190 + }, + { + "epoch": 2.261788617886179, + "grad_norm": 21.80608558654785, + "learning_rate": 1.5089129509307904e-06, + "loss": 3.0407, + "step": 77200 + }, + { + "epoch": 2.2620815937889107, + "grad_norm": 18.946260452270508, + "learning_rate": 1.5077809613989492e-06, + "loss": 2.9993, + "step": 77210 + }, + { + "epoch": 2.2623745696916426, + "grad_norm": 16.779714584350586, + "learning_rate": 1.5066493212429057e-06, + "loss": 3.0354, + "step": 77220 + }, + { + "epoch": 2.262667545594375, + "grad_norm": 16.027549743652344, + "learning_rate": 1.5055180305758704e-06, + "loss": 3.0211, + "step": 77230 + }, + { + "epoch": 2.262960521497107, + "grad_norm": 17.95000457763672, + "learning_rate": 1.504387089511024e-06, + "loss": 3.041, + "step": 77240 + }, + { + "epoch": 2.263253497399839, + "grad_norm": 16.64657974243164, + "learning_rate": 1.5032564981615106e-06, + "loss": 3.0373, + "step": 77250 + }, + { + "epoch": 2.2635464733025707, + "grad_norm": 22.150222778320312, + "learning_rate": 1.502126256640441e-06, + "loss": 3.0232, + "step": 77260 + }, + { + "epoch": 2.263839449205303, + "grad_norm": 18.154733657836914, + "learning_rate": 1.5009963650608855e-06, + "loss": 3.0306, + "step": 77270 + }, + { + "epoch": 2.264132425108035, + "grad_norm": 22.902061462402344, + "learning_rate": 1.4998668235358865e-06, + "loss": 3.0416, + "step": 77280 + }, + { + "epoch": 2.264425401010767, + "grad_norm": 16.74949073791504, + "learning_rate": 1.498737632178444e-06, + "loss": 3.025, + "step": 77290 + }, + { + "epoch": 2.264718376913499, + "grad_norm": 22.807254791259766, + "learning_rate": 1.4976087911015314e-06, + "loss": 3.0263, + "step": 77300 + }, + { + "epoch": 2.265011352816231, + "grad_norm": 16.960729598999023, + "learning_rate": 1.4964803004180784e-06, + "loss": 2.9953, + "step": 77310 + }, + { + "epoch": 2.265069947996777, + "eval_bleu": 0.3518088973579334, + "eval_cap_loss": 0.9036446809768677, + "eval_con_loss": 1.1392993927001953, + "eval_loss": 3.182243824005127, + "step": 77312 + }, + { + "epoch": 2.265069947996777, + "eval_bleu": 0.3518088973579334, + "eval_cap_loss": 0.9036446809768677, + "eval_con_loss": 1.1392993927001953, + "eval_loss": 3.182243824005127, + "eval_runtime": 53.4855, + "eval_samples_per_second": 373.933, + "eval_steps_per_second": 0.374, + "step": 77312 + }, + { + "epoch": 2.2653043287189627, + "grad_norm": 19.429166793823242, + "learning_rate": 1.495352160240986e-06, + "loss": 3.0245, + "step": 77320 + }, + { + "epoch": 2.2655973046216946, + "grad_norm": 18.242807388305664, + "learning_rate": 1.4942243706831172e-06, + "loss": 3.0324, + "step": 77330 + }, + { + "epoch": 2.265890280524427, + "grad_norm": 23.18109703063965, + "learning_rate": 1.493096931857303e-06, + "loss": 3.0204, + "step": 77340 + }, + { + "epoch": 2.266183256427159, + "grad_norm": 14.954708099365234, + "learning_rate": 1.4919698438763331e-06, + "loss": 3.0079, + "step": 77350 + }, + { + "epoch": 2.266476232329891, + "grad_norm": 21.61728286743164, + "learning_rate": 1.4908431068529694e-06, + "loss": 3.0263, + "step": 77360 + }, + { + "epoch": 2.2667692082326227, + "grad_norm": 20.640472412109375, + "learning_rate": 1.4897167208999325e-06, + "loss": 3.0222, + "step": 77370 + }, + { + "epoch": 2.267062184135355, + "grad_norm": 17.417043685913086, + "learning_rate": 1.4885906861299132e-06, + "loss": 2.9998, + "step": 77380 + }, + { + "epoch": 2.267355160038087, + "grad_norm": 17.408485412597656, + "learning_rate": 1.487465002655562e-06, + "loss": 3.0558, + "step": 77390 + }, + { + "epoch": 2.267648135940819, + "grad_norm": 20.382959365844727, + "learning_rate": 1.4863396705894978e-06, + "loss": 3.0311, + "step": 77400 + }, + { + "epoch": 2.267941111843551, + "grad_norm": 14.379940032958984, + "learning_rate": 1.4852146900443044e-06, + "loss": 3.0136, + "step": 77410 + }, + { + "epoch": 2.2682340877462828, + "grad_norm": 18.58685874938965, + "learning_rate": 1.4840900611325282e-06, + "loss": 3.0253, + "step": 77420 + }, + { + "epoch": 2.2685270636490147, + "grad_norm": 18.199209213256836, + "learning_rate": 1.4829657839666832e-06, + "loss": 3.0336, + "step": 77430 + }, + { + "epoch": 2.268820039551747, + "grad_norm": 19.228609085083008, + "learning_rate": 1.4818418586592448e-06, + "loss": 3.037, + "step": 77440 + }, + { + "epoch": 2.269113015454479, + "grad_norm": 17.887712478637695, + "learning_rate": 1.4807182853226575e-06, + "loss": 3.0396, + "step": 77450 + }, + { + "epoch": 2.269405991357211, + "grad_norm": 16.429594039916992, + "learning_rate": 1.4795950640693253e-06, + "loss": 3.0238, + "step": 77460 + }, + { + "epoch": 2.269698967259943, + "grad_norm": 18.158790588378906, + "learning_rate": 1.478472195011622e-06, + "loss": 3.0126, + "step": 77470 + }, + { + "epoch": 2.2699919431626747, + "grad_norm": 19.13398551940918, + "learning_rate": 1.4773496782618802e-06, + "loss": 3.0196, + "step": 77480 + }, + { + "epoch": 2.270284919065407, + "grad_norm": 17.55855941772461, + "learning_rate": 1.4762275139324072e-06, + "loss": 3.0202, + "step": 77490 + }, + { + "epoch": 2.270577894968139, + "grad_norm": 16.59376335144043, + "learning_rate": 1.4751057021354637e-06, + "loss": 3.0157, + "step": 77500 + }, + { + "epoch": 2.270870870870871, + "grad_norm": 18.99484634399414, + "learning_rate": 1.473984242983284e-06, + "loss": 3.0259, + "step": 77510 + }, + { + "epoch": 2.271163846773603, + "grad_norm": 17.030765533447266, + "learning_rate": 1.4728631365880603e-06, + "loss": 3.0154, + "step": 77520 + }, + { + "epoch": 2.2714568226763348, + "grad_norm": 20.502300262451172, + "learning_rate": 1.4717423830619553e-06, + "loss": 3.0211, + "step": 77530 + }, + { + "epoch": 2.2717497985790667, + "grad_norm": 15.392756462097168, + "learning_rate": 1.4706219825170908e-06, + "loss": 3.0012, + "step": 77540 + }, + { + "epoch": 2.272042774481799, + "grad_norm": 16.57636833190918, + "learning_rate": 1.4695019350655593e-06, + "loss": 3.0135, + "step": 77550 + }, + { + "epoch": 2.272335750384531, + "grad_norm": 21.5090389251709, + "learning_rate": 1.468382240819411e-06, + "loss": 3.005, + "step": 77560 + }, + { + "epoch": 2.272628726287263, + "grad_norm": 18.976268768310547, + "learning_rate": 1.4672628998906697e-06, + "loss": 3.0433, + "step": 77570 + }, + { + "epoch": 2.272921702189995, + "grad_norm": 20.142595291137695, + "learning_rate": 1.4661439123913147e-06, + "loss": 3.0295, + "step": 77580 + }, + { + "epoch": 2.2732146780927267, + "grad_norm": 16.887361526489258, + "learning_rate": 1.4650252784332975e-06, + "loss": 3.0119, + "step": 77590 + }, + { + "epoch": 2.273507653995459, + "grad_norm": 19.2895450592041, + "learning_rate": 1.463906998128527e-06, + "loss": 3.0101, + "step": 77600 + }, + { + "epoch": 2.273800629898191, + "grad_norm": 17.879812240600586, + "learning_rate": 1.4627890715888836e-06, + "loss": 3.0385, + "step": 77610 + }, + { + "epoch": 2.274093605800923, + "grad_norm": 21.373687744140625, + "learning_rate": 1.4616714989262077e-06, + "loss": 3.0448, + "step": 77620 + }, + { + "epoch": 2.274386581703655, + "grad_norm": 16.668487548828125, + "learning_rate": 1.4605542802523066e-06, + "loss": 3.0061, + "step": 77630 + }, + { + "epoch": 2.2746795576063867, + "grad_norm": 19.071426391601562, + "learning_rate": 1.4594374156789482e-06, + "loss": 3.012, + "step": 77640 + }, + { + "epoch": 2.2749725335091187, + "grad_norm": 21.656150817871094, + "learning_rate": 1.4583209053178743e-06, + "loss": 3.0199, + "step": 77650 + }, + { + "epoch": 2.275265509411851, + "grad_norm": 17.494352340698242, + "learning_rate": 1.45720474928078e-06, + "loss": 3.0093, + "step": 77660 + }, + { + "epoch": 2.275558485314583, + "grad_norm": 23.81630516052246, + "learning_rate": 1.4560889476793333e-06, + "loss": 3.0246, + "step": 77670 + }, + { + "epoch": 2.275851461217315, + "grad_norm": 20.85586929321289, + "learning_rate": 1.4549735006251603e-06, + "loss": 3.0401, + "step": 77680 + }, + { + "epoch": 2.2761444371200468, + "grad_norm": 17.16034507751465, + "learning_rate": 1.4538584082298586e-06, + "loss": 3.0064, + "step": 77690 + }, + { + "epoch": 2.2764374130227787, + "grad_norm": 16.05467414855957, + "learning_rate": 1.452743670604983e-06, + "loss": 3.0262, + "step": 77700 + }, + { + "epoch": 2.276730388925511, + "grad_norm": 21.81360626220703, + "learning_rate": 1.4516292878620598e-06, + "loss": 3.0338, + "step": 77710 + }, + { + "epoch": 2.277023364828243, + "grad_norm": 17.346084594726562, + "learning_rate": 1.4505152601125711e-06, + "loss": 3.0339, + "step": 77720 + }, + { + "epoch": 2.277316340730975, + "grad_norm": 18.699922561645508, + "learning_rate": 1.449401587467975e-06, + "loss": 3.0235, + "step": 77730 + }, + { + "epoch": 2.277609316633707, + "grad_norm": 17.740373611450195, + "learning_rate": 1.4482882700396843e-06, + "loss": 3.0318, + "step": 77740 + }, + { + "epoch": 2.2779022925364387, + "grad_norm": 15.657788276672363, + "learning_rate": 1.4471753079390815e-06, + "loss": 3.0165, + "step": 77750 + }, + { + "epoch": 2.2781952684391706, + "grad_norm": 20.87066650390625, + "learning_rate": 1.4460627012775101e-06, + "loss": 3.0222, + "step": 77760 + }, + { + "epoch": 2.278488244341903, + "grad_norm": 21.382617950439453, + "learning_rate": 1.444950450166282e-06, + "loss": 3.0274, + "step": 77770 + }, + { + "epoch": 2.278781220244635, + "grad_norm": 18.51226043701172, + "learning_rate": 1.4438385547166677e-06, + "loss": 3.0372, + "step": 77780 + }, + { + "epoch": 2.279074196147367, + "grad_norm": 16.757688522338867, + "learning_rate": 1.4427270150399086e-06, + "loss": 3.0251, + "step": 77790 + }, + { + "epoch": 2.2793671720500988, + "grad_norm": 18.853797912597656, + "learning_rate": 1.4416158312472072e-06, + "loss": 3.0371, + "step": 77800 + }, + { + "epoch": 2.279660147952831, + "grad_norm": 20.142290115356445, + "learning_rate": 1.440505003449731e-06, + "loss": 3.0262, + "step": 77810 + }, + { + "epoch": 2.279953123855563, + "grad_norm": 19.69032096862793, + "learning_rate": 1.4393945317586121e-06, + "loss": 3.0407, + "step": 77820 + }, + { + "epoch": 2.2800703142166556, + "eval_bleu": 0.351972458297661, + "eval_cap_loss": 0.9036924242973328, + "eval_con_loss": 1.1389646530151367, + "eval_loss": 3.181621789932251, + "step": 77824 + }, + { + "epoch": 2.2800703142166556, + "eval_bleu": 0.351972458297661, + "eval_cap_loss": 0.9036924242973328, + "eval_con_loss": 1.1389646530151367, + "eval_loss": 3.181621789932251, + "eval_runtime": 52.8629, + "eval_samples_per_second": 378.337, + "eval_steps_per_second": 0.378, + "step": 77824 + }, + { + "epoch": 2.280246099758295, + "grad_norm": 15.047937393188477, + "learning_rate": 1.4382844162849451e-06, + "loss": 3.0213, + "step": 77830 + }, + { + "epoch": 2.280539075661027, + "grad_norm": 15.660181999206543, + "learning_rate": 1.4371746571397927e-06, + "loss": 3.0207, + "step": 77840 + }, + { + "epoch": 2.280832051563759, + "grad_norm": 17.577985763549805, + "learning_rate": 1.4360652544341762e-06, + "loss": 3.0099, + "step": 77850 + }, + { + "epoch": 2.2811250274664907, + "grad_norm": 16.863075256347656, + "learning_rate": 1.434956208279089e-06, + "loss": 3.0184, + "step": 77860 + }, + { + "epoch": 2.281418003369223, + "grad_norm": 18.72022819519043, + "learning_rate": 1.4338475187854811e-06, + "loss": 3.0257, + "step": 77870 + }, + { + "epoch": 2.281710979271955, + "grad_norm": 16.941312789916992, + "learning_rate": 1.4327391860642715e-06, + "loss": 3.0178, + "step": 77880 + }, + { + "epoch": 2.282003955174687, + "grad_norm": 20.674152374267578, + "learning_rate": 1.4316312102263424e-06, + "loss": 3.0162, + "step": 77890 + }, + { + "epoch": 2.282296931077419, + "grad_norm": 18.626760482788086, + "learning_rate": 1.4305235913825421e-06, + "loss": 3.0056, + "step": 77900 + }, + { + "epoch": 2.2825899069801507, + "grad_norm": 17.59998321533203, + "learning_rate": 1.4294163296436776e-06, + "loss": 3.0136, + "step": 77910 + }, + { + "epoch": 2.282882882882883, + "grad_norm": 18.8997802734375, + "learning_rate": 1.4283094251205276e-06, + "loss": 3.0134, + "step": 77920 + }, + { + "epoch": 2.283175858785615, + "grad_norm": 18.634017944335938, + "learning_rate": 1.4272028779238277e-06, + "loss": 3.0138, + "step": 77930 + }, + { + "epoch": 2.283468834688347, + "grad_norm": 18.01626205444336, + "learning_rate": 1.426096688164285e-06, + "loss": 3.035, + "step": 77940 + }, + { + "epoch": 2.283761810591079, + "grad_norm": 18.55169105529785, + "learning_rate": 1.4249908559525638e-06, + "loss": 3.0282, + "step": 77950 + }, + { + "epoch": 2.2840547864938108, + "grad_norm": 19.197418212890625, + "learning_rate": 1.4238853813992975e-06, + "loss": 3.0229, + "step": 77960 + }, + { + "epoch": 2.2843477623965427, + "grad_norm": 21.898460388183594, + "learning_rate": 1.4227802646150824e-06, + "loss": 3.017, + "step": 77970 + }, + { + "epoch": 2.284640738299275, + "grad_norm": 17.391000747680664, + "learning_rate": 1.4216755057104803e-06, + "loss": 3.0232, + "step": 77980 + }, + { + "epoch": 2.284933714202007, + "grad_norm": 16.6404972076416, + "learning_rate": 1.4205711047960136e-06, + "loss": 3.0245, + "step": 77990 + }, + { + "epoch": 2.285226690104739, + "grad_norm": 18.85016441345215, + "learning_rate": 1.4194670619821726e-06, + "loss": 3.0113, + "step": 78000 + }, + { + "epoch": 2.285519666007471, + "grad_norm": 17.82332420349121, + "learning_rate": 1.4183633773794082e-06, + "loss": 3.019, + "step": 78010 + }, + { + "epoch": 2.2858126419102027, + "grad_norm": NaN, + "learning_rate": 1.4173703675986534e-06, + "loss": 3.0212, + "step": 78020 + }, + { + "epoch": 2.286105617812935, + "grad_norm": 17.89361000061035, + "learning_rate": 1.4162673639011065e-06, + "loss": 3.0095, + "step": 78030 + }, + { + "epoch": 2.286398593715667, + "grad_norm": 22.428482055664062, + "learning_rate": 1.4151647187347496e-06, + "loss": 3.0155, + "step": 78040 + }, + { + "epoch": 2.286691569618399, + "grad_norm": 21.722705841064453, + "learning_rate": 1.4140624322098933e-06, + "loss": 3.0522, + "step": 78050 + }, + { + "epoch": 2.286984545521131, + "grad_norm": 19.398542404174805, + "learning_rate": 1.4129605044368172e-06, + "loss": 3.0027, + "step": 78060 + }, + { + "epoch": 2.2872775214238628, + "grad_norm": 14.63221263885498, + "learning_rate": 1.4118589355257601e-06, + "loss": 3.0314, + "step": 78070 + }, + { + "epoch": 2.2875704973265947, + "grad_norm": 19.70339584350586, + "learning_rate": 1.4107577255869288e-06, + "loss": 3.0221, + "step": 78080 + }, + { + "epoch": 2.287863473229327, + "grad_norm": 16.71702766418457, + "learning_rate": 1.409656874730493e-06, + "loss": 3.0043, + "step": 78090 + }, + { + "epoch": 2.288156449132059, + "grad_norm": 17.364234924316406, + "learning_rate": 1.4085563830665855e-06, + "loss": 3.0299, + "step": 78100 + }, + { + "epoch": 2.288449425034791, + "grad_norm": 15.937032699584961, + "learning_rate": 1.407456250705307e-06, + "loss": 3.0313, + "step": 78110 + }, + { + "epoch": 2.288742400937523, + "grad_norm": 15.961935997009277, + "learning_rate": 1.4063564777567147e-06, + "loss": 3.0477, + "step": 78120 + }, + { + "epoch": 2.2890353768402547, + "grad_norm": 15.910840034484863, + "learning_rate": 1.4052570643308377e-06, + "loss": 3.0158, + "step": 78130 + }, + { + "epoch": 2.289328352742987, + "grad_norm": 17.28996467590332, + "learning_rate": 1.4041580105376635e-06, + "loss": 3.0313, + "step": 78140 + }, + { + "epoch": 2.289621328645719, + "grad_norm": 15.462076187133789, + "learning_rate": 1.403059316487148e-06, + "loss": 3.0298, + "step": 78150 + }, + { + "epoch": 2.289914304548451, + "grad_norm": 13.399137496948242, + "learning_rate": 1.4019609822892067e-06, + "loss": 3.0219, + "step": 78160 + }, + { + "epoch": 2.290207280451183, + "grad_norm": 22.395566940307617, + "learning_rate": 1.4008630080537227e-06, + "loss": 3.0117, + "step": 78170 + }, + { + "epoch": 2.2905002563539147, + "grad_norm": 16.43474769592285, + "learning_rate": 1.399765393890542e-06, + "loss": 3.0093, + "step": 78180 + }, + { + "epoch": 2.2907932322566467, + "grad_norm": 15.102253913879395, + "learning_rate": 1.3986681399094754e-06, + "loss": 3.0259, + "step": 78190 + }, + { + "epoch": 2.291086208159379, + "grad_norm": 17.234535217285156, + "learning_rate": 1.3975712462202938e-06, + "loss": 3.0153, + "step": 78200 + }, + { + "epoch": 2.291379184062111, + "grad_norm": 21.655826568603516, + "learning_rate": 1.3964747129327388e-06, + "loss": 3.0181, + "step": 78210 + }, + { + "epoch": 2.291672159964843, + "grad_norm": 16.909746170043945, + "learning_rate": 1.3953785401565079e-06, + "loss": 3.0079, + "step": 78220 + }, + { + "epoch": 2.2919651358675748, + "grad_norm": 17.94141387939453, + "learning_rate": 1.3942827280012694e-06, + "loss": 3.0185, + "step": 78230 + }, + { + "epoch": 2.292258111770307, + "grad_norm": 21.103891372680664, + "learning_rate": 1.3931872765766514e-06, + "loss": 3.0288, + "step": 78240 + }, + { + "epoch": 2.292551087673039, + "grad_norm": 18.628398895263672, + "learning_rate": 1.3920921859922475e-06, + "loss": 3.0264, + "step": 78250 + }, + { + "epoch": 2.292844063575771, + "grad_norm": 21.423566818237305, + "learning_rate": 1.3909974563576152e-06, + "loss": 3.0096, + "step": 78260 + }, + { + "epoch": 2.293137039478503, + "grad_norm": 17.124195098876953, + "learning_rate": 1.3899030877822778e-06, + "loss": 3.0055, + "step": 78270 + }, + { + "epoch": 2.293430015381235, + "grad_norm": 17.869991302490234, + "learning_rate": 1.3888090803757166e-06, + "loss": 3.009, + "step": 78280 + }, + { + "epoch": 2.2937229912839667, + "grad_norm": 19.87061309814453, + "learning_rate": 1.3877154342473842e-06, + "loss": 3.0516, + "step": 78290 + }, + { + "epoch": 2.2940159671866986, + "grad_norm": 19.186752319335938, + "learning_rate": 1.38662214950669e-06, + "loss": 3.0208, + "step": 78300 + }, + { + "epoch": 2.294308943089431, + "grad_norm": 16.700651168823242, + "learning_rate": 1.3855292262630137e-06, + "loss": 3.0241, + "step": 78310 + }, + { + "epoch": 2.294601918992163, + "grad_norm": 17.21974754333496, + "learning_rate": 1.3844366646256935e-06, + "loss": 3.0089, + "step": 78320 + }, + { + "epoch": 2.294894894894895, + "grad_norm": 16.40537452697754, + "learning_rate": 1.383344464704034e-06, + "loss": 2.9885, + "step": 78330 + }, + { + "epoch": 2.295070680436534, + "eval_bleu": 0.3523267007949002, + "eval_cap_loss": 0.9034057855606079, + "eval_con_loss": 1.13706374168396, + "eval_loss": 3.1775331497192383, + "step": 78336 + }, + { + "epoch": 2.295070680436534, + "eval_bleu": 0.3523267007949002, + "eval_cap_loss": 0.9034057855606079, + "eval_con_loss": 1.13706374168396, + "eval_loss": 3.1775331497192383, + "eval_runtime": 53.4892, + "eval_samples_per_second": 373.908, + "eval_steps_per_second": 0.374, + "step": 78336 + }, + { + "epoch": 2.2951878707976268, + "grad_norm": 15.236075401306152, + "learning_rate": 1.3822526266073044e-06, + "loss": 3.0253, + "step": 78340 + }, + { + "epoch": 2.295480846700359, + "grad_norm": 22.241050720214844, + "learning_rate": 1.3811611504447369e-06, + "loss": 3.017, + "step": 78350 + }, + { + "epoch": 2.295773822603091, + "grad_norm": 16.537883758544922, + "learning_rate": 1.3800700363255258e-06, + "loss": 3.0338, + "step": 78360 + }, + { + "epoch": 2.296066798505823, + "grad_norm": 14.506319999694824, + "learning_rate": 1.3789792843588322e-06, + "loss": 3.0121, + "step": 78370 + }, + { + "epoch": 2.296359774408555, + "grad_norm": 17.575153350830078, + "learning_rate": 1.3778888946537772e-06, + "loss": 3.0271, + "step": 78380 + }, + { + "epoch": 2.296652750311287, + "grad_norm": 14.815241813659668, + "learning_rate": 1.3767988673194505e-06, + "loss": 3.0343, + "step": 78390 + }, + { + "epoch": 2.2969457262140187, + "grad_norm": 16.357059478759766, + "learning_rate": 1.3757092024649e-06, + "loss": 3.023, + "step": 78400 + }, + { + "epoch": 2.297238702116751, + "grad_norm": 20.426437377929688, + "learning_rate": 1.3746199001991412e-06, + "loss": 3.0081, + "step": 78410 + }, + { + "epoch": 2.297531678019483, + "grad_norm": 19.60353660583496, + "learning_rate": 1.3735309606311532e-06, + "loss": 3.0005, + "step": 78420 + }, + { + "epoch": 2.297824653922215, + "grad_norm": 15.338605880737305, + "learning_rate": 1.3724423838698786e-06, + "loss": 3.0351, + "step": 78430 + }, + { + "epoch": 2.298117629824947, + "grad_norm": 19.816064834594727, + "learning_rate": 1.3713541700242206e-06, + "loss": 3.0271, + "step": 78440 + }, + { + "epoch": 2.2984106057276787, + "grad_norm": 19.409372329711914, + "learning_rate": 1.3702663192030513e-06, + "loss": 3.0374, + "step": 78450 + }, + { + "epoch": 2.298703581630411, + "grad_norm": 17.568851470947266, + "learning_rate": 1.369178831515201e-06, + "loss": 3.0136, + "step": 78460 + }, + { + "epoch": 2.298996557533143, + "grad_norm": 17.530681610107422, + "learning_rate": 1.3680917070694677e-06, + "loss": 3.0275, + "step": 78470 + }, + { + "epoch": 2.299289533435875, + "grad_norm": 18.737285614013672, + "learning_rate": 1.367004945974613e-06, + "loss": 3.0117, + "step": 78480 + }, + { + "epoch": 2.299582509338607, + "grad_norm": 18.510772705078125, + "learning_rate": 1.3659185483393583e-06, + "loss": 3.0345, + "step": 78490 + }, + { + "epoch": 2.299875485241339, + "grad_norm": 15.64285659790039, + "learning_rate": 1.3648325142723923e-06, + "loss": 3.0186, + "step": 78500 + }, + { + "epoch": 2.3001684611440707, + "grad_norm": 21.362112045288086, + "learning_rate": 1.363746843882367e-06, + "loss": 2.9961, + "step": 78510 + }, + { + "epoch": 2.300461437046803, + "grad_norm": 15.648037910461426, + "learning_rate": 1.3626615372778978e-06, + "loss": 3.012, + "step": 78520 + }, + { + "epoch": 2.300754412949535, + "grad_norm": 19.148759841918945, + "learning_rate": 1.3615765945675612e-06, + "loss": 3.015, + "step": 78530 + }, + { + "epoch": 2.301047388852267, + "grad_norm": 21.785621643066406, + "learning_rate": 1.3604920158599016e-06, + "loss": 3.0115, + "step": 78540 + }, + { + "epoch": 2.301340364754999, + "grad_norm": 16.986738204956055, + "learning_rate": 1.3594078012634216e-06, + "loss": 3.0209, + "step": 78550 + }, + { + "epoch": 2.3016333406577307, + "grad_norm": 17.448444366455078, + "learning_rate": 1.3583239508865937e-06, + "loss": 3.0215, + "step": 78560 + }, + { + "epoch": 2.301926316560463, + "grad_norm": 17.06800651550293, + "learning_rate": 1.3572404648378479e-06, + "loss": 3.0443, + "step": 78570 + }, + { + "epoch": 2.302219292463195, + "grad_norm": 19.376585006713867, + "learning_rate": 1.356157343225582e-06, + "loss": 3.0223, + "step": 78580 + }, + { + "epoch": 2.302512268365927, + "grad_norm": 16.723060607910156, + "learning_rate": 1.3550745861581555e-06, + "loss": 3.036, + "step": 78590 + }, + { + "epoch": 2.302805244268659, + "grad_norm": 13.62204360961914, + "learning_rate": 1.3539921937438938e-06, + "loss": 3.013, + "step": 78600 + }, + { + "epoch": 2.3030982201713908, + "grad_norm": 18.2899227142334, + "learning_rate": 1.3529101660910804e-06, + "loss": 3.0139, + "step": 78610 + }, + { + "epoch": 2.3033911960741227, + "grad_norm": 17.191776275634766, + "learning_rate": 1.3518285033079688e-06, + "loss": 3.0225, + "step": 78620 + }, + { + "epoch": 2.303684171976855, + "grad_norm": 18.04080581665039, + "learning_rate": 1.3507472055027698e-06, + "loss": 3.0203, + "step": 78630 + }, + { + "epoch": 2.303977147879587, + "grad_norm": 17.14531135559082, + "learning_rate": 1.3496662727836645e-06, + "loss": 3.0156, + "step": 78640 + }, + { + "epoch": 2.304270123782319, + "grad_norm": 19.669809341430664, + "learning_rate": 1.3485857052587908e-06, + "loss": 3.0043, + "step": 78650 + }, + { + "epoch": 2.304563099685051, + "grad_norm": 18.508604049682617, + "learning_rate": 1.3475055030362538e-06, + "loss": 3.0207, + "step": 78660 + }, + { + "epoch": 2.3048560755877827, + "grad_norm": 18.645524978637695, + "learning_rate": 1.3464256662241216e-06, + "loss": 3.0128, + "step": 78670 + }, + { + "epoch": 2.305149051490515, + "grad_norm": 16.632707595825195, + "learning_rate": 1.3453461949304275e-06, + "loss": 3.041, + "step": 78680 + }, + { + "epoch": 2.305442027393247, + "grad_norm": 21.39092445373535, + "learning_rate": 1.344267089263162e-06, + "loss": 3.014, + "step": 78690 + }, + { + "epoch": 2.305735003295979, + "grad_norm": 17.9338321685791, + "learning_rate": 1.3431883493302877e-06, + "loss": 3.0047, + "step": 78700 + }, + { + "epoch": 2.306027979198711, + "grad_norm": 18.643566131591797, + "learning_rate": 1.342109975239722e-06, + "loss": 3.0338, + "step": 78710 + }, + { + "epoch": 2.3063209551014427, + "grad_norm": 19.37554359436035, + "learning_rate": 1.3410319670993538e-06, + "loss": 2.9951, + "step": 78720 + }, + { + "epoch": 2.3066139310041747, + "grad_norm": 19.044605255126953, + "learning_rate": 1.3399543250170278e-06, + "loss": 3.0489, + "step": 78730 + }, + { + "epoch": 2.306906906906907, + "grad_norm": 15.664886474609375, + "learning_rate": 1.3388770491005564e-06, + "loss": 3.0212, + "step": 78740 + }, + { + "epoch": 2.307199882809639, + "grad_norm": 15.08438491821289, + "learning_rate": 1.3378001394577166e-06, + "loss": 3.0154, + "step": 78750 + }, + { + "epoch": 2.307492858712371, + "grad_norm": 16.014551162719727, + "learning_rate": 1.3367235961962466e-06, + "loss": 3.0134, + "step": 78760 + }, + { + "epoch": 2.307785834615103, + "grad_norm": 20.505582809448242, + "learning_rate": 1.3356474194238461e-06, + "loss": 3.0453, + "step": 78770 + }, + { + "epoch": 2.308078810517835, + "grad_norm": 20.653682708740234, + "learning_rate": 1.3345716092481831e-06, + "loss": 3.0213, + "step": 78780 + }, + { + "epoch": 2.308371786420567, + "grad_norm": 17.71834373474121, + "learning_rate": 1.3334961657768825e-06, + "loss": 3.0305, + "step": 78790 + }, + { + "epoch": 2.308664762323299, + "grad_norm": 18.739974975585938, + "learning_rate": 1.3324210891175393e-06, + "loss": 3.0165, + "step": 78800 + }, + { + "epoch": 2.308957738226031, + "grad_norm": 19.92327880859375, + "learning_rate": 1.331346379377706e-06, + "loss": 3.0436, + "step": 78810 + }, + { + "epoch": 2.309250714128763, + "grad_norm": 17.577274322509766, + "learning_rate": 1.330272036664902e-06, + "loss": 3.045, + "step": 78820 + }, + { + "epoch": 2.3095436900314947, + "grad_norm": 20.193359375, + "learning_rate": 1.3291980610866089e-06, + "loss": 3.0327, + "step": 78830 + }, + { + "epoch": 2.309836665934227, + "grad_norm": 23.38938331604004, + "learning_rate": 1.3281244527502717e-06, + "loss": 3.0272, + "step": 78840 + }, + { + "epoch": 2.3100710466564127, + "eval_bleu": 0.35220097173011655, + "eval_cap_loss": 0.9031902551651001, + "eval_con_loss": 1.1356689929962158, + "eval_loss": 3.174528121948242, + "step": 78848 + }, + { + "epoch": 2.3100710466564127, + "eval_bleu": 0.35220097173011655, + "eval_cap_loss": 0.9031902551651001, + "eval_con_loss": 1.1356689929962158, + "eval_loss": 3.174528121948242, + "eval_runtime": 61.7374, + "eval_samples_per_second": 323.953, + "eval_steps_per_second": 0.324, + "step": 78848 + }, + { + "epoch": 2.310129641836959, + "grad_norm": 18.902286529541016, + "learning_rate": 1.3270512117633e-06, + "loss": 3.0431, + "step": 78850 + }, + { + "epoch": 2.310422617739691, + "grad_norm": 17.506399154663086, + "learning_rate": 1.325978338233062e-06, + "loss": 3.0026, + "step": 78860 + }, + { + "epoch": 2.310715593642423, + "grad_norm": 17.40419578552246, + "learning_rate": 1.3249058322668956e-06, + "loss": 3.0102, + "step": 78870 + }, + { + "epoch": 2.3110085695451548, + "grad_norm": 19.949228286743164, + "learning_rate": 1.3238336939720957e-06, + "loss": 3.019, + "step": 78880 + }, + { + "epoch": 2.311301545447887, + "grad_norm": 17.381784439086914, + "learning_rate": 1.3227619234559264e-06, + "loss": 3.006, + "step": 78890 + }, + { + "epoch": 2.311594521350619, + "grad_norm": 17.27145004272461, + "learning_rate": 1.3216905208256086e-06, + "loss": 3.0211, + "step": 78900 + }, + { + "epoch": 2.311887497253351, + "grad_norm": 17.46925163269043, + "learning_rate": 1.320619486188332e-06, + "loss": 3.0394, + "step": 78910 + }, + { + "epoch": 2.312180473156083, + "grad_norm": 18.949058532714844, + "learning_rate": 1.3195488196512463e-06, + "loss": 3.0232, + "step": 78920 + }, + { + "epoch": 2.312473449058815, + "grad_norm": 20.050613403320312, + "learning_rate": 1.3184785213214673e-06, + "loss": 3.0428, + "step": 78930 + }, + { + "epoch": 2.3127664249615467, + "grad_norm": 20.60785675048828, + "learning_rate": 1.3174085913060692e-06, + "loss": 3.0153, + "step": 78940 + }, + { + "epoch": 2.313059400864279, + "grad_norm": 16.011510848999023, + "learning_rate": 1.316339029712095e-06, + "loss": 3.0226, + "step": 78950 + }, + { + "epoch": 2.313352376767011, + "grad_norm": 16.11046028137207, + "learning_rate": 1.3152698366465449e-06, + "loss": 3.0307, + "step": 78960 + }, + { + "epoch": 2.313645352669743, + "grad_norm": 20.242341995239258, + "learning_rate": 1.3142010122163885e-06, + "loss": 3.02, + "step": 78970 + }, + { + "epoch": 2.313938328572475, + "grad_norm": 15.747589111328125, + "learning_rate": 1.3131325565285519e-06, + "loss": 3.0298, + "step": 78980 + }, + { + "epoch": 2.3142313044752068, + "grad_norm": 17.97988510131836, + "learning_rate": 1.3120644696899298e-06, + "loss": 3.001, + "step": 78990 + }, + { + "epoch": 2.314524280377939, + "grad_norm": 17.443191528320312, + "learning_rate": 1.310996751807378e-06, + "loss": 3.012, + "step": 79000 + }, + { + "epoch": 2.314817256280671, + "grad_norm": 18.645498275756836, + "learning_rate": 1.3099294029877163e-06, + "loss": 3.0262, + "step": 79010 + }, + { + "epoch": 2.315110232183403, + "grad_norm": 19.596017837524414, + "learning_rate": 1.308862423337724e-06, + "loss": 3.0089, + "step": 79020 + }, + { + "epoch": 2.315403208086135, + "grad_norm": 20.496536254882812, + "learning_rate": 1.3077958129641489e-06, + "loss": 3.0442, + "step": 79030 + }, + { + "epoch": 2.315696183988867, + "grad_norm": 18.677640914916992, + "learning_rate": 1.306729571973696e-06, + "loss": 3.0203, + "step": 79040 + }, + { + "epoch": 2.3159891598915987, + "grad_norm": 20.57805633544922, + "learning_rate": 1.3056637004730394e-06, + "loss": 3.0121, + "step": 79050 + }, + { + "epoch": 2.316282135794331, + "grad_norm": 15.81386947631836, + "learning_rate": 1.3045981985688106e-06, + "loss": 3.0225, + "step": 79060 + }, + { + "epoch": 2.316575111697063, + "grad_norm": 19.159229278564453, + "learning_rate": 1.3035330663676083e-06, + "loss": 3.0233, + "step": 79070 + }, + { + "epoch": 2.316868087599795, + "grad_norm": 20.50929069519043, + "learning_rate": 1.3024683039759917e-06, + "loss": 3.0326, + "step": 79080 + }, + { + "epoch": 2.317161063502527, + "grad_norm": 19.50806999206543, + "learning_rate": 1.3014039115004862e-06, + "loss": 3.0385, + "step": 79090 + }, + { + "epoch": 2.3174540394052587, + "grad_norm": 17.64889907836914, + "learning_rate": 1.300339889047575e-06, + "loss": 3.0125, + "step": 79100 + }, + { + "epoch": 2.317747015307991, + "grad_norm": 19.377967834472656, + "learning_rate": 1.29927623672371e-06, + "loss": 3.028, + "step": 79110 + }, + { + "epoch": 2.318039991210723, + "grad_norm": 18.11537742614746, + "learning_rate": 1.2982129546353e-06, + "loss": 3.037, + "step": 79120 + }, + { + "epoch": 2.318332967113455, + "grad_norm": 18.175554275512695, + "learning_rate": 1.297150042888724e-06, + "loss": 3.0053, + "step": 79130 + }, + { + "epoch": 2.318625943016187, + "grad_norm": 14.595681190490723, + "learning_rate": 1.2960875015903164e-06, + "loss": 3.0242, + "step": 79140 + }, + { + "epoch": 2.3189189189189188, + "grad_norm": 13.935908317565918, + "learning_rate": 1.2950253308463795e-06, + "loss": 3.0181, + "step": 79150 + }, + { + "epoch": 2.3192118948216507, + "grad_norm": 19.573461532592773, + "learning_rate": 1.2939635307631776e-06, + "loss": 3.0199, + "step": 79160 + }, + { + "epoch": 2.319504870724383, + "grad_norm": 20.703306198120117, + "learning_rate": 1.2929021014469385e-06, + "loss": 3.0004, + "step": 79170 + }, + { + "epoch": 2.319797846627115, + "grad_norm": 20.733694076538086, + "learning_rate": 1.2918410430038491e-06, + "loss": 3.0211, + "step": 79180 + }, + { + "epoch": 2.320090822529847, + "grad_norm": 18.177263259887695, + "learning_rate": 1.290780355540064e-06, + "loss": 3.0457, + "step": 79190 + }, + { + "epoch": 2.320383798432579, + "grad_norm": 20.30535888671875, + "learning_rate": 1.2897200391617e-06, + "loss": 3.0399, + "step": 79200 + }, + { + "epoch": 2.320676774335311, + "grad_norm": 21.03881072998047, + "learning_rate": 1.2886600939748318e-06, + "loss": 3.0106, + "step": 79210 + }, + { + "epoch": 2.320969750238043, + "grad_norm": 18.32990837097168, + "learning_rate": 1.2876005200855035e-06, + "loss": 3.0363, + "step": 79220 + }, + { + "epoch": 2.321262726140775, + "grad_norm": 17.436975479125977, + "learning_rate": 1.2865413175997177e-06, + "loss": 3.0254, + "step": 79230 + }, + { + "epoch": 2.321555702043507, + "grad_norm": 21.482410430908203, + "learning_rate": 1.2854824866234417e-06, + "loss": 3.0429, + "step": 79240 + }, + { + "epoch": 2.321848677946239, + "grad_norm": 16.690683364868164, + "learning_rate": 1.2844240272626052e-06, + "loss": 3.0243, + "step": 79250 + }, + { + "epoch": 2.3221416538489708, + "grad_norm": 19.16546630859375, + "learning_rate": 1.2833659396231025e-06, + "loss": 3.0063, + "step": 79260 + }, + { + "epoch": 2.3224346297517027, + "grad_norm": 21.65381622314453, + "learning_rate": 1.282308223810786e-06, + "loss": 2.9981, + "step": 79270 + }, + { + "epoch": 2.322727605654435, + "grad_norm": 18.652124404907227, + "learning_rate": 1.2812508799314772e-06, + "loss": 3.0078, + "step": 79280 + }, + { + "epoch": 2.323020581557167, + "grad_norm": 16.226064682006836, + "learning_rate": 1.280193908090953e-06, + "loss": 3.0089, + "step": 79290 + }, + { + "epoch": 2.323313557459899, + "grad_norm": 16.841493606567383, + "learning_rate": 1.2791373083949621e-06, + "loss": 3.0432, + "step": 79300 + }, + { + "epoch": 2.323606533362631, + "grad_norm": 20.40160369873047, + "learning_rate": 1.2780810809492062e-06, + "loss": 3.0162, + "step": 79310 + }, + { + "epoch": 2.323899509265363, + "grad_norm": 20.411157608032227, + "learning_rate": 1.2770252258593568e-06, + "loss": 3.0258, + "step": 79320 + }, + { + "epoch": 2.324192485168095, + "grad_norm": 19.32838249206543, + "learning_rate": 1.2759697432310458e-06, + "loss": 3.0373, + "step": 79330 + }, + { + "epoch": 2.324485461070827, + "grad_norm": 17.632036209106445, + "learning_rate": 1.2749146331698698e-06, + "loss": 3.0306, + "step": 79340 + }, + { + "epoch": 2.324778436973559, + "grad_norm": 16.400657653808594, + "learning_rate": 1.2738598957813831e-06, + "loss": 3.017, + "step": 79350 + }, + { + "epoch": 2.325071412876291, + "grad_norm": 16.678136825561523, + "learning_rate": 1.2728055311711085e-06, + "loss": 3.0398, + "step": 79360 + }, + { + "epoch": 2.325071412876291, + "eval_bleu": 0.3525017502447413, + "eval_cap_loss": 0.9029959440231323, + "eval_con_loss": 1.1366126537322998, + "eval_loss": 3.1762213706970215, + "step": 79360 + }, + { + "epoch": 2.325071412876291, + "eval_bleu": 0.3525017502447413, + "eval_cap_loss": 0.9029959440231323, + "eval_con_loss": 1.1366126537322998, + "eval_loss": 3.1762213706970215, + "eval_runtime": 52.883, + "eval_samples_per_second": 378.194, + "eval_steps_per_second": 0.378, + "step": 79360 + }, + { + "epoch": 2.3253643887790227, + "grad_norm": 17.292314529418945, + "learning_rate": 1.2717515394445268e-06, + "loss": 3.0079, + "step": 79370 + }, + { + "epoch": 2.325657364681755, + "grad_norm": 18.8040771484375, + "learning_rate": 1.2706979207070858e-06, + "loss": 3.0002, + "step": 79380 + }, + { + "epoch": 2.325950340584487, + "grad_norm": 18.08357810974121, + "learning_rate": 1.2696446750641912e-06, + "loss": 3.0277, + "step": 79390 + }, + { + "epoch": 2.326243316487219, + "grad_norm": 20.102066040039062, + "learning_rate": 1.2685918026212158e-06, + "loss": 2.9976, + "step": 79400 + }, + { + "epoch": 2.326536292389951, + "grad_norm": 18.28993034362793, + "learning_rate": 1.2675393034834926e-06, + "loss": 3.0117, + "step": 79410 + }, + { + "epoch": 2.3268292682926828, + "grad_norm": 15.882536888122559, + "learning_rate": 1.2664871777563191e-06, + "loss": 3.034, + "step": 79420 + }, + { + "epoch": 2.327122244195415, + "grad_norm": 18.75213623046875, + "learning_rate": 1.2654354255449525e-06, + "loss": 3.0015, + "step": 79430 + }, + { + "epoch": 2.327415220098147, + "grad_norm": 20.17206382751465, + "learning_rate": 1.2643840469546166e-06, + "loss": 3.0233, + "step": 79440 + }, + { + "epoch": 2.327708196000879, + "grad_norm": 16.231332778930664, + "learning_rate": 1.2633330420904926e-06, + "loss": 3.0078, + "step": 79450 + }, + { + "epoch": 2.328001171903611, + "grad_norm": 23.35965919494629, + "learning_rate": 1.2622824110577304e-06, + "loss": 3.0089, + "step": 79460 + }, + { + "epoch": 2.328294147806343, + "grad_norm": 19.778432846069336, + "learning_rate": 1.261232153961436e-06, + "loss": 3.0221, + "step": 79470 + }, + { + "epoch": 2.3285871237090747, + "grad_norm": 19.090063095092773, + "learning_rate": 1.2601822709066836e-06, + "loss": 3.0309, + "step": 79480 + }, + { + "epoch": 2.328880099611807, + "grad_norm": 20.553926467895508, + "learning_rate": 1.2591327619985073e-06, + "loss": 3.0131, + "step": 79490 + }, + { + "epoch": 2.329173075514539, + "grad_norm": 20.19426155090332, + "learning_rate": 1.2580836273419056e-06, + "loss": 3.0095, + "step": 79500 + }, + { + "epoch": 2.329466051417271, + "grad_norm": 16.684335708618164, + "learning_rate": 1.257034867041836e-06, + "loss": 3.0248, + "step": 79510 + }, + { + "epoch": 2.329759027320003, + "grad_norm": 22.971799850463867, + "learning_rate": 1.2559864812032224e-06, + "loss": 3.0405, + "step": 79520 + }, + { + "epoch": 2.3300520032227348, + "grad_norm": 17.31688690185547, + "learning_rate": 1.254938469930948e-06, + "loss": 3.0186, + "step": 79530 + }, + { + "epoch": 2.330344979125467, + "grad_norm": 21.186948776245117, + "learning_rate": 1.253890833329861e-06, + "loss": 3.0322, + "step": 79540 + }, + { + "epoch": 2.330637955028199, + "grad_norm": 20.53605842590332, + "learning_rate": 1.2528435715047726e-06, + "loss": 3.0362, + "step": 79550 + }, + { + "epoch": 2.330930930930931, + "grad_norm": 22.189708709716797, + "learning_rate": 1.2517966845604506e-06, + "loss": 3.0288, + "step": 79560 + }, + { + "epoch": 2.331223906833663, + "grad_norm": 17.658037185668945, + "learning_rate": 1.2507501726016363e-06, + "loss": 3.0166, + "step": 79570 + }, + { + "epoch": 2.331516882736395, + "grad_norm": 19.9129581451416, + "learning_rate": 1.249704035733022e-06, + "loss": 3.006, + "step": 79580 + }, + { + "epoch": 2.3318098586391267, + "grad_norm": 17.741666793823242, + "learning_rate": 1.2486582740592705e-06, + "loss": 3.0084, + "step": 79590 + }, + { + "epoch": 2.332102834541859, + "grad_norm": 18.831947326660156, + "learning_rate": 1.2476128876850012e-06, + "loss": 3.0108, + "step": 79600 + }, + { + "epoch": 2.332395810444591, + "grad_norm": 15.4467134475708, + "learning_rate": 1.2465678767148026e-06, + "loss": 3.0052, + "step": 79610 + }, + { + "epoch": 2.332688786347323, + "grad_norm": 19.867464065551758, + "learning_rate": 1.2455232412532176e-06, + "loss": 3.0241, + "step": 79620 + }, + { + "epoch": 2.332981762250055, + "grad_norm": 15.425268173217773, + "learning_rate": 1.2444789814047587e-06, + "loss": 3.0296, + "step": 79630 + }, + { + "epoch": 2.3332747381527867, + "grad_norm": 16.073396682739258, + "learning_rate": 1.2434350972738946e-06, + "loss": 3.0189, + "step": 79640 + }, + { + "epoch": 2.333567714055519, + "grad_norm": 19.32401466369629, + "learning_rate": 1.242391588965065e-06, + "loss": 3.0192, + "step": 79650 + }, + { + "epoch": 2.333860689958251, + "grad_norm": 16.856264114379883, + "learning_rate": 1.2413484565826622e-06, + "loss": 3.0313, + "step": 79660 + }, + { + "epoch": 2.334153665860983, + "grad_norm": 18.708539962768555, + "learning_rate": 1.2403057002310487e-06, + "loss": 3.0236, + "step": 79670 + }, + { + "epoch": 2.334446641763715, + "grad_norm": 18.851455688476562, + "learning_rate": 1.2392633200145426e-06, + "loss": 3.0352, + "step": 79680 + }, + { + "epoch": 2.3347396176664468, + "grad_norm": 18.563335418701172, + "learning_rate": 1.2382213160374313e-06, + "loss": 3.0029, + "step": 79690 + }, + { + "epoch": 2.3350325935691787, + "grad_norm": 17.853620529174805, + "learning_rate": 1.2371796884039578e-06, + "loss": 3.0229, + "step": 79700 + }, + { + "epoch": 2.335325569471911, + "grad_norm": 15.811232566833496, + "learning_rate": 1.2361384372183343e-06, + "loss": 3.0112, + "step": 79710 + }, + { + "epoch": 2.335618545374643, + "grad_norm": 20.657875061035156, + "learning_rate": 1.2350975625847266e-06, + "loss": 2.9965, + "step": 79720 + }, + { + "epoch": 2.335911521277375, + "grad_norm": 19.990169525146484, + "learning_rate": 1.2340570646072746e-06, + "loss": 3.0443, + "step": 79730 + }, + { + "epoch": 2.336204497180107, + "grad_norm": 19.252290725708008, + "learning_rate": 1.233016943390069e-06, + "loss": 3.0306, + "step": 79740 + }, + { + "epoch": 2.336497473082839, + "grad_norm": 23.124372482299805, + "learning_rate": 1.231977199037171e-06, + "loss": 3.0315, + "step": 79750 + }, + { + "epoch": 2.336790448985571, + "grad_norm": 16.251325607299805, + "learning_rate": 1.2309378316525976e-06, + "loss": 3.0149, + "step": 79760 + }, + { + "epoch": 2.337083424888303, + "grad_norm": 19.917783737182617, + "learning_rate": 1.229898841340335e-06, + "loss": 3.0182, + "step": 79770 + }, + { + "epoch": 2.337376400791035, + "grad_norm": 16.601177215576172, + "learning_rate": 1.2288602282043244e-06, + "loss": 3.0328, + "step": 79780 + }, + { + "epoch": 2.337669376693767, + "grad_norm": 20.136714935302734, + "learning_rate": 1.2278219923484747e-06, + "loss": 3.0233, + "step": 79790 + }, + { + "epoch": 2.3379623525964988, + "grad_norm": 19.8515625, + "learning_rate": 1.2267841338766552e-06, + "loss": 3.0294, + "step": 79800 + }, + { + "epoch": 2.338255328499231, + "grad_norm": 18.19293785095215, + "learning_rate": 1.2257466528926982e-06, + "loss": 3.025, + "step": 79810 + }, + { + "epoch": 2.338548304401963, + "grad_norm": 15.067960739135742, + "learning_rate": 1.2247095495003958e-06, + "loss": 3.014, + "step": 79820 + }, + { + "epoch": 2.338841280304695, + "grad_norm": 19.201425552368164, + "learning_rate": 1.223672823803506e-06, + "loss": 3.0388, + "step": 79830 + }, + { + "epoch": 2.339134256207427, + "grad_norm": 16.61728858947754, + "learning_rate": 1.2226364759057453e-06, + "loss": 3.0101, + "step": 79840 + }, + { + "epoch": 2.339427232110159, + "grad_norm": 16.98577880859375, + "learning_rate": 1.2216005059107956e-06, + "loss": 3.005, + "step": 79850 + }, + { + "epoch": 2.339720208012891, + "grad_norm": 15.817618370056152, + "learning_rate": 1.2205649139222974e-06, + "loss": 3.0175, + "step": 79860 + }, + { + "epoch": 2.340013183915623, + "grad_norm": 16.218236923217773, + "learning_rate": 1.2195297000438571e-06, + "loss": 3.0228, + "step": 79870 + }, + { + "epoch": 2.3400717790961694, + "eval_bleu": 0.3525632218099351, + "eval_cap_loss": 0.9029735326766968, + "eval_con_loss": 1.1369636058807373, + "eval_loss": 3.176900863647461, + "step": 79872 + }, + { + "epoch": 2.3400717790961694, + "eval_bleu": 0.3525632218099351, + "eval_cap_loss": 0.9029735326766968, + "eval_con_loss": 1.1369636058807373, + "eval_loss": 3.176900863647461, + "eval_runtime": 54.8329, + "eval_samples_per_second": 364.745, + "eval_steps_per_second": 0.365, + "step": 79872 + }, + { + "epoch": 2.340306159818355, + "grad_norm": 19.608192443847656, + "learning_rate": 1.2184948643790418e-06, + "loss": 3.0212, + "step": 79880 + }, + { + "epoch": 2.340599135721087, + "grad_norm": 17.99622917175293, + "learning_rate": 1.2174604070313811e-06, + "loss": 3.0344, + "step": 79890 + }, + { + "epoch": 2.340892111623819, + "grad_norm": 18.254901885986328, + "learning_rate": 1.2164263281043647e-06, + "loss": 3.0141, + "step": 79900 + }, + { + "epoch": 2.3411850875265507, + "grad_norm": 18.543954849243164, + "learning_rate": 1.2153926277014471e-06, + "loss": 3.0213, + "step": 79910 + }, + { + "epoch": 2.341478063429283, + "grad_norm": 19.323179244995117, + "learning_rate": 1.2143593059260455e-06, + "loss": 3.0273, + "step": 79920 + }, + { + "epoch": 2.341771039332015, + "grad_norm": 16.960832595825195, + "learning_rate": 1.2133263628815339e-06, + "loss": 3.0322, + "step": 79930 + }, + { + "epoch": 2.342064015234747, + "grad_norm": 19.757768630981445, + "learning_rate": 1.2122937986712558e-06, + "loss": 3.0206, + "step": 79940 + }, + { + "epoch": 2.342356991137479, + "grad_norm": 19.652372360229492, + "learning_rate": 1.21126161339851e-06, + "loss": 3.0109, + "step": 79950 + }, + { + "epoch": 2.342649967040211, + "grad_norm": 19.041397094726562, + "learning_rate": 1.210229807166562e-06, + "loss": 3.01, + "step": 79960 + }, + { + "epoch": 2.342942942942943, + "grad_norm": 17.880617141723633, + "learning_rate": 1.2091983800786377e-06, + "loss": 3.0576, + "step": 79970 + }, + { + "epoch": 2.343235918845675, + "grad_norm": 16.834999084472656, + "learning_rate": 1.2081673322379267e-06, + "loss": 3.0223, + "step": 79980 + }, + { + "epoch": 2.343528894748407, + "grad_norm": 20.210771560668945, + "learning_rate": 1.2071366637475768e-06, + "loss": 3.0297, + "step": 79990 + }, + { + "epoch": 2.343821870651139, + "grad_norm": 20.539371490478516, + "learning_rate": 1.2061063747107026e-06, + "loss": 3.0162, + "step": 80000 + }, + { + "epoch": 2.344114846553871, + "grad_norm": 21.96381950378418, + "learning_rate": 1.2050764652303764e-06, + "loss": 3.0145, + "step": 80010 + }, + { + "epoch": 2.3444078224566027, + "grad_norm": 15.88278579711914, + "learning_rate": 1.2040469354096363e-06, + "loss": 3.0247, + "step": 80020 + }, + { + "epoch": 2.344700798359335, + "grad_norm": 19.449384689331055, + "learning_rate": 1.2031206832650423e-06, + "loss": 3.0241, + "step": 80030 + }, + { + "epoch": 2.344993774262067, + "grad_norm": 19.50215721130371, + "learning_rate": 1.2020918750812433e-06, + "loss": 3.0448, + "step": 80040 + }, + { + "epoch": 2.345286750164799, + "grad_norm": 19.869569778442383, + "learning_rate": 1.2010634468556183e-06, + "loss": 3.0207, + "step": 80050 + }, + { + "epoch": 2.345579726067531, + "grad_norm": 19.365352630615234, + "learning_rate": 1.2000353986910584e-06, + "loss": 3.0269, + "step": 80060 + }, + { + "epoch": 2.3458727019702628, + "grad_norm": 18.307857513427734, + "learning_rate": 1.1990077306904112e-06, + "loss": 3.0171, + "step": 80070 + }, + { + "epoch": 2.346165677872995, + "grad_norm": 19.34336280822754, + "learning_rate": 1.1979804429564901e-06, + "loss": 3.0149, + "step": 80080 + }, + { + "epoch": 2.346458653775727, + "grad_norm": 17.214181900024414, + "learning_rate": 1.1969535355920687e-06, + "loss": 2.988, + "step": 80090 + }, + { + "epoch": 2.346751629678459, + "grad_norm": 17.646984100341797, + "learning_rate": 1.1959270086998853e-06, + "loss": 3.0241, + "step": 80100 + }, + { + "epoch": 2.347044605581191, + "grad_norm": 17.779647827148438, + "learning_rate": 1.1949008623826342e-06, + "loss": 3.0184, + "step": 80110 + }, + { + "epoch": 2.347337581483923, + "grad_norm": 19.148296356201172, + "learning_rate": 1.1938750967429785e-06, + "loss": 3.0254, + "step": 80120 + }, + { + "epoch": 2.3476305573866547, + "grad_norm": 17.876693725585938, + "learning_rate": 1.1928497118835363e-06, + "loss": 3.0204, + "step": 80130 + }, + { + "epoch": 2.347923533289387, + "grad_norm": 19.066499710083008, + "learning_rate": 1.1918247079068956e-06, + "loss": 3.0179, + "step": 80140 + }, + { + "epoch": 2.348216509192119, + "grad_norm": 22.441417694091797, + "learning_rate": 1.1908000849155981e-06, + "loss": 3.0083, + "step": 80150 + }, + { + "epoch": 2.348509485094851, + "grad_norm": 19.214557647705078, + "learning_rate": 1.1897758430121532e-06, + "loss": 3.0069, + "step": 80160 + }, + { + "epoch": 2.348802460997583, + "grad_norm": 14.903095245361328, + "learning_rate": 1.1887519822990296e-06, + "loss": 3.0284, + "step": 80170 + }, + { + "epoch": 2.349095436900315, + "grad_norm": 20.075578689575195, + "learning_rate": 1.18772850287866e-06, + "loss": 3.0006, + "step": 80180 + }, + { + "epoch": 2.349388412803047, + "grad_norm": 19.871923446655273, + "learning_rate": 1.186705404853435e-06, + "loss": 3.0113, + "step": 80190 + }, + { + "epoch": 2.349681388705779, + "grad_norm": 21.34770393371582, + "learning_rate": 1.1856826883257121e-06, + "loss": 3.0254, + "step": 80200 + }, + { + "epoch": 2.349974364608511, + "grad_norm": 13.53134822845459, + "learning_rate": 1.1846603533978058e-06, + "loss": 3.0053, + "step": 80210 + }, + { + "epoch": 2.350267340511243, + "grad_norm": 18.574989318847656, + "learning_rate": 1.1836384001719963e-06, + "loss": 3.0216, + "step": 80220 + }, + { + "epoch": 2.350560316413975, + "grad_norm": 19.351795196533203, + "learning_rate": 1.1826168287505218e-06, + "loss": 3.014, + "step": 80230 + }, + { + "epoch": 2.350853292316707, + "grad_norm": 18.358945846557617, + "learning_rate": 1.181595639235586e-06, + "loss": 3.0271, + "step": 80240 + }, + { + "epoch": 2.351146268219439, + "grad_norm": 18.072011947631836, + "learning_rate": 1.1805748317293526e-06, + "loss": 3.0004, + "step": 80250 + }, + { + "epoch": 2.351439244122171, + "grad_norm": 14.727603912353516, + "learning_rate": 1.1795544063339475e-06, + "loss": 3.0157, + "step": 80260 + }, + { + "epoch": 2.351732220024903, + "grad_norm": 18.61092185974121, + "learning_rate": 1.1785343631514595e-06, + "loss": 3.0202, + "step": 80270 + }, + { + "epoch": 2.352025195927635, + "grad_norm": 18.409812927246094, + "learning_rate": 1.177514702283935e-06, + "loss": 3.016, + "step": 80280 + }, + { + "epoch": 2.352318171830367, + "grad_norm": 16.806591033935547, + "learning_rate": 1.1764954238333882e-06, + "loss": 3.0119, + "step": 80290 + }, + { + "epoch": 2.352611147733099, + "grad_norm": 17.854190826416016, + "learning_rate": 1.1754765279017882e-06, + "loss": 3.0283, + "step": 80300 + }, + { + "epoch": 2.352904123635831, + "grad_norm": 16.546377182006836, + "learning_rate": 1.1744580145910732e-06, + "loss": 3.031, + "step": 80310 + }, + { + "epoch": 2.353197099538563, + "grad_norm": 17.889907836914062, + "learning_rate": 1.1734398840031363e-06, + "loss": 3.0008, + "step": 80320 + }, + { + "epoch": 2.353490075441295, + "grad_norm": 20.78096580505371, + "learning_rate": 1.172422136239837e-06, + "loss": 3.0378, + "step": 80330 + }, + { + "epoch": 2.3537830513440268, + "grad_norm": 20.761425018310547, + "learning_rate": 1.171404771402994e-06, + "loss": 3.0123, + "step": 80340 + }, + { + "epoch": 2.354076027246759, + "grad_norm": 22.124027252197266, + "learning_rate": 1.1703877895943915e-06, + "loss": 3.0185, + "step": 80350 + }, + { + "epoch": 2.354369003149491, + "grad_norm": 16.78768539428711, + "learning_rate": 1.1693711909157685e-06, + "loss": 3.0036, + "step": 80360 + }, + { + "epoch": 2.354661979052223, + "grad_norm": 17.57758903503418, + "learning_rate": 1.1683549754688333e-06, + "loss": 3.0215, + "step": 80370 + }, + { + "epoch": 2.354954954954955, + "grad_norm": 17.463802337646484, + "learning_rate": 1.1673391433552488e-06, + "loss": 3.0052, + "step": 80380 + }, + { + "epoch": 2.355072145316048, + "eval_bleu": 0.35209847953173035, + "eval_cap_loss": 0.9027011394500732, + "eval_con_loss": 1.1355726718902588, + "eval_loss": 3.17384672164917, + "step": 80384 + }, + { + "epoch": 2.355072145316048, + "eval_bleu": 0.35209847953173035, + "eval_cap_loss": 0.9027011394500732, + "eval_con_loss": 1.1355726718902588, + "eval_loss": 3.17384672164917, + "eval_runtime": 53.5628, + "eval_samples_per_second": 373.393, + "eval_steps_per_second": 0.373, + "step": 80384 + }, + { + "epoch": 2.355247930857687, + "grad_norm": 20.40073013305664, + "learning_rate": 1.166323694676646e-06, + "loss": 3.0096, + "step": 80390 + }, + { + "epoch": 2.355540906760419, + "grad_norm": 19.021774291992188, + "learning_rate": 1.1653086295346116e-06, + "loss": 3.0362, + "step": 80400 + }, + { + "epoch": 2.355833882663151, + "grad_norm": 19.13472557067871, + "learning_rate": 1.1642939480306987e-06, + "loss": 3.0082, + "step": 80410 + }, + { + "epoch": 2.356126858565883, + "grad_norm": 17.042226791381836, + "learning_rate": 1.1632796502664196e-06, + "loss": 3.0197, + "step": 80420 + }, + { + "epoch": 2.356419834468615, + "grad_norm": 20.650367736816406, + "learning_rate": 1.1622657363432504e-06, + "loss": 3.0248, + "step": 80430 + }, + { + "epoch": 2.356712810371347, + "grad_norm": 16.96282958984375, + "learning_rate": 1.1612522063626242e-06, + "loss": 3.012, + "step": 80440 + }, + { + "epoch": 2.3570057862740788, + "grad_norm": 17.829315185546875, + "learning_rate": 1.1602390604259423e-06, + "loss": 3.0046, + "step": 80450 + }, + { + "epoch": 2.357298762176811, + "grad_norm": 17.691267013549805, + "learning_rate": 1.1592262986345598e-06, + "loss": 3.0173, + "step": 80460 + }, + { + "epoch": 2.357591738079543, + "grad_norm": 18.58101463317871, + "learning_rate": 1.1582139210898013e-06, + "loss": 3.0164, + "step": 80470 + }, + { + "epoch": 2.357884713982275, + "grad_norm": 16.504167556762695, + "learning_rate": 1.1572019278929457e-06, + "loss": 3.0185, + "step": 80480 + }, + { + "epoch": 2.358177689885007, + "grad_norm": 18.77106475830078, + "learning_rate": 1.156190319145239e-06, + "loss": 3.026, + "step": 80490 + }, + { + "epoch": 2.358470665787739, + "grad_norm": 18.347951889038086, + "learning_rate": 1.1551790949478863e-06, + "loss": 3.0217, + "step": 80500 + }, + { + "epoch": 2.358763641690471, + "grad_norm": 17.775732040405273, + "learning_rate": 1.154168255402056e-06, + "loss": 2.9849, + "step": 80510 + }, + { + "epoch": 2.359056617593203, + "grad_norm": 20.01736068725586, + "learning_rate": 1.153157800608874e-06, + "loss": 3.0298, + "step": 80520 + }, + { + "epoch": 2.359349593495935, + "grad_norm": 21.6260929107666, + "learning_rate": 1.1521477306694329e-06, + "loss": 3.0197, + "step": 80530 + }, + { + "epoch": 2.359642569398667, + "grad_norm": 17.15774154663086, + "learning_rate": 1.151138045684781e-06, + "loss": 3.0314, + "step": 80540 + }, + { + "epoch": 2.359935545301399, + "grad_norm": 20.355852127075195, + "learning_rate": 1.1501287457559345e-06, + "loss": 3.0147, + "step": 80550 + }, + { + "epoch": 2.3602285212041307, + "grad_norm": 19.31388282775879, + "learning_rate": 1.149119830983866e-06, + "loss": 3.0258, + "step": 80560 + }, + { + "epoch": 2.360521497106863, + "grad_norm": 17.63256072998047, + "learning_rate": 1.148111301469511e-06, + "loss": 3.003, + "step": 80570 + }, + { + "epoch": 2.360814473009595, + "grad_norm": 16.665985107421875, + "learning_rate": 1.147103157313768e-06, + "loss": 3.0238, + "step": 80580 + }, + { + "epoch": 2.361107448912327, + "grad_norm": 18.8463191986084, + "learning_rate": 1.1460953986174954e-06, + "loss": 3.0401, + "step": 80590 + }, + { + "epoch": 2.361400424815059, + "grad_norm": 19.633081436157227, + "learning_rate": 1.1450880254815155e-06, + "loss": 3.021, + "step": 80600 + }, + { + "epoch": 2.3616934007177908, + "grad_norm": 18.020414352416992, + "learning_rate": 1.1440810380066069e-06, + "loss": 3.0404, + "step": 80610 + }, + { + "epoch": 2.361986376620523, + "grad_norm": 16.822725296020508, + "learning_rate": 1.1430744362935153e-06, + "loss": 3.0247, + "step": 80620 + }, + { + "epoch": 2.362279352523255, + "grad_norm": 21.904109954833984, + "learning_rate": 1.142068220442943e-06, + "loss": 3.0098, + "step": 80630 + }, + { + "epoch": 2.362572328425987, + "grad_norm": 18.866167068481445, + "learning_rate": 1.1410623905555578e-06, + "loss": 3.026, + "step": 80640 + }, + { + "epoch": 2.362865304328719, + "grad_norm": 14.695634841918945, + "learning_rate": 1.140056946731985e-06, + "loss": 3.0246, + "step": 80650 + }, + { + "epoch": 2.363158280231451, + "grad_norm": 21.20557403564453, + "learning_rate": 1.1390518890728142e-06, + "loss": 3.0212, + "step": 80660 + }, + { + "epoch": 2.3634512561341827, + "grad_norm": 16.726106643676758, + "learning_rate": 1.1380472176785962e-06, + "loss": 2.9882, + "step": 80670 + }, + { + "epoch": 2.363744232036915, + "grad_norm": 19.782413482666016, + "learning_rate": 1.1370429326498428e-06, + "loss": 3.026, + "step": 80680 + }, + { + "epoch": 2.364037207939647, + "grad_norm": 17.169614791870117, + "learning_rate": 1.136039034087024e-06, + "loss": 3.0027, + "step": 80690 + }, + { + "epoch": 2.364330183842379, + "grad_norm": 16.933589935302734, + "learning_rate": 1.135035522090578e-06, + "loss": 3.0186, + "step": 80700 + }, + { + "epoch": 2.364623159745111, + "grad_norm": 19.484098434448242, + "learning_rate": 1.1340323967608963e-06, + "loss": 3.0137, + "step": 80710 + }, + { + "epoch": 2.364916135647843, + "grad_norm": 17.987594604492188, + "learning_rate": 1.1330296581983385e-06, + "loss": 2.9922, + "step": 80720 + }, + { + "epoch": 2.365209111550575, + "grad_norm": 20.570667266845703, + "learning_rate": 1.1320273065032207e-06, + "loss": 3.0146, + "step": 80730 + }, + { + "epoch": 2.365502087453307, + "grad_norm": 19.188350677490234, + "learning_rate": 1.1310253417758222e-06, + "loss": 3.0158, + "step": 80740 + }, + { + "epoch": 2.365795063356039, + "grad_norm": 18.28660774230957, + "learning_rate": 1.1300237641163847e-06, + "loss": 3.0175, + "step": 80750 + }, + { + "epoch": 2.366088039258771, + "grad_norm": 17.662071228027344, + "learning_rate": 1.129022573625111e-06, + "loss": 3.0112, + "step": 80760 + }, + { + "epoch": 2.366381015161503, + "grad_norm": 16.022615432739258, + "learning_rate": 1.1280217704021617e-06, + "loss": 3.0228, + "step": 80770 + }, + { + "epoch": 2.366673991064235, + "grad_norm": 19.057796478271484, + "learning_rate": 1.1270213545476643e-06, + "loss": 3.0255, + "step": 80780 + }, + { + "epoch": 2.366966966966967, + "grad_norm": 17.486080169677734, + "learning_rate": 1.1260213261617015e-06, + "loss": 3.0322, + "step": 80790 + }, + { + "epoch": 2.367259942869699, + "grad_norm": 21.924266815185547, + "learning_rate": 1.125021685344323e-06, + "loss": 3.0271, + "step": 80800 + }, + { + "epoch": 2.367552918772431, + "grad_norm": 16.751169204711914, + "learning_rate": 1.1240224321955333e-06, + "loss": 3.0341, + "step": 80810 + }, + { + "epoch": 2.367845894675163, + "grad_norm": 20.268293380737305, + "learning_rate": 1.1230235668153044e-06, + "loss": 3.0115, + "step": 80820 + }, + { + "epoch": 2.368138870577895, + "grad_norm": 18.965776443481445, + "learning_rate": 1.122025089303566e-06, + "loss": 3.0286, + "step": 80830 + }, + { + "epoch": 2.368431846480627, + "grad_norm": 18.770008087158203, + "learning_rate": 1.1210269997602118e-06, + "loss": 3.0113, + "step": 80840 + }, + { + "epoch": 2.368724822383359, + "grad_norm": 19.93950080871582, + "learning_rate": 1.1200292982850919e-06, + "loss": 3.0144, + "step": 80850 + }, + { + "epoch": 2.369017798286091, + "grad_norm": 13.85154914855957, + "learning_rate": 1.119031984978023e-06, + "loss": 3.0173, + "step": 80860 + }, + { + "epoch": 2.369310774188823, + "grad_norm": 21.89968490600586, + "learning_rate": 1.1180350599387774e-06, + "loss": 3.0463, + "step": 80870 + }, + { + "epoch": 2.3696037500915548, + "grad_norm": 19.208293914794922, + "learning_rate": 1.1170385232670944e-06, + "loss": 2.9907, + "step": 80880 + }, + { + "epoch": 2.369896725994287, + "grad_norm": 18.557464599609375, + "learning_rate": 1.1160423750626693e-06, + "loss": 3.0161, + "step": 80890 + }, + { + "epoch": 2.370072511535926, + "eval_bleu": 0.35253302253992747, + "eval_cap_loss": 0.9027010202407837, + "eval_con_loss": 1.133971929550171, + "eval_loss": 3.170644760131836, + "step": 80896 + }, + { + "epoch": 2.370072511535926, + "eval_bleu": 0.35253302253992747, + "eval_cap_loss": 0.9027010202407837, + "eval_con_loss": 1.133971929550171, + "eval_loss": 3.170644760131836, + "eval_runtime": 55.7937, + "eval_samples_per_second": 358.464, + "eval_steps_per_second": 0.358, + "step": 80896 + }, + { + "epoch": 2.370189701897019, + "grad_norm": 17.03923797607422, + "learning_rate": 1.1150466154251615e-06, + "loss": 3.0325, + "step": 80900 + }, + { + "epoch": 2.370482677799751, + "grad_norm": 18.379531860351562, + "learning_rate": 1.1140512444541911e-06, + "loss": 3.017, + "step": 80910 + }, + { + "epoch": 2.370775653702483, + "grad_norm": 15.909089088439941, + "learning_rate": 1.1130562622493408e-06, + "loss": 3.0169, + "step": 80920 + }, + { + "epoch": 2.371068629605215, + "grad_norm": 19.56803321838379, + "learning_rate": 1.1120616689101498e-06, + "loss": 3.0213, + "step": 80930 + }, + { + "epoch": 2.371361605507947, + "grad_norm": 17.116703033447266, + "learning_rate": 1.1110674645361218e-06, + "loss": 3.0041, + "step": 80940 + }, + { + "epoch": 2.371654581410679, + "grad_norm": 16.568897247314453, + "learning_rate": 1.1100736492267238e-06, + "loss": 3.0211, + "step": 80950 + }, + { + "epoch": 2.371947557313411, + "grad_norm": 17.92496109008789, + "learning_rate": 1.1090802230813775e-06, + "loss": 3.012, + "step": 80960 + }, + { + "epoch": 2.372240533216143, + "grad_norm": 17.42203712463379, + "learning_rate": 1.1080871861994724e-06, + "loss": 3.0157, + "step": 80970 + }, + { + "epoch": 2.372533509118875, + "grad_norm": 15.778738021850586, + "learning_rate": 1.1070945386803528e-06, + "loss": 3.0088, + "step": 80980 + }, + { + "epoch": 2.3728264850216068, + "grad_norm": 15.311943054199219, + "learning_rate": 1.1061022806233285e-06, + "loss": 3.0065, + "step": 80990 + }, + { + "epoch": 2.373119460924339, + "grad_norm": 17.722288131713867, + "learning_rate": 1.1051104121276696e-06, + "loss": 3.0151, + "step": 81000 + }, + { + "epoch": 2.373412436827071, + "grad_norm": 18.0969181060791, + "learning_rate": 1.1041189332926079e-06, + "loss": 3.0329, + "step": 81010 + }, + { + "epoch": 2.373705412729803, + "grad_norm": 20.388652801513672, + "learning_rate": 1.1031278442173316e-06, + "loss": 3.0327, + "step": 81020 + }, + { + "epoch": 2.373998388632535, + "grad_norm": 14.052337646484375, + "learning_rate": 1.1021371450009971e-06, + "loss": 3.0391, + "step": 81030 + }, + { + "epoch": 2.374291364535267, + "grad_norm": 17.773818969726562, + "learning_rate": 1.1011468357427142e-06, + "loss": 3.0256, + "step": 81040 + }, + { + "epoch": 2.374584340437999, + "grad_norm": 20.96186065673828, + "learning_rate": 1.100156916541561e-06, + "loss": 3.0228, + "step": 81050 + }, + { + "epoch": 2.374877316340731, + "grad_norm": 20.59732437133789, + "learning_rate": 1.0991673874965696e-06, + "loss": 3.0147, + "step": 81060 + }, + { + "epoch": 2.375170292243463, + "grad_norm": 17.576139450073242, + "learning_rate": 1.0981782487067382e-06, + "loss": 3.0154, + "step": 81070 + }, + { + "epoch": 2.375463268146195, + "grad_norm": 18.108707427978516, + "learning_rate": 1.0971895002710248e-06, + "loss": 3.02, + "step": 81080 + }, + { + "epoch": 2.375756244048927, + "grad_norm": 20.31915283203125, + "learning_rate": 1.0962011422883483e-06, + "loss": 3.0182, + "step": 81090 + }, + { + "epoch": 2.3760492199516587, + "grad_norm": 22.320947647094727, + "learning_rate": 1.0952131748575855e-06, + "loss": 3.0239, + "step": 81100 + }, + { + "epoch": 2.376342195854391, + "grad_norm": 21.631778717041016, + "learning_rate": 1.0942255980775801e-06, + "loss": 3.0055, + "step": 81110 + }, + { + "epoch": 2.376635171757123, + "grad_norm": 18.15153694152832, + "learning_rate": 1.0932384120471302e-06, + "loss": 3.0149, + "step": 81120 + }, + { + "epoch": 2.376928147659855, + "grad_norm": 20.821651458740234, + "learning_rate": 1.092251616865e-06, + "loss": 3.0292, + "step": 81130 + }, + { + "epoch": 2.377221123562587, + "grad_norm": 20.673236846923828, + "learning_rate": 1.0912652126299105e-06, + "loss": 3.0095, + "step": 81140 + }, + { + "epoch": 2.377514099465319, + "grad_norm": 16.757953643798828, + "learning_rate": 1.0902791994405465e-06, + "loss": 3.0205, + "step": 81150 + }, + { + "epoch": 2.377807075368051, + "grad_norm": 14.904943466186523, + "learning_rate": 1.0892935773955532e-06, + "loss": 3.0324, + "step": 81160 + }, + { + "epoch": 2.378100051270783, + "grad_norm": 19.13127899169922, + "learning_rate": 1.0883083465935374e-06, + "loss": 3.0119, + "step": 81170 + }, + { + "epoch": 2.378393027173515, + "grad_norm": 17.564682006835938, + "learning_rate": 1.087323507133063e-06, + "loss": 3.0048, + "step": 81180 + }, + { + "epoch": 2.378686003076247, + "grad_norm": 14.843195915222168, + "learning_rate": 1.0863390591126598e-06, + "loss": 2.9996, + "step": 81190 + }, + { + "epoch": 2.378978978978979, + "grad_norm": 21.370779037475586, + "learning_rate": 1.0853550026308134e-06, + "loss": 3.0325, + "step": 81200 + }, + { + "epoch": 2.379271954881711, + "grad_norm": 18.810976028442383, + "learning_rate": 1.0843713377859761e-06, + "loss": 3.0172, + "step": 81210 + }, + { + "epoch": 2.379564930784443, + "grad_norm": 18.599151611328125, + "learning_rate": 1.0833880646765538e-06, + "loss": 3.0485, + "step": 81220 + }, + { + "epoch": 2.379857906687175, + "grad_norm": 19.323326110839844, + "learning_rate": 1.0824051834009198e-06, + "loss": 3.039, + "step": 81230 + }, + { + "epoch": 2.380150882589907, + "grad_norm": 16.94253158569336, + "learning_rate": 1.0814226940574041e-06, + "loss": 3.0088, + "step": 81240 + }, + { + "epoch": 2.380443858492639, + "grad_norm": 14.551440238952637, + "learning_rate": 1.0804405967443016e-06, + "loss": 3.0132, + "step": 81250 + }, + { + "epoch": 2.380736834395371, + "grad_norm": 20.04059600830078, + "learning_rate": 1.079458891559862e-06, + "loss": 3.0109, + "step": 81260 + }, + { + "epoch": 2.381029810298103, + "grad_norm": 18.40492820739746, + "learning_rate": 1.0784775786023015e-06, + "loss": 3.0154, + "step": 81270 + }, + { + "epoch": 2.381322786200835, + "grad_norm": 21.819087982177734, + "learning_rate": 1.0774966579697931e-06, + "loss": 3.0165, + "step": 81280 + }, + { + "epoch": 2.381615762103567, + "grad_norm": 20.58901596069336, + "learning_rate": 1.0765161297604737e-06, + "loss": 3.0365, + "step": 81290 + }, + { + "epoch": 2.381908738006299, + "grad_norm": 19.594932556152344, + "learning_rate": 1.0755359940724364e-06, + "loss": 3.0076, + "step": 81300 + }, + { + "epoch": 2.382201713909031, + "grad_norm": 22.372533798217773, + "learning_rate": 1.0745562510037406e-06, + "loss": 3.019, + "step": 81310 + }, + { + "epoch": 2.382494689811763, + "grad_norm": 19.85383415222168, + "learning_rate": 1.0735769006524028e-06, + "loss": 3.0165, + "step": 81320 + }, + { + "epoch": 2.382787665714495, + "grad_norm": 19.859359741210938, + "learning_rate": 1.0725979431164007e-06, + "loss": 3.0339, + "step": 81330 + }, + { + "epoch": 2.383080641617227, + "grad_norm": 19.091556549072266, + "learning_rate": 1.0716193784936762e-06, + "loss": 3.0242, + "step": 81340 + }, + { + "epoch": 2.383373617519959, + "grad_norm": 20.16823387145996, + "learning_rate": 1.0706412068821248e-06, + "loss": 3.0087, + "step": 81350 + }, + { + "epoch": 2.383666593422691, + "grad_norm": 18.859968185424805, + "learning_rate": 1.0696634283796103e-06, + "loss": 3.0287, + "step": 81360 + }, + { + "epoch": 2.383959569325423, + "grad_norm": 16.854190826416016, + "learning_rate": 1.0686860430839502e-06, + "loss": 3.0168, + "step": 81370 + }, + { + "epoch": 2.384252545228155, + "grad_norm": 21.937480926513672, + "learning_rate": 1.067709051092929e-06, + "loss": 3.0499, + "step": 81380 + }, + { + "epoch": 2.384545521130887, + "grad_norm": 15.72452449798584, + "learning_rate": 1.0667324525042866e-06, + "loss": 3.0112, + "step": 81390 + }, + { + "epoch": 2.384838497033619, + "grad_norm": 16.991506576538086, + "learning_rate": 1.0657562474157274e-06, + "loss": 3.012, + "step": 81400 + }, + { + "epoch": 2.3850728777558046, + "eval_bleu": 0.3523936434898368, + "eval_cap_loss": 0.902219295501709, + "eval_con_loss": 1.1337206363677979, + "eval_loss": 3.1696603298187256, + "step": 81408 + }, + { + "epoch": 2.3850728777558046, + "eval_bleu": 0.3523936434898368, + "eval_cap_loss": 0.902219295501709, + "eval_con_loss": 1.1337206363677979, + "eval_loss": 3.1696603298187256, + "eval_runtime": 54.2136, + "eval_samples_per_second": 368.911, + "eval_steps_per_second": 0.369, + "step": 81408 + }, + { + "epoch": 2.385131472936351, + "grad_norm": 18.135379791259766, + "learning_rate": 1.0647804359249143e-06, + "loss": 2.9978, + "step": 81410 + }, + { + "epoch": 2.385424448839083, + "grad_norm": 17.416980743408203, + "learning_rate": 1.0638050181294729e-06, + "loss": 3.0368, + "step": 81420 + }, + { + "epoch": 2.385717424741815, + "grad_norm": 16.07200050354004, + "learning_rate": 1.0628299941269855e-06, + "loss": 3.0098, + "step": 81430 + }, + { + "epoch": 2.386010400644547, + "grad_norm": 17.547773361206055, + "learning_rate": 1.0618553640150003e-06, + "loss": 3.0166, + "step": 81440 + }, + { + "epoch": 2.386303376547279, + "grad_norm": 20.963153839111328, + "learning_rate": 1.0608811278910203e-06, + "loss": 3.0083, + "step": 81450 + }, + { + "epoch": 2.386596352450011, + "grad_norm": 16.761442184448242, + "learning_rate": 1.0599072858525144e-06, + "loss": 2.9997, + "step": 81460 + }, + { + "epoch": 2.386889328352743, + "grad_norm": 19.213407516479492, + "learning_rate": 1.0589338379969078e-06, + "loss": 3.0167, + "step": 81470 + }, + { + "epoch": 2.387182304255475, + "grad_norm": 18.991687774658203, + "learning_rate": 1.0579607844215894e-06, + "loss": 3.0253, + "step": 81480 + }, + { + "epoch": 2.387475280158207, + "grad_norm": 19.325037002563477, + "learning_rate": 1.0569881252239066e-06, + "loss": 3.0052, + "step": 81490 + }, + { + "epoch": 2.387768256060939, + "grad_norm": 18.318397521972656, + "learning_rate": 1.0560158605011706e-06, + "loss": 3.0232, + "step": 81500 + }, + { + "epoch": 2.388061231963671, + "grad_norm": 18.152074813842773, + "learning_rate": 1.0550439903506477e-06, + "loss": 3.0374, + "step": 81510 + }, + { + "epoch": 2.388354207866403, + "grad_norm": 17.622434616088867, + "learning_rate": 1.0540725148695696e-06, + "loss": 2.9998, + "step": 81520 + }, + { + "epoch": 2.3886471837691348, + "grad_norm": 20.331371307373047, + "learning_rate": 1.0531014341551254e-06, + "loss": 3.0035, + "step": 81530 + }, + { + "epoch": 2.388940159671867, + "grad_norm": 15.96308422088623, + "learning_rate": 1.0521307483044675e-06, + "loss": 3.0139, + "step": 81540 + }, + { + "epoch": 2.389233135574599, + "grad_norm": 23.022350311279297, + "learning_rate": 1.051160457414705e-06, + "loss": 3.0059, + "step": 81550 + }, + { + "epoch": 2.389526111477331, + "grad_norm": 18.529033660888672, + "learning_rate": 1.050190561582911e-06, + "loss": 3.0025, + "step": 81560 + }, + { + "epoch": 2.389819087380063, + "grad_norm": 16.576698303222656, + "learning_rate": 1.0492210609061182e-06, + "loss": 3.0174, + "step": 81570 + }, + { + "epoch": 2.390112063282795, + "grad_norm": 20.76590347290039, + "learning_rate": 1.0482519554813202e-06, + "loss": 3.0157, + "step": 81580 + }, + { + "epoch": 2.390405039185527, + "grad_norm": 16.859878540039062, + "learning_rate": 1.0472832454054681e-06, + "loss": 3.0118, + "step": 81590 + }, + { + "epoch": 2.390698015088259, + "grad_norm": 20.94404411315918, + "learning_rate": 1.0463149307754777e-06, + "loss": 3.0075, + "step": 81600 + }, + { + "epoch": 2.390990990990991, + "grad_norm": 19.383365631103516, + "learning_rate": 1.0453470116882208e-06, + "loss": 2.9971, + "step": 81610 + }, + { + "epoch": 2.391283966893723, + "grad_norm": 18.468957901000977, + "learning_rate": 1.0443794882405345e-06, + "loss": 3.0214, + "step": 81620 + }, + { + "epoch": 2.391576942796455, + "grad_norm": 18.127761840820312, + "learning_rate": 1.0434123605292097e-06, + "loss": 3.0158, + "step": 81630 + }, + { + "epoch": 2.3918699186991867, + "grad_norm": 16.69106101989746, + "learning_rate": 1.0424456286510076e-06, + "loss": 3.0236, + "step": 81640 + }, + { + "epoch": 2.392162894601919, + "grad_norm": 17.218460083007812, + "learning_rate": 1.041479292702639e-06, + "loss": 3.0096, + "step": 81650 + }, + { + "epoch": 2.392455870504651, + "grad_norm": 17.484966278076172, + "learning_rate": 1.0405133527807827e-06, + "loss": 3.0095, + "step": 81660 + }, + { + "epoch": 2.392748846407383, + "grad_norm": 16.41023063659668, + "learning_rate": 1.039547808982076e-06, + "loss": 3.028, + "step": 81670 + }, + { + "epoch": 2.393041822310115, + "grad_norm": 16.97001075744629, + "learning_rate": 1.0385826614031131e-06, + "loss": 3.0193, + "step": 81680 + }, + { + "epoch": 2.3933347982128472, + "grad_norm": 20.459571838378906, + "learning_rate": 1.0376179101404533e-06, + "loss": 3.0068, + "step": 81690 + }, + { + "epoch": 2.393627774115579, + "grad_norm": 17.57421112060547, + "learning_rate": 1.0366535552906127e-06, + "loss": 3.0051, + "step": 81700 + }, + { + "epoch": 2.393920750018311, + "grad_norm": 19.286529541015625, + "learning_rate": 1.0356895969500701e-06, + "loss": 3.0276, + "step": 81710 + }, + { + "epoch": 2.394213725921043, + "grad_norm": 19.97380828857422, + "learning_rate": 1.0347260352152644e-06, + "loss": 3.0114, + "step": 81720 + }, + { + "epoch": 2.394506701823775, + "grad_norm": 18.23011016845703, + "learning_rate": 1.0337628701825946e-06, + "loss": 2.9953, + "step": 81730 + }, + { + "epoch": 2.394799677726507, + "grad_norm": 18.150957107543945, + "learning_rate": 1.0328001019484174e-06, + "loss": 3.0115, + "step": 81740 + }, + { + "epoch": 2.395092653629239, + "grad_norm": 19.79232406616211, + "learning_rate": 1.0318377306090549e-06, + "loss": 3.0123, + "step": 81750 + }, + { + "epoch": 2.395385629531971, + "grad_norm": 20.27651023864746, + "learning_rate": 1.0308757562607835e-06, + "loss": 3.0064, + "step": 81760 + }, + { + "epoch": 2.395678605434703, + "grad_norm": 17.783966064453125, + "learning_rate": 1.0299141789998462e-06, + "loss": 3.0104, + "step": 81770 + }, + { + "epoch": 2.395971581337435, + "grad_norm": 16.439998626708984, + "learning_rate": 1.02895299892244e-06, + "loss": 3.0104, + "step": 81780 + }, + { + "epoch": 2.396264557240167, + "grad_norm": 20.662919998168945, + "learning_rate": 1.0279922161247274e-06, + "loss": 3.02, + "step": 81790 + }, + { + "epoch": 2.396557533142899, + "grad_norm": 15.160411834716797, + "learning_rate": 1.0270318307028283e-06, + "loss": 3.011, + "step": 81800 + }, + { + "epoch": 2.396850509045631, + "grad_norm": 19.912418365478516, + "learning_rate": 1.0260718427528248e-06, + "loss": 3.0056, + "step": 81810 + }, + { + "epoch": 2.397143484948363, + "grad_norm": 18.4680118560791, + "learning_rate": 1.0251122523707558e-06, + "loss": 3.013, + "step": 81820 + }, + { + "epoch": 2.397436460851095, + "grad_norm": 18.05204200744629, + "learning_rate": 1.024153059652625e-06, + "loss": 3.0177, + "step": 81830 + }, + { + "epoch": 2.397729436753827, + "grad_norm": 15.991755485534668, + "learning_rate": 1.023194264694391e-06, + "loss": 3.0372, + "step": 81840 + }, + { + "epoch": 2.398022412656559, + "grad_norm": 21.18789291381836, + "learning_rate": 1.0222358675919792e-06, + "loss": 3.016, + "step": 81850 + }, + { + "epoch": 2.398315388559291, + "grad_norm": 18.867706298828125, + "learning_rate": 1.0212778684412678e-06, + "loss": 3.0151, + "step": 81860 + }, + { + "epoch": 2.398608364462023, + "grad_norm": 19.112699508666992, + "learning_rate": 1.0203202673381008e-06, + "loss": 3.0065, + "step": 81870 + }, + { + "epoch": 2.398901340364755, + "grad_norm": 17.683256149291992, + "learning_rate": 1.0193630643782805e-06, + "loss": 3.0244, + "step": 81880 + }, + { + "epoch": 2.399194316267487, + "grad_norm": 18.473445892333984, + "learning_rate": 1.0184062596575705e-06, + "loss": 3.0137, + "step": 81890 + }, + { + "epoch": 2.399487292170219, + "grad_norm": 20.91373634338379, + "learning_rate": 1.0174498532716908e-06, + "loss": 3.0176, + "step": 81900 + }, + { + "epoch": 2.399780268072951, + "grad_norm": 19.200084686279297, + "learning_rate": 1.0164938453163276e-06, + "loss": 3.0328, + "step": 81910 + }, + { + "epoch": 2.400073243975683, + "grad_norm": 19.804365158081055, + "learning_rate": 1.0155382358871202e-06, + "loss": 3.0196, + "step": 81920 + }, + { + "epoch": 2.400073243975683, + "eval_bleu": 0.3526817647736235, + "eval_cap_loss": 0.9020564556121826, + "eval_con_loss": 1.1332266330718994, + "eval_loss": 3.1685099601745605, + "step": 81920 + }, + { + "epoch": 2.400073243975683, + "eval_bleu": 0.3526817647736235, + "eval_cap_loss": 0.9020564556121826, + "eval_con_loss": 1.1332266330718994, + "eval_loss": 3.1685099601745605, + "eval_runtime": 53.2153, + "eval_samples_per_second": 375.832, + "eval_steps_per_second": 0.376, + "step": 81920 + }, + { + "epoch": 2.400366219878415, + "grad_norm": 20.106185913085938, + "learning_rate": 1.0145830250796746e-06, + "loss": 3.0354, + "step": 81930 + }, + { + "epoch": 2.400659195781147, + "grad_norm": 13.060861587524414, + "learning_rate": 1.0136282129895509e-06, + "loss": 3.0036, + "step": 81940 + }, + { + "epoch": 2.400952171683879, + "grad_norm": 18.15827178955078, + "learning_rate": 1.0126737997122742e-06, + "loss": 3.0275, + "step": 81950 + }, + { + "epoch": 2.401245147586611, + "grad_norm": 16.3809871673584, + "learning_rate": 1.0117197853433286e-06, + "loss": 2.9987, + "step": 81960 + }, + { + "epoch": 2.401538123489343, + "grad_norm": 18.129283905029297, + "learning_rate": 1.0107661699781574e-06, + "loss": 3.0148, + "step": 81970 + }, + { + "epoch": 2.401831099392075, + "grad_norm": 19.147817611694336, + "learning_rate": 1.0098129537121626e-06, + "loss": 3.0124, + "step": 81980 + }, + { + "epoch": 2.402124075294807, + "grad_norm": 19.38336753845215, + "learning_rate": 1.0088601366407103e-06, + "loss": 3.007, + "step": 81990 + }, + { + "epoch": 2.402417051197539, + "grad_norm": 18.212892532348633, + "learning_rate": 1.0079077188591207e-06, + "loss": 3.0106, + "step": 82000 + }, + { + "epoch": 2.402710027100271, + "grad_norm": 18.152904510498047, + "learning_rate": 1.0069557004626817e-06, + "loss": 3.0185, + "step": 82010 + }, + { + "epoch": 2.403003003003003, + "grad_norm": 18.32472038269043, + "learning_rate": 1.0060040815466332e-06, + "loss": 3.0054, + "step": 82020 + }, + { + "epoch": 2.403295978905735, + "grad_norm": 19.887929916381836, + "learning_rate": 1.005147966156611e-06, + "loss": 3.0164, + "step": 82030 + }, + { + "epoch": 2.403588954808467, + "grad_norm": 15.131692886352539, + "learning_rate": 1.004197106515563e-06, + "loss": 3.0206, + "step": 82040 + }, + { + "epoch": 2.403881930711199, + "grad_norm": 18.312620162963867, + "learning_rate": 1.0032466466308867e-06, + "loss": 3.0264, + "step": 82050 + }, + { + "epoch": 2.404174906613931, + "grad_norm": 18.21465492248535, + "learning_rate": 1.0022965865976724e-06, + "loss": 3.014, + "step": 82060 + }, + { + "epoch": 2.4044678825166628, + "grad_norm": 17.01605796813965, + "learning_rate": 1.001346926510966e-06, + "loss": 3.011, + "step": 82070 + }, + { + "epoch": 2.404760858419395, + "grad_norm": 20.53750991821289, + "learning_rate": 1.000397666465776e-06, + "loss": 3.0187, + "step": 82080 + }, + { + "epoch": 2.405053834322127, + "grad_norm": 14.887331008911133, + "learning_rate": 9.994488065570713e-07, + "loss": 3.0159, + "step": 82090 + }, + { + "epoch": 2.405346810224859, + "grad_norm": 17.542953491210938, + "learning_rate": 9.9850034687978e-07, + "loss": 3.0304, + "step": 82100 + }, + { + "epoch": 2.405639786127591, + "grad_norm": 21.345996856689453, + "learning_rate": 9.975522875287879e-07, + "loss": 3.0136, + "step": 82110 + }, + { + "epoch": 2.4059327620303232, + "grad_norm": 22.192874908447266, + "learning_rate": 9.966046285989444e-07, + "loss": 3.0317, + "step": 82120 + }, + { + "epoch": 2.406225737933055, + "grad_norm": 18.021577835083008, + "learning_rate": 9.956573701850557e-07, + "loss": 3.0235, + "step": 82130 + }, + { + "epoch": 2.406518713835787, + "grad_norm": 19.066987991333008, + "learning_rate": 9.947105123818912e-07, + "loss": 3.0121, + "step": 82140 + }, + { + "epoch": 2.406811689738519, + "grad_norm": 15.186768531799316, + "learning_rate": 9.937640552841764e-07, + "loss": 3.0053, + "step": 82150 + }, + { + "epoch": 2.407104665641251, + "grad_norm": 20.314790725708008, + "learning_rate": 9.928179989865993e-07, + "loss": 3.0301, + "step": 82160 + }, + { + "epoch": 2.407397641543983, + "grad_norm": 13.269241333007812, + "learning_rate": 9.918723435838079e-07, + "loss": 3.015, + "step": 82170 + }, + { + "epoch": 2.407690617446715, + "grad_norm": 20.792699813842773, + "learning_rate": 9.909270891704103e-07, + "loss": 3.0069, + "step": 82180 + }, + { + "epoch": 2.407983593349447, + "grad_norm": 20.601659774780273, + "learning_rate": 9.89982235840971e-07, + "loss": 3.0369, + "step": 82190 + }, + { + "epoch": 2.408276569252179, + "grad_norm": 18.782846450805664, + "learning_rate": 9.8903778369002e-07, + "loss": 2.9874, + "step": 82200 + }, + { + "epoch": 2.408569545154911, + "grad_norm": 17.334081649780273, + "learning_rate": 9.88093732812041e-07, + "loss": 2.9999, + "step": 82210 + }, + { + "epoch": 2.408862521057643, + "grad_norm": 20.390920639038086, + "learning_rate": 9.871500833014835e-07, + "loss": 3.0153, + "step": 82220 + }, + { + "epoch": 2.4091554969603752, + "grad_norm": 18.4550724029541, + "learning_rate": 9.862068352527516e-07, + "loss": 3.0033, + "step": 82230 + }, + { + "epoch": 2.409448472863107, + "grad_norm": 19.864168167114258, + "learning_rate": 9.852639887602128e-07, + "loss": 3.016, + "step": 82240 + }, + { + "epoch": 2.409741448765839, + "grad_norm": 18.798301696777344, + "learning_rate": 9.84321543918193e-07, + "loss": 3.0022, + "step": 82250 + }, + { + "epoch": 2.410034424668571, + "grad_norm": 19.26661491394043, + "learning_rate": 9.8337950082098e-07, + "loss": 3.0282, + "step": 82260 + }, + { + "epoch": 2.410327400571303, + "grad_norm": 17.392818450927734, + "learning_rate": 9.82437859562817e-07, + "loss": 3.002, + "step": 82270 + }, + { + "epoch": 2.410620376474035, + "grad_norm": 15.826455116271973, + "learning_rate": 9.814966202379117e-07, + "loss": 3.023, + "step": 82280 + }, + { + "epoch": 2.410913352376767, + "grad_norm": 17.869140625, + "learning_rate": 9.805557829404277e-07, + "loss": 3.01, + "step": 82290 + }, + { + "epoch": 2.411206328279499, + "grad_norm": 17.6129093170166, + "learning_rate": 9.796153477644921e-07, + "loss": 3.0042, + "step": 82300 + }, + { + "epoch": 2.411499304182231, + "grad_norm": 18.803768157958984, + "learning_rate": 9.786753148041871e-07, + "loss": 3.0076, + "step": 82310 + }, + { + "epoch": 2.411792280084963, + "grad_norm": 18.347166061401367, + "learning_rate": 9.777356841535601e-07, + "loss": 3.007, + "step": 82320 + }, + { + "epoch": 2.412085255987695, + "grad_norm": 20.88671112060547, + "learning_rate": 9.76796455906614e-07, + "loss": 3.0243, + "step": 82330 + }, + { + "epoch": 2.412378231890427, + "grad_norm": 17.91054344177246, + "learning_rate": 9.758576301573153e-07, + "loss": 2.9904, + "step": 82340 + }, + { + "epoch": 2.412671207793159, + "grad_norm": 19.026763916015625, + "learning_rate": 9.74919206999585e-07, + "loss": 3.0077, + "step": 82350 + }, + { + "epoch": 2.412964183695891, + "grad_norm": 15.33513069152832, + "learning_rate": 9.739811865273075e-07, + "loss": 3.0196, + "step": 82360 + }, + { + "epoch": 2.413257159598623, + "grad_norm": 19.422061920166016, + "learning_rate": 9.730435688343286e-07, + "loss": 3.0125, + "step": 82370 + }, + { + "epoch": 2.413550135501355, + "grad_norm": 16.95096206665039, + "learning_rate": 9.721063540144476e-07, + "loss": 2.9891, + "step": 82380 + }, + { + "epoch": 2.413843111404087, + "grad_norm": 13.41400146484375, + "learning_rate": 9.711695421614309e-07, + "loss": 3.0147, + "step": 82390 + }, + { + "epoch": 2.414136087306819, + "grad_norm": 16.130104064941406, + "learning_rate": 9.702331333689985e-07, + "loss": 3.009, + "step": 82400 + }, + { + "epoch": 2.414429063209551, + "grad_norm": 19.47713279724121, + "learning_rate": 9.692971277308328e-07, + "loss": 3.0199, + "step": 82410 + }, + { + "epoch": 2.414722039112283, + "grad_norm": 16.82526206970215, + "learning_rate": 9.683615253405759e-07, + "loss": 3.0143, + "step": 82420 + }, + { + "epoch": 2.415015015015015, + "grad_norm": 16.064279556274414, + "learning_rate": 9.67426326291831e-07, + "loss": 3.0141, + "step": 82430 + }, + { + "epoch": 2.415073610195561, + "eval_bleu": 0.35296756535278034, + "eval_cap_loss": 0.9016990065574646, + "eval_con_loss": 1.1336700916290283, + "eval_loss": 3.169039487838745, + "step": 82432 + }, + { + "epoch": 2.415073610195561, + "eval_bleu": 0.35296756535278034, + "eval_cap_loss": 0.9016990065574646, + "eval_con_loss": 1.1336700916290283, + "eval_loss": 3.169039487838745, + "eval_runtime": 53.0927, + "eval_samples_per_second": 376.699, + "eval_steps_per_second": 0.377, + "step": 82432 + }, + { + "epoch": 2.415307990917747, + "grad_norm": 14.886573791503906, + "learning_rate": 9.664915306781563e-07, + "loss": 3.0016, + "step": 82440 + }, + { + "epoch": 2.415600966820479, + "grad_norm": 13.703285217285156, + "learning_rate": 9.655571385930756e-07, + "loss": 3.0206, + "step": 82450 + }, + { + "epoch": 2.415893942723211, + "grad_norm": 18.833072662353516, + "learning_rate": 9.646231501300663e-07, + "loss": 3.019, + "step": 82460 + }, + { + "epoch": 2.416186918625943, + "grad_norm": 19.485029220581055, + "learning_rate": 9.636895653825706e-07, + "loss": 3.002, + "step": 82470 + }, + { + "epoch": 2.416479894528675, + "grad_norm": 18.499221801757812, + "learning_rate": 9.627563844439864e-07, + "loss": 3.0162, + "step": 82480 + }, + { + "epoch": 2.416772870431407, + "grad_norm": 17.757404327392578, + "learning_rate": 9.618236074076736e-07, + "loss": 3.022, + "step": 82490 + }, + { + "epoch": 2.417065846334139, + "grad_norm": 17.46926498413086, + "learning_rate": 9.608912343669513e-07, + "loss": 3.0053, + "step": 82500 + }, + { + "epoch": 2.417358822236871, + "grad_norm": 17.47694969177246, + "learning_rate": 9.599592654150991e-07, + "loss": 3.0297, + "step": 82510 + }, + { + "epoch": 2.417651798139603, + "grad_norm": 23.145130157470703, + "learning_rate": 9.590277006453524e-07, + "loss": 3.0039, + "step": 82520 + }, + { + "epoch": 2.417944774042335, + "grad_norm": 20.948320388793945, + "learning_rate": 9.580965401509113e-07, + "loss": 3.0375, + "step": 82530 + }, + { + "epoch": 2.418237749945067, + "grad_norm": 19.77547836303711, + "learning_rate": 9.5716578402493e-07, + "loss": 3.0212, + "step": 82540 + }, + { + "epoch": 2.4185307258477993, + "grad_norm": 16.927881240844727, + "learning_rate": 9.562354323605284e-07, + "loss": 3.0108, + "step": 82550 + }, + { + "epoch": 2.418823701750531, + "grad_norm": 19.441198348999023, + "learning_rate": 9.553054852507792e-07, + "loss": 3.0146, + "step": 82560 + }, + { + "epoch": 2.419116677653263, + "grad_norm": 15.965827941894531, + "learning_rate": 9.543759427887201e-07, + "loss": 2.994, + "step": 82570 + }, + { + "epoch": 2.419409653555995, + "grad_norm": 19.475650787353516, + "learning_rate": 9.534468050673457e-07, + "loss": 3.0039, + "step": 82580 + }, + { + "epoch": 2.419702629458727, + "grad_norm": 20.70660400390625, + "learning_rate": 9.525180721796129e-07, + "loss": 3.0238, + "step": 82590 + }, + { + "epoch": 2.419995605361459, + "grad_norm": 25.14592742919922, + "learning_rate": 9.515897442184324e-07, + "loss": 3.0098, + "step": 82600 + }, + { + "epoch": 2.4202885812641908, + "grad_norm": 16.491785049438477, + "learning_rate": 9.50661821276681e-07, + "loss": 3.0175, + "step": 82610 + }, + { + "epoch": 2.420581557166923, + "grad_norm": 21.48167610168457, + "learning_rate": 9.497343034471896e-07, + "loss": 3.0216, + "step": 82620 + }, + { + "epoch": 2.420874533069655, + "grad_norm": 16.721799850463867, + "learning_rate": 9.488071908227525e-07, + "loss": 2.9951, + "step": 82630 + }, + { + "epoch": 2.421167508972387, + "grad_norm": 17.96596908569336, + "learning_rate": 9.478804834961203e-07, + "loss": 3.0057, + "step": 82640 + }, + { + "epoch": 2.421460484875119, + "grad_norm": 17.016128540039062, + "learning_rate": 9.469541815600047e-07, + "loss": 3.0179, + "step": 82650 + }, + { + "epoch": 2.4217534607778513, + "grad_norm": 19.137365341186523, + "learning_rate": 9.460282851070779e-07, + "loss": 3.0143, + "step": 82660 + }, + { + "epoch": 2.422046436680583, + "grad_norm": 20.449220657348633, + "learning_rate": 9.451027942299706e-07, + "loss": 3.0335, + "step": 82670 + }, + { + "epoch": 2.422339412583315, + "grad_norm": 16.213808059692383, + "learning_rate": 9.441777090212706e-07, + "loss": 2.9976, + "step": 82680 + }, + { + "epoch": 2.422632388486047, + "grad_norm": 20.397724151611328, + "learning_rate": 9.432530295735298e-07, + "loss": 3.0252, + "step": 82690 + }, + { + "epoch": 2.422925364388779, + "grad_norm": 18.46696662902832, + "learning_rate": 9.423287559792543e-07, + "loss": 3.036, + "step": 82700 + }, + { + "epoch": 2.423218340291511, + "grad_norm": 18.000953674316406, + "learning_rate": 9.414048883309135e-07, + "loss": 2.984, + "step": 82710 + }, + { + "epoch": 2.423511316194243, + "grad_norm": 16.398033142089844, + "learning_rate": 9.404814267209361e-07, + "loss": 3.0152, + "step": 82720 + }, + { + "epoch": 2.423804292096975, + "grad_norm": 18.665403366088867, + "learning_rate": 9.395583712417061e-07, + "loss": 3.0079, + "step": 82730 + }, + { + "epoch": 2.424097267999707, + "grad_norm": 19.82404136657715, + "learning_rate": 9.386357219855718e-07, + "loss": 3.0114, + "step": 82740 + }, + { + "epoch": 2.424390243902439, + "grad_norm": 21.929380416870117, + "learning_rate": 9.377134790448378e-07, + "loss": 2.9937, + "step": 82750 + }, + { + "epoch": 2.424683219805171, + "grad_norm": 20.51353645324707, + "learning_rate": 9.367916425117712e-07, + "loss": 3.0218, + "step": 82760 + }, + { + "epoch": 2.4249761957079032, + "grad_norm": 15.550773620605469, + "learning_rate": 9.358702124785928e-07, + "loss": 3.0353, + "step": 82770 + }, + { + "epoch": 2.425269171610635, + "grad_norm": 18.91178321838379, + "learning_rate": 9.349491890374895e-07, + "loss": 3.0257, + "step": 82780 + }, + { + "epoch": 2.425562147513367, + "grad_norm": 20.0042781829834, + "learning_rate": 9.340285722806014e-07, + "loss": 3.005, + "step": 82790 + }, + { + "epoch": 2.425855123416099, + "grad_norm": 20.93453598022461, + "learning_rate": 9.331083623000336e-07, + "loss": 3.024, + "step": 82800 + }, + { + "epoch": 2.426148099318831, + "grad_norm": 20.676944732666016, + "learning_rate": 9.321885591878443e-07, + "loss": 3.0233, + "step": 82810 + }, + { + "epoch": 2.426441075221563, + "grad_norm": 19.750972747802734, + "learning_rate": 9.312691630360565e-07, + "loss": 3.0176, + "step": 82820 + }, + { + "epoch": 2.426734051124295, + "grad_norm": 20.15202522277832, + "learning_rate": 9.3035017393665e-07, + "loss": 3.0034, + "step": 82830 + }, + { + "epoch": 2.427027027027027, + "grad_norm": 15.56653118133545, + "learning_rate": 9.294315919815655e-07, + "loss": 3.0141, + "step": 82840 + }, + { + "epoch": 2.427320002929759, + "grad_norm": 20.87416648864746, + "learning_rate": 9.285134172626992e-07, + "loss": 3.0006, + "step": 82850 + }, + { + "epoch": 2.427612978832491, + "grad_norm": 18.9753475189209, + "learning_rate": 9.275956498719107e-07, + "loss": 2.9982, + "step": 82860 + }, + { + "epoch": 2.427905954735223, + "grad_norm": 20.48468017578125, + "learning_rate": 9.266782899010162e-07, + "loss": 2.9978, + "step": 82870 + }, + { + "epoch": 2.428198930637955, + "grad_norm": 17.99952507019043, + "learning_rate": 9.25761337441794e-07, + "loss": 3.0277, + "step": 82880 + }, + { + "epoch": 2.428491906540687, + "grad_norm": 17.163759231567383, + "learning_rate": 9.248447925859766e-07, + "loss": 2.9918, + "step": 82890 + }, + { + "epoch": 2.428784882443419, + "grad_norm": 20.270084381103516, + "learning_rate": 9.239286554252608e-07, + "loss": 3.0128, + "step": 82900 + }, + { + "epoch": 2.429077858346151, + "grad_norm": 17.132434844970703, + "learning_rate": 9.230129260513004e-07, + "loss": 3.0297, + "step": 82910 + }, + { + "epoch": 2.429370834248883, + "grad_norm": 13.483399391174316, + "learning_rate": 9.220976045557101e-07, + "loss": 3.0097, + "step": 82920 + }, + { + "epoch": 2.429663810151615, + "grad_norm": 22.061294555664062, + "learning_rate": 9.211826910300598e-07, + "loss": 3.0293, + "step": 82930 + }, + { + "epoch": 2.429956786054347, + "grad_norm": 20.03185272216797, + "learning_rate": 9.202681855658841e-07, + "loss": 3.0124, + "step": 82940 + }, + { + "epoch": 2.4300739764154398, + "eval_bleu": 0.35287184074736, + "eval_cap_loss": 0.9017432928085327, + "eval_con_loss": 1.1328880786895752, + "eval_loss": 3.1675195693969727, + "step": 82944 + }, + { + "epoch": 2.4300739764154398, + "eval_bleu": 0.35287184074736, + "eval_cap_loss": 0.9017432928085327, + "eval_con_loss": 1.1328880786895752, + "eval_loss": 3.1675195693969727, + "eval_runtime": 52.8529, + "eval_samples_per_second": 378.409, + "eval_steps_per_second": 0.378, + "step": 82944 + }, + { + "epoch": 2.430249761957079, + "grad_norm": 17.182165145874023, + "learning_rate": 9.193540882546703e-07, + "loss": 3.0202, + "step": 82950 + }, + { + "epoch": 2.430542737859811, + "grad_norm": 18.001293182373047, + "learning_rate": 9.184403991878721e-07, + "loss": 3.0039, + "step": 82960 + }, + { + "epoch": 2.430835713762543, + "grad_norm": 16.134174346923828, + "learning_rate": 9.175271184568957e-07, + "loss": 3.0045, + "step": 82970 + }, + { + "epoch": 2.431128689665275, + "grad_norm": 15.993735313415527, + "learning_rate": 9.166142461531103e-07, + "loss": 3.0308, + "step": 82980 + }, + { + "epoch": 2.431421665568007, + "grad_norm": 16.0303897857666, + "learning_rate": 9.157017823678438e-07, + "loss": 2.9965, + "step": 82990 + }, + { + "epoch": 2.431714641470739, + "grad_norm": 17.186691284179688, + "learning_rate": 9.147897271923839e-07, + "loss": 3.002, + "step": 83000 + }, + { + "epoch": 2.432007617373471, + "grad_norm": 14.138461112976074, + "learning_rate": 9.138780807179737e-07, + "loss": 3.0163, + "step": 83010 + }, + { + "epoch": 2.432300593276203, + "grad_norm": 21.078752517700195, + "learning_rate": 9.129668430358207e-07, + "loss": 3.0205, + "step": 83020 + }, + { + "epoch": 2.432593569178935, + "grad_norm": 14.419515609741211, + "learning_rate": 9.120560142370866e-07, + "loss": 3.0152, + "step": 83030 + }, + { + "epoch": 2.432886545081667, + "grad_norm": 14.80251693725586, + "learning_rate": 9.111455944128961e-07, + "loss": 3.0329, + "step": 83040 + }, + { + "epoch": 2.433179520984399, + "grad_norm": 18.019699096679688, + "learning_rate": 9.102355836543298e-07, + "loss": 3.011, + "step": 83050 + }, + { + "epoch": 2.433472496887131, + "grad_norm": 18.167821884155273, + "learning_rate": 9.093259820524292e-07, + "loss": 3.0282, + "step": 83060 + }, + { + "epoch": 2.433765472789863, + "grad_norm": 18.33371925354004, + "learning_rate": 9.084167896981949e-07, + "loss": 3.0286, + "step": 83070 + }, + { + "epoch": 2.434058448692595, + "grad_norm": 19.6553897857666, + "learning_rate": 9.075080066825864e-07, + "loss": 3.0335, + "step": 83080 + }, + { + "epoch": 2.4343514245953273, + "grad_norm": 16.28204917907715, + "learning_rate": 9.065996330965232e-07, + "loss": 3.0111, + "step": 83090 + }, + { + "epoch": 2.434644400498059, + "grad_norm": 17.788259506225586, + "learning_rate": 9.056916690308803e-07, + "loss": 3.0041, + "step": 83100 + }, + { + "epoch": 2.434937376400791, + "grad_norm": 17.598833084106445, + "learning_rate": 9.047841145764963e-07, + "loss": 3.0111, + "step": 83110 + }, + { + "epoch": 2.435230352303523, + "grad_norm": 18.356489181518555, + "learning_rate": 9.038769698241645e-07, + "loss": 2.9938, + "step": 83120 + }, + { + "epoch": 2.435523328206255, + "grad_norm": 21.301910400390625, + "learning_rate": 9.029702348646413e-07, + "loss": 2.9908, + "step": 83130 + }, + { + "epoch": 2.435816304108987, + "grad_norm": 19.07978057861328, + "learning_rate": 9.02063909788638e-07, + "loss": 3.0322, + "step": 83140 + }, + { + "epoch": 2.4361092800117192, + "grad_norm": 19.597517013549805, + "learning_rate": 9.011579946868288e-07, + "loss": 3.0254, + "step": 83150 + }, + { + "epoch": 2.436402255914451, + "grad_norm": 18.6624698638916, + "learning_rate": 9.002524896498438e-07, + "loss": 3.0285, + "step": 83160 + }, + { + "epoch": 2.436695231817183, + "grad_norm": 21.353496551513672, + "learning_rate": 8.993473947682762e-07, + "loss": 3.0321, + "step": 83170 + }, + { + "epoch": 2.436988207719915, + "grad_norm": 20.785797119140625, + "learning_rate": 8.98442710132672e-07, + "loss": 3.0268, + "step": 83180 + }, + { + "epoch": 2.437281183622647, + "grad_norm": 19.956775665283203, + "learning_rate": 8.97538435833542e-07, + "loss": 3.0191, + "step": 83190 + }, + { + "epoch": 2.4375741595253793, + "grad_norm": 17.31841278076172, + "learning_rate": 8.966345719613511e-07, + "loss": 3.0209, + "step": 83200 + }, + { + "epoch": 2.437867135428111, + "grad_norm": 16.20709228515625, + "learning_rate": 8.957311186065287e-07, + "loss": 3.0015, + "step": 83210 + }, + { + "epoch": 2.438160111330843, + "grad_norm": 18.78475570678711, + "learning_rate": 8.948280758594563e-07, + "loss": 3.0071, + "step": 83220 + }, + { + "epoch": 2.438453087233575, + "grad_norm": 25.527135848999023, + "learning_rate": 8.9392544381048e-07, + "loss": 3.0049, + "step": 83230 + }, + { + "epoch": 2.438746063136307, + "grad_norm": 17.75792121887207, + "learning_rate": 8.930232225499025e-07, + "loss": 3.0007, + "step": 83240 + }, + { + "epoch": 2.439039039039039, + "grad_norm": 20.494579315185547, + "learning_rate": 8.921214121679866e-07, + "loss": 2.9868, + "step": 83250 + }, + { + "epoch": 2.439332014941771, + "grad_norm": 15.693263053894043, + "learning_rate": 8.912200127549514e-07, + "loss": 3.0231, + "step": 83260 + }, + { + "epoch": 2.439624990844503, + "grad_norm": 16.402172088623047, + "learning_rate": 8.903190244009785e-07, + "loss": 3.0041, + "step": 83270 + }, + { + "epoch": 2.439917966747235, + "grad_norm": 19.05427360534668, + "learning_rate": 8.894184471962042e-07, + "loss": 3.0079, + "step": 83280 + }, + { + "epoch": 2.440210942649967, + "grad_norm": 17.206388473510742, + "learning_rate": 8.88518281230728e-07, + "loss": 3.0202, + "step": 83290 + }, + { + "epoch": 2.440503918552699, + "grad_norm": 19.21988296508789, + "learning_rate": 8.876185265946042e-07, + "loss": 3.0068, + "step": 83300 + }, + { + "epoch": 2.4407968944554312, + "grad_norm": 16.625123977661133, + "learning_rate": 8.867191833778482e-07, + "loss": 3.0186, + "step": 83310 + }, + { + "epoch": 2.441089870358163, + "grad_norm": 14.052858352661133, + "learning_rate": 8.858202516704356e-07, + "loss": 3.0229, + "step": 83320 + }, + { + "epoch": 2.441382846260895, + "grad_norm": 15.494481086730957, + "learning_rate": 8.849217315622987e-07, + "loss": 3.0138, + "step": 83330 + }, + { + "epoch": 2.441675822163627, + "grad_norm": 18.564128875732422, + "learning_rate": 8.84023623143328e-07, + "loss": 3.0115, + "step": 83340 + }, + { + "epoch": 2.441968798066359, + "grad_norm": 16.026914596557617, + "learning_rate": 8.831259265033754e-07, + "loss": 3.0088, + "step": 83350 + }, + { + "epoch": 2.442261773969091, + "grad_norm": 19.845579147338867, + "learning_rate": 8.822286417322478e-07, + "loss": 3.0368, + "step": 83360 + }, + { + "epoch": 2.442554749871823, + "grad_norm": 21.962751388549805, + "learning_rate": 8.813317689197165e-07, + "loss": 3.0249, + "step": 83370 + }, + { + "epoch": 2.442847725774555, + "grad_norm": 22.287147521972656, + "learning_rate": 8.80435308155505e-07, + "loss": 3.0103, + "step": 83380 + }, + { + "epoch": 2.443140701677287, + "grad_norm": 14.53480339050293, + "learning_rate": 8.795392595293006e-07, + "loss": 3.0213, + "step": 83390 + }, + { + "epoch": 2.443433677580019, + "grad_norm": 18.38701629638672, + "learning_rate": 8.786436231307472e-07, + "loss": 3.01, + "step": 83400 + }, + { + "epoch": 2.443726653482751, + "grad_norm": 21.007980346679688, + "learning_rate": 8.777483990494496e-07, + "loss": 3.0002, + "step": 83410 + }, + { + "epoch": 2.4440196293854832, + "grad_norm": 18.443628311157227, + "learning_rate": 8.768535873749668e-07, + "loss": 3.0158, + "step": 83420 + }, + { + "epoch": 2.444312605288215, + "grad_norm": 16.057453155517578, + "learning_rate": 8.759591881968204e-07, + "loss": 3.0283, + "step": 83430 + }, + { + "epoch": 2.444605581190947, + "grad_norm": 18.836284637451172, + "learning_rate": 8.750652016044919e-07, + "loss": 3.0178, + "step": 83440 + }, + { + "epoch": 2.444898557093679, + "grad_norm": 18.791799545288086, + "learning_rate": 8.741716276874163e-07, + "loss": 3.0071, + "step": 83450 + }, + { + "epoch": 2.4450743426353183, + "eval_bleu": 0.3527535914125331, + "eval_cap_loss": 0.9014357328414917, + "eval_con_loss": 1.1312601566314697, + "eval_loss": 3.1639561653137207, + "step": 83456 + }, + { + "epoch": 2.4450743426353183, + "eval_bleu": 0.3527535914125331, + "eval_cap_loss": 0.9014357328414917, + "eval_con_loss": 1.1312601566314697, + "eval_loss": 3.1639561653137207, + "eval_runtime": 60.1934, + "eval_samples_per_second": 332.262, + "eval_steps_per_second": 0.332, + "step": 83456 + }, + { + "epoch": 2.445191532996411, + "grad_norm": 14.541261672973633, + "learning_rate": 8.732784665349925e-07, + "loss": 3.0151, + "step": 83460 + }, + { + "epoch": 2.445484508899143, + "grad_norm": 19.855798721313477, + "learning_rate": 8.72385718236573e-07, + "loss": 3.0075, + "step": 83470 + }, + { + "epoch": 2.445777484801875, + "grad_norm": 18.02438735961914, + "learning_rate": 8.714933828814764e-07, + "loss": 3.0142, + "step": 83480 + }, + { + "epoch": 2.446070460704607, + "grad_norm": 19.947067260742188, + "learning_rate": 8.706014605589718e-07, + "loss": 3.0108, + "step": 83490 + }, + { + "epoch": 2.446363436607339, + "grad_norm": 20.330310821533203, + "learning_rate": 8.697099513582941e-07, + "loss": 3.0144, + "step": 83500 + }, + { + "epoch": 2.446656412510071, + "grad_norm": 19.7384090423584, + "learning_rate": 8.6881885536863e-07, + "loss": 3.0198, + "step": 83510 + }, + { + "epoch": 2.4469493884128033, + "grad_norm": 21.206523895263672, + "learning_rate": 8.679281726791311e-07, + "loss": 3.0129, + "step": 83520 + }, + { + "epoch": 2.447242364315535, + "grad_norm": 16.90038299560547, + "learning_rate": 8.670379033789023e-07, + "loss": 3.0306, + "step": 83530 + }, + { + "epoch": 2.447535340218267, + "grad_norm": 22.42609405517578, + "learning_rate": 8.661480475570128e-07, + "loss": 3.0184, + "step": 83540 + }, + { + "epoch": 2.447828316120999, + "grad_norm": 15.338504791259766, + "learning_rate": 8.652586053024836e-07, + "loss": 3.0217, + "step": 83550 + }, + { + "epoch": 2.448121292023731, + "grad_norm": 20.927593231201172, + "learning_rate": 8.643695767043025e-07, + "loss": 3.0076, + "step": 83560 + }, + { + "epoch": 2.448414267926463, + "grad_norm": 15.323219299316406, + "learning_rate": 8.634809618514078e-07, + "loss": 3.0319, + "step": 83570 + }, + { + "epoch": 2.448707243829195, + "grad_norm": 16.931716918945312, + "learning_rate": 8.625927608327039e-07, + "loss": 3.0185, + "step": 83580 + }, + { + "epoch": 2.449000219731927, + "grad_norm": 23.71915054321289, + "learning_rate": 8.617049737370459e-07, + "loss": 3.027, + "step": 83590 + }, + { + "epoch": 2.449293195634659, + "grad_norm": 18.721084594726562, + "learning_rate": 8.608176006532554e-07, + "loss": 3.0116, + "step": 83600 + }, + { + "epoch": 2.449586171537391, + "grad_norm": 15.927114486694336, + "learning_rate": 8.599306416701053e-07, + "loss": 3.007, + "step": 83610 + }, + { + "epoch": 2.449879147440123, + "grad_norm": 20.25729751586914, + "learning_rate": 8.590440968763336e-07, + "loss": 2.9986, + "step": 83620 + }, + { + "epoch": 2.4501721233428553, + "grad_norm": 17.949949264526367, + "learning_rate": 8.581579663606304e-07, + "loss": 3.0081, + "step": 83630 + }, + { + "epoch": 2.450465099245587, + "grad_norm": 20.751211166381836, + "learning_rate": 8.572722502116521e-07, + "loss": 3.0059, + "step": 83640 + }, + { + "epoch": 2.450758075148319, + "grad_norm": 19.550371170043945, + "learning_rate": 8.56386948518006e-07, + "loss": 3.0109, + "step": 83650 + }, + { + "epoch": 2.451051051051051, + "grad_norm": 16.51458740234375, + "learning_rate": 8.555020613682641e-07, + "loss": 3.0173, + "step": 83660 + }, + { + "epoch": 2.451344026953783, + "grad_norm": 17.491031646728516, + "learning_rate": 8.546175888509506e-07, + "loss": 3.0034, + "step": 83670 + }, + { + "epoch": 2.451637002856515, + "grad_norm": 17.05636215209961, + "learning_rate": 8.537335310545552e-07, + "loss": 3.002, + "step": 83680 + }, + { + "epoch": 2.4519299787592472, + "grad_norm": 18.250497817993164, + "learning_rate": 8.528498880675201e-07, + "loss": 3.0195, + "step": 83690 + }, + { + "epoch": 2.452222954661979, + "grad_norm": 20.859691619873047, + "learning_rate": 8.519666599782506e-07, + "loss": 3.0224, + "step": 83700 + }, + { + "epoch": 2.452515930564711, + "grad_norm": 16.386579513549805, + "learning_rate": 8.510838468751048e-07, + "loss": 3.0135, + "step": 83710 + }, + { + "epoch": 2.452808906467443, + "grad_norm": 22.059080123901367, + "learning_rate": 8.502014488464077e-07, + "loss": 3.0242, + "step": 83720 + }, + { + "epoch": 2.453101882370175, + "grad_norm": 20.622079849243164, + "learning_rate": 8.493194659804349e-07, + "loss": 3.0058, + "step": 83730 + }, + { + "epoch": 2.4533948582729073, + "grad_norm": 18.418834686279297, + "learning_rate": 8.484378983654251e-07, + "loss": 3.0124, + "step": 83740 + }, + { + "epoch": 2.453687834175639, + "grad_norm": 16.22602081298828, + "learning_rate": 8.475567460895723e-07, + "loss": 3.0265, + "step": 83750 + }, + { + "epoch": 2.453980810078371, + "grad_norm": 19.670806884765625, + "learning_rate": 8.466760092410325e-07, + "loss": 3.0196, + "step": 83760 + }, + { + "epoch": 2.454273785981103, + "grad_norm": 16.948331832885742, + "learning_rate": 8.45795687907916e-07, + "loss": 3.0082, + "step": 83770 + }, + { + "epoch": 2.454566761883835, + "grad_norm": 21.258983612060547, + "learning_rate": 8.44915782178295e-07, + "loss": 3.0137, + "step": 83780 + }, + { + "epoch": 2.454859737786567, + "grad_norm": 20.612703323364258, + "learning_rate": 8.44036292140199e-07, + "loss": 3.0128, + "step": 83790 + }, + { + "epoch": 2.455152713689299, + "grad_norm": 19.742460250854492, + "learning_rate": 8.431572178816149e-07, + "loss": 3.0172, + "step": 83800 + }, + { + "epoch": 2.455445689592031, + "grad_norm": 18.063840866088867, + "learning_rate": 8.422785594904908e-07, + "loss": 3.0119, + "step": 83810 + }, + { + "epoch": 2.455738665494763, + "grad_norm": 17.600914001464844, + "learning_rate": 8.414003170547286e-07, + "loss": 3.0086, + "step": 83820 + }, + { + "epoch": 2.456031641397495, + "grad_norm": 18.4548282623291, + "learning_rate": 8.405224906621939e-07, + "loss": 3.0245, + "step": 83830 + }, + { + "epoch": 2.456324617300227, + "grad_norm": 20.896455764770508, + "learning_rate": 8.396450804007056e-07, + "loss": 2.9907, + "step": 83840 + }, + { + "epoch": 2.4566175932029592, + "grad_norm": 16.01974868774414, + "learning_rate": 8.387680863580456e-07, + "loss": 3.0018, + "step": 83850 + }, + { + "epoch": 2.456910569105691, + "grad_norm": 17.8347225189209, + "learning_rate": 8.378915086219497e-07, + "loss": 3.017, + "step": 83860 + }, + { + "epoch": 2.457203545008423, + "grad_norm": 19.49582862854004, + "learning_rate": 8.370153472801151e-07, + "loss": 3.0184, + "step": 83870 + }, + { + "epoch": 2.457496520911155, + "grad_norm": 18.851245880126953, + "learning_rate": 8.361396024201973e-07, + "loss": 2.9984, + "step": 83880 + }, + { + "epoch": 2.457789496813887, + "grad_norm": 22.321584701538086, + "learning_rate": 8.3526427412981e-07, + "loss": 3.0264, + "step": 83890 + }, + { + "epoch": 2.458082472716619, + "grad_norm": 18.23086929321289, + "learning_rate": 8.343893624965221e-07, + "loss": 3.0253, + "step": 83900 + }, + { + "epoch": 2.458375448619351, + "grad_norm": 16.59410858154297, + "learning_rate": 8.335148676078664e-07, + "loss": 3.001, + "step": 83910 + }, + { + "epoch": 2.458668424522083, + "grad_norm": 16.96788787841797, + "learning_rate": 8.326407895513277e-07, + "loss": 3.0144, + "step": 83920 + }, + { + "epoch": 2.458961400424815, + "grad_norm": 16.996257781982422, + "learning_rate": 8.317671284143553e-07, + "loss": 2.9995, + "step": 83930 + }, + { + "epoch": 2.459254376327547, + "grad_norm": 19.664264678955078, + "learning_rate": 8.308938842843512e-07, + "loss": 3.0119, + "step": 83940 + }, + { + "epoch": 2.459547352230279, + "grad_norm": 20.65154457092285, + "learning_rate": 8.300210572486789e-07, + "loss": 3.0075, + "step": 83950 + }, + { + "epoch": 2.4598403281330112, + "grad_norm": 15.99083137512207, + "learning_rate": 8.291486473946608e-07, + "loss": 3.0223, + "step": 83960 + }, + { + "epoch": 2.4600747088551964, + "eval_bleu": 0.35286820781701916, + "eval_cap_loss": 0.9015955924987793, + "eval_con_loss": 1.1316320896148682, + "eval_loss": 3.1648597717285156, + "step": 83968 + }, + { + "epoch": 2.4600747088551964, + "eval_bleu": 0.35286820781701916, + "eval_cap_loss": 0.9015955924987793, + "eval_con_loss": 1.1316320896148682, + "eval_loss": 3.1648597717285156, + "eval_runtime": 54.1393, + "eval_samples_per_second": 369.417, + "eval_steps_per_second": 0.369, + "step": 83968 + }, + { + "epoch": 2.460133304035743, + "grad_norm": 18.342769622802734, + "learning_rate": 8.282766548095772e-07, + "loss": 3.0268, + "step": 83970 + }, + { + "epoch": 2.460426279938475, + "grad_norm": 17.66189193725586, + "learning_rate": 8.274050795806626e-07, + "loss": 3.0188, + "step": 83980 + }, + { + "epoch": 2.460719255841207, + "grad_norm": 20.888219833374023, + "learning_rate": 8.265339217951157e-07, + "loss": 3.0378, + "step": 83990 + }, + { + "epoch": 2.461012231743939, + "grad_norm": 13.702651023864746, + "learning_rate": 8.256631815400889e-07, + "loss": 3.0195, + "step": 84000 + }, + { + "epoch": 2.461305207646671, + "grad_norm": 14.855647087097168, + "learning_rate": 8.24792858902696e-07, + "loss": 3.0187, + "step": 84010 + }, + { + "epoch": 2.461598183549403, + "grad_norm": 20.648693084716797, + "learning_rate": 8.239229539700061e-07, + "loss": 3.0259, + "step": 84020 + }, + { + "epoch": 2.461891159452135, + "grad_norm": 16.022443771362305, + "learning_rate": 8.231403967400376e-07, + "loss": 3.018, + "step": 84030 + }, + { + "epoch": 2.462184135354867, + "grad_norm": 21.492725372314453, + "learning_rate": 8.222712856860149e-07, + "loss": 3.0012, + "step": 84040 + }, + { + "epoch": 2.462477111257599, + "grad_norm": 19.554595947265625, + "learning_rate": 8.214025925889652e-07, + "loss": 3.0306, + "step": 84050 + }, + { + "epoch": 2.4627700871603313, + "grad_norm": 18.865936279296875, + "learning_rate": 8.205343175357944e-07, + "loss": 3.0105, + "step": 84060 + }, + { + "epoch": 2.463063063063063, + "grad_norm": 17.491409301757812, + "learning_rate": 8.196664606133692e-07, + "loss": 3.0036, + "step": 84070 + }, + { + "epoch": 2.463356038965795, + "grad_norm": 24.22266387939453, + "learning_rate": 8.187990219085135e-07, + "loss": 3.0257, + "step": 84080 + }, + { + "epoch": 2.463649014868527, + "grad_norm": 18.355051040649414, + "learning_rate": 8.179320015080105e-07, + "loss": 3.021, + "step": 84090 + }, + { + "epoch": 2.463941990771259, + "grad_norm": 17.933307647705078, + "learning_rate": 8.170653994985984e-07, + "loss": 3.0039, + "step": 84100 + }, + { + "epoch": 2.464234966673991, + "grad_norm": 18.241893768310547, + "learning_rate": 8.161992159669769e-07, + "loss": 3.0149, + "step": 84110 + }, + { + "epoch": 2.4645279425767233, + "grad_norm": 21.563127517700195, + "learning_rate": 8.153334509998029e-07, + "loss": 3.0381, + "step": 84120 + }, + { + "epoch": 2.464820918479455, + "grad_norm": 19.513423919677734, + "learning_rate": 8.144681046836894e-07, + "loss": 3.0215, + "step": 84130 + }, + { + "epoch": 2.465113894382187, + "grad_norm": 17.007362365722656, + "learning_rate": 8.13603177105211e-07, + "loss": 2.9995, + "step": 84140 + }, + { + "epoch": 2.465406870284919, + "grad_norm": 18.39387321472168, + "learning_rate": 8.127386683508965e-07, + "loss": 3.0141, + "step": 84150 + }, + { + "epoch": 2.465699846187651, + "grad_norm": 19.21137809753418, + "learning_rate": 8.118745785072363e-07, + "loss": 3.0162, + "step": 84160 + }, + { + "epoch": 2.4659928220903833, + "grad_norm": 16.507823944091797, + "learning_rate": 8.11010907660677e-07, + "loss": 2.9885, + "step": 84170 + }, + { + "epoch": 2.466285797993115, + "grad_norm": 17.12708282470703, + "learning_rate": 8.101476558976256e-07, + "loss": 3.029, + "step": 84180 + }, + { + "epoch": 2.466578773895847, + "grad_norm": 18.69634246826172, + "learning_rate": 8.09284823304442e-07, + "loss": 3.0265, + "step": 84190 + }, + { + "epoch": 2.466871749798579, + "grad_norm": 17.41305160522461, + "learning_rate": 8.084224099674509e-07, + "loss": 3.0108, + "step": 84200 + }, + { + "epoch": 2.467164725701311, + "grad_norm": 16.031982421875, + "learning_rate": 8.075604159729283e-07, + "loss": 3.0011, + "step": 84210 + }, + { + "epoch": 2.467457701604043, + "grad_norm": 17.805038452148438, + "learning_rate": 8.066988414071142e-07, + "loss": 2.9939, + "step": 84220 + }, + { + "epoch": 2.4677506775067752, + "grad_norm": 19.793771743774414, + "learning_rate": 8.058376863562023e-07, + "loss": 3.0222, + "step": 84230 + }, + { + "epoch": 2.468043653409507, + "grad_norm": 18.088714599609375, + "learning_rate": 8.049769509063465e-07, + "loss": 3.0006, + "step": 84240 + }, + { + "epoch": 2.468336629312239, + "grad_norm": 18.205236434936523, + "learning_rate": 8.041166351436585e-07, + "loss": 2.9899, + "step": 84250 + }, + { + "epoch": 2.468629605214971, + "grad_norm": 15.66354751586914, + "learning_rate": 8.03256739154209e-07, + "loss": 3.0169, + "step": 84260 + }, + { + "epoch": 2.468922581117703, + "grad_norm": 17.91958236694336, + "learning_rate": 8.023972630240234e-07, + "loss": 3.0156, + "step": 84270 + }, + { + "epoch": 2.4692155570204353, + "grad_norm": 15.26448917388916, + "learning_rate": 8.015382068390892e-07, + "loss": 3.0095, + "step": 84280 + }, + { + "epoch": 2.469508532923167, + "grad_norm": 15.661421775817871, + "learning_rate": 8.006795706853476e-07, + "loss": 3.0033, + "step": 84290 + }, + { + "epoch": 2.469801508825899, + "grad_norm": 19.22427749633789, + "learning_rate": 7.998213546487021e-07, + "loss": 3.0092, + "step": 84300 + }, + { + "epoch": 2.470094484728631, + "grad_norm": 19.764366149902344, + "learning_rate": 7.989635588150108e-07, + "loss": 3.0185, + "step": 84310 + }, + { + "epoch": 2.470387460631363, + "grad_norm": 18.879268646240234, + "learning_rate": 7.98106183270091e-07, + "loss": 3.0195, + "step": 84320 + }, + { + "epoch": 2.470680436534095, + "grad_norm": 16.133127212524414, + "learning_rate": 7.972492280997185e-07, + "loss": 3.0062, + "step": 84330 + }, + { + "epoch": 2.470973412436827, + "grad_norm": 18.25442123413086, + "learning_rate": 7.963926933896282e-07, + "loss": 3.0082, + "step": 84340 + }, + { + "epoch": 2.471266388339559, + "grad_norm": 19.082414627075195, + "learning_rate": 7.955365792255082e-07, + "loss": 3.0179, + "step": 84350 + }, + { + "epoch": 2.471559364242291, + "grad_norm": 14.984838485717773, + "learning_rate": 7.946808856930105e-07, + "loss": 3.0042, + "step": 84360 + }, + { + "epoch": 2.471852340145023, + "grad_norm": 16.813016891479492, + "learning_rate": 7.9382561287774e-07, + "loss": 3.0125, + "step": 84370 + }, + { + "epoch": 2.472145316047755, + "grad_norm": 14.633925437927246, + "learning_rate": 7.929707608652631e-07, + "loss": 3.034, + "step": 84380 + }, + { + "epoch": 2.4724382919504873, + "grad_norm": 20.36953353881836, + "learning_rate": 7.921163297411017e-07, + "loss": 2.9991, + "step": 84390 + }, + { + "epoch": 2.472731267853219, + "grad_norm": 17.236799240112305, + "learning_rate": 7.912623195907365e-07, + "loss": 3.0152, + "step": 84400 + }, + { + "epoch": 2.473024243755951, + "grad_norm": 15.871689796447754, + "learning_rate": 7.904087304996066e-07, + "loss": 3.0067, + "step": 84410 + }, + { + "epoch": 2.473317219658683, + "grad_norm": 18.293716430664062, + "learning_rate": 7.895555625531104e-07, + "loss": 3.035, + "step": 84420 + }, + { + "epoch": 2.473610195561415, + "grad_norm": 14.80221939086914, + "learning_rate": 7.887028158365989e-07, + "loss": 3.0083, + "step": 84430 + }, + { + "epoch": 2.473903171464147, + "grad_norm": 16.562664031982422, + "learning_rate": 7.878504904353868e-07, + "loss": 3.0038, + "step": 84440 + }, + { + "epoch": 2.474196147366879, + "grad_norm": 17.488473892211914, + "learning_rate": 7.869985864347424e-07, + "loss": 3.0057, + "step": 84450 + }, + { + "epoch": 2.474489123269611, + "grad_norm": 19.165063858032227, + "learning_rate": 7.861471039198953e-07, + "loss": 3.0014, + "step": 84460 + }, + { + "epoch": 2.474782099172343, + "grad_norm": 19.83101463317871, + "learning_rate": 7.852960429760298e-07, + "loss": 3.0129, + "step": 84470 + }, + { + "epoch": 2.475075075075075, + "grad_norm": 17.54348373413086, + "learning_rate": 7.844454036882904e-07, + "loss": 3.0098, + "step": 84480 + }, + { + "epoch": 2.475075075075075, + "eval_bleu": 0.3532575330431452, + "eval_cap_loss": 0.9013182520866394, + "eval_con_loss": 1.1312960386276245, + "eval_loss": 3.163910150527954, + "step": 84480 + }, + { + "epoch": 2.475075075075075, + "eval_bleu": 0.3532575330431452, + "eval_cap_loss": 0.9013182520866394, + "eval_con_loss": 1.1312960386276245, + "eval_loss": 3.163910150527954, + "eval_runtime": 55.3253, + "eval_samples_per_second": 361.498, + "eval_steps_per_second": 0.361, + "step": 84480 + }, + { + "epoch": 2.4753680509778073, + "grad_norm": 21.436439514160156, + "learning_rate": 7.835951861417779e-07, + "loss": 2.9863, + "step": 84490 + }, + { + "epoch": 2.4756610268805392, + "grad_norm": 15.825520515441895, + "learning_rate": 7.82745390421552e-07, + "loss": 3.0296, + "step": 84500 + }, + { + "epoch": 2.475954002783271, + "grad_norm": 16.832958221435547, + "learning_rate": 7.81896016612631e-07, + "loss": 3.0065, + "step": 84510 + }, + { + "epoch": 2.476246978686003, + "grad_norm": 16.541561126708984, + "learning_rate": 7.810470647999868e-07, + "loss": 3.0002, + "step": 84520 + }, + { + "epoch": 2.476539954588735, + "grad_norm": 17.446048736572266, + "learning_rate": 7.801985350685554e-07, + "loss": 3.0083, + "step": 84530 + }, + { + "epoch": 2.476832930491467, + "grad_norm": 16.96702766418457, + "learning_rate": 7.793504275032237e-07, + "loss": 3.012, + "step": 84540 + }, + { + "epoch": 2.477125906394199, + "grad_norm": 16.894115447998047, + "learning_rate": 7.785027421888431e-07, + "loss": 3.0086, + "step": 84550 + }, + { + "epoch": 2.477418882296931, + "grad_norm": 20.104162216186523, + "learning_rate": 7.776554792102165e-07, + "loss": 3.0167, + "step": 84560 + }, + { + "epoch": 2.477711858199663, + "grad_norm": 17.778392791748047, + "learning_rate": 7.768086386521084e-07, + "loss": 3.0283, + "step": 84570 + }, + { + "epoch": 2.478004834102395, + "grad_norm": 17.8503360748291, + "learning_rate": 7.759622205992412e-07, + "loss": 3.0035, + "step": 84580 + }, + { + "epoch": 2.478297810005127, + "grad_norm": 17.776933670043945, + "learning_rate": 7.751162251362943e-07, + "loss": 3.0196, + "step": 84590 + }, + { + "epoch": 2.4785907859078593, + "grad_norm": 17.575748443603516, + "learning_rate": 7.742706523479027e-07, + "loss": 3.0036, + "step": 84600 + }, + { + "epoch": 2.4788837618105912, + "grad_norm": 17.00566291809082, + "learning_rate": 7.734255023186632e-07, + "loss": 3.0121, + "step": 84610 + }, + { + "epoch": 2.479176737713323, + "grad_norm": 21.600645065307617, + "learning_rate": 7.725807751331255e-07, + "loss": 3.011, + "step": 84620 + }, + { + "epoch": 2.479469713616055, + "grad_norm": 13.673910140991211, + "learning_rate": 7.717364708758024e-07, + "loss": 3.0062, + "step": 84630 + }, + { + "epoch": 2.479762689518787, + "grad_norm": 16.821884155273438, + "learning_rate": 7.708925896311581e-07, + "loss": 2.9949, + "step": 84640 + }, + { + "epoch": 2.480055665421519, + "grad_norm": 20.13197135925293, + "learning_rate": 7.7004913148362e-07, + "loss": 3.0183, + "step": 84650 + }, + { + "epoch": 2.4803486413242513, + "grad_norm": 18.529037475585938, + "learning_rate": 7.692060965175713e-07, + "loss": 2.9945, + "step": 84660 + }, + { + "epoch": 2.480641617226983, + "grad_norm": 17.487855911254883, + "learning_rate": 7.683634848173527e-07, + "loss": 3.0148, + "step": 84670 + }, + { + "epoch": 2.480934593129715, + "grad_norm": 21.015972137451172, + "learning_rate": 7.675212964672613e-07, + "loss": 3.0128, + "step": 84680 + }, + { + "epoch": 2.481227569032447, + "grad_norm": 21.054119110107422, + "learning_rate": 7.666795315515546e-07, + "loss": 3.0324, + "step": 84690 + }, + { + "epoch": 2.481520544935179, + "grad_norm": 15.093094825744629, + "learning_rate": 7.658381901544443e-07, + "loss": 3.0324, + "step": 84700 + }, + { + "epoch": 2.4818135208379113, + "grad_norm": 16.781898498535156, + "learning_rate": 7.64997272360104e-07, + "loss": 3.0008, + "step": 84710 + }, + { + "epoch": 2.482106496740643, + "grad_norm": 20.80315399169922, + "learning_rate": 7.641567782526599e-07, + "loss": 3.027, + "step": 84720 + }, + { + "epoch": 2.482399472643375, + "grad_norm": 18.57159996032715, + "learning_rate": 7.633167079161996e-07, + "loss": 3.0211, + "step": 84730 + }, + { + "epoch": 2.482692448546107, + "grad_norm": 18.223495483398438, + "learning_rate": 7.624770614347676e-07, + "loss": 3.0147, + "step": 84740 + }, + { + "epoch": 2.482985424448839, + "grad_norm": 16.226634979248047, + "learning_rate": 7.616378388923662e-07, + "loss": 2.993, + "step": 84750 + }, + { + "epoch": 2.483278400351571, + "grad_norm": 20.42734146118164, + "learning_rate": 7.607990403729526e-07, + "loss": 3.0116, + "step": 84760 + }, + { + "epoch": 2.4835713762543032, + "grad_norm": 16.694780349731445, + "learning_rate": 7.599606659604459e-07, + "loss": 3.0, + "step": 84770 + }, + { + "epoch": 2.483864352157035, + "grad_norm": 22.591901779174805, + "learning_rate": 7.591227157387176e-07, + "loss": 3.0151, + "step": 84780 + }, + { + "epoch": 2.484157328059767, + "grad_norm": 14.684466361999512, + "learning_rate": 7.582851897916033e-07, + "loss": 2.993, + "step": 84790 + }, + { + "epoch": 2.484450303962499, + "grad_norm": 22.285686492919922, + "learning_rate": 7.574480882028884e-07, + "loss": 3.0094, + "step": 84800 + }, + { + "epoch": 2.484743279865231, + "grad_norm": 16.320194244384766, + "learning_rate": 7.566114110563227e-07, + "loss": 3.0341, + "step": 84810 + }, + { + "epoch": 2.4850362557679633, + "grad_norm": 17.13379669189453, + "learning_rate": 7.557751584356104e-07, + "loss": 3.0169, + "step": 84820 + }, + { + "epoch": 2.485329231670695, + "grad_norm": 17.514923095703125, + "learning_rate": 7.54939330424413e-07, + "loss": 2.9918, + "step": 84830 + }, + { + "epoch": 2.485622207573427, + "grad_norm": 16.75035285949707, + "learning_rate": 7.541039271063516e-07, + "loss": 2.9977, + "step": 84840 + }, + { + "epoch": 2.485915183476159, + "grad_norm": 18.528018951416016, + "learning_rate": 7.532689485650013e-07, + "loss": 2.9907, + "step": 84850 + }, + { + "epoch": 2.486208159378891, + "grad_norm": 20.004671096801758, + "learning_rate": 7.524343948838986e-07, + "loss": 2.9973, + "step": 84860 + }, + { + "epoch": 2.486501135281623, + "grad_norm": 19.892301559448242, + "learning_rate": 7.516002661465338e-07, + "loss": 3.0179, + "step": 84870 + }, + { + "epoch": 2.4867941111843552, + "grad_norm": 18.792911529541016, + "learning_rate": 7.507665624363586e-07, + "loss": 2.9926, + "step": 84880 + }, + { + "epoch": 2.487087087087087, + "grad_norm": 17.343759536743164, + "learning_rate": 7.499332838367779e-07, + "loss": 2.997, + "step": 84890 + }, + { + "epoch": 2.487380062989819, + "grad_norm": 18.66901397705078, + "learning_rate": 7.491004304311572e-07, + "loss": 3.0362, + "step": 84900 + }, + { + "epoch": 2.487673038892551, + "grad_norm": 18.556804656982422, + "learning_rate": 7.482680023028188e-07, + "loss": 3.0067, + "step": 84910 + }, + { + "epoch": 2.487966014795283, + "grad_norm": 17.678071975708008, + "learning_rate": 7.474359995350434e-07, + "loss": 3.0065, + "step": 84920 + }, + { + "epoch": 2.4882589906980153, + "grad_norm": 17.604690551757812, + "learning_rate": 7.466044222110652e-07, + "loss": 3.0069, + "step": 84930 + }, + { + "epoch": 2.488551966600747, + "grad_norm": 19.507062911987305, + "learning_rate": 7.457732704140813e-07, + "loss": 2.9893, + "step": 84940 + }, + { + "epoch": 2.488844942503479, + "grad_norm": 17.215259552001953, + "learning_rate": 7.449425442272412e-07, + "loss": 3.0075, + "step": 84950 + }, + { + "epoch": 2.489137918406211, + "grad_norm": 20.0311222076416, + "learning_rate": 7.441122437336562e-07, + "loss": 3.0096, + "step": 84960 + }, + { + "epoch": 2.489430894308943, + "grad_norm": 17.01163101196289, + "learning_rate": 7.43282369016391e-07, + "loss": 2.9975, + "step": 84970 + }, + { + "epoch": 2.489723870211675, + "grad_norm": 15.88427448272705, + "learning_rate": 7.424529201584701e-07, + "loss": 3.0062, + "step": 84980 + }, + { + "epoch": 2.490016846114407, + "grad_norm": 21.715383529663086, + "learning_rate": 7.416238972428758e-07, + "loss": 3.0441, + "step": 84990 + }, + { + "epoch": 2.4900754412949535, + "eval_bleu": 0.35303677901695213, + "eval_cap_loss": 0.9010302424430847, + "eval_con_loss": 1.1301785707473755, + "eval_loss": 3.1613874435424805, + "step": 84992 + }, + { + "epoch": 2.4900754412949535, + "eval_bleu": 0.35303677901695213, + "eval_cap_loss": 0.9010302424430847, + "eval_con_loss": 1.1301785707473755, + "eval_loss": 3.1613874435424805, + "eval_runtime": 53.2203, + "eval_samples_per_second": 375.797, + "eval_steps_per_second": 0.376, + "step": 84992 + }, + { + "epoch": 2.490309822017139, + "grad_norm": 19.503873825073242, + "learning_rate": 7.407953003525476e-07, + "loss": 3.0232, + "step": 85000 + }, + { + "epoch": 2.490602797919871, + "grad_norm": 14.45980453491211, + "learning_rate": 7.399671295703792e-07, + "loss": 3.0057, + "step": 85010 + }, + { + "epoch": 2.490895773822603, + "grad_norm": 20.314958572387695, + "learning_rate": 7.391393849792267e-07, + "loss": 3.0011, + "step": 85020 + }, + { + "epoch": 2.4911887497253353, + "grad_norm": 18.349464416503906, + "learning_rate": 7.383120666618987e-07, + "loss": 3.0272, + "step": 85030 + }, + { + "epoch": 2.4914817256280672, + "grad_norm": 19.565568923950195, + "learning_rate": 7.374851747011663e-07, + "loss": 3.0279, + "step": 85040 + }, + { + "epoch": 2.491774701530799, + "grad_norm": 20.99665641784668, + "learning_rate": 7.366587091797523e-07, + "loss": 3.003, + "step": 85050 + }, + { + "epoch": 2.492067677433531, + "grad_norm": 17.576154708862305, + "learning_rate": 7.358326701803403e-07, + "loss": 3.0079, + "step": 85060 + }, + { + "epoch": 2.492360653336263, + "grad_norm": 16.232744216918945, + "learning_rate": 7.350070577855716e-07, + "loss": 3.0044, + "step": 85070 + }, + { + "epoch": 2.492653629238995, + "grad_norm": 19.550662994384766, + "learning_rate": 7.341818720780447e-07, + "loss": 3.0013, + "step": 85080 + }, + { + "epoch": 2.4929466051417273, + "grad_norm": 20.111478805541992, + "learning_rate": 7.333571131403122e-07, + "loss": 3.0106, + "step": 85090 + }, + { + "epoch": 2.493239581044459, + "grad_norm": 20.317392349243164, + "learning_rate": 7.325327810548882e-07, + "loss": 3.0117, + "step": 85100 + }, + { + "epoch": 2.493532556947191, + "grad_norm": 16.46137237548828, + "learning_rate": 7.317088759042406e-07, + "loss": 3.0216, + "step": 85110 + }, + { + "epoch": 2.493825532849923, + "grad_norm": 17.45306396484375, + "learning_rate": 7.30885397770798e-07, + "loss": 2.9735, + "step": 85120 + }, + { + "epoch": 2.494118508752655, + "grad_norm": 21.610021591186523, + "learning_rate": 7.300623467369428e-07, + "loss": 3.0276, + "step": 85130 + }, + { + "epoch": 2.4944114846553873, + "grad_norm": 19.040822982788086, + "learning_rate": 7.292397228850168e-07, + "loss": 3.0115, + "step": 85140 + }, + { + "epoch": 2.4947044605581192, + "grad_norm": 20.31777000427246, + "learning_rate": 7.284175262973187e-07, + "loss": 3.0091, + "step": 85150 + }, + { + "epoch": 2.494997436460851, + "grad_norm": 18.39513397216797, + "learning_rate": 7.275957570561065e-07, + "loss": 2.9758, + "step": 85160 + }, + { + "epoch": 2.495290412363583, + "grad_norm": 19.44288444519043, + "learning_rate": 7.267744152435896e-07, + "loss": 2.9967, + "step": 85170 + }, + { + "epoch": 2.495583388266315, + "grad_norm": 21.988636016845703, + "learning_rate": 7.259535009419421e-07, + "loss": 2.982, + "step": 85180 + }, + { + "epoch": 2.495876364169047, + "grad_norm": 22.422969818115234, + "learning_rate": 7.25133014233288e-07, + "loss": 3.0132, + "step": 85190 + }, + { + "epoch": 2.4961693400717793, + "grad_norm": 13.500802040100098, + "learning_rate": 7.243129551997141e-07, + "loss": 2.9952, + "step": 85200 + }, + { + "epoch": 2.496462315974511, + "grad_norm": 20.03580665588379, + "learning_rate": 7.234933239232639e-07, + "loss": 3.0207, + "step": 85210 + }, + { + "epoch": 2.496755291877243, + "grad_norm": 19.077415466308594, + "learning_rate": 7.226741204859333e-07, + "loss": 3.0039, + "step": 85220 + }, + { + "epoch": 2.497048267779975, + "grad_norm": 19.478622436523438, + "learning_rate": 7.218553449696808e-07, + "loss": 3.0054, + "step": 85230 + }, + { + "epoch": 2.497341243682707, + "grad_norm": 16.443941116333008, + "learning_rate": 7.210369974564191e-07, + "loss": 3.0243, + "step": 85240 + }, + { + "epoch": 2.4976342195854393, + "grad_norm": 19.165122985839844, + "learning_rate": 7.202190780280211e-07, + "loss": 3.0109, + "step": 85250 + }, + { + "epoch": 2.497927195488171, + "grad_norm": 14.990290641784668, + "learning_rate": 7.194015867663124e-07, + "loss": 3.02, + "step": 85260 + }, + { + "epoch": 2.498220171390903, + "grad_norm": 17.602834701538086, + "learning_rate": 7.185845237530797e-07, + "loss": 2.9844, + "step": 85270 + }, + { + "epoch": 2.498513147293635, + "grad_norm": 17.203716278076172, + "learning_rate": 7.17767889070064e-07, + "loss": 2.9968, + "step": 85280 + }, + { + "epoch": 2.498806123196367, + "grad_norm": 16.301538467407227, + "learning_rate": 7.169516827989659e-07, + "loss": 3.035, + "step": 85290 + }, + { + "epoch": 2.499099099099099, + "grad_norm": 17.6550235748291, + "learning_rate": 7.161359050214405e-07, + "loss": 3.0015, + "step": 85300 + }, + { + "epoch": 2.4993920750018312, + "grad_norm": 19.05323600769043, + "learning_rate": 7.153205558191029e-07, + "loss": 3.0101, + "step": 85310 + }, + { + "epoch": 2.499685050904563, + "grad_norm": 19.104780197143555, + "learning_rate": 7.145056352735236e-07, + "loss": 3.0039, + "step": 85320 + }, + { + "epoch": 2.499978026807295, + "grad_norm": 21.18861961364746, + "learning_rate": 7.136911434662314e-07, + "loss": 2.9927, + "step": 85330 + }, + { + "epoch": 2.500271002710027, + "grad_norm": 17.80889129638672, + "learning_rate": 7.128770804787089e-07, + "loss": 3.0163, + "step": 85340 + }, + { + "epoch": 2.5005639786127594, + "grad_norm": 14.734169006347656, + "learning_rate": 7.120634463924015e-07, + "loss": 3.0156, + "step": 85350 + }, + { + "epoch": 2.5008569545154913, + "grad_norm": 14.229897499084473, + "learning_rate": 7.112502412887057e-07, + "loss": 3.0106, + "step": 85360 + }, + { + "epoch": 2.501149930418223, + "grad_norm": 20.14482307434082, + "learning_rate": 7.104374652489798e-07, + "loss": 3.0205, + "step": 85370 + }, + { + "epoch": 2.501442906320955, + "grad_norm": 17.258895874023438, + "learning_rate": 7.096251183545355e-07, + "loss": 2.999, + "step": 85380 + }, + { + "epoch": 2.501735882223687, + "grad_norm": 19.99054527282715, + "learning_rate": 7.088132006866444e-07, + "loss": 3.0207, + "step": 85390 + }, + { + "epoch": 2.502028858126419, + "grad_norm": 15.208708763122559, + "learning_rate": 7.080017123265337e-07, + "loss": 3.0009, + "step": 85400 + }, + { + "epoch": 2.502321834029151, + "grad_norm": 23.011699676513672, + "learning_rate": 7.07190653355389e-07, + "loss": 3.0133, + "step": 85410 + }, + { + "epoch": 2.5026148099318832, + "grad_norm": 20.177927017211914, + "learning_rate": 7.063800238543506e-07, + "loss": 3.0348, + "step": 85420 + }, + { + "epoch": 2.502907785834615, + "grad_norm": 16.902067184448242, + "learning_rate": 7.055698239045178e-07, + "loss": 2.9985, + "step": 85430 + }, + { + "epoch": 2.503200761737347, + "grad_norm": 20.288673400878906, + "learning_rate": 7.047600535869459e-07, + "loss": 3.0204, + "step": 85440 + }, + { + "epoch": 2.503493737640079, + "grad_norm": 20.444490432739258, + "learning_rate": 7.039507129826484e-07, + "loss": 2.9904, + "step": 85450 + }, + { + "epoch": 2.5037867135428113, + "grad_norm": 19.503978729248047, + "learning_rate": 7.031418021725933e-07, + "loss": 3.0017, + "step": 85460 + }, + { + "epoch": 2.5040796894455433, + "grad_norm": 16.01935577392578, + "learning_rate": 7.023333212377087e-07, + "loss": 3.0023, + "step": 85470 + }, + { + "epoch": 2.504372665348275, + "grad_norm": 16.312366485595703, + "learning_rate": 7.015252702588782e-07, + "loss": 3.0015, + "step": 85480 + }, + { + "epoch": 2.504665641251007, + "grad_norm": 20.49848175048828, + "learning_rate": 7.007176493169437e-07, + "loss": 3.0166, + "step": 85490 + }, + { + "epoch": 2.504958617153739, + "grad_norm": 19.448575973510742, + "learning_rate": 6.999104584927002e-07, + "loss": 3.0227, + "step": 85500 + }, + { + "epoch": 2.505075807514832, + "eval_bleu": 0.3532959686932189, + "eval_cap_loss": 0.9008821249008179, + "eval_con_loss": 1.129814863204956, + "eval_loss": 3.1605117321014404, + "step": 85504 + }, + { + "epoch": 2.505075807514832, + "eval_bleu": 0.3532959686932189, + "eval_cap_loss": 0.9008821249008179, + "eval_con_loss": 1.129814863204956, + "eval_loss": 3.1605117321014404, + "eval_runtime": 53.329, + "eval_samples_per_second": 375.03, + "eval_steps_per_second": 0.375, + "step": 85504 + }, + { + "epoch": 2.505251593056471, + "grad_norm": 18.25177574157715, + "learning_rate": 6.991036978669053e-07, + "loss": 3.0146, + "step": 85510 + }, + { + "epoch": 2.505544568959203, + "grad_norm": 17.764753341674805, + "learning_rate": 6.982973675202676e-07, + "loss": 3.0085, + "step": 85520 + }, + { + "epoch": 2.505837544861935, + "grad_norm": 18.97653579711914, + "learning_rate": 6.974914675334582e-07, + "loss": 3.0098, + "step": 85530 + }, + { + "epoch": 2.506130520764667, + "grad_norm": 17.253259658813477, + "learning_rate": 6.966859979871005e-07, + "loss": 3.0028, + "step": 85540 + }, + { + "epoch": 2.506423496667399, + "grad_norm": 17.210521697998047, + "learning_rate": 6.95880958961777e-07, + "loss": 3.0112, + "step": 85550 + }, + { + "epoch": 2.506716472570131, + "grad_norm": 18.647876739501953, + "learning_rate": 6.950763505380303e-07, + "loss": 3.032, + "step": 85560 + }, + { + "epoch": 2.5070094484728633, + "grad_norm": 20.11094093322754, + "learning_rate": 6.942721727963531e-07, + "loss": 3.0018, + "step": 85570 + }, + { + "epoch": 2.5073024243755953, + "grad_norm": 18.80077362060547, + "learning_rate": 6.934684258172014e-07, + "loss": 3.032, + "step": 85580 + }, + { + "epoch": 2.507595400278327, + "grad_norm": 21.0344181060791, + "learning_rate": 6.926651096809827e-07, + "loss": 3.0139, + "step": 85590 + }, + { + "epoch": 2.507888376181059, + "grad_norm": 16.169574737548828, + "learning_rate": 6.918622244680667e-07, + "loss": 3.0091, + "step": 85600 + }, + { + "epoch": 2.508181352083791, + "grad_norm": 19.146440505981445, + "learning_rate": 6.910597702587746e-07, + "loss": 2.9986, + "step": 85610 + }, + { + "epoch": 2.508474327986523, + "grad_norm": 19.657989501953125, + "learning_rate": 6.902577471333893e-07, + "loss": 2.9988, + "step": 85620 + }, + { + "epoch": 2.508767303889255, + "grad_norm": 21.633657455444336, + "learning_rate": 6.894561551721452e-07, + "loss": 3.0165, + "step": 85630 + }, + { + "epoch": 2.509060279791987, + "grad_norm": 18.518444061279297, + "learning_rate": 6.886549944552417e-07, + "loss": 3.0176, + "step": 85640 + }, + { + "epoch": 2.509353255694719, + "grad_norm": 17.72414207458496, + "learning_rate": 6.878542650628267e-07, + "loss": 2.9963, + "step": 85650 + }, + { + "epoch": 2.509646231597451, + "grad_norm": 17.76559066772461, + "learning_rate": 6.870539670750104e-07, + "loss": 3.0051, + "step": 85660 + }, + { + "epoch": 2.509939207500183, + "grad_norm": 18.30486488342285, + "learning_rate": 6.862541005718554e-07, + "loss": 3.0276, + "step": 85670 + }, + { + "epoch": 2.5102321834029153, + "grad_norm": 19.45911407470703, + "learning_rate": 6.854546656333866e-07, + "loss": 3.0224, + "step": 85680 + }, + { + "epoch": 2.5105251593056472, + "grad_norm": 18.802244186401367, + "learning_rate": 6.846556623395795e-07, + "loss": 3.0257, + "step": 85690 + }, + { + "epoch": 2.510818135208379, + "grad_norm": 17.54297637939453, + "learning_rate": 6.838570907703729e-07, + "loss": 3.03, + "step": 85700 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 18.43825912475586, + "learning_rate": 6.830589510056551e-07, + "loss": 2.9857, + "step": 85710 + }, + { + "epoch": 2.511404087013843, + "grad_norm": 13.514822006225586, + "learning_rate": 6.822612431252807e-07, + "loss": 3.0219, + "step": 85720 + }, + { + "epoch": 2.511697062916575, + "grad_norm": 18.195898056030273, + "learning_rate": 6.814639672090506e-07, + "loss": 2.9958, + "step": 85730 + }, + { + "epoch": 2.5119900388193073, + "grad_norm": 16.77031898498535, + "learning_rate": 6.806671233367312e-07, + "loss": 3.031, + "step": 85740 + }, + { + "epoch": 2.512283014722039, + "grad_norm": 18.377988815307617, + "learning_rate": 6.798707115880393e-07, + "loss": 3.0172, + "step": 85750 + }, + { + "epoch": 2.512575990624771, + "grad_norm": 18.54923439025879, + "learning_rate": 6.790747320426527e-07, + "loss": 3.002, + "step": 85760 + }, + { + "epoch": 2.512868966527503, + "grad_norm": 17.192943572998047, + "learning_rate": 6.782791847802033e-07, + "loss": 3.0148, + "step": 85770 + }, + { + "epoch": 2.5131619424302354, + "grad_norm": 17.049251556396484, + "learning_rate": 6.774840698802831e-07, + "loss": 3.0014, + "step": 85780 + }, + { + "epoch": 2.5134549183329673, + "grad_norm": 18.14607048034668, + "learning_rate": 6.766893874224346e-07, + "loss": 3.0144, + "step": 85790 + }, + { + "epoch": 2.513747894235699, + "grad_norm": 18.55379295349121, + "learning_rate": 6.758951374861656e-07, + "loss": 3.0041, + "step": 85800 + }, + { + "epoch": 2.514040870138431, + "grad_norm": 17.535724639892578, + "learning_rate": 6.751013201509333e-07, + "loss": 3.0296, + "step": 85810 + }, + { + "epoch": 2.514333846041163, + "grad_norm": 23.37264633178711, + "learning_rate": 6.743079354961557e-07, + "loss": 3.0121, + "step": 85820 + }, + { + "epoch": 2.514626821943895, + "grad_norm": 17.31527328491211, + "learning_rate": 6.735149836012051e-07, + "loss": 3.0155, + "step": 85830 + }, + { + "epoch": 2.514919797846627, + "grad_norm": 19.001907348632812, + "learning_rate": 6.727224645454133e-07, + "loss": 3.0262, + "step": 85840 + }, + { + "epoch": 2.5152127737493593, + "grad_norm": 19.481807708740234, + "learning_rate": 6.719303784080644e-07, + "loss": 2.9893, + "step": 85850 + }, + { + "epoch": 2.515505749652091, + "grad_norm": 16.271350860595703, + "learning_rate": 6.711387252684054e-07, + "loss": 3.0037, + "step": 85860 + }, + { + "epoch": 2.515798725554823, + "grad_norm": 20.782045364379883, + "learning_rate": 6.703475052056319e-07, + "loss": 2.9929, + "step": 85870 + }, + { + "epoch": 2.516091701457555, + "grad_norm": 19.12059783935547, + "learning_rate": 6.69556718298906e-07, + "loss": 3.009, + "step": 85880 + }, + { + "epoch": 2.5163846773602874, + "grad_norm": 19.708202362060547, + "learning_rate": 6.687663646273379e-07, + "loss": 3.0085, + "step": 85890 + }, + { + "epoch": 2.5166776532630193, + "grad_norm": 21.097524642944336, + "learning_rate": 6.679764442699987e-07, + "loss": 3.0123, + "step": 85900 + }, + { + "epoch": 2.516970629165751, + "grad_norm": 20.99749183654785, + "learning_rate": 6.671869573059159e-07, + "loss": 3.0067, + "step": 85910 + }, + { + "epoch": 2.517263605068483, + "grad_norm": 17.86103630065918, + "learning_rate": 6.663979038140716e-07, + "loss": 2.9784, + "step": 85920 + }, + { + "epoch": 2.517556580971215, + "grad_norm": 19.50977325439453, + "learning_rate": 6.656092838734074e-07, + "loss": 3.0037, + "step": 85930 + }, + { + "epoch": 2.517849556873947, + "grad_norm": 19.079174041748047, + "learning_rate": 6.648210975628188e-07, + "loss": 2.9963, + "step": 85940 + }, + { + "epoch": 2.518142532776679, + "grad_norm": 18.18479347229004, + "learning_rate": 6.640333449611592e-07, + "loss": 3.0013, + "step": 85950 + }, + { + "epoch": 2.5184355086794112, + "grad_norm": 13.419715881347656, + "learning_rate": 6.632460261472389e-07, + "loss": 3.0191, + "step": 85960 + }, + { + "epoch": 2.518728484582143, + "grad_norm": 17.76589584350586, + "learning_rate": 6.62459141199826e-07, + "loss": 3.0124, + "step": 85970 + }, + { + "epoch": 2.519021460484875, + "grad_norm": 18.231338500976562, + "learning_rate": 6.616726901976411e-07, + "loss": 3.0317, + "step": 85980 + }, + { + "epoch": 2.519314436387607, + "grad_norm": 21.053077697753906, + "learning_rate": 6.608866732193664e-07, + "loss": 2.9805, + "step": 85990 + }, + { + "epoch": 2.5196074122903394, + "grad_norm": 18.348785400390625, + "learning_rate": 6.601010903436355e-07, + "loss": 3.0099, + "step": 86000 + }, + { + "epoch": 2.5199003881930713, + "grad_norm": 18.384868621826172, + "learning_rate": 6.593159416490441e-07, + "loss": 3.0043, + "step": 86010 + }, + { + "epoch": 2.52007617373471, + "eval_bleu": 0.3534050120266615, + "eval_cap_loss": 0.9008654952049255, + "eval_con_loss": 1.1297690868377686, + "eval_loss": 3.1604037284851074, + "step": 86016 + }, + { + "epoch": 2.52007617373471, + "eval_bleu": 0.3534050120266615, + "eval_cap_loss": 0.9008654952049255, + "eval_con_loss": 1.1297690868377686, + "eval_loss": 3.1604037284851074, + "eval_runtime": 52.8856, + "eval_samples_per_second": 378.175, + "eval_steps_per_second": 0.378, + "step": 86016 + }, + { + "epoch": 2.520193364095803, + "grad_norm": 18.899906158447266, + "learning_rate": 6.585312272141392e-07, + "loss": 2.9967, + "step": 86020 + }, + { + "epoch": 2.520486339998535, + "grad_norm": 18.40736961364746, + "learning_rate": 6.578253555796443e-07, + "loss": 2.9979, + "step": 86030 + }, + { + "epoch": 2.520779315901267, + "grad_norm": 16.399816513061523, + "learning_rate": 6.570414664543939e-07, + "loss": 2.9983, + "step": 86040 + }, + { + "epoch": 2.521072291803999, + "grad_norm": 15.588159561157227, + "learning_rate": 6.562580118163803e-07, + "loss": 3.0113, + "step": 86050 + }, + { + "epoch": 2.521365267706731, + "grad_norm": 17.521591186523438, + "learning_rate": 6.554749917439806e-07, + "loss": 2.9907, + "step": 86060 + }, + { + "epoch": 2.5216582436094632, + "grad_norm": 14.876383781433105, + "learning_rate": 6.546924063155336e-07, + "loss": 3.0223, + "step": 86070 + }, + { + "epoch": 2.521951219512195, + "grad_norm": 16.654525756835938, + "learning_rate": 6.539102556093308e-07, + "loss": 3.009, + "step": 86080 + }, + { + "epoch": 2.522244195414927, + "grad_norm": 17.35709571838379, + "learning_rate": 6.531285397036235e-07, + "loss": 3.0199, + "step": 86090 + }, + { + "epoch": 2.522537171317659, + "grad_norm": 19.60348129272461, + "learning_rate": 6.523472586766161e-07, + "loss": 3.015, + "step": 86100 + }, + { + "epoch": 2.5228301472203913, + "grad_norm": 19.45755386352539, + "learning_rate": 6.515664126064725e-07, + "loss": 3.0228, + "step": 86110 + }, + { + "epoch": 2.5231231231231233, + "grad_norm": 16.082107543945312, + "learning_rate": 6.507860015713097e-07, + "loss": 3.0197, + "step": 86120 + }, + { + "epoch": 2.523416099025855, + "grad_norm": 19.187990188598633, + "learning_rate": 6.500060256492058e-07, + "loss": 3.0154, + "step": 86130 + }, + { + "epoch": 2.523709074928587, + "grad_norm": 18.414337158203125, + "learning_rate": 6.492264849181901e-07, + "loss": 3.0162, + "step": 86140 + }, + { + "epoch": 2.524002050831319, + "grad_norm": 16.668455123901367, + "learning_rate": 6.484473794562523e-07, + "loss": 3.016, + "step": 86150 + }, + { + "epoch": 2.524295026734051, + "grad_norm": 20.33777618408203, + "learning_rate": 6.476687093413375e-07, + "loss": 3.0028, + "step": 86160 + }, + { + "epoch": 2.5245880026367833, + "grad_norm": 19.090192794799805, + "learning_rate": 6.468904746513477e-07, + "loss": 3.0118, + "step": 86170 + }, + { + "epoch": 2.524880978539515, + "grad_norm": 14.876152992248535, + "learning_rate": 6.461126754641389e-07, + "loss": 3.0022, + "step": 86180 + }, + { + "epoch": 2.525173954442247, + "grad_norm": 17.020986557006836, + "learning_rate": 6.453353118575273e-07, + "loss": 3.0055, + "step": 86190 + }, + { + "epoch": 2.525466930344979, + "grad_norm": 20.991445541381836, + "learning_rate": 6.445583839092817e-07, + "loss": 3.009, + "step": 86200 + }, + { + "epoch": 2.525759906247711, + "grad_norm": 17.447317123413086, + "learning_rate": 6.4378189169713e-07, + "loss": 3.0017, + "step": 86210 + }, + { + "epoch": 2.5260528821504433, + "grad_norm": 18.08540916442871, + "learning_rate": 6.43005835298755e-07, + "loss": 2.9983, + "step": 86220 + }, + { + "epoch": 2.5263458580531752, + "grad_norm": 18.26626205444336, + "learning_rate": 6.42230214791797e-07, + "loss": 3.0167, + "step": 86230 + }, + { + "epoch": 2.526638833955907, + "grad_norm": 21.622207641601562, + "learning_rate": 6.414550302538524e-07, + "loss": 3.014, + "step": 86240 + }, + { + "epoch": 2.526931809858639, + "grad_norm": 15.63527774810791, + "learning_rate": 6.40680281762473e-07, + "loss": 2.9973, + "step": 86250 + }, + { + "epoch": 2.527224785761371, + "grad_norm": 18.38599395751953, + "learning_rate": 6.399059693951704e-07, + "loss": 3.006, + "step": 86260 + }, + { + "epoch": 2.527517761664103, + "grad_norm": 18.235105514526367, + "learning_rate": 6.391320932294065e-07, + "loss": 2.9952, + "step": 86270 + }, + { + "epoch": 2.5278107375668353, + "grad_norm": 17.936641693115234, + "learning_rate": 6.383586533426051e-07, + "loss": 3.0102, + "step": 86280 + }, + { + "epoch": 2.528103713469567, + "grad_norm": 15.544479370117188, + "learning_rate": 6.375856498121435e-07, + "loss": 2.9942, + "step": 86290 + }, + { + "epoch": 2.528396689372299, + "grad_norm": 20.501453399658203, + "learning_rate": 6.368130827153568e-07, + "loss": 3.0028, + "step": 86300 + }, + { + "epoch": 2.528689665275031, + "grad_norm": 16.35834312438965, + "learning_rate": 6.360409521295347e-07, + "loss": 3.0017, + "step": 86310 + }, + { + "epoch": 2.5289826411777634, + "grad_norm": 17.726930618286133, + "learning_rate": 6.352692581319242e-07, + "loss": 3.0144, + "step": 86320 + }, + { + "epoch": 2.5292756170804953, + "grad_norm": 16.19867515563965, + "learning_rate": 6.344980007997298e-07, + "loss": 3.0055, + "step": 86330 + }, + { + "epoch": 2.5295685929832272, + "grad_norm": 17.924867630004883, + "learning_rate": 6.337271802101119e-07, + "loss": 2.995, + "step": 86340 + }, + { + "epoch": 2.529861568885959, + "grad_norm": 15.651344299316406, + "learning_rate": 6.32956796440184e-07, + "loss": 3.0024, + "step": 86350 + }, + { + "epoch": 2.530154544788691, + "grad_norm": 19.937410354614258, + "learning_rate": 6.321868495670213e-07, + "loss": 3.0076, + "step": 86360 + }, + { + "epoch": 2.530447520691423, + "grad_norm": 18.550886154174805, + "learning_rate": 6.314173396676493e-07, + "loss": 3.0211, + "step": 86370 + }, + { + "epoch": 2.530740496594155, + "grad_norm": 18.702125549316406, + "learning_rate": 6.306482668190561e-07, + "loss": 3.0044, + "step": 86380 + }, + { + "epoch": 2.5310334724968873, + "grad_norm": 12.177067756652832, + "learning_rate": 6.298796310981803e-07, + "loss": 2.9864, + "step": 86390 + }, + { + "epoch": 2.531326448399619, + "grad_norm": 19.582748413085938, + "learning_rate": 6.291114325819203e-07, + "loss": 2.9956, + "step": 86400 + }, + { + "epoch": 2.531619424302351, + "grad_norm": 14.984067916870117, + "learning_rate": 6.283436713471297e-07, + "loss": 3.0231, + "step": 86410 + }, + { + "epoch": 2.531912400205083, + "grad_norm": 17.480417251586914, + "learning_rate": 6.275763474706198e-07, + "loss": 3.0184, + "step": 86420 + }, + { + "epoch": 2.5322053761078154, + "grad_norm": 16.124954223632812, + "learning_rate": 6.268094610291548e-07, + "loss": 3.0125, + "step": 86430 + }, + { + "epoch": 2.5324983520105473, + "grad_norm": 17.422473907470703, + "learning_rate": 6.260430120994587e-07, + "loss": 2.991, + "step": 86440 + }, + { + "epoch": 2.532791327913279, + "grad_norm": 16.827219009399414, + "learning_rate": 6.252770007582087e-07, + "loss": 2.9972, + "step": 86450 + }, + { + "epoch": 2.533084303816011, + "grad_norm": 16.922569274902344, + "learning_rate": 6.245114270820412e-07, + "loss": 3.0071, + "step": 86460 + }, + { + "epoch": 2.533377279718743, + "grad_norm": 18.0770206451416, + "learning_rate": 6.237462911475456e-07, + "loss": 3.0045, + "step": 86470 + }, + { + "epoch": 2.533670255621475, + "grad_norm": 20.486726760864258, + "learning_rate": 6.2298159303127e-07, + "loss": 2.9912, + "step": 86480 + }, + { + "epoch": 2.533963231524207, + "grad_norm": 21.69667625427246, + "learning_rate": 6.222173328097181e-07, + "loss": 3.0211, + "step": 86490 + }, + { + "epoch": 2.5342562074269392, + "grad_norm": 21.851579666137695, + "learning_rate": 6.214535105593505e-07, + "loss": 3.0104, + "step": 86500 + }, + { + "epoch": 2.534549183329671, + "grad_norm": 22.962636947631836, + "learning_rate": 6.206901263565812e-07, + "loss": 2.9939, + "step": 86510 + }, + { + "epoch": 2.534842159232403, + "grad_norm": 19.442575454711914, + "learning_rate": 6.199271802777846e-07, + "loss": 3.0145, + "step": 86520 + }, + { + "epoch": 2.5350765399545887, + "eval_bleu": 0.35347069259372266, + "eval_cap_loss": 0.9006245136260986, + "eval_con_loss": 1.129434585571289, + "eval_loss": 3.1594934463500977, + "step": 86528 + }, + { + "epoch": 2.5350765399545887, + "eval_bleu": 0.35347069259372266, + "eval_cap_loss": 0.9006245136260986, + "eval_con_loss": 1.129434585571289, + "eval_loss": 3.1594934463500977, + "eval_runtime": 54.7482, + "eval_samples_per_second": 365.309, + "eval_steps_per_second": 0.365, + "step": 86528 + }, + { + "epoch": 2.535135135135135, + "grad_norm": 21.813358306884766, + "learning_rate": 6.191646723992861e-07, + "loss": 3.0098, + "step": 86530 + }, + { + "epoch": 2.5354281110378674, + "grad_norm": 19.217350006103516, + "learning_rate": 6.184026027973728e-07, + "loss": 3.0186, + "step": 86540 + }, + { + "epoch": 2.5357210869405993, + "grad_norm": 19.352174758911133, + "learning_rate": 6.176409715482828e-07, + "loss": 3.0147, + "step": 86550 + }, + { + "epoch": 2.536014062843331, + "grad_norm": 15.945828437805176, + "learning_rate": 6.168797787282138e-07, + "loss": 3.0207, + "step": 86560 + }, + { + "epoch": 2.536307038746063, + "grad_norm": 19.2208309173584, + "learning_rate": 6.161190244133186e-07, + "loss": 3.0285, + "step": 86570 + }, + { + "epoch": 2.536600014648795, + "grad_norm": 18.736003875732422, + "learning_rate": 6.15358708679708e-07, + "loss": 3.0038, + "step": 86580 + }, + { + "epoch": 2.536892990551527, + "grad_norm": 16.73753547668457, + "learning_rate": 6.145988316034441e-07, + "loss": 2.9917, + "step": 86590 + }, + { + "epoch": 2.537185966454259, + "grad_norm": 18.515626907348633, + "learning_rate": 6.13839393260549e-07, + "loss": 3.0136, + "step": 86600 + }, + { + "epoch": 2.5374789423569912, + "grad_norm": 19.875171661376953, + "learning_rate": 6.13080393727002e-07, + "loss": 3.0081, + "step": 86610 + }, + { + "epoch": 2.537771918259723, + "grad_norm": 19.21499252319336, + "learning_rate": 6.123218330787334e-07, + "loss": 3.0192, + "step": 86620 + }, + { + "epoch": 2.538064894162455, + "grad_norm": 19.072229385375977, + "learning_rate": 6.115637113916356e-07, + "loss": 3.0189, + "step": 86630 + }, + { + "epoch": 2.538357870065187, + "grad_norm": 19.506671905517578, + "learning_rate": 6.108060287415518e-07, + "loss": 3.0197, + "step": 86640 + }, + { + "epoch": 2.5386508459679193, + "grad_norm": 15.389975547790527, + "learning_rate": 6.100487852042847e-07, + "loss": 3.0104, + "step": 86650 + }, + { + "epoch": 2.5389438218706513, + "grad_norm": 17.271730422973633, + "learning_rate": 6.092919808555914e-07, + "loss": 3.0087, + "step": 86660 + }, + { + "epoch": 2.539236797773383, + "grad_norm": 16.98444175720215, + "learning_rate": 6.085356157711875e-07, + "loss": 3.0083, + "step": 86670 + }, + { + "epoch": 2.539529773676115, + "grad_norm": 15.897809028625488, + "learning_rate": 6.077796900267408e-07, + "loss": 3.005, + "step": 86680 + }, + { + "epoch": 2.539822749578847, + "grad_norm": 18.30794906616211, + "learning_rate": 6.070242036978785e-07, + "loss": 3.0334, + "step": 86690 + }, + { + "epoch": 2.540115725481579, + "grad_norm": 17.225875854492188, + "learning_rate": 6.062691568601814e-07, + "loss": 3.0156, + "step": 86700 + }, + { + "epoch": 2.5404087013843113, + "grad_norm": 17.69049835205078, + "learning_rate": 6.05514549589189e-07, + "loss": 3.004, + "step": 86710 + }, + { + "epoch": 2.540701677287043, + "grad_norm": 17.980615615844727, + "learning_rate": 6.047603819603931e-07, + "loss": 3.007, + "step": 86720 + }, + { + "epoch": 2.540994653189775, + "grad_norm": 16.401500701904297, + "learning_rate": 6.040066540492451e-07, + "loss": 3.0302, + "step": 86730 + }, + { + "epoch": 2.541287629092507, + "grad_norm": 17.265226364135742, + "learning_rate": 6.032533659311507e-07, + "loss": 3.0234, + "step": 86740 + }, + { + "epoch": 2.5415806049952394, + "grad_norm": 16.30717658996582, + "learning_rate": 6.02500517681473e-07, + "loss": 2.9971, + "step": 86750 + }, + { + "epoch": 2.5418735808979713, + "grad_norm": 21.720474243164062, + "learning_rate": 6.017481093755284e-07, + "loss": 3.0197, + "step": 86760 + }, + { + "epoch": 2.5421665568007032, + "grad_norm": 17.080598831176758, + "learning_rate": 6.009961410885923e-07, + "loss": 3.0252, + "step": 86770 + }, + { + "epoch": 2.542459532703435, + "grad_norm": 18.03256607055664, + "learning_rate": 6.002446128958927e-07, + "loss": 2.9958, + "step": 86780 + }, + { + "epoch": 2.542752508606167, + "grad_norm": 20.34314727783203, + "learning_rate": 5.994935248726181e-07, + "loss": 2.9891, + "step": 86790 + }, + { + "epoch": 2.543045484508899, + "grad_norm": 19.04084014892578, + "learning_rate": 5.987428770939075e-07, + "loss": 3.0004, + "step": 86800 + }, + { + "epoch": 2.543338460411631, + "grad_norm": 19.6439151763916, + "learning_rate": 5.97992669634861e-07, + "loss": 2.9978, + "step": 86810 + }, + { + "epoch": 2.5436314363143633, + "grad_norm": 21.862640380859375, + "learning_rate": 5.972429025705312e-07, + "loss": 3.0054, + "step": 86820 + }, + { + "epoch": 2.543924412217095, + "grad_norm": 21.5358943939209, + "learning_rate": 5.964935759759294e-07, + "loss": 3.0157, + "step": 86830 + }, + { + "epoch": 2.544217388119827, + "grad_norm": 19.041671752929688, + "learning_rate": 5.957446899260189e-07, + "loss": 3.0177, + "step": 86840 + }, + { + "epoch": 2.544510364022559, + "grad_norm": 17.426652908325195, + "learning_rate": 5.949962444957242e-07, + "loss": 3.017, + "step": 86850 + }, + { + "epoch": 2.5448033399252914, + "grad_norm": 17.22276496887207, + "learning_rate": 5.942482397599198e-07, + "loss": 2.9936, + "step": 86860 + }, + { + "epoch": 2.5450963158280233, + "grad_norm": 18.450637817382812, + "learning_rate": 5.935006757934419e-07, + "loss": 2.9999, + "step": 86870 + }, + { + "epoch": 2.5453892917307552, + "grad_norm": 13.976837158203125, + "learning_rate": 5.927535526710776e-07, + "loss": 3.0055, + "step": 86880 + }, + { + "epoch": 2.545682267633487, + "grad_norm": 21.133644104003906, + "learning_rate": 5.920068704675724e-07, + "loss": 3.0105, + "step": 86890 + }, + { + "epoch": 2.545975243536219, + "grad_norm": 21.117904663085938, + "learning_rate": 5.912606292576284e-07, + "loss": 2.9931, + "step": 86900 + }, + { + "epoch": 2.546268219438951, + "grad_norm": 19.416879653930664, + "learning_rate": 5.905148291159035e-07, + "loss": 2.9929, + "step": 86910 + }, + { + "epoch": 2.546561195341683, + "grad_norm": 20.525976181030273, + "learning_rate": 5.897694701170087e-07, + "loss": 3.0037, + "step": 86920 + }, + { + "epoch": 2.5468541712444153, + "grad_norm": 17.757322311401367, + "learning_rate": 5.890245523355137e-07, + "loss": 3.0142, + "step": 86930 + }, + { + "epoch": 2.547147147147147, + "grad_norm": 19.7572078704834, + "learning_rate": 5.882800758459423e-07, + "loss": 3.015, + "step": 86940 + }, + { + "epoch": 2.547440123049879, + "grad_norm": 17.025732040405273, + "learning_rate": 5.875360407227754e-07, + "loss": 3.0095, + "step": 86950 + }, + { + "epoch": 2.547733098952611, + "grad_norm": 17.708866119384766, + "learning_rate": 5.867924470404501e-07, + "loss": 2.9993, + "step": 86960 + }, + { + "epoch": 2.5480260748553434, + "grad_norm": 19.985048294067383, + "learning_rate": 5.86049294873357e-07, + "loss": 3.0119, + "step": 86970 + }, + { + "epoch": 2.5483190507580753, + "grad_norm": 18.505447387695312, + "learning_rate": 5.853065842958445e-07, + "loss": 3.0096, + "step": 86980 + }, + { + "epoch": 2.548612026660807, + "grad_norm": 16.291067123413086, + "learning_rate": 5.845643153822173e-07, + "loss": 2.9947, + "step": 86990 + }, + { + "epoch": 2.548905002563539, + "grad_norm": 19.931598663330078, + "learning_rate": 5.838224882067351e-07, + "loss": 2.9917, + "step": 87000 + }, + { + "epoch": 2.549197978466271, + "grad_norm": 24.252531051635742, + "learning_rate": 5.830811028436117e-07, + "loss": 3.0104, + "step": 87010 + }, + { + "epoch": 2.549490954369003, + "grad_norm": 17.80961036682129, + "learning_rate": 5.823401593670209e-07, + "loss": 2.9947, + "step": 87020 + }, + { + "epoch": 2.549783930271735, + "grad_norm": 15.64791488647461, + "learning_rate": 5.815996578510863e-07, + "loss": 3.0174, + "step": 87030 + }, + { + "epoch": 2.5500769061744673, + "grad_norm": 17.995651245117188, + "learning_rate": 5.808595983698939e-07, + "loss": 3.0062, + "step": 87040 + }, + { + "epoch": 2.5500769061744673, + "eval_bleu": 0.353549966666215, + "eval_cap_loss": 0.9004603624343872, + "eval_con_loss": 1.1295024156570435, + "eval_loss": 3.1594650745391846, + "step": 87040 + }, + { + "epoch": 2.5500769061744673, + "eval_bleu": 0.353549966666215, + "eval_cap_loss": 0.9004603624343872, + "eval_con_loss": 1.1295024156570435, + "eval_loss": 3.1594650745391846, + "eval_runtime": 54.3565, + "eval_samples_per_second": 367.941, + "eval_steps_per_second": 0.368, + "step": 87040 + }, + { + "epoch": 2.550369882077199, + "grad_norm": 13.49088191986084, + "learning_rate": 5.801199809974795e-07, + "loss": 3.0191, + "step": 87050 + }, + { + "epoch": 2.550662857979931, + "grad_norm": 16.700044631958008, + "learning_rate": 5.793808058078388e-07, + "loss": 2.9906, + "step": 87060 + }, + { + "epoch": 2.550955833882663, + "grad_norm": 17.2686710357666, + "learning_rate": 5.78642072874922e-07, + "loss": 3.0035, + "step": 87070 + }, + { + "epoch": 2.5512488097853954, + "grad_norm": 16.481868743896484, + "learning_rate": 5.779037822726357e-07, + "loss": 2.9937, + "step": 87080 + }, + { + "epoch": 2.5515417856881273, + "grad_norm": 15.035865783691406, + "learning_rate": 5.771659340748397e-07, + "loss": 3.0002, + "step": 87090 + }, + { + "epoch": 2.551834761590859, + "grad_norm": 22.42452049255371, + "learning_rate": 5.764285283553528e-07, + "loss": 2.9845, + "step": 87100 + }, + { + "epoch": 2.552127737493591, + "grad_norm": 20.742107391357422, + "learning_rate": 5.756915651879469e-07, + "loss": 3.0087, + "step": 87110 + }, + { + "epoch": 2.552420713396323, + "grad_norm": 17.981250762939453, + "learning_rate": 5.749550446463525e-07, + "loss": 2.9997, + "step": 87120 + }, + { + "epoch": 2.552713689299055, + "grad_norm": 18.337379455566406, + "learning_rate": 5.74218966804252e-07, + "loss": 2.9873, + "step": 87130 + }, + { + "epoch": 2.5530066652017873, + "grad_norm": 19.762954711914062, + "learning_rate": 5.734833317352861e-07, + "loss": 2.9901, + "step": 87140 + }, + { + "epoch": 2.5532996411045192, + "grad_norm": 20.511062622070312, + "learning_rate": 5.727481395130513e-07, + "loss": 3.0104, + "step": 87150 + }, + { + "epoch": 2.553592617007251, + "grad_norm": 21.13945960998535, + "learning_rate": 5.720133902111008e-07, + "loss": 3.0164, + "step": 87160 + }, + { + "epoch": 2.553885592909983, + "grad_norm": 20.483720779418945, + "learning_rate": 5.712790839029386e-07, + "loss": 3.0173, + "step": 87170 + }, + { + "epoch": 2.554178568812715, + "grad_norm": 20.48524284362793, + "learning_rate": 5.70545220662031e-07, + "loss": 3.0005, + "step": 87180 + }, + { + "epoch": 2.5544715447154474, + "grad_norm": 20.822601318359375, + "learning_rate": 5.698118005617931e-07, + "loss": 2.9845, + "step": 87190 + }, + { + "epoch": 2.5547645206181793, + "grad_norm": 20.08998680114746, + "learning_rate": 5.690788236756028e-07, + "loss": 2.9859, + "step": 87200 + }, + { + "epoch": 2.555057496520911, + "grad_norm": 22.04472541809082, + "learning_rate": 5.683462900767873e-07, + "loss": 3.029, + "step": 87210 + }, + { + "epoch": 2.555350472423643, + "grad_norm": 24.662607192993164, + "learning_rate": 5.676141998386337e-07, + "loss": 2.9855, + "step": 87220 + }, + { + "epoch": 2.555643448326375, + "grad_norm": 17.64457130432129, + "learning_rate": 5.668825530343824e-07, + "loss": 3.0024, + "step": 87230 + }, + { + "epoch": 2.555936424229107, + "grad_norm": 21.537452697753906, + "learning_rate": 5.661513497372323e-07, + "loss": 3.0137, + "step": 87240 + }, + { + "epoch": 2.5562294001318393, + "grad_norm": 23.648805618286133, + "learning_rate": 5.65420590020333e-07, + "loss": 3.0086, + "step": 87250 + }, + { + "epoch": 2.556522376034571, + "grad_norm": 20.97948455810547, + "learning_rate": 5.646902739567955e-07, + "loss": 2.995, + "step": 87260 + }, + { + "epoch": 2.556815351937303, + "grad_norm": 19.427898406982422, + "learning_rate": 5.639604016196814e-07, + "loss": 3.0055, + "step": 87270 + }, + { + "epoch": 2.557108327840035, + "grad_norm": 20.0692138671875, + "learning_rate": 5.632309730820124e-07, + "loss": 3.0166, + "step": 87280 + }, + { + "epoch": 2.5574013037427674, + "grad_norm": 15.361127853393555, + "learning_rate": 5.625019884167604e-07, + "loss": 3.0133, + "step": 87290 + }, + { + "epoch": 2.5576942796454993, + "grad_norm": 15.902318954467773, + "learning_rate": 5.617734476968578e-07, + "loss": 3.0026, + "step": 87300 + }, + { + "epoch": 2.5579872555482313, + "grad_norm": 17.2949275970459, + "learning_rate": 5.610453509951907e-07, + "loss": 3.0052, + "step": 87310 + }, + { + "epoch": 2.558280231450963, + "grad_norm": 16.17302131652832, + "learning_rate": 5.603176983846009e-07, + "loss": 2.9848, + "step": 87320 + }, + { + "epoch": 2.558573207353695, + "grad_norm": 19.1361026763916, + "learning_rate": 5.595904899378868e-07, + "loss": 3.007, + "step": 87330 + }, + { + "epoch": 2.558866183256427, + "grad_norm": 17.501277923583984, + "learning_rate": 5.588637257277991e-07, + "loss": 2.9986, + "step": 87340 + }, + { + "epoch": 2.559159159159159, + "grad_norm": 18.633203506469727, + "learning_rate": 5.581374058270483e-07, + "loss": 3.0013, + "step": 87350 + }, + { + "epoch": 2.5594521350618913, + "grad_norm": 23.049753189086914, + "learning_rate": 5.574115303082961e-07, + "loss": 3.0001, + "step": 87360 + }, + { + "epoch": 2.559745110964623, + "grad_norm": 16.35970687866211, + "learning_rate": 5.566860992441642e-07, + "loss": 3.0077, + "step": 87370 + }, + { + "epoch": 2.560038086867355, + "grad_norm": 18.18650245666504, + "learning_rate": 5.559611127072262e-07, + "loss": 3.0099, + "step": 87380 + }, + { + "epoch": 2.560331062770087, + "grad_norm": 20.71776580810547, + "learning_rate": 5.552365707700131e-07, + "loss": 3.0086, + "step": 87390 + }, + { + "epoch": 2.5606240386728194, + "grad_norm": 17.872573852539062, + "learning_rate": 5.545124735050112e-07, + "loss": 3.0053, + "step": 87400 + }, + { + "epoch": 2.5609170145755513, + "grad_norm": 16.148075103759766, + "learning_rate": 5.537888209846631e-07, + "loss": 3.0253, + "step": 87410 + }, + { + "epoch": 2.5612099904782832, + "grad_norm": 19.452617645263672, + "learning_rate": 5.530656132813645e-07, + "loss": 3.0049, + "step": 87420 + }, + { + "epoch": 2.561502966381015, + "grad_norm": 14.919790267944336, + "learning_rate": 5.523428504674688e-07, + "loss": 3.0018, + "step": 87430 + }, + { + "epoch": 2.561795942283747, + "grad_norm": 18.562326431274414, + "learning_rate": 5.516205326152834e-07, + "loss": 3.0167, + "step": 87440 + }, + { + "epoch": 2.562088918186479, + "grad_norm": 19.837541580200195, + "learning_rate": 5.508986597970734e-07, + "loss": 3.0108, + "step": 87450 + }, + { + "epoch": 2.562381894089211, + "grad_norm": 17.91940689086914, + "learning_rate": 5.501772320850552e-07, + "loss": 2.983, + "step": 87460 + }, + { + "epoch": 2.5626748699919433, + "grad_norm": 22.167842864990234, + "learning_rate": 5.494562495514055e-07, + "loss": 3.0088, + "step": 87470 + }, + { + "epoch": 2.562967845894675, + "grad_norm": 16.4490909576416, + "learning_rate": 5.48735712268254e-07, + "loss": 2.9905, + "step": 87480 + }, + { + "epoch": 2.563260821797407, + "grad_norm": 16.09779930114746, + "learning_rate": 5.480156203076869e-07, + "loss": 2.9982, + "step": 87490 + }, + { + "epoch": 2.563553797700139, + "grad_norm": 23.214405059814453, + "learning_rate": 5.472959737417433e-07, + "loss": 3.0022, + "step": 87500 + }, + { + "epoch": 2.5638467736028714, + "grad_norm": 20.94660758972168, + "learning_rate": 5.465767726424214e-07, + "loss": 2.9934, + "step": 87510 + }, + { + "epoch": 2.5641397495056033, + "grad_norm": 18.108434677124023, + "learning_rate": 5.458580170816713e-07, + "loss": 2.9829, + "step": 87520 + }, + { + "epoch": 2.5644327254083352, + "grad_norm": 21.017492294311523, + "learning_rate": 5.451397071314018e-07, + "loss": 2.9946, + "step": 87530 + }, + { + "epoch": 2.564725701311067, + "grad_norm": 17.739301681518555, + "learning_rate": 5.44421842863474e-07, + "loss": 3.0073, + "step": 87540 + }, + { + "epoch": 2.565018677213799, + "grad_norm": 20.21487808227539, + "learning_rate": 5.437044243497069e-07, + "loss": 3.0008, + "step": 87550 + }, + { + "epoch": 2.565077272394346, + "eval_bleu": 0.35344327197254155, + "eval_cap_loss": 0.9003407955169678, + "eval_con_loss": 1.1281267404556274, + "eval_loss": 3.1565942764282227, + "step": 87552 + }, + { + "epoch": 2.565077272394346, + "eval_bleu": 0.35344327197254155, + "eval_cap_loss": 0.9003407955169678, + "eval_con_loss": 1.1281267404556274, + "eval_loss": 3.1565942764282227, + "eval_runtime": 53.685, + "eval_samples_per_second": 372.544, + "eval_steps_per_second": 0.373, + "step": 87552 + }, + { + "epoch": 2.565311653116531, + "grad_norm": 21.124574661254883, + "learning_rate": 5.429874516618738e-07, + "loss": 3.0008, + "step": 87560 + }, + { + "epoch": 2.565604629019263, + "grad_norm": 17.54202651977539, + "learning_rate": 5.422709248717045e-07, + "loss": 3.0094, + "step": 87570 + }, + { + "epoch": 2.5658976049219953, + "grad_norm": 19.358041763305664, + "learning_rate": 5.415548440508816e-07, + "loss": 3.0215, + "step": 87580 + }, + { + "epoch": 2.566190580824727, + "grad_norm": 19.34484100341797, + "learning_rate": 5.408392092710463e-07, + "loss": 3.0143, + "step": 87590 + }, + { + "epoch": 2.566483556727459, + "grad_norm": 16.45000648498535, + "learning_rate": 5.401240206037917e-07, + "loss": 2.9837, + "step": 87600 + }, + { + "epoch": 2.566776532630191, + "grad_norm": 14.144309043884277, + "learning_rate": 5.394092781206706e-07, + "loss": 2.9872, + "step": 87610 + }, + { + "epoch": 2.5670695085329234, + "grad_norm": 21.72707748413086, + "learning_rate": 5.386949818931858e-07, + "loss": 3.0117, + "step": 87620 + }, + { + "epoch": 2.5673624844356553, + "grad_norm": 18.90494728088379, + "learning_rate": 5.379811319928008e-07, + "loss": 3.0279, + "step": 87630 + }, + { + "epoch": 2.567655460338387, + "grad_norm": 17.71776580810547, + "learning_rate": 5.372677284909311e-07, + "loss": 3.011, + "step": 87640 + }, + { + "epoch": 2.567948436241119, + "grad_norm": 20.236011505126953, + "learning_rate": 5.365547714589492e-07, + "loss": 3.016, + "step": 87650 + }, + { + "epoch": 2.568241412143851, + "grad_norm": 14.811037063598633, + "learning_rate": 5.358422609681812e-07, + "loss": 3.003, + "step": 87660 + }, + { + "epoch": 2.568534388046583, + "grad_norm": 19.300413131713867, + "learning_rate": 5.351301970899104e-07, + "loss": 3.0161, + "step": 87670 + }, + { + "epoch": 2.5688273639493153, + "grad_norm": 17.17278480529785, + "learning_rate": 5.344185798953744e-07, + "loss": 3.0053, + "step": 87680 + }, + { + "epoch": 2.5691203398520472, + "grad_norm": 20.23556137084961, + "learning_rate": 5.337074094557654e-07, + "loss": 3.0232, + "step": 87690 + }, + { + "epoch": 2.569413315754779, + "grad_norm": 17.721155166625977, + "learning_rate": 5.329966858422326e-07, + "loss": 3.0077, + "step": 87700 + }, + { + "epoch": 2.569706291657511, + "grad_norm": 19.990657806396484, + "learning_rate": 5.322864091258795e-07, + "loss": 2.9911, + "step": 87710 + }, + { + "epoch": 2.5699992675602434, + "grad_norm": 18.48470687866211, + "learning_rate": 5.315765793777661e-07, + "loss": 3.0235, + "step": 87720 + }, + { + "epoch": 2.5702922434629754, + "grad_norm": 20.84996223449707, + "learning_rate": 5.308671966689044e-07, + "loss": 3.017, + "step": 87730 + }, + { + "epoch": 2.5705852193657073, + "grad_norm": 21.065353393554688, + "learning_rate": 5.301582610702666e-07, + "loss": 3.0127, + "step": 87740 + }, + { + "epoch": 2.570878195268439, + "grad_norm": 18.29056167602539, + "learning_rate": 5.294497726527754e-07, + "loss": 3.0109, + "step": 87750 + }, + { + "epoch": 2.571171171171171, + "grad_norm": 18.93345069885254, + "learning_rate": 5.287417314873122e-07, + "loss": 3.0199, + "step": 87760 + }, + { + "epoch": 2.571464147073903, + "grad_norm": 19.13198471069336, + "learning_rate": 5.280341376447107e-07, + "loss": 3.0097, + "step": 87770 + }, + { + "epoch": 2.571757122976635, + "grad_norm": 14.681391716003418, + "learning_rate": 5.273269911957623e-07, + "loss": 2.9767, + "step": 87780 + }, + { + "epoch": 2.5720500988793673, + "grad_norm": 16.906078338623047, + "learning_rate": 5.266202922112129e-07, + "loss": 2.9996, + "step": 87790 + }, + { + "epoch": 2.5723430747820992, + "grad_norm": 19.880828857421875, + "learning_rate": 5.259140407617652e-07, + "loss": 2.9843, + "step": 87800 + }, + { + "epoch": 2.572636050684831, + "grad_norm": 17.420848846435547, + "learning_rate": 5.252082369180722e-07, + "loss": 3.0167, + "step": 87810 + }, + { + "epoch": 2.572929026587563, + "grad_norm": 18.005096435546875, + "learning_rate": 5.245028807507485e-07, + "loss": 3.0252, + "step": 87820 + }, + { + "epoch": 2.5732220024902954, + "grad_norm": 18.975370407104492, + "learning_rate": 5.237979723303582e-07, + "loss": 2.9975, + "step": 87830 + }, + { + "epoch": 2.5735149783930273, + "grad_norm": 18.802949905395508, + "learning_rate": 5.230935117274255e-07, + "loss": 3.0111, + "step": 87840 + }, + { + "epoch": 2.5738079542957593, + "grad_norm": 16.990772247314453, + "learning_rate": 5.223894990124251e-07, + "loss": 2.9931, + "step": 87850 + }, + { + "epoch": 2.574100930198491, + "grad_norm": 18.111482620239258, + "learning_rate": 5.216859342557912e-07, + "loss": 2.9815, + "step": 87860 + }, + { + "epoch": 2.574393906101223, + "grad_norm": 18.711151123046875, + "learning_rate": 5.209828175279097e-07, + "loss": 3.0063, + "step": 87870 + }, + { + "epoch": 2.574686882003955, + "grad_norm": 17.77798843383789, + "learning_rate": 5.20280148899126e-07, + "loss": 2.9827, + "step": 87880 + }, + { + "epoch": 2.574979857906687, + "grad_norm": 24.24541664123535, + "learning_rate": 5.195779284397346e-07, + "loss": 3.0066, + "step": 87890 + }, + { + "epoch": 2.5752728338094193, + "grad_norm": 18.334712982177734, + "learning_rate": 5.188761562199912e-07, + "loss": 3.0132, + "step": 87900 + }, + { + "epoch": 2.575565809712151, + "grad_norm": 17.725173950195312, + "learning_rate": 5.181748323101021e-07, + "loss": 3.0072, + "step": 87910 + }, + { + "epoch": 2.575858785614883, + "grad_norm": 18.492582321166992, + "learning_rate": 5.174739567802317e-07, + "loss": 3.0134, + "step": 87920 + }, + { + "epoch": 2.576151761517615, + "grad_norm": 17.30645179748535, + "learning_rate": 5.167735297004978e-07, + "loss": 3.0129, + "step": 87930 + }, + { + "epoch": 2.5764447374203474, + "grad_norm": 21.606830596923828, + "learning_rate": 5.160735511409737e-07, + "loss": 2.9827, + "step": 87940 + }, + { + "epoch": 2.5767377133230793, + "grad_norm": 21.667526245117188, + "learning_rate": 5.153740211716885e-07, + "loss": 3.0017, + "step": 87950 + }, + { + "epoch": 2.5770306892258112, + "grad_norm": 20.391132354736328, + "learning_rate": 5.146749398626272e-07, + "loss": 3.0066, + "step": 87960 + }, + { + "epoch": 2.577323665128543, + "grad_norm": 17.661569595336914, + "learning_rate": 5.139763072837267e-07, + "loss": 3.0161, + "step": 87970 + }, + { + "epoch": 2.577616641031275, + "grad_norm": 19.150829315185547, + "learning_rate": 5.132781235048829e-07, + "loss": 2.9907, + "step": 87980 + }, + { + "epoch": 2.577909616934007, + "grad_norm": 15.117281913757324, + "learning_rate": 5.125803885959429e-07, + "loss": 2.9929, + "step": 87990 + }, + { + "epoch": 2.578202592836739, + "grad_norm": 19.391925811767578, + "learning_rate": 5.118831026267129e-07, + "loss": 3.0239, + "step": 88000 + }, + { + "epoch": 2.5784955687394713, + "grad_norm": 19.314727783203125, + "learning_rate": 5.111862656669508e-07, + "loss": 3.0014, + "step": 88010 + }, + { + "epoch": 2.578788544642203, + "grad_norm": 19.341964721679688, + "learning_rate": 5.104898777863715e-07, + "loss": 2.9942, + "step": 88020 + }, + { + "epoch": 2.579081520544935, + "grad_norm": 17.665691375732422, + "learning_rate": 5.098635127141349e-07, + "loss": 3.0007, + "step": 88030 + }, + { + "epoch": 2.579374496447667, + "grad_norm": 17.629079818725586, + "learning_rate": 5.091679782759057e-07, + "loss": 3.0125, + "step": 88040 + }, + { + "epoch": 2.5796674723503994, + "grad_norm": 18.9583740234375, + "learning_rate": 5.084728931187776e-07, + "loss": 2.9991, + "step": 88050 + }, + { + "epoch": 2.5799604482531313, + "grad_norm": 19.346466064453125, + "learning_rate": 5.077782573122886e-07, + "loss": 2.9847, + "step": 88060 + }, + { + "epoch": 2.580077638614224, + "eval_bleu": 0.35355181768959104, + "eval_cap_loss": 0.9001879692077637, + "eval_con_loss": 1.1277165412902832, + "eval_loss": 3.15562105178833, + "step": 88064 + }, + { + "epoch": 2.580077638614224, + "eval_bleu": 0.35355181768959104, + "eval_cap_loss": 0.9001879692077637, + "eval_con_loss": 1.1277165412902832, + "eval_loss": 3.15562105178833, + "eval_runtime": 53.5085, + "eval_samples_per_second": 373.772, + "eval_steps_per_second": 0.374, + "step": 88064 + }, + { + "epoch": 2.5802534241558632, + "grad_norm": 20.05380630493164, + "learning_rate": 5.070840709259333e-07, + "loss": 2.9991, + "step": 88070 + }, + { + "epoch": 2.580546400058595, + "grad_norm": 20.370311737060547, + "learning_rate": 5.063903340291609e-07, + "loss": 2.9896, + "step": 88080 + }, + { + "epoch": 2.580839375961327, + "grad_norm": 19.947721481323242, + "learning_rate": 5.056970466913775e-07, + "loss": 2.9865, + "step": 88090 + }, + { + "epoch": 2.581132351864059, + "grad_norm": 16.295616149902344, + "learning_rate": 5.05004208981939e-07, + "loss": 3.0093, + "step": 88100 + }, + { + "epoch": 2.5814253277667913, + "grad_norm": 19.472667694091797, + "learning_rate": 5.04311820970163e-07, + "loss": 2.9967, + "step": 88110 + }, + { + "epoch": 2.5817183036695233, + "grad_norm": 21.294475555419922, + "learning_rate": 5.036198827253164e-07, + "loss": 3.0289, + "step": 88120 + }, + { + "epoch": 2.582011279572255, + "grad_norm": 14.85816764831543, + "learning_rate": 5.029283943166252e-07, + "loss": 2.9996, + "step": 88130 + }, + { + "epoch": 2.582304255474987, + "grad_norm": 22.680639266967773, + "learning_rate": 5.022373558132671e-07, + "loss": 3.0112, + "step": 88140 + }, + { + "epoch": 2.5825972313777195, + "grad_norm": 15.32004451751709, + "learning_rate": 5.015467672843771e-07, + "loss": 3.0, + "step": 88150 + }, + { + "epoch": 2.5828902072804514, + "grad_norm": 18.000776290893555, + "learning_rate": 5.008566287990447e-07, + "loss": 3.0171, + "step": 88160 + }, + { + "epoch": 2.5831831831831833, + "grad_norm": 15.838814735412598, + "learning_rate": 5.001669404263149e-07, + "loss": 3.0031, + "step": 88170 + }, + { + "epoch": 2.583476159085915, + "grad_norm": 19.573862075805664, + "learning_rate": 4.994777022351854e-07, + "loss": 3.0039, + "step": 88180 + }, + { + "epoch": 2.583769134988647, + "grad_norm": 21.097631454467773, + "learning_rate": 4.987889142946117e-07, + "loss": 2.9987, + "step": 88190 + }, + { + "epoch": 2.584062110891379, + "grad_norm": 17.09856605529785, + "learning_rate": 4.98100576673502e-07, + "loss": 3.0276, + "step": 88200 + }, + { + "epoch": 2.584355086794111, + "grad_norm": 15.501601219177246, + "learning_rate": 4.974126894407211e-07, + "loss": 3.0057, + "step": 88210 + }, + { + "epoch": 2.5846480626968433, + "grad_norm": 16.74028205871582, + "learning_rate": 4.967252526650873e-07, + "loss": 2.9913, + "step": 88220 + }, + { + "epoch": 2.5849410385995752, + "grad_norm": 20.93977928161621, + "learning_rate": 4.960382664153746e-07, + "loss": 2.9952, + "step": 88230 + }, + { + "epoch": 2.585234014502307, + "grad_norm": 19.49881935119629, + "learning_rate": 4.953517307603123e-07, + "loss": 3.029, + "step": 88240 + }, + { + "epoch": 2.585526990405039, + "grad_norm": 19.5816593170166, + "learning_rate": 4.94665645768585e-07, + "loss": 3.0047, + "step": 88250 + }, + { + "epoch": 2.5858199663077714, + "grad_norm": 21.65655517578125, + "learning_rate": 4.939800115088295e-07, + "loss": 3.0197, + "step": 88260 + }, + { + "epoch": 2.5861129422105034, + "grad_norm": 18.251977920532227, + "learning_rate": 4.932948280496419e-07, + "loss": 3.008, + "step": 88270 + }, + { + "epoch": 2.5864059181132353, + "grad_norm": 18.067602157592773, + "learning_rate": 4.92610095459568e-07, + "loss": 2.9924, + "step": 88280 + }, + { + "epoch": 2.586698894015967, + "grad_norm": 21.621198654174805, + "learning_rate": 4.919258138071142e-07, + "loss": 3.0099, + "step": 88290 + }, + { + "epoch": 2.586991869918699, + "grad_norm": 22.503829956054688, + "learning_rate": 4.912419831607357e-07, + "loss": 2.9864, + "step": 88300 + }, + { + "epoch": 2.587284845821431, + "grad_norm": 18.839378356933594, + "learning_rate": 4.905586035888477e-07, + "loss": 2.9835, + "step": 88310 + }, + { + "epoch": 2.587577821724163, + "grad_norm": 15.169340133666992, + "learning_rate": 4.898756751598177e-07, + "loss": 2.9963, + "step": 88320 + }, + { + "epoch": 2.5878707976268953, + "grad_norm": 19.769756317138672, + "learning_rate": 4.891931979419695e-07, + "loss": 3.0193, + "step": 88330 + }, + { + "epoch": 2.5881637735296272, + "grad_norm": 17.38176918029785, + "learning_rate": 4.88511172003579e-07, + "loss": 3.0067, + "step": 88340 + }, + { + "epoch": 2.588456749432359, + "grad_norm": 18.407472610473633, + "learning_rate": 4.878295974128805e-07, + "loss": 2.9921, + "step": 88350 + }, + { + "epoch": 2.588749725335091, + "grad_norm": 23.072006225585938, + "learning_rate": 4.871484742380616e-07, + "loss": 3.0136, + "step": 88360 + }, + { + "epoch": 2.5890427012378234, + "grad_norm": 19.28253173828125, + "learning_rate": 4.864678025472635e-07, + "loss": 3.0072, + "step": 88370 + }, + { + "epoch": 2.5893356771405553, + "grad_norm": 16.872455596923828, + "learning_rate": 4.857875824085845e-07, + "loss": 2.9937, + "step": 88380 + }, + { + "epoch": 2.5896286530432873, + "grad_norm": 20.856908798217773, + "learning_rate": 4.85107813890075e-07, + "loss": 2.9985, + "step": 88390 + }, + { + "epoch": 2.589921628946019, + "grad_norm": 23.2618350982666, + "learning_rate": 4.844284970597429e-07, + "loss": 3.0161, + "step": 88400 + }, + { + "epoch": 2.590214604848751, + "grad_norm": 18.177597045898438, + "learning_rate": 4.837496319855495e-07, + "loss": 3.0081, + "step": 88410 + }, + { + "epoch": 2.590507580751483, + "grad_norm": 17.14712905883789, + "learning_rate": 4.830712187354125e-07, + "loss": 3.0236, + "step": 88420 + }, + { + "epoch": 2.590800556654215, + "grad_norm": 18.780942916870117, + "learning_rate": 4.823932573772011e-07, + "loss": 3.0195, + "step": 88430 + }, + { + "epoch": 2.5910935325569473, + "grad_norm": 19.085542678833008, + "learning_rate": 4.817157479787432e-07, + "loss": 2.9983, + "step": 88440 + }, + { + "epoch": 2.591386508459679, + "grad_norm": 16.362903594970703, + "learning_rate": 4.810386906078179e-07, + "loss": 2.9927, + "step": 88450 + }, + { + "epoch": 2.591679484362411, + "grad_norm": 18.28078842163086, + "learning_rate": 4.803620853321623e-07, + "loss": 2.989, + "step": 88460 + }, + { + "epoch": 2.591972460265143, + "grad_norm": 21.253620147705078, + "learning_rate": 4.796859322194647e-07, + "loss": 2.982, + "step": 88470 + }, + { + "epoch": 2.5922654361678754, + "grad_norm": 17.66602897644043, + "learning_rate": 4.790102313373718e-07, + "loss": 2.9963, + "step": 88480 + }, + { + "epoch": 2.5925584120706073, + "grad_norm": 18.646560668945312, + "learning_rate": 4.783349827534833e-07, + "loss": 2.9931, + "step": 88490 + }, + { + "epoch": 2.5928513879733392, + "grad_norm": 18.995494842529297, + "learning_rate": 4.776601865353541e-07, + "loss": 2.9922, + "step": 88500 + }, + { + "epoch": 2.593144363876071, + "grad_norm": 20.540477752685547, + "learning_rate": 4.769858427504925e-07, + "loss": 3.007, + "step": 88510 + }, + { + "epoch": 2.593437339778803, + "grad_norm": 15.495771408081055, + "learning_rate": 4.763119514663644e-07, + "loss": 3.0008, + "step": 88520 + }, + { + "epoch": 2.593730315681535, + "grad_norm": 22.693519592285156, + "learning_rate": 4.7563851275038595e-07, + "loss": 3.0001, + "step": 88530 + }, + { + "epoch": 2.594023291584267, + "grad_norm": 17.961742401123047, + "learning_rate": 4.749655266699338e-07, + "loss": 2.9903, + "step": 88540 + }, + { + "epoch": 2.5943162674869993, + "grad_norm": 21.635818481445312, + "learning_rate": 4.7429299329233293e-07, + "loss": 3.0178, + "step": 88550 + }, + { + "epoch": 2.594609243389731, + "grad_norm": 18.326007843017578, + "learning_rate": 4.7362091268486844e-07, + "loss": 2.9824, + "step": 88560 + }, + { + "epoch": 2.594902219292463, + "grad_norm": 19.4867000579834, + "learning_rate": 4.729492849147771e-07, + "loss": 2.9846, + "step": 88570 + }, + { + "epoch": 2.5950780048341024, + "eval_bleu": 0.3536014593817595, + "eval_cap_loss": 0.9001264572143555, + "eval_con_loss": 1.1281452178955078, + "eval_loss": 3.156416654586792, + "step": 88576 + }, + { + "epoch": 2.5950780048341024, + "eval_bleu": 0.3536014593817595, + "eval_cap_loss": 0.9001264572143555, + "eval_con_loss": 1.1281452178955078, + "eval_loss": 3.156416654586792, + "eval_runtime": 55.3138, + "eval_samples_per_second": 361.573, + "eval_steps_per_second": 0.362, + "step": 88576 + }, + { + "epoch": 2.595195195195195, + "grad_norm": 19.127716064453125, + "learning_rate": 4.7227811004925297e-07, + "loss": 3.0022, + "step": 88580 + }, + { + "epoch": 2.5954881710979274, + "grad_norm": 15.662275314331055, + "learning_rate": 4.7160738815544116e-07, + "loss": 3.0109, + "step": 88590 + }, + { + "epoch": 2.5957811470006593, + "grad_norm": 15.703532218933105, + "learning_rate": 4.7093711930044474e-07, + "loss": 3.0081, + "step": 88600 + }, + { + "epoch": 2.5960741229033912, + "grad_norm": 15.097633361816406, + "learning_rate": 4.7026730355131887e-07, + "loss": 3.0063, + "step": 88610 + }, + { + "epoch": 2.596367098806123, + "grad_norm": 18.712020874023438, + "learning_rate": 4.695979409750762e-07, + "loss": 2.9926, + "step": 88620 + }, + { + "epoch": 2.596660074708855, + "grad_norm": 22.7526798248291, + "learning_rate": 4.689290316386802e-07, + "loss": 3.0045, + "step": 88630 + }, + { + "epoch": 2.596953050611587, + "grad_norm": 22.42127799987793, + "learning_rate": 4.68260575609053e-07, + "loss": 3.027, + "step": 88640 + }, + { + "epoch": 2.5972460265143194, + "grad_norm": 17.24005889892578, + "learning_rate": 4.675925729530695e-07, + "loss": 3.0151, + "step": 88650 + }, + { + "epoch": 2.5975390024170513, + "grad_norm": 17.49968910217285, + "learning_rate": 4.6692502373755946e-07, + "loss": 3.0136, + "step": 88660 + }, + { + "epoch": 2.597831978319783, + "grad_norm": 17.13347625732422, + "learning_rate": 4.662579280293067e-07, + "loss": 3.0253, + "step": 88670 + }, + { + "epoch": 2.598124954222515, + "grad_norm": 17.476896286010742, + "learning_rate": 4.655912858950512e-07, + "loss": 2.9946, + "step": 88680 + }, + { + "epoch": 2.5984179301252475, + "grad_norm": 14.961960792541504, + "learning_rate": 4.6492509740148517e-07, + "loss": 3.0089, + "step": 88690 + }, + { + "epoch": 2.5987109060279794, + "grad_norm": 17.385393142700195, + "learning_rate": 4.6425936261525807e-07, + "loss": 3.0096, + "step": 88700 + }, + { + "epoch": 2.5990038819307113, + "grad_norm": 19.57839012145996, + "learning_rate": 4.63594081602971e-07, + "loss": 3.0241, + "step": 88710 + }, + { + "epoch": 2.599296857833443, + "grad_norm": 21.229772567749023, + "learning_rate": 4.629292544311831e-07, + "loss": 3.0103, + "step": 88720 + }, + { + "epoch": 2.599589833736175, + "grad_norm": 18.258960723876953, + "learning_rate": 4.622648811664049e-07, + "loss": 2.9993, + "step": 88730 + }, + { + "epoch": 2.599882809638907, + "grad_norm": 15.962053298950195, + "learning_rate": 4.616009618751044e-07, + "loss": 3.0164, + "step": 88740 + }, + { + "epoch": 2.600175785541639, + "grad_norm": 17.185178756713867, + "learning_rate": 4.6093749662370347e-07, + "loss": 3.0215, + "step": 88750 + }, + { + "epoch": 2.6004687614443713, + "grad_norm": 18.400117874145508, + "learning_rate": 4.602744854785757e-07, + "loss": 2.989, + "step": 88760 + }, + { + "epoch": 2.6007617373471033, + "grad_norm": 19.198251724243164, + "learning_rate": 4.59611928506053e-07, + "loss": 3.0191, + "step": 88770 + }, + { + "epoch": 2.601054713249835, + "grad_norm": 24.12956428527832, + "learning_rate": 4.589498257724184e-07, + "loss": 2.9984, + "step": 88780 + }, + { + "epoch": 2.601347689152567, + "grad_norm": 16.220457077026367, + "learning_rate": 4.582881773439141e-07, + "loss": 3.0205, + "step": 88790 + }, + { + "epoch": 2.6016406650552995, + "grad_norm": 19.99817657470703, + "learning_rate": 4.5762698328673136e-07, + "loss": 3.0099, + "step": 88800 + }, + { + "epoch": 2.6019336409580314, + "grad_norm": 16.88727378845215, + "learning_rate": 4.5696624366702026e-07, + "loss": 3.0148, + "step": 88810 + }, + { + "epoch": 2.6022266168607633, + "grad_norm": 19.323501586914062, + "learning_rate": 4.563059585508833e-07, + "loss": 3.0126, + "step": 88820 + }, + { + "epoch": 2.602519592763495, + "grad_norm": 19.28718376159668, + "learning_rate": 4.5564612800437944e-07, + "loss": 3.0163, + "step": 88830 + }, + { + "epoch": 2.602812568666227, + "grad_norm": 15.417278289794922, + "learning_rate": 4.5498675209351863e-07, + "loss": 2.9918, + "step": 88840 + }, + { + "epoch": 2.603105544568959, + "grad_norm": 15.56338119506836, + "learning_rate": 4.5432783088426966e-07, + "loss": 3.0237, + "step": 88850 + }, + { + "epoch": 2.603398520471691, + "grad_norm": 14.64168930053711, + "learning_rate": 4.5366936444255214e-07, + "loss": 3.0038, + "step": 88860 + }, + { + "epoch": 2.6036914963744233, + "grad_norm": 21.051151275634766, + "learning_rate": 4.530113528342428e-07, + "loss": 2.9992, + "step": 88870 + }, + { + "epoch": 2.6039844722771552, + "grad_norm": 21.61078643798828, + "learning_rate": 4.5235379612517003e-07, + "loss": 3.0114, + "step": 88880 + }, + { + "epoch": 2.604277448179887, + "grad_norm": 19.008718490600586, + "learning_rate": 4.5169669438112017e-07, + "loss": 2.9937, + "step": 88890 + }, + { + "epoch": 2.604570424082619, + "grad_norm": 19.9532527923584, + "learning_rate": 4.510400476678317e-07, + "loss": 3.0134, + "step": 88900 + }, + { + "epoch": 2.6048633999853514, + "grad_norm": 22.636804580688477, + "learning_rate": 4.503838560509993e-07, + "loss": 3.0108, + "step": 88910 + }, + { + "epoch": 2.6051563758880834, + "grad_norm": 18.834857940673828, + "learning_rate": 4.4972811959626937e-07, + "loss": 3.0148, + "step": 88920 + }, + { + "epoch": 2.6054493517908153, + "grad_norm": 19.245630264282227, + "learning_rate": 4.490728383692461e-07, + "loss": 3.0101, + "step": 88930 + }, + { + "epoch": 2.605742327693547, + "grad_norm": 20.70455551147461, + "learning_rate": 4.4841801243548476e-07, + "loss": 3.002, + "step": 88940 + }, + { + "epoch": 2.606035303596279, + "grad_norm": 23.81230354309082, + "learning_rate": 4.477636418604986e-07, + "loss": 2.997, + "step": 88950 + }, + { + "epoch": 2.606328279499011, + "grad_norm": 20.132402420043945, + "learning_rate": 4.471097267097513e-07, + "loss": 2.9897, + "step": 88960 + }, + { + "epoch": 2.606621255401743, + "grad_norm": 17.699588775634766, + "learning_rate": 4.4645626704866495e-07, + "loss": 3.0151, + "step": 88970 + }, + { + "epoch": 2.6069142313044753, + "grad_norm": 19.945659637451172, + "learning_rate": 4.458032629426134e-07, + "loss": 3.0083, + "step": 88980 + }, + { + "epoch": 2.6072072072072072, + "grad_norm": 20.40546989440918, + "learning_rate": 4.4515071445692725e-07, + "loss": 3.004, + "step": 88990 + }, + { + "epoch": 2.607500183109939, + "grad_norm": 18.15569496154785, + "learning_rate": 4.444986216568886e-07, + "loss": 3.0144, + "step": 89000 + }, + { + "epoch": 2.607793159012671, + "grad_norm": 15.399401664733887, + "learning_rate": 4.4384698460773646e-07, + "loss": 3.0052, + "step": 89010 + }, + { + "epoch": 2.6080861349154034, + "grad_norm": 19.40213966369629, + "learning_rate": 4.4319580337466196e-07, + "loss": 3.0151, + "step": 89020 + }, + { + "epoch": 2.6083791108181353, + "grad_norm": 20.92839813232422, + "learning_rate": 4.425450780228141e-07, + "loss": 3.0072, + "step": 89030 + }, + { + "epoch": 2.6086720867208673, + "grad_norm": 19.45376968383789, + "learning_rate": 4.4189480861729137e-07, + "loss": 3.0119, + "step": 89040 + }, + { + "epoch": 2.608965062623599, + "grad_norm": 18.254098892211914, + "learning_rate": 4.412449952231507e-07, + "loss": 3.0015, + "step": 89050 + }, + { + "epoch": 2.609258038526331, + "grad_norm": 18.731426239013672, + "learning_rate": 4.4059563790540216e-07, + "loss": 3.0083, + "step": 89060 + }, + { + "epoch": 2.609551014429063, + "grad_norm": 18.054662704467773, + "learning_rate": 4.3994673672901e-07, + "loss": 3.0162, + "step": 89070 + }, + { + "epoch": 2.6098439903317954, + "grad_norm": 19.62407684326172, + "learning_rate": 4.3929829175889395e-07, + "loss": 2.9979, + "step": 89080 + }, + { + "epoch": 2.610078371053981, + "eval_bleu": 0.35363302799540003, + "eval_cap_loss": 0.8999084234237671, + "eval_con_loss": 1.127392053604126, + "eval_loss": 3.1546926498413086, + "step": 89088 + }, + { + "epoch": 2.610078371053981, + "eval_bleu": 0.35363302799540003, + "eval_cap_loss": 0.8999084234237671, + "eval_con_loss": 1.127392053604126, + "eval_loss": 3.1546926498413086, + "eval_runtime": 56.3375, + "eval_samples_per_second": 355.003, + "eval_steps_per_second": 0.355, + "step": 89088 + }, + { + "epoch": 2.6101369662345273, + "grad_norm": 18.0157470703125, + "learning_rate": 4.3865030305992537e-07, + "loss": 3.0253, + "step": 89090 + }, + { + "epoch": 2.610429942137259, + "grad_norm": 18.90717124938965, + "learning_rate": 4.3800277069693307e-07, + "loss": 3.0291, + "step": 89100 + }, + { + "epoch": 2.610722918039991, + "grad_norm": 20.57257843017578, + "learning_rate": 4.373556947346974e-07, + "loss": 3.0051, + "step": 89110 + }, + { + "epoch": 2.6110158939427235, + "grad_norm": 16.95785903930664, + "learning_rate": 4.3670907523795604e-07, + "loss": 3.0241, + "step": 89120 + }, + { + "epoch": 2.6113088698454554, + "grad_norm": 14.488547325134277, + "learning_rate": 4.3606291227139775e-07, + "loss": 3.0166, + "step": 89130 + }, + { + "epoch": 2.6116018457481873, + "grad_norm": 16.305810928344727, + "learning_rate": 4.354172058996681e-07, + "loss": 3.0131, + "step": 89140 + }, + { + "epoch": 2.6118948216509192, + "grad_norm": 14.316901206970215, + "learning_rate": 4.3477195618736646e-07, + "loss": 2.9981, + "step": 89150 + }, + { + "epoch": 2.612187797553651, + "grad_norm": 20.262439727783203, + "learning_rate": 4.3412716319904625e-07, + "loss": 2.9929, + "step": 89160 + }, + { + "epoch": 2.612480773456383, + "grad_norm": 19.8267822265625, + "learning_rate": 4.334828269992147e-07, + "loss": 3.0231, + "step": 89170 + }, + { + "epoch": 2.612773749359115, + "grad_norm": 21.01027488708496, + "learning_rate": 4.328389476523348e-07, + "loss": 3.0058, + "step": 89180 + }, + { + "epoch": 2.6130667252618474, + "grad_norm": 21.911853790283203, + "learning_rate": 4.321955252228216e-07, + "loss": 3.0184, + "step": 89190 + }, + { + "epoch": 2.6133597011645793, + "grad_norm": 19.59243392944336, + "learning_rate": 4.3155255977504694e-07, + "loss": 3.0061, + "step": 89200 + }, + { + "epoch": 2.613652677067311, + "grad_norm": 17.52650260925293, + "learning_rate": 4.3091005137333386e-07, + "loss": 2.993, + "step": 89210 + }, + { + "epoch": 2.613945652970043, + "grad_norm": 15.501494407653809, + "learning_rate": 4.3026800008196256e-07, + "loss": 3.0077, + "step": 89220 + }, + { + "epoch": 2.6142386288727755, + "grad_norm": 15.604806900024414, + "learning_rate": 4.2962640596516725e-07, + "loss": 3.0171, + "step": 89230 + }, + { + "epoch": 2.6145316047755074, + "grad_norm": 19.46306800842285, + "learning_rate": 4.289852690871349e-07, + "loss": 3.0176, + "step": 89240 + }, + { + "epoch": 2.6148245806782393, + "grad_norm": 15.463471412658691, + "learning_rate": 4.2834458951200755e-07, + "loss": 3.0037, + "step": 89250 + }, + { + "epoch": 2.6151175565809712, + "grad_norm": 18.718223571777344, + "learning_rate": 4.2770436730388166e-07, + "loss": 3.0114, + "step": 89260 + }, + { + "epoch": 2.615410532483703, + "grad_norm": 17.51587677001953, + "learning_rate": 4.270646025268066e-07, + "loss": 3.0031, + "step": 89270 + }, + { + "epoch": 2.615703508386435, + "grad_norm": 21.32733726501465, + "learning_rate": 4.2642529524478836e-07, + "loss": 3.0013, + "step": 89280 + }, + { + "epoch": 2.615996484289167, + "grad_norm": 19.973400115966797, + "learning_rate": 4.257864455217842e-07, + "loss": 2.992, + "step": 89290 + }, + { + "epoch": 2.6162894601918993, + "grad_norm": 17.655868530273438, + "learning_rate": 4.2514805342170895e-07, + "loss": 3.031, + "step": 89300 + }, + { + "epoch": 2.6165824360946313, + "grad_norm": 19.01152801513672, + "learning_rate": 4.2451011900842887e-07, + "loss": 3.0089, + "step": 89310 + }, + { + "epoch": 2.616875411997363, + "grad_norm": 21.058618545532227, + "learning_rate": 4.2387264234576674e-07, + "loss": 2.9968, + "step": 89320 + }, + { + "epoch": 2.617168387900095, + "grad_norm": 19.28712272644043, + "learning_rate": 4.2323562349749715e-07, + "loss": 3.0093, + "step": 89330 + }, + { + "epoch": 2.6174613638028275, + "grad_norm": 17.519683837890625, + "learning_rate": 4.225990625273507e-07, + "loss": 2.9985, + "step": 89340 + }, + { + "epoch": 2.6177543397055594, + "grad_norm": 16.550952911376953, + "learning_rate": 4.2196295949901044e-07, + "loss": 2.9999, + "step": 89350 + }, + { + "epoch": 2.6180473156082913, + "grad_norm": 17.656967163085938, + "learning_rate": 4.213273144761165e-07, + "loss": 3.0047, + "step": 89360 + }, + { + "epoch": 2.618340291511023, + "grad_norm": 20.781082153320312, + "learning_rate": 4.2069212752225974e-07, + "loss": 2.9974, + "step": 89370 + }, + { + "epoch": 2.618633267413755, + "grad_norm": 19.571353912353516, + "learning_rate": 4.200573987009876e-07, + "loss": 3.0057, + "step": 89380 + }, + { + "epoch": 2.618926243316487, + "grad_norm": 17.438705444335938, + "learning_rate": 4.1942312807580045e-07, + "loss": 2.9817, + "step": 89390 + }, + { + "epoch": 2.619219219219219, + "grad_norm": 15.265673637390137, + "learning_rate": 4.187893157101547e-07, + "loss": 3.0124, + "step": 89400 + }, + { + "epoch": 2.6195121951219513, + "grad_norm": 19.91465950012207, + "learning_rate": 4.18155961667458e-07, + "loss": 3.0035, + "step": 89410 + }, + { + "epoch": 2.6198051710246832, + "grad_norm": 21.803226470947266, + "learning_rate": 4.1752306601107464e-07, + "loss": 3.01, + "step": 89420 + }, + { + "epoch": 2.620098146927415, + "grad_norm": 19.18092155456543, + "learning_rate": 4.1689062880432065e-07, + "loss": 3.0079, + "step": 89430 + }, + { + "epoch": 2.620391122830147, + "grad_norm": 19.425355911254883, + "learning_rate": 4.1625865011046874e-07, + "loss": 2.9961, + "step": 89440 + }, + { + "epoch": 2.6206840987328794, + "grad_norm": 21.19694709777832, + "learning_rate": 4.156271299927456e-07, + "loss": 2.9968, + "step": 89450 + }, + { + "epoch": 2.6209770746356114, + "grad_norm": 17.126985549926758, + "learning_rate": 4.1499606851432897e-07, + "loss": 3.0111, + "step": 89460 + }, + { + "epoch": 2.6212700505383433, + "grad_norm": 20.781476974487305, + "learning_rate": 4.1436546573835336e-07, + "loss": 3.0045, + "step": 89470 + }, + { + "epoch": 2.621563026441075, + "grad_norm": 22.702621459960938, + "learning_rate": 4.1373532172790773e-07, + "loss": 3.0256, + "step": 89480 + }, + { + "epoch": 2.621856002343807, + "grad_norm": 18.822816848754883, + "learning_rate": 4.131056365460345e-07, + "loss": 3.0034, + "step": 89490 + }, + { + "epoch": 2.622148978246539, + "grad_norm": 16.36473274230957, + "learning_rate": 4.1247641025572825e-07, + "loss": 3.0059, + "step": 89500 + }, + { + "epoch": 2.622441954149271, + "grad_norm": 21.27811050415039, + "learning_rate": 4.118476429199414e-07, + "loss": 3.0132, + "step": 89510 + }, + { + "epoch": 2.6227349300520033, + "grad_norm": 19.565004348754883, + "learning_rate": 4.1121933460157637e-07, + "loss": 3.0155, + "step": 89520 + }, + { + "epoch": 2.6230279059547352, + "grad_norm": 19.76236915588379, + "learning_rate": 4.1059148536349346e-07, + "loss": 2.9822, + "step": 89530 + }, + { + "epoch": 2.623320881857467, + "grad_norm": 17.794300079345703, + "learning_rate": 4.0996409526850243e-07, + "loss": 2.9923, + "step": 89540 + }, + { + "epoch": 2.623613857760199, + "grad_norm": 16.577716827392578, + "learning_rate": 4.093371643793742e-07, + "loss": 2.9943, + "step": 89550 + }, + { + "epoch": 2.6239068336629314, + "grad_norm": 18.02314567565918, + "learning_rate": 4.087106927588258e-07, + "loss": 2.9914, + "step": 89560 + }, + { + "epoch": 2.6241998095656633, + "grad_norm": 22.486570358276367, + "learning_rate": 4.080846804695343e-07, + "loss": 3.0074, + "step": 89570 + }, + { + "epoch": 2.6244927854683953, + "grad_norm": 22.448156356811523, + "learning_rate": 4.0745912757412744e-07, + "loss": 2.9917, + "step": 89580 + }, + { + "epoch": 2.624785761371127, + "grad_norm": 14.671769142150879, + "learning_rate": 4.068340341351884e-07, + "loss": 3.0005, + "step": 89590 + }, + { + "epoch": 2.625078737273859, + "grad_norm": 18.204753875732422, + "learning_rate": 4.0620940021525334e-07, + "loss": 3.0124, + "step": 89600 + }, + { + "epoch": 2.625078737273859, + "eval_bleu": 0.3536267153836456, + "eval_cap_loss": 0.8998857140541077, + "eval_con_loss": 1.1269196271896362, + "eval_loss": 3.1537251472473145, + "step": 89600 + }, + { + "epoch": 2.625078737273859, + "eval_bleu": 0.3536267153836456, + "eval_cap_loss": 0.8998857140541077, + "eval_con_loss": 1.1269196271896362, + "eval_loss": 3.1537251472473145, + "eval_runtime": 54.103, + "eval_samples_per_second": 369.665, + "eval_steps_per_second": 0.37, + "step": 89600 + }, + { + "epoch": 2.625371713176591, + "grad_norm": 21.01189613342285, + "learning_rate": 4.055852258768145e-07, + "loss": 3.0028, + "step": 89610 + }, + { + "epoch": 2.6256646890793234, + "grad_norm": 16.937068939208984, + "learning_rate": 4.0496151118231407e-07, + "loss": 2.9995, + "step": 89620 + }, + { + "epoch": 2.6259576649820553, + "grad_norm": 15.165190696716309, + "learning_rate": 4.043382561941556e-07, + "loss": 2.9907, + "step": 89630 + }, + { + "epoch": 2.626250640884787, + "grad_norm": 18.24514389038086, + "learning_rate": 4.037154609746879e-07, + "loss": 3.0188, + "step": 89640 + }, + { + "epoch": 2.626543616787519, + "grad_norm": 19.964902877807617, + "learning_rate": 4.030931255862203e-07, + "loss": 3.0056, + "step": 89650 + }, + { + "epoch": 2.6268365926902515, + "grad_norm": 18.534738540649414, + "learning_rate": 4.0247125009101275e-07, + "loss": 2.992, + "step": 89660 + }, + { + "epoch": 2.6271295685929834, + "grad_norm": 19.20092010498047, + "learning_rate": 4.018498345512806e-07, + "loss": 2.9868, + "step": 89670 + }, + { + "epoch": 2.6274225444957153, + "grad_norm": 17.0986328125, + "learning_rate": 4.0122887902919237e-07, + "loss": 3.0049, + "step": 89680 + }, + { + "epoch": 2.6277155203984472, + "grad_norm": 15.976539611816406, + "learning_rate": 4.0060838358687127e-07, + "loss": 3.0131, + "step": 89690 + }, + { + "epoch": 2.628008496301179, + "grad_norm": 16.957500457763672, + "learning_rate": 3.9998834828639313e-07, + "loss": 2.9961, + "step": 89700 + }, + { + "epoch": 2.628301472203911, + "grad_norm": 16.24892234802246, + "learning_rate": 3.993687731897911e-07, + "loss": 3.0112, + "step": 89710 + }, + { + "epoch": 2.628594448106643, + "grad_norm": 20.726173400878906, + "learning_rate": 3.9874965835904834e-07, + "loss": 3.0011, + "step": 89720 + }, + { + "epoch": 2.6288874240093754, + "grad_norm": 19.203439712524414, + "learning_rate": 3.9813100385610426e-07, + "loss": 3.0141, + "step": 89730 + }, + { + "epoch": 2.6291803999121073, + "grad_norm": 23.68788719177246, + "learning_rate": 3.97512809742851e-07, + "loss": 3.0096, + "step": 89740 + }, + { + "epoch": 2.629473375814839, + "grad_norm": 14.327543258666992, + "learning_rate": 3.9689507608113574e-07, + "loss": 3.0011, + "step": 89750 + }, + { + "epoch": 2.629766351717571, + "grad_norm": 19.376659393310547, + "learning_rate": 3.962778029327585e-07, + "loss": 2.9964, + "step": 89760 + }, + { + "epoch": 2.6300593276203035, + "grad_norm": 19.017894744873047, + "learning_rate": 3.9566099035947493e-07, + "loss": 2.9968, + "step": 89770 + }, + { + "epoch": 2.6303523035230354, + "grad_norm": 17.099164962768555, + "learning_rate": 3.950446384229906e-07, + "loss": 2.9993, + "step": 89780 + }, + { + "epoch": 2.6306452794257673, + "grad_norm": 19.995208740234375, + "learning_rate": 3.9442874718497116e-07, + "loss": 3.0042, + "step": 89790 + }, + { + "epoch": 2.6309382553284992, + "grad_norm": 16.80486297607422, + "learning_rate": 3.938133167070324e-07, + "loss": 2.9932, + "step": 89800 + }, + { + "epoch": 2.631231231231231, + "grad_norm": 15.622331619262695, + "learning_rate": 3.9319834705074277e-07, + "loss": 3.0124, + "step": 89810 + }, + { + "epoch": 2.631524207133963, + "grad_norm": 19.665882110595703, + "learning_rate": 3.9258383827762816e-07, + "loss": 3.0052, + "step": 89820 + }, + { + "epoch": 2.631817183036695, + "grad_norm": 19.569265365600586, + "learning_rate": 3.9196979044916486e-07, + "loss": 3.0098, + "step": 89830 + }, + { + "epoch": 2.6321101589394273, + "grad_norm": 17.169551849365234, + "learning_rate": 3.9135620362678593e-07, + "loss": 3.0078, + "step": 89840 + }, + { + "epoch": 2.6324031348421593, + "grad_norm": 15.36681842803955, + "learning_rate": 3.9074307787187627e-07, + "loss": 3.0065, + "step": 89850 + }, + { + "epoch": 2.632696110744891, + "grad_norm": 19.348169326782227, + "learning_rate": 3.9013041324577617e-07, + "loss": 3.0089, + "step": 89860 + }, + { + "epoch": 2.632989086647623, + "grad_norm": 21.352333068847656, + "learning_rate": 3.895182098097783e-07, + "loss": 2.9884, + "step": 89870 + }, + { + "epoch": 2.6332820625503555, + "grad_norm": 17.691789627075195, + "learning_rate": 3.889064676251314e-07, + "loss": 2.9943, + "step": 89880 + }, + { + "epoch": 2.6335750384530874, + "grad_norm": 16.741893768310547, + "learning_rate": 3.8829518675303543e-07, + "loss": 3.0058, + "step": 89890 + }, + { + "epoch": 2.6338680143558193, + "grad_norm": 14.911194801330566, + "learning_rate": 3.8768436725464595e-07, + "loss": 2.9914, + "step": 89900 + }, + { + "epoch": 2.634160990258551, + "grad_norm": 19.09469223022461, + "learning_rate": 3.8707400919107075e-07, + "loss": 3.0022, + "step": 89910 + }, + { + "epoch": 2.634453966161283, + "grad_norm": 20.105079650878906, + "learning_rate": 3.864641126233748e-07, + "loss": 3.0077, + "step": 89920 + }, + { + "epoch": 2.634746942064015, + "grad_norm": 15.957499504089355, + "learning_rate": 3.858546776125716e-07, + "loss": 2.987, + "step": 89930 + }, + { + "epoch": 2.635039917966747, + "grad_norm": 17.489099502563477, + "learning_rate": 3.852457042196334e-07, + "loss": 3.0121, + "step": 89940 + }, + { + "epoch": 2.6353328938694793, + "grad_norm": 18.059206008911133, + "learning_rate": 3.8463719250548424e-07, + "loss": 3.007, + "step": 89950 + }, + { + "epoch": 2.6356258697722112, + "grad_norm": 17.725431442260742, + "learning_rate": 3.840291425310028e-07, + "loss": 3.0047, + "step": 89960 + }, + { + "epoch": 2.635918845674943, + "grad_norm": 15.418511390686035, + "learning_rate": 3.834215543570191e-07, + "loss": 3.0257, + "step": 89970 + }, + { + "epoch": 2.636211821577675, + "grad_norm": 19.353591918945312, + "learning_rate": 3.828144280443208e-07, + "loss": 2.9999, + "step": 89980 + }, + { + "epoch": 2.6365047974804074, + "grad_norm": 19.636371612548828, + "learning_rate": 3.8220776365364475e-07, + "loss": 2.9996, + "step": 89990 + }, + { + "epoch": 2.6367977733831394, + "grad_norm": 19.85502052307129, + "learning_rate": 3.816015612456869e-07, + "loss": 2.9912, + "step": 90000 + }, + { + "epoch": 2.6370907492858713, + "grad_norm": 17.741464614868164, + "learning_rate": 3.809958208810916e-07, + "loss": 3.0043, + "step": 90010 + }, + { + "epoch": 2.637383725188603, + "grad_norm": 17.60081672668457, + "learning_rate": 3.8039054262046126e-07, + "loss": 3.0132, + "step": 90020 + }, + { + "epoch": 2.637676701091335, + "grad_norm": 16.19495964050293, + "learning_rate": 3.79846187334833e-07, + "loss": 3.0034, + "step": 90030 + }, + { + "epoch": 2.637969676994067, + "grad_norm": 18.61940574645996, + "learning_rate": 3.7924178723852514e-07, + "loss": 3.0047, + "step": 90040 + }, + { + "epoch": 2.6382626528967994, + "grad_norm": 18.37839126586914, + "learning_rate": 3.7863784942166135e-07, + "loss": 2.9979, + "step": 90050 + }, + { + "epoch": 2.6385556287995313, + "grad_norm": 21.378908157348633, + "learning_rate": 3.780343739446635e-07, + "loss": 3.0134, + "step": 90060 + }, + { + "epoch": 2.6388486047022632, + "grad_norm": 19.96013069152832, + "learning_rate": 3.7743136086790534e-07, + "loss": 3.0009, + "step": 90070 + }, + { + "epoch": 2.639141580604995, + "grad_norm": 20.279150009155273, + "learning_rate": 3.7682881025171537e-07, + "loss": 3.0164, + "step": 90080 + }, + { + "epoch": 2.6394345565077275, + "grad_norm": 17.263656616210938, + "learning_rate": 3.7622672215637424e-07, + "loss": 3.0026, + "step": 90090 + }, + { + "epoch": 2.6397275324104594, + "grad_norm": 18.974510192871094, + "learning_rate": 3.756250966421182e-07, + "loss": 3.0112, + "step": 90100 + }, + { + "epoch": 2.6400205083131913, + "grad_norm": 19.83928108215332, + "learning_rate": 3.750239337691347e-07, + "loss": 3.0001, + "step": 90110 + }, + { + "epoch": 2.6400791034937376, + "eval_bleu": 0.35368965809709235, + "eval_cap_loss": 0.899956464767456, + "eval_con_loss": 1.1271082162857056, + "eval_loss": 3.154172897338867, + "step": 90112 + }, + { + "epoch": 2.6400791034937376, + "eval_bleu": 0.35368965809709235, + "eval_cap_loss": 0.899956464767456, + "eval_con_loss": 1.1271082162857056, + "eval_loss": 3.154172897338867, + "eval_runtime": 52.2629, + "eval_samples_per_second": 382.68, + "eval_steps_per_second": 0.383, + "step": 90112 + }, + { + "epoch": 2.6403134842159233, + "grad_norm": 15.964645385742188, + "learning_rate": 3.7442323359756784e-07, + "loss": 2.9775, + "step": 90120 + }, + { + "epoch": 2.640606460118655, + "grad_norm": 18.689817428588867, + "learning_rate": 3.738229961875145e-07, + "loss": 3.0123, + "step": 90130 + }, + { + "epoch": 2.640899436021387, + "grad_norm": 17.530622482299805, + "learning_rate": 3.732232215990234e-07, + "loss": 2.9917, + "step": 90140 + }, + { + "epoch": 2.641192411924119, + "grad_norm": 19.80915641784668, + "learning_rate": 3.726239098920997e-07, + "loss": 3.0149, + "step": 90150 + }, + { + "epoch": 2.6414853878268514, + "grad_norm": 16.61175537109375, + "learning_rate": 3.720250611267001e-07, + "loss": 3.0167, + "step": 90160 + }, + { + "epoch": 2.6417783637295833, + "grad_norm": 16.934879302978516, + "learning_rate": 3.7142667536273714e-07, + "loss": 2.9904, + "step": 90170 + }, + { + "epoch": 2.642071339632315, + "grad_norm": 15.893780708312988, + "learning_rate": 3.70828752660074e-07, + "loss": 3.0326, + "step": 90180 + }, + { + "epoch": 2.642364315535047, + "grad_norm": 26.742753982543945, + "learning_rate": 3.702312930785312e-07, + "loss": 2.9985, + "step": 90190 + }, + { + "epoch": 2.6426572914377795, + "grad_norm": 14.690593719482422, + "learning_rate": 3.6963429667787933e-07, + "loss": 2.9961, + "step": 90200 + }, + { + "epoch": 2.6429502673405114, + "grad_norm": 12.799416542053223, + "learning_rate": 3.6903776351784605e-07, + "loss": 3.0346, + "step": 90210 + }, + { + "epoch": 2.6432432432432433, + "grad_norm": 18.79480743408203, + "learning_rate": 3.684416936581087e-07, + "loss": 2.9816, + "step": 90220 + }, + { + "epoch": 2.6435362191459753, + "grad_norm": 18.046911239624023, + "learning_rate": 3.678460871583023e-07, + "loss": 3.0124, + "step": 90230 + }, + { + "epoch": 2.643829195048707, + "grad_norm": 19.367687225341797, + "learning_rate": 3.672509440780125e-07, + "loss": 2.9951, + "step": 90240 + }, + { + "epoch": 2.644122170951439, + "grad_norm": 18.230064392089844, + "learning_rate": 3.666562644767824e-07, + "loss": 3.0048, + "step": 90250 + }, + { + "epoch": 2.644415146854171, + "grad_norm": 15.820456504821777, + "learning_rate": 3.6606204841410255e-07, + "loss": 2.9992, + "step": 90260 + }, + { + "epoch": 2.6447081227569034, + "grad_norm": 17.35654067993164, + "learning_rate": 3.654682959494238e-07, + "loss": 2.9885, + "step": 90270 + }, + { + "epoch": 2.6450010986596353, + "grad_norm": 19.46766471862793, + "learning_rate": 3.6487500714214586e-07, + "loss": 3.0139, + "step": 90280 + }, + { + "epoch": 2.645294074562367, + "grad_norm": 17.202800750732422, + "learning_rate": 3.642821820516246e-07, + "loss": 3.0029, + "step": 90290 + }, + { + "epoch": 2.645587050465099, + "grad_norm": 19.267581939697266, + "learning_rate": 3.636898207371675e-07, + "loss": 3.0178, + "step": 90300 + }, + { + "epoch": 2.6458800263678315, + "grad_norm": 23.005075454711914, + "learning_rate": 3.6309792325803726e-07, + "loss": 3.0069, + "step": 90310 + }, + { + "epoch": 2.6461730022705634, + "grad_norm": 16.447465896606445, + "learning_rate": 3.625064896734504e-07, + "loss": 2.983, + "step": 90320 + }, + { + "epoch": 2.6464659781732953, + "grad_norm": 18.67270851135254, + "learning_rate": 3.619155200425767e-07, + "loss": 2.9861, + "step": 90330 + }, + { + "epoch": 2.6467589540760272, + "grad_norm": 19.536394119262695, + "learning_rate": 3.613250144245373e-07, + "loss": 2.9981, + "step": 90340 + }, + { + "epoch": 2.647051929978759, + "grad_norm": 19.346385955810547, + "learning_rate": 3.60734972878411e-07, + "loss": 3.0026, + "step": 90350 + }, + { + "epoch": 2.647344905881491, + "grad_norm": 22.274885177612305, + "learning_rate": 3.6014539546322615e-07, + "loss": 3.0053, + "step": 90360 + }, + { + "epoch": 2.647637881784223, + "grad_norm": 17.996044158935547, + "learning_rate": 3.595562822379678e-07, + "loss": 2.9868, + "step": 90370 + }, + { + "epoch": 2.6479308576869554, + "grad_norm": 15.886268615722656, + "learning_rate": 3.589676332615716e-07, + "loss": 2.9867, + "step": 90380 + }, + { + "epoch": 2.6482238335896873, + "grad_norm": 16.662761688232422, + "learning_rate": 3.583794485929293e-07, + "loss": 3.0203, + "step": 90390 + }, + { + "epoch": 2.648516809492419, + "grad_norm": 14.952370643615723, + "learning_rate": 3.5779172829088604e-07, + "loss": 3.0008, + "step": 90400 + }, + { + "epoch": 2.648809785395151, + "grad_norm": 21.892356872558594, + "learning_rate": 3.5720447241423926e-07, + "loss": 2.9906, + "step": 90410 + }, + { + "epoch": 2.6491027612978835, + "grad_norm": 20.777183532714844, + "learning_rate": 3.5661768102173977e-07, + "loss": 2.995, + "step": 90420 + }, + { + "epoch": 2.6493957372006154, + "grad_norm": 22.616607666015625, + "learning_rate": 3.560313541720939e-07, + "loss": 3.0019, + "step": 90430 + }, + { + "epoch": 2.6496887131033473, + "grad_norm": 17.352678298950195, + "learning_rate": 3.5544549192395807e-07, + "loss": 3.0117, + "step": 90440 + }, + { + "epoch": 2.649981689006079, + "grad_norm": 20.86542510986328, + "learning_rate": 3.5486009433594646e-07, + "loss": 3.0233, + "step": 90450 + }, + { + "epoch": 2.650274664908811, + "grad_norm": 21.576032638549805, + "learning_rate": 3.5427516146662345e-07, + "loss": 2.979, + "step": 90460 + }, + { + "epoch": 2.650567640811543, + "grad_norm": 17.672304153442383, + "learning_rate": 3.536906933745082e-07, + "loss": 2.9906, + "step": 90470 + }, + { + "epoch": 2.650860616714275, + "grad_norm": 18.183298110961914, + "learning_rate": 3.53106690118073e-07, + "loss": 2.9982, + "step": 90480 + }, + { + "epoch": 2.6511535926170073, + "grad_norm": 17.500349044799805, + "learning_rate": 3.525231517557448e-07, + "loss": 2.9938, + "step": 90490 + }, + { + "epoch": 2.6514465685197393, + "grad_norm": 18.29456901550293, + "learning_rate": 3.519400783459037e-07, + "loss": 2.9862, + "step": 90500 + }, + { + "epoch": 2.651739544422471, + "grad_norm": 19.364036560058594, + "learning_rate": 3.513574699468808e-07, + "loss": 2.9986, + "step": 90510 + }, + { + "epoch": 2.652032520325203, + "grad_norm": 19.713422775268555, + "learning_rate": 3.5077532661696446e-07, + "loss": 3.0058, + "step": 90520 + }, + { + "epoch": 2.6523254962279355, + "grad_norm": 18.04722785949707, + "learning_rate": 3.5019364841439254e-07, + "loss": 2.9886, + "step": 90530 + }, + { + "epoch": 2.6526184721306674, + "grad_norm": 17.729433059692383, + "learning_rate": 3.496124353973607e-07, + "loss": 3.002, + "step": 90540 + }, + { + "epoch": 2.6529114480333993, + "grad_norm": 18.367124557495117, + "learning_rate": 3.490316876240146e-07, + "loss": 2.994, + "step": 90550 + }, + { + "epoch": 2.653204423936131, + "grad_norm": 20.03067398071289, + "learning_rate": 3.484514051524546e-07, + "loss": 2.9963, + "step": 90560 + }, + { + "epoch": 2.653497399838863, + "grad_norm": 21.187976837158203, + "learning_rate": 3.478715880407346e-07, + "loss": 3.0283, + "step": 90570 + }, + { + "epoch": 2.653790375741595, + "grad_norm": 20.099302291870117, + "learning_rate": 3.4729223634686284e-07, + "loss": 3.005, + "step": 90580 + }, + { + "epoch": 2.6540833516443274, + "grad_norm": 19.548124313354492, + "learning_rate": 3.467133501287989e-07, + "loss": 3.0029, + "step": 90590 + }, + { + "epoch": 2.6543763275470593, + "grad_norm": 23.426301956176758, + "learning_rate": 3.461349294444577e-07, + "loss": 3.0088, + "step": 90600 + }, + { + "epoch": 2.6546693034497912, + "grad_norm": 21.98386573791504, + "learning_rate": 3.4555697435170567e-07, + "loss": 2.9946, + "step": 90610 + }, + { + "epoch": 2.654962279352523, + "grad_norm": 18.91567611694336, + "learning_rate": 3.4497948490836543e-07, + "loss": 3.0234, + "step": 90620 + }, + { + "epoch": 2.655079469713616, + "eval_bleu": 0.3538147310601265, + "eval_cap_loss": 0.8998149037361145, + "eval_con_loss": 1.126878023147583, + "eval_loss": 3.153571128845215, + "step": 90624 + }, + { + "epoch": 2.655079469713616, + "eval_bleu": 0.3538147310601265, + "eval_cap_loss": 0.8998149037361145, + "eval_con_loss": 1.126878023147583, + "eval_loss": 3.153571128845215, + "eval_runtime": 54.6304, + "eval_samples_per_second": 366.096, + "eval_steps_per_second": 0.366, + "step": 90624 + }, + { + "epoch": 2.6552552552552555, + "grad_norm": 17.394596099853516, + "learning_rate": 3.4440246117220965e-07, + "loss": 3.0047, + "step": 90630 + }, + { + "epoch": 2.6555482311579874, + "grad_norm": 20.540571212768555, + "learning_rate": 3.4382590320096666e-07, + "loss": 3.0042, + "step": 90640 + }, + { + "epoch": 2.6558412070607194, + "grad_norm": 19.23984718322754, + "learning_rate": 3.43249811052318e-07, + "loss": 2.9894, + "step": 90650 + }, + { + "epoch": 2.6561341829634513, + "grad_norm": 18.769906997680664, + "learning_rate": 3.426741847838988e-07, + "loss": 2.9971, + "step": 90660 + }, + { + "epoch": 2.656427158866183, + "grad_norm": 16.962139129638672, + "learning_rate": 3.420990244532962e-07, + "loss": 2.9864, + "step": 90670 + }, + { + "epoch": 2.656720134768915, + "grad_norm": 21.75751495361328, + "learning_rate": 3.415243301180521e-07, + "loss": 2.9999, + "step": 90680 + }, + { + "epoch": 2.657013110671647, + "grad_norm": 18.630510330200195, + "learning_rate": 3.4095010183566035e-07, + "loss": 3.033, + "step": 90690 + }, + { + "epoch": 2.6573060865743794, + "grad_norm": 19.269968032836914, + "learning_rate": 3.4037633966357063e-07, + "loss": 2.997, + "step": 90700 + }, + { + "epoch": 2.6575990624771113, + "grad_norm": 14.809626579284668, + "learning_rate": 3.3980304365918257e-07, + "loss": 3.0166, + "step": 90710 + }, + { + "epoch": 2.6578920383798432, + "grad_norm": 14.531834602355957, + "learning_rate": 3.39230213879852e-07, + "loss": 3.0064, + "step": 90720 + }, + { + "epoch": 2.658185014282575, + "grad_norm": 18.551021575927734, + "learning_rate": 3.3865785038288745e-07, + "loss": 2.9993, + "step": 90730 + }, + { + "epoch": 2.6584779901853075, + "grad_norm": 16.673490524291992, + "learning_rate": 3.380859532255504e-07, + "loss": 3.0104, + "step": 90740 + }, + { + "epoch": 2.6587709660880394, + "grad_norm": 20.550525665283203, + "learning_rate": 3.37514522465055e-07, + "loss": 3.0, + "step": 90750 + }, + { + "epoch": 2.6590639419907713, + "grad_norm": 15.619542121887207, + "learning_rate": 3.3694355815857117e-07, + "loss": 2.9812, + "step": 90760 + }, + { + "epoch": 2.6593569178935033, + "grad_norm": 16.540109634399414, + "learning_rate": 3.3637306036321813e-07, + "loss": 2.9977, + "step": 90770 + }, + { + "epoch": 2.659649893796235, + "grad_norm": 15.197124481201172, + "learning_rate": 3.358030291360731e-07, + "loss": 3.0032, + "step": 90780 + }, + { + "epoch": 2.659942869698967, + "grad_norm": 17.50227928161621, + "learning_rate": 3.352334645341626e-07, + "loss": 3.0337, + "step": 90790 + }, + { + "epoch": 2.660235845601699, + "grad_norm": 19.527002334594727, + "learning_rate": 3.346643666144683e-07, + "loss": 3.0066, + "step": 90800 + }, + { + "epoch": 2.6605288215044314, + "grad_norm": 16.12047576904297, + "learning_rate": 3.340957354339264e-07, + "loss": 2.9938, + "step": 90810 + }, + { + "epoch": 2.6608217974071633, + "grad_norm": 19.525224685668945, + "learning_rate": 3.3352757104942456e-07, + "loss": 3.0041, + "step": 90820 + }, + { + "epoch": 2.661114773309895, + "grad_norm": 21.971410751342773, + "learning_rate": 3.32959873517803e-07, + "loss": 3.0205, + "step": 90830 + }, + { + "epoch": 2.661407749212627, + "grad_norm": 16.072463989257812, + "learning_rate": 3.323926428958579e-07, + "loss": 2.9923, + "step": 90840 + }, + { + "epoch": 2.6617007251153595, + "grad_norm": 17.502023696899414, + "learning_rate": 3.318258792403378e-07, + "loss": 2.9821, + "step": 90850 + }, + { + "epoch": 2.6619937010180914, + "grad_norm": 15.96132755279541, + "learning_rate": 3.312595826079423e-07, + "loss": 2.9793, + "step": 90860 + }, + { + "epoch": 2.6622866769208233, + "grad_norm": 18.232593536376953, + "learning_rate": 3.3069375305532725e-07, + "loss": 3.0059, + "step": 90870 + }, + { + "epoch": 2.6625796528235552, + "grad_norm": 17.61224937438965, + "learning_rate": 3.3012839063910006e-07, + "loss": 3.0017, + "step": 90880 + }, + { + "epoch": 2.662872628726287, + "grad_norm": 19.253252029418945, + "learning_rate": 3.295634954158217e-07, + "loss": 3.0106, + "step": 90890 + }, + { + "epoch": 2.663165604629019, + "grad_norm": 18.11317253112793, + "learning_rate": 3.2899906744200625e-07, + "loss": 3.0095, + "step": 90900 + }, + { + "epoch": 2.663458580531751, + "grad_norm": 23.948049545288086, + "learning_rate": 3.284351067741237e-07, + "loss": 3.0095, + "step": 90910 + }, + { + "epoch": 2.6637515564344834, + "grad_norm": 17.69609260559082, + "learning_rate": 3.278716134685922e-07, + "loss": 3.0103, + "step": 90920 + }, + { + "epoch": 2.6640445323372153, + "grad_norm": 18.472827911376953, + "learning_rate": 3.2730858758178717e-07, + "loss": 2.9895, + "step": 90930 + }, + { + "epoch": 2.664337508239947, + "grad_norm": 26.42156982421875, + "learning_rate": 3.2674602917003586e-07, + "loss": 2.994, + "step": 90940 + }, + { + "epoch": 2.664630484142679, + "grad_norm": 18.671674728393555, + "learning_rate": 3.2618393828961927e-07, + "loss": 2.9978, + "step": 90950 + }, + { + "epoch": 2.6649234600454115, + "grad_norm": 17.2265625, + "learning_rate": 3.256223149967702e-07, + "loss": 3.0033, + "step": 90960 + }, + { + "epoch": 2.6652164359481434, + "grad_norm": 14.123478889465332, + "learning_rate": 3.25061159347676e-07, + "loss": 2.9885, + "step": 90970 + }, + { + "epoch": 2.6655094118508753, + "grad_norm": 18.225933074951172, + "learning_rate": 3.2450047139847785e-07, + "loss": 3.0005, + "step": 90980 + }, + { + "epoch": 2.6658023877536072, + "grad_norm": 13.759406089782715, + "learning_rate": 3.2394025120526916e-07, + "loss": 3.0015, + "step": 90990 + }, + { + "epoch": 2.666095363656339, + "grad_norm": 14.325350761413574, + "learning_rate": 3.233804988240957e-07, + "loss": 3.0015, + "step": 91000 + }, + { + "epoch": 2.666388339559071, + "grad_norm": 16.640625, + "learning_rate": 3.228212143109588e-07, + "loss": 2.9799, + "step": 91010 + }, + { + "epoch": 2.6666813154618034, + "grad_norm": 20.555566787719727, + "learning_rate": 3.222623977218092e-07, + "loss": 3.0089, + "step": 91020 + }, + { + "epoch": 2.6669742913645353, + "grad_norm": 17.72934341430664, + "learning_rate": 3.2170404911255617e-07, + "loss": 2.992, + "step": 91030 + }, + { + "epoch": 2.6672672672672673, + "grad_norm": 17.990816116333008, + "learning_rate": 3.2114616853905654e-07, + "loss": 3.0255, + "step": 91040 + }, + { + "epoch": 2.667560243169999, + "grad_norm": 17.984689712524414, + "learning_rate": 3.2058875605712414e-07, + "loss": 3.0129, + "step": 91050 + }, + { + "epoch": 2.6678532190727315, + "grad_norm": 18.024316787719727, + "learning_rate": 3.2003181172252487e-07, + "loss": 3.0085, + "step": 91060 + }, + { + "epoch": 2.6681461949754635, + "grad_norm": 17.851940155029297, + "learning_rate": 3.19475335590978e-07, + "loss": 2.976, + "step": 91070 + }, + { + "epoch": 2.6684391708781954, + "grad_norm": 17.524354934692383, + "learning_rate": 3.189193277181546e-07, + "loss": 3.004, + "step": 91080 + }, + { + "epoch": 2.6687321467809273, + "grad_norm": 19.313669204711914, + "learning_rate": 3.183637881596807e-07, + "loss": 3.0094, + "step": 91090 + }, + { + "epoch": 2.669025122683659, + "grad_norm": 16.854692459106445, + "learning_rate": 3.1780871697113467e-07, + "loss": 3.0158, + "step": 91100 + }, + { + "epoch": 2.669318098586391, + "grad_norm": 22.679426193237305, + "learning_rate": 3.1725411420804877e-07, + "loss": 3.0054, + "step": 91110 + }, + { + "epoch": 2.669611074489123, + "grad_norm": 19.045808792114258, + "learning_rate": 3.1669997992590574e-07, + "loss": 3.0113, + "step": 91120 + }, + { + "epoch": 2.6699040503918554, + "grad_norm": 17.60618782043457, + "learning_rate": 3.1614631418014464e-07, + "loss": 3.0003, + "step": 91130 + }, + { + "epoch": 2.6700798359334943, + "eval_bleu": 0.35382154507387836, + "eval_cap_loss": 0.8996003866195679, + "eval_con_loss": 1.1259232759475708, + "eval_loss": 3.151447057723999, + "step": 91136 + }, + { + "epoch": 2.6700798359334943, + "eval_bleu": 0.35382154507387836, + "eval_cap_loss": 0.8996003866195679, + "eval_con_loss": 1.1259232759475708, + "eval_loss": 3.151447057723999, + "eval_runtime": 55.2539, + "eval_samples_per_second": 361.965, + "eval_steps_per_second": 0.362, + "step": 91136 + }, + { + "epoch": 2.6701970262945873, + "grad_norm": 19.16583824157715, + "learning_rate": 3.155931170261567e-07, + "loss": 3.0136, + "step": 91140 + }, + { + "epoch": 2.6704900021973192, + "grad_norm": 13.534708023071289, + "learning_rate": 3.1504038851928654e-07, + "loss": 2.9865, + "step": 91150 + }, + { + "epoch": 2.670782978100051, + "grad_norm": 16.246009826660156, + "learning_rate": 3.1448812871482994e-07, + "loss": 3.0024, + "step": 91160 + }, + { + "epoch": 2.6710759540027835, + "grad_norm": 16.51812171936035, + "learning_rate": 3.1393633766803823e-07, + "loss": 2.9909, + "step": 91170 + }, + { + "epoch": 2.6713689299055154, + "grad_norm": 15.926804542541504, + "learning_rate": 3.133850154341139e-07, + "loss": 2.9941, + "step": 91180 + }, + { + "epoch": 2.6716619058082474, + "grad_norm": 20.038421630859375, + "learning_rate": 3.1283416206821404e-07, + "loss": 3.0148, + "step": 91190 + }, + { + "epoch": 2.6719548817109793, + "grad_norm": 19.585968017578125, + "learning_rate": 3.1228377762544895e-07, + "loss": 2.9976, + "step": 91200 + }, + { + "epoch": 2.672247857613711, + "grad_norm": 17.52625846862793, + "learning_rate": 3.1173386216088007e-07, + "loss": 3.0201, + "step": 91210 + }, + { + "epoch": 2.672540833516443, + "grad_norm": 18.832918167114258, + "learning_rate": 3.111844157295235e-07, + "loss": 2.9872, + "step": 91220 + }, + { + "epoch": 2.672833809419175, + "grad_norm": 17.410524368286133, + "learning_rate": 3.1063543838634856e-07, + "loss": 3.0254, + "step": 91230 + }, + { + "epoch": 2.6731267853219074, + "grad_norm": 20.83738899230957, + "learning_rate": 3.10086930186278e-07, + "loss": 2.9976, + "step": 91240 + }, + { + "epoch": 2.6734197612246393, + "grad_norm": 19.289798736572266, + "learning_rate": 3.0953889118418457e-07, + "loss": 3.014, + "step": 91250 + }, + { + "epoch": 2.6737127371273712, + "grad_norm": 18.576831817626953, + "learning_rate": 3.089913214348983e-07, + "loss": 2.9884, + "step": 91260 + }, + { + "epoch": 2.674005713030103, + "grad_norm": 18.996313095092773, + "learning_rate": 3.084442209931987e-07, + "loss": 2.993, + "step": 91270 + }, + { + "epoch": 2.6742986889328355, + "grad_norm": 16.6117000579834, + "learning_rate": 3.07897589913822e-07, + "loss": 2.986, + "step": 91280 + }, + { + "epoch": 2.6745916648355674, + "grad_norm": 18.783252716064453, + "learning_rate": 3.073514282514528e-07, + "loss": 3.0199, + "step": 91290 + }, + { + "epoch": 2.6748846407382993, + "grad_norm": 17.67466926574707, + "learning_rate": 3.068057360607329e-07, + "loss": 2.9794, + "step": 91300 + }, + { + "epoch": 2.6751776166410313, + "grad_norm": 17.688539505004883, + "learning_rate": 3.062605133962554e-07, + "loss": 2.9993, + "step": 91310 + }, + { + "epoch": 2.675470592543763, + "grad_norm": 14.650411605834961, + "learning_rate": 3.057157603125671e-07, + "loss": 3.017, + "step": 91320 + }, + { + "epoch": 2.675763568446495, + "grad_norm": 15.176665306091309, + "learning_rate": 3.0517147686416613e-07, + "loss": 2.9934, + "step": 91330 + }, + { + "epoch": 2.676056544349227, + "grad_norm": 19.211259841918945, + "learning_rate": 3.046276631055062e-07, + "loss": 2.9859, + "step": 91340 + }, + { + "epoch": 2.6763495202519594, + "grad_norm": 18.594738006591797, + "learning_rate": 3.040843190909909e-07, + "loss": 3.0034, + "step": 91350 + }, + { + "epoch": 2.6766424961546913, + "grad_norm": 17.50558090209961, + "learning_rate": 3.035414448749807e-07, + "loss": 2.9858, + "step": 91360 + }, + { + "epoch": 2.676935472057423, + "grad_norm": 13.835028648376465, + "learning_rate": 3.02999040511785e-07, + "loss": 2.9929, + "step": 91370 + }, + { + "epoch": 2.677228447960155, + "grad_norm": 17.16483497619629, + "learning_rate": 3.024571060556686e-07, + "loss": 3.0089, + "step": 91380 + }, + { + "epoch": 2.6775214238628875, + "grad_norm": 21.755035400390625, + "learning_rate": 3.0191564156084927e-07, + "loss": 2.9947, + "step": 91390 + }, + { + "epoch": 2.6778143997656194, + "grad_norm": 16.236190795898438, + "learning_rate": 3.013746470814982e-07, + "loss": 2.9888, + "step": 91400 + }, + { + "epoch": 2.6781073756683513, + "grad_norm": 17.57683563232422, + "learning_rate": 3.0083412267173704e-07, + "loss": 3.0042, + "step": 91410 + }, + { + "epoch": 2.6784003515710832, + "grad_norm": 17.094406127929688, + "learning_rate": 3.0029406838564313e-07, + "loss": 2.986, + "step": 91420 + }, + { + "epoch": 2.678693327473815, + "grad_norm": 15.907098770141602, + "learning_rate": 2.997544842772449e-07, + "loss": 2.9951, + "step": 91430 + }, + { + "epoch": 2.678986303376547, + "grad_norm": 19.648048400878906, + "learning_rate": 2.992153704005252e-07, + "loss": 3.0085, + "step": 91440 + }, + { + "epoch": 2.679279279279279, + "grad_norm": 15.697529792785645, + "learning_rate": 2.986767268094182e-07, + "loss": 2.9973, + "step": 91450 + }, + { + "epoch": 2.6795722551820114, + "grad_norm": 15.788487434387207, + "learning_rate": 2.9813855355781295e-07, + "loss": 3.0003, + "step": 91460 + }, + { + "epoch": 2.6798652310847433, + "grad_norm": 19.50789451599121, + "learning_rate": 2.976008506995498e-07, + "loss": 2.988, + "step": 91470 + }, + { + "epoch": 2.680158206987475, + "grad_norm": 21.105653762817383, + "learning_rate": 2.9706361828842455e-07, + "loss": 3.0047, + "step": 91480 + }, + { + "epoch": 2.680451182890207, + "grad_norm": 15.925585746765137, + "learning_rate": 2.9652685637818147e-07, + "loss": 3.0189, + "step": 91490 + }, + { + "epoch": 2.6807441587929395, + "grad_norm": 17.348188400268555, + "learning_rate": 2.9599056502252253e-07, + "loss": 2.9998, + "step": 91500 + }, + { + "epoch": 2.6810371346956714, + "grad_norm": 18.838031768798828, + "learning_rate": 2.954547442750988e-07, + "loss": 3.0199, + "step": 91510 + }, + { + "epoch": 2.6813301105984033, + "grad_norm": 20.643739700317383, + "learning_rate": 2.9491939418951787e-07, + "loss": 2.9993, + "step": 91520 + }, + { + "epoch": 2.6816230865011352, + "grad_norm": 14.710201263427734, + "learning_rate": 2.943845148193364e-07, + "loss": 2.9862, + "step": 91530 + }, + { + "epoch": 2.681916062403867, + "grad_norm": 19.21002960205078, + "learning_rate": 2.9385010621806653e-07, + "loss": 3.0002, + "step": 91540 + }, + { + "epoch": 2.682209038306599, + "grad_norm": 17.733068466186523, + "learning_rate": 2.933161684391733e-07, + "loss": 2.9992, + "step": 91550 + }, + { + "epoch": 2.6825020142093314, + "grad_norm": 16.363893508911133, + "learning_rate": 2.9278270153607345e-07, + "loss": 2.9896, + "step": 91560 + }, + { + "epoch": 2.6827949901120633, + "grad_norm": 20.523635864257812, + "learning_rate": 2.922497055621376e-07, + "loss": 3.0065, + "step": 91570 + }, + { + "epoch": 2.6830879660147953, + "grad_norm": 15.44674015045166, + "learning_rate": 2.9171718057068867e-07, + "loss": 2.9914, + "step": 91580 + }, + { + "epoch": 2.683380941917527, + "grad_norm": 19.05780792236328, + "learning_rate": 2.911851266150023e-07, + "loss": 2.9961, + "step": 91590 + }, + { + "epoch": 2.6836739178202595, + "grad_norm": 21.84246826171875, + "learning_rate": 2.9065354374830767e-07, + "loss": 3.0101, + "step": 91600 + }, + { + "epoch": 2.6839668937229915, + "grad_norm": 16.877891540527344, + "learning_rate": 2.9012243202378663e-07, + "loss": 2.9865, + "step": 91610 + }, + { + "epoch": 2.6842598696257234, + "grad_norm": 21.052228927612305, + "learning_rate": 2.895917914945723e-07, + "loss": 3.0027, + "step": 91620 + }, + { + "epoch": 2.6845528455284553, + "grad_norm": 18.282567977905273, + "learning_rate": 2.890616222137549e-07, + "loss": 2.978, + "step": 91630 + }, + { + "epoch": 2.684845821431187, + "grad_norm": 17.9250431060791, + "learning_rate": 2.8853192423437205e-07, + "loss": 3.0049, + "step": 91640 + }, + { + "epoch": 2.685080202153373, + "eval_bleu": 0.3538124133934715, + "eval_cap_loss": 0.8995830416679382, + "eval_con_loss": 1.1261568069458008, + "eval_loss": 3.1518964767456055, + "step": 91648 + }, + { + "epoch": 2.685080202153373, + "eval_bleu": 0.3538124133934715, + "eval_cap_loss": 0.8995830416679382, + "eval_con_loss": 1.1261568069458008, + "eval_loss": 3.1518964767456055, + "eval_runtime": 54.4787, + "eval_samples_per_second": 367.116, + "eval_steps_per_second": 0.367, + "step": 91648 + }, + { + "epoch": 2.685138797333919, + "grad_norm": 18.464323043823242, + "learning_rate": 2.880026976094191e-07, + "loss": 2.9978, + "step": 91650 + }, + { + "epoch": 2.685431773236651, + "grad_norm": 18.25983238220215, + "learning_rate": 2.874739423918399e-07, + "loss": 2.9933, + "step": 91660 + }, + { + "epoch": 2.6857247491393834, + "grad_norm": 18.7008113861084, + "learning_rate": 2.869456586345354e-07, + "loss": 2.9896, + "step": 91670 + }, + { + "epoch": 2.6860177250421153, + "grad_norm": 18.943038940429688, + "learning_rate": 2.86417846390355e-07, + "loss": 2.993, + "step": 91680 + }, + { + "epoch": 2.6863107009448473, + "grad_norm": 20.855125427246094, + "learning_rate": 2.858905057121053e-07, + "loss": 3.0006, + "step": 91690 + }, + { + "epoch": 2.686603676847579, + "grad_norm": 18.294506072998047, + "learning_rate": 2.853636366525414e-07, + "loss": 2.9725, + "step": 91700 + }, + { + "epoch": 2.6868966527503115, + "grad_norm": 17.421756744384766, + "learning_rate": 2.8483723926437613e-07, + "loss": 2.9905, + "step": 91710 + }, + { + "epoch": 2.6871896286530434, + "grad_norm": 19.847293853759766, + "learning_rate": 2.843113136002695e-07, + "loss": 2.9891, + "step": 91720 + }, + { + "epoch": 2.6874826045557754, + "grad_norm": 17.910188674926758, + "learning_rate": 2.837858597128401e-07, + "loss": 2.9875, + "step": 91730 + }, + { + "epoch": 2.6877755804585073, + "grad_norm": 18.5717716217041, + "learning_rate": 2.8326087765465415e-07, + "loss": 3.0002, + "step": 91740 + }, + { + "epoch": 2.688068556361239, + "grad_norm": 14.794572830200195, + "learning_rate": 2.827363674782352e-07, + "loss": 2.9629, + "step": 91750 + }, + { + "epoch": 2.688361532263971, + "grad_norm": 21.061859130859375, + "learning_rate": 2.822123292360546e-07, + "loss": 3.0091, + "step": 91760 + }, + { + "epoch": 2.688654508166703, + "grad_norm": 13.693034172058105, + "learning_rate": 2.8168876298054205e-07, + "loss": 2.9861, + "step": 91770 + }, + { + "epoch": 2.6889474840694354, + "grad_norm": 18.442058563232422, + "learning_rate": 2.8116566876407404e-07, + "loss": 3.0003, + "step": 91780 + }, + { + "epoch": 2.6892404599721673, + "grad_norm": 19.312597274780273, + "learning_rate": 2.80643046638987e-07, + "loss": 3.0211, + "step": 91790 + }, + { + "epoch": 2.6895334358748992, + "grad_norm": 19.914913177490234, + "learning_rate": 2.80120896657563e-07, + "loss": 2.9952, + "step": 91800 + }, + { + "epoch": 2.689826411777631, + "grad_norm": 21.923139572143555, + "learning_rate": 2.79599218872042e-07, + "loss": 3.0062, + "step": 91810 + }, + { + "epoch": 2.6901193876803635, + "grad_norm": 17.236116409301758, + "learning_rate": 2.7907801333461325e-07, + "loss": 3.0057, + "step": 91820 + }, + { + "epoch": 2.6904123635830954, + "grad_norm": 19.89923858642578, + "learning_rate": 2.7855728009742177e-07, + "loss": 3.0107, + "step": 91830 + }, + { + "epoch": 2.6907053394858274, + "grad_norm": 16.514537811279297, + "learning_rate": 2.7803701921256255e-07, + "loss": 2.9949, + "step": 91840 + }, + { + "epoch": 2.6909983153885593, + "grad_norm": 17.99540901184082, + "learning_rate": 2.7751723073208557e-07, + "loss": 2.9993, + "step": 91850 + }, + { + "epoch": 2.691291291291291, + "grad_norm": 18.76699447631836, + "learning_rate": 2.7699791470799096e-07, + "loss": 2.9964, + "step": 91860 + }, + { + "epoch": 2.691584267194023, + "grad_norm": 16.179101943969727, + "learning_rate": 2.7647907119223547e-07, + "loss": 2.9775, + "step": 91870 + }, + { + "epoch": 2.691877243096755, + "grad_norm": 20.819377899169922, + "learning_rate": 2.7596070023672426e-07, + "loss": 3.0032, + "step": 91880 + }, + { + "epoch": 2.6921702189994874, + "grad_norm": 18.591018676757812, + "learning_rate": 2.7544280189331973e-07, + "loss": 3.0176, + "step": 91890 + }, + { + "epoch": 2.6924631949022193, + "grad_norm": 18.354719161987305, + "learning_rate": 2.749253762138315e-07, + "loss": 2.99, + "step": 91900 + }, + { + "epoch": 2.692756170804951, + "grad_norm": 16.978010177612305, + "learning_rate": 2.7440842325002716e-07, + "loss": 3.0064, + "step": 91910 + }, + { + "epoch": 2.693049146707683, + "grad_norm": 17.700029373168945, + "learning_rate": 2.7389194305362463e-07, + "loss": 3.0, + "step": 91920 + }, + { + "epoch": 2.6933421226104155, + "grad_norm": 19.927244186401367, + "learning_rate": 2.733759356762933e-07, + "loss": 2.9958, + "step": 91930 + }, + { + "epoch": 2.6936350985131474, + "grad_norm": 18.808271408081055, + "learning_rate": 2.728604011696573e-07, + "loss": 3.0029, + "step": 91940 + }, + { + "epoch": 2.6939280744158793, + "grad_norm": 18.752511978149414, + "learning_rate": 2.723453395852932e-07, + "loss": 3.0155, + "step": 91950 + }, + { + "epoch": 2.6942210503186113, + "grad_norm": 18.968481063842773, + "learning_rate": 2.7183075097473035e-07, + "loss": 3.0066, + "step": 91960 + }, + { + "epoch": 2.694514026221343, + "grad_norm": 17.49675178527832, + "learning_rate": 2.7131663538944917e-07, + "loss": 3.0102, + "step": 91970 + }, + { + "epoch": 2.694807002124075, + "grad_norm": 20.353368759155273, + "learning_rate": 2.7080299288088474e-07, + "loss": 2.9928, + "step": 91980 + }, + { + "epoch": 2.6950999780268075, + "grad_norm": 19.28335189819336, + "learning_rate": 2.702898235004225e-07, + "loss": 2.9903, + "step": 91990 + }, + { + "epoch": 2.6953929539295394, + "grad_norm": 20.70633316040039, + "learning_rate": 2.697771272994043e-07, + "loss": 2.9881, + "step": 92000 + }, + { + "epoch": 2.6956859298322713, + "grad_norm": 16.849040985107422, + "learning_rate": 2.692649043291201e-07, + "loss": 3.0158, + "step": 92010 + }, + { + "epoch": 2.695978905735003, + "grad_norm": 17.41381072998047, + "learning_rate": 2.6875315464081566e-07, + "loss": 3.0137, + "step": 92020 + }, + { + "epoch": 2.6962718816377356, + "grad_norm": 18.012540817260742, + "learning_rate": 2.682929846197507e-07, + "loss": 3.0071, + "step": 92030 + }, + { + "epoch": 2.6965648575404675, + "grad_norm": 19.815603256225586, + "learning_rate": 2.6778213430821876e-07, + "loss": 3.0159, + "step": 92040 + }, + { + "epoch": 2.6968578334431994, + "grad_norm": 20.214826583862305, + "learning_rate": 2.6727175742700837e-07, + "loss": 3.0188, + "step": 92050 + }, + { + "epoch": 2.6971508093459313, + "grad_norm": 16.162506103515625, + "learning_rate": 2.667618540271799e-07, + "loss": 2.9936, + "step": 92060 + }, + { + "epoch": 2.6974437852486632, + "grad_norm": 19.391969680786133, + "learning_rate": 2.6625242415974697e-07, + "loss": 2.9751, + "step": 92070 + }, + { + "epoch": 2.697736761151395, + "grad_norm": 19.32337188720703, + "learning_rate": 2.657434678756754e-07, + "loss": 2.9819, + "step": 92080 + }, + { + "epoch": 2.698029737054127, + "grad_norm": 16.69041633605957, + "learning_rate": 2.6523498522588175e-07, + "loss": 2.9861, + "step": 92090 + }, + { + "epoch": 2.6983227129568594, + "grad_norm": 18.805402755737305, + "learning_rate": 2.647269762612381e-07, + "loss": 3.0066, + "step": 92100 + }, + { + "epoch": 2.6986156888595914, + "grad_norm": 18.121646881103516, + "learning_rate": 2.6421944103256657e-07, + "loss": 3.0079, + "step": 92110 + }, + { + "epoch": 2.6989086647623233, + "grad_norm": 19.936782836914062, + "learning_rate": 2.637123795906438e-07, + "loss": 3.0047, + "step": 92120 + }, + { + "epoch": 2.699201640665055, + "grad_norm": 16.026084899902344, + "learning_rate": 2.6320579198619746e-07, + "loss": 3.0061, + "step": 92130 + }, + { + "epoch": 2.6994946165677876, + "grad_norm": 18.061132431030273, + "learning_rate": 2.6269967826990926e-07, + "loss": 3.0074, + "step": 92140 + }, + { + "epoch": 2.6997875924705195, + "grad_norm": 20.149234771728516, + "learning_rate": 2.6219403849241263e-07, + "loss": 2.9959, + "step": 92150 + }, + { + "epoch": 2.7000805683732514, + "grad_norm": 20.526351928710938, + "learning_rate": 2.6168887270429487e-07, + "loss": 3.003, + "step": 92160 + }, + { + "epoch": 2.7000805683732514, + "eval_bleu": 0.3538952305886356, + "eval_cap_loss": 0.8994816541671753, + "eval_con_loss": 1.1258118152618408, + "eval_loss": 3.1511054039001465, + "step": 92160 + }, + { + "epoch": 2.7000805683732514, + "eval_bleu": 0.3538952305886356, + "eval_cap_loss": 0.8994816541671753, + "eval_con_loss": 1.1258118152618408, + "eval_loss": 3.1511054039001465, + "eval_runtime": 56.881, + "eval_samples_per_second": 351.611, + "eval_steps_per_second": 0.352, + "step": 92160 + }, + { + "epoch": 2.7003735442759833, + "grad_norm": 20.111791610717773, + "learning_rate": 2.6118418095609335e-07, + "loss": 3.0246, + "step": 92170 + }, + { + "epoch": 2.7006665201787152, + "grad_norm": 18.083498001098633, + "learning_rate": 2.606799632983009e-07, + "loss": 2.9861, + "step": 92180 + }, + { + "epoch": 2.700959496081447, + "grad_norm": 18.675798416137695, + "learning_rate": 2.6017621978135956e-07, + "loss": 2.9813, + "step": 92190 + }, + { + "epoch": 2.701252471984179, + "grad_norm": 16.595258712768555, + "learning_rate": 2.596729504556683e-07, + "loss": 3.0159, + "step": 92200 + }, + { + "epoch": 2.7015454478869114, + "grad_norm": 16.809730529785156, + "learning_rate": 2.5917015537157365e-07, + "loss": 2.9883, + "step": 92210 + }, + { + "epoch": 2.7018384237896433, + "grad_norm": 20.852066040039062, + "learning_rate": 2.586678345793786e-07, + "loss": 2.9938, + "step": 92220 + }, + { + "epoch": 2.7021313996923753, + "grad_norm": 13.463897705078125, + "learning_rate": 2.58165988129338e-07, + "loss": 2.9637, + "step": 92230 + }, + { + "epoch": 2.702424375595107, + "grad_norm": 17.758575439453125, + "learning_rate": 2.5766461607165725e-07, + "loss": 2.9965, + "step": 92240 + }, + { + "epoch": 2.7027173514978395, + "grad_norm": 21.96714973449707, + "learning_rate": 2.5716371845649734e-07, + "loss": 2.9956, + "step": 92250 + }, + { + "epoch": 2.7030103274005715, + "grad_norm": 17.727697372436523, + "learning_rate": 2.5666329533396816e-07, + "loss": 3.0001, + "step": 92260 + }, + { + "epoch": 2.7033033033033034, + "grad_norm": 16.897197723388672, + "learning_rate": 2.5616334675413636e-07, + "loss": 2.995, + "step": 92270 + }, + { + "epoch": 2.7035962792060353, + "grad_norm": 16.451751708984375, + "learning_rate": 2.556638727670163e-07, + "loss": 3.0078, + "step": 92280 + }, + { + "epoch": 2.703889255108767, + "grad_norm": 19.101428985595703, + "learning_rate": 2.5516487342257924e-07, + "loss": 3.0024, + "step": 92290 + }, + { + "epoch": 2.704182231011499, + "grad_norm": 20.77566146850586, + "learning_rate": 2.546663487707457e-07, + "loss": 2.9928, + "step": 92300 + }, + { + "epoch": 2.704475206914231, + "grad_norm": 17.0070743560791, + "learning_rate": 2.541682988613914e-07, + "loss": 2.9766, + "step": 92310 + }, + { + "epoch": 2.7047681828169634, + "grad_norm": 17.95570182800293, + "learning_rate": 2.5367072374434253e-07, + "loss": 3.0079, + "step": 92320 + }, + { + "epoch": 2.7050611587196953, + "grad_norm": 17.252283096313477, + "learning_rate": 2.5317362346937933e-07, + "loss": 3.0221, + "step": 92330 + }, + { + "epoch": 2.7053541346224272, + "grad_norm": 19.574676513671875, + "learning_rate": 2.52676998086232e-07, + "loss": 2.9931, + "step": 92340 + }, + { + "epoch": 2.705647110525159, + "grad_norm": 13.565011024475098, + "learning_rate": 2.521808476445875e-07, + "loss": 2.984, + "step": 92350 + }, + { + "epoch": 2.7059400864278915, + "grad_norm": 17.45563507080078, + "learning_rate": 2.516851721940805e-07, + "loss": 2.9708, + "step": 92360 + }, + { + "epoch": 2.7062330623306234, + "grad_norm": 21.343223571777344, + "learning_rate": 2.511899717843014e-07, + "loss": 3.0116, + "step": 92370 + }, + { + "epoch": 2.7065260382333554, + "grad_norm": 16.08833122253418, + "learning_rate": 2.506952464647916e-07, + "loss": 2.9798, + "step": 92380 + }, + { + "epoch": 2.7068190141360873, + "grad_norm": 18.319507598876953, + "learning_rate": 2.5020099628504603e-07, + "loss": 2.9949, + "step": 92390 + }, + { + "epoch": 2.707111990038819, + "grad_norm": 17.507970809936523, + "learning_rate": 2.4970722129451066e-07, + "loss": 2.9896, + "step": 92400 + }, + { + "epoch": 2.707404965941551, + "grad_norm": 20.805543899536133, + "learning_rate": 2.492139215425865e-07, + "loss": 3.0088, + "step": 92410 + }, + { + "epoch": 2.7076979418442835, + "grad_norm": 18.008438110351562, + "learning_rate": 2.48721097078623e-07, + "loss": 3.0064, + "step": 92420 + }, + { + "epoch": 2.7079909177470154, + "grad_norm": 20.70323371887207, + "learning_rate": 2.4822874795192575e-07, + "loss": 2.9731, + "step": 92430 + }, + { + "epoch": 2.7082838936497473, + "grad_norm": 20.671762466430664, + "learning_rate": 2.4773687421175083e-07, + "loss": 3.004, + "step": 92440 + }, + { + "epoch": 2.7085768695524792, + "grad_norm": 20.263134002685547, + "learning_rate": 2.4724547590730786e-07, + "loss": 2.9846, + "step": 92450 + }, + { + "epoch": 2.708869845455211, + "grad_norm": 19.34324073791504, + "learning_rate": 2.4675455308775744e-07, + "loss": 2.9838, + "step": 92460 + }, + { + "epoch": 2.7091628213579435, + "grad_norm": 16.014394760131836, + "learning_rate": 2.462641058022136e-07, + "loss": 3.0053, + "step": 92470 + }, + { + "epoch": 2.7094557972606754, + "grad_norm": 15.821072578430176, + "learning_rate": 2.457741340997433e-07, + "loss": 3.0013, + "step": 92480 + }, + { + "epoch": 2.7097487731634073, + "grad_norm": 19.300065994262695, + "learning_rate": 2.4528463802936555e-07, + "loss": 2.9882, + "step": 92490 + }, + { + "epoch": 2.7100417490661393, + "grad_norm": 20.06265640258789, + "learning_rate": 2.4479561764005013e-07, + "loss": 2.9887, + "step": 92500 + }, + { + "epoch": 2.710334724968871, + "grad_norm": 15.636382102966309, + "learning_rate": 2.4430707298072287e-07, + "loss": 3.0069, + "step": 92510 + }, + { + "epoch": 2.710627700871603, + "grad_norm": 18.79867935180664, + "learning_rate": 2.4381900410025695e-07, + "loss": 2.9923, + "step": 92520 + }, + { + "epoch": 2.7109206767743355, + "grad_norm": 18.25360679626465, + "learning_rate": 2.4333141104748327e-07, + "loss": 2.9965, + "step": 92530 + }, + { + "epoch": 2.7112136526770674, + "grad_norm": 21.867401123046875, + "learning_rate": 2.4284429387118115e-07, + "loss": 3.008, + "step": 92540 + }, + { + "epoch": 2.7115066285797993, + "grad_norm": 15.630112648010254, + "learning_rate": 2.423576526200838e-07, + "loss": 3.0062, + "step": 92550 + }, + { + "epoch": 2.711799604482531, + "grad_norm": 18.499387741088867, + "learning_rate": 2.418714873428774e-07, + "loss": 3.0189, + "step": 92560 + }, + { + "epoch": 2.7120925803852636, + "grad_norm": 20.68560218811035, + "learning_rate": 2.413857980882001e-07, + "loss": 3.013, + "step": 92570 + }, + { + "epoch": 2.7123855562879955, + "grad_norm": 19.204166412353516, + "learning_rate": 2.4090058490464087e-07, + "loss": 3.0121, + "step": 92580 + }, + { + "epoch": 2.7126785321907274, + "grad_norm": 19.52005386352539, + "learning_rate": 2.404158478407437e-07, + "loss": 3.0217, + "step": 92590 + }, + { + "epoch": 2.7129715080934593, + "grad_norm": 19.76167869567871, + "learning_rate": 2.3993158694500415e-07, + "loss": 3.0027, + "step": 92600 + }, + { + "epoch": 2.7132644839961912, + "grad_norm": 18.324954986572266, + "learning_rate": 2.3944780226586804e-07, + "loss": 2.9814, + "step": 92610 + }, + { + "epoch": 2.713557459898923, + "grad_norm": 18.45107650756836, + "learning_rate": 2.389644938517366e-07, + "loss": 3.0101, + "step": 92620 + }, + { + "epoch": 2.713850435801655, + "grad_norm": 19.497230529785156, + "learning_rate": 2.3848166175096056e-07, + "loss": 3.0239, + "step": 92630 + }, + { + "epoch": 2.7141434117043874, + "grad_norm": 22.922861099243164, + "learning_rate": 2.3799930601184518e-07, + "loss": 2.9898, + "step": 92640 + }, + { + "epoch": 2.7144363876071194, + "grad_norm": 17.371023178100586, + "learning_rate": 2.3751742668264689e-07, + "loss": 2.9848, + "step": 92650 + }, + { + "epoch": 2.7147293635098513, + "grad_norm": 17.28805160522461, + "learning_rate": 2.3703602381157653e-07, + "loss": 2.9987, + "step": 92660 + }, + { + "epoch": 2.715022339412583, + "grad_norm": 20.347667694091797, + "learning_rate": 2.3655509744679339e-07, + "loss": 3.0165, + "step": 92670 + }, + { + "epoch": 2.7150809345931295, + "eval_bleu": 0.35363084215369933, + "eval_cap_loss": 0.8994849920272827, + "eval_con_loss": 1.125730276107788, + "eval_loss": 3.1509456634521484, + "step": 92672 + }, + { + "epoch": 2.7150809345931295, + "eval_bleu": 0.35363084215369933, + "eval_cap_loss": 0.8994849920272827, + "eval_con_loss": 1.125730276107788, + "eval_loss": 3.1509456634521484, + "eval_runtime": 54.8415, + "eval_samples_per_second": 364.687, + "eval_steps_per_second": 0.365, + "step": 92672 + }, + { + "epoch": 2.7153153153153156, + "grad_norm": 21.772058486938477, + "learning_rate": 2.3607464763641285e-07, + "loss": 3.0049, + "step": 92680 + }, + { + "epoch": 2.7156082912180475, + "grad_norm": 18.872068405151367, + "learning_rate": 2.3559467442849927e-07, + "loss": 3.015, + "step": 92690 + }, + { + "epoch": 2.7159012671207794, + "grad_norm": 16.012163162231445, + "learning_rate": 2.3511517787107363e-07, + "loss": 2.9936, + "step": 92700 + }, + { + "epoch": 2.7161942430235113, + "grad_norm": 17.5259952545166, + "learning_rate": 2.3463615801210426e-07, + "loss": 2.9928, + "step": 92710 + }, + { + "epoch": 2.7164872189262432, + "grad_norm": 21.93014144897461, + "learning_rate": 2.341576148995156e-07, + "loss": 2.9903, + "step": 92720 + }, + { + "epoch": 2.716780194828975, + "grad_norm": 17.346588134765625, + "learning_rate": 2.336795485811827e-07, + "loss": 2.9993, + "step": 92730 + }, + { + "epoch": 2.717073170731707, + "grad_norm": 19.023311614990234, + "learning_rate": 2.3320195910493392e-07, + "loss": 2.9967, + "step": 92740 + }, + { + "epoch": 2.7173661466344394, + "grad_norm": 18.709468841552734, + "learning_rate": 2.3272484651854776e-07, + "loss": 2.9933, + "step": 92750 + }, + { + "epoch": 2.7176591225371713, + "grad_norm": 16.609909057617188, + "learning_rate": 2.3224821086975879e-07, + "loss": 3.0009, + "step": 92760 + }, + { + "epoch": 2.7179520984399033, + "grad_norm": 17.563926696777344, + "learning_rate": 2.3177205220624887e-07, + "loss": 2.9801, + "step": 92770 + }, + { + "epoch": 2.718245074342635, + "grad_norm": 18.317283630371094, + "learning_rate": 2.3129637057565712e-07, + "loss": 2.9917, + "step": 92780 + }, + { + "epoch": 2.7185380502453675, + "grad_norm": 19.657649993896484, + "learning_rate": 2.3082116602557157e-07, + "loss": 3.0005, + "step": 92790 + }, + { + "epoch": 2.7188310261480995, + "grad_norm": 17.13313865661621, + "learning_rate": 2.3034643860353312e-07, + "loss": 2.9863, + "step": 92800 + }, + { + "epoch": 2.7191240020508314, + "grad_norm": 17.847501754760742, + "learning_rate": 2.298721883570365e-07, + "loss": 2.9849, + "step": 92810 + }, + { + "epoch": 2.7194169779535633, + "grad_norm": 17.688201904296875, + "learning_rate": 2.293984153335277e-07, + "loss": 2.9872, + "step": 92820 + }, + { + "epoch": 2.719709953856295, + "grad_norm": 21.970260620117188, + "learning_rate": 2.2892511958040376e-07, + "loss": 3.0059, + "step": 92830 + }, + { + "epoch": 2.720002929759027, + "grad_norm": 17.700321197509766, + "learning_rate": 2.2845230114501627e-07, + "loss": 2.9939, + "step": 92840 + }, + { + "epoch": 2.720295905661759, + "grad_norm": 18.797441482543945, + "learning_rate": 2.2797996007466682e-07, + "loss": 3.0017, + "step": 92850 + }, + { + "epoch": 2.7205888815644914, + "grad_norm": 16.8956356048584, + "learning_rate": 2.275080964166121e-07, + "loss": 3.0046, + "step": 92860 + }, + { + "epoch": 2.7208818574672233, + "grad_norm": 16.151657104492188, + "learning_rate": 2.2703671021805707e-07, + "loss": 3.0027, + "step": 92870 + }, + { + "epoch": 2.7211748333699552, + "grad_norm": 21.10832977294922, + "learning_rate": 2.2656580152616237e-07, + "loss": 2.9983, + "step": 92880 + }, + { + "epoch": 2.721467809272687, + "grad_norm": 18.16106605529785, + "learning_rate": 2.260953703880392e-07, + "loss": 2.9938, + "step": 92890 + }, + { + "epoch": 2.7217607851754195, + "grad_norm": 16.232248306274414, + "learning_rate": 2.2562541685075212e-07, + "loss": 2.989, + "step": 92900 + }, + { + "epoch": 2.7220537610781514, + "grad_norm": 17.972307205200195, + "learning_rate": 2.2515594096131633e-07, + "loss": 3.0011, + "step": 92910 + }, + { + "epoch": 2.7223467369808834, + "grad_norm": 17.003219604492188, + "learning_rate": 2.2468694276670088e-07, + "loss": 3.0142, + "step": 92920 + }, + { + "epoch": 2.7226397128836153, + "grad_norm": 17.66781997680664, + "learning_rate": 2.2421842231382552e-07, + "loss": 3.0024, + "step": 92930 + }, + { + "epoch": 2.722932688786347, + "grad_norm": 16.97702980041504, + "learning_rate": 2.2375037964956326e-07, + "loss": 2.9933, + "step": 92940 + }, + { + "epoch": 2.723225664689079, + "grad_norm": 19.881608963012695, + "learning_rate": 2.232828148207389e-07, + "loss": 2.9858, + "step": 92950 + }, + { + "epoch": 2.7235186405918115, + "grad_norm": 15.875232696533203, + "learning_rate": 2.2281572787412942e-07, + "loss": 2.9889, + "step": 92960 + }, + { + "epoch": 2.7238116164945434, + "grad_norm": 18.50567626953125, + "learning_rate": 2.2234911885646472e-07, + "loss": 3.0154, + "step": 92970 + }, + { + "epoch": 2.7241045923972753, + "grad_norm": 19.46870231628418, + "learning_rate": 2.2188298781442573e-07, + "loss": 2.9921, + "step": 92980 + }, + { + "epoch": 2.7243975683000072, + "grad_norm": 19.538528442382812, + "learning_rate": 2.2141733479464634e-07, + "loss": 3.0306, + "step": 92990 + }, + { + "epoch": 2.7246905442027396, + "grad_norm": 19.412118911743164, + "learning_rate": 2.20952159843712e-07, + "loss": 3.0198, + "step": 93000 + }, + { + "epoch": 2.7249835201054715, + "grad_norm": 22.49936866760254, + "learning_rate": 2.204874630081616e-07, + "loss": 3.0041, + "step": 93010 + }, + { + "epoch": 2.7252764960082034, + "grad_norm": 21.111120223999023, + "learning_rate": 2.200232443344835e-07, + "loss": 3.011, + "step": 93020 + }, + { + "epoch": 2.7255694719109353, + "grad_norm": 20.844045639038086, + "learning_rate": 2.1955950386912228e-07, + "loss": 3.0058, + "step": 93030 + }, + { + "epoch": 2.7258624478136673, + "grad_norm": 16.330141067504883, + "learning_rate": 2.1909624165847075e-07, + "loss": 2.9976, + "step": 93040 + }, + { + "epoch": 2.726155423716399, + "grad_norm": 18.72747802734375, + "learning_rate": 2.1863345774887635e-07, + "loss": 2.9999, + "step": 93050 + }, + { + "epoch": 2.726448399619131, + "grad_norm": 14.460762023925781, + "learning_rate": 2.1817115218663698e-07, + "loss": 2.983, + "step": 93060 + }, + { + "epoch": 2.7267413755218635, + "grad_norm": 15.162249565124512, + "learning_rate": 2.1770932501800514e-07, + "loss": 3.0238, + "step": 93070 + }, + { + "epoch": 2.7270343514245954, + "grad_norm": 20.005111694335938, + "learning_rate": 2.1724797628918214e-07, + "loss": 2.9901, + "step": 93080 + }, + { + "epoch": 2.7273273273273273, + "grad_norm": 19.22265625, + "learning_rate": 2.1678710604632503e-07, + "loss": 2.9941, + "step": 93090 + }, + { + "epoch": 2.727620303230059, + "grad_norm": 17.197946548461914, + "learning_rate": 2.163267143355391e-07, + "loss": 2.9977, + "step": 93100 + }, + { + "epoch": 2.7279132791327916, + "grad_norm": 18.641347885131836, + "learning_rate": 2.1586680120288583e-07, + "loss": 2.9889, + "step": 93110 + }, + { + "epoch": 2.7282062550355235, + "grad_norm": 19.785219192504883, + "learning_rate": 2.1540736669437513e-07, + "loss": 2.9989, + "step": 93120 + }, + { + "epoch": 2.7284992309382554, + "grad_norm": 16.013261795043945, + "learning_rate": 2.149484108559713e-07, + "loss": 2.9855, + "step": 93130 + }, + { + "epoch": 2.7287922068409873, + "grad_norm": 16.498104095458984, + "learning_rate": 2.1448993373358983e-07, + "loss": 2.985, + "step": 93140 + }, + { + "epoch": 2.7290851827437193, + "grad_norm": 15.866523742675781, + "learning_rate": 2.140319353731002e-07, + "loss": 3.022, + "step": 93150 + }, + { + "epoch": 2.729378158646451, + "grad_norm": 21.860084533691406, + "learning_rate": 2.1357441582032068e-07, + "loss": 2.9994, + "step": 93160 + }, + { + "epoch": 2.729671134549183, + "grad_norm": 16.064908981323242, + "learning_rate": 2.1311737512102415e-07, + "loss": 2.9975, + "step": 93170 + }, + { + "epoch": 2.7299641104519154, + "grad_norm": 16.722518920898438, + "learning_rate": 2.1266081332093402e-07, + "loss": 3.0207, + "step": 93180 + }, + { + "epoch": 2.730081300813008, + "eval_bleu": 0.3537770471707242, + "eval_cap_loss": 0.8994135856628418, + "eval_con_loss": 1.1254702806472778, + "eval_loss": 3.1503543853759766, + "step": 93184 + }, + { + "epoch": 2.730081300813008, + "eval_bleu": 0.3537770471707242, + "eval_cap_loss": 0.8994135856628418, + "eval_con_loss": 1.1254702806472778, + "eval_loss": 3.1503543853759766, + "eval_runtime": 54.1941, + "eval_samples_per_second": 369.044, + "eval_steps_per_second": 0.369, + "step": 93184 + }, + { + "epoch": 2.7302570863546474, + "grad_norm": 18.19828987121582, + "learning_rate": 2.1220473046572821e-07, + "loss": 2.9889, + "step": 93190 + }, + { + "epoch": 2.7305500622573793, + "grad_norm": 17.88983917236328, + "learning_rate": 2.1174912660103296e-07, + "loss": 2.9878, + "step": 93200 + }, + { + "epoch": 2.730843038160111, + "grad_norm": 20.692440032958984, + "learning_rate": 2.112940017724302e-07, + "loss": 2.9909, + "step": 93210 + }, + { + "epoch": 2.7311360140628436, + "grad_norm": 15.684040069580078, + "learning_rate": 2.1083935602545235e-07, + "loss": 2.9868, + "step": 93220 + }, + { + "epoch": 2.7314289899655755, + "grad_norm": 19.811725616455078, + "learning_rate": 2.1038518940558416e-07, + "loss": 2.9802, + "step": 93230 + }, + { + "epoch": 2.7317219658683074, + "grad_norm": 21.470056533813477, + "learning_rate": 2.099315019582615e-07, + "loss": 3.0137, + "step": 93240 + }, + { + "epoch": 2.7320149417710393, + "grad_norm": 17.521690368652344, + "learning_rate": 2.0947829372887418e-07, + "loss": 2.987, + "step": 93250 + }, + { + "epoch": 2.7323079176737712, + "grad_norm": 18.648096084594727, + "learning_rate": 2.0902556476276147e-07, + "loss": 2.9961, + "step": 93260 + }, + { + "epoch": 2.732600893576503, + "grad_norm": 19.1317081451416, + "learning_rate": 2.0857331510521828e-07, + "loss": 2.9983, + "step": 93270 + }, + { + "epoch": 2.732893869479235, + "grad_norm": 20.68265724182129, + "learning_rate": 2.0812154480148727e-07, + "loss": 2.9875, + "step": 93280 + }, + { + "epoch": 2.7331868453819674, + "grad_norm": 16.56145477294922, + "learning_rate": 2.076702538967662e-07, + "loss": 3.0099, + "step": 93290 + }, + { + "epoch": 2.7334798212846994, + "grad_norm": 17.431032180786133, + "learning_rate": 2.0721944243620507e-07, + "loss": 3.0145, + "step": 93300 + }, + { + "epoch": 2.7337727971874313, + "grad_norm": 20.01137924194336, + "learning_rate": 2.0676911046490334e-07, + "loss": 2.9877, + "step": 93310 + }, + { + "epoch": 2.734065773090163, + "grad_norm": 15.525433540344238, + "learning_rate": 2.0631925802791608e-07, + "loss": 3.0186, + "step": 93320 + }, + { + "epoch": 2.7343587489928955, + "grad_norm": 19.030771255493164, + "learning_rate": 2.058698851702462e-07, + "loss": 3.0013, + "step": 93330 + }, + { + "epoch": 2.7346517248956275, + "grad_norm": 21.78780746459961, + "learning_rate": 2.054209919368527e-07, + "loss": 2.9953, + "step": 93340 + }, + { + "epoch": 2.7349447007983594, + "grad_norm": 17.761404037475586, + "learning_rate": 2.049725783726425e-07, + "loss": 3.0028, + "step": 93350 + }, + { + "epoch": 2.7352376767010913, + "grad_norm": 17.309810638427734, + "learning_rate": 2.045246445224791e-07, + "loss": 3.0009, + "step": 93360 + }, + { + "epoch": 2.735530652603823, + "grad_norm": 19.896108627319336, + "learning_rate": 2.0407719043117334e-07, + "loss": 2.9891, + "step": 93370 + }, + { + "epoch": 2.735823628506555, + "grad_norm": 18.91130256652832, + "learning_rate": 2.0363021614349166e-07, + "loss": 2.9938, + "step": 93380 + }, + { + "epoch": 2.7361166044092875, + "grad_norm": 18.046846389770508, + "learning_rate": 2.03183721704151e-07, + "loss": 2.9702, + "step": 93390 + }, + { + "epoch": 2.7364095803120194, + "grad_norm": 18.081436157226562, + "learning_rate": 2.027377071578207e-07, + "loss": 3.0142, + "step": 93400 + }, + { + "epoch": 2.7367025562147513, + "grad_norm": 18.618520736694336, + "learning_rate": 2.022921725491217e-07, + "loss": 3.0153, + "step": 93410 + }, + { + "epoch": 2.7369955321174833, + "grad_norm": 18.81499481201172, + "learning_rate": 2.0184711792262724e-07, + "loss": 2.9985, + "step": 93420 + }, + { + "epoch": 2.737288508020215, + "grad_norm": 19.753904342651367, + "learning_rate": 2.0140254332286114e-07, + "loss": 3.0137, + "step": 93430 + }, + { + "epoch": 2.7375814839229475, + "grad_norm": 19.276344299316406, + "learning_rate": 2.0095844879430282e-07, + "loss": 3.0151, + "step": 93440 + }, + { + "epoch": 2.7378744598256795, + "grad_norm": 17.28042221069336, + "learning_rate": 2.0051483438137897e-07, + "loss": 2.9879, + "step": 93450 + }, + { + "epoch": 2.7381674357284114, + "grad_norm": 18.276023864746094, + "learning_rate": 2.0007170012847133e-07, + "loss": 2.9967, + "step": 93460 + }, + { + "epoch": 2.7384604116311433, + "grad_norm": 19.404949188232422, + "learning_rate": 1.996290460799133e-07, + "loss": 2.995, + "step": 93470 + }, + { + "epoch": 2.738753387533875, + "grad_norm": 18.831802368164062, + "learning_rate": 1.9918687227999056e-07, + "loss": 2.967, + "step": 93480 + }, + { + "epoch": 2.739046363436607, + "grad_norm": 15.046919822692871, + "learning_rate": 1.9874517877293774e-07, + "loss": 3.0224, + "step": 93490 + }, + { + "epoch": 2.7393393393393395, + "grad_norm": 18.07546615600586, + "learning_rate": 1.9830396560294617e-07, + "loss": 2.9915, + "step": 93500 + }, + { + "epoch": 2.7396323152420714, + "grad_norm": 16.150588989257812, + "learning_rate": 1.9786323281415443e-07, + "loss": 2.9699, + "step": 93510 + }, + { + "epoch": 2.7399252911448033, + "grad_norm": 18.033864974975586, + "learning_rate": 1.9742298045065667e-07, + "loss": 2.9952, + "step": 93520 + }, + { + "epoch": 2.7402182670475352, + "grad_norm": 18.144939422607422, + "learning_rate": 1.9698320855649654e-07, + "loss": 2.985, + "step": 93530 + }, + { + "epoch": 2.7405112429502676, + "grad_norm": 19.5450382232666, + "learning_rate": 1.9654391717567112e-07, + "loss": 2.9973, + "step": 93540 + }, + { + "epoch": 2.7408042188529995, + "grad_norm": 19.429431915283203, + "learning_rate": 1.9610510635212855e-07, + "loss": 2.9932, + "step": 93550 + }, + { + "epoch": 2.7410971947557314, + "grad_norm": 15.200150489807129, + "learning_rate": 1.9566677612977035e-07, + "loss": 2.9745, + "step": 93560 + }, + { + "epoch": 2.7413901706584634, + "grad_norm": 19.140241622924805, + "learning_rate": 1.9522892655244762e-07, + "loss": 2.9885, + "step": 93570 + }, + { + "epoch": 2.7416831465611953, + "grad_norm": 18.971651077270508, + "learning_rate": 1.947915576639653e-07, + "loss": 3.0159, + "step": 93580 + }, + { + "epoch": 2.741976122463927, + "grad_norm": 19.663448333740234, + "learning_rate": 1.943546695080789e-07, + "loss": 3.01, + "step": 93590 + }, + { + "epoch": 2.742269098366659, + "grad_norm": 18.74468421936035, + "learning_rate": 1.9391826212849797e-07, + "loss": 2.9973, + "step": 93600 + }, + { + "epoch": 2.7425620742693915, + "grad_norm": 17.867572784423828, + "learning_rate": 1.934823355688803e-07, + "loss": 2.9965, + "step": 93610 + }, + { + "epoch": 2.7428550501721234, + "grad_norm": 18.43490982055664, + "learning_rate": 1.9304688987283883e-07, + "loss": 2.9915, + "step": 93620 + }, + { + "epoch": 2.7431480260748553, + "grad_norm": 19.233814239501953, + "learning_rate": 1.9261192508393755e-07, + "loss": 3.001, + "step": 93630 + }, + { + "epoch": 2.7434410019775872, + "grad_norm": 19.251441955566406, + "learning_rate": 1.9217744124569226e-07, + "loss": 3.0092, + "step": 93640 + }, + { + "epoch": 2.7437339778803196, + "grad_norm": 22.15867805480957, + "learning_rate": 1.9174343840156985e-07, + "loss": 2.9884, + "step": 93650 + }, + { + "epoch": 2.7440269537830515, + "grad_norm": 20.97469139099121, + "learning_rate": 1.9130991659498998e-07, + "loss": 2.9837, + "step": 93660 + }, + { + "epoch": 2.7443199296857834, + "grad_norm": 16.00536346435547, + "learning_rate": 1.9087687586932412e-07, + "loss": 2.9861, + "step": 93670 + }, + { + "epoch": 2.7446129055885153, + "grad_norm": 20.76568031311035, + "learning_rate": 1.9044431626789483e-07, + "loss": 2.9757, + "step": 93680 + }, + { + "epoch": 2.7449058814912473, + "grad_norm": 16.284730911254883, + "learning_rate": 1.900122378339786e-07, + "loss": 2.9869, + "step": 93690 + }, + { + "epoch": 2.7450816670328866, + "eval_bleu": 0.35394435787906614, + "eval_cap_loss": 0.899275541305542, + "eval_con_loss": 1.125493049621582, + "eval_loss": 3.150261640548706, + "step": 93696 + }, + { + "epoch": 2.7450816670328866, + "eval_bleu": 0.35394435787906614, + "eval_cap_loss": 0.899275541305542, + "eval_con_loss": 1.125493049621582, + "eval_loss": 3.150261640548706, + "eval_runtime": 60.7229, + "eval_samples_per_second": 329.365, + "eval_steps_per_second": 0.329, + "step": 93696 + }, + { + "epoch": 2.745198857393979, + "grad_norm": 17.928905487060547, + "learning_rate": 1.8958064061079972e-07, + "loss": 2.995, + "step": 93700 + }, + { + "epoch": 2.745491833296711, + "grad_norm": 18.37485694885254, + "learning_rate": 1.891495246415398e-07, + "loss": 2.9959, + "step": 93710 + }, + { + "epoch": 2.7457848091994435, + "grad_norm": 24.60184669494629, + "learning_rate": 1.8871888996932762e-07, + "loss": 2.9993, + "step": 93720 + }, + { + "epoch": 2.7460777851021754, + "grad_norm": 19.856760025024414, + "learning_rate": 1.8828873663724657e-07, + "loss": 2.9966, + "step": 93730 + }, + { + "epoch": 2.7463707610049073, + "grad_norm": 15.183549880981445, + "learning_rate": 1.8785906468832992e-07, + "loss": 3.0062, + "step": 93740 + }, + { + "epoch": 2.746663736907639, + "grad_norm": 20.588956832885742, + "learning_rate": 1.8742987416556503e-07, + "loss": 2.9933, + "step": 93750 + }, + { + "epoch": 2.7469567128103716, + "grad_norm": 15.442214965820312, + "learning_rate": 1.870011651118886e-07, + "loss": 2.9933, + "step": 93760 + }, + { + "epoch": 2.7472496887131035, + "grad_norm": 20.72815704345703, + "learning_rate": 1.865729375701919e-07, + "loss": 2.982, + "step": 93770 + }, + { + "epoch": 2.7475426646158354, + "grad_norm": 23.29121971130371, + "learning_rate": 1.8614519158331457e-07, + "loss": 3.016, + "step": 93780 + }, + { + "epoch": 2.7478356405185673, + "grad_norm": 22.307126998901367, + "learning_rate": 1.857179271940518e-07, + "loss": 3.0077, + "step": 93790 + }, + { + "epoch": 2.7481286164212992, + "grad_norm": 18.747852325439453, + "learning_rate": 1.8529114444514827e-07, + "loss": 3.0087, + "step": 93800 + }, + { + "epoch": 2.748421592324031, + "grad_norm": 17.37303352355957, + "learning_rate": 1.8486484337930156e-07, + "loss": 3.0056, + "step": 93810 + }, + { + "epoch": 2.748714568226763, + "grad_norm": 19.397287368774414, + "learning_rate": 1.8443902403915914e-07, + "loss": 2.9977, + "step": 93820 + }, + { + "epoch": 2.7490075441294954, + "grad_norm": 17.83242416381836, + "learning_rate": 1.8401368646732364e-07, + "loss": 2.986, + "step": 93830 + }, + { + "epoch": 2.7493005200322274, + "grad_norm": 16.285266876220703, + "learning_rate": 1.8358883070634547e-07, + "loss": 3.0042, + "step": 93840 + }, + { + "epoch": 2.7495934959349593, + "grad_norm": 23.475662231445312, + "learning_rate": 1.8316445679873007e-07, + "loss": 2.9767, + "step": 93850 + }, + { + "epoch": 2.749886471837691, + "grad_norm": 17.895605087280273, + "learning_rate": 1.8274056478693402e-07, + "loss": 2.9868, + "step": 93860 + }, + { + "epoch": 2.7501794477404236, + "grad_norm": 17.687002182006836, + "learning_rate": 1.8231715471336454e-07, + "loss": 2.9943, + "step": 93870 + }, + { + "epoch": 2.7504724236431555, + "grad_norm": 19.831735610961914, + "learning_rate": 1.8189422662038104e-07, + "loss": 3.0157, + "step": 93880 + }, + { + "epoch": 2.7507653995458874, + "grad_norm": 18.906068801879883, + "learning_rate": 1.814717805502958e-07, + "loss": 2.9936, + "step": 93890 + }, + { + "epoch": 2.7510583754486193, + "grad_norm": 19.516918182373047, + "learning_rate": 1.8104981654537113e-07, + "loss": 2.9822, + "step": 93900 + }, + { + "epoch": 2.7513513513513512, + "grad_norm": 16.621328353881836, + "learning_rate": 1.8062833464782325e-07, + "loss": 2.9837, + "step": 93910 + }, + { + "epoch": 2.751644327254083, + "grad_norm": 19.690616607666016, + "learning_rate": 1.8020733489981735e-07, + "loss": 3.0083, + "step": 93920 + }, + { + "epoch": 2.7519373031568155, + "grad_norm": 24.890893936157227, + "learning_rate": 1.797868173434725e-07, + "loss": 3.0065, + "step": 93930 + }, + { + "epoch": 2.7522302790595474, + "grad_norm": 20.94524574279785, + "learning_rate": 1.7936678202085945e-07, + "loss": 2.9826, + "step": 93940 + }, + { + "epoch": 2.7525232549622793, + "grad_norm": 16.873456954956055, + "learning_rate": 1.7894722897400075e-07, + "loss": 2.9714, + "step": 93950 + }, + { + "epoch": 2.7528162308650113, + "grad_norm": 19.18708038330078, + "learning_rate": 1.785281582448689e-07, + "loss": 3.013, + "step": 93960 + }, + { + "epoch": 2.7531092067677436, + "grad_norm": 16.64519691467285, + "learning_rate": 1.7810956987539042e-07, + "loss": 2.9857, + "step": 93970 + }, + { + "epoch": 2.7534021826704755, + "grad_norm": 18.011564254760742, + "learning_rate": 1.7769146390744173e-07, + "loss": 3.0162, + "step": 93980 + }, + { + "epoch": 2.7536951585732075, + "grad_norm": 18.151926040649414, + "learning_rate": 1.7727384038285278e-07, + "loss": 3.0054, + "step": 93990 + }, + { + "epoch": 2.7539881344759394, + "grad_norm": 19.48988151550293, + "learning_rate": 1.7685669934340288e-07, + "loss": 3.0143, + "step": 94000 + }, + { + "epoch": 2.7542811103786713, + "grad_norm": 18.656524658203125, + "learning_rate": 1.764400408308259e-07, + "loss": 2.9889, + "step": 94010 + }, + { + "epoch": 2.754574086281403, + "grad_norm": 21.163625717163086, + "learning_rate": 1.7602386488680567e-07, + "loss": 3.0113, + "step": 94020 + }, + { + "epoch": 2.754867062184135, + "grad_norm": 22.49643898010254, + "learning_rate": 1.75649719167717e-07, + "loss": 3.0066, + "step": 94030 + }, + { + "epoch": 2.7551600380868675, + "grad_norm": 15.406214714050293, + "learning_rate": 1.752344602186218e-07, + "loss": 3.0058, + "step": 94040 + }, + { + "epoch": 2.7554530139895994, + "grad_norm": 16.366498947143555, + "learning_rate": 1.7481968395869464e-07, + "loss": 3.0035, + "step": 94050 + }, + { + "epoch": 2.7557459898923313, + "grad_norm": 17.064363479614258, + "learning_rate": 1.7440539042943071e-07, + "loss": 2.9829, + "step": 94060 + }, + { + "epoch": 2.7560389657950632, + "grad_norm": 18.09139060974121, + "learning_rate": 1.7399157967227798e-07, + "loss": 2.9967, + "step": 94070 + }, + { + "epoch": 2.7563319416977956, + "grad_norm": 16.028884887695312, + "learning_rate": 1.7357825172863717e-07, + "loss": 3.0155, + "step": 94080 + }, + { + "epoch": 2.7566249176005275, + "grad_norm": 18.65414047241211, + "learning_rate": 1.7316540663985747e-07, + "loss": 2.9921, + "step": 94090 + }, + { + "epoch": 2.7569178935032594, + "grad_norm": 16.709434509277344, + "learning_rate": 1.7275304444724251e-07, + "loss": 3.006, + "step": 94100 + }, + { + "epoch": 2.7572108694059914, + "grad_norm": 15.918365478515625, + "learning_rate": 1.723411651920459e-07, + "loss": 3.0085, + "step": 94110 + }, + { + "epoch": 2.7575038453087233, + "grad_norm": 20.010177612304688, + "learning_rate": 1.7192976891547476e-07, + "loss": 2.9908, + "step": 94120 + }, + { + "epoch": 2.757796821211455, + "grad_norm": 18.381290435791016, + "learning_rate": 1.7151885565868553e-07, + "loss": 2.9862, + "step": 94130 + }, + { + "epoch": 2.758089797114187, + "grad_norm": 18.152597427368164, + "learning_rate": 1.711084254627887e-07, + "loss": 2.9998, + "step": 94140 + }, + { + "epoch": 2.7583827730169195, + "grad_norm": 20.93446922302246, + "learning_rate": 1.7069847836884478e-07, + "loss": 3.0067, + "step": 94150 + }, + { + "epoch": 2.7586757489196514, + "grad_norm": 20.431137084960938, + "learning_rate": 1.702890144178676e-07, + "loss": 3.0148, + "step": 94160 + }, + { + "epoch": 2.7589687248223833, + "grad_norm": 16.44817352294922, + "learning_rate": 1.6988003365081996e-07, + "loss": 2.9933, + "step": 94170 + }, + { + "epoch": 2.7592617007251152, + "grad_norm": 20.8680362701416, + "learning_rate": 1.6947153610861965e-07, + "loss": 2.9883, + "step": 94180 + }, + { + "epoch": 2.7595546766278476, + "grad_norm": 19.652482986450195, + "learning_rate": 1.6906352183213292e-07, + "loss": 3.012, + "step": 94190 + }, + { + "epoch": 2.7598476525305795, + "grad_norm": 18.466630935668945, + "learning_rate": 1.6865599086218098e-07, + "loss": 2.9869, + "step": 94200 + }, + { + "epoch": 2.7600820332527647, + "eval_bleu": 0.3538759213260902, + "eval_cap_loss": 0.899131715297699, + "eval_con_loss": 1.1252284049987793, + "eval_loss": 3.1495883464813232, + "step": 94208 + }, + { + "epoch": 2.7600820332527647, + "eval_bleu": 0.3538759213260902, + "eval_cap_loss": 0.899131715297699, + "eval_con_loss": 1.1252284049987793, + "eval_loss": 3.1495883464813232, + "eval_runtime": 53.0025, + "eval_samples_per_second": 377.341, + "eval_steps_per_second": 0.377, + "step": 94208 + }, + { + "epoch": 2.7601406284333114, + "grad_norm": 18.174882888793945, + "learning_rate": 1.6824894323953233e-07, + "loss": 3.0108, + "step": 94210 + }, + { + "epoch": 2.7604336043360433, + "grad_norm": 16.9205265045166, + "learning_rate": 1.6784237900491163e-07, + "loss": 3.0093, + "step": 94220 + }, + { + "epoch": 2.7607265802387753, + "grad_norm": 20.906614303588867, + "learning_rate": 1.67436298198993e-07, + "loss": 3.0127, + "step": 94230 + }, + { + "epoch": 2.761019556141507, + "grad_norm": 18.126544952392578, + "learning_rate": 1.6703070086240226e-07, + "loss": 3.0238, + "step": 94240 + }, + { + "epoch": 2.761312532044239, + "grad_norm": 18.22772789001465, + "learning_rate": 1.666255870357164e-07, + "loss": 3.0024, + "step": 94250 + }, + { + "epoch": 2.7616055079469715, + "grad_norm": 20.077816009521484, + "learning_rate": 1.6622095675946527e-07, + "loss": 2.9909, + "step": 94260 + }, + { + "epoch": 2.7618984838497034, + "grad_norm": 19.802785873413086, + "learning_rate": 1.6581681007412976e-07, + "loss": 3.0182, + "step": 94270 + }, + { + "epoch": 2.7621914597524353, + "grad_norm": 19.938634872436523, + "learning_rate": 1.65413147020142e-07, + "loss": 2.9982, + "step": 94280 + }, + { + "epoch": 2.762484435655167, + "grad_norm": 19.46788215637207, + "learning_rate": 1.6500996763788636e-07, + "loss": 2.9733, + "step": 94290 + }, + { + "epoch": 2.7627774115578996, + "grad_norm": 21.185672760009766, + "learning_rate": 1.6460727196769776e-07, + "loss": 2.995, + "step": 94300 + }, + { + "epoch": 2.7630703874606315, + "grad_norm": 15.676284790039062, + "learning_rate": 1.6420506004986458e-07, + "loss": 2.9998, + "step": 94310 + }, + { + "epoch": 2.7633633633633634, + "grad_norm": 17.659481048583984, + "learning_rate": 1.6380333192462628e-07, + "loss": 3.0071, + "step": 94320 + }, + { + "epoch": 2.7636563392660953, + "grad_norm": 21.96422004699707, + "learning_rate": 1.6340208763217126e-07, + "loss": 2.9966, + "step": 94330 + }, + { + "epoch": 2.7639493151688272, + "grad_norm": 18.64176368713379, + "learning_rate": 1.630013272126435e-07, + "loss": 3.0005, + "step": 94340 + }, + { + "epoch": 2.764242291071559, + "grad_norm": 21.69329071044922, + "learning_rate": 1.6260105070613596e-07, + "loss": 2.9932, + "step": 94350 + }, + { + "epoch": 2.7645352669742915, + "grad_norm": 18.578468322753906, + "learning_rate": 1.6220125815269327e-07, + "loss": 3.01, + "step": 94360 + }, + { + "epoch": 2.7648282428770234, + "grad_norm": 20.181705474853516, + "learning_rate": 1.61801949592314e-07, + "loss": 3.004, + "step": 94370 + }, + { + "epoch": 2.7651212187797554, + "grad_norm": 19.343854904174805, + "learning_rate": 1.6140312506494504e-07, + "loss": 2.9955, + "step": 94380 + }, + { + "epoch": 2.7654141946824873, + "grad_norm": 18.057483673095703, + "learning_rate": 1.6100478461048729e-07, + "loss": 3.0166, + "step": 94390 + }, + { + "epoch": 2.7657071705852196, + "grad_norm": 16.35106658935547, + "learning_rate": 1.6060692826879165e-07, + "loss": 2.9851, + "step": 94400 + }, + { + "epoch": 2.7660001464879516, + "grad_norm": 18.8363037109375, + "learning_rate": 1.6020955607966237e-07, + "loss": 2.9928, + "step": 94410 + }, + { + "epoch": 2.7662931223906835, + "grad_norm": 18.57612419128418, + "learning_rate": 1.5981266808285268e-07, + "loss": 3.0001, + "step": 94420 + }, + { + "epoch": 2.7665860982934154, + "grad_norm": 17.66020965576172, + "learning_rate": 1.5941626431807078e-07, + "loss": 2.9998, + "step": 94430 + }, + { + "epoch": 2.7668790741961473, + "grad_norm": 18.130107879638672, + "learning_rate": 1.590203448249722e-07, + "loss": 2.9837, + "step": 94440 + }, + { + "epoch": 2.7671720500988792, + "grad_norm": 19.48138427734375, + "learning_rate": 1.5862490964316857e-07, + "loss": 2.9981, + "step": 94450 + }, + { + "epoch": 2.767465026001611, + "grad_norm": 17.758668899536133, + "learning_rate": 1.5822995881221935e-07, + "loss": 2.999, + "step": 94460 + }, + { + "epoch": 2.7677580019043435, + "grad_norm": 21.26416015625, + "learning_rate": 1.5783549237163742e-07, + "loss": 2.9962, + "step": 94470 + }, + { + "epoch": 2.7680509778070754, + "grad_norm": 19.569971084594727, + "learning_rate": 1.5744151036088674e-07, + "loss": 2.9874, + "step": 94480 + }, + { + "epoch": 2.7683439537098073, + "grad_norm": 19.37531280517578, + "learning_rate": 1.5704801281938408e-07, + "loss": 2.9838, + "step": 94490 + }, + { + "epoch": 2.7686369296125393, + "grad_norm": 21.636648178100586, + "learning_rate": 1.5665499978649412e-07, + "loss": 3.0219, + "step": 94500 + }, + { + "epoch": 2.7689299055152716, + "grad_norm": 18.984363555908203, + "learning_rate": 1.5626247130153816e-07, + "loss": 3.0042, + "step": 94510 + }, + { + "epoch": 2.7692228814180035, + "grad_norm": 16.76352310180664, + "learning_rate": 1.5587042740378422e-07, + "loss": 3.0109, + "step": 94520 + }, + { + "epoch": 2.7695158573207355, + "grad_norm": 17.340036392211914, + "learning_rate": 1.554788681324554e-07, + "loss": 3.0051, + "step": 94530 + }, + { + "epoch": 2.7698088332234674, + "grad_norm": 19.84832191467285, + "learning_rate": 1.5508779352672366e-07, + "loss": 2.9859, + "step": 94540 + }, + { + "epoch": 2.7701018091261993, + "grad_norm": 15.387134552001953, + "learning_rate": 1.5469720362571382e-07, + "loss": 2.9807, + "step": 94550 + }, + { + "epoch": 2.770394785028931, + "grad_norm": 14.005971908569336, + "learning_rate": 1.54307098468503e-07, + "loss": 3.0072, + "step": 94560 + }, + { + "epoch": 2.770687760931663, + "grad_norm": 18.217477798461914, + "learning_rate": 1.539174780941194e-07, + "loss": 3.0051, + "step": 94570 + }, + { + "epoch": 2.7709807368343955, + "grad_norm": 18.93244171142578, + "learning_rate": 1.5352834254154013e-07, + "loss": 2.995, + "step": 94580 + }, + { + "epoch": 2.7712737127371274, + "grad_norm": 19.415103912353516, + "learning_rate": 1.53139691849698e-07, + "loss": 2.9931, + "step": 94590 + }, + { + "epoch": 2.7715666886398593, + "grad_norm": 18.89051055908203, + "learning_rate": 1.5275152605747356e-07, + "loss": 3.0219, + "step": 94600 + }, + { + "epoch": 2.7718596645425913, + "grad_norm": 16.341289520263672, + "learning_rate": 1.5236384520370128e-07, + "loss": 3.0037, + "step": 94610 + }, + { + "epoch": 2.7721526404453236, + "grad_norm": 18.172176361083984, + "learning_rate": 1.5197664932716627e-07, + "loss": 2.9852, + "step": 94620 + }, + { + "epoch": 2.7724456163480555, + "grad_norm": 16.882965087890625, + "learning_rate": 1.5158993846660476e-07, + "loss": 2.9947, + "step": 94630 + }, + { + "epoch": 2.7727385922507874, + "grad_norm": 18.11585235595703, + "learning_rate": 1.5120371266070576e-07, + "loss": 2.9931, + "step": 94640 + }, + { + "epoch": 2.7730315681535194, + "grad_norm": 17.88173484802246, + "learning_rate": 1.50817971948109e-07, + "loss": 3.0066, + "step": 94650 + }, + { + "epoch": 2.7733245440562513, + "grad_norm": 19.83518409729004, + "learning_rate": 1.5043271636740354e-07, + "loss": 2.9842, + "step": 94660 + }, + { + "epoch": 2.773617519958983, + "grad_norm": 17.105690002441406, + "learning_rate": 1.500479459571347e-07, + "loss": 3.0035, + "step": 94670 + }, + { + "epoch": 2.773910495861715, + "grad_norm": 19.075368881225586, + "learning_rate": 1.4966366075579442e-07, + "loss": 3.0047, + "step": 94680 + }, + { + "epoch": 2.7742034717644475, + "grad_norm": 18.82093620300293, + "learning_rate": 1.492798608018292e-07, + "loss": 3.0128, + "step": 94690 + }, + { + "epoch": 2.7744964476671794, + "grad_norm": 19.33072280883789, + "learning_rate": 1.4889654613363502e-07, + "loss": 3.0068, + "step": 94700 + }, + { + "epoch": 2.7747894235699113, + "grad_norm": 15.06930923461914, + "learning_rate": 1.485137167895606e-07, + "loss": 2.9896, + "step": 94710 + }, + { + "epoch": 2.7750823994726432, + "grad_norm": 21.71002960205078, + "learning_rate": 1.481313728079059e-07, + "loss": 2.9831, + "step": 94720 + }, + { + "epoch": 2.7750823994726432, + "eval_bleu": 0.3538667774275185, + "eval_cap_loss": 0.8991467952728271, + "eval_con_loss": 1.124716877937317, + "eval_loss": 3.148580551147461, + "step": 94720 + }, + { + "epoch": 2.7750823994726432, + "eval_bleu": 0.3538667774275185, + "eval_cap_loss": 0.8991467952728271, + "eval_con_loss": 1.124716877937317, + "eval_loss": 3.148580551147461, + "eval_runtime": 54.7509, + "eval_samples_per_second": 365.291, + "eval_steps_per_second": 0.365, + "step": 94720 + }, + { + "epoch": 2.7753753753753756, + "grad_norm": 19.20245361328125, + "learning_rate": 1.4774951422692253e-07, + "loss": 3.0178, + "step": 94730 + }, + { + "epoch": 2.7756683512781075, + "grad_norm": 21.003137588500977, + "learning_rate": 1.4736814108481323e-07, + "loss": 2.9727, + "step": 94740 + }, + { + "epoch": 2.7759613271808394, + "grad_norm": 15.702285766601562, + "learning_rate": 1.469872534197314e-07, + "loss": 2.9955, + "step": 94750 + }, + { + "epoch": 2.7762543030835714, + "grad_norm": 19.405363082885742, + "learning_rate": 1.4660685126978325e-07, + "loss": 2.9969, + "step": 94760 + }, + { + "epoch": 2.7765472789863033, + "grad_norm": 16.87163543701172, + "learning_rate": 1.4622693467302496e-07, + "loss": 3.0261, + "step": 94770 + }, + { + "epoch": 2.776840254889035, + "grad_norm": 18.70584487915039, + "learning_rate": 1.4584750366746558e-07, + "loss": 3.0008, + "step": 94780 + }, + { + "epoch": 2.777133230791767, + "grad_norm": 19.801794052124023, + "learning_rate": 1.4546855829106477e-07, + "loss": 2.9976, + "step": 94790 + }, + { + "epoch": 2.7774262066944995, + "grad_norm": 16.3852481842041, + "learning_rate": 1.4509009858173328e-07, + "loss": 2.9802, + "step": 94800 + }, + { + "epoch": 2.7777191825972314, + "grad_norm": 15.087662696838379, + "learning_rate": 1.4471212457733475e-07, + "loss": 3.0028, + "step": 94810 + }, + { + "epoch": 2.7780121584999633, + "grad_norm": 17.220355987548828, + "learning_rate": 1.4433463631568278e-07, + "loss": 2.9991, + "step": 94820 + }, + { + "epoch": 2.778305134402695, + "grad_norm": 13.342890739440918, + "learning_rate": 1.4395763383454275e-07, + "loss": 3.0014, + "step": 94830 + }, + { + "epoch": 2.7785981103054276, + "grad_norm": 18.582021713256836, + "learning_rate": 1.435811171716317e-07, + "loss": 3.0128, + "step": 94840 + }, + { + "epoch": 2.7788910862081595, + "grad_norm": 16.76411247253418, + "learning_rate": 1.4320508636461728e-07, + "loss": 2.975, + "step": 94850 + }, + { + "epoch": 2.7791840621108914, + "grad_norm": 18.145545959472656, + "learning_rate": 1.4282954145111995e-07, + "loss": 2.9954, + "step": 94860 + }, + { + "epoch": 2.7794770380136233, + "grad_norm": 19.053621292114258, + "learning_rate": 1.424544824687102e-07, + "loss": 3.0051, + "step": 94870 + }, + { + "epoch": 2.7797700139163553, + "grad_norm": 15.441676139831543, + "learning_rate": 1.4207990945491024e-07, + "loss": 2.9853, + "step": 94880 + }, + { + "epoch": 2.780062989819087, + "grad_norm": 21.787321090698242, + "learning_rate": 1.4170582244719454e-07, + "loss": 2.9906, + "step": 94890 + }, + { + "epoch": 2.7803559657218195, + "grad_norm": 14.825602531433105, + "learning_rate": 1.4133222148298874e-07, + "loss": 3.0108, + "step": 94900 + }, + { + "epoch": 2.7806489416245515, + "grad_norm": 17.438980102539062, + "learning_rate": 1.4095910659966793e-07, + "loss": 3.0017, + "step": 94910 + }, + { + "epoch": 2.7809419175272834, + "grad_norm": 15.754881858825684, + "learning_rate": 1.4058647783456115e-07, + "loss": 3.0147, + "step": 94920 + }, + { + "epoch": 2.7812348934300153, + "grad_norm": 14.177969932556152, + "learning_rate": 1.4021433522494687e-07, + "loss": 2.992, + "step": 94930 + }, + { + "epoch": 2.7815278693327476, + "grad_norm": 20.15513038635254, + "learning_rate": 1.3984267880805703e-07, + "loss": 2.9888, + "step": 94940 + }, + { + "epoch": 2.7818208452354796, + "grad_norm": 19.707725524902344, + "learning_rate": 1.3947150862107239e-07, + "loss": 2.9998, + "step": 94950 + }, + { + "epoch": 2.7821138211382115, + "grad_norm": 16.119211196899414, + "learning_rate": 1.3910082470112662e-07, + "loss": 2.9968, + "step": 94960 + }, + { + "epoch": 2.7824067970409434, + "grad_norm": 18.496713638305664, + "learning_rate": 1.387306270853045e-07, + "loss": 2.9939, + "step": 94970 + }, + { + "epoch": 2.7826997729436753, + "grad_norm": 15.894161224365234, + "learning_rate": 1.3836091581064305e-07, + "loss": 3.0057, + "step": 94980 + }, + { + "epoch": 2.7829927488464072, + "grad_norm": 18.72605323791504, + "learning_rate": 1.3799169091412767e-07, + "loss": 2.9991, + "step": 94990 + }, + { + "epoch": 2.783285724749139, + "grad_norm": 17.854412078857422, + "learning_rate": 1.3762295243269942e-07, + "loss": 2.9815, + "step": 95000 + }, + { + "epoch": 2.7835787006518715, + "grad_norm": 13.977839469909668, + "learning_rate": 1.3725470040324652e-07, + "loss": 2.9971, + "step": 95010 + }, + { + "epoch": 2.7838716765546034, + "grad_norm": 19.98535919189453, + "learning_rate": 1.3688693486261173e-07, + "loss": 3.0082, + "step": 95020 + }, + { + "epoch": 2.7841646524573354, + "grad_norm": 22.660966873168945, + "learning_rate": 1.3651965584758675e-07, + "loss": 3.0117, + "step": 95030 + }, + { + "epoch": 2.7844576283600673, + "grad_norm": 19.27387237548828, + "learning_rate": 1.3615286339491606e-07, + "loss": 2.9988, + "step": 95040 + }, + { + "epoch": 2.7847506042627996, + "grad_norm": 21.192230224609375, + "learning_rate": 1.3578655754129532e-07, + "loss": 2.9842, + "step": 95050 + }, + { + "epoch": 2.7850435801655316, + "grad_norm": 18.98067283630371, + "learning_rate": 1.3542073832337188e-07, + "loss": 3.0193, + "step": 95060 + }, + { + "epoch": 2.7853365560682635, + "grad_norm": 15.241750717163086, + "learning_rate": 1.35055405777742e-07, + "loss": 2.9971, + "step": 95070 + }, + { + "epoch": 2.7856295319709954, + "grad_norm": 16.996444702148438, + "learning_rate": 1.3469055994095647e-07, + "loss": 3.004, + "step": 95080 + }, + { + "epoch": 2.7859225078737273, + "grad_norm": 19.844572067260742, + "learning_rate": 1.3432620084951554e-07, + "loss": 2.9948, + "step": 95090 + }, + { + "epoch": 2.7862154837764592, + "grad_norm": 18.9677734375, + "learning_rate": 1.3396232853987113e-07, + "loss": 2.9845, + "step": 95100 + }, + { + "epoch": 2.786508459679191, + "grad_norm": 16.627117156982422, + "learning_rate": 1.3359894304842692e-07, + "loss": 2.9974, + "step": 95110 + }, + { + "epoch": 2.7868014355819235, + "grad_norm": 14.583757400512695, + "learning_rate": 1.3323604441153603e-07, + "loss": 2.9881, + "step": 95120 + }, + { + "epoch": 2.7870944114846554, + "grad_norm": 18.018470764160156, + "learning_rate": 1.3287363266550613e-07, + "loss": 2.9937, + "step": 95130 + }, + { + "epoch": 2.7873873873873873, + "grad_norm": 20.387643814086914, + "learning_rate": 1.3251170784659316e-07, + "loss": 3.0162, + "step": 95140 + }, + { + "epoch": 2.7876803632901193, + "grad_norm": 21.422069549560547, + "learning_rate": 1.3215026999100655e-07, + "loss": 3.0161, + "step": 95150 + }, + { + "epoch": 2.7879733391928516, + "grad_norm": 20.213485717773438, + "learning_rate": 1.3178931913490512e-07, + "loss": 2.993, + "step": 95160 + }, + { + "epoch": 2.7882663150955835, + "grad_norm": 18.660573959350586, + "learning_rate": 1.3142885531440053e-07, + "loss": 3.0016, + "step": 95170 + }, + { + "epoch": 2.7885592909983155, + "grad_norm": 16.2274169921875, + "learning_rate": 1.310688785655545e-07, + "loss": 3.0056, + "step": 95180 + }, + { + "epoch": 2.7888522669010474, + "grad_norm": 18.94505500793457, + "learning_rate": 1.3070938892438102e-07, + "loss": 3.0292, + "step": 95190 + }, + { + "epoch": 2.7891452428037793, + "grad_norm": 20.036352157592773, + "learning_rate": 1.3035038642684462e-07, + "loss": 2.9916, + "step": 95200 + }, + { + "epoch": 2.789438218706511, + "grad_norm": 22.57684898376465, + "learning_rate": 1.2999187110886102e-07, + "loss": 3.0133, + "step": 95210 + }, + { + "epoch": 2.789731194609243, + "grad_norm": 18.5534610748291, + "learning_rate": 1.2963384300629766e-07, + "loss": 3.0072, + "step": 95220 + }, + { + "epoch": 2.7900241705119755, + "grad_norm": 17.976146697998047, + "learning_rate": 1.2927630215497422e-07, + "loss": 2.9863, + "step": 95230 + }, + { + "epoch": 2.790082765692522, + "eval_bleu": 0.35389339202439474, + "eval_cap_loss": 0.8991236686706543, + "eval_con_loss": 1.1248091459274292, + "eval_loss": 3.1487419605255127, + "step": 95232 + }, + { + "epoch": 2.790082765692522, + "eval_bleu": 0.35389339202439474, + "eval_cap_loss": 0.8991236686706543, + "eval_con_loss": 1.1248091459274292, + "eval_loss": 3.1487419605255127, + "eval_runtime": 53.9381, + "eval_samples_per_second": 370.795, + "eval_steps_per_second": 0.371, + "step": 95232 + }, + { + "epoch": 2.7903171464147074, + "grad_norm": 15.107407569885254, + "learning_rate": 1.289192485906593e-07, + "loss": 3.0019, + "step": 95240 + }, + { + "epoch": 2.7906101223174393, + "grad_norm": 22.343544006347656, + "learning_rate": 1.2856268234907487e-07, + "loss": 3.012, + "step": 95250 + }, + { + "epoch": 2.7909030982201712, + "grad_norm": 18.843843460083008, + "learning_rate": 1.2820660346589241e-07, + "loss": 3.019, + "step": 95260 + }, + { + "epoch": 2.7911960741229036, + "grad_norm": 17.794170379638672, + "learning_rate": 1.2785101197673623e-07, + "loss": 2.993, + "step": 95270 + }, + { + "epoch": 2.7914890500256355, + "grad_norm": 14.4146089553833, + "learning_rate": 1.274959079171806e-07, + "loss": 2.9974, + "step": 95280 + }, + { + "epoch": 2.7917820259283674, + "grad_norm": 20.039047241210938, + "learning_rate": 1.2714129132275154e-07, + "loss": 3.0047, + "step": 95290 + }, + { + "epoch": 2.7920750018310994, + "grad_norm": 18.047739028930664, + "learning_rate": 1.2678716222892684e-07, + "loss": 3.0189, + "step": 95300 + }, + { + "epoch": 2.7923679777338313, + "grad_norm": 21.238664627075195, + "learning_rate": 1.2643352067113534e-07, + "loss": 2.9871, + "step": 95310 + }, + { + "epoch": 2.792660953636563, + "grad_norm": 22.547861099243164, + "learning_rate": 1.2608036668475542e-07, + "loss": 2.9897, + "step": 95320 + }, + { + "epoch": 2.7929539295392956, + "grad_norm": 18.804349899291992, + "learning_rate": 1.2572770030511937e-07, + "loss": 2.9877, + "step": 95330 + }, + { + "epoch": 2.7932469054420275, + "grad_norm": 16.44514274597168, + "learning_rate": 1.2537552156750843e-07, + "loss": 3.0088, + "step": 95340 + }, + { + "epoch": 2.7935398813447594, + "grad_norm": 20.36206817626953, + "learning_rate": 1.250238305071566e-07, + "loss": 3.0017, + "step": 95350 + }, + { + "epoch": 2.7938328572474913, + "grad_norm": 17.836509704589844, + "learning_rate": 1.2467262715924798e-07, + "loss": 2.9948, + "step": 95360 + }, + { + "epoch": 2.7941258331502237, + "grad_norm": 19.71042251586914, + "learning_rate": 1.2432191155891894e-07, + "loss": 2.9875, + "step": 95370 + }, + { + "epoch": 2.7944188090529556, + "grad_norm": 21.473485946655273, + "learning_rate": 1.2397168374125635e-07, + "loss": 3.0036, + "step": 95380 + }, + { + "epoch": 2.7947117849556875, + "grad_norm": 17.24872398376465, + "learning_rate": 1.2362194374129833e-07, + "loss": 2.9917, + "step": 95390 + }, + { + "epoch": 2.7950047608584194, + "grad_norm": 14.355525970458984, + "learning_rate": 1.232726915940341e-07, + "loss": 2.9698, + "step": 95400 + }, + { + "epoch": 2.7952977367611513, + "grad_norm": 18.004150390625, + "learning_rate": 1.229239273344046e-07, + "loss": 2.9819, + "step": 95410 + }, + { + "epoch": 2.7955907126638833, + "grad_norm": 19.489667892456055, + "learning_rate": 1.225756509973014e-07, + "loss": 2.9827, + "step": 95420 + }, + { + "epoch": 2.795883688566615, + "grad_norm": 20.406286239624023, + "learning_rate": 1.2222786261756715e-07, + "loss": 3.0067, + "step": 95430 + }, + { + "epoch": 2.7961766644693475, + "grad_norm": 17.465024948120117, + "learning_rate": 1.2188056222999732e-07, + "loss": 2.9791, + "step": 95440 + }, + { + "epoch": 2.7964696403720795, + "grad_norm": 21.994871139526367, + "learning_rate": 1.215337498693353e-07, + "loss": 3.0058, + "step": 95450 + }, + { + "epoch": 2.7967626162748114, + "grad_norm": 20.720182418823242, + "learning_rate": 1.2118742557027885e-07, + "loss": 3.0146, + "step": 95460 + }, + { + "epoch": 2.7970555921775433, + "grad_norm": 19.079906463623047, + "learning_rate": 1.208415893674758e-07, + "loss": 3.0133, + "step": 95470 + }, + { + "epoch": 2.7973485680802757, + "grad_norm": 18.210599899291992, + "learning_rate": 1.2049624129552519e-07, + "loss": 3.0028, + "step": 95480 + }, + { + "epoch": 2.7976415439830076, + "grad_norm": 19.09597396850586, + "learning_rate": 1.2015138138897543e-07, + "loss": 2.9994, + "step": 95490 + }, + { + "epoch": 2.7979345198857395, + "grad_norm": 18.01551055908203, + "learning_rate": 1.1980700968233005e-07, + "loss": 2.9883, + "step": 95500 + }, + { + "epoch": 2.7982274957884714, + "grad_norm": 18.4329891204834, + "learning_rate": 1.194631262100393e-07, + "loss": 3.0024, + "step": 95510 + }, + { + "epoch": 2.7985204716912033, + "grad_norm": 22.89197540283203, + "learning_rate": 1.1911973100650786e-07, + "loss": 3.0196, + "step": 95520 + }, + { + "epoch": 2.7988134475939352, + "grad_norm": 16.032922744750977, + "learning_rate": 1.1877682410608937e-07, + "loss": 2.9985, + "step": 95530 + }, + { + "epoch": 2.799106423496667, + "grad_norm": 17.019323348999023, + "learning_rate": 1.1843440554309082e-07, + "loss": 2.9915, + "step": 95540 + }, + { + "epoch": 2.7993993993993995, + "grad_norm": 12.443193435668945, + "learning_rate": 1.180924753517687e-07, + "loss": 3.0012, + "step": 95550 + }, + { + "epoch": 2.7996923753021314, + "grad_norm": 19.02220916748047, + "learning_rate": 1.1775103356633123e-07, + "loss": 2.9971, + "step": 95560 + }, + { + "epoch": 2.7999853512048634, + "grad_norm": 19.007736206054688, + "learning_rate": 1.1741008022093714e-07, + "loss": 3.0249, + "step": 95570 + }, + { + "epoch": 2.8002783271075953, + "grad_norm": 17.169254302978516, + "learning_rate": 1.1706961534969752e-07, + "loss": 2.9934, + "step": 95580 + }, + { + "epoch": 2.8005713030103276, + "grad_norm": 19.51718521118164, + "learning_rate": 1.1672963898667289e-07, + "loss": 3.0109, + "step": 95590 + }, + { + "epoch": 2.8008642789130596, + "grad_norm": 16.29317283630371, + "learning_rate": 1.1639015116587715e-07, + "loss": 2.9888, + "step": 95600 + }, + { + "epoch": 2.8011572548157915, + "grad_norm": 18.563552856445312, + "learning_rate": 1.1605115192127203e-07, + "loss": 2.9783, + "step": 95610 + }, + { + "epoch": 2.8014502307185234, + "grad_norm": 15.26429271697998, + "learning_rate": 1.1571264128677484e-07, + "loss": 2.9994, + "step": 95620 + }, + { + "epoch": 2.8017432066212553, + "grad_norm": 18.552711486816406, + "learning_rate": 1.1537461929625015e-07, + "loss": 2.9844, + "step": 95630 + }, + { + "epoch": 2.8020361825239872, + "grad_norm": 17.107446670532227, + "learning_rate": 1.1503708598351537e-07, + "loss": 3.0161, + "step": 95640 + }, + { + "epoch": 2.802329158426719, + "grad_norm": 17.57796287536621, + "learning_rate": 1.1470004138233847e-07, + "loss": 3.0201, + "step": 95650 + }, + { + "epoch": 2.8026221343294515, + "grad_norm": 21.707672119140625, + "learning_rate": 1.1436348552643917e-07, + "loss": 2.9768, + "step": 95660 + }, + { + "epoch": 2.8029151102321834, + "grad_norm": 20.74460792541504, + "learning_rate": 1.1402741844948772e-07, + "loss": 2.984, + "step": 95670 + }, + { + "epoch": 2.8032080861349153, + "grad_norm": 17.98557472229004, + "learning_rate": 1.1369184018510559e-07, + "loss": 2.9894, + "step": 95680 + }, + { + "epoch": 2.8035010620376473, + "grad_norm": 19.990379333496094, + "learning_rate": 1.1335675076686425e-07, + "loss": 2.9833, + "step": 95690 + }, + { + "epoch": 2.8037940379403796, + "grad_norm": 19.296037673950195, + "learning_rate": 1.1302215022828967e-07, + "loss": 2.988, + "step": 95700 + }, + { + "epoch": 2.8040870138431115, + "grad_norm": 14.898625373840332, + "learning_rate": 1.1268803860285504e-07, + "loss": 2.979, + "step": 95710 + }, + { + "epoch": 2.8043799897458435, + "grad_norm": 18.723962783813477, + "learning_rate": 1.1235441592398754e-07, + "loss": 2.9889, + "step": 95720 + }, + { + "epoch": 2.8046729656485754, + "grad_norm": 16.79287338256836, + "learning_rate": 1.1202128222506214e-07, + "loss": 2.9796, + "step": 95730 + }, + { + "epoch": 2.8049659415513073, + "grad_norm": 17.031816482543945, + "learning_rate": 1.116886375394094e-07, + "loss": 2.996, + "step": 95740 + }, + { + "epoch": 2.8050831319124003, + "eval_bleu": 0.35387853934230595, + "eval_cap_loss": 0.899121880531311, + "eval_con_loss": 1.1245965957641602, + "eval_loss": 3.148314952850342, + "step": 95744 + }, + { + "epoch": 2.8050831319124003, + "eval_bleu": 0.35387853934230595, + "eval_cap_loss": 0.899121880531311, + "eval_con_loss": 1.1245965957641602, + "eval_loss": 3.148314952850342, + "eval_runtime": 56.7306, + "eval_samples_per_second": 352.543, + "eval_steps_per_second": 0.353, + "step": 95744 + }, + { + "epoch": 2.805258917454039, + "grad_norm": 16.008310317993164, + "learning_rate": 1.1135648190030602e-07, + "loss": 3.0123, + "step": 95750 + }, + { + "epoch": 2.805551893356771, + "grad_norm": 15.504179954528809, + "learning_rate": 1.1102481534098375e-07, + "loss": 2.9889, + "step": 95760 + }, + { + "epoch": 2.8058448692595035, + "grad_norm": 17.794281005859375, + "learning_rate": 1.1069363789462273e-07, + "loss": 2.9971, + "step": 95770 + }, + { + "epoch": 2.8061378451622354, + "grad_norm": 20.814865112304688, + "learning_rate": 1.10362949594357e-07, + "loss": 3.0091, + "step": 95780 + }, + { + "epoch": 2.8064308210649673, + "grad_norm": 18.0063419342041, + "learning_rate": 1.1003275047326789e-07, + "loss": 2.9978, + "step": 95790 + }, + { + "epoch": 2.8067237969676992, + "grad_norm": 19.270023345947266, + "learning_rate": 1.0970304056439118e-07, + "loss": 2.994, + "step": 95800 + }, + { + "epoch": 2.8070167728704316, + "grad_norm": 19.231403350830078, + "learning_rate": 1.0937381990071272e-07, + "loss": 3.013, + "step": 95810 + }, + { + "epoch": 2.8073097487731635, + "grad_norm": 18.0327205657959, + "learning_rate": 1.0904508851516837e-07, + "loss": 2.9773, + "step": 95820 + }, + { + "epoch": 2.8076027246758954, + "grad_norm": 18.09341812133789, + "learning_rate": 1.0871684644064573e-07, + "loss": 3.0151, + "step": 95830 + }, + { + "epoch": 2.8078957005786274, + "grad_norm": 20.15986442565918, + "learning_rate": 1.0838909370998352e-07, + "loss": 2.9835, + "step": 95840 + }, + { + "epoch": 2.8081886764813593, + "grad_norm": 15.824950218200684, + "learning_rate": 1.0806183035597162e-07, + "loss": 3.0085, + "step": 95850 + }, + { + "epoch": 2.808481652384091, + "grad_norm": 18.033084869384766, + "learning_rate": 1.0773505641135051e-07, + "loss": 2.997, + "step": 95860 + }, + { + "epoch": 2.8087746282868236, + "grad_norm": 19.809789657592773, + "learning_rate": 1.0740877190881238e-07, + "loss": 2.9807, + "step": 95870 + }, + { + "epoch": 2.8090676041895555, + "grad_norm": 23.112247467041016, + "learning_rate": 1.07082976881e-07, + "loss": 2.9998, + "step": 95880 + }, + { + "epoch": 2.8093605800922874, + "grad_norm": 18.15815544128418, + "learning_rate": 1.067576713605073e-07, + "loss": 3.014, + "step": 95890 + }, + { + "epoch": 2.8096535559950193, + "grad_norm": 16.964174270629883, + "learning_rate": 1.0643285537987824e-07, + "loss": 2.9931, + "step": 95900 + }, + { + "epoch": 2.8099465318977517, + "grad_norm": 24.000967025756836, + "learning_rate": 1.0610852897161017e-07, + "loss": 2.9941, + "step": 95910 + }, + { + "epoch": 2.8102395078004836, + "grad_norm": 16.9497013092041, + "learning_rate": 1.057846921681488e-07, + "loss": 2.9979, + "step": 95920 + }, + { + "epoch": 2.8105324837032155, + "grad_norm": 19.299442291259766, + "learning_rate": 1.0546134500189264e-07, + "loss": 2.979, + "step": 95930 + }, + { + "epoch": 2.8108254596059474, + "grad_norm": 18.451303482055664, + "learning_rate": 1.051384875051903e-07, + "loss": 3.0083, + "step": 95940 + }, + { + "epoch": 2.8111184355086793, + "grad_norm": 16.269861221313477, + "learning_rate": 1.0481611971034311e-07, + "loss": 2.9828, + "step": 95950 + }, + { + "epoch": 2.8114114114114113, + "grad_norm": 16.77858543395996, + "learning_rate": 1.0449424164960031e-07, + "loss": 3.0057, + "step": 95960 + }, + { + "epoch": 2.811704387314143, + "grad_norm": 18.32619285583496, + "learning_rate": 1.0417285335516503e-07, + "loss": 2.9831, + "step": 95970 + }, + { + "epoch": 2.8119973632168755, + "grad_norm": 16.6810245513916, + "learning_rate": 1.0385195485918931e-07, + "loss": 2.9956, + "step": 95980 + }, + { + "epoch": 2.8122903391196075, + "grad_norm": 17.379676818847656, + "learning_rate": 1.0353154619377803e-07, + "loss": 2.9999, + "step": 95990 + }, + { + "epoch": 2.8125833150223394, + "grad_norm": 14.880467414855957, + "learning_rate": 1.0321162739098556e-07, + "loss": 3.0009, + "step": 96000 + }, + { + "epoch": 2.8128762909250713, + "grad_norm": 22.14545249938965, + "learning_rate": 1.0289219848281795e-07, + "loss": 2.9748, + "step": 96010 + }, + { + "epoch": 2.8131692668278037, + "grad_norm": 13.71142292022705, + "learning_rate": 1.0257325950123242e-07, + "loss": 2.9975, + "step": 96020 + }, + { + "epoch": 2.8134622427305356, + "grad_norm": 20.101539611816406, + "learning_rate": 1.0228663333140654e-07, + "loss": 2.9961, + "step": 96030 + }, + { + "epoch": 2.8137552186332675, + "grad_norm": 18.17886734008789, + "learning_rate": 1.0196862529819274e-07, + "loss": 3.0161, + "step": 96040 + }, + { + "epoch": 2.8140481945359994, + "grad_norm": 18.418088912963867, + "learning_rate": 1.0165110728395878e-07, + "loss": 2.9807, + "step": 96050 + }, + { + "epoch": 2.8143411704387313, + "grad_norm": 17.441696166992188, + "learning_rate": 1.0133407932047034e-07, + "loss": 2.9985, + "step": 96060 + }, + { + "epoch": 2.8146341463414632, + "grad_norm": 19.568811416625977, + "learning_rate": 1.0101754143944542e-07, + "loss": 2.9837, + "step": 96070 + }, + { + "epoch": 2.814927122244195, + "grad_norm": 22.937700271606445, + "learning_rate": 1.0070149367255034e-07, + "loss": 2.9813, + "step": 96080 + }, + { + "epoch": 2.8152200981469275, + "grad_norm": 21.41871452331543, + "learning_rate": 1.0038593605140479e-07, + "loss": 3.0068, + "step": 96090 + }, + { + "epoch": 2.8155130740496594, + "grad_norm": 17.364538192749023, + "learning_rate": 1.0007086860757742e-07, + "loss": 3.0041, + "step": 96100 + }, + { + "epoch": 2.8158060499523914, + "grad_norm": 17.845205307006836, + "learning_rate": 9.975629137258968e-08, + "loss": 3.0134, + "step": 96110 + }, + { + "epoch": 2.8160990258551233, + "grad_norm": 21.86840057373047, + "learning_rate": 9.944220437791197e-08, + "loss": 2.9886, + "step": 96120 + }, + { + "epoch": 2.8163920017578556, + "grad_norm": 20.42848777770996, + "learning_rate": 9.912860765496856e-08, + "loss": 3.0063, + "step": 96130 + }, + { + "epoch": 2.8166849776605876, + "grad_norm": 17.111907958984375, + "learning_rate": 9.881550123513161e-08, + "loss": 3.0025, + "step": 96140 + }, + { + "epoch": 2.8169779535633195, + "grad_norm": 18.420312881469727, + "learning_rate": 9.850288514972606e-08, + "loss": 2.9775, + "step": 96150 + }, + { + "epoch": 2.8172709294660514, + "grad_norm": 18.84491539001465, + "learning_rate": 9.819075943002743e-08, + "loss": 2.9884, + "step": 96160 + }, + { + "epoch": 2.8175639053687833, + "grad_norm": 19.01876449584961, + "learning_rate": 9.787912410726186e-08, + "loss": 2.9747, + "step": 96170 + }, + { + "epoch": 2.8178568812715152, + "grad_norm": 17.657346725463867, + "learning_rate": 9.756797921260664e-08, + "loss": 3.0047, + "step": 96180 + }, + { + "epoch": 2.818149857174247, + "grad_norm": 18.56519889831543, + "learning_rate": 9.725732477719019e-08, + "loss": 3.0081, + "step": 96190 + }, + { + "epoch": 2.8184428330769795, + "grad_norm": 20.010488510131836, + "learning_rate": 9.694716083209155e-08, + "loss": 3.0041, + "step": 96200 + }, + { + "epoch": 2.8187358089797114, + "grad_norm": 23.2574405670166, + "learning_rate": 9.663748740833977e-08, + "loss": 3.0039, + "step": 96210 + }, + { + "epoch": 2.8190287848824434, + "grad_norm": 18.70757293701172, + "learning_rate": 9.632830453691733e-08, + "loss": 2.9826, + "step": 96220 + }, + { + "epoch": 2.8193217607851753, + "grad_norm": 18.26752471923828, + "learning_rate": 9.6019612248755e-08, + "loss": 2.9929, + "step": 96230 + }, + { + "epoch": 2.8196147366879076, + "grad_norm": 22.154916763305664, + "learning_rate": 9.571141057473698e-08, + "loss": 2.9876, + "step": 96240 + }, + { + "epoch": 2.8199077125906395, + "grad_norm": 19.027196884155273, + "learning_rate": 9.540369954569583e-08, + "loss": 3.0055, + "step": 96250 + }, + { + "epoch": 2.820083498132279, + "eval_bleu": 0.35388861158848134, + "eval_cap_loss": 0.8990879058837891, + "eval_con_loss": 1.1248058080673218, + "eval_loss": 3.1486995220184326, + "step": 96256 + }, + { + "epoch": 2.820083498132279, + "eval_bleu": 0.35388861158848134, + "eval_cap_loss": 0.8990879058837891, + "eval_con_loss": 1.1248058080673218, + "eval_loss": 3.1486995220184326, + "eval_runtime": 53.8284, + "eval_samples_per_second": 371.551, + "eval_steps_per_second": 0.372, + "step": 96256 + }, + { + "epoch": 2.8202006884933715, + "grad_norm": 16.618881225585938, + "learning_rate": 9.509647919241694e-08, + "loss": 2.9911, + "step": 96260 + }, + { + "epoch": 2.8204936643961034, + "grad_norm": 18.94037628173828, + "learning_rate": 9.478974954563458e-08, + "loss": 3.0044, + "step": 96270 + }, + { + "epoch": 2.8207866402988353, + "grad_norm": 20.75229263305664, + "learning_rate": 9.4483510636037e-08, + "loss": 2.9946, + "step": 96280 + }, + { + "epoch": 2.821079616201567, + "grad_norm": 21.128515243530273, + "learning_rate": 9.417776249426025e-08, + "loss": 2.9791, + "step": 96290 + }, + { + "epoch": 2.8213725921042996, + "grad_norm": 16.22479820251465, + "learning_rate": 9.387250515089319e-08, + "loss": 3.0151, + "step": 96300 + }, + { + "epoch": 2.8216655680070315, + "grad_norm": 16.86743927001953, + "learning_rate": 9.356773863647528e-08, + "loss": 2.9896, + "step": 96310 + }, + { + "epoch": 2.8219585439097634, + "grad_norm": 17.818178176879883, + "learning_rate": 9.326346298149603e-08, + "loss": 2.9986, + "step": 96320 + }, + { + "epoch": 2.8222515198124953, + "grad_norm": 17.692665100097656, + "learning_rate": 9.295967821639662e-08, + "loss": 3.0001, + "step": 96330 + }, + { + "epoch": 2.8225444957152277, + "grad_norm": 19.68508529663086, + "learning_rate": 9.265638437156888e-08, + "loss": 3.0031, + "step": 96340 + }, + { + "epoch": 2.8228374716179596, + "grad_norm": 17.2794132232666, + "learning_rate": 9.235358147735574e-08, + "loss": 3.009, + "step": 96350 + }, + { + "epoch": 2.8231304475206915, + "grad_norm": 19.41590118408203, + "learning_rate": 9.205126956405075e-08, + "loss": 3.0043, + "step": 96360 + }, + { + "epoch": 2.8234234234234235, + "grad_norm": 22.619462966918945, + "learning_rate": 9.174944866189806e-08, + "loss": 3.0018, + "step": 96370 + }, + { + "epoch": 2.8237163993261554, + "grad_norm": 15.230626106262207, + "learning_rate": 9.14481188010935e-08, + "loss": 3.0122, + "step": 96380 + }, + { + "epoch": 2.8240093752288873, + "grad_norm": 16.64894676208496, + "learning_rate": 9.114728001178297e-08, + "loss": 2.9891, + "step": 96390 + }, + { + "epoch": 2.824302351131619, + "grad_norm": 19.14910125732422, + "learning_rate": 9.084693232406461e-08, + "loss": 2.9881, + "step": 96400 + }, + { + "epoch": 2.8245953270343516, + "grad_norm": 19.068246841430664, + "learning_rate": 9.054707576798494e-08, + "loss": 2.9984, + "step": 96410 + }, + { + "epoch": 2.8248883029370835, + "grad_norm": 17.609750747680664, + "learning_rate": 9.024771037354441e-08, + "loss": 2.9922, + "step": 96420 + }, + { + "epoch": 2.8251812788398154, + "grad_norm": 15.244477272033691, + "learning_rate": 8.994883617069073e-08, + "loss": 2.9897, + "step": 96430 + }, + { + "epoch": 2.8254742547425473, + "grad_norm": 17.966197967529297, + "learning_rate": 8.965045318932663e-08, + "loss": 2.982, + "step": 96440 + }, + { + "epoch": 2.8257672306452797, + "grad_norm": 22.23626708984375, + "learning_rate": 8.935256145930216e-08, + "loss": 2.999, + "step": 96450 + }, + { + "epoch": 2.8260602065480116, + "grad_norm": 18.98790740966797, + "learning_rate": 8.905516101041955e-08, + "loss": 2.9868, + "step": 96460 + }, + { + "epoch": 2.8263531824507435, + "grad_norm": 19.018238067626953, + "learning_rate": 8.875825187243281e-08, + "loss": 3.0014, + "step": 96470 + }, + { + "epoch": 2.8266461583534754, + "grad_norm": 19.042871475219727, + "learning_rate": 8.846183407504594e-08, + "loss": 3.0037, + "step": 96480 + }, + { + "epoch": 2.8269391342562074, + "grad_norm": 18.912349700927734, + "learning_rate": 8.816590764791356e-08, + "loss": 2.9975, + "step": 96490 + }, + { + "epoch": 2.8272321101589393, + "grad_norm": 17.87407875061035, + "learning_rate": 8.787047262064031e-08, + "loss": 2.9879, + "step": 96500 + }, + { + "epoch": 2.827525086061671, + "grad_norm": 16.442264556884766, + "learning_rate": 8.757552902278421e-08, + "loss": 2.993, + "step": 96510 + }, + { + "epoch": 2.8278180619644036, + "grad_norm": 21.24555206298828, + "learning_rate": 8.728107688385224e-08, + "loss": 3.0014, + "step": 96520 + }, + { + "epoch": 2.8281110378671355, + "grad_norm": 17.63591957092285, + "learning_rate": 8.698711623330192e-08, + "loss": 2.998, + "step": 96530 + }, + { + "epoch": 2.8284040137698674, + "grad_norm": 15.18093490600586, + "learning_rate": 8.669364710054306e-08, + "loss": 2.9991, + "step": 96540 + }, + { + "epoch": 2.8286969896725993, + "grad_norm": 22.3137149810791, + "learning_rate": 8.640066951493442e-08, + "loss": 2.987, + "step": 96550 + }, + { + "epoch": 2.8289899655753317, + "grad_norm": 19.491954803466797, + "learning_rate": 8.610818350578809e-08, + "loss": 2.9928, + "step": 96560 + }, + { + "epoch": 2.8292829414780636, + "grad_norm": 17.837207794189453, + "learning_rate": 8.581618910236511e-08, + "loss": 3.0046, + "step": 96570 + }, + { + "epoch": 2.8295759173807955, + "grad_norm": 18.70038604736328, + "learning_rate": 8.552468633387656e-08, + "loss": 3.0118, + "step": 96580 + }, + { + "epoch": 2.8298688932835274, + "grad_norm": 15.130945205688477, + "learning_rate": 8.523367522948745e-08, + "loss": 3.003, + "step": 96590 + }, + { + "epoch": 2.8301618691862593, + "grad_norm": 18.281587600708008, + "learning_rate": 8.494315581831059e-08, + "loss": 2.9877, + "step": 96600 + }, + { + "epoch": 2.8304548450889913, + "grad_norm": 17.801538467407227, + "learning_rate": 8.465312812941107e-08, + "loss": 3.0099, + "step": 96610 + }, + { + "epoch": 2.830747820991723, + "grad_norm": 20.672260284423828, + "learning_rate": 8.436359219180401e-08, + "loss": 3.0124, + "step": 96620 + }, + { + "epoch": 2.8310407968944555, + "grad_norm": 19.622957229614258, + "learning_rate": 8.407454803445625e-08, + "loss": 2.9854, + "step": 96630 + }, + { + "epoch": 2.8313337727971875, + "grad_norm": 18.432167053222656, + "learning_rate": 8.378599568628465e-08, + "loss": 2.997, + "step": 96640 + }, + { + "epoch": 2.8316267486999194, + "grad_norm": 21.087480545043945, + "learning_rate": 8.349793517615723e-08, + "loss": 3.0263, + "step": 96650 + }, + { + "epoch": 2.8319197246026513, + "grad_norm": 18.11351776123047, + "learning_rate": 8.321036653289316e-08, + "loss": 3.0011, + "step": 96660 + }, + { + "epoch": 2.8322127005053837, + "grad_norm": 17.599586486816406, + "learning_rate": 8.29232897852611e-08, + "loss": 3.0037, + "step": 96670 + }, + { + "epoch": 2.8325056764081156, + "grad_norm": 18.77383041381836, + "learning_rate": 8.26367049619825e-08, + "loss": 3.0062, + "step": 96680 + }, + { + "epoch": 2.8327986523108475, + "grad_norm": 18.405412673950195, + "learning_rate": 8.235061209172724e-08, + "loss": 3.0104, + "step": 96690 + }, + { + "epoch": 2.8330916282135794, + "grad_norm": 15.817266464233398, + "learning_rate": 8.206501120311795e-08, + "loss": 3.0126, + "step": 96700 + }, + { + "epoch": 2.8333846041163113, + "grad_norm": 14.865458488464355, + "learning_rate": 8.177990232472732e-08, + "loss": 3.0186, + "step": 96710 + }, + { + "epoch": 2.8336775800190432, + "grad_norm": 22.124481201171875, + "learning_rate": 8.149528548507867e-08, + "loss": 2.9845, + "step": 96720 + }, + { + "epoch": 2.833970555921775, + "grad_norm": 17.681941986083984, + "learning_rate": 8.121116071264646e-08, + "loss": 3.0098, + "step": 96730 + }, + { + "epoch": 2.8342635318245075, + "grad_norm": 20.557212829589844, + "learning_rate": 8.092752803585513e-08, + "loss": 3.0133, + "step": 96740 + }, + { + "epoch": 2.8345565077272394, + "grad_norm": 20.10780906677246, + "learning_rate": 8.06443874830809e-08, + "loss": 3.0034, + "step": 96750 + }, + { + "epoch": 2.8348494836299714, + "grad_norm": 17.87681007385254, + "learning_rate": 8.036173908264999e-08, + "loss": 3.0129, + "step": 96760 + }, + { + "epoch": 2.835083864352157, + "eval_bleu": 0.35390934782604966, + "eval_cap_loss": 0.8989832401275635, + "eval_con_loss": 1.1245039701461792, + "eval_loss": 3.1479909420013428, + "step": 96768 + }, + { + "epoch": 2.835083864352157, + "eval_bleu": 0.35390934782604966, + "eval_cap_loss": 0.8989832401275635, + "eval_con_loss": 1.1245039701461792, + "eval_loss": 3.1479909420013428, + "eval_runtime": 57.4327, + "eval_samples_per_second": 348.233, + "eval_steps_per_second": 0.348, + "step": 96768 + }, + { + "epoch": 2.8351424595327033, + "grad_norm": 18.067014694213867, + "learning_rate": 8.007958286284035e-08, + "loss": 2.9826, + "step": 96770 + }, + { + "epoch": 2.8354354354354356, + "grad_norm": 16.85359764099121, + "learning_rate": 7.979791885187881e-08, + "loss": 2.9863, + "step": 96780 + }, + { + "epoch": 2.8357284113381676, + "grad_norm": 14.72361946105957, + "learning_rate": 7.95167470779451e-08, + "loss": 3.0133, + "step": 96790 + }, + { + "epoch": 2.8360213872408995, + "grad_norm": 17.46025848388672, + "learning_rate": 7.923606756916836e-08, + "loss": 2.9979, + "step": 96800 + }, + { + "epoch": 2.8363143631436314, + "grad_norm": 15.676977157592773, + "learning_rate": 7.895588035362999e-08, + "loss": 3.0029, + "step": 96810 + }, + { + "epoch": 2.8366073390463633, + "grad_norm": 18.643529891967773, + "learning_rate": 7.867618545935984e-08, + "loss": 2.9977, + "step": 96820 + }, + { + "epoch": 2.8369003149490952, + "grad_norm": 23.607952117919922, + "learning_rate": 7.83969829143405e-08, + "loss": 2.9987, + "step": 96830 + }, + { + "epoch": 2.8371932908518276, + "grad_norm": 16.732269287109375, + "learning_rate": 7.811827274650352e-08, + "loss": 2.9996, + "step": 96840 + }, + { + "epoch": 2.8374862667545595, + "grad_norm": 18.32098388671875, + "learning_rate": 7.784005498373271e-08, + "loss": 3.0094, + "step": 96850 + }, + { + "epoch": 2.8377792426572914, + "grad_norm": 18.498531341552734, + "learning_rate": 7.756232965386301e-08, + "loss": 2.9956, + "step": 96860 + }, + { + "epoch": 2.8380722185600233, + "grad_norm": 18.388771057128906, + "learning_rate": 7.728509678467777e-08, + "loss": 2.9859, + "step": 96870 + }, + { + "epoch": 2.8383651944627557, + "grad_norm": 18.611364364624023, + "learning_rate": 7.700835640391313e-08, + "loss": 2.9959, + "step": 96880 + }, + { + "epoch": 2.8386581703654876, + "grad_norm": 18.147571563720703, + "learning_rate": 7.673210853925528e-08, + "loss": 3.0049, + "step": 96890 + }, + { + "epoch": 2.8389511462682195, + "grad_norm": 18.154943466186523, + "learning_rate": 7.645635321834211e-08, + "loss": 3.0, + "step": 96900 + }, + { + "epoch": 2.8392441221709515, + "grad_norm": 16.0788631439209, + "learning_rate": 7.618109046875932e-08, + "loss": 3.0185, + "step": 96910 + }, + { + "epoch": 2.8395370980736834, + "grad_norm": 18.74028968811035, + "learning_rate": 7.590632031804712e-08, + "loss": 2.9956, + "step": 96920 + }, + { + "epoch": 2.8398300739764153, + "grad_norm": 19.71653938293457, + "learning_rate": 7.563204279369351e-08, + "loss": 3.0184, + "step": 96930 + }, + { + "epoch": 2.840123049879147, + "grad_norm": 19.711214065551758, + "learning_rate": 7.535825792313933e-08, + "loss": 2.9849, + "step": 96940 + }, + { + "epoch": 2.8404160257818796, + "grad_norm": 19.994888305664062, + "learning_rate": 7.508496573377377e-08, + "loss": 2.9899, + "step": 96950 + }, + { + "epoch": 2.8407090016846115, + "grad_norm": 20.891551971435547, + "learning_rate": 7.48121662529394e-08, + "loss": 3.0173, + "step": 96960 + }, + { + "epoch": 2.8410019775873434, + "grad_norm": 14.017349243164062, + "learning_rate": 7.45398595079272e-08, + "loss": 2.9773, + "step": 96970 + }, + { + "epoch": 2.8412949534900753, + "grad_norm": 18.136030197143555, + "learning_rate": 7.426804552598088e-08, + "loss": 2.9937, + "step": 96980 + }, + { + "epoch": 2.8415879293928077, + "grad_norm": 18.82659912109375, + "learning_rate": 7.399672433429262e-08, + "loss": 3.0042, + "step": 96990 + }, + { + "epoch": 2.8418809052955396, + "grad_norm": 15.22811508178711, + "learning_rate": 7.37258959600079e-08, + "loss": 2.9916, + "step": 97000 + }, + { + "epoch": 2.8421738811982715, + "grad_norm": 17.45804786682129, + "learning_rate": 7.345556043022006e-08, + "loss": 2.9921, + "step": 97010 + }, + { + "epoch": 2.8424668571010034, + "grad_norm": 18.01175308227539, + "learning_rate": 7.318571777197525e-08, + "loss": 2.9881, + "step": 97020 + }, + { + "epoch": 2.8427598330037354, + "grad_norm": 15.47770881652832, + "learning_rate": 7.291636801226964e-08, + "loss": 2.9872, + "step": 97030 + }, + { + "epoch": 2.8430528089064673, + "grad_norm": 16.637371063232422, + "learning_rate": 7.264751117805002e-08, + "loss": 3.0155, + "step": 97040 + }, + { + "epoch": 2.843345784809199, + "grad_norm": 14.628586769104004, + "learning_rate": 7.237914729621375e-08, + "loss": 2.9873, + "step": 97050 + }, + { + "epoch": 2.8436387607119316, + "grad_norm": 18.28457260131836, + "learning_rate": 7.211127639360994e-08, + "loss": 2.9887, + "step": 97060 + }, + { + "epoch": 2.8439317366146635, + "grad_norm": 20.90717887878418, + "learning_rate": 7.184389849703599e-08, + "loss": 2.9798, + "step": 97070 + }, + { + "epoch": 2.8442247125173954, + "grad_norm": 21.444046020507812, + "learning_rate": 7.157701363324276e-08, + "loss": 3.0052, + "step": 97080 + }, + { + "epoch": 2.8445176884201273, + "grad_norm": 21.161426544189453, + "learning_rate": 7.131062182892946e-08, + "loss": 2.9875, + "step": 97090 + }, + { + "epoch": 2.8448106643228597, + "grad_norm": 19.737743377685547, + "learning_rate": 7.104472311074806e-08, + "loss": 3.012, + "step": 97100 + }, + { + "epoch": 2.8451036402255916, + "grad_norm": 17.332386016845703, + "learning_rate": 7.077931750529954e-08, + "loss": 2.9869, + "step": 97110 + }, + { + "epoch": 2.8453966161283235, + "grad_norm": 21.258100509643555, + "learning_rate": 7.051440503913598e-08, + "loss": 2.9959, + "step": 97120 + }, + { + "epoch": 2.8456895920310554, + "grad_norm": 19.084827423095703, + "learning_rate": 7.024998573876063e-08, + "loss": 3.0026, + "step": 97130 + }, + { + "epoch": 2.8459825679337873, + "grad_norm": 18.354110717773438, + "learning_rate": 6.998605963062732e-08, + "loss": 3.0076, + "step": 97140 + }, + { + "epoch": 2.8462755438365193, + "grad_norm": 20.976165771484375, + "learning_rate": 6.972262674113939e-08, + "loss": 3.005, + "step": 97150 + }, + { + "epoch": 2.846568519739251, + "grad_norm": 17.116762161254883, + "learning_rate": 6.945968709665296e-08, + "loss": 3.0121, + "step": 97160 + }, + { + "epoch": 2.8468614956419835, + "grad_norm": 16.65795135498047, + "learning_rate": 6.919724072347256e-08, + "loss": 2.9903, + "step": 97170 + }, + { + "epoch": 2.8471544715447155, + "grad_norm": 20.00611114501953, + "learning_rate": 6.893528764785496e-08, + "loss": 3.003, + "step": 97180 + }, + { + "epoch": 2.8474474474474474, + "grad_norm": 19.077770233154297, + "learning_rate": 6.867382789600641e-08, + "loss": 2.9855, + "step": 97190 + }, + { + "epoch": 2.8477404233501793, + "grad_norm": 17.637582778930664, + "learning_rate": 6.841286149408488e-08, + "loss": 3.0142, + "step": 97200 + }, + { + "epoch": 2.8480333992529117, + "grad_norm": 20.094467163085938, + "learning_rate": 6.815238846819838e-08, + "loss": 3.0194, + "step": 97210 + }, + { + "epoch": 2.8483263751556436, + "grad_norm": 15.195802688598633, + "learning_rate": 6.789240884440607e-08, + "loss": 3.0115, + "step": 97220 + }, + { + "epoch": 2.8486193510583755, + "grad_norm": 15.854584693908691, + "learning_rate": 6.763292264871713e-08, + "loss": 2.9901, + "step": 97230 + }, + { + "epoch": 2.8489123269611074, + "grad_norm": 19.35702896118164, + "learning_rate": 6.737392990709135e-08, + "loss": 2.9978, + "step": 97240 + }, + { + "epoch": 2.8492053028638393, + "grad_norm": 17.146669387817383, + "learning_rate": 6.711543064544024e-08, + "loss": 2.989, + "step": 97250 + }, + { + "epoch": 2.8494982787665712, + "grad_norm": 18.077468872070312, + "learning_rate": 6.685742488962366e-08, + "loss": 3.0005, + "step": 97260 + }, + { + "epoch": 2.8497912546693036, + "grad_norm": 19.59822654724121, + "learning_rate": 6.659991266545485e-08, + "loss": 2.9955, + "step": 97270 + }, + { + "epoch": 2.8500842305720355, + "grad_norm": 15.775928497314453, + "learning_rate": 6.634289399869542e-08, + "loss": 3.0093, + "step": 97280 + }, + { + "epoch": 2.8500842305720355, + "eval_bleu": 0.35400484745450517, + "eval_cap_loss": 0.8990264534950256, + "eval_con_loss": 1.124382734298706, + "eval_loss": 3.147792339324951, + "step": 97280 + }, + { + "epoch": 2.8500842305720355, + "eval_bleu": 0.35400484745450517, + "eval_cap_loss": 0.8990264534950256, + "eval_con_loss": 1.124382734298706, + "eval_loss": 3.147792339324951, + "eval_runtime": 56.2034, + "eval_samples_per_second": 355.85, + "eval_steps_per_second": 0.356, + "step": 97280 + }, + { + "epoch": 2.8503772064747674, + "grad_norm": 16.463924407958984, + "learning_rate": 6.608636891505982e-08, + "loss": 2.9858, + "step": 97290 + }, + { + "epoch": 2.8506701823774994, + "grad_norm": 15.66458797454834, + "learning_rate": 6.583033744021028e-08, + "loss": 2.9803, + "step": 97300 + }, + { + "epoch": 2.8509631582802317, + "grad_norm": 20.377405166625977, + "learning_rate": 6.557479959976243e-08, + "loss": 3.0109, + "step": 97310 + }, + { + "epoch": 2.8512561341829636, + "grad_norm": 15.68867015838623, + "learning_rate": 6.531975541928082e-08, + "loss": 2.9877, + "step": 97320 + }, + { + "epoch": 2.8515491100856956, + "grad_norm": 16.69121551513672, + "learning_rate": 6.50652049242817e-08, + "loss": 3.0102, + "step": 97330 + }, + { + "epoch": 2.8518420859884275, + "grad_norm": 23.56432342529297, + "learning_rate": 6.481114814023027e-08, + "loss": 2.9997, + "step": 97340 + }, + { + "epoch": 2.8521350618911594, + "grad_norm": 20.736772537231445, + "learning_rate": 6.455758509254451e-08, + "loss": 3.03, + "step": 97350 + }, + { + "epoch": 2.8524280377938913, + "grad_norm": 20.44320297241211, + "learning_rate": 6.430451580659137e-08, + "loss": 3.0026, + "step": 97360 + }, + { + "epoch": 2.8527210136966232, + "grad_norm": 17.74087142944336, + "learning_rate": 6.405194030768835e-08, + "loss": 2.9992, + "step": 97370 + }, + { + "epoch": 2.8530139895993556, + "grad_norm": 17.600942611694336, + "learning_rate": 6.379985862110471e-08, + "loss": 3.0025, + "step": 97380 + }, + { + "epoch": 2.8533069655020875, + "grad_norm": 17.929332733154297, + "learning_rate": 6.354827077206082e-08, + "loss": 2.9857, + "step": 97390 + }, + { + "epoch": 2.8535999414048194, + "grad_norm": 14.740304946899414, + "learning_rate": 6.329717678572434e-08, + "loss": 3.0016, + "step": 97400 + }, + { + "epoch": 2.8538929173075513, + "grad_norm": 20.554941177368164, + "learning_rate": 6.304657668721737e-08, + "loss": 3.01, + "step": 97410 + }, + { + "epoch": 2.8541858932102837, + "grad_norm": 17.5187931060791, + "learning_rate": 6.279647050161042e-08, + "loss": 2.9919, + "step": 97420 + }, + { + "epoch": 2.8544788691130156, + "grad_norm": 19.352754592895508, + "learning_rate": 6.254685825392515e-08, + "loss": 3.0222, + "step": 97430 + }, + { + "epoch": 2.8547718450157475, + "grad_norm": 22.618656158447266, + "learning_rate": 6.229773996913379e-08, + "loss": 2.9971, + "step": 97440 + }, + { + "epoch": 2.8550648209184795, + "grad_norm": 20.02587890625, + "learning_rate": 6.20491156721592e-08, + "loss": 2.9862, + "step": 97450 + }, + { + "epoch": 2.8553577968212114, + "grad_norm": 18.70216941833496, + "learning_rate": 6.180098538787426e-08, + "loss": 2.9824, + "step": 97460 + }, + { + "epoch": 2.8556507727239433, + "grad_norm": 19.194833755493164, + "learning_rate": 6.155334914110411e-08, + "loss": 3.0188, + "step": 97470 + }, + { + "epoch": 2.855943748626675, + "grad_norm": 19.946834564208984, + "learning_rate": 6.130620695662225e-08, + "loss": 2.9931, + "step": 97480 + }, + { + "epoch": 2.8562367245294076, + "grad_norm": 20.497146606445312, + "learning_rate": 6.105955885915448e-08, + "loss": 2.997, + "step": 97490 + }, + { + "epoch": 2.8565297004321395, + "grad_norm": 19.771488189697266, + "learning_rate": 6.08134048733755e-08, + "loss": 3.0036, + "step": 97500 + }, + { + "epoch": 2.8568226763348714, + "grad_norm": 17.744043350219727, + "learning_rate": 6.056774502391283e-08, + "loss": 3.0254, + "step": 97510 + }, + { + "epoch": 2.8571156522376033, + "grad_norm": 19.00957679748535, + "learning_rate": 6.03225793353418e-08, + "loss": 2.9922, + "step": 97520 + }, + { + "epoch": 2.8574086281403357, + "grad_norm": 20.74540901184082, + "learning_rate": 6.007790783219114e-08, + "loss": 3.0006, + "step": 97530 + }, + { + "epoch": 2.8577016040430676, + "grad_norm": 18.901531219482422, + "learning_rate": 5.983373053893793e-08, + "loss": 3.0101, + "step": 97540 + }, + { + "epoch": 2.8579945799457995, + "grad_norm": 15.716984748840332, + "learning_rate": 5.959004748001096e-08, + "loss": 2.9968, + "step": 97550 + }, + { + "epoch": 2.8582875558485314, + "grad_norm": 19.68179702758789, + "learning_rate": 5.9346858679789623e-08, + "loss": 2.9861, + "step": 97560 + }, + { + "epoch": 2.8585805317512634, + "grad_norm": 17.55038070678711, + "learning_rate": 5.910416416260278e-08, + "loss": 3.0019, + "step": 97570 + }, + { + "epoch": 2.8588735076539953, + "grad_norm": 19.75676918029785, + "learning_rate": 5.886196395273158e-08, + "loss": 2.9985, + "step": 97580 + }, + { + "epoch": 2.859166483556727, + "grad_norm": 16.177873611450195, + "learning_rate": 5.8620258074406075e-08, + "loss": 3.0014, + "step": 97590 + }, + { + "epoch": 2.8594594594594596, + "grad_norm": 17.270448684692383, + "learning_rate": 5.8379046551807486e-08, + "loss": 3.0098, + "step": 97600 + }, + { + "epoch": 2.8597524353621915, + "grad_norm": 18.01717185974121, + "learning_rate": 5.8138329409067054e-08, + "loss": 2.9808, + "step": 97610 + }, + { + "epoch": 2.8600454112649234, + "grad_norm": 15.365814208984375, + "learning_rate": 5.7898106670268296e-08, + "loss": 3.0084, + "step": 97620 + }, + { + "epoch": 2.8603383871676553, + "grad_norm": 18.603015899658203, + "learning_rate": 5.7658378359443104e-08, + "loss": 2.9841, + "step": 97630 + }, + { + "epoch": 2.8606313630703877, + "grad_norm": 19.094331741333008, + "learning_rate": 5.741914450057562e-08, + "loss": 2.9987, + "step": 97640 + }, + { + "epoch": 2.8609243389731196, + "grad_norm": 20.268007278442383, + "learning_rate": 5.7180405117598924e-08, + "loss": 2.9957, + "step": 97650 + }, + { + "epoch": 2.8612173148758515, + "grad_norm": 20.882713317871094, + "learning_rate": 5.694216023439836e-08, + "loss": 2.9975, + "step": 97660 + }, + { + "epoch": 2.8615102907785834, + "grad_norm": 21.55385971069336, + "learning_rate": 5.670440987480819e-08, + "loss": 2.9893, + "step": 97670 + }, + { + "epoch": 2.8618032666813153, + "grad_norm": 16.602869033813477, + "learning_rate": 5.646715406261494e-08, + "loss": 2.9831, + "step": 97680 + }, + { + "epoch": 2.8620962425840473, + "grad_norm": 19.41594123840332, + "learning_rate": 5.623039282155296e-08, + "loss": 2.9937, + "step": 97690 + }, + { + "epoch": 2.862389218486779, + "grad_norm": 19.693973541259766, + "learning_rate": 5.599412617531053e-08, + "loss": 2.9889, + "step": 97700 + }, + { + "epoch": 2.8626821943895115, + "grad_norm": 17.124711990356445, + "learning_rate": 5.575835414752318e-08, + "loss": 3.0021, + "step": 97710 + }, + { + "epoch": 2.8629751702922435, + "grad_norm": 19.20081329345703, + "learning_rate": 5.552307676178037e-08, + "loss": 2.9853, + "step": 97720 + }, + { + "epoch": 2.8632681461949754, + "grad_norm": 18.684688568115234, + "learning_rate": 5.528829404161828e-08, + "loss": 2.9679, + "step": 97730 + }, + { + "epoch": 2.8635611220977073, + "grad_norm": 19.360200881958008, + "learning_rate": 5.505400601052702e-08, + "loss": 2.9854, + "step": 97740 + }, + { + "epoch": 2.8638540980004397, + "grad_norm": 19.758943557739258, + "learning_rate": 5.482021269194504e-08, + "loss": 2.9984, + "step": 97750 + }, + { + "epoch": 2.8641470739031716, + "grad_norm": 14.3231782913208, + "learning_rate": 5.458691410926198e-08, + "loss": 2.9875, + "step": 97760 + }, + { + "epoch": 2.8644400498059035, + "grad_norm": 16.3438663482666, + "learning_rate": 5.43541102858175e-08, + "loss": 2.9906, + "step": 97770 + }, + { + "epoch": 2.8647330257086354, + "grad_norm": 19.639482498168945, + "learning_rate": 5.412180124490296e-08, + "loss": 2.9898, + "step": 97780 + }, + { + "epoch": 2.8650260016113673, + "grad_norm": 23.70155143737793, + "learning_rate": 5.388998700975978e-08, + "loss": 2.9958, + "step": 97790 + }, + { + "epoch": 2.865084596791914, + "eval_bleu": 0.3540571385659056, + "eval_cap_loss": 0.8989731669425964, + "eval_con_loss": 1.1243669986724854, + "eval_loss": 3.147706985473633, + "step": 97792 + }, + { + "epoch": 2.865084596791914, + "eval_bleu": 0.3540571385659056, + "eval_cap_loss": 0.8989731669425964, + "eval_con_loss": 1.1243669986724854, + "eval_loss": 3.147706985473633, + "eval_runtime": 53.9525, + "eval_samples_per_second": 370.697, + "eval_steps_per_second": 0.371, + "step": 97792 + }, + { + "epoch": 2.8653189775140993, + "grad_norm": 17.991727828979492, + "learning_rate": 5.365866760357885e-08, + "loss": 2.9769, + "step": 97800 + }, + { + "epoch": 2.8656119534168316, + "grad_norm": 19.765987396240234, + "learning_rate": 5.342784304950277e-08, + "loss": 2.9991, + "step": 97810 + }, + { + "epoch": 2.8659049293195635, + "grad_norm": 16.88435935974121, + "learning_rate": 5.319751337062362e-08, + "loss": 3.0126, + "step": 97820 + }, + { + "epoch": 2.8661979052222955, + "grad_norm": 17.314102172851562, + "learning_rate": 5.2967678589985194e-08, + "loss": 3.0179, + "step": 97830 + }, + { + "epoch": 2.8664908811250274, + "grad_norm": 20.68282127380371, + "learning_rate": 5.273833873058076e-08, + "loss": 3.0186, + "step": 97840 + }, + { + "epoch": 2.8667838570277597, + "grad_norm": 15.705927848815918, + "learning_rate": 5.25094938153542e-08, + "loss": 3.0012, + "step": 97850 + }, + { + "epoch": 2.8670768329304916, + "grad_norm": 17.9012451171875, + "learning_rate": 5.228114386720051e-08, + "loss": 2.9873, + "step": 97860 + }, + { + "epoch": 2.8673698088332236, + "grad_norm": 17.18877410888672, + "learning_rate": 5.205328890896422e-08, + "loss": 2.9924, + "step": 97870 + }, + { + "epoch": 2.8676627847359555, + "grad_norm": 19.39225196838379, + "learning_rate": 5.1825928963442075e-08, + "loss": 2.9826, + "step": 97880 + }, + { + "epoch": 2.8679557606386874, + "grad_norm": 17.986011505126953, + "learning_rate": 5.159906405337811e-08, + "loss": 2.9956, + "step": 97890 + }, + { + "epoch": 2.8682487365414193, + "grad_norm": 17.831649780273438, + "learning_rate": 5.137269420147084e-08, + "loss": 3.0126, + "step": 97900 + }, + { + "epoch": 2.8685417124441512, + "grad_norm": 18.19670867919922, + "learning_rate": 5.114681943036603e-08, + "loss": 2.9779, + "step": 97910 + }, + { + "epoch": 2.8688346883468836, + "grad_norm": 19.316091537475586, + "learning_rate": 5.0921439762661174e-08, + "loss": 2.9952, + "step": 97920 + }, + { + "epoch": 2.8691276642496155, + "grad_norm": 15.981245994567871, + "learning_rate": 5.0696555220904334e-08, + "loss": 2.9944, + "step": 97930 + }, + { + "epoch": 2.8694206401523474, + "grad_norm": 21.64008903503418, + "learning_rate": 5.0472165827594176e-08, + "loss": 3.0013, + "step": 97940 + }, + { + "epoch": 2.8697136160550794, + "grad_norm": 16.32243537902832, + "learning_rate": 5.024827160517942e-08, + "loss": 2.9935, + "step": 97950 + }, + { + "epoch": 2.8700065919578117, + "grad_norm": 18.042842864990234, + "learning_rate": 5.0024872576058814e-08, + "loss": 2.9989, + "step": 97960 + }, + { + "epoch": 2.8702995678605436, + "grad_norm": 16.78386878967285, + "learning_rate": 4.980196876258281e-08, + "loss": 2.988, + "step": 97970 + }, + { + "epoch": 2.8705925437632756, + "grad_norm": 18.567983627319336, + "learning_rate": 4.957956018705135e-08, + "loss": 2.9925, + "step": 97980 + }, + { + "epoch": 2.8708855196660075, + "grad_norm": 22.337862014770508, + "learning_rate": 4.935764687171496e-08, + "loss": 2.9746, + "step": 97990 + }, + { + "epoch": 2.8711784955687394, + "grad_norm": 18.361953735351562, + "learning_rate": 4.913622883877478e-08, + "loss": 2.997, + "step": 98000 + }, + { + "epoch": 2.8714714714714713, + "grad_norm": 17.168296813964844, + "learning_rate": 4.891530611038253e-08, + "loss": 2.9892, + "step": 98010 + }, + { + "epoch": 2.871764447374203, + "grad_norm": 19.817018508911133, + "learning_rate": 4.869487870863998e-08, + "loss": 2.9962, + "step": 98020 + }, + { + "epoch": 2.8720574232769356, + "grad_norm": NaN, + "learning_rate": 4.8496917569584854e-08, + "loss": 3.0046, + "step": 98030 + }, + { + "epoch": 2.8723503991796675, + "grad_norm": 18.277721405029297, + "learning_rate": 4.827743134919072e-08, + "loss": 3.0152, + "step": 98040 + }, + { + "epoch": 2.8726433750823994, + "grad_norm": 16.491058349609375, + "learning_rate": 4.8080317309059376e-08, + "loss": 2.999, + "step": 98050 + }, + { + "epoch": 2.8729363509851313, + "grad_norm": 20.85120391845703, + "learning_rate": 4.786177234928246e-08, + "loss": 3.001, + "step": 98060 + }, + { + "epoch": 2.8732293268878637, + "grad_norm": 18.322256088256836, + "learning_rate": 4.764372282155594e-08, + "loss": 2.9922, + "step": 98070 + }, + { + "epoch": 2.8735223027905956, + "grad_norm": 16.35382080078125, + "learning_rate": 4.74261687476929e-08, + "loss": 3.0112, + "step": 98080 + }, + { + "epoch": 2.8738152786933275, + "grad_norm": 19.68415641784668, + "learning_rate": 4.720911014945928e-08, + "loss": 3.0273, + "step": 98090 + }, + { + "epoch": 2.8741082545960595, + "grad_norm": 17.15452003479004, + "learning_rate": 4.699254704857048e-08, + "loss": 2.9917, + "step": 98100 + }, + { + "epoch": 2.8744012304987914, + "grad_norm": 15.801593780517578, + "learning_rate": 4.67764794666925e-08, + "loss": 3.0136, + "step": 98110 + }, + { + "epoch": 2.8746942064015233, + "grad_norm": 16.094722747802734, + "learning_rate": 4.6560907425440836e-08, + "loss": 3.0013, + "step": 98120 + }, + { + "epoch": 2.874987182304255, + "grad_norm": 16.288406372070312, + "learning_rate": 4.6345830946382675e-08, + "loss": 2.9823, + "step": 98130 + }, + { + "epoch": 2.8752801582069876, + "grad_norm": 17.24344825744629, + "learning_rate": 4.613125005103525e-08, + "loss": 3.0199, + "step": 98140 + }, + { + "epoch": 2.8755731341097195, + "grad_norm": 14.579266548156738, + "learning_rate": 4.591716476086583e-08, + "loss": 2.9735, + "step": 98150 + }, + { + "epoch": 2.8758661100124514, + "grad_norm": 24.279434204101562, + "learning_rate": 4.5703575097292286e-08, + "loss": 3.0016, + "step": 98160 + }, + { + "epoch": 2.8761590859151833, + "grad_norm": 17.21149444580078, + "learning_rate": 4.549048108168363e-08, + "loss": 2.9964, + "step": 98170 + }, + { + "epoch": 2.8764520618179157, + "grad_norm": 14.665658950805664, + "learning_rate": 4.5277882735357826e-08, + "loss": 2.9985, + "step": 98180 + }, + { + "epoch": 2.8767450377206476, + "grad_norm": 14.697004318237305, + "learning_rate": 4.506578007958506e-08, + "loss": 2.9958, + "step": 98190 + }, + { + "epoch": 2.8770380136233795, + "grad_norm": 20.008338928222656, + "learning_rate": 4.485417313558393e-08, + "loss": 2.9954, + "step": 98200 + }, + { + "epoch": 2.8773309895261114, + "grad_norm": 15.516068458557129, + "learning_rate": 4.4643061924524724e-08, + "loss": 2.9951, + "step": 98210 + }, + { + "epoch": 2.8776239654288434, + "grad_norm": 16.103729248046875, + "learning_rate": 4.443244646752831e-08, + "loss": 3.0019, + "step": 98220 + }, + { + "epoch": 2.8779169413315753, + "grad_norm": 16.636472702026367, + "learning_rate": 4.4222326785665624e-08, + "loss": 2.9898, + "step": 98230 + }, + { + "epoch": 2.8782099172343076, + "grad_norm": 19.128816604614258, + "learning_rate": 4.401270289995707e-08, + "loss": 2.9989, + "step": 98240 + }, + { + "epoch": 2.8785028931370396, + "grad_norm": 20.39919662475586, + "learning_rate": 4.3803574831374765e-08, + "loss": 3.0117, + "step": 98250 + }, + { + "epoch": 2.8787958690397715, + "grad_norm": 17.425785064697266, + "learning_rate": 4.359494260084085e-08, + "loss": 2.9664, + "step": 98260 + }, + { + "epoch": 2.8790888449425034, + "grad_norm": 15.323901176452637, + "learning_rate": 4.3386806229226976e-08, + "loss": 2.9826, + "step": 98270 + }, + { + "epoch": 2.8793818208452358, + "grad_norm": 19.200014114379883, + "learning_rate": 4.317916573735759e-08, + "loss": 3.0084, + "step": 98280 + }, + { + "epoch": 2.8796747967479677, + "grad_norm": 17.27436637878418, + "learning_rate": 4.297202114600385e-08, + "loss": 2.9784, + "step": 98290 + }, + { + "epoch": 2.8799677726506996, + "grad_norm": 16.978206634521484, + "learning_rate": 4.276537247589141e-08, + "loss": 2.9764, + "step": 98300 + }, + { + "epoch": 2.880084963011792, + "eval_bleu": 0.3540364144349827, + "eval_cap_loss": 0.8989909291267395, + "eval_con_loss": 1.1243305206298828, + "eval_loss": 3.1476516723632812, + "step": 98304 + }, + { + "epoch": 2.880084963011792, + "eval_bleu": 0.3540364144349827, + "eval_cap_loss": 0.8989909291267395, + "eval_con_loss": 1.1243305206298828, + "eval_loss": 3.1476516723632812, + "eval_runtime": 58.0182, + "eval_samples_per_second": 344.72, + "eval_steps_per_second": 0.345, + "step": 98304 + }, + { + "epoch": 2.8802607485534315, + "grad_norm": 20.14606285095215, + "learning_rate": 4.2559219747692617e-08, + "loss": 2.998, + "step": 98310 + }, + { + "epoch": 2.8805537244561634, + "grad_norm": 20.478404998779297, + "learning_rate": 4.235356298203208e-08, + "loss": 2.985, + "step": 98320 + }, + { + "epoch": 2.8808467003588953, + "grad_norm": 14.232062339782715, + "learning_rate": 4.214840219948502e-08, + "loss": 3.0116, + "step": 98330 + }, + { + "epoch": 2.8811396762616273, + "grad_norm": 20.217466354370117, + "learning_rate": 4.194373742057667e-08, + "loss": 2.9991, + "step": 98340 + }, + { + "epoch": 2.8814326521643596, + "grad_norm": 21.776973724365234, + "learning_rate": 4.1739568665782326e-08, + "loss": 2.9813, + "step": 98350 + }, + { + "epoch": 2.8817256280670915, + "grad_norm": 19.5780086517334, + "learning_rate": 4.153589595552732e-08, + "loss": 2.9843, + "step": 98360 + }, + { + "epoch": 2.8820186039698235, + "grad_norm": 15.975736618041992, + "learning_rate": 4.1332719310188677e-08, + "loss": 3.0214, + "step": 98370 + }, + { + "epoch": 2.8823115798725554, + "grad_norm": 15.613947868347168, + "learning_rate": 4.1130038750092916e-08, + "loss": 2.9868, + "step": 98380 + }, + { + "epoch": 2.8826045557752877, + "grad_norm": 21.69607925415039, + "learning_rate": 4.09278542955166e-08, + "loss": 2.9863, + "step": 98390 + }, + { + "epoch": 2.8828975316780197, + "grad_norm": 19.769258499145508, + "learning_rate": 4.0726165966687435e-08, + "loss": 3.0031, + "step": 98400 + }, + { + "epoch": 2.8831905075807516, + "grad_norm": 17.414209365844727, + "learning_rate": 4.052497378378206e-08, + "loss": 3.0007, + "step": 98410 + }, + { + "epoch": 2.8834834834834835, + "grad_norm": 18.74073600769043, + "learning_rate": 4.032427776693049e-08, + "loss": 2.9887, + "step": 98420 + }, + { + "epoch": 2.8837764593862154, + "grad_norm": 15.99059009552002, + "learning_rate": 4.012407793620998e-08, + "loss": 2.9935, + "step": 98430 + }, + { + "epoch": 2.8840694352889473, + "grad_norm": 16.6143798828125, + "learning_rate": 3.992437431164953e-08, + "loss": 2.9873, + "step": 98440 + }, + { + "epoch": 2.8843624111916792, + "grad_norm": 17.247888565063477, + "learning_rate": 3.972516691322814e-08, + "loss": 2.9842, + "step": 98450 + }, + { + "epoch": 2.8846553870944116, + "grad_norm": 19.888790130615234, + "learning_rate": 3.9526455760875994e-08, + "loss": 3.002, + "step": 98460 + }, + { + "epoch": 2.8849483629971435, + "grad_norm": 16.9798583984375, + "learning_rate": 3.9328240874471624e-08, + "loss": 2.9848, + "step": 98470 + }, + { + "epoch": 2.8852413388998754, + "grad_norm": 18.847742080688477, + "learning_rate": 3.913052227384695e-08, + "loss": 3.005, + "step": 98480 + }, + { + "epoch": 2.8855343148026074, + "grad_norm": 19.841650009155273, + "learning_rate": 3.893329997878115e-08, + "loss": 3.0048, + "step": 98490 + }, + { + "epoch": 2.8858272907053397, + "grad_norm": 18.724016189575195, + "learning_rate": 3.8736574009005656e-08, + "loss": 3.0236, + "step": 98500 + }, + { + "epoch": 2.8861202666080716, + "grad_norm": 18.28474998474121, + "learning_rate": 3.854034438420195e-08, + "loss": 2.9793, + "step": 98510 + }, + { + "epoch": 2.8864132425108036, + "grad_norm": 22.6310977935791, + "learning_rate": 3.834461112400156e-08, + "loss": 3.0136, + "step": 98520 + }, + { + "epoch": 2.8867062184135355, + "grad_norm": 18.5135555267334, + "learning_rate": 3.8149374247986034e-08, + "loss": 2.9956, + "step": 98530 + }, + { + "epoch": 2.8869991943162674, + "grad_norm": 16.521270751953125, + "learning_rate": 3.795463377568864e-08, + "loss": 2.988, + "step": 98540 + }, + { + "epoch": 2.8872921702189993, + "grad_norm": 16.435237884521484, + "learning_rate": 3.7760389726590464e-08, + "loss": 2.9858, + "step": 98550 + }, + { + "epoch": 2.8875851461217312, + "grad_norm": 16.286832809448242, + "learning_rate": 3.7566642120125954e-08, + "loss": 3.0027, + "step": 98560 + }, + { + "epoch": 2.8878781220244636, + "grad_norm": 23.01050567626953, + "learning_rate": 3.7373390975677385e-08, + "loss": 3.0186, + "step": 98570 + }, + { + "epoch": 2.8881710979271955, + "grad_norm": 19.42784309387207, + "learning_rate": 3.718063631257873e-08, + "loss": 2.9984, + "step": 98580 + }, + { + "epoch": 2.8884640738299274, + "grad_norm": 19.559654235839844, + "learning_rate": 3.698837815011458e-08, + "loss": 2.995, + "step": 98590 + }, + { + "epoch": 2.8887570497326593, + "grad_norm": 18.007465362548828, + "learning_rate": 3.679661650751787e-08, + "loss": 3.0018, + "step": 98600 + }, + { + "epoch": 2.8890500256353917, + "grad_norm": 19.135461807250977, + "learning_rate": 3.660535140397492e-08, + "loss": 2.9974, + "step": 98610 + }, + { + "epoch": 2.8893430015381236, + "grad_norm": 18.6368465423584, + "learning_rate": 3.641458285861876e-08, + "loss": 2.995, + "step": 98620 + }, + { + "epoch": 2.8896359774408555, + "grad_norm": 19.740339279174805, + "learning_rate": 3.6224310890536354e-08, + "loss": 2.9826, + "step": 98630 + }, + { + "epoch": 2.8899289533435875, + "grad_norm": 15.260982513427734, + "learning_rate": 3.603453551876246e-08, + "loss": 3.0005, + "step": 98640 + }, + { + "epoch": 2.8902219292463194, + "grad_norm": 20.782100677490234, + "learning_rate": 3.584525676228301e-08, + "loss": 2.9713, + "step": 98650 + }, + { + "epoch": 2.8905149051490513, + "grad_norm": 20.111238479614258, + "learning_rate": 3.565647464003397e-08, + "loss": 2.9973, + "step": 98660 + }, + { + "epoch": 2.8908078810517837, + "grad_norm": 20.65951156616211, + "learning_rate": 3.5468189170902444e-08, + "loss": 2.9964, + "step": 98670 + }, + { + "epoch": 2.8911008569545156, + "grad_norm": 18.52412986755371, + "learning_rate": 3.528040037372504e-08, + "loss": 2.9994, + "step": 98680 + }, + { + "epoch": 2.8913938328572475, + "grad_norm": 18.7800350189209, + "learning_rate": 3.509310826728951e-08, + "loss": 2.9885, + "step": 98690 + }, + { + "epoch": 2.8916868087599794, + "grad_norm": 21.014488220214844, + "learning_rate": 3.4906312870331973e-08, + "loss": 2.9835, + "step": 98700 + }, + { + "epoch": 2.8919797846627113, + "grad_norm": 19.634599685668945, + "learning_rate": 3.4720014201541365e-08, + "loss": 2.9972, + "step": 98710 + }, + { + "epoch": 2.8922727605654437, + "grad_norm": 22.23586654663086, + "learning_rate": 3.4534212279555004e-08, + "loss": 3.0099, + "step": 98720 + }, + { + "epoch": 2.8925657364681756, + "grad_norm": 17.26802635192871, + "learning_rate": 3.434890712296135e-08, + "loss": 2.993, + "step": 98730 + }, + { + "epoch": 2.8928587123709075, + "grad_norm": 18.359235763549805, + "learning_rate": 3.416409875029947e-08, + "loss": 2.9886, + "step": 98740 + }, + { + "epoch": 2.8931516882736394, + "grad_norm": 21.527976989746094, + "learning_rate": 3.397978718005901e-08, + "loss": 2.9797, + "step": 98750 + }, + { + "epoch": 2.8934446641763714, + "grad_norm": 17.96360969543457, + "learning_rate": 3.379597243067745e-08, + "loss": 2.9986, + "step": 98760 + }, + { + "epoch": 2.8937376400791033, + "grad_norm": 20.27254867553711, + "learning_rate": 3.361265452054618e-08, + "loss": 3.0494, + "step": 98770 + }, + { + "epoch": 2.8940306159818356, + "grad_norm": 16.948753356933594, + "learning_rate": 3.342983346800388e-08, + "loss": 3.0091, + "step": 98780 + }, + { + "epoch": 2.8943235918845676, + "grad_norm": 20.19217872619629, + "learning_rate": 3.3247509291340906e-08, + "loss": 3.0031, + "step": 98790 + }, + { + "epoch": 2.8946165677872995, + "grad_norm": 19.88099479675293, + "learning_rate": 3.306568200879823e-08, + "loss": 2.9863, + "step": 98800 + }, + { + "epoch": 2.8949095436900314, + "grad_norm": 18.353181838989258, + "learning_rate": 3.288435163856574e-08, + "loss": 3.0017, + "step": 98810 + }, + { + "epoch": 2.8950853292316707, + "eval_bleu": 0.35405216667993644, + "eval_cap_loss": 0.8989263772964478, + "eval_con_loss": 1.1242631673812866, + "eval_loss": 3.1474528312683105, + "step": 98816 + }, + { + "epoch": 2.8950853292316707, + "eval_bleu": 0.35405216667993644, + "eval_cap_loss": 0.8989263772964478, + "eval_con_loss": 1.1242631673812866, + "eval_loss": 3.1474528312683105, + "eval_runtime": 53.9153, + "eval_samples_per_second": 370.952, + "eval_steps_per_second": 0.371, + "step": 98816 + }, + { + "epoch": 2.8952025195927638, + "grad_norm": 18.348102569580078, + "learning_rate": 3.270351819878503e-08, + "loss": 3.0074, + "step": 98820 + }, + { + "epoch": 2.8954954954954957, + "grad_norm": 15.20551872253418, + "learning_rate": 3.2523181707548314e-08, + "loss": 2.9814, + "step": 98830 + }, + { + "epoch": 2.8957884713982276, + "grad_norm": 19.15839385986328, + "learning_rate": 3.234334218289503e-08, + "loss": 3.0017, + "step": 98840 + }, + { + "epoch": 2.8960814473009595, + "grad_norm": 19.273317337036133, + "learning_rate": 3.216399964281913e-08, + "loss": 2.9826, + "step": 98850 + }, + { + "epoch": 2.8963744232036914, + "grad_norm": 15.921628952026367, + "learning_rate": 3.198515410526182e-08, + "loss": 2.9956, + "step": 98860 + }, + { + "epoch": 2.8966673991064233, + "grad_norm": 20.25644874572754, + "learning_rate": 3.180680558811544e-08, + "loss": 3.0017, + "step": 98870 + }, + { + "epoch": 2.8969603750091553, + "grad_norm": 16.243873596191406, + "learning_rate": 3.16289541092224e-08, + "loss": 3.0064, + "step": 98880 + }, + { + "epoch": 2.8972533509118876, + "grad_norm": 19.092138290405273, + "learning_rate": 3.145159968637679e-08, + "loss": 2.9803, + "step": 98890 + }, + { + "epoch": 2.8975463268146195, + "grad_norm": 16.100549697875977, + "learning_rate": 3.127474233732053e-08, + "loss": 2.9954, + "step": 98900 + }, + { + "epoch": 2.8978393027173515, + "grad_norm": 17.965721130371094, + "learning_rate": 3.1098382079748914e-08, + "loss": 3.0078, + "step": 98910 + }, + { + "epoch": 2.8981322786200834, + "grad_norm": 22.609554290771484, + "learning_rate": 3.0922518931303934e-08, + "loss": 3.0028, + "step": 98920 + }, + { + "epoch": 2.8984252545228157, + "grad_norm": 16.64712905883789, + "learning_rate": 3.0747152909580414e-08, + "loss": 2.9701, + "step": 98930 + }, + { + "epoch": 2.8987182304255477, + "grad_norm": 16.8970947265625, + "learning_rate": 3.0572284032123204e-08, + "loss": 2.993, + "step": 98940 + }, + { + "epoch": 2.8990112063282796, + "grad_norm": 19.046749114990234, + "learning_rate": 3.039791231642608e-08, + "loss": 3.0057, + "step": 98950 + }, + { + "epoch": 2.8993041822310115, + "grad_norm": 17.197528839111328, + "learning_rate": 3.022403777993399e-08, + "loss": 2.9837, + "step": 98960 + }, + { + "epoch": 2.8995971581337434, + "grad_norm": 14.872748374938965, + "learning_rate": 3.005066044004246e-08, + "loss": 3.0164, + "step": 98970 + }, + { + "epoch": 2.8998901340364753, + "grad_norm": 18.24330711364746, + "learning_rate": 2.9877780314096493e-08, + "loss": 2.9784, + "step": 98980 + }, + { + "epoch": 2.9001831099392072, + "grad_norm": 18.491775512695312, + "learning_rate": 2.9705397419391712e-08, + "loss": 2.989, + "step": 98990 + }, + { + "epoch": 2.9004760858419396, + "grad_norm": 15.922914505004883, + "learning_rate": 2.9533511773174872e-08, + "loss": 2.9886, + "step": 99000 + }, + { + "epoch": 2.9007690617446715, + "grad_norm": 17.540449142456055, + "learning_rate": 2.9362123392640552e-08, + "loss": 2.9942, + "step": 99010 + }, + { + "epoch": 2.9010620376474034, + "grad_norm": 19.650434494018555, + "learning_rate": 2.91912322949367e-08, + "loss": 2.9959, + "step": 99020 + }, + { + "epoch": 2.9013550135501354, + "grad_norm": 19.379600524902344, + "learning_rate": 2.9020838497159087e-08, + "loss": 3.002, + "step": 99030 + }, + { + "epoch": 2.9016479894528677, + "grad_norm": 20.185550689697266, + "learning_rate": 2.8850942016354634e-08, + "loss": 3.0156, + "step": 99040 + }, + { + "epoch": 2.9019409653555996, + "grad_norm": 17.378347396850586, + "learning_rate": 2.868154286952085e-08, + "loss": 2.9701, + "step": 99050 + }, + { + "epoch": 2.9022339412583316, + "grad_norm": 21.993505477905273, + "learning_rate": 2.8512641073604185e-08, + "loss": 2.9772, + "step": 99060 + }, + { + "epoch": 2.9025269171610635, + "grad_norm": 19.58789825439453, + "learning_rate": 2.8344236645503343e-08, + "loss": 2.9892, + "step": 99070 + }, + { + "epoch": 2.9028198930637954, + "grad_norm": 16.797733306884766, + "learning_rate": 2.8176329602065956e-08, + "loss": 2.9955, + "step": 99080 + }, + { + "epoch": 2.9031128689665273, + "grad_norm": 20.504514694213867, + "learning_rate": 2.8008919960090253e-08, + "loss": 2.9824, + "step": 99090 + }, + { + "epoch": 2.9034058448692592, + "grad_norm": 20.143842697143555, + "learning_rate": 2.7842007736323952e-08, + "loss": 2.9804, + "step": 99100 + }, + { + "epoch": 2.9036988207719916, + "grad_norm": 18.00935173034668, + "learning_rate": 2.7675592947465913e-08, + "loss": 3.002, + "step": 99110 + }, + { + "epoch": 2.9039917966747235, + "grad_norm": 16.051116943359375, + "learning_rate": 2.7509675610165043e-08, + "loss": 2.9879, + "step": 99120 + }, + { + "epoch": 2.9042847725774554, + "grad_norm": 18.59381866455078, + "learning_rate": 2.7344255741020287e-08, + "loss": 3.0011, + "step": 99130 + }, + { + "epoch": 2.9045777484801873, + "grad_norm": 19.296709060668945, + "learning_rate": 2.7179333356581182e-08, + "loss": 2.9979, + "step": 99140 + }, + { + "epoch": 2.9048707243829197, + "grad_norm": 13.99242877960205, + "learning_rate": 2.7014908473346758e-08, + "loss": 3.002, + "step": 99150 + }, + { + "epoch": 2.9051637002856516, + "grad_norm": 16.819482803344727, + "learning_rate": 2.6850981107767182e-08, + "loss": 3.003, + "step": 99160 + }, + { + "epoch": 2.9054566761883835, + "grad_norm": 15.82397174835205, + "learning_rate": 2.6687551276242117e-08, + "loss": 3.0151, + "step": 99170 + }, + { + "epoch": 2.9057496520911155, + "grad_norm": 18.7031307220459, + "learning_rate": 2.6524618995122376e-08, + "loss": 2.9893, + "step": 99180 + }, + { + "epoch": 2.9060426279938474, + "grad_norm": 20.754331588745117, + "learning_rate": 2.6362184280707692e-08, + "loss": 2.9851, + "step": 99190 + }, + { + "epoch": 2.9063356038965793, + "grad_norm": 14.987410545349121, + "learning_rate": 2.6200247149248958e-08, + "loss": 2.9956, + "step": 99200 + }, + { + "epoch": 2.9066285797993117, + "grad_norm": 20.96665382385254, + "learning_rate": 2.603880761694655e-08, + "loss": 3.0077, + "step": 99210 + }, + { + "epoch": 2.9069215557020436, + "grad_norm": 18.049962997436523, + "learning_rate": 2.5877865699951988e-08, + "loss": 2.9893, + "step": 99220 + }, + { + "epoch": 2.9072145316047755, + "grad_norm": 22.3404598236084, + "learning_rate": 2.571742141436684e-08, + "loss": 2.9846, + "step": 99230 + }, + { + "epoch": 2.9075075075075074, + "grad_norm": 16.095468521118164, + "learning_rate": 2.5557474776242153e-08, + "loss": 3.0184, + "step": 99240 + }, + { + "epoch": 2.90780048341024, + "grad_norm": 23.997106552124023, + "learning_rate": 2.5398025801579573e-08, + "loss": 2.9928, + "step": 99250 + }, + { + "epoch": 2.9080934593129717, + "grad_norm": 21.274311065673828, + "learning_rate": 2.523907450633134e-08, + "loss": 2.9876, + "step": 99260 + }, + { + "epoch": 2.9083864352157036, + "grad_norm": 17.442150115966797, + "learning_rate": 2.5080620906399176e-08, + "loss": 2.9976, + "step": 99270 + }, + { + "epoch": 2.9086794111184355, + "grad_norm": 17.715682983398438, + "learning_rate": 2.492266501763596e-08, + "loss": 3.0004, + "step": 99280 + }, + { + "epoch": 2.9089723870211674, + "grad_norm": 18.467927932739258, + "learning_rate": 2.476520685584405e-08, + "loss": 3.0065, + "step": 99290 + }, + { + "epoch": 2.9092653629238994, + "grad_norm": 15.904788970947266, + "learning_rate": 2.4608246436776394e-08, + "loss": 3.0086, + "step": 99300 + }, + { + "epoch": 2.9095583388266313, + "grad_norm": 19.918684005737305, + "learning_rate": 2.4451783776134887e-08, + "loss": 3.0057, + "step": 99310 + }, + { + "epoch": 2.9098513147293636, + "grad_norm": 19.99102210998535, + "learning_rate": 2.4295818889574217e-08, + "loss": 2.9946, + "step": 99320 + }, + { + "epoch": 2.9100856954515493, + "eval_bleu": 0.35407365265873714, + "eval_cap_loss": 0.8989189267158508, + "eval_con_loss": 1.1242027282714844, + "eval_loss": 3.147324323654175, + "step": 99328 + }, + { + "epoch": 2.9100856954515493, + "eval_bleu": 0.35407365265873714, + "eval_cap_loss": 0.8989189267158508, + "eval_con_loss": 1.1242027282714844, + "eval_loss": 3.147324323654175, + "eval_runtime": 53.7079, + "eval_samples_per_second": 372.385, + "eval_steps_per_second": 0.372, + "step": 99328 + }, + { + "epoch": 2.9101442906320956, + "grad_norm": 19.866477966308594, + "learning_rate": 2.4140351792696914e-08, + "loss": 3.0025, + "step": 99330 + }, + { + "epoch": 2.9104372665348275, + "grad_norm": 19.19602394104004, + "learning_rate": 2.398538250105664e-08, + "loss": 2.9858, + "step": 99340 + }, + { + "epoch": 2.9107302424375594, + "grad_norm": 17.9207706451416, + "learning_rate": 2.383091103015711e-08, + "loss": 2.9941, + "step": 99350 + }, + { + "epoch": 2.9110232183402918, + "grad_norm": 18.017990112304688, + "learning_rate": 2.3676937395452072e-08, + "loss": 3.0065, + "step": 99360 + }, + { + "epoch": 2.9113161942430237, + "grad_norm": 20.918970108032227, + "learning_rate": 2.352346161234642e-08, + "loss": 3.0028, + "step": 99370 + }, + { + "epoch": 2.9116091701457556, + "grad_norm": 22.56100082397461, + "learning_rate": 2.3370483696193434e-08, + "loss": 2.9978, + "step": 99380 + }, + { + "epoch": 2.9119021460484875, + "grad_norm": 16.840280532836914, + "learning_rate": 2.321800366229865e-08, + "loss": 2.9842, + "step": 99390 + }, + { + "epoch": 2.9121951219512194, + "grad_norm": 17.516679763793945, + "learning_rate": 2.306602152591597e-08, + "loss": 2.9829, + "step": 99400 + }, + { + "epoch": 2.9124880978539514, + "grad_norm": 19.506610870361328, + "learning_rate": 2.2914537302251017e-08, + "loss": 2.9891, + "step": 99410 + }, + { + "epoch": 2.9127810737566833, + "grad_norm": 17.1470947265625, + "learning_rate": 2.276355100645833e-08, + "loss": 2.9856, + "step": 99420 + }, + { + "epoch": 2.9130740496594156, + "grad_norm": 17.073196411132812, + "learning_rate": 2.26130626536436e-08, + "loss": 2.9854, + "step": 99430 + }, + { + "epoch": 2.9133670255621475, + "grad_norm": 18.872404098510742, + "learning_rate": 2.2463072258862017e-08, + "loss": 3.0076, + "step": 99440 + }, + { + "epoch": 2.9136600014648795, + "grad_norm": 17.517738342285156, + "learning_rate": 2.2313579837119904e-08, + "loss": 3.0242, + "step": 99450 + }, + { + "epoch": 2.9139529773676114, + "grad_norm": 20.281902313232422, + "learning_rate": 2.2164585403371963e-08, + "loss": 2.9993, + "step": 99460 + }, + { + "epoch": 2.9142459532703437, + "grad_norm": 20.014102935791016, + "learning_rate": 2.201608897252461e-08, + "loss": 3.004, + "step": 99470 + }, + { + "epoch": 2.9145389291730757, + "grad_norm": 22.415428161621094, + "learning_rate": 2.1868090559434285e-08, + "loss": 2.9911, + "step": 99480 + }, + { + "epoch": 2.9148319050758076, + "grad_norm": 19.78404998779297, + "learning_rate": 2.1720590178907485e-08, + "loss": 3.0137, + "step": 99490 + }, + { + "epoch": 2.9151248809785395, + "grad_norm": 18.887359619140625, + "learning_rate": 2.157358784570074e-08, + "loss": 3.0004, + "step": 99500 + }, + { + "epoch": 2.9154178568812714, + "grad_norm": 17.717283248901367, + "learning_rate": 2.142708357452006e-08, + "loss": 2.9678, + "step": 99510 + }, + { + "epoch": 2.9157108327840033, + "grad_norm": 19.135765075683594, + "learning_rate": 2.1281077380023163e-08, + "loss": 3.0146, + "step": 99520 + }, + { + "epoch": 2.9160038086867353, + "grad_norm": 21.762006759643555, + "learning_rate": 2.1135569276816703e-08, + "loss": 3.0023, + "step": 99530 + }, + { + "epoch": 2.9162967845894676, + "grad_norm": 19.968204498291016, + "learning_rate": 2.0990559279457922e-08, + "loss": 2.9872, + "step": 99540 + }, + { + "epoch": 2.9165897604921995, + "grad_norm": 17.34891128540039, + "learning_rate": 2.0846047402453552e-08, + "loss": 2.9994, + "step": 99550 + }, + { + "epoch": 2.9168827363949315, + "grad_norm": 17.355024337768555, + "learning_rate": 2.0702033660262576e-08, + "loss": 3.0098, + "step": 99560 + }, + { + "epoch": 2.9171757122976634, + "grad_norm": 18.707401275634766, + "learning_rate": 2.0558518067291812e-08, + "loss": 2.9977, + "step": 99570 + }, + { + "epoch": 2.9174686882003957, + "grad_norm": 22.559045791625977, + "learning_rate": 2.041550063789921e-08, + "loss": 3.0071, + "step": 99580 + }, + { + "epoch": 2.9177616641031277, + "grad_norm": 19.023983001708984, + "learning_rate": 2.0272981386393332e-08, + "loss": 3.0151, + "step": 99590 + }, + { + "epoch": 2.9180546400058596, + "grad_norm": 20.711275100708008, + "learning_rate": 2.01309603270311e-08, + "loss": 2.9962, + "step": 99600 + }, + { + "epoch": 2.9183476159085915, + "grad_norm": 18.69031524658203, + "learning_rate": 1.9989437474022823e-08, + "loss": 3.0005, + "step": 99610 + }, + { + "epoch": 2.9186405918113234, + "grad_norm": 20.62566375732422, + "learning_rate": 1.9848412841524946e-08, + "loss": 3.0156, + "step": 99620 + }, + { + "epoch": 2.9189335677140553, + "grad_norm": 21.116546630859375, + "learning_rate": 1.9707886443647294e-08, + "loss": 2.9943, + "step": 99630 + }, + { + "epoch": 2.9192265436167877, + "grad_norm": 21.108325958251953, + "learning_rate": 1.956785829444918e-08, + "loss": 2.9931, + "step": 99640 + }, + { + "epoch": 2.9195195195195196, + "grad_norm": 18.6483211517334, + "learning_rate": 1.942832840793829e-08, + "loss": 3.0017, + "step": 99650 + }, + { + "epoch": 2.9198124954222515, + "grad_norm": 14.885245323181152, + "learning_rate": 1.9289296798074565e-08, + "loss": 2.9826, + "step": 99660 + }, + { + "epoch": 2.9201054713249834, + "grad_norm": 20.95960807800293, + "learning_rate": 1.915076347876743e-08, + "loss": 2.9875, + "step": 99670 + }, + { + "epoch": 2.9203984472277154, + "grad_norm": 17.606266021728516, + "learning_rate": 1.9012728463875806e-08, + "loss": 3.0073, + "step": 99680 + }, + { + "epoch": 2.9206914231304477, + "grad_norm": 18.107572555541992, + "learning_rate": 1.8875191767209757e-08, + "loss": 3.0098, + "step": 99690 + }, + { + "epoch": 2.9209843990331796, + "grad_norm": 19.203453063964844, + "learning_rate": 1.8738153402528825e-08, + "loss": 3.0114, + "step": 99700 + }, + { + "epoch": 2.9212773749359116, + "grad_norm": 16.857772827148438, + "learning_rate": 1.860161338354205e-08, + "loss": 2.9738, + "step": 99710 + }, + { + "epoch": 2.9215703508386435, + "grad_norm": 15.007349014282227, + "learning_rate": 1.8465571723910723e-08, + "loss": 2.9943, + "step": 99720 + }, + { + "epoch": 2.9218633267413754, + "grad_norm": 19.047636032104492, + "learning_rate": 1.8330028437244517e-08, + "loss": 3.0174, + "step": 99730 + }, + { + "epoch": 2.9221563026441073, + "grad_norm": 22.793432235717773, + "learning_rate": 1.819498353710425e-08, + "loss": 2.9836, + "step": 99740 + }, + { + "epoch": 2.9224492785468397, + "grad_norm": 16.157155990600586, + "learning_rate": 1.8060437036999112e-08, + "loss": 2.981, + "step": 99750 + }, + { + "epoch": 2.9227422544495716, + "grad_norm": 15.788615226745605, + "learning_rate": 1.792638895039056e-08, + "loss": 2.9963, + "step": 99760 + }, + { + "epoch": 2.9230352303523035, + "grad_norm": 18.001434326171875, + "learning_rate": 1.7792839290689536e-08, + "loss": 3.0063, + "step": 99770 + }, + { + "epoch": 2.9233282062550354, + "grad_norm": 18.09105682373047, + "learning_rate": 1.7659788071256457e-08, + "loss": 3.0058, + "step": 99780 + }, + { + "epoch": 2.923621182157768, + "grad_norm": 18.92778205871582, + "learning_rate": 1.752723530540179e-08, + "loss": 2.9915, + "step": 99790 + }, + { + "epoch": 2.9239141580604997, + "grad_norm": 16.601648330688477, + "learning_rate": 1.7395181006387706e-08, + "loss": 2.9987, + "step": 99800 + }, + { + "epoch": 2.9242071339632316, + "grad_norm": 20.907379150390625, + "learning_rate": 1.7263625187424748e-08, + "loss": 2.995, + "step": 99810 + }, + { + "epoch": 2.9245001098659635, + "grad_norm": 18.141490936279297, + "learning_rate": 1.713256786167461e-08, + "loss": 3.001, + "step": 99820 + }, + { + "epoch": 2.9247930857686955, + "grad_norm": 18.96015167236328, + "learning_rate": 1.7002009042249023e-08, + "loss": 2.9987, + "step": 99830 + }, + { + "epoch": 2.9250860616714274, + "grad_norm": 18.02805519104004, + "learning_rate": 1.687194874220921e-08, + "loss": 3.0032, + "step": 99840 + }, + { + "epoch": 2.9250860616714274, + "eval_bleu": 0.3540873422393005, + "eval_cap_loss": 0.898938775062561, + "eval_con_loss": 1.124133586883545, + "eval_loss": 3.1472063064575195, + "step": 99840 + }, + { + "epoch": 2.9250860616714274, + "eval_bleu": 0.3540873422393005, + "eval_cap_loss": 0.898938775062561, + "eval_con_loss": 1.124133586883545, + "eval_loss": 3.1472063064575195, + "eval_runtime": 54.4101, + "eval_samples_per_second": 367.579, + "eval_steps_per_second": 0.368, + "step": 99840 + }, + { + "epoch": 2.9253790375741593, + "grad_norm": 16.83776092529297, + "learning_rate": 1.6742386974566426e-08, + "loss": 2.9907, + "step": 99850 + }, + { + "epoch": 2.9256720134768917, + "grad_norm": 23.338119506835938, + "learning_rate": 1.6613323752283638e-08, + "loss": 2.9848, + "step": 99860 + }, + { + "epoch": 2.9259649893796236, + "grad_norm": 18.306217193603516, + "learning_rate": 1.648475908827274e-08, + "loss": 2.9754, + "step": 99870 + }, + { + "epoch": 2.9262579652823555, + "grad_norm": 19.821746826171875, + "learning_rate": 1.6356692995395106e-08, + "loss": 2.9999, + "step": 99880 + }, + { + "epoch": 2.9265509411850874, + "grad_norm": 21.954103469848633, + "learning_rate": 1.622912548646327e-08, + "loss": 2.9943, + "step": 99890 + }, + { + "epoch": 2.9268439170878198, + "grad_norm": 19.722145080566406, + "learning_rate": 1.6102056574239798e-08, + "loss": 2.9834, + "step": 99900 + }, + { + "epoch": 2.9271368929905517, + "grad_norm": 16.892044067382812, + "learning_rate": 1.59754862714373e-08, + "loss": 2.9823, + "step": 99910 + }, + { + "epoch": 2.9274298688932836, + "grad_norm": 17.881473541259766, + "learning_rate": 1.584941459071787e-08, + "loss": 3.0054, + "step": 99920 + }, + { + "epoch": 2.9277228447960155, + "grad_norm": 17.060300827026367, + "learning_rate": 1.5723841544694752e-08, + "loss": 3.0067, + "step": 99930 + }, + { + "epoch": 2.9280158206987474, + "grad_norm": 19.58568572998047, + "learning_rate": 1.5598767145930672e-08, + "loss": 2.9995, + "step": 99940 + }, + { + "epoch": 2.9283087966014794, + "grad_norm": 19.948165893554688, + "learning_rate": 1.54741914069384e-08, + "loss": 3.0097, + "step": 99950 + }, + { + "epoch": 2.9286017725042113, + "grad_norm": 18.956899642944336, + "learning_rate": 1.5350114340181298e-08, + "loss": 2.983, + "step": 99960 + }, + { + "epoch": 2.9288947484069436, + "grad_norm": 18.03178596496582, + "learning_rate": 1.522653595807222e-08, + "loss": 3.0197, + "step": 99970 + }, + { + "epoch": 2.9291877243096756, + "grad_norm": 16.424991607666016, + "learning_rate": 1.5103456272974604e-08, + "loss": 2.9801, + "step": 99980 + }, + { + "epoch": 2.9294807002124075, + "grad_norm": 16.53423500061035, + "learning_rate": 1.4980875297201382e-08, + "loss": 3.0071, + "step": 99990 + }, + { + "epoch": 2.9297736761151394, + "grad_norm": 19.115575790405273, + "learning_rate": 1.4858793043016629e-08, + "loss": 2.9823, + "step": 100000 + }, + { + "epoch": 2.9300666520178718, + "grad_norm": 22.08986473083496, + "learning_rate": 1.4737209522634466e-08, + "loss": 3.0156, + "step": 100010 + }, + { + "epoch": 2.9303596279206037, + "grad_norm": 14.550226211547852, + "learning_rate": 1.4616124748217387e-08, + "loss": 2.9893, + "step": 100020 + }, + { + "epoch": 2.9306526038233356, + "grad_norm": 15.80354118347168, + "learning_rate": 1.4495538731879589e-08, + "loss": 2.9802, + "step": 100030 + }, + { + "epoch": 2.9309455797260675, + "grad_norm": 15.734529495239258, + "learning_rate": 1.4375451485685866e-08, + "loss": 2.998, + "step": 100040 + }, + { + "epoch": 2.9312385556287994, + "grad_norm": 20.21775245666504, + "learning_rate": 1.425586302164883e-08, + "loss": 2.9827, + "step": 100050 + }, + { + "epoch": 2.9315315315315313, + "grad_norm": 17.494258880615234, + "learning_rate": 1.4136773351733358e-08, + "loss": 3.0097, + "step": 100060 + }, + { + "epoch": 2.9318245074342633, + "grad_norm": 14.292271614074707, + "learning_rate": 1.4018182487853804e-08, + "loss": 2.989, + "step": 100070 + }, + { + "epoch": 2.9321174833369956, + "grad_norm": 19.113313674926758, + "learning_rate": 1.3900090441874015e-08, + "loss": 3.0188, + "step": 100080 + }, + { + "epoch": 2.9324104592397275, + "grad_norm": 20.625579833984375, + "learning_rate": 1.3782497225608981e-08, + "loss": 3.0059, + "step": 100090 + }, + { + "epoch": 2.9327034351424595, + "grad_norm": 18.660696029663086, + "learning_rate": 1.3665402850823184e-08, + "loss": 2.9919, + "step": 100100 + }, + { + "epoch": 2.9329964110451914, + "grad_norm": 17.322982788085938, + "learning_rate": 1.3548807329230585e-08, + "loss": 2.9875, + "step": 100110 + }, + { + "epoch": 2.9332893869479237, + "grad_norm": 15.569415092468262, + "learning_rate": 1.343271067249574e-08, + "loss": 3.0001, + "step": 100120 + }, + { + "epoch": 2.9335823628506557, + "grad_norm": 18.442588806152344, + "learning_rate": 1.3317112892234363e-08, + "loss": 2.9911, + "step": 100130 + }, + { + "epoch": 2.9338753387533876, + "grad_norm": 13.33384895324707, + "learning_rate": 1.3202014000010533e-08, + "loss": 2.9973, + "step": 100140 + }, + { + "epoch": 2.9341683146561195, + "grad_norm": 20.620223999023438, + "learning_rate": 1.3087414007340038e-08, + "loss": 3.0081, + "step": 100150 + }, + { + "epoch": 2.9344612905588514, + "grad_norm": 15.845526695251465, + "learning_rate": 1.2973312925687042e-08, + "loss": 3.0059, + "step": 100160 + }, + { + "epoch": 2.9347542664615833, + "grad_norm": 17.710237503051758, + "learning_rate": 1.2859710766467414e-08, + "loss": 2.9963, + "step": 100170 + }, + { + "epoch": 2.9350472423643157, + "grad_norm": 20.665233612060547, + "learning_rate": 1.2746607541045952e-08, + "loss": 3.0108, + "step": 100180 + }, + { + "epoch": 2.9353402182670476, + "grad_norm": 18.802597045898438, + "learning_rate": 1.2634003260738048e-08, + "loss": 2.9805, + "step": 100190 + }, + { + "epoch": 2.9356331941697795, + "grad_norm": 18.745609283447266, + "learning_rate": 1.252189793680858e-08, + "loss": 2.9902, + "step": 100200 + }, + { + "epoch": 2.9359261700725114, + "grad_norm": 16.42022705078125, + "learning_rate": 1.2410291580474132e-08, + "loss": 3.0135, + "step": 100210 + }, + { + "epoch": 2.936219145975244, + "grad_norm": 20.969688415527344, + "learning_rate": 1.2299184202899107e-08, + "loss": 2.9769, + "step": 100220 + }, + { + "epoch": 2.9365121218779757, + "grad_norm": 19.392671585083008, + "learning_rate": 1.2188575815200165e-08, + "loss": 2.9861, + "step": 100230 + }, + { + "epoch": 2.9368050977807076, + "grad_norm": 20.235607147216797, + "learning_rate": 1.2078466428442348e-08, + "loss": 2.9908, + "step": 100240 + }, + { + "epoch": 2.9370980736834396, + "grad_norm": 15.889041900634766, + "learning_rate": 1.1968856053641842e-08, + "loss": 3.0161, + "step": 100250 + }, + { + "epoch": 2.9373910495861715, + "grad_norm": 18.53316879272461, + "learning_rate": 1.185974470176432e-08, + "loss": 2.9967, + "step": 100260 + }, + { + "epoch": 2.9376840254889034, + "grad_norm": 18.689638137817383, + "learning_rate": 1.1751132383725494e-08, + "loss": 3.0098, + "step": 100270 + }, + { + "epoch": 2.9379770013916353, + "grad_norm": 15.085429191589355, + "learning_rate": 1.1643019110391673e-08, + "loss": 2.9932, + "step": 100280 + }, + { + "epoch": 2.9382699772943677, + "grad_norm": 17.740407943725586, + "learning_rate": 1.153540489257865e-08, + "loss": 3.0217, + "step": 100290 + }, + { + "epoch": 2.9385629531970996, + "grad_norm": 19.20929718017578, + "learning_rate": 1.1428289741053367e-08, + "loss": 3.0024, + "step": 100300 + }, + { + "epoch": 2.9388559290998315, + "grad_norm": 15.125190734863281, + "learning_rate": 1.1321673666531141e-08, + "loss": 3.0138, + "step": 100310 + }, + { + "epoch": 2.9391489050025634, + "grad_norm": 19.995807647705078, + "learning_rate": 1.121555667967844e-08, + "loss": 2.9681, + "step": 100320 + }, + { + "epoch": 2.939441880905296, + "grad_norm": 20.60546112060547, + "learning_rate": 1.1109938791112328e-08, + "loss": 3.0042, + "step": 100330 + }, + { + "epoch": 2.9397348568080277, + "grad_norm": 16.790668487548828, + "learning_rate": 1.1004820011398243e-08, + "loss": 2.982, + "step": 100340 + }, + { + "epoch": 2.9400278327107596, + "grad_norm": 19.500728607177734, + "learning_rate": 1.090020035105388e-08, + "loss": 2.9933, + "step": 100350 + }, + { + "epoch": 2.940086427891306, + "eval_bleu": 0.3541613811249636, + "eval_cap_loss": 0.8989034295082092, + "eval_con_loss": 1.1241393089294434, + "eval_loss": 3.147181987762451, + "step": 100352 + }, + { + "epoch": 2.940086427891306, + "eval_bleu": 0.3541613811249636, + "eval_cap_loss": 0.8989034295082092, + "eval_con_loss": 1.1241393089294434, + "eval_loss": 3.147181987762451, + "eval_runtime": 54.0986, + "eval_samples_per_second": 369.696, + "eval_steps_per_second": 0.37, + "step": 100352 + }, + { + "epoch": 2.9403208086134915, + "grad_norm": 19.601337432861328, + "learning_rate": 1.0796079820544758e-08, + "loss": 3.0052, + "step": 100360 + }, + { + "epoch": 2.9406137845162235, + "grad_norm": 13.58393669128418, + "learning_rate": 1.0692458430287545e-08, + "loss": 3.0, + "step": 100370 + }, + { + "epoch": 2.9409067604189554, + "grad_norm": 23.908281326293945, + "learning_rate": 1.0589336190650058e-08, + "loss": 3.0098, + "step": 100380 + }, + { + "epoch": 2.9411997363216873, + "grad_norm": 17.436479568481445, + "learning_rate": 1.0486713111947932e-08, + "loss": 3.0102, + "step": 100390 + }, + { + "epoch": 2.9414927122244197, + "grad_norm": 14.664161682128906, + "learning_rate": 1.0384589204448514e-08, + "loss": 2.9933, + "step": 100400 + }, + { + "epoch": 2.9417856881271516, + "grad_norm": 19.72046661376953, + "learning_rate": 1.0282964478368074e-08, + "loss": 3.0162, + "step": 100410 + }, + { + "epoch": 2.9420786640298835, + "grad_norm": 17.816251754760742, + "learning_rate": 1.0181838943874589e-08, + "loss": 3.0104, + "step": 100420 + }, + { + "epoch": 2.9423716399326154, + "grad_norm": 17.95802116394043, + "learning_rate": 1.0081212611084411e-08, + "loss": 2.9945, + "step": 100430 + }, + { + "epoch": 2.9426646158353478, + "grad_norm": 18.190181732177734, + "learning_rate": 9.981085490065046e-09, + "loss": 2.9935, + "step": 100440 + }, + { + "epoch": 2.9429575917380797, + "grad_norm": 16.684532165527344, + "learning_rate": 9.881457590832921e-09, + "loss": 2.989, + "step": 100450 + }, + { + "epoch": 2.9432505676408116, + "grad_norm": 19.31289291381836, + "learning_rate": 9.782328923355622e-09, + "loss": 2.9963, + "step": 100460 + }, + { + "epoch": 2.9435435435435435, + "grad_norm": 18.480064392089844, + "learning_rate": 9.683699497550215e-09, + "loss": 2.9987, + "step": 100470 + }, + { + "epoch": 2.9438365194462754, + "grad_norm": 16.35353660583496, + "learning_rate": 9.585569323284915e-09, + "loss": 2.9994, + "step": 100480 + }, + { + "epoch": 2.9441294953490074, + "grad_norm": 16.622766494750977, + "learning_rate": 9.487938410375208e-09, + "loss": 3.0026, + "step": 100490 + }, + { + "epoch": 2.9444224712517393, + "grad_norm": 18.563316345214844, + "learning_rate": 9.390806768590499e-09, + "loss": 3.0077, + "step": 100500 + }, + { + "epoch": 2.9447154471544716, + "grad_norm": 20.503379821777344, + "learning_rate": 9.294174407646906e-09, + "loss": 2.9922, + "step": 100510 + }, + { + "epoch": 2.9450084230572036, + "grad_norm": 18.108415603637695, + "learning_rate": 9.198041337212249e-09, + "loss": 2.9905, + "step": 100520 + }, + { + "epoch": 2.9453013989599355, + "grad_norm": 18.572202682495117, + "learning_rate": 9.102407566904391e-09, + "loss": 2.9975, + "step": 100530 + }, + { + "epoch": 2.9455943748626674, + "grad_norm": 19.643678665161133, + "learning_rate": 9.007273106290126e-09, + "loss": 2.9986, + "step": 100540 + }, + { + "epoch": 2.9458873507653998, + "grad_norm": 21.769054412841797, + "learning_rate": 8.912637964887949e-09, + "loss": 2.9992, + "step": 100550 + }, + { + "epoch": 2.9461803266681317, + "grad_norm": 20.796350479125977, + "learning_rate": 8.818502152165286e-09, + "loss": 3.0146, + "step": 100560 + }, + { + "epoch": 2.9464733025708636, + "grad_norm": 16.079030990600586, + "learning_rate": 8.724865677540163e-09, + "loss": 3.0122, + "step": 100570 + }, + { + "epoch": 2.9467662784735955, + "grad_norm": 19.90447998046875, + "learning_rate": 8.631728550379525e-09, + "loss": 3.009, + "step": 100580 + }, + { + "epoch": 2.9470592543763274, + "grad_norm": 17.10182762145996, + "learning_rate": 8.53909078000148e-09, + "loss": 3.0157, + "step": 100590 + }, + { + "epoch": 2.9473522302790593, + "grad_norm": 19.386566162109375, + "learning_rate": 8.446952375674722e-09, + "loss": 3.0062, + "step": 100600 + }, + { + "epoch": 2.9476452061817917, + "grad_norm": 18.954240798950195, + "learning_rate": 8.355313346616323e-09, + "loss": 3.0075, + "step": 100610 + }, + { + "epoch": 2.9479381820845236, + "grad_norm": 16.76070785522461, + "learning_rate": 8.26417370199395e-09, + "loss": 3.0024, + "step": 100620 + }, + { + "epoch": 2.9482311579872555, + "grad_norm": 19.736473083496094, + "learning_rate": 8.17353345092642e-09, + "loss": 2.9875, + "step": 100630 + }, + { + "epoch": 2.9485241338899875, + "grad_norm": 19.156593322753906, + "learning_rate": 8.083392602481477e-09, + "loss": 2.9874, + "step": 100640 + }, + { + "epoch": 2.94881710979272, + "grad_norm": 17.507600784301758, + "learning_rate": 7.99375116567691e-09, + "loss": 3.004, + "step": 100650 + }, + { + "epoch": 2.9491100856954517, + "grad_norm": 16.694406509399414, + "learning_rate": 7.904609149481102e-09, + "loss": 2.9844, + "step": 100660 + }, + { + "epoch": 2.9494030615981837, + "grad_norm": 15.255763053894043, + "learning_rate": 7.815966562811916e-09, + "loss": 2.9977, + "step": 100670 + }, + { + "epoch": 2.9496960375009156, + "grad_norm": 18.29314613342285, + "learning_rate": 7.727823414537816e-09, + "loss": 2.9855, + "step": 100680 + }, + { + "epoch": 2.9499890134036475, + "grad_norm": 18.99582290649414, + "learning_rate": 7.640179713476192e-09, + "loss": 3.0105, + "step": 100690 + }, + { + "epoch": 2.9502819893063794, + "grad_norm": 20.769533157348633, + "learning_rate": 7.553035468396697e-09, + "loss": 3.011, + "step": 100700 + }, + { + "epoch": 2.9505749652091113, + "grad_norm": 22.044504165649414, + "learning_rate": 7.466390688016245e-09, + "loss": 3.0133, + "step": 100710 + }, + { + "epoch": 2.9508679411118437, + "grad_norm": 20.07642936706543, + "learning_rate": 7.380245381004014e-09, + "loss": 2.9678, + "step": 100720 + }, + { + "epoch": 2.9511609170145756, + "grad_norm": 15.431648254394531, + "learning_rate": 7.294599555978665e-09, + "loss": 2.9877, + "step": 100730 + }, + { + "epoch": 2.9514538929173075, + "grad_norm": 19.70823860168457, + "learning_rate": 7.209453221507234e-09, + "loss": 2.9961, + "step": 100740 + }, + { + "epoch": 2.9517468688200394, + "grad_norm": 15.697593688964844, + "learning_rate": 7.1248063861090174e-09, + "loss": 2.9759, + "step": 100750 + }, + { + "epoch": 2.952039844722772, + "grad_norm": 21.695350646972656, + "learning_rate": 7.040659058251686e-09, + "loss": 3.0122, + "step": 100760 + }, + { + "epoch": 2.9523328206255037, + "grad_norm": 20.53473472595215, + "learning_rate": 6.957011246354617e-09, + "loss": 3.0084, + "step": 100770 + }, + { + "epoch": 2.9526257965282356, + "grad_norm": 19.708282470703125, + "learning_rate": 6.873862958786115e-09, + "loss": 3.0021, + "step": 100780 + }, + { + "epoch": 2.9529187724309676, + "grad_norm": 15.513784408569336, + "learning_rate": 6.7912142038639714e-09, + "loss": 3.0159, + "step": 100790 + }, + { + "epoch": 2.9532117483336995, + "grad_norm": 15.47984504699707, + "learning_rate": 6.709064989857128e-09, + "loss": 2.9976, + "step": 100800 + }, + { + "epoch": 2.9535047242364314, + "grad_norm": 17.274179458618164, + "learning_rate": 6.627415324984565e-09, + "loss": 2.9935, + "step": 100810 + }, + { + "epoch": 2.9537977001391633, + "grad_norm": 20.37606430053711, + "learning_rate": 6.546265217414194e-09, + "loss": 2.9874, + "step": 100820 + }, + { + "epoch": 2.9540906760418957, + "grad_norm": 19.496885299682617, + "learning_rate": 6.465614675265075e-09, + "loss": 3.0048, + "step": 100830 + }, + { + "epoch": 2.9543836519446276, + "grad_norm": 18.682222366333008, + "learning_rate": 6.3854637066052e-09, + "loss": 3.0087, + "step": 100840 + }, + { + "epoch": 2.9546766278473595, + "grad_norm": 18.72085952758789, + "learning_rate": 6.305812319453708e-09, + "loss": 2.9965, + "step": 100850 + }, + { + "epoch": 2.9549696037500914, + "grad_norm": 16.725961685180664, + "learning_rate": 6.2266605217792265e-09, + "loss": 3.0018, + "step": 100860 + }, + { + "epoch": 2.9550867941111845, + "eval_bleu": 0.3541371556945125, + "eval_cap_loss": 0.8989046812057495, + "eval_con_loss": 1.1241124868392944, + "eval_loss": 3.147129535675049, + "step": 100864 + }, + { + "epoch": 2.9550867941111845, + "eval_bleu": 0.3541371556945125, + "eval_cap_loss": 0.8989046812057495, + "eval_con_loss": 1.1241124868392944, + "eval_loss": 3.147129535675049, + "eval_runtime": 54.365, + "eval_samples_per_second": 367.884, + "eval_steps_per_second": 0.368, + "step": 100864 + }, + { + "epoch": 2.955262579652824, + "grad_norm": 15.167901992797852, + "learning_rate": 6.14800832150042e-09, + "loss": 3.0029, + "step": 100870 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 15.07583236694336, + "learning_rate": 6.069855726485441e-09, + "loss": 3.0081, + "step": 100880 + }, + { + "epoch": 2.9558485314582876, + "grad_norm": 16.031675338745117, + "learning_rate": 5.992202744553588e-09, + "loss": 3.0119, + "step": 100890 + }, + { + "epoch": 2.9561415073610195, + "grad_norm": 18.935447692871094, + "learning_rate": 5.915049383473648e-09, + "loss": 2.988, + "step": 100900 + }, + { + "epoch": 2.9564344832637515, + "grad_norm": 17.118610382080078, + "learning_rate": 5.838395650963891e-09, + "loss": 2.9662, + "step": 100910 + }, + { + "epoch": 2.9567274591664834, + "grad_norm": 20.53005027770996, + "learning_rate": 5.762241554693737e-09, + "loss": 2.9955, + "step": 100920 + }, + { + "epoch": 2.9570204350692153, + "grad_norm": 13.718560218811035, + "learning_rate": 5.686587102280983e-09, + "loss": 3.0138, + "step": 100930 + }, + { + "epoch": 2.9573134109719477, + "grad_norm": 18.583900451660156, + "learning_rate": 5.611432301295128e-09, + "loss": 3.0158, + "step": 100940 + }, + { + "epoch": 2.9576063868746796, + "grad_norm": 18.734827041625977, + "learning_rate": 5.536777159254603e-09, + "loss": 3.0006, + "step": 100950 + }, + { + "epoch": 2.9578993627774115, + "grad_norm": 22.752941131591797, + "learning_rate": 5.462621683628433e-09, + "loss": 3.0054, + "step": 100960 + }, + { + "epoch": 2.9581923386801434, + "grad_norm": 21.960460662841797, + "learning_rate": 5.388965881835684e-09, + "loss": 3.0034, + "step": 100970 + }, + { + "epoch": 2.958485314582876, + "grad_norm": 21.17070198059082, + "learning_rate": 5.315809761244906e-09, + "loss": 3.0015, + "step": 100980 + }, + { + "epoch": 2.9587782904856077, + "grad_norm": 19.20633888244629, + "learning_rate": 5.243153329174689e-09, + "loss": 2.9893, + "step": 100990 + }, + { + "epoch": 2.9590712663883396, + "grad_norm": 16.309650421142578, + "learning_rate": 5.170996592894218e-09, + "loss": 2.9914, + "step": 101000 + }, + { + "epoch": 2.9593642422910715, + "grad_norm": 15.681644439697266, + "learning_rate": 5.0993395596221625e-09, + "loss": 2.9894, + "step": 101010 + }, + { + "epoch": 2.9596572181938035, + "grad_norm": 18.406734466552734, + "learning_rate": 5.028182236527235e-09, + "loss": 3.0015, + "step": 101020 + }, + { + "epoch": 2.9599501940965354, + "grad_norm": 18.946090698242188, + "learning_rate": 4.9575246307287385e-09, + "loss": 2.9965, + "step": 101030 + }, + { + "epoch": 2.9602431699992673, + "grad_norm": 19.989774703979492, + "learning_rate": 4.8873667492954634e-09, + "loss": 2.9691, + "step": 101040 + }, + { + "epoch": 2.9605361459019996, + "grad_norm": 16.931114196777344, + "learning_rate": 4.817708599245685e-09, + "loss": 2.983, + "step": 101050 + }, + { + "epoch": 2.9608291218047316, + "grad_norm": 20.09424591064453, + "learning_rate": 4.748550187549383e-09, + "loss": 3.0041, + "step": 101060 + }, + { + "epoch": 2.9611220977074635, + "grad_norm": 17.113542556762695, + "learning_rate": 4.679891521124358e-09, + "loss": 3.0043, + "step": 101070 + }, + { + "epoch": 2.9614150736101954, + "grad_norm": 17.863740921020508, + "learning_rate": 4.611732606840114e-09, + "loss": 2.9925, + "step": 101080 + }, + { + "epoch": 2.9617080495129278, + "grad_norm": 19.90141487121582, + "learning_rate": 4.54407345151564e-09, + "loss": 3.0173, + "step": 101090 + }, + { + "epoch": 2.9620010254156597, + "grad_norm": 17.401046752929688, + "learning_rate": 4.476914061919413e-09, + "loss": 2.9908, + "step": 101100 + }, + { + "epoch": 2.9622940013183916, + "grad_norm": 15.456336975097656, + "learning_rate": 4.410254444770501e-09, + "loss": 2.9911, + "step": 101110 + }, + { + "epoch": 2.9625869772211235, + "grad_norm": 18.61873435974121, + "learning_rate": 4.344094606737459e-09, + "loss": 2.9918, + "step": 101120 + }, + { + "epoch": 2.9628799531238554, + "grad_norm": 18.328868865966797, + "learning_rate": 4.278434554439992e-09, + "loss": 2.992, + "step": 101130 + }, + { + "epoch": 2.9631729290265874, + "grad_norm": 18.42508316040039, + "learning_rate": 4.213274294446179e-09, + "loss": 2.9912, + "step": 101140 + }, + { + "epoch": 2.9634659049293197, + "grad_norm": 19.169475555419922, + "learning_rate": 4.148613833275805e-09, + "loss": 3.0008, + "step": 101150 + }, + { + "epoch": 2.9637588808320516, + "grad_norm": 14.731956481933594, + "learning_rate": 4.08445317739703e-09, + "loss": 2.9953, + "step": 101160 + }, + { + "epoch": 2.9640518567347836, + "grad_norm": 20.120874404907227, + "learning_rate": 4.020792333228607e-09, + "loss": 2.9962, + "step": 101170 + }, + { + "epoch": 2.9643448326375155, + "grad_norm": 16.190658569335938, + "learning_rate": 3.957631307139886e-09, + "loss": 2.9935, + "step": 101180 + }, + { + "epoch": 2.964637808540248, + "grad_norm": 22.62802505493164, + "learning_rate": 3.894970105450257e-09, + "loss": 3.0109, + "step": 101190 + }, + { + "epoch": 2.9649307844429797, + "grad_norm": 19.47657012939453, + "learning_rate": 3.832808734427484e-09, + "loss": 2.9988, + "step": 101200 + }, + { + "epoch": 2.9652237603457117, + "grad_norm": 19.499755859375, + "learning_rate": 3.771147200291036e-09, + "loss": 2.9895, + "step": 101210 + }, + { + "epoch": 2.9655167362484436, + "grad_norm": 16.591413497924805, + "learning_rate": 3.709985509209313e-09, + "loss": 2.9884, + "step": 101220 + }, + { + "epoch": 2.9658097121511755, + "grad_norm": 19.328144073486328, + "learning_rate": 3.649323667301863e-09, + "loss": 3.0135, + "step": 101230 + }, + { + "epoch": 2.9661026880539074, + "grad_norm": 20.27332305908203, + "learning_rate": 3.5891616806377204e-09, + "loss": 3.0052, + "step": 101240 + }, + { + "epoch": 2.9663956639566393, + "grad_norm": 19.032045364379883, + "learning_rate": 3.52949955523485e-09, + "loss": 2.9903, + "step": 101250 + }, + { + "epoch": 2.9666886398593717, + "grad_norm": 16.966442108154297, + "learning_rate": 3.470337297062365e-09, + "loss": 2.9961, + "step": 101260 + }, + { + "epoch": 2.9669816157621036, + "grad_norm": 17.886812210083008, + "learning_rate": 3.41167491203942e-09, + "loss": 2.9644, + "step": 101270 + }, + { + "epoch": 2.9672745916648355, + "grad_norm": 13.540022850036621, + "learning_rate": 3.3535124060346534e-09, + "loss": 3.0209, + "step": 101280 + }, + { + "epoch": 2.9675675675675675, + "grad_norm": 15.344192504882812, + "learning_rate": 3.2958497848667446e-09, + "loss": 3.0049, + "step": 101290 + }, + { + "epoch": 2.9678605434703, + "grad_norm": 19.738954544067383, + "learning_rate": 3.2386870543044126e-09, + "loss": 3.0002, + "step": 101300 + }, + { + "epoch": 2.9681535193730317, + "grad_norm": 20.661998748779297, + "learning_rate": 3.1820242200669705e-09, + "loss": 2.9777, + "step": 101310 + }, + { + "epoch": 2.9684464952757637, + "grad_norm": 17.910404205322266, + "learning_rate": 3.1258612878232175e-09, + "loss": 2.9951, + "step": 101320 + }, + { + "epoch": 2.9687394711784956, + "grad_norm": 18.69469451904297, + "learning_rate": 3.0701982631914375e-09, + "loss": 3.0019, + "step": 101330 + }, + { + "epoch": 2.9690324470812275, + "grad_norm": 17.63420295715332, + "learning_rate": 3.0150351517405084e-09, + "loss": 2.9874, + "step": 101340 + }, + { + "epoch": 2.9693254229839594, + "grad_norm": 18.998573303222656, + "learning_rate": 2.9603719589887947e-09, + "loss": 2.9876, + "step": 101350 + }, + { + "epoch": 2.9696183988866913, + "grad_norm": 18.99207878112793, + "learning_rate": 2.9062086904058096e-09, + "loss": 2.9942, + "step": 101360 + }, + { + "epoch": 2.9699113747894237, + "grad_norm": 18.05365753173828, + "learning_rate": 2.8525453514099966e-09, + "loss": 3.0041, + "step": 101370 + }, + { + "epoch": 2.9700871603310626, + "eval_bleu": 0.35409850720736413, + "eval_cap_loss": 0.8988959789276123, + "eval_con_loss": 1.1241047382354736, + "eval_loss": 3.1471054553985596, + "step": 101376 + }, + { + "epoch": 2.9700871603310626, + "eval_bleu": 0.35409850720736413, + "eval_cap_loss": 0.8988959789276123, + "eval_con_loss": 1.1241047382354736, + "eval_loss": 3.1471054553985596, + "eval_runtime": 55.0502, + "eval_samples_per_second": 363.305, + "eval_steps_per_second": 0.363, + "step": 101376 + } + ], + "logging_steps": 10, + "max_steps": 102396, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 512, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/training_args.bin b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..85852d7c64705ecb7af5ec274fa47262172cc29d --- /dev/null +++ b/checkpoints/checkpoint-101376/home/c_hunba/c_huncap_scratch/checkpoints/checkpoint-101376/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c310801d133c0a877798c4d244b1e6f64930af3bf8ef18bd678c3a9981bbd23 +size 5240