{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13157894736842105, "eval_steps": 2000, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.578947368421052e-05, "grad_norm": 992.0, "learning_rate": 1e-05, "loss": 37.1063, "loss/crossentropy": 15.088774585723877, "loss/hidden": 19.0875, "loss/incoh": 0.0, "loss/logits": 17.867034912109375, "loss/reg": 0.0, "step": 10 }, { "epoch": 0.00013157894736842105, "grad_norm": 408.0, "grad_norm_var": 138977.05, "learning_rate": 2e-05, "loss": 34.8921, "loss/crossentropy": 14.647695541381836, "loss/hidden": 18.8125, "loss/incoh": 0.0, "loss/logits": 15.686719226837159, "loss/reg": 0.0, "step": 20 }, { "epoch": 0.00019736842105263157, "grad_norm": 296.0, "grad_norm_var": 17795.066666666666, "learning_rate": 3e-05, "loss": 32.2418, "loss/crossentropy": 14.60380687713623, "loss/hidden": 18.6375, "loss/incoh": 0.0, "loss/logits": 13.480152130126953, "loss/reg": 0.0, "step": 30 }, { "epoch": 0.0002631578947368421, "grad_norm": 26.125, "grad_norm_var": 30349.038525390624, "learning_rate": 4e-05, "loss": 28.5558, "loss/crossentropy": 15.434869766235352, "loss/hidden": 18.525, "loss/incoh": 0.0, "loss/logits": 10.245227527618407, "loss/reg": 0.0, "step": 40 }, { "epoch": 0.0003289473684210526, "grad_norm": 51.25, "grad_norm_var": 160.51717122395834, "learning_rate": 5e-05, "loss": 27.8535, "loss/crossentropy": 13.159739780426026, "loss/hidden": 18.4, "loss/incoh": 0.0, "loss/logits": 10.088242149353027, "loss/reg": 0.0, "step": 50 }, { "epoch": 0.00039473684210526315, "grad_norm": 22.125, "grad_norm_var": 1.3315524858929522e+17, "learning_rate": 6e-05, "loss": 26.8127, "loss/crossentropy": 10.868320941925049, "loss/hidden": 18.1375, "loss/incoh": 0.0, "loss/logits": 8.643436527252197, "loss/reg": 0.0, "step": 60 }, { "epoch": 0.0004605263157894737, "grad_norm": 25.625, "grad_norm_var": 1.3315524982997056e+17, "learning_rate": 7e-05, "loss": 25.6909, "loss/crossentropy": 10.59145736694336, "loss/hidden": 17.65, "loss/incoh": 0.0, "loss/logits": 8.497289371490478, "loss/reg": 0.0, "step": 70 }, { "epoch": 0.0005263157894736842, "grad_norm": 32.75, "grad_norm_var": 206.156103515625, "learning_rate": 8e-05, "loss": 24.5919, "loss/crossentropy": 10.117278575897217, "loss/hidden": 17.025, "loss/incoh": 0.0, "loss/logits": 8.660355854034425, "loss/reg": 0.0, "step": 80 }, { "epoch": 0.0005921052631578948, "grad_norm": 128.0, "grad_norm_var": 1129.743212890625, "learning_rate": 9e-05, "loss": 23.5218, "loss/crossentropy": 9.528321361541748, "loss/hidden": 16.3125, "loss/incoh": 0.0, "loss/logits": 6.695920991897583, "loss/reg": 0.0, "step": 90 }, { "epoch": 0.0006578947368421052, "grad_norm": 68.5, "grad_norm_var": 439.56223958333334, "learning_rate": 0.0001, "loss": 23.2768, "loss/crossentropy": 10.33118553161621, "loss/hidden": 16.5125, "loss/incoh": 0.0, "loss/logits": 7.274227619171143, "loss/reg": 0.0, "step": 100 }, { "epoch": 0.0007236842105263158, "grad_norm": 73.5, "grad_norm_var": 2741.439518229167, "learning_rate": 0.0001, "loss": 22.6559, "loss/crossentropy": 9.676900672912598, "loss/hidden": 16.21875, "loss/incoh": 0.0, "loss/logits": 7.22898006439209, "loss/reg": 0.0, "step": 110 }, { "epoch": 0.0007894736842105263, "grad_norm": 115.5, "grad_norm_var": 1451.31875, "learning_rate": 0.0001, "loss": 22.7316, "loss/crossentropy": 9.472201251983643, "loss/hidden": 16.03125, "loss/incoh": 0.0, "loss/logits": 6.231936502456665, "loss/reg": 0.0, "step": 120 }, { "epoch": 0.0008552631578947369, "grad_norm": 32.25, "grad_norm_var": 939.03515625, "learning_rate": 0.0001, "loss": 22.1075, "loss/crossentropy": 9.909808540344239, "loss/hidden": 16.075, "loss/incoh": 0.0, "loss/logits": 6.337067890167236, "loss/reg": 0.0, "step": 130 }, { "epoch": 0.0009210526315789473, "grad_norm": 37.75, "grad_norm_var": 222.684375, "learning_rate": 0.0001, "loss": 22.103, "loss/crossentropy": 9.516400051116943, "loss/hidden": 15.8875, "loss/incoh": 0.0, "loss/logits": 5.677393054962158, "loss/reg": 0.0, "step": 140 }, { "epoch": 0.000986842105263158, "grad_norm": 46.25, "grad_norm_var": 1221.1551432291667, "learning_rate": 0.0001, "loss": 21.5532, "loss/crossentropy": 9.398476314544677, "loss/hidden": 15.70625, "loss/incoh": 0.0, "loss/logits": 6.764282178878784, "loss/reg": 0.0, "step": 150 }, { "epoch": 0.0010526315789473684, "grad_norm": 42.0, "grad_norm_var": 416.9809895833333, "learning_rate": 0.0001, "loss": 20.8381, "loss/crossentropy": 9.098556232452392, "loss/hidden": 15.225, "loss/incoh": 0.0, "loss/logits": 5.8288336277008055, "loss/reg": 0.0, "step": 160 }, { "epoch": 0.0011184210526315789, "grad_norm": 33.25, "grad_norm_var": 334.8979166666667, "learning_rate": 0.0001, "loss": 18.9261, "loss/crossentropy": 7.879999303817749, "loss/hidden": 13.65, "loss/incoh": 0.0, "loss/logits": 5.353159737586975, "loss/reg": 0.0, "step": 170 }, { "epoch": 0.0011842105263157896, "grad_norm": 14.375, "grad_norm_var": 250.13333333333333, "learning_rate": 0.0001, "loss": 16.5004, "loss/crossentropy": 6.681199312210083, "loss/hidden": 12.0125, "loss/incoh": 0.0, "loss/logits": 4.6115447044372555, "loss/reg": 0.0, "step": 180 }, { "epoch": 0.00125, "grad_norm": 12.5625, "grad_norm_var": 119.9546875, "learning_rate": 0.0001, "loss": 14.1282, "loss/crossentropy": 5.785468435287475, "loss/hidden": 10.725, "loss/incoh": 0.0, "loss/logits": 3.489908790588379, "loss/reg": 0.0, "step": 190 }, { "epoch": 0.0013157894736842105, "grad_norm": 12.625, "grad_norm_var": 32.467431640625, "learning_rate": 0.0001, "loss": 12.8216, "loss/crossentropy": 4.923622274398804, "loss/hidden": 9.675, "loss/incoh": 0.0, "loss/logits": 3.1496715903282166, "loss/reg": 0.0, "step": 200 }, { "epoch": 0.001381578947368421, "grad_norm": 26.375, "grad_norm_var": 22.6759765625, "learning_rate": 0.0001, "loss": 11.5516, "loss/crossentropy": 4.429650473594665, "loss/hidden": 8.875, "loss/incoh": 0.0, "loss/logits": 2.247162342071533, "loss/reg": 0.0, "step": 210 }, { "epoch": 0.0014473684210526317, "grad_norm": 35.5, "grad_norm_var": 45.67233072916667, "learning_rate": 0.0001, "loss": 10.495, "loss/crossentropy": 4.112493515014648, "loss/hidden": 8.346875, "loss/incoh": 0.0, "loss/logits": 2.2232163310050965, "loss/reg": 0.0, "step": 220 }, { "epoch": 0.0015131578947368421, "grad_norm": 37.5, "grad_norm_var": 58.01770833333333, "learning_rate": 0.0001, "loss": 9.9703, "loss/crossentropy": 4.019938945770264, "loss/hidden": 8.015625, "loss/incoh": 0.0, "loss/logits": 2.0913679361343385, "loss/reg": 0.0, "step": 230 }, { "epoch": 0.0015789473684210526, "grad_norm": 36.25, "grad_norm_var": 53.95390625, "learning_rate": 0.0001, "loss": 9.5394, "loss/crossentropy": 3.9986461877822874, "loss/hidden": 7.490625, "loss/incoh": 0.0, "loss/logits": 1.801698899269104, "loss/reg": 0.0, "step": 240 }, { "epoch": 0.001644736842105263, "grad_norm": 34.0, "grad_norm_var": 913.9712890625, "learning_rate": 0.0001, "loss": 9.1248, "loss/crossentropy": 3.7174268484115602, "loss/hidden": 7.721875, "loss/incoh": 0.0, "loss/logits": 2.0452203273773195, "loss/reg": 0.0, "step": 250 }, { "epoch": 0.0017105263157894738, "grad_norm": 30.625, "grad_norm_var": 901.7176432291667, "learning_rate": 0.0001, "loss": 8.7525, "loss/crossentropy": 3.6645556688308716, "loss/hidden": 7.36875, "loss/incoh": 0.0, "loss/logits": 1.4742069363594055, "loss/reg": 0.0, "step": 260 }, { "epoch": 0.0017763157894736842, "grad_norm": 40.0, "grad_norm_var": 110.65826822916667, "learning_rate": 0.0001, "loss": 8.8113, "loss/crossentropy": 3.116227722167969, "loss/hidden": 7.31875, "loss/incoh": 0.0, "loss/logits": 1.6629727721214294, "loss/reg": 0.0, "step": 270 }, { "epoch": 0.0018421052631578947, "grad_norm": 34.75, "grad_norm_var": 122.43170572916667, "learning_rate": 0.0001, "loss": 8.5312, "loss/crossentropy": 3.413420820236206, "loss/hidden": 7.38125, "loss/incoh": 0.0, "loss/logits": 1.3304585099220276, "loss/reg": 0.0, "step": 280 }, { "epoch": 0.0019078947368421052, "grad_norm": 30.25, "grad_norm_var": 50.46640625, "learning_rate": 0.0001, "loss": 8.1172, "loss/crossentropy": 3.313588786125183, "loss/hidden": 6.721875, "loss/incoh": 0.0, "loss/logits": 1.1558095216751099, "loss/reg": 0.0, "step": 290 }, { "epoch": 0.001973684210526316, "grad_norm": 33.5, "grad_norm_var": 66.95618489583333, "learning_rate": 0.0001, "loss": 8.4831, "loss/crossentropy": 3.286371445655823, "loss/hidden": 6.971875, "loss/incoh": 0.0, "loss/logits": 1.4257299542427062, "loss/reg": 0.0, "step": 300 }, { "epoch": 0.0020394736842105263, "grad_norm": 37.0, "grad_norm_var": 39.40930989583333, "learning_rate": 0.0001, "loss": 8.1428, "loss/crossentropy": 3.1924397230148314, "loss/hidden": 6.7375, "loss/incoh": 0.0, "loss/logits": 1.129437392950058, "loss/reg": 0.0, "step": 310 }, { "epoch": 0.002105263157894737, "grad_norm": 35.75, "grad_norm_var": 60.73125, "learning_rate": 0.0001, "loss": 8.1236, "loss/crossentropy": 3.217240035533905, "loss/hidden": 7.0375, "loss/incoh": 0.0, "loss/logits": 1.216874635219574, "loss/reg": 0.0, "step": 320 }, { "epoch": 0.0021710526315789473, "grad_norm": 32.75, "grad_norm_var": 21.7181640625, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 3.2230591058731077, "loss/hidden": 6.665625, "loss/incoh": 0.0, "loss/logits": 1.165043205022812, "loss/reg": 0.0, "step": 330 }, { "epoch": 0.0022368421052631577, "grad_norm": 38.75, "grad_norm_var": 247.53932291666666, "learning_rate": 0.0001, "loss": 7.8717, "loss/crossentropy": 3.491655874252319, "loss/hidden": 6.628125, "loss/incoh": 0.0, "loss/logits": 1.1553439140319823, "loss/reg": 0.0, "step": 340 }, { "epoch": 0.002302631578947368, "grad_norm": 29.0, "grad_norm_var": 243.79576822916667, "learning_rate": 0.0001, "loss": 7.8354, "loss/crossentropy": 3.3709447622299193, "loss/hidden": 6.646875, "loss/incoh": 0.0, "loss/logits": 1.1052397668361664, "loss/reg": 0.0, "step": 350 }, { "epoch": 0.002368421052631579, "grad_norm": 28.75, "grad_norm_var": 40.61640625, "learning_rate": 0.0001, "loss": 7.5894, "loss/crossentropy": 3.0430339336395265, "loss/hidden": 6.778125, "loss/incoh": 0.0, "loss/logits": 1.152853137254715, "loss/reg": 0.0, "step": 360 }, { "epoch": 0.0024342105263157896, "grad_norm": 50.0, "grad_norm_var": 58.61640625, "learning_rate": 0.0001, "loss": 7.7607, "loss/crossentropy": 3.461497259140015, "loss/hidden": 6.640625, "loss/incoh": 0.0, "loss/logits": 1.2907899796962738, "loss/reg": 0.0, "step": 370 }, { "epoch": 0.0025, "grad_norm": 23.875, "grad_norm_var": 50.36145833333333, "learning_rate": 0.0001, "loss": 7.5895, "loss/crossentropy": 2.8183989763259887, "loss/hidden": 6.36875, "loss/incoh": 0.0, "loss/logits": 0.9597792446613311, "loss/reg": 0.0, "step": 380 }, { "epoch": 0.0025657894736842105, "grad_norm": 22.75, "grad_norm_var": 18.348958333333332, "learning_rate": 0.0001, "loss": 7.4024, "loss/crossentropy": 3.0434406876564024, "loss/hidden": 6.425, "loss/incoh": 0.0, "loss/logits": 1.1219703614711762, "loss/reg": 0.0, "step": 390 }, { "epoch": 0.002631578947368421, "grad_norm": 18.75, "grad_norm_var": 75.46451822916667, "learning_rate": 0.0001, "loss": 7.4663, "loss/crossentropy": 2.9813458204269407, "loss/hidden": 6.334375, "loss/incoh": 0.0, "loss/logits": 1.0157755613327026, "loss/reg": 0.0, "step": 400 }, { "epoch": 0.0026973684210526315, "grad_norm": 20.25, "grad_norm_var": 14.660416666666666, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 3.030743360519409, "loss/hidden": 6.23125, "loss/incoh": 0.0, "loss/logits": 1.0403401851654053, "loss/reg": 0.0, "step": 410 }, { "epoch": 0.002763157894736842, "grad_norm": 19.75, "grad_norm_var": 6.712239583333333, "learning_rate": 0.0001, "loss": 7.1935, "loss/crossentropy": 3.044888973236084, "loss/hidden": 6.1125, "loss/incoh": 0.0, "loss/logits": 0.920581477880478, "loss/reg": 0.0, "step": 420 }, { "epoch": 0.002828947368421053, "grad_norm": 18.625, "grad_norm_var": 5.847330729166667, "learning_rate": 0.0001, "loss": 7.0053, "loss/crossentropy": 3.2355963468551634, "loss/hidden": 6.015625, "loss/incoh": 0.0, "loss/logits": 0.9773828387260437, "loss/reg": 0.0, "step": 430 }, { "epoch": 0.0028947368421052633, "grad_norm": 21.875, "grad_norm_var": 9.627067057291667, "learning_rate": 0.0001, "loss": 6.9973, "loss/crossentropy": 3.3775979042053224, "loss/hidden": 6.071875, "loss/incoh": 0.0, "loss/logits": 1.0445533573627472, "loss/reg": 0.0, "step": 440 }, { "epoch": 0.0029605263157894738, "grad_norm": 19.25, "grad_norm_var": 10.170947265625, "learning_rate": 0.0001, "loss": 6.9686, "loss/crossentropy": 3.0577521562576293, "loss/hidden": 5.803125, "loss/incoh": 0.0, "loss/logits": 0.8946591019630432, "loss/reg": 0.0, "step": 450 }, { "epoch": 0.0030263157894736843, "grad_norm": 19.75, "grad_norm_var": 5.329166666666667, "learning_rate": 0.0001, "loss": 6.8021, "loss/crossentropy": 3.2570735692977903, "loss/hidden": 5.678125, "loss/incoh": 0.0, "loss/logits": 0.860103166103363, "loss/reg": 0.0, "step": 460 }, { "epoch": 0.0030921052631578947, "grad_norm": 11.5, "grad_norm_var": 5.882535807291666, "learning_rate": 0.0001, "loss": 6.6494, "loss/crossentropy": 3.045071005821228, "loss/hidden": 5.759375, "loss/incoh": 0.0, "loss/logits": 0.890488612651825, "loss/reg": 0.0, "step": 470 }, { "epoch": 0.003157894736842105, "grad_norm": 12.0, "grad_norm_var": 3.778369140625, "learning_rate": 0.0001, "loss": 6.6399, "loss/crossentropy": 2.955122375488281, "loss/hidden": 5.559375, "loss/incoh": 0.0, "loss/logits": 0.7988932132720947, "loss/reg": 0.0, "step": 480 }, { "epoch": 0.0032236842105263157, "grad_norm": 14.625, "grad_norm_var": 3.397509765625, "learning_rate": 0.0001, "loss": 6.6006, "loss/crossentropy": 2.8290895342826845, "loss/hidden": 5.73125, "loss/incoh": 0.0, "loss/logits": 0.8468542337417603, "loss/reg": 0.0, "step": 490 }, { "epoch": 0.003289473684210526, "grad_norm": 11.5, "grad_norm_var": 4.100113932291666, "learning_rate": 0.0001, "loss": 6.4823, "loss/crossentropy": 2.7418078184127808, "loss/hidden": 5.665625, "loss/incoh": 0.0, "loss/logits": 0.7723756909370423, "loss/reg": 0.0, "step": 500 }, { "epoch": 0.003355263157894737, "grad_norm": 11.5625, "grad_norm_var": 2.79375, "learning_rate": 0.0001, "loss": 6.4511, "loss/crossentropy": 3.1031686782836916, "loss/hidden": 5.721875, "loss/incoh": 0.0, "loss/logits": 0.9047020822763443, "loss/reg": 0.0, "step": 510 }, { "epoch": 0.0034210526315789475, "grad_norm": 10.625, "grad_norm_var": 0.8618326822916667, "learning_rate": 0.0001, "loss": 6.3114, "loss/crossentropy": 2.7031071186065674, "loss/hidden": 5.3875, "loss/incoh": 0.0, "loss/logits": 0.7112044870853425, "loss/reg": 0.0, "step": 520 }, { "epoch": 0.003486842105263158, "grad_norm": 9.625, "grad_norm_var": 94.71066080729166, "learning_rate": 0.0001, "loss": 6.2538, "loss/crossentropy": 2.8632609844207764, "loss/hidden": 5.784375, "loss/incoh": 0.0, "loss/logits": 0.8032145172357559, "loss/reg": 0.0, "step": 530 }, { "epoch": 0.0035526315789473684, "grad_norm": 12.0, "grad_norm_var": 2.7383748372395833, "learning_rate": 0.0001, "loss": 6.1042, "loss/crossentropy": 3.037100100517273, "loss/hidden": 5.25625, "loss/incoh": 0.0, "loss/logits": 0.7572773277759552, "loss/reg": 0.0, "step": 540 }, { "epoch": 0.003618421052631579, "grad_norm": 10.9375, "grad_norm_var": 2.5140584309895835, "learning_rate": 0.0001, "loss": 6.1353, "loss/crossentropy": 2.979613184928894, "loss/hidden": 5.196875, "loss/incoh": 0.0, "loss/logits": 0.7719000339508056, "loss/reg": 0.0, "step": 550 }, { "epoch": 0.0036842105263157894, "grad_norm": 8.0625, "grad_norm_var": 1.6379557291666667, "learning_rate": 0.0001, "loss": 6.0637, "loss/crossentropy": 2.687799036502838, "loss/hidden": 5.51875, "loss/incoh": 0.0, "loss/logits": 0.9166353821754456, "loss/reg": 0.0, "step": 560 }, { "epoch": 0.00375, "grad_norm": 9.9375, "grad_norm_var": 18.912353515625, "learning_rate": 0.0001, "loss": 6.0506, "loss/crossentropy": 2.935390818119049, "loss/hidden": 5.165625, "loss/incoh": 0.0, "loss/logits": 0.716228786110878, "loss/reg": 0.0, "step": 570 }, { "epoch": 0.0038157894736842103, "grad_norm": 7.25, "grad_norm_var": 18.753641764322918, "learning_rate": 0.0001, "loss": 5.9637, "loss/crossentropy": 2.7780107259750366, "loss/hidden": 5.190625, "loss/incoh": 0.0, "loss/logits": 0.7067849993705749, "loss/reg": 0.0, "step": 580 }, { "epoch": 0.0038815789473684212, "grad_norm": 6.84375, "grad_norm_var": 2.39361572265625, "learning_rate": 0.0001, "loss": 5.9361, "loss/crossentropy": 3.0060938119888307, "loss/hidden": 5.225, "loss/incoh": 0.0, "loss/logits": 0.7271955192089081, "loss/reg": 0.0, "step": 590 }, { "epoch": 0.003947368421052632, "grad_norm": 7.6875, "grad_norm_var": 0.75357666015625, "learning_rate": 0.0001, "loss": 5.7669, "loss/crossentropy": 2.691058301925659, "loss/hidden": 4.825, "loss/incoh": 0.0, "loss/logits": 0.6389567136764527, "loss/reg": 0.0, "step": 600 }, { "epoch": 0.004013157894736842, "grad_norm": 6.78125, "grad_norm_var": 2.53160400390625, "learning_rate": 0.0001, "loss": 5.7022, "loss/crossentropy": 2.7504406690597536, "loss/hidden": 5.121875, "loss/incoh": 0.0, "loss/logits": 0.8024603247642517, "loss/reg": 0.0, "step": 610 }, { "epoch": 0.004078947368421053, "grad_norm": 6.875, "grad_norm_var": 2.7155558268229165, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.9313748240470887, "loss/hidden": 5.00625, "loss/incoh": 0.0, "loss/logits": 0.7257195949554444, "loss/reg": 0.0, "step": 620 }, { "epoch": 0.0041447368421052636, "grad_norm": 5.8125, "grad_norm_var": 0.3809733072916667, "learning_rate": 0.0001, "loss": 5.6209, "loss/crossentropy": 2.8686537384986877, "loss/hidden": 5.05625, "loss/incoh": 0.0, "loss/logits": 0.7894174456596375, "loss/reg": 0.0, "step": 630 }, { "epoch": 0.004210526315789474, "grad_norm": 10.5, "grad_norm_var": 1.7327962239583334, "learning_rate": 0.0001, "loss": 5.6566, "loss/crossentropy": 2.910102880001068, "loss/hidden": 4.95, "loss/incoh": 0.0, "loss/logits": 0.7542287766933441, "loss/reg": 0.0, "step": 640 }, { "epoch": 0.0042763157894736845, "grad_norm": 6.0, "grad_norm_var": 1.90406494140625, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 3.0497835516929626, "loss/hidden": 5.075, "loss/incoh": 0.0, "loss/logits": 0.6385725855827331, "loss/reg": 0.0, "step": 650 }, { "epoch": 0.0043421052631578945, "grad_norm": 5.90625, "grad_norm_var": 6.811181640625, "learning_rate": 0.0001, "loss": 5.4435, "loss/crossentropy": 2.839945673942566, "loss/hidden": 4.621875, "loss/incoh": 0.0, "loss/logits": 0.608047366142273, "loss/reg": 0.0, "step": 660 }, { "epoch": 0.0044078947368421054, "grad_norm": 8.5, "grad_norm_var": 1.419775390625, "learning_rate": 0.0001, "loss": 5.4264, "loss/crossentropy": 2.6664235949516297, "loss/hidden": 4.71875, "loss/incoh": 0.0, "loss/logits": 0.5786069691181183, "loss/reg": 0.0, "step": 670 }, { "epoch": 0.0044736842105263155, "grad_norm": 5.25, "grad_norm_var": 1.222509765625, "learning_rate": 0.0001, "loss": 5.4175, "loss/crossentropy": 2.7476831912994384, "loss/hidden": 4.646875, "loss/incoh": 0.0, "loss/logits": 0.6524303257465363, "loss/reg": 0.0, "step": 680 }, { "epoch": 0.004539473684210526, "grad_norm": 5.15625, "grad_norm_var": 0.8695271809895834, "learning_rate": 0.0001, "loss": 5.2974, "loss/crossentropy": 2.718129062652588, "loss/hidden": 4.69375, "loss/incoh": 0.0, "loss/logits": 0.679440614581108, "loss/reg": 0.0, "step": 690 }, { "epoch": 0.004605263157894736, "grad_norm": 5.875, "grad_norm_var": 1.3825358072916667, "learning_rate": 0.0001, "loss": 5.3809, "loss/crossentropy": 2.896076512336731, "loss/hidden": 4.503125, "loss/incoh": 0.0, "loss/logits": 0.6036079049110412, "loss/reg": 0.0, "step": 700 }, { "epoch": 0.004671052631578947, "grad_norm": 5.875, "grad_norm_var": 0.94888916015625, "learning_rate": 0.0001, "loss": 5.2593, "loss/crossentropy": 2.765268421173096, "loss/hidden": 4.803125, "loss/incoh": 0.0, "loss/logits": 0.7337387800216675, "loss/reg": 0.0, "step": 710 }, { "epoch": 0.004736842105263158, "grad_norm": 4.75, "grad_norm_var": 1.0287760416666667, "learning_rate": 0.0001, "loss": 5.1503, "loss/crossentropy": 2.6812595248222353, "loss/hidden": 4.75625, "loss/incoh": 0.0, "loss/logits": 0.6710720509290695, "loss/reg": 0.0, "step": 720 }, { "epoch": 0.004802631578947368, "grad_norm": 6.0625, "grad_norm_var": 86.4677734375, "learning_rate": 0.0001, "loss": 5.3144, "loss/crossentropy": 2.7298573732376097, "loss/hidden": 4.484375, "loss/incoh": 0.0, "loss/logits": 0.6176893144845963, "loss/reg": 0.0, "step": 730 }, { "epoch": 0.004868421052631579, "grad_norm": 4.875, "grad_norm_var": 85.56676025390625, "learning_rate": 0.0001, "loss": 5.131, "loss/crossentropy": 2.823095703125, "loss/hidden": 4.571875, "loss/incoh": 0.0, "loss/logits": 0.6797973781824111, "loss/reg": 0.0, "step": 740 }, { "epoch": 0.004934210526315789, "grad_norm": 4.96875, "grad_norm_var": 4.8291015625, "learning_rate": 0.0001, "loss": 5.1654, "loss/crossentropy": 2.901310992240906, "loss/hidden": 4.61875, "loss/incoh": 0.0, "loss/logits": 0.737342044711113, "loss/reg": 0.0, "step": 750 }, { "epoch": 0.005, "grad_norm": 9.375, "grad_norm_var": 3.796728515625, "learning_rate": 0.0001, "loss": 5.0448, "loss/crossentropy": 2.5148804664611815, "loss/hidden": 4.48125, "loss/incoh": 0.0, "loss/logits": 0.5650010257959366, "loss/reg": 0.0, "step": 760 }, { "epoch": 0.00506578947368421, "grad_norm": 4.90625, "grad_norm_var": 18.020947265625, "learning_rate": 0.0001, "loss": 5.0271, "loss/crossentropy": 2.6732282817363737, "loss/hidden": 4.48125, "loss/incoh": 0.0, "loss/logits": 0.5763083070516586, "loss/reg": 0.0, "step": 770 }, { "epoch": 0.005131578947368421, "grad_norm": 4.71875, "grad_norm_var": 17.617118326822915, "learning_rate": 0.0001, "loss": 5.0057, "loss/crossentropy": 2.927682900428772, "loss/hidden": 4.578125, "loss/incoh": 0.0, "loss/logits": 0.680091741681099, "loss/reg": 0.0, "step": 780 }, { "epoch": 0.005197368421052632, "grad_norm": 4.53125, "grad_norm_var": 1.5018880208333334, "learning_rate": 0.0001, "loss": 5.0918, "loss/crossentropy": 2.7974375009536745, "loss/hidden": 4.75, "loss/incoh": 0.0, "loss/logits": 0.709694892168045, "loss/reg": 0.0, "step": 790 }, { "epoch": 0.005263157894736842, "grad_norm": 4.46875, "grad_norm_var": 20.911812337239585, "learning_rate": 0.0001, "loss": 4.9367, "loss/crossentropy": 2.875410461425781, "loss/hidden": 4.578125, "loss/incoh": 0.0, "loss/logits": 0.8118050575256348, "loss/reg": 0.0, "step": 800 }, { "epoch": 0.005328947368421053, "grad_norm": 5.375, "grad_norm_var": 2.1762980143229167, "learning_rate": 0.0001, "loss": 4.8483, "loss/crossentropy": 2.665932035446167, "loss/hidden": 4.3140625, "loss/incoh": 0.0, "loss/logits": 0.5815477341413497, "loss/reg": 0.0, "step": 810 }, { "epoch": 0.005394736842105263, "grad_norm": 4.96875, "grad_norm_var": 0.6493235270182292, "learning_rate": 0.0001, "loss": 4.9314, "loss/crossentropy": 2.5820897936820986, "loss/hidden": 4.15, "loss/incoh": 0.0, "loss/logits": 0.4888360023498535, "loss/reg": 0.0, "step": 820 }, { "epoch": 0.005460526315789474, "grad_norm": 3.890625, "grad_norm_var": 0.2476470947265625, "learning_rate": 0.0001, "loss": 4.7859, "loss/crossentropy": 2.647566497325897, "loss/hidden": 4.3484375, "loss/incoh": 0.0, "loss/logits": 0.5885868102312088, "loss/reg": 0.0, "step": 830 }, { "epoch": 0.005526315789473684, "grad_norm": 4.9375, "grad_norm_var": 0.29011128743489584, "learning_rate": 0.0001, "loss": 4.7829, "loss/crossentropy": 2.583667039871216, "loss/hidden": 4.2828125, "loss/incoh": 0.0, "loss/logits": 0.4921345829963684, "loss/reg": 0.0, "step": 840 }, { "epoch": 0.005592105263157895, "grad_norm": 4.6875, "grad_norm_var": 0.397900390625, "learning_rate": 0.0001, "loss": 4.8821, "loss/crossentropy": 2.798052990436554, "loss/hidden": 4.2625, "loss/incoh": 0.0, "loss/logits": 0.7617209196090698, "loss/reg": 0.0, "step": 850 }, { "epoch": 0.005657894736842106, "grad_norm": 4.21875, "grad_norm_var": 0.29501546223958336, "learning_rate": 0.0001, "loss": 4.9069, "loss/crossentropy": 2.8075502276420594, "loss/hidden": 4.1328125, "loss/incoh": 0.0, "loss/logits": 0.5111032873392105, "loss/reg": 0.0, "step": 860 }, { "epoch": 0.005723684210526316, "grad_norm": 4.21875, "grad_norm_var": 0.24931233723958332, "learning_rate": 0.0001, "loss": 4.8049, "loss/crossentropy": 2.400660240650177, "loss/hidden": 4.3859375, "loss/incoh": 0.0, "loss/logits": 0.6145752131938934, "loss/reg": 0.0, "step": 870 }, { "epoch": 0.005789473684210527, "grad_norm": 4.75, "grad_norm_var": 1.2445271809895833, "learning_rate": 0.0001, "loss": 4.7333, "loss/crossentropy": 2.631825530529022, "loss/hidden": 4.3640625, "loss/incoh": 0.0, "loss/logits": 0.5737248510122299, "loss/reg": 0.0, "step": 880 }, { "epoch": 0.005855263157894737, "grad_norm": 4.90625, "grad_norm_var": 1.9954742431640624, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.9550926446914674, "loss/hidden": 4.1046875, "loss/incoh": 0.0, "loss/logits": 0.5351347416639328, "loss/reg": 0.0, "step": 890 }, { "epoch": 0.0059210526315789476, "grad_norm": 4.15625, "grad_norm_var": 0.8051991780598958, "learning_rate": 0.0001, "loss": 4.7267, "loss/crossentropy": 2.322158467769623, "loss/hidden": 4.2359375, "loss/incoh": 0.0, "loss/logits": 0.4901482403278351, "loss/reg": 0.0, "step": 900 }, { "epoch": 0.005986842105263158, "grad_norm": 4.59375, "grad_norm_var": 0.5987589518229167, "learning_rate": 0.0001, "loss": 4.7552, "loss/crossentropy": 2.163175904750824, "loss/hidden": 4.475, "loss/incoh": 0.0, "loss/logits": 0.5075026541948319, "loss/reg": 0.0, "step": 910 }, { "epoch": 0.0060526315789473685, "grad_norm": 4.4375, "grad_norm_var": 13.234305826822917, "learning_rate": 0.0001, "loss": 4.8996, "loss/crossentropy": 2.5448178887367248, "loss/hidden": 4.078125, "loss/incoh": 0.0, "loss/logits": 0.5253018319606781, "loss/reg": 0.0, "step": 920 }, { "epoch": 0.0061184210526315785, "grad_norm": 4.09375, "grad_norm_var": 4.241829427083333, "learning_rate": 0.0001, "loss": 4.5704, "loss/crossentropy": 2.551811099052429, "loss/hidden": 4.0546875, "loss/incoh": 0.0, "loss/logits": 0.48250589668750765, "loss/reg": 0.0, "step": 930 }, { "epoch": 0.0061842105263157894, "grad_norm": 4.71875, "grad_norm_var": 0.13075764973958334, "learning_rate": 0.0001, "loss": 4.6287, "loss/crossentropy": 2.8518677711486817, "loss/hidden": 4.134375, "loss/incoh": 0.0, "loss/logits": 0.5851545244455337, "loss/reg": 0.0, "step": 940 }, { "epoch": 0.00625, "grad_norm": 5.65625, "grad_norm_var": 1.0635701497395833, "learning_rate": 0.0001, "loss": 4.6855, "loss/crossentropy": 2.8685452222824095, "loss/hidden": 3.9453125, "loss/incoh": 0.0, "loss/logits": 0.5299171417951584, "loss/reg": 0.0, "step": 950 }, { "epoch": 0.00631578947368421, "grad_norm": 4.4375, "grad_norm_var": 0.2560506184895833, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.73275808095932, "loss/hidden": 4.121875, "loss/incoh": 0.0, "loss/logits": 0.4992497324943542, "loss/reg": 0.0, "step": 960 }, { "epoch": 0.006381578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.3540598551432292, "learning_rate": 0.0001, "loss": 4.6436, "loss/crossentropy": 2.6122008085250856, "loss/hidden": 4.084375, "loss/incoh": 0.0, "loss/logits": 0.5424144893884659, "loss/reg": 0.0, "step": 970 }, { "epoch": 0.006447368421052631, "grad_norm": 4.0, "grad_norm_var": 1.2561260533134024e+17, "learning_rate": 0.0001, "loss": 4.7384, "loss/crossentropy": 2.747082471847534, "loss/hidden": 3.975, "loss/incoh": 0.0, "loss/logits": 0.4949415147304535, "loss/reg": 0.0, "step": 980 }, { "epoch": 0.006513157894736842, "grad_norm": 4.6875, "grad_norm_var": 15.184098307291666, "learning_rate": 0.0001, "loss": 4.6154, "loss/crossentropy": 2.6343679666519164, "loss/hidden": 4.009375, "loss/incoh": 0.0, "loss/logits": 0.46223918795585633, "loss/reg": 0.0, "step": 990 }, { "epoch": 0.006578947368421052, "grad_norm": 4.84375, "grad_norm_var": 3.58121337890625, "learning_rate": 0.0001, "loss": 4.6456, "loss/crossentropy": 2.618730914592743, "loss/hidden": 4.1390625, "loss/incoh": 0.0, "loss/logits": 0.568024319410324, "loss/reg": 0.0, "step": 1000 }, { "epoch": 0.006644736842105263, "grad_norm": 10.1875, "grad_norm_var": 2.6229563395182294, "learning_rate": 0.0001, "loss": 4.5241, "loss/crossentropy": 2.7510436296463014, "loss/hidden": 3.9421875, "loss/incoh": 0.0, "loss/logits": 0.4975563734769821, "loss/reg": 0.0, "step": 1010 }, { "epoch": 0.006710526315789474, "grad_norm": 4.59375, "grad_norm_var": 9.821955362955729, "learning_rate": 0.0001, "loss": 4.5805, "loss/crossentropy": 2.773310422897339, "loss/hidden": 3.94375, "loss/incoh": 0.0, "loss/logits": 0.5206439226865769, "loss/reg": 0.0, "step": 1020 }, { "epoch": 0.006776315789473684, "grad_norm": 4.34375, "grad_norm_var": 8.482259114583334, "learning_rate": 0.0001, "loss": 4.4588, "loss/crossentropy": 2.6046599745750427, "loss/hidden": 3.88125, "loss/incoh": 0.0, "loss/logits": 0.44563083052635194, "loss/reg": 0.0, "step": 1030 }, { "epoch": 0.006842105263157895, "grad_norm": 4.0, "grad_norm_var": 5.523110961914062, "learning_rate": 0.0001, "loss": 4.6377, "loss/crossentropy": 2.6867428183555604, "loss/hidden": 4.0921875, "loss/incoh": 0.0, "loss/logits": 0.4769616901874542, "loss/reg": 0.0, "step": 1040 }, { "epoch": 0.006907894736842105, "grad_norm": 3.890625, "grad_norm_var": 9.615762329101562, "learning_rate": 0.0001, "loss": 4.5576, "loss/crossentropy": 2.6374024391174316, "loss/hidden": 3.915625, "loss/incoh": 0.0, "loss/logits": 0.5235758543014526, "loss/reg": 0.0, "step": 1050 }, { "epoch": 0.006973684210526316, "grad_norm": 3.796875, "grad_norm_var": 4.420035807291667, "learning_rate": 0.0001, "loss": 4.5061, "loss/crossentropy": 2.409494662284851, "loss/hidden": 4.0203125, "loss/incoh": 0.0, "loss/logits": 0.5403494209051132, "loss/reg": 0.0, "step": 1060 }, { "epoch": 0.007039473684210526, "grad_norm": 4.25, "grad_norm_var": 12.098844401041667, "learning_rate": 0.0001, "loss": 4.7608, "loss/crossentropy": 2.6496052145957947, "loss/hidden": 3.878125, "loss/incoh": 0.0, "loss/logits": 0.48798912912607195, "loss/reg": 0.0, "step": 1070 }, { "epoch": 0.007105263157894737, "grad_norm": 3.875, "grad_norm_var": 9.539867146809895, "learning_rate": 0.0001, "loss": 4.3815, "loss/crossentropy": 2.757003378868103, "loss/hidden": 4.11875, "loss/incoh": 0.0, "loss/logits": 0.4590116262435913, "loss/reg": 0.0, "step": 1080 }, { "epoch": 0.007171052631578947, "grad_norm": 8.5625, "grad_norm_var": 1.5923166910807292, "learning_rate": 0.0001, "loss": 4.44, "loss/crossentropy": 2.550069880485535, "loss/hidden": 4.071875, "loss/incoh": 0.0, "loss/logits": 0.5490242928266525, "loss/reg": 0.0, "step": 1090 }, { "epoch": 0.007236842105263158, "grad_norm": 3.96875, "grad_norm_var": 1.6922597249348958, "learning_rate": 0.0001, "loss": 4.4725, "loss/crossentropy": 2.606226110458374, "loss/hidden": 4.0875, "loss/incoh": 0.0, "loss/logits": 0.5686961978673934, "loss/reg": 0.0, "step": 1100 }, { "epoch": 0.007302631578947369, "grad_norm": 4.90625, "grad_norm_var": 0.9616282145182292, "learning_rate": 0.0001, "loss": 4.4416, "loss/crossentropy": 2.742388653755188, "loss/hidden": 3.8171875, "loss/incoh": 0.0, "loss/logits": 0.491252401471138, "loss/reg": 0.0, "step": 1110 }, { "epoch": 0.007368421052631579, "grad_norm": 4.375, "grad_norm_var": 0.8876302083333333, "learning_rate": 0.0001, "loss": 4.4119, "loss/crossentropy": 2.858240842819214, "loss/hidden": 3.796875, "loss/incoh": 0.0, "loss/logits": 0.4695854902267456, "loss/reg": 0.0, "step": 1120 }, { "epoch": 0.00743421052631579, "grad_norm": 3.453125, "grad_norm_var": 0.8083485921223958, "learning_rate": 0.0001, "loss": 4.3524, "loss/crossentropy": 2.7758461236953735, "loss/hidden": 3.90625, "loss/incoh": 0.0, "loss/logits": 0.5061279594898224, "loss/reg": 0.0, "step": 1130 }, { "epoch": 0.0075, "grad_norm": 3.78125, "grad_norm_var": 0.8257802327473959, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 2.849539041519165, "loss/hidden": 3.7125, "loss/incoh": 0.0, "loss/logits": 0.45034482181072233, "loss/reg": 0.0, "step": 1140 }, { "epoch": 0.007565789473684211, "grad_norm": 4.0625, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 4.259, "loss/crossentropy": 2.6673625230789186, "loss/hidden": 3.903125, "loss/incoh": 0.0, "loss/logits": 0.45979970395565034, "loss/reg": 0.0, "step": 1150 }, { "epoch": 0.007631578947368421, "grad_norm": 3.984375, "grad_norm_var": 0.2490631103515625, "learning_rate": 0.0001, "loss": 4.2786, "loss/crossentropy": 2.6413369178771973, "loss/hidden": 3.7734375, "loss/incoh": 0.0, "loss/logits": 0.45444311797618864, "loss/reg": 0.0, "step": 1160 }, { "epoch": 0.007697368421052632, "grad_norm": 6.1875, "grad_norm_var": 0.7844960530598958, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.54203085899353, "loss/hidden": 3.6671875, "loss/incoh": 0.0, "loss/logits": 0.40531369894742963, "loss/reg": 0.0, "step": 1170 }, { "epoch": 0.0077631578947368425, "grad_norm": 3.65625, "grad_norm_var": 0.444384765625, "learning_rate": 0.0001, "loss": 4.3125, "loss/crossentropy": 2.772641682624817, "loss/hidden": 3.8296875, "loss/incoh": 0.0, "loss/logits": 0.4890771210193634, "loss/reg": 0.0, "step": 1180 }, { "epoch": 0.007828947368421053, "grad_norm": 4.15625, "grad_norm_var": 0.26546122233072916, "learning_rate": 0.0001, "loss": 4.3654, "loss/crossentropy": 2.661901044845581, "loss/hidden": 3.7015625, "loss/incoh": 0.0, "loss/logits": 0.439369834959507, "loss/reg": 0.0, "step": 1190 }, { "epoch": 0.007894736842105263, "grad_norm": 3.75, "grad_norm_var": 0.25321858723958335, "learning_rate": 0.0001, "loss": 4.2648, "loss/crossentropy": 2.3847479939460756, "loss/hidden": 4.028125, "loss/incoh": 0.0, "loss/logits": 0.4770447015762329, "loss/reg": 0.0, "step": 1200 }, { "epoch": 0.007960526315789473, "grad_norm": 3.921875, "grad_norm_var": 0.2809529622395833, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 2.244320285320282, "loss/hidden": 3.8109375, "loss/incoh": 0.0, "loss/logits": 0.40470985919237135, "loss/reg": 0.0, "step": 1210 }, { "epoch": 0.008026315789473683, "grad_norm": 4.28125, "grad_norm_var": 0.5173004150390625, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 2.5515334010124207, "loss/hidden": 4.109375, "loss/incoh": 0.0, "loss/logits": 0.5172385692596435, "loss/reg": 0.0, "step": 1220 }, { "epoch": 0.008092105263157895, "grad_norm": 3.5, "grad_norm_var": 0.2546539306640625, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.5860470652580263, "loss/hidden": 3.7171875, "loss/incoh": 0.0, "loss/logits": 0.4412991553544998, "loss/reg": 0.0, "step": 1230 }, { "epoch": 0.008157894736842105, "grad_norm": 3.1875, "grad_norm_var": 1.7592185950961664e+17, "learning_rate": 0.0001, "loss": 4.4752, "loss/crossentropy": 2.823768949508667, "loss/hidden": 3.703125, "loss/incoh": 0.0, "loss/logits": 0.5228973954916001, "loss/reg": 0.0, "step": 1240 }, { "epoch": 0.008223684210526315, "grad_norm": 3.21875, "grad_norm_var": 1.966552734375, "learning_rate": 0.0001, "loss": 4.2687, "loss/crossentropy": 2.523781180381775, "loss/hidden": 3.825, "loss/incoh": 0.0, "loss/logits": 0.4939111739397049, "loss/reg": 0.0, "step": 1250 }, { "epoch": 0.008289473684210527, "grad_norm": 2.796875, "grad_norm_var": 1.2621897379557292, "learning_rate": 0.0001, "loss": 4.1499, "loss/crossentropy": 2.5173804640769957, "loss/hidden": 3.7328125, "loss/incoh": 0.0, "loss/logits": 0.44251940250396726, "loss/reg": 0.0, "step": 1260 }, { "epoch": 0.008355263157894737, "grad_norm": 4.15625, "grad_norm_var": 3.665185546875, "learning_rate": 0.0001, "loss": 4.3378, "loss/crossentropy": 2.551619827747345, "loss/hidden": 3.9625, "loss/incoh": 0.0, "loss/logits": 0.5328098922967911, "loss/reg": 0.0, "step": 1270 }, { "epoch": 0.008421052631578947, "grad_norm": 3.53125, "grad_norm_var": 3.1853352864583333, "learning_rate": 0.0001, "loss": 4.2329, "loss/crossentropy": 2.3984143674373626, "loss/hidden": 3.9703125, "loss/incoh": 0.0, "loss/logits": 0.43639505505561826, "loss/reg": 0.0, "step": 1280 }, { "epoch": 0.008486842105263157, "grad_norm": 4.21875, "grad_norm_var": 0.21245015462239583, "learning_rate": 0.0001, "loss": 4.3604, "loss/crossentropy": 2.8736027479171753, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.48700871765613557, "loss/reg": 0.0, "step": 1290 }, { "epoch": 0.008552631578947369, "grad_norm": 4.125, "grad_norm_var": 0.24678446451822916, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 2.714110541343689, "loss/hidden": 3.75625, "loss/incoh": 0.0, "loss/logits": 0.47638387978076935, "loss/reg": 0.0, "step": 1300 }, { "epoch": 0.008618421052631579, "grad_norm": 5.03125, "grad_norm_var": 0.36387430826822914, "learning_rate": 0.0001, "loss": 4.2879, "loss/crossentropy": 2.6876192927360534, "loss/hidden": 3.7, "loss/incoh": 0.0, "loss/logits": 0.49314437210559847, "loss/reg": 0.0, "step": 1310 }, { "epoch": 0.008684210526315789, "grad_norm": 3.84375, "grad_norm_var": 0.28189188639322915, "learning_rate": 0.0001, "loss": 4.1505, "loss/crossentropy": 2.4753618359565737, "loss/hidden": 3.684375, "loss/incoh": 0.0, "loss/logits": 0.40235219299793246, "loss/reg": 0.0, "step": 1320 }, { "epoch": 0.00875, "grad_norm": 3.1875, "grad_norm_var": 2.41715087890625, "learning_rate": 0.0001, "loss": 4.2095, "loss/crossentropy": 2.515524423122406, "loss/hidden": 3.996875, "loss/incoh": 0.0, "loss/logits": 0.5540182292461395, "loss/reg": 0.0, "step": 1330 }, { "epoch": 0.008815789473684211, "grad_norm": 3.8125, "grad_norm_var": 2.259993489583333, "learning_rate": 0.0001, "loss": 4.1889, "loss/crossentropy": 2.346592426300049, "loss/hidden": 3.7703125, "loss/incoh": 0.0, "loss/logits": 0.4610589429736137, "loss/reg": 0.0, "step": 1340 }, { "epoch": 0.008881578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.13492431640625, "learning_rate": 0.0001, "loss": 4.1587, "loss/crossentropy": 2.600476896762848, "loss/hidden": 3.746875, "loss/incoh": 0.0, "loss/logits": 0.4497509777545929, "loss/reg": 0.0, "step": 1350 }, { "epoch": 0.008947368421052631, "grad_norm": 3.390625, "grad_norm_var": 0.7162017822265625, "learning_rate": 0.0001, "loss": 4.0563, "loss/crossentropy": 2.6253500103950502, "loss/hidden": 3.5359375, "loss/incoh": 0.0, "loss/logits": 0.4007237285375595, "loss/reg": 0.0, "step": 1360 }, { "epoch": 0.009013157894736843, "grad_norm": 3.828125, "grad_norm_var": 0.7642812093098958, "learning_rate": 0.0001, "loss": 4.1053, "loss/crossentropy": 2.4306472063064577, "loss/hidden": 3.596875, "loss/incoh": 0.0, "loss/logits": 0.42645111978054046, "loss/reg": 0.0, "step": 1370 }, { "epoch": 0.009078947368421053, "grad_norm": 5.65625, "grad_norm_var": 0.3096995035807292, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 2.4861367106437684, "loss/hidden": 3.9359375, "loss/incoh": 0.0, "loss/logits": 0.46326183080673217, "loss/reg": 0.0, "step": 1380 }, { "epoch": 0.009144736842105263, "grad_norm": 3.265625, "grad_norm_var": 0.36946207682291665, "learning_rate": 0.0001, "loss": 4.0653, "loss/crossentropy": 2.608224070072174, "loss/hidden": 3.5140625, "loss/incoh": 0.0, "loss/logits": 0.3894644558429718, "loss/reg": 0.0, "step": 1390 }, { "epoch": 0.009210526315789473, "grad_norm": 3.21875, "grad_norm_var": 0.30060221354166666, "learning_rate": 0.0001, "loss": 4.1547, "loss/crossentropy": 2.337048816680908, "loss/hidden": 3.6046875, "loss/incoh": 0.0, "loss/logits": 0.39037723541259767, "loss/reg": 0.0, "step": 1400 }, { "epoch": 0.009276315789473685, "grad_norm": 3.625, "grad_norm_var": 0.14612630208333333, "learning_rate": 0.0001, "loss": 4.076, "loss/crossentropy": 2.726799726486206, "loss/hidden": 3.5515625, "loss/incoh": 0.0, "loss/logits": 0.4537044405937195, "loss/reg": 0.0, "step": 1410 }, { "epoch": 0.009342105263157895, "grad_norm": 3.875, "grad_norm_var": 1.9072662353515626, "learning_rate": 0.0001, "loss": 4.2387, "loss/crossentropy": 2.5365243434906004, "loss/hidden": 3.503125, "loss/incoh": 0.0, "loss/logits": 0.37743023335933684, "loss/reg": 0.0, "step": 1420 }, { "epoch": 0.009407894736842105, "grad_norm": 4.1875, "grad_norm_var": 0.15405171712239582, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 2.6032602190971375, "loss/hidden": 3.6015625, "loss/incoh": 0.0, "loss/logits": 0.4031028777360916, "loss/reg": 0.0, "step": 1430 }, { "epoch": 0.009473684210526316, "grad_norm": 3.453125, "grad_norm_var": 0.35299072265625, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 2.279827582836151, "loss/hidden": 3.671875, "loss/incoh": 0.0, "loss/logits": 0.38540517538785934, "loss/reg": 0.0, "step": 1440 }, { "epoch": 0.009539473684210526, "grad_norm": 3.703125, "grad_norm_var": 0.44612223307291665, "learning_rate": 0.0001, "loss": 4.0791, "loss/crossentropy": 2.4795989274978636, "loss/hidden": 3.7, "loss/incoh": 0.0, "loss/logits": 0.40130155086517333, "loss/reg": 0.0, "step": 1450 }, { "epoch": 0.009605263157894737, "grad_norm": 3.515625, "grad_norm_var": 0.43585611979166666, "learning_rate": 0.0001, "loss": 4.0499, "loss/crossentropy": 2.337530755996704, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.3699365258216858, "loss/reg": 0.0, "step": 1460 }, { "epoch": 0.009671052631578947, "grad_norm": 4.375, "grad_norm_var": 2.1455393473307294, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 2.246569663286209, "loss/hidden": 3.54375, "loss/incoh": 0.0, "loss/logits": 0.37747917622327803, "loss/reg": 0.0, "step": 1470 }, { "epoch": 0.009736842105263158, "grad_norm": 4.34375, "grad_norm_var": 1.3469017374654464e+17, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 2.465463387966156, "loss/hidden": 3.5796875, "loss/incoh": 0.0, "loss/logits": 0.409694692492485, "loss/reg": 0.0, "step": 1480 }, { "epoch": 0.009802631578947368, "grad_norm": 3.8125, "grad_norm_var": 2.647081560180774e+17, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.614275646209717, "loss/hidden": 3.86875, "loss/incoh": 0.0, "loss/logits": 0.4961456567049026, "loss/reg": 0.0, "step": 1490 }, { "epoch": 0.009868421052631578, "grad_norm": 3.59375, "grad_norm_var": 3.232743326822917, "learning_rate": 0.0001, "loss": 4.2432, "loss/crossentropy": 2.9305615186691285, "loss/hidden": 3.8921875, "loss/incoh": 0.0, "loss/logits": 0.6574043720960617, "loss/reg": 0.0, "step": 1500 }, { "epoch": 0.00993421052631579, "grad_norm": 3.34375, "grad_norm_var": 3.299430338541667, "learning_rate": 0.0001, "loss": 4.066, "loss/crossentropy": 2.3778577923774717, "loss/hidden": 3.6515625, "loss/incoh": 0.0, "loss/logits": 0.3975479930639267, "loss/reg": 0.0, "step": 1510 }, { "epoch": 0.01, "grad_norm": 3.515625, "grad_norm_var": 18.948729451497396, "learning_rate": 0.0001, "loss": 4.1724, "loss/crossentropy": 2.527243709564209, "loss/hidden": 3.596875, "loss/incoh": 0.0, "loss/logits": 0.41472980976104734, "loss/reg": 0.0, "step": 1520 }, { "epoch": 0.01006578947368421, "grad_norm": 3.03125, "grad_norm_var": 18.10924072265625, "learning_rate": 0.0001, "loss": 4.0302, "loss/crossentropy": 2.7474317073822023, "loss/hidden": 3.690625, "loss/incoh": 0.0, "loss/logits": 0.47478381991386415, "loss/reg": 0.0, "step": 1530 }, { "epoch": 0.01013157894736842, "grad_norm": 4.4375, "grad_norm_var": 0.45779520670572915, "learning_rate": 0.0001, "loss": 4.0726, "loss/crossentropy": 2.4822750091552734, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.39128718376159666, "loss/reg": 0.0, "step": 1540 }, { "epoch": 0.010197368421052632, "grad_norm": 3.5625, "grad_norm_var": 0.21116129557291666, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 2.480714201927185, "loss/hidden": 3.9203125, "loss/incoh": 0.0, "loss/logits": 0.4798941880464554, "loss/reg": 0.0, "step": 1550 }, { "epoch": 0.010263157894736842, "grad_norm": 3.78125, "grad_norm_var": 0.21678059895833332, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.536018407344818, "loss/hidden": 3.53125, "loss/incoh": 0.0, "loss/logits": 0.3731235474348068, "loss/reg": 0.0, "step": 1560 }, { "epoch": 0.010328947368421052, "grad_norm": 3.953125, "grad_norm_var": 0.17219645182291668, "learning_rate": 0.0001, "loss": 4.0545, "loss/crossentropy": 2.4370301008224486, "loss/hidden": 3.5640625, "loss/incoh": 0.0, "loss/logits": 0.4435511589050293, "loss/reg": 0.0, "step": 1570 }, { "epoch": 0.010394736842105264, "grad_norm": 3.140625, "grad_norm_var": 0.5761027018229167, "learning_rate": 0.0001, "loss": 4.0907, "loss/crossentropy": 2.576425087451935, "loss/hidden": 3.6484375, "loss/incoh": 0.0, "loss/logits": 0.43103125393390657, "loss/reg": 0.0, "step": 1580 }, { "epoch": 0.010460526315789474, "grad_norm": 3.34375, "grad_norm_var": 0.15403544108072917, "learning_rate": 0.0001, "loss": 3.8713, "loss/crossentropy": 2.2120620369911195, "loss/hidden": 3.796875, "loss/incoh": 0.0, "loss/logits": 0.43894679844379425, "loss/reg": 0.0, "step": 1590 }, { "epoch": 0.010526315789473684, "grad_norm": 2.890625, "grad_norm_var": 0.09127197265625, "learning_rate": 0.0001, "loss": 3.9719, "loss/crossentropy": 2.5636333346366884, "loss/hidden": 3.575, "loss/incoh": 0.0, "loss/logits": 0.4157550185918808, "loss/reg": 0.0, "step": 1600 }, { "epoch": 0.010592105263157894, "grad_norm": 3.5, "grad_norm_var": 2.413444010416667, "learning_rate": 0.0001, "loss": 4.153, "loss/crossentropy": 2.6213369131088258, "loss/hidden": 3.6171875, "loss/incoh": 0.0, "loss/logits": 0.45062357783317564, "loss/reg": 0.0, "step": 1610 }, { "epoch": 0.010657894736842106, "grad_norm": 9.5, "grad_norm_var": 4.2955881754557295, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 2.626167094707489, "loss/hidden": 3.5765625, "loss/incoh": 0.0, "loss/logits": 0.44043630063533784, "loss/reg": 0.0, "step": 1620 }, { "epoch": 0.010723684210526316, "grad_norm": 3.6875, "grad_norm_var": 2.3217437744140623, "learning_rate": 0.0001, "loss": 3.937, "loss/crossentropy": 2.6269264578819276, "loss/hidden": 3.45625, "loss/incoh": 0.0, "loss/logits": 0.48427494168281554, "loss/reg": 0.0, "step": 1630 }, { "epoch": 0.010789473684210526, "grad_norm": 3.078125, "grad_norm_var": 0.12195638020833334, "learning_rate": 0.0001, "loss": 3.9365, "loss/crossentropy": 2.6117714166641237, "loss/hidden": 3.4359375, "loss/incoh": 0.0, "loss/logits": 0.4069202274084091, "loss/reg": 0.0, "step": 1640 }, { "epoch": 0.010855263157894738, "grad_norm": 3.3125, "grad_norm_var": 0.25017903645833334, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 2.6508745312690736, "loss/hidden": 3.53125, "loss/incoh": 0.0, "loss/logits": 0.4166009187698364, "loss/reg": 0.0, "step": 1650 }, { "epoch": 0.010921052631578948, "grad_norm": 4.21875, "grad_norm_var": 0.1591217041015625, "learning_rate": 0.0001, "loss": 3.8964, "loss/crossentropy": 2.4683817744255068, "loss/hidden": 3.4421875, "loss/incoh": 0.0, "loss/logits": 0.37454236298799515, "loss/reg": 0.0, "step": 1660 }, { "epoch": 0.010986842105263158, "grad_norm": 3.28125, "grad_norm_var": 0.12337239583333333, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 2.4436564683914184, "loss/hidden": 3.3390625, "loss/incoh": 0.0, "loss/logits": 0.36218023002147676, "loss/reg": 0.0, "step": 1670 }, { "epoch": 0.011052631578947368, "grad_norm": 3.15625, "grad_norm_var": 0.08958231608072917, "learning_rate": 0.0001, "loss": 3.9784, "loss/crossentropy": 2.559529435634613, "loss/hidden": 3.703125, "loss/incoh": 0.0, "loss/logits": 0.46595812439918516, "loss/reg": 0.0, "step": 1680 }, { "epoch": 0.01111842105263158, "grad_norm": 2.953125, "grad_norm_var": 0.11448160807291667, "learning_rate": 0.0001, "loss": 3.9419, "loss/crossentropy": 2.4433623433113096, "loss/hidden": 3.453125, "loss/incoh": 0.0, "loss/logits": 0.41454428136348725, "loss/reg": 0.0, "step": 1690 }, { "epoch": 0.01118421052631579, "grad_norm": 4.0625, "grad_norm_var": 0.11336161295572916, "learning_rate": 0.0001, "loss": 3.8878, "loss/crossentropy": 2.561389684677124, "loss/hidden": 3.4640625, "loss/incoh": 0.0, "loss/logits": 0.3884895950555801, "loss/reg": 0.0, "step": 1700 }, { "epoch": 0.01125, "grad_norm": 3.296875, "grad_norm_var": 2.4417928059895835, "learning_rate": 0.0001, "loss": 3.9996, "loss/crossentropy": 2.709191393852234, "loss/hidden": 3.5515625, "loss/incoh": 0.0, "loss/logits": 0.3691831022500992, "loss/reg": 0.0, "step": 1710 }, { "epoch": 0.011315789473684211, "grad_norm": 3.3125, "grad_norm_var": 0.5698232014973958, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 2.4276717066764832, "loss/hidden": 3.7375, "loss/incoh": 0.0, "loss/logits": 0.4981458902359009, "loss/reg": 0.0, "step": 1720 }, { "epoch": 0.011381578947368421, "grad_norm": 3.71875, "grad_norm_var": 0.4803059895833333, "learning_rate": 0.0001, "loss": 4.0546, "loss/crossentropy": 2.672085237503052, "loss/hidden": 3.465625, "loss/incoh": 0.0, "loss/logits": 0.41347330510616304, "loss/reg": 0.0, "step": 1730 }, { "epoch": 0.011447368421052631, "grad_norm": 6.59375, "grad_norm_var": 0.7734700520833333, "learning_rate": 0.0001, "loss": 4.0697, "loss/crossentropy": 2.625990152359009, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.45684492886066436, "loss/reg": 0.0, "step": 1740 }, { "epoch": 0.011513157894736841, "grad_norm": 2.8125, "grad_norm_var": 2.146930948893229, "learning_rate": 0.0001, "loss": 4.1129, "loss/crossentropy": 2.682978630065918, "loss/hidden": 3.5703125, "loss/incoh": 0.0, "loss/logits": 0.4306509166955948, "loss/reg": 0.0, "step": 1750 }, { "epoch": 0.011578947368421053, "grad_norm": 3.203125, "grad_norm_var": 0.1972076416015625, "learning_rate": 0.0001, "loss": 3.8797, "loss/crossentropy": 2.7138221740722654, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.46029032766819, "loss/reg": 0.0, "step": 1760 }, { "epoch": 0.011644736842105263, "grad_norm": 3.03125, "grad_norm_var": 15.614777628580729, "learning_rate": 0.0001, "loss": 4.0409, "loss/crossentropy": 2.1841426372528074, "loss/hidden": 3.3828125, "loss/incoh": 0.0, "loss/logits": 0.35761781185865404, "loss/reg": 0.0, "step": 1770 }, { "epoch": 0.011710526315789473, "grad_norm": 3.953125, "grad_norm_var": 0.32083231608072915, "learning_rate": 0.0001, "loss": 4.0523, "loss/crossentropy": 2.404168051481247, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.34855909645557404, "loss/reg": 0.0, "step": 1780 }, { "epoch": 0.011776315789473683, "grad_norm": 3.5, "grad_norm_var": 3.124772135416667, "learning_rate": 0.0001, "loss": 3.9895, "loss/crossentropy": 2.370392310619354, "loss/hidden": 3.6515625, "loss/incoh": 0.0, "loss/logits": 0.4228974744677544, "loss/reg": 0.0, "step": 1790 }, { "epoch": 0.011842105263157895, "grad_norm": 6.125, "grad_norm_var": 3.445572916666667, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.6190654158592226, "loss/hidden": 3.4578125, "loss/incoh": 0.0, "loss/logits": 0.406630203127861, "loss/reg": 0.0, "step": 1800 }, { "epoch": 0.011907894736842105, "grad_norm": 3.28125, "grad_norm_var": 0.575048828125, "learning_rate": 0.0001, "loss": 3.8828, "loss/crossentropy": 2.505708968639374, "loss/hidden": 3.4203125, "loss/incoh": 0.0, "loss/logits": 0.3773295432329178, "loss/reg": 0.0, "step": 1810 }, { "epoch": 0.011973684210526315, "grad_norm": 3.734375, "grad_norm_var": 14.053641764322917, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 2.8797521352767945, "loss/hidden": 3.6734375, "loss/incoh": 0.0, "loss/logits": 0.7419060736894607, "loss/reg": 0.0, "step": 1820 }, { "epoch": 0.012039473684210527, "grad_norm": 3.96875, "grad_norm_var": 2.006696573893229, "learning_rate": 0.0001, "loss": 3.87, "loss/crossentropy": 2.792651188373566, "loss/hidden": 3.6546875, "loss/incoh": 0.0, "loss/logits": 0.7776896879076958, "loss/reg": 0.0, "step": 1830 }, { "epoch": 0.012105263157894737, "grad_norm": 4.09375, "grad_norm_var": 3.2296132405598956, "learning_rate": 0.0001, "loss": 3.8688, "loss/crossentropy": 2.4713597655296327, "loss/hidden": 3.4484375, "loss/incoh": 0.0, "loss/logits": 0.3962660849094391, "loss/reg": 0.0, "step": 1840 }, { "epoch": 0.012171052631578947, "grad_norm": 3.296875, "grad_norm_var": 0.20377197265625, "learning_rate": 0.0001, "loss": 3.9721, "loss/crossentropy": 2.225203812122345, "loss/hidden": 3.4734375, "loss/incoh": 0.0, "loss/logits": 0.3860040009021759, "loss/reg": 0.0, "step": 1850 }, { "epoch": 0.012236842105263157, "grad_norm": 4.375, "grad_norm_var": 14.807112630208334, "learning_rate": 0.0001, "loss": 4.0342, "loss/crossentropy": 2.405521821975708, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.40949456989765165, "loss/reg": 0.0, "step": 1860 }, { "epoch": 0.012302631578947369, "grad_norm": 4.53125, "grad_norm_var": 6.056012980143229, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 2.3927958846092223, "loss/hidden": 3.5265625, "loss/incoh": 0.0, "loss/logits": 0.3984386846423149, "loss/reg": 0.0, "step": 1870 }, { "epoch": 0.012368421052631579, "grad_norm": 2.734375, "grad_norm_var": 0.6073893229166667, "learning_rate": 0.0001, "loss": 3.9074, "loss/crossentropy": 2.6031975388526916, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.4128950208425522, "loss/reg": 0.0, "step": 1880 }, { "epoch": 0.012434210526315789, "grad_norm": 3.65625, "grad_norm_var": 0.3526845296223958, "learning_rate": 0.0001, "loss": 3.9143, "loss/crossentropy": 2.7405603647232057, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.37195596396923064, "loss/reg": 0.0, "step": 1890 }, { "epoch": 0.0125, "grad_norm": 3.3125, "grad_norm_var": 130.27611389160157, "learning_rate": 0.0001, "loss": 3.986, "loss/crossentropy": 2.6554584980010985, "loss/hidden": 3.5, "loss/incoh": 0.0, "loss/logits": 0.3823524177074432, "loss/reg": 0.0, "step": 1900 }, { "epoch": 0.01256578947368421, "grad_norm": 3.6875, "grad_norm_var": 130.43673400878907, "learning_rate": 0.0001, "loss": 3.8682, "loss/crossentropy": 2.6816349744796755, "loss/hidden": 3.421875, "loss/incoh": 0.0, "loss/logits": 0.43074882328510283, "loss/reg": 0.0, "step": 1910 }, { "epoch": 0.01263157894736842, "grad_norm": 3.0, "grad_norm_var": 10.65947265625, "learning_rate": 0.0001, "loss": 4.0152, "loss/crossentropy": 2.1248778343200683, "loss/hidden": 3.6953125, "loss/incoh": 0.0, "loss/logits": 0.4099419146776199, "loss/reg": 0.0, "step": 1920 }, { "epoch": 0.01269736842105263, "grad_norm": 2.828125, "grad_norm_var": 0.41845296223958334, "learning_rate": 0.0001, "loss": 3.9099, "loss/crossentropy": 2.485488569736481, "loss/hidden": 3.3921875, "loss/incoh": 0.0, "loss/logits": 0.3766200736165047, "loss/reg": 0.0, "step": 1930 }, { "epoch": 0.012763157894736843, "grad_norm": 4.375, "grad_norm_var": 2.3640777587890627, "learning_rate": 0.0001, "loss": 4.0886, "loss/crossentropy": 2.896865522861481, "loss/hidden": 4.4, "loss/incoh": 0.0, "loss/logits": 0.6787679702043533, "loss/reg": 0.0, "step": 1940 }, { "epoch": 0.012828947368421053, "grad_norm": 3.34375, "grad_norm_var": 34.95194905598958, "learning_rate": 0.0001, "loss": 4.0285, "loss/crossentropy": 2.348608684539795, "loss/hidden": 3.6171875, "loss/incoh": 0.0, "loss/logits": 0.4279856622219086, "loss/reg": 0.0, "step": 1950 }, { "epoch": 0.012894736842105263, "grad_norm": 3.65625, "grad_norm_var": 1.5377278645833334, "learning_rate": 0.0001, "loss": 3.9097, "loss/crossentropy": 2.3533430814743044, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.37981766164302827, "loss/reg": 0.0, "step": 1960 }, { "epoch": 0.012960526315789474, "grad_norm": 4.15625, "grad_norm_var": 0.23593648274739584, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 2.54624525308609, "loss/hidden": 3.790625, "loss/incoh": 0.0, "loss/logits": 0.4486588716506958, "loss/reg": 0.0, "step": 1970 }, { "epoch": 0.013026315789473684, "grad_norm": 2.859375, "grad_norm_var": 0.24025777180989583, "learning_rate": 0.0001, "loss": 3.8942, "loss/crossentropy": 2.5797088146209717, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.4970128297805786, "loss/reg": 0.0, "step": 1980 }, { "epoch": 0.013092105263157894, "grad_norm": 3.9375, "grad_norm_var": 0.47609049479166665, "learning_rate": 0.0001, "loss": 3.934, "loss/crossentropy": 2.555588161945343, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.38218972086906433, "loss/reg": 0.0, "step": 1990 }, { "epoch": 0.013157894736842105, "grad_norm": 4.03125, "grad_norm_var": 1.0361073811848958, "learning_rate": 0.0001, "loss": 3.8759, "loss/crossentropy": 2.164474868774414, "loss/hidden": 3.7125, "loss/incoh": 0.0, "loss/logits": 0.3994155451655388, "loss/reg": 0.0, "step": 2000 }, { "epoch": 0.013223684210526316, "grad_norm": 3.453125, "grad_norm_var": 0.8633778889973959, "learning_rate": 0.0001, "loss": 3.8317, "loss/crossentropy": 2.558875060081482, "loss/hidden": 3.471875, "loss/incoh": 0.0, "loss/logits": 0.3639406472444534, "loss/reg": 0.0, "step": 2010 }, { "epoch": 0.013289473684210526, "grad_norm": 3.75, "grad_norm_var": 1.5050740559895834, "learning_rate": 0.0001, "loss": 3.9716, "loss/crossentropy": 2.3385006546974183, "loss/hidden": 3.4359375, "loss/incoh": 0.0, "loss/logits": 0.3938352942466736, "loss/reg": 0.0, "step": 2020 }, { "epoch": 0.013355263157894736, "grad_norm": 3.15625, "grad_norm_var": 0.9781158447265625, "learning_rate": 0.0001, "loss": 3.9531, "loss/crossentropy": 2.4627522945404055, "loss/hidden": 3.5578125, "loss/incoh": 0.0, "loss/logits": 0.495425808429718, "loss/reg": 0.0, "step": 2030 }, { "epoch": 0.013421052631578948, "grad_norm": 2.75, "grad_norm_var": 1.949779256184896, "learning_rate": 0.0001, "loss": 3.9979, "loss/crossentropy": 2.11747065782547, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.36948435604572294, "loss/reg": 0.0, "step": 2040 }, { "epoch": 0.013486842105263158, "grad_norm": 3.0625, "grad_norm_var": 1.5490549723307292, "learning_rate": 0.0001, "loss": 3.8381, "loss/crossentropy": 2.473706376552582, "loss/hidden": 3.4265625, "loss/incoh": 0.0, "loss/logits": 0.3867632657289505, "loss/reg": 0.0, "step": 2050 }, { "epoch": 0.013552631578947368, "grad_norm": 3.125, "grad_norm_var": 2.1093470786676653e+17, "learning_rate": 0.0001, "loss": 4.0319, "loss/crossentropy": 2.4225202679634092, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3675911784172058, "loss/reg": 0.0, "step": 2060 }, { "epoch": 0.013618421052631578, "grad_norm": 5.625, "grad_norm_var": 76.3380849202474, "learning_rate": 0.0001, "loss": 4.0746, "loss/crossentropy": 2.48265061378479, "loss/hidden": 3.7265625, "loss/incoh": 0.0, "loss/logits": 0.4639868468046188, "loss/reg": 0.0, "step": 2070 }, { "epoch": 0.01368421052631579, "grad_norm": 4.5, "grad_norm_var": 0.7881011962890625, "learning_rate": 0.0001, "loss": 4.038, "loss/crossentropy": 2.7775272965431212, "loss/hidden": 4.0609375, "loss/incoh": 0.0, "loss/logits": 0.4948854446411133, "loss/reg": 0.0, "step": 2080 }, { "epoch": 0.01375, "grad_norm": 3.296875, "grad_norm_var": 0.8111612955729167, "learning_rate": 0.0001, "loss": 3.8968, "loss/crossentropy": 2.4613396763801574, "loss/hidden": 3.3484375, "loss/incoh": 0.0, "loss/logits": 0.3684074327349663, "loss/reg": 0.0, "step": 2090 }, { "epoch": 0.01381578947368421, "grad_norm": 3.484375, "grad_norm_var": 1.490550740559896, "learning_rate": 0.0001, "loss": 3.867, "loss/crossentropy": 2.5947747588157655, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.3717468947172165, "loss/reg": 0.0, "step": 2100 }, { "epoch": 0.013881578947368422, "grad_norm": 4.75, "grad_norm_var": 1.7372385660807292, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 2.466842460632324, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3417574405670166, "loss/reg": 0.0, "step": 2110 }, { "epoch": 0.013947368421052632, "grad_norm": 4.21875, "grad_norm_var": 1.8861002604166666, "learning_rate": 0.0001, "loss": 3.823, "loss/crossentropy": 2.3006282687187194, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.36191926896572113, "loss/reg": 0.0, "step": 2120 }, { "epoch": 0.014013157894736842, "grad_norm": 4.21875, "grad_norm_var": 0.9100901285807291, "learning_rate": 0.0001, "loss": 3.8933, "loss/crossentropy": 2.6159239768981934, "loss/hidden": 3.5109375, "loss/incoh": 0.0, "loss/logits": 0.46396631598472593, "loss/reg": 0.0, "step": 2130 }, { "epoch": 0.014078947368421052, "grad_norm": 3.734375, "grad_norm_var": 0.9789388020833333, "learning_rate": 0.0001, "loss": 3.8761, "loss/crossentropy": 2.6355370759963987, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.361694809794426, "loss/reg": 0.0, "step": 2140 }, { "epoch": 0.014144736842105264, "grad_norm": 2.875, "grad_norm_var": 0.30113525390625, "learning_rate": 0.0001, "loss": 3.8617, "loss/crossentropy": 2.6357606053352356, "loss/hidden": 3.3515625, "loss/incoh": 0.0, "loss/logits": 0.3612044155597687, "loss/reg": 0.0, "step": 2150 }, { "epoch": 0.014210526315789474, "grad_norm": 2.921875, "grad_norm_var": 0.23430582682291667, "learning_rate": 0.0001, "loss": 3.8903, "loss/crossentropy": 2.551873171329498, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.41060586273670197, "loss/reg": 0.0, "step": 2160 }, { "epoch": 0.014276315789473684, "grad_norm": 3.5625, "grad_norm_var": 0.48121744791666665, "learning_rate": 0.0001, "loss": 3.9663, "loss/crossentropy": 2.495719885826111, "loss/hidden": 3.690625, "loss/incoh": 0.0, "loss/logits": 0.4500987708568573, "loss/reg": 0.0, "step": 2170 }, { "epoch": 0.014342105263157894, "grad_norm": 2.8125, "grad_norm_var": 0.17538655598958333, "learning_rate": 0.0001, "loss": 3.8053, "loss/crossentropy": 2.520111393928528, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.44193484634160995, "loss/reg": 0.0, "step": 2180 }, { "epoch": 0.014407894736842106, "grad_norm": 3.328125, "grad_norm_var": 31.391422526041666, "learning_rate": 0.0001, "loss": 4.0394, "loss/crossentropy": 2.5524103164672853, "loss/hidden": 3.4203125, "loss/incoh": 0.0, "loss/logits": 0.38946655094623567, "loss/reg": 0.0, "step": 2190 }, { "epoch": 0.014473684210526316, "grad_norm": 8.6875, "grad_norm_var": 2.0572499593098956, "learning_rate": 0.0001, "loss": 3.9758, "loss/crossentropy": 2.2560265123844148, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.4082709074020386, "loss/reg": 0.0, "step": 2200 }, { "epoch": 0.014539473684210526, "grad_norm": 3.25, "grad_norm_var": 3.566722615559896, "learning_rate": 0.0001, "loss": 3.8174, "loss/crossentropy": 2.2830474019050597, "loss/hidden": 3.3375, "loss/incoh": 0.0, "loss/logits": 0.3462225392460823, "loss/reg": 0.0, "step": 2210 }, { "epoch": 0.014605263157894737, "grad_norm": 3.109375, "grad_norm_var": 0.155615234375, "learning_rate": 0.0001, "loss": 3.8468, "loss/crossentropy": 2.3341428637504578, "loss/hidden": 3.4375, "loss/incoh": 0.0, "loss/logits": 0.4024402230978012, "loss/reg": 0.0, "step": 2220 }, { "epoch": 0.014671052631578948, "grad_norm": 2.984375, "grad_norm_var": 5.723542277018229, "learning_rate": 0.0001, "loss": 3.9699, "loss/crossentropy": 2.201635646820068, "loss/hidden": 3.2703125, "loss/incoh": 0.0, "loss/logits": 0.3397625252604485, "loss/reg": 0.0, "step": 2230 }, { "epoch": 0.014736842105263158, "grad_norm": 2.796875, "grad_norm_var": 47.54188537597656, "learning_rate": 0.0001, "loss": 3.9281, "loss/crossentropy": 2.5724541902542115, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3728118479251862, "loss/reg": 0.0, "step": 2240 }, { "epoch": 0.014802631578947368, "grad_norm": 2.984375, "grad_norm_var": 51.28417867024739, "learning_rate": 0.0001, "loss": 3.8237, "loss/crossentropy": 2.5087037920951842, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.3448286011815071, "loss/reg": 0.0, "step": 2250 }, { "epoch": 0.01486842105263158, "grad_norm": 3.171875, "grad_norm_var": 0.05579427083333333, "learning_rate": 0.0001, "loss": 3.8448, "loss/crossentropy": 2.4813582420349123, "loss/hidden": 3.409375, "loss/incoh": 0.0, "loss/logits": 0.40958506166934966, "loss/reg": 0.0, "step": 2260 }, { "epoch": 0.01493421052631579, "grad_norm": 2.96875, "grad_norm_var": 0.16342671712239584, "learning_rate": 0.0001, "loss": 3.753, "loss/crossentropy": 2.523963761329651, "loss/hidden": 3.6640625, "loss/incoh": 0.0, "loss/logits": 0.38928901553153994, "loss/reg": 0.0, "step": 2270 }, { "epoch": 0.015, "grad_norm": 3.0625, "grad_norm_var": 0.7111317952473958, "learning_rate": 0.0001, "loss": 3.8078, "loss/crossentropy": 2.6787729024887086, "loss/hidden": 3.71875, "loss/incoh": 0.0, "loss/logits": 0.4244162023067474, "loss/reg": 0.0, "step": 2280 }, { "epoch": 0.015065789473684211, "grad_norm": 3.09375, "grad_norm_var": 0.10339253743489583, "learning_rate": 0.0001, "loss": 3.7562, "loss/crossentropy": 2.191652774810791, "loss/hidden": 3.4140625, "loss/incoh": 0.0, "loss/logits": 0.3300579100847244, "loss/reg": 0.0, "step": 2290 }, { "epoch": 0.015131578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 3.73, "loss/crossentropy": 2.4444664478302003, "loss/hidden": 3.4265625, "loss/incoh": 0.0, "loss/logits": 0.4327535033226013, "loss/reg": 0.0, "step": 2300 }, { "epoch": 0.015197368421052631, "grad_norm": 3.046875, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 3.7723, "loss/crossentropy": 2.2329365968704225, "loss/hidden": 3.51875, "loss/incoh": 0.0, "loss/logits": 0.3683215394616127, "loss/reg": 0.0, "step": 2310 }, { "epoch": 0.015263157894736841, "grad_norm": 3.03125, "grad_norm_var": 0.3289713541666667, "learning_rate": 0.0001, "loss": 3.7699, "loss/crossentropy": 2.5342983961105348, "loss/hidden": 3.2921875, "loss/incoh": 0.0, "loss/logits": 0.35688025653362276, "loss/reg": 0.0, "step": 2320 }, { "epoch": 0.015328947368421053, "grad_norm": 3.15625, "grad_norm_var": 0.5997060139973959, "learning_rate": 0.0001, "loss": 3.8494, "loss/crossentropy": 2.4934002995491027, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.3495032548904419, "loss/reg": 0.0, "step": 2330 }, { "epoch": 0.015394736842105263, "grad_norm": 4.28125, "grad_norm_var": 1.570759073893229, "learning_rate": 0.0001, "loss": 3.9628, "loss/crossentropy": 2.2064894437789917, "loss/hidden": 3.478125, "loss/incoh": 0.0, "loss/logits": 0.34214983880519867, "loss/reg": 0.0, "step": 2340 }, { "epoch": 0.015460526315789473, "grad_norm": 3.5, "grad_norm_var": 1.9128865559895833, "learning_rate": 0.0001, "loss": 3.9538, "loss/crossentropy": 2.5408032178878783, "loss/hidden": 3.515625, "loss/incoh": 0.0, "loss/logits": 0.4111128658056259, "loss/reg": 0.0, "step": 2350 }, { "epoch": 0.015526315789473685, "grad_norm": 3.765625, "grad_norm_var": 0.39661051432291666, "learning_rate": 0.0001, "loss": 3.8897, "loss/crossentropy": 2.4922020554542543, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.36137166023254397, "loss/reg": 0.0, "step": 2360 }, { "epoch": 0.015592105263157895, "grad_norm": 4.09375, "grad_norm_var": 0.21868082682291667, "learning_rate": 0.0001, "loss": 3.8461, "loss/crossentropy": 2.427833843231201, "loss/hidden": 3.5921875, "loss/incoh": 0.0, "loss/logits": 0.36205882132053374, "loss/reg": 0.0, "step": 2370 }, { "epoch": 0.015657894736842107, "grad_norm": 3.171875, "grad_norm_var": 0.29641520182291664, "learning_rate": 0.0001, "loss": 3.767, "loss/crossentropy": 2.3795222878456115, "loss/hidden": 3.5140625, "loss/incoh": 0.0, "loss/logits": 0.40104621052742007, "loss/reg": 0.0, "step": 2380 }, { "epoch": 0.015723684210526317, "grad_norm": 3.171875, "grad_norm_var": 0.4603017171223958, "learning_rate": 0.0001, "loss": 3.8162, "loss/crossentropy": 2.3675019264221193, "loss/hidden": 3.4, "loss/incoh": 0.0, "loss/logits": 0.3826363369822502, "loss/reg": 0.0, "step": 2390 }, { "epoch": 0.015789473684210527, "grad_norm": 3.59375, "grad_norm_var": 0.12642313639322916, "learning_rate": 0.0001, "loss": 3.8017, "loss/crossentropy": 2.56625235080719, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.3704580098390579, "loss/reg": 0.0, "step": 2400 }, { "epoch": 0.015855263157894737, "grad_norm": 2.8125, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 3.8356, "loss/crossentropy": 2.3642316341400145, "loss/hidden": 3.640625, "loss/incoh": 0.0, "loss/logits": 0.4971610188484192, "loss/reg": 0.0, "step": 2410 }, { "epoch": 0.015921052631578947, "grad_norm": 3.359375, "grad_norm_var": 2.7929354478016266e+17, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 2.5260600686073302, "loss/hidden": 3.4734375, "loss/incoh": 0.0, "loss/logits": 0.34504298865795135, "loss/reg": 0.0, "step": 2420 }, { "epoch": 0.015986842105263157, "grad_norm": 3.140625, "grad_norm_var": 2.792935447600693e+17, "learning_rate": 0.0001, "loss": 3.7506, "loss/crossentropy": 2.6639176845550536, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.3401388913393021, "loss/reg": 0.0, "step": 2430 }, { "epoch": 0.016052631578947367, "grad_norm": 2.890625, "grad_norm_var": 0.0854644775390625, "learning_rate": 0.0001, "loss": 3.7687, "loss/crossentropy": 2.3805726766586304, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.3257554292678833, "loss/reg": 0.0, "step": 2440 }, { "epoch": 0.01611842105263158, "grad_norm": 8.125, "grad_norm_var": 1.995349713361273e+17, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 2.370633268356323, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.3756751254200935, "loss/reg": 0.0, "step": 2450 }, { "epoch": 0.01618421052631579, "grad_norm": 2.75, "grad_norm_var": 1.995349712714498e+17, "learning_rate": 0.0001, "loss": 3.7822, "loss/crossentropy": 2.5394015312194824, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.34547194838523865, "loss/reg": 0.0, "step": 2460 }, { "epoch": 0.01625, "grad_norm": 3.15625, "grad_norm_var": 0.0999664306640625, "learning_rate": 0.0001, "loss": 3.6838, "loss/crossentropy": 2.478586256504059, "loss/hidden": 3.4109375, "loss/incoh": 0.0, "loss/logits": 0.4159191906452179, "loss/reg": 0.0, "step": 2470 }, { "epoch": 0.01631578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.48103739420572916, "learning_rate": 0.0001, "loss": 3.7274, "loss/crossentropy": 2.3859502553939818, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.36551299393177034, "loss/reg": 0.0, "step": 2480 }, { "epoch": 0.01638157894736842, "grad_norm": 3.171875, "grad_norm_var": 0.33056640625, "learning_rate": 0.0001, "loss": 3.8413, "loss/crossentropy": 2.47422776222229, "loss/hidden": 3.2703125, "loss/incoh": 0.0, "loss/logits": 0.34280748963356017, "loss/reg": 0.0, "step": 2490 }, { "epoch": 0.01644736842105263, "grad_norm": 3.03125, "grad_norm_var": 0.30504150390625, "learning_rate": 0.0001, "loss": 3.7262, "loss/crossentropy": 2.288319444656372, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.31078503280878067, "loss/reg": 0.0, "step": 2500 }, { "epoch": 0.01651315789473684, "grad_norm": 3.125, "grad_norm_var": 0.2894683837890625, "learning_rate": 0.0001, "loss": 3.8194, "loss/crossentropy": 2.2243799686431887, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.33974049538373946, "loss/reg": 0.0, "step": 2510 }, { "epoch": 0.016578947368421054, "grad_norm": 3.046875, "grad_norm_var": 0.3177235921223958, "learning_rate": 0.0001, "loss": 3.6925, "loss/crossentropy": 2.63401620388031, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.39840718507766726, "loss/reg": 0.0, "step": 2520 }, { "epoch": 0.016644736842105264, "grad_norm": 3.046875, "grad_norm_var": 0.2181549072265625, "learning_rate": 0.0001, "loss": 3.7573, "loss/crossentropy": 2.4925423860549927, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3407833933830261, "loss/reg": 0.0, "step": 2530 }, { "epoch": 0.016710526315789474, "grad_norm": 3.328125, "grad_norm_var": 0.19260152180989584, "learning_rate": 0.0001, "loss": 3.7514, "loss/crossentropy": 2.3376643657684326, "loss/hidden": 3.4609375, "loss/incoh": 0.0, "loss/logits": 0.4351751744747162, "loss/reg": 0.0, "step": 2540 }, { "epoch": 0.016776315789473684, "grad_norm": 2.78125, "grad_norm_var": 0.12961324055989584, "learning_rate": 0.0001, "loss": 3.7217, "loss/crossentropy": 2.279516875743866, "loss/hidden": 3.58125, "loss/incoh": 0.0, "loss/logits": 0.37157190442085264, "loss/reg": 0.0, "step": 2550 }, { "epoch": 0.016842105263157894, "grad_norm": 2.921875, "grad_norm_var": 0.0912017822265625, "learning_rate": 0.0001, "loss": 3.6839, "loss/crossentropy": 2.2843039661645888, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.33208019435405733, "loss/reg": 0.0, "step": 2560 }, { "epoch": 0.016907894736842104, "grad_norm": 3.28125, "grad_norm_var": 0.32079671223958334, "learning_rate": 0.0001, "loss": 3.7381, "loss/crossentropy": 2.2308380246162414, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.30514844954013826, "loss/reg": 0.0, "step": 2570 }, { "epoch": 0.016973684210526314, "grad_norm": 2.921875, "grad_norm_var": 0.6193522135416667, "learning_rate": 0.0001, "loss": 3.7209, "loss/crossentropy": 2.500720489025116, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.38955146372318267, "loss/reg": 0.0, "step": 2580 }, { "epoch": 0.017039473684210528, "grad_norm": 3.0, "grad_norm_var": 0.09539388020833334, "learning_rate": 0.0001, "loss": 3.6933, "loss/crossentropy": 2.397514319419861, "loss/hidden": 3.4796875, "loss/incoh": 0.0, "loss/logits": 0.426153627038002, "loss/reg": 0.0, "step": 2590 }, { "epoch": 0.017105263157894738, "grad_norm": 2.984375, "grad_norm_var": 0.44207356770833334, "learning_rate": 0.0001, "loss": 3.7927, "loss/crossentropy": 2.4746341586112974, "loss/hidden": 3.4515625, "loss/incoh": 0.0, "loss/logits": 0.3806774616241455, "loss/reg": 0.0, "step": 2600 }, { "epoch": 0.017171052631578948, "grad_norm": 3.109375, "grad_norm_var": 0.10927632649739584, "learning_rate": 0.0001, "loss": 3.7507, "loss/crossentropy": 2.6908259630203246, "loss/hidden": 3.459375, "loss/incoh": 0.0, "loss/logits": 0.47532927691936494, "loss/reg": 0.0, "step": 2610 }, { "epoch": 0.017236842105263158, "grad_norm": 2.75, "grad_norm_var": 1.8751780192057292, "learning_rate": 0.0001, "loss": 3.8498, "loss/crossentropy": 2.3184617161750793, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3832893192768097, "loss/reg": 0.0, "step": 2620 }, { "epoch": 0.017302631578947368, "grad_norm": 3.734375, "grad_norm_var": 1.95693359375, "learning_rate": 0.0001, "loss": 3.8697, "loss/crossentropy": 2.386726236343384, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.4112528935074806, "loss/reg": 0.0, "step": 2630 }, { "epoch": 0.017368421052631578, "grad_norm": 3.09375, "grad_norm_var": 236.78067118326823, "learning_rate": 0.0001, "loss": 3.843, "loss/crossentropy": 2.6268559217453005, "loss/hidden": 3.525, "loss/incoh": 0.0, "loss/logits": 0.5286044746637344, "loss/reg": 0.0, "step": 2640 }, { "epoch": 0.017434210526315788, "grad_norm": 3.15625, "grad_norm_var": 238.80460510253906, "learning_rate": 0.0001, "loss": 3.7934, "loss/crossentropy": 2.621377694606781, "loss/hidden": 3.346875, "loss/incoh": 0.0, "loss/logits": 0.39670759439468384, "loss/reg": 0.0, "step": 2650 }, { "epoch": 0.0175, "grad_norm": 2.921875, "grad_norm_var": 0.4898274739583333, "learning_rate": 0.0001, "loss": 3.8686, "loss/crossentropy": 2.3771218061447144, "loss/hidden": 3.5328125, "loss/incoh": 0.0, "loss/logits": 0.39255764335393906, "loss/reg": 0.0, "step": 2660 }, { "epoch": 0.01756578947368421, "grad_norm": 2.96875, "grad_norm_var": 0.9812001546223958, "learning_rate": 0.0001, "loss": 3.6636, "loss/crossentropy": 2.2852025091648103, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3551890656352043, "loss/reg": 0.0, "step": 2670 }, { "epoch": 0.017631578947368422, "grad_norm": 3.203125, "grad_norm_var": 0.38752848307291665, "learning_rate": 0.0001, "loss": 3.7338, "loss/crossentropy": 2.280995038151741, "loss/hidden": 3.271875, "loss/incoh": 0.0, "loss/logits": 0.34781029969453814, "loss/reg": 0.0, "step": 2680 }, { "epoch": 0.017697368421052632, "grad_norm": 2.953125, "grad_norm_var": 0.5138661702473958, "learning_rate": 0.0001, "loss": 3.7751, "loss/crossentropy": 2.5928542375564576, "loss/hidden": 3.2203125, "loss/incoh": 0.0, "loss/logits": 0.3643882930278778, "loss/reg": 0.0, "step": 2690 }, { "epoch": 0.017763157894736842, "grad_norm": 3.015625, "grad_norm_var": 2.5011057535807293, "learning_rate": 0.0001, "loss": 3.7281, "loss/crossentropy": 2.71818265914917, "loss/hidden": 3.3203125, "loss/incoh": 0.0, "loss/logits": 0.41874536871910095, "loss/reg": 0.0, "step": 2700 }, { "epoch": 0.017828947368421052, "grad_norm": 3.0625, "grad_norm_var": 0.2831939697265625, "learning_rate": 0.0001, "loss": 3.7723, "loss/crossentropy": 2.3557824969291685, "loss/hidden": 3.303125, "loss/incoh": 0.0, "loss/logits": 0.36439308822155, "loss/reg": 0.0, "step": 2710 }, { "epoch": 0.017894736842105262, "grad_norm": 3.390625, "grad_norm_var": 1.9459625244140626, "learning_rate": 0.0001, "loss": 3.854, "loss/crossentropy": 2.3793618083000183, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.36435145139694214, "loss/reg": 0.0, "step": 2720 }, { "epoch": 0.017960526315789475, "grad_norm": 2.5, "grad_norm_var": 1.9445271809895834, "learning_rate": 0.0001, "loss": 3.7925, "loss/crossentropy": 2.364825797080994, "loss/hidden": 3.2453125, "loss/incoh": 0.0, "loss/logits": 0.3577578902244568, "loss/reg": 0.0, "step": 2730 }, { "epoch": 0.018026315789473685, "grad_norm": 2.9375, "grad_norm_var": 0.18502197265625, "learning_rate": 0.0001, "loss": 3.747, "loss/crossentropy": 2.3110872566699983, "loss/hidden": 3.4078125, "loss/incoh": 0.0, "loss/logits": 0.3662073493003845, "loss/reg": 0.0, "step": 2740 }, { "epoch": 0.018092105263157895, "grad_norm": 2.90625, "grad_norm_var": 0.06238606770833333, "learning_rate": 0.0001, "loss": 3.736, "loss/crossentropy": 2.4977880120277405, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.40198240578174593, "loss/reg": 0.0, "step": 2750 }, { "epoch": 0.018157894736842106, "grad_norm": 2.65625, "grad_norm_var": 1.3276357014973958, "learning_rate": 0.0001, "loss": 3.7594, "loss/crossentropy": 2.3260527729988096, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.3752635881304741, "loss/reg": 0.0, "step": 2760 }, { "epoch": 0.018223684210526316, "grad_norm": 3.0, "grad_norm_var": 0.06177978515625, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 2.6722695350646974, "loss/hidden": 3.34375, "loss/incoh": 0.0, "loss/logits": 0.36579819619655607, "loss/reg": 0.0, "step": 2770 }, { "epoch": 0.018289473684210526, "grad_norm": 3.78125, "grad_norm_var": 0.07119038899739584, "learning_rate": 0.0001, "loss": 3.7821, "loss/crossentropy": 2.712409424781799, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.43594706654548643, "loss/reg": 0.0, "step": 2780 }, { "epoch": 0.018355263157894736, "grad_norm": 3.21875, "grad_norm_var": 2.3453409830729166, "learning_rate": 0.0001, "loss": 3.7973, "loss/crossentropy": 2.547107517719269, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.4092650800943375, "loss/reg": 0.0, "step": 2790 }, { "epoch": 0.018421052631578946, "grad_norm": 2.796875, "grad_norm_var": 4.506314086914062, "learning_rate": 0.0001, "loss": 3.6334, "loss/crossentropy": 2.5399341940879823, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.4391636699438095, "loss/reg": 0.0, "step": 2800 }, { "epoch": 0.01848684210526316, "grad_norm": 4.84375, "grad_norm_var": 2.5375152587890626, "learning_rate": 0.0001, "loss": 3.7292, "loss/crossentropy": 2.5819294929504393, "loss/hidden": 3.365625, "loss/incoh": 0.0, "loss/logits": 0.33986122310161593, "loss/reg": 0.0, "step": 2810 }, { "epoch": 0.01855263157894737, "grad_norm": 4.03125, "grad_norm_var": 0.3421295166015625, "learning_rate": 0.0001, "loss": 3.7003, "loss/crossentropy": 2.4064539194107057, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.3262439340353012, "loss/reg": 0.0, "step": 2820 }, { "epoch": 0.01861842105263158, "grad_norm": 2.828125, "grad_norm_var": 0.1756988525390625, "learning_rate": 0.0001, "loss": 3.804, "loss/crossentropy": 2.6151478767395018, "loss/hidden": 3.6390625, "loss/incoh": 0.0, "loss/logits": 0.4677980303764343, "loss/reg": 0.0, "step": 2830 }, { "epoch": 0.01868421052631579, "grad_norm": 3.03125, "grad_norm_var": 0.10458577473958333, "learning_rate": 0.0001, "loss": 3.7672, "loss/crossentropy": 2.5210029244422913, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.40769249498844146, "loss/reg": 0.0, "step": 2840 }, { "epoch": 0.01875, "grad_norm": 3.28125, "grad_norm_var": 0.1360504150390625, "learning_rate": 0.0001, "loss": 3.7829, "loss/crossentropy": 2.3390053629875185, "loss/hidden": 3.4421875, "loss/incoh": 0.0, "loss/logits": 0.3656760662794113, "loss/reg": 0.0, "step": 2850 }, { "epoch": 0.01881578947368421, "grad_norm": 2.609375, "grad_norm_var": 0.21334635416666667, "learning_rate": 0.0001, "loss": 3.7445, "loss/crossentropy": 2.044054812192917, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.45996856689453125, "loss/reg": 0.0, "step": 2860 }, { "epoch": 0.01888157894736842, "grad_norm": 2.921875, "grad_norm_var": 0.35147196451822915, "learning_rate": 0.0001, "loss": 3.793, "loss/crossentropy": 2.2911840200424196, "loss/hidden": 3.5703125, "loss/incoh": 0.0, "loss/logits": 0.3373717874288559, "loss/reg": 0.0, "step": 2870 }, { "epoch": 0.018947368421052633, "grad_norm": 2.515625, "grad_norm_var": 0.3585845947265625, "learning_rate": 0.0001, "loss": 3.7057, "loss/crossentropy": 2.3755346536636353, "loss/hidden": 3.3609375, "loss/incoh": 0.0, "loss/logits": 0.36185318529605864, "loss/reg": 0.0, "step": 2880 }, { "epoch": 0.019013157894736843, "grad_norm": 2.859375, "grad_norm_var": 0.0810211181640625, "learning_rate": 0.0001, "loss": 3.7152, "loss/crossentropy": 2.1650418758392336, "loss/hidden": 3.5109375, "loss/incoh": 0.0, "loss/logits": 0.36209405958652496, "loss/reg": 0.0, "step": 2890 }, { "epoch": 0.019078947368421053, "grad_norm": 3.171875, "grad_norm_var": 1.6020182291666667, "learning_rate": 0.0001, "loss": 3.8985, "loss/crossentropy": 2.345675766468048, "loss/hidden": 3.9203125, "loss/incoh": 0.0, "loss/logits": 0.41470250189304353, "loss/reg": 0.0, "step": 2900 }, { "epoch": 0.019144736842105263, "grad_norm": 3.359375, "grad_norm_var": 1.289264933268229, "learning_rate": 0.0001, "loss": 3.8444, "loss/crossentropy": 2.3634544610977173, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.3347387105226517, "loss/reg": 0.0, "step": 2910 }, { "epoch": 0.019210526315789473, "grad_norm": 2.890625, "grad_norm_var": 0.19099019368489584, "learning_rate": 0.0001, "loss": 3.7385, "loss/crossentropy": 2.3202159285545347, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.30690879598259924, "loss/reg": 0.0, "step": 2920 }, { "epoch": 0.019276315789473683, "grad_norm": 3.484375, "grad_norm_var": 0.4906209309895833, "learning_rate": 0.0001, "loss": 3.6643, "loss/crossentropy": 2.2004665434360504, "loss/hidden": 3.484375, "loss/incoh": 0.0, "loss/logits": 0.3603193074464798, "loss/reg": 0.0, "step": 2930 }, { "epoch": 0.019342105263157893, "grad_norm": 3.421875, "grad_norm_var": 0.13365478515625, "learning_rate": 0.0001, "loss": 3.6656, "loss/crossentropy": 2.5021592140197755, "loss/hidden": 3.4109375, "loss/incoh": 0.0, "loss/logits": 0.35134916603565214, "loss/reg": 0.0, "step": 2940 }, { "epoch": 0.019407894736842107, "grad_norm": 2.765625, "grad_norm_var": 1.0520497639973958, "learning_rate": 0.0001, "loss": 3.7813, "loss/crossentropy": 2.4475439548492433, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.4413463234901428, "loss/reg": 0.0, "step": 2950 }, { "epoch": 0.019473684210526317, "grad_norm": 2.90625, "grad_norm_var": 1.761279296875, "learning_rate": 0.0001, "loss": 3.7476, "loss/crossentropy": 2.57927063703537, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.4805259481072426, "loss/reg": 0.0, "step": 2960 }, { "epoch": 0.019539473684210527, "grad_norm": 3.734375, "grad_norm_var": 0.2982737223307292, "learning_rate": 0.0001, "loss": 3.7277, "loss/crossentropy": 2.0291129291057586, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.285884577780962, "loss/reg": 0.0, "step": 2970 }, { "epoch": 0.019605263157894737, "grad_norm": 3.03125, "grad_norm_var": 0.6626942952473959, "learning_rate": 0.0001, "loss": 3.8596, "loss/crossentropy": 2.112092435359955, "loss/hidden": 3.4453125, "loss/incoh": 0.0, "loss/logits": 0.36193700730800626, "loss/reg": 0.0, "step": 2980 }, { "epoch": 0.019671052631578947, "grad_norm": 3.359375, "grad_norm_var": 0.16502176920572917, "learning_rate": 0.0001, "loss": 3.7666, "loss/crossentropy": 2.2168065547943114, "loss/hidden": 3.3734375, "loss/incoh": 0.0, "loss/logits": 0.3593168243765831, "loss/reg": 0.0, "step": 2990 }, { "epoch": 0.019736842105263157, "grad_norm": 2.921875, "grad_norm_var": 0.45120035807291664, "learning_rate": 0.0001, "loss": 3.7107, "loss/crossentropy": 2.350427895784378, "loss/hidden": 3.3109375, "loss/incoh": 0.0, "loss/logits": 0.37478172183036806, "loss/reg": 0.0, "step": 3000 }, { "epoch": 0.019802631578947367, "grad_norm": 2.71875, "grad_norm_var": 0.15097249348958333, "learning_rate": 0.0001, "loss": 3.7559, "loss/crossentropy": 2.6041540622711183, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.366491025686264, "loss/reg": 0.0, "step": 3010 }, { "epoch": 0.01986842105263158, "grad_norm": 2.953125, "grad_norm_var": 0.1023834228515625, "learning_rate": 0.0001, "loss": 3.7322, "loss/crossentropy": 2.2936912298202516, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.3990582287311554, "loss/reg": 0.0, "step": 3020 }, { "epoch": 0.01993421052631579, "grad_norm": 2.9375, "grad_norm_var": 0.07664388020833333, "learning_rate": 0.0001, "loss": 3.7405, "loss/crossentropy": 2.1873862028121946, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.4286995857954025, "loss/reg": 0.0, "step": 3030 }, { "epoch": 0.02, "grad_norm": 2.53125, "grad_norm_var": 0.3075480143229167, "learning_rate": 0.0001, "loss": 3.7681, "loss/crossentropy": 2.0712085247039793, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.2994038611650467, "loss/reg": 0.0, "step": 3040 }, { "epoch": 0.02006578947368421, "grad_norm": 2.734375, "grad_norm_var": 0.6994049072265625, "learning_rate": 0.0001, "loss": 3.7519, "loss/crossentropy": 2.155745780467987, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3106741845607758, "loss/reg": 0.0, "step": 3050 }, { "epoch": 0.02013157894736842, "grad_norm": 3.53125, "grad_norm_var": 0.177978515625, "learning_rate": 0.0001, "loss": 3.7618, "loss/crossentropy": 2.1236796349287035, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.3017591178417206, "loss/reg": 0.0, "step": 3060 }, { "epoch": 0.02019736842105263, "grad_norm": 3.265625, "grad_norm_var": 0.16617431640625, "learning_rate": 0.0001, "loss": 3.7225, "loss/crossentropy": 2.4667672872543336, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.3858541399240494, "loss/reg": 0.0, "step": 3070 }, { "epoch": 0.02026315789473684, "grad_norm": 2.703125, "grad_norm_var": 2.2265110394707968e+17, "learning_rate": 0.0001, "loss": 3.8077, "loss/crossentropy": 2.53361736536026, "loss/hidden": 3.121875, "loss/incoh": 0.0, "loss/logits": 0.3336446687579155, "loss/reg": 0.0, "step": 3080 }, { "epoch": 0.020328947368421054, "grad_norm": 2.34375, "grad_norm_var": 2.2265110387924992e+17, "learning_rate": 0.0001, "loss": 3.7067, "loss/crossentropy": 2.3802372574806214, "loss/hidden": 3.2609375, "loss/incoh": 0.0, "loss/logits": 0.3561277031898499, "loss/reg": 0.0, "step": 3090 }, { "epoch": 0.020394736842105264, "grad_norm": 4.53125, "grad_norm_var": 1.25572509765625, "learning_rate": 0.0001, "loss": 3.8816, "loss/crossentropy": 2.8750504910945893, "loss/hidden": 3.728125, "loss/incoh": 0.0, "loss/logits": 0.37489808425307275, "loss/reg": 0.0, "step": 3100 }, { "epoch": 0.020460526315789474, "grad_norm": 2.640625, "grad_norm_var": 0.77646484375, "learning_rate": 0.0001, "loss": 3.6678, "loss/crossentropy": 2.258682942390442, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.3497451141476631, "loss/reg": 0.0, "step": 3110 }, { "epoch": 0.020526315789473684, "grad_norm": 2.8125, "grad_norm_var": 0.047412109375, "learning_rate": 0.0001, "loss": 3.6585, "loss/crossentropy": 2.3022143959999086, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.37089207768440247, "loss/reg": 0.0, "step": 3120 }, { "epoch": 0.020592105263157894, "grad_norm": 2.921875, "grad_norm_var": 1.5189036051432292, "learning_rate": 0.0001, "loss": 3.7862, "loss/crossentropy": 2.6240602493286134, "loss/hidden": 3.35625, "loss/incoh": 0.0, "loss/logits": 0.407352888584137, "loss/reg": 0.0, "step": 3130 }, { "epoch": 0.020657894736842104, "grad_norm": 3.71875, "grad_norm_var": 1.5723052978515626, "learning_rate": 0.0001, "loss": 3.7355, "loss/crossentropy": 2.290934902429581, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3808047503232956, "loss/reg": 0.0, "step": 3140 }, { "epoch": 0.020723684210526314, "grad_norm": 2.84375, "grad_norm_var": 0.5337961832682292, "learning_rate": 0.0001, "loss": 3.7778, "loss/crossentropy": 2.485193204879761, "loss/hidden": 3.515625, "loss/incoh": 0.0, "loss/logits": 0.4543539136648178, "loss/reg": 0.0, "step": 3150 }, { "epoch": 0.020789473684210528, "grad_norm": 2.8125, "grad_norm_var": 0.3465728759765625, "learning_rate": 0.0001, "loss": 3.7506, "loss/crossentropy": 2.557511067390442, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.4272035837173462, "loss/reg": 0.0, "step": 3160 }, { "epoch": 0.020855263157894738, "grad_norm": 3.0625, "grad_norm_var": 0.20754801432291667, "learning_rate": 0.0001, "loss": 3.6316, "loss/crossentropy": 2.310171937942505, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.32435240745544436, "loss/reg": 0.0, "step": 3170 }, { "epoch": 0.020921052631578948, "grad_norm": 2.578125, "grad_norm_var": 0.833984375, "learning_rate": 0.0001, "loss": 3.7324, "loss/crossentropy": 2.5518528699874876, "loss/hidden": 3.3765625, "loss/incoh": 0.0, "loss/logits": 0.3816035658121109, "loss/reg": 0.0, "step": 3180 }, { "epoch": 0.020986842105263158, "grad_norm": 2.984375, "grad_norm_var": 2.3063795635792774e+17, "learning_rate": 0.0001, "loss": 3.8813, "loss/crossentropy": 2.3198139667510986, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.3700142025947571, "loss/reg": 0.0, "step": 3190 }, { "epoch": 0.021052631578947368, "grad_norm": 2.78125, "grad_norm_var": 0.10243733723958333, "learning_rate": 0.0001, "loss": 3.6063, "loss/crossentropy": 2.545992207527161, "loss/hidden": 3.2875, "loss/incoh": 0.0, "loss/logits": 0.3494896024465561, "loss/reg": 0.0, "step": 3200 }, { "epoch": 0.021118421052631578, "grad_norm": 2.8125, "grad_norm_var": 0.06311442057291666, "learning_rate": 0.0001, "loss": 3.6901, "loss/crossentropy": 2.330699014663696, "loss/hidden": 3.3625, "loss/incoh": 0.0, "loss/logits": 0.34815158843994143, "loss/reg": 0.0, "step": 3210 }, { "epoch": 0.021184210526315788, "grad_norm": 3.25, "grad_norm_var": 0.3830718994140625, "learning_rate": 0.0001, "loss": 3.7327, "loss/crossentropy": 2.489699113368988, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.337864650785923, "loss/reg": 0.0, "step": 3220 }, { "epoch": 0.02125, "grad_norm": 2.828125, "grad_norm_var": 0.38974609375, "learning_rate": 0.0001, "loss": 3.6871, "loss/crossentropy": 2.3864477396011354, "loss/hidden": 3.3859375, "loss/incoh": 0.0, "loss/logits": 0.4103096604347229, "loss/reg": 0.0, "step": 3230 }, { "epoch": 0.02131578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 3.6458, "loss/crossentropy": 2.4074989527463915, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.35124915838241577, "loss/reg": 0.0, "step": 3240 }, { "epoch": 0.02138157894736842, "grad_norm": 3.453125, "grad_norm_var": 0.08498942057291667, "learning_rate": 0.0001, "loss": 3.7083, "loss/crossentropy": 2.524831974506378, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.368264502286911, "loss/reg": 0.0, "step": 3250 }, { "epoch": 0.02144736842105263, "grad_norm": 2.78125, "grad_norm_var": 0.42135009765625, "learning_rate": 0.0001, "loss": 3.7602, "loss/crossentropy": 2.166905391216278, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.3093524396419525, "loss/reg": 0.0, "step": 3260 }, { "epoch": 0.02151315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.4861724853515625, "learning_rate": 0.0001, "loss": 3.6738, "loss/crossentropy": 2.2662750601768495, "loss/hidden": 3.1578125, "loss/incoh": 0.0, "loss/logits": 0.318782140314579, "loss/reg": 0.0, "step": 3270 }, { "epoch": 0.02157894736842105, "grad_norm": 2.875, "grad_norm_var": 0.13782145182291666, "learning_rate": 0.0001, "loss": 3.7461, "loss/crossentropy": 2.0462193369865416, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.30782590508461, "loss/reg": 0.0, "step": 3280 }, { "epoch": 0.021644736842105262, "grad_norm": 2.875, "grad_norm_var": 0.11330973307291667, "learning_rate": 0.0001, "loss": 3.7008, "loss/crossentropy": 2.5261476397514344, "loss/hidden": 3.2734375, "loss/incoh": 0.0, "loss/logits": 0.3591727793216705, "loss/reg": 0.0, "step": 3290 }, { "epoch": 0.021710526315789475, "grad_norm": 3.0, "grad_norm_var": 0.1515533447265625, "learning_rate": 0.0001, "loss": 3.751, "loss/crossentropy": 2.569035267829895, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.3920204371213913, "loss/reg": 0.0, "step": 3300 }, { "epoch": 0.021776315789473685, "grad_norm": 2.96875, "grad_norm_var": 0.09117431640625, "learning_rate": 0.0001, "loss": 3.6385, "loss/crossentropy": 2.5647154331207274, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.34364444613456724, "loss/reg": 0.0, "step": 3310 }, { "epoch": 0.021842105263157895, "grad_norm": 2.5625, "grad_norm_var": 1.6411692301432292, "learning_rate": 0.0001, "loss": 3.6335, "loss/crossentropy": 2.427185571193695, "loss/hidden": 3.30625, "loss/incoh": 0.0, "loss/logits": 0.3427447766065598, "loss/reg": 0.0, "step": 3320 }, { "epoch": 0.021907894736842105, "grad_norm": 2.390625, "grad_norm_var": 0.11515299479166667, "learning_rate": 0.0001, "loss": 3.6807, "loss/crossentropy": 2.1253209471702577, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.36423676908016206, "loss/reg": 0.0, "step": 3330 }, { "epoch": 0.021973684210526315, "grad_norm": 2.78125, "grad_norm_var": 0.046956380208333336, "learning_rate": 0.0001, "loss": 3.5711, "loss/crossentropy": 2.3783676266670226, "loss/hidden": 3.1703125, "loss/incoh": 0.0, "loss/logits": 0.3118838146328926, "loss/reg": 0.0, "step": 3340 }, { "epoch": 0.022039473684210525, "grad_norm": 2.953125, "grad_norm_var": 0.060791015625, "learning_rate": 0.0001, "loss": 3.6252, "loss/crossentropy": 2.350738251209259, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.2957428440451622, "loss/reg": 0.0, "step": 3350 }, { "epoch": 0.022105263157894735, "grad_norm": 2.4375, "grad_norm_var": 0.09226888020833333, "learning_rate": 0.0001, "loss": 3.7233, "loss/crossentropy": 2.5446563720703126, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.3123622477054596, "loss/reg": 0.0, "step": 3360 }, { "epoch": 0.02217105263157895, "grad_norm": 2.828125, "grad_norm_var": 0.3047108968098958, "learning_rate": 0.0001, "loss": 3.6972, "loss/crossentropy": 2.33814697265625, "loss/hidden": 3.20625, "loss/incoh": 0.0, "loss/logits": 0.30704180896282196, "loss/reg": 0.0, "step": 3370 }, { "epoch": 0.02223684210526316, "grad_norm": 3.03125, "grad_norm_var": 0.17353413899739584, "learning_rate": 0.0001, "loss": 3.6799, "loss/crossentropy": 2.2355513691902162, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.3377602517604828, "loss/reg": 0.0, "step": 3380 }, { "epoch": 0.02230263157894737, "grad_norm": 2.609375, "grad_norm_var": 6.300797526041666, "learning_rate": 0.0001, "loss": 3.7636, "loss/crossentropy": 2.3003466069698333, "loss/hidden": 3.396875, "loss/incoh": 0.0, "loss/logits": 0.3391520828008652, "loss/reg": 0.0, "step": 3390 }, { "epoch": 0.02236842105263158, "grad_norm": 3.34375, "grad_norm_var": 0.0972564697265625, "learning_rate": 0.0001, "loss": 3.658, "loss/crossentropy": 2.3254055261611937, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.3125807404518127, "loss/reg": 0.0, "step": 3400 }, { "epoch": 0.02243421052631579, "grad_norm": 2.59375, "grad_norm_var": 15.84599609375, "learning_rate": 0.0001, "loss": 3.7692, "loss/crossentropy": 2.753233790397644, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.3326481133699417, "loss/reg": 0.0, "step": 3410 }, { "epoch": 0.0225, "grad_norm": 4.875, "grad_norm_var": 1.10299072265625, "learning_rate": 0.0001, "loss": 3.7531, "loss/crossentropy": 2.282338631153107, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.3607694834470749, "loss/reg": 0.0, "step": 3420 }, { "epoch": 0.02256578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.626123046875, "learning_rate": 0.0001, "loss": 3.8275, "loss/crossentropy": 2.5421807527542115, "loss/hidden": 3.425, "loss/incoh": 0.0, "loss/logits": 0.5308041572570801, "loss/reg": 0.0, "step": 3430 }, { "epoch": 0.022631578947368423, "grad_norm": 2.78125, "grad_norm_var": 0.40051676432291666, "learning_rate": 0.0001, "loss": 3.7523, "loss/crossentropy": 2.541818845272064, "loss/hidden": 3.3625, "loss/incoh": 0.0, "loss/logits": 0.359403657913208, "loss/reg": 0.0, "step": 3440 }, { "epoch": 0.022697368421052633, "grad_norm": 2.90625, "grad_norm_var": 0.35347391764322916, "learning_rate": 0.0001, "loss": 3.6627, "loss/crossentropy": 2.5443089246749877, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.32425140738487246, "loss/reg": 0.0, "step": 3450 }, { "epoch": 0.022763157894736843, "grad_norm": 3.296875, "grad_norm_var": 0.09103190104166667, "learning_rate": 0.0001, "loss": 3.6541, "loss/crossentropy": 2.4523619592189787, "loss/hidden": 3.4453125, "loss/incoh": 0.0, "loss/logits": 0.3741248741745949, "loss/reg": 0.0, "step": 3460 }, { "epoch": 0.022828947368421053, "grad_norm": 5.375, "grad_norm_var": 1.3390533447265625, "learning_rate": 0.0001, "loss": 3.7552, "loss/crossentropy": 2.282164466381073, "loss/hidden": 3.3984375, "loss/incoh": 0.0, "loss/logits": 0.32355323880910875, "loss/reg": 0.0, "step": 3470 }, { "epoch": 0.022894736842105263, "grad_norm": 2.859375, "grad_norm_var": 0.5425608317057292, "learning_rate": 0.0001, "loss": 3.6438, "loss/crossentropy": 2.584494400024414, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.3407262712717056, "loss/reg": 0.0, "step": 3480 }, { "epoch": 0.022960526315789473, "grad_norm": 2.796875, "grad_norm_var": 11.746613566080729, "learning_rate": 0.0001, "loss": 3.8186, "loss/crossentropy": 2.781242084503174, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.516629433631897, "loss/reg": 0.0, "step": 3490 }, { "epoch": 0.023026315789473683, "grad_norm": 2.96875, "grad_norm_var": 0.38925374348958336, "learning_rate": 0.0001, "loss": 3.6824, "loss/crossentropy": 2.6085395932197573, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.40641255080699923, "loss/reg": 0.0, "step": 3500 }, { "epoch": 0.023092105263157896, "grad_norm": 2.375, "grad_norm_var": 0.0653228759765625, "learning_rate": 0.0001, "loss": 3.6125, "loss/crossentropy": 2.143614149093628, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.32446493208408356, "loss/reg": 0.0, "step": 3510 }, { "epoch": 0.023157894736842106, "grad_norm": 3.78125, "grad_norm_var": 0.24351806640625, "learning_rate": 0.0001, "loss": 3.7227, "loss/crossentropy": 2.4581753849983214, "loss/hidden": 3.5296875, "loss/incoh": 0.0, "loss/logits": 0.4273629605770111, "loss/reg": 0.0, "step": 3520 }, { "epoch": 0.023223684210526317, "grad_norm": 2.546875, "grad_norm_var": 0.3322987874348958, "learning_rate": 0.0001, "loss": 3.6619, "loss/crossentropy": 2.452810299396515, "loss/hidden": 3.41875, "loss/incoh": 0.0, "loss/logits": 0.364976304769516, "loss/reg": 0.0, "step": 3530 }, { "epoch": 0.023289473684210527, "grad_norm": 2.546875, "grad_norm_var": 0.050780232747395834, "learning_rate": 0.0001, "loss": 3.5957, "loss/crossentropy": 2.6891199111938477, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3124883592128754, "loss/reg": 0.0, "step": 3540 }, { "epoch": 0.023355263157894737, "grad_norm": 2.671875, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 3.7773, "loss/crossentropy": 2.565224659442902, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.30210830420255663, "loss/reg": 0.0, "step": 3550 }, { "epoch": 0.023421052631578947, "grad_norm": 3.921875, "grad_norm_var": 0.20045166015625, "learning_rate": 0.0001, "loss": 3.6777, "loss/crossentropy": 2.3339913129806518, "loss/hidden": 3.7484375, "loss/incoh": 0.0, "loss/logits": 0.3782587692141533, "loss/reg": 0.0, "step": 3560 }, { "epoch": 0.023486842105263157, "grad_norm": 6.375, "grad_norm_var": 7.792964680989583, "learning_rate": 0.0001, "loss": 3.9088, "loss/crossentropy": 2.305986249446869, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3260203331708908, "loss/reg": 0.0, "step": 3570 }, { "epoch": 0.023552631578947367, "grad_norm": 2.5, "grad_norm_var": 1.226398722330729, "learning_rate": 0.0001, "loss": 3.5858, "loss/crossentropy": 2.5068121433258055, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.38687763810157777, "loss/reg": 0.0, "step": 3580 }, { "epoch": 0.02361842105263158, "grad_norm": 2.6875, "grad_norm_var": 0.08307291666666666, "learning_rate": 0.0001, "loss": 3.5749, "loss/crossentropy": 2.440618324279785, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.3219692587852478, "loss/reg": 0.0, "step": 3590 }, { "epoch": 0.02368421052631579, "grad_norm": 2.921875, "grad_norm_var": 0.2329742431640625, "learning_rate": 0.0001, "loss": 3.7091, "loss/crossentropy": 2.3230647802352906, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.3276051238179207, "loss/reg": 0.0, "step": 3600 }, { "epoch": 0.02375, "grad_norm": 3.125, "grad_norm_var": 0.06494038899739583, "learning_rate": 0.0001, "loss": 3.6957, "loss/crossentropy": 2.549039614200592, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.4018037021160126, "loss/reg": 0.0, "step": 3610 }, { "epoch": 0.02381578947368421, "grad_norm": 2.546875, "grad_norm_var": 0.05314839680989583, "learning_rate": 0.0001, "loss": 3.6018, "loss/crossentropy": 2.5681329488754274, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.3874122858047485, "loss/reg": 0.0, "step": 3620 }, { "epoch": 0.02388157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.30606180826822915, "learning_rate": 0.0001, "loss": 3.6534, "loss/crossentropy": 2.224066364765167, "loss/hidden": 3.41875, "loss/incoh": 0.0, "loss/logits": 0.3868778973817825, "loss/reg": 0.0, "step": 3630 }, { "epoch": 0.02394736842105263, "grad_norm": 2.5, "grad_norm_var": 0.44580078125, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 2.313184142112732, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.35929109454154967, "loss/reg": 0.0, "step": 3640 }, { "epoch": 0.02401315789473684, "grad_norm": 4.65625, "grad_norm_var": 0.43798421223958334, "learning_rate": 0.0001, "loss": 3.6503, "loss/crossentropy": 2.483446490764618, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.324999064207077, "loss/reg": 0.0, "step": 3650 }, { "epoch": 0.024078947368421054, "grad_norm": 2.71875, "grad_norm_var": 0.271875, "learning_rate": 0.0001, "loss": 3.7069, "loss/crossentropy": 2.2979444444179533, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3168840616941452, "loss/reg": 0.0, "step": 3660 }, { "epoch": 0.024144736842105264, "grad_norm": 2.65625, "grad_norm_var": 0.0767242431640625, "learning_rate": 0.0001, "loss": 3.6808, "loss/crossentropy": 2.4076380014419554, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.3966252237558365, "loss/reg": 0.0, "step": 3670 }, { "epoch": 0.024210526315789474, "grad_norm": 3.109375, "grad_norm_var": 0.06747639973958333, "learning_rate": 0.0001, "loss": 3.7852, "loss/crossentropy": 2.6107199430465697, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3948290854692459, "loss/reg": 0.0, "step": 3680 }, { "epoch": 0.024276315789473684, "grad_norm": 3.765625, "grad_norm_var": 0.13434244791666666, "learning_rate": 0.0001, "loss": 3.6048, "loss/crossentropy": 2.5476237654685976, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.352567557990551, "loss/reg": 0.0, "step": 3690 }, { "epoch": 0.024342105263157894, "grad_norm": 2.34375, "grad_norm_var": 0.6126139322916667, "learning_rate": 0.0001, "loss": 3.7034, "loss/crossentropy": 2.435033369064331, "loss/hidden": 3.275, "loss/incoh": 0.0, "loss/logits": 0.37708690464496614, "loss/reg": 0.0, "step": 3700 }, { "epoch": 0.024407894736842104, "grad_norm": 3.03125, "grad_norm_var": 0.7496002197265625, "learning_rate": 0.0001, "loss": 3.6696, "loss/crossentropy": 2.550716495513916, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.38517349362373354, "loss/reg": 0.0, "step": 3710 }, { "epoch": 0.024473684210526314, "grad_norm": 2.296875, "grad_norm_var": 0.5516886393229167, "learning_rate": 0.0001, "loss": 3.5623, "loss/crossentropy": 2.3806477397680283, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.3013214536011219, "loss/reg": 0.0, "step": 3720 }, { "epoch": 0.024539473684210528, "grad_norm": 3.1875, "grad_norm_var": 0.4890207926432292, "learning_rate": 0.0001, "loss": 3.6412, "loss/crossentropy": 2.3859506011009217, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.49241943359375, "loss/reg": 0.0, "step": 3730 }, { "epoch": 0.024605263157894738, "grad_norm": 2.6875, "grad_norm_var": 0.2377838134765625, "learning_rate": 0.0001, "loss": 3.5895, "loss/crossentropy": 2.141975212097168, "loss/hidden": 3.2140625, "loss/incoh": 0.0, "loss/logits": 0.3277399495244026, "loss/reg": 0.0, "step": 3740 }, { "epoch": 0.024671052631578948, "grad_norm": 3.140625, "grad_norm_var": 0.12506510416666666, "learning_rate": 0.0001, "loss": 3.6233, "loss/crossentropy": 2.4696611404418944, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.3837138593196869, "loss/reg": 0.0, "step": 3750 }, { "epoch": 0.024736842105263158, "grad_norm": 2.34375, "grad_norm_var": 2.6720540364583334, "learning_rate": 0.0001, "loss": 3.7719, "loss/crossentropy": 2.494647514820099, "loss/hidden": 3.275, "loss/incoh": 0.0, "loss/logits": 0.37584047913551333, "loss/reg": 0.0, "step": 3760 }, { "epoch": 0.024802631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.63804931640625, "learning_rate": 0.0001, "loss": 3.676, "loss/crossentropy": 2.5242549180984497, "loss/hidden": 3.2328125, "loss/incoh": 0.0, "loss/logits": 0.39244888722896576, "loss/reg": 0.0, "step": 3770 }, { "epoch": 0.024868421052631578, "grad_norm": 3.03125, "grad_norm_var": 0.3232167561848958, "learning_rate": 0.0001, "loss": 3.5206, "loss/crossentropy": 2.2167584180831907, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.31152922809123995, "loss/reg": 0.0, "step": 3780 }, { "epoch": 0.024934210526315788, "grad_norm": 2.640625, "grad_norm_var": 0.8850331624348958, "learning_rate": 0.0001, "loss": 3.6831, "loss/crossentropy": 2.406609225273132, "loss/hidden": 3.2734375, "loss/incoh": 0.0, "loss/logits": 0.3529895097017288, "loss/reg": 0.0, "step": 3790 }, { "epoch": 0.025, "grad_norm": 2.390625, "grad_norm_var": 0.8739491780598958, "learning_rate": 0.0001, "loss": 3.6024, "loss/crossentropy": 2.270749258995056, "loss/hidden": 3.3390625, "loss/incoh": 0.0, "loss/logits": 0.32701381742954255, "loss/reg": 0.0, "step": 3800 }, { "epoch": 0.02506578947368421, "grad_norm": 2.625, "grad_norm_var": 0.06500651041666666, "learning_rate": 0.0001, "loss": 3.6037, "loss/crossentropy": 2.3936703205108643, "loss/hidden": 3.3078125, "loss/incoh": 0.0, "loss/logits": 0.4090299874544144, "loss/reg": 0.0, "step": 3810 }, { "epoch": 0.02513157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.3386301676432292, "learning_rate": 0.0001, "loss": 3.8203, "loss/crossentropy": 2.057803177833557, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.3018879994750023, "loss/reg": 0.0, "step": 3820 }, { "epoch": 0.02519736842105263, "grad_norm": 10.375, "grad_norm_var": 3.7443756103515624, "learning_rate": 0.0001, "loss": 3.5987, "loss/crossentropy": 2.558328187465668, "loss/hidden": 3.3671875, "loss/incoh": 0.0, "loss/logits": 0.40375421941280365, "loss/reg": 0.0, "step": 3830 }, { "epoch": 0.02526315789473684, "grad_norm": 2.59375, "grad_norm_var": 4.824331665039063, "learning_rate": 0.0001, "loss": 3.6109, "loss/crossentropy": 2.4123119592666624, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3224829614162445, "loss/reg": 0.0, "step": 3840 }, { "epoch": 0.02532894736842105, "grad_norm": 2.921875, "grad_norm_var": 1.4220611572265625, "learning_rate": 0.0001, "loss": 3.6503, "loss/crossentropy": 2.4460156679153444, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.37439659237861633, "loss/reg": 0.0, "step": 3850 }, { "epoch": 0.02539473684210526, "grad_norm": 2.828125, "grad_norm_var": 0.24488525390625, "learning_rate": 0.0001, "loss": 3.6799, "loss/crossentropy": 2.471416544914246, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.3233081191778183, "loss/reg": 0.0, "step": 3860 }, { "epoch": 0.025460526315789475, "grad_norm": 4.6875, "grad_norm_var": 2.005939737955729, "learning_rate": 0.0001, "loss": 3.7099, "loss/crossentropy": 2.46109699010849, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.31833461821079256, "loss/reg": 0.0, "step": 3870 }, { "epoch": 0.025526315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.4576568603515625, "learning_rate": 0.0001, "loss": 3.62, "loss/crossentropy": 2.4385437607765197, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.3444881528615952, "loss/reg": 0.0, "step": 3880 }, { "epoch": 0.025592105263157895, "grad_norm": 2.625, "grad_norm_var": 0.07125244140625, "learning_rate": 0.0001, "loss": 3.5814, "loss/crossentropy": 2.464188981056213, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.4162255361676216, "loss/reg": 0.0, "step": 3890 }, { "epoch": 0.025657894736842105, "grad_norm": 3.203125, "grad_norm_var": 0.13810221354166666, "learning_rate": 0.0001, "loss": 3.5963, "loss/crossentropy": 2.5776121497154234, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.351967790722847, "loss/reg": 0.0, "step": 3900 }, { "epoch": 0.025723684210526315, "grad_norm": 2.203125, "grad_norm_var": 0.13177083333333334, "learning_rate": 0.0001, "loss": 3.6137, "loss/crossentropy": 2.4709773540496824, "loss/hidden": 3.28125, "loss/incoh": 0.0, "loss/logits": 0.3811213612556458, "loss/reg": 0.0, "step": 3910 }, { "epoch": 0.025789473684210525, "grad_norm": 2.671875, "grad_norm_var": 0.047591145833333334, "learning_rate": 0.0001, "loss": 3.5243, "loss/crossentropy": 2.3289324045181274, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.32472735941410064, "loss/reg": 0.0, "step": 3920 }, { "epoch": 0.025855263157894735, "grad_norm": 3.359375, "grad_norm_var": 0.07144266764322917, "learning_rate": 0.0001, "loss": 3.5618, "loss/crossentropy": 2.151239442825317, "loss/hidden": 3.15, "loss/incoh": 0.0, "loss/logits": 0.3790448889136314, "loss/reg": 0.0, "step": 3930 }, { "epoch": 0.02592105263157895, "grad_norm": 2.78125, "grad_norm_var": 0.19239908854166668, "learning_rate": 0.0001, "loss": 3.5689, "loss/crossentropy": 2.281427323818207, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.32773717790842055, "loss/reg": 0.0, "step": 3940 }, { "epoch": 0.02598684210526316, "grad_norm": 2.765625, "grad_norm_var": 0.07558186848958333, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.2366716623306275, "loss/hidden": 3.36875, "loss/incoh": 0.0, "loss/logits": 0.3462549954652786, "loss/reg": 0.0, "step": 3950 }, { "epoch": 0.02605263157894737, "grad_norm": 2.46875, "grad_norm_var": 0.0950347900390625, "learning_rate": 0.0001, "loss": 3.6604, "loss/crossentropy": 2.6565569043159485, "loss/hidden": 3.3375, "loss/incoh": 0.0, "loss/logits": 0.38759642243385317, "loss/reg": 0.0, "step": 3960 }, { "epoch": 0.02611842105263158, "grad_norm": 2.328125, "grad_norm_var": 0.04934895833333333, "learning_rate": 0.0001, "loss": 3.4984, "loss/crossentropy": 2.3093223094940187, "loss/hidden": 3.2484375, "loss/incoh": 0.0, "loss/logits": 0.3547346442937851, "loss/reg": 0.0, "step": 3970 }, { "epoch": 0.02618421052631579, "grad_norm": 2.640625, "grad_norm_var": 0.6375284830729167, "learning_rate": 0.0001, "loss": 3.6552, "loss/crossentropy": 2.5669564962387086, "loss/hidden": 3.640625, "loss/incoh": 0.0, "loss/logits": 0.3741306886076927, "loss/reg": 0.0, "step": 3980 }, { "epoch": 0.02625, "grad_norm": 3.328125, "grad_norm_var": 0.1037994384765625, "learning_rate": 0.0001, "loss": 3.707, "loss/crossentropy": 2.255212366580963, "loss/hidden": 3.4078125, "loss/incoh": 0.0, "loss/logits": 0.36638626754283904, "loss/reg": 0.0, "step": 3990 }, { "epoch": 0.02631578947368421, "grad_norm": 3.484375, "grad_norm_var": 0.1186431884765625, "learning_rate": 0.0001, "loss": 3.5907, "loss/crossentropy": 2.5717132806777956, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.3674279361963272, "loss/reg": 0.0, "step": 4000 }, { "epoch": 0.026381578947368423, "grad_norm": 3.09375, "grad_norm_var": 0.09750874837239583, "learning_rate": 0.0001, "loss": 3.5937, "loss/crossentropy": 2.494677722454071, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.34122500121593474, "loss/reg": 0.0, "step": 4010 }, { "epoch": 0.026447368421052633, "grad_norm": 2.9375, "grad_norm_var": 0.07803446451822917, "learning_rate": 0.0001, "loss": 3.4996, "loss/crossentropy": 2.4037094593048094, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.34369616210460663, "loss/reg": 0.0, "step": 4020 }, { "epoch": 0.026513157894736843, "grad_norm": 2.875, "grad_norm_var": 0.031087239583333332, "learning_rate": 0.0001, "loss": 3.5883, "loss/crossentropy": 2.5161670804023744, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.3320692449808121, "loss/reg": 0.0, "step": 4030 }, { "epoch": 0.026578947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.27988993326822914, "learning_rate": 0.0001, "loss": 3.5987, "loss/crossentropy": 2.489400029182434, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.37186973094940184, "loss/reg": 0.0, "step": 4040 }, { "epoch": 0.026644736842105263, "grad_norm": 2.453125, "grad_norm_var": 0.2902740478515625, "learning_rate": 0.0001, "loss": 3.6423, "loss/crossentropy": 2.1810465335845945, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.30852093994617463, "loss/reg": 0.0, "step": 4050 }, { "epoch": 0.026710526315789473, "grad_norm": 3.203125, "grad_norm_var": 0.2831776936848958, "learning_rate": 0.0001, "loss": 3.5696, "loss/crossentropy": 2.5600404262542726, "loss/hidden": 3.2265625, "loss/incoh": 0.0, "loss/logits": 0.38900414407253264, "loss/reg": 0.0, "step": 4060 }, { "epoch": 0.026776315789473683, "grad_norm": 4.125, "grad_norm_var": 0.25816141764322914, "learning_rate": 0.0001, "loss": 3.6429, "loss/crossentropy": 2.4915748476982116, "loss/hidden": 3.290625, "loss/incoh": 0.0, "loss/logits": 0.35222682952880857, "loss/reg": 0.0, "step": 4070 }, { "epoch": 0.026842105263157896, "grad_norm": 3.875, "grad_norm_var": 2.97427978515625, "learning_rate": 0.0001, "loss": 3.6704, "loss/crossentropy": 2.131973624229431, "loss/hidden": 3.36875, "loss/incoh": 0.0, "loss/logits": 0.32549644112586973, "loss/reg": 0.0, "step": 4080 }, { "epoch": 0.026907894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.25222981770833336, "learning_rate": 0.0001, "loss": 3.591, "loss/crossentropy": 2.196081441640854, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.35593045353889463, "loss/reg": 0.0, "step": 4090 }, { "epoch": 0.026973684210526316, "grad_norm": 3.03125, "grad_norm_var": 0.20009765625, "learning_rate": 0.0001, "loss": 3.5044, "loss/crossentropy": 2.3047094464302065, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2999614104628563, "loss/reg": 0.0, "step": 4100 }, { "epoch": 0.027039473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.5296051025390625, "learning_rate": 0.0001, "loss": 3.6015, "loss/crossentropy": 2.4926783800125123, "loss/hidden": 3.2578125, "loss/incoh": 0.0, "loss/logits": 0.342082779109478, "loss/reg": 0.0, "step": 4110 }, { "epoch": 0.027105263157894736, "grad_norm": 2.609375, "grad_norm_var": 0.05623270670572917, "learning_rate": 0.0001, "loss": 3.5623, "loss/crossentropy": 2.6063008666038514, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3086448922753334, "loss/reg": 0.0, "step": 4120 }, { "epoch": 0.027171052631578946, "grad_norm": 2.671875, "grad_norm_var": 0.103515625, "learning_rate": 0.0001, "loss": 3.5628, "loss/crossentropy": 2.516204798221588, "loss/hidden": 3.2609375, "loss/incoh": 0.0, "loss/logits": 0.36054509580135347, "loss/reg": 0.0, "step": 4130 }, { "epoch": 0.027236842105263157, "grad_norm": 2.421875, "grad_norm_var": 0.09970601399739583, "learning_rate": 0.0001, "loss": 3.5328, "loss/crossentropy": 2.5081961393356322, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3743078649044037, "loss/reg": 0.0, "step": 4140 }, { "epoch": 0.02730263157894737, "grad_norm": 2.4375, "grad_norm_var": 0.0626373291015625, "learning_rate": 0.0001, "loss": 3.5896, "loss/crossentropy": 2.386087703704834, "loss/hidden": 3.225, "loss/incoh": 0.0, "loss/logits": 0.33356338143348696, "loss/reg": 0.0, "step": 4150 }, { "epoch": 0.02736842105263158, "grad_norm": 3.03125, "grad_norm_var": 0.12830301920572917, "learning_rate": 0.0001, "loss": 3.6293, "loss/crossentropy": 2.2993146777153015, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.28893803358078, "loss/reg": 0.0, "step": 4160 }, { "epoch": 0.02743421052631579, "grad_norm": 2.4375, "grad_norm_var": 0.29797770182291666, "learning_rate": 0.0001, "loss": 3.5201, "loss/crossentropy": 2.4818823099136353, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.3620707929134369, "loss/reg": 0.0, "step": 4170 }, { "epoch": 0.0275, "grad_norm": 2.28125, "grad_norm_var": 0.2361328125, "learning_rate": 0.0001, "loss": 3.533, "loss/crossentropy": 2.4130859971046448, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.3289807617664337, "loss/reg": 0.0, "step": 4180 }, { "epoch": 0.02756578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.06122945149739583, "learning_rate": 0.0001, "loss": 3.4961, "loss/crossentropy": 2.3559444665908815, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.3270682215690613, "loss/reg": 0.0, "step": 4190 }, { "epoch": 0.02763157894736842, "grad_norm": 2.609375, "grad_norm_var": 0.3246378580729167, "learning_rate": 0.0001, "loss": 3.7095, "loss/crossentropy": 2.370071732997894, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.36643182039260863, "loss/reg": 0.0, "step": 4200 }, { "epoch": 0.02769736842105263, "grad_norm": 8.375, "grad_norm_var": 2.272069295247396, "learning_rate": 0.0001, "loss": 3.5404, "loss/crossentropy": 2.4906920313835146, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3274578660726547, "loss/reg": 0.0, "step": 4210 }, { "epoch": 0.027763157894736844, "grad_norm": 2.40625, "grad_norm_var": 2.19049072265625, "learning_rate": 0.0001, "loss": 3.6801, "loss/crossentropy": 2.66324725151062, "loss/hidden": 3.5046875, "loss/incoh": 0.0, "loss/logits": 0.38620950281620026, "loss/reg": 0.0, "step": 4220 }, { "epoch": 0.027828947368421054, "grad_norm": 2.796875, "grad_norm_var": 0.1225494384765625, "learning_rate": 0.0001, "loss": 3.6293, "loss/crossentropy": 2.3201419711112976, "loss/hidden": 3.284375, "loss/incoh": 0.0, "loss/logits": 0.33118238747119905, "loss/reg": 0.0, "step": 4230 }, { "epoch": 0.027894736842105264, "grad_norm": 2.765625, "grad_norm_var": 0.062108357747395836, "learning_rate": 0.0001, "loss": 3.5523, "loss/crossentropy": 2.2000674962997437, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.33759562224149703, "loss/reg": 0.0, "step": 4240 }, { "epoch": 0.027960526315789474, "grad_norm": 2.890625, "grad_norm_var": 0.10436197916666666, "learning_rate": 0.0001, "loss": 3.6554, "loss/crossentropy": 2.4001947045326233, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.34554801881313324, "loss/reg": 0.0, "step": 4250 }, { "epoch": 0.028026315789473684, "grad_norm": 2.90625, "grad_norm_var": 0.09031473795572917, "learning_rate": 0.0001, "loss": 3.5922, "loss/crossentropy": 2.5996686697006224, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.34134136140346527, "loss/reg": 0.0, "step": 4260 }, { "epoch": 0.028092105263157894, "grad_norm": 2.59375, "grad_norm_var": 0.07454020182291667, "learning_rate": 0.0001, "loss": 3.5895, "loss/crossentropy": 2.1346129894256594, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.2764407262206078, "loss/reg": 0.0, "step": 4270 }, { "epoch": 0.028157894736842104, "grad_norm": 2.671875, "grad_norm_var": 0.07390950520833334, "learning_rate": 0.0001, "loss": 3.6094, "loss/crossentropy": 2.539172089099884, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.3945852980017662, "loss/reg": 0.0, "step": 4280 }, { "epoch": 0.028223684210526317, "grad_norm": 2.796875, "grad_norm_var": 0.14036051432291666, "learning_rate": 0.0001, "loss": 3.6505, "loss/crossentropy": 2.4181793212890623, "loss/hidden": 3.484375, "loss/incoh": 0.0, "loss/logits": 0.45307959616184235, "loss/reg": 0.0, "step": 4290 }, { "epoch": 0.028289473684210528, "grad_norm": 2.40625, "grad_norm_var": 0.14678446451822916, "learning_rate": 0.0001, "loss": 3.5539, "loss/crossentropy": 2.3630972266197205, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.28445124477148054, "loss/reg": 0.0, "step": 4300 }, { "epoch": 0.028355263157894738, "grad_norm": 2.6875, "grad_norm_var": 0.3001627604166667, "learning_rate": 0.0001, "loss": 3.5098, "loss/crossentropy": 2.4898954033851624, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.320017996430397, "loss/reg": 0.0, "step": 4310 }, { "epoch": 0.028421052631578948, "grad_norm": 2.4375, "grad_norm_var": 0.13015034993489583, "learning_rate": 0.0001, "loss": 3.5755, "loss/crossentropy": 2.2955414295196532, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.41727418303489683, "loss/reg": 0.0, "step": 4320 }, { "epoch": 0.028486842105263158, "grad_norm": 3.015625, "grad_norm_var": 0.29983723958333336, "learning_rate": 0.0001, "loss": 3.7538, "loss/crossentropy": 2.4246325135231017, "loss/hidden": 3.3546875, "loss/incoh": 0.0, "loss/logits": 0.34737818390131, "loss/reg": 0.0, "step": 4330 }, { "epoch": 0.028552631578947368, "grad_norm": 5.3125, "grad_norm_var": 0.5324544270833333, "learning_rate": 0.0001, "loss": 3.6927, "loss/crossentropy": 2.393894040584564, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.37734392285346985, "loss/reg": 0.0, "step": 4340 }, { "epoch": 0.028618421052631578, "grad_norm": 2.78125, "grad_norm_var": 0.5618316650390625, "learning_rate": 0.0001, "loss": 3.6563, "loss/crossentropy": 2.5759302139282227, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.32841147780418395, "loss/reg": 0.0, "step": 4350 }, { "epoch": 0.028684210526315788, "grad_norm": 2.875, "grad_norm_var": 0.06923726399739584, "learning_rate": 0.0001, "loss": 3.5818, "loss/crossentropy": 2.664570915699005, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.31884663701057436, "loss/reg": 0.0, "step": 4360 }, { "epoch": 0.02875, "grad_norm": 3.203125, "grad_norm_var": 0.219873046875, "learning_rate": 0.0001, "loss": 3.6444, "loss/crossentropy": 2.3797228574752807, "loss/hidden": 3.5796875, "loss/incoh": 0.0, "loss/logits": 0.4052841871976852, "loss/reg": 0.0, "step": 4370 }, { "epoch": 0.02881578947368421, "grad_norm": 3.0, "grad_norm_var": 0.5880849202473958, "learning_rate": 0.0001, "loss": 3.5881, "loss/crossentropy": 2.5849244236946105, "loss/hidden": 3.5046875, "loss/incoh": 0.0, "loss/logits": 0.529149529337883, "loss/reg": 0.0, "step": 4380 }, { "epoch": 0.02888157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.31160380045572916, "learning_rate": 0.0001, "loss": 3.4738, "loss/crossentropy": 2.5852147936820984, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.3336706295609474, "loss/reg": 0.0, "step": 4390 }, { "epoch": 0.02894736842105263, "grad_norm": 2.703125, "grad_norm_var": 0.05146077473958333, "learning_rate": 0.0001, "loss": 3.6199, "loss/crossentropy": 2.412027895450592, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.4120332598686218, "loss/reg": 0.0, "step": 4400 }, { "epoch": 0.02901315789473684, "grad_norm": 2.59375, "grad_norm_var": 0.138232421875, "learning_rate": 0.0001, "loss": 3.4929, "loss/crossentropy": 2.270553803443909, "loss/hidden": 3.2546875, "loss/incoh": 0.0, "loss/logits": 0.34523763358592985, "loss/reg": 0.0, "step": 4410 }, { "epoch": 0.02907894736842105, "grad_norm": 2.359375, "grad_norm_var": 0.1102691650390625, "learning_rate": 0.0001, "loss": 3.5778, "loss/crossentropy": 2.361116898059845, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.34047031700611113, "loss/reg": 0.0, "step": 4420 }, { "epoch": 0.02914473684210526, "grad_norm": 6.375, "grad_norm_var": 1.0507120768229166, "learning_rate": 0.0001, "loss": 3.6517, "loss/crossentropy": 2.547470712661743, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.298332154750824, "loss/reg": 0.0, "step": 4430 }, { "epoch": 0.029210526315789475, "grad_norm": 3.796875, "grad_norm_var": 1.0804972330729166, "learning_rate": 0.0001, "loss": 3.7389, "loss/crossentropy": 2.7002538442611694, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3686490625143051, "loss/reg": 0.0, "step": 4440 }, { "epoch": 0.029276315789473685, "grad_norm": 2.3125, "grad_norm_var": 0.432177734375, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.421727478504181, "loss/hidden": 3.18125, "loss/incoh": 0.0, "loss/logits": 0.30692713260650634, "loss/reg": 0.0, "step": 4450 }, { "epoch": 0.029342105263157895, "grad_norm": 2.59375, "grad_norm_var": 0.09047749837239584, "learning_rate": 0.0001, "loss": 3.6132, "loss/crossentropy": 2.659491038322449, "loss/hidden": 3.2140625, "loss/incoh": 0.0, "loss/logits": 0.3632184773683548, "loss/reg": 0.0, "step": 4460 }, { "epoch": 0.029407894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.06815999348958333, "learning_rate": 0.0001, "loss": 3.4561, "loss/crossentropy": 2.5192266911268235, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.2842270269989967, "loss/reg": 0.0, "step": 4470 }, { "epoch": 0.029473684210526315, "grad_norm": 3.109375, "grad_norm_var": 0.1687408447265625, "learning_rate": 0.0001, "loss": 3.5582, "loss/crossentropy": 2.2021409273147583, "loss/hidden": 3.1484375, "loss/incoh": 0.0, "loss/logits": 0.31400761008262634, "loss/reg": 0.0, "step": 4480 }, { "epoch": 0.029539473684210525, "grad_norm": 2.59375, "grad_norm_var": 0.5006174723307292, "learning_rate": 0.0001, "loss": 3.5183, "loss/crossentropy": 2.4442033648490904, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.3307736128568649, "loss/reg": 0.0, "step": 4490 }, { "epoch": 0.029605263157894735, "grad_norm": 2.734375, "grad_norm_var": 0.40276692708333334, "learning_rate": 0.0001, "loss": 3.4888, "loss/crossentropy": 2.373897171020508, "loss/hidden": 3.2234375, "loss/incoh": 0.0, "loss/logits": 0.31207202970981596, "loss/reg": 0.0, "step": 4500 }, { "epoch": 0.02967105263157895, "grad_norm": 2.78125, "grad_norm_var": 0.20685933430989584, "learning_rate": 0.0001, "loss": 3.5971, "loss/crossentropy": 2.325096046924591, "loss/hidden": 3.40625, "loss/incoh": 0.0, "loss/logits": 0.4359890788793564, "loss/reg": 0.0, "step": 4510 }, { "epoch": 0.02973684210526316, "grad_norm": 3.078125, "grad_norm_var": 0.2956451416015625, "learning_rate": 0.0001, "loss": 3.5023, "loss/crossentropy": 2.1537609457969666, "loss/hidden": 3.253125, "loss/incoh": 0.0, "loss/logits": 0.3213866874575615, "loss/reg": 0.0, "step": 4520 }, { "epoch": 0.02980263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.09353841145833333, "learning_rate": 0.0001, "loss": 3.4494, "loss/crossentropy": 2.4694852471351623, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.3156100481748581, "loss/reg": 0.0, "step": 4530 }, { "epoch": 0.02986842105263158, "grad_norm": 2.78125, "grad_norm_var": 0.17408447265625, "learning_rate": 0.0001, "loss": 3.6124, "loss/crossentropy": 2.438071775436401, "loss/hidden": 3.3203125, "loss/incoh": 0.0, "loss/logits": 0.40948416888713834, "loss/reg": 0.0, "step": 4540 }, { "epoch": 0.02993421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.6079010009765625, "learning_rate": 0.0001, "loss": 3.6751, "loss/crossentropy": 2.525905132293701, "loss/hidden": 3.3546875, "loss/incoh": 0.0, "loss/logits": 0.4153590425848961, "loss/reg": 0.0, "step": 4550 }, { "epoch": 0.03, "grad_norm": 3.203125, "grad_norm_var": 0.6879191080729167, "learning_rate": 0.0001, "loss": 3.5335, "loss/crossentropy": 2.421697771549225, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.3177122876048088, "loss/reg": 0.0, "step": 4560 }, { "epoch": 0.03006578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.5993123372395833, "learning_rate": 0.0001, "loss": 3.5742, "loss/crossentropy": 2.3068729996681214, "loss/hidden": 3.05, "loss/incoh": 0.0, "loss/logits": 0.29430699050426484, "loss/reg": 0.0, "step": 4570 }, { "epoch": 0.030131578947368422, "grad_norm": 4.21875, "grad_norm_var": 0.5433339436848958, "learning_rate": 0.0001, "loss": 3.6381, "loss/crossentropy": 2.3981791496276856, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3355312556028366, "loss/reg": 0.0, "step": 4580 }, { "epoch": 0.030197368421052632, "grad_norm": 3.25, "grad_norm_var": 0.9806630452473958, "learning_rate": 0.0001, "loss": 3.6522, "loss/crossentropy": 2.309436595439911, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.350050950050354, "loss/reg": 0.0, "step": 4590 }, { "epoch": 0.030263157894736843, "grad_norm": 2.75, "grad_norm_var": 0.9801910400390625, "learning_rate": 0.0001, "loss": 3.5648, "loss/crossentropy": 2.561086916923523, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3277510732412338, "loss/reg": 0.0, "step": 4600 }, { "epoch": 0.030328947368421053, "grad_norm": 3.015625, "grad_norm_var": 0.20364481608072918, "learning_rate": 0.0001, "loss": 3.5518, "loss/crossentropy": 2.6774720311164857, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31699982583522796, "loss/reg": 0.0, "step": 4610 }, { "epoch": 0.030394736842105263, "grad_norm": 2.765625, "grad_norm_var": 0.9744425455729167, "learning_rate": 0.0001, "loss": 3.5924, "loss/crossentropy": 2.623241698741913, "loss/hidden": 3.228125, "loss/incoh": 0.0, "loss/logits": 0.38138356506824495, "loss/reg": 0.0, "step": 4620 }, { "epoch": 0.030460526315789473, "grad_norm": 2.5625, "grad_norm_var": 0.9736073811848959, "learning_rate": 0.0001, "loss": 3.4682, "loss/crossentropy": 2.4855759739875793, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.3377710849046707, "loss/reg": 0.0, "step": 4630 }, { "epoch": 0.030526315789473683, "grad_norm": 2.78125, "grad_norm_var": 0.6709706624348958, "learning_rate": 0.0001, "loss": 3.4714, "loss/crossentropy": 2.476264202594757, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.3642516300082207, "loss/reg": 0.0, "step": 4640 }, { "epoch": 0.030592105263157896, "grad_norm": 3.0, "grad_norm_var": 0.6482899983723959, "learning_rate": 0.0001, "loss": 3.4865, "loss/crossentropy": 2.3190789937973024, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.32634713053703307, "loss/reg": 0.0, "step": 4650 }, { "epoch": 0.030657894736842106, "grad_norm": 2.921875, "grad_norm_var": 0.12795817057291667, "learning_rate": 0.0001, "loss": 3.5003, "loss/crossentropy": 2.4210057139396666, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.2987945884466171, "loss/reg": 0.0, "step": 4660 }, { "epoch": 0.030723684210526316, "grad_norm": 3.0625, "grad_norm_var": 0.08050130208333334, "learning_rate": 0.0001, "loss": 3.4858, "loss/crossentropy": 2.041215348243713, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.2908504828810692, "loss/reg": 0.0, "step": 4670 }, { "epoch": 0.030789473684210526, "grad_norm": 3.0625, "grad_norm_var": 0.4143300374348958, "learning_rate": 0.0001, "loss": 3.6937, "loss/crossentropy": 2.4882567286491395, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3625297635793686, "loss/reg": 0.0, "step": 4680 }, { "epoch": 0.030855263157894736, "grad_norm": 3.328125, "grad_norm_var": 0.15123291015625, "learning_rate": 0.0001, "loss": 3.5943, "loss/crossentropy": 2.3254539489746096, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.308133128285408, "loss/reg": 0.0, "step": 4690 }, { "epoch": 0.030921052631578946, "grad_norm": 2.46875, "grad_norm_var": 0.19954325358072916, "learning_rate": 0.0001, "loss": 3.5843, "loss/crossentropy": 1.918275660276413, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.36902148872613905, "loss/reg": 0.0, "step": 4700 }, { "epoch": 0.030986842105263156, "grad_norm": 2.84375, "grad_norm_var": 0.04302978515625, "learning_rate": 0.0001, "loss": 3.5845, "loss/crossentropy": 2.573339414596558, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.36042743623256684, "loss/reg": 0.0, "step": 4710 }, { "epoch": 0.03105263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.0475982666015625, "learning_rate": 0.0001, "loss": 3.5453, "loss/crossentropy": 2.313191366195679, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.3170273721218109, "loss/reg": 0.0, "step": 4720 }, { "epoch": 0.03111842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.0694244384765625, "learning_rate": 0.0001, "loss": 3.4641, "loss/crossentropy": 2.5151350021362306, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.3049825429916382, "loss/reg": 0.0, "step": 4730 }, { "epoch": 0.03118421052631579, "grad_norm": 4.25, "grad_norm_var": 1.9115549723307292, "learning_rate": 0.0001, "loss": 3.6088, "loss/crossentropy": 2.278876805305481, "loss/hidden": 3.3796875, "loss/incoh": 0.0, "loss/logits": 0.38837724179029465, "loss/reg": 0.0, "step": 4740 }, { "epoch": 0.03125, "grad_norm": 2.234375, "grad_norm_var": 2.200390625, "learning_rate": 0.0001, "loss": 3.5348, "loss/crossentropy": 2.2090991735458374, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.3878710061311722, "loss/reg": 0.0, "step": 4750 }, { "epoch": 0.031315789473684214, "grad_norm": 2.25, "grad_norm_var": 0.6573404947916667, "learning_rate": 0.0001, "loss": 3.5378, "loss/crossentropy": 2.2805041670799255, "loss/hidden": 3.3015625, "loss/incoh": 0.0, "loss/logits": 0.31536445766687393, "loss/reg": 0.0, "step": 4760 }, { "epoch": 0.03138157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.38818257649739585, "learning_rate": 0.0001, "loss": 3.6082, "loss/crossentropy": 2.6178433656692506, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3645938545465469, "loss/reg": 0.0, "step": 4770 }, { "epoch": 0.031447368421052634, "grad_norm": 2.671875, "grad_norm_var": 0.07810770670572917, "learning_rate": 0.0001, "loss": 3.4424, "loss/crossentropy": 2.5498223304748535, "loss/hidden": 3.2625, "loss/incoh": 0.0, "loss/logits": 0.3333883464336395, "loss/reg": 0.0, "step": 4780 }, { "epoch": 0.03151315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.11669820149739583, "learning_rate": 0.0001, "loss": 3.6145, "loss/crossentropy": 2.639970850944519, "loss/hidden": 3.278125, "loss/incoh": 0.0, "loss/logits": 0.37047617733478544, "loss/reg": 0.0, "step": 4790 }, { "epoch": 0.031578947368421054, "grad_norm": 2.4375, "grad_norm_var": 0.0907867431640625, "learning_rate": 0.0001, "loss": 3.5869, "loss/crossentropy": 2.458595323562622, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.4149588346481323, "loss/reg": 0.0, "step": 4800 }, { "epoch": 0.03164473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.0759429931640625, "learning_rate": 0.0001, "loss": 3.4775, "loss/crossentropy": 2.01421400308609, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.29176320880651474, "loss/reg": 0.0, "step": 4810 }, { "epoch": 0.031710526315789474, "grad_norm": 2.96875, "grad_norm_var": 0.1147857666015625, "learning_rate": 0.0001, "loss": 3.5268, "loss/crossentropy": 2.5456383228302, "loss/hidden": 3.3484375, "loss/incoh": 0.0, "loss/logits": 0.3915561467409134, "loss/reg": 0.0, "step": 4820 }, { "epoch": 0.03177631578947369, "grad_norm": 2.421875, "grad_norm_var": 0.0890625, "learning_rate": 0.0001, "loss": 3.5371, "loss/crossentropy": 2.1736844003200533, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.28229199200868604, "loss/reg": 0.0, "step": 4830 }, { "epoch": 0.031842105263157894, "grad_norm": 2.21875, "grad_norm_var": 0.08162434895833333, "learning_rate": 0.0001, "loss": 3.4898, "loss/crossentropy": 2.397980511188507, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.28644354790449145, "loss/reg": 0.0, "step": 4840 }, { "epoch": 0.03190789473684211, "grad_norm": 2.578125, "grad_norm_var": 1.33443603515625, "learning_rate": 0.0001, "loss": 3.5788, "loss/crossentropy": 2.434200632572174, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.37485773116350174, "loss/reg": 0.0, "step": 4850 }, { "epoch": 0.031973684210526314, "grad_norm": 2.640625, "grad_norm_var": 1.3780181884765625, "learning_rate": 0.0001, "loss": 3.5008, "loss/crossentropy": 2.642025816440582, "loss/hidden": 3.1375, "loss/incoh": 0.0, "loss/logits": 0.34916335344314575, "loss/reg": 0.0, "step": 4860 }, { "epoch": 0.03203947368421053, "grad_norm": 2.578125, "grad_norm_var": 0.04523111979166667, "learning_rate": 0.0001, "loss": 3.4784, "loss/crossentropy": 2.2365106463432314, "loss/hidden": 3.4328125, "loss/incoh": 0.0, "loss/logits": 0.45036998838186265, "loss/reg": 0.0, "step": 4870 }, { "epoch": 0.032105263157894734, "grad_norm": 2.25, "grad_norm_var": 0.04436442057291667, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.250430929660797, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.3373000741004944, "loss/reg": 0.0, "step": 4880 }, { "epoch": 0.03217105263157895, "grad_norm": 2.71875, "grad_norm_var": 0.07979227701822916, "learning_rate": 0.0001, "loss": 3.4476, "loss/crossentropy": 2.688676381111145, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.32961316406726837, "loss/reg": 0.0, "step": 4890 }, { "epoch": 0.03223684210526316, "grad_norm": 2.625, "grad_norm_var": 0.0539215087890625, "learning_rate": 0.0001, "loss": 3.5264, "loss/crossentropy": 2.5281107783317567, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.36207843720912936, "loss/reg": 0.0, "step": 4900 }, { "epoch": 0.03230263157894737, "grad_norm": 2.515625, "grad_norm_var": 0.11297200520833334, "learning_rate": 0.0001, "loss": 3.4276, "loss/crossentropy": 2.3438509345054626, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.3646134212613106, "loss/reg": 0.0, "step": 4910 }, { "epoch": 0.03236842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 3.479, "loss/crossentropy": 2.331527066230774, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.407479552924633, "loss/reg": 0.0, "step": 4920 }, { "epoch": 0.03243421052631579, "grad_norm": 2.375, "grad_norm_var": 0.21721598307291667, "learning_rate": 0.0001, "loss": 3.6251, "loss/crossentropy": 2.3732258677482605, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.32182002663612364, "loss/reg": 0.0, "step": 4930 }, { "epoch": 0.0325, "grad_norm": 4.59375, "grad_norm_var": 1.3630167643229167, "learning_rate": 0.0001, "loss": 3.6469, "loss/crossentropy": 1.9915230482816697, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.35692891776561736, "loss/reg": 0.0, "step": 4940 }, { "epoch": 0.03256578947368421, "grad_norm": 2.40625, "grad_norm_var": 0.2885894775390625, "learning_rate": 0.0001, "loss": 3.5077, "loss/crossentropy": 1.9868581891059875, "loss/hidden": 3.215625, "loss/incoh": 0.0, "loss/logits": 0.30731415897607806, "loss/reg": 0.0, "step": 4950 }, { "epoch": 0.03263157894736842, "grad_norm": 2.765625, "grad_norm_var": 0.21422119140625, "learning_rate": 0.0001, "loss": 3.5426, "loss/crossentropy": 2.3579143285751343, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.3354805111885071, "loss/reg": 0.0, "step": 4960 }, { "epoch": 0.032697368421052635, "grad_norm": 2.625, "grad_norm_var": 4.145735677083334, "learning_rate": 0.0001, "loss": 3.5276, "loss/crossentropy": 2.2787723779678344, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2516452088952065, "loss/reg": 0.0, "step": 4970 }, { "epoch": 0.03276315789473684, "grad_norm": 2.375, "grad_norm_var": 0.09233296712239583, "learning_rate": 0.0001, "loss": 3.4912, "loss/crossentropy": 2.2899803042411806, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3140288829803467, "loss/reg": 0.0, "step": 4980 }, { "epoch": 0.032828947368421055, "grad_norm": 2.53125, "grad_norm_var": 0.07078450520833333, "learning_rate": 0.0001, "loss": 3.4308, "loss/crossentropy": 2.4203084468841554, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.2926447048783302, "loss/reg": 0.0, "step": 4990 }, { "epoch": 0.03289473684210526, "grad_norm": 2.375, "grad_norm_var": 0.1664215087890625, "learning_rate": 0.0001, "loss": 3.5233, "loss/crossentropy": 2.4435134291648866, "loss/hidden": 3.3640625, "loss/incoh": 0.0, "loss/logits": 0.35478622317314146, "loss/reg": 0.0, "step": 5000 }, { "epoch": 0.032960526315789475, "grad_norm": 3.296875, "grad_norm_var": 0.17375386555989583, "learning_rate": 0.0001, "loss": 3.529, "loss/crossentropy": 2.3886643409729005, "loss/hidden": 3.196875, "loss/incoh": 0.0, "loss/logits": 0.36350963413715365, "loss/reg": 0.0, "step": 5010 }, { "epoch": 0.03302631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.13043619791666666, "learning_rate": 0.0001, "loss": 3.5608, "loss/crossentropy": 2.5570758461952208, "loss/hidden": 3.4828125, "loss/incoh": 0.0, "loss/logits": 0.343785697221756, "loss/reg": 0.0, "step": 5020 }, { "epoch": 0.033092105263157895, "grad_norm": 2.921875, "grad_norm_var": 0.19719645182291667, "learning_rate": 0.0001, "loss": 3.5903, "loss/crossentropy": 2.3763694763183594, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.32882467210292815, "loss/reg": 0.0, "step": 5030 }, { "epoch": 0.03315789473684211, "grad_norm": 2.484375, "grad_norm_var": 0.21155497233072917, "learning_rate": 0.0001, "loss": 3.5454, "loss/crossentropy": 2.5775513648986816, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3086023017764091, "loss/reg": 0.0, "step": 5040 }, { "epoch": 0.033223684210526315, "grad_norm": 2.703125, "grad_norm_var": 0.08268229166666667, "learning_rate": 0.0001, "loss": 3.5005, "loss/crossentropy": 2.257374918460846, "loss/hidden": 3.1609375, "loss/incoh": 0.0, "loss/logits": 0.3117083102464676, "loss/reg": 0.0, "step": 5050 }, { "epoch": 0.03328947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.21896158854166667, "learning_rate": 0.0001, "loss": 3.4097, "loss/crossentropy": 2.437604343891144, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.33744728565216064, "loss/reg": 0.0, "step": 5060 }, { "epoch": 0.033355263157894735, "grad_norm": 2.421875, "grad_norm_var": 0.19402669270833334, "learning_rate": 0.0001, "loss": 3.5413, "loss/crossentropy": 2.1056251645088198, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.35452440977096555, "loss/reg": 0.0, "step": 5070 }, { "epoch": 0.03342105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.041829427083333336, "learning_rate": 0.0001, "loss": 3.5305, "loss/crossentropy": 2.5163299083709716, "loss/hidden": 3.296875, "loss/incoh": 0.0, "loss/logits": 0.35606471002101897, "loss/reg": 0.0, "step": 5080 }, { "epoch": 0.033486842105263155, "grad_norm": 2.328125, "grad_norm_var": 0.08046773274739584, "learning_rate": 0.0001, "loss": 3.4405, "loss/crossentropy": 2.37408185005188, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.3234561800956726, "loss/reg": 0.0, "step": 5090 }, { "epoch": 0.03355263157894737, "grad_norm": 2.921875, "grad_norm_var": 0.17653706868489583, "learning_rate": 0.0001, "loss": 3.4303, "loss/crossentropy": 2.3730108022689818, "loss/hidden": 3.1359375, "loss/incoh": 0.0, "loss/logits": 0.2919617787003517, "loss/reg": 0.0, "step": 5100 }, { "epoch": 0.03361842105263158, "grad_norm": 2.5625, "grad_norm_var": 0.47526753743489586, "learning_rate": 0.0001, "loss": 3.5312, "loss/crossentropy": 2.4720141887664795, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.36764703392982484, "loss/reg": 0.0, "step": 5110 }, { "epoch": 0.03368421052631579, "grad_norm": 2.6875, "grad_norm_var": 0.4923787434895833, "learning_rate": 0.0001, "loss": 3.5491, "loss/crossentropy": 2.449038088321686, "loss/hidden": 3.3171875, "loss/incoh": 0.0, "loss/logits": 0.4042062431573868, "loss/reg": 0.0, "step": 5120 }, { "epoch": 0.03375, "grad_norm": 2.703125, "grad_norm_var": 0.1626861572265625, "learning_rate": 0.0001, "loss": 3.5298, "loss/crossentropy": 2.5271955728530884, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.3406914800405502, "loss/reg": 0.0, "step": 5130 }, { "epoch": 0.03381578947368421, "grad_norm": 2.65625, "grad_norm_var": 0.037018839518229166, "learning_rate": 0.0001, "loss": 3.549, "loss/crossentropy": 2.6082807898521425, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.33228414356708524, "loss/reg": 0.0, "step": 5140 }, { "epoch": 0.03388157894736842, "grad_norm": 2.375, "grad_norm_var": 0.08620503743489584, "learning_rate": 0.0001, "loss": 3.505, "loss/crossentropy": 2.0589061468839644, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.30756633579730985, "loss/reg": 0.0, "step": 5150 }, { "epoch": 0.03394736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.3478342692057292, "learning_rate": 0.0001, "loss": 3.5598, "loss/crossentropy": 2.0876080930233, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.3331515982747078, "loss/reg": 0.0, "step": 5160 }, { "epoch": 0.03401315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.3102773030598958, "learning_rate": 0.0001, "loss": 3.5298, "loss/crossentropy": 2.123845911026001, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.27716329991817473, "loss/reg": 0.0, "step": 5170 }, { "epoch": 0.034078947368421056, "grad_norm": 3.484375, "grad_norm_var": 0.3398590087890625, "learning_rate": 0.0001, "loss": 3.546, "loss/crossentropy": 2.5750380873680117, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.3079290196299553, "loss/reg": 0.0, "step": 5180 }, { "epoch": 0.03414473684210526, "grad_norm": 3.015625, "grad_norm_var": 0.3519195556640625, "learning_rate": 0.0001, "loss": 3.4686, "loss/crossentropy": 2.1761133074760437, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.28422126173973083, "loss/reg": 0.0, "step": 5190 }, { "epoch": 0.034210526315789476, "grad_norm": 3.21875, "grad_norm_var": 0.09970296223958333, "learning_rate": 0.0001, "loss": 3.5823, "loss/crossentropy": 2.224585694074631, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.32807315289974215, "loss/reg": 0.0, "step": 5200 }, { "epoch": 0.03427631578947368, "grad_norm": 2.96875, "grad_norm_var": 0.34479878743489584, "learning_rate": 0.0001, "loss": 3.5417, "loss/crossentropy": 2.217251694202423, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2984598934650421, "loss/reg": 0.0, "step": 5210 }, { "epoch": 0.034342105263157896, "grad_norm": 2.6875, "grad_norm_var": 0.28804931640625, "learning_rate": 0.0001, "loss": 3.4788, "loss/crossentropy": 2.344852977991104, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.2878506749868393, "loss/reg": 0.0, "step": 5220 }, { "epoch": 0.0344078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.17006734212239583, "learning_rate": 0.0001, "loss": 3.4649, "loss/crossentropy": 2.5100401520729063, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3491516515612602, "loss/reg": 0.0, "step": 5230 }, { "epoch": 0.034473684210526316, "grad_norm": 2.5625, "grad_norm_var": 1.1076649983723958, "learning_rate": 0.0001, "loss": 3.503, "loss/crossentropy": 2.565778684616089, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.35944747030735014, "loss/reg": 0.0, "step": 5240 }, { "epoch": 0.03453947368421053, "grad_norm": 2.3125, "grad_norm_var": 1.1434315999348958, "learning_rate": 0.0001, "loss": 3.4668, "loss/crossentropy": 2.5031490683555604, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2864773109555244, "loss/reg": 0.0, "step": 5250 }, { "epoch": 0.034605263157894736, "grad_norm": 2.90625, "grad_norm_var": 0.5837849934895833, "learning_rate": 0.0001, "loss": 3.4822, "loss/crossentropy": 2.3741963386535643, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3231631726026535, "loss/reg": 0.0, "step": 5260 }, { "epoch": 0.03467105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.62431640625, "learning_rate": 0.0001, "loss": 3.4414, "loss/crossentropy": 2.3789267897605897, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3201348423957825, "loss/reg": 0.0, "step": 5270 }, { "epoch": 0.034736842105263156, "grad_norm": 5.25, "grad_norm_var": 0.5048573811848959, "learning_rate": 0.0001, "loss": 3.412, "loss/crossentropy": 2.47695529460907, "loss/hidden": 3.171875, "loss/incoh": 0.0, "loss/logits": 0.31060084253549575, "loss/reg": 0.0, "step": 5280 }, { "epoch": 0.03480263157894737, "grad_norm": 2.625, "grad_norm_var": 0.54195556640625, "learning_rate": 0.0001, "loss": 3.5371, "loss/crossentropy": 2.4316977143287657, "loss/hidden": 3.0703125, "loss/incoh": 0.0, "loss/logits": 0.30330550074577334, "loss/reg": 0.0, "step": 5290 }, { "epoch": 0.034868421052631576, "grad_norm": 2.625, "grad_norm_var": 0.1771484375, "learning_rate": 0.0001, "loss": 3.534, "loss/crossentropy": 2.058604693412781, "loss/hidden": 3.16875, "loss/incoh": 0.0, "loss/logits": 0.2998970851302147, "loss/reg": 0.0, "step": 5300 }, { "epoch": 0.03493421052631579, "grad_norm": 2.375, "grad_norm_var": 0.21590169270833334, "learning_rate": 0.0001, "loss": 3.5029, "loss/crossentropy": 2.1623964309692383, "loss/hidden": 3.1453125, "loss/incoh": 0.0, "loss/logits": 0.2633717767894268, "loss/reg": 0.0, "step": 5310 }, { "epoch": 0.035, "grad_norm": 2.90625, "grad_norm_var": 0.12704671223958333, "learning_rate": 0.0001, "loss": 3.4862, "loss/crossentropy": 2.5717769265174866, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.30162925869226453, "loss/reg": 0.0, "step": 5320 }, { "epoch": 0.03506578947368421, "grad_norm": 2.734375, "grad_norm_var": 1.4618398030598958, "learning_rate": 0.0001, "loss": 3.5429, "loss/crossentropy": 2.462851893901825, "loss/hidden": 3.328125, "loss/incoh": 0.0, "loss/logits": 0.3766929477453232, "loss/reg": 0.0, "step": 5330 }, { "epoch": 0.03513157894736842, "grad_norm": 2.390625, "grad_norm_var": 1.7596181233723958, "learning_rate": 0.0001, "loss": 3.5089, "loss/crossentropy": 2.44319885969162, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.30458342134952543, "loss/reg": 0.0, "step": 5340 }, { "epoch": 0.03519736842105263, "grad_norm": 2.875, "grad_norm_var": 0.4787668863932292, "learning_rate": 0.0001, "loss": 3.6272, "loss/crossentropy": 2.594151020050049, "loss/hidden": 3.2296875, "loss/incoh": 0.0, "loss/logits": 0.3736713409423828, "loss/reg": 0.0, "step": 5350 }, { "epoch": 0.035263157894736843, "grad_norm": 2.484375, "grad_norm_var": 0.4522939046223958, "learning_rate": 0.0001, "loss": 3.5325, "loss/crossentropy": 2.0771877110004424, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.3338633939623833, "loss/reg": 0.0, "step": 5360 }, { "epoch": 0.03532894736842105, "grad_norm": 2.390625, "grad_norm_var": 0.0387847900390625, "learning_rate": 0.0001, "loss": 3.4757, "loss/crossentropy": 2.5547770977020265, "loss/hidden": 3.25, "loss/incoh": 0.0, "loss/logits": 0.36584808975458144, "loss/reg": 0.0, "step": 5370 }, { "epoch": 0.035394736842105264, "grad_norm": 2.34375, "grad_norm_var": 11.889094034830729, "learning_rate": 0.0001, "loss": 3.6332, "loss/crossentropy": 2.2124004304409026, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.29226877391338346, "loss/reg": 0.0, "step": 5380 }, { "epoch": 0.03546052631578948, "grad_norm": 3.15625, "grad_norm_var": 6.872362263997396, "learning_rate": 0.0001, "loss": 3.5496, "loss/crossentropy": 2.2622018218040467, "loss/hidden": 3.259375, "loss/incoh": 0.0, "loss/logits": 0.40298803299665453, "loss/reg": 0.0, "step": 5390 }, { "epoch": 0.035526315789473684, "grad_norm": 2.25, "grad_norm_var": 0.0641998291015625, "learning_rate": 0.0001, "loss": 3.4621, "loss/crossentropy": 2.374979627132416, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.37631402611732484, "loss/reg": 0.0, "step": 5400 }, { "epoch": 0.0355921052631579, "grad_norm": 2.625, "grad_norm_var": 0.028059895833333334, "learning_rate": 0.0001, "loss": 3.4667, "loss/crossentropy": 2.4809056520462036, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.32154888212680816, "loss/reg": 0.0, "step": 5410 }, { "epoch": 0.035657894736842104, "grad_norm": 2.5625, "grad_norm_var": 0.19182840983072916, "learning_rate": 0.0001, "loss": 3.528, "loss/crossentropy": 2.3937729835510253, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.29925636053085325, "loss/reg": 0.0, "step": 5420 }, { "epoch": 0.03572368421052632, "grad_norm": 2.734375, "grad_norm_var": 0.04440104166666667, "learning_rate": 0.0001, "loss": 3.4433, "loss/crossentropy": 2.604015350341797, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.28630980402231215, "loss/reg": 0.0, "step": 5430 }, { "epoch": 0.035789473684210524, "grad_norm": 2.34375, "grad_norm_var": 0.057454427083333336, "learning_rate": 0.0001, "loss": 3.3938, "loss/crossentropy": 2.3647801518440246, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.2772139713168144, "loss/reg": 0.0, "step": 5440 }, { "epoch": 0.03585526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.06382548014322917, "learning_rate": 0.0001, "loss": 3.4874, "loss/crossentropy": 2.4227387428283693, "loss/hidden": 3.3890625, "loss/incoh": 0.0, "loss/logits": 0.3498344630002975, "loss/reg": 0.0, "step": 5450 }, { "epoch": 0.03592105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.5879221598307292, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.024171155691147, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.25232082083821294, "loss/reg": 0.0, "step": 5460 }, { "epoch": 0.03598684210526316, "grad_norm": 2.78125, "grad_norm_var": 1.2988596598307292, "learning_rate": 0.0001, "loss": 3.4549, "loss/crossentropy": 2.5548394203186033, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.29853117763996123, "loss/reg": 0.0, "step": 5470 }, { "epoch": 0.03605263157894737, "grad_norm": 3.234375, "grad_norm_var": 1.458153279622396, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.520615005493164, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.31394066512584684, "loss/reg": 0.0, "step": 5480 }, { "epoch": 0.03611842105263158, "grad_norm": 2.96875, "grad_norm_var": 0.6611328125, "learning_rate": 0.0001, "loss": 3.4932, "loss/crossentropy": 2.447161090373993, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.3091880366206169, "loss/reg": 0.0, "step": 5490 }, { "epoch": 0.03618421052631579, "grad_norm": 2.671875, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 3.4718, "loss/crossentropy": 2.354436981678009, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.30545540153980255, "loss/reg": 0.0, "step": 5500 }, { "epoch": 0.03625, "grad_norm": 2.828125, "grad_norm_var": 0.24016011555989583, "learning_rate": 0.0001, "loss": 3.5554, "loss/crossentropy": 2.350696861743927, "loss/hidden": 3.1578125, "loss/incoh": 0.0, "loss/logits": 0.27360412031412124, "loss/reg": 0.0, "step": 5510 }, { "epoch": 0.03631578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.6210245768229167, "learning_rate": 0.0001, "loss": 3.513, "loss/crossentropy": 2.368817460536957, "loss/hidden": 3.1796875, "loss/incoh": 0.0, "loss/logits": 0.31491883993148806, "loss/reg": 0.0, "step": 5520 }, { "epoch": 0.036381578947368425, "grad_norm": 2.640625, "grad_norm_var": 0.16715087890625, "learning_rate": 0.0001, "loss": 3.4444, "loss/crossentropy": 2.3894132494926454, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.2710603341460228, "loss/reg": 0.0, "step": 5530 }, { "epoch": 0.03644736842105263, "grad_norm": 2.484375, "grad_norm_var": 2.993724568684896, "learning_rate": 0.0001, "loss": 3.5105, "loss/crossentropy": 2.4798691868782043, "loss/hidden": 3.25, "loss/incoh": 0.0, "loss/logits": 0.422188438475132, "loss/reg": 0.0, "step": 5540 }, { "epoch": 0.036513157894736845, "grad_norm": 3.6875, "grad_norm_var": 2.864090983072917, "learning_rate": 0.0001, "loss": 3.6266, "loss/crossentropy": 2.499036192893982, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.3576551049947739, "loss/reg": 0.0, "step": 5550 }, { "epoch": 0.03657894736842105, "grad_norm": 2.59375, "grad_norm_var": 0.2598052978515625, "learning_rate": 0.0001, "loss": 3.5659, "loss/crossentropy": 2.4270546317100523, "loss/hidden": 3.23125, "loss/incoh": 0.0, "loss/logits": 0.4616221562027931, "loss/reg": 0.0, "step": 5560 }, { "epoch": 0.036644736842105265, "grad_norm": 3.03125, "grad_norm_var": 0.39485270182291665, "learning_rate": 0.0001, "loss": 3.5978, "loss/crossentropy": 2.427480709552765, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.38075721710920335, "loss/reg": 0.0, "step": 5570 }, { "epoch": 0.03671052631578947, "grad_norm": 2.390625, "grad_norm_var": 1.5193318684895833, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.2717662811279298, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.2794697627425194, "loss/reg": 0.0, "step": 5580 }, { "epoch": 0.036776315789473685, "grad_norm": 2.59375, "grad_norm_var": 1.5052734375, "learning_rate": 0.0001, "loss": 3.5513, "loss/crossentropy": 2.549258255958557, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.3551526039838791, "loss/reg": 0.0, "step": 5590 }, { "epoch": 0.03684210526315789, "grad_norm": 2.78125, "grad_norm_var": 0.12431538899739583, "learning_rate": 0.0001, "loss": 3.5166, "loss/crossentropy": 2.42179411649704, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.349351167678833, "loss/reg": 0.0, "step": 5600 }, { "epoch": 0.036907894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.09789937337239583, "learning_rate": 0.0001, "loss": 3.4519, "loss/crossentropy": 2.275598430633545, "loss/hidden": 3.2015625, "loss/incoh": 0.0, "loss/logits": 0.32426146864891053, "loss/reg": 0.0, "step": 5610 }, { "epoch": 0.03697368421052632, "grad_norm": 2.84375, "grad_norm_var": 0.10132548014322916, "learning_rate": 0.0001, "loss": 3.4632, "loss/crossentropy": 2.4317312955856325, "loss/hidden": 3.3171875, "loss/incoh": 0.0, "loss/logits": 0.3550658613443375, "loss/reg": 0.0, "step": 5620 }, { "epoch": 0.037039473684210525, "grad_norm": 25.0, "grad_norm_var": 167.39566650390626, "learning_rate": 0.0001, "loss": 3.6136, "loss/crossentropy": 2.5963298320770263, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.34572866559028625, "loss/reg": 0.0, "step": 5630 }, { "epoch": 0.03710526315789474, "grad_norm": 3.0, "grad_norm_var": 167.50836486816405, "learning_rate": 0.0001, "loss": 3.5122, "loss/crossentropy": 2.370168614387512, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.316168874502182, "loss/reg": 0.0, "step": 5640 }, { "epoch": 0.037171052631578945, "grad_norm": 2.921875, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 3.4512, "loss/crossentropy": 2.1566815614700316, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.30936725735664367, "loss/reg": 0.0, "step": 5650 }, { "epoch": 0.03723684210526316, "grad_norm": 3.546875, "grad_norm_var": 0.1086822509765625, "learning_rate": 0.0001, "loss": 3.4308, "loss/crossentropy": 2.315059244632721, "loss/hidden": 3.1609375, "loss/incoh": 0.0, "loss/logits": 0.3344813346862793, "loss/reg": 0.0, "step": 5660 }, { "epoch": 0.037302631578947365, "grad_norm": 2.984375, "grad_norm_var": 0.22506510416666667, "learning_rate": 0.0001, "loss": 3.5705, "loss/crossentropy": 2.063437449932098, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.2783856257796288, "loss/reg": 0.0, "step": 5670 }, { "epoch": 0.03736842105263158, "grad_norm": 2.75, "grad_norm_var": 0.48176981608072916, "learning_rate": 0.0001, "loss": 3.5285, "loss/crossentropy": 2.564136099815369, "loss/hidden": 3.2453125, "loss/incoh": 0.0, "loss/logits": 0.3260859474539757, "loss/reg": 0.0, "step": 5680 }, { "epoch": 0.03743421052631579, "grad_norm": 2.78125, "grad_norm_var": 1.2526519775390625, "learning_rate": 0.0001, "loss": 3.581, "loss/crossentropy": 2.4384737968444825, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2700443536043167, "loss/reg": 0.0, "step": 5690 }, { "epoch": 0.0375, "grad_norm": 2.53125, "grad_norm_var": 1.029296875, "learning_rate": 0.0001, "loss": 3.5309, "loss/crossentropy": 2.656829285621643, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.34218672215938567, "loss/reg": 0.0, "step": 5700 }, { "epoch": 0.03756578947368421, "grad_norm": 2.375, "grad_norm_var": 5.287238566080729, "learning_rate": 0.0001, "loss": 3.6012, "loss/crossentropy": 2.3537548005580904, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.33707170784473417, "loss/reg": 0.0, "step": 5710 }, { "epoch": 0.03763157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.6506795247395833, "learning_rate": 0.0001, "loss": 3.4987, "loss/crossentropy": 2.217307722568512, "loss/hidden": 3.31875, "loss/incoh": 0.0, "loss/logits": 0.36700052917003634, "loss/reg": 0.0, "step": 5720 }, { "epoch": 0.03769736842105263, "grad_norm": 2.796875, "grad_norm_var": 235.47136942545572, "learning_rate": 0.0001, "loss": 3.4822, "loss/crossentropy": 2.39666086435318, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.322082930803299, "loss/reg": 0.0, "step": 5730 }, { "epoch": 0.03776315789473684, "grad_norm": 2.65625, "grad_norm_var": 48.15474853515625, "learning_rate": 0.0001, "loss": 3.5338, "loss/crossentropy": 2.5715784192085267, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.31636003255844114, "loss/reg": 0.0, "step": 5740 }, { "epoch": 0.03782894736842105, "grad_norm": 2.375, "grad_norm_var": 48.25117899576823, "learning_rate": 0.0001, "loss": 3.345, "loss/crossentropy": 2.1092816948890687, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.27086046040058137, "loss/reg": 0.0, "step": 5750 }, { "epoch": 0.037894736842105266, "grad_norm": 3.34375, "grad_norm_var": 0.1339508056640625, "learning_rate": 0.0001, "loss": 3.4478, "loss/crossentropy": 2.20155810713768, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.3250092178583145, "loss/reg": 0.0, "step": 5760 }, { "epoch": 0.03796052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.5598592122395833, "learning_rate": 0.0001, "loss": 3.4268, "loss/crossentropy": 2.2270141005516053, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.2702811732888222, "loss/reg": 0.0, "step": 5770 }, { "epoch": 0.038026315789473686, "grad_norm": 2.53125, "grad_norm_var": 0.1083892822265625, "learning_rate": 0.0001, "loss": 3.4578, "loss/crossentropy": 2.4509124517440797, "loss/hidden": 3.1109375, "loss/incoh": 0.0, "loss/logits": 0.3049029678106308, "loss/reg": 0.0, "step": 5780 }, { "epoch": 0.03809210526315789, "grad_norm": 3.21875, "grad_norm_var": 0.20041402180989584, "learning_rate": 0.0001, "loss": 3.4516, "loss/crossentropy": 2.1973756074905397, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2764800027012825, "loss/reg": 0.0, "step": 5790 }, { "epoch": 0.038157894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.17437235514322916, "learning_rate": 0.0001, "loss": 3.3789, "loss/crossentropy": 2.4859437584877013, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.28125611394643785, "loss/reg": 0.0, "step": 5800 }, { "epoch": 0.03822368421052631, "grad_norm": 2.484375, "grad_norm_var": 1.9865234375, "learning_rate": 0.0001, "loss": 3.4479, "loss/crossentropy": 2.6306315779685976, "loss/hidden": 3.440625, "loss/incoh": 0.0, "loss/logits": 0.2930992156267166, "loss/reg": 0.0, "step": 5810 }, { "epoch": 0.038289473684210526, "grad_norm": 2.40625, "grad_norm_var": 1.902880859375, "learning_rate": 0.0001, "loss": 3.482, "loss/crossentropy": 2.4275772333145142, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.32571674734354017, "loss/reg": 0.0, "step": 5820 }, { "epoch": 0.03835526315789474, "grad_norm": 2.625, "grad_norm_var": 0.040087890625, "learning_rate": 0.0001, "loss": 3.3832, "loss/crossentropy": 2.348308402299881, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.325964193046093, "loss/reg": 0.0, "step": 5830 }, { "epoch": 0.038421052631578946, "grad_norm": 3.796875, "grad_norm_var": 0.8002115885416666, "learning_rate": 0.0001, "loss": 3.5519, "loss/crossentropy": 2.1252057909965516, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.3046886622905731, "loss/reg": 0.0, "step": 5840 }, { "epoch": 0.03848684210526316, "grad_norm": 2.625, "grad_norm_var": 0.2637685139973958, "learning_rate": 0.0001, "loss": 3.4841, "loss/crossentropy": 2.5330613613128663, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.31516623198986055, "loss/reg": 0.0, "step": 5850 }, { "epoch": 0.038552631578947366, "grad_norm": 2.3125, "grad_norm_var": 0.24798075358072916, "learning_rate": 0.0001, "loss": 3.4625, "loss/crossentropy": 2.324964237213135, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.29473926275968554, "loss/reg": 0.0, "step": 5860 }, { "epoch": 0.03861842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.12890218098958334, "learning_rate": 0.0001, "loss": 3.3932, "loss/crossentropy": 2.436046540737152, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.28259717375040055, "loss/reg": 0.0, "step": 5870 }, { "epoch": 0.038684210526315786, "grad_norm": 2.59375, "grad_norm_var": 0.09658101399739584, "learning_rate": 0.0001, "loss": 3.4335, "loss/crossentropy": 2.3942569494247437, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.29623564779758454, "loss/reg": 0.0, "step": 5880 }, { "epoch": 0.03875, "grad_norm": 2.421875, "grad_norm_var": 0.0283355712890625, "learning_rate": 0.0001, "loss": 3.4179, "loss/crossentropy": 2.675841474533081, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2852372720837593, "loss/reg": 0.0, "step": 5890 }, { "epoch": 0.03881578947368421, "grad_norm": 2.609375, "grad_norm_var": 0.25607096354166664, "learning_rate": 0.0001, "loss": 3.4508, "loss/crossentropy": 2.516406524181366, "loss/hidden": 3.35, "loss/incoh": 0.0, "loss/logits": 0.35599096268415453, "loss/reg": 0.0, "step": 5900 }, { "epoch": 0.03888157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.19685872395833334, "learning_rate": 0.0001, "loss": 3.3701, "loss/crossentropy": 2.118768775463104, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.3433256149291992, "loss/reg": 0.0, "step": 5910 }, { "epoch": 0.03894736842105263, "grad_norm": 2.75, "grad_norm_var": 0.10144856770833334, "learning_rate": 0.0001, "loss": 3.4698, "loss/crossentropy": 2.4102617263793946, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.2903832048177719, "loss/reg": 0.0, "step": 5920 }, { "epoch": 0.03901315789473684, "grad_norm": 3.25, "grad_norm_var": 0.05016276041666667, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.4927979469299317, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2977334216237068, "loss/reg": 0.0, "step": 5930 }, { "epoch": 0.03907894736842105, "grad_norm": 2.25, "grad_norm_var": 0.2647043863932292, "learning_rate": 0.0001, "loss": 3.4123, "loss/crossentropy": 2.163569325208664, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.27049526423215864, "loss/reg": 0.0, "step": 5940 }, { "epoch": 0.03914473684210526, "grad_norm": 2.765625, "grad_norm_var": 0.1790679931640625, "learning_rate": 0.0001, "loss": 3.4968, "loss/crossentropy": 2.314089775085449, "loss/hidden": 3.290625, "loss/incoh": 0.0, "loss/logits": 0.2804916575551033, "loss/reg": 0.0, "step": 5950 }, { "epoch": 0.03921052631578947, "grad_norm": 2.578125, "grad_norm_var": 0.10930989583333334, "learning_rate": 0.0001, "loss": 3.4204, "loss/crossentropy": 2.5095210552215574, "loss/hidden": 3.1890625, "loss/incoh": 0.0, "loss/logits": 0.34905528128147123, "loss/reg": 0.0, "step": 5960 }, { "epoch": 0.03927631578947369, "grad_norm": 2.5, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 3.3368, "loss/crossentropy": 2.3903687596321106, "loss/hidden": 3.228125, "loss/incoh": 0.0, "loss/logits": 0.36534676551818845, "loss/reg": 0.0, "step": 5970 }, { "epoch": 0.039342105263157894, "grad_norm": 2.703125, "grad_norm_var": 0.04106343587239583, "learning_rate": 0.0001, "loss": 3.3975, "loss/crossentropy": 2.485898661613464, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28061621338129045, "loss/reg": 0.0, "step": 5980 }, { "epoch": 0.03940789473684211, "grad_norm": 2.578125, "grad_norm_var": 0.0488677978515625, "learning_rate": 0.0001, "loss": 3.3803, "loss/crossentropy": 2.4124781847000123, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.30112328827381135, "loss/reg": 0.0, "step": 5990 }, { "epoch": 0.039473684210526314, "grad_norm": 2.734375, "grad_norm_var": 0.44810791015625, "learning_rate": 0.0001, "loss": 3.472, "loss/crossentropy": 2.4459670901298525, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2799991726875305, "loss/reg": 0.0, "step": 6000 }, { "epoch": 0.03953947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.1685546875, "learning_rate": 0.0001, "loss": 3.4322, "loss/crossentropy": 2.641672468185425, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.31368287801742556, "loss/reg": 0.0, "step": 6010 }, { "epoch": 0.039605263157894734, "grad_norm": 2.546875, "grad_norm_var": 0.03518473307291667, "learning_rate": 0.0001, "loss": 3.4325, "loss/crossentropy": 2.2493654131889342, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.29540172666311265, "loss/reg": 0.0, "step": 6020 }, { "epoch": 0.03967105263157895, "grad_norm": 2.25, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 3.408, "loss/crossentropy": 2.5626556158065794, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.3445401757955551, "loss/reg": 0.0, "step": 6030 }, { "epoch": 0.03973684210526316, "grad_norm": 3.375, "grad_norm_var": 0.11850484212239583, "learning_rate": 0.0001, "loss": 3.3697, "loss/crossentropy": 2.2249147415161135, "loss/hidden": 3.165625, "loss/incoh": 0.0, "loss/logits": 0.3320572040975094, "loss/reg": 0.0, "step": 6040 }, { "epoch": 0.03980263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.1265045166015625, "learning_rate": 0.0001, "loss": 3.3726, "loss/crossentropy": 2.479216980934143, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3085744693875313, "loss/reg": 0.0, "step": 6050 }, { "epoch": 0.03986842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.2034088134765625, "learning_rate": 0.0001, "loss": 3.4346, "loss/crossentropy": 2.3974440932273864, "loss/hidden": 3.259375, "loss/incoh": 0.0, "loss/logits": 0.4024490460753441, "loss/reg": 0.0, "step": 6060 }, { "epoch": 0.03993421052631579, "grad_norm": 2.640625, "grad_norm_var": 0.19241434733072918, "learning_rate": 0.0001, "loss": 3.3911, "loss/crossentropy": 2.3311298370361326, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2968046382069588, "loss/reg": 0.0, "step": 6070 }, { "epoch": 0.04, "grad_norm": 2.734375, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 3.5175, "loss/crossentropy": 2.6493954181671144, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3133848324418068, "loss/reg": 0.0, "step": 6080 }, { "epoch": 0.04006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.2754140218098958, "learning_rate": 0.0001, "loss": 3.3932, "loss/crossentropy": 2.4315222024917604, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.3378627926111221, "loss/reg": 0.0, "step": 6090 }, { "epoch": 0.04013157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 3.3698, "loss/crossentropy": 2.421731984615326, "loss/hidden": 3.315625, "loss/incoh": 0.0, "loss/logits": 0.3809585988521576, "loss/reg": 0.0, "step": 6100 }, { "epoch": 0.040197368421052634, "grad_norm": 2.328125, "grad_norm_var": 0.0509429931640625, "learning_rate": 0.0001, "loss": 3.3839, "loss/crossentropy": 2.2816696763038635, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.3188880756497383, "loss/reg": 0.0, "step": 6110 }, { "epoch": 0.04026315789473684, "grad_norm": 2.421875, "grad_norm_var": 0.26266276041666664, "learning_rate": 0.0001, "loss": 3.4852, "loss/crossentropy": 2.4251498103141786, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.3131637305021286, "loss/reg": 0.0, "step": 6120 }, { "epoch": 0.040328947368421054, "grad_norm": 2.046875, "grad_norm_var": 0.3377919514973958, "learning_rate": 0.0001, "loss": 3.4191, "loss/crossentropy": 2.321718716621399, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.28195892125368116, "loss/reg": 0.0, "step": 6130 }, { "epoch": 0.04039473684210526, "grad_norm": 2.515625, "grad_norm_var": 0.0685943603515625, "learning_rate": 0.0001, "loss": 3.4412, "loss/crossentropy": 2.658420753479004, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3355125278234482, "loss/reg": 0.0, "step": 6140 }, { "epoch": 0.040460526315789475, "grad_norm": 2.1875, "grad_norm_var": 0.059305826822916664, "learning_rate": 0.0001, "loss": 3.3292, "loss/crossentropy": 2.5203867316246034, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.3060264021158218, "loss/reg": 0.0, "step": 6150 }, { "epoch": 0.04052631578947368, "grad_norm": 3.078125, "grad_norm_var": 0.052302042643229164, "learning_rate": 0.0001, "loss": 3.3844, "loss/crossentropy": 2.53420695066452, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.3124456375837326, "loss/reg": 0.0, "step": 6160 }, { "epoch": 0.040592105263157895, "grad_norm": 2.375, "grad_norm_var": 0.049779256184895836, "learning_rate": 0.0001, "loss": 3.3634, "loss/crossentropy": 2.5132151365280153, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.35312571823596955, "loss/reg": 0.0, "step": 6170 }, { "epoch": 0.04065789473684211, "grad_norm": 2.46875, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 3.4426, "loss/crossentropy": 2.3004459500312806, "loss/hidden": 3.1703125, "loss/incoh": 0.0, "loss/logits": 0.28007449954748154, "loss/reg": 0.0, "step": 6180 }, { "epoch": 0.040723684210526315, "grad_norm": 2.515625, "grad_norm_var": 0.1358306884765625, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.373980039358139, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.28091391175985336, "loss/reg": 0.0, "step": 6190 }, { "epoch": 0.04078947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.08801981608072916, "learning_rate": 0.0001, "loss": 3.4006, "loss/crossentropy": 2.2023098945617674, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2726595625281334, "loss/reg": 0.0, "step": 6200 }, { "epoch": 0.040855263157894735, "grad_norm": 2.984375, "grad_norm_var": 3.6811187744140623, "learning_rate": 0.0001, "loss": 3.6073, "loss/crossentropy": 2.382032370567322, "loss/hidden": 3.6921875, "loss/incoh": 0.0, "loss/logits": 0.4323269993066788, "loss/reg": 0.0, "step": 6210 }, { "epoch": 0.04092105263157895, "grad_norm": 2.296875, "grad_norm_var": 3.804295857747396, "learning_rate": 0.0001, "loss": 3.3627, "loss/crossentropy": 2.569228994846344, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31191317439079286, "loss/reg": 0.0, "step": 6220 }, { "epoch": 0.040986842105263155, "grad_norm": 2.828125, "grad_norm_var": 0.08497721354166667, "learning_rate": 0.0001, "loss": 3.3414, "loss/crossentropy": 2.5153043985366823, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.30689269602298735, "loss/reg": 0.0, "step": 6230 }, { "epoch": 0.04105263157894737, "grad_norm": 2.375, "grad_norm_var": 1.2814849853515624, "learning_rate": 0.0001, "loss": 3.4386, "loss/crossentropy": 2.283157765865326, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3153227433562279, "loss/reg": 0.0, "step": 6240 }, { "epoch": 0.04111842105263158, "grad_norm": 2.234375, "grad_norm_var": 5.054263305664063, "learning_rate": 0.0001, "loss": 3.4379, "loss/crossentropy": 2.585819673538208, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.2913993000984192, "loss/reg": 0.0, "step": 6250 }, { "epoch": 0.04118421052631579, "grad_norm": 2.5, "grad_norm_var": 0.0501373291015625, "learning_rate": 0.0001, "loss": 3.3357, "loss/crossentropy": 2.453801620006561, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3249350532889366, "loss/reg": 0.0, "step": 6260 }, { "epoch": 0.04125, "grad_norm": 2.5, "grad_norm_var": 0.0405181884765625, "learning_rate": 0.0001, "loss": 3.3525, "loss/crossentropy": 2.4949014663696287, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2902356445789337, "loss/reg": 0.0, "step": 6270 }, { "epoch": 0.04131578947368421, "grad_norm": 2.78125, "grad_norm_var": 1.6241770426432292, "learning_rate": 0.0001, "loss": 3.4444, "loss/crossentropy": 2.029393529891968, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.30831936225295065, "loss/reg": 0.0, "step": 6280 }, { "epoch": 0.04138157894736842, "grad_norm": 2.53125, "grad_norm_var": 0.0864654541015625, "learning_rate": 0.0001, "loss": 3.3944, "loss/crossentropy": 3.0075352430343627, "loss/hidden": 3.2390625, "loss/incoh": 0.0, "loss/logits": 0.4476942718029022, "loss/reg": 0.0, "step": 6290 }, { "epoch": 0.04144736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.15415751139322917, "learning_rate": 0.0001, "loss": 3.4091, "loss/crossentropy": 2.36070739030838, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.26602610796689985, "loss/reg": 0.0, "step": 6300 }, { "epoch": 0.04151315789473684, "grad_norm": 3.0, "grad_norm_var": 0.18596903483072916, "learning_rate": 0.0001, "loss": 3.4135, "loss/crossentropy": 2.186020624637604, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.30242343842983244, "loss/reg": 0.0, "step": 6310 }, { "epoch": 0.041578947368421056, "grad_norm": 3.34375, "grad_norm_var": 0.06327718098958333, "learning_rate": 0.0001, "loss": 3.44, "loss/crossentropy": 2.3751362919807435, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.29995152205228803, "loss/reg": 0.0, "step": 6320 }, { "epoch": 0.04164473684210526, "grad_norm": 7.9375, "grad_norm_var": 3.442041015625, "learning_rate": 0.0001, "loss": 3.4985, "loss/crossentropy": 2.2889585196971893, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.5501813948154449, "loss/reg": 0.0, "step": 6330 }, { "epoch": 0.041710526315789476, "grad_norm": 2.875, "grad_norm_var": 3.8211822509765625, "learning_rate": 0.0001, "loss": 3.4688, "loss/crossentropy": 2.378446078300476, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.30127355754375457, "loss/reg": 0.0, "step": 6340 }, { "epoch": 0.04177631578947368, "grad_norm": 3.109375, "grad_norm_var": 1.1990793863932292, "learning_rate": 0.0001, "loss": 3.3813, "loss/crossentropy": 2.444005084037781, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.3470224469900131, "loss/reg": 0.0, "step": 6350 }, { "epoch": 0.041842105263157896, "grad_norm": 2.234375, "grad_norm_var": 0.7403065999348958, "learning_rate": 0.0001, "loss": 3.4, "loss/crossentropy": 2.026668357849121, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2548553004860878, "loss/reg": 0.0, "step": 6360 }, { "epoch": 0.0419078947368421, "grad_norm": 2.921875, "grad_norm_var": 0.3556925455729167, "learning_rate": 0.0001, "loss": 3.4591, "loss/crossentropy": 2.331345629692078, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26911164075136185, "loss/reg": 0.0, "step": 6370 }, { "epoch": 0.041973684210526316, "grad_norm": 2.484375, "grad_norm_var": 0.27215169270833334, "learning_rate": 0.0001, "loss": 3.4011, "loss/crossentropy": 2.2243176221847536, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.30012439042329786, "loss/reg": 0.0, "step": 6380 }, { "epoch": 0.04203947368421053, "grad_norm": 2.625, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 3.3476, "loss/crossentropy": 2.284479832649231, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.3452886208891869, "loss/reg": 0.0, "step": 6390 }, { "epoch": 0.042105263157894736, "grad_norm": 2.859375, "grad_norm_var": 0.18044331868489583, "learning_rate": 0.0001, "loss": 3.4206, "loss/crossentropy": 2.2036523103713987, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2783824667334557, "loss/reg": 0.0, "step": 6400 }, { "epoch": 0.04217105263157895, "grad_norm": 2.34375, "grad_norm_var": 4.91226806640625, "learning_rate": 0.0001, "loss": 3.4892, "loss/crossentropy": 2.2305456399917603, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.29731594026088715, "loss/reg": 0.0, "step": 6410 }, { "epoch": 0.042236842105263156, "grad_norm": 2.15625, "grad_norm_var": 0.8588216145833333, "learning_rate": 0.0001, "loss": 3.3759, "loss/crossentropy": 2.186525213718414, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.31351439356803895, "loss/reg": 0.0, "step": 6420 }, { "epoch": 0.04230263157894737, "grad_norm": 2.421875, "grad_norm_var": 1.168097941080729, "learning_rate": 0.0001, "loss": 3.4452, "loss/crossentropy": 2.3028628826141357, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.40216329991817473, "loss/reg": 0.0, "step": 6430 }, { "epoch": 0.042368421052631576, "grad_norm": 2.296875, "grad_norm_var": 0.0708160400390625, "learning_rate": 0.0001, "loss": 3.362, "loss/crossentropy": 2.5469772100448607, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.3354289785027504, "loss/reg": 0.0, "step": 6440 }, { "epoch": 0.04243421052631579, "grad_norm": 3.4375, "grad_norm_var": 0.09075419108072917, "learning_rate": 0.0001, "loss": 3.366, "loss/crossentropy": 2.3079045534133913, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.2784201934933662, "loss/reg": 0.0, "step": 6450 }, { "epoch": 0.0425, "grad_norm": 2.5625, "grad_norm_var": 0.10932515462239584, "learning_rate": 0.0001, "loss": 3.4156, "loss/crossentropy": 2.325330352783203, "loss/hidden": 3.1828125, "loss/incoh": 0.0, "loss/logits": 0.3098024681210518, "loss/reg": 0.0, "step": 6460 }, { "epoch": 0.04256578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.07423502604166667, "learning_rate": 0.0001, "loss": 3.3605, "loss/crossentropy": 2.4809486865997314, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.336503566801548, "loss/reg": 0.0, "step": 6470 }, { "epoch": 0.04263157894736842, "grad_norm": 2.5, "grad_norm_var": 0.06294657389322916, "learning_rate": 0.0001, "loss": 3.4219, "loss/crossentropy": 2.357036566734314, "loss/hidden": 3.16875, "loss/incoh": 0.0, "loss/logits": 0.31517077386379244, "loss/reg": 0.0, "step": 6480 }, { "epoch": 0.04269736842105263, "grad_norm": 2.96875, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 3.2698, "loss/crossentropy": 2.3011206150054933, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.3023343622684479, "loss/reg": 0.0, "step": 6490 }, { "epoch": 0.04276315789473684, "grad_norm": 3.75, "grad_norm_var": 0.191259765625, "learning_rate": 0.0001, "loss": 3.3991, "loss/crossentropy": 2.135625755786896, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.30503824055194856, "loss/reg": 0.0, "step": 6500 }, { "epoch": 0.04282894736842105, "grad_norm": 2.796875, "grad_norm_var": 0.16857096354166667, "learning_rate": 0.0001, "loss": 3.317, "loss/crossentropy": 2.2963179469108583, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.30365400537848475, "loss/reg": 0.0, "step": 6510 }, { "epoch": 0.04289473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.0795562744140625, "learning_rate": 0.0001, "loss": 3.3769, "loss/crossentropy": 2.463003098964691, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2864454731345177, "loss/reg": 0.0, "step": 6520 }, { "epoch": 0.04296052631578948, "grad_norm": 3.28125, "grad_norm_var": 0.1319244384765625, "learning_rate": 0.0001, "loss": 3.4418, "loss/crossentropy": 2.4336194515228273, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.2907303601503372, "loss/reg": 0.0, "step": 6530 }, { "epoch": 0.04302631578947368, "grad_norm": 2.78125, "grad_norm_var": 0.18455403645833332, "learning_rate": 0.0001, "loss": 3.5029, "loss/crossentropy": 2.4530020356178284, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.3084172964096069, "loss/reg": 0.0, "step": 6540 }, { "epoch": 0.0430921052631579, "grad_norm": 2.625, "grad_norm_var": 0.0909332275390625, "learning_rate": 0.0001, "loss": 3.4113, "loss/crossentropy": 2.28972727060318, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.29103828966617584, "loss/reg": 0.0, "step": 6550 }, { "epoch": 0.0431578947368421, "grad_norm": 3.15625, "grad_norm_var": 0.09442952473958334, "learning_rate": 0.0001, "loss": 3.3842, "loss/crossentropy": 2.5410515666007996, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.262499050796032, "loss/reg": 0.0, "step": 6560 }, { "epoch": 0.04322368421052632, "grad_norm": 2.625, "grad_norm_var": 0.5502105712890625, "learning_rate": 0.0001, "loss": 3.3708, "loss/crossentropy": 2.460482358932495, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.2910577103495598, "loss/reg": 0.0, "step": 6570 }, { "epoch": 0.043289473684210523, "grad_norm": 2.484375, "grad_norm_var": 0.5198527018229167, "learning_rate": 0.0001, "loss": 3.2716, "loss/crossentropy": 2.264538216590881, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.26389954835176466, "loss/reg": 0.0, "step": 6580 }, { "epoch": 0.04335526315789474, "grad_norm": 2.609375, "grad_norm_var": 0.58785400390625, "learning_rate": 0.0001, "loss": 3.4431, "loss/crossentropy": 2.705476760864258, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.3710309460759163, "loss/reg": 0.0, "step": 6590 }, { "epoch": 0.04342105263157895, "grad_norm": 2.71875, "grad_norm_var": 0.7066721598307292, "learning_rate": 0.0001, "loss": 3.3844, "loss/crossentropy": 2.3109049081802366, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.3492768794298172, "loss/reg": 0.0, "step": 6600 }, { "epoch": 0.04348684210526316, "grad_norm": 2.40625, "grad_norm_var": 2.2473958333333335, "learning_rate": 0.0001, "loss": 3.5366, "loss/crossentropy": 2.110491228103638, "loss/hidden": 3.2296875, "loss/incoh": 0.0, "loss/logits": 0.3138969212770462, "loss/reg": 0.0, "step": 6610 }, { "epoch": 0.04355263157894737, "grad_norm": 2.265625, "grad_norm_var": 1.2563222249348958, "learning_rate": 0.0001, "loss": 3.3583, "loss/crossentropy": 2.5283448338508605, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2928563803434372, "loss/reg": 0.0, "step": 6620 }, { "epoch": 0.04361842105263158, "grad_norm": 2.359375, "grad_norm_var": 1.2412261962890625, "learning_rate": 0.0001, "loss": 3.3885, "loss/crossentropy": 2.579999303817749, "loss/hidden": 3.4296875, "loss/incoh": 0.0, "loss/logits": 0.4414908319711685, "loss/reg": 0.0, "step": 6630 }, { "epoch": 0.04368421052631579, "grad_norm": 2.546875, "grad_norm_var": 0.11516825358072917, "learning_rate": 0.0001, "loss": 3.3022, "loss/crossentropy": 2.413753032684326, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.28862773478031156, "loss/reg": 0.0, "step": 6640 }, { "epoch": 0.04375, "grad_norm": 2.828125, "grad_norm_var": 0.28503316243489585, "learning_rate": 0.0001, "loss": 3.4009, "loss/crossentropy": 2.4203495264053343, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2926980495452881, "loss/reg": 0.0, "step": 6650 }, { "epoch": 0.04381578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.07642822265625, "learning_rate": 0.0001, "loss": 3.4092, "loss/crossentropy": 2.3588085770606995, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.32405087500810625, "loss/reg": 0.0, "step": 6660 }, { "epoch": 0.043881578947368424, "grad_norm": 3.53125, "grad_norm_var": 0.45767822265625, "learning_rate": 0.0001, "loss": 3.4143, "loss/crossentropy": 2.512442636489868, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.2854305922985077, "loss/reg": 0.0, "step": 6670 }, { "epoch": 0.04394736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.3211873372395833, "learning_rate": 0.0001, "loss": 3.3297, "loss/crossentropy": 2.5028780698776245, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3471944749355316, "loss/reg": 0.0, "step": 6680 }, { "epoch": 0.044013157894736844, "grad_norm": 4.0625, "grad_norm_var": 0.8257720947265625, "learning_rate": 0.0001, "loss": 3.4063, "loss/crossentropy": 2.2031276702880858, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.25473351776599884, "loss/reg": 0.0, "step": 6690 }, { "epoch": 0.04407894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.22024637858072918, "learning_rate": 0.0001, "loss": 3.4035, "loss/crossentropy": 2.2040748953819276, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.3361481264233589, "loss/reg": 0.0, "step": 6700 }, { "epoch": 0.044144736842105264, "grad_norm": 2.15625, "grad_norm_var": 0.1261383056640625, "learning_rate": 0.0001, "loss": 3.4225, "loss/crossentropy": 2.390383541584015, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2891513243317604, "loss/reg": 0.0, "step": 6710 }, { "epoch": 0.04421052631578947, "grad_norm": 2.65625, "grad_norm_var": 0.0734375, "learning_rate": 0.0001, "loss": 3.4363, "loss/crossentropy": 2.392831575870514, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3119692116975784, "loss/reg": 0.0, "step": 6720 }, { "epoch": 0.044276315789473684, "grad_norm": 2.453125, "grad_norm_var": 0.12464192708333334, "learning_rate": 0.0001, "loss": 3.3959, "loss/crossentropy": 2.426222395896912, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2563284829258919, "loss/reg": 0.0, "step": 6730 }, { "epoch": 0.0443421052631579, "grad_norm": 2.890625, "grad_norm_var": 3.1175201416015623, "learning_rate": 0.0001, "loss": 3.4427, "loss/crossentropy": 2.408197546005249, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.3056713670492172, "loss/reg": 0.0, "step": 6740 }, { "epoch": 0.044407894736842105, "grad_norm": 2.359375, "grad_norm_var": 2.6096099853515624, "learning_rate": 0.0001, "loss": 3.3365, "loss/crossentropy": 2.5353691220283507, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.30413212031126025, "loss/reg": 0.0, "step": 6750 }, { "epoch": 0.04447368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.233740234375, "learning_rate": 0.0001, "loss": 3.371, "loss/crossentropy": 2.495334494113922, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.3251196876168251, "loss/reg": 0.0, "step": 6760 }, { "epoch": 0.044539473684210525, "grad_norm": 2.375, "grad_norm_var": 0.11097005208333334, "learning_rate": 0.0001, "loss": 3.3306, "loss/crossentropy": 2.3767608165740968, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.27886694818735125, "loss/reg": 0.0, "step": 6770 }, { "epoch": 0.04460526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.17870992024739582, "learning_rate": 0.0001, "loss": 3.4315, "loss/crossentropy": 2.5281002640724184, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.31765572130680086, "loss/reg": 0.0, "step": 6780 }, { "epoch": 0.044671052631578945, "grad_norm": 3.046875, "grad_norm_var": 3.0139973958333335, "learning_rate": 0.0001, "loss": 3.4717, "loss/crossentropy": 2.362587594985962, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.3029619336128235, "loss/reg": 0.0, "step": 6790 }, { "epoch": 0.04473684210526316, "grad_norm": 2.6875, "grad_norm_var": 7.6724192301432295, "learning_rate": 0.0001, "loss": 3.4302, "loss/crossentropy": 2.4665863275527955, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.2934702351689339, "loss/reg": 0.0, "step": 6800 }, { "epoch": 0.04480263157894737, "grad_norm": 2.453125, "grad_norm_var": 7.714127604166666, "learning_rate": 0.0001, "loss": 3.4034, "loss/crossentropy": 2.1714313626289368, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.3513699471950531, "loss/reg": 0.0, "step": 6810 }, { "epoch": 0.04486842105263158, "grad_norm": 3.953125, "grad_norm_var": 0.17561848958333334, "learning_rate": 0.0001, "loss": 3.3193, "loss/crossentropy": 2.145359480381012, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.299596332013607, "loss/reg": 0.0, "step": 6820 }, { "epoch": 0.04493421052631579, "grad_norm": 2.328125, "grad_norm_var": 0.20790913899739583, "learning_rate": 0.0001, "loss": 3.343, "loss/crossentropy": 2.7453124046325685, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.36230285465717316, "loss/reg": 0.0, "step": 6830 }, { "epoch": 0.045, "grad_norm": 2.6875, "grad_norm_var": 2.236214192708333, "learning_rate": 0.0001, "loss": 3.4372, "loss/crossentropy": 2.256898009777069, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.2946140691637993, "loss/reg": 0.0, "step": 6840 }, { "epoch": 0.04506578947368421, "grad_norm": 2.375, "grad_norm_var": 2.391950480143229, "learning_rate": 0.0001, "loss": 3.3701, "loss/crossentropy": 2.6210601568222045, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.3517232984304428, "loss/reg": 0.0, "step": 6850 }, { "epoch": 0.04513157894736842, "grad_norm": 2.4375, "grad_norm_var": 1.015998331705729, "learning_rate": 0.0001, "loss": 3.3998, "loss/crossentropy": 2.387173318862915, "loss/hidden": 3.3296875, "loss/incoh": 0.0, "loss/logits": 0.3826398134231567, "loss/reg": 0.0, "step": 6860 }, { "epoch": 0.04519736842105263, "grad_norm": 2.84375, "grad_norm_var": 0.44384663899739585, "learning_rate": 0.0001, "loss": 3.3858, "loss/crossentropy": 2.4001118540763855, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.27424332648515704, "loss/reg": 0.0, "step": 6870 }, { "epoch": 0.045263157894736845, "grad_norm": 2.671875, "grad_norm_var": 0.06037495930989583, "learning_rate": 0.0001, "loss": 3.3752, "loss/crossentropy": 2.1074828147888183, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.2777694225311279, "loss/reg": 0.0, "step": 6880 }, { "epoch": 0.04532894736842105, "grad_norm": 2.515625, "grad_norm_var": 0.1735504150390625, "learning_rate": 0.0001, "loss": 3.3792, "loss/crossentropy": 2.2711395502090452, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.3001497104763985, "loss/reg": 0.0, "step": 6890 }, { "epoch": 0.045394736842105265, "grad_norm": 3.0, "grad_norm_var": 1.8758951822916667, "learning_rate": 0.0001, "loss": 3.4311, "loss/crossentropy": 2.2329100012779235, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.3293987289071083, "loss/reg": 0.0, "step": 6900 }, { "epoch": 0.04546052631578947, "grad_norm": 2.703125, "grad_norm_var": 1.793781534830729, "learning_rate": 0.0001, "loss": 3.2852, "loss/crossentropy": 2.3576371729373933, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.27053930461406706, "loss/reg": 0.0, "step": 6910 }, { "epoch": 0.045526315789473686, "grad_norm": 2.484375, "grad_norm_var": 1.989207967122396, "learning_rate": 0.0001, "loss": 3.4411, "loss/crossentropy": 2.532450318336487, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.2638275146484375, "loss/reg": 0.0, "step": 6920 }, { "epoch": 0.04559210526315789, "grad_norm": 2.546875, "grad_norm_var": 2.184137980143229, "learning_rate": 0.0001, "loss": 3.4069, "loss/crossentropy": 2.4933431267738344, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.32281421422958373, "loss/reg": 0.0, "step": 6930 }, { "epoch": 0.045657894736842106, "grad_norm": 2.421875, "grad_norm_var": 0.05158589680989583, "learning_rate": 0.0001, "loss": 3.284, "loss/crossentropy": 2.2752655148506165, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2645736649632454, "loss/reg": 0.0, "step": 6940 }, { "epoch": 0.04572368421052632, "grad_norm": 2.40625, "grad_norm_var": 0.0625152587890625, "learning_rate": 0.0001, "loss": 3.3456, "loss/crossentropy": 2.290147030353546, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.27036611288785933, "loss/reg": 0.0, "step": 6950 }, { "epoch": 0.045789473684210526, "grad_norm": 2.53125, "grad_norm_var": 6.027144368489584, "learning_rate": 0.0001, "loss": 3.4116, "loss/crossentropy": 2.359883761405945, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.29680062681436536, "loss/reg": 0.0, "step": 6960 }, { "epoch": 0.04585526315789474, "grad_norm": 2.984375, "grad_norm_var": 0.5505208333333333, "learning_rate": 0.0001, "loss": 3.334, "loss/crossentropy": 2.4557218074798586, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.2946802690625191, "loss/reg": 0.0, "step": 6970 }, { "epoch": 0.045921052631578946, "grad_norm": 2.5, "grad_norm_var": 0.50865478515625, "learning_rate": 0.0001, "loss": 3.3166, "loss/crossentropy": 2.2482771933078767, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.25328404903411866, "loss/reg": 0.0, "step": 6980 }, { "epoch": 0.04598684210526316, "grad_norm": 2.203125, "grad_norm_var": 0.045685831705729166, "learning_rate": 0.0001, "loss": 3.4432, "loss/crossentropy": 2.3823115646839144, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.29391862004995345, "loss/reg": 0.0, "step": 6990 }, { "epoch": 0.046052631578947366, "grad_norm": 2.453125, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 3.4052, "loss/crossentropy": 2.67444326877594, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.336136220395565, "loss/reg": 0.0, "step": 7000 }, { "epoch": 0.04611842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.03218994140625, "learning_rate": 0.0001, "loss": 3.3293, "loss/crossentropy": 2.4088356614112856, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26314016729593276, "loss/reg": 0.0, "step": 7010 }, { "epoch": 0.04618421052631579, "grad_norm": 2.484375, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 3.3201, "loss/crossentropy": 2.210337924957275, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.28177270889282224, "loss/reg": 0.0, "step": 7020 }, { "epoch": 0.04625, "grad_norm": 2.328125, "grad_norm_var": 1.1310780843098958, "learning_rate": 0.0001, "loss": 3.4065, "loss/crossentropy": 2.3901759028434753, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3315755516290665, "loss/reg": 0.0, "step": 7030 }, { "epoch": 0.04631578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.27060139973958336, "learning_rate": 0.0001, "loss": 3.4377, "loss/crossentropy": 2.3308543801307677, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.27955446392297745, "loss/reg": 0.0, "step": 7040 }, { "epoch": 0.04638157894736842, "grad_norm": 3.953125, "grad_norm_var": 0.25728759765625, "learning_rate": 0.0001, "loss": 3.3076, "loss/crossentropy": 2.3288169384002684, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.25437864363193513, "loss/reg": 0.0, "step": 7050 }, { "epoch": 0.04644736842105263, "grad_norm": 3.046875, "grad_norm_var": 0.20709228515625, "learning_rate": 0.0001, "loss": 3.439, "loss/crossentropy": 2.033195120096207, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2639622241258621, "loss/reg": 0.0, "step": 7060 }, { "epoch": 0.04651315789473684, "grad_norm": 2.390625, "grad_norm_var": 0.84127197265625, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.366466200351715, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2680037707090378, "loss/reg": 0.0, "step": 7070 }, { "epoch": 0.04657894736842105, "grad_norm": 11.9375, "grad_norm_var": 5.57564697265625, "learning_rate": 0.0001, "loss": 3.4068, "loss/crossentropy": 2.3484351873397826, "loss/hidden": 3.3828125, "loss/incoh": 0.0, "loss/logits": 0.3572035223245621, "loss/reg": 0.0, "step": 7080 }, { "epoch": 0.04664473684210527, "grad_norm": 2.515625, "grad_norm_var": 5.670992024739584, "learning_rate": 0.0001, "loss": 3.5356, "loss/crossentropy": 2.491948664188385, "loss/hidden": 3.1484375, "loss/incoh": 0.0, "loss/logits": 0.33623201847076417, "loss/reg": 0.0, "step": 7090 }, { "epoch": 0.04671052631578947, "grad_norm": 7.90625, "grad_norm_var": 1.8282297770182292, "learning_rate": 0.0001, "loss": 3.4057, "loss/crossentropy": 2.3702210783958435, "loss/hidden": 3.203125, "loss/incoh": 0.0, "loss/logits": 0.3451476514339447, "loss/reg": 0.0, "step": 7100 }, { "epoch": 0.04677631578947369, "grad_norm": 2.5, "grad_norm_var": 1.74947509765625, "learning_rate": 0.0001, "loss": 3.4173, "loss/crossentropy": 2.372307813167572, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.30449149161577227, "loss/reg": 0.0, "step": 7110 }, { "epoch": 0.04684210526315789, "grad_norm": 2.484375, "grad_norm_var": 0.1269683837890625, "learning_rate": 0.0001, "loss": 3.3266, "loss/crossentropy": 2.4220902919769287, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.29380183219909667, "loss/reg": 0.0, "step": 7120 }, { "epoch": 0.04690789473684211, "grad_norm": 2.375, "grad_norm_var": 5.4926503499348955, "learning_rate": 0.0001, "loss": 3.4299, "loss/crossentropy": 2.333963227272034, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.32264130711555483, "loss/reg": 0.0, "step": 7130 }, { "epoch": 0.04697368421052631, "grad_norm": 2.453125, "grad_norm_var": 8.575126139322917, "learning_rate": 0.0001, "loss": 3.5129, "loss/crossentropy": 2.447948896884918, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.34722310602664946, "loss/reg": 0.0, "step": 7140 }, { "epoch": 0.04703947368421053, "grad_norm": 2.171875, "grad_norm_var": 4.295670572916666, "learning_rate": 0.0001, "loss": 3.4954, "loss/crossentropy": 2.4875372767448427, "loss/hidden": 3.1765625, "loss/incoh": 0.0, "loss/logits": 0.38640123009681704, "loss/reg": 0.0, "step": 7150 }, { "epoch": 0.04710526315789473, "grad_norm": 2.53125, "grad_norm_var": 0.09901936848958333, "learning_rate": 0.0001, "loss": 3.3592, "loss/crossentropy": 2.4286764740943907, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.3441846176981926, "loss/reg": 0.0, "step": 7160 }, { "epoch": 0.04717105263157895, "grad_norm": 2.46875, "grad_norm_var": 0.09706624348958333, "learning_rate": 0.0001, "loss": 3.4219, "loss/crossentropy": 2.197064208984375, "loss/hidden": 3.215625, "loss/incoh": 0.0, "loss/logits": 0.30653059035539626, "loss/reg": 0.0, "step": 7170 }, { "epoch": 0.04723684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.07274983723958334, "learning_rate": 0.0001, "loss": 3.4754, "loss/crossentropy": 2.592367339134216, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.30859925150871276, "loss/reg": 0.0, "step": 7180 }, { "epoch": 0.04730263157894737, "grad_norm": 2.671875, "grad_norm_var": 0.04854227701822917, "learning_rate": 0.0001, "loss": 3.335, "loss/crossentropy": 2.4139750719070436, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.26624749004840853, "loss/reg": 0.0, "step": 7190 }, { "epoch": 0.04736842105263158, "grad_norm": 2.34375, "grad_norm_var": 0.061579386393229164, "learning_rate": 0.0001, "loss": 3.4049, "loss/crossentropy": 1.9312179803848266, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.23240152448415757, "loss/reg": 0.0, "step": 7200 }, { "epoch": 0.04743421052631579, "grad_norm": 3.140625, "grad_norm_var": 2.7487790626051414e+17, "learning_rate": 0.0001, "loss": 3.5157, "loss/crossentropy": 2.192197346687317, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.28352061808109286, "loss/reg": 0.0, "step": 7210 }, { "epoch": 0.0475, "grad_norm": 2.828125, "grad_norm_var": 2.7487790627662506e+17, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.2706116318702696, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.29702268838882445, "loss/reg": 0.0, "step": 7220 }, { "epoch": 0.04756578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.05640869140625, "learning_rate": 0.0001, "loss": 3.2627, "loss/crossentropy": 2.354196774959564, "loss/hidden": 3.0625, "loss/incoh": 0.0, "loss/logits": 0.2917496845126152, "loss/reg": 0.0, "step": 7230 }, { "epoch": 0.04763157894736842, "grad_norm": 2.4375, "grad_norm_var": 0.048779296875, "learning_rate": 0.0001, "loss": 3.3302, "loss/crossentropy": 2.4912230253219603, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3378173440694809, "loss/reg": 0.0, "step": 7240 }, { "epoch": 0.047697368421052634, "grad_norm": 2.359375, "grad_norm_var": 0.0933990478515625, "learning_rate": 0.0001, "loss": 3.348, "loss/crossentropy": 2.541353499889374, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.3061401903629303, "loss/reg": 0.0, "step": 7250 }, { "epoch": 0.04776315789473684, "grad_norm": 4.15625, "grad_norm_var": 0.45806884765625, "learning_rate": 0.0001, "loss": 3.5007, "loss/crossentropy": 2.4307178616523744, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3470552325248718, "loss/reg": 0.0, "step": 7260 }, { "epoch": 0.047828947368421054, "grad_norm": 2.421875, "grad_norm_var": 0.4556955973307292, "learning_rate": 0.0001, "loss": 3.3409, "loss/crossentropy": 2.257239353656769, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.29585833847522736, "loss/reg": 0.0, "step": 7270 }, { "epoch": 0.04789473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.3855445861816406, "learning_rate": 0.0001, "loss": 3.2964, "loss/crossentropy": 2.1451990723609926, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.25767376720905305, "loss/reg": 0.0, "step": 7280 }, { "epoch": 0.047960526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.26484349568684895, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.2842231035232543, "loss/hidden": 3.1171875, "loss/incoh": 0.0, "loss/logits": 0.33771214783191683, "loss/reg": 0.0, "step": 7290 }, { "epoch": 0.04802631578947368, "grad_norm": 2.578125, "grad_norm_var": 0.13931884765625, "learning_rate": 0.0001, "loss": 3.2818, "loss/crossentropy": 2.1290991365909577, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.2695446103811264, "loss/reg": 0.0, "step": 7300 }, { "epoch": 0.048092105263157894, "grad_norm": 2.53125, "grad_norm_var": 0.0911773681640625, "learning_rate": 0.0001, "loss": 3.3833, "loss/crossentropy": 2.149968445301056, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.24086009413003923, "loss/reg": 0.0, "step": 7310 }, { "epoch": 0.04815789473684211, "grad_norm": 2.65625, "grad_norm_var": 0.07766011555989584, "learning_rate": 0.0001, "loss": 3.3503, "loss/crossentropy": 2.519470489025116, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.25514234602451324, "loss/reg": 0.0, "step": 7320 }, { "epoch": 0.048223684210526314, "grad_norm": 2.796875, "grad_norm_var": 0.025614420572916668, "learning_rate": 0.0001, "loss": 3.3595, "loss/crossentropy": 2.5252918124198915, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.281466007232666, "loss/reg": 0.0, "step": 7330 }, { "epoch": 0.04828947368421053, "grad_norm": 2.078125, "grad_norm_var": 0.37810872395833334, "learning_rate": 0.0001, "loss": 3.3791, "loss/crossentropy": 2.0419042229652407, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2731199100613594, "loss/reg": 0.0, "step": 7340 }, { "epoch": 0.048355263157894735, "grad_norm": 3.421875, "grad_norm_var": 0.19280192057291667, "learning_rate": 0.0001, "loss": 3.3562, "loss/crossentropy": 2.1462841510772703, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.23967409282922744, "loss/reg": 0.0, "step": 7350 }, { "epoch": 0.04842105263157895, "grad_norm": 2.75, "grad_norm_var": 0.13990478515625, "learning_rate": 0.0001, "loss": 3.2953, "loss/crossentropy": 2.4041597843170166, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.26162110567092894, "loss/reg": 0.0, "step": 7360 }, { "epoch": 0.048486842105263155, "grad_norm": 2.375, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 3.2731, "loss/crossentropy": 2.61968252658844, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.28658540844917296, "loss/reg": 0.0, "step": 7370 }, { "epoch": 0.04855263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.0795562744140625, "learning_rate": 0.0001, "loss": 3.3537, "loss/crossentropy": 2.330329430103302, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.311858968436718, "loss/reg": 0.0, "step": 7380 }, { "epoch": 0.04861842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06910400390625, "learning_rate": 0.0001, "loss": 3.3762, "loss/crossentropy": 2.331431567668915, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.3637159377336502, "loss/reg": 0.0, "step": 7390 }, { "epoch": 0.04868421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.07679036458333334, "learning_rate": 0.0001, "loss": 3.3474, "loss/crossentropy": 2.334563136100769, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3102908283472061, "loss/reg": 0.0, "step": 7400 }, { "epoch": 0.04875, "grad_norm": 2.21875, "grad_norm_var": 0.041727701822916664, "learning_rate": 0.0001, "loss": 3.2994, "loss/crossentropy": 2.4575919032096865, "loss/hidden": 3.1453125, "loss/incoh": 0.0, "loss/logits": 0.30418373495340345, "loss/reg": 0.0, "step": 7410 }, { "epoch": 0.04881578947368421, "grad_norm": 2.5625, "grad_norm_var": 0.95484619140625, "learning_rate": 0.0001, "loss": 3.3136, "loss/crossentropy": 2.2615838646888733, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.27002790868282317, "loss/reg": 0.0, "step": 7420 }, { "epoch": 0.04888157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.05439046223958333, "learning_rate": 0.0001, "loss": 3.3084, "loss/crossentropy": 2.2472564220428466, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.28446437120437623, "loss/reg": 0.0, "step": 7430 }, { "epoch": 0.04894736842105263, "grad_norm": 2.40625, "grad_norm_var": 0.08689676920572917, "learning_rate": 0.0001, "loss": 3.3891, "loss/crossentropy": 2.5821902751922607, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.29424687922000886, "loss/reg": 0.0, "step": 7440 }, { "epoch": 0.04901315789473684, "grad_norm": 2.703125, "grad_norm_var": 0.09962565104166667, "learning_rate": 0.0001, "loss": 3.4691, "loss/crossentropy": 2.547790551185608, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.3217499524354935, "loss/reg": 0.0, "step": 7450 }, { "epoch": 0.049078947368421055, "grad_norm": 3.03125, "grad_norm_var": 0.18788655598958334, "learning_rate": 0.0001, "loss": 3.3014, "loss/crossentropy": 2.025312936306, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.29601537734270095, "loss/reg": 0.0, "step": 7460 }, { "epoch": 0.04914473684210526, "grad_norm": 2.359375, "grad_norm_var": 0.15539957682291666, "learning_rate": 0.0001, "loss": 3.3516, "loss/crossentropy": 2.1228564500808718, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2723393976688385, "loss/reg": 0.0, "step": 7470 }, { "epoch": 0.049210526315789475, "grad_norm": 2.546875, "grad_norm_var": 0.04488016764322917, "learning_rate": 0.0001, "loss": 3.4105, "loss/crossentropy": 2.221798670291901, "loss/hidden": 3.303125, "loss/incoh": 0.0, "loss/logits": 0.3143211781978607, "loss/reg": 0.0, "step": 7480 }, { "epoch": 0.04927631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.028938802083333333, "learning_rate": 0.0001, "loss": 3.4092, "loss/crossentropy": 2.4991084337234497, "loss/hidden": 3.24375, "loss/incoh": 0.0, "loss/logits": 0.3354015931487083, "loss/reg": 0.0, "step": 7490 }, { "epoch": 0.049342105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.04719136555989583, "learning_rate": 0.0001, "loss": 3.2643, "loss/crossentropy": 2.3538936495780947, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.25752398669719695, "loss/reg": 0.0, "step": 7500 }, { "epoch": 0.0494078947368421, "grad_norm": 2.859375, "grad_norm_var": 0.5662272135416667, "learning_rate": 0.0001, "loss": 3.3662, "loss/crossentropy": 2.2441036820411684, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.26160079389810564, "loss/reg": 0.0, "step": 7510 }, { "epoch": 0.049473684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.5289703369140625, "learning_rate": 0.0001, "loss": 3.3149, "loss/crossentropy": 2.544923734664917, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2898894131183624, "loss/reg": 0.0, "step": 7520 }, { "epoch": 0.04953947368421053, "grad_norm": 2.34375, "grad_norm_var": 0.0388092041015625, "learning_rate": 0.0001, "loss": 3.3611, "loss/crossentropy": 2.5786613702774046, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.2799048855900764, "loss/reg": 0.0, "step": 7530 }, { "epoch": 0.049605263157894736, "grad_norm": 4.625, "grad_norm_var": 0.3494293212890625, "learning_rate": 0.0001, "loss": 3.3299, "loss/crossentropy": 2.6309832334518433, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.3181111514568329, "loss/reg": 0.0, "step": 7540 }, { "epoch": 0.04967105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.32229715983072915, "learning_rate": 0.0001, "loss": 3.381, "loss/crossentropy": 2.395383334159851, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.29645184725522994, "loss/reg": 0.0, "step": 7550 }, { "epoch": 0.049736842105263156, "grad_norm": 2.609375, "grad_norm_var": 0.06101786295572917, "learning_rate": 0.0001, "loss": 3.3076, "loss/crossentropy": 2.2212601780891417, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.2814889296889305, "loss/reg": 0.0, "step": 7560 }, { "epoch": 0.04980263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.5580800374348959, "learning_rate": 0.0001, "loss": 3.3547, "loss/crossentropy": 2.340949076414108, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.3048363208770752, "loss/reg": 0.0, "step": 7570 }, { "epoch": 0.049868421052631576, "grad_norm": 2.140625, "grad_norm_var": 0.5829498291015625, "learning_rate": 0.0001, "loss": 3.2873, "loss/crossentropy": 2.434855592250824, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.33333509862422944, "loss/reg": 0.0, "step": 7580 }, { "epoch": 0.04993421052631579, "grad_norm": 2.625, "grad_norm_var": 0.07898661295572916, "learning_rate": 0.0001, "loss": 3.3901, "loss/crossentropy": 2.2928370952606203, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.30124022662639616, "loss/reg": 0.0, "step": 7590 }, { "epoch": 0.05, "grad_norm": 2.453125, "grad_norm_var": 0.04219462076822917, "learning_rate": 0.0001, "loss": 3.3048, "loss/crossentropy": 2.0018354773521425, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.25127379447221754, "loss/reg": 0.0, "step": 7600 }, { "epoch": 0.05006578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.02652587890625, "learning_rate": 0.0001, "loss": 3.3285, "loss/crossentropy": 2.2610169410705567, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.27540155351161955, "loss/reg": 0.0, "step": 7610 }, { "epoch": 0.05013157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.0210357666015625, "learning_rate": 0.0001, "loss": 3.2714, "loss/crossentropy": 2.2387811303138734, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27260326892137526, "loss/reg": 0.0, "step": 7620 }, { "epoch": 0.05019736842105263, "grad_norm": 2.265625, "grad_norm_var": 0.25825093587239584, "learning_rate": 0.0001, "loss": 3.4396, "loss/crossentropy": 2.5094308257102966, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.3727137431502342, "loss/reg": 0.0, "step": 7630 }, { "epoch": 0.05026315789473684, "grad_norm": 3.203125, "grad_norm_var": 0.42695210774739584, "learning_rate": 0.0001, "loss": 3.4951, "loss/crossentropy": 2.181921923160553, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.29040979146957396, "loss/reg": 0.0, "step": 7640 }, { "epoch": 0.05032894736842105, "grad_norm": 3.5625, "grad_norm_var": 0.47198893229166666, "learning_rate": 0.0001, "loss": 3.3254, "loss/crossentropy": 2.4378631830215456, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.32593746185302735, "loss/reg": 0.0, "step": 7650 }, { "epoch": 0.05039473684210526, "grad_norm": 2.234375, "grad_norm_var": 0.12525126139322917, "learning_rate": 0.0001, "loss": 3.2625, "loss/crossentropy": 2.318000388145447, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.267861607670784, "loss/reg": 0.0, "step": 7660 }, { "epoch": 0.050460526315789477, "grad_norm": 2.28125, "grad_norm_var": 0.09370015462239584, "learning_rate": 0.0001, "loss": 3.4075, "loss/crossentropy": 2.300287425518036, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.272597573697567, "loss/reg": 0.0, "step": 7670 }, { "epoch": 0.05052631578947368, "grad_norm": 2.0625, "grad_norm_var": 0.09342041015625, "learning_rate": 0.0001, "loss": 3.2909, "loss/crossentropy": 2.5380281090736387, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2639786213636398, "loss/reg": 0.0, "step": 7680 }, { "epoch": 0.0505921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.165625, "learning_rate": 0.0001, "loss": 3.2847, "loss/crossentropy": 2.0959218978881835, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2350485235452652, "loss/reg": 0.0, "step": 7690 }, { "epoch": 0.0506578947368421, "grad_norm": 2.6875, "grad_norm_var": 0.17685546875, "learning_rate": 0.0001, "loss": 3.3983, "loss/crossentropy": 2.323216736316681, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.3249870762228966, "loss/reg": 0.0, "step": 7700 }, { "epoch": 0.05072368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.0990234375, "learning_rate": 0.0001, "loss": 3.3125, "loss/crossentropy": 2.3656753659248353, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.3182197526097298, "loss/reg": 0.0, "step": 7710 }, { "epoch": 0.05078947368421052, "grad_norm": 2.8125, "grad_norm_var": 0.19866129557291667, "learning_rate": 0.0001, "loss": 3.4009, "loss/crossentropy": 2.5273406386375425, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.3028098613023758, "loss/reg": 0.0, "step": 7720 }, { "epoch": 0.05085526315789474, "grad_norm": 3.234375, "grad_norm_var": 0.20156148274739583, "learning_rate": 0.0001, "loss": 3.4345, "loss/crossentropy": 2.391481709480286, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3816041976213455, "loss/reg": 0.0, "step": 7730 }, { "epoch": 0.05092105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 3.3338, "loss/crossentropy": 2.4912365436553956, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2999194458127022, "loss/reg": 0.0, "step": 7740 }, { "epoch": 0.05098684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.32692057291666665, "learning_rate": 0.0001, "loss": 3.3967, "loss/crossentropy": 2.3413360595703123, "loss/hidden": 3.225, "loss/incoh": 0.0, "loss/logits": 0.2852958709001541, "loss/reg": 0.0, "step": 7750 }, { "epoch": 0.05105263157894737, "grad_norm": 2.625, "grad_norm_var": 0.32123921712239584, "learning_rate": 0.0001, "loss": 3.3178, "loss/crossentropy": 2.6519938707351685, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.30396163165569307, "loss/reg": 0.0, "step": 7760 }, { "epoch": 0.05111842105263158, "grad_norm": 2.546875, "grad_norm_var": 0.05095113118489583, "learning_rate": 0.0001, "loss": 3.3751, "loss/crossentropy": 2.4069360971450804, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2661174476146698, "loss/reg": 0.0, "step": 7770 }, { "epoch": 0.05118421052631579, "grad_norm": 2.6875, "grad_norm_var": 0.12613016764322918, "learning_rate": 0.0001, "loss": 3.3737, "loss/crossentropy": 2.3529985427856444, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.2910769283771515, "loss/reg": 0.0, "step": 7780 }, { "epoch": 0.05125, "grad_norm": 2.25, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 3.3208, "loss/crossentropy": 2.1253769397735596, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2470591977238655, "loss/reg": 0.0, "step": 7790 }, { "epoch": 0.05131578947368421, "grad_norm": 2.96875, "grad_norm_var": 0.5931925455729167, "learning_rate": 0.0001, "loss": 3.4704, "loss/crossentropy": 2.2813072860240937, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.3708792179822922, "loss/reg": 0.0, "step": 7800 }, { "epoch": 0.051381578947368424, "grad_norm": 2.15625, "grad_norm_var": 0.6090159098307292, "learning_rate": 0.0001, "loss": 3.3966, "loss/crossentropy": 2.071128582954407, "loss/hidden": 3.2015625, "loss/incoh": 0.0, "loss/logits": 0.332095867395401, "loss/reg": 0.0, "step": 7810 }, { "epoch": 0.05144736842105263, "grad_norm": 2.671875, "grad_norm_var": 0.19628499348958334, "learning_rate": 0.0001, "loss": 3.3989, "loss/crossentropy": 2.368660008907318, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.28022406101226804, "loss/reg": 0.0, "step": 7820 }, { "epoch": 0.051513157894736844, "grad_norm": 2.40625, "grad_norm_var": 2.9096995035807294, "learning_rate": 0.0001, "loss": 3.3211, "loss/crossentropy": 2.1510692477226256, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.2576268881559372, "loss/reg": 0.0, "step": 7830 }, { "epoch": 0.05157894736842105, "grad_norm": 2.390625, "grad_norm_var": 2.7149729410807293, "learning_rate": 0.0001, "loss": 3.4215, "loss/crossentropy": 2.4896273493766783, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.3669875577092171, "loss/reg": 0.0, "step": 7840 }, { "epoch": 0.051644736842105264, "grad_norm": 6.125, "grad_norm_var": 0.9022450764973958, "learning_rate": 0.0001, "loss": 3.4161, "loss/crossentropy": 2.0616636157035826, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.2621200427412987, "loss/reg": 0.0, "step": 7850 }, { "epoch": 0.05171052631578947, "grad_norm": 2.421875, "grad_norm_var": 1.457957967122396, "learning_rate": 0.0001, "loss": 3.3632, "loss/crossentropy": 2.236839824914932, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28002799302339554, "loss/reg": 0.0, "step": 7860 }, { "epoch": 0.051776315789473684, "grad_norm": 2.828125, "grad_norm_var": 7.502855428059896, "learning_rate": 0.0001, "loss": 3.3749, "loss/crossentropy": 2.4405242681503294, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2591508060693741, "loss/reg": 0.0, "step": 7870 }, { "epoch": 0.0518421052631579, "grad_norm": 2.34375, "grad_norm_var": 3.8793904622395834, "learning_rate": 0.0001, "loss": 3.3791, "loss/crossentropy": 2.5814385414123535, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.31668367683887483, "loss/reg": 0.0, "step": 7880 }, { "epoch": 0.051907894736842104, "grad_norm": 2.46875, "grad_norm_var": 3.8120432535807294, "learning_rate": 0.0001, "loss": 3.4478, "loss/crossentropy": 2.185117280483246, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.34759806394577025, "loss/reg": 0.0, "step": 7890 }, { "epoch": 0.05197368421052632, "grad_norm": 2.609375, "grad_norm_var": 0.09302469889322916, "learning_rate": 0.0001, "loss": 3.3764, "loss/crossentropy": 2.249006187915802, "loss/hidden": 3.253125, "loss/incoh": 0.0, "loss/logits": 0.28100676983594897, "loss/reg": 0.0, "step": 7900 }, { "epoch": 0.052039473684210524, "grad_norm": 2.25, "grad_norm_var": 0.08615620930989583, "learning_rate": 0.0001, "loss": 3.261, "loss/crossentropy": 2.297496807575226, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.30758740454912187, "loss/reg": 0.0, "step": 7910 }, { "epoch": 0.05210526315789474, "grad_norm": 3.46875, "grad_norm_var": 0.08404947916666666, "learning_rate": 0.0001, "loss": 3.3132, "loss/crossentropy": 2.411357748508453, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.29194266349077225, "loss/reg": 0.0, "step": 7920 }, { "epoch": 0.052171052631578944, "grad_norm": 2.90625, "grad_norm_var": 0.27255859375, "learning_rate": 0.0001, "loss": 3.3767, "loss/crossentropy": 2.6010719895362855, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.28802800476551055, "loss/reg": 0.0, "step": 7930 }, { "epoch": 0.05223684210526316, "grad_norm": 2.296875, "grad_norm_var": 0.0465484619140625, "learning_rate": 0.0001, "loss": 3.3193, "loss/crossentropy": 2.6153851985931396, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.2830176457762718, "loss/reg": 0.0, "step": 7940 }, { "epoch": 0.05230263157894737, "grad_norm": 2.609375, "grad_norm_var": 0.0275054931640625, "learning_rate": 0.0001, "loss": 3.2535, "loss/crossentropy": 2.2821019262075426, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.28540263772010804, "loss/reg": 0.0, "step": 7950 }, { "epoch": 0.05236842105263158, "grad_norm": 2.125, "grad_norm_var": 0.16284077962239582, "learning_rate": 0.0001, "loss": 3.2614, "loss/crossentropy": 2.1502284169197083, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.305269892513752, "loss/reg": 0.0, "step": 7960 }, { "epoch": 0.05243421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.1658843994140625, "learning_rate": 0.0001, "loss": 3.3904, "loss/crossentropy": 2.4389883518218993, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.3048334762454033, "loss/reg": 0.0, "step": 7970 }, { "epoch": 0.0525, "grad_norm": 2.765625, "grad_norm_var": 0.07778218587239584, "learning_rate": 0.0001, "loss": 3.4152, "loss/crossentropy": 2.3947508692741395, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.3298331335186958, "loss/reg": 0.0, "step": 7980 }, { "epoch": 0.05256578947368421, "grad_norm": 2.78125, "grad_norm_var": 0.08662109375, "learning_rate": 0.0001, "loss": 3.3709, "loss/crossentropy": 2.2743655920028685, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3255280390381813, "loss/reg": 0.0, "step": 7990 }, { "epoch": 0.05263157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.036881510416666666, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.4580028295516967, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.29261877238750456, "loss/reg": 0.0, "step": 8000 }, { "epoch": 0.05269736842105263, "grad_norm": 3.28125, "grad_norm_var": 0.23829752604166668, "learning_rate": 0.0001, "loss": 3.3925, "loss/crossentropy": 2.569744038581848, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.4232485115528107, "loss/reg": 0.0, "step": 8010 }, { "epoch": 0.052763157894736845, "grad_norm": 2.5625, "grad_norm_var": 0.09871317545572916, "learning_rate": 0.0001, "loss": 3.33, "loss/crossentropy": 2.381676936149597, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2768414840102196, "loss/reg": 0.0, "step": 8020 }, { "epoch": 0.05282894736842105, "grad_norm": 2.625, "grad_norm_var": 0.07251688639322916, "learning_rate": 0.0001, "loss": 3.3659, "loss/crossentropy": 2.529127871990204, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.3230110973119736, "loss/reg": 0.0, "step": 8030 }, { "epoch": 0.052894736842105265, "grad_norm": 2.375, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 3.241, "loss/crossentropy": 2.4573261976242065, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.26054717898368834, "loss/reg": 0.0, "step": 8040 }, { "epoch": 0.05296052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.028319295247395834, "learning_rate": 0.0001, "loss": 3.3654, "loss/crossentropy": 2.3388527154922487, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.3071700781583786, "loss/reg": 0.0, "step": 8050 }, { "epoch": 0.053026315789473685, "grad_norm": 3.015625, "grad_norm_var": 0.0393218994140625, "learning_rate": 0.0001, "loss": 3.3602, "loss/crossentropy": 2.4984395027160646, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2504978612065315, "loss/reg": 0.0, "step": 8060 }, { "epoch": 0.05309210526315789, "grad_norm": 2.390625, "grad_norm_var": 2.4539998372395835, "learning_rate": 0.0001, "loss": 3.4446, "loss/crossentropy": 2.3318467855453493, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.38549562990665437, "loss/reg": 0.0, "step": 8070 }, { "epoch": 0.053157894736842105, "grad_norm": 2.921875, "grad_norm_var": 2.5296946207682294, "learning_rate": 0.0001, "loss": 3.3183, "loss/crossentropy": 2.3632636427879334, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.28499974459409716, "loss/reg": 0.0, "step": 8080 }, { "epoch": 0.05322368421052632, "grad_norm": 2.515625, "grad_norm_var": 0.2263824462890625, "learning_rate": 0.0001, "loss": 3.3982, "loss/crossentropy": 2.5644856214523317, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.32271777242422106, "loss/reg": 0.0, "step": 8090 }, { "epoch": 0.053289473684210525, "grad_norm": 2.109375, "grad_norm_var": 0.20636393229166666, "learning_rate": 0.0001, "loss": 3.3351, "loss/crossentropy": 2.195914793014526, "loss/hidden": 3.2546875, "loss/incoh": 0.0, "loss/logits": 0.32049285918474196, "loss/reg": 0.0, "step": 8100 }, { "epoch": 0.05335526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 3.3031, "loss/crossentropy": 2.6255326747894285, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.27993575036525725, "loss/reg": 0.0, "step": 8110 }, { "epoch": 0.053421052631578946, "grad_norm": 2.484375, "grad_norm_var": 0.07769775390625, "learning_rate": 0.0001, "loss": 3.3496, "loss/crossentropy": 2.430241084098816, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.31150197684764863, "loss/reg": 0.0, "step": 8120 }, { "epoch": 0.05348684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.0697174072265625, "learning_rate": 0.0001, "loss": 3.3655, "loss/crossentropy": 2.4684382557868956, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.2934414252638817, "loss/reg": 0.0, "step": 8130 }, { "epoch": 0.053552631578947366, "grad_norm": 2.921875, "grad_norm_var": 0.09716389973958334, "learning_rate": 0.0001, "loss": 3.3756, "loss/crossentropy": 2.679113733768463, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.35277644991874696, "loss/reg": 0.0, "step": 8140 }, { "epoch": 0.05361842105263158, "grad_norm": 4.625, "grad_norm_var": 0.3069986979166667, "learning_rate": 0.0001, "loss": 3.3979, "loss/crossentropy": 2.3737685680389404, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.4402651429176331, "loss/reg": 0.0, "step": 8150 }, { "epoch": 0.05368421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.4447255452473958, "learning_rate": 0.0001, "loss": 3.3395, "loss/crossentropy": 2.556090760231018, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2898376002907753, "loss/reg": 0.0, "step": 8160 }, { "epoch": 0.05375, "grad_norm": 2.484375, "grad_norm_var": 0.4153717041015625, "learning_rate": 0.0001, "loss": 3.4755, "loss/crossentropy": 2.3199184775352477, "loss/hidden": 3.2375, "loss/incoh": 0.0, "loss/logits": 0.3026577115058899, "loss/reg": 0.0, "step": 8170 }, { "epoch": 0.05381578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.11787821451822916, "learning_rate": 0.0001, "loss": 3.3616, "loss/crossentropy": 2.566536474227905, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.3062845066189766, "loss/reg": 0.0, "step": 8180 }, { "epoch": 0.05388157894736842, "grad_norm": 2.25, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 3.3019, "loss/crossentropy": 2.434731423854828, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.27008683085441587, "loss/reg": 0.0, "step": 8190 }, { "epoch": 0.05394736842105263, "grad_norm": 2.375, "grad_norm_var": 0.2584299723307292, "learning_rate": 0.0001, "loss": 3.359, "loss/crossentropy": 2.2641067147254943, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.27225579768419267, "loss/reg": 0.0, "step": 8200 }, { "epoch": 0.05401315789473684, "grad_norm": 2.25, "grad_norm_var": 0.27990620930989585, "learning_rate": 0.0001, "loss": 3.3185, "loss/crossentropy": 2.410356640815735, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24587155133485794, "loss/reg": 0.0, "step": 8210 }, { "epoch": 0.05407894736842105, "grad_norm": 2.484375, "grad_norm_var": 0.03493550618489583, "learning_rate": 0.0001, "loss": 3.407, "loss/crossentropy": 2.3557684421539307, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.27189340591430666, "loss/reg": 0.0, "step": 8220 }, { "epoch": 0.054144736842105266, "grad_norm": 2.578125, "grad_norm_var": 0.031615193684895834, "learning_rate": 0.0001, "loss": 3.3054, "loss/crossentropy": 2.453370213508606, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3121939614415169, "loss/reg": 0.0, "step": 8230 }, { "epoch": 0.05421052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.06060791015625, "learning_rate": 0.0001, "loss": 3.3556, "loss/crossentropy": 2.3469805240631105, "loss/hidden": 3.115625, "loss/incoh": 0.0, "loss/logits": 0.35667684078216555, "loss/reg": 0.0, "step": 8240 }, { "epoch": 0.054276315789473686, "grad_norm": 2.484375, "grad_norm_var": 0.10061442057291667, "learning_rate": 0.0001, "loss": 3.3466, "loss/crossentropy": 2.5584194660186768, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.31178881525993346, "loss/reg": 0.0, "step": 8250 }, { "epoch": 0.05434210526315789, "grad_norm": 2.296875, "grad_norm_var": 0.06204020182291667, "learning_rate": 0.0001, "loss": 3.291, "loss/crossentropy": 2.372837942838669, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.27887275665998457, "loss/reg": 0.0, "step": 8260 }, { "epoch": 0.054407894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.05986328125, "learning_rate": 0.0001, "loss": 3.3687, "loss/crossentropy": 2.2389731287956236, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.29073659181594846, "loss/reg": 0.0, "step": 8270 }, { "epoch": 0.05447368421052631, "grad_norm": 2.34375, "grad_norm_var": 0.0772369384765625, "learning_rate": 0.0001, "loss": 3.3364, "loss/crossentropy": 2.130947244167328, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.3286518737673759, "loss/reg": 0.0, "step": 8280 }, { "epoch": 0.05453947368421053, "grad_norm": 2.796875, "grad_norm_var": 0.3402740478515625, "learning_rate": 0.0001, "loss": 3.4311, "loss/crossentropy": 2.5036970019340514, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.3155761957168579, "loss/reg": 0.0, "step": 8290 }, { "epoch": 0.05460526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.07206624348958333, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.15233553647995, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.2912480518221855, "loss/reg": 0.0, "step": 8300 }, { "epoch": 0.05467105263157895, "grad_norm": 2.765625, "grad_norm_var": 0.043290201822916666, "learning_rate": 0.0001, "loss": 3.3888, "loss/crossentropy": 2.3679205000400545, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.33265506476163864, "loss/reg": 0.0, "step": 8310 }, { "epoch": 0.05473684210526316, "grad_norm": 4.78125, "grad_norm_var": 0.4022745768229167, "learning_rate": 0.0001, "loss": 3.4588, "loss/crossentropy": 2.3284069895744324, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.28553168624639513, "loss/reg": 0.0, "step": 8320 }, { "epoch": 0.05480263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.5325480143229167, "learning_rate": 0.0001, "loss": 3.3491, "loss/crossentropy": 2.602140688896179, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.30461025387048724, "loss/reg": 0.0, "step": 8330 }, { "epoch": 0.05486842105263158, "grad_norm": 2.375, "grad_norm_var": 0.21806233723958332, "learning_rate": 0.0001, "loss": 3.2902, "loss/crossentropy": 2.3135936856269836, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.24052460640668868, "loss/reg": 0.0, "step": 8340 }, { "epoch": 0.05493421052631579, "grad_norm": 2.859375, "grad_norm_var": 0.10383707682291667, "learning_rate": 0.0001, "loss": 3.4435, "loss/crossentropy": 2.133499014377594, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.3066937685012817, "loss/reg": 0.0, "step": 8350 }, { "epoch": 0.055, "grad_norm": 3.15625, "grad_norm_var": 1.1286936442057292, "learning_rate": 0.0001, "loss": 3.3443, "loss/crossentropy": 2.5188000440597533, "loss/hidden": 3.340625, "loss/incoh": 0.0, "loss/logits": 0.3301475077867508, "loss/reg": 0.0, "step": 8360 }, { "epoch": 0.055065789473684214, "grad_norm": 2.4375, "grad_norm_var": 1.1146230061848958, "learning_rate": 0.0001, "loss": 3.2979, "loss/crossentropy": 2.5192033290863036, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.29569360315799714, "loss/reg": 0.0, "step": 8370 }, { "epoch": 0.05513157894736842, "grad_norm": 4.28125, "grad_norm_var": 0.2727691650390625, "learning_rate": 0.0001, "loss": 3.312, "loss/crossentropy": 2.5402653098106383, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2737806707620621, "loss/reg": 0.0, "step": 8380 }, { "epoch": 0.055197368421052634, "grad_norm": 2.234375, "grad_norm_var": 0.9796702067057291, "learning_rate": 0.0001, "loss": 3.3479, "loss/crossentropy": 2.264913785457611, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.28024169653654096, "loss/reg": 0.0, "step": 8390 }, { "epoch": 0.05526315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.14442952473958334, "learning_rate": 0.0001, "loss": 3.2227, "loss/crossentropy": 2.3250611424446106, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.29773685038089753, "loss/reg": 0.0, "step": 8400 }, { "epoch": 0.055328947368421054, "grad_norm": 3.453125, "grad_norm_var": 0.1592681884765625, "learning_rate": 0.0001, "loss": 3.3882, "loss/crossentropy": 2.73007869720459, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.361674590408802, "loss/reg": 0.0, "step": 8410 }, { "epoch": 0.05539473684210526, "grad_norm": 2.5, "grad_norm_var": 0.12535807291666667, "learning_rate": 0.0001, "loss": 3.3358, "loss/crossentropy": 2.533698391914368, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.32232231795787813, "loss/reg": 0.0, "step": 8420 }, { "epoch": 0.055460526315789474, "grad_norm": 2.640625, "grad_norm_var": 413.9550415039063, "learning_rate": 0.0001, "loss": 3.4177, "loss/crossentropy": 2.4008097648620605, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.31593555510044097, "loss/reg": 0.0, "step": 8430 }, { "epoch": 0.05552631578947369, "grad_norm": 3.234375, "grad_norm_var": 413.1215077718099, "learning_rate": 0.0001, "loss": 3.3581, "loss/crossentropy": 2.4226332664489747, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.25367428809404374, "loss/reg": 0.0, "step": 8440 }, { "epoch": 0.055592105263157894, "grad_norm": 2.484375, "grad_norm_var": 0.14670817057291666, "learning_rate": 0.0001, "loss": 3.3078, "loss/crossentropy": 2.630770039558411, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.3664619579911232, "loss/reg": 0.0, "step": 8450 }, { "epoch": 0.05565789473684211, "grad_norm": 6.75, "grad_norm_var": 2.27388916015625, "learning_rate": 0.0001, "loss": 3.503, "loss/crossentropy": 2.376649534702301, "loss/hidden": 3.1640625, "loss/incoh": 0.0, "loss/logits": 0.3476260006427765, "loss/reg": 0.0, "step": 8460 }, { "epoch": 0.055723684210526314, "grad_norm": 2.640625, "grad_norm_var": 2.541617838541667, "learning_rate": 0.0001, "loss": 3.4543, "loss/crossentropy": 2.638149607181549, "loss/hidden": 3.34375, "loss/incoh": 0.0, "loss/logits": 0.34109789580106736, "loss/reg": 0.0, "step": 8470 }, { "epoch": 0.05578947368421053, "grad_norm": 2.6875, "grad_norm_var": 0.4665679931640625, "learning_rate": 0.0001, "loss": 3.5013, "loss/crossentropy": 2.40644109249115, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.3092473894357681, "loss/reg": 0.0, "step": 8480 }, { "epoch": 0.055855263157894734, "grad_norm": 2.8125, "grad_norm_var": 1.1175282796223958, "learning_rate": 0.0001, "loss": 3.293, "loss/crossentropy": 2.159450513124466, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.27864499390125275, "loss/reg": 0.0, "step": 8490 }, { "epoch": 0.05592105263157895, "grad_norm": 4.90625, "grad_norm_var": 0.49722900390625, "learning_rate": 0.0001, "loss": 3.3045, "loss/crossentropy": 2.355340528488159, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2977081388235092, "loss/reg": 0.0, "step": 8500 }, { "epoch": 0.05598684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.4117502848307292, "learning_rate": 0.0001, "loss": 3.3273, "loss/crossentropy": 2.284228873252869, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.27203311026096344, "loss/reg": 0.0, "step": 8510 }, { "epoch": 0.05605263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.15953776041666667, "learning_rate": 0.0001, "loss": 3.3038, "loss/crossentropy": 2.6644370317459107, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.2977398321032524, "loss/reg": 0.0, "step": 8520 }, { "epoch": 0.05611842105263158, "grad_norm": 2.65625, "grad_norm_var": 0.07891337076822917, "learning_rate": 0.0001, "loss": 3.3066, "loss/crossentropy": 2.480556678771973, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3119770348072052, "loss/reg": 0.0, "step": 8530 }, { "epoch": 0.05618421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.2862630208333333, "learning_rate": 0.0001, "loss": 3.3849, "loss/crossentropy": 2.3611693739891053, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.27296979874372485, "loss/reg": 0.0, "step": 8540 }, { "epoch": 0.05625, "grad_norm": 2.453125, "grad_norm_var": 0.41441141764322914, "learning_rate": 0.0001, "loss": 3.3496, "loss/crossentropy": 2.3446286380290986, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.28327286094427107, "loss/reg": 0.0, "step": 8550 }, { "epoch": 0.05631578947368421, "grad_norm": 2.625, "grad_norm_var": 6.07724609375, "learning_rate": 0.0001, "loss": 3.4711, "loss/crossentropy": 1.7737745344638824, "loss/hidden": 3.171875, "loss/incoh": 0.0, "loss/logits": 0.27109832018613816, "loss/reg": 0.0, "step": 8560 }, { "epoch": 0.05638157894736842, "grad_norm": 2.875, "grad_norm_var": 5.751741536458334, "learning_rate": 0.0001, "loss": 3.3501, "loss/crossentropy": 2.3749794125556947, "loss/hidden": 3.0578125, "loss/incoh": 0.0, "loss/logits": 0.2803412050008774, "loss/reg": 0.0, "step": 8570 }, { "epoch": 0.056447368421052635, "grad_norm": 2.1875, "grad_norm_var": 0.09153645833333333, "learning_rate": 0.0001, "loss": 3.3793, "loss/crossentropy": 2.348732423782349, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2703657791018486, "loss/reg": 0.0, "step": 8580 }, { "epoch": 0.05651315789473684, "grad_norm": 2.984375, "grad_norm_var": 0.14538472493489582, "learning_rate": 0.0001, "loss": 3.3984, "loss/crossentropy": 2.2422220349311828, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.2956344410777092, "loss/reg": 0.0, "step": 8590 }, { "epoch": 0.056578947368421055, "grad_norm": 2.46875, "grad_norm_var": 0.07770894368489584, "learning_rate": 0.0001, "loss": 3.3787, "loss/crossentropy": 2.3457955360412597, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3119618773460388, "loss/reg": 0.0, "step": 8600 }, { "epoch": 0.05664473684210526, "grad_norm": 3.40625, "grad_norm_var": 1.7676717122395833, "learning_rate": 0.0001, "loss": 3.4129, "loss/crossentropy": 2.3159496188163757, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3751305788755417, "loss/reg": 0.0, "step": 8610 }, { "epoch": 0.056710526315789475, "grad_norm": 2.96875, "grad_norm_var": 1.6932902018229166, "learning_rate": 0.0001, "loss": 3.4391, "loss/crossentropy": 2.6694117546081544, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.29238851368427277, "loss/reg": 0.0, "step": 8620 }, { "epoch": 0.05677631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.11296284993489583, "learning_rate": 0.0001, "loss": 3.3208, "loss/crossentropy": 2.1584444522857664, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2903019651770592, "loss/reg": 0.0, "step": 8630 }, { "epoch": 0.056842105263157895, "grad_norm": 2.078125, "grad_norm_var": 0.12463785807291666, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2420501947402953, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.27747504264116285, "loss/reg": 0.0, "step": 8640 }, { "epoch": 0.05690789473684211, "grad_norm": 2.484375, "grad_norm_var": 0.16119384765625, "learning_rate": 0.0001, "loss": 3.226, "loss/crossentropy": 2.378132700920105, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.27239564061164856, "loss/reg": 0.0, "step": 8650 }, { "epoch": 0.056973684210526315, "grad_norm": 2.796875, "grad_norm_var": 0.06622721354166666, "learning_rate": 0.0001, "loss": 3.3322, "loss/crossentropy": 2.1032593488693236, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2557524010539055, "loss/reg": 0.0, "step": 8660 }, { "epoch": 0.05703947368421053, "grad_norm": 2.640625, "grad_norm_var": 1.2355143229166667, "learning_rate": 0.0001, "loss": 3.3289, "loss/crossentropy": 2.41143513917923, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.27892655730247495, "loss/reg": 0.0, "step": 8670 }, { "epoch": 0.057105263157894735, "grad_norm": 2.40625, "grad_norm_var": 0.021117146809895834, "learning_rate": 0.0001, "loss": 3.348, "loss/crossentropy": 2.443818140029907, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.3347576320171356, "loss/reg": 0.0, "step": 8680 }, { "epoch": 0.05717105263157895, "grad_norm": 2.453125, "grad_norm_var": 9.24689275122101e+16, "learning_rate": 0.0001, "loss": 3.3877, "loss/crossentropy": 2.181317722797394, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.30654960721731184, "loss/reg": 0.0, "step": 8690 }, { "epoch": 0.057236842105263155, "grad_norm": 2.203125, "grad_norm_var": 0.108984375, "learning_rate": 0.0001, "loss": 3.308, "loss/crossentropy": 2.2454151153564452, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.3219583719968796, "loss/reg": 0.0, "step": 8700 }, { "epoch": 0.05730263157894737, "grad_norm": 2.40625, "grad_norm_var": 0.44682515462239586, "learning_rate": 0.0001, "loss": 3.2909, "loss/crossentropy": 2.226694929599762, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.27956253886222837, "loss/reg": 0.0, "step": 8710 }, { "epoch": 0.057368421052631575, "grad_norm": 2.171875, "grad_norm_var": 0.46708577473958335, "learning_rate": 0.0001, "loss": 3.3322, "loss/crossentropy": 2.329686003923416, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.27385311424732206, "loss/reg": 0.0, "step": 8720 }, { "epoch": 0.05743421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.09485575358072916, "learning_rate": 0.0001, "loss": 3.3243, "loss/crossentropy": 2.262730371952057, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.26381057798862456, "loss/reg": 0.0, "step": 8730 }, { "epoch": 0.0575, "grad_norm": 2.453125, "grad_norm_var": 0.07937825520833333, "learning_rate": 0.0001, "loss": 3.3226, "loss/crossentropy": 2.43484423160553, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2970830351114273, "loss/reg": 0.0, "step": 8740 }, { "epoch": 0.05756578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.23977864583333333, "learning_rate": 0.0001, "loss": 3.3517, "loss/crossentropy": 2.2560600876808166, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2825756400823593, "loss/reg": 0.0, "step": 8750 }, { "epoch": 0.05763157894736842, "grad_norm": 2.359375, "grad_norm_var": 4.011067708333333, "learning_rate": 0.0001, "loss": 3.4317, "loss/crossentropy": 2.1193652033805845, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.2872270569205284, "loss/reg": 0.0, "step": 8760 }, { "epoch": 0.05769736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.98638916015625, "learning_rate": 0.0001, "loss": 3.2657, "loss/crossentropy": 2.2515121579170225, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2510280326008797, "loss/reg": 0.0, "step": 8770 }, { "epoch": 0.05776315789473684, "grad_norm": 2.90625, "grad_norm_var": 1.39010009765625, "learning_rate": 0.0001, "loss": 3.3069, "loss/crossentropy": 2.45401873588562, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.3077335625886917, "loss/reg": 0.0, "step": 8780 }, { "epoch": 0.05782894736842105, "grad_norm": 2.671875, "grad_norm_var": 1.4149373372395833, "learning_rate": 0.0001, "loss": 3.3864, "loss/crossentropy": 2.3450352430343626, "loss/hidden": 3.421875, "loss/incoh": 0.0, "loss/logits": 0.3485978364944458, "loss/reg": 0.0, "step": 8790 }, { "epoch": 0.05789473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.23061421712239583, "learning_rate": 0.0001, "loss": 3.261, "loss/crossentropy": 2.4229711413383486, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.3234298795461655, "loss/reg": 0.0, "step": 8800 }, { "epoch": 0.057960526315789476, "grad_norm": 2.703125, "grad_norm_var": 4.344071451822916, "learning_rate": 0.0001, "loss": 3.2778, "loss/crossentropy": 2.202963078022003, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2559796661138535, "loss/reg": 0.0, "step": 8810 }, { "epoch": 0.05802631578947368, "grad_norm": 2.640625, "grad_norm_var": 4.178641764322917, "learning_rate": 0.0001, "loss": 3.3628, "loss/crossentropy": 2.2936235070228577, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.32534674406051634, "loss/reg": 0.0, "step": 8820 }, { "epoch": 0.058092105263157896, "grad_norm": 2.046875, "grad_norm_var": 0.17635091145833334, "learning_rate": 0.0001, "loss": 3.3078, "loss/crossentropy": 2.535597097873688, "loss/hidden": 3.1109375, "loss/incoh": 0.0, "loss/logits": 0.30167998671531676, "loss/reg": 0.0, "step": 8830 }, { "epoch": 0.0581578947368421, "grad_norm": 2.828125, "grad_norm_var": 0.10144856770833334, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.652754557132721, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.37436943501234055, "loss/reg": 0.0, "step": 8840 }, { "epoch": 0.058223684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.79921875, "learning_rate": 0.0001, "loss": 3.3639, "loss/crossentropy": 2.4041113376617433, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.331952853500843, "loss/reg": 0.0, "step": 8850 }, { "epoch": 0.05828947368421052, "grad_norm": 2.671875, "grad_norm_var": 0.8512603759765625, "learning_rate": 0.0001, "loss": 3.2301, "loss/crossentropy": 2.7010722875595095, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2689524292945862, "loss/reg": 0.0, "step": 8860 }, { "epoch": 0.058355263157894736, "grad_norm": 2.484375, "grad_norm_var": 0.22805887858072918, "learning_rate": 0.0001, "loss": 3.301, "loss/crossentropy": 2.3007858633995055, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.38490410447120665, "loss/reg": 0.0, "step": 8870 }, { "epoch": 0.05842105263157895, "grad_norm": 4.375, "grad_norm_var": 0.2703928629557292, "learning_rate": 0.0001, "loss": 3.3077, "loss/crossentropy": 2.562314450740814, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.29355555921792986, "loss/reg": 0.0, "step": 8880 }, { "epoch": 0.058486842105263157, "grad_norm": 4.46875, "grad_norm_var": 0.5262522379557292, "learning_rate": 0.0001, "loss": 3.2847, "loss/crossentropy": 2.329223835468292, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2784240961074829, "loss/reg": 0.0, "step": 8890 }, { "epoch": 0.05855263157894737, "grad_norm": 2.078125, "grad_norm_var": 0.32392476399739584, "learning_rate": 0.0001, "loss": 3.2975, "loss/crossentropy": 2.371010947227478, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3012363612651825, "loss/reg": 0.0, "step": 8900 }, { "epoch": 0.05861842105263158, "grad_norm": 2.90625, "grad_norm_var": 0.05488993326822917, "learning_rate": 0.0001, "loss": 3.3558, "loss/crossentropy": 2.4505101799964906, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28698593378067017, "loss/reg": 0.0, "step": 8910 }, { "epoch": 0.05868421052631579, "grad_norm": 2.890625, "grad_norm_var": 1.541039021809896, "learning_rate": 0.0001, "loss": 3.4362, "loss/crossentropy": 2.569356393814087, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.3198714107275009, "loss/reg": 0.0, "step": 8920 }, { "epoch": 0.05875, "grad_norm": 2.34375, "grad_norm_var": 0.24500325520833333, "learning_rate": 0.0001, "loss": 3.3947, "loss/crossentropy": 2.106434017419815, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.2644048437476158, "loss/reg": 0.0, "step": 8930 }, { "epoch": 0.05881578947368421, "grad_norm": 2.03125, "grad_norm_var": 0.21669514973958334, "learning_rate": 0.0001, "loss": 3.2788, "loss/crossentropy": 2.517351245880127, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.27984755039215087, "loss/reg": 0.0, "step": 8940 }, { "epoch": 0.058881578947368424, "grad_norm": 2.328125, "grad_norm_var": 0.06357421875, "learning_rate": 0.0001, "loss": 3.3106, "loss/crossentropy": 2.3258296728134153, "loss/hidden": 3.165625, "loss/incoh": 0.0, "loss/logits": 0.32734392732381823, "loss/reg": 0.0, "step": 8950 }, { "epoch": 0.05894736842105263, "grad_norm": 2.53125, "grad_norm_var": 0.7411610921223958, "learning_rate": 0.0001, "loss": 3.4426, "loss/crossentropy": 2.4165605783462523, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3073220491409302, "loss/reg": 0.0, "step": 8960 }, { "epoch": 0.059013157894736844, "grad_norm": 2.21875, "grad_norm_var": 0.44649149576822916, "learning_rate": 0.0001, "loss": 3.2202, "loss/crossentropy": 2.4866004467010496, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.30207364708185197, "loss/reg": 0.0, "step": 8970 }, { "epoch": 0.05907894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.0566070556640625, "learning_rate": 0.0001, "loss": 3.2957, "loss/crossentropy": 2.4917370676994324, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.3163344025611877, "loss/reg": 0.0, "step": 8980 }, { "epoch": 0.059144736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.0467193603515625, "learning_rate": 0.0001, "loss": 3.337, "loss/crossentropy": 2.529834246635437, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2896230161190033, "loss/reg": 0.0, "step": 8990 }, { "epoch": 0.05921052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 3.2664, "loss/crossentropy": 2.548252558708191, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.3721586674451828, "loss/reg": 0.0, "step": 9000 }, { "epoch": 0.059276315789473684, "grad_norm": 2.375, "grad_norm_var": 0.010172526041666666, "learning_rate": 0.0001, "loss": 3.3136, "loss/crossentropy": 2.628942942619324, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.29181756228208544, "loss/reg": 0.0, "step": 9010 }, { "epoch": 0.0593421052631579, "grad_norm": 2.484375, "grad_norm_var": 0.039937337239583336, "learning_rate": 0.0001, "loss": 3.3276, "loss/crossentropy": 2.35152667760849, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2875461965799332, "loss/reg": 0.0, "step": 9020 }, { "epoch": 0.059407894736842104, "grad_norm": 2.625, "grad_norm_var": 0.0728668212890625, "learning_rate": 0.0001, "loss": 3.3454, "loss/crossentropy": 2.3906983017921446, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2821692392230034, "loss/reg": 0.0, "step": 9030 }, { "epoch": 0.05947368421052632, "grad_norm": 2.5625, "grad_norm_var": 0.0730133056640625, "learning_rate": 0.0001, "loss": 3.3058, "loss/crossentropy": 2.4100876331329344, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2886385262012482, "loss/reg": 0.0, "step": 9040 }, { "epoch": 0.059539473684210524, "grad_norm": 2.5, "grad_norm_var": 0.4940592447916667, "learning_rate": 0.0001, "loss": 3.4398, "loss/crossentropy": 2.464267885684967, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.31053231209516524, "loss/reg": 0.0, "step": 9050 }, { "epoch": 0.05960526315789474, "grad_norm": 2.015625, "grad_norm_var": 0.08632405598958333, "learning_rate": 0.0001, "loss": 3.3778, "loss/crossentropy": 2.402372860908508, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.35591588020324705, "loss/reg": 0.0, "step": 9060 }, { "epoch": 0.059671052631578944, "grad_norm": 3.296875, "grad_norm_var": 0.12502848307291667, "learning_rate": 0.0001, "loss": 3.3312, "loss/crossentropy": 2.3369374930858613, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.2521222412586212, "loss/reg": 0.0, "step": 9070 }, { "epoch": 0.05973684210526316, "grad_norm": 3.15625, "grad_norm_var": 0.11041259765625, "learning_rate": 0.0001, "loss": 3.2977, "loss/crossentropy": 2.2894081354141234, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.37748522460460665, "loss/reg": 0.0, "step": 9080 }, { "epoch": 0.05980263157894737, "grad_norm": 2.671875, "grad_norm_var": 2.4495359778774493e+17, "learning_rate": 0.0001, "loss": 3.4555, "loss/crossentropy": 2.248319935798645, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.27714093402028084, "loss/reg": 0.0, "step": 9090 }, { "epoch": 0.05986842105263158, "grad_norm": 2.609375, "grad_norm_var": 2.449535978075936e+17, "learning_rate": 0.0001, "loss": 3.2988, "loss/crossentropy": 2.304659366607666, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.2838084354996681, "loss/reg": 0.0, "step": 9100 }, { "epoch": 0.05993421052631579, "grad_norm": 2.796875, "grad_norm_var": 0.5627838134765625, "learning_rate": 0.0001, "loss": 3.2864, "loss/crossentropy": 2.5072904348373415, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.30276854485273363, "loss/reg": 0.0, "step": 9110 }, { "epoch": 0.06, "grad_norm": 2.328125, "grad_norm_var": 0.5690582275390625, "learning_rate": 0.0001, "loss": 3.2102, "loss/crossentropy": 2.2461979389190674, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2540374085307121, "loss/reg": 0.0, "step": 9120 }, { "epoch": 0.06006578947368421, "grad_norm": 2.21875, "grad_norm_var": 0.11433817545572916, "learning_rate": 0.0001, "loss": 3.2304, "loss/crossentropy": 2.3598265290260314, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.28421255201101303, "loss/reg": 0.0, "step": 9130 }, { "epoch": 0.06013157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.05432535807291667, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.3720561623573304, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.31470999121665955, "loss/reg": 0.0, "step": 9140 }, { "epoch": 0.06019736842105263, "grad_norm": 2.453125, "grad_norm_var": 0.08810933430989583, "learning_rate": 0.0001, "loss": 3.3173, "loss/crossentropy": 2.2540945589542387, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24728769958019256, "loss/reg": 0.0, "step": 9150 }, { "epoch": 0.060263157894736845, "grad_norm": 3.109375, "grad_norm_var": 0.07870992024739583, "learning_rate": 0.0001, "loss": 3.295, "loss/crossentropy": 2.2703887224197388, "loss/hidden": 3.20625, "loss/incoh": 0.0, "loss/logits": 0.27733070850372316, "loss/reg": 0.0, "step": 9160 }, { "epoch": 0.06032894736842105, "grad_norm": 2.203125, "grad_norm_var": 0.20073954264322916, "learning_rate": 0.0001, "loss": 3.2337, "loss/crossentropy": 2.5513323664665224, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.27180006355047226, "loss/reg": 0.0, "step": 9170 }, { "epoch": 0.060394736842105265, "grad_norm": 2.234375, "grad_norm_var": 0.6225545247395833, "learning_rate": 0.0001, "loss": 3.4001, "loss/crossentropy": 2.1638057589530946, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2586470812559128, "loss/reg": 0.0, "step": 9180 }, { "epoch": 0.06046052631578947, "grad_norm": 2.140625, "grad_norm_var": 0.37139867146809896, "learning_rate": 0.0001, "loss": 3.2305, "loss/crossentropy": 2.4451101064682006, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.3055331766605377, "loss/reg": 0.0, "step": 9190 }, { "epoch": 0.060526315789473685, "grad_norm": 2.171875, "grad_norm_var": 0.32624282836914065, "learning_rate": 0.0001, "loss": 3.2254, "loss/crossentropy": 2.561469316482544, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2592929035425186, "loss/reg": 0.0, "step": 9200 }, { "epoch": 0.06059210526315789, "grad_norm": 2.90625, "grad_norm_var": 1.0789459228515625, "learning_rate": 0.0001, "loss": 3.3551, "loss/crossentropy": 2.1200980842113495, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2634043380618095, "loss/reg": 0.0, "step": 9210 }, { "epoch": 0.060657894736842105, "grad_norm": 2.0, "grad_norm_var": 0.54371337890625, "learning_rate": 0.0001, "loss": 3.3742, "loss/crossentropy": 2.4880159854888917, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.29063448309898376, "loss/reg": 0.0, "step": 9220 }, { "epoch": 0.06072368421052632, "grad_norm": 2.953125, "grad_norm_var": 0.24761962890625, "learning_rate": 0.0001, "loss": 3.3781, "loss/crossentropy": 2.558793306350708, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.3282888367772102, "loss/reg": 0.0, "step": 9230 }, { "epoch": 0.060789473684210525, "grad_norm": 4.75, "grad_norm_var": 0.35944010416666666, "learning_rate": 0.0001, "loss": 3.2908, "loss/crossentropy": 2.3837397813797, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.29975869655609133, "loss/reg": 0.0, "step": 9240 }, { "epoch": 0.06085526315789474, "grad_norm": 2.828125, "grad_norm_var": 0.41629130045572915, "learning_rate": 0.0001, "loss": 3.3291, "loss/crossentropy": 2.395397412776947, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.3036083221435547, "loss/reg": 0.0, "step": 9250 }, { "epoch": 0.060921052631578945, "grad_norm": 2.5, "grad_norm_var": 0.09286702473958333, "learning_rate": 0.0001, "loss": 3.2991, "loss/crossentropy": 2.350081342458725, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.31170106381177903, "loss/reg": 0.0, "step": 9260 }, { "epoch": 0.06098684210526316, "grad_norm": 3.125, "grad_norm_var": 0.09381510416666666, "learning_rate": 0.0001, "loss": 3.2234, "loss/crossentropy": 2.023473250865936, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2770851716399193, "loss/reg": 0.0, "step": 9270 }, { "epoch": 0.061052631578947365, "grad_norm": 2.265625, "grad_norm_var": 0.09631245930989583, "learning_rate": 0.0001, "loss": 3.2279, "loss/crossentropy": 2.362475335597992, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24814699292182923, "loss/reg": 0.0, "step": 9280 }, { "epoch": 0.06111842105263158, "grad_norm": 3.1875, "grad_norm_var": 0.49576416015625, "learning_rate": 0.0001, "loss": 3.4035, "loss/crossentropy": 2.5755128145217894, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.35746499747037885, "loss/reg": 0.0, "step": 9290 }, { "epoch": 0.06118421052631579, "grad_norm": 2.953125, "grad_norm_var": 0.09010009765625, "learning_rate": 0.0001, "loss": 3.2592, "loss/crossentropy": 2.2949343085289002, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.2996257975697517, "loss/reg": 0.0, "step": 9300 }, { "epoch": 0.06125, "grad_norm": 2.421875, "grad_norm_var": 0.3297190348307292, "learning_rate": 0.0001, "loss": 3.3329, "loss/crossentropy": 2.4506813049316407, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.3265557274222374, "loss/reg": 0.0, "step": 9310 }, { "epoch": 0.06131578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.3309529622395833, "learning_rate": 0.0001, "loss": 3.2163, "loss/crossentropy": 2.3078281760215758, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.2651080995798111, "loss/reg": 0.0, "step": 9320 }, { "epoch": 0.06138157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.26513671875, "learning_rate": 0.0001, "loss": 3.2706, "loss/crossentropy": 2.1937548160552978, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2700933560729027, "loss/reg": 0.0, "step": 9330 }, { "epoch": 0.06144736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.31164957682291666, "learning_rate": 0.0001, "loss": 3.4252, "loss/crossentropy": 2.112582105398178, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.26334969997406005, "loss/reg": 0.0, "step": 9340 }, { "epoch": 0.06151315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.33046875, "learning_rate": 0.0001, "loss": 3.3277, "loss/crossentropy": 2.4319912672042845, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2982776537537575, "loss/reg": 0.0, "step": 9350 }, { "epoch": 0.06157894736842105, "grad_norm": 2.171875, "grad_norm_var": 0.48318684895833336, "learning_rate": 0.0001, "loss": 3.28, "loss/crossentropy": 2.3863817691802978, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2783980667591095, "loss/reg": 0.0, "step": 9360 }, { "epoch": 0.061644736842105266, "grad_norm": 2.59375, "grad_norm_var": 0.4833730061848958, "learning_rate": 0.0001, "loss": 3.329, "loss/crossentropy": 2.5484490633010863, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.3031032904982567, "loss/reg": 0.0, "step": 9370 }, { "epoch": 0.06171052631578947, "grad_norm": 2.203125, "grad_norm_var": 0.08167317708333334, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.2674924612045286, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.3650638833642006, "loss/reg": 0.0, "step": 9380 }, { "epoch": 0.061776315789473686, "grad_norm": 2.5625, "grad_norm_var": 0.22155659993489582, "learning_rate": 0.0001, "loss": 3.3241, "loss/crossentropy": 2.3348896741867065, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.283182792365551, "loss/reg": 0.0, "step": 9390 }, { "epoch": 0.06184210526315789, "grad_norm": 3.15625, "grad_norm_var": 1.1829427083333333, "learning_rate": 0.0001, "loss": 3.3629, "loss/crossentropy": 2.464947986602783, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.28233895897865297, "loss/reg": 0.0, "step": 9400 }, { "epoch": 0.061907894736842106, "grad_norm": 2.296875, "grad_norm_var": 1.1680623372395833, "learning_rate": 0.0001, "loss": 3.3087, "loss/crossentropy": 2.179221343994141, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.3310952290892601, "loss/reg": 0.0, "step": 9410 }, { "epoch": 0.06197368421052631, "grad_norm": 2.21875, "grad_norm_var": 2.0582183837890624, "learning_rate": 0.0001, "loss": 3.4293, "loss/crossentropy": 2.413185155391693, "loss/hidden": 3.18125, "loss/incoh": 0.0, "loss/logits": 0.3003757044672966, "loss/reg": 0.0, "step": 9420 }, { "epoch": 0.062039473684210526, "grad_norm": 2.421875, "grad_norm_var": 0.3502349853515625, "learning_rate": 0.0001, "loss": 3.1251, "loss/crossentropy": 2.3518580555915833, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2455562546849251, "loss/reg": 0.0, "step": 9430 }, { "epoch": 0.06210526315789474, "grad_norm": 2.578125, "grad_norm_var": 0.028425089518229165, "learning_rate": 0.0001, "loss": 3.3233, "loss/crossentropy": 2.375118088722229, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.31473297625780106, "loss/reg": 0.0, "step": 9440 }, { "epoch": 0.062171052631578946, "grad_norm": 2.46875, "grad_norm_var": 0.0333984375, "learning_rate": 0.0001, "loss": 3.2654, "loss/crossentropy": 2.3164775133132935, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2699767738580704, "loss/reg": 0.0, "step": 9450 }, { "epoch": 0.06223684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.0964263916015625, "learning_rate": 0.0001, "loss": 3.3552, "loss/crossentropy": 2.233529049158096, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.3262931898236275, "loss/reg": 0.0, "step": 9460 }, { "epoch": 0.062302631578947366, "grad_norm": 2.1875, "grad_norm_var": 24.440249633789062, "learning_rate": 0.0001, "loss": 3.2359, "loss/crossentropy": 2.3248747825622558, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.24342142790555954, "loss/reg": 0.0, "step": 9470 }, { "epoch": 0.06236842105263158, "grad_norm": 2.5625, "grad_norm_var": 0.06562398274739584, "learning_rate": 0.0001, "loss": 3.3237, "loss/crossentropy": 2.588094711303711, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3172804355621338, "loss/reg": 0.0, "step": 9480 }, { "epoch": 0.062434210526315786, "grad_norm": 2.375, "grad_norm_var": 0.21744384765625, "learning_rate": 0.0001, "loss": 3.3475, "loss/crossentropy": 2.390676462650299, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.27351657301187515, "loss/reg": 0.0, "step": 9490 }, { "epoch": 0.0625, "grad_norm": 2.703125, "grad_norm_var": 0.0399078369140625, "learning_rate": 0.0001, "loss": 3.2819, "loss/crossentropy": 2.3253311276435853, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.265305657684803, "loss/reg": 0.0, "step": 9500 }, { "epoch": 0.06256578947368421, "grad_norm": 2.25, "grad_norm_var": 0.08692118326822916, "learning_rate": 0.0001, "loss": 3.2535, "loss/crossentropy": 2.3912763714790346, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2748603358864784, "loss/reg": 0.0, "step": 9510 }, { "epoch": 0.06263157894736843, "grad_norm": 2.265625, "grad_norm_var": 0.11210098266601562, "learning_rate": 0.0001, "loss": 3.2609, "loss/crossentropy": 2.3854790568351745, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2736155718564987, "loss/reg": 0.0, "step": 9520 }, { "epoch": 0.06269736842105263, "grad_norm": 2.9375, "grad_norm_var": 0.13178888956705728, "learning_rate": 0.0001, "loss": 3.2434, "loss/crossentropy": 2.4524547696113586, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.256032706797123, "loss/reg": 0.0, "step": 9530 }, { "epoch": 0.06276315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.08687744140625, "learning_rate": 0.0001, "loss": 3.3256, "loss/crossentropy": 2.563627076148987, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.3692674309015274, "loss/reg": 0.0, "step": 9540 }, { "epoch": 0.06282894736842105, "grad_norm": 2.4375, "grad_norm_var": 0.13664957682291667, "learning_rate": 0.0001, "loss": 3.3197, "loss/crossentropy": 2.3016056180000306, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.2520491242408752, "loss/reg": 0.0, "step": 9550 }, { "epoch": 0.06289473684210527, "grad_norm": 2.828125, "grad_norm_var": 0.4147857666015625, "learning_rate": 0.0001, "loss": 3.511, "loss/crossentropy": 2.1577144265174866, "loss/hidden": 3.1046875, "loss/incoh": 0.0, "loss/logits": 0.2929541230201721, "loss/reg": 0.0, "step": 9560 }, { "epoch": 0.06296052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.18341471354166666, "learning_rate": 0.0001, "loss": 3.2549, "loss/crossentropy": 2.31901068687439, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.27463280111551286, "loss/reg": 0.0, "step": 9570 }, { "epoch": 0.06302631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.4622884114583333, "learning_rate": 0.0001, "loss": 3.3477, "loss/crossentropy": 2.6131432056427, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.38214774429798126, "loss/reg": 0.0, "step": 9580 }, { "epoch": 0.0630921052631579, "grad_norm": 2.484375, "grad_norm_var": 0.13547337849934896, "learning_rate": 0.0001, "loss": 3.2885, "loss/crossentropy": 2.451016199588776, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2813758164644241, "loss/reg": 0.0, "step": 9590 }, { "epoch": 0.06315789473684211, "grad_norm": 2.1875, "grad_norm_var": 0.2512003580729167, "learning_rate": 0.0001, "loss": 3.3345, "loss/crossentropy": 2.1964026927947997, "loss/hidden": 3.2765625, "loss/incoh": 0.0, "loss/logits": 0.31094489246606827, "loss/reg": 0.0, "step": 9600 }, { "epoch": 0.06322368421052632, "grad_norm": 2.25, "grad_norm_var": 0.28735249837239585, "learning_rate": 0.0001, "loss": 3.2956, "loss/crossentropy": 2.3820174098014832, "loss/hidden": 3.1375, "loss/incoh": 0.0, "loss/logits": 0.30036603659391403, "loss/reg": 0.0, "step": 9610 }, { "epoch": 0.06328947368421052, "grad_norm": 2.609375, "grad_norm_var": 0.9716102600097656, "learning_rate": 0.0001, "loss": 3.2363, "loss/crossentropy": 2.4219281673431396, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2639324277639389, "loss/reg": 0.0, "step": 9620 }, { "epoch": 0.06335526315789473, "grad_norm": 2.8125, "grad_norm_var": 0.8159016927083333, "learning_rate": 0.0001, "loss": 3.331, "loss/crossentropy": 1.9574783891439438, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.2543580986559391, "loss/reg": 0.0, "step": 9630 }, { "epoch": 0.06342105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.06944961547851562, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 2.210974097251892, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2627203479409218, "loss/reg": 0.0, "step": 9640 }, { "epoch": 0.06348684210526316, "grad_norm": 2.9375, "grad_norm_var": 0.2388336181640625, "learning_rate": 0.0001, "loss": 3.3255, "loss/crossentropy": 2.225372338294983, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.28057117611169813, "loss/reg": 0.0, "step": 9650 }, { "epoch": 0.06355263157894737, "grad_norm": 2.0, "grad_norm_var": 0.3254954020182292, "learning_rate": 0.0001, "loss": 3.3021, "loss/crossentropy": 2.190217161178589, "loss/hidden": 3.1734375, "loss/incoh": 0.0, "loss/logits": 0.31579277813434603, "loss/reg": 0.0, "step": 9660 }, { "epoch": 0.06361842105263157, "grad_norm": 2.078125, "grad_norm_var": 0.5466105143229166, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.427161252498627, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.307109659910202, "loss/reg": 0.0, "step": 9670 }, { "epoch": 0.06368421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.48213602701822916, "learning_rate": 0.0001, "loss": 3.2915, "loss/crossentropy": 2.2590057969093325, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.2674726366996765, "loss/reg": 0.0, "step": 9680 }, { "epoch": 0.06375, "grad_norm": 2.53125, "grad_norm_var": 0.2637858072916667, "learning_rate": 0.0001, "loss": 3.2742, "loss/crossentropy": 2.3426333904266357, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2729641154408455, "loss/reg": 0.0, "step": 9690 }, { "epoch": 0.06381578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.3402252197265625, "learning_rate": 0.0001, "loss": 3.2983, "loss/crossentropy": 2.3633025169372557, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2770617350935936, "loss/reg": 0.0, "step": 9700 }, { "epoch": 0.06388157894736841, "grad_norm": 3.25, "grad_norm_var": 0.23658447265625, "learning_rate": 0.0001, "loss": 3.2621, "loss/crossentropy": 2.363292157649994, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2777522191405296, "loss/reg": 0.0, "step": 9710 }, { "epoch": 0.06394736842105263, "grad_norm": 3.5625, "grad_norm_var": 0.45250244140625, "learning_rate": 0.0001, "loss": 3.3017, "loss/crossentropy": 2.4210567593574526, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2831977978348732, "loss/reg": 0.0, "step": 9720 }, { "epoch": 0.06401315789473684, "grad_norm": 2.453125, "grad_norm_var": 0.2709147135416667, "learning_rate": 0.0001, "loss": 3.234, "loss/crossentropy": 2.3109630227088926, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.31227574199438096, "loss/reg": 0.0, "step": 9730 }, { "epoch": 0.06407894736842105, "grad_norm": 2.15625, "grad_norm_var": 0.38984273274739584, "learning_rate": 0.0001, "loss": 3.2414, "loss/crossentropy": 2.225367599725723, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.24756639897823335, "loss/reg": 0.0, "step": 9740 }, { "epoch": 0.06414473684210527, "grad_norm": 3.65625, "grad_norm_var": 0.40685933430989585, "learning_rate": 0.0001, "loss": 3.2959, "loss/crossentropy": 2.633290505409241, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.27978702783584597, "loss/reg": 0.0, "step": 9750 }, { "epoch": 0.06421052631578947, "grad_norm": 2.3125, "grad_norm_var": 1.353466796875, "learning_rate": 0.0001, "loss": 3.2732, "loss/crossentropy": 2.377350616455078, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27294492572546003, "loss/reg": 0.0, "step": 9760 }, { "epoch": 0.06427631578947368, "grad_norm": 2.375, "grad_norm_var": 1.3017242431640625, "learning_rate": 0.0001, "loss": 3.3766, "loss/crossentropy": 2.230624866485596, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.2353967770934105, "loss/reg": 0.0, "step": 9770 }, { "epoch": 0.0643421052631579, "grad_norm": 3.046875, "grad_norm_var": 30.074632771809895, "learning_rate": 0.0001, "loss": 3.2769, "loss/crossentropy": 2.250266909599304, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2896205812692642, "loss/reg": 0.0, "step": 9780 }, { "epoch": 0.06440789473684211, "grad_norm": 2.578125, "grad_norm_var": 30.214286295572915, "learning_rate": 0.0001, "loss": 3.2666, "loss/crossentropy": 2.495633435249329, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2458545297384262, "loss/reg": 0.0, "step": 9790 }, { "epoch": 0.06447368421052632, "grad_norm": 1.96875, "grad_norm_var": 0.09082743326822916, "learning_rate": 0.0001, "loss": 3.2638, "loss/crossentropy": 2.140651452541351, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.27739599645137786, "loss/reg": 0.0, "step": 9800 }, { "epoch": 0.06453947368421052, "grad_norm": 2.765625, "grad_norm_var": 2.439875284830729, "learning_rate": 0.0001, "loss": 3.2928, "loss/crossentropy": 2.273459422588348, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.26713447719812394, "loss/reg": 0.0, "step": 9810 }, { "epoch": 0.06460526315789474, "grad_norm": 2.640625, "grad_norm_var": 2.7100901285807293, "learning_rate": 0.0001, "loss": 3.2839, "loss/crossentropy": 2.4476850271224975, "loss/hidden": 3.3984375, "loss/incoh": 0.0, "loss/logits": 0.35955790579319, "loss/reg": 0.0, "step": 9820 }, { "epoch": 0.06467105263157895, "grad_norm": 2.5, "grad_norm_var": 0.46149800618489584, "learning_rate": 0.0001, "loss": 3.306, "loss/crossentropy": 2.3544110536575316, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.31140194088220596, "loss/reg": 0.0, "step": 9830 }, { "epoch": 0.06473684210526316, "grad_norm": 2.4375, "grad_norm_var": 0.15746968587239582, "learning_rate": 0.0001, "loss": 3.2209, "loss/crossentropy": 2.2321391999721527, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2574504017829895, "loss/reg": 0.0, "step": 9840 }, { "epoch": 0.06480263157894736, "grad_norm": 2.171875, "grad_norm_var": 0.10312093098958333, "learning_rate": 0.0001, "loss": 3.2513, "loss/crossentropy": 2.311957097053528, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2625372394919395, "loss/reg": 0.0, "step": 9850 }, { "epoch": 0.06486842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.0565826416015625, "learning_rate": 0.0001, "loss": 3.3056, "loss/crossentropy": 2.331065666675568, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.277868440747261, "loss/reg": 0.0, "step": 9860 }, { "epoch": 0.06493421052631579, "grad_norm": 2.46875, "grad_norm_var": 0.18406575520833332, "learning_rate": 0.0001, "loss": 3.3417, "loss/crossentropy": 2.314790654182434, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.33730033934116366, "loss/reg": 0.0, "step": 9870 }, { "epoch": 0.065, "grad_norm": 2.359375, "grad_norm_var": 0.08243815104166667, "learning_rate": 0.0001, "loss": 3.3522, "loss/crossentropy": 2.489400041103363, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.28822820335626603, "loss/reg": 0.0, "step": 9880 }, { "epoch": 0.06506578947368422, "grad_norm": 2.171875, "grad_norm_var": 0.0967437744140625, "learning_rate": 0.0001, "loss": 3.2368, "loss/crossentropy": 2.3749926924705504, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3226087599992752, "loss/reg": 0.0, "step": 9890 }, { "epoch": 0.06513157894736842, "grad_norm": 2.265625, "grad_norm_var": 0.11569722493489583, "learning_rate": 0.0001, "loss": 3.2881, "loss/crossentropy": 2.190938687324524, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.30352693498134614, "loss/reg": 0.0, "step": 9900 }, { "epoch": 0.06519736842105263, "grad_norm": 2.28125, "grad_norm_var": 0.09851888020833334, "learning_rate": 0.0001, "loss": 3.2849, "loss/crossentropy": 2.412072277069092, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.32761459052562714, "loss/reg": 0.0, "step": 9910 }, { "epoch": 0.06526315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.49250895182291665, "learning_rate": 0.0001, "loss": 3.2984, "loss/crossentropy": 2.2049274504184724, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.2710462361574173, "loss/reg": 0.0, "step": 9920 }, { "epoch": 0.06532894736842106, "grad_norm": 2.296875, "grad_norm_var": 0.2756581624348958, "learning_rate": 0.0001, "loss": 3.3729, "loss/crossentropy": 2.0697293996810915, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.32963491380214693, "loss/reg": 0.0, "step": 9930 }, { "epoch": 0.06539473684210527, "grad_norm": 2.546875, "grad_norm_var": 0.18677978515625, "learning_rate": 0.0001, "loss": 3.2972, "loss/crossentropy": 2.3622434020042418, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.31804386228322984, "loss/reg": 0.0, "step": 9940 }, { "epoch": 0.06546052631578947, "grad_norm": 2.71875, "grad_norm_var": 0.07353413899739583, "learning_rate": 0.0001, "loss": 3.3347, "loss/crossentropy": 2.4146655321121218, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.36195366978645327, "loss/reg": 0.0, "step": 9950 }, { "epoch": 0.06552631578947368, "grad_norm": 2.65625, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 3.3045, "loss/crossentropy": 2.499011588096619, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.27686055898666384, "loss/reg": 0.0, "step": 9960 }, { "epoch": 0.0655921052631579, "grad_norm": 2.359375, "grad_norm_var": 0.36774800618489584, "learning_rate": 0.0001, "loss": 3.3505, "loss/crossentropy": 2.2883435606956484, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.43403479307889936, "loss/reg": 0.0, "step": 9970 }, { "epoch": 0.06565789473684211, "grad_norm": 2.96875, "grad_norm_var": 0.39895426432291664, "learning_rate": 0.0001, "loss": 3.3004, "loss/crossentropy": 2.328636658191681, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.30042982697486875, "loss/reg": 0.0, "step": 9980 }, { "epoch": 0.06572368421052631, "grad_norm": 2.734375, "grad_norm_var": 0.070458984375, "learning_rate": 0.0001, "loss": 3.3201, "loss/crossentropy": 2.1099472880363463, "loss/hidden": 3.2375, "loss/incoh": 0.0, "loss/logits": 0.2899234861135483, "loss/reg": 0.0, "step": 9990 }, { "epoch": 0.06578947368421052, "grad_norm": 2.40625, "grad_norm_var": 0.06652018229166666, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2847598433494567, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.3000261425971985, "loss/reg": 0.0, "step": 10000 }, { "epoch": 0.06585526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.08883463541666667, "learning_rate": 0.0001, "loss": 3.2377, "loss/crossentropy": 2.4516909599304197, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3408001005649567, "loss/reg": 0.0, "step": 10010 }, { "epoch": 0.06592105263157895, "grad_norm": 2.40625, "grad_norm_var": 4.412495930989583, "learning_rate": 0.0001, "loss": 3.417, "loss/crossentropy": 2.3393358111381533, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.3251391679048538, "loss/reg": 0.0, "step": 10020 }, { "epoch": 0.06598684210526316, "grad_norm": 2.921875, "grad_norm_var": 4.318382771809896, "learning_rate": 0.0001, "loss": 3.318, "loss/crossentropy": 2.4476661682128906, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.25891698747873304, "loss/reg": 0.0, "step": 10030 }, { "epoch": 0.06605263157894736, "grad_norm": 2.265625, "grad_norm_var": 2.029248046875, "learning_rate": 0.0001, "loss": 3.3175, "loss/crossentropy": 2.5090669870376585, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.3005325973033905, "loss/reg": 0.0, "step": 10040 }, { "epoch": 0.06611842105263158, "grad_norm": 2.515625, "grad_norm_var": 18.96970926920573, "learning_rate": 0.0001, "loss": 3.3768, "loss/crossentropy": 2.4451366662979126, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2800415620207787, "loss/reg": 0.0, "step": 10050 }, { "epoch": 0.06618421052631579, "grad_norm": 2.9375, "grad_norm_var": 0.11100260416666667, "learning_rate": 0.0001, "loss": 3.3131, "loss/crossentropy": 2.3123559236526487, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.32703131139278413, "loss/reg": 0.0, "step": 10060 }, { "epoch": 0.06625, "grad_norm": 2.3125, "grad_norm_var": 0.055985514322916666, "learning_rate": 0.0001, "loss": 3.2837, "loss/crossentropy": 2.20616455078125, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.3035066843032837, "loss/reg": 0.0, "step": 10070 }, { "epoch": 0.06631578947368422, "grad_norm": 2.3125, "grad_norm_var": 0.025560506184895835, "learning_rate": 0.0001, "loss": 3.202, "loss/crossentropy": 2.3889974474906923, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25791111290454866, "loss/reg": 0.0, "step": 10080 }, { "epoch": 0.06638157894736842, "grad_norm": 3.078125, "grad_norm_var": 0.12398681640625, "learning_rate": 0.0001, "loss": 3.2662, "loss/crossentropy": 2.0885241270065307, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.26190474927425383, "loss/reg": 0.0, "step": 10090 }, { "epoch": 0.06644736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.10334879557291667, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.3084590315818785, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.292548905313015, "loss/reg": 0.0, "step": 10100 }, { "epoch": 0.06651315789473684, "grad_norm": 2.21875, "grad_norm_var": 0.11416727701822917, "learning_rate": 0.0001, "loss": 3.214, "loss/crossentropy": 2.519231605529785, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2500781774520874, "loss/reg": 0.0, "step": 10110 }, { "epoch": 0.06657894736842106, "grad_norm": 2.609375, "grad_norm_var": 0.10711263020833334, "learning_rate": 0.0001, "loss": 3.3349, "loss/crossentropy": 2.3334843158721923, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.2946275919675827, "loss/reg": 0.0, "step": 10120 }, { "epoch": 0.06664473684210526, "grad_norm": 2.75, "grad_norm_var": 0.3732086181640625, "learning_rate": 0.0001, "loss": 3.3638, "loss/crossentropy": 2.4801114797592163, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.29529736936092377, "loss/reg": 0.0, "step": 10130 }, { "epoch": 0.06671052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.4193511962890625, "learning_rate": 0.0001, "loss": 3.3195, "loss/crossentropy": 2.3818048357963564, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.299163943529129, "loss/reg": 0.0, "step": 10140 }, { "epoch": 0.06677631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.07637430826822916, "learning_rate": 0.0001, "loss": 3.2907, "loss/crossentropy": 2.3112216353416444, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.24870002567768096, "loss/reg": 0.0, "step": 10150 }, { "epoch": 0.0668421052631579, "grad_norm": 3.859375, "grad_norm_var": 0.16756083170572916, "learning_rate": 0.0001, "loss": 3.369, "loss/crossentropy": 2.1794182300567626, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3170015588402748, "loss/reg": 0.0, "step": 10160 }, { "epoch": 0.06690789473684211, "grad_norm": 2.34375, "grad_norm_var": 0.17148030598958333, "learning_rate": 0.0001, "loss": 3.3343, "loss/crossentropy": 2.125279116630554, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.27327116429805753, "loss/reg": 0.0, "step": 10170 }, { "epoch": 0.06697368421052631, "grad_norm": 2.03125, "grad_norm_var": 1.8383626302083333, "learning_rate": 0.0001, "loss": 3.3548, "loss/crossentropy": 2.4289526462554933, "loss/hidden": 3.065625, "loss/incoh": 0.0, "loss/logits": 0.28756752908229827, "loss/reg": 0.0, "step": 10180 }, { "epoch": 0.06703947368421052, "grad_norm": 2.25, "grad_norm_var": 0.4429677327473958, "learning_rate": 0.0001, "loss": 3.3431, "loss/crossentropy": 2.297343075275421, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2415475845336914, "loss/reg": 0.0, "step": 10190 }, { "epoch": 0.06710526315789474, "grad_norm": 2.1875, "grad_norm_var": 0.53648681640625, "learning_rate": 0.0001, "loss": 3.2412, "loss/crossentropy": 2.291072869300842, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.3272585093975067, "loss/reg": 0.0, "step": 10200 }, { "epoch": 0.06717105263157895, "grad_norm": 2.53125, "grad_norm_var": 0.03841044108072917, "learning_rate": 0.0001, "loss": 3.2881, "loss/crossentropy": 2.3315325021743774, "loss/hidden": 3.1046875, "loss/incoh": 0.0, "loss/logits": 0.2916677713394165, "loss/reg": 0.0, "step": 10210 }, { "epoch": 0.06723684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 3.2758, "loss/crossentropy": 2.496378016471863, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.2993095234036446, "loss/reg": 0.0, "step": 10220 }, { "epoch": 0.06730263157894736, "grad_norm": 2.390625, "grad_norm_var": 0.030101521809895834, "learning_rate": 0.0001, "loss": 3.184, "loss/crossentropy": 1.954445093870163, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.23346130400896073, "loss/reg": 0.0, "step": 10230 }, { "epoch": 0.06736842105263158, "grad_norm": 3.6875, "grad_norm_var": 0.2775675455729167, "learning_rate": 0.0001, "loss": 3.2837, "loss/crossentropy": 2.4431410312652586, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3258019149303436, "loss/reg": 0.0, "step": 10240 }, { "epoch": 0.06743421052631579, "grad_norm": 2.125, "grad_norm_var": 0.33483784993489585, "learning_rate": 0.0001, "loss": 3.2903, "loss/crossentropy": 2.0363747119903564, "loss/hidden": 3.4796875, "loss/incoh": 0.0, "loss/logits": 0.3713301241397858, "loss/reg": 0.0, "step": 10250 }, { "epoch": 0.0675, "grad_norm": 2.4375, "grad_norm_var": 0.3346028645833333, "learning_rate": 0.0001, "loss": 3.2875, "loss/crossentropy": 2.1991047143936155, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.3233490988612175, "loss/reg": 0.0, "step": 10260 }, { "epoch": 0.0675657894736842, "grad_norm": 3.890625, "grad_norm_var": 0.1937652587890625, "learning_rate": 0.0001, "loss": 3.2766, "loss/crossentropy": 2.334232974052429, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.3195400908589363, "loss/reg": 0.0, "step": 10270 }, { "epoch": 0.06763157894736842, "grad_norm": 2.25, "grad_norm_var": 0.7010162353515625, "learning_rate": 0.0001, "loss": 3.224, "loss/crossentropy": 2.242165985703468, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.27829615995287893, "loss/reg": 0.0, "step": 10280 }, { "epoch": 0.06769736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.1137115478515625, "learning_rate": 0.0001, "loss": 3.2283, "loss/crossentropy": 2.4408915996551515, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2770886033773422, "loss/reg": 0.0, "step": 10290 }, { "epoch": 0.06776315789473684, "grad_norm": 2.75, "grad_norm_var": 0.28396708170572915, "learning_rate": 0.0001, "loss": 3.2788, "loss/crossentropy": 2.2959680914878846, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2606198683381081, "loss/reg": 0.0, "step": 10300 }, { "epoch": 0.06782894736842106, "grad_norm": 2.859375, "grad_norm_var": 0.0829498291015625, "learning_rate": 0.0001, "loss": 3.3487, "loss/crossentropy": 2.430151104927063, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.25218638926744463, "loss/reg": 0.0, "step": 10310 }, { "epoch": 0.06789473684210526, "grad_norm": 3.203125, "grad_norm_var": 0.3111979166666667, "learning_rate": 0.0001, "loss": 3.3676, "loss/crossentropy": 2.1297863006591795, "loss/hidden": 3.415625, "loss/incoh": 0.0, "loss/logits": 0.4299448400735855, "loss/reg": 0.0, "step": 10320 }, { "epoch": 0.06796052631578947, "grad_norm": 3.0625, "grad_norm_var": 0.34820556640625, "learning_rate": 0.0001, "loss": 3.3013, "loss/crossentropy": 2.2383357286453247, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.31647931337356566, "loss/reg": 0.0, "step": 10330 }, { "epoch": 0.06802631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.0734283447265625, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.3983714103698732, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.29214349389076233, "loss/reg": 0.0, "step": 10340 }, { "epoch": 0.0680921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.0652984619140625, "learning_rate": 0.0001, "loss": 3.2579, "loss/crossentropy": 2.5644800424575807, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.3019869774580002, "loss/reg": 0.0, "step": 10350 }, { "epoch": 0.06815789473684211, "grad_norm": 2.53125, "grad_norm_var": 3.350255903165907e+17, "learning_rate": 0.0001, "loss": 3.3114, "loss/crossentropy": 2.6736939191818236, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2768147349357605, "loss/reg": 0.0, "step": 10360 }, { "epoch": 0.06822368421052631, "grad_norm": 4.65625, "grad_norm_var": 3.350255902575034e+17, "learning_rate": 0.0001, "loss": 3.3131, "loss/crossentropy": 2.2038461327552796, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.31388128101825713, "loss/reg": 0.0, "step": 10370 }, { "epoch": 0.06828947368421052, "grad_norm": 3.78125, "grad_norm_var": 0.5317220052083333, "learning_rate": 0.0001, "loss": 3.3243, "loss/crossentropy": 2.3583734750747682, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.26419220119714737, "loss/reg": 0.0, "step": 10380 }, { "epoch": 0.06835526315789474, "grad_norm": 2.640625, "grad_norm_var": 0.2758127848307292, "learning_rate": 0.0001, "loss": 3.287, "loss/crossentropy": 2.2863503098487854, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.23900726288557053, "loss/reg": 0.0, "step": 10390 }, { "epoch": 0.06842105263157895, "grad_norm": 2.3125, "grad_norm_var": 0.04855143229166667, "learning_rate": 0.0001, "loss": 3.2604, "loss/crossentropy": 2.3840556263923647, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.30461184978485106, "loss/reg": 0.0, "step": 10400 }, { "epoch": 0.06848684210526315, "grad_norm": 2.953125, "grad_norm_var": 0.15156962076822916, "learning_rate": 0.0001, "loss": 3.3062, "loss/crossentropy": 2.088348960876465, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2698833361268044, "loss/reg": 0.0, "step": 10410 }, { "epoch": 0.06855263157894737, "grad_norm": 2.875, "grad_norm_var": 0.5885894775390625, "learning_rate": 0.0001, "loss": 3.3402, "loss/crossentropy": 2.551842737197876, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.313777893781662, "loss/reg": 0.0, "step": 10420 }, { "epoch": 0.06861842105263158, "grad_norm": 2.46875, "grad_norm_var": 0.9581939697265625, "learning_rate": 0.0001, "loss": 3.3486, "loss/crossentropy": 2.148633885383606, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.29070123881101606, "loss/reg": 0.0, "step": 10430 }, { "epoch": 0.06868421052631579, "grad_norm": 2.59375, "grad_norm_var": 0.2256988525390625, "learning_rate": 0.0001, "loss": 3.3073, "loss/crossentropy": 2.0771638333797453, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.24762868583202363, "loss/reg": 0.0, "step": 10440 }, { "epoch": 0.06875, "grad_norm": 1.9453125, "grad_norm_var": 0.30576553344726565, "learning_rate": 0.0001, "loss": 3.3608, "loss/crossentropy": 1.9622422456741333, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2823460906744003, "loss/reg": 0.0, "step": 10450 }, { "epoch": 0.0688157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.24808731079101562, "learning_rate": 0.0001, "loss": 3.2114, "loss/crossentropy": 2.4866459488868715, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.25186999291181567, "loss/reg": 0.0, "step": 10460 }, { "epoch": 0.06888157894736842, "grad_norm": 2.59375, "grad_norm_var": 3.232233683268229, "learning_rate": 0.0001, "loss": 3.2672, "loss/crossentropy": 2.4218720883131026, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.34285663813352585, "loss/reg": 0.0, "step": 10470 }, { "epoch": 0.06894736842105263, "grad_norm": 3.0, "grad_norm_var": 0.12942301432291667, "learning_rate": 0.0001, "loss": 3.2774, "loss/crossentropy": 2.2948715806007387, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22767567485570908, "loss/reg": 0.0, "step": 10480 }, { "epoch": 0.06901315789473685, "grad_norm": 2.40625, "grad_norm_var": 0.09120686848958333, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.4549028277397156, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2874615803360939, "loss/reg": 0.0, "step": 10490 }, { "epoch": 0.06907894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.14446512858072916, "learning_rate": 0.0001, "loss": 3.2562, "loss/crossentropy": 2.2758328318595886, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.24516315162181854, "loss/reg": 0.0, "step": 10500 }, { "epoch": 0.06914473684210526, "grad_norm": 2.328125, "grad_norm_var": 0.167626953125, "learning_rate": 0.0001, "loss": 3.3066, "loss/crossentropy": 2.422711133956909, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24841044396162032, "loss/reg": 0.0, "step": 10510 }, { "epoch": 0.06921052631578947, "grad_norm": 2.953125, "grad_norm_var": 0.0686187744140625, "learning_rate": 0.0001, "loss": 3.2834, "loss/crossentropy": 2.210281264781952, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.31483527421951296, "loss/reg": 0.0, "step": 10520 }, { "epoch": 0.06927631578947369, "grad_norm": 2.390625, "grad_norm_var": 0.1522857666015625, "learning_rate": 0.0001, "loss": 3.2678, "loss/crossentropy": 2.3035526394844057, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.26298564970493316, "loss/reg": 0.0, "step": 10530 }, { "epoch": 0.0693421052631579, "grad_norm": 2.140625, "grad_norm_var": 1.5601959228515625, "learning_rate": 0.0001, "loss": 3.2926, "loss/crossentropy": 2.5075936675071717, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27058843374252317, "loss/reg": 0.0, "step": 10540 }, { "epoch": 0.0694078947368421, "grad_norm": 2.171875, "grad_norm_var": 1.5597330729166667, "learning_rate": 0.0001, "loss": 3.2643, "loss/crossentropy": 2.448484253883362, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.3164616659283638, "loss/reg": 0.0, "step": 10550 }, { "epoch": 0.06947368421052631, "grad_norm": 2.15625, "grad_norm_var": 0.3221638997395833, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.3997972130775453, "loss/hidden": 3.4125, "loss/incoh": 0.0, "loss/logits": 0.3671171858906746, "loss/reg": 0.0, "step": 10560 }, { "epoch": 0.06953947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.33424072265625, "learning_rate": 0.0001, "loss": 3.3035, "loss/crossentropy": 2.3439933180809023, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24507830888032914, "loss/reg": 0.0, "step": 10570 }, { "epoch": 0.06960526315789474, "grad_norm": 2.5625, "grad_norm_var": 0.15203450520833334, "learning_rate": 0.0001, "loss": 3.3203, "loss/crossentropy": 2.20079083442688, "loss/hidden": 3.203125, "loss/incoh": 0.0, "loss/logits": 0.3732150986790657, "loss/reg": 0.0, "step": 10580 }, { "epoch": 0.06967105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.23407796223958333, "learning_rate": 0.0001, "loss": 3.3251, "loss/crossentropy": 2.353682446479797, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.2952089346945286, "loss/reg": 0.0, "step": 10590 }, { "epoch": 0.06973684210526315, "grad_norm": 2.453125, "grad_norm_var": 0.22082926432291666, "learning_rate": 0.0001, "loss": 3.2719, "loss/crossentropy": 2.3233517169952393, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.290130452811718, "loss/reg": 0.0, "step": 10600 }, { "epoch": 0.06980263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.2807362874348958, "learning_rate": 0.0001, "loss": 3.2558, "loss/crossentropy": 2.466183233261108, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.33012075573205946, "loss/reg": 0.0, "step": 10610 }, { "epoch": 0.06986842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.08990478515625, "learning_rate": 0.0001, "loss": 3.3285, "loss/crossentropy": 2.3989938259124757, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.37008936554193494, "loss/reg": 0.0, "step": 10620 }, { "epoch": 0.0699342105263158, "grad_norm": 3.09375, "grad_norm_var": 0.05269266764322917, "learning_rate": 0.0001, "loss": 3.2761, "loss/crossentropy": 2.4531158804893494, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.27111856192350386, "loss/reg": 0.0, "step": 10630 }, { "epoch": 0.07, "grad_norm": 2.203125, "grad_norm_var": 0.0650787353515625, "learning_rate": 0.0001, "loss": 3.22, "loss/crossentropy": 2.44055380821228, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.25391832590103147, "loss/reg": 0.0, "step": 10640 }, { "epoch": 0.0700657894736842, "grad_norm": 2.484375, "grad_norm_var": 0.0595367431640625, "learning_rate": 0.0001, "loss": 3.226, "loss/crossentropy": 2.4790910482406616, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2732370227575302, "loss/reg": 0.0, "step": 10650 }, { "epoch": 0.07013157894736842, "grad_norm": 2248146944.0, "grad_norm_var": 3.158852919285514e+17, "learning_rate": 0.0001, "loss": 3.3733, "loss/crossentropy": 2.1218923926353455, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.23474968373775482, "loss/reg": 0.0, "step": 10660 }, { "epoch": 0.07019736842105263, "grad_norm": 2.40625, "grad_norm_var": 3.158852918981078e+17, "learning_rate": 0.0001, "loss": 3.2364, "loss/crossentropy": 2.3490695118904115, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.3323436751961708, "loss/reg": 0.0, "step": 10670 }, { "epoch": 0.07026315789473685, "grad_norm": 3.1875, "grad_norm_var": 0.07629801432291666, "learning_rate": 0.0001, "loss": 3.1831, "loss/crossentropy": 2.3577569365501403, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.26663027703762054, "loss/reg": 0.0, "step": 10680 }, { "epoch": 0.07032894736842105, "grad_norm": 2.703125, "grad_norm_var": 0.0871978759765625, "learning_rate": 0.0001, "loss": 3.2111, "loss/crossentropy": 2.2620568752288817, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.3401878133416176, "loss/reg": 0.0, "step": 10690 }, { "epoch": 0.07039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 3.252, "loss/crossentropy": 2.223601281642914, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.26251988410949706, "loss/reg": 0.0, "step": 10700 }, { "epoch": 0.07046052631578947, "grad_norm": 2.53125, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.2720033645629885, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2705120757222176, "loss/reg": 0.0, "step": 10710 }, { "epoch": 0.07052631578947369, "grad_norm": 2.203125, "grad_norm_var": 0.36824442545572916, "learning_rate": 0.0001, "loss": 3.2981, "loss/crossentropy": 2.3247862100601195, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.25748861730098727, "loss/reg": 0.0, "step": 10720 }, { "epoch": 0.0705921052631579, "grad_norm": 2.421875, "grad_norm_var": 0.27925796508789064, "learning_rate": 0.0001, "loss": 3.1731, "loss/crossentropy": 1.9388428241014481, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.21091363951563835, "loss/reg": 0.0, "step": 10730 }, { "epoch": 0.0706578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.10903294881184895, "learning_rate": 0.0001, "loss": 3.2623, "loss/crossentropy": 2.338471806049347, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2822930008172989, "loss/reg": 0.0, "step": 10740 }, { "epoch": 0.07072368421052631, "grad_norm": 1.96875, "grad_norm_var": 0.11550191243489584, "learning_rate": 0.0001, "loss": 3.2306, "loss/crossentropy": 2.275705647468567, "loss/hidden": 3.1640625, "loss/incoh": 0.0, "loss/logits": 0.30476620346307753, "loss/reg": 0.0, "step": 10750 }, { "epoch": 0.07078947368421053, "grad_norm": 2.171875, "grad_norm_var": 1.7166033426920573, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.375147843360901, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.3041233107447624, "loss/reg": 0.0, "step": 10760 }, { "epoch": 0.07085526315789474, "grad_norm": 2.75, "grad_norm_var": 1.698127237955729, "learning_rate": 0.0001, "loss": 3.308, "loss/crossentropy": 2.3641104817390444, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2578106954693794, "loss/reg": 0.0, "step": 10770 }, { "epoch": 0.07092105263157895, "grad_norm": 2.390625, "grad_norm_var": 0.33463134765625, "learning_rate": 0.0001, "loss": 3.2267, "loss/crossentropy": 2.3689509868621825, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.3170511037111282, "loss/reg": 0.0, "step": 10780 }, { "epoch": 0.07098684210526315, "grad_norm": 2.140625, "grad_norm_var": 0.08876851399739584, "learning_rate": 0.0001, "loss": 3.2416, "loss/crossentropy": 2.0522693753242494, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.25120987445116044, "loss/reg": 0.0, "step": 10790 }, { "epoch": 0.07105263157894737, "grad_norm": 2.734375, "grad_norm_var": 0.1547027587890625, "learning_rate": 0.0001, "loss": 3.2808, "loss/crossentropy": 2.193123185634613, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.30211101323366163, "loss/reg": 0.0, "step": 10800 }, { "epoch": 0.07111842105263158, "grad_norm": 2.09375, "grad_norm_var": 0.11612955729166667, "learning_rate": 0.0001, "loss": 3.1868, "loss/crossentropy": 2.6722516775131226, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.26585151851177213, "loss/reg": 0.0, "step": 10810 }, { "epoch": 0.0711842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.3016998291015625, "learning_rate": 0.0001, "loss": 3.1683, "loss/crossentropy": 2.480617439746857, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2681833073496819, "loss/reg": 0.0, "step": 10820 }, { "epoch": 0.07125, "grad_norm": 3.8125, "grad_norm_var": 0.1811431884765625, "learning_rate": 0.0001, "loss": 3.2239, "loss/crossentropy": 2.2105371236801146, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.27378717362880706, "loss/reg": 0.0, "step": 10830 }, { "epoch": 0.07131578947368421, "grad_norm": 2.203125, "grad_norm_var": 0.17531636555989583, "learning_rate": 0.0001, "loss": 3.1981, "loss/crossentropy": 2.201746928691864, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3672773316502571, "loss/reg": 0.0, "step": 10840 }, { "epoch": 0.07138157894736842, "grad_norm": 4.28125, "grad_norm_var": 0.34869791666666666, "learning_rate": 0.0001, "loss": 3.2799, "loss/crossentropy": 2.322747588157654, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.262615929543972, "loss/reg": 0.0, "step": 10850 }, { "epoch": 0.07144736842105263, "grad_norm": 2.796875, "grad_norm_var": 0.7465810139973958, "learning_rate": 0.0001, "loss": 3.2065, "loss/crossentropy": 1.9148864209651948, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2080842524766922, "loss/reg": 0.0, "step": 10860 }, { "epoch": 0.07151315789473685, "grad_norm": 2.53125, "grad_norm_var": 0.053873697916666664, "learning_rate": 0.0001, "loss": 3.2002, "loss/crossentropy": 2.314997375011444, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2829948261380196, "loss/reg": 0.0, "step": 10870 }, { "epoch": 0.07157894736842105, "grad_norm": 2.25, "grad_norm_var": 0.08782145182291666, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.4381492733955383, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2620039567351341, "loss/reg": 0.0, "step": 10880 }, { "epoch": 0.07164473684210526, "grad_norm": 2.984375, "grad_norm_var": 0.10087788899739583, "learning_rate": 0.0001, "loss": 3.2769, "loss/crossentropy": 2.478635573387146, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.30705118626356126, "loss/reg": 0.0, "step": 10890 }, { "epoch": 0.07171052631578947, "grad_norm": 2.515625, "grad_norm_var": 0.12694066365559895, "learning_rate": 0.0001, "loss": 3.1767, "loss/crossentropy": 2.145402270555496, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.26391510516405103, "loss/reg": 0.0, "step": 10900 }, { "epoch": 0.07177631578947369, "grad_norm": 2.34375, "grad_norm_var": 0.13940404256184896, "learning_rate": 0.0001, "loss": 3.2976, "loss/crossentropy": 2.1119534373283386, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.24341508150100707, "loss/reg": 0.0, "step": 10910 }, { "epoch": 0.0718421052631579, "grad_norm": 2.375, "grad_norm_var": 0.193408203125, "learning_rate": 0.0001, "loss": 3.2699, "loss/crossentropy": 2.2503302097320557, "loss/hidden": 3.196875, "loss/incoh": 0.0, "loss/logits": 0.2697600871324539, "loss/reg": 0.0, "step": 10920 }, { "epoch": 0.0719078947368421, "grad_norm": 4.9375, "grad_norm_var": 1.1786092122395833, "learning_rate": 0.0001, "loss": 3.3635, "loss/crossentropy": 2.2035016298294066, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.23886747062206268, "loss/reg": 0.0, "step": 10930 }, { "epoch": 0.07197368421052631, "grad_norm": 2.5, "grad_norm_var": 1.1909088134765624, "learning_rate": 0.0001, "loss": 3.2664, "loss/crossentropy": 2.438611125946045, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2649322673678398, "loss/reg": 0.0, "step": 10940 }, { "epoch": 0.07203947368421053, "grad_norm": 2.71875, "grad_norm_var": 0.1803375244140625, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.300194537639618, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2784364491701126, "loss/reg": 0.0, "step": 10950 }, { "epoch": 0.07210526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.13961181640625, "learning_rate": 0.0001, "loss": 3.196, "loss/crossentropy": 2.297783041000366, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23439482748508453, "loss/reg": 0.0, "step": 10960 }, { "epoch": 0.07217105263157894, "grad_norm": 2.5, "grad_norm_var": 0.03483784993489583, "learning_rate": 0.0001, "loss": 3.2816, "loss/crossentropy": 2.2468614101409914, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.26336451917886733, "loss/reg": 0.0, "step": 10970 }, { "epoch": 0.07223684210526315, "grad_norm": 2.515625, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 3.2387, "loss/crossentropy": 2.5107889652252195, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2557321682572365, "loss/reg": 0.0, "step": 10980 }, { "epoch": 0.07230263157894737, "grad_norm": 2.5, "grad_norm_var": 0.05127665201822917, "learning_rate": 0.0001, "loss": 3.2995, "loss/crossentropy": 2.221148931980133, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2837982401251793, "loss/reg": 0.0, "step": 10990 }, { "epoch": 0.07236842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.08108317057291667, "learning_rate": 0.0001, "loss": 3.289, "loss/crossentropy": 2.515943694114685, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.269567608833313, "loss/reg": 0.0, "step": 11000 }, { "epoch": 0.0724342105263158, "grad_norm": 2.328125, "grad_norm_var": 0.10301106770833333, "learning_rate": 0.0001, "loss": 3.2421, "loss/crossentropy": 2.22921404838562, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24364713877439498, "loss/reg": 0.0, "step": 11010 }, { "epoch": 0.0725, "grad_norm": 2.40625, "grad_norm_var": 0.15533447265625, "learning_rate": 0.0001, "loss": 3.2872, "loss/crossentropy": 2.5498072266578675, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2900205120444298, "loss/reg": 0.0, "step": 11020 }, { "epoch": 0.07256578947368421, "grad_norm": 2.828125, "grad_norm_var": 0.19057515462239583, "learning_rate": 0.0001, "loss": 3.2853, "loss/crossentropy": 2.607512426376343, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.3007731422781944, "loss/reg": 0.0, "step": 11030 }, { "epoch": 0.07263157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.13447240193684895, "learning_rate": 0.0001, "loss": 3.1687, "loss/crossentropy": 1.9578089714050293, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23623087108135224, "loss/reg": 0.0, "step": 11040 }, { "epoch": 0.07269736842105264, "grad_norm": 2.453125, "grad_norm_var": 0.2533192952473958, "learning_rate": 0.0001, "loss": 3.2864, "loss/crossentropy": 2.473111295700073, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.31404276490211486, "loss/reg": 0.0, "step": 11050 }, { "epoch": 0.07276315789473685, "grad_norm": 2.625, "grad_norm_var": 0.1464019775390625, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.1964801430702208, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.26576130986213686, "loss/reg": 0.0, "step": 11060 }, { "epoch": 0.07282894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.12649637858072918, "learning_rate": 0.0001, "loss": 3.2663, "loss/crossentropy": 2.2337945103645325, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2691087871789932, "loss/reg": 0.0, "step": 11070 }, { "epoch": 0.07289473684210526, "grad_norm": 2.125, "grad_norm_var": 0.08884175618489583, "learning_rate": 0.0001, "loss": 3.3071, "loss/crossentropy": 2.5003953099250795, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3058940455317497, "loss/reg": 0.0, "step": 11080 }, { "epoch": 0.07296052631578948, "grad_norm": 2.46875, "grad_norm_var": 0.14674479166666668, "learning_rate": 0.0001, "loss": 3.2275, "loss/crossentropy": 2.3431849002838137, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2692634254693985, "loss/reg": 0.0, "step": 11090 }, { "epoch": 0.07302631578947369, "grad_norm": 2.609375, "grad_norm_var": 0.08789774576822916, "learning_rate": 0.0001, "loss": 3.2168, "loss/crossentropy": 2.2301442503929136, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.24614981114864348, "loss/reg": 0.0, "step": 11100 }, { "epoch": 0.07309210526315789, "grad_norm": 2.78125, "grad_norm_var": 0.09053446451822916, "learning_rate": 0.0001, "loss": 3.3336, "loss/crossentropy": 2.624728870391846, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.30191341042518616, "loss/reg": 0.0, "step": 11110 }, { "epoch": 0.0731578947368421, "grad_norm": 2.65625, "grad_norm_var": 0.23191731770833332, "learning_rate": 0.0001, "loss": 3.3217, "loss/crossentropy": 2.204717183113098, "loss/hidden": 3.4859375, "loss/incoh": 0.0, "loss/logits": 0.5070606812834739, "loss/reg": 0.0, "step": 11120 }, { "epoch": 0.07322368421052632, "grad_norm": 2.640625, "grad_norm_var": 0.2526519775390625, "learning_rate": 0.0001, "loss": 3.345, "loss/crossentropy": 2.558793139457703, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.42952366173267365, "loss/reg": 0.0, "step": 11130 }, { "epoch": 0.07328947368421053, "grad_norm": 2.53125, "grad_norm_var": 0.0856109619140625, "learning_rate": 0.0001, "loss": 3.3754, "loss/crossentropy": 2.227391791343689, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.27742871195077895, "loss/reg": 0.0, "step": 11140 }, { "epoch": 0.07335526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.07666015625, "learning_rate": 0.0001, "loss": 3.2194, "loss/crossentropy": 2.32956976890564, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.2826590985059738, "loss/reg": 0.0, "step": 11150 }, { "epoch": 0.07342105263157894, "grad_norm": 8.625, "grad_norm_var": 2.3614542643229166, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.0696181058883667, "loss/hidden": 3.3640625, "loss/incoh": 0.0, "loss/logits": 0.2718909472227097, "loss/reg": 0.0, "step": 11160 }, { "epoch": 0.07348684210526316, "grad_norm": 2.234375, "grad_norm_var": 2.3623697916666666, "learning_rate": 0.0001, "loss": 3.2983, "loss/crossentropy": 2.5612054228782655, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.26728835999965667, "loss/reg": 0.0, "step": 11170 }, { "epoch": 0.07355263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.11398111979166667, "learning_rate": 0.0001, "loss": 3.1779, "loss/crossentropy": 2.5427613735198973, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.24258261919021606, "loss/reg": 0.0, "step": 11180 }, { "epoch": 0.07361842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.07968648274739583, "learning_rate": 0.0001, "loss": 3.1565, "loss/crossentropy": 2.3772116184234617, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.24017660170793534, "loss/reg": 0.0, "step": 11190 }, { "epoch": 0.07368421052631578, "grad_norm": 2.9375, "grad_norm_var": 0.058652496337890624, "learning_rate": 0.0001, "loss": 3.2383, "loss/crossentropy": 2.284471166133881, "loss/hidden": 3.0625, "loss/incoh": 0.0, "loss/logits": 0.31355464905500413, "loss/reg": 0.0, "step": 11200 }, { "epoch": 0.07375, "grad_norm": 2.5625, "grad_norm_var": 0.04164937337239583, "learning_rate": 0.0001, "loss": 3.2288, "loss/crossentropy": 2.1727640271186828, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24059856683015823, "loss/reg": 0.0, "step": 11210 }, { "epoch": 0.07381578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 3.2857, "loss/crossentropy": 2.377478325366974, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.2916824325919151, "loss/reg": 0.0, "step": 11220 }, { "epoch": 0.07388157894736842, "grad_norm": 3.140625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 3.2848, "loss/crossentropy": 2.372981941699982, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.3395596519112587, "loss/reg": 0.0, "step": 11230 }, { "epoch": 0.07394736842105264, "grad_norm": 2.59375, "grad_norm_var": 1.1325103759765625, "learning_rate": 0.0001, "loss": 3.2954, "loss/crossentropy": 2.571339511871338, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.3173587560653687, "loss/reg": 0.0, "step": 11240 }, { "epoch": 0.07401315789473684, "grad_norm": 2.828125, "grad_norm_var": 0.11298421223958334, "learning_rate": 0.0001, "loss": 3.213, "loss/crossentropy": 2.093873751163483, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.23975073918700218, "loss/reg": 0.0, "step": 11250 }, { "epoch": 0.07407894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.13909403483072916, "learning_rate": 0.0001, "loss": 3.25, "loss/crossentropy": 2.55220787525177, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.29909972846508026, "loss/reg": 0.0, "step": 11260 }, { "epoch": 0.07414473684210526, "grad_norm": 2.734375, "grad_norm_var": 0.11393229166666667, "learning_rate": 0.0001, "loss": 3.2424, "loss/crossentropy": 2.18590772151947, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2539385199546814, "loss/reg": 0.0, "step": 11270 }, { "epoch": 0.07421052631578948, "grad_norm": 3.078125, "grad_norm_var": 0.07273661295572917, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.4804351210594175, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2747698500752449, "loss/reg": 0.0, "step": 11280 }, { "epoch": 0.07427631578947369, "grad_norm": 2.140625, "grad_norm_var": 0.17068684895833333, "learning_rate": 0.0001, "loss": 3.2114, "loss/crossentropy": 2.4408376574516297, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.3068179443478584, "loss/reg": 0.0, "step": 11290 }, { "epoch": 0.07434210526315789, "grad_norm": 2.5625, "grad_norm_var": 0.27847900390625, "learning_rate": 0.0001, "loss": 3.3225, "loss/crossentropy": 2.284189748764038, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.2627576723694801, "loss/reg": 0.0, "step": 11300 }, { "epoch": 0.0744078947368421, "grad_norm": 2.546875, "grad_norm_var": 0.22431233723958333, "learning_rate": 0.0001, "loss": 3.2794, "loss/crossentropy": 2.348969095945358, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.22996244430541993, "loss/reg": 0.0, "step": 11310 }, { "epoch": 0.07447368421052632, "grad_norm": 2.890625, "grad_norm_var": 0.8210245768229166, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.455811655521393, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2752078205347061, "loss/reg": 0.0, "step": 11320 }, { "epoch": 0.07453947368421053, "grad_norm": 4.75, "grad_norm_var": 0.3614095052083333, "learning_rate": 0.0001, "loss": 3.2713, "loss/crossentropy": 2.4881197214126587, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.33918842375278474, "loss/reg": 0.0, "step": 11330 }, { "epoch": 0.07460526315789473, "grad_norm": 2.59375, "grad_norm_var": 0.4044596354166667, "learning_rate": 0.0001, "loss": 3.334, "loss/crossentropy": 2.4584757328033446, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.3295674562454224, "loss/reg": 0.0, "step": 11340 }, { "epoch": 0.07467105263157894, "grad_norm": 2.703125, "grad_norm_var": 0.9745920817057292, "learning_rate": 0.0001, "loss": 3.2347, "loss/crossentropy": 2.413297247886658, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.2875028237700462, "loss/reg": 0.0, "step": 11350 }, { "epoch": 0.07473684210526316, "grad_norm": 2.609375, "grad_norm_var": 0.0769439697265625, "learning_rate": 0.0001, "loss": 3.2495, "loss/crossentropy": 2.297819769382477, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.28927008211612704, "loss/reg": 0.0, "step": 11360 }, { "epoch": 0.07480263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.054182942708333334, "learning_rate": 0.0001, "loss": 3.1982, "loss/crossentropy": 2.2709707379341126, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.29085248410701753, "loss/reg": 0.0, "step": 11370 }, { "epoch": 0.07486842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.315283203125, "learning_rate": 0.0001, "loss": 3.2877, "loss/crossentropy": 2.5602762937545775, "loss/hidden": 3.1828125, "loss/incoh": 0.0, "loss/logits": 0.3155085578560829, "loss/reg": 0.0, "step": 11380 }, { "epoch": 0.07493421052631578, "grad_norm": 8.375, "grad_norm_var": 2.533186848958333, "learning_rate": 0.0001, "loss": 3.2192, "loss/crossentropy": 2.231611895561218, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.27611204236745834, "loss/reg": 0.0, "step": 11390 }, { "epoch": 0.075, "grad_norm": 2.109375, "grad_norm_var": 2.313280232747396, "learning_rate": 0.0001, "loss": 3.3273, "loss/crossentropy": 2.2164941787719727, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2650335058569908, "loss/reg": 0.0, "step": 11400 }, { "epoch": 0.07506578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.3658528645833333, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.2440002799034118, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.3373932957649231, "loss/reg": 0.0, "step": 11410 }, { "epoch": 0.07513157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.06290690104166667, "learning_rate": 0.0001, "loss": 3.1439, "loss/crossentropy": 2.405060076713562, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.26099992990493776, "loss/reg": 0.0, "step": 11420 }, { "epoch": 0.07519736842105264, "grad_norm": 2.40625, "grad_norm_var": 0.04853108723958333, "learning_rate": 0.0001, "loss": 3.1454, "loss/crossentropy": 2.5482593178749084, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2424457401037216, "loss/reg": 0.0, "step": 11430 }, { "epoch": 0.07526315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.5184244791666667, "learning_rate": 0.0001, "loss": 3.2189, "loss/crossentropy": 2.1846509099006655, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.29890656769275664, "loss/reg": 0.0, "step": 11440 }, { "epoch": 0.07532894736842105, "grad_norm": 2.46875, "grad_norm_var": 0.03676656087239583, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.3350785970687866, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.26758097261190417, "loss/reg": 0.0, "step": 11450 }, { "epoch": 0.07539473684210526, "grad_norm": 2.40625, "grad_norm_var": 56.07302958170573, "learning_rate": 0.0001, "loss": 3.3264, "loss/crossentropy": 2.357296335697174, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.32786626666784285, "loss/reg": 0.0, "step": 11460 }, { "epoch": 0.07546052631578948, "grad_norm": 2.71875, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 3.1956, "loss/crossentropy": 2.29429577589035, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22037848085165024, "loss/reg": 0.0, "step": 11470 }, { "epoch": 0.07552631578947368, "grad_norm": 2.84375, "grad_norm_var": 0.09621988932291667, "learning_rate": 0.0001, "loss": 3.2053, "loss/crossentropy": 2.450187027454376, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.27029853165149687, "loss/reg": 0.0, "step": 11480 }, { "epoch": 0.07559210526315789, "grad_norm": 2.421875, "grad_norm_var": 0.06201171875, "learning_rate": 0.0001, "loss": 3.2252, "loss/crossentropy": 1.9360981225967406, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.23251112401485444, "loss/reg": 0.0, "step": 11490 }, { "epoch": 0.0756578947368421, "grad_norm": 2.375, "grad_norm_var": 0.07566731770833333, "learning_rate": 0.0001, "loss": 3.2003, "loss/crossentropy": 2.21165417432785, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2515560120344162, "loss/reg": 0.0, "step": 11500 }, { "epoch": 0.07572368421052632, "grad_norm": 2.546875, "grad_norm_var": 0.07636311848958334, "learning_rate": 0.0001, "loss": 3.1758, "loss/crossentropy": 2.5108731746673585, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.31811543107032775, "loss/reg": 0.0, "step": 11510 }, { "epoch": 0.07578947368421053, "grad_norm": 2.21875, "grad_norm_var": 0.17082926432291667, "learning_rate": 0.0001, "loss": 3.2811, "loss/crossentropy": 2.1942604899406435, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.25081480443477633, "loss/reg": 0.0, "step": 11520 }, { "epoch": 0.07585526315789473, "grad_norm": 2.859375, "grad_norm_var": 0.07385660807291666, "learning_rate": 0.0001, "loss": 3.239, "loss/crossentropy": 2.0190611362457274, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.28278502970933916, "loss/reg": 0.0, "step": 11530 }, { "epoch": 0.07592105263157894, "grad_norm": 2.296875, "grad_norm_var": 0.045166015625, "learning_rate": 0.0001, "loss": 3.2825, "loss/crossentropy": 2.3928887605667115, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.31386475563049315, "loss/reg": 0.0, "step": 11540 }, { "epoch": 0.07598684210526316, "grad_norm": 2.34375, "grad_norm_var": 0.05419514973958333, "learning_rate": 0.0001, "loss": 3.236, "loss/crossentropy": 2.3381729245185854, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2617738708853722, "loss/reg": 0.0, "step": 11550 }, { "epoch": 0.07605263157894737, "grad_norm": 3.421875, "grad_norm_var": 0.10877278645833334, "learning_rate": 0.0001, "loss": 3.1435, "loss/crossentropy": 2.1223382353782654, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2554148808121681, "loss/reg": 0.0, "step": 11560 }, { "epoch": 0.07611842105263159, "grad_norm": 2.75, "grad_norm_var": 0.13093973795572916, "learning_rate": 0.0001, "loss": 3.2511, "loss/crossentropy": 2.4102694511413576, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.2813078135251999, "loss/reg": 0.0, "step": 11570 }, { "epoch": 0.07618421052631578, "grad_norm": 2.296875, "grad_norm_var": 0.12099507649739584, "learning_rate": 0.0001, "loss": 3.2712, "loss/crossentropy": 2.2883424520492555, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.27575887441635133, "loss/reg": 0.0, "step": 11580 }, { "epoch": 0.07625, "grad_norm": 2.09375, "grad_norm_var": 0.15095926920572916, "learning_rate": 0.0001, "loss": 3.2856, "loss/crossentropy": 2.4679728865623476, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.3342022061347961, "loss/reg": 0.0, "step": 11590 }, { "epoch": 0.07631578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.1062896728515625, "learning_rate": 0.0001, "loss": 3.2302, "loss/crossentropy": 2.35604043006897, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.30481296926736834, "loss/reg": 0.0, "step": 11600 }, { "epoch": 0.07638157894736843, "grad_norm": 2.4375, "grad_norm_var": 0.17685445149739584, "learning_rate": 0.0001, "loss": 3.3621, "loss/crossentropy": 2.302362835407257, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.29829359203577044, "loss/reg": 0.0, "step": 11610 }, { "epoch": 0.07644736842105262, "grad_norm": 2.515625, "grad_norm_var": 2.837443679954338e+17, "learning_rate": 0.0001, "loss": 3.365, "loss/crossentropy": 2.3786125659942625, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2504860758781433, "loss/reg": 0.0, "step": 11620 }, { "epoch": 0.07651315789473684, "grad_norm": 2.59375, "grad_norm_var": 2.8374436804315274e+17, "learning_rate": 0.0001, "loss": 3.285, "loss/crossentropy": 1.7914829134941102, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.26675148904323576, "loss/reg": 0.0, "step": 11630 }, { "epoch": 0.07657894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.047028605143229166, "learning_rate": 0.0001, "loss": 3.2398, "loss/crossentropy": 2.3633928418159487, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2506480649113655, "loss/reg": 0.0, "step": 11640 }, { "epoch": 0.07664473684210527, "grad_norm": 2.1875, "grad_norm_var": 0.12795308430989583, "learning_rate": 0.0001, "loss": 3.1729, "loss/crossentropy": 2.3739442467689513, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.24874649345874786, "loss/reg": 0.0, "step": 11650 }, { "epoch": 0.07671052631578948, "grad_norm": 3.140625, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 3.19, "loss/crossentropy": 1.946711039543152, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.26388829201459885, "loss/reg": 0.0, "step": 11660 }, { "epoch": 0.07677631578947368, "grad_norm": 2.28125, "grad_norm_var": 0.16907145182291666, "learning_rate": 0.0001, "loss": 3.2141, "loss/crossentropy": 2.5971063375473022, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.29624524116516116, "loss/reg": 0.0, "step": 11670 }, { "epoch": 0.07684210526315789, "grad_norm": 2.796875, "grad_norm_var": 0.20071512858072918, "learning_rate": 0.0001, "loss": 3.2566, "loss/crossentropy": 2.5601096868515016, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.272810535132885, "loss/reg": 0.0, "step": 11680 }, { "epoch": 0.0769078947368421, "grad_norm": 2.375, "grad_norm_var": 0.14016927083333333, "learning_rate": 0.0001, "loss": 3.1653, "loss/crossentropy": 2.4755476355552672, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.27636886537075045, "loss/reg": 0.0, "step": 11690 }, { "epoch": 0.07697368421052632, "grad_norm": 2.484375, "grad_norm_var": 0.641162109375, "learning_rate": 0.0001, "loss": 3.1798, "loss/crossentropy": 2.558279812335968, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.24465988874435424, "loss/reg": 0.0, "step": 11700 }, { "epoch": 0.07703947368421053, "grad_norm": 2.25, "grad_norm_var": 0.0510162353515625, "learning_rate": 0.0001, "loss": 3.1825, "loss/crossentropy": 2.437064230442047, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.25304732471704483, "loss/reg": 0.0, "step": 11710 }, { "epoch": 0.07710526315789473, "grad_norm": 2.578125, "grad_norm_var": 0.012482706705729167, "learning_rate": 0.0001, "loss": 3.1497, "loss/crossentropy": 2.3207133412361145, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.26052851378917696, "loss/reg": 0.0, "step": 11720 }, { "epoch": 0.07717105263157895, "grad_norm": 2.375, "grad_norm_var": 0.03972066243489583, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.210064744949341, "loss/hidden": 3.1765625, "loss/incoh": 0.0, "loss/logits": 0.321417099237442, "loss/reg": 0.0, "step": 11730 }, { "epoch": 0.07723684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.11972249348958333, "learning_rate": 0.0001, "loss": 3.2828, "loss/crossentropy": 2.2914742827415466, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.30591298937797545, "loss/reg": 0.0, "step": 11740 }, { "epoch": 0.07730263157894737, "grad_norm": 2.875, "grad_norm_var": 0.48640034993489584, "learning_rate": 0.0001, "loss": 3.2259, "loss/crossentropy": 2.2794241905212402, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.3180560126900673, "loss/reg": 0.0, "step": 11750 }, { "epoch": 0.07736842105263157, "grad_norm": 3.15625, "grad_norm_var": 0.4784576416015625, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.237711024284363, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.22340844720602035, "loss/reg": 0.0, "step": 11760 }, { "epoch": 0.07743421052631579, "grad_norm": 2.34375, "grad_norm_var": 0.2001129150390625, "learning_rate": 0.0001, "loss": 3.2992, "loss/crossentropy": 2.4300220131874086, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.279655484855175, "loss/reg": 0.0, "step": 11770 }, { "epoch": 0.0775, "grad_norm": 2.4375, "grad_norm_var": 0.0681549072265625, "learning_rate": 0.0001, "loss": 3.1433, "loss/crossentropy": 2.1450002193450928, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.26157657504081727, "loss/reg": 0.0, "step": 11780 }, { "epoch": 0.07756578947368421, "grad_norm": 2.671875, "grad_norm_var": 0.038492838541666664, "learning_rate": 0.0001, "loss": 3.2329, "loss/crossentropy": 2.273455095291138, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2625778928399086, "loss/reg": 0.0, "step": 11790 }, { "epoch": 0.07763157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.06272379557291667, "learning_rate": 0.0001, "loss": 3.2736, "loss/crossentropy": 2.2713128685951234, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2852136388421059, "loss/reg": 0.0, "step": 11800 }, { "epoch": 0.07769736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.14368082682291666, "learning_rate": 0.0001, "loss": 3.2063, "loss/crossentropy": 2.276504385471344, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.2478427141904831, "loss/reg": 0.0, "step": 11810 }, { "epoch": 0.07776315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.14405008951822917, "learning_rate": 0.0001, "loss": 3.2627, "loss/crossentropy": 2.3879762291908264, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.26758207380771637, "loss/reg": 0.0, "step": 11820 }, { "epoch": 0.07782894736842105, "grad_norm": 2.21875, "grad_norm_var": 16.03980712890625, "learning_rate": 0.0001, "loss": 3.1689, "loss/crossentropy": 2.238702917098999, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.25569620728492737, "loss/reg": 0.0, "step": 11830 }, { "epoch": 0.07789473684210527, "grad_norm": 2.09375, "grad_norm_var": 16.03136774698893, "learning_rate": 0.0001, "loss": 3.1737, "loss/crossentropy": 2.11713285446167, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.2322181984782219, "loss/reg": 0.0, "step": 11840 }, { "epoch": 0.07796052631578948, "grad_norm": 3.53125, "grad_norm_var": 0.14957249959309896, "learning_rate": 0.0001, "loss": 3.2023, "loss/crossentropy": 2.279906690120697, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.26055515706539156, "loss/reg": 0.0, "step": 11850 }, { "epoch": 0.07802631578947368, "grad_norm": 2.453125, "grad_norm_var": 0.1443756103515625, "learning_rate": 0.0001, "loss": 3.1657, "loss/crossentropy": 2.338582932949066, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.23672561049461366, "loss/reg": 0.0, "step": 11860 }, { "epoch": 0.0780921052631579, "grad_norm": 2.171875, "grad_norm_var": 0.08467508951822916, "learning_rate": 0.0001, "loss": 3.1991, "loss/crossentropy": 2.3666534066200255, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.27455085664987566, "loss/reg": 0.0, "step": 11870 }, { "epoch": 0.0781578947368421, "grad_norm": 2.359375, "grad_norm_var": 0.08329671223958333, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 2.232216811180115, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.29752269983291624, "loss/reg": 0.0, "step": 11880 }, { "epoch": 0.07822368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 3.2052, "loss/crossentropy": 2.5155674695968626, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2883127599954605, "loss/reg": 0.0, "step": 11890 }, { "epoch": 0.07828947368421052, "grad_norm": 2.109375, "grad_norm_var": 0.05334879557291667, "learning_rate": 0.0001, "loss": 3.1696, "loss/crossentropy": 2.3109512329101562, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2651562377810478, "loss/reg": 0.0, "step": 11900 }, { "epoch": 0.07835526315789473, "grad_norm": 2.578125, "grad_norm_var": 0.1116363525390625, "learning_rate": 0.0001, "loss": 3.2305, "loss/crossentropy": 2.4196372270584106, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24704778790473939, "loss/reg": 0.0, "step": 11910 }, { "epoch": 0.07842105263157895, "grad_norm": 2.25, "grad_norm_var": 0.10711161295572917, "learning_rate": 0.0001, "loss": 3.2486, "loss/crossentropy": 2.359645998477936, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.28654517233371735, "loss/reg": 0.0, "step": 11920 }, { "epoch": 0.07848684210526316, "grad_norm": 2.75, "grad_norm_var": 0.06504618326822917, "learning_rate": 0.0001, "loss": 3.2312, "loss/crossentropy": 2.364006555080414, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.24593105614185334, "loss/reg": 0.0, "step": 11930 }, { "epoch": 0.07855263157894737, "grad_norm": 2.390625, "grad_norm_var": 0.11108296712239583, "learning_rate": 0.0001, "loss": 3.1318, "loss/crossentropy": 2.2041036009788515, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.2498387575149536, "loss/reg": 0.0, "step": 11940 }, { "epoch": 0.07861842105263157, "grad_norm": 3.015625, "grad_norm_var": 0.06920166015625, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.2502759456634522, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.27016896903514864, "loss/reg": 0.0, "step": 11950 }, { "epoch": 0.07868421052631579, "grad_norm": 3.125, "grad_norm_var": 0.09563700358072917, "learning_rate": 0.0001, "loss": 3.2128, "loss/crossentropy": 2.1190481543540955, "loss/hidden": 3.1734375, "loss/incoh": 0.0, "loss/logits": 0.288416750729084, "loss/reg": 0.0, "step": 11960 }, { "epoch": 0.07875, "grad_norm": 2.203125, "grad_norm_var": 0.15563863118489582, "learning_rate": 0.0001, "loss": 3.2263, "loss/crossentropy": 1.9541548937559128, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.271015003323555, "loss/reg": 0.0, "step": 11970 }, { "epoch": 0.07881578947368421, "grad_norm": 3.015625, "grad_norm_var": 0.09900614420572916, "learning_rate": 0.0001, "loss": 3.0889, "loss/crossentropy": 2.3341493129730226, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2499636933207512, "loss/reg": 0.0, "step": 11980 }, { "epoch": 0.07888157894736843, "grad_norm": 2.46875, "grad_norm_var": 0.07329813639322917, "learning_rate": 0.0001, "loss": 3.1932, "loss/crossentropy": 2.388286221027374, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.24724568724632262, "loss/reg": 0.0, "step": 11990 }, { "epoch": 0.07894736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.05429280598958333, "learning_rate": 0.0001, "loss": 3.1876, "loss/crossentropy": 2.351203644275665, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.26266307979822157, "loss/reg": 0.0, "step": 12000 }, { "epoch": 0.07901315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.04205322265625, "learning_rate": 0.0001, "loss": 3.1379, "loss/crossentropy": 2.3741995811462404, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.24828371405601501, "loss/reg": 0.0, "step": 12010 }, { "epoch": 0.07907894736842105, "grad_norm": 2.640625, "grad_norm_var": 0.13201395670572916, "learning_rate": 0.0001, "loss": 3.2741, "loss/crossentropy": 2.037536895275116, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.24230389446020126, "loss/reg": 0.0, "step": 12020 }, { "epoch": 0.07914473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.2516727566719057, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2339026600122452, "loss/reg": 0.0, "step": 12030 }, { "epoch": 0.07921052631578947, "grad_norm": 2.3125, "grad_norm_var": 26.919873046875, "learning_rate": 0.0001, "loss": 3.2787, "loss/crossentropy": 2.3474916219711304, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2710026606917381, "loss/reg": 0.0, "step": 12040 }, { "epoch": 0.07927631578947368, "grad_norm": 2.453125, "grad_norm_var": 26.796516927083335, "learning_rate": 0.0001, "loss": 3.254, "loss/crossentropy": 2.153431460261345, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.23484777919948102, "loss/reg": 0.0, "step": 12050 }, { "epoch": 0.0793421052631579, "grad_norm": 2.71875, "grad_norm_var": 0.0520416259765625, "learning_rate": 0.0001, "loss": 3.2866, "loss/crossentropy": 2.137704038619995, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2761687204241753, "loss/reg": 0.0, "step": 12060 }, { "epoch": 0.07940789473684211, "grad_norm": 2.4375, "grad_norm_var": 0.05562515258789062, "learning_rate": 0.0001, "loss": 3.1658, "loss/crossentropy": 2.2096517443656922, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22131142765283585, "loss/reg": 0.0, "step": 12070 }, { "epoch": 0.07947368421052632, "grad_norm": 2.578125, "grad_norm_var": 0.033300526936848956, "learning_rate": 0.0001, "loss": 3.1878, "loss/crossentropy": 2.486378014087677, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.288157556951046, "loss/reg": 0.0, "step": 12080 }, { "epoch": 0.07953947368421052, "grad_norm": 2.359375, "grad_norm_var": 0.18038736979166667, "learning_rate": 0.0001, "loss": 3.3441, "loss/crossentropy": 2.1570433020591735, "loss/hidden": 3.3609375, "loss/incoh": 0.0, "loss/logits": 0.33738467693328855, "loss/reg": 0.0, "step": 12090 }, { "epoch": 0.07960526315789473, "grad_norm": 2.234375, "grad_norm_var": 0.14531962076822916, "learning_rate": 0.0001, "loss": 3.2092, "loss/crossentropy": 2.11829297542572, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.29526630192995074, "loss/reg": 0.0, "step": 12100 }, { "epoch": 0.07967105263157895, "grad_norm": 2.328125, "grad_norm_var": 0.11277669270833333, "learning_rate": 0.0001, "loss": 3.2437, "loss/crossentropy": 2.151422083377838, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.31264509409666064, "loss/reg": 0.0, "step": 12110 }, { "epoch": 0.07973684210526316, "grad_norm": 3.046875, "grad_norm_var": 0.09020182291666666, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.190220355987549, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.26489182114601134, "loss/reg": 0.0, "step": 12120 }, { "epoch": 0.07980263157894738, "grad_norm": 2.359375, "grad_norm_var": 0.10075581868489583, "learning_rate": 0.0001, "loss": 3.2733, "loss/crossentropy": 2.4329964399337767, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.29891559183597566, "loss/reg": 0.0, "step": 12130 }, { "epoch": 0.07986842105263157, "grad_norm": 2.640625, "grad_norm_var": 0.08171284993489583, "learning_rate": 0.0001, "loss": 3.2143, "loss/crossentropy": 2.3487884759902955, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2699664428830147, "loss/reg": 0.0, "step": 12140 }, { "epoch": 0.07993421052631579, "grad_norm": 2.1875, "grad_norm_var": 0.11846415201822917, "learning_rate": 0.0001, "loss": 3.1594, "loss/crossentropy": 2.3031589150428773, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2605236619710922, "loss/reg": 0.0, "step": 12150 }, { "epoch": 0.08, "grad_norm": 3.015625, "grad_norm_var": 0.17751363118489583, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.4235578894615175, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.40606142282485963, "loss/reg": 0.0, "step": 12160 }, { "epoch": 0.08006578947368422, "grad_norm": 2.46875, "grad_norm_var": 0.18465067545572916, "learning_rate": 0.0001, "loss": 3.2069, "loss/crossentropy": 2.3678341031074526, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.26937836706638335, "loss/reg": 0.0, "step": 12170 }, { "epoch": 0.08013157894736841, "grad_norm": 2.4375, "grad_norm_var": 0.11225484212239584, "learning_rate": 0.0001, "loss": 3.1606, "loss/crossentropy": 2.606902313232422, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2720926284790039, "loss/reg": 0.0, "step": 12180 }, { "epoch": 0.08019736842105263, "grad_norm": 2.140625, "grad_norm_var": 0.03882548014322917, "learning_rate": 0.0001, "loss": 3.2091, "loss/crossentropy": 2.389057195186615, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2834290415048599, "loss/reg": 0.0, "step": 12190 }, { "epoch": 0.08026315789473684, "grad_norm": 3.328125, "grad_norm_var": 0.082470703125, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.1885082483291627, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.250583179295063, "loss/reg": 0.0, "step": 12200 }, { "epoch": 0.08032894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.14348551432291667, "learning_rate": 0.0001, "loss": 3.2234, "loss/crossentropy": 2.2288808941841127, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2857954427599907, "loss/reg": 0.0, "step": 12210 }, { "epoch": 0.08039473684210527, "grad_norm": 2.578125, "grad_norm_var": 0.05944010416666667, "learning_rate": 0.0001, "loss": 3.1535, "loss/crossentropy": 2.3943295001983644, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.27064385265111923, "loss/reg": 0.0, "step": 12220 }, { "epoch": 0.08046052631578947, "grad_norm": 2.921875, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 3.2165, "loss/crossentropy": 2.36237952709198, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.29628041982650755, "loss/reg": 0.0, "step": 12230 }, { "epoch": 0.08052631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.06207682291666667, "learning_rate": 0.0001, "loss": 3.2112, "loss/crossentropy": 2.26582453250885, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.3621983379125595, "loss/reg": 0.0, "step": 12240 }, { "epoch": 0.0805921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.08144124348958333, "learning_rate": 0.0001, "loss": 3.2711, "loss/crossentropy": 2.4002971291542052, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.29988196343183515, "loss/reg": 0.0, "step": 12250 }, { "epoch": 0.08065789473684211, "grad_norm": 2.78125, "grad_norm_var": 3.1181549072265624, "learning_rate": 0.0001, "loss": 3.2486, "loss/crossentropy": 2.26135613322258, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.24128984957933425, "loss/reg": 0.0, "step": 12260 }, { "epoch": 0.08072368421052632, "grad_norm": 2.34375, "grad_norm_var": 4.51636962890625, "learning_rate": 0.0001, "loss": 3.5227, "loss/crossentropy": 2.2277899503707888, "loss/hidden": 3.453125, "loss/incoh": 0.0, "loss/logits": 0.35484138429164885, "loss/reg": 0.0, "step": 12270 }, { "epoch": 0.08078947368421052, "grad_norm": 2.703125, "grad_norm_var": 2.06083984375, "learning_rate": 0.0001, "loss": 3.2936, "loss/crossentropy": 2.313612127304077, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2666293315589428, "loss/reg": 0.0, "step": 12280 }, { "epoch": 0.08085526315789474, "grad_norm": 4.625, "grad_norm_var": 6.363402303059896, "learning_rate": 0.0001, "loss": 3.2228, "loss/crossentropy": 2.3592599511146544, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.25869676619768145, "loss/reg": 0.0, "step": 12290 }, { "epoch": 0.08092105263157895, "grad_norm": 2.421875, "grad_norm_var": 0.3772532145182292, "learning_rate": 0.0001, "loss": 3.2186, "loss/crossentropy": 2.50057338476181, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2711464300751686, "loss/reg": 0.0, "step": 12300 }, { "epoch": 0.08098684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.07467447916666667, "learning_rate": 0.0001, "loss": 3.2317, "loss/crossentropy": 2.381076216697693, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24998296648263932, "loss/reg": 0.0, "step": 12310 }, { "epoch": 0.08105263157894736, "grad_norm": 2.53125, "grad_norm_var": 1.58385009765625, "learning_rate": 0.0001, "loss": 3.3666, "loss/crossentropy": 2.2987404227256776, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24597887545824051, "loss/reg": 0.0, "step": 12320 }, { "epoch": 0.08111842105263158, "grad_norm": 2.421875, "grad_norm_var": 3.158852918199495e+17, "learning_rate": 0.0001, "loss": 3.3716, "loss/crossentropy": 2.415051448345184, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.29122400283813477, "loss/reg": 0.0, "step": 12330 }, { "epoch": 0.08118421052631579, "grad_norm": 2.734375, "grad_norm_var": 3.158852918454168e+17, "learning_rate": 0.0001, "loss": 3.2774, "loss/crossentropy": 2.082320672273636, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2748811081051826, "loss/reg": 0.0, "step": 12340 }, { "epoch": 0.08125, "grad_norm": 3.46875, "grad_norm_var": 0.26638997395833336, "learning_rate": 0.0001, "loss": 3.2315, "loss/crossentropy": 2.7044607162475587, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.32181375473737717, "loss/reg": 0.0, "step": 12350 }, { "epoch": 0.08131578947368422, "grad_norm": 2.765625, "grad_norm_var": 0.22693684895833333, "learning_rate": 0.0001, "loss": 3.2351, "loss/crossentropy": 2.7221840620040894, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.26635742783546446, "loss/reg": 0.0, "step": 12360 }, { "epoch": 0.08138157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.26437174479166664, "learning_rate": 0.0001, "loss": 3.2068, "loss/crossentropy": 2.3512622594833372, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2477712720632553, "loss/reg": 0.0, "step": 12370 }, { "epoch": 0.08144736842105263, "grad_norm": 2.125, "grad_norm_var": 0.3268707275390625, "learning_rate": 0.0001, "loss": 3.1974, "loss/crossentropy": 2.210025131702423, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.36077398508787156, "loss/reg": 0.0, "step": 12380 }, { "epoch": 0.08151315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.5682525634765625, "learning_rate": 0.0001, "loss": 3.2079, "loss/crossentropy": 2.0402897000312805, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.26192123591899874, "loss/reg": 0.0, "step": 12390 }, { "epoch": 0.08157894736842106, "grad_norm": 2.65625, "grad_norm_var": 0.3712565104166667, "learning_rate": 0.0001, "loss": 3.2107, "loss/crossentropy": 1.9560218453407288, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.21844983994960784, "loss/reg": 0.0, "step": 12400 }, { "epoch": 0.08164473684210527, "grad_norm": 3.171875, "grad_norm_var": 0.4505859375, "learning_rate": 0.0001, "loss": 3.2759, "loss/crossentropy": 2.3604748249053955, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.2524840489029884, "loss/reg": 0.0, "step": 12410 }, { "epoch": 0.08171052631578947, "grad_norm": 2.484375, "grad_norm_var": 0.24068603515625, "learning_rate": 0.0001, "loss": 3.2728, "loss/crossentropy": 2.517817199230194, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.29347013384103776, "loss/reg": 0.0, "step": 12420 }, { "epoch": 0.08177631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.12271728515625, "learning_rate": 0.0001, "loss": 3.2551, "loss/crossentropy": 2.4665472149848937, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2709146931767464, "loss/reg": 0.0, "step": 12430 }, { "epoch": 0.0818421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.2295074462890625, "learning_rate": 0.0001, "loss": 3.2219, "loss/crossentropy": 2.40498046875, "loss/hidden": 3.1, "loss/incoh": 0.0, "loss/logits": 0.32999152690172195, "loss/reg": 0.0, "step": 12440 }, { "epoch": 0.08190789473684211, "grad_norm": 2.421875, "grad_norm_var": 0.1619049072265625, "learning_rate": 0.0001, "loss": 3.0867, "loss/crossentropy": 2.3188422203063963, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.29690912514925005, "loss/reg": 0.0, "step": 12450 }, { "epoch": 0.08197368421052631, "grad_norm": 2.09375, "grad_norm_var": 0.07049051920572917, "learning_rate": 0.0001, "loss": 3.2682, "loss/crossentropy": 2.588274967670441, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.39090928733348845, "loss/reg": 0.0, "step": 12460 }, { "epoch": 0.08203947368421052, "grad_norm": 3.921875, "grad_norm_var": 0.2250261942545573, "learning_rate": 0.0001, "loss": 3.2503, "loss/crossentropy": 2.6322230458259583, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2629552409052849, "loss/reg": 0.0, "step": 12470 }, { "epoch": 0.08210526315789474, "grad_norm": 2.203125, "grad_norm_var": 0.18522109985351562, "learning_rate": 0.0001, "loss": 3.1313, "loss/crossentropy": 2.1257114171981812, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23226768374443055, "loss/reg": 0.0, "step": 12480 }, { "epoch": 0.08217105263157895, "grad_norm": 2.25, "grad_norm_var": 0.16383056640625, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 1.9993813276290893, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21686773076653482, "loss/reg": 0.0, "step": 12490 }, { "epoch": 0.08223684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.12244364420572916, "learning_rate": 0.0001, "loss": 3.1643, "loss/crossentropy": 2.4217318654060365, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.2706719309091568, "loss/reg": 0.0, "step": 12500 }, { "epoch": 0.08230263157894736, "grad_norm": 3.03125, "grad_norm_var": 0.9795888264973959, "learning_rate": 0.0001, "loss": 3.2763, "loss/crossentropy": 2.131495940685272, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.24229931831359863, "loss/reg": 0.0, "step": 12510 }, { "epoch": 0.08236842105263158, "grad_norm": 2.171875, "grad_norm_var": 1.021240234375, "learning_rate": 0.0001, "loss": 3.1894, "loss/crossentropy": 2.2545747995376586, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.26802987307310105, "loss/reg": 0.0, "step": 12520 }, { "epoch": 0.08243421052631579, "grad_norm": 2.234375, "grad_norm_var": 2.0329969940865024e+17, "learning_rate": 0.0001, "loss": 3.364, "loss/crossentropy": 2.37786750793457, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.27177259773015977, "loss/reg": 0.0, "step": 12530 }, { "epoch": 0.0825, "grad_norm": 2.46875, "grad_norm_var": 0.0586822509765625, "learning_rate": 0.0001, "loss": 3.1425, "loss/crossentropy": 2.3742210388183596, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25972897857427596, "loss/reg": 0.0, "step": 12540 }, { "epoch": 0.08256578947368422, "grad_norm": 2.21875, "grad_norm_var": 0.16419270833333333, "learning_rate": 0.0001, "loss": 3.1808, "loss/crossentropy": 2.2249507308006287, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.260022896528244, "loss/reg": 0.0, "step": 12550 }, { "epoch": 0.08263157894736842, "grad_norm": 3.765625, "grad_norm_var": 0.1635406494140625, "learning_rate": 0.0001, "loss": 3.2244, "loss/crossentropy": 2.417391860485077, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24499612003564836, "loss/reg": 0.0, "step": 12560 }, { "epoch": 0.08269736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.24940999348958334, "learning_rate": 0.0001, "loss": 3.259, "loss/crossentropy": 2.008757221698761, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.28915109634399416, "loss/reg": 0.0, "step": 12570 }, { "epoch": 0.08276315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 3.0918, "loss/crossentropy": 2.3331239223480225, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.27426364421844485, "loss/reg": 0.0, "step": 12580 }, { "epoch": 0.08282894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.4984527587890625, "learning_rate": 0.0001, "loss": 3.2675, "loss/crossentropy": 2.3463852405548096, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.2746781826019287, "loss/reg": 0.0, "step": 12590 }, { "epoch": 0.08289473684210526, "grad_norm": 2.09375, "grad_norm_var": 0.026822916666666665, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.1946144729852675, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.273574560880661, "loss/reg": 0.0, "step": 12600 }, { "epoch": 0.08296052631578947, "grad_norm": 2.75, "grad_norm_var": 0.399658203125, "learning_rate": 0.0001, "loss": 3.3113, "loss/crossentropy": 2.292886030673981, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3399674043059349, "loss/reg": 0.0, "step": 12610 }, { "epoch": 0.08302631578947368, "grad_norm": 2.703125, "grad_norm_var": 0.306982421875, "learning_rate": 0.0001, "loss": 3.3316, "loss/crossentropy": 2.2384460091590883, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2688921958208084, "loss/reg": 0.0, "step": 12620 }, { "epoch": 0.0830921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 3.1944, "loss/crossentropy": 2.2317716479301453, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23216352015733718, "loss/reg": 0.0, "step": 12630 }, { "epoch": 0.08315789473684211, "grad_norm": 2.203125, "grad_norm_var": 0.05810139973958333, "learning_rate": 0.0001, "loss": 3.1734, "loss/crossentropy": 2.513770651817322, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.26424300968647, "loss/reg": 0.0, "step": 12640 }, { "epoch": 0.08322368421052631, "grad_norm": 2.765625, "grad_norm_var": 0.060302734375, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.3643002271652223, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2974396377801895, "loss/reg": 0.0, "step": 12650 }, { "epoch": 0.08328947368421052, "grad_norm": 2.765625, "grad_norm_var": 0.11692301432291667, "learning_rate": 0.0001, "loss": 3.2013, "loss/crossentropy": 2.200558376312256, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.26023727655410767, "loss/reg": 0.0, "step": 12660 }, { "epoch": 0.08335526315789474, "grad_norm": 2.34375, "grad_norm_var": 0.0976470947265625, "learning_rate": 0.0001, "loss": 3.2425, "loss/crossentropy": 2.3456878662109375, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.30842293947935107, "loss/reg": 0.0, "step": 12670 }, { "epoch": 0.08342105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.05621744791666667, "learning_rate": 0.0001, "loss": 3.1823, "loss/crossentropy": 2.2649134039878844, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24366314560174943, "loss/reg": 0.0, "step": 12680 }, { "epoch": 0.08348684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.059891764322916666, "learning_rate": 0.0001, "loss": 3.1859, "loss/crossentropy": 2.0741775274276733, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.24329135864973067, "loss/reg": 0.0, "step": 12690 }, { "epoch": 0.08355263157894736, "grad_norm": 2.40625, "grad_norm_var": 0.13880208333333333, "learning_rate": 0.0001, "loss": 3.1551, "loss/crossentropy": 2.4719223856925963, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.24188547879457473, "loss/reg": 0.0, "step": 12700 }, { "epoch": 0.08361842105263158, "grad_norm": 2.8125, "grad_norm_var": 0.32080078125, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.279301416873932, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.2998178914189339, "loss/reg": 0.0, "step": 12710 }, { "epoch": 0.08368421052631579, "grad_norm": 2.65625, "grad_norm_var": 0.341064453125, "learning_rate": 0.0001, "loss": 3.2089, "loss/crossentropy": 2.4134485125541687, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.28444976508617403, "loss/reg": 0.0, "step": 12720 }, { "epoch": 0.08375, "grad_norm": 2.40625, "grad_norm_var": 0.13637593587239583, "learning_rate": 0.0001, "loss": 3.2345, "loss/crossentropy": 2.4403869032859804, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.26378336995840074, "loss/reg": 0.0, "step": 12730 }, { "epoch": 0.0838157894736842, "grad_norm": 2.34375, "grad_norm_var": 0.0974273681640625, "learning_rate": 0.0001, "loss": 3.1501, "loss/crossentropy": 2.1977667093276976, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.23563524186611176, "loss/reg": 0.0, "step": 12740 }, { "epoch": 0.08388157894736842, "grad_norm": 3.203125, "grad_norm_var": 0.12566731770833334, "learning_rate": 0.0001, "loss": 3.2516, "loss/crossentropy": 2.206720507144928, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.2731472015380859, "loss/reg": 0.0, "step": 12750 }, { "epoch": 0.08394736842105263, "grad_norm": 3.859375, "grad_norm_var": 0.21768290201822918, "learning_rate": 0.0001, "loss": 3.2035, "loss/crossentropy": 2.3951833486557006, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.26070789247751236, "loss/reg": 0.0, "step": 12760 }, { "epoch": 0.08401315789473685, "grad_norm": 2.140625, "grad_norm_var": 0.36861572265625, "learning_rate": 0.0001, "loss": 3.3054, "loss/crossentropy": 2.393438732624054, "loss/hidden": 3.2328125, "loss/incoh": 0.0, "loss/logits": 0.3618380635976791, "loss/reg": 0.0, "step": 12770 }, { "epoch": 0.08407894736842106, "grad_norm": 2.28125, "grad_norm_var": 0.24163004557291667, "learning_rate": 0.0001, "loss": 3.2972, "loss/crossentropy": 2.510524129867554, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.26494802087545394, "loss/reg": 0.0, "step": 12780 }, { "epoch": 0.08414473684210526, "grad_norm": 2.640625, "grad_norm_var": 0.07789713541666667, "learning_rate": 0.0001, "loss": 3.2375, "loss/crossentropy": 2.326508915424347, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.25067940801382066, "loss/reg": 0.0, "step": 12790 }, { "epoch": 0.08421052631578947, "grad_norm": 2.625, "grad_norm_var": 0.0654693603515625, "learning_rate": 0.0001, "loss": 3.1664, "loss/crossentropy": 2.2524615049362184, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.23585905730724335, "loss/reg": 0.0, "step": 12800 }, { "epoch": 0.08427631578947369, "grad_norm": 2.296875, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 3.2086, "loss/crossentropy": 2.586345672607422, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2775671869516373, "loss/reg": 0.0, "step": 12810 }, { "epoch": 0.0843421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.157568359375, "learning_rate": 0.0001, "loss": 3.2467, "loss/crossentropy": 2.4124717354774474, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.27734949439764023, "loss/reg": 0.0, "step": 12820 }, { "epoch": 0.0844078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.151953125, "learning_rate": 0.0001, "loss": 3.1838, "loss/crossentropy": 2.217060422897339, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.31069841980934143, "loss/reg": 0.0, "step": 12830 }, { "epoch": 0.08447368421052631, "grad_norm": 2.46875, "grad_norm_var": 0.15916341145833332, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.2915706515312193, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.28919376283884046, "loss/reg": 0.0, "step": 12840 }, { "epoch": 0.08453947368421053, "grad_norm": 2.8125, "grad_norm_var": 0.05465494791666667, "learning_rate": 0.0001, "loss": 3.1509, "loss/crossentropy": 2.3436198830604553, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2566779345273972, "loss/reg": 0.0, "step": 12850 }, { "epoch": 0.08460526315789474, "grad_norm": 2.796875, "grad_norm_var": 0.12815653483072917, "learning_rate": 0.0001, "loss": 3.1553, "loss/crossentropy": 2.6007506489753722, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27069382518529894, "loss/reg": 0.0, "step": 12860 }, { "epoch": 0.08467105263157895, "grad_norm": 3.125, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 3.1848, "loss/crossentropy": 2.3489827513694763, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2542911395430565, "loss/reg": 0.0, "step": 12870 }, { "epoch": 0.08473684210526315, "grad_norm": 2.359375, "grad_norm_var": 0.2053375244140625, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.3320749402046204, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.25249498784542085, "loss/reg": 0.0, "step": 12880 }, { "epoch": 0.08480263157894737, "grad_norm": 2.375, "grad_norm_var": 0.22139383951822916, "learning_rate": 0.0001, "loss": 3.2744, "loss/crossentropy": 2.5565970659255983, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.30419613122940065, "loss/reg": 0.0, "step": 12890 }, { "epoch": 0.08486842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.12555338541666666, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.382706320285797, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.28827311396598815, "loss/reg": 0.0, "step": 12900 }, { "epoch": 0.08493421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.031119791666666667, "learning_rate": 0.0001, "loss": 3.1646, "loss/crossentropy": 2.209449625015259, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.27490794360637666, "loss/reg": 0.0, "step": 12910 }, { "epoch": 0.085, "grad_norm": 2.46875, "grad_norm_var": 0.075537109375, "learning_rate": 0.0001, "loss": 3.1151, "loss/crossentropy": 2.42991498708725, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.24879284277558328, "loss/reg": 0.0, "step": 12920 }, { "epoch": 0.0850657894736842, "grad_norm": 2.21875, "grad_norm_var": 0.06396865844726562, "learning_rate": 0.0001, "loss": 3.1176, "loss/crossentropy": 2.124844658374786, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26795649230480195, "loss/reg": 0.0, "step": 12930 }, { "epoch": 0.08513157894736842, "grad_norm": 3.046875, "grad_norm_var": 0.0884844462076823, "learning_rate": 0.0001, "loss": 3.1416, "loss/crossentropy": 2.390605056285858, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2603973612189293, "loss/reg": 0.0, "step": 12940 }, { "epoch": 0.08519736842105263, "grad_norm": 2.15625, "grad_norm_var": 0.11295572916666667, "learning_rate": 0.0001, "loss": 3.1506, "loss/crossentropy": 2.3307228684425354, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.23968299478292465, "loss/reg": 0.0, "step": 12950 }, { "epoch": 0.08526315789473685, "grad_norm": 2.375, "grad_norm_var": 0.07505594889322917, "learning_rate": 0.0001, "loss": 3.1409, "loss/crossentropy": 2.32118815779686, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.27437605410814286, "loss/reg": 0.0, "step": 12960 }, { "epoch": 0.08532894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.05011393229166667, "learning_rate": 0.0001, "loss": 3.1943, "loss/crossentropy": 2.1953859329223633, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.2964278385043144, "loss/reg": 0.0, "step": 12970 }, { "epoch": 0.08539473684210526, "grad_norm": 2.265625, "grad_norm_var": 0.044733683268229164, "learning_rate": 0.0001, "loss": 3.1596, "loss/crossentropy": 2.48675742149353, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2866469621658325, "loss/reg": 0.0, "step": 12980 }, { "epoch": 0.08546052631578947, "grad_norm": 2.890625, "grad_norm_var": 0.04439188639322917, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.3276284098625184, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.254511134326458, "loss/reg": 0.0, "step": 12990 }, { "epoch": 0.08552631578947369, "grad_norm": 2.125, "grad_norm_var": 0.18092041015625, "learning_rate": 0.0001, "loss": 3.3308, "loss/crossentropy": 2.436275231838226, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27356108725070954, "loss/reg": 0.0, "step": 13000 }, { "epoch": 0.0855921052631579, "grad_norm": 2.25, "grad_norm_var": 0.17511393229166666, "learning_rate": 0.0001, "loss": 3.1302, "loss/crossentropy": 2.2856626510620117, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24442512094974517, "loss/reg": 0.0, "step": 13010 }, { "epoch": 0.0856578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.10161844889322917, "learning_rate": 0.0001, "loss": 3.1923, "loss/crossentropy": 2.4494728326797484, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2575364217162132, "loss/reg": 0.0, "step": 13020 }, { "epoch": 0.08572368421052631, "grad_norm": 2.34375, "grad_norm_var": 0.09909566243489583, "learning_rate": 0.0001, "loss": 3.1732, "loss/crossentropy": 2.4833264112472535, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2727135464549065, "loss/reg": 0.0, "step": 13030 }, { "epoch": 0.08578947368421053, "grad_norm": 3.203125, "grad_norm_var": 0.12711588541666666, "learning_rate": 0.0001, "loss": 3.165, "loss/crossentropy": 2.284275805950165, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.29309146106243134, "loss/reg": 0.0, "step": 13040 }, { "epoch": 0.08585526315789474, "grad_norm": 2.28125, "grad_norm_var": 0.18906148274739584, "learning_rate": 0.0001, "loss": 3.1988, "loss/crossentropy": 2.1231042385101317, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.246621835231781, "loss/reg": 0.0, "step": 13050 }, { "epoch": 0.08592105263157895, "grad_norm": 2.234375, "grad_norm_var": 0.14317118326822917, "learning_rate": 0.0001, "loss": 3.2204, "loss/crossentropy": 2.115370142459869, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.22535212635993956, "loss/reg": 0.0, "step": 13060 }, { "epoch": 0.08598684210526315, "grad_norm": 2.234375, "grad_norm_var": 0.17832743326822917, "learning_rate": 0.0001, "loss": 3.1988, "loss/crossentropy": 2.271882343292236, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2518279105424881, "loss/reg": 0.0, "step": 13070 }, { "epoch": 0.08605263157894737, "grad_norm": 1.90625, "grad_norm_var": 0.1100250244140625, "learning_rate": 0.0001, "loss": 3.2672, "loss/crossentropy": 2.211618059873581, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2709794193506241, "loss/reg": 0.0, "step": 13080 }, { "epoch": 0.08611842105263158, "grad_norm": 2.15625, "grad_norm_var": 0.06676432291666666, "learning_rate": 0.0001, "loss": 3.2393, "loss/crossentropy": 2.4678762197494506, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2634593158960342, "loss/reg": 0.0, "step": 13090 }, { "epoch": 0.0861842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 3.19, "loss/crossentropy": 2.5426036715507507, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.26041875034570694, "loss/reg": 0.0, "step": 13100 }, { "epoch": 0.08625, "grad_norm": 2.34375, "grad_norm_var": 0.028034464518229166, "learning_rate": 0.0001, "loss": 3.1556, "loss/crossentropy": 2.1748136937618257, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.3230336934328079, "loss/reg": 0.0, "step": 13110 }, { "epoch": 0.0863157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.3787750244140625, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.352058470249176, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.3100366100668907, "loss/reg": 0.0, "step": 13120 }, { "epoch": 0.08638157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.12065327962239583, "learning_rate": 0.0001, "loss": 3.2897, "loss/crossentropy": 2.2920926332473757, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.24493622779846191, "loss/reg": 0.0, "step": 13130 }, { "epoch": 0.08644736842105263, "grad_norm": 3.140625, "grad_norm_var": 0.17266337076822916, "learning_rate": 0.0001, "loss": 3.2541, "loss/crossentropy": 2.253483748435974, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24110897034406661, "loss/reg": 0.0, "step": 13140 }, { "epoch": 0.08651315789473685, "grad_norm": 3.140625, "grad_norm_var": 0.15883687337239583, "learning_rate": 0.0001, "loss": 3.2161, "loss/crossentropy": 2.490678381919861, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2905942976474762, "loss/reg": 0.0, "step": 13150 }, { "epoch": 0.08657894736842105, "grad_norm": 2.671875, "grad_norm_var": 0.124365234375, "learning_rate": 0.0001, "loss": 3.2037, "loss/crossentropy": 2.607399010658264, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.3426252081990242, "loss/reg": 0.0, "step": 13160 }, { "epoch": 0.08664473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.20832697550455728, "learning_rate": 0.0001, "loss": 3.3389, "loss/crossentropy": 2.5389102935791015, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.27587536424398423, "loss/reg": 0.0, "step": 13170 }, { "epoch": 0.08671052631578947, "grad_norm": 2.171875, "grad_norm_var": 3.749950368844328e+17, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2043145060539246, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.27380194514989853, "loss/reg": 0.0, "step": 13180 }, { "epoch": 0.08677631578947369, "grad_norm": 3.203125, "grad_norm_var": 0.10250651041666667, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.5562551975250245, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.35649020969867706, "loss/reg": 0.0, "step": 13190 }, { "epoch": 0.0868421052631579, "grad_norm": 2.046875, "grad_norm_var": 0.11367899576822917, "learning_rate": 0.0001, "loss": 3.1639, "loss/crossentropy": 2.2194557189941406, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.2623781323432922, "loss/reg": 0.0, "step": 13200 }, { "epoch": 0.0869078947368421, "grad_norm": 2.59375, "grad_norm_var": 0.09927978515625, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.3938650250434876, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22924545854330064, "loss/reg": 0.0, "step": 13210 }, { "epoch": 0.08697368421052631, "grad_norm": 2.328125, "grad_norm_var": 0.07858072916666667, "learning_rate": 0.0001, "loss": 3.1885, "loss/crossentropy": 2.659112477302551, "loss/hidden": 3.2625, "loss/incoh": 0.0, "loss/logits": 0.3096113160252571, "loss/reg": 0.0, "step": 13220 }, { "epoch": 0.08703947368421053, "grad_norm": 2.140625, "grad_norm_var": 0.08105061848958334, "learning_rate": 0.0001, "loss": 3.1608, "loss/crossentropy": 2.2919702410697935, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.28502654284238815, "loss/reg": 0.0, "step": 13230 }, { "epoch": 0.08710526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.2513631184895833, "learning_rate": 0.0001, "loss": 3.1995, "loss/crossentropy": 2.206997013092041, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.20357893258333207, "loss/reg": 0.0, "step": 13240 }, { "epoch": 0.08717105263157894, "grad_norm": 2.1875, "grad_norm_var": 0.06531575520833334, "learning_rate": 0.0001, "loss": 3.1267, "loss/crossentropy": 2.095241755247116, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.23652465790510177, "loss/reg": 0.0, "step": 13250 }, { "epoch": 0.08723684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.110009765625, "learning_rate": 0.0001, "loss": 3.1799, "loss/crossentropy": 2.1254674077033995, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.22684407681226731, "loss/reg": 0.0, "step": 13260 }, { "epoch": 0.08730263157894737, "grad_norm": 2.5625, "grad_norm_var": 3.398986257643471e+17, "learning_rate": 0.0001, "loss": 3.358, "loss/crossentropy": 2.3295519828796385, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.30277568846940994, "loss/reg": 0.0, "step": 13270 }, { "epoch": 0.08736842105263158, "grad_norm": 2.359375, "grad_norm_var": 3.3989862579380115e+17, "learning_rate": 0.0001, "loss": 3.3036, "loss/crossentropy": 2.496494376659393, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2583273336291313, "loss/reg": 0.0, "step": 13280 }, { "epoch": 0.0874342105263158, "grad_norm": 2.640625, "grad_norm_var": 0.0663238525390625, "learning_rate": 0.0001, "loss": 3.2025, "loss/crossentropy": 2.332583689689636, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24232984483242034, "loss/reg": 0.0, "step": 13290 }, { "epoch": 0.0875, "grad_norm": 2.25, "grad_norm_var": 0.43166910807291664, "learning_rate": 0.0001, "loss": 3.1339, "loss/crossentropy": 2.394269013404846, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24607907682657243, "loss/reg": 0.0, "step": 13300 }, { "epoch": 0.08756578947368421, "grad_norm": 2.40625, "grad_norm_var": 0.112890625, "learning_rate": 0.0001, "loss": 3.1815, "loss/crossentropy": 2.2034417927265166, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.25049934834241866, "loss/reg": 0.0, "step": 13310 }, { "epoch": 0.08763157894736842, "grad_norm": 2.6875, "grad_norm_var": 0.0748443603515625, "learning_rate": 0.0001, "loss": 3.2491, "loss/crossentropy": 2.058911919593811, "loss/hidden": 3.1359375, "loss/incoh": 0.0, "loss/logits": 0.25788910537958143, "loss/reg": 0.0, "step": 13320 }, { "epoch": 0.08769736842105263, "grad_norm": 3.046875, "grad_norm_var": 0.22857666015625, "learning_rate": 0.0001, "loss": 3.2051, "loss/crossentropy": 2.3308457016944883, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.3006305813789368, "loss/reg": 0.0, "step": 13330 }, { "epoch": 0.08776315789473685, "grad_norm": 2.46875, "grad_norm_var": 0.266796875, "learning_rate": 0.0001, "loss": 3.1792, "loss/crossentropy": 2.392533528804779, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.25690300911664965, "loss/reg": 0.0, "step": 13340 }, { "epoch": 0.08782894736842105, "grad_norm": 2.015625, "grad_norm_var": 0.24251302083333334, "learning_rate": 0.0001, "loss": 3.1342, "loss/crossentropy": 2.303742027282715, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.24467559903860092, "loss/reg": 0.0, "step": 13350 }, { "epoch": 0.08789473684210526, "grad_norm": 3.21875, "grad_norm_var": 0.26910400390625, "learning_rate": 0.0001, "loss": 3.2614, "loss/crossentropy": 2.5700215101242065, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.346772675216198, "loss/reg": 0.0, "step": 13360 }, { "epoch": 0.08796052631578948, "grad_norm": 2.234375, "grad_norm_var": 8.30354715983073, "learning_rate": 0.0001, "loss": 3.1987, "loss/crossentropy": 2.3953630328178406, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2601369693875313, "loss/reg": 0.0, "step": 13370 }, { "epoch": 0.08802631578947369, "grad_norm": 3.671875, "grad_norm_var": 3.5722076416015627, "learning_rate": 0.0001, "loss": 3.2285, "loss/crossentropy": 2.540582847595215, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.274001345038414, "loss/reg": 0.0, "step": 13380 }, { "epoch": 0.08809210526315789, "grad_norm": 2.359375, "grad_norm_var": 0.2377593994140625, "learning_rate": 0.0001, "loss": 3.1787, "loss/crossentropy": 2.3221506476402283, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2874672919511795, "loss/reg": 0.0, "step": 13390 }, { "epoch": 0.0881578947368421, "grad_norm": 2.15625, "grad_norm_var": 1.4219309488932292, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 2.255697971582413, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.25251765847206115, "loss/reg": 0.0, "step": 13400 }, { "epoch": 0.08822368421052632, "grad_norm": 2.28125, "grad_norm_var": 0.023307291666666667, "learning_rate": 0.0001, "loss": 3.1642, "loss/crossentropy": 2.376703941822052, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2795195817947388, "loss/reg": 0.0, "step": 13410 }, { "epoch": 0.08828947368421053, "grad_norm": 2.359375, "grad_norm_var": 0.32665608723958334, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.390151119232178, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.255537736415863, "loss/reg": 0.0, "step": 13420 }, { "epoch": 0.08835526315789474, "grad_norm": 2.203125, "grad_norm_var": 0.33414306640625, "learning_rate": 0.0001, "loss": 3.1779, "loss/crossentropy": 2.2273300528526305, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2409852236509323, "loss/reg": 0.0, "step": 13430 }, { "epoch": 0.08842105263157894, "grad_norm": 25.75, "grad_norm_var": 34.02027587890625, "learning_rate": 0.0001, "loss": 3.1974, "loss/crossentropy": 2.281948208808899, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2378145158290863, "loss/reg": 0.0, "step": 13440 }, { "epoch": 0.08848684210526316, "grad_norm": 2.75, "grad_norm_var": 34.235252888997394, "learning_rate": 0.0001, "loss": 3.3736, "loss/crossentropy": 2.267159104347229, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.2702139914035797, "loss/reg": 0.0, "step": 13450 }, { "epoch": 0.08855263157894737, "grad_norm": 2.703125, "grad_norm_var": 1.1851145426432292, "learning_rate": 0.0001, "loss": 3.16, "loss/crossentropy": 2.3185499548912047, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2232184700667858, "loss/reg": 0.0, "step": 13460 }, { "epoch": 0.08861842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.07108968098958333, "learning_rate": 0.0001, "loss": 3.2029, "loss/crossentropy": 2.1752323627471926, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.3101599723100662, "loss/reg": 0.0, "step": 13470 }, { "epoch": 0.0886842105263158, "grad_norm": 2.21875, "grad_norm_var": 0.11365559895833334, "learning_rate": 0.0001, "loss": 3.2671, "loss/crossentropy": 2.255975532531738, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.30118285566568376, "loss/reg": 0.0, "step": 13480 }, { "epoch": 0.08875, "grad_norm": 2.4375, "grad_norm_var": 0.06363525390625, "learning_rate": 0.0001, "loss": 3.1907, "loss/crossentropy": 2.2357093393802643, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.23682421892881395, "loss/reg": 0.0, "step": 13490 }, { "epoch": 0.08881578947368421, "grad_norm": 2.75, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 3.1246, "loss/crossentropy": 2.3712179183959963, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2336268275976181, "loss/reg": 0.0, "step": 13500 }, { "epoch": 0.08888157894736842, "grad_norm": 2.40625, "grad_norm_var": 0.13904520670572917, "learning_rate": 0.0001, "loss": 3.1879, "loss/crossentropy": 2.407870662212372, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.28650608360767366, "loss/reg": 0.0, "step": 13510 }, { "epoch": 0.08894736842105264, "grad_norm": 2.359375, "grad_norm_var": 0.009284464518229167, "learning_rate": 0.0001, "loss": 3.1881, "loss/crossentropy": 2.4734713077545165, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.30961792171001434, "loss/reg": 0.0, "step": 13520 }, { "epoch": 0.08901315789473684, "grad_norm": 2.796875, "grad_norm_var": 0.03455301920572917, "learning_rate": 0.0001, "loss": 3.1521, "loss/crossentropy": 2.1405319690704347, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.25160129070281984, "loss/reg": 0.0, "step": 13530 }, { "epoch": 0.08907894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.13059488932291666, "learning_rate": 0.0001, "loss": 3.2623, "loss/crossentropy": 2.5234264612197874, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2405148908495903, "loss/reg": 0.0, "step": 13540 }, { "epoch": 0.08914473684210526, "grad_norm": 3.109375, "grad_norm_var": 0.14059244791666667, "learning_rate": 0.0001, "loss": 3.1637, "loss/crossentropy": 2.4861895561218263, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2647073075175285, "loss/reg": 0.0, "step": 13550 }, { "epoch": 0.08921052631578948, "grad_norm": 2.09375, "grad_norm_var": 0.08899332682291666, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.241242003440857, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.25270578265190125, "loss/reg": 0.0, "step": 13560 }, { "epoch": 0.08927631578947369, "grad_norm": 2.703125, "grad_norm_var": 0.15600484212239582, "learning_rate": 0.0001, "loss": 3.2692, "loss/crossentropy": 2.5087321639060973, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.2870207831263542, "loss/reg": 0.0, "step": 13570 }, { "epoch": 0.08934210526315789, "grad_norm": 1.9140625, "grad_norm_var": 0.09006322224934896, "learning_rate": 0.0001, "loss": 3.2167, "loss/crossentropy": 2.519565200805664, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2802609995007515, "loss/reg": 0.0, "step": 13580 }, { "epoch": 0.0894078947368421, "grad_norm": 2.859375, "grad_norm_var": 0.08097508748372396, "learning_rate": 0.0001, "loss": 3.2167, "loss/crossentropy": 2.6469456434249876, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.2738121926784515, "loss/reg": 0.0, "step": 13590 }, { "epoch": 0.08947368421052632, "grad_norm": 4.34375, "grad_norm_var": 0.28693033854166666, "learning_rate": 0.0001, "loss": 3.0727, "loss/crossentropy": 2.4280938267707826, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.24971676170825957, "loss/reg": 0.0, "step": 13600 }, { "epoch": 0.08953947368421053, "grad_norm": 2.171875, "grad_norm_var": 0.28564046223958334, "learning_rate": 0.0001, "loss": 3.2647, "loss/crossentropy": 2.331229364871979, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.25523588955402376, "loss/reg": 0.0, "step": 13610 }, { "epoch": 0.08960526315789474, "grad_norm": 2.03125, "grad_norm_var": 0.1018463134765625, "learning_rate": 0.0001, "loss": 3.141, "loss/crossentropy": 2.248876082897186, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.27915321439504626, "loss/reg": 0.0, "step": 13620 }, { "epoch": 0.08967105263157894, "grad_norm": 2.484375, "grad_norm_var": 0.05377604166666667, "learning_rate": 0.0001, "loss": 3.1282, "loss/crossentropy": 2.475773072242737, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26457071453332903, "loss/reg": 0.0, "step": 13630 }, { "epoch": 0.08973684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.09758707682291666, "learning_rate": 0.0001, "loss": 3.1607, "loss/crossentropy": 2.3480275869369507, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.2902192771434784, "loss/reg": 0.0, "step": 13640 }, { "epoch": 0.08980263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.1151519775390625, "learning_rate": 0.0001, "loss": 3.1651, "loss/crossentropy": 2.362102711200714, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.24841166138648987, "loss/reg": 0.0, "step": 13650 }, { "epoch": 0.08986842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.0971832275390625, "learning_rate": 0.0001, "loss": 3.1725, "loss/crossentropy": 2.134939956665039, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2594160199165344, "loss/reg": 0.0, "step": 13660 }, { "epoch": 0.08993421052631578, "grad_norm": 2.296875, "grad_norm_var": 0.09054361979166667, "learning_rate": 0.0001, "loss": 3.0604, "loss/crossentropy": 2.22348096370697, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2797392845153809, "loss/reg": 0.0, "step": 13670 }, { "epoch": 0.09, "grad_norm": 2.65625, "grad_norm_var": 0.0264556884765625, "learning_rate": 0.0001, "loss": 3.165, "loss/crossentropy": 2.321420121192932, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.25273988842964173, "loss/reg": 0.0, "step": 13680 }, { "epoch": 0.09006578947368421, "grad_norm": 2.1875, "grad_norm_var": 0.0448394775390625, "learning_rate": 0.0001, "loss": 3.2251, "loss/crossentropy": 2.5713982224464416, "loss/hidden": 3.1890625, "loss/incoh": 0.0, "loss/logits": 0.3743001103401184, "loss/reg": 0.0, "step": 13690 }, { "epoch": 0.09013157894736842, "grad_norm": 2.34375, "grad_norm_var": 0.05821024576822917, "learning_rate": 0.0001, "loss": 3.1174, "loss/crossentropy": 2.433469843864441, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.29491184949874877, "loss/reg": 0.0, "step": 13700 }, { "epoch": 0.09019736842105264, "grad_norm": 2.328125, "grad_norm_var": 0.06334228515625, "learning_rate": 0.0001, "loss": 3.1422, "loss/crossentropy": 2.498783230781555, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2630519106984138, "loss/reg": 0.0, "step": 13710 }, { "epoch": 0.09026315789473684, "grad_norm": 2.484375, "grad_norm_var": 0.18053385416666667, "learning_rate": 0.0001, "loss": 3.2017, "loss/crossentropy": 2.295622777938843, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.25550642013549807, "loss/reg": 0.0, "step": 13720 }, { "epoch": 0.09032894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.14754231770833334, "learning_rate": 0.0001, "loss": 3.2136, "loss/crossentropy": 2.4960160851478577, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.29100794196128843, "loss/reg": 0.0, "step": 13730 }, { "epoch": 0.09039473684210526, "grad_norm": 2.5, "grad_norm_var": 0.060888671875, "learning_rate": 0.0001, "loss": 3.182, "loss/crossentropy": 2.543604516983032, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.26428850889205935, "loss/reg": 0.0, "step": 13740 }, { "epoch": 0.09046052631578948, "grad_norm": 2.59375, "grad_norm_var": 0.1883697509765625, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.495186424255371, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24936733841896058, "loss/reg": 0.0, "step": 13750 }, { "epoch": 0.09052631578947369, "grad_norm": 2.515625, "grad_norm_var": 0.17746480305989584, "learning_rate": 0.0001, "loss": 3.2332, "loss/crossentropy": 2.5168472051620485, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.3214556619524956, "loss/reg": 0.0, "step": 13760 }, { "epoch": 0.09059210526315789, "grad_norm": 2.53125, "grad_norm_var": 0.049853515625, "learning_rate": 0.0001, "loss": 3.1549, "loss/crossentropy": 2.2676196336746215, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.28792900443077085, "loss/reg": 0.0, "step": 13770 }, { "epoch": 0.0906578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.50240478515625, "learning_rate": 0.0001, "loss": 3.1942, "loss/crossentropy": 2.3297463774681093, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.20883744955062866, "loss/reg": 0.0, "step": 13780 }, { "epoch": 0.09072368421052632, "grad_norm": 2.546875, "grad_norm_var": 0.4667154947916667, "learning_rate": 0.0001, "loss": 3.2349, "loss/crossentropy": 2.1975256204605103, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2709361925721169, "loss/reg": 0.0, "step": 13790 }, { "epoch": 0.09078947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.03693745930989583, "learning_rate": 0.0001, "loss": 3.1167, "loss/crossentropy": 2.3970743119716644, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.2433250866830349, "loss/reg": 0.0, "step": 13800 }, { "epoch": 0.09085526315789473, "grad_norm": 2.609375, "grad_norm_var": 0.7566965738932292, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.269898569583893, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23498842120170593, "loss/reg": 0.0, "step": 13810 }, { "epoch": 0.09092105263157894, "grad_norm": 2.265625, "grad_norm_var": 0.050389607747395836, "learning_rate": 0.0001, "loss": 3.1946, "loss/crossentropy": 2.5624868392944338, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.36430184692144396, "loss/reg": 0.0, "step": 13820 }, { "epoch": 0.09098684210526316, "grad_norm": 2.640625, "grad_norm_var": 0.06546122233072917, "learning_rate": 0.0001, "loss": 3.225, "loss/crossentropy": 2.2082170367240908, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2326130375266075, "loss/reg": 0.0, "step": 13830 }, { "epoch": 0.09105263157894737, "grad_norm": 3.046875, "grad_norm_var": 0.1621002197265625, "learning_rate": 0.0001, "loss": 3.225, "loss/crossentropy": 2.2408367514610292, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.25092404931783674, "loss/reg": 0.0, "step": 13840 }, { "epoch": 0.09111842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.18394775390625, "learning_rate": 0.0001, "loss": 3.1397, "loss/crossentropy": 2.092372101545334, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25034971833229064, "loss/reg": 0.0, "step": 13850 }, { "epoch": 0.09118421052631578, "grad_norm": 2.421875, "grad_norm_var": 0.0753313700358073, "learning_rate": 0.0001, "loss": 3.1936, "loss/crossentropy": 2.2277904510498048, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.28555874079465865, "loss/reg": 0.0, "step": 13860 }, { "epoch": 0.09125, "grad_norm": 2.203125, "grad_norm_var": 0.06499608357747395, "learning_rate": 0.0001, "loss": 3.1857, "loss/crossentropy": 2.0974882781505584, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.22869385927915573, "loss/reg": 0.0, "step": 13870 }, { "epoch": 0.09131578947368421, "grad_norm": 3.1875, "grad_norm_var": 0.30895894368489585, "learning_rate": 0.0001, "loss": 3.1808, "loss/crossentropy": 2.2242319107055666, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.24477246254682541, "loss/reg": 0.0, "step": 13880 }, { "epoch": 0.09138157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.30500895182291665, "learning_rate": 0.0001, "loss": 3.1587, "loss/crossentropy": 2.2787875294685365, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.23667097985744476, "loss/reg": 0.0, "step": 13890 }, { "epoch": 0.09144736842105264, "grad_norm": 2.109375, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 3.1791, "loss/crossentropy": 2.5015464782714845, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.26709298938512804, "loss/reg": 0.0, "step": 13900 }, { "epoch": 0.09151315789473684, "grad_norm": 1.9765625, "grad_norm_var": 0.38769505818684896, "learning_rate": 0.0001, "loss": 3.1858, "loss/crossentropy": 2.367018985748291, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2575811371207237, "loss/reg": 0.0, "step": 13910 }, { "epoch": 0.09157894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.12981338500976564, "learning_rate": 0.0001, "loss": 3.2014, "loss/crossentropy": 2.3600114941596986, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2798996135592461, "loss/reg": 0.0, "step": 13920 }, { "epoch": 0.09164473684210526, "grad_norm": 2.40625, "grad_norm_var": 0.5571940104166667, "learning_rate": 0.0001, "loss": 3.1415, "loss/crossentropy": 2.07312273979187, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.27816066443920134, "loss/reg": 0.0, "step": 13930 }, { "epoch": 0.09171052631578948, "grad_norm": 2.234375, "grad_norm_var": 0.16337483723958332, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.3842572927474976, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.3044495239853859, "loss/reg": 0.0, "step": 13940 }, { "epoch": 0.09177631578947368, "grad_norm": 2.28125, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 3.2638, "loss/crossentropy": 2.258384811878204, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.24848029464483262, "loss/reg": 0.0, "step": 13950 }, { "epoch": 0.09184210526315789, "grad_norm": 2.625, "grad_norm_var": 0.08046875, "learning_rate": 0.0001, "loss": 3.2277, "loss/crossentropy": 2.1830771923065186, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.282887265086174, "loss/reg": 0.0, "step": 13960 }, { "epoch": 0.0919078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.69127197265625, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.5693406105041503, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2618736609816551, "loss/reg": 0.0, "step": 13970 }, { "epoch": 0.09197368421052632, "grad_norm": 2.265625, "grad_norm_var": 1.2373443603515626, "learning_rate": 0.0001, "loss": 3.163, "loss/crossentropy": 2.496062994003296, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.27756498008966446, "loss/reg": 0.0, "step": 13980 }, { "epoch": 0.09203947368421053, "grad_norm": 2.375, "grad_norm_var": 1.256787109375, "learning_rate": 0.0001, "loss": 3.2974, "loss/crossentropy": 2.233779698610306, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.22414856255054474, "loss/reg": 0.0, "step": 13990 }, { "epoch": 0.09210526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.04104817708333333, "learning_rate": 0.0001, "loss": 3.1465, "loss/crossentropy": 2.3837480187416076, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.29815254360437393, "loss/reg": 0.0, "step": 14000 }, { "epoch": 0.09217105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 3.1933, "loss/crossentropy": 2.4089162349700928, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.28769134283065795, "loss/reg": 0.0, "step": 14010 }, { "epoch": 0.09223684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.099560546875, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.387049177289009, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.31867421939969065, "loss/reg": 0.0, "step": 14020 }, { "epoch": 0.09230263157894737, "grad_norm": 2.203125, "grad_norm_var": 0.12417704264322917, "learning_rate": 0.0001, "loss": 3.0959, "loss/crossentropy": 2.2009261429309843, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2346379891037941, "loss/reg": 0.0, "step": 14030 }, { "epoch": 0.09236842105263159, "grad_norm": 2.484375, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 3.1679, "loss/crossentropy": 2.379330587387085, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23624493330717086, "loss/reg": 0.0, "step": 14040 }, { "epoch": 0.09243421052631579, "grad_norm": 2.515625, "grad_norm_var": 0.25758056640625, "learning_rate": 0.0001, "loss": 3.1901, "loss/crossentropy": 2.2017428398132326, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2819879144430161, "loss/reg": 0.0, "step": 14050 }, { "epoch": 0.0925, "grad_norm": 2.59375, "grad_norm_var": 1.2106597900390625, "learning_rate": 0.0001, "loss": 3.1627, "loss/crossentropy": 2.053563690185547, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24388336688280104, "loss/reg": 0.0, "step": 14060 }, { "epoch": 0.09256578947368421, "grad_norm": 1.953125, "grad_norm_var": 0.0501617431640625, "learning_rate": 0.0001, "loss": 3.1122, "loss/crossentropy": 2.5748242855072023, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2592714115977287, "loss/reg": 0.0, "step": 14070 }, { "epoch": 0.09263157894736843, "grad_norm": 2.546875, "grad_norm_var": 0.115869140625, "learning_rate": 0.0001, "loss": 3.2403, "loss/crossentropy": 2.1434300899505616, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.23879239857196807, "loss/reg": 0.0, "step": 14080 }, { "epoch": 0.09269736842105263, "grad_norm": 2.53125, "grad_norm_var": 0.08590087890625, "learning_rate": 0.0001, "loss": 3.187, "loss/crossentropy": 2.078695094585419, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.29750160723924635, "loss/reg": 0.0, "step": 14090 }, { "epoch": 0.09276315789473684, "grad_norm": 3.078125, "grad_norm_var": 0.06355692545572916, "learning_rate": 0.0001, "loss": 3.1426, "loss/crossentropy": 2.4077930808067323, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2784098371863365, "loss/reg": 0.0, "step": 14100 }, { "epoch": 0.09282894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.28290913899739584, "learning_rate": 0.0001, "loss": 3.1999, "loss/crossentropy": 2.551260459423065, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.30844325572252274, "loss/reg": 0.0, "step": 14110 }, { "epoch": 0.09289473684210527, "grad_norm": 2.890625, "grad_norm_var": 0.28843994140625, "learning_rate": 0.0001, "loss": 3.1903, "loss/crossentropy": 2.057539927959442, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21393984705209732, "loss/reg": 0.0, "step": 14120 }, { "epoch": 0.09296052631578948, "grad_norm": 2.328125, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 3.2036, "loss/crossentropy": 2.2166428923606873, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.28170192539691924, "loss/reg": 0.0, "step": 14130 }, { "epoch": 0.09302631578947368, "grad_norm": 2.328125, "grad_norm_var": 0.04431864420572917, "learning_rate": 0.0001, "loss": 3.1706, "loss/crossentropy": 2.5224907636642455, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2656204476952553, "loss/reg": 0.0, "step": 14140 }, { "epoch": 0.09309210526315789, "grad_norm": 2.28125, "grad_norm_var": 0.13594462076822916, "learning_rate": 0.0001, "loss": 3.2365, "loss/crossentropy": 2.365175998210907, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.28030987083911896, "loss/reg": 0.0, "step": 14150 }, { "epoch": 0.0931578947368421, "grad_norm": 2.375, "grad_norm_var": 3.4480684566712595e+17, "learning_rate": 0.0001, "loss": 3.3698, "loss/crossentropy": 2.4446977496147158, "loss/hidden": 4.159375, "loss/incoh": 0.0, "loss/logits": 0.35163595527410507, "loss/reg": 0.0, "step": 14160 }, { "epoch": 0.09322368421052632, "grad_norm": 2.46875, "grad_norm_var": 3.4480684570076774e+17, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.144097054004669, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.2777150124311447, "loss/reg": 0.0, "step": 14170 }, { "epoch": 0.09328947368421053, "grad_norm": 2.484375, "grad_norm_var": 0.1343902587890625, "learning_rate": 0.0001, "loss": 3.1593, "loss/crossentropy": 2.5162782430648805, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2642303377389908, "loss/reg": 0.0, "step": 14180 }, { "epoch": 0.09335526315789473, "grad_norm": 2.203125, "grad_norm_var": 0.169921875, "learning_rate": 0.0001, "loss": 3.1927, "loss/crossentropy": 2.1481271982192993, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.27440374791622163, "loss/reg": 0.0, "step": 14190 }, { "epoch": 0.09342105263157895, "grad_norm": 2.21875, "grad_norm_var": 0.188330078125, "learning_rate": 0.0001, "loss": 3.2049, "loss/crossentropy": 2.4672377467155457, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25575320422649384, "loss/reg": 0.0, "step": 14200 }, { "epoch": 0.09348684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.07030843098958334, "learning_rate": 0.0001, "loss": 3.1256, "loss/crossentropy": 2.4822991728782653, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.23255562111735345, "loss/reg": 0.0, "step": 14210 }, { "epoch": 0.09355263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.06575698852539062, "learning_rate": 0.0001, "loss": 3.1061, "loss/crossentropy": 2.4455429315567017, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.2238618478178978, "loss/reg": 0.0, "step": 14220 }, { "epoch": 0.09361842105263157, "grad_norm": 2.453125, "grad_norm_var": 0.13621317545572917, "learning_rate": 0.0001, "loss": 3.1927, "loss/crossentropy": 2.3609827399253844, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3211557373404503, "loss/reg": 0.0, "step": 14230 }, { "epoch": 0.09368421052631579, "grad_norm": 2.3125, "grad_norm_var": 0.1390777587890625, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.4929265141487122, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.256229493021965, "loss/reg": 0.0, "step": 14240 }, { "epoch": 0.09375, "grad_norm": 2.28125, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 3.0973, "loss/crossentropy": 2.396563506126404, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.3292677074670792, "loss/reg": 0.0, "step": 14250 }, { "epoch": 0.09381578947368421, "grad_norm": 3.046875, "grad_norm_var": 3.801495474913411e+17, "learning_rate": 0.0001, "loss": 3.3636, "loss/crossentropy": 2.1659668326377868, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.24123398140072821, "loss/reg": 0.0, "step": 14260 }, { "epoch": 0.09388157894736843, "grad_norm": 2.640625, "grad_norm_var": 3.801495474059215e+17, "learning_rate": 0.0001, "loss": 3.2259, "loss/crossentropy": 2.35439647436142, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22794780433177947, "loss/reg": 0.0, "step": 14270 }, { "epoch": 0.09394736842105263, "grad_norm": 2.734375, "grad_norm_var": 0.42942708333333335, "learning_rate": 0.0001, "loss": 3.1665, "loss/crossentropy": 2.3001658797264097, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.26924325078725814, "loss/reg": 0.0, "step": 14280 }, { "epoch": 0.09401315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.0408843994140625, "learning_rate": 0.0001, "loss": 3.0518, "loss/crossentropy": 2.3841129422187803, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.26079314202070236, "loss/reg": 0.0, "step": 14290 }, { "epoch": 0.09407894736842105, "grad_norm": 2.296875, "grad_norm_var": 0.03784891764322917, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.2710848689079284, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.23626017868518828, "loss/reg": 0.0, "step": 14300 }, { "epoch": 0.09414473684210527, "grad_norm": 2.921875, "grad_norm_var": 0.11845296223958333, "learning_rate": 0.0001, "loss": 3.2715, "loss/crossentropy": 2.4462394237518312, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.3083674684166908, "loss/reg": 0.0, "step": 14310 }, { "epoch": 0.09421052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.115771484375, "learning_rate": 0.0001, "loss": 3.1137, "loss/crossentropy": 2.4102493643760683, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.3285558119416237, "loss/reg": 0.0, "step": 14320 }, { "epoch": 0.09427631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 3.1163, "loss/crossentropy": 2.496846008300781, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2681970477104187, "loss/reg": 0.0, "step": 14330 }, { "epoch": 0.0943421052631579, "grad_norm": 3.28125, "grad_norm_var": 0.3074615478515625, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.368880546092987, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27040198296308515, "loss/reg": 0.0, "step": 14340 }, { "epoch": 0.09440789473684211, "grad_norm": 2.359375, "grad_norm_var": 0.087841796875, "learning_rate": 0.0001, "loss": 3.1677, "loss/crossentropy": 2.379721689224243, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.3262082427740097, "loss/reg": 0.0, "step": 14350 }, { "epoch": 0.09447368421052632, "grad_norm": 3.03125, "grad_norm_var": 0.06515299479166667, "learning_rate": 0.0001, "loss": 3.1541, "loss/crossentropy": 2.3577764987945558, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.28836628049612045, "loss/reg": 0.0, "step": 14360 }, { "epoch": 0.09453947368421052, "grad_norm": 2.40625, "grad_norm_var": 0.09937744140625, "learning_rate": 0.0001, "loss": 3.1429, "loss/crossentropy": 2.2929248332977297, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2473236471414566, "loss/reg": 0.0, "step": 14370 }, { "epoch": 0.09460526315789473, "grad_norm": 2.90625, "grad_norm_var": 0.056428019205729166, "learning_rate": 0.0001, "loss": 3.1331, "loss/crossentropy": 2.11824688911438, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.19864632040262223, "loss/reg": 0.0, "step": 14380 }, { "epoch": 0.09467105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.2792154947916667, "learning_rate": 0.0001, "loss": 3.2558, "loss/crossentropy": 2.4552414536476137, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.26964131295681, "loss/reg": 0.0, "step": 14390 }, { "epoch": 0.09473684210526316, "grad_norm": 2.828125, "grad_norm_var": 0.1197418212890625, "learning_rate": 0.0001, "loss": 3.1627, "loss/crossentropy": 2.4108232736587523, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2856267586350441, "loss/reg": 0.0, "step": 14400 }, { "epoch": 0.09480263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.09917704264322917, "learning_rate": 0.0001, "loss": 3.2683, "loss/crossentropy": 2.245331883430481, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.26227104514837263, "loss/reg": 0.0, "step": 14410 }, { "epoch": 0.09486842105263157, "grad_norm": 2.71875, "grad_norm_var": 0.47395426432291665, "learning_rate": 0.0001, "loss": 3.3242, "loss/crossentropy": 2.5019222021102907, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2487493023276329, "loss/reg": 0.0, "step": 14420 }, { "epoch": 0.09493421052631579, "grad_norm": 2.84375, "grad_norm_var": 0.8088053385416667, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.239874541759491, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.30340075492858887, "loss/reg": 0.0, "step": 14430 }, { "epoch": 0.095, "grad_norm": 3.203125, "grad_norm_var": 0.5774698893229167, "learning_rate": 0.0001, "loss": 3.2356, "loss/crossentropy": 2.212349569797516, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.25182478949427606, "loss/reg": 0.0, "step": 14440 }, { "epoch": 0.09506578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.19168294270833333, "learning_rate": 0.0001, "loss": 3.2659, "loss/crossentropy": 2.337146294116974, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2963370710611343, "loss/reg": 0.0, "step": 14450 }, { "epoch": 0.09513157894736841, "grad_norm": 2.5625, "grad_norm_var": 0.10920308430989584, "learning_rate": 0.0001, "loss": 3.2442, "loss/crossentropy": 1.8299875736236573, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.21587850898504257, "loss/reg": 0.0, "step": 14460 }, { "epoch": 0.09519736842105263, "grad_norm": 2.4375, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 3.1964, "loss/crossentropy": 2.558639335632324, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.25251512974500656, "loss/reg": 0.0, "step": 14470 }, { "epoch": 0.09526315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.020686848958333334, "learning_rate": 0.0001, "loss": 3.1942, "loss/crossentropy": 2.514283466339111, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.2890095472335815, "loss/reg": 0.0, "step": 14480 }, { "epoch": 0.09532894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.028123982747395835, "learning_rate": 0.0001, "loss": 3.1856, "loss/crossentropy": 2.2959898948669433, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24410185664892198, "loss/reg": 0.0, "step": 14490 }, { "epoch": 0.09539473684210527, "grad_norm": 2.328125, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.421563959121704, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.30335861891508104, "loss/reg": 0.0, "step": 14500 }, { "epoch": 0.09546052631578947, "grad_norm": 2.625, "grad_norm_var": 0.041825358072916666, "learning_rate": 0.0001, "loss": 3.1292, "loss/crossentropy": 2.3713988065719604, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2508593872189522, "loss/reg": 0.0, "step": 14510 }, { "epoch": 0.09552631578947368, "grad_norm": 2.015625, "grad_norm_var": 0.12280985514322916, "learning_rate": 0.0001, "loss": 3.1975, "loss/crossentropy": 2.3875895380973815, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.26996816471219065, "loss/reg": 0.0, "step": 14520 }, { "epoch": 0.0955921052631579, "grad_norm": 2.375, "grad_norm_var": 0.13038101196289062, "learning_rate": 0.0001, "loss": 3.1709, "loss/crossentropy": 2.3352912187576296, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.23363660275936127, "loss/reg": 0.0, "step": 14530 }, { "epoch": 0.09565789473684211, "grad_norm": 2.5625, "grad_norm_var": 0.11688206990559896, "learning_rate": 0.0001, "loss": 3.1364, "loss/crossentropy": 2.473167669773102, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24549597799777984, "loss/reg": 0.0, "step": 14540 }, { "epoch": 0.09572368421052632, "grad_norm": 2.734375, "grad_norm_var": 0.060868326822916666, "learning_rate": 0.0001, "loss": 3.2086, "loss/crossentropy": 2.3421871423721314, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.2576961562037468, "loss/reg": 0.0, "step": 14550 }, { "epoch": 0.09578947368421052, "grad_norm": 2.59375, "grad_norm_var": 0.10832926432291666, "learning_rate": 0.0001, "loss": 3.1744, "loss/crossentropy": 2.1765161633491514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2244855582714081, "loss/reg": 0.0, "step": 14560 }, { "epoch": 0.09585526315789474, "grad_norm": 2.265625, "grad_norm_var": 0.1071685791015625, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.507940888404846, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2875757083296776, "loss/reg": 0.0, "step": 14570 }, { "epoch": 0.09592105263157895, "grad_norm": 2.140625, "grad_norm_var": 0.029523722330729165, "learning_rate": 0.0001, "loss": 3.1669, "loss/crossentropy": 2.2886616230010985, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.3050649344921112, "loss/reg": 0.0, "step": 14580 }, { "epoch": 0.09598684210526316, "grad_norm": 2.8125, "grad_norm_var": 0.03560282389322917, "learning_rate": 0.0001, "loss": 3.1454, "loss/crossentropy": 2.395359969139099, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2635947346687317, "loss/reg": 0.0, "step": 14590 }, { "epoch": 0.09605263157894736, "grad_norm": 2.03125, "grad_norm_var": 0.23357747395833334, "learning_rate": 0.0001, "loss": 3.2176, "loss/crossentropy": 2.002879011631012, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.24021245390176774, "loss/reg": 0.0, "step": 14600 }, { "epoch": 0.09611842105263158, "grad_norm": 2.109375, "grad_norm_var": 0.216552734375, "learning_rate": 0.0001, "loss": 3.1289, "loss/crossentropy": 2.4440099954605103, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.3085349440574646, "loss/reg": 0.0, "step": 14610 }, { "epoch": 0.09618421052631579, "grad_norm": 2.21875, "grad_norm_var": 0.18892822265625, "learning_rate": 0.0001, "loss": 3.2541, "loss/crossentropy": 2.5584524154663084, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.33079054951667786, "loss/reg": 0.0, "step": 14620 }, { "epoch": 0.09625, "grad_norm": 2.546875, "grad_norm_var": 0.18036702473958333, "learning_rate": 0.0001, "loss": 3.1899, "loss/crossentropy": 2.386956262588501, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2895423695445061, "loss/reg": 0.0, "step": 14630 }, { "epoch": 0.09631578947368422, "grad_norm": 2.390625, "grad_norm_var": 0.15852762858072916, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.0192247331142426, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2274337187409401, "loss/reg": 0.0, "step": 14640 }, { "epoch": 0.09638157894736842, "grad_norm": 2.375, "grad_norm_var": 0.11549072265625, "learning_rate": 0.0001, "loss": 3.1243, "loss/crossentropy": 2.499123454093933, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.25674946457147596, "loss/reg": 0.0, "step": 14650 }, { "epoch": 0.09644736842105263, "grad_norm": 2130706432.0, "grad_norm_var": 2.83744368059244e+17, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.1218835711479187, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.23685040771961213, "loss/reg": 0.0, "step": 14660 }, { "epoch": 0.09651315789473684, "grad_norm": 2.15625, "grad_norm_var": 2.8374436794771485e+17, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 2.125733083486557, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3448520749807358, "loss/reg": 0.0, "step": 14670 }, { "epoch": 0.09657894736842106, "grad_norm": 2.40625, "grad_norm_var": 0.24964090983072917, "learning_rate": 0.0001, "loss": 3.1553, "loss/crossentropy": 2.604245328903198, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.23915667235851287, "loss/reg": 0.0, "step": 14680 }, { "epoch": 0.09664473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.11900634765625, "learning_rate": 0.0001, "loss": 3.1023, "loss/crossentropy": 2.3761191368103027, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2546194761991501, "loss/reg": 0.0, "step": 14690 }, { "epoch": 0.09671052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.05559488932291667, "learning_rate": 0.0001, "loss": 3.1544, "loss/crossentropy": 2.3285223722457884, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.24710593670606612, "loss/reg": 0.0, "step": 14700 }, { "epoch": 0.09677631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.13647842407226562, "learning_rate": 0.0001, "loss": 3.1385, "loss/crossentropy": 2.364056706428528, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22887694388628005, "loss/reg": 0.0, "step": 14710 }, { "epoch": 0.0968421052631579, "grad_norm": 2.140625, "grad_norm_var": 5.376778157552083, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.2852516174316406, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24893905222415924, "loss/reg": 0.0, "step": 14720 }, { "epoch": 0.09690789473684211, "grad_norm": 2.828125, "grad_norm_var": 5.43084487915039, "learning_rate": 0.0001, "loss": 3.1089, "loss/crossentropy": 2.1680606245994567, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23210474848747253, "loss/reg": 0.0, "step": 14730 }, { "epoch": 0.09697368421052631, "grad_norm": 1.984375, "grad_norm_var": 0.1650054931640625, "learning_rate": 0.0001, "loss": 3.1202, "loss/crossentropy": 2.1676085114479067, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.2939779102802277, "loss/reg": 0.0, "step": 14740 }, { "epoch": 0.09703947368421052, "grad_norm": 2.078125, "grad_norm_var": 0.17122294108072916, "learning_rate": 0.0001, "loss": 3.0635, "loss/crossentropy": 2.498277449607849, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.22846879661083222, "loss/reg": 0.0, "step": 14750 }, { "epoch": 0.09710526315789474, "grad_norm": 2.296875, "grad_norm_var": 0.11088765462239583, "learning_rate": 0.0001, "loss": 3.155, "loss/crossentropy": 2.2534152626991273, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2535191968083382, "loss/reg": 0.0, "step": 14760 }, { "epoch": 0.09717105263157895, "grad_norm": 2.625, "grad_norm_var": 0.10987955729166667, "learning_rate": 0.0001, "loss": 3.2477, "loss/crossentropy": 2.085835373401642, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.25164034962654114, "loss/reg": 0.0, "step": 14770 }, { "epoch": 0.09723684210526316, "grad_norm": 2.578125, "grad_norm_var": 17.859382120768228, "learning_rate": 0.0001, "loss": 3.2842, "loss/crossentropy": 1.8799749910831451, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.24884901493787764, "loss/reg": 0.0, "step": 14780 }, { "epoch": 0.09730263157894736, "grad_norm": 2.078125, "grad_norm_var": 17.996207682291665, "learning_rate": 0.0001, "loss": 3.2386, "loss/crossentropy": 2.307372045516968, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.26386404484510423, "loss/reg": 0.0, "step": 14790 }, { "epoch": 0.09736842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06170247395833333, "learning_rate": 0.0001, "loss": 3.1452, "loss/crossentropy": 2.454807901382446, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.3172868087887764, "loss/reg": 0.0, "step": 14800 }, { "epoch": 0.09743421052631579, "grad_norm": 2.5, "grad_norm_var": 0.05920817057291667, "learning_rate": 0.0001, "loss": 3.1114, "loss/crossentropy": 2.014724650979042, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.1908031925559044, "loss/reg": 0.0, "step": 14810 }, { "epoch": 0.0975, "grad_norm": 2.90625, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.463605833053589, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24406316578388215, "loss/reg": 0.0, "step": 14820 }, { "epoch": 0.09756578947368422, "grad_norm": 2.28125, "grad_norm_var": 0.29640299479166665, "learning_rate": 0.0001, "loss": 3.1614, "loss/crossentropy": 2.501069176197052, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2565233051776886, "loss/reg": 0.0, "step": 14830 }, { "epoch": 0.09763157894736842, "grad_norm": 2.09375, "grad_norm_var": 0.5406534830729167, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.462419664859772, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.22374519556760789, "loss/reg": 0.0, "step": 14840 }, { "epoch": 0.09769736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.7624501546223958, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.226536822319031, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.26644248366355894, "loss/reg": 0.0, "step": 14850 }, { "epoch": 0.09776315789473684, "grad_norm": 2.328125, "grad_norm_var": 2.5783355712890623, "learning_rate": 0.0001, "loss": 3.3229, "loss/crossentropy": 2.3811925053596497, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22725498378276826, "loss/reg": 0.0, "step": 14860 }, { "epoch": 0.09782894736842106, "grad_norm": 2.90625, "grad_norm_var": 2.3144205729166667, "learning_rate": 0.0001, "loss": 3.2061, "loss/crossentropy": 2.5685184717178347, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2757527410984039, "loss/reg": 0.0, "step": 14870 }, { "epoch": 0.09789473684210526, "grad_norm": 2.515625, "grad_norm_var": 1.900218709309896, "learning_rate": 0.0001, "loss": 3.1588, "loss/crossentropy": 2.224149799346924, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.24766245782375335, "loss/reg": 0.0, "step": 14880 }, { "epoch": 0.09796052631578947, "grad_norm": 2.421875, "grad_norm_var": 0.13209228515625, "learning_rate": 0.0001, "loss": 3.1723, "loss/crossentropy": 2.2869945645332335, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3204286351799965, "loss/reg": 0.0, "step": 14890 }, { "epoch": 0.09802631578947368, "grad_norm": 2.71875, "grad_norm_var": 0.036408487955729166, "learning_rate": 0.0001, "loss": 3.1244, "loss/crossentropy": 2.276153302192688, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2348289594054222, "loss/reg": 0.0, "step": 14900 }, { "epoch": 0.0980921052631579, "grad_norm": 3.328125, "grad_norm_var": 0.0842926025390625, "learning_rate": 0.0001, "loss": 3.2024, "loss/crossentropy": 2.2277270436286924, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2504657179117203, "loss/reg": 0.0, "step": 14910 }, { "epoch": 0.09815789473684211, "grad_norm": 2.25, "grad_norm_var": 0.15784098307291666, "learning_rate": 0.0001, "loss": 3.1643, "loss/crossentropy": 2.601578450202942, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.27564091980457306, "loss/reg": 0.0, "step": 14920 }, { "epoch": 0.09822368421052631, "grad_norm": 2.21875, "grad_norm_var": 0.12454325358072917, "learning_rate": 0.0001, "loss": 3.0857, "loss/crossentropy": 2.4973155736923216, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2681492820382118, "loss/reg": 0.0, "step": 14930 }, { "epoch": 0.09828947368421052, "grad_norm": 2.703125, "grad_norm_var": 0.04514872233072917, "learning_rate": 0.0001, "loss": 3.1214, "loss/crossentropy": 2.526623797416687, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.26705596745014193, "loss/reg": 0.0, "step": 14940 }, { "epoch": 0.09835526315789474, "grad_norm": 2.21875, "grad_norm_var": 0.23338216145833332, "learning_rate": 0.0001, "loss": 3.1468, "loss/crossentropy": 2.273276376724243, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.28430237621068954, "loss/reg": 0.0, "step": 14950 }, { "epoch": 0.09842105263157895, "grad_norm": 2.1875, "grad_norm_var": 0.2802398681640625, "learning_rate": 0.0001, "loss": 3.2042, "loss/crossentropy": 2.4497693538665772, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2505017280578613, "loss/reg": 0.0, "step": 14960 }, { "epoch": 0.09848684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.15282796223958334, "learning_rate": 0.0001, "loss": 3.1746, "loss/crossentropy": 2.4736087679862977, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.24588750153779984, "loss/reg": 0.0, "step": 14970 }, { "epoch": 0.09855263157894736, "grad_norm": 2.203125, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 3.1539, "loss/crossentropy": 2.373614990711212, "loss/hidden": 3.0578125, "loss/incoh": 0.0, "loss/logits": 0.30631934851408005, "loss/reg": 0.0, "step": 14980 }, { "epoch": 0.09861842105263158, "grad_norm": 3.921875, "grad_norm_var": 0.1558990478515625, "learning_rate": 0.0001, "loss": 3.1982, "loss/crossentropy": 2.295030379295349, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.24311772882938384, "loss/reg": 0.0, "step": 14990 }, { "epoch": 0.09868421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.3582967122395833, "learning_rate": 0.0001, "loss": 3.2225, "loss/crossentropy": 2.29546434879303, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.45574710667133334, "loss/reg": 0.0, "step": 15000 }, { "epoch": 0.09875, "grad_norm": 2.796875, "grad_norm_var": 5.883512115478515, "learning_rate": 0.0001, "loss": 3.1782, "loss/crossentropy": 2.4186841249465942, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3855607584118843, "loss/reg": 0.0, "step": 15010 }, { "epoch": 0.0988157894736842, "grad_norm": 2.578125, "grad_norm_var": 5.986443837483724, "learning_rate": 0.0001, "loss": 3.0661, "loss/crossentropy": 2.409875476360321, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.23775186911225318, "loss/reg": 0.0, "step": 15020 }, { "epoch": 0.09888157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.055916086832682295, "learning_rate": 0.0001, "loss": 3.1756, "loss/crossentropy": 2.187470281124115, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2504939392209053, "loss/reg": 0.0, "step": 15030 }, { "epoch": 0.09894736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.058203125, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.634449529647827, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2524956986308098, "loss/reg": 0.0, "step": 15040 }, { "epoch": 0.09901315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.09206441243489584, "learning_rate": 0.0001, "loss": 3.1804, "loss/crossentropy": 2.3493194222450255, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.24196315556764603, "loss/reg": 0.0, "step": 15050 }, { "epoch": 0.09907894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.0587799072265625, "learning_rate": 0.0001, "loss": 3.1473, "loss/crossentropy": 2.5005852222442626, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2778876781463623, "loss/reg": 0.0, "step": 15060 }, { "epoch": 0.09914473684210526, "grad_norm": 2.890625, "grad_norm_var": 0.07665913899739583, "learning_rate": 0.0001, "loss": 3.2621, "loss/crossentropy": 2.30003308057785, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.28490780740976335, "loss/reg": 0.0, "step": 15070 }, { "epoch": 0.09921052631578947, "grad_norm": 2.78125, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 3.1472, "loss/crossentropy": 2.2739776968955994, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2682348355650902, "loss/reg": 0.0, "step": 15080 }, { "epoch": 0.09927631578947368, "grad_norm": 2.484375, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 3.1016, "loss/crossentropy": 2.2083258867263793, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23891242742538452, "loss/reg": 0.0, "step": 15090 }, { "epoch": 0.0993421052631579, "grad_norm": 2.34375, "grad_norm_var": 0.0837890625, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.4516371846199037, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21450784876942636, "loss/reg": 0.0, "step": 15100 }, { "epoch": 0.09940789473684211, "grad_norm": 2.515625, "grad_norm_var": 0.14197489420572917, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.2019423693418503, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.278714843839407, "loss/reg": 0.0, "step": 15110 }, { "epoch": 0.09947368421052631, "grad_norm": 1.984375, "grad_norm_var": 0.06102676391601562, "learning_rate": 0.0001, "loss": 3.0573, "loss/crossentropy": 2.0795727133750916, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24602137356996537, "loss/reg": 0.0, "step": 15120 }, { "epoch": 0.09953947368421052, "grad_norm": 2.484375, "grad_norm_var": 0.04859619140625, "learning_rate": 0.0001, "loss": 3.1684, "loss/crossentropy": 2.0709302008152006, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27385044246912005, "loss/reg": 0.0, "step": 15130 }, { "epoch": 0.09960526315789474, "grad_norm": 3.359375, "grad_norm_var": 0.10572509765625, "learning_rate": 0.0001, "loss": 3.2319, "loss/crossentropy": 2.1219942808151244, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.31611712723970414, "loss/reg": 0.0, "step": 15140 }, { "epoch": 0.09967105263157895, "grad_norm": 2.265625, "grad_norm_var": 0.30730794270833334, "learning_rate": 0.0001, "loss": 3.3146, "loss/crossentropy": 2.36390962600708, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2202586129307747, "loss/reg": 0.0, "step": 15150 }, { "epoch": 0.09973684210526315, "grad_norm": 3.015625, "grad_norm_var": 0.3680948893229167, "learning_rate": 0.0001, "loss": 3.1903, "loss/crossentropy": 2.592810094356537, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.31535505652427676, "loss/reg": 0.0, "step": 15160 }, { "epoch": 0.09980263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.06611226399739584, "learning_rate": 0.0001, "loss": 3.1583, "loss/crossentropy": 2.4805197715759277, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.25672143548727033, "loss/reg": 0.0, "step": 15170 }, { "epoch": 0.09986842105263158, "grad_norm": 2.1875, "grad_norm_var": 0.026691691080729166, "learning_rate": 0.0001, "loss": 3.0654, "loss/crossentropy": 2.428369462490082, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2483712613582611, "loss/reg": 0.0, "step": 15180 }, { "epoch": 0.09993421052631579, "grad_norm": 2.203125, "grad_norm_var": 0.16633707682291668, "learning_rate": 0.0001, "loss": 3.2011, "loss/crossentropy": 1.8888699412345886, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.25502131283283236, "loss/reg": 0.0, "step": 15190 }, { "epoch": 0.1, "grad_norm": 2.515625, "grad_norm_var": 0.09675191243489584, "learning_rate": 0.0001, "loss": 3.1893, "loss/crossentropy": 2.1418832421302794, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2491638869047165, "loss/reg": 0.0, "step": 15200 }, { "epoch": 0.1000657894736842, "grad_norm": 2.46875, "grad_norm_var": 0.08440755208333334, "learning_rate": 0.0001, "loss": 3.2207, "loss/crossentropy": 2.5305609703063965, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.28034003674983976, "loss/reg": 0.0, "step": 15210 }, { "epoch": 0.10013157894736842, "grad_norm": 2.703125, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 3.2279, "loss/crossentropy": 2.1161023378372192, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2513925403356552, "loss/reg": 0.0, "step": 15220 }, { "epoch": 0.10019736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.2027008056640625, "learning_rate": 0.0001, "loss": 3.2361, "loss/crossentropy": 2.064685332775116, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.28131103515625, "loss/reg": 0.0, "step": 15230 }, { "epoch": 0.10026315789473685, "grad_norm": 2.28125, "grad_norm_var": 0.14849853515625, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.4768277525901796, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.24746784269809724, "loss/reg": 0.0, "step": 15240 }, { "epoch": 0.10032894736842106, "grad_norm": 2.15625, "grad_norm_var": 0.07953999837239584, "learning_rate": 0.0001, "loss": 3.1794, "loss/crossentropy": 2.2046443462371825, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.26957911550998687, "loss/reg": 0.0, "step": 15250 }, { "epoch": 0.10039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.12280985514322916, "learning_rate": 0.0001, "loss": 3.0606, "loss/crossentropy": 2.418226730823517, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.21895384043455124, "loss/reg": 0.0, "step": 15260 }, { "epoch": 0.10046052631578947, "grad_norm": 2.46875, "grad_norm_var": 0.09387613932291666, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.1524213790893554, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.23847964853048326, "loss/reg": 0.0, "step": 15270 }, { "epoch": 0.10052631578947369, "grad_norm": 2.859375, "grad_norm_var": 0.12705459594726562, "learning_rate": 0.0001, "loss": 3.0626, "loss/crossentropy": 1.9326023817062379, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.24662483483552933, "loss/reg": 0.0, "step": 15280 }, { "epoch": 0.1005921052631579, "grad_norm": 2.640625, "grad_norm_var": 8.221414947509766, "learning_rate": 0.0001, "loss": 3.2248, "loss/crossentropy": 2.2926568508148195, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.26595802754163744, "loss/reg": 0.0, "step": 15290 }, { "epoch": 0.1006578947368421, "grad_norm": 2.71875, "grad_norm_var": 8.17893778483073, "learning_rate": 0.0001, "loss": 3.1429, "loss/crossentropy": 2.4002704977989198, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.2911446109414101, "loss/reg": 0.0, "step": 15300 }, { "epoch": 0.10072368421052631, "grad_norm": 2.1875, "grad_norm_var": 0.03648681640625, "learning_rate": 0.0001, "loss": 3.1424, "loss/crossentropy": 2.1603388369083403, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.27493633329868317, "loss/reg": 0.0, "step": 15310 }, { "epoch": 0.10078947368421053, "grad_norm": 3.96875, "grad_norm_var": 0.1912506103515625, "learning_rate": 0.0001, "loss": 3.133, "loss/crossentropy": 2.279471695423126, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.23778771311044694, "loss/reg": 0.0, "step": 15320 }, { "epoch": 0.10085526315789474, "grad_norm": 2.140625, "grad_norm_var": 0.2115875244140625, "learning_rate": 0.0001, "loss": 3.0811, "loss/crossentropy": 2.1736138820648194, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.19810649901628494, "loss/reg": 0.0, "step": 15330 }, { "epoch": 0.10092105263157895, "grad_norm": 2.453125, "grad_norm_var": 0.05100809733072917, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.2725695729255677, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.2860790088772774, "loss/reg": 0.0, "step": 15340 }, { "epoch": 0.10098684210526315, "grad_norm": 2.6875, "grad_norm_var": 2.056571451822917, "learning_rate": 0.0001, "loss": 3.2468, "loss/crossentropy": 2.215101981163025, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.27275211960077284, "loss/reg": 0.0, "step": 15350 }, { "epoch": 0.10105263157894737, "grad_norm": 2.15625, "grad_norm_var": 2.1172159830729167, "learning_rate": 0.0001, "loss": 3.1519, "loss/crossentropy": 2.416505420207977, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.26659835278987887, "loss/reg": 0.0, "step": 15360 }, { "epoch": 0.10111842105263158, "grad_norm": 2.234375, "grad_norm_var": 0.051708984375, "learning_rate": 0.0001, "loss": 3.0702, "loss/crossentropy": 2.3915260195732118, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.23197022825479507, "loss/reg": 0.0, "step": 15370 }, { "epoch": 0.1011842105263158, "grad_norm": 2.359375, "grad_norm_var": 0.03733622233072917, "learning_rate": 0.0001, "loss": 3.1794, "loss/crossentropy": 2.4574419140815733, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.34623306542634963, "loss/reg": 0.0, "step": 15380 }, { "epoch": 0.10125, "grad_norm": 2.3125, "grad_norm_var": 0.229052734375, "learning_rate": 0.0001, "loss": 3.1256, "loss/crossentropy": 2.1932177782058715, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.22062241584062575, "loss/reg": 0.0, "step": 15390 }, { "epoch": 0.1013157894736842, "grad_norm": 2.234375, "grad_norm_var": 0.08741861979166667, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.2610700845718386, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.22664773613214492, "loss/reg": 0.0, "step": 15400 }, { "epoch": 0.10138157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.394396644830704, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.25834341049194337, "loss/reg": 0.0, "step": 15410 }, { "epoch": 0.10144736842105263, "grad_norm": 2.484375, "grad_norm_var": 0.034895833333333334, "learning_rate": 0.0001, "loss": 3.1834, "loss/crossentropy": 2.290709137916565, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2504901379346848, "loss/reg": 0.0, "step": 15420 }, { "epoch": 0.10151315789473685, "grad_norm": 2.734375, "grad_norm_var": 5.647508748372396, "learning_rate": 0.0001, "loss": 3.1705, "loss/crossentropy": 2.597213554382324, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.2691803678870201, "loss/reg": 0.0, "step": 15430 }, { "epoch": 0.10157894736842105, "grad_norm": 2.125, "grad_norm_var": 0.07888997395833333, "learning_rate": 0.0001, "loss": 3.1069, "loss/crossentropy": 2.1887213230133056, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2691598206758499, "loss/reg": 0.0, "step": 15440 }, { "epoch": 0.10164473684210526, "grad_norm": 3.109375, "grad_norm_var": 0.0681793212890625, "learning_rate": 0.0001, "loss": 3.1639, "loss/crossentropy": 2.097235471010208, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2574038878083229, "loss/reg": 0.0, "step": 15450 }, { "epoch": 0.10171052631578947, "grad_norm": 2.125, "grad_norm_var": 0.0678863525390625, "learning_rate": 0.0001, "loss": 3.1301, "loss/crossentropy": 2.2443934082984924, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.24395571500062943, "loss/reg": 0.0, "step": 15460 }, { "epoch": 0.10177631578947369, "grad_norm": 2.265625, "grad_norm_var": 0.1054595947265625, "learning_rate": 0.0001, "loss": 3.2604, "loss/crossentropy": 2.429443156719208, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.2746900498867035, "loss/reg": 0.0, "step": 15470 }, { "epoch": 0.1018421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.1326080322265625, "learning_rate": 0.0001, "loss": 3.221, "loss/crossentropy": 2.423279583454132, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.27492027878761294, "loss/reg": 0.0, "step": 15480 }, { "epoch": 0.1019078947368421, "grad_norm": 2.484375, "grad_norm_var": 0.1138092041015625, "learning_rate": 0.0001, "loss": 3.257, "loss/crossentropy": 2.648502016067505, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2935266062617302, "loss/reg": 0.0, "step": 15490 }, { "epoch": 0.10197368421052631, "grad_norm": 2.40625, "grad_norm_var": 0.1727203369140625, "learning_rate": 0.0001, "loss": 3.1197, "loss/crossentropy": 2.485008704662323, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2535603493452072, "loss/reg": 0.0, "step": 15500 }, { "epoch": 0.10203947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.18816731770833334, "learning_rate": 0.0001, "loss": 3.1804, "loss/crossentropy": 2.180854117870331, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.24670835435390473, "loss/reg": 0.0, "step": 15510 }, { "epoch": 0.10210526315789474, "grad_norm": 2.453125, "grad_norm_var": 0.07157796223958333, "learning_rate": 0.0001, "loss": 3.1547, "loss/crossentropy": 2.5997716546058656, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2878054201602936, "loss/reg": 0.0, "step": 15520 }, { "epoch": 0.10217105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.154443359375, "learning_rate": 0.0001, "loss": 3.1694, "loss/crossentropy": 2.3764194369316103, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.24656722396612168, "loss/reg": 0.0, "step": 15530 }, { "epoch": 0.10223684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.25536702473958334, "learning_rate": 0.0001, "loss": 3.1474, "loss/crossentropy": 1.9735815048217773, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.24598835706710814, "loss/reg": 0.0, "step": 15540 }, { "epoch": 0.10230263157894737, "grad_norm": 2.8125, "grad_norm_var": 0.1596832275390625, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.525629758834839, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.33703051060438155, "loss/reg": 0.0, "step": 15550 }, { "epoch": 0.10236842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.1472808837890625, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.03394900560379, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.24415955394506456, "loss/reg": 0.0, "step": 15560 }, { "epoch": 0.1024342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.07866923014322917, "learning_rate": 0.0001, "loss": 3.122, "loss/crossentropy": 2.0943363308906555, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.23014722466468812, "loss/reg": 0.0, "step": 15570 }, { "epoch": 0.1025, "grad_norm": 2.359375, "grad_norm_var": 0.057389322916666666, "learning_rate": 0.0001, "loss": 3.1268, "loss/crossentropy": 2.033569025993347, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2175581559538841, "loss/reg": 0.0, "step": 15580 }, { "epoch": 0.10256578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.08776041666666666, "learning_rate": 0.0001, "loss": 3.0706, "loss/crossentropy": 2.6018026471138, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2438505232334137, "loss/reg": 0.0, "step": 15590 }, { "epoch": 0.10263157894736842, "grad_norm": 2.078125, "grad_norm_var": 0.1537994384765625, "learning_rate": 0.0001, "loss": 3.1439, "loss/crossentropy": 2.196335256099701, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2767083361744881, "loss/reg": 0.0, "step": 15600 }, { "epoch": 0.10269736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.07605692545572916, "learning_rate": 0.0001, "loss": 3.1523, "loss/crossentropy": 2.326652777194977, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2691790975630283, "loss/reg": 0.0, "step": 15610 }, { "epoch": 0.10276315789473685, "grad_norm": 2.21875, "grad_norm_var": 0.09851455688476562, "learning_rate": 0.0001, "loss": 3.1315, "loss/crossentropy": 2.219775491952896, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2539939820766449, "loss/reg": 0.0, "step": 15620 }, { "epoch": 0.10282894736842105, "grad_norm": 2.078125, "grad_norm_var": 0.14008560180664062, "learning_rate": 0.0001, "loss": 3.1501, "loss/crossentropy": 2.284542143344879, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27733145356178285, "loss/reg": 0.0, "step": 15630 }, { "epoch": 0.10289473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.10869115193684896, "learning_rate": 0.0001, "loss": 3.1621, "loss/crossentropy": 2.4228519797325134, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.32454578429460523, "loss/reg": 0.0, "step": 15640 }, { "epoch": 0.10296052631578947, "grad_norm": 2.21875, "grad_norm_var": 0.11193211873372395, "learning_rate": 0.0001, "loss": 3.1631, "loss/crossentropy": 2.3004646062850953, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2865068309009075, "loss/reg": 0.0, "step": 15650 }, { "epoch": 0.10302631578947369, "grad_norm": 2.203125, "grad_norm_var": 0.08788655598958334, "learning_rate": 0.0001, "loss": 3.2413, "loss/crossentropy": 2.3326223611831667, "loss/hidden": 3.0703125, "loss/incoh": 0.0, "loss/logits": 0.2786666050553322, "loss/reg": 0.0, "step": 15660 }, { "epoch": 0.1030921052631579, "grad_norm": 2.0625, "grad_norm_var": 0.05933329264322917, "learning_rate": 0.0001, "loss": 3.0799, "loss/crossentropy": 2.308107304573059, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23495433181524278, "loss/reg": 0.0, "step": 15670 }, { "epoch": 0.1031578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.04383926391601563, "learning_rate": 0.0001, "loss": 3.0889, "loss/crossentropy": 2.3051168084144593, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.278233802318573, "loss/reg": 0.0, "step": 15680 }, { "epoch": 0.10322368421052631, "grad_norm": 2.234375, "grad_norm_var": 1.4827247619628907, "learning_rate": 0.0001, "loss": 3.1864, "loss/crossentropy": 2.2644376397132873, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.22414357215166092, "loss/reg": 0.0, "step": 15690 }, { "epoch": 0.10328947368421053, "grad_norm": 2.296875, "grad_norm_var": 0.962451171875, "learning_rate": 0.0001, "loss": 3.1546, "loss/crossentropy": 2.424470007419586, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.24368802309036255, "loss/reg": 0.0, "step": 15700 }, { "epoch": 0.10335526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.07737223307291667, "learning_rate": 0.0001, "loss": 3.1624, "loss/crossentropy": 2.459938275814056, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23296410143375396, "loss/reg": 0.0, "step": 15710 }, { "epoch": 0.10342105263157894, "grad_norm": 2.59375, "grad_norm_var": 0.06443583170572917, "learning_rate": 0.0001, "loss": 3.1932, "loss/crossentropy": 2.2153470873832704, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.33586266040802004, "loss/reg": 0.0, "step": 15720 }, { "epoch": 0.10348684210526315, "grad_norm": 3.53125, "grad_norm_var": 0.17069905598958332, "learning_rate": 0.0001, "loss": 3.257, "loss/crossentropy": 2.2473284363746644, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.24566184133291244, "loss/reg": 0.0, "step": 15730 }, { "epoch": 0.10355263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.163916015625, "learning_rate": 0.0001, "loss": 3.0707, "loss/crossentropy": 2.620988368988037, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.29941551238298414, "loss/reg": 0.0, "step": 15740 }, { "epoch": 0.10361842105263158, "grad_norm": 2.703125, "grad_norm_var": 0.35409749348958336, "learning_rate": 0.0001, "loss": 3.2423, "loss/crossentropy": 2.3977234601974486, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.26520802527666093, "loss/reg": 0.0, "step": 15750 }, { "epoch": 0.1036842105263158, "grad_norm": 2.375, "grad_norm_var": 3.301877391050758e+17, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.3854068875312806, "loss/hidden": 3.8296875, "loss/incoh": 0.0, "loss/logits": 0.3163578942418098, "loss/reg": 0.0, "step": 15760 }, { "epoch": 0.10375, "grad_norm": 3.09375, "grad_norm_var": 3.301877391679248e+17, "learning_rate": 0.0001, "loss": 3.1843, "loss/crossentropy": 2.2795175433158876, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.26798346936702727, "loss/reg": 0.0, "step": 15770 }, { "epoch": 0.10381578947368421, "grad_norm": 2.578125, "grad_norm_var": 0.1086334228515625, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.125895881652832, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.23809853047132493, "loss/reg": 0.0, "step": 15780 }, { "epoch": 0.10388157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.09806289672851562, "learning_rate": 0.0001, "loss": 3.121, "loss/crossentropy": 2.3405375838279725, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23903141915798187, "loss/reg": 0.0, "step": 15790 }, { "epoch": 0.10394736842105264, "grad_norm": 2.1875, "grad_norm_var": 0.05409927368164062, "learning_rate": 0.0001, "loss": 3.1165, "loss/crossentropy": 2.371259605884552, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.24798257499933243, "loss/reg": 0.0, "step": 15800 }, { "epoch": 0.10401315789473685, "grad_norm": 2.625, "grad_norm_var": 0.046727498372395836, "learning_rate": 0.0001, "loss": 3.0808, "loss/crossentropy": 2.5483869433403017, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.2987742185592651, "loss/reg": 0.0, "step": 15810 }, { "epoch": 0.10407894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.43404541015625, "learning_rate": 0.0001, "loss": 3.1913, "loss/crossentropy": 2.3843488097190857, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.41426307857036593, "loss/reg": 0.0, "step": 15820 }, { "epoch": 0.10414473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.42040913899739585, "learning_rate": 0.0001, "loss": 3.1412, "loss/crossentropy": 2.451664757728577, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2949830338358879, "loss/reg": 0.0, "step": 15830 }, { "epoch": 0.10421052631578948, "grad_norm": 2.046875, "grad_norm_var": 12064558770995.855, "learning_rate": 0.0001, "loss": 3.2876, "loss/crossentropy": 2.540658712387085, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.3069792494177818, "loss/reg": 0.0, "step": 15840 }, { "epoch": 0.10427631578947369, "grad_norm": 2.59375, "grad_norm_var": 0.26617431640625, "learning_rate": 0.0001, "loss": 3.1409, "loss/crossentropy": 2.3011590003967286, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23567215949296952, "loss/reg": 0.0, "step": 15850 }, { "epoch": 0.10434210526315789, "grad_norm": 2.21875, "grad_norm_var": 0.0716217041015625, "learning_rate": 0.0001, "loss": 3.2032, "loss/crossentropy": 2.624694299697876, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.2735205709934235, "loss/reg": 0.0, "step": 15860 }, { "epoch": 0.1044078947368421, "grad_norm": 2.671875, "grad_norm_var": 0.037018839518229166, "learning_rate": 0.0001, "loss": 3.1547, "loss/crossentropy": 2.3426929831504824, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26750322580337527, "loss/reg": 0.0, "step": 15870 }, { "epoch": 0.10447368421052632, "grad_norm": 2.625, "grad_norm_var": 0.0340728759765625, "learning_rate": 0.0001, "loss": 3.1257, "loss/crossentropy": 2.0538764238357543, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23220817893743514, "loss/reg": 0.0, "step": 15880 }, { "epoch": 0.10453947368421053, "grad_norm": 2.484375, "grad_norm_var": 0.10313084920247396, "learning_rate": 0.0001, "loss": 3.0791, "loss/crossentropy": 2.4493046522140505, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.264581099152565, "loss/reg": 0.0, "step": 15890 }, { "epoch": 0.10460526315789474, "grad_norm": 2.140625, "grad_norm_var": 0.08592910766601562, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.384776270389557, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.20801746100187302, "loss/reg": 0.0, "step": 15900 }, { "epoch": 0.10467105263157894, "grad_norm": 2.203125, "grad_norm_var": 0.03535054524739583, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.2780667304992677, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.33164310455322266, "loss/reg": 0.0, "step": 15910 }, { "epoch": 0.10473684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.04937744140625, "learning_rate": 0.0001, "loss": 3.1317, "loss/crossentropy": 2.3224631786346435, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.26515267193317416, "loss/reg": 0.0, "step": 15920 }, { "epoch": 0.10480263157894737, "grad_norm": 3.375, "grad_norm_var": 0.09521077473958334, "learning_rate": 0.0001, "loss": 3.1393, "loss/crossentropy": 2.26437486410141, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.25974428951740264, "loss/reg": 0.0, "step": 15930 }, { "epoch": 0.10486842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.10048828125, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.216641104221344, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.3172867178916931, "loss/reg": 0.0, "step": 15940 }, { "epoch": 0.10493421052631578, "grad_norm": 2.15625, "grad_norm_var": 0.0614898681640625, "learning_rate": 0.0001, "loss": 3.0768, "loss/crossentropy": 2.6155009508132934, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2578451469540596, "loss/reg": 0.0, "step": 15950 }, { "epoch": 0.105, "grad_norm": 2.234375, "grad_norm_var": 0.060269927978515624, "learning_rate": 0.0001, "loss": 3.161, "loss/crossentropy": 2.5140405654907227, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2738805189728737, "loss/reg": 0.0, "step": 15960 }, { "epoch": 0.10506578947368421, "grad_norm": 2.59375, "grad_norm_var": 0.050176747639973956, "learning_rate": 0.0001, "loss": 3.0829, "loss/crossentropy": 2.133234918117523, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2213057592511177, "loss/reg": 0.0, "step": 15970 }, { "epoch": 0.10513157894736842, "grad_norm": 2.515625, "grad_norm_var": 0.18106180826822918, "learning_rate": 0.0001, "loss": 3.2666, "loss/crossentropy": 2.489140582084656, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2818035438656807, "loss/reg": 0.0, "step": 15980 }, { "epoch": 0.10519736842105264, "grad_norm": 2.390625, "grad_norm_var": 0.6164947509765625, "learning_rate": 0.0001, "loss": 3.1722, "loss/crossentropy": 2.51891051530838, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2563257798552513, "loss/reg": 0.0, "step": 15990 }, { "epoch": 0.10526315789473684, "grad_norm": 2.75, "grad_norm_var": 0.8845987955729167, "learning_rate": 0.0001, "loss": 3.1763, "loss/crossentropy": 2.2177582025527953, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2367611363530159, "loss/reg": 0.0, "step": 16000 }, { "epoch": 0.10532894736842105, "grad_norm": 2.5625, "grad_norm_var": 0.451318359375, "learning_rate": 0.0001, "loss": 3.1408, "loss/crossentropy": 2.2017379879951475, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.20062217488884926, "loss/reg": 0.0, "step": 16010 }, { "epoch": 0.10539473684210526, "grad_norm": 2.390625, "grad_norm_var": 0.050439453125, "learning_rate": 0.0001, "loss": 3.1466, "loss/crossentropy": 2.2292946934700013, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2783455640077591, "loss/reg": 0.0, "step": 16020 }, { "epoch": 0.10546052631578948, "grad_norm": 1.984375, "grad_norm_var": 0.1126129150390625, "learning_rate": 0.0001, "loss": 3.1018, "loss/crossentropy": 2.530960404872894, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.23581028431653978, "loss/reg": 0.0, "step": 16030 }, { "epoch": 0.10552631578947369, "grad_norm": 2.109375, "grad_norm_var": 0.08625386555989584, "learning_rate": 0.0001, "loss": 3.1222, "loss/crossentropy": 2.432818961143494, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.23912513256072998, "loss/reg": 0.0, "step": 16040 }, { "epoch": 0.10559210526315789, "grad_norm": 2.234375, "grad_norm_var": 0.1727203369140625, "learning_rate": 0.0001, "loss": 3.1108, "loss/crossentropy": 2.481075167655945, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2601512670516968, "loss/reg": 0.0, "step": 16050 }, { "epoch": 0.1056578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.22974853515625, "learning_rate": 0.0001, "loss": 3.1102, "loss/crossentropy": 2.347836995124817, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2967532381415367, "loss/reg": 0.0, "step": 16060 }, { "epoch": 0.10572368421052632, "grad_norm": 2.421875, "grad_norm_var": 3.920637003580729, "learning_rate": 0.0001, "loss": 3.1789, "loss/crossentropy": 2.2461806178092956, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.2796615108847618, "loss/reg": 0.0, "step": 16070 }, { "epoch": 0.10578947368421053, "grad_norm": 3.234375, "grad_norm_var": 3.8184529622395833, "learning_rate": 0.0001, "loss": 3.2028, "loss/crossentropy": 2.2602067947387696, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.27418701648712157, "loss/reg": 0.0, "step": 16080 }, { "epoch": 0.10585526315789473, "grad_norm": 2.546875, "grad_norm_var": 0.0978912353515625, "learning_rate": 0.0001, "loss": 3.087, "loss/crossentropy": 2.4076030969619753, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2339254654943943, "loss/reg": 0.0, "step": 16090 }, { "epoch": 0.10592105263157894, "grad_norm": 8.8125, "grad_norm_var": 2.634016927083333, "learning_rate": 0.0001, "loss": 3.2404, "loss/crossentropy": 1.9294680893421172, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.24796108528971672, "loss/reg": 0.0, "step": 16100 }, { "epoch": 0.10598684210526316, "grad_norm": 2.5, "grad_norm_var": 2.7831013997395835, "learning_rate": 0.0001, "loss": 3.1866, "loss/crossentropy": 1.8769205152988433, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2611444815993309, "loss/reg": 0.0, "step": 16110 }, { "epoch": 0.10605263157894737, "grad_norm": 2.703125, "grad_norm_var": 0.76617431640625, "learning_rate": 0.0001, "loss": 3.2357, "loss/crossentropy": 2.2888787627220153, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2839828670024872, "loss/reg": 0.0, "step": 16120 }, { "epoch": 0.10611842105263158, "grad_norm": 2.359375, "grad_norm_var": 0.04062093098958333, "learning_rate": 0.0001, "loss": 3.1608, "loss/crossentropy": 2.313342797756195, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2872613161802292, "loss/reg": 0.0, "step": 16130 }, { "epoch": 0.10618421052631578, "grad_norm": 2.28125, "grad_norm_var": 0.4191691080729167, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.6026643037796022, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.28212335854768755, "loss/reg": 0.0, "step": 16140 }, { "epoch": 0.10625, "grad_norm": 2.4375, "grad_norm_var": 0.41529541015625, "learning_rate": 0.0001, "loss": 3.1346, "loss/crossentropy": 2.556915271282196, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.2837150752544403, "loss/reg": 0.0, "step": 16150 }, { "epoch": 0.10631578947368421, "grad_norm": 2.75, "grad_norm_var": 0.053441365559895836, "learning_rate": 0.0001, "loss": 3.1471, "loss/crossentropy": 2.399764931201935, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2460502192378044, "loss/reg": 0.0, "step": 16160 }, { "epoch": 0.10638157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.8765462239583334, "learning_rate": 0.0001, "loss": 3.2831, "loss/crossentropy": 2.2979734420776365, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.2876197725534439, "loss/reg": 0.0, "step": 16170 }, { "epoch": 0.10644736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.9296834309895833, "learning_rate": 0.0001, "loss": 3.2187, "loss/crossentropy": 2.2085362553596495, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.24199980795383452, "loss/reg": 0.0, "step": 16180 }, { "epoch": 0.10651315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.06541239420572917, "learning_rate": 0.0001, "loss": 3.1524, "loss/crossentropy": 2.4575427293777468, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2581774353981018, "loss/reg": 0.0, "step": 16190 }, { "epoch": 0.10657894736842105, "grad_norm": 3.34375, "grad_norm_var": 0.2840983072916667, "learning_rate": 0.0001, "loss": 3.2356, "loss/crossentropy": 2.444900369644165, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.295256008207798, "loss/reg": 0.0, "step": 16200 }, { "epoch": 0.10664473684210526, "grad_norm": 2.15625, "grad_norm_var": 0.27533137003580727, "learning_rate": 0.0001, "loss": 3.0994, "loss/crossentropy": 2.5305007696151733, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.2888329938054085, "loss/reg": 0.0, "step": 16210 }, { "epoch": 0.10671052631578948, "grad_norm": 2.609375, "grad_norm_var": 0.6374224344889323, "learning_rate": 0.0001, "loss": 3.1837, "loss/crossentropy": 2.394989788532257, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.22691280096769334, "loss/reg": 0.0, "step": 16220 }, { "epoch": 0.10677631578947368, "grad_norm": 2.546875, "grad_norm_var": 0.6025950113932291, "learning_rate": 0.0001, "loss": 3.1035, "loss/crossentropy": 2.3761568784713747, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25212556272745135, "loss/reg": 0.0, "step": 16230 }, { "epoch": 0.10684210526315789, "grad_norm": 2.390625, "grad_norm_var": 0.07968343098958333, "learning_rate": 0.0001, "loss": 3.1507, "loss/crossentropy": 2.1518751621246337, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.25603573620319364, "loss/reg": 0.0, "step": 16240 }, { "epoch": 0.1069078947368421, "grad_norm": 2.03125, "grad_norm_var": 0.0736968994140625, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.2170523405075073, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2497616469860077, "loss/reg": 0.0, "step": 16250 }, { "epoch": 0.10697368421052632, "grad_norm": 2.671875, "grad_norm_var": 0.15055338541666666, "learning_rate": 0.0001, "loss": 3.18, "loss/crossentropy": 2.2126652002334595, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2371530830860138, "loss/reg": 0.0, "step": 16260 }, { "epoch": 0.10703947368421053, "grad_norm": 2.359375, "grad_norm_var": 0.1051666259765625, "learning_rate": 0.0001, "loss": 3.1175, "loss/crossentropy": 2.5849027037620544, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.31244791448116305, "loss/reg": 0.0, "step": 16270 }, { "epoch": 0.10710526315789473, "grad_norm": 2.34375, "grad_norm_var": 0.05666910807291667, "learning_rate": 0.0001, "loss": 3.2064, "loss/crossentropy": 1.9444570660591125, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2180204689502716, "loss/reg": 0.0, "step": 16280 }, { "epoch": 0.10717105263157894, "grad_norm": 2.203125, "grad_norm_var": 0.08806050618489583, "learning_rate": 0.0001, "loss": 3.0722, "loss/crossentropy": 2.3104967713356017, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.2631675943732262, "loss/reg": 0.0, "step": 16290 }, { "epoch": 0.10723684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.018146769205729166, "learning_rate": 0.0001, "loss": 3.0587, "loss/crossentropy": 2.1140322208404543, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2300342008471489, "loss/reg": 0.0, "step": 16300 }, { "epoch": 0.10730263157894737, "grad_norm": 2.40625, "grad_norm_var": 0.036774698893229166, "learning_rate": 0.0001, "loss": 3.2022, "loss/crossentropy": 2.5614047765731813, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2788904532790184, "loss/reg": 0.0, "step": 16310 }, { "epoch": 0.10736842105263159, "grad_norm": 2.734375, "grad_norm_var": 0.21725972493489584, "learning_rate": 0.0001, "loss": 3.1511, "loss/crossentropy": 1.8924081802368165, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2040714904665947, "loss/reg": 0.0, "step": 16320 }, { "epoch": 0.10743421052631578, "grad_norm": 2.234375, "grad_norm_var": 0.23220926920572918, "learning_rate": 0.0001, "loss": 3.1014, "loss/crossentropy": 2.342508816719055, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.27500451505184176, "loss/reg": 0.0, "step": 16330 }, { "epoch": 0.1075, "grad_norm": 2.21875, "grad_norm_var": 0.5662424723307292, "learning_rate": 0.0001, "loss": 3.2912, "loss/crossentropy": 2.3959147095680238, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2863708436489105, "loss/reg": 0.0, "step": 16340 }, { "epoch": 0.10756578947368421, "grad_norm": 2.078125, "grad_norm_var": 0.11278889973958334, "learning_rate": 0.0001, "loss": 3.1532, "loss/crossentropy": 2.166390228271484, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.23770884573459625, "loss/reg": 0.0, "step": 16350 }, { "epoch": 0.10763157894736843, "grad_norm": 2.265625, "grad_norm_var": 0.06780776977539063, "learning_rate": 0.0001, "loss": 3.0399, "loss/crossentropy": 2.3057939767837525, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2797847852110863, "loss/reg": 0.0, "step": 16360 }, { "epoch": 0.10769736842105262, "grad_norm": 2.390625, "grad_norm_var": 0.038826243082682295, "learning_rate": 0.0001, "loss": 3.12, "loss/crossentropy": 2.265635335445404, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23047662824392318, "loss/reg": 0.0, "step": 16370 }, { "epoch": 0.10776315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.08754781087239584, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.6614980459213258, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.23254811465740205, "loss/reg": 0.0, "step": 16380 }, { "epoch": 0.10782894736842105, "grad_norm": 3.390625, "grad_norm_var": 0.14463882446289061, "learning_rate": 0.0001, "loss": 3.1805, "loss/crossentropy": 2.419999623298645, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.3057109400629997, "loss/reg": 0.0, "step": 16390 }, { "epoch": 0.10789473684210527, "grad_norm": 2.234375, "grad_norm_var": 0.12823893229166666, "learning_rate": 0.0001, "loss": 3.1467, "loss/crossentropy": 2.145538020133972, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.23782579749822616, "loss/reg": 0.0, "step": 16400 }, { "epoch": 0.10796052631578948, "grad_norm": 2.40625, "grad_norm_var": 0.07416890462239584, "learning_rate": 0.0001, "loss": 3.1471, "loss/crossentropy": 2.3262511491775513, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.23651919960975648, "loss/reg": 0.0, "step": 16410 }, { "epoch": 0.10802631578947368, "grad_norm": 2.453125, "grad_norm_var": 0.052245076497395834, "learning_rate": 0.0001, "loss": 3.1573, "loss/crossentropy": 2.446836495399475, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22134366482496262, "loss/reg": 0.0, "step": 16420 }, { "epoch": 0.10809210526315789, "grad_norm": 2.90625, "grad_norm_var": 0.12043355305989584, "learning_rate": 0.0001, "loss": 3.2359, "loss/crossentropy": 2.3113945603370665, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2689176544547081, "loss/reg": 0.0, "step": 16430 }, { "epoch": 0.1081578947368421, "grad_norm": 2.078125, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 3.1679, "loss/crossentropy": 2.422752869129181, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.306715852022171, "loss/reg": 0.0, "step": 16440 }, { "epoch": 0.10822368421052632, "grad_norm": 2.296875, "grad_norm_var": 0.06880594889322916, "learning_rate": 0.0001, "loss": 3.147, "loss/crossentropy": 2.048931634426117, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2303798720240593, "loss/reg": 0.0, "step": 16450 }, { "epoch": 0.10828947368421053, "grad_norm": 2.234375, "grad_norm_var": 0.15419514973958334, "learning_rate": 0.0001, "loss": 3.1877, "loss/crossentropy": 2.68409343957901, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.25028681606054304, "loss/reg": 0.0, "step": 16460 }, { "epoch": 0.10835526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.09561258951822917, "learning_rate": 0.0001, "loss": 3.0881, "loss/crossentropy": 2.3087923645973207, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.2732239991426468, "loss/reg": 0.0, "step": 16470 }, { "epoch": 0.10842105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.14474283854166667, "learning_rate": 0.0001, "loss": 3.1675, "loss/crossentropy": 2.3147490501403807, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.25520786345005037, "loss/reg": 0.0, "step": 16480 }, { "epoch": 0.10848684210526316, "grad_norm": 2.21875, "grad_norm_var": 0.12763671875, "learning_rate": 0.0001, "loss": 3.1948, "loss/crossentropy": 2.4046076416969298, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.24635857343673706, "loss/reg": 0.0, "step": 16490 }, { "epoch": 0.10855263157894737, "grad_norm": 3.234375, "grad_norm_var": 0.19675267537434896, "learning_rate": 0.0001, "loss": 3.134, "loss/crossentropy": 2.1358281135559083, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20830129384994506, "loss/reg": 0.0, "step": 16500 }, { "epoch": 0.10861842105263157, "grad_norm": 2.25, "grad_norm_var": 0.20981216430664062, "learning_rate": 0.0001, "loss": 3.1161, "loss/crossentropy": 2.2100176930427553, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22965313643217086, "loss/reg": 0.0, "step": 16510 }, { "epoch": 0.10868421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.025813802083333334, "learning_rate": 0.0001, "loss": 3.1402, "loss/crossentropy": 2.1080865144729612, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.24372481554746628, "loss/reg": 0.0, "step": 16520 }, { "epoch": 0.10875, "grad_norm": 2.390625, "grad_norm_var": 0.03843994140625, "learning_rate": 0.0001, "loss": 3.112, "loss/crossentropy": 2.184442663192749, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.25090422928333284, "loss/reg": 0.0, "step": 16530 }, { "epoch": 0.10881578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.03850682576497396, "learning_rate": 0.0001, "loss": 3.0917, "loss/crossentropy": 2.2426281213760375, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.26249563246965407, "loss/reg": 0.0, "step": 16540 }, { "epoch": 0.10888157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.037082672119140625, "learning_rate": 0.0001, "loss": 3.1533, "loss/crossentropy": 2.4359527707099913, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.28122682869434357, "loss/reg": 0.0, "step": 16550 }, { "epoch": 0.10894736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.06422526041666667, "learning_rate": 0.0001, "loss": 3.1086, "loss/crossentropy": 2.679885816574097, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.25820834636688234, "loss/reg": 0.0, "step": 16560 }, { "epoch": 0.10901315789473684, "grad_norm": 2.375, "grad_norm_var": 0.033543904622395836, "learning_rate": 0.0001, "loss": 3.1226, "loss/crossentropy": 2.230025511980057, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.24839217215776443, "loss/reg": 0.0, "step": 16570 }, { "epoch": 0.10907894736842105, "grad_norm": 2.171875, "grad_norm_var": 0.08507486979166666, "learning_rate": 0.0001, "loss": 3.158, "loss/crossentropy": 2.3745269417762755, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22690069079399108, "loss/reg": 0.0, "step": 16580 }, { "epoch": 0.10914473684210527, "grad_norm": 2.03125, "grad_norm_var": 0.07500712076822917, "learning_rate": 0.0001, "loss": 3.0464, "loss/crossentropy": 2.3726358652114867, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22285908311605454, "loss/reg": 0.0, "step": 16590 }, { "epoch": 0.10921052631578948, "grad_norm": 2.984375, "grad_norm_var": 0.0905426025390625, "learning_rate": 0.0001, "loss": 3.0802, "loss/crossentropy": 2.297460901737213, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.24248487651348113, "loss/reg": 0.0, "step": 16600 }, { "epoch": 0.10927631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.42377827962239584, "learning_rate": 0.0001, "loss": 3.1369, "loss/crossentropy": 2.1492306351661683, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.25582308024168016, "loss/reg": 0.0, "step": 16610 }, { "epoch": 0.1093421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.38531494140625, "learning_rate": 0.0001, "loss": 3.1291, "loss/crossentropy": 2.1944116175174715, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.26006165742874143, "loss/reg": 0.0, "step": 16620 }, { "epoch": 0.1094078947368421, "grad_norm": 2.296875, "grad_norm_var": 0.031305948893229164, "learning_rate": 0.0001, "loss": 3.0966, "loss/crossentropy": 2.2580446600914, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2571749180555344, "loss/reg": 0.0, "step": 16630 }, { "epoch": 0.10947368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.711108660697937, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.23514284193515778, "loss/reg": 0.0, "step": 16640 }, { "epoch": 0.10953947368421052, "grad_norm": 2.53125, "grad_norm_var": 0.39968973795572915, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.318574833869934, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.3364941954612732, "loss/reg": 0.0, "step": 16650 }, { "epoch": 0.10960526315789473, "grad_norm": 2.90625, "grad_norm_var": 0.5502431233723958, "learning_rate": 0.0001, "loss": 3.1398, "loss/crossentropy": 2.5402591228485107, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2698328003287315, "loss/reg": 0.0, "step": 16660 }, { "epoch": 0.10967105263157895, "grad_norm": 2.578125, "grad_norm_var": 0.7628651936848958, "learning_rate": 0.0001, "loss": 3.2125, "loss/crossentropy": 2.4290089428424837, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.23459196090698242, "loss/reg": 0.0, "step": 16670 }, { "epoch": 0.10973684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.5333984375, "learning_rate": 0.0001, "loss": 3.1568, "loss/crossentropy": 2.4715004682540895, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2527153715491295, "loss/reg": 0.0, "step": 16680 }, { "epoch": 0.10980263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.01929931640625, "learning_rate": 0.0001, "loss": 3.1188, "loss/crossentropy": 2.565124809741974, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2764893934130669, "loss/reg": 0.0, "step": 16690 }, { "epoch": 0.10986842105263157, "grad_norm": 2.609375, "grad_norm_var": 0.7143287658691406, "learning_rate": 0.0001, "loss": 3.1851, "loss/crossentropy": 2.1823901176452636, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2804734021425247, "loss/reg": 0.0, "step": 16700 }, { "epoch": 0.10993421052631579, "grad_norm": 3.75, "grad_norm_var": 0.800158437093099, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.339951229095459, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23388542532920836, "loss/reg": 0.0, "step": 16710 }, { "epoch": 0.11, "grad_norm": 3.0625, "grad_norm_var": 0.46857808430989584, "learning_rate": 0.0001, "loss": 3.168, "loss/crossentropy": 2.5235843658447266, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.25846762359142306, "loss/reg": 0.0, "step": 16720 }, { "epoch": 0.11006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.2997792561848958, "learning_rate": 0.0001, "loss": 3.199, "loss/crossentropy": 2.4483086824417115, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.24846359938383103, "loss/reg": 0.0, "step": 16730 }, { "epoch": 0.11013157894736843, "grad_norm": 2.703125, "grad_norm_var": 0.0768218994140625, "learning_rate": 0.0001, "loss": 3.1155, "loss/crossentropy": 2.318689703941345, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.25379282981157303, "loss/reg": 0.0, "step": 16740 }, { "epoch": 0.11019736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.04757258097330729, "learning_rate": 0.0001, "loss": 3.0428, "loss/crossentropy": 2.3445772767066955, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.26050496101379395, "loss/reg": 0.0, "step": 16750 }, { "epoch": 0.11026315789473684, "grad_norm": 3.265625, "grad_norm_var": 0.09231669108072917, "learning_rate": 0.0001, "loss": 3.1426, "loss/crossentropy": 2.2978576898574827, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.23892502784729003, "loss/reg": 0.0, "step": 16760 }, { "epoch": 0.11032894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.09435221354166666, "learning_rate": 0.0001, "loss": 3.0687, "loss/crossentropy": 2.15471470952034, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22209101915359497, "loss/reg": 0.0, "step": 16770 }, { "epoch": 0.11039473684210527, "grad_norm": 2.625, "grad_norm_var": 0.084326171875, "learning_rate": 0.0001, "loss": 3.0934, "loss/crossentropy": 2.0944029092788696, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2732590340077877, "loss/reg": 0.0, "step": 16780 }, { "epoch": 0.11046052631578947, "grad_norm": 2.296875, "grad_norm_var": 0.07172749837239584, "learning_rate": 0.0001, "loss": 3.196, "loss/crossentropy": 2.4396159648895264, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.25805698037147523, "loss/reg": 0.0, "step": 16790 }, { "epoch": 0.11052631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.04521382649739583, "learning_rate": 0.0001, "loss": 3.1492, "loss/crossentropy": 2.3519849300384523, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.24649366587400437, "loss/reg": 0.0, "step": 16800 }, { "epoch": 0.1105921052631579, "grad_norm": 2.90625, "grad_norm_var": 0.06435445149739584, "learning_rate": 0.0001, "loss": 3.1366, "loss/crossentropy": 2.2999491453170777, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2394917294383049, "loss/reg": 0.0, "step": 16810 }, { "epoch": 0.11065789473684211, "grad_norm": 2.3125, "grad_norm_var": 0.09391988118489583, "learning_rate": 0.0001, "loss": 3.1441, "loss/crossentropy": 2.4930548071861267, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2843811124563217, "loss/reg": 0.0, "step": 16820 }, { "epoch": 0.11072368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.23037007649739583, "learning_rate": 0.0001, "loss": 3.1744, "loss/crossentropy": 2.327615487575531, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.25857883393764497, "loss/reg": 0.0, "step": 16830 }, { "epoch": 0.11078947368421052, "grad_norm": 2.046875, "grad_norm_var": 0.08505859375, "learning_rate": 0.0001, "loss": 3.1966, "loss/crossentropy": 2.0340147018432617, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22139777690172197, "loss/reg": 0.0, "step": 16840 }, { "epoch": 0.11085526315789473, "grad_norm": 2.359375, "grad_norm_var": 0.04459228515625, "learning_rate": 0.0001, "loss": 3.0559, "loss/crossentropy": 2.327996277809143, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2415407806634903, "loss/reg": 0.0, "step": 16850 }, { "epoch": 0.11092105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.15735677083333333, "learning_rate": 0.0001, "loss": 3.1824, "loss/crossentropy": 2.3824142098426817, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.28444231003522874, "loss/reg": 0.0, "step": 16860 }, { "epoch": 0.11098684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.2694498697916667, "learning_rate": 0.0001, "loss": 3.1079, "loss/crossentropy": 2.4686101198196413, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.23947114795446395, "loss/reg": 0.0, "step": 16870 }, { "epoch": 0.11105263157894738, "grad_norm": 2.53125, "grad_norm_var": 0.2833811442057292, "learning_rate": 0.0001, "loss": 3.0544, "loss/crossentropy": 2.2384608387947083, "loss/hidden": 3.23125, "loss/incoh": 0.0, "loss/logits": 0.29755171537399294, "loss/reg": 0.0, "step": 16880 }, { "epoch": 0.11111842105263157, "grad_norm": 2.484375, "grad_norm_var": 0.11591389973958334, "learning_rate": 0.0001, "loss": 3.1316, "loss/crossentropy": 2.065184140205383, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.22740320414304732, "loss/reg": 0.0, "step": 16890 }, { "epoch": 0.11118421052631579, "grad_norm": 2.46875, "grad_norm_var": 0.14423421223958333, "learning_rate": 0.0001, "loss": 3.1867, "loss/crossentropy": 2.241389238834381, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2434727743268013, "loss/reg": 0.0, "step": 16900 }, { "epoch": 0.11125, "grad_norm": 2.390625, "grad_norm_var": 0.14661458333333333, "learning_rate": 0.0001, "loss": 3.1063, "loss/crossentropy": 2.1571604132652284, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.21824948787689208, "loss/reg": 0.0, "step": 16910 }, { "epoch": 0.11131578947368422, "grad_norm": 2.1875, "grad_norm_var": 0.9939605712890625, "learning_rate": 0.0001, "loss": 3.2191, "loss/crossentropy": 2.4117300748825072, "loss/hidden": 3.2203125, "loss/incoh": 0.0, "loss/logits": 0.33291524201631545, "loss/reg": 0.0, "step": 16920 }, { "epoch": 0.11138157894736841, "grad_norm": 2.46875, "grad_norm_var": 0.9513580322265625, "learning_rate": 0.0001, "loss": 3.2001, "loss/crossentropy": 2.490310883522034, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2631781131029129, "loss/reg": 0.0, "step": 16930 }, { "epoch": 0.11144736842105263, "grad_norm": 2.421875, "grad_norm_var": 0.12056884765625, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.3007746815681456, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2449628993868828, "loss/reg": 0.0, "step": 16940 }, { "epoch": 0.11151315789473684, "grad_norm": 2.515625, "grad_norm_var": 0.11777242024739583, "learning_rate": 0.0001, "loss": 3.1245, "loss/crossentropy": 2.3470576763153077, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24132359698414801, "loss/reg": 0.0, "step": 16950 }, { "epoch": 0.11157894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.05214742024739583, "learning_rate": 0.0001, "loss": 3.115, "loss/crossentropy": 2.4354201793670653, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22888378351926802, "loss/reg": 0.0, "step": 16960 }, { "epoch": 0.11164473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.07139383951822917, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.1988754749298094, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22949053347110748, "loss/reg": 0.0, "step": 16970 }, { "epoch": 0.11171052631578947, "grad_norm": 2.375, "grad_norm_var": 0.06669921875, "learning_rate": 0.0001, "loss": 3.113, "loss/crossentropy": 2.2436501502990724, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.26571269184350965, "loss/reg": 0.0, "step": 16980 }, { "epoch": 0.11177631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.1141998291015625, "learning_rate": 0.0001, "loss": 3.2373, "loss/crossentropy": 2.0406009197235107, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2788648784160614, "loss/reg": 0.0, "step": 16990 }, { "epoch": 0.1118421052631579, "grad_norm": 3.5, "grad_norm_var": 0.13699544270833333, "learning_rate": 0.0001, "loss": 3.1371, "loss/crossentropy": 2.282533049583435, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.27586452960968016, "loss/reg": 0.0, "step": 17000 }, { "epoch": 0.11190789473684211, "grad_norm": 2.390625, "grad_norm_var": 0.13681233723958333, "learning_rate": 0.0001, "loss": 3.1898, "loss/crossentropy": 2.192341995239258, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2728851273655891, "loss/reg": 0.0, "step": 17010 }, { "epoch": 0.11197368421052632, "grad_norm": 2.9375, "grad_norm_var": 0.08626302083333333, "learning_rate": 0.0001, "loss": 3.1208, "loss/crossentropy": 2.036851680278778, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.29492041319608686, "loss/reg": 0.0, "step": 17020 }, { "epoch": 0.11203947368421052, "grad_norm": 3.171875, "grad_norm_var": 0.38601888020833336, "learning_rate": 0.0001, "loss": 3.1351, "loss/crossentropy": 2.013105309009552, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.21977066546678542, "loss/reg": 0.0, "step": 17030 }, { "epoch": 0.11210526315789474, "grad_norm": 2.515625, "grad_norm_var": 0.515862782796224, "learning_rate": 0.0001, "loss": 3.0822, "loss/crossentropy": 2.2523082733154296, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.23816863894462587, "loss/reg": 0.0, "step": 17040 }, { "epoch": 0.11217105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.13596598307291666, "learning_rate": 0.0001, "loss": 3.0894, "loss/crossentropy": 2.474255657196045, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.28221793919801713, "loss/reg": 0.0, "step": 17050 }, { "epoch": 0.11223684210526316, "grad_norm": 2.375, "grad_norm_var": 2.8374436804564963e+17, "learning_rate": 0.0001, "loss": 3.2437, "loss/crossentropy": 2.5942065715789795, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.26605169773101806, "loss/reg": 0.0, "step": 17060 }, { "epoch": 0.11230263157894736, "grad_norm": 1.984375, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 3.0525, "loss/crossentropy": 2.034271013736725, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2154896892607212, "loss/reg": 0.0, "step": 17070 }, { "epoch": 0.11236842105263158, "grad_norm": 2.25, "grad_norm_var": 0.050902303059895834, "learning_rate": 0.0001, "loss": 3.1033, "loss/crossentropy": 2.229046678543091, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.22233688235282897, "loss/reg": 0.0, "step": 17080 }, { "epoch": 0.11243421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.04163309733072917, "learning_rate": 0.0001, "loss": 3.1293, "loss/crossentropy": 2.4799925684928894, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2588403090834618, "loss/reg": 0.0, "step": 17090 }, { "epoch": 0.1125, "grad_norm": 2.1875, "grad_norm_var": 0.1295074462890625, "learning_rate": 0.0001, "loss": 3.0953, "loss/crossentropy": 2.308819645643234, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.23669061064720154, "loss/reg": 0.0, "step": 17100 }, { "epoch": 0.11256578947368422, "grad_norm": 2.140625, "grad_norm_var": 0.7806142171223959, "learning_rate": 0.0001, "loss": 3.1353, "loss/crossentropy": 2.3123676419258117, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.29565141499042513, "loss/reg": 0.0, "step": 17110 }, { "epoch": 0.11263157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.08242085774739584, "learning_rate": 0.0001, "loss": 3.0662, "loss/crossentropy": 2.262893891334534, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.23198095709085464, "loss/reg": 0.0, "step": 17120 }, { "epoch": 0.11269736842105263, "grad_norm": 2.265625, "grad_norm_var": 0.1591949462890625, "learning_rate": 0.0001, "loss": 3.0855, "loss/crossentropy": 2.1816434502601623, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2444119155406952, "loss/reg": 0.0, "step": 17130 }, { "epoch": 0.11276315789473684, "grad_norm": 2.125, "grad_norm_var": 0.14752197265625, "learning_rate": 0.0001, "loss": 3.1414, "loss/crossentropy": 2.4847304582595826, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.26482110619544985, "loss/reg": 0.0, "step": 17140 }, { "epoch": 0.11282894736842106, "grad_norm": 2.109375, "grad_norm_var": 0.09546610514322916, "learning_rate": 0.0001, "loss": 3.0923, "loss/crossentropy": 2.5822364687919617, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.26294154226779937, "loss/reg": 0.0, "step": 17150 }, { "epoch": 0.11289473684210527, "grad_norm": 3.796875, "grad_norm_var": 0.19820963541666667, "learning_rate": 0.0001, "loss": 3.0656, "loss/crossentropy": 2.3146368622779847, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.23589982092380524, "loss/reg": 0.0, "step": 17160 }, { "epoch": 0.11296052631578947, "grad_norm": 2.0625, "grad_norm_var": 1.0957618713378907, "learning_rate": 0.0001, "loss": 3.1693, "loss/crossentropy": 2.355622184276581, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.2783960849046707, "loss/reg": 0.0, "step": 17170 }, { "epoch": 0.11302631578947368, "grad_norm": 2.515625, "grad_norm_var": 1.0233965555826823, "learning_rate": 0.0001, "loss": 3.1617, "loss/crossentropy": 2.3866767287254333, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.25886829346418383, "loss/reg": 0.0, "step": 17180 }, { "epoch": 0.1130921052631579, "grad_norm": 2.25, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 3.1277, "loss/crossentropy": 2.095993900299072, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.1956949472427368, "loss/reg": 0.0, "step": 17190 }, { "epoch": 0.11315789473684211, "grad_norm": 2.375, "grad_norm_var": 0.09143778483072916, "learning_rate": 0.0001, "loss": 3.0835, "loss/crossentropy": 2.3784751892089844, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.26899502128362657, "loss/reg": 0.0, "step": 17200 }, { "epoch": 0.11322368421052631, "grad_norm": 2.3125, "grad_norm_var": 0.027274576822916667, "learning_rate": 0.0001, "loss": 3.1617, "loss/crossentropy": 2.505306875705719, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.2929636001586914, "loss/reg": 0.0, "step": 17210 }, { "epoch": 0.11328947368421052, "grad_norm": 2.296875, "grad_norm_var": 0.06923421223958333, "learning_rate": 0.0001, "loss": 3.12, "loss/crossentropy": 2.1889270186424254, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2669166073203087, "loss/reg": 0.0, "step": 17220 }, { "epoch": 0.11335526315789474, "grad_norm": 2.3125, "grad_norm_var": 0.17213312784830728, "learning_rate": 0.0001, "loss": 3.0681, "loss/crossentropy": 2.2528875708580016, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.23936019986867904, "loss/reg": 0.0, "step": 17230 }, { "epoch": 0.11342105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.12957763671875, "learning_rate": 0.0001, "loss": 3.119, "loss/crossentropy": 2.2457367897033693, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.23305099308490754, "loss/reg": 0.0, "step": 17240 }, { "epoch": 0.11348684210526316, "grad_norm": 2.578125, "grad_norm_var": 0.07983779907226562, "learning_rate": 0.0001, "loss": 3.0851, "loss/crossentropy": 2.4546321392059327, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.24581009149551392, "loss/reg": 0.0, "step": 17250 }, { "epoch": 0.11355263157894736, "grad_norm": 4.1875, "grad_norm_var": 0.2813791910807292, "learning_rate": 0.0001, "loss": 3.171, "loss/crossentropy": 1.9349523544311524, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31686680018901825, "loss/reg": 0.0, "step": 17260 }, { "epoch": 0.11361842105263158, "grad_norm": 2.59375, "grad_norm_var": 1.4602701822916666, "learning_rate": 0.0001, "loss": 3.1656, "loss/crossentropy": 2.2751689314842225, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2878335312008858, "loss/reg": 0.0, "step": 17270 }, { "epoch": 0.11368421052631579, "grad_norm": 2.765625, "grad_norm_var": 0.5687662760416666, "learning_rate": 0.0001, "loss": 3.0772, "loss/crossentropy": 2.4036699175834655, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.24233294725418092, "loss/reg": 0.0, "step": 17280 }, { "epoch": 0.11375, "grad_norm": 2.96875, "grad_norm_var": 0.06691792805989584, "learning_rate": 0.0001, "loss": 3.1161, "loss/crossentropy": 2.4056329488754273, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.25776179879903793, "loss/reg": 0.0, "step": 17290 }, { "epoch": 0.11381578947368422, "grad_norm": 2.203125, "grad_norm_var": 0.12681884765625, "learning_rate": 0.0001, "loss": 3.2097, "loss/crossentropy": 2.4936662912368774, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.27261604368686676, "loss/reg": 0.0, "step": 17300 }, { "epoch": 0.11388157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.08603108723958333, "learning_rate": 0.0001, "loss": 3.1596, "loss/crossentropy": 2.2334436774253845, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.27515108734369276, "loss/reg": 0.0, "step": 17310 }, { "epoch": 0.11394736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.06290690104166667, "learning_rate": 0.0001, "loss": 3.1322, "loss/crossentropy": 2.3009248971939087, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2514318063855171, "loss/reg": 0.0, "step": 17320 }, { "epoch": 0.11401315789473684, "grad_norm": 2.953125, "grad_norm_var": 0.19062093098958333, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.2115599513053894, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24124337881803512, "loss/reg": 0.0, "step": 17330 }, { "epoch": 0.11407894736842106, "grad_norm": 2.34375, "grad_norm_var": 0.4003245035807292, "learning_rate": 0.0001, "loss": 3.1404, "loss/crossentropy": 2.2533546447753907, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.25740948766469957, "loss/reg": 0.0, "step": 17340 }, { "epoch": 0.11414473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.06924540201822917, "learning_rate": 0.0001, "loss": 3.1509, "loss/crossentropy": 2.4188032507896424, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.27805479913949965, "loss/reg": 0.0, "step": 17350 }, { "epoch": 0.11421052631578947, "grad_norm": 2.96875, "grad_norm_var": 0.6562489827473958, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.1555517435073854, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.28488118648529054, "loss/reg": 0.0, "step": 17360 }, { "epoch": 0.11427631578947368, "grad_norm": 2.140625, "grad_norm_var": 0.62613525390625, "learning_rate": 0.0001, "loss": 3.1637, "loss/crossentropy": 2.351659083366394, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.2672264903783798, "loss/reg": 0.0, "step": 17370 }, { "epoch": 0.1143421052631579, "grad_norm": 2.09375, "grad_norm_var": 0.08678385416666666, "learning_rate": 0.0001, "loss": 3.1242, "loss/crossentropy": 2.2498091578483583, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24537185132503508, "loss/reg": 0.0, "step": 17380 }, { "epoch": 0.11440789473684211, "grad_norm": 2.53125, "grad_norm_var": 0.17229817708333334, "learning_rate": 0.0001, "loss": 3.1776, "loss/crossentropy": 2.401424062252045, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.3059739723801613, "loss/reg": 0.0, "step": 17390 }, { "epoch": 0.11447368421052631, "grad_norm": 2.125, "grad_norm_var": 0.1625885009765625, "learning_rate": 0.0001, "loss": 3.1477, "loss/crossentropy": 2.292530918121338, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.2784269869327545, "loss/reg": 0.0, "step": 17400 }, { "epoch": 0.11453947368421052, "grad_norm": 2.375, "grad_norm_var": 0.09892171223958333, "learning_rate": 0.0001, "loss": 3.1288, "loss/crossentropy": 2.464983069896698, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2796273499727249, "loss/reg": 0.0, "step": 17410 }, { "epoch": 0.11460526315789474, "grad_norm": 2.921875, "grad_norm_var": 0.09850972493489583, "learning_rate": 0.0001, "loss": 3.1289, "loss/crossentropy": 2.1951366662979126, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.21696581244468688, "loss/reg": 0.0, "step": 17420 }, { "epoch": 0.11467105263157895, "grad_norm": 2.53125, "grad_norm_var": 0.06843159993489584, "learning_rate": 0.0001, "loss": 3.1597, "loss/crossentropy": 2.3925849914550783, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2907762542366982, "loss/reg": 0.0, "step": 17430 }, { "epoch": 0.11473684210526315, "grad_norm": 2.21875, "grad_norm_var": 0.21046549479166668, "learning_rate": 0.0001, "loss": 3.1634, "loss/crossentropy": 2.464526927471161, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2937367483973503, "loss/reg": 0.0, "step": 17440 }, { "epoch": 0.11480263157894736, "grad_norm": 2.203125, "grad_norm_var": 0.047526041666666664, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.072342965006828, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2236027292907238, "loss/reg": 0.0, "step": 17450 }, { "epoch": 0.11486842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.05799051920572917, "learning_rate": 0.0001, "loss": 3.0873, "loss/crossentropy": 2.240265655517578, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.23422597646713256, "loss/reg": 0.0, "step": 17460 }, { "epoch": 0.11493421052631579, "grad_norm": 2.078125, "grad_norm_var": 0.10058186848958334, "learning_rate": 0.0001, "loss": 3.127, "loss/crossentropy": 2.3168819308280946, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2430237874388695, "loss/reg": 0.0, "step": 17470 }, { "epoch": 0.115, "grad_norm": 2.625, "grad_norm_var": 0.11015523274739583, "learning_rate": 0.0001, "loss": 3.0618, "loss/crossentropy": 2.2366016268730164, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23987135738134385, "loss/reg": 0.0, "step": 17480 }, { "epoch": 0.1150657894736842, "grad_norm": 2.609375, "grad_norm_var": 0.09355061848958333, "learning_rate": 0.0001, "loss": 3.1875, "loss/crossentropy": 2.1879064559936525, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.24197361022233962, "loss/reg": 0.0, "step": 17490 }, { "epoch": 0.11513157894736842, "grad_norm": 2.421875, "grad_norm_var": 7.735667928059896, "learning_rate": 0.0001, "loss": 3.2427, "loss/crossentropy": 2.149078643321991, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.24705443456768988, "loss/reg": 0.0, "step": 17500 }, { "epoch": 0.11519736842105263, "grad_norm": 2.265625, "grad_norm_var": 7.899019368489584, "learning_rate": 0.0001, "loss": 3.1837, "loss/crossentropy": 2.3580456256866453, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23221501410007478, "loss/reg": 0.0, "step": 17510 }, { "epoch": 0.11526315789473685, "grad_norm": 2.6875, "grad_norm_var": 0.2678456624348958, "learning_rate": 0.0001, "loss": 3.079, "loss/crossentropy": 2.414518404006958, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2647902578115463, "loss/reg": 0.0, "step": 17520 }, { "epoch": 0.11532894736842106, "grad_norm": 3.125, "grad_norm_var": 0.14198811848958334, "learning_rate": 0.0001, "loss": 3.1613, "loss/crossentropy": 2.416664445400238, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22963229715824127, "loss/reg": 0.0, "step": 17530 }, { "epoch": 0.11539473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.7926177978515625, "learning_rate": 0.0001, "loss": 3.1532, "loss/crossentropy": 2.1853476405143737, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2286560907959938, "loss/reg": 0.0, "step": 17540 }, { "epoch": 0.11546052631578947, "grad_norm": 2.859375, "grad_norm_var": 0.8031534830729167, "learning_rate": 0.0001, "loss": 3.1557, "loss/crossentropy": 2.6209848165512084, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.25361735969781873, "loss/reg": 0.0, "step": 17550 }, { "epoch": 0.11552631578947369, "grad_norm": 2.4375, "grad_norm_var": 0.09949544270833334, "learning_rate": 0.0001, "loss": 3.2229, "loss/crossentropy": 2.3588363409042357, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.30205955654382705, "loss/reg": 0.0, "step": 17560 }, { "epoch": 0.1155921052631579, "grad_norm": 1.9765625, "grad_norm_var": 0.07576471964518229, "learning_rate": 0.0001, "loss": 3.174, "loss/crossentropy": 2.1386860758066177, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.24600727967917918, "loss/reg": 0.0, "step": 17570 }, { "epoch": 0.1156578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.07134577433268229, "learning_rate": 0.0001, "loss": 3.1143, "loss/crossentropy": 2.5142947912216185, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2483747810125351, "loss/reg": 0.0, "step": 17580 }, { "epoch": 0.11572368421052631, "grad_norm": 2.46875, "grad_norm_var": 0.7249959309895834, "learning_rate": 0.0001, "loss": 3.1598, "loss/crossentropy": 2.339717888832092, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2511411294341087, "loss/reg": 0.0, "step": 17590 }, { "epoch": 0.11578947368421053, "grad_norm": 2.671875, "grad_norm_var": 0.9438435872395833, "learning_rate": 0.0001, "loss": 3.0902, "loss/crossentropy": 2.3273319840431212, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.321346378326416, "loss/reg": 0.0, "step": 17600 }, { "epoch": 0.11585526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.93150634765625, "learning_rate": 0.0001, "loss": 3.1392, "loss/crossentropy": 2.0881393194198608, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2542056769132614, "loss/reg": 0.0, "step": 17610 }, { "epoch": 0.11592105263157895, "grad_norm": 2.25, "grad_norm_var": 0.05450846354166667, "learning_rate": 0.0001, "loss": 3.153, "loss/crossentropy": 2.200914776325226, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2596256859600544, "loss/reg": 0.0, "step": 17620 }, { "epoch": 0.11598684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.06134440104166667, "learning_rate": 0.0001, "loss": 3.1138, "loss/crossentropy": 2.391866648197174, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.24801254719495774, "loss/reg": 0.0, "step": 17630 }, { "epoch": 0.11605263157894737, "grad_norm": 2.5625, "grad_norm_var": 0.19595947265625, "learning_rate": 0.0001, "loss": 3.1197, "loss/crossentropy": 2.3731101989746093, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2734734550118446, "loss/reg": 0.0, "step": 17640 }, { "epoch": 0.11611842105263158, "grad_norm": 2.3125, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 3.186, "loss/crossentropy": 2.510706162452698, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.3243511900305748, "loss/reg": 0.0, "step": 17650 }, { "epoch": 0.11618421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.08347066243489583, "learning_rate": 0.0001, "loss": 3.1433, "loss/crossentropy": 2.3694114327430724, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.23816563338041305, "loss/reg": 0.0, "step": 17660 }, { "epoch": 0.11625, "grad_norm": 2.515625, "grad_norm_var": 0.07176106770833333, "learning_rate": 0.0001, "loss": 3.117, "loss/crossentropy": 2.3822181940078737, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2559090554714203, "loss/reg": 0.0, "step": 17670 }, { "epoch": 0.1163157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.06597391764322917, "learning_rate": 0.0001, "loss": 3.167, "loss/crossentropy": 2.236483609676361, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.24177847057580948, "loss/reg": 0.0, "step": 17680 }, { "epoch": 0.11638157894736842, "grad_norm": 2.140625, "grad_norm_var": 0.05454813639322917, "learning_rate": 0.0001, "loss": 3.1207, "loss/crossentropy": 2.386284852027893, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.23972099274396896, "loss/reg": 0.0, "step": 17690 }, { "epoch": 0.11644736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.029325358072916665, "learning_rate": 0.0001, "loss": 3.1551, "loss/crossentropy": 2.3102688074111937, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.25639690458774567, "loss/reg": 0.0, "step": 17700 }, { "epoch": 0.11651315789473685, "grad_norm": 2.390625, "grad_norm_var": 0.1159576416015625, "learning_rate": 0.0001, "loss": 3.1227, "loss/crossentropy": 2.164554476737976, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2502566508948803, "loss/reg": 0.0, "step": 17710 }, { "epoch": 0.11657894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.16982421875, "learning_rate": 0.0001, "loss": 3.2736, "loss/crossentropy": 2.166772598028183, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.24817814379930497, "loss/reg": 0.0, "step": 17720 }, { "epoch": 0.11664473684210526, "grad_norm": 2.75, "grad_norm_var": 0.13912353515625, "learning_rate": 0.0001, "loss": 3.1985, "loss/crossentropy": 2.1537662744522095, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21837295591831207, "loss/reg": 0.0, "step": 17730 }, { "epoch": 0.11671052631578947, "grad_norm": 2.046875, "grad_norm_var": 0.09753392537434896, "learning_rate": 0.0001, "loss": 3.1066, "loss/crossentropy": 2.106878674030304, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2489032343029976, "loss/reg": 0.0, "step": 17740 }, { "epoch": 0.11677631578947369, "grad_norm": 2.5625, "grad_norm_var": 0.08820699055989584, "learning_rate": 0.0001, "loss": 3.1647, "loss/crossentropy": 2.186918389797211, "loss/hidden": 3.1171875, "loss/incoh": 0.0, "loss/logits": 0.26816043853759763, "loss/reg": 0.0, "step": 17750 }, { "epoch": 0.1168421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.14002176920572917, "learning_rate": 0.0001, "loss": 3.2041, "loss/crossentropy": 2.130704140663147, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.26593275666236876, "loss/reg": 0.0, "step": 17760 }, { "epoch": 0.1169078947368421, "grad_norm": 3.6875, "grad_norm_var": 0.16592508951822918, "learning_rate": 0.0001, "loss": 3.0748, "loss/crossentropy": 2.4294887661933897, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.26008370518684387, "loss/reg": 0.0, "step": 17770 }, { "epoch": 0.11697368421052631, "grad_norm": 2.109375, "grad_norm_var": 0.4354156494140625, "learning_rate": 0.0001, "loss": 3.2129, "loss/crossentropy": 2.0633517861366273, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2134759709239006, "loss/reg": 0.0, "step": 17780 }, { "epoch": 0.11703947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.114306640625, "learning_rate": 0.0001, "loss": 3.1969, "loss/crossentropy": 2.470662558078766, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.32178338468074796, "loss/reg": 0.0, "step": 17790 }, { "epoch": 0.11710526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.1042388916015625, "learning_rate": 0.0001, "loss": 3.132, "loss/crossentropy": 2.180730104446411, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2408156231045723, "loss/reg": 0.0, "step": 17800 }, { "epoch": 0.11717105263157895, "grad_norm": 2.5, "grad_norm_var": 0.08092041015625, "learning_rate": 0.0001, "loss": 3.1107, "loss/crossentropy": 2.3672548174858092, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2295123293995857, "loss/reg": 0.0, "step": 17810 }, { "epoch": 0.11723684210526315, "grad_norm": 2.40625, "grad_norm_var": 0.029108683268229168, "learning_rate": 0.0001, "loss": 3.1063, "loss/crossentropy": 2.486844336986542, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2817386701703072, "loss/reg": 0.0, "step": 17820 }, { "epoch": 0.11730263157894737, "grad_norm": 2.609375, "grad_norm_var": 3.6479156901264755e+17, "learning_rate": 0.0001, "loss": 3.2511, "loss/crossentropy": 2.3458006739616395, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.25344357788562777, "loss/reg": 0.0, "step": 17830 }, { "epoch": 0.11736842105263158, "grad_norm": 2.75, "grad_norm_var": 1.1187174479166666, "learning_rate": 0.0001, "loss": 3.1952, "loss/crossentropy": 2.3346620917320253, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24867950975894929, "loss/reg": 0.0, "step": 17840 }, { "epoch": 0.1174342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.05538736979166667, "learning_rate": 0.0001, "loss": 3.1781, "loss/crossentropy": 2.1594719171524046, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.32041922956705093, "loss/reg": 0.0, "step": 17850 }, { "epoch": 0.1175, "grad_norm": 2.3125, "grad_norm_var": 0.13107096354166667, "learning_rate": 0.0001, "loss": 3.1675, "loss/crossentropy": 2.335154187679291, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.23140522688627244, "loss/reg": 0.0, "step": 17860 }, { "epoch": 0.1175657894736842, "grad_norm": 2.421875, "grad_norm_var": 0.061777496337890626, "learning_rate": 0.0001, "loss": 3.1431, "loss/crossentropy": 2.448777401447296, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24006084948778153, "loss/reg": 0.0, "step": 17870 }, { "epoch": 0.11763157894736842, "grad_norm": 2.25, "grad_norm_var": 0.14251302083333334, "learning_rate": 0.0001, "loss": 3.2176, "loss/crossentropy": 2.2756917595863344, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.30769334733486176, "loss/reg": 0.0, "step": 17880 }, { "epoch": 0.11769736842105263, "grad_norm": 1.921875, "grad_norm_var": 0.14058837890625, "learning_rate": 0.0001, "loss": 3.1661, "loss/crossentropy": 2.3477345585823057, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.2401590347290039, "loss/reg": 0.0, "step": 17890 }, { "epoch": 0.11776315789473685, "grad_norm": 3.34375, "grad_norm_var": 0.14404271443684896, "learning_rate": 0.0001, "loss": 3.1339, "loss/crossentropy": 2.2531643748283385, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.27347354739904406, "loss/reg": 0.0, "step": 17900 }, { "epoch": 0.11782894736842105, "grad_norm": 2.59375, "grad_norm_var": 0.10000991821289062, "learning_rate": 0.0001, "loss": 3.1182, "loss/crossentropy": 2.2883235216140747, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2531774565577507, "loss/reg": 0.0, "step": 17910 }, { "epoch": 0.11789473684210526, "grad_norm": 2.328125, "grad_norm_var": 0.0377349853515625, "learning_rate": 0.0001, "loss": 3.1566, "loss/crossentropy": 2.2789531648159027, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.235878374427557, "loss/reg": 0.0, "step": 17920 }, { "epoch": 0.11796052631578947, "grad_norm": 2.25, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 3.1068, "loss/crossentropy": 2.3758100509643554, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22890822887420653, "loss/reg": 0.0, "step": 17930 }, { "epoch": 0.11802631578947369, "grad_norm": 2.578125, "grad_norm_var": 0.07280171712239583, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.2372307777404785, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.24057336896657944, "loss/reg": 0.0, "step": 17940 }, { "epoch": 0.1180921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.12463785807291666, "learning_rate": 0.0001, "loss": 3.1969, "loss/crossentropy": 2.1488450884819033, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25413042977452277, "loss/reg": 0.0, "step": 17950 }, { "epoch": 0.1181578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.11864827473958334, "learning_rate": 0.0001, "loss": 3.2164, "loss/crossentropy": 2.5382091283798216, "loss/hidden": 3.13125, "loss/incoh": 0.0, "loss/logits": 0.26207938939332964, "loss/reg": 0.0, "step": 17960 }, { "epoch": 0.11822368421052631, "grad_norm": 2.0625, "grad_norm_var": 0.06974283854166667, "learning_rate": 0.0001, "loss": 3.1713, "loss/crossentropy": 2.3174261093139648, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.279338338971138, "loss/reg": 0.0, "step": 17970 }, { "epoch": 0.11828947368421053, "grad_norm": 2.6875, "grad_norm_var": 0.04395243326822917, "learning_rate": 0.0001, "loss": 3.1261, "loss/crossentropy": 2.182445216178894, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2768437474966049, "loss/reg": 0.0, "step": 17980 }, { "epoch": 0.11835526315789474, "grad_norm": 2.703125, "grad_norm_var": 0.14431050618489583, "learning_rate": 0.0001, "loss": 3.1287, "loss/crossentropy": 2.202815556526184, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.21700907945632936, "loss/reg": 0.0, "step": 17990 }, { "epoch": 0.11842105263157894, "grad_norm": 2.546875, "grad_norm_var": 2.2837799072265623, "learning_rate": 0.0001, "loss": 3.1737, "loss/crossentropy": 2.0051895678043365, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.35186032503843306, "loss/reg": 0.0, "step": 18000 }, { "epoch": 0.11848684210526315, "grad_norm": 2.4375, "grad_norm_var": 2.469374338785807, "learning_rate": 0.0001, "loss": 3.0848, "loss/crossentropy": 1.872368621826172, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2913415163755417, "loss/reg": 0.0, "step": 18010 }, { "epoch": 0.11855263157894737, "grad_norm": 2.046875, "grad_norm_var": 0.1679278055826823, "learning_rate": 0.0001, "loss": 3.2063, "loss/crossentropy": 2.367658519744873, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.27947854697704316, "loss/reg": 0.0, "step": 18020 }, { "epoch": 0.11861842105263158, "grad_norm": 3.40625, "grad_norm_var": 0.15479227701822917, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 2.087648892402649, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2204158440232277, "loss/reg": 0.0, "step": 18030 }, { "epoch": 0.1186842105263158, "grad_norm": 3.171875, "grad_norm_var": 0.18935546875, "learning_rate": 0.0001, "loss": 3.205, "loss/crossentropy": 2.1874751687049865, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2435563921928406, "loss/reg": 0.0, "step": 18040 }, { "epoch": 0.11875, "grad_norm": 2.234375, "grad_norm_var": 0.38063151041666665, "learning_rate": 0.0001, "loss": 3.1996, "loss/crossentropy": 2.1633397549390794, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.21968650221824645, "loss/reg": 0.0, "step": 18050 }, { "epoch": 0.11881578947368421, "grad_norm": 2.140625, "grad_norm_var": 0.5637278238932292, "learning_rate": 0.0001, "loss": 3.1736, "loss/crossentropy": 2.281619644165039, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.295219412446022, "loss/reg": 0.0, "step": 18060 }, { "epoch": 0.11888157894736842, "grad_norm": 2.828125, "grad_norm_var": 1.9192342122395833, "learning_rate": 0.0001, "loss": 3.2705, "loss/crossentropy": 2.21823273897171, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.23566214740276337, "loss/reg": 0.0, "step": 18070 }, { "epoch": 0.11894736842105263, "grad_norm": 2.515625, "grad_norm_var": 1.6334869384765625, "learning_rate": 0.0001, "loss": 3.1873, "loss/crossentropy": 2.2668980836868284, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2256957158446312, "loss/reg": 0.0, "step": 18080 }, { "epoch": 0.11901315789473685, "grad_norm": 2.046875, "grad_norm_var": 0.0713287353515625, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.218550479412079, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.3167494982481003, "loss/reg": 0.0, "step": 18090 }, { "epoch": 0.11907894736842105, "grad_norm": 2.375, "grad_norm_var": 0.05681050618489583, "learning_rate": 0.0001, "loss": 3.1816, "loss/crossentropy": 2.464331579208374, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.25844376236200334, "loss/reg": 0.0, "step": 18100 }, { "epoch": 0.11914473684210526, "grad_norm": 2.390625, "grad_norm_var": 0.07647196451822917, "learning_rate": 0.0001, "loss": 3.1727, "loss/crossentropy": 2.2838521599769592, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2587725341320038, "loss/reg": 0.0, "step": 18110 }, { "epoch": 0.11921052631578948, "grad_norm": 2.1875, "grad_norm_var": 0.13923238118489584, "learning_rate": 0.0001, "loss": 3.2584, "loss/crossentropy": 2.3038102626800536, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3060797408223152, "loss/reg": 0.0, "step": 18120 }, { "epoch": 0.11927631578947369, "grad_norm": 2.09375, "grad_norm_var": 0.11160481770833333, "learning_rate": 0.0001, "loss": 3.0877, "loss/crossentropy": 2.1922419667243958, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2569601759314537, "loss/reg": 0.0, "step": 18130 }, { "epoch": 0.11934210526315789, "grad_norm": 2.34375, "grad_norm_var": 0.18005269368489582, "learning_rate": 0.0001, "loss": 3.2654, "loss/crossentropy": 2.336120533943176, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.3214158996939659, "loss/reg": 0.0, "step": 18140 }, { "epoch": 0.1194078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.211083984375, "learning_rate": 0.0001, "loss": 3.1801, "loss/crossentropy": 2.6258700489997864, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.23279473930597305, "loss/reg": 0.0, "step": 18150 }, { "epoch": 0.11947368421052632, "grad_norm": 2.46875, "grad_norm_var": 0.097412109375, "learning_rate": 0.0001, "loss": 3.0988, "loss/crossentropy": 2.378826451301575, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.24316011592745781, "loss/reg": 0.0, "step": 18160 }, { "epoch": 0.11953947368421053, "grad_norm": 2.953125, "grad_norm_var": 0.25974934895833335, "learning_rate": 0.0001, "loss": 3.1615, "loss/crossentropy": 2.3753814458847047, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2662284314632416, "loss/reg": 0.0, "step": 18170 }, { "epoch": 0.11960526315789474, "grad_norm": 2.28125, "grad_norm_var": 0.2468658447265625, "learning_rate": 0.0001, "loss": 3.2061, "loss/crossentropy": 2.2813811898231506, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.3734820380806923, "loss/reg": 0.0, "step": 18180 }, { "epoch": 0.11967105263157894, "grad_norm": 2.96875, "grad_norm_var": 0.12701416015625, "learning_rate": 0.0001, "loss": 3.16, "loss/crossentropy": 2.2644060015678407, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.22710230052471161, "loss/reg": 0.0, "step": 18190 }, { "epoch": 0.11973684210526316, "grad_norm": 2.015625, "grad_norm_var": 0.059342447916666666, "learning_rate": 0.0001, "loss": 3.0907, "loss/crossentropy": 2.2262014031410216, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.20997287034988404, "loss/reg": 0.0, "step": 18200 }, { "epoch": 0.11980263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.04023335774739583, "learning_rate": 0.0001, "loss": 3.2103, "loss/crossentropy": 2.261046063899994, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.262168163061142, "loss/reg": 0.0, "step": 18210 }, { "epoch": 0.11986842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.11513570149739584, "learning_rate": 0.0001, "loss": 3.194, "loss/crossentropy": 2.39845809340477, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.2080080732703209, "loss/reg": 0.0, "step": 18220 }, { "epoch": 0.1199342105263158, "grad_norm": 2.140625, "grad_norm_var": 0.12336324055989584, "learning_rate": 0.0001, "loss": 3.1321, "loss/crossentropy": 2.221972668170929, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.2630168259143829, "loss/reg": 0.0, "step": 18230 }, { "epoch": 0.12, "grad_norm": 2.46875, "grad_norm_var": 0.0994049072265625, "learning_rate": 0.0001, "loss": 3.192, "loss/crossentropy": 1.910440945625305, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.22247039675712585, "loss/reg": 0.0, "step": 18240 }, { "epoch": 0.12006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.38481343587239586, "learning_rate": 0.0001, "loss": 3.211, "loss/crossentropy": 2.3357484221458433, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.28341811895370483, "loss/reg": 0.0, "step": 18250 }, { "epoch": 0.12013157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.1193023681640625, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.277235043048859, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2401238664984703, "loss/reg": 0.0, "step": 18260 }, { "epoch": 0.12019736842105264, "grad_norm": 2.5, "grad_norm_var": 0.0804595947265625, "learning_rate": 0.0001, "loss": 3.1693, "loss/crossentropy": 2.4965412855148315, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.23239507675170898, "loss/reg": 0.0, "step": 18270 }, { "epoch": 0.12026315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.1340240478515625, "learning_rate": 0.0001, "loss": 3.0471, "loss/crossentropy": 2.187135934829712, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22307218313217164, "loss/reg": 0.0, "step": 18280 }, { "epoch": 0.12032894736842105, "grad_norm": 2.03125, "grad_norm_var": 0.08740946451822916, "learning_rate": 0.0001, "loss": 3.0703, "loss/crossentropy": 2.144671416282654, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.23641343265771866, "loss/reg": 0.0, "step": 18290 }, { "epoch": 0.12039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.0469390869140625, "learning_rate": 0.0001, "loss": 3.0766, "loss/crossentropy": 2.3019802451133726, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.27486053854227066, "loss/reg": 0.0, "step": 18300 }, { "epoch": 0.12046052631578948, "grad_norm": 2.765625, "grad_norm_var": 0.056086222330729164, "learning_rate": 0.0001, "loss": 3.1479, "loss/crossentropy": 2.350484275817871, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.2915784493088722, "loss/reg": 0.0, "step": 18310 }, { "epoch": 0.12052631578947369, "grad_norm": 2.3125, "grad_norm_var": 0.0907867431640625, "learning_rate": 0.0001, "loss": 3.085, "loss/crossentropy": 2.519459903240204, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2799163952469826, "loss/reg": 0.0, "step": 18320 }, { "epoch": 0.12059210526315789, "grad_norm": 3.03125, "grad_norm_var": 0.07711588541666667, "learning_rate": 0.0001, "loss": 3.1352, "loss/crossentropy": 2.196255683898926, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.256213016808033, "loss/reg": 0.0, "step": 18330 }, { "epoch": 0.1206578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.111083984375, "learning_rate": 0.0001, "loss": 3.1252, "loss/crossentropy": 2.421483266353607, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.3222003743052483, "loss/reg": 0.0, "step": 18340 }, { "epoch": 0.12072368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.07574462890625, "learning_rate": 0.0001, "loss": 3.1338, "loss/crossentropy": 2.248370945453644, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.24929146319627762, "loss/reg": 0.0, "step": 18350 }, { "epoch": 0.12078947368421053, "grad_norm": 2.375, "grad_norm_var": 0.05138346354166667, "learning_rate": 0.0001, "loss": 3.1477, "loss/crossentropy": 2.3680251955986025, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2555671989917755, "loss/reg": 0.0, "step": 18360 }, { "epoch": 0.12085526315789474, "grad_norm": 3.296875, "grad_norm_var": 0.06642964680989584, "learning_rate": 0.0001, "loss": 3.1165, "loss/crossentropy": 2.199974000453949, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.24803201854228973, "loss/reg": 0.0, "step": 18370 }, { "epoch": 0.12092105263157894, "grad_norm": 2.171875, "grad_norm_var": 0.11695556640625, "learning_rate": 0.0001, "loss": 3.1367, "loss/crossentropy": 2.5192595601081846, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.24734148681163787, "loss/reg": 0.0, "step": 18380 }, { "epoch": 0.12098684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.05526936848958333, "learning_rate": 0.0001, "loss": 3.1724, "loss/crossentropy": 2.450891613960266, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2809787794947624, "loss/reg": 0.0, "step": 18390 }, { "epoch": 0.12105263157894737, "grad_norm": 2.15625, "grad_norm_var": 0.0400787353515625, "learning_rate": 0.0001, "loss": 3.1368, "loss/crossentropy": 2.142946255207062, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.25829449892044065, "loss/reg": 0.0, "step": 18400 }, { "epoch": 0.12111842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.12823893229166666, "learning_rate": 0.0001, "loss": 3.1313, "loss/crossentropy": 2.4548865795135497, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2860646352171898, "loss/reg": 0.0, "step": 18410 }, { "epoch": 0.12118421052631578, "grad_norm": 2.46875, "grad_norm_var": 0.22473042805989582, "learning_rate": 0.0001, "loss": 3.0336, "loss/crossentropy": 2.4296185970306396, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24130426943302155, "loss/reg": 0.0, "step": 18420 }, { "epoch": 0.12125, "grad_norm": 2.546875, "grad_norm_var": 0.24602864583333334, "learning_rate": 0.0001, "loss": 3.0905, "loss/crossentropy": 2.2097928047180178, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.22747812122106553, "loss/reg": 0.0, "step": 18430 }, { "epoch": 0.12131578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.03878580729166667, "learning_rate": 0.0001, "loss": 3.1482, "loss/crossentropy": 2.266276228427887, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.3076672673225403, "loss/reg": 0.0, "step": 18440 }, { "epoch": 0.12138157894736842, "grad_norm": 2.046875, "grad_norm_var": 0.19593098958333333, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.3656784892082214, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.21082431972026824, "loss/reg": 0.0, "step": 18450 }, { "epoch": 0.12144736842105264, "grad_norm": 2.28125, "grad_norm_var": 0.13351236979166667, "learning_rate": 0.0001, "loss": 3.1809, "loss/crossentropy": 2.363306760787964, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.259796117246151, "loss/reg": 0.0, "step": 18460 }, { "epoch": 0.12151315789473684, "grad_norm": 2.921875, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0001, "loss": 3.0904, "loss/crossentropy": 2.2991411328315734, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22767046988010406, "loss/reg": 0.0, "step": 18470 }, { "epoch": 0.12157894736842105, "grad_norm": 2.25, "grad_norm_var": 0.051595052083333336, "learning_rate": 0.0001, "loss": 3.0997, "loss/crossentropy": 2.479309868812561, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.2216094933450222, "loss/reg": 0.0, "step": 18480 }, { "epoch": 0.12164473684210526, "grad_norm": 2.171875, "grad_norm_var": 0.029279581705729165, "learning_rate": 0.0001, "loss": 3.0982, "loss/crossentropy": 2.224264907836914, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2954641401767731, "loss/reg": 0.0, "step": 18490 }, { "epoch": 0.12171052631578948, "grad_norm": 2.484375, "grad_norm_var": 0.0418365478515625, "learning_rate": 0.0001, "loss": 3.1278, "loss/crossentropy": 2.1996599078178405, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.24900650084018708, "loss/reg": 0.0, "step": 18500 }, { "epoch": 0.12177631578947369, "grad_norm": 2.65625, "grad_norm_var": 6.255301920572917, "learning_rate": 0.0001, "loss": 3.1924, "loss/crossentropy": 2.388622558116913, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2473811611533165, "loss/reg": 0.0, "step": 18510 }, { "epoch": 0.12184210526315789, "grad_norm": 2.359375, "grad_norm_var": 0.07906901041666667, "learning_rate": 0.0001, "loss": 3.1227, "loss/crossentropy": 2.325872230529785, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2705776423215866, "loss/reg": 0.0, "step": 18520 }, { "epoch": 0.1219078947368421, "grad_norm": 3.96875, "grad_norm_var": 0.2271197001139323, "learning_rate": 0.0001, "loss": 3.104, "loss/crossentropy": 2.4012768149375914, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.24956294745206833, "loss/reg": 0.0, "step": 18530 }, { "epoch": 0.12197368421052632, "grad_norm": 2.015625, "grad_norm_var": 0.2910316467285156, "learning_rate": 0.0001, "loss": 3.1206, "loss/crossentropy": 2.605260455608368, "loss/hidden": 3.33125, "loss/incoh": 0.0, "loss/logits": 0.2743007704615593, "loss/reg": 0.0, "step": 18540 }, { "epoch": 0.12203947368421053, "grad_norm": 2.1875, "grad_norm_var": 0.1703277587890625, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 1.9470459461212157, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.27280396595597267, "loss/reg": 0.0, "step": 18550 }, { "epoch": 0.12210526315789473, "grad_norm": 4.5625, "grad_norm_var": 0.52919921875, "learning_rate": 0.0001, "loss": 3.1155, "loss/crossentropy": 2.216350567340851, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22622163146734237, "loss/reg": 0.0, "step": 18560 }, { "epoch": 0.12217105263157894, "grad_norm": 2.65625, "grad_norm_var": 0.4434733072916667, "learning_rate": 0.0001, "loss": 3.1029, "loss/crossentropy": 2.1628998279571534, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.28237638175487517, "loss/reg": 0.0, "step": 18570 }, { "epoch": 0.12223684210526316, "grad_norm": 2.890625, "grad_norm_var": 0.3641916910807292, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.486808693408966, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.2668070778250694, "loss/reg": 0.0, "step": 18580 }, { "epoch": 0.12230263157894737, "grad_norm": 3.03125, "grad_norm_var": 0.3584950764973958, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.3242167592048646, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2651012405753136, "loss/reg": 0.0, "step": 18590 }, { "epoch": 0.12236842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.05087890625, "learning_rate": 0.0001, "loss": 3.0364, "loss/crossentropy": 2.4761088371276854, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.24524887502193451, "loss/reg": 0.0, "step": 18600 }, { "epoch": 0.12243421052631578, "grad_norm": 2.59375, "grad_norm_var": 0.08817952473958333, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 2.012187111377716, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.2920076042413712, "loss/reg": 0.0, "step": 18610 }, { "epoch": 0.1225, "grad_norm": 2.421875, "grad_norm_var": 0.07818094889322917, "learning_rate": 0.0001, "loss": 3.1844, "loss/crossentropy": 2.3601612210273744, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2700245052576065, "loss/reg": 0.0, "step": 18620 }, { "epoch": 0.12256578947368421, "grad_norm": 2.28125, "grad_norm_var": 0.6979482014973958, "learning_rate": 0.0001, "loss": 3.1953, "loss/crossentropy": 2.2805428981781004, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.21399059891700745, "loss/reg": 0.0, "step": 18630 }, { "epoch": 0.12263157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.09913736979166667, "learning_rate": 0.0001, "loss": 3.2462, "loss/crossentropy": 2.3119712233543397, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.27173476070165636, "loss/reg": 0.0, "step": 18640 }, { "epoch": 0.12269736842105264, "grad_norm": 2.484375, "grad_norm_var": 0.12066650390625, "learning_rate": 0.0001, "loss": 3.1387, "loss/crossentropy": 2.1647680759429933, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2955679178237915, "loss/reg": 0.0, "step": 18650 }, { "epoch": 0.12276315789473684, "grad_norm": 2.21875, "grad_norm_var": 0.0642242431640625, "learning_rate": 0.0001, "loss": 3.0672, "loss/crossentropy": 2.398548412322998, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.24050813913345337, "loss/reg": 0.0, "step": 18660 }, { "epoch": 0.12282894736842105, "grad_norm": 3.046875, "grad_norm_var": 0.43463134765625, "learning_rate": 0.0001, "loss": 3.1768, "loss/crossentropy": 2.007632791996002, "loss/hidden": 3.2265625, "loss/incoh": 0.0, "loss/logits": 0.28355503678321836, "loss/reg": 0.0, "step": 18670 }, { "epoch": 0.12289473684210526, "grad_norm": 2.21875, "grad_norm_var": 0.41833394368489585, "learning_rate": 0.0001, "loss": 3.1491, "loss/crossentropy": 2.4610511898994445, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.23162013590335845, "loss/reg": 0.0, "step": 18680 }, { "epoch": 0.12296052631578948, "grad_norm": 4.15625, "grad_norm_var": 0.29620335896809896, "learning_rate": 0.0001, "loss": 3.1762, "loss/crossentropy": 2.11824317574501, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2083466961979866, "loss/reg": 0.0, "step": 18690 }, { "epoch": 0.12302631578947368, "grad_norm": 2.203125, "grad_norm_var": 1.1225685119628905, "learning_rate": 0.0001, "loss": 3.1766, "loss/crossentropy": 2.236714720726013, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.23704309910535812, "loss/reg": 0.0, "step": 18700 }, { "epoch": 0.12309210526315789, "grad_norm": 2.109375, "grad_norm_var": 0.12454020182291667, "learning_rate": 0.0001, "loss": 3.0992, "loss/crossentropy": 2.2541938424110413, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.23853187412023544, "loss/reg": 0.0, "step": 18710 }, { "epoch": 0.1231578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.070654296875, "learning_rate": 0.0001, "loss": 3.0403, "loss/crossentropy": 2.4512630701065063, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.24162033200263977, "loss/reg": 0.0, "step": 18720 }, { "epoch": 0.12322368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.3199268102645876, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.23435179740190507, "loss/reg": 0.0, "step": 18730 }, { "epoch": 0.12328947368421053, "grad_norm": 2.421875, "grad_norm_var": 0.0454254150390625, "learning_rate": 0.0001, "loss": 3.019, "loss/crossentropy": 2.3065245509147645, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22434473037719727, "loss/reg": 0.0, "step": 18740 }, { "epoch": 0.12335526315789473, "grad_norm": 2.515625, "grad_norm_var": 0.1038726806640625, "learning_rate": 0.0001, "loss": 3.145, "loss/crossentropy": 2.2620524525642396, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2564009681344032, "loss/reg": 0.0, "step": 18750 }, { "epoch": 0.12342105263157895, "grad_norm": 3.40625, "grad_norm_var": 0.1225982666015625, "learning_rate": 0.0001, "loss": 3.1243, "loss/crossentropy": 2.06068754196167, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.23558274507522584, "loss/reg": 0.0, "step": 18760 }, { "epoch": 0.12348684210526316, "grad_norm": 2.4375, "grad_norm_var": 0.28750712076822915, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.5775238513946532, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2552958935499191, "loss/reg": 0.0, "step": 18770 }, { "epoch": 0.12355263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.09669596354166667, "learning_rate": 0.0001, "loss": 3.1108, "loss/crossentropy": 2.100378167629242, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.23871416002511978, "loss/reg": 0.0, "step": 18780 }, { "epoch": 0.12361842105263159, "grad_norm": 5.5, "grad_norm_var": 0.6776112874348958, "learning_rate": 0.0001, "loss": 3.1095, "loss/crossentropy": 2.4402441143989564, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23556852638721465, "loss/reg": 0.0, "step": 18790 }, { "epoch": 0.12368421052631579, "grad_norm": 2.28125, "grad_norm_var": 0.6309967041015625, "learning_rate": 0.0001, "loss": 3.1612, "loss/crossentropy": 2.3034533500671386, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2525601238012314, "loss/reg": 0.0, "step": 18800 }, { "epoch": 0.12375, "grad_norm": 2.90625, "grad_norm_var": 0.06685282389322916, "learning_rate": 0.0001, "loss": 3.1566, "loss/crossentropy": 2.447878336906433, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2531526446342468, "loss/reg": 0.0, "step": 18810 }, { "epoch": 0.12381578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.17978515625, "learning_rate": 0.0001, "loss": 3.1701, "loss/crossentropy": 2.297843897342682, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.2250390335917473, "loss/reg": 0.0, "step": 18820 }, { "epoch": 0.12388157894736843, "grad_norm": 2.4375, "grad_norm_var": 0.06181640625, "learning_rate": 0.0001, "loss": 3.1129, "loss/crossentropy": 2.1577285885810853, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2582122042775154, "loss/reg": 0.0, "step": 18830 }, { "epoch": 0.12394736842105263, "grad_norm": 2.859375, "grad_norm_var": 0.08957926432291667, "learning_rate": 0.0001, "loss": 3.1521, "loss/crossentropy": 2.5041862964630126, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.24232363551855088, "loss/reg": 0.0, "step": 18840 }, { "epoch": 0.12401315789473684, "grad_norm": 2.953125, "grad_norm_var": 0.10745035807291667, "learning_rate": 0.0001, "loss": 3.1007, "loss/crossentropy": 1.8846357107162475, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.2533442348241806, "loss/reg": 0.0, "step": 18850 }, { "epoch": 0.12407894736842105, "grad_norm": 2.65625, "grad_norm_var": 0.11793212890625, "learning_rate": 0.0001, "loss": 3.0916, "loss/crossentropy": 2.5479671955108643, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.26022554039955137, "loss/reg": 0.0, "step": 18860 }, { "epoch": 0.12414473684210527, "grad_norm": 3.203125, "grad_norm_var": 0.4687652587890625, "learning_rate": 0.0001, "loss": 3.1202, "loss/crossentropy": 2.3512953519821167, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.2577581197023392, "loss/reg": 0.0, "step": 18870 }, { "epoch": 0.12421052631578948, "grad_norm": 2.265625, "grad_norm_var": 0.49575907389322915, "learning_rate": 0.0001, "loss": 3.192, "loss/crossentropy": 2.4191364645957947, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2868465960025787, "loss/reg": 0.0, "step": 18880 }, { "epoch": 0.12427631578947368, "grad_norm": 2.296875, "grad_norm_var": 0.20038655598958333, "learning_rate": 0.0001, "loss": 3.1072, "loss/crossentropy": 2.548805284500122, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.22707584351301194, "loss/reg": 0.0, "step": 18890 }, { "epoch": 0.12434210526315789, "grad_norm": 2.4375, "grad_norm_var": 0.16153055826822918, "learning_rate": 0.0001, "loss": 3.0653, "loss/crossentropy": 2.4287894129753114, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24110280126333236, "loss/reg": 0.0, "step": 18900 }, { "epoch": 0.1244078947368421, "grad_norm": 2.59375, "grad_norm_var": 0.06378580729166666, "learning_rate": 0.0001, "loss": 3.1229, "loss/crossentropy": 2.381858563423157, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.28408930599689486, "loss/reg": 0.0, "step": 18910 }, { "epoch": 0.12447368421052632, "grad_norm": 3.9375, "grad_norm_var": 0.33860575358072914, "learning_rate": 0.0001, "loss": 3.1459, "loss/crossentropy": 2.0421772241592406, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24683285355567933, "loss/reg": 0.0, "step": 18920 }, { "epoch": 0.12453947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.3823638916015625, "learning_rate": 0.0001, "loss": 3.1332, "loss/crossentropy": 2.528811717033386, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.26049076169729235, "loss/reg": 0.0, "step": 18930 }, { "epoch": 0.12460526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.1140045166015625, "learning_rate": 0.0001, "loss": 3.2262, "loss/crossentropy": 2.391852283477783, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.3043837010860443, "loss/reg": 0.0, "step": 18940 }, { "epoch": 0.12467105263157895, "grad_norm": 2.03125, "grad_norm_var": 0.06843973795572916, "learning_rate": 0.0001, "loss": 3.149, "loss/crossentropy": 2.218567681312561, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.254660502076149, "loss/reg": 0.0, "step": 18950 }, { "epoch": 0.12473684210526316, "grad_norm": 2.375, "grad_norm_var": 0.0237701416015625, "learning_rate": 0.0001, "loss": 3.1052, "loss/crossentropy": 2.3154717803001406, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2391164407134056, "loss/reg": 0.0, "step": 18960 }, { "epoch": 0.12480263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.05070699055989583, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.2152688026428224, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.28472713232040403, "loss/reg": 0.0, "step": 18970 }, { "epoch": 0.12486842105263157, "grad_norm": 2.3125, "grad_norm_var": 0.08042704264322917, "learning_rate": 0.0001, "loss": 3.1049, "loss/crossentropy": 2.392467772960663, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.34464606642723083, "loss/reg": 0.0, "step": 18980 }, { "epoch": 0.12493421052631579, "grad_norm": 2.625, "grad_norm_var": 0.07502848307291667, "learning_rate": 0.0001, "loss": 3.1321, "loss/crossentropy": 2.4656677722930906, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.25502916276454923, "loss/reg": 0.0, "step": 18990 }, { "epoch": 0.125, "grad_norm": 2.421875, "grad_norm_var": 1.4638824462890625, "learning_rate": 0.0001, "loss": 3.2055, "loss/crossentropy": 2.27457115650177, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.21155266612768173, "loss/reg": 0.0, "step": 19000 }, { "epoch": 0.1250657894736842, "grad_norm": 2.15625, "grad_norm_var": 0.5252115885416667, "learning_rate": 0.0001, "loss": 3.0721, "loss/crossentropy": 2.232183575630188, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.23653523325920106, "loss/reg": 0.0, "step": 19010 }, { "epoch": 0.12513157894736843, "grad_norm": 2.546875, "grad_norm_var": 0.3392415364583333, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.134384286403656, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.23607225120067596, "loss/reg": 0.0, "step": 19020 }, { "epoch": 0.12519736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.14305013020833332, "learning_rate": 0.0001, "loss": 3.1322, "loss/crossentropy": 2.406242322921753, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.2707079291343689, "loss/reg": 0.0, "step": 19030 }, { "epoch": 0.12526315789473685, "grad_norm": 2.234375, "grad_norm_var": 0.34036051432291664, "learning_rate": 0.0001, "loss": 3.2069, "loss/crossentropy": 2.4997928857803347, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2432278200984001, "loss/reg": 0.0, "step": 19040 }, { "epoch": 0.12532894736842104, "grad_norm": 2.4375, "grad_norm_var": 0.12541910807291667, "learning_rate": 0.0001, "loss": 3.0995, "loss/crossentropy": 2.2165623545646667, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24738555699586867, "loss/reg": 0.0, "step": 19050 }, { "epoch": 0.12539473684210525, "grad_norm": 2.8125, "grad_norm_var": 0.15855712890625, "learning_rate": 0.0001, "loss": 3.1697, "loss/crossentropy": 2.510625755786896, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.22799582332372664, "loss/reg": 0.0, "step": 19060 }, { "epoch": 0.12546052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.09364827473958333, "learning_rate": 0.0001, "loss": 3.2057, "loss/crossentropy": 2.380051004886627, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2702123373746872, "loss/reg": 0.0, "step": 19070 }, { "epoch": 0.12552631578947368, "grad_norm": 2.015625, "grad_norm_var": 0.07896728515625, "learning_rate": 0.0001, "loss": 3.0865, "loss/crossentropy": 2.2913742661476135, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22653487473726272, "loss/reg": 0.0, "step": 19080 }, { "epoch": 0.1255921052631579, "grad_norm": 2.453125, "grad_norm_var": 0.1556549072265625, "learning_rate": 0.0001, "loss": 3.1844, "loss/crossentropy": 2.39451003074646, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.23494229055941104, "loss/reg": 0.0, "step": 19090 }, { "epoch": 0.1256578947368421, "grad_norm": 2.5, "grad_norm_var": 0.42444254557291666, "learning_rate": 0.0001, "loss": 3.1173, "loss/crossentropy": 2.2207372069358824, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2351771742105484, "loss/reg": 0.0, "step": 19100 }, { "epoch": 0.12572368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.405517578125, "learning_rate": 0.0001, "loss": 3.1686, "loss/crossentropy": 2.544073963165283, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.23578422516584396, "loss/reg": 0.0, "step": 19110 }, { "epoch": 0.12578947368421053, "grad_norm": 2.28125, "grad_norm_var": 0.022623697916666668, "learning_rate": 0.0001, "loss": 3.1575, "loss/crossentropy": 2.0650166511535644, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2400606468319893, "loss/reg": 0.0, "step": 19120 }, { "epoch": 0.12585526315789475, "grad_norm": 2.078125, "grad_norm_var": 0.12542215983072916, "learning_rate": 0.0001, "loss": 3.1717, "loss/crossentropy": 2.1879891753196716, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.23869821876287461, "loss/reg": 0.0, "step": 19130 }, { "epoch": 0.12592105263157893, "grad_norm": 2.84375, "grad_norm_var": 2.683204189608586e+17, "learning_rate": 0.0001, "loss": 3.2235, "loss/crossentropy": 2.4875245571136473, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.3171342611312866, "loss/reg": 0.0, "step": 19140 }, { "epoch": 0.12598684210526315, "grad_norm": 2.109375, "grad_norm_var": 0.12364908854166666, "learning_rate": 0.0001, "loss": 3.1832, "loss/crossentropy": 2.199462330341339, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.2820486217737198, "loss/reg": 0.0, "step": 19150 }, { "epoch": 0.12605263157894736, "grad_norm": 3.53125, "grad_norm_var": 0.133642578125, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.433344876766205, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.3452912583947182, "loss/reg": 0.0, "step": 19160 }, { "epoch": 0.12611842105263157, "grad_norm": 2.328125, "grad_norm_var": 0.23087946573893228, "learning_rate": 0.0001, "loss": 3.073, "loss/crossentropy": 2.107306253910065, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.29124595075845716, "loss/reg": 0.0, "step": 19170 }, { "epoch": 0.1261842105263158, "grad_norm": 2.640625, "grad_norm_var": 2.206763811704668e+17, "learning_rate": 0.0001, "loss": 3.2818, "loss/crossentropy": 2.17460697889328, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.26940477788448336, "loss/reg": 0.0, "step": 19180 }, { "epoch": 0.12625, "grad_norm": 2.25, "grad_norm_var": 0.06741434733072917, "learning_rate": 0.0001, "loss": 3.1977, "loss/crossentropy": 2.249357485771179, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.29158340096473695, "loss/reg": 0.0, "step": 19190 }, { "epoch": 0.12631578947368421, "grad_norm": 2.28125, "grad_norm_var": 0.1665679931640625, "learning_rate": 0.0001, "loss": 3.0798, "loss/crossentropy": 2.39562349319458, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.24128414690494537, "loss/reg": 0.0, "step": 19200 }, { "epoch": 0.12638157894736843, "grad_norm": 2.515625, "grad_norm_var": 0.04352925618489583, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 2.4379887223243712, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2693786233663559, "loss/reg": 0.0, "step": 19210 }, { "epoch": 0.12644736842105264, "grad_norm": 2.609375, "grad_norm_var": 0.05164286295572917, "learning_rate": 0.0001, "loss": 3.052, "loss/crossentropy": 2.327665722370148, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.37138707339763644, "loss/reg": 0.0, "step": 19220 }, { "epoch": 0.12651315789473686, "grad_norm": 2.09375, "grad_norm_var": 1.5772623697916666, "learning_rate": 0.0001, "loss": 3.1007, "loss/crossentropy": 2.1090051174163817, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.23532682955265044, "loss/reg": 0.0, "step": 19230 }, { "epoch": 0.12657894736842104, "grad_norm": 2.5625, "grad_norm_var": 0.76259765625, "learning_rate": 0.0001, "loss": 3.087, "loss/crossentropy": 2.3096412897109984, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2415456846356392, "loss/reg": 0.0, "step": 19240 }, { "epoch": 0.12664473684210525, "grad_norm": 2.5, "grad_norm_var": 0.0485504150390625, "learning_rate": 0.0001, "loss": 3.0857, "loss/crossentropy": 2.252851128578186, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.21757592558860778, "loss/reg": 0.0, "step": 19250 }, { "epoch": 0.12671052631578947, "grad_norm": 2.359375, "grad_norm_var": 0.16155192057291667, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.19182807803154, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.23997026532888413, "loss/reg": 0.0, "step": 19260 }, { "epoch": 0.12677631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.4118609110514323, "learning_rate": 0.0001, "loss": 3.1121, "loss/crossentropy": 2.4667071104049683, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.23552417606115342, "loss/reg": 0.0, "step": 19270 }, { "epoch": 0.1268421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.04609553019205729, "learning_rate": 0.0001, "loss": 3.1763, "loss/crossentropy": 2.1369692206382753, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24942600578069687, "loss/reg": 0.0, "step": 19280 }, { "epoch": 0.1269078947368421, "grad_norm": 2.5, "grad_norm_var": 0.0526031494140625, "learning_rate": 0.0001, "loss": 3.1068, "loss/crossentropy": 2.11109459400177, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.25110483914613724, "loss/reg": 0.0, "step": 19290 }, { "epoch": 0.12697368421052632, "grad_norm": 2.578125, "grad_norm_var": 0.053343709309895834, "learning_rate": 0.0001, "loss": 3.1517, "loss/crossentropy": 2.380664014816284, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2508363798260689, "loss/reg": 0.0, "step": 19300 }, { "epoch": 0.12703947368421054, "grad_norm": 2.171875, "grad_norm_var": 0.034032185872395836, "learning_rate": 0.0001, "loss": 3.1303, "loss/crossentropy": 2.380405902862549, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2564548909664154, "loss/reg": 0.0, "step": 19310 }, { "epoch": 0.12710526315789475, "grad_norm": 2.375, "grad_norm_var": 0.03234049479166667, "learning_rate": 0.0001, "loss": 3.1223, "loss/crossentropy": 2.1872188091278075, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2912978962063789, "loss/reg": 0.0, "step": 19320 }, { "epoch": 0.12717105263157893, "grad_norm": 2.328125, "grad_norm_var": 0.07059504191080729, "learning_rate": 0.0001, "loss": 3.1221, "loss/crossentropy": 2.470194971561432, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2459412097930908, "loss/reg": 0.0, "step": 19330 }, { "epoch": 0.12723684210526315, "grad_norm": 2.484375, "grad_norm_var": 0.1386431376139323, "learning_rate": 0.0001, "loss": 3.1555, "loss/crossentropy": 2.3521093368530273, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2635854005813599, "loss/reg": 0.0, "step": 19340 }, { "epoch": 0.12730263157894736, "grad_norm": 2.4375, "grad_norm_var": 0.053857421875, "learning_rate": 0.0001, "loss": 3.0594, "loss/crossentropy": 2.099438285827637, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22058220505714415, "loss/reg": 0.0, "step": 19350 }, { "epoch": 0.12736842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.10086161295572917, "learning_rate": 0.0001, "loss": 3.1877, "loss/crossentropy": 2.2927380204200745, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.25694535821676256, "loss/reg": 0.0, "step": 19360 }, { "epoch": 0.1274342105263158, "grad_norm": 1.9375, "grad_norm_var": 0.05416259765625, "learning_rate": 0.0001, "loss": 3.0826, "loss/crossentropy": 2.1295772194862366, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.254076661169529, "loss/reg": 0.0, "step": 19370 }, { "epoch": 0.1275, "grad_norm": 2.5625, "grad_norm_var": 0.24744466145833333, "learning_rate": 0.0001, "loss": 3.1415, "loss/crossentropy": 1.9256490916013718, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.24017905220389366, "loss/reg": 0.0, "step": 19380 }, { "epoch": 0.12756578947368422, "grad_norm": 2.421875, "grad_norm_var": 0.5484659830729167, "learning_rate": 0.0001, "loss": 3.1094, "loss/crossentropy": 2.4685181736946107, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.26331629455089567, "loss/reg": 0.0, "step": 19390 }, { "epoch": 0.12763157894736843, "grad_norm": 2.3125, "grad_norm_var": 0.39381103515625, "learning_rate": 0.0001, "loss": 3.1037, "loss/crossentropy": 2.2764881372451784, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.25390326231718063, "loss/reg": 0.0, "step": 19400 }, { "epoch": 0.12769736842105264, "grad_norm": 2.765625, "grad_norm_var": 0.0698150634765625, "learning_rate": 0.0001, "loss": 3.1262, "loss/crossentropy": 2.299801528453827, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2553006038069725, "loss/reg": 0.0, "step": 19410 }, { "epoch": 0.12776315789473683, "grad_norm": 2.453125, "grad_norm_var": 0.266357421875, "learning_rate": 0.0001, "loss": 3.1239, "loss/crossentropy": 2.0346228003501894, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.22659170776605606, "loss/reg": 0.0, "step": 19420 }, { "epoch": 0.12782894736842104, "grad_norm": 2.34375, "grad_norm_var": 0.37324930826822916, "learning_rate": 0.0001, "loss": 3.1527, "loss/crossentropy": 1.8951176881790162, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.23443188220262529, "loss/reg": 0.0, "step": 19430 }, { "epoch": 0.12789473684210526, "grad_norm": 2.71875, "grad_norm_var": 0.30020243326822915, "learning_rate": 0.0001, "loss": 3.1742, "loss/crossentropy": 2.0540109515190124, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.20190905332565307, "loss/reg": 0.0, "step": 19440 }, { "epoch": 0.12796052631578947, "grad_norm": 2.75, "grad_norm_var": 0.19728190104166668, "learning_rate": 0.0001, "loss": 3.1528, "loss/crossentropy": 2.3164134502410887, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2684441477060318, "loss/reg": 0.0, "step": 19450 }, { "epoch": 0.12802631578947368, "grad_norm": 2.25, "grad_norm_var": 0.2419586181640625, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.380271017551422, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.2657022625207901, "loss/reg": 0.0, "step": 19460 }, { "epoch": 0.1280921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.11366780598958333, "learning_rate": 0.0001, "loss": 3.1441, "loss/crossentropy": 2.4884063005447388, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2683678835630417, "loss/reg": 0.0, "step": 19470 }, { "epoch": 0.1281578947368421, "grad_norm": 2038431744.0, "grad_norm_var": 2.597002478369833e+17, "learning_rate": 0.0001, "loss": 3.2142, "loss/crossentropy": 2.0524453282356263, "loss/hidden": 3.80625, "loss/incoh": 0.0, "loss/logits": 0.2409697949886322, "loss/reg": 0.0, "step": 19480 }, { "epoch": 0.12822368421052632, "grad_norm": 2.203125, "grad_norm_var": 2.597002478497235e+17, "learning_rate": 0.0001, "loss": 3.0528, "loss/crossentropy": 2.2681432604789733, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.24559762477874755, "loss/reg": 0.0, "step": 19490 }, { "epoch": 0.12828947368421054, "grad_norm": 2.296875, "grad_norm_var": 0.030345662434895834, "learning_rate": 0.0001, "loss": 3.0796, "loss/crossentropy": 2.362069141864777, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2367064341902733, "loss/reg": 0.0, "step": 19500 }, { "epoch": 0.12835526315789475, "grad_norm": 2.75, "grad_norm_var": 0.059382120768229164, "learning_rate": 0.0001, "loss": 3.1465, "loss/crossentropy": 2.260143554210663, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.25476695597171783, "loss/reg": 0.0, "step": 19510 }, { "epoch": 0.12842105263157894, "grad_norm": 2.40625, "grad_norm_var": 0.18327534993489583, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.322359097003937, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.22897413671016692, "loss/reg": 0.0, "step": 19520 }, { "epoch": 0.12848684210526315, "grad_norm": 2.078125, "grad_norm_var": 0.4216379801432292, "learning_rate": 0.0001, "loss": 3.1736, "loss/crossentropy": 2.658802056312561, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.26290144920349123, "loss/reg": 0.0, "step": 19530 }, { "epoch": 0.12855263157894736, "grad_norm": 2.0625, "grad_norm_var": 0.1238677978515625, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.389211559295654, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.29232275635004046, "loss/reg": 0.0, "step": 19540 }, { "epoch": 0.12861842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.049845123291015626, "learning_rate": 0.0001, "loss": 3.0708, "loss/crossentropy": 2.186306023597717, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2743732765316963, "loss/reg": 0.0, "step": 19550 }, { "epoch": 0.1286842105263158, "grad_norm": 2.40625, "grad_norm_var": 0.024887847900390624, "learning_rate": 0.0001, "loss": 3.1515, "loss/crossentropy": 2.325215721130371, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.24441724121570588, "loss/reg": 0.0, "step": 19560 }, { "epoch": 0.12875, "grad_norm": 2.40625, "grad_norm_var": 0.18791910807291667, "learning_rate": 0.0001, "loss": 3.1163, "loss/crossentropy": 2.4399210453033446, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.23570073395967484, "loss/reg": 0.0, "step": 19570 }, { "epoch": 0.12881578947368422, "grad_norm": 2.34375, "grad_norm_var": 0.0600250244140625, "learning_rate": 0.0001, "loss": 3.0731, "loss/crossentropy": 2.250354325771332, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.25124100893735885, "loss/reg": 0.0, "step": 19580 }, { "epoch": 0.12888157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.2105865478515625, "learning_rate": 0.0001, "loss": 3.1494, "loss/crossentropy": 2.3004735589027403, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.250587160885334, "loss/reg": 0.0, "step": 19590 }, { "epoch": 0.12894736842105264, "grad_norm": 2.125, "grad_norm_var": 0.23645833333333333, "learning_rate": 0.0001, "loss": 3.099, "loss/crossentropy": 2.3559111833572386, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.26171091198921204, "loss/reg": 0.0, "step": 19600 }, { "epoch": 0.12901315789473683, "grad_norm": 2.328125, "grad_norm_var": 0.028955078125, "learning_rate": 0.0001, "loss": 3.171, "loss/crossentropy": 2.2754984378814695, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2695039168000221, "loss/reg": 0.0, "step": 19610 }, { "epoch": 0.12907894736842104, "grad_norm": 2.703125, "grad_norm_var": 0.12908426920572916, "learning_rate": 0.0001, "loss": 3.1609, "loss/crossentropy": 2.457037115097046, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2523172840476036, "loss/reg": 0.0, "step": 19620 }, { "epoch": 0.12914473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.14287007649739583, "learning_rate": 0.0001, "loss": 3.0694, "loss/crossentropy": 2.33393777012825, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23025162070989608, "loss/reg": 0.0, "step": 19630 }, { "epoch": 0.12921052631578947, "grad_norm": 2.125, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 3.0684, "loss/crossentropy": 2.155827796459198, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.22635919600725174, "loss/reg": 0.0, "step": 19640 }, { "epoch": 0.12927631578947368, "grad_norm": 2.03125, "grad_norm_var": 0.31851806640625, "learning_rate": 0.0001, "loss": 3.112, "loss/crossentropy": 2.2350030899047852, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2474326401948929, "loss/reg": 0.0, "step": 19650 }, { "epoch": 0.1293421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.30481363932291666, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.1235520601272584, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23340977281332015, "loss/reg": 0.0, "step": 19660 }, { "epoch": 0.1294078947368421, "grad_norm": 2.4375, "grad_norm_var": 0.058958943684895834, "learning_rate": 0.0001, "loss": 3.1092, "loss/crossentropy": 2.436754751205444, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2396928071975708, "loss/reg": 0.0, "step": 19670 }, { "epoch": 0.12947368421052632, "grad_norm": 2.84375, "grad_norm_var": 0.2934804280598958, "learning_rate": 0.0001, "loss": 3.0531, "loss/crossentropy": 2.24703209400177, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.23143238127231597, "loss/reg": 0.0, "step": 19680 }, { "epoch": 0.12953947368421054, "grad_norm": 2.25, "grad_norm_var": 0.25420303344726564, "learning_rate": 0.0001, "loss": 3.0318, "loss/crossentropy": 2.536008381843567, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22230196446180345, "loss/reg": 0.0, "step": 19690 }, { "epoch": 0.12960526315789472, "grad_norm": 2.21875, "grad_norm_var": 0.29778416951497394, "learning_rate": 0.0001, "loss": 3.0902, "loss/crossentropy": 2.1229737393558024, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2103697349317372, "loss/reg": 0.0, "step": 19700 }, { "epoch": 0.12967105263157894, "grad_norm": 2.0625, "grad_norm_var": 0.09654032389322917, "learning_rate": 0.0001, "loss": 3.0303, "loss/crossentropy": 2.360739004611969, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.238627889752388, "loss/reg": 0.0, "step": 19710 }, { "epoch": 0.12973684210526315, "grad_norm": 2.234375, "grad_norm_var": 0.16044514973958332, "learning_rate": 0.0001, "loss": 3.082, "loss/crossentropy": 2.3643787026405336, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2354188710451126, "loss/reg": 0.0, "step": 19720 }, { "epoch": 0.12980263157894736, "grad_norm": 2.375, "grad_norm_var": 0.23007405598958333, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.3766650319099427, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.26477697044610976, "loss/reg": 0.0, "step": 19730 }, { "epoch": 0.12986842105263158, "grad_norm": 2.21875, "grad_norm_var": 0.055946604410807295, "learning_rate": 0.0001, "loss": 3.0414, "loss/crossentropy": 2.6002285480499268, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.33312758058309555, "loss/reg": 0.0, "step": 19740 }, { "epoch": 0.1299342105263158, "grad_norm": 2.515625, "grad_norm_var": 0.13170547485351564, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.457702159881592, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.25578114166855814, "loss/reg": 0.0, "step": 19750 }, { "epoch": 0.13, "grad_norm": 2.65625, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 3.0882, "loss/crossentropy": 2.261428934335709, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2278106167912483, "loss/reg": 0.0, "step": 19760 }, { "epoch": 0.13006578947368422, "grad_norm": 2.421875, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 3.0582, "loss/crossentropy": 2.060434710979462, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.24812956005334855, "loss/reg": 0.0, "step": 19770 }, { "epoch": 0.13013157894736843, "grad_norm": 2.40625, "grad_norm_var": 0.0256256103515625, "learning_rate": 0.0001, "loss": 3.08, "loss/crossentropy": 2.2472903966903686, "loss/hidden": 3.1, "loss/incoh": 0.0, "loss/logits": 0.34347400814294815, "loss/reg": 0.0, "step": 19780 }, { "epoch": 0.13019736842105264, "grad_norm": 2.265625, "grad_norm_var": 0.0520660400390625, "learning_rate": 0.0001, "loss": 3.1154, "loss/crossentropy": 2.3003188014030456, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2695119082927704, "loss/reg": 0.0, "step": 19790 }, { "epoch": 0.13026315789473683, "grad_norm": 2.578125, "grad_norm_var": 0.08396809895833333, "learning_rate": 0.0001, "loss": 3.0827, "loss/crossentropy": 2.2551465153694155, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.24707757085561752, "loss/reg": 0.0, "step": 19800 }, { "epoch": 0.13032894736842104, "grad_norm": 2.65625, "grad_norm_var": 0.15972900390625, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.323357033729553, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2660538278520107, "loss/reg": 0.0, "step": 19810 }, { "epoch": 0.13039473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.15328776041666667, "learning_rate": 0.0001, "loss": 3.0792, "loss/crossentropy": 2.369428300857544, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2439693480730057, "loss/reg": 0.0, "step": 19820 }, { "epoch": 0.13046052631578947, "grad_norm": 2.328125, "grad_norm_var": 0.09102274576822916, "learning_rate": 0.0001, "loss": 3.1636, "loss/crossentropy": 2.2469497442245485, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24479606077075006, "loss/reg": 0.0, "step": 19830 }, { "epoch": 0.13052631578947368, "grad_norm": 2.625, "grad_norm_var": 0.052302042643229164, "learning_rate": 0.0001, "loss": 3.1694, "loss/crossentropy": 2.4753658294677736, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.3045656159520149, "loss/reg": 0.0, "step": 19840 }, { "epoch": 0.1305921052631579, "grad_norm": 2.265625, "grad_norm_var": 0.0764068603515625, "learning_rate": 0.0001, "loss": 3.0959, "loss/crossentropy": 2.4656317472457885, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.258389513194561, "loss/reg": 0.0, "step": 19850 }, { "epoch": 0.1306578947368421, "grad_norm": 2.515625, "grad_norm_var": 0.06215718587239583, "learning_rate": 0.0001, "loss": 3.0406, "loss/crossentropy": 2.440659189224243, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.26817646920681, "loss/reg": 0.0, "step": 19860 }, { "epoch": 0.13072368421052633, "grad_norm": 2.296875, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 3.0968, "loss/crossentropy": 2.206334137916565, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24976521283388137, "loss/reg": 0.0, "step": 19870 }, { "epoch": 0.13078947368421054, "grad_norm": 2.109375, "grad_norm_var": 0.08323567708333333, "learning_rate": 0.0001, "loss": 3.0685, "loss/crossentropy": 2.21451940536499, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.21932003498077393, "loss/reg": 0.0, "step": 19880 }, { "epoch": 0.13085526315789472, "grad_norm": 2.84375, "grad_norm_var": 0.17522786458333334, "learning_rate": 0.0001, "loss": 3.0628, "loss/crossentropy": 2.1315925240516664, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23312209993600846, "loss/reg": 0.0, "step": 19890 }, { "epoch": 0.13092105263157894, "grad_norm": 6.28125, "grad_norm_var": 0.9668528238932291, "learning_rate": 0.0001, "loss": 3.1207, "loss/crossentropy": 2.5357746481895447, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2542704403400421, "loss/reg": 0.0, "step": 19900 }, { "epoch": 0.13098684210526315, "grad_norm": 2.25, "grad_norm_var": 0.9971913655598958, "learning_rate": 0.0001, "loss": 3.0247, "loss/crossentropy": 2.53050377368927, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2318735808134079, "loss/reg": 0.0, "step": 19910 }, { "epoch": 0.13105263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.14268290201822917, "learning_rate": 0.0001, "loss": 3.1545, "loss/crossentropy": 2.475492477416992, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.29062798619270325, "loss/reg": 0.0, "step": 19920 }, { "epoch": 0.13111842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.08727925618489583, "learning_rate": 0.0001, "loss": 3.1316, "loss/crossentropy": 2.4628885269165037, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.24764630049467087, "loss/reg": 0.0, "step": 19930 }, { "epoch": 0.1311842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.07839736938476563, "learning_rate": 0.0001, "loss": 3.0597, "loss/crossentropy": 2.46280722618103, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2163504734635353, "loss/reg": 0.0, "step": 19940 }, { "epoch": 0.13125, "grad_norm": 2.34375, "grad_norm_var": 0.0990875244140625, "learning_rate": 0.0001, "loss": 3.1648, "loss/crossentropy": 2.4153407394886015, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.26917385756969453, "loss/reg": 0.0, "step": 19950 }, { "epoch": 0.13131578947368422, "grad_norm": 2.8125, "grad_norm_var": 0.18907877604166667, "learning_rate": 0.0001, "loss": 3.1173, "loss/crossentropy": 2.232962656021118, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3076439991593361, "loss/reg": 0.0, "step": 19960 }, { "epoch": 0.13138157894736843, "grad_norm": 2.3125, "grad_norm_var": 0.13981119791666666, "learning_rate": 0.0001, "loss": 3.0478, "loss/crossentropy": 2.353410243988037, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24693673849105835, "loss/reg": 0.0, "step": 19970 }, { "epoch": 0.13144736842105262, "grad_norm": 3.15625, "grad_norm_var": 0.1372711181640625, "learning_rate": 0.0001, "loss": 3.1452, "loss/crossentropy": 2.121303880214691, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.31650226265192033, "loss/reg": 0.0, "step": 19980 }, { "epoch": 0.13151315789473683, "grad_norm": 2.25, "grad_norm_var": 0.1741607666015625, "learning_rate": 0.0001, "loss": 3.1028, "loss/crossentropy": 2.43160719871521, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.2277207463979721, "loss/reg": 0.0, "step": 19990 }, { "epoch": 0.13157894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.15798238118489583, "learning_rate": 0.0001, "loss": 3.0532, "loss/crossentropy": 2.2883418917655947, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.2525527849793434, "loss/reg": 0.0, "step": 20000 } ], "logging_steps": 10, "max_steps": 152000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4287550160044032e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }