{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.39473684210526316, "eval_steps": 2000, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.578947368421052e-05, "grad_norm": 992.0, "learning_rate": 1e-05, "loss": 37.1063, "loss/crossentropy": 15.088774585723877, "loss/hidden": 19.0875, "loss/incoh": 0.0, "loss/logits": 17.867034912109375, "loss/reg": 0.0, "step": 10 }, { "epoch": 0.00013157894736842105, "grad_norm": 408.0, "grad_norm_var": 138977.05, "learning_rate": 2e-05, "loss": 34.8921, "loss/crossentropy": 14.647695541381836, "loss/hidden": 18.8125, "loss/incoh": 0.0, "loss/logits": 15.686719226837159, "loss/reg": 0.0, "step": 20 }, { "epoch": 0.00019736842105263157, "grad_norm": 296.0, "grad_norm_var": 17795.066666666666, "learning_rate": 3e-05, "loss": 32.2418, "loss/crossentropy": 14.60380687713623, "loss/hidden": 18.6375, "loss/incoh": 0.0, "loss/logits": 13.480152130126953, "loss/reg": 0.0, "step": 30 }, { "epoch": 0.0002631578947368421, "grad_norm": 26.125, "grad_norm_var": 30349.038525390624, "learning_rate": 4e-05, "loss": 28.5558, "loss/crossentropy": 15.434869766235352, "loss/hidden": 18.525, "loss/incoh": 0.0, "loss/logits": 10.245227527618407, "loss/reg": 0.0, "step": 40 }, { "epoch": 0.0003289473684210526, "grad_norm": 51.25, "grad_norm_var": 160.51717122395834, "learning_rate": 5e-05, "loss": 27.8535, "loss/crossentropy": 13.159739780426026, "loss/hidden": 18.4, "loss/incoh": 0.0, "loss/logits": 10.088242149353027, "loss/reg": 0.0, "step": 50 }, { "epoch": 0.00039473684210526315, "grad_norm": 22.125, "grad_norm_var": 1.3315524858929522e+17, "learning_rate": 6e-05, "loss": 26.8127, "loss/crossentropy": 10.868320941925049, "loss/hidden": 18.1375, "loss/incoh": 0.0, "loss/logits": 8.643436527252197, "loss/reg": 0.0, "step": 60 }, { "epoch": 0.0004605263157894737, "grad_norm": 25.625, "grad_norm_var": 1.3315524982997056e+17, "learning_rate": 7e-05, "loss": 25.6909, "loss/crossentropy": 10.59145736694336, "loss/hidden": 17.65, "loss/incoh": 0.0, "loss/logits": 8.497289371490478, "loss/reg": 0.0, "step": 70 }, { "epoch": 0.0005263157894736842, "grad_norm": 32.75, "grad_norm_var": 206.156103515625, "learning_rate": 8e-05, "loss": 24.5919, "loss/crossentropy": 10.117278575897217, "loss/hidden": 17.025, "loss/incoh": 0.0, "loss/logits": 8.660355854034425, "loss/reg": 0.0, "step": 80 }, { "epoch": 0.0005921052631578948, "grad_norm": 128.0, "grad_norm_var": 1129.743212890625, "learning_rate": 9e-05, "loss": 23.5218, "loss/crossentropy": 9.528321361541748, "loss/hidden": 16.3125, "loss/incoh": 0.0, "loss/logits": 6.695920991897583, "loss/reg": 0.0, "step": 90 }, { "epoch": 0.0006578947368421052, "grad_norm": 68.5, "grad_norm_var": 439.56223958333334, "learning_rate": 0.0001, "loss": 23.2768, "loss/crossentropy": 10.33118553161621, "loss/hidden": 16.5125, "loss/incoh": 0.0, "loss/logits": 7.274227619171143, "loss/reg": 0.0, "step": 100 }, { "epoch": 0.0007236842105263158, "grad_norm": 73.5, "grad_norm_var": 2741.439518229167, "learning_rate": 0.0001, "loss": 22.6559, "loss/crossentropy": 9.676900672912598, "loss/hidden": 16.21875, "loss/incoh": 0.0, "loss/logits": 7.22898006439209, "loss/reg": 0.0, "step": 110 }, { "epoch": 0.0007894736842105263, "grad_norm": 115.5, "grad_norm_var": 1451.31875, "learning_rate": 0.0001, "loss": 22.7316, "loss/crossentropy": 9.472201251983643, "loss/hidden": 16.03125, "loss/incoh": 0.0, "loss/logits": 6.231936502456665, "loss/reg": 0.0, "step": 120 }, { "epoch": 0.0008552631578947369, "grad_norm": 32.25, "grad_norm_var": 939.03515625, "learning_rate": 0.0001, "loss": 22.1075, "loss/crossentropy": 9.909808540344239, "loss/hidden": 16.075, "loss/incoh": 0.0, "loss/logits": 6.337067890167236, "loss/reg": 0.0, "step": 130 }, { "epoch": 0.0009210526315789473, "grad_norm": 37.75, "grad_norm_var": 222.684375, "learning_rate": 0.0001, "loss": 22.103, "loss/crossentropy": 9.516400051116943, "loss/hidden": 15.8875, "loss/incoh": 0.0, "loss/logits": 5.677393054962158, "loss/reg": 0.0, "step": 140 }, { "epoch": 0.000986842105263158, "grad_norm": 46.25, "grad_norm_var": 1221.1551432291667, "learning_rate": 0.0001, "loss": 21.5532, "loss/crossentropy": 9.398476314544677, "loss/hidden": 15.70625, "loss/incoh": 0.0, "loss/logits": 6.764282178878784, "loss/reg": 0.0, "step": 150 }, { "epoch": 0.0010526315789473684, "grad_norm": 42.0, "grad_norm_var": 416.9809895833333, "learning_rate": 0.0001, "loss": 20.8381, "loss/crossentropy": 9.098556232452392, "loss/hidden": 15.225, "loss/incoh": 0.0, "loss/logits": 5.8288336277008055, "loss/reg": 0.0, "step": 160 }, { "epoch": 0.0011184210526315789, "grad_norm": 33.25, "grad_norm_var": 334.8979166666667, "learning_rate": 0.0001, "loss": 18.9261, "loss/crossentropy": 7.879999303817749, "loss/hidden": 13.65, "loss/incoh": 0.0, "loss/logits": 5.353159737586975, "loss/reg": 0.0, "step": 170 }, { "epoch": 0.0011842105263157896, "grad_norm": 14.375, "grad_norm_var": 250.13333333333333, "learning_rate": 0.0001, "loss": 16.5004, "loss/crossentropy": 6.681199312210083, "loss/hidden": 12.0125, "loss/incoh": 0.0, "loss/logits": 4.6115447044372555, "loss/reg": 0.0, "step": 180 }, { "epoch": 0.00125, "grad_norm": 12.5625, "grad_norm_var": 119.9546875, "learning_rate": 0.0001, "loss": 14.1282, "loss/crossentropy": 5.785468435287475, "loss/hidden": 10.725, "loss/incoh": 0.0, "loss/logits": 3.489908790588379, "loss/reg": 0.0, "step": 190 }, { "epoch": 0.0013157894736842105, "grad_norm": 12.625, "grad_norm_var": 32.467431640625, "learning_rate": 0.0001, "loss": 12.8216, "loss/crossentropy": 4.923622274398804, "loss/hidden": 9.675, "loss/incoh": 0.0, "loss/logits": 3.1496715903282166, "loss/reg": 0.0, "step": 200 }, { "epoch": 0.001381578947368421, "grad_norm": 26.375, "grad_norm_var": 22.6759765625, "learning_rate": 0.0001, "loss": 11.5516, "loss/crossentropy": 4.429650473594665, "loss/hidden": 8.875, "loss/incoh": 0.0, "loss/logits": 2.247162342071533, "loss/reg": 0.0, "step": 210 }, { "epoch": 0.0014473684210526317, "grad_norm": 35.5, "grad_norm_var": 45.67233072916667, "learning_rate": 0.0001, "loss": 10.495, "loss/crossentropy": 4.112493515014648, "loss/hidden": 8.346875, "loss/incoh": 0.0, "loss/logits": 2.2232163310050965, "loss/reg": 0.0, "step": 220 }, { "epoch": 0.0015131578947368421, "grad_norm": 37.5, "grad_norm_var": 58.01770833333333, "learning_rate": 0.0001, "loss": 9.9703, "loss/crossentropy": 4.019938945770264, "loss/hidden": 8.015625, "loss/incoh": 0.0, "loss/logits": 2.0913679361343385, "loss/reg": 0.0, "step": 230 }, { "epoch": 0.0015789473684210526, "grad_norm": 36.25, "grad_norm_var": 53.95390625, "learning_rate": 0.0001, "loss": 9.5394, "loss/crossentropy": 3.9986461877822874, "loss/hidden": 7.490625, "loss/incoh": 0.0, "loss/logits": 1.801698899269104, "loss/reg": 0.0, "step": 240 }, { "epoch": 0.001644736842105263, "grad_norm": 34.0, "grad_norm_var": 913.9712890625, "learning_rate": 0.0001, "loss": 9.1248, "loss/crossentropy": 3.7174268484115602, "loss/hidden": 7.721875, "loss/incoh": 0.0, "loss/logits": 2.0452203273773195, "loss/reg": 0.0, "step": 250 }, { "epoch": 0.0017105263157894738, "grad_norm": 30.625, "grad_norm_var": 901.7176432291667, "learning_rate": 0.0001, "loss": 8.7525, "loss/crossentropy": 3.6645556688308716, "loss/hidden": 7.36875, "loss/incoh": 0.0, "loss/logits": 1.4742069363594055, "loss/reg": 0.0, "step": 260 }, { "epoch": 0.0017763157894736842, "grad_norm": 40.0, "grad_norm_var": 110.65826822916667, "learning_rate": 0.0001, "loss": 8.8113, "loss/crossentropy": 3.116227722167969, "loss/hidden": 7.31875, "loss/incoh": 0.0, "loss/logits": 1.6629727721214294, "loss/reg": 0.0, "step": 270 }, { "epoch": 0.0018421052631578947, "grad_norm": 34.75, "grad_norm_var": 122.43170572916667, "learning_rate": 0.0001, "loss": 8.5312, "loss/crossentropy": 3.413420820236206, "loss/hidden": 7.38125, "loss/incoh": 0.0, "loss/logits": 1.3304585099220276, "loss/reg": 0.0, "step": 280 }, { "epoch": 0.0019078947368421052, "grad_norm": 30.25, "grad_norm_var": 50.46640625, "learning_rate": 0.0001, "loss": 8.1172, "loss/crossentropy": 3.313588786125183, "loss/hidden": 6.721875, "loss/incoh": 0.0, "loss/logits": 1.1558095216751099, "loss/reg": 0.0, "step": 290 }, { "epoch": 0.001973684210526316, "grad_norm": 33.5, "grad_norm_var": 66.95618489583333, "learning_rate": 0.0001, "loss": 8.4831, "loss/crossentropy": 3.286371445655823, "loss/hidden": 6.971875, "loss/incoh": 0.0, "loss/logits": 1.4257299542427062, "loss/reg": 0.0, "step": 300 }, { "epoch": 0.0020394736842105263, "grad_norm": 37.0, "grad_norm_var": 39.40930989583333, "learning_rate": 0.0001, "loss": 8.1428, "loss/crossentropy": 3.1924397230148314, "loss/hidden": 6.7375, "loss/incoh": 0.0, "loss/logits": 1.129437392950058, "loss/reg": 0.0, "step": 310 }, { "epoch": 0.002105263157894737, "grad_norm": 35.75, "grad_norm_var": 60.73125, "learning_rate": 0.0001, "loss": 8.1236, "loss/crossentropy": 3.217240035533905, "loss/hidden": 7.0375, "loss/incoh": 0.0, "loss/logits": 1.216874635219574, "loss/reg": 0.0, "step": 320 }, { "epoch": 0.0021710526315789473, "grad_norm": 32.75, "grad_norm_var": 21.7181640625, "learning_rate": 0.0001, "loss": 8.0115, "loss/crossentropy": 3.2230591058731077, "loss/hidden": 6.665625, "loss/incoh": 0.0, "loss/logits": 1.165043205022812, "loss/reg": 0.0, "step": 330 }, { "epoch": 0.0022368421052631577, "grad_norm": 38.75, "grad_norm_var": 247.53932291666666, "learning_rate": 0.0001, "loss": 7.8717, "loss/crossentropy": 3.491655874252319, "loss/hidden": 6.628125, "loss/incoh": 0.0, "loss/logits": 1.1553439140319823, "loss/reg": 0.0, "step": 340 }, { "epoch": 0.002302631578947368, "grad_norm": 29.0, "grad_norm_var": 243.79576822916667, "learning_rate": 0.0001, "loss": 7.8354, "loss/crossentropy": 3.3709447622299193, "loss/hidden": 6.646875, "loss/incoh": 0.0, "loss/logits": 1.1052397668361664, "loss/reg": 0.0, "step": 350 }, { "epoch": 0.002368421052631579, "grad_norm": 28.75, "grad_norm_var": 40.61640625, "learning_rate": 0.0001, "loss": 7.5894, "loss/crossentropy": 3.0430339336395265, "loss/hidden": 6.778125, "loss/incoh": 0.0, "loss/logits": 1.152853137254715, "loss/reg": 0.0, "step": 360 }, { "epoch": 0.0024342105263157896, "grad_norm": 50.0, "grad_norm_var": 58.61640625, "learning_rate": 0.0001, "loss": 7.7607, "loss/crossentropy": 3.461497259140015, "loss/hidden": 6.640625, "loss/incoh": 0.0, "loss/logits": 1.2907899796962738, "loss/reg": 0.0, "step": 370 }, { "epoch": 0.0025, "grad_norm": 23.875, "grad_norm_var": 50.36145833333333, "learning_rate": 0.0001, "loss": 7.5895, "loss/crossentropy": 2.8183989763259887, "loss/hidden": 6.36875, "loss/incoh": 0.0, "loss/logits": 0.9597792446613311, "loss/reg": 0.0, "step": 380 }, { "epoch": 0.0025657894736842105, "grad_norm": 22.75, "grad_norm_var": 18.348958333333332, "learning_rate": 0.0001, "loss": 7.4024, "loss/crossentropy": 3.0434406876564024, "loss/hidden": 6.425, "loss/incoh": 0.0, "loss/logits": 1.1219703614711762, "loss/reg": 0.0, "step": 390 }, { "epoch": 0.002631578947368421, "grad_norm": 18.75, "grad_norm_var": 75.46451822916667, "learning_rate": 0.0001, "loss": 7.4663, "loss/crossentropy": 2.9813458204269407, "loss/hidden": 6.334375, "loss/incoh": 0.0, "loss/logits": 1.0157755613327026, "loss/reg": 0.0, "step": 400 }, { "epoch": 0.0026973684210526315, "grad_norm": 20.25, "grad_norm_var": 14.660416666666666, "learning_rate": 0.0001, "loss": 7.4425, "loss/crossentropy": 3.030743360519409, "loss/hidden": 6.23125, "loss/incoh": 0.0, "loss/logits": 1.0403401851654053, "loss/reg": 0.0, "step": 410 }, { "epoch": 0.002763157894736842, "grad_norm": 19.75, "grad_norm_var": 6.712239583333333, "learning_rate": 0.0001, "loss": 7.1935, "loss/crossentropy": 3.044888973236084, "loss/hidden": 6.1125, "loss/incoh": 0.0, "loss/logits": 0.920581477880478, "loss/reg": 0.0, "step": 420 }, { "epoch": 0.002828947368421053, "grad_norm": 18.625, "grad_norm_var": 5.847330729166667, "learning_rate": 0.0001, "loss": 7.0053, "loss/crossentropy": 3.2355963468551634, "loss/hidden": 6.015625, "loss/incoh": 0.0, "loss/logits": 0.9773828387260437, "loss/reg": 0.0, "step": 430 }, { "epoch": 0.0028947368421052633, "grad_norm": 21.875, "grad_norm_var": 9.627067057291667, "learning_rate": 0.0001, "loss": 6.9973, "loss/crossentropy": 3.3775979042053224, "loss/hidden": 6.071875, "loss/incoh": 0.0, "loss/logits": 1.0445533573627472, "loss/reg": 0.0, "step": 440 }, { "epoch": 0.0029605263157894738, "grad_norm": 19.25, "grad_norm_var": 10.170947265625, "learning_rate": 0.0001, "loss": 6.9686, "loss/crossentropy": 3.0577521562576293, "loss/hidden": 5.803125, "loss/incoh": 0.0, "loss/logits": 0.8946591019630432, "loss/reg": 0.0, "step": 450 }, { "epoch": 0.0030263157894736843, "grad_norm": 19.75, "grad_norm_var": 5.329166666666667, "learning_rate": 0.0001, "loss": 6.8021, "loss/crossentropy": 3.2570735692977903, "loss/hidden": 5.678125, "loss/incoh": 0.0, "loss/logits": 0.860103166103363, "loss/reg": 0.0, "step": 460 }, { "epoch": 0.0030921052631578947, "grad_norm": 11.5, "grad_norm_var": 5.882535807291666, "learning_rate": 0.0001, "loss": 6.6494, "loss/crossentropy": 3.045071005821228, "loss/hidden": 5.759375, "loss/incoh": 0.0, "loss/logits": 0.890488612651825, "loss/reg": 0.0, "step": 470 }, { "epoch": 0.003157894736842105, "grad_norm": 12.0, "grad_norm_var": 3.778369140625, "learning_rate": 0.0001, "loss": 6.6399, "loss/crossentropy": 2.955122375488281, "loss/hidden": 5.559375, "loss/incoh": 0.0, "loss/logits": 0.7988932132720947, "loss/reg": 0.0, "step": 480 }, { "epoch": 0.0032236842105263157, "grad_norm": 14.625, "grad_norm_var": 3.397509765625, "learning_rate": 0.0001, "loss": 6.6006, "loss/crossentropy": 2.8290895342826845, "loss/hidden": 5.73125, "loss/incoh": 0.0, "loss/logits": 0.8468542337417603, "loss/reg": 0.0, "step": 490 }, { "epoch": 0.003289473684210526, "grad_norm": 11.5, "grad_norm_var": 4.100113932291666, "learning_rate": 0.0001, "loss": 6.4823, "loss/crossentropy": 2.7418078184127808, "loss/hidden": 5.665625, "loss/incoh": 0.0, "loss/logits": 0.7723756909370423, "loss/reg": 0.0, "step": 500 }, { "epoch": 0.003355263157894737, "grad_norm": 11.5625, "grad_norm_var": 2.79375, "learning_rate": 0.0001, "loss": 6.4511, "loss/crossentropy": 3.1031686782836916, "loss/hidden": 5.721875, "loss/incoh": 0.0, "loss/logits": 0.9047020822763443, "loss/reg": 0.0, "step": 510 }, { "epoch": 0.0034210526315789475, "grad_norm": 10.625, "grad_norm_var": 0.8618326822916667, "learning_rate": 0.0001, "loss": 6.3114, "loss/crossentropy": 2.7031071186065674, "loss/hidden": 5.3875, "loss/incoh": 0.0, "loss/logits": 0.7112044870853425, "loss/reg": 0.0, "step": 520 }, { "epoch": 0.003486842105263158, "grad_norm": 9.625, "grad_norm_var": 94.71066080729166, "learning_rate": 0.0001, "loss": 6.2538, "loss/crossentropy": 2.8632609844207764, "loss/hidden": 5.784375, "loss/incoh": 0.0, "loss/logits": 0.8032145172357559, "loss/reg": 0.0, "step": 530 }, { "epoch": 0.0035526315789473684, "grad_norm": 12.0, "grad_norm_var": 2.7383748372395833, "learning_rate": 0.0001, "loss": 6.1042, "loss/crossentropy": 3.037100100517273, "loss/hidden": 5.25625, "loss/incoh": 0.0, "loss/logits": 0.7572773277759552, "loss/reg": 0.0, "step": 540 }, { "epoch": 0.003618421052631579, "grad_norm": 10.9375, "grad_norm_var": 2.5140584309895835, "learning_rate": 0.0001, "loss": 6.1353, "loss/crossentropy": 2.979613184928894, "loss/hidden": 5.196875, "loss/incoh": 0.0, "loss/logits": 0.7719000339508056, "loss/reg": 0.0, "step": 550 }, { "epoch": 0.0036842105263157894, "grad_norm": 8.0625, "grad_norm_var": 1.6379557291666667, "learning_rate": 0.0001, "loss": 6.0637, "loss/crossentropy": 2.687799036502838, "loss/hidden": 5.51875, "loss/incoh": 0.0, "loss/logits": 0.9166353821754456, "loss/reg": 0.0, "step": 560 }, { "epoch": 0.00375, "grad_norm": 9.9375, "grad_norm_var": 18.912353515625, "learning_rate": 0.0001, "loss": 6.0506, "loss/crossentropy": 2.935390818119049, "loss/hidden": 5.165625, "loss/incoh": 0.0, "loss/logits": 0.716228786110878, "loss/reg": 0.0, "step": 570 }, { "epoch": 0.0038157894736842103, "grad_norm": 7.25, "grad_norm_var": 18.753641764322918, "learning_rate": 0.0001, "loss": 5.9637, "loss/crossentropy": 2.7780107259750366, "loss/hidden": 5.190625, "loss/incoh": 0.0, "loss/logits": 0.7067849993705749, "loss/reg": 0.0, "step": 580 }, { "epoch": 0.0038815789473684212, "grad_norm": 6.84375, "grad_norm_var": 2.39361572265625, "learning_rate": 0.0001, "loss": 5.9361, "loss/crossentropy": 3.0060938119888307, "loss/hidden": 5.225, "loss/incoh": 0.0, "loss/logits": 0.7271955192089081, "loss/reg": 0.0, "step": 590 }, { "epoch": 0.003947368421052632, "grad_norm": 7.6875, "grad_norm_var": 0.75357666015625, "learning_rate": 0.0001, "loss": 5.7669, "loss/crossentropy": 2.691058301925659, "loss/hidden": 4.825, "loss/incoh": 0.0, "loss/logits": 0.6389567136764527, "loss/reg": 0.0, "step": 600 }, { "epoch": 0.004013157894736842, "grad_norm": 6.78125, "grad_norm_var": 2.53160400390625, "learning_rate": 0.0001, "loss": 5.7022, "loss/crossentropy": 2.7504406690597536, "loss/hidden": 5.121875, "loss/incoh": 0.0, "loss/logits": 0.8024603247642517, "loss/reg": 0.0, "step": 610 }, { "epoch": 0.004078947368421053, "grad_norm": 6.875, "grad_norm_var": 2.7155558268229165, "learning_rate": 0.0001, "loss": 5.6145, "loss/crossentropy": 2.9313748240470887, "loss/hidden": 5.00625, "loss/incoh": 0.0, "loss/logits": 0.7257195949554444, "loss/reg": 0.0, "step": 620 }, { "epoch": 0.0041447368421052636, "grad_norm": 5.8125, "grad_norm_var": 0.3809733072916667, "learning_rate": 0.0001, "loss": 5.6209, "loss/crossentropy": 2.8686537384986877, "loss/hidden": 5.05625, "loss/incoh": 0.0, "loss/logits": 0.7894174456596375, "loss/reg": 0.0, "step": 630 }, { "epoch": 0.004210526315789474, "grad_norm": 10.5, "grad_norm_var": 1.7327962239583334, "learning_rate": 0.0001, "loss": 5.6566, "loss/crossentropy": 2.910102880001068, "loss/hidden": 4.95, "loss/incoh": 0.0, "loss/logits": 0.7542287766933441, "loss/reg": 0.0, "step": 640 }, { "epoch": 0.0042763157894736845, "grad_norm": 6.0, "grad_norm_var": 1.90406494140625, "learning_rate": 0.0001, "loss": 5.6548, "loss/crossentropy": 3.0497835516929626, "loss/hidden": 5.075, "loss/incoh": 0.0, "loss/logits": 0.6385725855827331, "loss/reg": 0.0, "step": 650 }, { "epoch": 0.0043421052631578945, "grad_norm": 5.90625, "grad_norm_var": 6.811181640625, "learning_rate": 0.0001, "loss": 5.4435, "loss/crossentropy": 2.839945673942566, "loss/hidden": 4.621875, "loss/incoh": 0.0, "loss/logits": 0.608047366142273, "loss/reg": 0.0, "step": 660 }, { "epoch": 0.0044078947368421054, "grad_norm": 8.5, "grad_norm_var": 1.419775390625, "learning_rate": 0.0001, "loss": 5.4264, "loss/crossentropy": 2.6664235949516297, "loss/hidden": 4.71875, "loss/incoh": 0.0, "loss/logits": 0.5786069691181183, "loss/reg": 0.0, "step": 670 }, { "epoch": 0.0044736842105263155, "grad_norm": 5.25, "grad_norm_var": 1.222509765625, "learning_rate": 0.0001, "loss": 5.4175, "loss/crossentropy": 2.7476831912994384, "loss/hidden": 4.646875, "loss/incoh": 0.0, "loss/logits": 0.6524303257465363, "loss/reg": 0.0, "step": 680 }, { "epoch": 0.004539473684210526, "grad_norm": 5.15625, "grad_norm_var": 0.8695271809895834, "learning_rate": 0.0001, "loss": 5.2974, "loss/crossentropy": 2.718129062652588, "loss/hidden": 4.69375, "loss/incoh": 0.0, "loss/logits": 0.679440614581108, "loss/reg": 0.0, "step": 690 }, { "epoch": 0.004605263157894736, "grad_norm": 5.875, "grad_norm_var": 1.3825358072916667, "learning_rate": 0.0001, "loss": 5.3809, "loss/crossentropy": 2.896076512336731, "loss/hidden": 4.503125, "loss/incoh": 0.0, "loss/logits": 0.6036079049110412, "loss/reg": 0.0, "step": 700 }, { "epoch": 0.004671052631578947, "grad_norm": 5.875, "grad_norm_var": 0.94888916015625, "learning_rate": 0.0001, "loss": 5.2593, "loss/crossentropy": 2.765268421173096, "loss/hidden": 4.803125, "loss/incoh": 0.0, "loss/logits": 0.7337387800216675, "loss/reg": 0.0, "step": 710 }, { "epoch": 0.004736842105263158, "grad_norm": 4.75, "grad_norm_var": 1.0287760416666667, "learning_rate": 0.0001, "loss": 5.1503, "loss/crossentropy": 2.6812595248222353, "loss/hidden": 4.75625, "loss/incoh": 0.0, "loss/logits": 0.6710720509290695, "loss/reg": 0.0, "step": 720 }, { "epoch": 0.004802631578947368, "grad_norm": 6.0625, "grad_norm_var": 86.4677734375, "learning_rate": 0.0001, "loss": 5.3144, "loss/crossentropy": 2.7298573732376097, "loss/hidden": 4.484375, "loss/incoh": 0.0, "loss/logits": 0.6176893144845963, "loss/reg": 0.0, "step": 730 }, { "epoch": 0.004868421052631579, "grad_norm": 4.875, "grad_norm_var": 85.56676025390625, "learning_rate": 0.0001, "loss": 5.131, "loss/crossentropy": 2.823095703125, "loss/hidden": 4.571875, "loss/incoh": 0.0, "loss/logits": 0.6797973781824111, "loss/reg": 0.0, "step": 740 }, { "epoch": 0.004934210526315789, "grad_norm": 4.96875, "grad_norm_var": 4.8291015625, "learning_rate": 0.0001, "loss": 5.1654, "loss/crossentropy": 2.901310992240906, "loss/hidden": 4.61875, "loss/incoh": 0.0, "loss/logits": 0.737342044711113, "loss/reg": 0.0, "step": 750 }, { "epoch": 0.005, "grad_norm": 9.375, "grad_norm_var": 3.796728515625, "learning_rate": 0.0001, "loss": 5.0448, "loss/crossentropy": 2.5148804664611815, "loss/hidden": 4.48125, "loss/incoh": 0.0, "loss/logits": 0.5650010257959366, "loss/reg": 0.0, "step": 760 }, { "epoch": 0.00506578947368421, "grad_norm": 4.90625, "grad_norm_var": 18.020947265625, "learning_rate": 0.0001, "loss": 5.0271, "loss/crossentropy": 2.6732282817363737, "loss/hidden": 4.48125, "loss/incoh": 0.0, "loss/logits": 0.5763083070516586, "loss/reg": 0.0, "step": 770 }, { "epoch": 0.005131578947368421, "grad_norm": 4.71875, "grad_norm_var": 17.617118326822915, "learning_rate": 0.0001, "loss": 5.0057, "loss/crossentropy": 2.927682900428772, "loss/hidden": 4.578125, "loss/incoh": 0.0, "loss/logits": 0.680091741681099, "loss/reg": 0.0, "step": 780 }, { "epoch": 0.005197368421052632, "grad_norm": 4.53125, "grad_norm_var": 1.5018880208333334, "learning_rate": 0.0001, "loss": 5.0918, "loss/crossentropy": 2.7974375009536745, "loss/hidden": 4.75, "loss/incoh": 0.0, "loss/logits": 0.709694892168045, "loss/reg": 0.0, "step": 790 }, { "epoch": 0.005263157894736842, "grad_norm": 4.46875, "grad_norm_var": 20.911812337239585, "learning_rate": 0.0001, "loss": 4.9367, "loss/crossentropy": 2.875410461425781, "loss/hidden": 4.578125, "loss/incoh": 0.0, "loss/logits": 0.8118050575256348, "loss/reg": 0.0, "step": 800 }, { "epoch": 0.005328947368421053, "grad_norm": 5.375, "grad_norm_var": 2.1762980143229167, "learning_rate": 0.0001, "loss": 4.8483, "loss/crossentropy": 2.665932035446167, "loss/hidden": 4.3140625, "loss/incoh": 0.0, "loss/logits": 0.5815477341413497, "loss/reg": 0.0, "step": 810 }, { "epoch": 0.005394736842105263, "grad_norm": 4.96875, "grad_norm_var": 0.6493235270182292, "learning_rate": 0.0001, "loss": 4.9314, "loss/crossentropy": 2.5820897936820986, "loss/hidden": 4.15, "loss/incoh": 0.0, "loss/logits": 0.4888360023498535, "loss/reg": 0.0, "step": 820 }, { "epoch": 0.005460526315789474, "grad_norm": 3.890625, "grad_norm_var": 0.2476470947265625, "learning_rate": 0.0001, "loss": 4.7859, "loss/crossentropy": 2.647566497325897, "loss/hidden": 4.3484375, "loss/incoh": 0.0, "loss/logits": 0.5885868102312088, "loss/reg": 0.0, "step": 830 }, { "epoch": 0.005526315789473684, "grad_norm": 4.9375, "grad_norm_var": 0.29011128743489584, "learning_rate": 0.0001, "loss": 4.7829, "loss/crossentropy": 2.583667039871216, "loss/hidden": 4.2828125, "loss/incoh": 0.0, "loss/logits": 0.4921345829963684, "loss/reg": 0.0, "step": 840 }, { "epoch": 0.005592105263157895, "grad_norm": 4.6875, "grad_norm_var": 0.397900390625, "learning_rate": 0.0001, "loss": 4.8821, "loss/crossentropy": 2.798052990436554, "loss/hidden": 4.2625, "loss/incoh": 0.0, "loss/logits": 0.7617209196090698, "loss/reg": 0.0, "step": 850 }, { "epoch": 0.005657894736842106, "grad_norm": 4.21875, "grad_norm_var": 0.29501546223958336, "learning_rate": 0.0001, "loss": 4.9069, "loss/crossentropy": 2.8075502276420594, "loss/hidden": 4.1328125, "loss/incoh": 0.0, "loss/logits": 0.5111032873392105, "loss/reg": 0.0, "step": 860 }, { "epoch": 0.005723684210526316, "grad_norm": 4.21875, "grad_norm_var": 0.24931233723958332, "learning_rate": 0.0001, "loss": 4.8049, "loss/crossentropy": 2.400660240650177, "loss/hidden": 4.3859375, "loss/incoh": 0.0, "loss/logits": 0.6145752131938934, "loss/reg": 0.0, "step": 870 }, { "epoch": 0.005789473684210527, "grad_norm": 4.75, "grad_norm_var": 1.2445271809895833, "learning_rate": 0.0001, "loss": 4.7333, "loss/crossentropy": 2.631825530529022, "loss/hidden": 4.3640625, "loss/incoh": 0.0, "loss/logits": 0.5737248510122299, "loss/reg": 0.0, "step": 880 }, { "epoch": 0.005855263157894737, "grad_norm": 4.90625, "grad_norm_var": 1.9954742431640624, "learning_rate": 0.0001, "loss": 4.6882, "loss/crossentropy": 2.9550926446914674, "loss/hidden": 4.1046875, "loss/incoh": 0.0, "loss/logits": 0.5351347416639328, "loss/reg": 0.0, "step": 890 }, { "epoch": 0.0059210526315789476, "grad_norm": 4.15625, "grad_norm_var": 0.8051991780598958, "learning_rate": 0.0001, "loss": 4.7267, "loss/crossentropy": 2.322158467769623, "loss/hidden": 4.2359375, "loss/incoh": 0.0, "loss/logits": 0.4901482403278351, "loss/reg": 0.0, "step": 900 }, { "epoch": 0.005986842105263158, "grad_norm": 4.59375, "grad_norm_var": 0.5987589518229167, "learning_rate": 0.0001, "loss": 4.7552, "loss/crossentropy": 2.163175904750824, "loss/hidden": 4.475, "loss/incoh": 0.0, "loss/logits": 0.5075026541948319, "loss/reg": 0.0, "step": 910 }, { "epoch": 0.0060526315789473685, "grad_norm": 4.4375, "grad_norm_var": 13.234305826822917, "learning_rate": 0.0001, "loss": 4.8996, "loss/crossentropy": 2.5448178887367248, "loss/hidden": 4.078125, "loss/incoh": 0.0, "loss/logits": 0.5253018319606781, "loss/reg": 0.0, "step": 920 }, { "epoch": 0.0061184210526315785, "grad_norm": 4.09375, "grad_norm_var": 4.241829427083333, "learning_rate": 0.0001, "loss": 4.5704, "loss/crossentropy": 2.551811099052429, "loss/hidden": 4.0546875, "loss/incoh": 0.0, "loss/logits": 0.48250589668750765, "loss/reg": 0.0, "step": 930 }, { "epoch": 0.0061842105263157894, "grad_norm": 4.71875, "grad_norm_var": 0.13075764973958334, "learning_rate": 0.0001, "loss": 4.6287, "loss/crossentropy": 2.8518677711486817, "loss/hidden": 4.134375, "loss/incoh": 0.0, "loss/logits": 0.5851545244455337, "loss/reg": 0.0, "step": 940 }, { "epoch": 0.00625, "grad_norm": 5.65625, "grad_norm_var": 1.0635701497395833, "learning_rate": 0.0001, "loss": 4.6855, "loss/crossentropy": 2.8685452222824095, "loss/hidden": 3.9453125, "loss/incoh": 0.0, "loss/logits": 0.5299171417951584, "loss/reg": 0.0, "step": 950 }, { "epoch": 0.00631578947368421, "grad_norm": 4.4375, "grad_norm_var": 0.2560506184895833, "learning_rate": 0.0001, "loss": 4.6193, "loss/crossentropy": 2.73275808095932, "loss/hidden": 4.121875, "loss/incoh": 0.0, "loss/logits": 0.4992497324943542, "loss/reg": 0.0, "step": 960 }, { "epoch": 0.006381578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.3540598551432292, "learning_rate": 0.0001, "loss": 4.6436, "loss/crossentropy": 2.6122008085250856, "loss/hidden": 4.084375, "loss/incoh": 0.0, "loss/logits": 0.5424144893884659, "loss/reg": 0.0, "step": 970 }, { "epoch": 0.006447368421052631, "grad_norm": 4.0, "grad_norm_var": 1.2561260533134024e+17, "learning_rate": 0.0001, "loss": 4.7384, "loss/crossentropy": 2.747082471847534, "loss/hidden": 3.975, "loss/incoh": 0.0, "loss/logits": 0.4949415147304535, "loss/reg": 0.0, "step": 980 }, { "epoch": 0.006513157894736842, "grad_norm": 4.6875, "grad_norm_var": 15.184098307291666, "learning_rate": 0.0001, "loss": 4.6154, "loss/crossentropy": 2.6343679666519164, "loss/hidden": 4.009375, "loss/incoh": 0.0, "loss/logits": 0.46223918795585633, "loss/reg": 0.0, "step": 990 }, { "epoch": 0.006578947368421052, "grad_norm": 4.84375, "grad_norm_var": 3.58121337890625, "learning_rate": 0.0001, "loss": 4.6456, "loss/crossentropy": 2.618730914592743, "loss/hidden": 4.1390625, "loss/incoh": 0.0, "loss/logits": 0.568024319410324, "loss/reg": 0.0, "step": 1000 }, { "epoch": 0.006644736842105263, "grad_norm": 10.1875, "grad_norm_var": 2.6229563395182294, "learning_rate": 0.0001, "loss": 4.5241, "loss/crossentropy": 2.7510436296463014, "loss/hidden": 3.9421875, "loss/incoh": 0.0, "loss/logits": 0.4975563734769821, "loss/reg": 0.0, "step": 1010 }, { "epoch": 0.006710526315789474, "grad_norm": 4.59375, "grad_norm_var": 9.821955362955729, "learning_rate": 0.0001, "loss": 4.5805, "loss/crossentropy": 2.773310422897339, "loss/hidden": 3.94375, "loss/incoh": 0.0, "loss/logits": 0.5206439226865769, "loss/reg": 0.0, "step": 1020 }, { "epoch": 0.006776315789473684, "grad_norm": 4.34375, "grad_norm_var": 8.482259114583334, "learning_rate": 0.0001, "loss": 4.4588, "loss/crossentropy": 2.6046599745750427, "loss/hidden": 3.88125, "loss/incoh": 0.0, "loss/logits": 0.44563083052635194, "loss/reg": 0.0, "step": 1030 }, { "epoch": 0.006842105263157895, "grad_norm": 4.0, "grad_norm_var": 5.523110961914062, "learning_rate": 0.0001, "loss": 4.6377, "loss/crossentropy": 2.6867428183555604, "loss/hidden": 4.0921875, "loss/incoh": 0.0, "loss/logits": 0.4769616901874542, "loss/reg": 0.0, "step": 1040 }, { "epoch": 0.006907894736842105, "grad_norm": 3.890625, "grad_norm_var": 9.615762329101562, "learning_rate": 0.0001, "loss": 4.5576, "loss/crossentropy": 2.6374024391174316, "loss/hidden": 3.915625, "loss/incoh": 0.0, "loss/logits": 0.5235758543014526, "loss/reg": 0.0, "step": 1050 }, { "epoch": 0.006973684210526316, "grad_norm": 3.796875, "grad_norm_var": 4.420035807291667, "learning_rate": 0.0001, "loss": 4.5061, "loss/crossentropy": 2.409494662284851, "loss/hidden": 4.0203125, "loss/incoh": 0.0, "loss/logits": 0.5403494209051132, "loss/reg": 0.0, "step": 1060 }, { "epoch": 0.007039473684210526, "grad_norm": 4.25, "grad_norm_var": 12.098844401041667, "learning_rate": 0.0001, "loss": 4.7608, "loss/crossentropy": 2.6496052145957947, "loss/hidden": 3.878125, "loss/incoh": 0.0, "loss/logits": 0.48798912912607195, "loss/reg": 0.0, "step": 1070 }, { "epoch": 0.007105263157894737, "grad_norm": 3.875, "grad_norm_var": 9.539867146809895, "learning_rate": 0.0001, "loss": 4.3815, "loss/crossentropy": 2.757003378868103, "loss/hidden": 4.11875, "loss/incoh": 0.0, "loss/logits": 0.4590116262435913, "loss/reg": 0.0, "step": 1080 }, { "epoch": 0.007171052631578947, "grad_norm": 8.5625, "grad_norm_var": 1.5923166910807292, "learning_rate": 0.0001, "loss": 4.44, "loss/crossentropy": 2.550069880485535, "loss/hidden": 4.071875, "loss/incoh": 0.0, "loss/logits": 0.5490242928266525, "loss/reg": 0.0, "step": 1090 }, { "epoch": 0.007236842105263158, "grad_norm": 3.96875, "grad_norm_var": 1.6922597249348958, "learning_rate": 0.0001, "loss": 4.4725, "loss/crossentropy": 2.606226110458374, "loss/hidden": 4.0875, "loss/incoh": 0.0, "loss/logits": 0.5686961978673934, "loss/reg": 0.0, "step": 1100 }, { "epoch": 0.007302631578947369, "grad_norm": 4.90625, "grad_norm_var": 0.9616282145182292, "learning_rate": 0.0001, "loss": 4.4416, "loss/crossentropy": 2.742388653755188, "loss/hidden": 3.8171875, "loss/incoh": 0.0, "loss/logits": 0.491252401471138, "loss/reg": 0.0, "step": 1110 }, { "epoch": 0.007368421052631579, "grad_norm": 4.375, "grad_norm_var": 0.8876302083333333, "learning_rate": 0.0001, "loss": 4.4119, "loss/crossentropy": 2.858240842819214, "loss/hidden": 3.796875, "loss/incoh": 0.0, "loss/logits": 0.4695854902267456, "loss/reg": 0.0, "step": 1120 }, { "epoch": 0.00743421052631579, "grad_norm": 3.453125, "grad_norm_var": 0.8083485921223958, "learning_rate": 0.0001, "loss": 4.3524, "loss/crossentropy": 2.7758461236953735, "loss/hidden": 3.90625, "loss/incoh": 0.0, "loss/logits": 0.5061279594898224, "loss/reg": 0.0, "step": 1130 }, { "epoch": 0.0075, "grad_norm": 3.78125, "grad_norm_var": 0.8257802327473959, "learning_rate": 0.0001, "loss": 4.296, "loss/crossentropy": 2.849539041519165, "loss/hidden": 3.7125, "loss/incoh": 0.0, "loss/logits": 0.45034482181072233, "loss/reg": 0.0, "step": 1140 }, { "epoch": 0.007565789473684211, "grad_norm": 4.0625, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 4.259, "loss/crossentropy": 2.6673625230789186, "loss/hidden": 3.903125, "loss/incoh": 0.0, "loss/logits": 0.45979970395565034, "loss/reg": 0.0, "step": 1150 }, { "epoch": 0.007631578947368421, "grad_norm": 3.984375, "grad_norm_var": 0.2490631103515625, "learning_rate": 0.0001, "loss": 4.2786, "loss/crossentropy": 2.6413369178771973, "loss/hidden": 3.7734375, "loss/incoh": 0.0, "loss/logits": 0.45444311797618864, "loss/reg": 0.0, "step": 1160 }, { "epoch": 0.007697368421052632, "grad_norm": 6.1875, "grad_norm_var": 0.7844960530598958, "learning_rate": 0.0001, "loss": 4.2937, "loss/crossentropy": 2.54203085899353, "loss/hidden": 3.6671875, "loss/incoh": 0.0, "loss/logits": 0.40531369894742963, "loss/reg": 0.0, "step": 1170 }, { "epoch": 0.0077631578947368425, "grad_norm": 3.65625, "grad_norm_var": 0.444384765625, "learning_rate": 0.0001, "loss": 4.3125, "loss/crossentropy": 2.772641682624817, "loss/hidden": 3.8296875, "loss/incoh": 0.0, "loss/logits": 0.4890771210193634, "loss/reg": 0.0, "step": 1180 }, { "epoch": 0.007828947368421053, "grad_norm": 4.15625, "grad_norm_var": 0.26546122233072916, "learning_rate": 0.0001, "loss": 4.3654, "loss/crossentropy": 2.661901044845581, "loss/hidden": 3.7015625, "loss/incoh": 0.0, "loss/logits": 0.439369834959507, "loss/reg": 0.0, "step": 1190 }, { "epoch": 0.007894736842105263, "grad_norm": 3.75, "grad_norm_var": 0.25321858723958335, "learning_rate": 0.0001, "loss": 4.2648, "loss/crossentropy": 2.3847479939460756, "loss/hidden": 4.028125, "loss/incoh": 0.0, "loss/logits": 0.4770447015762329, "loss/reg": 0.0, "step": 1200 }, { "epoch": 0.007960526315789473, "grad_norm": 3.921875, "grad_norm_var": 0.2809529622395833, "learning_rate": 0.0001, "loss": 4.3473, "loss/crossentropy": 2.244320285320282, "loss/hidden": 3.8109375, "loss/incoh": 0.0, "loss/logits": 0.40470985919237135, "loss/reg": 0.0, "step": 1210 }, { "epoch": 0.008026315789473683, "grad_norm": 4.28125, "grad_norm_var": 0.5173004150390625, "learning_rate": 0.0001, "loss": 4.3494, "loss/crossentropy": 2.5515334010124207, "loss/hidden": 4.109375, "loss/incoh": 0.0, "loss/logits": 0.5172385692596435, "loss/reg": 0.0, "step": 1220 }, { "epoch": 0.008092105263157895, "grad_norm": 3.5, "grad_norm_var": 0.2546539306640625, "learning_rate": 0.0001, "loss": 4.2293, "loss/crossentropy": 2.5860470652580263, "loss/hidden": 3.7171875, "loss/incoh": 0.0, "loss/logits": 0.4412991553544998, "loss/reg": 0.0, "step": 1230 }, { "epoch": 0.008157894736842105, "grad_norm": 3.1875, "grad_norm_var": 1.7592185950961664e+17, "learning_rate": 0.0001, "loss": 4.4752, "loss/crossentropy": 2.823768949508667, "loss/hidden": 3.703125, "loss/incoh": 0.0, "loss/logits": 0.5228973954916001, "loss/reg": 0.0, "step": 1240 }, { "epoch": 0.008223684210526315, "grad_norm": 3.21875, "grad_norm_var": 1.966552734375, "learning_rate": 0.0001, "loss": 4.2687, "loss/crossentropy": 2.523781180381775, "loss/hidden": 3.825, "loss/incoh": 0.0, "loss/logits": 0.4939111739397049, "loss/reg": 0.0, "step": 1250 }, { "epoch": 0.008289473684210527, "grad_norm": 2.796875, "grad_norm_var": 1.2621897379557292, "learning_rate": 0.0001, "loss": 4.1499, "loss/crossentropy": 2.5173804640769957, "loss/hidden": 3.7328125, "loss/incoh": 0.0, "loss/logits": 0.44251940250396726, "loss/reg": 0.0, "step": 1260 }, { "epoch": 0.008355263157894737, "grad_norm": 4.15625, "grad_norm_var": 3.665185546875, "learning_rate": 0.0001, "loss": 4.3378, "loss/crossentropy": 2.551619827747345, "loss/hidden": 3.9625, "loss/incoh": 0.0, "loss/logits": 0.5328098922967911, "loss/reg": 0.0, "step": 1270 }, { "epoch": 0.008421052631578947, "grad_norm": 3.53125, "grad_norm_var": 3.1853352864583333, "learning_rate": 0.0001, "loss": 4.2329, "loss/crossentropy": 2.3984143674373626, "loss/hidden": 3.9703125, "loss/incoh": 0.0, "loss/logits": 0.43639505505561826, "loss/reg": 0.0, "step": 1280 }, { "epoch": 0.008486842105263157, "grad_norm": 4.21875, "grad_norm_var": 0.21245015462239583, "learning_rate": 0.0001, "loss": 4.3604, "loss/crossentropy": 2.8736027479171753, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.48700871765613557, "loss/reg": 0.0, "step": 1290 }, { "epoch": 0.008552631578947369, "grad_norm": 4.125, "grad_norm_var": 0.24678446451822916, "learning_rate": 0.0001, "loss": 4.1666, "loss/crossentropy": 2.714110541343689, "loss/hidden": 3.75625, "loss/incoh": 0.0, "loss/logits": 0.47638387978076935, "loss/reg": 0.0, "step": 1300 }, { "epoch": 0.008618421052631579, "grad_norm": 5.03125, "grad_norm_var": 0.36387430826822914, "learning_rate": 0.0001, "loss": 4.2879, "loss/crossentropy": 2.6876192927360534, "loss/hidden": 3.7, "loss/incoh": 0.0, "loss/logits": 0.49314437210559847, "loss/reg": 0.0, "step": 1310 }, { "epoch": 0.008684210526315789, "grad_norm": 3.84375, "grad_norm_var": 0.28189188639322915, "learning_rate": 0.0001, "loss": 4.1505, "loss/crossentropy": 2.4753618359565737, "loss/hidden": 3.684375, "loss/incoh": 0.0, "loss/logits": 0.40235219299793246, "loss/reg": 0.0, "step": 1320 }, { "epoch": 0.00875, "grad_norm": 3.1875, "grad_norm_var": 2.41715087890625, "learning_rate": 0.0001, "loss": 4.2095, "loss/crossentropy": 2.515524423122406, "loss/hidden": 3.996875, "loss/incoh": 0.0, "loss/logits": 0.5540182292461395, "loss/reg": 0.0, "step": 1330 }, { "epoch": 0.008815789473684211, "grad_norm": 3.8125, "grad_norm_var": 2.259993489583333, "learning_rate": 0.0001, "loss": 4.1889, "loss/crossentropy": 2.346592426300049, "loss/hidden": 3.7703125, "loss/incoh": 0.0, "loss/logits": 0.4610589429736137, "loss/reg": 0.0, "step": 1340 }, { "epoch": 0.008881578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.13492431640625, "learning_rate": 0.0001, "loss": 4.1587, "loss/crossentropy": 2.600476896762848, "loss/hidden": 3.746875, "loss/incoh": 0.0, "loss/logits": 0.4497509777545929, "loss/reg": 0.0, "step": 1350 }, { "epoch": 0.008947368421052631, "grad_norm": 3.390625, "grad_norm_var": 0.7162017822265625, "learning_rate": 0.0001, "loss": 4.0563, "loss/crossentropy": 2.6253500103950502, "loss/hidden": 3.5359375, "loss/incoh": 0.0, "loss/logits": 0.4007237285375595, "loss/reg": 0.0, "step": 1360 }, { "epoch": 0.009013157894736843, "grad_norm": 3.828125, "grad_norm_var": 0.7642812093098958, "learning_rate": 0.0001, "loss": 4.1053, "loss/crossentropy": 2.4306472063064577, "loss/hidden": 3.596875, "loss/incoh": 0.0, "loss/logits": 0.42645111978054046, "loss/reg": 0.0, "step": 1370 }, { "epoch": 0.009078947368421053, "grad_norm": 5.65625, "grad_norm_var": 0.3096995035807292, "learning_rate": 0.0001, "loss": 4.2093, "loss/crossentropy": 2.4861367106437684, "loss/hidden": 3.9359375, "loss/incoh": 0.0, "loss/logits": 0.46326183080673217, "loss/reg": 0.0, "step": 1380 }, { "epoch": 0.009144736842105263, "grad_norm": 3.265625, "grad_norm_var": 0.36946207682291665, "learning_rate": 0.0001, "loss": 4.0653, "loss/crossentropy": 2.608224070072174, "loss/hidden": 3.5140625, "loss/incoh": 0.0, "loss/logits": 0.3894644558429718, "loss/reg": 0.0, "step": 1390 }, { "epoch": 0.009210526315789473, "grad_norm": 3.21875, "grad_norm_var": 0.30060221354166666, "learning_rate": 0.0001, "loss": 4.1547, "loss/crossentropy": 2.337048816680908, "loss/hidden": 3.6046875, "loss/incoh": 0.0, "loss/logits": 0.39037723541259767, "loss/reg": 0.0, "step": 1400 }, { "epoch": 0.009276315789473685, "grad_norm": 3.625, "grad_norm_var": 0.14612630208333333, "learning_rate": 0.0001, "loss": 4.076, "loss/crossentropy": 2.726799726486206, "loss/hidden": 3.5515625, "loss/incoh": 0.0, "loss/logits": 0.4537044405937195, "loss/reg": 0.0, "step": 1410 }, { "epoch": 0.009342105263157895, "grad_norm": 3.875, "grad_norm_var": 1.9072662353515626, "learning_rate": 0.0001, "loss": 4.2387, "loss/crossentropy": 2.5365243434906004, "loss/hidden": 3.503125, "loss/incoh": 0.0, "loss/logits": 0.37743023335933684, "loss/reg": 0.0, "step": 1420 }, { "epoch": 0.009407894736842105, "grad_norm": 4.1875, "grad_norm_var": 0.15405171712239582, "learning_rate": 0.0001, "loss": 4.0913, "loss/crossentropy": 2.6032602190971375, "loss/hidden": 3.6015625, "loss/incoh": 0.0, "loss/logits": 0.4031028777360916, "loss/reg": 0.0, "step": 1430 }, { "epoch": 0.009473684210526316, "grad_norm": 3.453125, "grad_norm_var": 0.35299072265625, "learning_rate": 0.0001, "loss": 4.0676, "loss/crossentropy": 2.279827582836151, "loss/hidden": 3.671875, "loss/incoh": 0.0, "loss/logits": 0.38540517538785934, "loss/reg": 0.0, "step": 1440 }, { "epoch": 0.009539473684210526, "grad_norm": 3.703125, "grad_norm_var": 0.44612223307291665, "learning_rate": 0.0001, "loss": 4.0791, "loss/crossentropy": 2.4795989274978636, "loss/hidden": 3.7, "loss/incoh": 0.0, "loss/logits": 0.40130155086517333, "loss/reg": 0.0, "step": 1450 }, { "epoch": 0.009605263157894737, "grad_norm": 3.515625, "grad_norm_var": 0.43585611979166666, "learning_rate": 0.0001, "loss": 4.0499, "loss/crossentropy": 2.337530755996704, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.3699365258216858, "loss/reg": 0.0, "step": 1460 }, { "epoch": 0.009671052631578947, "grad_norm": 4.375, "grad_norm_var": 2.1455393473307294, "learning_rate": 0.0001, "loss": 4.0616, "loss/crossentropy": 2.246569663286209, "loss/hidden": 3.54375, "loss/incoh": 0.0, "loss/logits": 0.37747917622327803, "loss/reg": 0.0, "step": 1470 }, { "epoch": 0.009736842105263158, "grad_norm": 4.34375, "grad_norm_var": 1.3469017374654464e+17, "learning_rate": 0.0001, "loss": 4.1919, "loss/crossentropy": 2.465463387966156, "loss/hidden": 3.5796875, "loss/incoh": 0.0, "loss/logits": 0.409694692492485, "loss/reg": 0.0, "step": 1480 }, { "epoch": 0.009802631578947368, "grad_norm": 3.8125, "grad_norm_var": 2.647081560180774e+17, "learning_rate": 0.0001, "loss": 4.2391, "loss/crossentropy": 2.614275646209717, "loss/hidden": 3.86875, "loss/incoh": 0.0, "loss/logits": 0.4961456567049026, "loss/reg": 0.0, "step": 1490 }, { "epoch": 0.009868421052631578, "grad_norm": 3.59375, "grad_norm_var": 3.232743326822917, "learning_rate": 0.0001, "loss": 4.2432, "loss/crossentropy": 2.9305615186691285, "loss/hidden": 3.8921875, "loss/incoh": 0.0, "loss/logits": 0.6574043720960617, "loss/reg": 0.0, "step": 1500 }, { "epoch": 0.00993421052631579, "grad_norm": 3.34375, "grad_norm_var": 3.299430338541667, "learning_rate": 0.0001, "loss": 4.066, "loss/crossentropy": 2.3778577923774717, "loss/hidden": 3.6515625, "loss/incoh": 0.0, "loss/logits": 0.3975479930639267, "loss/reg": 0.0, "step": 1510 }, { "epoch": 0.01, "grad_norm": 3.515625, "grad_norm_var": 18.948729451497396, "learning_rate": 0.0001, "loss": 4.1724, "loss/crossentropy": 2.527243709564209, "loss/hidden": 3.596875, "loss/incoh": 0.0, "loss/logits": 0.41472980976104734, "loss/reg": 0.0, "step": 1520 }, { "epoch": 0.01006578947368421, "grad_norm": 3.03125, "grad_norm_var": 18.10924072265625, "learning_rate": 0.0001, "loss": 4.0302, "loss/crossentropy": 2.7474317073822023, "loss/hidden": 3.690625, "loss/incoh": 0.0, "loss/logits": 0.47478381991386415, "loss/reg": 0.0, "step": 1530 }, { "epoch": 0.01013157894736842, "grad_norm": 4.4375, "grad_norm_var": 0.45779520670572915, "learning_rate": 0.0001, "loss": 4.0726, "loss/crossentropy": 2.4822750091552734, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.39128718376159666, "loss/reg": 0.0, "step": 1540 }, { "epoch": 0.010197368421052632, "grad_norm": 3.5625, "grad_norm_var": 0.21116129557291666, "learning_rate": 0.0001, "loss": 4.1353, "loss/crossentropy": 2.480714201927185, "loss/hidden": 3.9203125, "loss/incoh": 0.0, "loss/logits": 0.4798941880464554, "loss/reg": 0.0, "step": 1550 }, { "epoch": 0.010263157894736842, "grad_norm": 3.78125, "grad_norm_var": 0.21678059895833332, "learning_rate": 0.0001, "loss": 3.993, "loss/crossentropy": 2.536018407344818, "loss/hidden": 3.53125, "loss/incoh": 0.0, "loss/logits": 0.3731235474348068, "loss/reg": 0.0, "step": 1560 }, { "epoch": 0.010328947368421052, "grad_norm": 3.953125, "grad_norm_var": 0.17219645182291668, "learning_rate": 0.0001, "loss": 4.0545, "loss/crossentropy": 2.4370301008224486, "loss/hidden": 3.5640625, "loss/incoh": 0.0, "loss/logits": 0.4435511589050293, "loss/reg": 0.0, "step": 1570 }, { "epoch": 0.010394736842105264, "grad_norm": 3.140625, "grad_norm_var": 0.5761027018229167, "learning_rate": 0.0001, "loss": 4.0907, "loss/crossentropy": 2.576425087451935, "loss/hidden": 3.6484375, "loss/incoh": 0.0, "loss/logits": 0.43103125393390657, "loss/reg": 0.0, "step": 1580 }, { "epoch": 0.010460526315789474, "grad_norm": 3.34375, "grad_norm_var": 0.15403544108072917, "learning_rate": 0.0001, "loss": 3.8713, "loss/crossentropy": 2.2120620369911195, "loss/hidden": 3.796875, "loss/incoh": 0.0, "loss/logits": 0.43894679844379425, "loss/reg": 0.0, "step": 1590 }, { "epoch": 0.010526315789473684, "grad_norm": 2.890625, "grad_norm_var": 0.09127197265625, "learning_rate": 0.0001, "loss": 3.9719, "loss/crossentropy": 2.5636333346366884, "loss/hidden": 3.575, "loss/incoh": 0.0, "loss/logits": 0.4157550185918808, "loss/reg": 0.0, "step": 1600 }, { "epoch": 0.010592105263157894, "grad_norm": 3.5, "grad_norm_var": 2.413444010416667, "learning_rate": 0.0001, "loss": 4.153, "loss/crossentropy": 2.6213369131088258, "loss/hidden": 3.6171875, "loss/incoh": 0.0, "loss/logits": 0.45062357783317564, "loss/reg": 0.0, "step": 1610 }, { "epoch": 0.010657894736842106, "grad_norm": 9.5, "grad_norm_var": 4.2955881754557295, "learning_rate": 0.0001, "loss": 4.0295, "loss/crossentropy": 2.626167094707489, "loss/hidden": 3.5765625, "loss/incoh": 0.0, "loss/logits": 0.44043630063533784, "loss/reg": 0.0, "step": 1620 }, { "epoch": 0.010723684210526316, "grad_norm": 3.6875, "grad_norm_var": 2.3217437744140623, "learning_rate": 0.0001, "loss": 3.937, "loss/crossentropy": 2.6269264578819276, "loss/hidden": 3.45625, "loss/incoh": 0.0, "loss/logits": 0.48427494168281554, "loss/reg": 0.0, "step": 1630 }, { "epoch": 0.010789473684210526, "grad_norm": 3.078125, "grad_norm_var": 0.12195638020833334, "learning_rate": 0.0001, "loss": 3.9365, "loss/crossentropy": 2.6117714166641237, "loss/hidden": 3.4359375, "loss/incoh": 0.0, "loss/logits": 0.4069202274084091, "loss/reg": 0.0, "step": 1640 }, { "epoch": 0.010855263157894738, "grad_norm": 3.3125, "grad_norm_var": 0.25017903645833334, "learning_rate": 0.0001, "loss": 3.9318, "loss/crossentropy": 2.6508745312690736, "loss/hidden": 3.53125, "loss/incoh": 0.0, "loss/logits": 0.4166009187698364, "loss/reg": 0.0, "step": 1650 }, { "epoch": 0.010921052631578948, "grad_norm": 4.21875, "grad_norm_var": 0.1591217041015625, "learning_rate": 0.0001, "loss": 3.8964, "loss/crossentropy": 2.4683817744255068, "loss/hidden": 3.4421875, "loss/incoh": 0.0, "loss/logits": 0.37454236298799515, "loss/reg": 0.0, "step": 1660 }, { "epoch": 0.010986842105263158, "grad_norm": 3.28125, "grad_norm_var": 0.12337239583333333, "learning_rate": 0.0001, "loss": 4.0102, "loss/crossentropy": 2.4436564683914184, "loss/hidden": 3.3390625, "loss/incoh": 0.0, "loss/logits": 0.36218023002147676, "loss/reg": 0.0, "step": 1670 }, { "epoch": 0.011052631578947368, "grad_norm": 3.15625, "grad_norm_var": 0.08958231608072917, "learning_rate": 0.0001, "loss": 3.9784, "loss/crossentropy": 2.559529435634613, "loss/hidden": 3.703125, "loss/incoh": 0.0, "loss/logits": 0.46595812439918516, "loss/reg": 0.0, "step": 1680 }, { "epoch": 0.01111842105263158, "grad_norm": 2.953125, "grad_norm_var": 0.11448160807291667, "learning_rate": 0.0001, "loss": 3.9419, "loss/crossentropy": 2.4433623433113096, "loss/hidden": 3.453125, "loss/incoh": 0.0, "loss/logits": 0.41454428136348725, "loss/reg": 0.0, "step": 1690 }, { "epoch": 0.01118421052631579, "grad_norm": 4.0625, "grad_norm_var": 0.11336161295572916, "learning_rate": 0.0001, "loss": 3.8878, "loss/crossentropy": 2.561389684677124, "loss/hidden": 3.4640625, "loss/incoh": 0.0, "loss/logits": 0.3884895950555801, "loss/reg": 0.0, "step": 1700 }, { "epoch": 0.01125, "grad_norm": 3.296875, "grad_norm_var": 2.4417928059895835, "learning_rate": 0.0001, "loss": 3.9996, "loss/crossentropy": 2.709191393852234, "loss/hidden": 3.5515625, "loss/incoh": 0.0, "loss/logits": 0.3691831022500992, "loss/reg": 0.0, "step": 1710 }, { "epoch": 0.011315789473684211, "grad_norm": 3.3125, "grad_norm_var": 0.5698232014973958, "learning_rate": 0.0001, "loss": 4.0535, "loss/crossentropy": 2.4276717066764832, "loss/hidden": 3.7375, "loss/incoh": 0.0, "loss/logits": 0.4981458902359009, "loss/reg": 0.0, "step": 1720 }, { "epoch": 0.011381578947368421, "grad_norm": 3.71875, "grad_norm_var": 0.4803059895833333, "learning_rate": 0.0001, "loss": 4.0546, "loss/crossentropy": 2.672085237503052, "loss/hidden": 3.465625, "loss/incoh": 0.0, "loss/logits": 0.41347330510616304, "loss/reg": 0.0, "step": 1730 }, { "epoch": 0.011447368421052631, "grad_norm": 6.59375, "grad_norm_var": 0.7734700520833333, "learning_rate": 0.0001, "loss": 4.0697, "loss/crossentropy": 2.625990152359009, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.45684492886066436, "loss/reg": 0.0, "step": 1740 }, { "epoch": 0.011513157894736841, "grad_norm": 2.8125, "grad_norm_var": 2.146930948893229, "learning_rate": 0.0001, "loss": 4.1129, "loss/crossentropy": 2.682978630065918, "loss/hidden": 3.5703125, "loss/incoh": 0.0, "loss/logits": 0.4306509166955948, "loss/reg": 0.0, "step": 1750 }, { "epoch": 0.011578947368421053, "grad_norm": 3.203125, "grad_norm_var": 0.1972076416015625, "learning_rate": 0.0001, "loss": 3.8797, "loss/crossentropy": 2.7138221740722654, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.46029032766819, "loss/reg": 0.0, "step": 1760 }, { "epoch": 0.011644736842105263, "grad_norm": 3.03125, "grad_norm_var": 15.614777628580729, "learning_rate": 0.0001, "loss": 4.0409, "loss/crossentropy": 2.1841426372528074, "loss/hidden": 3.3828125, "loss/incoh": 0.0, "loss/logits": 0.35761781185865404, "loss/reg": 0.0, "step": 1770 }, { "epoch": 0.011710526315789473, "grad_norm": 3.953125, "grad_norm_var": 0.32083231608072915, "learning_rate": 0.0001, "loss": 4.0523, "loss/crossentropy": 2.404168051481247, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.34855909645557404, "loss/reg": 0.0, "step": 1780 }, { "epoch": 0.011776315789473683, "grad_norm": 3.5, "grad_norm_var": 3.124772135416667, "learning_rate": 0.0001, "loss": 3.9895, "loss/crossentropy": 2.370392310619354, "loss/hidden": 3.6515625, "loss/incoh": 0.0, "loss/logits": 0.4228974744677544, "loss/reg": 0.0, "step": 1790 }, { "epoch": 0.011842105263157895, "grad_norm": 6.125, "grad_norm_var": 3.445572916666667, "learning_rate": 0.0001, "loss": 4.0349, "loss/crossentropy": 2.6190654158592226, "loss/hidden": 3.4578125, "loss/incoh": 0.0, "loss/logits": 0.406630203127861, "loss/reg": 0.0, "step": 1800 }, { "epoch": 0.011907894736842105, "grad_norm": 3.28125, "grad_norm_var": 0.575048828125, "learning_rate": 0.0001, "loss": 3.8828, "loss/crossentropy": 2.505708968639374, "loss/hidden": 3.4203125, "loss/incoh": 0.0, "loss/logits": 0.3773295432329178, "loss/reg": 0.0, "step": 1810 }, { "epoch": 0.011973684210526315, "grad_norm": 3.734375, "grad_norm_var": 14.053641764322917, "learning_rate": 0.0001, "loss": 3.9952, "loss/crossentropy": 2.8797521352767945, "loss/hidden": 3.6734375, "loss/incoh": 0.0, "loss/logits": 0.7419060736894607, "loss/reg": 0.0, "step": 1820 }, { "epoch": 0.012039473684210527, "grad_norm": 3.96875, "grad_norm_var": 2.006696573893229, "learning_rate": 0.0001, "loss": 3.87, "loss/crossentropy": 2.792651188373566, "loss/hidden": 3.6546875, "loss/incoh": 0.0, "loss/logits": 0.7776896879076958, "loss/reg": 0.0, "step": 1830 }, { "epoch": 0.012105263157894737, "grad_norm": 4.09375, "grad_norm_var": 3.2296132405598956, "learning_rate": 0.0001, "loss": 3.8688, "loss/crossentropy": 2.4713597655296327, "loss/hidden": 3.4484375, "loss/incoh": 0.0, "loss/logits": 0.3962660849094391, "loss/reg": 0.0, "step": 1840 }, { "epoch": 0.012171052631578947, "grad_norm": 3.296875, "grad_norm_var": 0.20377197265625, "learning_rate": 0.0001, "loss": 3.9721, "loss/crossentropy": 2.225203812122345, "loss/hidden": 3.4734375, "loss/incoh": 0.0, "loss/logits": 0.3860040009021759, "loss/reg": 0.0, "step": 1850 }, { "epoch": 0.012236842105263157, "grad_norm": 4.375, "grad_norm_var": 14.807112630208334, "learning_rate": 0.0001, "loss": 4.0342, "loss/crossentropy": 2.405521821975708, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.40949456989765165, "loss/reg": 0.0, "step": 1860 }, { "epoch": 0.012302631578947369, "grad_norm": 4.53125, "grad_norm_var": 6.056012980143229, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 2.3927958846092223, "loss/hidden": 3.5265625, "loss/incoh": 0.0, "loss/logits": 0.3984386846423149, "loss/reg": 0.0, "step": 1870 }, { "epoch": 0.012368421052631579, "grad_norm": 2.734375, "grad_norm_var": 0.6073893229166667, "learning_rate": 0.0001, "loss": 3.9074, "loss/crossentropy": 2.6031975388526916, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.4128950208425522, "loss/reg": 0.0, "step": 1880 }, { "epoch": 0.012434210526315789, "grad_norm": 3.65625, "grad_norm_var": 0.3526845296223958, "learning_rate": 0.0001, "loss": 3.9143, "loss/crossentropy": 2.7405603647232057, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.37195596396923064, "loss/reg": 0.0, "step": 1890 }, { "epoch": 0.0125, "grad_norm": 3.3125, "grad_norm_var": 130.27611389160157, "learning_rate": 0.0001, "loss": 3.986, "loss/crossentropy": 2.6554584980010985, "loss/hidden": 3.5, "loss/incoh": 0.0, "loss/logits": 0.3823524177074432, "loss/reg": 0.0, "step": 1900 }, { "epoch": 0.01256578947368421, "grad_norm": 3.6875, "grad_norm_var": 130.43673400878907, "learning_rate": 0.0001, "loss": 3.8682, "loss/crossentropy": 2.6816349744796755, "loss/hidden": 3.421875, "loss/incoh": 0.0, "loss/logits": 0.43074882328510283, "loss/reg": 0.0, "step": 1910 }, { "epoch": 0.01263157894736842, "grad_norm": 3.0, "grad_norm_var": 10.65947265625, "learning_rate": 0.0001, "loss": 4.0152, "loss/crossentropy": 2.1248778343200683, "loss/hidden": 3.6953125, "loss/incoh": 0.0, "loss/logits": 0.4099419146776199, "loss/reg": 0.0, "step": 1920 }, { "epoch": 0.01269736842105263, "grad_norm": 2.828125, "grad_norm_var": 0.41845296223958334, "learning_rate": 0.0001, "loss": 3.9099, "loss/crossentropy": 2.485488569736481, "loss/hidden": 3.3921875, "loss/incoh": 0.0, "loss/logits": 0.3766200736165047, "loss/reg": 0.0, "step": 1930 }, { "epoch": 0.012763157894736843, "grad_norm": 4.375, "grad_norm_var": 2.3640777587890627, "learning_rate": 0.0001, "loss": 4.0886, "loss/crossentropy": 2.896865522861481, "loss/hidden": 4.4, "loss/incoh": 0.0, "loss/logits": 0.6787679702043533, "loss/reg": 0.0, "step": 1940 }, { "epoch": 0.012828947368421053, "grad_norm": 3.34375, "grad_norm_var": 34.95194905598958, "learning_rate": 0.0001, "loss": 4.0285, "loss/crossentropy": 2.348608684539795, "loss/hidden": 3.6171875, "loss/incoh": 0.0, "loss/logits": 0.4279856622219086, "loss/reg": 0.0, "step": 1950 }, { "epoch": 0.012894736842105263, "grad_norm": 3.65625, "grad_norm_var": 1.5377278645833334, "learning_rate": 0.0001, "loss": 3.9097, "loss/crossentropy": 2.3533430814743044, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.37981766164302827, "loss/reg": 0.0, "step": 1960 }, { "epoch": 0.012960526315789474, "grad_norm": 4.15625, "grad_norm_var": 0.23593648274739584, "learning_rate": 0.0001, "loss": 3.9802, "loss/crossentropy": 2.54624525308609, "loss/hidden": 3.790625, "loss/incoh": 0.0, "loss/logits": 0.4486588716506958, "loss/reg": 0.0, "step": 1970 }, { "epoch": 0.013026315789473684, "grad_norm": 2.859375, "grad_norm_var": 0.24025777180989583, "learning_rate": 0.0001, "loss": 3.8942, "loss/crossentropy": 2.5797088146209717, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.4970128297805786, "loss/reg": 0.0, "step": 1980 }, { "epoch": 0.013092105263157894, "grad_norm": 3.9375, "grad_norm_var": 0.47609049479166665, "learning_rate": 0.0001, "loss": 3.934, "loss/crossentropy": 2.555588161945343, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.38218972086906433, "loss/reg": 0.0, "step": 1990 }, { "epoch": 0.013157894736842105, "grad_norm": 4.03125, "grad_norm_var": 1.0361073811848958, "learning_rate": 0.0001, "loss": 3.8759, "loss/crossentropy": 2.164474868774414, "loss/hidden": 3.7125, "loss/incoh": 0.0, "loss/logits": 0.3994155451655388, "loss/reg": 0.0, "step": 2000 }, { "epoch": 0.013223684210526316, "grad_norm": 3.453125, "grad_norm_var": 0.8633778889973959, "learning_rate": 0.0001, "loss": 3.8317, "loss/crossentropy": 2.558875060081482, "loss/hidden": 3.471875, "loss/incoh": 0.0, "loss/logits": 0.3639406472444534, "loss/reg": 0.0, "step": 2010 }, { "epoch": 0.013289473684210526, "grad_norm": 3.75, "grad_norm_var": 1.5050740559895834, "learning_rate": 0.0001, "loss": 3.9716, "loss/crossentropy": 2.3385006546974183, "loss/hidden": 3.4359375, "loss/incoh": 0.0, "loss/logits": 0.3938352942466736, "loss/reg": 0.0, "step": 2020 }, { "epoch": 0.013355263157894736, "grad_norm": 3.15625, "grad_norm_var": 0.9781158447265625, "learning_rate": 0.0001, "loss": 3.9531, "loss/crossentropy": 2.4627522945404055, "loss/hidden": 3.5578125, "loss/incoh": 0.0, "loss/logits": 0.495425808429718, "loss/reg": 0.0, "step": 2030 }, { "epoch": 0.013421052631578948, "grad_norm": 2.75, "grad_norm_var": 1.949779256184896, "learning_rate": 0.0001, "loss": 3.9979, "loss/crossentropy": 2.11747065782547, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.36948435604572294, "loss/reg": 0.0, "step": 2040 }, { "epoch": 0.013486842105263158, "grad_norm": 3.0625, "grad_norm_var": 1.5490549723307292, "learning_rate": 0.0001, "loss": 3.8381, "loss/crossentropy": 2.473706376552582, "loss/hidden": 3.4265625, "loss/incoh": 0.0, "loss/logits": 0.3867632657289505, "loss/reg": 0.0, "step": 2050 }, { "epoch": 0.013552631578947368, "grad_norm": 3.125, "grad_norm_var": 2.1093470786676653e+17, "learning_rate": 0.0001, "loss": 4.0319, "loss/crossentropy": 2.4225202679634092, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3675911784172058, "loss/reg": 0.0, "step": 2060 }, { "epoch": 0.013618421052631578, "grad_norm": 5.625, "grad_norm_var": 76.3380849202474, "learning_rate": 0.0001, "loss": 4.0746, "loss/crossentropy": 2.48265061378479, "loss/hidden": 3.7265625, "loss/incoh": 0.0, "loss/logits": 0.4639868468046188, "loss/reg": 0.0, "step": 2070 }, { "epoch": 0.01368421052631579, "grad_norm": 4.5, "grad_norm_var": 0.7881011962890625, "learning_rate": 0.0001, "loss": 4.038, "loss/crossentropy": 2.7775272965431212, "loss/hidden": 4.0609375, "loss/incoh": 0.0, "loss/logits": 0.4948854446411133, "loss/reg": 0.0, "step": 2080 }, { "epoch": 0.01375, "grad_norm": 3.296875, "grad_norm_var": 0.8111612955729167, "learning_rate": 0.0001, "loss": 3.8968, "loss/crossentropy": 2.4613396763801574, "loss/hidden": 3.3484375, "loss/incoh": 0.0, "loss/logits": 0.3684074327349663, "loss/reg": 0.0, "step": 2090 }, { "epoch": 0.01381578947368421, "grad_norm": 3.484375, "grad_norm_var": 1.490550740559896, "learning_rate": 0.0001, "loss": 3.867, "loss/crossentropy": 2.5947747588157655, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.3717468947172165, "loss/reg": 0.0, "step": 2100 }, { "epoch": 0.013881578947368422, "grad_norm": 4.75, "grad_norm_var": 1.7372385660807292, "learning_rate": 0.0001, "loss": 3.8731, "loss/crossentropy": 2.466842460632324, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3417574405670166, "loss/reg": 0.0, "step": 2110 }, { "epoch": 0.013947368421052632, "grad_norm": 4.21875, "grad_norm_var": 1.8861002604166666, "learning_rate": 0.0001, "loss": 3.823, "loss/crossentropy": 2.3006282687187194, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.36191926896572113, "loss/reg": 0.0, "step": 2120 }, { "epoch": 0.014013157894736842, "grad_norm": 4.21875, "grad_norm_var": 0.9100901285807291, "learning_rate": 0.0001, "loss": 3.8933, "loss/crossentropy": 2.6159239768981934, "loss/hidden": 3.5109375, "loss/incoh": 0.0, "loss/logits": 0.46396631598472593, "loss/reg": 0.0, "step": 2130 }, { "epoch": 0.014078947368421052, "grad_norm": 3.734375, "grad_norm_var": 0.9789388020833333, "learning_rate": 0.0001, "loss": 3.8761, "loss/crossentropy": 2.6355370759963987, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.361694809794426, "loss/reg": 0.0, "step": 2140 }, { "epoch": 0.014144736842105264, "grad_norm": 2.875, "grad_norm_var": 0.30113525390625, "learning_rate": 0.0001, "loss": 3.8617, "loss/crossentropy": 2.6357606053352356, "loss/hidden": 3.3515625, "loss/incoh": 0.0, "loss/logits": 0.3612044155597687, "loss/reg": 0.0, "step": 2150 }, { "epoch": 0.014210526315789474, "grad_norm": 2.921875, "grad_norm_var": 0.23430582682291667, "learning_rate": 0.0001, "loss": 3.8903, "loss/crossentropy": 2.551873171329498, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.41060586273670197, "loss/reg": 0.0, "step": 2160 }, { "epoch": 0.014276315789473684, "grad_norm": 3.5625, "grad_norm_var": 0.48121744791666665, "learning_rate": 0.0001, "loss": 3.9663, "loss/crossentropy": 2.495719885826111, "loss/hidden": 3.690625, "loss/incoh": 0.0, "loss/logits": 0.4500987708568573, "loss/reg": 0.0, "step": 2170 }, { "epoch": 0.014342105263157894, "grad_norm": 2.8125, "grad_norm_var": 0.17538655598958333, "learning_rate": 0.0001, "loss": 3.8053, "loss/crossentropy": 2.520111393928528, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.44193484634160995, "loss/reg": 0.0, "step": 2180 }, { "epoch": 0.014407894736842106, "grad_norm": 3.328125, "grad_norm_var": 31.391422526041666, "learning_rate": 0.0001, "loss": 4.0394, "loss/crossentropy": 2.5524103164672853, "loss/hidden": 3.4203125, "loss/incoh": 0.0, "loss/logits": 0.38946655094623567, "loss/reg": 0.0, "step": 2190 }, { "epoch": 0.014473684210526316, "grad_norm": 8.6875, "grad_norm_var": 2.0572499593098956, "learning_rate": 0.0001, "loss": 3.9758, "loss/crossentropy": 2.2560265123844148, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.4082709074020386, "loss/reg": 0.0, "step": 2200 }, { "epoch": 0.014539473684210526, "grad_norm": 3.25, "grad_norm_var": 3.566722615559896, "learning_rate": 0.0001, "loss": 3.8174, "loss/crossentropy": 2.2830474019050597, "loss/hidden": 3.3375, "loss/incoh": 0.0, "loss/logits": 0.3462225392460823, "loss/reg": 0.0, "step": 2210 }, { "epoch": 0.014605263157894737, "grad_norm": 3.109375, "grad_norm_var": 0.155615234375, "learning_rate": 0.0001, "loss": 3.8468, "loss/crossentropy": 2.3341428637504578, "loss/hidden": 3.4375, "loss/incoh": 0.0, "loss/logits": 0.4024402230978012, "loss/reg": 0.0, "step": 2220 }, { "epoch": 0.014671052631578948, "grad_norm": 2.984375, "grad_norm_var": 5.723542277018229, "learning_rate": 0.0001, "loss": 3.9699, "loss/crossentropy": 2.201635646820068, "loss/hidden": 3.2703125, "loss/incoh": 0.0, "loss/logits": 0.3397625252604485, "loss/reg": 0.0, "step": 2230 }, { "epoch": 0.014736842105263158, "grad_norm": 2.796875, "grad_norm_var": 47.54188537597656, "learning_rate": 0.0001, "loss": 3.9281, "loss/crossentropy": 2.5724541902542115, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3728118479251862, "loss/reg": 0.0, "step": 2240 }, { "epoch": 0.014802631578947368, "grad_norm": 2.984375, "grad_norm_var": 51.28417867024739, "learning_rate": 0.0001, "loss": 3.8237, "loss/crossentropy": 2.5087037920951842, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.3448286011815071, "loss/reg": 0.0, "step": 2250 }, { "epoch": 0.01486842105263158, "grad_norm": 3.171875, "grad_norm_var": 0.05579427083333333, "learning_rate": 0.0001, "loss": 3.8448, "loss/crossentropy": 2.4813582420349123, "loss/hidden": 3.409375, "loss/incoh": 0.0, "loss/logits": 0.40958506166934966, "loss/reg": 0.0, "step": 2260 }, { "epoch": 0.01493421052631579, "grad_norm": 2.96875, "grad_norm_var": 0.16342671712239584, "learning_rate": 0.0001, "loss": 3.753, "loss/crossentropy": 2.523963761329651, "loss/hidden": 3.6640625, "loss/incoh": 0.0, "loss/logits": 0.38928901553153994, "loss/reg": 0.0, "step": 2270 }, { "epoch": 0.015, "grad_norm": 3.0625, "grad_norm_var": 0.7111317952473958, "learning_rate": 0.0001, "loss": 3.8078, "loss/crossentropy": 2.6787729024887086, "loss/hidden": 3.71875, "loss/incoh": 0.0, "loss/logits": 0.4244162023067474, "loss/reg": 0.0, "step": 2280 }, { "epoch": 0.015065789473684211, "grad_norm": 3.09375, "grad_norm_var": 0.10339253743489583, "learning_rate": 0.0001, "loss": 3.7562, "loss/crossentropy": 2.191652774810791, "loss/hidden": 3.4140625, "loss/incoh": 0.0, "loss/logits": 0.3300579100847244, "loss/reg": 0.0, "step": 2290 }, { "epoch": 0.015131578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 3.73, "loss/crossentropy": 2.4444664478302003, "loss/hidden": 3.4265625, "loss/incoh": 0.0, "loss/logits": 0.4327535033226013, "loss/reg": 0.0, "step": 2300 }, { "epoch": 0.015197368421052631, "grad_norm": 3.046875, "grad_norm_var": 0.15128580729166666, "learning_rate": 0.0001, "loss": 3.7723, "loss/crossentropy": 2.2329365968704225, "loss/hidden": 3.51875, "loss/incoh": 0.0, "loss/logits": 0.3683215394616127, "loss/reg": 0.0, "step": 2310 }, { "epoch": 0.015263157894736841, "grad_norm": 3.03125, "grad_norm_var": 0.3289713541666667, "learning_rate": 0.0001, "loss": 3.7699, "loss/crossentropy": 2.5342983961105348, "loss/hidden": 3.2921875, "loss/incoh": 0.0, "loss/logits": 0.35688025653362276, "loss/reg": 0.0, "step": 2320 }, { "epoch": 0.015328947368421053, "grad_norm": 3.15625, "grad_norm_var": 0.5997060139973959, "learning_rate": 0.0001, "loss": 3.8494, "loss/crossentropy": 2.4934002995491027, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.3495032548904419, "loss/reg": 0.0, "step": 2330 }, { "epoch": 0.015394736842105263, "grad_norm": 4.28125, "grad_norm_var": 1.570759073893229, "learning_rate": 0.0001, "loss": 3.9628, "loss/crossentropy": 2.2064894437789917, "loss/hidden": 3.478125, "loss/incoh": 0.0, "loss/logits": 0.34214983880519867, "loss/reg": 0.0, "step": 2340 }, { "epoch": 0.015460526315789473, "grad_norm": 3.5, "grad_norm_var": 1.9128865559895833, "learning_rate": 0.0001, "loss": 3.9538, "loss/crossentropy": 2.5408032178878783, "loss/hidden": 3.515625, "loss/incoh": 0.0, "loss/logits": 0.4111128658056259, "loss/reg": 0.0, "step": 2350 }, { "epoch": 0.015526315789473685, "grad_norm": 3.765625, "grad_norm_var": 0.39661051432291666, "learning_rate": 0.0001, "loss": 3.8897, "loss/crossentropy": 2.4922020554542543, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.36137166023254397, "loss/reg": 0.0, "step": 2360 }, { "epoch": 0.015592105263157895, "grad_norm": 4.09375, "grad_norm_var": 0.21868082682291667, "learning_rate": 0.0001, "loss": 3.8461, "loss/crossentropy": 2.427833843231201, "loss/hidden": 3.5921875, "loss/incoh": 0.0, "loss/logits": 0.36205882132053374, "loss/reg": 0.0, "step": 2370 }, { "epoch": 0.015657894736842107, "grad_norm": 3.171875, "grad_norm_var": 0.29641520182291664, "learning_rate": 0.0001, "loss": 3.767, "loss/crossentropy": 2.3795222878456115, "loss/hidden": 3.5140625, "loss/incoh": 0.0, "loss/logits": 0.40104621052742007, "loss/reg": 0.0, "step": 2380 }, { "epoch": 0.015723684210526317, "grad_norm": 3.171875, "grad_norm_var": 0.4603017171223958, "learning_rate": 0.0001, "loss": 3.8162, "loss/crossentropy": 2.3675019264221193, "loss/hidden": 3.4, "loss/incoh": 0.0, "loss/logits": 0.3826363369822502, "loss/reg": 0.0, "step": 2390 }, { "epoch": 0.015789473684210527, "grad_norm": 3.59375, "grad_norm_var": 0.12642313639322916, "learning_rate": 0.0001, "loss": 3.8017, "loss/crossentropy": 2.56625235080719, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.3704580098390579, "loss/reg": 0.0, "step": 2400 }, { "epoch": 0.015855263157894737, "grad_norm": 2.8125, "grad_norm_var": 0.22603759765625, "learning_rate": 0.0001, "loss": 3.8356, "loss/crossentropy": 2.3642316341400145, "loss/hidden": 3.640625, "loss/incoh": 0.0, "loss/logits": 0.4971610188484192, "loss/reg": 0.0, "step": 2410 }, { "epoch": 0.015921052631578947, "grad_norm": 3.359375, "grad_norm_var": 2.7929354478016266e+17, "learning_rate": 0.0001, "loss": 3.9524, "loss/crossentropy": 2.5260600686073302, "loss/hidden": 3.4734375, "loss/incoh": 0.0, "loss/logits": 0.34504298865795135, "loss/reg": 0.0, "step": 2420 }, { "epoch": 0.015986842105263157, "grad_norm": 3.140625, "grad_norm_var": 2.792935447600693e+17, "learning_rate": 0.0001, "loss": 3.7506, "loss/crossentropy": 2.6639176845550536, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.3401388913393021, "loss/reg": 0.0, "step": 2430 }, { "epoch": 0.016052631578947367, "grad_norm": 2.890625, "grad_norm_var": 0.0854644775390625, "learning_rate": 0.0001, "loss": 3.7687, "loss/crossentropy": 2.3805726766586304, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.3257554292678833, "loss/reg": 0.0, "step": 2440 }, { "epoch": 0.01611842105263158, "grad_norm": 8.125, "grad_norm_var": 1.995349713361273e+17, "learning_rate": 0.0001, "loss": 4.0533, "loss/crossentropy": 2.370633268356323, "loss/hidden": 3.4765625, "loss/incoh": 0.0, "loss/logits": 0.3756751254200935, "loss/reg": 0.0, "step": 2450 }, { "epoch": 0.01618421052631579, "grad_norm": 2.75, "grad_norm_var": 1.995349712714498e+17, "learning_rate": 0.0001, "loss": 3.7822, "loss/crossentropy": 2.5394015312194824, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.34547194838523865, "loss/reg": 0.0, "step": 2460 }, { "epoch": 0.01625, "grad_norm": 3.15625, "grad_norm_var": 0.0999664306640625, "learning_rate": 0.0001, "loss": 3.6838, "loss/crossentropy": 2.478586256504059, "loss/hidden": 3.4109375, "loss/incoh": 0.0, "loss/logits": 0.4159191906452179, "loss/reg": 0.0, "step": 2470 }, { "epoch": 0.01631578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.48103739420572916, "learning_rate": 0.0001, "loss": 3.7274, "loss/crossentropy": 2.3859502553939818, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.36551299393177034, "loss/reg": 0.0, "step": 2480 }, { "epoch": 0.01638157894736842, "grad_norm": 3.171875, "grad_norm_var": 0.33056640625, "learning_rate": 0.0001, "loss": 3.8413, "loss/crossentropy": 2.47422776222229, "loss/hidden": 3.2703125, "loss/incoh": 0.0, "loss/logits": 0.34280748963356017, "loss/reg": 0.0, "step": 2490 }, { "epoch": 0.01644736842105263, "grad_norm": 3.03125, "grad_norm_var": 0.30504150390625, "learning_rate": 0.0001, "loss": 3.7262, "loss/crossentropy": 2.288319444656372, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.31078503280878067, "loss/reg": 0.0, "step": 2500 }, { "epoch": 0.01651315789473684, "grad_norm": 3.125, "grad_norm_var": 0.2894683837890625, "learning_rate": 0.0001, "loss": 3.8194, "loss/crossentropy": 2.2243799686431887, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.33974049538373946, "loss/reg": 0.0, "step": 2510 }, { "epoch": 0.016578947368421054, "grad_norm": 3.046875, "grad_norm_var": 0.3177235921223958, "learning_rate": 0.0001, "loss": 3.6925, "loss/crossentropy": 2.63401620388031, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.39840718507766726, "loss/reg": 0.0, "step": 2520 }, { "epoch": 0.016644736842105264, "grad_norm": 3.046875, "grad_norm_var": 0.2181549072265625, "learning_rate": 0.0001, "loss": 3.7573, "loss/crossentropy": 2.4925423860549927, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3407833933830261, "loss/reg": 0.0, "step": 2530 }, { "epoch": 0.016710526315789474, "grad_norm": 3.328125, "grad_norm_var": 0.19260152180989584, "learning_rate": 0.0001, "loss": 3.7514, "loss/crossentropy": 2.3376643657684326, "loss/hidden": 3.4609375, "loss/incoh": 0.0, "loss/logits": 0.4351751744747162, "loss/reg": 0.0, "step": 2540 }, { "epoch": 0.016776315789473684, "grad_norm": 2.78125, "grad_norm_var": 0.12961324055989584, "learning_rate": 0.0001, "loss": 3.7217, "loss/crossentropy": 2.279516875743866, "loss/hidden": 3.58125, "loss/incoh": 0.0, "loss/logits": 0.37157190442085264, "loss/reg": 0.0, "step": 2550 }, { "epoch": 0.016842105263157894, "grad_norm": 2.921875, "grad_norm_var": 0.0912017822265625, "learning_rate": 0.0001, "loss": 3.6839, "loss/crossentropy": 2.2843039661645888, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.33208019435405733, "loss/reg": 0.0, "step": 2560 }, { "epoch": 0.016907894736842104, "grad_norm": 3.28125, "grad_norm_var": 0.32079671223958334, "learning_rate": 0.0001, "loss": 3.7381, "loss/crossentropy": 2.2308380246162414, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.30514844954013826, "loss/reg": 0.0, "step": 2570 }, { "epoch": 0.016973684210526314, "grad_norm": 2.921875, "grad_norm_var": 0.6193522135416667, "learning_rate": 0.0001, "loss": 3.7209, "loss/crossentropy": 2.500720489025116, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.38955146372318267, "loss/reg": 0.0, "step": 2580 }, { "epoch": 0.017039473684210528, "grad_norm": 3.0, "grad_norm_var": 0.09539388020833334, "learning_rate": 0.0001, "loss": 3.6933, "loss/crossentropy": 2.397514319419861, "loss/hidden": 3.4796875, "loss/incoh": 0.0, "loss/logits": 0.426153627038002, "loss/reg": 0.0, "step": 2590 }, { "epoch": 0.017105263157894738, "grad_norm": 2.984375, "grad_norm_var": 0.44207356770833334, "learning_rate": 0.0001, "loss": 3.7927, "loss/crossentropy": 2.4746341586112974, "loss/hidden": 3.4515625, "loss/incoh": 0.0, "loss/logits": 0.3806774616241455, "loss/reg": 0.0, "step": 2600 }, { "epoch": 0.017171052631578948, "grad_norm": 3.109375, "grad_norm_var": 0.10927632649739584, "learning_rate": 0.0001, "loss": 3.7507, "loss/crossentropy": 2.6908259630203246, "loss/hidden": 3.459375, "loss/incoh": 0.0, "loss/logits": 0.47532927691936494, "loss/reg": 0.0, "step": 2610 }, { "epoch": 0.017236842105263158, "grad_norm": 2.75, "grad_norm_var": 1.8751780192057292, "learning_rate": 0.0001, "loss": 3.8498, "loss/crossentropy": 2.3184617161750793, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3832893192768097, "loss/reg": 0.0, "step": 2620 }, { "epoch": 0.017302631578947368, "grad_norm": 3.734375, "grad_norm_var": 1.95693359375, "learning_rate": 0.0001, "loss": 3.8697, "loss/crossentropy": 2.386726236343384, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.4112528935074806, "loss/reg": 0.0, "step": 2630 }, { "epoch": 0.017368421052631578, "grad_norm": 3.09375, "grad_norm_var": 236.78067118326823, "learning_rate": 0.0001, "loss": 3.843, "loss/crossentropy": 2.6268559217453005, "loss/hidden": 3.525, "loss/incoh": 0.0, "loss/logits": 0.5286044746637344, "loss/reg": 0.0, "step": 2640 }, { "epoch": 0.017434210526315788, "grad_norm": 3.15625, "grad_norm_var": 238.80460510253906, "learning_rate": 0.0001, "loss": 3.7934, "loss/crossentropy": 2.621377694606781, "loss/hidden": 3.346875, "loss/incoh": 0.0, "loss/logits": 0.39670759439468384, "loss/reg": 0.0, "step": 2650 }, { "epoch": 0.0175, "grad_norm": 2.921875, "grad_norm_var": 0.4898274739583333, "learning_rate": 0.0001, "loss": 3.8686, "loss/crossentropy": 2.3771218061447144, "loss/hidden": 3.5328125, "loss/incoh": 0.0, "loss/logits": 0.39255764335393906, "loss/reg": 0.0, "step": 2660 }, { "epoch": 0.01756578947368421, "grad_norm": 2.96875, "grad_norm_var": 0.9812001546223958, "learning_rate": 0.0001, "loss": 3.6636, "loss/crossentropy": 2.2852025091648103, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3551890656352043, "loss/reg": 0.0, "step": 2670 }, { "epoch": 0.017631578947368422, "grad_norm": 3.203125, "grad_norm_var": 0.38752848307291665, "learning_rate": 0.0001, "loss": 3.7338, "loss/crossentropy": 2.280995038151741, "loss/hidden": 3.271875, "loss/incoh": 0.0, "loss/logits": 0.34781029969453814, "loss/reg": 0.0, "step": 2680 }, { "epoch": 0.017697368421052632, "grad_norm": 2.953125, "grad_norm_var": 0.5138661702473958, "learning_rate": 0.0001, "loss": 3.7751, "loss/crossentropy": 2.5928542375564576, "loss/hidden": 3.2203125, "loss/incoh": 0.0, "loss/logits": 0.3643882930278778, "loss/reg": 0.0, "step": 2690 }, { "epoch": 0.017763157894736842, "grad_norm": 3.015625, "grad_norm_var": 2.5011057535807293, "learning_rate": 0.0001, "loss": 3.7281, "loss/crossentropy": 2.71818265914917, "loss/hidden": 3.3203125, "loss/incoh": 0.0, "loss/logits": 0.41874536871910095, "loss/reg": 0.0, "step": 2700 }, { "epoch": 0.017828947368421052, "grad_norm": 3.0625, "grad_norm_var": 0.2831939697265625, "learning_rate": 0.0001, "loss": 3.7723, "loss/crossentropy": 2.3557824969291685, "loss/hidden": 3.303125, "loss/incoh": 0.0, "loss/logits": 0.36439308822155, "loss/reg": 0.0, "step": 2710 }, { "epoch": 0.017894736842105262, "grad_norm": 3.390625, "grad_norm_var": 1.9459625244140626, "learning_rate": 0.0001, "loss": 3.854, "loss/crossentropy": 2.3793618083000183, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.36435145139694214, "loss/reg": 0.0, "step": 2720 }, { "epoch": 0.017960526315789475, "grad_norm": 2.5, "grad_norm_var": 1.9445271809895834, "learning_rate": 0.0001, "loss": 3.7925, "loss/crossentropy": 2.364825797080994, "loss/hidden": 3.2453125, "loss/incoh": 0.0, "loss/logits": 0.3577578902244568, "loss/reg": 0.0, "step": 2730 }, { "epoch": 0.018026315789473685, "grad_norm": 2.9375, "grad_norm_var": 0.18502197265625, "learning_rate": 0.0001, "loss": 3.747, "loss/crossentropy": 2.3110872566699983, "loss/hidden": 3.4078125, "loss/incoh": 0.0, "loss/logits": 0.3662073493003845, "loss/reg": 0.0, "step": 2740 }, { "epoch": 0.018092105263157895, "grad_norm": 2.90625, "grad_norm_var": 0.06238606770833333, "learning_rate": 0.0001, "loss": 3.736, "loss/crossentropy": 2.4977880120277405, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.40198240578174593, "loss/reg": 0.0, "step": 2750 }, { "epoch": 0.018157894736842106, "grad_norm": 2.65625, "grad_norm_var": 1.3276357014973958, "learning_rate": 0.0001, "loss": 3.7594, "loss/crossentropy": 2.3260527729988096, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.3752635881304741, "loss/reg": 0.0, "step": 2760 }, { "epoch": 0.018223684210526316, "grad_norm": 3.0, "grad_norm_var": 0.06177978515625, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 2.6722695350646974, "loss/hidden": 3.34375, "loss/incoh": 0.0, "loss/logits": 0.36579819619655607, "loss/reg": 0.0, "step": 2770 }, { "epoch": 0.018289473684210526, "grad_norm": 3.78125, "grad_norm_var": 0.07119038899739584, "learning_rate": 0.0001, "loss": 3.7821, "loss/crossentropy": 2.712409424781799, "loss/hidden": 3.59375, "loss/incoh": 0.0, "loss/logits": 0.43594706654548643, "loss/reg": 0.0, "step": 2780 }, { "epoch": 0.018355263157894736, "grad_norm": 3.21875, "grad_norm_var": 2.3453409830729166, "learning_rate": 0.0001, "loss": 3.7973, "loss/crossentropy": 2.547107517719269, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.4092650800943375, "loss/reg": 0.0, "step": 2790 }, { "epoch": 0.018421052631578946, "grad_norm": 2.796875, "grad_norm_var": 4.506314086914062, "learning_rate": 0.0001, "loss": 3.6334, "loss/crossentropy": 2.5399341940879823, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.4391636699438095, "loss/reg": 0.0, "step": 2800 }, { "epoch": 0.01848684210526316, "grad_norm": 4.84375, "grad_norm_var": 2.5375152587890626, "learning_rate": 0.0001, "loss": 3.7292, "loss/crossentropy": 2.5819294929504393, "loss/hidden": 3.365625, "loss/incoh": 0.0, "loss/logits": 0.33986122310161593, "loss/reg": 0.0, "step": 2810 }, { "epoch": 0.01855263157894737, "grad_norm": 4.03125, "grad_norm_var": 0.3421295166015625, "learning_rate": 0.0001, "loss": 3.7003, "loss/crossentropy": 2.4064539194107057, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.3262439340353012, "loss/reg": 0.0, "step": 2820 }, { "epoch": 0.01861842105263158, "grad_norm": 2.828125, "grad_norm_var": 0.1756988525390625, "learning_rate": 0.0001, "loss": 3.804, "loss/crossentropy": 2.6151478767395018, "loss/hidden": 3.6390625, "loss/incoh": 0.0, "loss/logits": 0.4677980303764343, "loss/reg": 0.0, "step": 2830 }, { "epoch": 0.01868421052631579, "grad_norm": 3.03125, "grad_norm_var": 0.10458577473958333, "learning_rate": 0.0001, "loss": 3.7672, "loss/crossentropy": 2.5210029244422913, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.40769249498844146, "loss/reg": 0.0, "step": 2840 }, { "epoch": 0.01875, "grad_norm": 3.28125, "grad_norm_var": 0.1360504150390625, "learning_rate": 0.0001, "loss": 3.7829, "loss/crossentropy": 2.3390053629875185, "loss/hidden": 3.4421875, "loss/incoh": 0.0, "loss/logits": 0.3656760662794113, "loss/reg": 0.0, "step": 2850 }, { "epoch": 0.01881578947368421, "grad_norm": 2.609375, "grad_norm_var": 0.21334635416666667, "learning_rate": 0.0001, "loss": 3.7445, "loss/crossentropy": 2.044054812192917, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.45996856689453125, "loss/reg": 0.0, "step": 2860 }, { "epoch": 0.01888157894736842, "grad_norm": 2.921875, "grad_norm_var": 0.35147196451822915, "learning_rate": 0.0001, "loss": 3.793, "loss/crossentropy": 2.2911840200424196, "loss/hidden": 3.5703125, "loss/incoh": 0.0, "loss/logits": 0.3373717874288559, "loss/reg": 0.0, "step": 2870 }, { "epoch": 0.018947368421052633, "grad_norm": 2.515625, "grad_norm_var": 0.3585845947265625, "learning_rate": 0.0001, "loss": 3.7057, "loss/crossentropy": 2.3755346536636353, "loss/hidden": 3.3609375, "loss/incoh": 0.0, "loss/logits": 0.36185318529605864, "loss/reg": 0.0, "step": 2880 }, { "epoch": 0.019013157894736843, "grad_norm": 2.859375, "grad_norm_var": 0.0810211181640625, "learning_rate": 0.0001, "loss": 3.7152, "loss/crossentropy": 2.1650418758392336, "loss/hidden": 3.5109375, "loss/incoh": 0.0, "loss/logits": 0.36209405958652496, "loss/reg": 0.0, "step": 2890 }, { "epoch": 0.019078947368421053, "grad_norm": 3.171875, "grad_norm_var": 1.6020182291666667, "learning_rate": 0.0001, "loss": 3.8985, "loss/crossentropy": 2.345675766468048, "loss/hidden": 3.9203125, "loss/incoh": 0.0, "loss/logits": 0.41470250189304353, "loss/reg": 0.0, "step": 2900 }, { "epoch": 0.019144736842105263, "grad_norm": 3.359375, "grad_norm_var": 1.289264933268229, "learning_rate": 0.0001, "loss": 3.8444, "loss/crossentropy": 2.3634544610977173, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.3347387105226517, "loss/reg": 0.0, "step": 2910 }, { "epoch": 0.019210526315789473, "grad_norm": 2.890625, "grad_norm_var": 0.19099019368489584, "learning_rate": 0.0001, "loss": 3.7385, "loss/crossentropy": 2.3202159285545347, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.30690879598259924, "loss/reg": 0.0, "step": 2920 }, { "epoch": 0.019276315789473683, "grad_norm": 3.484375, "grad_norm_var": 0.4906209309895833, "learning_rate": 0.0001, "loss": 3.6643, "loss/crossentropy": 2.2004665434360504, "loss/hidden": 3.484375, "loss/incoh": 0.0, "loss/logits": 0.3603193074464798, "loss/reg": 0.0, "step": 2930 }, { "epoch": 0.019342105263157893, "grad_norm": 3.421875, "grad_norm_var": 0.13365478515625, "learning_rate": 0.0001, "loss": 3.6656, "loss/crossentropy": 2.5021592140197755, "loss/hidden": 3.4109375, "loss/incoh": 0.0, "loss/logits": 0.35134916603565214, "loss/reg": 0.0, "step": 2940 }, { "epoch": 0.019407894736842107, "grad_norm": 2.765625, "grad_norm_var": 1.0520497639973958, "learning_rate": 0.0001, "loss": 3.7813, "loss/crossentropy": 2.4475439548492433, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.4413463234901428, "loss/reg": 0.0, "step": 2950 }, { "epoch": 0.019473684210526317, "grad_norm": 2.90625, "grad_norm_var": 1.761279296875, "learning_rate": 0.0001, "loss": 3.7476, "loss/crossentropy": 2.57927063703537, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.4805259481072426, "loss/reg": 0.0, "step": 2960 }, { "epoch": 0.019539473684210527, "grad_norm": 3.734375, "grad_norm_var": 0.2982737223307292, "learning_rate": 0.0001, "loss": 3.7277, "loss/crossentropy": 2.0291129291057586, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.285884577780962, "loss/reg": 0.0, "step": 2970 }, { "epoch": 0.019605263157894737, "grad_norm": 3.03125, "grad_norm_var": 0.6626942952473959, "learning_rate": 0.0001, "loss": 3.8596, "loss/crossentropy": 2.112092435359955, "loss/hidden": 3.4453125, "loss/incoh": 0.0, "loss/logits": 0.36193700730800626, "loss/reg": 0.0, "step": 2980 }, { "epoch": 0.019671052631578947, "grad_norm": 3.359375, "grad_norm_var": 0.16502176920572917, "learning_rate": 0.0001, "loss": 3.7666, "loss/crossentropy": 2.2168065547943114, "loss/hidden": 3.3734375, "loss/incoh": 0.0, "loss/logits": 0.3593168243765831, "loss/reg": 0.0, "step": 2990 }, { "epoch": 0.019736842105263157, "grad_norm": 2.921875, "grad_norm_var": 0.45120035807291664, "learning_rate": 0.0001, "loss": 3.7107, "loss/crossentropy": 2.350427895784378, "loss/hidden": 3.3109375, "loss/incoh": 0.0, "loss/logits": 0.37478172183036806, "loss/reg": 0.0, "step": 3000 }, { "epoch": 0.019802631578947367, "grad_norm": 2.71875, "grad_norm_var": 0.15097249348958333, "learning_rate": 0.0001, "loss": 3.7559, "loss/crossentropy": 2.6041540622711183, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.366491025686264, "loss/reg": 0.0, "step": 3010 }, { "epoch": 0.01986842105263158, "grad_norm": 2.953125, "grad_norm_var": 0.1023834228515625, "learning_rate": 0.0001, "loss": 3.7322, "loss/crossentropy": 2.2936912298202516, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.3990582287311554, "loss/reg": 0.0, "step": 3020 }, { "epoch": 0.01993421052631579, "grad_norm": 2.9375, "grad_norm_var": 0.07664388020833333, "learning_rate": 0.0001, "loss": 3.7405, "loss/crossentropy": 2.1873862028121946, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.4286995857954025, "loss/reg": 0.0, "step": 3030 }, { "epoch": 0.02, "grad_norm": 2.53125, "grad_norm_var": 0.3075480143229167, "learning_rate": 0.0001, "loss": 3.7681, "loss/crossentropy": 2.0712085247039793, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.2994038611650467, "loss/reg": 0.0, "step": 3040 }, { "epoch": 0.02006578947368421, "grad_norm": 2.734375, "grad_norm_var": 0.6994049072265625, "learning_rate": 0.0001, "loss": 3.7519, "loss/crossentropy": 2.155745780467987, "loss/hidden": 3.2890625, "loss/incoh": 0.0, "loss/logits": 0.3106741845607758, "loss/reg": 0.0, "step": 3050 }, { "epoch": 0.02013157894736842, "grad_norm": 3.53125, "grad_norm_var": 0.177978515625, "learning_rate": 0.0001, "loss": 3.7618, "loss/crossentropy": 2.1236796349287035, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.3017591178417206, "loss/reg": 0.0, "step": 3060 }, { "epoch": 0.02019736842105263, "grad_norm": 3.265625, "grad_norm_var": 0.16617431640625, "learning_rate": 0.0001, "loss": 3.7225, "loss/crossentropy": 2.4667672872543336, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.3858541399240494, "loss/reg": 0.0, "step": 3070 }, { "epoch": 0.02026315789473684, "grad_norm": 2.703125, "grad_norm_var": 2.2265110394707968e+17, "learning_rate": 0.0001, "loss": 3.8077, "loss/crossentropy": 2.53361736536026, "loss/hidden": 3.121875, "loss/incoh": 0.0, "loss/logits": 0.3336446687579155, "loss/reg": 0.0, "step": 3080 }, { "epoch": 0.020328947368421054, "grad_norm": 2.34375, "grad_norm_var": 2.2265110387924992e+17, "learning_rate": 0.0001, "loss": 3.7067, "loss/crossentropy": 2.3802372574806214, "loss/hidden": 3.2609375, "loss/incoh": 0.0, "loss/logits": 0.3561277031898499, "loss/reg": 0.0, "step": 3090 }, { "epoch": 0.020394736842105264, "grad_norm": 4.53125, "grad_norm_var": 1.25572509765625, "learning_rate": 0.0001, "loss": 3.8816, "loss/crossentropy": 2.8750504910945893, "loss/hidden": 3.728125, "loss/incoh": 0.0, "loss/logits": 0.37489808425307275, "loss/reg": 0.0, "step": 3100 }, { "epoch": 0.020460526315789474, "grad_norm": 2.640625, "grad_norm_var": 0.77646484375, "learning_rate": 0.0001, "loss": 3.6678, "loss/crossentropy": 2.258682942390442, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.3497451141476631, "loss/reg": 0.0, "step": 3110 }, { "epoch": 0.020526315789473684, "grad_norm": 2.8125, "grad_norm_var": 0.047412109375, "learning_rate": 0.0001, "loss": 3.6585, "loss/crossentropy": 2.3022143959999086, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.37089207768440247, "loss/reg": 0.0, "step": 3120 }, { "epoch": 0.020592105263157894, "grad_norm": 2.921875, "grad_norm_var": 1.5189036051432292, "learning_rate": 0.0001, "loss": 3.7862, "loss/crossentropy": 2.6240602493286134, "loss/hidden": 3.35625, "loss/incoh": 0.0, "loss/logits": 0.407352888584137, "loss/reg": 0.0, "step": 3130 }, { "epoch": 0.020657894736842104, "grad_norm": 3.71875, "grad_norm_var": 1.5723052978515626, "learning_rate": 0.0001, "loss": 3.7355, "loss/crossentropy": 2.290934902429581, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3808047503232956, "loss/reg": 0.0, "step": 3140 }, { "epoch": 0.020723684210526314, "grad_norm": 2.84375, "grad_norm_var": 0.5337961832682292, "learning_rate": 0.0001, "loss": 3.7778, "loss/crossentropy": 2.485193204879761, "loss/hidden": 3.515625, "loss/incoh": 0.0, "loss/logits": 0.4543539136648178, "loss/reg": 0.0, "step": 3150 }, { "epoch": 0.020789473684210528, "grad_norm": 2.8125, "grad_norm_var": 0.3465728759765625, "learning_rate": 0.0001, "loss": 3.7506, "loss/crossentropy": 2.557511067390442, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.4272035837173462, "loss/reg": 0.0, "step": 3160 }, { "epoch": 0.020855263157894738, "grad_norm": 3.0625, "grad_norm_var": 0.20754801432291667, "learning_rate": 0.0001, "loss": 3.6316, "loss/crossentropy": 2.310171937942505, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.32435240745544436, "loss/reg": 0.0, "step": 3170 }, { "epoch": 0.020921052631578948, "grad_norm": 2.578125, "grad_norm_var": 0.833984375, "learning_rate": 0.0001, "loss": 3.7324, "loss/crossentropy": 2.5518528699874876, "loss/hidden": 3.3765625, "loss/incoh": 0.0, "loss/logits": 0.3816035658121109, "loss/reg": 0.0, "step": 3180 }, { "epoch": 0.020986842105263158, "grad_norm": 2.984375, "grad_norm_var": 2.3063795635792774e+17, "learning_rate": 0.0001, "loss": 3.8813, "loss/crossentropy": 2.3198139667510986, "loss/hidden": 3.38125, "loss/incoh": 0.0, "loss/logits": 0.3700142025947571, "loss/reg": 0.0, "step": 3190 }, { "epoch": 0.021052631578947368, "grad_norm": 2.78125, "grad_norm_var": 0.10243733723958333, "learning_rate": 0.0001, "loss": 3.6063, "loss/crossentropy": 2.545992207527161, "loss/hidden": 3.2875, "loss/incoh": 0.0, "loss/logits": 0.3494896024465561, "loss/reg": 0.0, "step": 3200 }, { "epoch": 0.021118421052631578, "grad_norm": 2.8125, "grad_norm_var": 0.06311442057291666, "learning_rate": 0.0001, "loss": 3.6901, "loss/crossentropy": 2.330699014663696, "loss/hidden": 3.3625, "loss/incoh": 0.0, "loss/logits": 0.34815158843994143, "loss/reg": 0.0, "step": 3210 }, { "epoch": 0.021184210526315788, "grad_norm": 3.25, "grad_norm_var": 0.3830718994140625, "learning_rate": 0.0001, "loss": 3.7327, "loss/crossentropy": 2.489699113368988, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.337864650785923, "loss/reg": 0.0, "step": 3220 }, { "epoch": 0.02125, "grad_norm": 2.828125, "grad_norm_var": 0.38974609375, "learning_rate": 0.0001, "loss": 3.6871, "loss/crossentropy": 2.3864477396011354, "loss/hidden": 3.3859375, "loss/incoh": 0.0, "loss/logits": 0.4103096604347229, "loss/reg": 0.0, "step": 3230 }, { "epoch": 0.02131578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 3.6458, "loss/crossentropy": 2.4074989527463915, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.35124915838241577, "loss/reg": 0.0, "step": 3240 }, { "epoch": 0.02138157894736842, "grad_norm": 3.453125, "grad_norm_var": 0.08498942057291667, "learning_rate": 0.0001, "loss": 3.7083, "loss/crossentropy": 2.524831974506378, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.368264502286911, "loss/reg": 0.0, "step": 3250 }, { "epoch": 0.02144736842105263, "grad_norm": 2.78125, "grad_norm_var": 0.42135009765625, "learning_rate": 0.0001, "loss": 3.7602, "loss/crossentropy": 2.166905391216278, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.3093524396419525, "loss/reg": 0.0, "step": 3260 }, { "epoch": 0.02151315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.4861724853515625, "learning_rate": 0.0001, "loss": 3.6738, "loss/crossentropy": 2.2662750601768495, "loss/hidden": 3.1578125, "loss/incoh": 0.0, "loss/logits": 0.318782140314579, "loss/reg": 0.0, "step": 3270 }, { "epoch": 0.02157894736842105, "grad_norm": 2.875, "grad_norm_var": 0.13782145182291666, "learning_rate": 0.0001, "loss": 3.7461, "loss/crossentropy": 2.0462193369865416, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.30782590508461, "loss/reg": 0.0, "step": 3280 }, { "epoch": 0.021644736842105262, "grad_norm": 2.875, "grad_norm_var": 0.11330973307291667, "learning_rate": 0.0001, "loss": 3.7008, "loss/crossentropy": 2.5261476397514344, "loss/hidden": 3.2734375, "loss/incoh": 0.0, "loss/logits": 0.3591727793216705, "loss/reg": 0.0, "step": 3290 }, { "epoch": 0.021710526315789475, "grad_norm": 3.0, "grad_norm_var": 0.1515533447265625, "learning_rate": 0.0001, "loss": 3.751, "loss/crossentropy": 2.569035267829895, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.3920204371213913, "loss/reg": 0.0, "step": 3300 }, { "epoch": 0.021776315789473685, "grad_norm": 2.96875, "grad_norm_var": 0.09117431640625, "learning_rate": 0.0001, "loss": 3.6385, "loss/crossentropy": 2.5647154331207274, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.34364444613456724, "loss/reg": 0.0, "step": 3310 }, { "epoch": 0.021842105263157895, "grad_norm": 2.5625, "grad_norm_var": 1.6411692301432292, "learning_rate": 0.0001, "loss": 3.6335, "loss/crossentropy": 2.427185571193695, "loss/hidden": 3.30625, "loss/incoh": 0.0, "loss/logits": 0.3427447766065598, "loss/reg": 0.0, "step": 3320 }, { "epoch": 0.021907894736842105, "grad_norm": 2.390625, "grad_norm_var": 0.11515299479166667, "learning_rate": 0.0001, "loss": 3.6807, "loss/crossentropy": 2.1253209471702577, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.36423676908016206, "loss/reg": 0.0, "step": 3330 }, { "epoch": 0.021973684210526315, "grad_norm": 2.78125, "grad_norm_var": 0.046956380208333336, "learning_rate": 0.0001, "loss": 3.5711, "loss/crossentropy": 2.3783676266670226, "loss/hidden": 3.1703125, "loss/incoh": 0.0, "loss/logits": 0.3118838146328926, "loss/reg": 0.0, "step": 3340 }, { "epoch": 0.022039473684210525, "grad_norm": 2.953125, "grad_norm_var": 0.060791015625, "learning_rate": 0.0001, "loss": 3.6252, "loss/crossentropy": 2.350738251209259, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.2957428440451622, "loss/reg": 0.0, "step": 3350 }, { "epoch": 0.022105263157894735, "grad_norm": 2.4375, "grad_norm_var": 0.09226888020833333, "learning_rate": 0.0001, "loss": 3.7233, "loss/crossentropy": 2.5446563720703126, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.3123622477054596, "loss/reg": 0.0, "step": 3360 }, { "epoch": 0.02217105263157895, "grad_norm": 2.828125, "grad_norm_var": 0.3047108968098958, "learning_rate": 0.0001, "loss": 3.6972, "loss/crossentropy": 2.33814697265625, "loss/hidden": 3.20625, "loss/incoh": 0.0, "loss/logits": 0.30704180896282196, "loss/reg": 0.0, "step": 3370 }, { "epoch": 0.02223684210526316, "grad_norm": 3.03125, "grad_norm_var": 0.17353413899739584, "learning_rate": 0.0001, "loss": 3.6799, "loss/crossentropy": 2.2355513691902162, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.3377602517604828, "loss/reg": 0.0, "step": 3380 }, { "epoch": 0.02230263157894737, "grad_norm": 2.609375, "grad_norm_var": 6.300797526041666, "learning_rate": 0.0001, "loss": 3.7636, "loss/crossentropy": 2.3003466069698333, "loss/hidden": 3.396875, "loss/incoh": 0.0, "loss/logits": 0.3391520828008652, "loss/reg": 0.0, "step": 3390 }, { "epoch": 0.02236842105263158, "grad_norm": 3.34375, "grad_norm_var": 0.0972564697265625, "learning_rate": 0.0001, "loss": 3.658, "loss/crossentropy": 2.3254055261611937, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.3125807404518127, "loss/reg": 0.0, "step": 3400 }, { "epoch": 0.02243421052631579, "grad_norm": 2.59375, "grad_norm_var": 15.84599609375, "learning_rate": 0.0001, "loss": 3.7692, "loss/crossentropy": 2.753233790397644, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.3326481133699417, "loss/reg": 0.0, "step": 3410 }, { "epoch": 0.0225, "grad_norm": 4.875, "grad_norm_var": 1.10299072265625, "learning_rate": 0.0001, "loss": 3.7531, "loss/crossentropy": 2.282338631153107, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.3607694834470749, "loss/reg": 0.0, "step": 3420 }, { "epoch": 0.02256578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.626123046875, "learning_rate": 0.0001, "loss": 3.8275, "loss/crossentropy": 2.5421807527542115, "loss/hidden": 3.425, "loss/incoh": 0.0, "loss/logits": 0.5308041572570801, "loss/reg": 0.0, "step": 3430 }, { "epoch": 0.022631578947368423, "grad_norm": 2.78125, "grad_norm_var": 0.40051676432291666, "learning_rate": 0.0001, "loss": 3.7523, "loss/crossentropy": 2.541818845272064, "loss/hidden": 3.3625, "loss/incoh": 0.0, "loss/logits": 0.359403657913208, "loss/reg": 0.0, "step": 3440 }, { "epoch": 0.022697368421052633, "grad_norm": 2.90625, "grad_norm_var": 0.35347391764322916, "learning_rate": 0.0001, "loss": 3.6627, "loss/crossentropy": 2.5443089246749877, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.32425140738487246, "loss/reg": 0.0, "step": 3450 }, { "epoch": 0.022763157894736843, "grad_norm": 3.296875, "grad_norm_var": 0.09103190104166667, "learning_rate": 0.0001, "loss": 3.6541, "loss/crossentropy": 2.4523619592189787, "loss/hidden": 3.4453125, "loss/incoh": 0.0, "loss/logits": 0.3741248741745949, "loss/reg": 0.0, "step": 3460 }, { "epoch": 0.022828947368421053, "grad_norm": 5.375, "grad_norm_var": 1.3390533447265625, "learning_rate": 0.0001, "loss": 3.7552, "loss/crossentropy": 2.282164466381073, "loss/hidden": 3.3984375, "loss/incoh": 0.0, "loss/logits": 0.32355323880910875, "loss/reg": 0.0, "step": 3470 }, { "epoch": 0.022894736842105263, "grad_norm": 2.859375, "grad_norm_var": 0.5425608317057292, "learning_rate": 0.0001, "loss": 3.6438, "loss/crossentropy": 2.584494400024414, "loss/hidden": 3.325, "loss/incoh": 0.0, "loss/logits": 0.3407262712717056, "loss/reg": 0.0, "step": 3480 }, { "epoch": 0.022960526315789473, "grad_norm": 2.796875, "grad_norm_var": 11.746613566080729, "learning_rate": 0.0001, "loss": 3.8186, "loss/crossentropy": 2.781242084503174, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.516629433631897, "loss/reg": 0.0, "step": 3490 }, { "epoch": 0.023026315789473683, "grad_norm": 2.96875, "grad_norm_var": 0.38925374348958336, "learning_rate": 0.0001, "loss": 3.6824, "loss/crossentropy": 2.6085395932197573, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.40641255080699923, "loss/reg": 0.0, "step": 3500 }, { "epoch": 0.023092105263157896, "grad_norm": 2.375, "grad_norm_var": 0.0653228759765625, "learning_rate": 0.0001, "loss": 3.6125, "loss/crossentropy": 2.143614149093628, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.32446493208408356, "loss/reg": 0.0, "step": 3510 }, { "epoch": 0.023157894736842106, "grad_norm": 3.78125, "grad_norm_var": 0.24351806640625, "learning_rate": 0.0001, "loss": 3.7227, "loss/crossentropy": 2.4581753849983214, "loss/hidden": 3.5296875, "loss/incoh": 0.0, "loss/logits": 0.4273629605770111, "loss/reg": 0.0, "step": 3520 }, { "epoch": 0.023223684210526317, "grad_norm": 2.546875, "grad_norm_var": 0.3322987874348958, "learning_rate": 0.0001, "loss": 3.6619, "loss/crossentropy": 2.452810299396515, "loss/hidden": 3.41875, "loss/incoh": 0.0, "loss/logits": 0.364976304769516, "loss/reg": 0.0, "step": 3530 }, { "epoch": 0.023289473684210527, "grad_norm": 2.546875, "grad_norm_var": 0.050780232747395834, "learning_rate": 0.0001, "loss": 3.5957, "loss/crossentropy": 2.6891199111938477, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3124883592128754, "loss/reg": 0.0, "step": 3540 }, { "epoch": 0.023355263157894737, "grad_norm": 2.671875, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 3.7773, "loss/crossentropy": 2.565224659442902, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.30210830420255663, "loss/reg": 0.0, "step": 3550 }, { "epoch": 0.023421052631578947, "grad_norm": 3.921875, "grad_norm_var": 0.20045166015625, "learning_rate": 0.0001, "loss": 3.6777, "loss/crossentropy": 2.3339913129806518, "loss/hidden": 3.7484375, "loss/incoh": 0.0, "loss/logits": 0.3782587692141533, "loss/reg": 0.0, "step": 3560 }, { "epoch": 0.023486842105263157, "grad_norm": 6.375, "grad_norm_var": 7.792964680989583, "learning_rate": 0.0001, "loss": 3.9088, "loss/crossentropy": 2.305986249446869, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3260203331708908, "loss/reg": 0.0, "step": 3570 }, { "epoch": 0.023552631578947367, "grad_norm": 2.5, "grad_norm_var": 1.226398722330729, "learning_rate": 0.0001, "loss": 3.5858, "loss/crossentropy": 2.5068121433258055, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.38687763810157777, "loss/reg": 0.0, "step": 3580 }, { "epoch": 0.02361842105263158, "grad_norm": 2.6875, "grad_norm_var": 0.08307291666666666, "learning_rate": 0.0001, "loss": 3.5749, "loss/crossentropy": 2.440618324279785, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.3219692587852478, "loss/reg": 0.0, "step": 3590 }, { "epoch": 0.02368421052631579, "grad_norm": 2.921875, "grad_norm_var": 0.2329742431640625, "learning_rate": 0.0001, "loss": 3.7091, "loss/crossentropy": 2.3230647802352906, "loss/hidden": 3.4234375, "loss/incoh": 0.0, "loss/logits": 0.3276051238179207, "loss/reg": 0.0, "step": 3600 }, { "epoch": 0.02375, "grad_norm": 3.125, "grad_norm_var": 0.06494038899739583, "learning_rate": 0.0001, "loss": 3.6957, "loss/crossentropy": 2.549039614200592, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.4018037021160126, "loss/reg": 0.0, "step": 3610 }, { "epoch": 0.02381578947368421, "grad_norm": 2.546875, "grad_norm_var": 0.05314839680989583, "learning_rate": 0.0001, "loss": 3.6018, "loss/crossentropy": 2.5681329488754274, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.3874122858047485, "loss/reg": 0.0, "step": 3620 }, { "epoch": 0.02388157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.30606180826822915, "learning_rate": 0.0001, "loss": 3.6534, "loss/crossentropy": 2.224066364765167, "loss/hidden": 3.41875, "loss/incoh": 0.0, "loss/logits": 0.3868778973817825, "loss/reg": 0.0, "step": 3630 }, { "epoch": 0.02394736842105263, "grad_norm": 2.5, "grad_norm_var": 0.44580078125, "learning_rate": 0.0001, "loss": 3.7632, "loss/crossentropy": 2.313184142112732, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.35929109454154967, "loss/reg": 0.0, "step": 3640 }, { "epoch": 0.02401315789473684, "grad_norm": 4.65625, "grad_norm_var": 0.43798421223958334, "learning_rate": 0.0001, "loss": 3.6503, "loss/crossentropy": 2.483446490764618, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.324999064207077, "loss/reg": 0.0, "step": 3650 }, { "epoch": 0.024078947368421054, "grad_norm": 2.71875, "grad_norm_var": 0.271875, "learning_rate": 0.0001, "loss": 3.7069, "loss/crossentropy": 2.2979444444179533, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3168840616941452, "loss/reg": 0.0, "step": 3660 }, { "epoch": 0.024144736842105264, "grad_norm": 2.65625, "grad_norm_var": 0.0767242431640625, "learning_rate": 0.0001, "loss": 3.6808, "loss/crossentropy": 2.4076380014419554, "loss/hidden": 3.55625, "loss/incoh": 0.0, "loss/logits": 0.3966252237558365, "loss/reg": 0.0, "step": 3670 }, { "epoch": 0.024210526315789474, "grad_norm": 3.109375, "grad_norm_var": 0.06747639973958333, "learning_rate": 0.0001, "loss": 3.7852, "loss/crossentropy": 2.6107199430465697, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.3948290854692459, "loss/reg": 0.0, "step": 3680 }, { "epoch": 0.024276315789473684, "grad_norm": 3.765625, "grad_norm_var": 0.13434244791666666, "learning_rate": 0.0001, "loss": 3.6048, "loss/crossentropy": 2.5476237654685976, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.352567557990551, "loss/reg": 0.0, "step": 3690 }, { "epoch": 0.024342105263157894, "grad_norm": 2.34375, "grad_norm_var": 0.6126139322916667, "learning_rate": 0.0001, "loss": 3.7034, "loss/crossentropy": 2.435033369064331, "loss/hidden": 3.275, "loss/incoh": 0.0, "loss/logits": 0.37708690464496614, "loss/reg": 0.0, "step": 3700 }, { "epoch": 0.024407894736842104, "grad_norm": 3.03125, "grad_norm_var": 0.7496002197265625, "learning_rate": 0.0001, "loss": 3.6696, "loss/crossentropy": 2.550716495513916, "loss/hidden": 3.3328125, "loss/incoh": 0.0, "loss/logits": 0.38517349362373354, "loss/reg": 0.0, "step": 3710 }, { "epoch": 0.024473684210526314, "grad_norm": 2.296875, "grad_norm_var": 0.5516886393229167, "learning_rate": 0.0001, "loss": 3.5623, "loss/crossentropy": 2.3806477397680283, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.3013214536011219, "loss/reg": 0.0, "step": 3720 }, { "epoch": 0.024539473684210528, "grad_norm": 3.1875, "grad_norm_var": 0.4890207926432292, "learning_rate": 0.0001, "loss": 3.6412, "loss/crossentropy": 2.3859506011009217, "loss/hidden": 3.428125, "loss/incoh": 0.0, "loss/logits": 0.49241943359375, "loss/reg": 0.0, "step": 3730 }, { "epoch": 0.024605263157894738, "grad_norm": 2.6875, "grad_norm_var": 0.2377838134765625, "learning_rate": 0.0001, "loss": 3.5895, "loss/crossentropy": 2.141975212097168, "loss/hidden": 3.2140625, "loss/incoh": 0.0, "loss/logits": 0.3277399495244026, "loss/reg": 0.0, "step": 3740 }, { "epoch": 0.024671052631578948, "grad_norm": 3.140625, "grad_norm_var": 0.12506510416666666, "learning_rate": 0.0001, "loss": 3.6233, "loss/crossentropy": 2.4696611404418944, "loss/hidden": 3.3265625, "loss/incoh": 0.0, "loss/logits": 0.3837138593196869, "loss/reg": 0.0, "step": 3750 }, { "epoch": 0.024736842105263158, "grad_norm": 2.34375, "grad_norm_var": 2.6720540364583334, "learning_rate": 0.0001, "loss": 3.7719, "loss/crossentropy": 2.494647514820099, "loss/hidden": 3.275, "loss/incoh": 0.0, "loss/logits": 0.37584047913551333, "loss/reg": 0.0, "step": 3760 }, { "epoch": 0.024802631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.63804931640625, "learning_rate": 0.0001, "loss": 3.676, "loss/crossentropy": 2.5242549180984497, "loss/hidden": 3.2328125, "loss/incoh": 0.0, "loss/logits": 0.39244888722896576, "loss/reg": 0.0, "step": 3770 }, { "epoch": 0.024868421052631578, "grad_norm": 3.03125, "grad_norm_var": 0.3232167561848958, "learning_rate": 0.0001, "loss": 3.5206, "loss/crossentropy": 2.2167584180831907, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.31152922809123995, "loss/reg": 0.0, "step": 3780 }, { "epoch": 0.024934210526315788, "grad_norm": 2.640625, "grad_norm_var": 0.8850331624348958, "learning_rate": 0.0001, "loss": 3.6831, "loss/crossentropy": 2.406609225273132, "loss/hidden": 3.2734375, "loss/incoh": 0.0, "loss/logits": 0.3529895097017288, "loss/reg": 0.0, "step": 3790 }, { "epoch": 0.025, "grad_norm": 2.390625, "grad_norm_var": 0.8739491780598958, "learning_rate": 0.0001, "loss": 3.6024, "loss/crossentropy": 2.270749258995056, "loss/hidden": 3.3390625, "loss/incoh": 0.0, "loss/logits": 0.32701381742954255, "loss/reg": 0.0, "step": 3800 }, { "epoch": 0.02506578947368421, "grad_norm": 2.625, "grad_norm_var": 0.06500651041666666, "learning_rate": 0.0001, "loss": 3.6037, "loss/crossentropy": 2.3936703205108643, "loss/hidden": 3.3078125, "loss/incoh": 0.0, "loss/logits": 0.4090299874544144, "loss/reg": 0.0, "step": 3810 }, { "epoch": 0.02513157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.3386301676432292, "learning_rate": 0.0001, "loss": 3.8203, "loss/crossentropy": 2.057803177833557, "loss/hidden": 3.390625, "loss/incoh": 0.0, "loss/logits": 0.3018879994750023, "loss/reg": 0.0, "step": 3820 }, { "epoch": 0.02519736842105263, "grad_norm": 10.375, "grad_norm_var": 3.7443756103515624, "learning_rate": 0.0001, "loss": 3.5987, "loss/crossentropy": 2.558328187465668, "loss/hidden": 3.3671875, "loss/incoh": 0.0, "loss/logits": 0.40375421941280365, "loss/reg": 0.0, "step": 3830 }, { "epoch": 0.02526315789473684, "grad_norm": 2.59375, "grad_norm_var": 4.824331665039063, "learning_rate": 0.0001, "loss": 3.6109, "loss/crossentropy": 2.4123119592666624, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3224829614162445, "loss/reg": 0.0, "step": 3840 }, { "epoch": 0.02532894736842105, "grad_norm": 2.921875, "grad_norm_var": 1.4220611572265625, "learning_rate": 0.0001, "loss": 3.6503, "loss/crossentropy": 2.4460156679153444, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.37439659237861633, "loss/reg": 0.0, "step": 3850 }, { "epoch": 0.02539473684210526, "grad_norm": 2.828125, "grad_norm_var": 0.24488525390625, "learning_rate": 0.0001, "loss": 3.6799, "loss/crossentropy": 2.471416544914246, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.3233081191778183, "loss/reg": 0.0, "step": 3860 }, { "epoch": 0.025460526315789475, "grad_norm": 4.6875, "grad_norm_var": 2.005939737955729, "learning_rate": 0.0001, "loss": 3.7099, "loss/crossentropy": 2.46109699010849, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.31833461821079256, "loss/reg": 0.0, "step": 3870 }, { "epoch": 0.025526315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.4576568603515625, "learning_rate": 0.0001, "loss": 3.62, "loss/crossentropy": 2.4385437607765197, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.3444881528615952, "loss/reg": 0.0, "step": 3880 }, { "epoch": 0.025592105263157895, "grad_norm": 2.625, "grad_norm_var": 0.07125244140625, "learning_rate": 0.0001, "loss": 3.5814, "loss/crossentropy": 2.464188981056213, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.4162255361676216, "loss/reg": 0.0, "step": 3890 }, { "epoch": 0.025657894736842105, "grad_norm": 3.203125, "grad_norm_var": 0.13810221354166666, "learning_rate": 0.0001, "loss": 3.5963, "loss/crossentropy": 2.5776121497154234, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.351967790722847, "loss/reg": 0.0, "step": 3900 }, { "epoch": 0.025723684210526315, "grad_norm": 2.203125, "grad_norm_var": 0.13177083333333334, "learning_rate": 0.0001, "loss": 3.6137, "loss/crossentropy": 2.4709773540496824, "loss/hidden": 3.28125, "loss/incoh": 0.0, "loss/logits": 0.3811213612556458, "loss/reg": 0.0, "step": 3910 }, { "epoch": 0.025789473684210525, "grad_norm": 2.671875, "grad_norm_var": 0.047591145833333334, "learning_rate": 0.0001, "loss": 3.5243, "loss/crossentropy": 2.3289324045181274, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.32472735941410064, "loss/reg": 0.0, "step": 3920 }, { "epoch": 0.025855263157894735, "grad_norm": 3.359375, "grad_norm_var": 0.07144266764322917, "learning_rate": 0.0001, "loss": 3.5618, "loss/crossentropy": 2.151239442825317, "loss/hidden": 3.15, "loss/incoh": 0.0, "loss/logits": 0.3790448889136314, "loss/reg": 0.0, "step": 3930 }, { "epoch": 0.02592105263157895, "grad_norm": 2.78125, "grad_norm_var": 0.19239908854166668, "learning_rate": 0.0001, "loss": 3.5689, "loss/crossentropy": 2.281427323818207, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.32773717790842055, "loss/reg": 0.0, "step": 3940 }, { "epoch": 0.02598684210526316, "grad_norm": 2.765625, "grad_norm_var": 0.07558186848958333, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.2366716623306275, "loss/hidden": 3.36875, "loss/incoh": 0.0, "loss/logits": 0.3462549954652786, "loss/reg": 0.0, "step": 3950 }, { "epoch": 0.02605263157894737, "grad_norm": 2.46875, "grad_norm_var": 0.0950347900390625, "learning_rate": 0.0001, "loss": 3.6604, "loss/crossentropy": 2.6565569043159485, "loss/hidden": 3.3375, "loss/incoh": 0.0, "loss/logits": 0.38759642243385317, "loss/reg": 0.0, "step": 3960 }, { "epoch": 0.02611842105263158, "grad_norm": 2.328125, "grad_norm_var": 0.04934895833333333, "learning_rate": 0.0001, "loss": 3.4984, "loss/crossentropy": 2.3093223094940187, "loss/hidden": 3.2484375, "loss/incoh": 0.0, "loss/logits": 0.3547346442937851, "loss/reg": 0.0, "step": 3970 }, { "epoch": 0.02618421052631579, "grad_norm": 2.640625, "grad_norm_var": 0.6375284830729167, "learning_rate": 0.0001, "loss": 3.6552, "loss/crossentropy": 2.5669564962387086, "loss/hidden": 3.640625, "loss/incoh": 0.0, "loss/logits": 0.3741306886076927, "loss/reg": 0.0, "step": 3980 }, { "epoch": 0.02625, "grad_norm": 3.328125, "grad_norm_var": 0.1037994384765625, "learning_rate": 0.0001, "loss": 3.707, "loss/crossentropy": 2.255212366580963, "loss/hidden": 3.4078125, "loss/incoh": 0.0, "loss/logits": 0.36638626754283904, "loss/reg": 0.0, "step": 3990 }, { "epoch": 0.02631578947368421, "grad_norm": 3.484375, "grad_norm_var": 0.1186431884765625, "learning_rate": 0.0001, "loss": 3.5907, "loss/crossentropy": 2.5717132806777956, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.3674279361963272, "loss/reg": 0.0, "step": 4000 }, { "epoch": 0.026381578947368423, "grad_norm": 3.09375, "grad_norm_var": 0.09750874837239583, "learning_rate": 0.0001, "loss": 3.5937, "loss/crossentropy": 2.494677722454071, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.34122500121593474, "loss/reg": 0.0, "step": 4010 }, { "epoch": 0.026447368421052633, "grad_norm": 2.9375, "grad_norm_var": 0.07803446451822917, "learning_rate": 0.0001, "loss": 3.4996, "loss/crossentropy": 2.4037094593048094, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.34369616210460663, "loss/reg": 0.0, "step": 4020 }, { "epoch": 0.026513157894736843, "grad_norm": 2.875, "grad_norm_var": 0.031087239583333332, "learning_rate": 0.0001, "loss": 3.5883, "loss/crossentropy": 2.5161670804023744, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.3320692449808121, "loss/reg": 0.0, "step": 4030 }, { "epoch": 0.026578947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.27988993326822914, "learning_rate": 0.0001, "loss": 3.5987, "loss/crossentropy": 2.489400029182434, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.37186973094940184, "loss/reg": 0.0, "step": 4040 }, { "epoch": 0.026644736842105263, "grad_norm": 2.453125, "grad_norm_var": 0.2902740478515625, "learning_rate": 0.0001, "loss": 3.6423, "loss/crossentropy": 2.1810465335845945, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.30852093994617463, "loss/reg": 0.0, "step": 4050 }, { "epoch": 0.026710526315789473, "grad_norm": 3.203125, "grad_norm_var": 0.2831776936848958, "learning_rate": 0.0001, "loss": 3.5696, "loss/crossentropy": 2.5600404262542726, "loss/hidden": 3.2265625, "loss/incoh": 0.0, "loss/logits": 0.38900414407253264, "loss/reg": 0.0, "step": 4060 }, { "epoch": 0.026776315789473683, "grad_norm": 4.125, "grad_norm_var": 0.25816141764322914, "learning_rate": 0.0001, "loss": 3.6429, "loss/crossentropy": 2.4915748476982116, "loss/hidden": 3.290625, "loss/incoh": 0.0, "loss/logits": 0.35222682952880857, "loss/reg": 0.0, "step": 4070 }, { "epoch": 0.026842105263157896, "grad_norm": 3.875, "grad_norm_var": 2.97427978515625, "learning_rate": 0.0001, "loss": 3.6704, "loss/crossentropy": 2.131973624229431, "loss/hidden": 3.36875, "loss/incoh": 0.0, "loss/logits": 0.32549644112586973, "loss/reg": 0.0, "step": 4080 }, { "epoch": 0.026907894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.25222981770833336, "learning_rate": 0.0001, "loss": 3.591, "loss/crossentropy": 2.196081441640854, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.35593045353889463, "loss/reg": 0.0, "step": 4090 }, { "epoch": 0.026973684210526316, "grad_norm": 3.03125, "grad_norm_var": 0.20009765625, "learning_rate": 0.0001, "loss": 3.5044, "loss/crossentropy": 2.3047094464302065, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2999614104628563, "loss/reg": 0.0, "step": 4100 }, { "epoch": 0.027039473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.5296051025390625, "learning_rate": 0.0001, "loss": 3.6015, "loss/crossentropy": 2.4926783800125123, "loss/hidden": 3.2578125, "loss/incoh": 0.0, "loss/logits": 0.342082779109478, "loss/reg": 0.0, "step": 4110 }, { "epoch": 0.027105263157894736, "grad_norm": 2.609375, "grad_norm_var": 0.05623270670572917, "learning_rate": 0.0001, "loss": 3.5623, "loss/crossentropy": 2.6063008666038514, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3086448922753334, "loss/reg": 0.0, "step": 4120 }, { "epoch": 0.027171052631578946, "grad_norm": 2.671875, "grad_norm_var": 0.103515625, "learning_rate": 0.0001, "loss": 3.5628, "loss/crossentropy": 2.516204798221588, "loss/hidden": 3.2609375, "loss/incoh": 0.0, "loss/logits": 0.36054509580135347, "loss/reg": 0.0, "step": 4130 }, { "epoch": 0.027236842105263157, "grad_norm": 2.421875, "grad_norm_var": 0.09970601399739583, "learning_rate": 0.0001, "loss": 3.5328, "loss/crossentropy": 2.5081961393356322, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3743078649044037, "loss/reg": 0.0, "step": 4140 }, { "epoch": 0.02730263157894737, "grad_norm": 2.4375, "grad_norm_var": 0.0626373291015625, "learning_rate": 0.0001, "loss": 3.5896, "loss/crossentropy": 2.386087703704834, "loss/hidden": 3.225, "loss/incoh": 0.0, "loss/logits": 0.33356338143348696, "loss/reg": 0.0, "step": 4150 }, { "epoch": 0.02736842105263158, "grad_norm": 3.03125, "grad_norm_var": 0.12830301920572917, "learning_rate": 0.0001, "loss": 3.6293, "loss/crossentropy": 2.2993146777153015, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.28893803358078, "loss/reg": 0.0, "step": 4160 }, { "epoch": 0.02743421052631579, "grad_norm": 2.4375, "grad_norm_var": 0.29797770182291666, "learning_rate": 0.0001, "loss": 3.5201, "loss/crossentropy": 2.4818823099136353, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.3620707929134369, "loss/reg": 0.0, "step": 4170 }, { "epoch": 0.0275, "grad_norm": 2.28125, "grad_norm_var": 0.2361328125, "learning_rate": 0.0001, "loss": 3.533, "loss/crossentropy": 2.4130859971046448, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.3289807617664337, "loss/reg": 0.0, "step": 4180 }, { "epoch": 0.02756578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.06122945149739583, "learning_rate": 0.0001, "loss": 3.4961, "loss/crossentropy": 2.3559444665908815, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.3270682215690613, "loss/reg": 0.0, "step": 4190 }, { "epoch": 0.02763157894736842, "grad_norm": 2.609375, "grad_norm_var": 0.3246378580729167, "learning_rate": 0.0001, "loss": 3.7095, "loss/crossentropy": 2.370071732997894, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.36643182039260863, "loss/reg": 0.0, "step": 4200 }, { "epoch": 0.02769736842105263, "grad_norm": 8.375, "grad_norm_var": 2.272069295247396, "learning_rate": 0.0001, "loss": 3.5404, "loss/crossentropy": 2.4906920313835146, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3274578660726547, "loss/reg": 0.0, "step": 4210 }, { "epoch": 0.027763157894736844, "grad_norm": 2.40625, "grad_norm_var": 2.19049072265625, "learning_rate": 0.0001, "loss": 3.6801, "loss/crossentropy": 2.66324725151062, "loss/hidden": 3.5046875, "loss/incoh": 0.0, "loss/logits": 0.38620950281620026, "loss/reg": 0.0, "step": 4220 }, { "epoch": 0.027828947368421054, "grad_norm": 2.796875, "grad_norm_var": 0.1225494384765625, "learning_rate": 0.0001, "loss": 3.6293, "loss/crossentropy": 2.3201419711112976, "loss/hidden": 3.284375, "loss/incoh": 0.0, "loss/logits": 0.33118238747119905, "loss/reg": 0.0, "step": 4230 }, { "epoch": 0.027894736842105264, "grad_norm": 2.765625, "grad_norm_var": 0.062108357747395836, "learning_rate": 0.0001, "loss": 3.5523, "loss/crossentropy": 2.2000674962997437, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.33759562224149703, "loss/reg": 0.0, "step": 4240 }, { "epoch": 0.027960526315789474, "grad_norm": 2.890625, "grad_norm_var": 0.10436197916666666, "learning_rate": 0.0001, "loss": 3.6554, "loss/crossentropy": 2.4001947045326233, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.34554801881313324, "loss/reg": 0.0, "step": 4250 }, { "epoch": 0.028026315789473684, "grad_norm": 2.90625, "grad_norm_var": 0.09031473795572917, "learning_rate": 0.0001, "loss": 3.5922, "loss/crossentropy": 2.5996686697006224, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.34134136140346527, "loss/reg": 0.0, "step": 4260 }, { "epoch": 0.028092105263157894, "grad_norm": 2.59375, "grad_norm_var": 0.07454020182291667, "learning_rate": 0.0001, "loss": 3.5895, "loss/crossentropy": 2.1346129894256594, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.2764407262206078, "loss/reg": 0.0, "step": 4270 }, { "epoch": 0.028157894736842104, "grad_norm": 2.671875, "grad_norm_var": 0.07390950520833334, "learning_rate": 0.0001, "loss": 3.6094, "loss/crossentropy": 2.539172089099884, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.3945852980017662, "loss/reg": 0.0, "step": 4280 }, { "epoch": 0.028223684210526317, "grad_norm": 2.796875, "grad_norm_var": 0.14036051432291666, "learning_rate": 0.0001, "loss": 3.6505, "loss/crossentropy": 2.4181793212890623, "loss/hidden": 3.484375, "loss/incoh": 0.0, "loss/logits": 0.45307959616184235, "loss/reg": 0.0, "step": 4290 }, { "epoch": 0.028289473684210528, "grad_norm": 2.40625, "grad_norm_var": 0.14678446451822916, "learning_rate": 0.0001, "loss": 3.5539, "loss/crossentropy": 2.3630972266197205, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.28445124477148054, "loss/reg": 0.0, "step": 4300 }, { "epoch": 0.028355263157894738, "grad_norm": 2.6875, "grad_norm_var": 0.3001627604166667, "learning_rate": 0.0001, "loss": 3.5098, "loss/crossentropy": 2.4898954033851624, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.320017996430397, "loss/reg": 0.0, "step": 4310 }, { "epoch": 0.028421052631578948, "grad_norm": 2.4375, "grad_norm_var": 0.13015034993489583, "learning_rate": 0.0001, "loss": 3.5755, "loss/crossentropy": 2.2955414295196532, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.41727418303489683, "loss/reg": 0.0, "step": 4320 }, { "epoch": 0.028486842105263158, "grad_norm": 3.015625, "grad_norm_var": 0.29983723958333336, "learning_rate": 0.0001, "loss": 3.7538, "loss/crossentropy": 2.4246325135231017, "loss/hidden": 3.3546875, "loss/incoh": 0.0, "loss/logits": 0.34737818390131, "loss/reg": 0.0, "step": 4330 }, { "epoch": 0.028552631578947368, "grad_norm": 5.3125, "grad_norm_var": 0.5324544270833333, "learning_rate": 0.0001, "loss": 3.6927, "loss/crossentropy": 2.393894040584564, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.37734392285346985, "loss/reg": 0.0, "step": 4340 }, { "epoch": 0.028618421052631578, "grad_norm": 2.78125, "grad_norm_var": 0.5618316650390625, "learning_rate": 0.0001, "loss": 3.6563, "loss/crossentropy": 2.5759302139282227, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.32841147780418395, "loss/reg": 0.0, "step": 4350 }, { "epoch": 0.028684210526315788, "grad_norm": 2.875, "grad_norm_var": 0.06923726399739584, "learning_rate": 0.0001, "loss": 3.5818, "loss/crossentropy": 2.664570915699005, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.31884663701057436, "loss/reg": 0.0, "step": 4360 }, { "epoch": 0.02875, "grad_norm": 3.203125, "grad_norm_var": 0.219873046875, "learning_rate": 0.0001, "loss": 3.6444, "loss/crossentropy": 2.3797228574752807, "loss/hidden": 3.5796875, "loss/incoh": 0.0, "loss/logits": 0.4052841871976852, "loss/reg": 0.0, "step": 4370 }, { "epoch": 0.02881578947368421, "grad_norm": 3.0, "grad_norm_var": 0.5880849202473958, "learning_rate": 0.0001, "loss": 3.5881, "loss/crossentropy": 2.5849244236946105, "loss/hidden": 3.5046875, "loss/incoh": 0.0, "loss/logits": 0.529149529337883, "loss/reg": 0.0, "step": 4380 }, { "epoch": 0.02888157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.31160380045572916, "learning_rate": 0.0001, "loss": 3.4738, "loss/crossentropy": 2.5852147936820984, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.3336706295609474, "loss/reg": 0.0, "step": 4390 }, { "epoch": 0.02894736842105263, "grad_norm": 2.703125, "grad_norm_var": 0.05146077473958333, "learning_rate": 0.0001, "loss": 3.6199, "loss/crossentropy": 2.412027895450592, "loss/hidden": 3.375, "loss/incoh": 0.0, "loss/logits": 0.4120332598686218, "loss/reg": 0.0, "step": 4400 }, { "epoch": 0.02901315789473684, "grad_norm": 2.59375, "grad_norm_var": 0.138232421875, "learning_rate": 0.0001, "loss": 3.4929, "loss/crossentropy": 2.270553803443909, "loss/hidden": 3.2546875, "loss/incoh": 0.0, "loss/logits": 0.34523763358592985, "loss/reg": 0.0, "step": 4410 }, { "epoch": 0.02907894736842105, "grad_norm": 2.359375, "grad_norm_var": 0.1102691650390625, "learning_rate": 0.0001, "loss": 3.5778, "loss/crossentropy": 2.361116898059845, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.34047031700611113, "loss/reg": 0.0, "step": 4420 }, { "epoch": 0.02914473684210526, "grad_norm": 6.375, "grad_norm_var": 1.0507120768229166, "learning_rate": 0.0001, "loss": 3.6517, "loss/crossentropy": 2.547470712661743, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.298332154750824, "loss/reg": 0.0, "step": 4430 }, { "epoch": 0.029210526315789475, "grad_norm": 3.796875, "grad_norm_var": 1.0804972330729166, "learning_rate": 0.0001, "loss": 3.7389, "loss/crossentropy": 2.7002538442611694, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3686490625143051, "loss/reg": 0.0, "step": 4440 }, { "epoch": 0.029276315789473685, "grad_norm": 2.3125, "grad_norm_var": 0.432177734375, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.421727478504181, "loss/hidden": 3.18125, "loss/incoh": 0.0, "loss/logits": 0.30692713260650634, "loss/reg": 0.0, "step": 4450 }, { "epoch": 0.029342105263157895, "grad_norm": 2.59375, "grad_norm_var": 0.09047749837239584, "learning_rate": 0.0001, "loss": 3.6132, "loss/crossentropy": 2.659491038322449, "loss/hidden": 3.2140625, "loss/incoh": 0.0, "loss/logits": 0.3632184773683548, "loss/reg": 0.0, "step": 4460 }, { "epoch": 0.029407894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.06815999348958333, "learning_rate": 0.0001, "loss": 3.4561, "loss/crossentropy": 2.5192266911268235, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.2842270269989967, "loss/reg": 0.0, "step": 4470 }, { "epoch": 0.029473684210526315, "grad_norm": 3.109375, "grad_norm_var": 0.1687408447265625, "learning_rate": 0.0001, "loss": 3.5582, "loss/crossentropy": 2.2021409273147583, "loss/hidden": 3.1484375, "loss/incoh": 0.0, "loss/logits": 0.31400761008262634, "loss/reg": 0.0, "step": 4480 }, { "epoch": 0.029539473684210525, "grad_norm": 2.59375, "grad_norm_var": 0.5006174723307292, "learning_rate": 0.0001, "loss": 3.5183, "loss/crossentropy": 2.4442033648490904, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.3307736128568649, "loss/reg": 0.0, "step": 4490 }, { "epoch": 0.029605263157894735, "grad_norm": 2.734375, "grad_norm_var": 0.40276692708333334, "learning_rate": 0.0001, "loss": 3.4888, "loss/crossentropy": 2.373897171020508, "loss/hidden": 3.2234375, "loss/incoh": 0.0, "loss/logits": 0.31207202970981596, "loss/reg": 0.0, "step": 4500 }, { "epoch": 0.02967105263157895, "grad_norm": 2.78125, "grad_norm_var": 0.20685933430989584, "learning_rate": 0.0001, "loss": 3.5971, "loss/crossentropy": 2.325096046924591, "loss/hidden": 3.40625, "loss/incoh": 0.0, "loss/logits": 0.4359890788793564, "loss/reg": 0.0, "step": 4510 }, { "epoch": 0.02973684210526316, "grad_norm": 3.078125, "grad_norm_var": 0.2956451416015625, "learning_rate": 0.0001, "loss": 3.5023, "loss/crossentropy": 2.1537609457969666, "loss/hidden": 3.253125, "loss/incoh": 0.0, "loss/logits": 0.3213866874575615, "loss/reg": 0.0, "step": 4520 }, { "epoch": 0.02980263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.09353841145833333, "learning_rate": 0.0001, "loss": 3.4494, "loss/crossentropy": 2.4694852471351623, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.3156100481748581, "loss/reg": 0.0, "step": 4530 }, { "epoch": 0.02986842105263158, "grad_norm": 2.78125, "grad_norm_var": 0.17408447265625, "learning_rate": 0.0001, "loss": 3.6124, "loss/crossentropy": 2.438071775436401, "loss/hidden": 3.3203125, "loss/incoh": 0.0, "loss/logits": 0.40948416888713834, "loss/reg": 0.0, "step": 4540 }, { "epoch": 0.02993421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.6079010009765625, "learning_rate": 0.0001, "loss": 3.6751, "loss/crossentropy": 2.525905132293701, "loss/hidden": 3.3546875, "loss/incoh": 0.0, "loss/logits": 0.4153590425848961, "loss/reg": 0.0, "step": 4550 }, { "epoch": 0.03, "grad_norm": 3.203125, "grad_norm_var": 0.6879191080729167, "learning_rate": 0.0001, "loss": 3.5335, "loss/crossentropy": 2.421697771549225, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.3177122876048088, "loss/reg": 0.0, "step": 4560 }, { "epoch": 0.03006578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.5993123372395833, "learning_rate": 0.0001, "loss": 3.5742, "loss/crossentropy": 2.3068729996681214, "loss/hidden": 3.05, "loss/incoh": 0.0, "loss/logits": 0.29430699050426484, "loss/reg": 0.0, "step": 4570 }, { "epoch": 0.030131578947368422, "grad_norm": 4.21875, "grad_norm_var": 0.5433339436848958, "learning_rate": 0.0001, "loss": 3.6381, "loss/crossentropy": 2.3981791496276856, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3355312556028366, "loss/reg": 0.0, "step": 4580 }, { "epoch": 0.030197368421052632, "grad_norm": 3.25, "grad_norm_var": 0.9806630452473958, "learning_rate": 0.0001, "loss": 3.6522, "loss/crossentropy": 2.309436595439911, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.350050950050354, "loss/reg": 0.0, "step": 4590 }, { "epoch": 0.030263157894736843, "grad_norm": 2.75, "grad_norm_var": 0.9801910400390625, "learning_rate": 0.0001, "loss": 3.5648, "loss/crossentropy": 2.561086916923523, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3277510732412338, "loss/reg": 0.0, "step": 4600 }, { "epoch": 0.030328947368421053, "grad_norm": 3.015625, "grad_norm_var": 0.20364481608072918, "learning_rate": 0.0001, "loss": 3.5518, "loss/crossentropy": 2.6774720311164857, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31699982583522796, "loss/reg": 0.0, "step": 4610 }, { "epoch": 0.030394736842105263, "grad_norm": 2.765625, "grad_norm_var": 0.9744425455729167, "learning_rate": 0.0001, "loss": 3.5924, "loss/crossentropy": 2.623241698741913, "loss/hidden": 3.228125, "loss/incoh": 0.0, "loss/logits": 0.38138356506824495, "loss/reg": 0.0, "step": 4620 }, { "epoch": 0.030460526315789473, "grad_norm": 2.5625, "grad_norm_var": 0.9736073811848959, "learning_rate": 0.0001, "loss": 3.4682, "loss/crossentropy": 2.4855759739875793, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.3377710849046707, "loss/reg": 0.0, "step": 4630 }, { "epoch": 0.030526315789473683, "grad_norm": 2.78125, "grad_norm_var": 0.6709706624348958, "learning_rate": 0.0001, "loss": 3.4714, "loss/crossentropy": 2.476264202594757, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.3642516300082207, "loss/reg": 0.0, "step": 4640 }, { "epoch": 0.030592105263157896, "grad_norm": 3.0, "grad_norm_var": 0.6482899983723959, "learning_rate": 0.0001, "loss": 3.4865, "loss/crossentropy": 2.3190789937973024, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.32634713053703307, "loss/reg": 0.0, "step": 4650 }, { "epoch": 0.030657894736842106, "grad_norm": 2.921875, "grad_norm_var": 0.12795817057291667, "learning_rate": 0.0001, "loss": 3.5003, "loss/crossentropy": 2.4210057139396666, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.2987945884466171, "loss/reg": 0.0, "step": 4660 }, { "epoch": 0.030723684210526316, "grad_norm": 3.0625, "grad_norm_var": 0.08050130208333334, "learning_rate": 0.0001, "loss": 3.4858, "loss/crossentropy": 2.041215348243713, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.2908504828810692, "loss/reg": 0.0, "step": 4670 }, { "epoch": 0.030789473684210526, "grad_norm": 3.0625, "grad_norm_var": 0.4143300374348958, "learning_rate": 0.0001, "loss": 3.6937, "loss/crossentropy": 2.4882567286491395, "loss/hidden": 3.3125, "loss/incoh": 0.0, "loss/logits": 0.3625297635793686, "loss/reg": 0.0, "step": 4680 }, { "epoch": 0.030855263157894736, "grad_norm": 3.328125, "grad_norm_var": 0.15123291015625, "learning_rate": 0.0001, "loss": 3.5943, "loss/crossentropy": 2.3254539489746096, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.308133128285408, "loss/reg": 0.0, "step": 4690 }, { "epoch": 0.030921052631578946, "grad_norm": 2.46875, "grad_norm_var": 0.19954325358072916, "learning_rate": 0.0001, "loss": 3.5843, "loss/crossentropy": 1.918275660276413, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.36902148872613905, "loss/reg": 0.0, "step": 4700 }, { "epoch": 0.030986842105263156, "grad_norm": 2.84375, "grad_norm_var": 0.04302978515625, "learning_rate": 0.0001, "loss": 3.5845, "loss/crossentropy": 2.573339414596558, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.36042743623256684, "loss/reg": 0.0, "step": 4710 }, { "epoch": 0.03105263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.0475982666015625, "learning_rate": 0.0001, "loss": 3.5453, "loss/crossentropy": 2.313191366195679, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.3170273721218109, "loss/reg": 0.0, "step": 4720 }, { "epoch": 0.03111842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.0694244384765625, "learning_rate": 0.0001, "loss": 3.4641, "loss/crossentropy": 2.5151350021362306, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.3049825429916382, "loss/reg": 0.0, "step": 4730 }, { "epoch": 0.03118421052631579, "grad_norm": 4.25, "grad_norm_var": 1.9115549723307292, "learning_rate": 0.0001, "loss": 3.6088, "loss/crossentropy": 2.278876805305481, "loss/hidden": 3.3796875, "loss/incoh": 0.0, "loss/logits": 0.38837724179029465, "loss/reg": 0.0, "step": 4740 }, { "epoch": 0.03125, "grad_norm": 2.234375, "grad_norm_var": 2.200390625, "learning_rate": 0.0001, "loss": 3.5348, "loss/crossentropy": 2.2090991735458374, "loss/hidden": 3.44375, "loss/incoh": 0.0, "loss/logits": 0.3878710061311722, "loss/reg": 0.0, "step": 4750 }, { "epoch": 0.031315789473684214, "grad_norm": 2.25, "grad_norm_var": 0.6573404947916667, "learning_rate": 0.0001, "loss": 3.5378, "loss/crossentropy": 2.2805041670799255, "loss/hidden": 3.3015625, "loss/incoh": 0.0, "loss/logits": 0.31536445766687393, "loss/reg": 0.0, "step": 4760 }, { "epoch": 0.03138157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.38818257649739585, "learning_rate": 0.0001, "loss": 3.6082, "loss/crossentropy": 2.6178433656692506, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3645938545465469, "loss/reg": 0.0, "step": 4770 }, { "epoch": 0.031447368421052634, "grad_norm": 2.671875, "grad_norm_var": 0.07810770670572917, "learning_rate": 0.0001, "loss": 3.4424, "loss/crossentropy": 2.5498223304748535, "loss/hidden": 3.2625, "loss/incoh": 0.0, "loss/logits": 0.3333883464336395, "loss/reg": 0.0, "step": 4780 }, { "epoch": 0.03151315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.11669820149739583, "learning_rate": 0.0001, "loss": 3.6145, "loss/crossentropy": 2.639970850944519, "loss/hidden": 3.278125, "loss/incoh": 0.0, "loss/logits": 0.37047617733478544, "loss/reg": 0.0, "step": 4790 }, { "epoch": 0.031578947368421054, "grad_norm": 2.4375, "grad_norm_var": 0.0907867431640625, "learning_rate": 0.0001, "loss": 3.5869, "loss/crossentropy": 2.458595323562622, "loss/hidden": 3.60625, "loss/incoh": 0.0, "loss/logits": 0.4149588346481323, "loss/reg": 0.0, "step": 4800 }, { "epoch": 0.03164473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.0759429931640625, "learning_rate": 0.0001, "loss": 3.4775, "loss/crossentropy": 2.01421400308609, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.29176320880651474, "loss/reg": 0.0, "step": 4810 }, { "epoch": 0.031710526315789474, "grad_norm": 2.96875, "grad_norm_var": 0.1147857666015625, "learning_rate": 0.0001, "loss": 3.5268, "loss/crossentropy": 2.5456383228302, "loss/hidden": 3.3484375, "loss/incoh": 0.0, "loss/logits": 0.3915561467409134, "loss/reg": 0.0, "step": 4820 }, { "epoch": 0.03177631578947369, "grad_norm": 2.421875, "grad_norm_var": 0.0890625, "learning_rate": 0.0001, "loss": 3.5371, "loss/crossentropy": 2.1736844003200533, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.28229199200868604, "loss/reg": 0.0, "step": 4830 }, { "epoch": 0.031842105263157894, "grad_norm": 2.21875, "grad_norm_var": 0.08162434895833333, "learning_rate": 0.0001, "loss": 3.4898, "loss/crossentropy": 2.397980511188507, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.28644354790449145, "loss/reg": 0.0, "step": 4840 }, { "epoch": 0.03190789473684211, "grad_norm": 2.578125, "grad_norm_var": 1.33443603515625, "learning_rate": 0.0001, "loss": 3.5788, "loss/crossentropy": 2.434200632572174, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.37485773116350174, "loss/reg": 0.0, "step": 4850 }, { "epoch": 0.031973684210526314, "grad_norm": 2.640625, "grad_norm_var": 1.3780181884765625, "learning_rate": 0.0001, "loss": 3.5008, "loss/crossentropy": 2.642025816440582, "loss/hidden": 3.1375, "loss/incoh": 0.0, "loss/logits": 0.34916335344314575, "loss/reg": 0.0, "step": 4860 }, { "epoch": 0.03203947368421053, "grad_norm": 2.578125, "grad_norm_var": 0.04523111979166667, "learning_rate": 0.0001, "loss": 3.4784, "loss/crossentropy": 2.2365106463432314, "loss/hidden": 3.4328125, "loss/incoh": 0.0, "loss/logits": 0.45036998838186265, "loss/reg": 0.0, "step": 4870 }, { "epoch": 0.032105263157894734, "grad_norm": 2.25, "grad_norm_var": 0.04436442057291667, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.250430929660797, "loss/hidden": 3.2859375, "loss/incoh": 0.0, "loss/logits": 0.3373000741004944, "loss/reg": 0.0, "step": 4880 }, { "epoch": 0.03217105263157895, "grad_norm": 2.71875, "grad_norm_var": 0.07979227701822916, "learning_rate": 0.0001, "loss": 3.4476, "loss/crossentropy": 2.688676381111145, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.32961316406726837, "loss/reg": 0.0, "step": 4890 }, { "epoch": 0.03223684210526316, "grad_norm": 2.625, "grad_norm_var": 0.0539215087890625, "learning_rate": 0.0001, "loss": 3.5264, "loss/crossentropy": 2.5281107783317567, "loss/hidden": 3.2125, "loss/incoh": 0.0, "loss/logits": 0.36207843720912936, "loss/reg": 0.0, "step": 4900 }, { "epoch": 0.03230263157894737, "grad_norm": 2.515625, "grad_norm_var": 0.11297200520833334, "learning_rate": 0.0001, "loss": 3.4276, "loss/crossentropy": 2.3438509345054626, "loss/hidden": 3.2671875, "loss/incoh": 0.0, "loss/logits": 0.3646134212613106, "loss/reg": 0.0, "step": 4910 }, { "epoch": 0.03236842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 3.479, "loss/crossentropy": 2.331527066230774, "loss/hidden": 3.48125, "loss/incoh": 0.0, "loss/logits": 0.407479552924633, "loss/reg": 0.0, "step": 4920 }, { "epoch": 0.03243421052631579, "grad_norm": 2.375, "grad_norm_var": 0.21721598307291667, "learning_rate": 0.0001, "loss": 3.6251, "loss/crossentropy": 2.3732258677482605, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.32182002663612364, "loss/reg": 0.0, "step": 4930 }, { "epoch": 0.0325, "grad_norm": 4.59375, "grad_norm_var": 1.3630167643229167, "learning_rate": 0.0001, "loss": 3.6469, "loss/crossentropy": 1.9915230482816697, "loss/hidden": 3.3953125, "loss/incoh": 0.0, "loss/logits": 0.35692891776561736, "loss/reg": 0.0, "step": 4940 }, { "epoch": 0.03256578947368421, "grad_norm": 2.40625, "grad_norm_var": 0.2885894775390625, "learning_rate": 0.0001, "loss": 3.5077, "loss/crossentropy": 1.9868581891059875, "loss/hidden": 3.215625, "loss/incoh": 0.0, "loss/logits": 0.30731415897607806, "loss/reg": 0.0, "step": 4950 }, { "epoch": 0.03263157894736842, "grad_norm": 2.765625, "grad_norm_var": 0.21422119140625, "learning_rate": 0.0001, "loss": 3.5426, "loss/crossentropy": 2.3579143285751343, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.3354805111885071, "loss/reg": 0.0, "step": 4960 }, { "epoch": 0.032697368421052635, "grad_norm": 2.625, "grad_norm_var": 4.145735677083334, "learning_rate": 0.0001, "loss": 3.5276, "loss/crossentropy": 2.2787723779678344, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2516452088952065, "loss/reg": 0.0, "step": 4970 }, { "epoch": 0.03276315789473684, "grad_norm": 2.375, "grad_norm_var": 0.09233296712239583, "learning_rate": 0.0001, "loss": 3.4912, "loss/crossentropy": 2.2899803042411806, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3140288829803467, "loss/reg": 0.0, "step": 4980 }, { "epoch": 0.032828947368421055, "grad_norm": 2.53125, "grad_norm_var": 0.07078450520833333, "learning_rate": 0.0001, "loss": 3.4308, "loss/crossentropy": 2.4203084468841554, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.2926447048783302, "loss/reg": 0.0, "step": 4990 }, { "epoch": 0.03289473684210526, "grad_norm": 2.375, "grad_norm_var": 0.1664215087890625, "learning_rate": 0.0001, "loss": 3.5233, "loss/crossentropy": 2.4435134291648866, "loss/hidden": 3.3640625, "loss/incoh": 0.0, "loss/logits": 0.35478622317314146, "loss/reg": 0.0, "step": 5000 }, { "epoch": 0.032960526315789475, "grad_norm": 3.296875, "grad_norm_var": 0.17375386555989583, "learning_rate": 0.0001, "loss": 3.529, "loss/crossentropy": 2.3886643409729005, "loss/hidden": 3.196875, "loss/incoh": 0.0, "loss/logits": 0.36350963413715365, "loss/reg": 0.0, "step": 5010 }, { "epoch": 0.03302631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.13043619791666666, "learning_rate": 0.0001, "loss": 3.5608, "loss/crossentropy": 2.5570758461952208, "loss/hidden": 3.4828125, "loss/incoh": 0.0, "loss/logits": 0.343785697221756, "loss/reg": 0.0, "step": 5020 }, { "epoch": 0.033092105263157895, "grad_norm": 2.921875, "grad_norm_var": 0.19719645182291667, "learning_rate": 0.0001, "loss": 3.5903, "loss/crossentropy": 2.3763694763183594, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.32882467210292815, "loss/reg": 0.0, "step": 5030 }, { "epoch": 0.03315789473684211, "grad_norm": 2.484375, "grad_norm_var": 0.21155497233072917, "learning_rate": 0.0001, "loss": 3.5454, "loss/crossentropy": 2.5775513648986816, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3086023017764091, "loss/reg": 0.0, "step": 5040 }, { "epoch": 0.033223684210526315, "grad_norm": 2.703125, "grad_norm_var": 0.08268229166666667, "learning_rate": 0.0001, "loss": 3.5005, "loss/crossentropy": 2.257374918460846, "loss/hidden": 3.1609375, "loss/incoh": 0.0, "loss/logits": 0.3117083102464676, "loss/reg": 0.0, "step": 5050 }, { "epoch": 0.03328947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.21896158854166667, "learning_rate": 0.0001, "loss": 3.4097, "loss/crossentropy": 2.437604343891144, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.33744728565216064, "loss/reg": 0.0, "step": 5060 }, { "epoch": 0.033355263157894735, "grad_norm": 2.421875, "grad_norm_var": 0.19402669270833334, "learning_rate": 0.0001, "loss": 3.5413, "loss/crossentropy": 2.1056251645088198, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.35452440977096555, "loss/reg": 0.0, "step": 5070 }, { "epoch": 0.03342105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.041829427083333336, "learning_rate": 0.0001, "loss": 3.5305, "loss/crossentropy": 2.5163299083709716, "loss/hidden": 3.296875, "loss/incoh": 0.0, "loss/logits": 0.35606471002101897, "loss/reg": 0.0, "step": 5080 }, { "epoch": 0.033486842105263155, "grad_norm": 2.328125, "grad_norm_var": 0.08046773274739584, "learning_rate": 0.0001, "loss": 3.4405, "loss/crossentropy": 2.37408185005188, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.3234561800956726, "loss/reg": 0.0, "step": 5090 }, { "epoch": 0.03355263157894737, "grad_norm": 2.921875, "grad_norm_var": 0.17653706868489583, "learning_rate": 0.0001, "loss": 3.4303, "loss/crossentropy": 2.3730108022689818, "loss/hidden": 3.1359375, "loss/incoh": 0.0, "loss/logits": 0.2919617787003517, "loss/reg": 0.0, "step": 5100 }, { "epoch": 0.03361842105263158, "grad_norm": 2.5625, "grad_norm_var": 0.47526753743489586, "learning_rate": 0.0001, "loss": 3.5312, "loss/crossentropy": 2.4720141887664795, "loss/hidden": 3.246875, "loss/incoh": 0.0, "loss/logits": 0.36764703392982484, "loss/reg": 0.0, "step": 5110 }, { "epoch": 0.03368421052631579, "grad_norm": 2.6875, "grad_norm_var": 0.4923787434895833, "learning_rate": 0.0001, "loss": 3.5491, "loss/crossentropy": 2.449038088321686, "loss/hidden": 3.3171875, "loss/incoh": 0.0, "loss/logits": 0.4042062431573868, "loss/reg": 0.0, "step": 5120 }, { "epoch": 0.03375, "grad_norm": 2.703125, "grad_norm_var": 0.1626861572265625, "learning_rate": 0.0001, "loss": 3.5298, "loss/crossentropy": 2.5271955728530884, "loss/hidden": 3.2515625, "loss/incoh": 0.0, "loss/logits": 0.3406914800405502, "loss/reg": 0.0, "step": 5130 }, { "epoch": 0.03381578947368421, "grad_norm": 2.65625, "grad_norm_var": 0.037018839518229166, "learning_rate": 0.0001, "loss": 3.549, "loss/crossentropy": 2.6082807898521425, "loss/hidden": 3.321875, "loss/incoh": 0.0, "loss/logits": 0.33228414356708524, "loss/reg": 0.0, "step": 5140 }, { "epoch": 0.03388157894736842, "grad_norm": 2.375, "grad_norm_var": 0.08620503743489584, "learning_rate": 0.0001, "loss": 3.505, "loss/crossentropy": 2.0589061468839644, "loss/hidden": 3.4890625, "loss/incoh": 0.0, "loss/logits": 0.30756633579730985, "loss/reg": 0.0, "step": 5150 }, { "epoch": 0.03394736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.3478342692057292, "learning_rate": 0.0001, "loss": 3.5598, "loss/crossentropy": 2.0876080930233, "loss/hidden": 3.39375, "loss/incoh": 0.0, "loss/logits": 0.3331515982747078, "loss/reg": 0.0, "step": 5160 }, { "epoch": 0.03401315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.3102773030598958, "learning_rate": 0.0001, "loss": 3.5298, "loss/crossentropy": 2.123845911026001, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.27716329991817473, "loss/reg": 0.0, "step": 5170 }, { "epoch": 0.034078947368421056, "grad_norm": 3.484375, "grad_norm_var": 0.3398590087890625, "learning_rate": 0.0001, "loss": 3.546, "loss/crossentropy": 2.5750380873680117, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.3079290196299553, "loss/reg": 0.0, "step": 5180 }, { "epoch": 0.03414473684210526, "grad_norm": 3.015625, "grad_norm_var": 0.3519195556640625, "learning_rate": 0.0001, "loss": 3.4686, "loss/crossentropy": 2.1761133074760437, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.28422126173973083, "loss/reg": 0.0, "step": 5190 }, { "epoch": 0.034210526315789476, "grad_norm": 3.21875, "grad_norm_var": 0.09970296223958333, "learning_rate": 0.0001, "loss": 3.5823, "loss/crossentropy": 2.224585694074631, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.32807315289974215, "loss/reg": 0.0, "step": 5200 }, { "epoch": 0.03427631578947368, "grad_norm": 2.96875, "grad_norm_var": 0.34479878743489584, "learning_rate": 0.0001, "loss": 3.5417, "loss/crossentropy": 2.217251694202423, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2984598934650421, "loss/reg": 0.0, "step": 5210 }, { "epoch": 0.034342105263157896, "grad_norm": 2.6875, "grad_norm_var": 0.28804931640625, "learning_rate": 0.0001, "loss": 3.4788, "loss/crossentropy": 2.344852977991104, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.2878506749868393, "loss/reg": 0.0, "step": 5220 }, { "epoch": 0.0344078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.17006734212239583, "learning_rate": 0.0001, "loss": 3.4649, "loss/crossentropy": 2.5100401520729063, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3491516515612602, "loss/reg": 0.0, "step": 5230 }, { "epoch": 0.034473684210526316, "grad_norm": 2.5625, "grad_norm_var": 1.1076649983723958, "learning_rate": 0.0001, "loss": 3.503, "loss/crossentropy": 2.565778684616089, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.35944747030735014, "loss/reg": 0.0, "step": 5240 }, { "epoch": 0.03453947368421053, "grad_norm": 2.3125, "grad_norm_var": 1.1434315999348958, "learning_rate": 0.0001, "loss": 3.4668, "loss/crossentropy": 2.5031490683555604, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2864773109555244, "loss/reg": 0.0, "step": 5250 }, { "epoch": 0.034605263157894736, "grad_norm": 2.90625, "grad_norm_var": 0.5837849934895833, "learning_rate": 0.0001, "loss": 3.4822, "loss/crossentropy": 2.3741963386535643, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3231631726026535, "loss/reg": 0.0, "step": 5260 }, { "epoch": 0.03467105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.62431640625, "learning_rate": 0.0001, "loss": 3.4414, "loss/crossentropy": 2.3789267897605897, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3201348423957825, "loss/reg": 0.0, "step": 5270 }, { "epoch": 0.034736842105263156, "grad_norm": 5.25, "grad_norm_var": 0.5048573811848959, "learning_rate": 0.0001, "loss": 3.412, "loss/crossentropy": 2.47695529460907, "loss/hidden": 3.171875, "loss/incoh": 0.0, "loss/logits": 0.31060084253549575, "loss/reg": 0.0, "step": 5280 }, { "epoch": 0.03480263157894737, "grad_norm": 2.625, "grad_norm_var": 0.54195556640625, "learning_rate": 0.0001, "loss": 3.5371, "loss/crossentropy": 2.4316977143287657, "loss/hidden": 3.0703125, "loss/incoh": 0.0, "loss/logits": 0.30330550074577334, "loss/reg": 0.0, "step": 5290 }, { "epoch": 0.034868421052631576, "grad_norm": 2.625, "grad_norm_var": 0.1771484375, "learning_rate": 0.0001, "loss": 3.534, "loss/crossentropy": 2.058604693412781, "loss/hidden": 3.16875, "loss/incoh": 0.0, "loss/logits": 0.2998970851302147, "loss/reg": 0.0, "step": 5300 }, { "epoch": 0.03493421052631579, "grad_norm": 2.375, "grad_norm_var": 0.21590169270833334, "learning_rate": 0.0001, "loss": 3.5029, "loss/crossentropy": 2.1623964309692383, "loss/hidden": 3.1453125, "loss/incoh": 0.0, "loss/logits": 0.2633717767894268, "loss/reg": 0.0, "step": 5310 }, { "epoch": 0.035, "grad_norm": 2.90625, "grad_norm_var": 0.12704671223958333, "learning_rate": 0.0001, "loss": 3.4862, "loss/crossentropy": 2.5717769265174866, "loss/hidden": 3.1671875, "loss/incoh": 0.0, "loss/logits": 0.30162925869226453, "loss/reg": 0.0, "step": 5320 }, { "epoch": 0.03506578947368421, "grad_norm": 2.734375, "grad_norm_var": 1.4618398030598958, "learning_rate": 0.0001, "loss": 3.5429, "loss/crossentropy": 2.462851893901825, "loss/hidden": 3.328125, "loss/incoh": 0.0, "loss/logits": 0.3766929477453232, "loss/reg": 0.0, "step": 5330 }, { "epoch": 0.03513157894736842, "grad_norm": 2.390625, "grad_norm_var": 1.7596181233723958, "learning_rate": 0.0001, "loss": 3.5089, "loss/crossentropy": 2.44319885969162, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.30458342134952543, "loss/reg": 0.0, "step": 5340 }, { "epoch": 0.03519736842105263, "grad_norm": 2.875, "grad_norm_var": 0.4787668863932292, "learning_rate": 0.0001, "loss": 3.6272, "loss/crossentropy": 2.594151020050049, "loss/hidden": 3.2296875, "loss/incoh": 0.0, "loss/logits": 0.3736713409423828, "loss/reg": 0.0, "step": 5350 }, { "epoch": 0.035263157894736843, "grad_norm": 2.484375, "grad_norm_var": 0.4522939046223958, "learning_rate": 0.0001, "loss": 3.5325, "loss/crossentropy": 2.0771877110004424, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.3338633939623833, "loss/reg": 0.0, "step": 5360 }, { "epoch": 0.03532894736842105, "grad_norm": 2.390625, "grad_norm_var": 0.0387847900390625, "learning_rate": 0.0001, "loss": 3.4757, "loss/crossentropy": 2.5547770977020265, "loss/hidden": 3.25, "loss/incoh": 0.0, "loss/logits": 0.36584808975458144, "loss/reg": 0.0, "step": 5370 }, { "epoch": 0.035394736842105264, "grad_norm": 2.34375, "grad_norm_var": 11.889094034830729, "learning_rate": 0.0001, "loss": 3.6332, "loss/crossentropy": 2.2124004304409026, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.29226877391338346, "loss/reg": 0.0, "step": 5380 }, { "epoch": 0.03546052631578948, "grad_norm": 3.15625, "grad_norm_var": 6.872362263997396, "learning_rate": 0.0001, "loss": 3.5496, "loss/crossentropy": 2.2622018218040467, "loss/hidden": 3.259375, "loss/incoh": 0.0, "loss/logits": 0.40298803299665453, "loss/reg": 0.0, "step": 5390 }, { "epoch": 0.035526315789473684, "grad_norm": 2.25, "grad_norm_var": 0.0641998291015625, "learning_rate": 0.0001, "loss": 3.4621, "loss/crossentropy": 2.374979627132416, "loss/hidden": 3.2109375, "loss/incoh": 0.0, "loss/logits": 0.37631402611732484, "loss/reg": 0.0, "step": 5400 }, { "epoch": 0.0355921052631579, "grad_norm": 2.625, "grad_norm_var": 0.028059895833333334, "learning_rate": 0.0001, "loss": 3.4667, "loss/crossentropy": 2.4809056520462036, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.32154888212680816, "loss/reg": 0.0, "step": 5410 }, { "epoch": 0.035657894736842104, "grad_norm": 2.5625, "grad_norm_var": 0.19182840983072916, "learning_rate": 0.0001, "loss": 3.528, "loss/crossentropy": 2.3937729835510253, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.29925636053085325, "loss/reg": 0.0, "step": 5420 }, { "epoch": 0.03572368421052632, "grad_norm": 2.734375, "grad_norm_var": 0.04440104166666667, "learning_rate": 0.0001, "loss": 3.4433, "loss/crossentropy": 2.604015350341797, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.28630980402231215, "loss/reg": 0.0, "step": 5430 }, { "epoch": 0.035789473684210524, "grad_norm": 2.34375, "grad_norm_var": 0.057454427083333336, "learning_rate": 0.0001, "loss": 3.3938, "loss/crossentropy": 2.3647801518440246, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.2772139713168144, "loss/reg": 0.0, "step": 5440 }, { "epoch": 0.03585526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.06382548014322917, "learning_rate": 0.0001, "loss": 3.4874, "loss/crossentropy": 2.4227387428283693, "loss/hidden": 3.3890625, "loss/incoh": 0.0, "loss/logits": 0.3498344630002975, "loss/reg": 0.0, "step": 5450 }, { "epoch": 0.03592105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.5879221598307292, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.024171155691147, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.25232082083821294, "loss/reg": 0.0, "step": 5460 }, { "epoch": 0.03598684210526316, "grad_norm": 2.78125, "grad_norm_var": 1.2988596598307292, "learning_rate": 0.0001, "loss": 3.4549, "loss/crossentropy": 2.5548394203186033, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.29853117763996123, "loss/reg": 0.0, "step": 5470 }, { "epoch": 0.03605263157894737, "grad_norm": 3.234375, "grad_norm_var": 1.458153279622396, "learning_rate": 0.0001, "loss": 3.5047, "loss/crossentropy": 2.520615005493164, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.31394066512584684, "loss/reg": 0.0, "step": 5480 }, { "epoch": 0.03611842105263158, "grad_norm": 2.96875, "grad_norm_var": 0.6611328125, "learning_rate": 0.0001, "loss": 3.4932, "loss/crossentropy": 2.447161090373993, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.3091880366206169, "loss/reg": 0.0, "step": 5490 }, { "epoch": 0.03618421052631579, "grad_norm": 2.671875, "grad_norm_var": 0.05611572265625, "learning_rate": 0.0001, "loss": 3.4718, "loss/crossentropy": 2.354436981678009, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.30545540153980255, "loss/reg": 0.0, "step": 5500 }, { "epoch": 0.03625, "grad_norm": 2.828125, "grad_norm_var": 0.24016011555989583, "learning_rate": 0.0001, "loss": 3.5554, "loss/crossentropy": 2.350696861743927, "loss/hidden": 3.1578125, "loss/incoh": 0.0, "loss/logits": 0.27360412031412124, "loss/reg": 0.0, "step": 5510 }, { "epoch": 0.03631578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.6210245768229167, "learning_rate": 0.0001, "loss": 3.513, "loss/crossentropy": 2.368817460536957, "loss/hidden": 3.1796875, "loss/incoh": 0.0, "loss/logits": 0.31491883993148806, "loss/reg": 0.0, "step": 5520 }, { "epoch": 0.036381578947368425, "grad_norm": 2.640625, "grad_norm_var": 0.16715087890625, "learning_rate": 0.0001, "loss": 3.4444, "loss/crossentropy": 2.3894132494926454, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.2710603341460228, "loss/reg": 0.0, "step": 5530 }, { "epoch": 0.03644736842105263, "grad_norm": 2.484375, "grad_norm_var": 2.993724568684896, "learning_rate": 0.0001, "loss": 3.5105, "loss/crossentropy": 2.4798691868782043, "loss/hidden": 3.25, "loss/incoh": 0.0, "loss/logits": 0.422188438475132, "loss/reg": 0.0, "step": 5540 }, { "epoch": 0.036513157894736845, "grad_norm": 3.6875, "grad_norm_var": 2.864090983072917, "learning_rate": 0.0001, "loss": 3.6266, "loss/crossentropy": 2.499036192893982, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.3576551049947739, "loss/reg": 0.0, "step": 5550 }, { "epoch": 0.03657894736842105, "grad_norm": 2.59375, "grad_norm_var": 0.2598052978515625, "learning_rate": 0.0001, "loss": 3.5659, "loss/crossentropy": 2.4270546317100523, "loss/hidden": 3.23125, "loss/incoh": 0.0, "loss/logits": 0.4616221562027931, "loss/reg": 0.0, "step": 5560 }, { "epoch": 0.036644736842105265, "grad_norm": 3.03125, "grad_norm_var": 0.39485270182291665, "learning_rate": 0.0001, "loss": 3.5978, "loss/crossentropy": 2.427480709552765, "loss/hidden": 3.7875, "loss/incoh": 0.0, "loss/logits": 0.38075721710920335, "loss/reg": 0.0, "step": 5570 }, { "epoch": 0.03671052631578947, "grad_norm": 2.390625, "grad_norm_var": 1.5193318684895833, "learning_rate": 0.0001, "loss": 3.5541, "loss/crossentropy": 2.2717662811279298, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.2794697627425194, "loss/reg": 0.0, "step": 5580 }, { "epoch": 0.036776315789473685, "grad_norm": 2.59375, "grad_norm_var": 1.5052734375, "learning_rate": 0.0001, "loss": 3.5513, "loss/crossentropy": 2.549258255958557, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.3551526039838791, "loss/reg": 0.0, "step": 5590 }, { "epoch": 0.03684210526315789, "grad_norm": 2.78125, "grad_norm_var": 0.12431538899739583, "learning_rate": 0.0001, "loss": 3.5166, "loss/crossentropy": 2.42179411649704, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.349351167678833, "loss/reg": 0.0, "step": 5600 }, { "epoch": 0.036907894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.09789937337239583, "learning_rate": 0.0001, "loss": 3.4519, "loss/crossentropy": 2.275598430633545, "loss/hidden": 3.2015625, "loss/incoh": 0.0, "loss/logits": 0.32426146864891053, "loss/reg": 0.0, "step": 5610 }, { "epoch": 0.03697368421052632, "grad_norm": 2.84375, "grad_norm_var": 0.10132548014322916, "learning_rate": 0.0001, "loss": 3.4632, "loss/crossentropy": 2.4317312955856325, "loss/hidden": 3.3171875, "loss/incoh": 0.0, "loss/logits": 0.3550658613443375, "loss/reg": 0.0, "step": 5620 }, { "epoch": 0.037039473684210525, "grad_norm": 25.0, "grad_norm_var": 167.39566650390626, "learning_rate": 0.0001, "loss": 3.6136, "loss/crossentropy": 2.5963298320770263, "loss/hidden": 3.1546875, "loss/incoh": 0.0, "loss/logits": 0.34572866559028625, "loss/reg": 0.0, "step": 5630 }, { "epoch": 0.03710526315789474, "grad_norm": 3.0, "grad_norm_var": 167.50836486816405, "learning_rate": 0.0001, "loss": 3.5122, "loss/crossentropy": 2.370168614387512, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.316168874502182, "loss/reg": 0.0, "step": 5640 }, { "epoch": 0.037171052631578945, "grad_norm": 2.921875, "grad_norm_var": 0.05074462890625, "learning_rate": 0.0001, "loss": 3.4512, "loss/crossentropy": 2.1566815614700316, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.30936725735664367, "loss/reg": 0.0, "step": 5650 }, { "epoch": 0.03723684210526316, "grad_norm": 3.546875, "grad_norm_var": 0.1086822509765625, "learning_rate": 0.0001, "loss": 3.4308, "loss/crossentropy": 2.315059244632721, "loss/hidden": 3.1609375, "loss/incoh": 0.0, "loss/logits": 0.3344813346862793, "loss/reg": 0.0, "step": 5660 }, { "epoch": 0.037302631578947365, "grad_norm": 2.984375, "grad_norm_var": 0.22506510416666667, "learning_rate": 0.0001, "loss": 3.5705, "loss/crossentropy": 2.063437449932098, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.2783856257796288, "loss/reg": 0.0, "step": 5670 }, { "epoch": 0.03736842105263158, "grad_norm": 2.75, "grad_norm_var": 0.48176981608072916, "learning_rate": 0.0001, "loss": 3.5285, "loss/crossentropy": 2.564136099815369, "loss/hidden": 3.2453125, "loss/incoh": 0.0, "loss/logits": 0.3260859474539757, "loss/reg": 0.0, "step": 5680 }, { "epoch": 0.03743421052631579, "grad_norm": 2.78125, "grad_norm_var": 1.2526519775390625, "learning_rate": 0.0001, "loss": 3.581, "loss/crossentropy": 2.4384737968444825, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2700443536043167, "loss/reg": 0.0, "step": 5690 }, { "epoch": 0.0375, "grad_norm": 2.53125, "grad_norm_var": 1.029296875, "learning_rate": 0.0001, "loss": 3.5309, "loss/crossentropy": 2.656829285621643, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.34218672215938567, "loss/reg": 0.0, "step": 5700 }, { "epoch": 0.03756578947368421, "grad_norm": 2.375, "grad_norm_var": 5.287238566080729, "learning_rate": 0.0001, "loss": 3.6012, "loss/crossentropy": 2.3537548005580904, "loss/hidden": 3.1875, "loss/incoh": 0.0, "loss/logits": 0.33707170784473417, "loss/reg": 0.0, "step": 5710 }, { "epoch": 0.03763157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.6506795247395833, "learning_rate": 0.0001, "loss": 3.4987, "loss/crossentropy": 2.217307722568512, "loss/hidden": 3.31875, "loss/incoh": 0.0, "loss/logits": 0.36700052917003634, "loss/reg": 0.0, "step": 5720 }, { "epoch": 0.03769736842105263, "grad_norm": 2.796875, "grad_norm_var": 235.47136942545572, "learning_rate": 0.0001, "loss": 3.4822, "loss/crossentropy": 2.39666086435318, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.322082930803299, "loss/reg": 0.0, "step": 5730 }, { "epoch": 0.03776315789473684, "grad_norm": 2.65625, "grad_norm_var": 48.15474853515625, "learning_rate": 0.0001, "loss": 3.5338, "loss/crossentropy": 2.5715784192085267, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.31636003255844114, "loss/reg": 0.0, "step": 5740 }, { "epoch": 0.03782894736842105, "grad_norm": 2.375, "grad_norm_var": 48.25117899576823, "learning_rate": 0.0001, "loss": 3.345, "loss/crossentropy": 2.1092816948890687, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.27086046040058137, "loss/reg": 0.0, "step": 5750 }, { "epoch": 0.037894736842105266, "grad_norm": 3.34375, "grad_norm_var": 0.1339508056640625, "learning_rate": 0.0001, "loss": 3.4478, "loss/crossentropy": 2.20155810713768, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.3250092178583145, "loss/reg": 0.0, "step": 5760 }, { "epoch": 0.03796052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.5598592122395833, "learning_rate": 0.0001, "loss": 3.4268, "loss/crossentropy": 2.2270141005516053, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.2702811732888222, "loss/reg": 0.0, "step": 5770 }, { "epoch": 0.038026315789473686, "grad_norm": 2.53125, "grad_norm_var": 0.1083892822265625, "learning_rate": 0.0001, "loss": 3.4578, "loss/crossentropy": 2.4509124517440797, "loss/hidden": 3.1109375, "loss/incoh": 0.0, "loss/logits": 0.3049029678106308, "loss/reg": 0.0, "step": 5780 }, { "epoch": 0.03809210526315789, "grad_norm": 3.21875, "grad_norm_var": 0.20041402180989584, "learning_rate": 0.0001, "loss": 3.4516, "loss/crossentropy": 2.1973756074905397, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2764800027012825, "loss/reg": 0.0, "step": 5790 }, { "epoch": 0.038157894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.17437235514322916, "learning_rate": 0.0001, "loss": 3.3789, "loss/crossentropy": 2.4859437584877013, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.28125611394643785, "loss/reg": 0.0, "step": 5800 }, { "epoch": 0.03822368421052631, "grad_norm": 2.484375, "grad_norm_var": 1.9865234375, "learning_rate": 0.0001, "loss": 3.4479, "loss/crossentropy": 2.6306315779685976, "loss/hidden": 3.440625, "loss/incoh": 0.0, "loss/logits": 0.2930992156267166, "loss/reg": 0.0, "step": 5810 }, { "epoch": 0.038289473684210526, "grad_norm": 2.40625, "grad_norm_var": 1.902880859375, "learning_rate": 0.0001, "loss": 3.482, "loss/crossentropy": 2.4275772333145142, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.32571674734354017, "loss/reg": 0.0, "step": 5820 }, { "epoch": 0.03835526315789474, "grad_norm": 2.625, "grad_norm_var": 0.040087890625, "learning_rate": 0.0001, "loss": 3.3832, "loss/crossentropy": 2.348308402299881, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.325964193046093, "loss/reg": 0.0, "step": 5830 }, { "epoch": 0.038421052631578946, "grad_norm": 3.796875, "grad_norm_var": 0.8002115885416666, "learning_rate": 0.0001, "loss": 3.5519, "loss/crossentropy": 2.1252057909965516, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.3046886622905731, "loss/reg": 0.0, "step": 5840 }, { "epoch": 0.03848684210526316, "grad_norm": 2.625, "grad_norm_var": 0.2637685139973958, "learning_rate": 0.0001, "loss": 3.4841, "loss/crossentropy": 2.5330613613128663, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.31516623198986055, "loss/reg": 0.0, "step": 5850 }, { "epoch": 0.038552631578947366, "grad_norm": 2.3125, "grad_norm_var": 0.24798075358072916, "learning_rate": 0.0001, "loss": 3.4625, "loss/crossentropy": 2.324964237213135, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.29473926275968554, "loss/reg": 0.0, "step": 5860 }, { "epoch": 0.03861842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.12890218098958334, "learning_rate": 0.0001, "loss": 3.3932, "loss/crossentropy": 2.436046540737152, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.28259717375040055, "loss/reg": 0.0, "step": 5870 }, { "epoch": 0.038684210526315786, "grad_norm": 2.59375, "grad_norm_var": 0.09658101399739584, "learning_rate": 0.0001, "loss": 3.4335, "loss/crossentropy": 2.3942569494247437, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.29623564779758454, "loss/reg": 0.0, "step": 5880 }, { "epoch": 0.03875, "grad_norm": 2.421875, "grad_norm_var": 0.0283355712890625, "learning_rate": 0.0001, "loss": 3.4179, "loss/crossentropy": 2.675841474533081, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2852372720837593, "loss/reg": 0.0, "step": 5890 }, { "epoch": 0.03881578947368421, "grad_norm": 2.609375, "grad_norm_var": 0.25607096354166664, "learning_rate": 0.0001, "loss": 3.4508, "loss/crossentropy": 2.516406524181366, "loss/hidden": 3.35, "loss/incoh": 0.0, "loss/logits": 0.35599096268415453, "loss/reg": 0.0, "step": 5900 }, { "epoch": 0.03888157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.19685872395833334, "learning_rate": 0.0001, "loss": 3.3701, "loss/crossentropy": 2.118768775463104, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.3433256149291992, "loss/reg": 0.0, "step": 5910 }, { "epoch": 0.03894736842105263, "grad_norm": 2.75, "grad_norm_var": 0.10144856770833334, "learning_rate": 0.0001, "loss": 3.4698, "loss/crossentropy": 2.4102617263793946, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.2903832048177719, "loss/reg": 0.0, "step": 5920 }, { "epoch": 0.03901315789473684, "grad_norm": 3.25, "grad_norm_var": 0.05016276041666667, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.4927979469299317, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2977334216237068, "loss/reg": 0.0, "step": 5930 }, { "epoch": 0.03907894736842105, "grad_norm": 2.25, "grad_norm_var": 0.2647043863932292, "learning_rate": 0.0001, "loss": 3.4123, "loss/crossentropy": 2.163569325208664, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.27049526423215864, "loss/reg": 0.0, "step": 5940 }, { "epoch": 0.03914473684210526, "grad_norm": 2.765625, "grad_norm_var": 0.1790679931640625, "learning_rate": 0.0001, "loss": 3.4968, "loss/crossentropy": 2.314089775085449, "loss/hidden": 3.290625, "loss/incoh": 0.0, "loss/logits": 0.2804916575551033, "loss/reg": 0.0, "step": 5950 }, { "epoch": 0.03921052631578947, "grad_norm": 2.578125, "grad_norm_var": 0.10930989583333334, "learning_rate": 0.0001, "loss": 3.4204, "loss/crossentropy": 2.5095210552215574, "loss/hidden": 3.1890625, "loss/incoh": 0.0, "loss/logits": 0.34905528128147123, "loss/reg": 0.0, "step": 5960 }, { "epoch": 0.03927631578947369, "grad_norm": 2.5, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 3.3368, "loss/crossentropy": 2.3903687596321106, "loss/hidden": 3.228125, "loss/incoh": 0.0, "loss/logits": 0.36534676551818845, "loss/reg": 0.0, "step": 5970 }, { "epoch": 0.039342105263157894, "grad_norm": 2.703125, "grad_norm_var": 0.04106343587239583, "learning_rate": 0.0001, "loss": 3.3975, "loss/crossentropy": 2.485898661613464, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28061621338129045, "loss/reg": 0.0, "step": 5980 }, { "epoch": 0.03940789473684211, "grad_norm": 2.578125, "grad_norm_var": 0.0488677978515625, "learning_rate": 0.0001, "loss": 3.3803, "loss/crossentropy": 2.4124781847000123, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.30112328827381135, "loss/reg": 0.0, "step": 5990 }, { "epoch": 0.039473684210526314, "grad_norm": 2.734375, "grad_norm_var": 0.44810791015625, "learning_rate": 0.0001, "loss": 3.472, "loss/crossentropy": 2.4459670901298525, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2799991726875305, "loss/reg": 0.0, "step": 6000 }, { "epoch": 0.03953947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.1685546875, "learning_rate": 0.0001, "loss": 3.4322, "loss/crossentropy": 2.641672468185425, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.31368287801742556, "loss/reg": 0.0, "step": 6010 }, { "epoch": 0.039605263157894734, "grad_norm": 2.546875, "grad_norm_var": 0.03518473307291667, "learning_rate": 0.0001, "loss": 3.4325, "loss/crossentropy": 2.2493654131889342, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.29540172666311265, "loss/reg": 0.0, "step": 6020 }, { "epoch": 0.03967105263157895, "grad_norm": 2.25, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 3.408, "loss/crossentropy": 2.5626556158065794, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.3445401757955551, "loss/reg": 0.0, "step": 6030 }, { "epoch": 0.03973684210526316, "grad_norm": 3.375, "grad_norm_var": 0.11850484212239583, "learning_rate": 0.0001, "loss": 3.3697, "loss/crossentropy": 2.2249147415161135, "loss/hidden": 3.165625, "loss/incoh": 0.0, "loss/logits": 0.3320572040975094, "loss/reg": 0.0, "step": 6040 }, { "epoch": 0.03980263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.1265045166015625, "learning_rate": 0.0001, "loss": 3.3726, "loss/crossentropy": 2.479216980934143, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3085744693875313, "loss/reg": 0.0, "step": 6050 }, { "epoch": 0.03986842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.2034088134765625, "learning_rate": 0.0001, "loss": 3.4346, "loss/crossentropy": 2.3974440932273864, "loss/hidden": 3.259375, "loss/incoh": 0.0, "loss/logits": 0.4024490460753441, "loss/reg": 0.0, "step": 6060 }, { "epoch": 0.03993421052631579, "grad_norm": 2.640625, "grad_norm_var": 0.19241434733072918, "learning_rate": 0.0001, "loss": 3.3911, "loss/crossentropy": 2.3311298370361326, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2968046382069588, "loss/reg": 0.0, "step": 6070 }, { "epoch": 0.04, "grad_norm": 2.734375, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 3.5175, "loss/crossentropy": 2.6493954181671144, "loss/hidden": 3.2828125, "loss/incoh": 0.0, "loss/logits": 0.3133848324418068, "loss/reg": 0.0, "step": 6080 }, { "epoch": 0.04006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.2754140218098958, "learning_rate": 0.0001, "loss": 3.3932, "loss/crossentropy": 2.4315222024917604, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.3378627926111221, "loss/reg": 0.0, "step": 6090 }, { "epoch": 0.04013157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.04810282389322917, "learning_rate": 0.0001, "loss": 3.3698, "loss/crossentropy": 2.421731984615326, "loss/hidden": 3.315625, "loss/incoh": 0.0, "loss/logits": 0.3809585988521576, "loss/reg": 0.0, "step": 6100 }, { "epoch": 0.040197368421052634, "grad_norm": 2.328125, "grad_norm_var": 0.0509429931640625, "learning_rate": 0.0001, "loss": 3.3839, "loss/crossentropy": 2.2816696763038635, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.3188880756497383, "loss/reg": 0.0, "step": 6110 }, { "epoch": 0.04026315789473684, "grad_norm": 2.421875, "grad_norm_var": 0.26266276041666664, "learning_rate": 0.0001, "loss": 3.4852, "loss/crossentropy": 2.4251498103141786, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.3131637305021286, "loss/reg": 0.0, "step": 6120 }, { "epoch": 0.040328947368421054, "grad_norm": 2.046875, "grad_norm_var": 0.3377919514973958, "learning_rate": 0.0001, "loss": 3.4191, "loss/crossentropy": 2.321718716621399, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.28195892125368116, "loss/reg": 0.0, "step": 6130 }, { "epoch": 0.04039473684210526, "grad_norm": 2.515625, "grad_norm_var": 0.0685943603515625, "learning_rate": 0.0001, "loss": 3.4412, "loss/crossentropy": 2.658420753479004, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3355125278234482, "loss/reg": 0.0, "step": 6140 }, { "epoch": 0.040460526315789475, "grad_norm": 2.1875, "grad_norm_var": 0.059305826822916664, "learning_rate": 0.0001, "loss": 3.3292, "loss/crossentropy": 2.5203867316246034, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.3060264021158218, "loss/reg": 0.0, "step": 6150 }, { "epoch": 0.04052631578947368, "grad_norm": 3.078125, "grad_norm_var": 0.052302042643229164, "learning_rate": 0.0001, "loss": 3.3844, "loss/crossentropy": 2.53420695066452, "loss/hidden": 3.178125, "loss/incoh": 0.0, "loss/logits": 0.3124456375837326, "loss/reg": 0.0, "step": 6160 }, { "epoch": 0.040592105263157895, "grad_norm": 2.375, "grad_norm_var": 0.049779256184895836, "learning_rate": 0.0001, "loss": 3.3634, "loss/crossentropy": 2.5132151365280153, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.35312571823596955, "loss/reg": 0.0, "step": 6170 }, { "epoch": 0.04065789473684211, "grad_norm": 2.46875, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 3.4426, "loss/crossentropy": 2.3004459500312806, "loss/hidden": 3.1703125, "loss/incoh": 0.0, "loss/logits": 0.28007449954748154, "loss/reg": 0.0, "step": 6180 }, { "epoch": 0.040723684210526315, "grad_norm": 2.515625, "grad_norm_var": 0.1358306884765625, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.373980039358139, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.28091391175985336, "loss/reg": 0.0, "step": 6190 }, { "epoch": 0.04078947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.08801981608072916, "learning_rate": 0.0001, "loss": 3.4006, "loss/crossentropy": 2.2023098945617674, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2726595625281334, "loss/reg": 0.0, "step": 6200 }, { "epoch": 0.040855263157894735, "grad_norm": 2.984375, "grad_norm_var": 3.6811187744140623, "learning_rate": 0.0001, "loss": 3.6073, "loss/crossentropy": 2.382032370567322, "loss/hidden": 3.6921875, "loss/incoh": 0.0, "loss/logits": 0.4323269993066788, "loss/reg": 0.0, "step": 6210 }, { "epoch": 0.04092105263157895, "grad_norm": 2.296875, "grad_norm_var": 3.804295857747396, "learning_rate": 0.0001, "loss": 3.3627, "loss/crossentropy": 2.569228994846344, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31191317439079286, "loss/reg": 0.0, "step": 6220 }, { "epoch": 0.040986842105263155, "grad_norm": 2.828125, "grad_norm_var": 0.08497721354166667, "learning_rate": 0.0001, "loss": 3.3414, "loss/crossentropy": 2.5153043985366823, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.30689269602298735, "loss/reg": 0.0, "step": 6230 }, { "epoch": 0.04105263157894737, "grad_norm": 2.375, "grad_norm_var": 1.2814849853515624, "learning_rate": 0.0001, "loss": 3.4386, "loss/crossentropy": 2.283157765865326, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3153227433562279, "loss/reg": 0.0, "step": 6240 }, { "epoch": 0.04111842105263158, "grad_norm": 2.234375, "grad_norm_var": 5.054263305664063, "learning_rate": 0.0001, "loss": 3.4379, "loss/crossentropy": 2.585819673538208, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.2913993000984192, "loss/reg": 0.0, "step": 6250 }, { "epoch": 0.04118421052631579, "grad_norm": 2.5, "grad_norm_var": 0.0501373291015625, "learning_rate": 0.0001, "loss": 3.3357, "loss/crossentropy": 2.453801620006561, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3249350532889366, "loss/reg": 0.0, "step": 6260 }, { "epoch": 0.04125, "grad_norm": 2.5, "grad_norm_var": 0.0405181884765625, "learning_rate": 0.0001, "loss": 3.3525, "loss/crossentropy": 2.4949014663696287, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2902356445789337, "loss/reg": 0.0, "step": 6270 }, { "epoch": 0.04131578947368421, "grad_norm": 2.78125, "grad_norm_var": 1.6241770426432292, "learning_rate": 0.0001, "loss": 3.4444, "loss/crossentropy": 2.029393529891968, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.30831936225295065, "loss/reg": 0.0, "step": 6280 }, { "epoch": 0.04138157894736842, "grad_norm": 2.53125, "grad_norm_var": 0.0864654541015625, "learning_rate": 0.0001, "loss": 3.3944, "loss/crossentropy": 3.0075352430343627, "loss/hidden": 3.2390625, "loss/incoh": 0.0, "loss/logits": 0.4476942718029022, "loss/reg": 0.0, "step": 6290 }, { "epoch": 0.04144736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.15415751139322917, "learning_rate": 0.0001, "loss": 3.4091, "loss/crossentropy": 2.36070739030838, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.26602610796689985, "loss/reg": 0.0, "step": 6300 }, { "epoch": 0.04151315789473684, "grad_norm": 3.0, "grad_norm_var": 0.18596903483072916, "learning_rate": 0.0001, "loss": 3.4135, "loss/crossentropy": 2.186020624637604, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.30242343842983244, "loss/reg": 0.0, "step": 6310 }, { "epoch": 0.041578947368421056, "grad_norm": 3.34375, "grad_norm_var": 0.06327718098958333, "learning_rate": 0.0001, "loss": 3.44, "loss/crossentropy": 2.3751362919807435, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.29995152205228803, "loss/reg": 0.0, "step": 6320 }, { "epoch": 0.04164473684210526, "grad_norm": 7.9375, "grad_norm_var": 3.442041015625, "learning_rate": 0.0001, "loss": 3.4985, "loss/crossentropy": 2.2889585196971893, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.5501813948154449, "loss/reg": 0.0, "step": 6330 }, { "epoch": 0.041710526315789476, "grad_norm": 2.875, "grad_norm_var": 3.8211822509765625, "learning_rate": 0.0001, "loss": 3.4688, "loss/crossentropy": 2.378446078300476, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.30127355754375457, "loss/reg": 0.0, "step": 6340 }, { "epoch": 0.04177631578947368, "grad_norm": 3.109375, "grad_norm_var": 1.1990793863932292, "learning_rate": 0.0001, "loss": 3.3813, "loss/crossentropy": 2.444005084037781, "loss/hidden": 3.209375, "loss/incoh": 0.0, "loss/logits": 0.3470224469900131, "loss/reg": 0.0, "step": 6350 }, { "epoch": 0.041842105263157896, "grad_norm": 2.234375, "grad_norm_var": 0.7403065999348958, "learning_rate": 0.0001, "loss": 3.4, "loss/crossentropy": 2.026668357849121, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2548553004860878, "loss/reg": 0.0, "step": 6360 }, { "epoch": 0.0419078947368421, "grad_norm": 2.921875, "grad_norm_var": 0.3556925455729167, "learning_rate": 0.0001, "loss": 3.4591, "loss/crossentropy": 2.331345629692078, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26911164075136185, "loss/reg": 0.0, "step": 6370 }, { "epoch": 0.041973684210526316, "grad_norm": 2.484375, "grad_norm_var": 0.27215169270833334, "learning_rate": 0.0001, "loss": 3.4011, "loss/crossentropy": 2.2243176221847536, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.30012439042329786, "loss/reg": 0.0, "step": 6380 }, { "epoch": 0.04203947368421053, "grad_norm": 2.625, "grad_norm_var": 0.0267974853515625, "learning_rate": 0.0001, "loss": 3.3476, "loss/crossentropy": 2.284479832649231, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.3452886208891869, "loss/reg": 0.0, "step": 6390 }, { "epoch": 0.042105263157894736, "grad_norm": 2.859375, "grad_norm_var": 0.18044331868489583, "learning_rate": 0.0001, "loss": 3.4206, "loss/crossentropy": 2.2036523103713987, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2783824667334557, "loss/reg": 0.0, "step": 6400 }, { "epoch": 0.04217105263157895, "grad_norm": 2.34375, "grad_norm_var": 4.91226806640625, "learning_rate": 0.0001, "loss": 3.4892, "loss/crossentropy": 2.2305456399917603, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.29731594026088715, "loss/reg": 0.0, "step": 6410 }, { "epoch": 0.042236842105263156, "grad_norm": 2.15625, "grad_norm_var": 0.8588216145833333, "learning_rate": 0.0001, "loss": 3.3759, "loss/crossentropy": 2.186525213718414, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.31351439356803895, "loss/reg": 0.0, "step": 6420 }, { "epoch": 0.04230263157894737, "grad_norm": 2.421875, "grad_norm_var": 1.168097941080729, "learning_rate": 0.0001, "loss": 3.4452, "loss/crossentropy": 2.3028628826141357, "loss/hidden": 3.29375, "loss/incoh": 0.0, "loss/logits": 0.40216329991817473, "loss/reg": 0.0, "step": 6430 }, { "epoch": 0.042368421052631576, "grad_norm": 2.296875, "grad_norm_var": 0.0708160400390625, "learning_rate": 0.0001, "loss": 3.362, "loss/crossentropy": 2.5469772100448607, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.3354289785027504, "loss/reg": 0.0, "step": 6440 }, { "epoch": 0.04243421052631579, "grad_norm": 3.4375, "grad_norm_var": 0.09075419108072917, "learning_rate": 0.0001, "loss": 3.366, "loss/crossentropy": 2.3079045534133913, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.2784201934933662, "loss/reg": 0.0, "step": 6450 }, { "epoch": 0.0425, "grad_norm": 2.5625, "grad_norm_var": 0.10932515462239584, "learning_rate": 0.0001, "loss": 3.4156, "loss/crossentropy": 2.325330352783203, "loss/hidden": 3.1828125, "loss/incoh": 0.0, "loss/logits": 0.3098024681210518, "loss/reg": 0.0, "step": 6460 }, { "epoch": 0.04256578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.07423502604166667, "learning_rate": 0.0001, "loss": 3.3605, "loss/crossentropy": 2.4809486865997314, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.336503566801548, "loss/reg": 0.0, "step": 6470 }, { "epoch": 0.04263157894736842, "grad_norm": 2.5, "grad_norm_var": 0.06294657389322916, "learning_rate": 0.0001, "loss": 3.4219, "loss/crossentropy": 2.357036566734314, "loss/hidden": 3.16875, "loss/incoh": 0.0, "loss/logits": 0.31517077386379244, "loss/reg": 0.0, "step": 6480 }, { "epoch": 0.04269736842105263, "grad_norm": 2.96875, "grad_norm_var": 0.044188435872395834, "learning_rate": 0.0001, "loss": 3.2698, "loss/crossentropy": 2.3011206150054933, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.3023343622684479, "loss/reg": 0.0, "step": 6490 }, { "epoch": 0.04276315789473684, "grad_norm": 3.75, "grad_norm_var": 0.191259765625, "learning_rate": 0.0001, "loss": 3.3991, "loss/crossentropy": 2.135625755786896, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.30503824055194856, "loss/reg": 0.0, "step": 6500 }, { "epoch": 0.04282894736842105, "grad_norm": 2.796875, "grad_norm_var": 0.16857096354166667, "learning_rate": 0.0001, "loss": 3.317, "loss/crossentropy": 2.2963179469108583, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.30365400537848475, "loss/reg": 0.0, "step": 6510 }, { "epoch": 0.04289473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.0795562744140625, "learning_rate": 0.0001, "loss": 3.3769, "loss/crossentropy": 2.463003098964691, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2864454731345177, "loss/reg": 0.0, "step": 6520 }, { "epoch": 0.04296052631578948, "grad_norm": 3.28125, "grad_norm_var": 0.1319244384765625, "learning_rate": 0.0001, "loss": 3.4418, "loss/crossentropy": 2.4336194515228273, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.2907303601503372, "loss/reg": 0.0, "step": 6530 }, { "epoch": 0.04302631578947368, "grad_norm": 2.78125, "grad_norm_var": 0.18455403645833332, "learning_rate": 0.0001, "loss": 3.5029, "loss/crossentropy": 2.4530020356178284, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.3084172964096069, "loss/reg": 0.0, "step": 6540 }, { "epoch": 0.0430921052631579, "grad_norm": 2.625, "grad_norm_var": 0.0909332275390625, "learning_rate": 0.0001, "loss": 3.4113, "loss/crossentropy": 2.28972727060318, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.29103828966617584, "loss/reg": 0.0, "step": 6550 }, { "epoch": 0.0431578947368421, "grad_norm": 3.15625, "grad_norm_var": 0.09442952473958334, "learning_rate": 0.0001, "loss": 3.3842, "loss/crossentropy": 2.5410515666007996, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.262499050796032, "loss/reg": 0.0, "step": 6560 }, { "epoch": 0.04322368421052632, "grad_norm": 2.625, "grad_norm_var": 0.5502105712890625, "learning_rate": 0.0001, "loss": 3.3708, "loss/crossentropy": 2.460482358932495, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.2910577103495598, "loss/reg": 0.0, "step": 6570 }, { "epoch": 0.043289473684210523, "grad_norm": 2.484375, "grad_norm_var": 0.5198527018229167, "learning_rate": 0.0001, "loss": 3.2716, "loss/crossentropy": 2.264538216590881, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.26389954835176466, "loss/reg": 0.0, "step": 6580 }, { "epoch": 0.04335526315789474, "grad_norm": 2.609375, "grad_norm_var": 0.58785400390625, "learning_rate": 0.0001, "loss": 3.4431, "loss/crossentropy": 2.705476760864258, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.3710309460759163, "loss/reg": 0.0, "step": 6590 }, { "epoch": 0.04342105263157895, "grad_norm": 2.71875, "grad_norm_var": 0.7066721598307292, "learning_rate": 0.0001, "loss": 3.3844, "loss/crossentropy": 2.3109049081802366, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.3492768794298172, "loss/reg": 0.0, "step": 6600 }, { "epoch": 0.04348684210526316, "grad_norm": 2.40625, "grad_norm_var": 2.2473958333333335, "learning_rate": 0.0001, "loss": 3.5366, "loss/crossentropy": 2.110491228103638, "loss/hidden": 3.2296875, "loss/incoh": 0.0, "loss/logits": 0.3138969212770462, "loss/reg": 0.0, "step": 6610 }, { "epoch": 0.04355263157894737, "grad_norm": 2.265625, "grad_norm_var": 1.2563222249348958, "learning_rate": 0.0001, "loss": 3.3583, "loss/crossentropy": 2.5283448338508605, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2928563803434372, "loss/reg": 0.0, "step": 6620 }, { "epoch": 0.04361842105263158, "grad_norm": 2.359375, "grad_norm_var": 1.2412261962890625, "learning_rate": 0.0001, "loss": 3.3885, "loss/crossentropy": 2.579999303817749, "loss/hidden": 3.4296875, "loss/incoh": 0.0, "loss/logits": 0.4414908319711685, "loss/reg": 0.0, "step": 6630 }, { "epoch": 0.04368421052631579, "grad_norm": 2.546875, "grad_norm_var": 0.11516825358072917, "learning_rate": 0.0001, "loss": 3.3022, "loss/crossentropy": 2.413753032684326, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.28862773478031156, "loss/reg": 0.0, "step": 6640 }, { "epoch": 0.04375, "grad_norm": 2.828125, "grad_norm_var": 0.28503316243489585, "learning_rate": 0.0001, "loss": 3.4009, "loss/crossentropy": 2.4203495264053343, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2926980495452881, "loss/reg": 0.0, "step": 6650 }, { "epoch": 0.04381578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.07642822265625, "learning_rate": 0.0001, "loss": 3.4092, "loss/crossentropy": 2.3588085770606995, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.32405087500810625, "loss/reg": 0.0, "step": 6660 }, { "epoch": 0.043881578947368424, "grad_norm": 3.53125, "grad_norm_var": 0.45767822265625, "learning_rate": 0.0001, "loss": 3.4143, "loss/crossentropy": 2.512442636489868, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.2854305922985077, "loss/reg": 0.0, "step": 6670 }, { "epoch": 0.04394736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.3211873372395833, "learning_rate": 0.0001, "loss": 3.3297, "loss/crossentropy": 2.5028780698776245, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3471944749355316, "loss/reg": 0.0, "step": 6680 }, { "epoch": 0.044013157894736844, "grad_norm": 4.0625, "grad_norm_var": 0.8257720947265625, "learning_rate": 0.0001, "loss": 3.4063, "loss/crossentropy": 2.2031276702880858, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.25473351776599884, "loss/reg": 0.0, "step": 6690 }, { "epoch": 0.04407894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.22024637858072918, "learning_rate": 0.0001, "loss": 3.4035, "loss/crossentropy": 2.2040748953819276, "loss/hidden": 3.434375, "loss/incoh": 0.0, "loss/logits": 0.3361481264233589, "loss/reg": 0.0, "step": 6700 }, { "epoch": 0.044144736842105264, "grad_norm": 2.15625, "grad_norm_var": 0.1261383056640625, "learning_rate": 0.0001, "loss": 3.4225, "loss/crossentropy": 2.390383541584015, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2891513243317604, "loss/reg": 0.0, "step": 6710 }, { "epoch": 0.04421052631578947, "grad_norm": 2.65625, "grad_norm_var": 0.0734375, "learning_rate": 0.0001, "loss": 3.4363, "loss/crossentropy": 2.392831575870514, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3119692116975784, "loss/reg": 0.0, "step": 6720 }, { "epoch": 0.044276315789473684, "grad_norm": 2.453125, "grad_norm_var": 0.12464192708333334, "learning_rate": 0.0001, "loss": 3.3959, "loss/crossentropy": 2.426222395896912, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2563284829258919, "loss/reg": 0.0, "step": 6730 }, { "epoch": 0.0443421052631579, "grad_norm": 2.890625, "grad_norm_var": 3.1175201416015623, "learning_rate": 0.0001, "loss": 3.4427, "loss/crossentropy": 2.408197546005249, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.3056713670492172, "loss/reg": 0.0, "step": 6740 }, { "epoch": 0.044407894736842105, "grad_norm": 2.359375, "grad_norm_var": 2.6096099853515624, "learning_rate": 0.0001, "loss": 3.3365, "loss/crossentropy": 2.5353691220283507, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.30413212031126025, "loss/reg": 0.0, "step": 6750 }, { "epoch": 0.04447368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.233740234375, "learning_rate": 0.0001, "loss": 3.371, "loss/crossentropy": 2.495334494113922, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.3251196876168251, "loss/reg": 0.0, "step": 6760 }, { "epoch": 0.044539473684210525, "grad_norm": 2.375, "grad_norm_var": 0.11097005208333334, "learning_rate": 0.0001, "loss": 3.3306, "loss/crossentropy": 2.3767608165740968, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.27886694818735125, "loss/reg": 0.0, "step": 6770 }, { "epoch": 0.04460526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.17870992024739582, "learning_rate": 0.0001, "loss": 3.4315, "loss/crossentropy": 2.5281002640724184, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.31765572130680086, "loss/reg": 0.0, "step": 6780 }, { "epoch": 0.044671052631578945, "grad_norm": 3.046875, "grad_norm_var": 3.0139973958333335, "learning_rate": 0.0001, "loss": 3.4717, "loss/crossentropy": 2.362587594985962, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.3029619336128235, "loss/reg": 0.0, "step": 6790 }, { "epoch": 0.04473684210526316, "grad_norm": 2.6875, "grad_norm_var": 7.6724192301432295, "learning_rate": 0.0001, "loss": 3.4302, "loss/crossentropy": 2.4665863275527955, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.2934702351689339, "loss/reg": 0.0, "step": 6800 }, { "epoch": 0.04480263157894737, "grad_norm": 2.453125, "grad_norm_var": 7.714127604166666, "learning_rate": 0.0001, "loss": 3.4034, "loss/crossentropy": 2.1714313626289368, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.3513699471950531, "loss/reg": 0.0, "step": 6810 }, { "epoch": 0.04486842105263158, "grad_norm": 3.953125, "grad_norm_var": 0.17561848958333334, "learning_rate": 0.0001, "loss": 3.3193, "loss/crossentropy": 2.145359480381012, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.299596332013607, "loss/reg": 0.0, "step": 6820 }, { "epoch": 0.04493421052631579, "grad_norm": 2.328125, "grad_norm_var": 0.20790913899739583, "learning_rate": 0.0001, "loss": 3.343, "loss/crossentropy": 2.7453124046325685, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.36230285465717316, "loss/reg": 0.0, "step": 6830 }, { "epoch": 0.045, "grad_norm": 2.6875, "grad_norm_var": 2.236214192708333, "learning_rate": 0.0001, "loss": 3.4372, "loss/crossentropy": 2.256898009777069, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.2946140691637993, "loss/reg": 0.0, "step": 6840 }, { "epoch": 0.04506578947368421, "grad_norm": 2.375, "grad_norm_var": 2.391950480143229, "learning_rate": 0.0001, "loss": 3.3701, "loss/crossentropy": 2.6210601568222045, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.3517232984304428, "loss/reg": 0.0, "step": 6850 }, { "epoch": 0.04513157894736842, "grad_norm": 2.4375, "grad_norm_var": 1.015998331705729, "learning_rate": 0.0001, "loss": 3.3998, "loss/crossentropy": 2.387173318862915, "loss/hidden": 3.3296875, "loss/incoh": 0.0, "loss/logits": 0.3826398134231567, "loss/reg": 0.0, "step": 6860 }, { "epoch": 0.04519736842105263, "grad_norm": 2.84375, "grad_norm_var": 0.44384663899739585, "learning_rate": 0.0001, "loss": 3.3858, "loss/crossentropy": 2.4001118540763855, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.27424332648515704, "loss/reg": 0.0, "step": 6870 }, { "epoch": 0.045263157894736845, "grad_norm": 2.671875, "grad_norm_var": 0.06037495930989583, "learning_rate": 0.0001, "loss": 3.3752, "loss/crossentropy": 2.1074828147888183, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.2777694225311279, "loss/reg": 0.0, "step": 6880 }, { "epoch": 0.04532894736842105, "grad_norm": 2.515625, "grad_norm_var": 0.1735504150390625, "learning_rate": 0.0001, "loss": 3.3792, "loss/crossentropy": 2.2711395502090452, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.3001497104763985, "loss/reg": 0.0, "step": 6890 }, { "epoch": 0.045394736842105265, "grad_norm": 3.0, "grad_norm_var": 1.8758951822916667, "learning_rate": 0.0001, "loss": 3.4311, "loss/crossentropy": 2.2329100012779235, "loss/hidden": 3.3421875, "loss/incoh": 0.0, "loss/logits": 0.3293987289071083, "loss/reg": 0.0, "step": 6900 }, { "epoch": 0.04546052631578947, "grad_norm": 2.703125, "grad_norm_var": 1.793781534830729, "learning_rate": 0.0001, "loss": 3.2852, "loss/crossentropy": 2.3576371729373933, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.27053930461406706, "loss/reg": 0.0, "step": 6910 }, { "epoch": 0.045526315789473686, "grad_norm": 2.484375, "grad_norm_var": 1.989207967122396, "learning_rate": 0.0001, "loss": 3.4411, "loss/crossentropy": 2.532450318336487, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.2638275146484375, "loss/reg": 0.0, "step": 6920 }, { "epoch": 0.04559210526315789, "grad_norm": 2.546875, "grad_norm_var": 2.184137980143229, "learning_rate": 0.0001, "loss": 3.4069, "loss/crossentropy": 2.4933431267738344, "loss/hidden": 3.2640625, "loss/incoh": 0.0, "loss/logits": 0.32281421422958373, "loss/reg": 0.0, "step": 6930 }, { "epoch": 0.045657894736842106, "grad_norm": 2.421875, "grad_norm_var": 0.05158589680989583, "learning_rate": 0.0001, "loss": 3.284, "loss/crossentropy": 2.2752655148506165, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2645736649632454, "loss/reg": 0.0, "step": 6940 }, { "epoch": 0.04572368421052632, "grad_norm": 2.40625, "grad_norm_var": 0.0625152587890625, "learning_rate": 0.0001, "loss": 3.3456, "loss/crossentropy": 2.290147030353546, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.27036611288785933, "loss/reg": 0.0, "step": 6950 }, { "epoch": 0.045789473684210526, "grad_norm": 2.53125, "grad_norm_var": 6.027144368489584, "learning_rate": 0.0001, "loss": 3.4116, "loss/crossentropy": 2.359883761405945, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.29680062681436536, "loss/reg": 0.0, "step": 6960 }, { "epoch": 0.04585526315789474, "grad_norm": 2.984375, "grad_norm_var": 0.5505208333333333, "learning_rate": 0.0001, "loss": 3.334, "loss/crossentropy": 2.4557218074798586, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.2946802690625191, "loss/reg": 0.0, "step": 6970 }, { "epoch": 0.045921052631578946, "grad_norm": 2.5, "grad_norm_var": 0.50865478515625, "learning_rate": 0.0001, "loss": 3.3166, "loss/crossentropy": 2.2482771933078767, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.25328404903411866, "loss/reg": 0.0, "step": 6980 }, { "epoch": 0.04598684210526316, "grad_norm": 2.203125, "grad_norm_var": 0.045685831705729166, "learning_rate": 0.0001, "loss": 3.4432, "loss/crossentropy": 2.3823115646839144, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.29391862004995345, "loss/reg": 0.0, "step": 6990 }, { "epoch": 0.046052631578947366, "grad_norm": 2.453125, "grad_norm_var": 0.04589436848958333, "learning_rate": 0.0001, "loss": 3.4052, "loss/crossentropy": 2.67444326877594, "loss/hidden": 3.3140625, "loss/incoh": 0.0, "loss/logits": 0.336136220395565, "loss/reg": 0.0, "step": 7000 }, { "epoch": 0.04611842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.03218994140625, "learning_rate": 0.0001, "loss": 3.3293, "loss/crossentropy": 2.4088356614112856, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26314016729593276, "loss/reg": 0.0, "step": 7010 }, { "epoch": 0.04618421052631579, "grad_norm": 2.484375, "grad_norm_var": 0.044266764322916666, "learning_rate": 0.0001, "loss": 3.3201, "loss/crossentropy": 2.210337924957275, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.28177270889282224, "loss/reg": 0.0, "step": 7020 }, { "epoch": 0.04625, "grad_norm": 2.328125, "grad_norm_var": 1.1310780843098958, "learning_rate": 0.0001, "loss": 3.4065, "loss/crossentropy": 2.3901759028434753, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3315755516290665, "loss/reg": 0.0, "step": 7030 }, { "epoch": 0.04631578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.27060139973958336, "learning_rate": 0.0001, "loss": 3.4377, "loss/crossentropy": 2.3308543801307677, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.27955446392297745, "loss/reg": 0.0, "step": 7040 }, { "epoch": 0.04638157894736842, "grad_norm": 3.953125, "grad_norm_var": 0.25728759765625, "learning_rate": 0.0001, "loss": 3.3076, "loss/crossentropy": 2.3288169384002684, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.25437864363193513, "loss/reg": 0.0, "step": 7050 }, { "epoch": 0.04644736842105263, "grad_norm": 3.046875, "grad_norm_var": 0.20709228515625, "learning_rate": 0.0001, "loss": 3.439, "loss/crossentropy": 2.033195120096207, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2639622241258621, "loss/reg": 0.0, "step": 7060 }, { "epoch": 0.04651315789473684, "grad_norm": 2.390625, "grad_norm_var": 0.84127197265625, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.366466200351715, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2680037707090378, "loss/reg": 0.0, "step": 7070 }, { "epoch": 0.04657894736842105, "grad_norm": 11.9375, "grad_norm_var": 5.57564697265625, "learning_rate": 0.0001, "loss": 3.4068, "loss/crossentropy": 2.3484351873397826, "loss/hidden": 3.3828125, "loss/incoh": 0.0, "loss/logits": 0.3572035223245621, "loss/reg": 0.0, "step": 7080 }, { "epoch": 0.04664473684210527, "grad_norm": 2.515625, "grad_norm_var": 5.670992024739584, "learning_rate": 0.0001, "loss": 3.5356, "loss/crossentropy": 2.491948664188385, "loss/hidden": 3.1484375, "loss/incoh": 0.0, "loss/logits": 0.33623201847076417, "loss/reg": 0.0, "step": 7090 }, { "epoch": 0.04671052631578947, "grad_norm": 7.90625, "grad_norm_var": 1.8282297770182292, "learning_rate": 0.0001, "loss": 3.4057, "loss/crossentropy": 2.3702210783958435, "loss/hidden": 3.203125, "loss/incoh": 0.0, "loss/logits": 0.3451476514339447, "loss/reg": 0.0, "step": 7100 }, { "epoch": 0.04677631578947369, "grad_norm": 2.5, "grad_norm_var": 1.74947509765625, "learning_rate": 0.0001, "loss": 3.4173, "loss/crossentropy": 2.372307813167572, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.30449149161577227, "loss/reg": 0.0, "step": 7110 }, { "epoch": 0.04684210526315789, "grad_norm": 2.484375, "grad_norm_var": 0.1269683837890625, "learning_rate": 0.0001, "loss": 3.3266, "loss/crossentropy": 2.4220902919769287, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.29380183219909667, "loss/reg": 0.0, "step": 7120 }, { "epoch": 0.04690789473684211, "grad_norm": 2.375, "grad_norm_var": 5.4926503499348955, "learning_rate": 0.0001, "loss": 3.4299, "loss/crossentropy": 2.333963227272034, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.32264130711555483, "loss/reg": 0.0, "step": 7130 }, { "epoch": 0.04697368421052631, "grad_norm": 2.453125, "grad_norm_var": 8.575126139322917, "learning_rate": 0.0001, "loss": 3.5129, "loss/crossentropy": 2.447948896884918, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.34722310602664946, "loss/reg": 0.0, "step": 7140 }, { "epoch": 0.04703947368421053, "grad_norm": 2.171875, "grad_norm_var": 4.295670572916666, "learning_rate": 0.0001, "loss": 3.4954, "loss/crossentropy": 2.4875372767448427, "loss/hidden": 3.1765625, "loss/incoh": 0.0, "loss/logits": 0.38640123009681704, "loss/reg": 0.0, "step": 7150 }, { "epoch": 0.04710526315789473, "grad_norm": 2.53125, "grad_norm_var": 0.09901936848958333, "learning_rate": 0.0001, "loss": 3.3592, "loss/crossentropy": 2.4286764740943907, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.3441846176981926, "loss/reg": 0.0, "step": 7160 }, { "epoch": 0.04717105263157895, "grad_norm": 2.46875, "grad_norm_var": 0.09706624348958333, "learning_rate": 0.0001, "loss": 3.4219, "loss/crossentropy": 2.197064208984375, "loss/hidden": 3.215625, "loss/incoh": 0.0, "loss/logits": 0.30653059035539626, "loss/reg": 0.0, "step": 7170 }, { "epoch": 0.04723684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.07274983723958334, "learning_rate": 0.0001, "loss": 3.4754, "loss/crossentropy": 2.592367339134216, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.30859925150871276, "loss/reg": 0.0, "step": 7180 }, { "epoch": 0.04730263157894737, "grad_norm": 2.671875, "grad_norm_var": 0.04854227701822917, "learning_rate": 0.0001, "loss": 3.335, "loss/crossentropy": 2.4139750719070436, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.26624749004840853, "loss/reg": 0.0, "step": 7190 }, { "epoch": 0.04736842105263158, "grad_norm": 2.34375, "grad_norm_var": 0.061579386393229164, "learning_rate": 0.0001, "loss": 3.4049, "loss/crossentropy": 1.9312179803848266, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.23240152448415757, "loss/reg": 0.0, "step": 7200 }, { "epoch": 0.04743421052631579, "grad_norm": 3.140625, "grad_norm_var": 2.7487790626051414e+17, "learning_rate": 0.0001, "loss": 3.5157, "loss/crossentropy": 2.192197346687317, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.28352061808109286, "loss/reg": 0.0, "step": 7210 }, { "epoch": 0.0475, "grad_norm": 2.828125, "grad_norm_var": 2.7487790627662506e+17, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.2706116318702696, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.29702268838882445, "loss/reg": 0.0, "step": 7220 }, { "epoch": 0.04756578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.05640869140625, "learning_rate": 0.0001, "loss": 3.2627, "loss/crossentropy": 2.354196774959564, "loss/hidden": 3.0625, "loss/incoh": 0.0, "loss/logits": 0.2917496845126152, "loss/reg": 0.0, "step": 7230 }, { "epoch": 0.04763157894736842, "grad_norm": 2.4375, "grad_norm_var": 0.048779296875, "learning_rate": 0.0001, "loss": 3.3302, "loss/crossentropy": 2.4912230253219603, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3378173440694809, "loss/reg": 0.0, "step": 7240 }, { "epoch": 0.047697368421052634, "grad_norm": 2.359375, "grad_norm_var": 0.0933990478515625, "learning_rate": 0.0001, "loss": 3.348, "loss/crossentropy": 2.541353499889374, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.3061401903629303, "loss/reg": 0.0, "step": 7250 }, { "epoch": 0.04776315789473684, "grad_norm": 4.15625, "grad_norm_var": 0.45806884765625, "learning_rate": 0.0001, "loss": 3.5007, "loss/crossentropy": 2.4307178616523744, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.3470552325248718, "loss/reg": 0.0, "step": 7260 }, { "epoch": 0.047828947368421054, "grad_norm": 2.421875, "grad_norm_var": 0.4556955973307292, "learning_rate": 0.0001, "loss": 3.3409, "loss/crossentropy": 2.257239353656769, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.29585833847522736, "loss/reg": 0.0, "step": 7270 }, { "epoch": 0.04789473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.3855445861816406, "learning_rate": 0.0001, "loss": 3.2964, "loss/crossentropy": 2.1451990723609926, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.25767376720905305, "loss/reg": 0.0, "step": 7280 }, { "epoch": 0.047960526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.26484349568684895, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.2842231035232543, "loss/hidden": 3.1171875, "loss/incoh": 0.0, "loss/logits": 0.33771214783191683, "loss/reg": 0.0, "step": 7290 }, { "epoch": 0.04802631578947368, "grad_norm": 2.578125, "grad_norm_var": 0.13931884765625, "learning_rate": 0.0001, "loss": 3.2818, "loss/crossentropy": 2.1290991365909577, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.2695446103811264, "loss/reg": 0.0, "step": 7300 }, { "epoch": 0.048092105263157894, "grad_norm": 2.53125, "grad_norm_var": 0.0911773681640625, "learning_rate": 0.0001, "loss": 3.3833, "loss/crossentropy": 2.149968445301056, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.24086009413003923, "loss/reg": 0.0, "step": 7310 }, { "epoch": 0.04815789473684211, "grad_norm": 2.65625, "grad_norm_var": 0.07766011555989584, "learning_rate": 0.0001, "loss": 3.3503, "loss/crossentropy": 2.519470489025116, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.25514234602451324, "loss/reg": 0.0, "step": 7320 }, { "epoch": 0.048223684210526314, "grad_norm": 2.796875, "grad_norm_var": 0.025614420572916668, "learning_rate": 0.0001, "loss": 3.3595, "loss/crossentropy": 2.5252918124198915, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.281466007232666, "loss/reg": 0.0, "step": 7330 }, { "epoch": 0.04828947368421053, "grad_norm": 2.078125, "grad_norm_var": 0.37810872395833334, "learning_rate": 0.0001, "loss": 3.3791, "loss/crossentropy": 2.0419042229652407, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2731199100613594, "loss/reg": 0.0, "step": 7340 }, { "epoch": 0.048355263157894735, "grad_norm": 3.421875, "grad_norm_var": 0.19280192057291667, "learning_rate": 0.0001, "loss": 3.3562, "loss/crossentropy": 2.1462841510772703, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.23967409282922744, "loss/reg": 0.0, "step": 7350 }, { "epoch": 0.04842105263157895, "grad_norm": 2.75, "grad_norm_var": 0.13990478515625, "learning_rate": 0.0001, "loss": 3.2953, "loss/crossentropy": 2.4041597843170166, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.26162110567092894, "loss/reg": 0.0, "step": 7360 }, { "epoch": 0.048486842105263155, "grad_norm": 2.375, "grad_norm_var": 0.0317047119140625, "learning_rate": 0.0001, "loss": 3.2731, "loss/crossentropy": 2.61968252658844, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.28658540844917296, "loss/reg": 0.0, "step": 7370 }, { "epoch": 0.04855263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.0795562744140625, "learning_rate": 0.0001, "loss": 3.3537, "loss/crossentropy": 2.330329430103302, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.311858968436718, "loss/reg": 0.0, "step": 7380 }, { "epoch": 0.04861842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06910400390625, "learning_rate": 0.0001, "loss": 3.3762, "loss/crossentropy": 2.331431567668915, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.3637159377336502, "loss/reg": 0.0, "step": 7390 }, { "epoch": 0.04868421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.07679036458333334, "learning_rate": 0.0001, "loss": 3.3474, "loss/crossentropy": 2.334563136100769, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.3102908283472061, "loss/reg": 0.0, "step": 7400 }, { "epoch": 0.04875, "grad_norm": 2.21875, "grad_norm_var": 0.041727701822916664, "learning_rate": 0.0001, "loss": 3.2994, "loss/crossentropy": 2.4575919032096865, "loss/hidden": 3.1453125, "loss/incoh": 0.0, "loss/logits": 0.30418373495340345, "loss/reg": 0.0, "step": 7410 }, { "epoch": 0.04881578947368421, "grad_norm": 2.5625, "grad_norm_var": 0.95484619140625, "learning_rate": 0.0001, "loss": 3.3136, "loss/crossentropy": 2.2615838646888733, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.27002790868282317, "loss/reg": 0.0, "step": 7420 }, { "epoch": 0.04888157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.05439046223958333, "learning_rate": 0.0001, "loss": 3.3084, "loss/crossentropy": 2.2472564220428466, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.28446437120437623, "loss/reg": 0.0, "step": 7430 }, { "epoch": 0.04894736842105263, "grad_norm": 2.40625, "grad_norm_var": 0.08689676920572917, "learning_rate": 0.0001, "loss": 3.3891, "loss/crossentropy": 2.5821902751922607, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.29424687922000886, "loss/reg": 0.0, "step": 7440 }, { "epoch": 0.04901315789473684, "grad_norm": 2.703125, "grad_norm_var": 0.09962565104166667, "learning_rate": 0.0001, "loss": 3.4691, "loss/crossentropy": 2.547790551185608, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.3217499524354935, "loss/reg": 0.0, "step": 7450 }, { "epoch": 0.049078947368421055, "grad_norm": 3.03125, "grad_norm_var": 0.18788655598958334, "learning_rate": 0.0001, "loss": 3.3014, "loss/crossentropy": 2.025312936306, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.29601537734270095, "loss/reg": 0.0, "step": 7460 }, { "epoch": 0.04914473684210526, "grad_norm": 2.359375, "grad_norm_var": 0.15539957682291666, "learning_rate": 0.0001, "loss": 3.3516, "loss/crossentropy": 2.1228564500808718, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2723393976688385, "loss/reg": 0.0, "step": 7470 }, { "epoch": 0.049210526315789475, "grad_norm": 2.546875, "grad_norm_var": 0.04488016764322917, "learning_rate": 0.0001, "loss": 3.4105, "loss/crossentropy": 2.221798670291901, "loss/hidden": 3.303125, "loss/incoh": 0.0, "loss/logits": 0.3143211781978607, "loss/reg": 0.0, "step": 7480 }, { "epoch": 0.04927631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.028938802083333333, "learning_rate": 0.0001, "loss": 3.4092, "loss/crossentropy": 2.4991084337234497, "loss/hidden": 3.24375, "loss/incoh": 0.0, "loss/logits": 0.3354015931487083, "loss/reg": 0.0, "step": 7490 }, { "epoch": 0.049342105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.04719136555989583, "learning_rate": 0.0001, "loss": 3.2643, "loss/crossentropy": 2.3538936495780947, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.25752398669719695, "loss/reg": 0.0, "step": 7500 }, { "epoch": 0.0494078947368421, "grad_norm": 2.859375, "grad_norm_var": 0.5662272135416667, "learning_rate": 0.0001, "loss": 3.3662, "loss/crossentropy": 2.2441036820411684, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.26160079389810564, "loss/reg": 0.0, "step": 7510 }, { "epoch": 0.049473684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.5289703369140625, "learning_rate": 0.0001, "loss": 3.3149, "loss/crossentropy": 2.544923734664917, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2898894131183624, "loss/reg": 0.0, "step": 7520 }, { "epoch": 0.04953947368421053, "grad_norm": 2.34375, "grad_norm_var": 0.0388092041015625, "learning_rate": 0.0001, "loss": 3.3611, "loss/crossentropy": 2.5786613702774046, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.2799048855900764, "loss/reg": 0.0, "step": 7530 }, { "epoch": 0.049605263157894736, "grad_norm": 4.625, "grad_norm_var": 0.3494293212890625, "learning_rate": 0.0001, "loss": 3.3299, "loss/crossentropy": 2.6309832334518433, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.3181111514568329, "loss/reg": 0.0, "step": 7540 }, { "epoch": 0.04967105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.32229715983072915, "learning_rate": 0.0001, "loss": 3.381, "loss/crossentropy": 2.395383334159851, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.29645184725522994, "loss/reg": 0.0, "step": 7550 }, { "epoch": 0.049736842105263156, "grad_norm": 2.609375, "grad_norm_var": 0.06101786295572917, "learning_rate": 0.0001, "loss": 3.3076, "loss/crossentropy": 2.2212601780891417, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.2814889296889305, "loss/reg": 0.0, "step": 7560 }, { "epoch": 0.04980263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.5580800374348959, "learning_rate": 0.0001, "loss": 3.3547, "loss/crossentropy": 2.340949076414108, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.3048363208770752, "loss/reg": 0.0, "step": 7570 }, { "epoch": 0.049868421052631576, "grad_norm": 2.140625, "grad_norm_var": 0.5829498291015625, "learning_rate": 0.0001, "loss": 3.2873, "loss/crossentropy": 2.434855592250824, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.33333509862422944, "loss/reg": 0.0, "step": 7580 }, { "epoch": 0.04993421052631579, "grad_norm": 2.625, "grad_norm_var": 0.07898661295572916, "learning_rate": 0.0001, "loss": 3.3901, "loss/crossentropy": 2.2928370952606203, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.30124022662639616, "loss/reg": 0.0, "step": 7590 }, { "epoch": 0.05, "grad_norm": 2.453125, "grad_norm_var": 0.04219462076822917, "learning_rate": 0.0001, "loss": 3.3048, "loss/crossentropy": 2.0018354773521425, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.25127379447221754, "loss/reg": 0.0, "step": 7600 }, { "epoch": 0.05006578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.02652587890625, "learning_rate": 0.0001, "loss": 3.3285, "loss/crossentropy": 2.2610169410705567, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.27540155351161955, "loss/reg": 0.0, "step": 7610 }, { "epoch": 0.05013157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.0210357666015625, "learning_rate": 0.0001, "loss": 3.2714, "loss/crossentropy": 2.2387811303138734, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27260326892137526, "loss/reg": 0.0, "step": 7620 }, { "epoch": 0.05019736842105263, "grad_norm": 2.265625, "grad_norm_var": 0.25825093587239584, "learning_rate": 0.0001, "loss": 3.4396, "loss/crossentropy": 2.5094308257102966, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.3727137431502342, "loss/reg": 0.0, "step": 7630 }, { "epoch": 0.05026315789473684, "grad_norm": 3.203125, "grad_norm_var": 0.42695210774739584, "learning_rate": 0.0001, "loss": 3.4951, "loss/crossentropy": 2.181921923160553, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.29040979146957396, "loss/reg": 0.0, "step": 7640 }, { "epoch": 0.05032894736842105, "grad_norm": 3.5625, "grad_norm_var": 0.47198893229166666, "learning_rate": 0.0001, "loss": 3.3254, "loss/crossentropy": 2.4378631830215456, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.32593746185302735, "loss/reg": 0.0, "step": 7650 }, { "epoch": 0.05039473684210526, "grad_norm": 2.234375, "grad_norm_var": 0.12525126139322917, "learning_rate": 0.0001, "loss": 3.2625, "loss/crossentropy": 2.318000388145447, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.267861607670784, "loss/reg": 0.0, "step": 7660 }, { "epoch": 0.050460526315789477, "grad_norm": 2.28125, "grad_norm_var": 0.09370015462239584, "learning_rate": 0.0001, "loss": 3.4075, "loss/crossentropy": 2.300287425518036, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.272597573697567, "loss/reg": 0.0, "step": 7670 }, { "epoch": 0.05052631578947368, "grad_norm": 2.0625, "grad_norm_var": 0.09342041015625, "learning_rate": 0.0001, "loss": 3.2909, "loss/crossentropy": 2.5380281090736387, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2639786213636398, "loss/reg": 0.0, "step": 7680 }, { "epoch": 0.0505921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.165625, "learning_rate": 0.0001, "loss": 3.2847, "loss/crossentropy": 2.0959218978881835, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2350485235452652, "loss/reg": 0.0, "step": 7690 }, { "epoch": 0.0506578947368421, "grad_norm": 2.6875, "grad_norm_var": 0.17685546875, "learning_rate": 0.0001, "loss": 3.3983, "loss/crossentropy": 2.323216736316681, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.3249870762228966, "loss/reg": 0.0, "step": 7700 }, { "epoch": 0.05072368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.0990234375, "learning_rate": 0.0001, "loss": 3.3125, "loss/crossentropy": 2.3656753659248353, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.3182197526097298, "loss/reg": 0.0, "step": 7710 }, { "epoch": 0.05078947368421052, "grad_norm": 2.8125, "grad_norm_var": 0.19866129557291667, "learning_rate": 0.0001, "loss": 3.4009, "loss/crossentropy": 2.5273406386375425, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.3028098613023758, "loss/reg": 0.0, "step": 7720 }, { "epoch": 0.05085526315789474, "grad_norm": 3.234375, "grad_norm_var": 0.20156148274739583, "learning_rate": 0.0001, "loss": 3.4345, "loss/crossentropy": 2.391481709480286, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3816041976213455, "loss/reg": 0.0, "step": 7730 }, { "epoch": 0.05092105263157895, "grad_norm": 2.984375, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 3.3338, "loss/crossentropy": 2.4912365436553956, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2999194458127022, "loss/reg": 0.0, "step": 7740 }, { "epoch": 0.05098684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.32692057291666665, "learning_rate": 0.0001, "loss": 3.3967, "loss/crossentropy": 2.3413360595703123, "loss/hidden": 3.225, "loss/incoh": 0.0, "loss/logits": 0.2852958709001541, "loss/reg": 0.0, "step": 7750 }, { "epoch": 0.05105263157894737, "grad_norm": 2.625, "grad_norm_var": 0.32123921712239584, "learning_rate": 0.0001, "loss": 3.3178, "loss/crossentropy": 2.6519938707351685, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.30396163165569307, "loss/reg": 0.0, "step": 7760 }, { "epoch": 0.05111842105263158, "grad_norm": 2.546875, "grad_norm_var": 0.05095113118489583, "learning_rate": 0.0001, "loss": 3.3751, "loss/crossentropy": 2.4069360971450804, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2661174476146698, "loss/reg": 0.0, "step": 7770 }, { "epoch": 0.05118421052631579, "grad_norm": 2.6875, "grad_norm_var": 0.12613016764322918, "learning_rate": 0.0001, "loss": 3.3737, "loss/crossentropy": 2.3529985427856444, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.2910769283771515, "loss/reg": 0.0, "step": 7780 }, { "epoch": 0.05125, "grad_norm": 2.25, "grad_norm_var": 0.03486226399739583, "learning_rate": 0.0001, "loss": 3.3208, "loss/crossentropy": 2.1253769397735596, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2470591977238655, "loss/reg": 0.0, "step": 7790 }, { "epoch": 0.05131578947368421, "grad_norm": 2.96875, "grad_norm_var": 0.5931925455729167, "learning_rate": 0.0001, "loss": 3.4704, "loss/crossentropy": 2.2813072860240937, "loss/hidden": 3.3578125, "loss/incoh": 0.0, "loss/logits": 0.3708792179822922, "loss/reg": 0.0, "step": 7800 }, { "epoch": 0.051381578947368424, "grad_norm": 2.15625, "grad_norm_var": 0.6090159098307292, "learning_rate": 0.0001, "loss": 3.3966, "loss/crossentropy": 2.071128582954407, "loss/hidden": 3.2015625, "loss/incoh": 0.0, "loss/logits": 0.332095867395401, "loss/reg": 0.0, "step": 7810 }, { "epoch": 0.05144736842105263, "grad_norm": 2.671875, "grad_norm_var": 0.19628499348958334, "learning_rate": 0.0001, "loss": 3.3989, "loss/crossentropy": 2.368660008907318, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.28022406101226804, "loss/reg": 0.0, "step": 7820 }, { "epoch": 0.051513157894736844, "grad_norm": 2.40625, "grad_norm_var": 2.9096995035807294, "learning_rate": 0.0001, "loss": 3.3211, "loss/crossentropy": 2.1510692477226256, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.2576268881559372, "loss/reg": 0.0, "step": 7830 }, { "epoch": 0.05157894736842105, "grad_norm": 2.390625, "grad_norm_var": 2.7149729410807293, "learning_rate": 0.0001, "loss": 3.4215, "loss/crossentropy": 2.4896273493766783, "loss/hidden": 3.4703125, "loss/incoh": 0.0, "loss/logits": 0.3669875577092171, "loss/reg": 0.0, "step": 7840 }, { "epoch": 0.051644736842105264, "grad_norm": 6.125, "grad_norm_var": 0.9022450764973958, "learning_rate": 0.0001, "loss": 3.4161, "loss/crossentropy": 2.0616636157035826, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.2621200427412987, "loss/reg": 0.0, "step": 7850 }, { "epoch": 0.05171052631578947, "grad_norm": 2.421875, "grad_norm_var": 1.457957967122396, "learning_rate": 0.0001, "loss": 3.3632, "loss/crossentropy": 2.236839824914932, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28002799302339554, "loss/reg": 0.0, "step": 7860 }, { "epoch": 0.051776315789473684, "grad_norm": 2.828125, "grad_norm_var": 7.502855428059896, "learning_rate": 0.0001, "loss": 3.3749, "loss/crossentropy": 2.4405242681503294, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2591508060693741, "loss/reg": 0.0, "step": 7870 }, { "epoch": 0.0518421052631579, "grad_norm": 2.34375, "grad_norm_var": 3.8793904622395834, "learning_rate": 0.0001, "loss": 3.3791, "loss/crossentropy": 2.5814385414123535, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.31668367683887483, "loss/reg": 0.0, "step": 7880 }, { "epoch": 0.051907894736842104, "grad_norm": 2.46875, "grad_norm_var": 3.8120432535807294, "learning_rate": 0.0001, "loss": 3.4478, "loss/crossentropy": 2.185117280483246, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.34759806394577025, "loss/reg": 0.0, "step": 7890 }, { "epoch": 0.05197368421052632, "grad_norm": 2.609375, "grad_norm_var": 0.09302469889322916, "learning_rate": 0.0001, "loss": 3.3764, "loss/crossentropy": 2.249006187915802, "loss/hidden": 3.253125, "loss/incoh": 0.0, "loss/logits": 0.28100676983594897, "loss/reg": 0.0, "step": 7900 }, { "epoch": 0.052039473684210524, "grad_norm": 2.25, "grad_norm_var": 0.08615620930989583, "learning_rate": 0.0001, "loss": 3.261, "loss/crossentropy": 2.297496807575226, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.30758740454912187, "loss/reg": 0.0, "step": 7910 }, { "epoch": 0.05210526315789474, "grad_norm": 3.46875, "grad_norm_var": 0.08404947916666666, "learning_rate": 0.0001, "loss": 3.3132, "loss/crossentropy": 2.411357748508453, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.29194266349077225, "loss/reg": 0.0, "step": 7920 }, { "epoch": 0.052171052631578944, "grad_norm": 2.90625, "grad_norm_var": 0.27255859375, "learning_rate": 0.0001, "loss": 3.3767, "loss/crossentropy": 2.6010719895362855, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.28802800476551055, "loss/reg": 0.0, "step": 7930 }, { "epoch": 0.05223684210526316, "grad_norm": 2.296875, "grad_norm_var": 0.0465484619140625, "learning_rate": 0.0001, "loss": 3.3193, "loss/crossentropy": 2.6153851985931396, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.2830176457762718, "loss/reg": 0.0, "step": 7940 }, { "epoch": 0.05230263157894737, "grad_norm": 2.609375, "grad_norm_var": 0.0275054931640625, "learning_rate": 0.0001, "loss": 3.2535, "loss/crossentropy": 2.2821019262075426, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.28540263772010804, "loss/reg": 0.0, "step": 7950 }, { "epoch": 0.05236842105263158, "grad_norm": 2.125, "grad_norm_var": 0.16284077962239582, "learning_rate": 0.0001, "loss": 3.2614, "loss/crossentropy": 2.1502284169197083, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.305269892513752, "loss/reg": 0.0, "step": 7960 }, { "epoch": 0.05243421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.1658843994140625, "learning_rate": 0.0001, "loss": 3.3904, "loss/crossentropy": 2.4389883518218993, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.3048334762454033, "loss/reg": 0.0, "step": 7970 }, { "epoch": 0.0525, "grad_norm": 2.765625, "grad_norm_var": 0.07778218587239584, "learning_rate": 0.0001, "loss": 3.4152, "loss/crossentropy": 2.3947508692741395, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.3298331335186958, "loss/reg": 0.0, "step": 7980 }, { "epoch": 0.05256578947368421, "grad_norm": 2.78125, "grad_norm_var": 0.08662109375, "learning_rate": 0.0001, "loss": 3.3709, "loss/crossentropy": 2.2743655920028685, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3255280390381813, "loss/reg": 0.0, "step": 7990 }, { "epoch": 0.05263157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.036881510416666666, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.4580028295516967, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.29261877238750456, "loss/reg": 0.0, "step": 8000 }, { "epoch": 0.05269736842105263, "grad_norm": 3.28125, "grad_norm_var": 0.23829752604166668, "learning_rate": 0.0001, "loss": 3.3925, "loss/crossentropy": 2.569744038581848, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.4232485115528107, "loss/reg": 0.0, "step": 8010 }, { "epoch": 0.052763157894736845, "grad_norm": 2.5625, "grad_norm_var": 0.09871317545572916, "learning_rate": 0.0001, "loss": 3.33, "loss/crossentropy": 2.381676936149597, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2768414840102196, "loss/reg": 0.0, "step": 8020 }, { "epoch": 0.05282894736842105, "grad_norm": 2.625, "grad_norm_var": 0.07251688639322916, "learning_rate": 0.0001, "loss": 3.3659, "loss/crossentropy": 2.529127871990204, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.3230110973119736, "loss/reg": 0.0, "step": 8030 }, { "epoch": 0.052894736842105265, "grad_norm": 2.375, "grad_norm_var": 0.03585611979166667, "learning_rate": 0.0001, "loss": 3.241, "loss/crossentropy": 2.4573261976242065, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.26054717898368834, "loss/reg": 0.0, "step": 8040 }, { "epoch": 0.05296052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.028319295247395834, "learning_rate": 0.0001, "loss": 3.3654, "loss/crossentropy": 2.3388527154922487, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.3071700781583786, "loss/reg": 0.0, "step": 8050 }, { "epoch": 0.053026315789473685, "grad_norm": 3.015625, "grad_norm_var": 0.0393218994140625, "learning_rate": 0.0001, "loss": 3.3602, "loss/crossentropy": 2.4984395027160646, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2504978612065315, "loss/reg": 0.0, "step": 8060 }, { "epoch": 0.05309210526315789, "grad_norm": 2.390625, "grad_norm_var": 2.4539998372395835, "learning_rate": 0.0001, "loss": 3.4446, "loss/crossentropy": 2.3318467855453493, "loss/hidden": 3.25625, "loss/incoh": 0.0, "loss/logits": 0.38549562990665437, "loss/reg": 0.0, "step": 8070 }, { "epoch": 0.053157894736842105, "grad_norm": 2.921875, "grad_norm_var": 2.5296946207682294, "learning_rate": 0.0001, "loss": 3.3183, "loss/crossentropy": 2.3632636427879334, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.28499974459409716, "loss/reg": 0.0, "step": 8080 }, { "epoch": 0.05322368421052632, "grad_norm": 2.515625, "grad_norm_var": 0.2263824462890625, "learning_rate": 0.0001, "loss": 3.3982, "loss/crossentropy": 2.5644856214523317, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.32271777242422106, "loss/reg": 0.0, "step": 8090 }, { "epoch": 0.053289473684210525, "grad_norm": 2.109375, "grad_norm_var": 0.20636393229166666, "learning_rate": 0.0001, "loss": 3.3351, "loss/crossentropy": 2.195914793014526, "loss/hidden": 3.2546875, "loss/incoh": 0.0, "loss/logits": 0.32049285918474196, "loss/reg": 0.0, "step": 8100 }, { "epoch": 0.05335526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 3.3031, "loss/crossentropy": 2.6255326747894285, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.27993575036525725, "loss/reg": 0.0, "step": 8110 }, { "epoch": 0.053421052631578946, "grad_norm": 2.484375, "grad_norm_var": 0.07769775390625, "learning_rate": 0.0001, "loss": 3.3496, "loss/crossentropy": 2.430241084098816, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.31150197684764863, "loss/reg": 0.0, "step": 8120 }, { "epoch": 0.05348684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.0697174072265625, "learning_rate": 0.0001, "loss": 3.3655, "loss/crossentropy": 2.4684382557868956, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.2934414252638817, "loss/reg": 0.0, "step": 8130 }, { "epoch": 0.053552631578947366, "grad_norm": 2.921875, "grad_norm_var": 0.09716389973958334, "learning_rate": 0.0001, "loss": 3.3756, "loss/crossentropy": 2.679113733768463, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.35277644991874696, "loss/reg": 0.0, "step": 8140 }, { "epoch": 0.05361842105263158, "grad_norm": 4.625, "grad_norm_var": 0.3069986979166667, "learning_rate": 0.0001, "loss": 3.3979, "loss/crossentropy": 2.3737685680389404, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.4402651429176331, "loss/reg": 0.0, "step": 8150 }, { "epoch": 0.05368421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.4447255452473958, "learning_rate": 0.0001, "loss": 3.3395, "loss/crossentropy": 2.556090760231018, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2898376002907753, "loss/reg": 0.0, "step": 8160 }, { "epoch": 0.05375, "grad_norm": 2.484375, "grad_norm_var": 0.4153717041015625, "learning_rate": 0.0001, "loss": 3.4755, "loss/crossentropy": 2.3199184775352477, "loss/hidden": 3.2375, "loss/incoh": 0.0, "loss/logits": 0.3026577115058899, "loss/reg": 0.0, "step": 8170 }, { "epoch": 0.05381578947368421, "grad_norm": 2.765625, "grad_norm_var": 0.11787821451822916, "learning_rate": 0.0001, "loss": 3.3616, "loss/crossentropy": 2.566536474227905, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.3062845066189766, "loss/reg": 0.0, "step": 8180 }, { "epoch": 0.05388157894736842, "grad_norm": 2.25, "grad_norm_var": 0.0469146728515625, "learning_rate": 0.0001, "loss": 3.3019, "loss/crossentropy": 2.434731423854828, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.27008683085441587, "loss/reg": 0.0, "step": 8190 }, { "epoch": 0.05394736842105263, "grad_norm": 2.375, "grad_norm_var": 0.2584299723307292, "learning_rate": 0.0001, "loss": 3.359, "loss/crossentropy": 2.2641067147254943, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.27225579768419267, "loss/reg": 0.0, "step": 8200 }, { "epoch": 0.05401315789473684, "grad_norm": 2.25, "grad_norm_var": 0.27990620930989585, "learning_rate": 0.0001, "loss": 3.3185, "loss/crossentropy": 2.410356640815735, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24587155133485794, "loss/reg": 0.0, "step": 8210 }, { "epoch": 0.05407894736842105, "grad_norm": 2.484375, "grad_norm_var": 0.03493550618489583, "learning_rate": 0.0001, "loss": 3.407, "loss/crossentropy": 2.3557684421539307, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.27189340591430666, "loss/reg": 0.0, "step": 8220 }, { "epoch": 0.054144736842105266, "grad_norm": 2.578125, "grad_norm_var": 0.031615193684895834, "learning_rate": 0.0001, "loss": 3.3054, "loss/crossentropy": 2.453370213508606, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3121939614415169, "loss/reg": 0.0, "step": 8230 }, { "epoch": 0.05421052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.06060791015625, "learning_rate": 0.0001, "loss": 3.3556, "loss/crossentropy": 2.3469805240631105, "loss/hidden": 3.115625, "loss/incoh": 0.0, "loss/logits": 0.35667684078216555, "loss/reg": 0.0, "step": 8240 }, { "epoch": 0.054276315789473686, "grad_norm": 2.484375, "grad_norm_var": 0.10061442057291667, "learning_rate": 0.0001, "loss": 3.3466, "loss/crossentropy": 2.5584194660186768, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.31178881525993346, "loss/reg": 0.0, "step": 8250 }, { "epoch": 0.05434210526315789, "grad_norm": 2.296875, "grad_norm_var": 0.06204020182291667, "learning_rate": 0.0001, "loss": 3.291, "loss/crossentropy": 2.372837942838669, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.27887275665998457, "loss/reg": 0.0, "step": 8260 }, { "epoch": 0.054407894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.05986328125, "learning_rate": 0.0001, "loss": 3.3687, "loss/crossentropy": 2.2389731287956236, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.29073659181594846, "loss/reg": 0.0, "step": 8270 }, { "epoch": 0.05447368421052631, "grad_norm": 2.34375, "grad_norm_var": 0.0772369384765625, "learning_rate": 0.0001, "loss": 3.3364, "loss/crossentropy": 2.130947244167328, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.3286518737673759, "loss/reg": 0.0, "step": 8280 }, { "epoch": 0.05453947368421053, "grad_norm": 2.796875, "grad_norm_var": 0.3402740478515625, "learning_rate": 0.0001, "loss": 3.4311, "loss/crossentropy": 2.5036970019340514, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.3155761957168579, "loss/reg": 0.0, "step": 8290 }, { "epoch": 0.05460526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.07206624348958333, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.15233553647995, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.2912480518221855, "loss/reg": 0.0, "step": 8300 }, { "epoch": 0.05467105263157895, "grad_norm": 2.765625, "grad_norm_var": 0.043290201822916666, "learning_rate": 0.0001, "loss": 3.3888, "loss/crossentropy": 2.3679205000400545, "loss/hidden": 3.2984375, "loss/incoh": 0.0, "loss/logits": 0.33265506476163864, "loss/reg": 0.0, "step": 8310 }, { "epoch": 0.05473684210526316, "grad_norm": 4.78125, "grad_norm_var": 0.4022745768229167, "learning_rate": 0.0001, "loss": 3.4588, "loss/crossentropy": 2.3284069895744324, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.28553168624639513, "loss/reg": 0.0, "step": 8320 }, { "epoch": 0.05480263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.5325480143229167, "learning_rate": 0.0001, "loss": 3.3491, "loss/crossentropy": 2.602140688896179, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.30461025387048724, "loss/reg": 0.0, "step": 8330 }, { "epoch": 0.05486842105263158, "grad_norm": 2.375, "grad_norm_var": 0.21806233723958332, "learning_rate": 0.0001, "loss": 3.2902, "loss/crossentropy": 2.3135936856269836, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.24052460640668868, "loss/reg": 0.0, "step": 8340 }, { "epoch": 0.05493421052631579, "grad_norm": 2.859375, "grad_norm_var": 0.10383707682291667, "learning_rate": 0.0001, "loss": 3.4435, "loss/crossentropy": 2.133499014377594, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.3066937685012817, "loss/reg": 0.0, "step": 8350 }, { "epoch": 0.055, "grad_norm": 3.15625, "grad_norm_var": 1.1286936442057292, "learning_rate": 0.0001, "loss": 3.3443, "loss/crossentropy": 2.5188000440597533, "loss/hidden": 3.340625, "loss/incoh": 0.0, "loss/logits": 0.3301475077867508, "loss/reg": 0.0, "step": 8360 }, { "epoch": 0.055065789473684214, "grad_norm": 2.4375, "grad_norm_var": 1.1146230061848958, "learning_rate": 0.0001, "loss": 3.2979, "loss/crossentropy": 2.5192033290863036, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.29569360315799714, "loss/reg": 0.0, "step": 8370 }, { "epoch": 0.05513157894736842, "grad_norm": 4.28125, "grad_norm_var": 0.2727691650390625, "learning_rate": 0.0001, "loss": 3.312, "loss/crossentropy": 2.5402653098106383, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2737806707620621, "loss/reg": 0.0, "step": 8380 }, { "epoch": 0.055197368421052634, "grad_norm": 2.234375, "grad_norm_var": 0.9796702067057291, "learning_rate": 0.0001, "loss": 3.3479, "loss/crossentropy": 2.264913785457611, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.28024169653654096, "loss/reg": 0.0, "step": 8390 }, { "epoch": 0.05526315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.14442952473958334, "learning_rate": 0.0001, "loss": 3.2227, "loss/crossentropy": 2.3250611424446106, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.29773685038089753, "loss/reg": 0.0, "step": 8400 }, { "epoch": 0.055328947368421054, "grad_norm": 3.453125, "grad_norm_var": 0.1592681884765625, "learning_rate": 0.0001, "loss": 3.3882, "loss/crossentropy": 2.73007869720459, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.361674590408802, "loss/reg": 0.0, "step": 8410 }, { "epoch": 0.05539473684210526, "grad_norm": 2.5, "grad_norm_var": 0.12535807291666667, "learning_rate": 0.0001, "loss": 3.3358, "loss/crossentropy": 2.533698391914368, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.32232231795787813, "loss/reg": 0.0, "step": 8420 }, { "epoch": 0.055460526315789474, "grad_norm": 2.640625, "grad_norm_var": 413.9550415039063, "learning_rate": 0.0001, "loss": 3.4177, "loss/crossentropy": 2.4008097648620605, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.31593555510044097, "loss/reg": 0.0, "step": 8430 }, { "epoch": 0.05552631578947369, "grad_norm": 3.234375, "grad_norm_var": 413.1215077718099, "learning_rate": 0.0001, "loss": 3.3581, "loss/crossentropy": 2.4226332664489747, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.25367428809404374, "loss/reg": 0.0, "step": 8440 }, { "epoch": 0.055592105263157894, "grad_norm": 2.484375, "grad_norm_var": 0.14670817057291666, "learning_rate": 0.0001, "loss": 3.3078, "loss/crossentropy": 2.630770039558411, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.3664619579911232, "loss/reg": 0.0, "step": 8450 }, { "epoch": 0.05565789473684211, "grad_norm": 6.75, "grad_norm_var": 2.27388916015625, "learning_rate": 0.0001, "loss": 3.503, "loss/crossentropy": 2.376649534702301, "loss/hidden": 3.1640625, "loss/incoh": 0.0, "loss/logits": 0.3476260006427765, "loss/reg": 0.0, "step": 8460 }, { "epoch": 0.055723684210526314, "grad_norm": 2.640625, "grad_norm_var": 2.541617838541667, "learning_rate": 0.0001, "loss": 3.4543, "loss/crossentropy": 2.638149607181549, "loss/hidden": 3.34375, "loss/incoh": 0.0, "loss/logits": 0.34109789580106736, "loss/reg": 0.0, "step": 8470 }, { "epoch": 0.05578947368421053, "grad_norm": 2.6875, "grad_norm_var": 0.4665679931640625, "learning_rate": 0.0001, "loss": 3.5013, "loss/crossentropy": 2.40644109249115, "loss/hidden": 3.1515625, "loss/incoh": 0.0, "loss/logits": 0.3092473894357681, "loss/reg": 0.0, "step": 8480 }, { "epoch": 0.055855263157894734, "grad_norm": 2.8125, "grad_norm_var": 1.1175282796223958, "learning_rate": 0.0001, "loss": 3.293, "loss/crossentropy": 2.159450513124466, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.27864499390125275, "loss/reg": 0.0, "step": 8490 }, { "epoch": 0.05592105263157895, "grad_norm": 4.90625, "grad_norm_var": 0.49722900390625, "learning_rate": 0.0001, "loss": 3.3045, "loss/crossentropy": 2.355340528488159, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2977081388235092, "loss/reg": 0.0, "step": 8500 }, { "epoch": 0.05598684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.4117502848307292, "learning_rate": 0.0001, "loss": 3.3273, "loss/crossentropy": 2.284228873252869, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.27203311026096344, "loss/reg": 0.0, "step": 8510 }, { "epoch": 0.05605263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.15953776041666667, "learning_rate": 0.0001, "loss": 3.3038, "loss/crossentropy": 2.6644370317459107, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.2977398321032524, "loss/reg": 0.0, "step": 8520 }, { "epoch": 0.05611842105263158, "grad_norm": 2.65625, "grad_norm_var": 0.07891337076822917, "learning_rate": 0.0001, "loss": 3.3066, "loss/crossentropy": 2.480556678771973, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3119770348072052, "loss/reg": 0.0, "step": 8530 }, { "epoch": 0.05618421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.2862630208333333, "learning_rate": 0.0001, "loss": 3.3849, "loss/crossentropy": 2.3611693739891053, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.27296979874372485, "loss/reg": 0.0, "step": 8540 }, { "epoch": 0.05625, "grad_norm": 2.453125, "grad_norm_var": 0.41441141764322914, "learning_rate": 0.0001, "loss": 3.3496, "loss/crossentropy": 2.3446286380290986, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.28327286094427107, "loss/reg": 0.0, "step": 8550 }, { "epoch": 0.05631578947368421, "grad_norm": 2.625, "grad_norm_var": 6.07724609375, "learning_rate": 0.0001, "loss": 3.4711, "loss/crossentropy": 1.7737745344638824, "loss/hidden": 3.171875, "loss/incoh": 0.0, "loss/logits": 0.27109832018613816, "loss/reg": 0.0, "step": 8560 }, { "epoch": 0.05638157894736842, "grad_norm": 2.875, "grad_norm_var": 5.751741536458334, "learning_rate": 0.0001, "loss": 3.3501, "loss/crossentropy": 2.3749794125556947, "loss/hidden": 3.0578125, "loss/incoh": 0.0, "loss/logits": 0.2803412050008774, "loss/reg": 0.0, "step": 8570 }, { "epoch": 0.056447368421052635, "grad_norm": 2.1875, "grad_norm_var": 0.09153645833333333, "learning_rate": 0.0001, "loss": 3.3793, "loss/crossentropy": 2.348732423782349, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2703657791018486, "loss/reg": 0.0, "step": 8580 }, { "epoch": 0.05651315789473684, "grad_norm": 2.984375, "grad_norm_var": 0.14538472493489582, "learning_rate": 0.0001, "loss": 3.3984, "loss/crossentropy": 2.2422220349311828, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.2956344410777092, "loss/reg": 0.0, "step": 8590 }, { "epoch": 0.056578947368421055, "grad_norm": 2.46875, "grad_norm_var": 0.07770894368489584, "learning_rate": 0.0001, "loss": 3.3787, "loss/crossentropy": 2.3457955360412597, "loss/hidden": 3.2359375, "loss/incoh": 0.0, "loss/logits": 0.3119618773460388, "loss/reg": 0.0, "step": 8600 }, { "epoch": 0.05664473684210526, "grad_norm": 3.40625, "grad_norm_var": 1.7676717122395833, "learning_rate": 0.0001, "loss": 3.4129, "loss/crossentropy": 2.3159496188163757, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.3751305788755417, "loss/reg": 0.0, "step": 8610 }, { "epoch": 0.056710526315789475, "grad_norm": 2.96875, "grad_norm_var": 1.6932902018229166, "learning_rate": 0.0001, "loss": 3.4391, "loss/crossentropy": 2.6694117546081544, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.29238851368427277, "loss/reg": 0.0, "step": 8620 }, { "epoch": 0.05677631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.11296284993489583, "learning_rate": 0.0001, "loss": 3.3208, "loss/crossentropy": 2.1584444522857664, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2903019651770592, "loss/reg": 0.0, "step": 8630 }, { "epoch": 0.056842105263157895, "grad_norm": 2.078125, "grad_norm_var": 0.12463785807291666, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2420501947402953, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.27747504264116285, "loss/reg": 0.0, "step": 8640 }, { "epoch": 0.05690789473684211, "grad_norm": 2.484375, "grad_norm_var": 0.16119384765625, "learning_rate": 0.0001, "loss": 3.226, "loss/crossentropy": 2.378132700920105, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.27239564061164856, "loss/reg": 0.0, "step": 8650 }, { "epoch": 0.056973684210526315, "grad_norm": 2.796875, "grad_norm_var": 0.06622721354166666, "learning_rate": 0.0001, "loss": 3.3322, "loss/crossentropy": 2.1032593488693236, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2557524010539055, "loss/reg": 0.0, "step": 8660 }, { "epoch": 0.05703947368421053, "grad_norm": 2.640625, "grad_norm_var": 1.2355143229166667, "learning_rate": 0.0001, "loss": 3.3289, "loss/crossentropy": 2.41143513917923, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.27892655730247495, "loss/reg": 0.0, "step": 8670 }, { "epoch": 0.057105263157894735, "grad_norm": 2.40625, "grad_norm_var": 0.021117146809895834, "learning_rate": 0.0001, "loss": 3.348, "loss/crossentropy": 2.443818140029907, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.3347576320171356, "loss/reg": 0.0, "step": 8680 }, { "epoch": 0.05717105263157895, "grad_norm": 2.453125, "grad_norm_var": 9.24689275122101e+16, "learning_rate": 0.0001, "loss": 3.3877, "loss/crossentropy": 2.181317722797394, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.30654960721731184, "loss/reg": 0.0, "step": 8690 }, { "epoch": 0.057236842105263155, "grad_norm": 2.203125, "grad_norm_var": 0.108984375, "learning_rate": 0.0001, "loss": 3.308, "loss/crossentropy": 2.2454151153564452, "loss/hidden": 3.240625, "loss/incoh": 0.0, "loss/logits": 0.3219583719968796, "loss/reg": 0.0, "step": 8700 }, { "epoch": 0.05730263157894737, "grad_norm": 2.40625, "grad_norm_var": 0.44682515462239586, "learning_rate": 0.0001, "loss": 3.2909, "loss/crossentropy": 2.226694929599762, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.27956253886222837, "loss/reg": 0.0, "step": 8710 }, { "epoch": 0.057368421052631575, "grad_norm": 2.171875, "grad_norm_var": 0.46708577473958335, "learning_rate": 0.0001, "loss": 3.3322, "loss/crossentropy": 2.329686003923416, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.27385311424732206, "loss/reg": 0.0, "step": 8720 }, { "epoch": 0.05743421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.09485575358072916, "learning_rate": 0.0001, "loss": 3.3243, "loss/crossentropy": 2.262730371952057, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.26381057798862456, "loss/reg": 0.0, "step": 8730 }, { "epoch": 0.0575, "grad_norm": 2.453125, "grad_norm_var": 0.07937825520833333, "learning_rate": 0.0001, "loss": 3.3226, "loss/crossentropy": 2.43484423160553, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2970830351114273, "loss/reg": 0.0, "step": 8740 }, { "epoch": 0.05756578947368421, "grad_norm": 4.0625, "grad_norm_var": 0.23977864583333333, "learning_rate": 0.0001, "loss": 3.3517, "loss/crossentropy": 2.2560600876808166, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2825756400823593, "loss/reg": 0.0, "step": 8750 }, { "epoch": 0.05763157894736842, "grad_norm": 2.359375, "grad_norm_var": 4.011067708333333, "learning_rate": 0.0001, "loss": 3.4317, "loss/crossentropy": 2.1193652033805845, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.2872270569205284, "loss/reg": 0.0, "step": 8760 }, { "epoch": 0.05769736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.98638916015625, "learning_rate": 0.0001, "loss": 3.2657, "loss/crossentropy": 2.2515121579170225, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2510280326008797, "loss/reg": 0.0, "step": 8770 }, { "epoch": 0.05776315789473684, "grad_norm": 2.90625, "grad_norm_var": 1.39010009765625, "learning_rate": 0.0001, "loss": 3.3069, "loss/crossentropy": 2.45401873588562, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.3077335625886917, "loss/reg": 0.0, "step": 8780 }, { "epoch": 0.05782894736842105, "grad_norm": 2.671875, "grad_norm_var": 1.4149373372395833, "learning_rate": 0.0001, "loss": 3.3864, "loss/crossentropy": 2.3450352430343626, "loss/hidden": 3.421875, "loss/incoh": 0.0, "loss/logits": 0.3485978364944458, "loss/reg": 0.0, "step": 8790 }, { "epoch": 0.05789473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.23061421712239583, "learning_rate": 0.0001, "loss": 3.261, "loss/crossentropy": 2.4229711413383486, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.3234298795461655, "loss/reg": 0.0, "step": 8800 }, { "epoch": 0.057960526315789476, "grad_norm": 2.703125, "grad_norm_var": 4.344071451822916, "learning_rate": 0.0001, "loss": 3.2778, "loss/crossentropy": 2.202963078022003, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2559796661138535, "loss/reg": 0.0, "step": 8810 }, { "epoch": 0.05802631578947368, "grad_norm": 2.640625, "grad_norm_var": 4.178641764322917, "learning_rate": 0.0001, "loss": 3.3628, "loss/crossentropy": 2.2936235070228577, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.32534674406051634, "loss/reg": 0.0, "step": 8820 }, { "epoch": 0.058092105263157896, "grad_norm": 2.046875, "grad_norm_var": 0.17635091145833334, "learning_rate": 0.0001, "loss": 3.3078, "loss/crossentropy": 2.535597097873688, "loss/hidden": 3.1109375, "loss/incoh": 0.0, "loss/logits": 0.30167998671531676, "loss/reg": 0.0, "step": 8830 }, { "epoch": 0.0581578947368421, "grad_norm": 2.828125, "grad_norm_var": 0.10144856770833334, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.652754557132721, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.37436943501234055, "loss/reg": 0.0, "step": 8840 }, { "epoch": 0.058223684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.79921875, "learning_rate": 0.0001, "loss": 3.3639, "loss/crossentropy": 2.4041113376617433, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.331952853500843, "loss/reg": 0.0, "step": 8850 }, { "epoch": 0.05828947368421052, "grad_norm": 2.671875, "grad_norm_var": 0.8512603759765625, "learning_rate": 0.0001, "loss": 3.2301, "loss/crossentropy": 2.7010722875595095, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2689524292945862, "loss/reg": 0.0, "step": 8860 }, { "epoch": 0.058355263157894736, "grad_norm": 2.484375, "grad_norm_var": 0.22805887858072918, "learning_rate": 0.0001, "loss": 3.301, "loss/crossentropy": 2.3007858633995055, "loss/hidden": 3.2953125, "loss/incoh": 0.0, "loss/logits": 0.38490410447120665, "loss/reg": 0.0, "step": 8870 }, { "epoch": 0.05842105263157895, "grad_norm": 4.375, "grad_norm_var": 0.2703928629557292, "learning_rate": 0.0001, "loss": 3.3077, "loss/crossentropy": 2.562314450740814, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.29355555921792986, "loss/reg": 0.0, "step": 8880 }, { "epoch": 0.058486842105263157, "grad_norm": 4.46875, "grad_norm_var": 0.5262522379557292, "learning_rate": 0.0001, "loss": 3.2847, "loss/crossentropy": 2.329223835468292, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2784240961074829, "loss/reg": 0.0, "step": 8890 }, { "epoch": 0.05855263157894737, "grad_norm": 2.078125, "grad_norm_var": 0.32392476399739584, "learning_rate": 0.0001, "loss": 3.2975, "loss/crossentropy": 2.371010947227478, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3012363612651825, "loss/reg": 0.0, "step": 8900 }, { "epoch": 0.05861842105263158, "grad_norm": 2.90625, "grad_norm_var": 0.05488993326822917, "learning_rate": 0.0001, "loss": 3.3558, "loss/crossentropy": 2.4505101799964906, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.28698593378067017, "loss/reg": 0.0, "step": 8910 }, { "epoch": 0.05868421052631579, "grad_norm": 2.890625, "grad_norm_var": 1.541039021809896, "learning_rate": 0.0001, "loss": 3.4362, "loss/crossentropy": 2.569356393814087, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.3198714107275009, "loss/reg": 0.0, "step": 8920 }, { "epoch": 0.05875, "grad_norm": 2.34375, "grad_norm_var": 0.24500325520833333, "learning_rate": 0.0001, "loss": 3.3947, "loss/crossentropy": 2.106434017419815, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.2644048437476158, "loss/reg": 0.0, "step": 8930 }, { "epoch": 0.05881578947368421, "grad_norm": 2.03125, "grad_norm_var": 0.21669514973958334, "learning_rate": 0.0001, "loss": 3.2788, "loss/crossentropy": 2.517351245880127, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.27984755039215087, "loss/reg": 0.0, "step": 8940 }, { "epoch": 0.058881578947368424, "grad_norm": 2.328125, "grad_norm_var": 0.06357421875, "learning_rate": 0.0001, "loss": 3.3106, "loss/crossentropy": 2.3258296728134153, "loss/hidden": 3.165625, "loss/incoh": 0.0, "loss/logits": 0.32734392732381823, "loss/reg": 0.0, "step": 8950 }, { "epoch": 0.05894736842105263, "grad_norm": 2.53125, "grad_norm_var": 0.7411610921223958, "learning_rate": 0.0001, "loss": 3.4426, "loss/crossentropy": 2.4165605783462523, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.3073220491409302, "loss/reg": 0.0, "step": 8960 }, { "epoch": 0.059013157894736844, "grad_norm": 2.21875, "grad_norm_var": 0.44649149576822916, "learning_rate": 0.0001, "loss": 3.2202, "loss/crossentropy": 2.4866004467010496, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.30207364708185197, "loss/reg": 0.0, "step": 8970 }, { "epoch": 0.05907894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.0566070556640625, "learning_rate": 0.0001, "loss": 3.2957, "loss/crossentropy": 2.4917370676994324, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.3163344025611877, "loss/reg": 0.0, "step": 8980 }, { "epoch": 0.059144736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.0467193603515625, "learning_rate": 0.0001, "loss": 3.337, "loss/crossentropy": 2.529834246635437, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2896230161190033, "loss/reg": 0.0, "step": 8990 }, { "epoch": 0.05921052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.024637858072916668, "learning_rate": 0.0001, "loss": 3.2664, "loss/crossentropy": 2.548252558708191, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.3721586674451828, "loss/reg": 0.0, "step": 9000 }, { "epoch": 0.059276315789473684, "grad_norm": 2.375, "grad_norm_var": 0.010172526041666666, "learning_rate": 0.0001, "loss": 3.3136, "loss/crossentropy": 2.628942942619324, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.29181756228208544, "loss/reg": 0.0, "step": 9010 }, { "epoch": 0.0593421052631579, "grad_norm": 2.484375, "grad_norm_var": 0.039937337239583336, "learning_rate": 0.0001, "loss": 3.3276, "loss/crossentropy": 2.35152667760849, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2875461965799332, "loss/reg": 0.0, "step": 9020 }, { "epoch": 0.059407894736842104, "grad_norm": 2.625, "grad_norm_var": 0.0728668212890625, "learning_rate": 0.0001, "loss": 3.3454, "loss/crossentropy": 2.3906983017921446, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2821692392230034, "loss/reg": 0.0, "step": 9030 }, { "epoch": 0.05947368421052632, "grad_norm": 2.5625, "grad_norm_var": 0.0730133056640625, "learning_rate": 0.0001, "loss": 3.3058, "loss/crossentropy": 2.4100876331329344, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2886385262012482, "loss/reg": 0.0, "step": 9040 }, { "epoch": 0.059539473684210524, "grad_norm": 2.5, "grad_norm_var": 0.4940592447916667, "learning_rate": 0.0001, "loss": 3.4398, "loss/crossentropy": 2.464267885684967, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.31053231209516524, "loss/reg": 0.0, "step": 9050 }, { "epoch": 0.05960526315789474, "grad_norm": 2.015625, "grad_norm_var": 0.08632405598958333, "learning_rate": 0.0001, "loss": 3.3778, "loss/crossentropy": 2.402372860908508, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.35591588020324705, "loss/reg": 0.0, "step": 9060 }, { "epoch": 0.059671052631578944, "grad_norm": 3.296875, "grad_norm_var": 0.12502848307291667, "learning_rate": 0.0001, "loss": 3.3312, "loss/crossentropy": 2.3369374930858613, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.2521222412586212, "loss/reg": 0.0, "step": 9070 }, { "epoch": 0.05973684210526316, "grad_norm": 3.15625, "grad_norm_var": 0.11041259765625, "learning_rate": 0.0001, "loss": 3.2977, "loss/crossentropy": 2.2894081354141234, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.37748522460460665, "loss/reg": 0.0, "step": 9080 }, { "epoch": 0.05980263157894737, "grad_norm": 2.671875, "grad_norm_var": 2.4495359778774493e+17, "learning_rate": 0.0001, "loss": 3.4555, "loss/crossentropy": 2.248319935798645, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.27714093402028084, "loss/reg": 0.0, "step": 9090 }, { "epoch": 0.05986842105263158, "grad_norm": 2.609375, "grad_norm_var": 2.449535978075936e+17, "learning_rate": 0.0001, "loss": 3.2988, "loss/crossentropy": 2.304659366607666, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.2838084354996681, "loss/reg": 0.0, "step": 9100 }, { "epoch": 0.05993421052631579, "grad_norm": 2.796875, "grad_norm_var": 0.5627838134765625, "learning_rate": 0.0001, "loss": 3.2864, "loss/crossentropy": 2.5072904348373415, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.30276854485273363, "loss/reg": 0.0, "step": 9110 }, { "epoch": 0.06, "grad_norm": 2.328125, "grad_norm_var": 0.5690582275390625, "learning_rate": 0.0001, "loss": 3.2102, "loss/crossentropy": 2.2461979389190674, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2540374085307121, "loss/reg": 0.0, "step": 9120 }, { "epoch": 0.06006578947368421, "grad_norm": 2.21875, "grad_norm_var": 0.11433817545572916, "learning_rate": 0.0001, "loss": 3.2304, "loss/crossentropy": 2.3598265290260314, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.28421255201101303, "loss/reg": 0.0, "step": 9130 }, { "epoch": 0.06013157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.05432535807291667, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.3720561623573304, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.31470999121665955, "loss/reg": 0.0, "step": 9140 }, { "epoch": 0.06019736842105263, "grad_norm": 2.453125, "grad_norm_var": 0.08810933430989583, "learning_rate": 0.0001, "loss": 3.3173, "loss/crossentropy": 2.2540945589542387, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24728769958019256, "loss/reg": 0.0, "step": 9150 }, { "epoch": 0.060263157894736845, "grad_norm": 3.109375, "grad_norm_var": 0.07870992024739583, "learning_rate": 0.0001, "loss": 3.295, "loss/crossentropy": 2.2703887224197388, "loss/hidden": 3.20625, "loss/incoh": 0.0, "loss/logits": 0.27733070850372316, "loss/reg": 0.0, "step": 9160 }, { "epoch": 0.06032894736842105, "grad_norm": 2.203125, "grad_norm_var": 0.20073954264322916, "learning_rate": 0.0001, "loss": 3.2337, "loss/crossentropy": 2.5513323664665224, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.27180006355047226, "loss/reg": 0.0, "step": 9170 }, { "epoch": 0.060394736842105265, "grad_norm": 2.234375, "grad_norm_var": 0.6225545247395833, "learning_rate": 0.0001, "loss": 3.4001, "loss/crossentropy": 2.1638057589530946, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2586470812559128, "loss/reg": 0.0, "step": 9180 }, { "epoch": 0.06046052631578947, "grad_norm": 2.140625, "grad_norm_var": 0.37139867146809896, "learning_rate": 0.0001, "loss": 3.2305, "loss/crossentropy": 2.4451101064682006, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.3055331766605377, "loss/reg": 0.0, "step": 9190 }, { "epoch": 0.060526315789473685, "grad_norm": 2.171875, "grad_norm_var": 0.32624282836914065, "learning_rate": 0.0001, "loss": 3.2254, "loss/crossentropy": 2.561469316482544, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2592929035425186, "loss/reg": 0.0, "step": 9200 }, { "epoch": 0.06059210526315789, "grad_norm": 2.90625, "grad_norm_var": 1.0789459228515625, "learning_rate": 0.0001, "loss": 3.3551, "loss/crossentropy": 2.1200980842113495, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2634043380618095, "loss/reg": 0.0, "step": 9210 }, { "epoch": 0.060657894736842105, "grad_norm": 2.0, "grad_norm_var": 0.54371337890625, "learning_rate": 0.0001, "loss": 3.3742, "loss/crossentropy": 2.4880159854888917, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.29063448309898376, "loss/reg": 0.0, "step": 9220 }, { "epoch": 0.06072368421052632, "grad_norm": 2.953125, "grad_norm_var": 0.24761962890625, "learning_rate": 0.0001, "loss": 3.3781, "loss/crossentropy": 2.558793306350708, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.3282888367772102, "loss/reg": 0.0, "step": 9230 }, { "epoch": 0.060789473684210525, "grad_norm": 4.75, "grad_norm_var": 0.35944010416666666, "learning_rate": 0.0001, "loss": 3.2908, "loss/crossentropy": 2.3837397813797, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.29975869655609133, "loss/reg": 0.0, "step": 9240 }, { "epoch": 0.06085526315789474, "grad_norm": 2.828125, "grad_norm_var": 0.41629130045572915, "learning_rate": 0.0001, "loss": 3.3291, "loss/crossentropy": 2.395397412776947, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.3036083221435547, "loss/reg": 0.0, "step": 9250 }, { "epoch": 0.060921052631578945, "grad_norm": 2.5, "grad_norm_var": 0.09286702473958333, "learning_rate": 0.0001, "loss": 3.2991, "loss/crossentropy": 2.350081342458725, "loss/hidden": 3.234375, "loss/incoh": 0.0, "loss/logits": 0.31170106381177903, "loss/reg": 0.0, "step": 9260 }, { "epoch": 0.06098684210526316, "grad_norm": 3.125, "grad_norm_var": 0.09381510416666666, "learning_rate": 0.0001, "loss": 3.2234, "loss/crossentropy": 2.023473250865936, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2770851716399193, "loss/reg": 0.0, "step": 9270 }, { "epoch": 0.061052631578947365, "grad_norm": 2.265625, "grad_norm_var": 0.09631245930989583, "learning_rate": 0.0001, "loss": 3.2279, "loss/crossentropy": 2.362475335597992, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24814699292182923, "loss/reg": 0.0, "step": 9280 }, { "epoch": 0.06111842105263158, "grad_norm": 3.1875, "grad_norm_var": 0.49576416015625, "learning_rate": 0.0001, "loss": 3.4035, "loss/crossentropy": 2.5755128145217894, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.35746499747037885, "loss/reg": 0.0, "step": 9290 }, { "epoch": 0.06118421052631579, "grad_norm": 2.953125, "grad_norm_var": 0.09010009765625, "learning_rate": 0.0001, "loss": 3.2592, "loss/crossentropy": 2.2949343085289002, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.2996257975697517, "loss/reg": 0.0, "step": 9300 }, { "epoch": 0.06125, "grad_norm": 2.421875, "grad_norm_var": 0.3297190348307292, "learning_rate": 0.0001, "loss": 3.3329, "loss/crossentropy": 2.4506813049316407, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.3265557274222374, "loss/reg": 0.0, "step": 9310 }, { "epoch": 0.06131578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.3309529622395833, "learning_rate": 0.0001, "loss": 3.2163, "loss/crossentropy": 2.3078281760215758, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.2651080995798111, "loss/reg": 0.0, "step": 9320 }, { "epoch": 0.06138157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.26513671875, "learning_rate": 0.0001, "loss": 3.2706, "loss/crossentropy": 2.1937548160552978, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2700933560729027, "loss/reg": 0.0, "step": 9330 }, { "epoch": 0.06144736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.31164957682291666, "learning_rate": 0.0001, "loss": 3.4252, "loss/crossentropy": 2.112582105398178, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.26334969997406005, "loss/reg": 0.0, "step": 9340 }, { "epoch": 0.06151315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.33046875, "learning_rate": 0.0001, "loss": 3.3277, "loss/crossentropy": 2.4319912672042845, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2982776537537575, "loss/reg": 0.0, "step": 9350 }, { "epoch": 0.06157894736842105, "grad_norm": 2.171875, "grad_norm_var": 0.48318684895833336, "learning_rate": 0.0001, "loss": 3.28, "loss/crossentropy": 2.3863817691802978, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2783980667591095, "loss/reg": 0.0, "step": 9360 }, { "epoch": 0.061644736842105266, "grad_norm": 2.59375, "grad_norm_var": 0.4833730061848958, "learning_rate": 0.0001, "loss": 3.329, "loss/crossentropy": 2.5484490633010863, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.3031032904982567, "loss/reg": 0.0, "step": 9370 }, { "epoch": 0.06171052631578947, "grad_norm": 2.203125, "grad_norm_var": 0.08167317708333334, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.2674924612045286, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.3650638833642006, "loss/reg": 0.0, "step": 9380 }, { "epoch": 0.061776315789473686, "grad_norm": 2.5625, "grad_norm_var": 0.22155659993489582, "learning_rate": 0.0001, "loss": 3.3241, "loss/crossentropy": 2.3348896741867065, "loss/hidden": 3.1203125, "loss/incoh": 0.0, "loss/logits": 0.283182792365551, "loss/reg": 0.0, "step": 9390 }, { "epoch": 0.06184210526315789, "grad_norm": 3.15625, "grad_norm_var": 1.1829427083333333, "learning_rate": 0.0001, "loss": 3.3629, "loss/crossentropy": 2.464947986602783, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.28233895897865297, "loss/reg": 0.0, "step": 9400 }, { "epoch": 0.061907894736842106, "grad_norm": 2.296875, "grad_norm_var": 1.1680623372395833, "learning_rate": 0.0001, "loss": 3.3087, "loss/crossentropy": 2.179221343994141, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.3310952290892601, "loss/reg": 0.0, "step": 9410 }, { "epoch": 0.06197368421052631, "grad_norm": 2.21875, "grad_norm_var": 2.0582183837890624, "learning_rate": 0.0001, "loss": 3.4293, "loss/crossentropy": 2.413185155391693, "loss/hidden": 3.18125, "loss/incoh": 0.0, "loss/logits": 0.3003757044672966, "loss/reg": 0.0, "step": 9420 }, { "epoch": 0.062039473684210526, "grad_norm": 2.421875, "grad_norm_var": 0.3502349853515625, "learning_rate": 0.0001, "loss": 3.1251, "loss/crossentropy": 2.3518580555915833, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2455562546849251, "loss/reg": 0.0, "step": 9430 }, { "epoch": 0.06210526315789474, "grad_norm": 2.578125, "grad_norm_var": 0.028425089518229165, "learning_rate": 0.0001, "loss": 3.3233, "loss/crossentropy": 2.375118088722229, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.31473297625780106, "loss/reg": 0.0, "step": 9440 }, { "epoch": 0.062171052631578946, "grad_norm": 2.46875, "grad_norm_var": 0.0333984375, "learning_rate": 0.0001, "loss": 3.2654, "loss/crossentropy": 2.3164775133132935, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2699767738580704, "loss/reg": 0.0, "step": 9450 }, { "epoch": 0.06223684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.0964263916015625, "learning_rate": 0.0001, "loss": 3.3552, "loss/crossentropy": 2.233529049158096, "loss/hidden": 3.21875, "loss/incoh": 0.0, "loss/logits": 0.3262931898236275, "loss/reg": 0.0, "step": 9460 }, { "epoch": 0.062302631578947366, "grad_norm": 2.1875, "grad_norm_var": 24.440249633789062, "learning_rate": 0.0001, "loss": 3.2359, "loss/crossentropy": 2.3248747825622558, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.24342142790555954, "loss/reg": 0.0, "step": 9470 }, { "epoch": 0.06236842105263158, "grad_norm": 2.5625, "grad_norm_var": 0.06562398274739584, "learning_rate": 0.0001, "loss": 3.3237, "loss/crossentropy": 2.588094711303711, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.3172804355621338, "loss/reg": 0.0, "step": 9480 }, { "epoch": 0.062434210526315786, "grad_norm": 2.375, "grad_norm_var": 0.21744384765625, "learning_rate": 0.0001, "loss": 3.3475, "loss/crossentropy": 2.390676462650299, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.27351657301187515, "loss/reg": 0.0, "step": 9490 }, { "epoch": 0.0625, "grad_norm": 2.703125, "grad_norm_var": 0.0399078369140625, "learning_rate": 0.0001, "loss": 3.2819, "loss/crossentropy": 2.3253311276435853, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.265305657684803, "loss/reg": 0.0, "step": 9500 }, { "epoch": 0.06256578947368421, "grad_norm": 2.25, "grad_norm_var": 0.08692118326822916, "learning_rate": 0.0001, "loss": 3.2535, "loss/crossentropy": 2.3912763714790346, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2748603358864784, "loss/reg": 0.0, "step": 9510 }, { "epoch": 0.06263157894736843, "grad_norm": 2.265625, "grad_norm_var": 0.11210098266601562, "learning_rate": 0.0001, "loss": 3.2609, "loss/crossentropy": 2.3854790568351745, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2736155718564987, "loss/reg": 0.0, "step": 9520 }, { "epoch": 0.06269736842105263, "grad_norm": 2.9375, "grad_norm_var": 0.13178888956705728, "learning_rate": 0.0001, "loss": 3.2434, "loss/crossentropy": 2.4524547696113586, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.256032706797123, "loss/reg": 0.0, "step": 9530 }, { "epoch": 0.06276315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.08687744140625, "learning_rate": 0.0001, "loss": 3.3256, "loss/crossentropy": 2.563627076148987, "loss/hidden": 3.3046875, "loss/incoh": 0.0, "loss/logits": 0.3692674309015274, "loss/reg": 0.0, "step": 9540 }, { "epoch": 0.06282894736842105, "grad_norm": 2.4375, "grad_norm_var": 0.13664957682291667, "learning_rate": 0.0001, "loss": 3.3197, "loss/crossentropy": 2.3016056180000306, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.2520491242408752, "loss/reg": 0.0, "step": 9550 }, { "epoch": 0.06289473684210527, "grad_norm": 2.828125, "grad_norm_var": 0.4147857666015625, "learning_rate": 0.0001, "loss": 3.511, "loss/crossentropy": 2.1577144265174866, "loss/hidden": 3.1046875, "loss/incoh": 0.0, "loss/logits": 0.2929541230201721, "loss/reg": 0.0, "step": 9560 }, { "epoch": 0.06296052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.18341471354166666, "learning_rate": 0.0001, "loss": 3.2549, "loss/crossentropy": 2.31901068687439, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.27463280111551286, "loss/reg": 0.0, "step": 9570 }, { "epoch": 0.06302631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.4622884114583333, "learning_rate": 0.0001, "loss": 3.3477, "loss/crossentropy": 2.6131432056427, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.38214774429798126, "loss/reg": 0.0, "step": 9580 }, { "epoch": 0.0630921052631579, "grad_norm": 2.484375, "grad_norm_var": 0.13547337849934896, "learning_rate": 0.0001, "loss": 3.2885, "loss/crossentropy": 2.451016199588776, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2813758164644241, "loss/reg": 0.0, "step": 9590 }, { "epoch": 0.06315789473684211, "grad_norm": 2.1875, "grad_norm_var": 0.2512003580729167, "learning_rate": 0.0001, "loss": 3.3345, "loss/crossentropy": 2.1964026927947997, "loss/hidden": 3.2765625, "loss/incoh": 0.0, "loss/logits": 0.31094489246606827, "loss/reg": 0.0, "step": 9600 }, { "epoch": 0.06322368421052632, "grad_norm": 2.25, "grad_norm_var": 0.28735249837239585, "learning_rate": 0.0001, "loss": 3.2956, "loss/crossentropy": 2.3820174098014832, "loss/hidden": 3.1375, "loss/incoh": 0.0, "loss/logits": 0.30036603659391403, "loss/reg": 0.0, "step": 9610 }, { "epoch": 0.06328947368421052, "grad_norm": 2.609375, "grad_norm_var": 0.9716102600097656, "learning_rate": 0.0001, "loss": 3.2363, "loss/crossentropy": 2.4219281673431396, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2639324277639389, "loss/reg": 0.0, "step": 9620 }, { "epoch": 0.06335526315789473, "grad_norm": 2.8125, "grad_norm_var": 0.8159016927083333, "learning_rate": 0.0001, "loss": 3.331, "loss/crossentropy": 1.9574783891439438, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.2543580986559391, "loss/reg": 0.0, "step": 9630 }, { "epoch": 0.06342105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.06944961547851562, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 2.210974097251892, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2627203479409218, "loss/reg": 0.0, "step": 9640 }, { "epoch": 0.06348684210526316, "grad_norm": 2.9375, "grad_norm_var": 0.2388336181640625, "learning_rate": 0.0001, "loss": 3.3255, "loss/crossentropy": 2.225372338294983, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.28057117611169813, "loss/reg": 0.0, "step": 9650 }, { "epoch": 0.06355263157894737, "grad_norm": 2.0, "grad_norm_var": 0.3254954020182292, "learning_rate": 0.0001, "loss": 3.3021, "loss/crossentropy": 2.190217161178589, "loss/hidden": 3.1734375, "loss/incoh": 0.0, "loss/logits": 0.31579277813434603, "loss/reg": 0.0, "step": 9660 }, { "epoch": 0.06361842105263157, "grad_norm": 2.078125, "grad_norm_var": 0.5466105143229166, "learning_rate": 0.0001, "loss": 3.3119, "loss/crossentropy": 2.427161252498627, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.307109659910202, "loss/reg": 0.0, "step": 9670 }, { "epoch": 0.06368421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.48213602701822916, "learning_rate": 0.0001, "loss": 3.2915, "loss/crossentropy": 2.2590057969093325, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.2674726366996765, "loss/reg": 0.0, "step": 9680 }, { "epoch": 0.06375, "grad_norm": 2.53125, "grad_norm_var": 0.2637858072916667, "learning_rate": 0.0001, "loss": 3.2742, "loss/crossentropy": 2.3426333904266357, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2729641154408455, "loss/reg": 0.0, "step": 9690 }, { "epoch": 0.06381578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.3402252197265625, "learning_rate": 0.0001, "loss": 3.2983, "loss/crossentropy": 2.3633025169372557, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2770617350935936, "loss/reg": 0.0, "step": 9700 }, { "epoch": 0.06388157894736841, "grad_norm": 3.25, "grad_norm_var": 0.23658447265625, "learning_rate": 0.0001, "loss": 3.2621, "loss/crossentropy": 2.363292157649994, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2777522191405296, "loss/reg": 0.0, "step": 9710 }, { "epoch": 0.06394736842105263, "grad_norm": 3.5625, "grad_norm_var": 0.45250244140625, "learning_rate": 0.0001, "loss": 3.3017, "loss/crossentropy": 2.4210567593574526, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2831977978348732, "loss/reg": 0.0, "step": 9720 }, { "epoch": 0.06401315789473684, "grad_norm": 2.453125, "grad_norm_var": 0.2709147135416667, "learning_rate": 0.0001, "loss": 3.234, "loss/crossentropy": 2.3109630227088926, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.31227574199438096, "loss/reg": 0.0, "step": 9730 }, { "epoch": 0.06407894736842105, "grad_norm": 2.15625, "grad_norm_var": 0.38984273274739584, "learning_rate": 0.0001, "loss": 3.2414, "loss/crossentropy": 2.225367599725723, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.24756639897823335, "loss/reg": 0.0, "step": 9740 }, { "epoch": 0.06414473684210527, "grad_norm": 3.65625, "grad_norm_var": 0.40685933430989585, "learning_rate": 0.0001, "loss": 3.2959, "loss/crossentropy": 2.633290505409241, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.27978702783584597, "loss/reg": 0.0, "step": 9750 }, { "epoch": 0.06421052631578947, "grad_norm": 2.3125, "grad_norm_var": 1.353466796875, "learning_rate": 0.0001, "loss": 3.2732, "loss/crossentropy": 2.377350616455078, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27294492572546003, "loss/reg": 0.0, "step": 9760 }, { "epoch": 0.06427631578947368, "grad_norm": 2.375, "grad_norm_var": 1.3017242431640625, "learning_rate": 0.0001, "loss": 3.3766, "loss/crossentropy": 2.230624866485596, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.2353967770934105, "loss/reg": 0.0, "step": 9770 }, { "epoch": 0.0643421052631579, "grad_norm": 3.046875, "grad_norm_var": 30.074632771809895, "learning_rate": 0.0001, "loss": 3.2769, "loss/crossentropy": 2.250266909599304, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2896205812692642, "loss/reg": 0.0, "step": 9780 }, { "epoch": 0.06440789473684211, "grad_norm": 2.578125, "grad_norm_var": 30.214286295572915, "learning_rate": 0.0001, "loss": 3.2666, "loss/crossentropy": 2.495633435249329, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2458545297384262, "loss/reg": 0.0, "step": 9790 }, { "epoch": 0.06447368421052632, "grad_norm": 1.96875, "grad_norm_var": 0.09082743326822916, "learning_rate": 0.0001, "loss": 3.2638, "loss/crossentropy": 2.140651452541351, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.27739599645137786, "loss/reg": 0.0, "step": 9800 }, { "epoch": 0.06453947368421052, "grad_norm": 2.765625, "grad_norm_var": 2.439875284830729, "learning_rate": 0.0001, "loss": 3.2928, "loss/crossentropy": 2.273459422588348, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.26713447719812394, "loss/reg": 0.0, "step": 9810 }, { "epoch": 0.06460526315789474, "grad_norm": 2.640625, "grad_norm_var": 2.7100901285807293, "learning_rate": 0.0001, "loss": 3.2839, "loss/crossentropy": 2.4476850271224975, "loss/hidden": 3.3984375, "loss/incoh": 0.0, "loss/logits": 0.35955790579319, "loss/reg": 0.0, "step": 9820 }, { "epoch": 0.06467105263157895, "grad_norm": 2.5, "grad_norm_var": 0.46149800618489584, "learning_rate": 0.0001, "loss": 3.306, "loss/crossentropy": 2.3544110536575316, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.31140194088220596, "loss/reg": 0.0, "step": 9830 }, { "epoch": 0.06473684210526316, "grad_norm": 2.4375, "grad_norm_var": 0.15746968587239582, "learning_rate": 0.0001, "loss": 3.2209, "loss/crossentropy": 2.2321391999721527, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2574504017829895, "loss/reg": 0.0, "step": 9840 }, { "epoch": 0.06480263157894736, "grad_norm": 2.171875, "grad_norm_var": 0.10312093098958333, "learning_rate": 0.0001, "loss": 3.2513, "loss/crossentropy": 2.311957097053528, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2625372394919395, "loss/reg": 0.0, "step": 9850 }, { "epoch": 0.06486842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.0565826416015625, "learning_rate": 0.0001, "loss": 3.3056, "loss/crossentropy": 2.331065666675568, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.277868440747261, "loss/reg": 0.0, "step": 9860 }, { "epoch": 0.06493421052631579, "grad_norm": 2.46875, "grad_norm_var": 0.18406575520833332, "learning_rate": 0.0001, "loss": 3.3417, "loss/crossentropy": 2.314790654182434, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.33730033934116366, "loss/reg": 0.0, "step": 9870 }, { "epoch": 0.065, "grad_norm": 2.359375, "grad_norm_var": 0.08243815104166667, "learning_rate": 0.0001, "loss": 3.3522, "loss/crossentropy": 2.489400041103363, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.28822820335626603, "loss/reg": 0.0, "step": 9880 }, { "epoch": 0.06506578947368422, "grad_norm": 2.171875, "grad_norm_var": 0.0967437744140625, "learning_rate": 0.0001, "loss": 3.2368, "loss/crossentropy": 2.3749926924705504, "loss/hidden": 3.175, "loss/incoh": 0.0, "loss/logits": 0.3226087599992752, "loss/reg": 0.0, "step": 9890 }, { "epoch": 0.06513157894736842, "grad_norm": 2.265625, "grad_norm_var": 0.11569722493489583, "learning_rate": 0.0001, "loss": 3.2881, "loss/crossentropy": 2.190938687324524, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.30352693498134614, "loss/reg": 0.0, "step": 9900 }, { "epoch": 0.06519736842105263, "grad_norm": 2.28125, "grad_norm_var": 0.09851888020833334, "learning_rate": 0.0001, "loss": 3.2849, "loss/crossentropy": 2.412072277069092, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.32761459052562714, "loss/reg": 0.0, "step": 9910 }, { "epoch": 0.06526315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.49250895182291665, "learning_rate": 0.0001, "loss": 3.2984, "loss/crossentropy": 2.2049274504184724, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.2710462361574173, "loss/reg": 0.0, "step": 9920 }, { "epoch": 0.06532894736842106, "grad_norm": 2.296875, "grad_norm_var": 0.2756581624348958, "learning_rate": 0.0001, "loss": 3.3729, "loss/crossentropy": 2.0697293996810915, "loss/hidden": 3.3359375, "loss/incoh": 0.0, "loss/logits": 0.32963491380214693, "loss/reg": 0.0, "step": 9930 }, { "epoch": 0.06539473684210527, "grad_norm": 2.546875, "grad_norm_var": 0.18677978515625, "learning_rate": 0.0001, "loss": 3.2972, "loss/crossentropy": 2.3622434020042418, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.31804386228322984, "loss/reg": 0.0, "step": 9940 }, { "epoch": 0.06546052631578947, "grad_norm": 2.71875, "grad_norm_var": 0.07353413899739583, "learning_rate": 0.0001, "loss": 3.3347, "loss/crossentropy": 2.4146655321121218, "loss/hidden": 3.309375, "loss/incoh": 0.0, "loss/logits": 0.36195366978645327, "loss/reg": 0.0, "step": 9950 }, { "epoch": 0.06552631578947368, "grad_norm": 2.65625, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 3.3045, "loss/crossentropy": 2.499011588096619, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.27686055898666384, "loss/reg": 0.0, "step": 9960 }, { "epoch": 0.0655921052631579, "grad_norm": 2.359375, "grad_norm_var": 0.36774800618489584, "learning_rate": 0.0001, "loss": 3.3505, "loss/crossentropy": 2.2883435606956484, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.43403479307889936, "loss/reg": 0.0, "step": 9970 }, { "epoch": 0.06565789473684211, "grad_norm": 2.96875, "grad_norm_var": 0.39895426432291664, "learning_rate": 0.0001, "loss": 3.3004, "loss/crossentropy": 2.328636658191681, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.30042982697486875, "loss/reg": 0.0, "step": 9980 }, { "epoch": 0.06572368421052631, "grad_norm": 2.734375, "grad_norm_var": 0.070458984375, "learning_rate": 0.0001, "loss": 3.3201, "loss/crossentropy": 2.1099472880363463, "loss/hidden": 3.2375, "loss/incoh": 0.0, "loss/logits": 0.2899234861135483, "loss/reg": 0.0, "step": 9990 }, { "epoch": 0.06578947368421052, "grad_norm": 2.40625, "grad_norm_var": 0.06652018229166666, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2847598433494567, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.3000261425971985, "loss/reg": 0.0, "step": 10000 }, { "epoch": 0.06585526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.08883463541666667, "learning_rate": 0.0001, "loss": 3.2377, "loss/crossentropy": 2.4516909599304197, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3408001005649567, "loss/reg": 0.0, "step": 10010 }, { "epoch": 0.06592105263157895, "grad_norm": 2.40625, "grad_norm_var": 4.412495930989583, "learning_rate": 0.0001, "loss": 3.417, "loss/crossentropy": 2.3393358111381533, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.3251391679048538, "loss/reg": 0.0, "step": 10020 }, { "epoch": 0.06598684210526316, "grad_norm": 2.921875, "grad_norm_var": 4.318382771809896, "learning_rate": 0.0001, "loss": 3.318, "loss/crossentropy": 2.4476661682128906, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.25891698747873304, "loss/reg": 0.0, "step": 10030 }, { "epoch": 0.06605263157894736, "grad_norm": 2.265625, "grad_norm_var": 2.029248046875, "learning_rate": 0.0001, "loss": 3.3175, "loss/crossentropy": 2.5090669870376585, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.3005325973033905, "loss/reg": 0.0, "step": 10040 }, { "epoch": 0.06611842105263158, "grad_norm": 2.515625, "grad_norm_var": 18.96970926920573, "learning_rate": 0.0001, "loss": 3.3768, "loss/crossentropy": 2.4451366662979126, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2800415620207787, "loss/reg": 0.0, "step": 10050 }, { "epoch": 0.06618421052631579, "grad_norm": 2.9375, "grad_norm_var": 0.11100260416666667, "learning_rate": 0.0001, "loss": 3.3131, "loss/crossentropy": 2.3123559236526487, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.32703131139278413, "loss/reg": 0.0, "step": 10060 }, { "epoch": 0.06625, "grad_norm": 2.3125, "grad_norm_var": 0.055985514322916666, "learning_rate": 0.0001, "loss": 3.2837, "loss/crossentropy": 2.20616455078125, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.3035066843032837, "loss/reg": 0.0, "step": 10070 }, { "epoch": 0.06631578947368422, "grad_norm": 2.3125, "grad_norm_var": 0.025560506184895835, "learning_rate": 0.0001, "loss": 3.202, "loss/crossentropy": 2.3889974474906923, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25791111290454866, "loss/reg": 0.0, "step": 10080 }, { "epoch": 0.06638157894736842, "grad_norm": 3.078125, "grad_norm_var": 0.12398681640625, "learning_rate": 0.0001, "loss": 3.2662, "loss/crossentropy": 2.0885241270065307, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.26190474927425383, "loss/reg": 0.0, "step": 10090 }, { "epoch": 0.06644736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.10334879557291667, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.3084590315818785, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.292548905313015, "loss/reg": 0.0, "step": 10100 }, { "epoch": 0.06651315789473684, "grad_norm": 2.21875, "grad_norm_var": 0.11416727701822917, "learning_rate": 0.0001, "loss": 3.214, "loss/crossentropy": 2.519231605529785, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2500781774520874, "loss/reg": 0.0, "step": 10110 }, { "epoch": 0.06657894736842106, "grad_norm": 2.609375, "grad_norm_var": 0.10711263020833334, "learning_rate": 0.0001, "loss": 3.3349, "loss/crossentropy": 2.3334843158721923, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.2946275919675827, "loss/reg": 0.0, "step": 10120 }, { "epoch": 0.06664473684210526, "grad_norm": 2.75, "grad_norm_var": 0.3732086181640625, "learning_rate": 0.0001, "loss": 3.3638, "loss/crossentropy": 2.4801114797592163, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.29529736936092377, "loss/reg": 0.0, "step": 10130 }, { "epoch": 0.06671052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.4193511962890625, "learning_rate": 0.0001, "loss": 3.3195, "loss/crossentropy": 2.3818048357963564, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.299163943529129, "loss/reg": 0.0, "step": 10140 }, { "epoch": 0.06677631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.07637430826822916, "learning_rate": 0.0001, "loss": 3.2907, "loss/crossentropy": 2.3112216353416444, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.24870002567768096, "loss/reg": 0.0, "step": 10150 }, { "epoch": 0.0668421052631579, "grad_norm": 3.859375, "grad_norm_var": 0.16756083170572916, "learning_rate": 0.0001, "loss": 3.369, "loss/crossentropy": 2.1794182300567626, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.3170015588402748, "loss/reg": 0.0, "step": 10160 }, { "epoch": 0.06690789473684211, "grad_norm": 2.34375, "grad_norm_var": 0.17148030598958333, "learning_rate": 0.0001, "loss": 3.3343, "loss/crossentropy": 2.125279116630554, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.27327116429805753, "loss/reg": 0.0, "step": 10170 }, { "epoch": 0.06697368421052631, "grad_norm": 2.03125, "grad_norm_var": 1.8383626302083333, "learning_rate": 0.0001, "loss": 3.3548, "loss/crossentropy": 2.4289526462554933, "loss/hidden": 3.065625, "loss/incoh": 0.0, "loss/logits": 0.28756752908229827, "loss/reg": 0.0, "step": 10180 }, { "epoch": 0.06703947368421052, "grad_norm": 2.25, "grad_norm_var": 0.4429677327473958, "learning_rate": 0.0001, "loss": 3.3431, "loss/crossentropy": 2.297343075275421, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2415475845336914, "loss/reg": 0.0, "step": 10190 }, { "epoch": 0.06710526315789474, "grad_norm": 2.1875, "grad_norm_var": 0.53648681640625, "learning_rate": 0.0001, "loss": 3.2412, "loss/crossentropy": 2.291072869300842, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.3272585093975067, "loss/reg": 0.0, "step": 10200 }, { "epoch": 0.06717105263157895, "grad_norm": 2.53125, "grad_norm_var": 0.03841044108072917, "learning_rate": 0.0001, "loss": 3.2881, "loss/crossentropy": 2.3315325021743774, "loss/hidden": 3.1046875, "loss/incoh": 0.0, "loss/logits": 0.2916677713394165, "loss/reg": 0.0, "step": 10210 }, { "epoch": 0.06723684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 3.2758, "loss/crossentropy": 2.496378016471863, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.2993095234036446, "loss/reg": 0.0, "step": 10220 }, { "epoch": 0.06730263157894736, "grad_norm": 2.390625, "grad_norm_var": 0.030101521809895834, "learning_rate": 0.0001, "loss": 3.184, "loss/crossentropy": 1.954445093870163, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.23346130400896073, "loss/reg": 0.0, "step": 10230 }, { "epoch": 0.06736842105263158, "grad_norm": 3.6875, "grad_norm_var": 0.2775675455729167, "learning_rate": 0.0001, "loss": 3.2837, "loss/crossentropy": 2.4431410312652586, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3258019149303436, "loss/reg": 0.0, "step": 10240 }, { "epoch": 0.06743421052631579, "grad_norm": 2.125, "grad_norm_var": 0.33483784993489585, "learning_rate": 0.0001, "loss": 3.2903, "loss/crossentropy": 2.0363747119903564, "loss/hidden": 3.4796875, "loss/incoh": 0.0, "loss/logits": 0.3713301241397858, "loss/reg": 0.0, "step": 10250 }, { "epoch": 0.0675, "grad_norm": 2.4375, "grad_norm_var": 0.3346028645833333, "learning_rate": 0.0001, "loss": 3.2875, "loss/crossentropy": 2.1991047143936155, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.3233490988612175, "loss/reg": 0.0, "step": 10260 }, { "epoch": 0.0675657894736842, "grad_norm": 3.890625, "grad_norm_var": 0.1937652587890625, "learning_rate": 0.0001, "loss": 3.2766, "loss/crossentropy": 2.334232974052429, "loss/hidden": 3.15625, "loss/incoh": 0.0, "loss/logits": 0.3195400908589363, "loss/reg": 0.0, "step": 10270 }, { "epoch": 0.06763157894736842, "grad_norm": 2.25, "grad_norm_var": 0.7010162353515625, "learning_rate": 0.0001, "loss": 3.224, "loss/crossentropy": 2.242165985703468, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.27829615995287893, "loss/reg": 0.0, "step": 10280 }, { "epoch": 0.06769736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.1137115478515625, "learning_rate": 0.0001, "loss": 3.2283, "loss/crossentropy": 2.4408915996551515, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2770886033773422, "loss/reg": 0.0, "step": 10290 }, { "epoch": 0.06776315789473684, "grad_norm": 2.75, "grad_norm_var": 0.28396708170572915, "learning_rate": 0.0001, "loss": 3.2788, "loss/crossentropy": 2.2959680914878846, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2606198683381081, "loss/reg": 0.0, "step": 10300 }, { "epoch": 0.06782894736842106, "grad_norm": 2.859375, "grad_norm_var": 0.0829498291015625, "learning_rate": 0.0001, "loss": 3.3487, "loss/crossentropy": 2.430151104927063, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.25218638926744463, "loss/reg": 0.0, "step": 10310 }, { "epoch": 0.06789473684210526, "grad_norm": 3.203125, "grad_norm_var": 0.3111979166666667, "learning_rate": 0.0001, "loss": 3.3676, "loss/crossentropy": 2.1297863006591795, "loss/hidden": 3.415625, "loss/incoh": 0.0, "loss/logits": 0.4299448400735855, "loss/reg": 0.0, "step": 10320 }, { "epoch": 0.06796052631578947, "grad_norm": 3.0625, "grad_norm_var": 0.34820556640625, "learning_rate": 0.0001, "loss": 3.3013, "loss/crossentropy": 2.2383357286453247, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.31647931337356566, "loss/reg": 0.0, "step": 10330 }, { "epoch": 0.06802631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.0734283447265625, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.3983714103698732, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.29214349389076233, "loss/reg": 0.0, "step": 10340 }, { "epoch": 0.0680921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.0652984619140625, "learning_rate": 0.0001, "loss": 3.2579, "loss/crossentropy": 2.5644800424575807, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.3019869774580002, "loss/reg": 0.0, "step": 10350 }, { "epoch": 0.06815789473684211, "grad_norm": 2.53125, "grad_norm_var": 3.350255903165907e+17, "learning_rate": 0.0001, "loss": 3.3114, "loss/crossentropy": 2.6736939191818236, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2768147349357605, "loss/reg": 0.0, "step": 10360 }, { "epoch": 0.06822368421052631, "grad_norm": 4.65625, "grad_norm_var": 3.350255902575034e+17, "learning_rate": 0.0001, "loss": 3.3131, "loss/crossentropy": 2.2038461327552796, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.31388128101825713, "loss/reg": 0.0, "step": 10370 }, { "epoch": 0.06828947368421052, "grad_norm": 3.78125, "grad_norm_var": 0.5317220052083333, "learning_rate": 0.0001, "loss": 3.3243, "loss/crossentropy": 2.3583734750747682, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.26419220119714737, "loss/reg": 0.0, "step": 10380 }, { "epoch": 0.06835526315789474, "grad_norm": 2.640625, "grad_norm_var": 0.2758127848307292, "learning_rate": 0.0001, "loss": 3.287, "loss/crossentropy": 2.2863503098487854, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.23900726288557053, "loss/reg": 0.0, "step": 10390 }, { "epoch": 0.06842105263157895, "grad_norm": 2.3125, "grad_norm_var": 0.04855143229166667, "learning_rate": 0.0001, "loss": 3.2604, "loss/crossentropy": 2.3840556263923647, "loss/hidden": 3.14375, "loss/incoh": 0.0, "loss/logits": 0.30461184978485106, "loss/reg": 0.0, "step": 10400 }, { "epoch": 0.06848684210526315, "grad_norm": 2.953125, "grad_norm_var": 0.15156962076822916, "learning_rate": 0.0001, "loss": 3.3062, "loss/crossentropy": 2.088348960876465, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2698833361268044, "loss/reg": 0.0, "step": 10410 }, { "epoch": 0.06855263157894737, "grad_norm": 2.875, "grad_norm_var": 0.5885894775390625, "learning_rate": 0.0001, "loss": 3.3402, "loss/crossentropy": 2.551842737197876, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.313777893781662, "loss/reg": 0.0, "step": 10420 }, { "epoch": 0.06861842105263158, "grad_norm": 2.46875, "grad_norm_var": 0.9581939697265625, "learning_rate": 0.0001, "loss": 3.3486, "loss/crossentropy": 2.148633885383606, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.29070123881101606, "loss/reg": 0.0, "step": 10430 }, { "epoch": 0.06868421052631579, "grad_norm": 2.59375, "grad_norm_var": 0.2256988525390625, "learning_rate": 0.0001, "loss": 3.3073, "loss/crossentropy": 2.0771638333797453, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.24762868583202363, "loss/reg": 0.0, "step": 10440 }, { "epoch": 0.06875, "grad_norm": 1.9453125, "grad_norm_var": 0.30576553344726565, "learning_rate": 0.0001, "loss": 3.3608, "loss/crossentropy": 1.9622422456741333, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2823460906744003, "loss/reg": 0.0, "step": 10450 }, { "epoch": 0.0688157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.24808731079101562, "learning_rate": 0.0001, "loss": 3.2114, "loss/crossentropy": 2.4866459488868715, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.25186999291181567, "loss/reg": 0.0, "step": 10460 }, { "epoch": 0.06888157894736842, "grad_norm": 2.59375, "grad_norm_var": 3.232233683268229, "learning_rate": 0.0001, "loss": 3.2672, "loss/crossentropy": 2.4218720883131026, "loss/hidden": 3.2421875, "loss/incoh": 0.0, "loss/logits": 0.34285663813352585, "loss/reg": 0.0, "step": 10470 }, { "epoch": 0.06894736842105263, "grad_norm": 3.0, "grad_norm_var": 0.12942301432291667, "learning_rate": 0.0001, "loss": 3.2774, "loss/crossentropy": 2.2948715806007387, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22767567485570908, "loss/reg": 0.0, "step": 10480 }, { "epoch": 0.06901315789473685, "grad_norm": 2.40625, "grad_norm_var": 0.09120686848958333, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.4549028277397156, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.2874615803360939, "loss/reg": 0.0, "step": 10490 }, { "epoch": 0.06907894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.14446512858072916, "learning_rate": 0.0001, "loss": 3.2562, "loss/crossentropy": 2.2758328318595886, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.24516315162181854, "loss/reg": 0.0, "step": 10500 }, { "epoch": 0.06914473684210526, "grad_norm": 2.328125, "grad_norm_var": 0.167626953125, "learning_rate": 0.0001, "loss": 3.3066, "loss/crossentropy": 2.422711133956909, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24841044396162032, "loss/reg": 0.0, "step": 10510 }, { "epoch": 0.06921052631578947, "grad_norm": 2.953125, "grad_norm_var": 0.0686187744140625, "learning_rate": 0.0001, "loss": 3.2834, "loss/crossentropy": 2.210281264781952, "loss/hidden": 3.3, "loss/incoh": 0.0, "loss/logits": 0.31483527421951296, "loss/reg": 0.0, "step": 10520 }, { "epoch": 0.06927631578947369, "grad_norm": 2.390625, "grad_norm_var": 0.1522857666015625, "learning_rate": 0.0001, "loss": 3.2678, "loss/crossentropy": 2.3035526394844057, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.26298564970493316, "loss/reg": 0.0, "step": 10530 }, { "epoch": 0.0693421052631579, "grad_norm": 2.140625, "grad_norm_var": 1.5601959228515625, "learning_rate": 0.0001, "loss": 3.2926, "loss/crossentropy": 2.5075936675071717, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27058843374252317, "loss/reg": 0.0, "step": 10540 }, { "epoch": 0.0694078947368421, "grad_norm": 2.171875, "grad_norm_var": 1.5597330729166667, "learning_rate": 0.0001, "loss": 3.2643, "loss/crossentropy": 2.448484253883362, "loss/hidden": 3.1421875, "loss/incoh": 0.0, "loss/logits": 0.3164616659283638, "loss/reg": 0.0, "step": 10550 }, { "epoch": 0.06947368421052631, "grad_norm": 2.15625, "grad_norm_var": 0.3221638997395833, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.3997972130775453, "loss/hidden": 3.4125, "loss/incoh": 0.0, "loss/logits": 0.3671171858906746, "loss/reg": 0.0, "step": 10560 }, { "epoch": 0.06953947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.33424072265625, "learning_rate": 0.0001, "loss": 3.3035, "loss/crossentropy": 2.3439933180809023, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24507830888032914, "loss/reg": 0.0, "step": 10570 }, { "epoch": 0.06960526315789474, "grad_norm": 2.5625, "grad_norm_var": 0.15203450520833334, "learning_rate": 0.0001, "loss": 3.3203, "loss/crossentropy": 2.20079083442688, "loss/hidden": 3.203125, "loss/incoh": 0.0, "loss/logits": 0.3732150986790657, "loss/reg": 0.0, "step": 10580 }, { "epoch": 0.06967105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.23407796223958333, "learning_rate": 0.0001, "loss": 3.3251, "loss/crossentropy": 2.353682446479797, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.2952089346945286, "loss/reg": 0.0, "step": 10590 }, { "epoch": 0.06973684210526315, "grad_norm": 2.453125, "grad_norm_var": 0.22082926432291666, "learning_rate": 0.0001, "loss": 3.2719, "loss/crossentropy": 2.3233517169952393, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.290130452811718, "loss/reg": 0.0, "step": 10600 }, { "epoch": 0.06980263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.2807362874348958, "learning_rate": 0.0001, "loss": 3.2558, "loss/crossentropy": 2.466183233261108, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.33012075573205946, "loss/reg": 0.0, "step": 10610 }, { "epoch": 0.06986842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.08990478515625, "learning_rate": 0.0001, "loss": 3.3285, "loss/crossentropy": 2.3989938259124757, "loss/hidden": 3.184375, "loss/incoh": 0.0, "loss/logits": 0.37008936554193494, "loss/reg": 0.0, "step": 10620 }, { "epoch": 0.0699342105263158, "grad_norm": 3.09375, "grad_norm_var": 0.05269266764322917, "learning_rate": 0.0001, "loss": 3.2761, "loss/crossentropy": 2.4531158804893494, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.27111856192350386, "loss/reg": 0.0, "step": 10630 }, { "epoch": 0.07, "grad_norm": 2.203125, "grad_norm_var": 0.0650787353515625, "learning_rate": 0.0001, "loss": 3.22, "loss/crossentropy": 2.44055380821228, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.25391832590103147, "loss/reg": 0.0, "step": 10640 }, { "epoch": 0.0700657894736842, "grad_norm": 2.484375, "grad_norm_var": 0.0595367431640625, "learning_rate": 0.0001, "loss": 3.226, "loss/crossentropy": 2.4790910482406616, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2732370227575302, "loss/reg": 0.0, "step": 10650 }, { "epoch": 0.07013157894736842, "grad_norm": 2248146944.0, "grad_norm_var": 3.158852919285514e+17, "learning_rate": 0.0001, "loss": 3.3733, "loss/crossentropy": 2.1218923926353455, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.23474968373775482, "loss/reg": 0.0, "step": 10660 }, { "epoch": 0.07019736842105263, "grad_norm": 2.40625, "grad_norm_var": 3.158852918981078e+17, "learning_rate": 0.0001, "loss": 3.2364, "loss/crossentropy": 2.3490695118904115, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.3323436751961708, "loss/reg": 0.0, "step": 10670 }, { "epoch": 0.07026315789473685, "grad_norm": 3.1875, "grad_norm_var": 0.07629801432291666, "learning_rate": 0.0001, "loss": 3.1831, "loss/crossentropy": 2.3577569365501403, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.26663027703762054, "loss/reg": 0.0, "step": 10680 }, { "epoch": 0.07032894736842105, "grad_norm": 2.703125, "grad_norm_var": 0.0871978759765625, "learning_rate": 0.0001, "loss": 3.2111, "loss/crossentropy": 2.2620568752288817, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.3401878133416176, "loss/reg": 0.0, "step": 10690 }, { "epoch": 0.07039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 3.252, "loss/crossentropy": 2.223601281642914, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.26251988410949706, "loss/reg": 0.0, "step": 10700 }, { "epoch": 0.07046052631578947, "grad_norm": 2.53125, "grad_norm_var": 0.020702107747395834, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.2720033645629885, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2705120757222176, "loss/reg": 0.0, "step": 10710 }, { "epoch": 0.07052631578947369, "grad_norm": 2.203125, "grad_norm_var": 0.36824442545572916, "learning_rate": 0.0001, "loss": 3.2981, "loss/crossentropy": 2.3247862100601195, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.25748861730098727, "loss/reg": 0.0, "step": 10720 }, { "epoch": 0.0705921052631579, "grad_norm": 2.421875, "grad_norm_var": 0.27925796508789064, "learning_rate": 0.0001, "loss": 3.1731, "loss/crossentropy": 1.9388428241014481, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.21091363951563835, "loss/reg": 0.0, "step": 10730 }, { "epoch": 0.0706578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.10903294881184895, "learning_rate": 0.0001, "loss": 3.2623, "loss/crossentropy": 2.338471806049347, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2822930008172989, "loss/reg": 0.0, "step": 10740 }, { "epoch": 0.07072368421052631, "grad_norm": 1.96875, "grad_norm_var": 0.11550191243489584, "learning_rate": 0.0001, "loss": 3.2306, "loss/crossentropy": 2.275705647468567, "loss/hidden": 3.1640625, "loss/incoh": 0.0, "loss/logits": 0.30476620346307753, "loss/reg": 0.0, "step": 10750 }, { "epoch": 0.07078947368421053, "grad_norm": 2.171875, "grad_norm_var": 1.7166033426920573, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.375147843360901, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.3041233107447624, "loss/reg": 0.0, "step": 10760 }, { "epoch": 0.07085526315789474, "grad_norm": 2.75, "grad_norm_var": 1.698127237955729, "learning_rate": 0.0001, "loss": 3.308, "loss/crossentropy": 2.3641104817390444, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2578106954693794, "loss/reg": 0.0, "step": 10770 }, { "epoch": 0.07092105263157895, "grad_norm": 2.390625, "grad_norm_var": 0.33463134765625, "learning_rate": 0.0001, "loss": 3.2267, "loss/crossentropy": 2.3689509868621825, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.3170511037111282, "loss/reg": 0.0, "step": 10780 }, { "epoch": 0.07098684210526315, "grad_norm": 2.140625, "grad_norm_var": 0.08876851399739584, "learning_rate": 0.0001, "loss": 3.2416, "loss/crossentropy": 2.0522693753242494, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.25120987445116044, "loss/reg": 0.0, "step": 10790 }, { "epoch": 0.07105263157894737, "grad_norm": 2.734375, "grad_norm_var": 0.1547027587890625, "learning_rate": 0.0001, "loss": 3.2808, "loss/crossentropy": 2.193123185634613, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.30211101323366163, "loss/reg": 0.0, "step": 10800 }, { "epoch": 0.07111842105263158, "grad_norm": 2.09375, "grad_norm_var": 0.11612955729166667, "learning_rate": 0.0001, "loss": 3.1868, "loss/crossentropy": 2.6722516775131226, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.26585151851177213, "loss/reg": 0.0, "step": 10810 }, { "epoch": 0.0711842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.3016998291015625, "learning_rate": 0.0001, "loss": 3.1683, "loss/crossentropy": 2.480617439746857, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2681833073496819, "loss/reg": 0.0, "step": 10820 }, { "epoch": 0.07125, "grad_norm": 3.8125, "grad_norm_var": 0.1811431884765625, "learning_rate": 0.0001, "loss": 3.2239, "loss/crossentropy": 2.2105371236801146, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.27378717362880706, "loss/reg": 0.0, "step": 10830 }, { "epoch": 0.07131578947368421, "grad_norm": 2.203125, "grad_norm_var": 0.17531636555989583, "learning_rate": 0.0001, "loss": 3.1981, "loss/crossentropy": 2.201746928691864, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.3672773316502571, "loss/reg": 0.0, "step": 10840 }, { "epoch": 0.07138157894736842, "grad_norm": 4.28125, "grad_norm_var": 0.34869791666666666, "learning_rate": 0.0001, "loss": 3.2799, "loss/crossentropy": 2.322747588157654, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.262615929543972, "loss/reg": 0.0, "step": 10850 }, { "epoch": 0.07144736842105263, "grad_norm": 2.796875, "grad_norm_var": 0.7465810139973958, "learning_rate": 0.0001, "loss": 3.2065, "loss/crossentropy": 1.9148864209651948, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2080842524766922, "loss/reg": 0.0, "step": 10860 }, { "epoch": 0.07151315789473685, "grad_norm": 2.53125, "grad_norm_var": 0.053873697916666664, "learning_rate": 0.0001, "loss": 3.2002, "loss/crossentropy": 2.314997375011444, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2829948261380196, "loss/reg": 0.0, "step": 10870 }, { "epoch": 0.07157894736842105, "grad_norm": 2.25, "grad_norm_var": 0.08782145182291666, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.4381492733955383, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2620039567351341, "loss/reg": 0.0, "step": 10880 }, { "epoch": 0.07164473684210526, "grad_norm": 2.984375, "grad_norm_var": 0.10087788899739583, "learning_rate": 0.0001, "loss": 3.2769, "loss/crossentropy": 2.478635573387146, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.30705118626356126, "loss/reg": 0.0, "step": 10890 }, { "epoch": 0.07171052631578947, "grad_norm": 2.515625, "grad_norm_var": 0.12694066365559895, "learning_rate": 0.0001, "loss": 3.1767, "loss/crossentropy": 2.145402270555496, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.26391510516405103, "loss/reg": 0.0, "step": 10900 }, { "epoch": 0.07177631578947369, "grad_norm": 2.34375, "grad_norm_var": 0.13940404256184896, "learning_rate": 0.0001, "loss": 3.2976, "loss/crossentropy": 2.1119534373283386, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.24341508150100707, "loss/reg": 0.0, "step": 10910 }, { "epoch": 0.0718421052631579, "grad_norm": 2.375, "grad_norm_var": 0.193408203125, "learning_rate": 0.0001, "loss": 3.2699, "loss/crossentropy": 2.2503302097320557, "loss/hidden": 3.196875, "loss/incoh": 0.0, "loss/logits": 0.2697600871324539, "loss/reg": 0.0, "step": 10920 }, { "epoch": 0.0719078947368421, "grad_norm": 4.9375, "grad_norm_var": 1.1786092122395833, "learning_rate": 0.0001, "loss": 3.3635, "loss/crossentropy": 2.2035016298294066, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.23886747062206268, "loss/reg": 0.0, "step": 10930 }, { "epoch": 0.07197368421052631, "grad_norm": 2.5, "grad_norm_var": 1.1909088134765624, "learning_rate": 0.0001, "loss": 3.2664, "loss/crossentropy": 2.438611125946045, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2649322673678398, "loss/reg": 0.0, "step": 10940 }, { "epoch": 0.07203947368421053, "grad_norm": 2.71875, "grad_norm_var": 0.1803375244140625, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.300194537639618, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2784364491701126, "loss/reg": 0.0, "step": 10950 }, { "epoch": 0.07210526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.13961181640625, "learning_rate": 0.0001, "loss": 3.196, "loss/crossentropy": 2.297783041000366, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23439482748508453, "loss/reg": 0.0, "step": 10960 }, { "epoch": 0.07217105263157894, "grad_norm": 2.5, "grad_norm_var": 0.03483784993489583, "learning_rate": 0.0001, "loss": 3.2816, "loss/crossentropy": 2.2468614101409914, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.26336451917886733, "loss/reg": 0.0, "step": 10970 }, { "epoch": 0.07223684210526315, "grad_norm": 2.515625, "grad_norm_var": 0.027684529622395832, "learning_rate": 0.0001, "loss": 3.2387, "loss/crossentropy": 2.5107889652252195, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2557321682572365, "loss/reg": 0.0, "step": 10980 }, { "epoch": 0.07230263157894737, "grad_norm": 2.5, "grad_norm_var": 0.05127665201822917, "learning_rate": 0.0001, "loss": 3.2995, "loss/crossentropy": 2.221148931980133, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2837982401251793, "loss/reg": 0.0, "step": 10990 }, { "epoch": 0.07236842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.08108317057291667, "learning_rate": 0.0001, "loss": 3.289, "loss/crossentropy": 2.515943694114685, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.269567608833313, "loss/reg": 0.0, "step": 11000 }, { "epoch": 0.0724342105263158, "grad_norm": 2.328125, "grad_norm_var": 0.10301106770833333, "learning_rate": 0.0001, "loss": 3.2421, "loss/crossentropy": 2.22921404838562, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24364713877439498, "loss/reg": 0.0, "step": 11010 }, { "epoch": 0.0725, "grad_norm": 2.40625, "grad_norm_var": 0.15533447265625, "learning_rate": 0.0001, "loss": 3.2872, "loss/crossentropy": 2.5498072266578675, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2900205120444298, "loss/reg": 0.0, "step": 11020 }, { "epoch": 0.07256578947368421, "grad_norm": 2.828125, "grad_norm_var": 0.19057515462239583, "learning_rate": 0.0001, "loss": 3.2853, "loss/crossentropy": 2.607512426376343, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.3007731422781944, "loss/reg": 0.0, "step": 11030 }, { "epoch": 0.07263157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.13447240193684895, "learning_rate": 0.0001, "loss": 3.1687, "loss/crossentropy": 1.9578089714050293, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23623087108135224, "loss/reg": 0.0, "step": 11040 }, { "epoch": 0.07269736842105264, "grad_norm": 2.453125, "grad_norm_var": 0.2533192952473958, "learning_rate": 0.0001, "loss": 3.2864, "loss/crossentropy": 2.473111295700073, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.31404276490211486, "loss/reg": 0.0, "step": 11050 }, { "epoch": 0.07276315789473685, "grad_norm": 2.625, "grad_norm_var": 0.1464019775390625, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.1964801430702208, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.26576130986213686, "loss/reg": 0.0, "step": 11060 }, { "epoch": 0.07282894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.12649637858072918, "learning_rate": 0.0001, "loss": 3.2663, "loss/crossentropy": 2.2337945103645325, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2691087871789932, "loss/reg": 0.0, "step": 11070 }, { "epoch": 0.07289473684210526, "grad_norm": 2.125, "grad_norm_var": 0.08884175618489583, "learning_rate": 0.0001, "loss": 3.3071, "loss/crossentropy": 2.5003953099250795, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3058940455317497, "loss/reg": 0.0, "step": 11080 }, { "epoch": 0.07296052631578948, "grad_norm": 2.46875, "grad_norm_var": 0.14674479166666668, "learning_rate": 0.0001, "loss": 3.2275, "loss/crossentropy": 2.3431849002838137, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2692634254693985, "loss/reg": 0.0, "step": 11090 }, { "epoch": 0.07302631578947369, "grad_norm": 2.609375, "grad_norm_var": 0.08789774576822916, "learning_rate": 0.0001, "loss": 3.2168, "loss/crossentropy": 2.2301442503929136, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.24614981114864348, "loss/reg": 0.0, "step": 11100 }, { "epoch": 0.07309210526315789, "grad_norm": 2.78125, "grad_norm_var": 0.09053446451822916, "learning_rate": 0.0001, "loss": 3.3336, "loss/crossentropy": 2.624728870391846, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.30191341042518616, "loss/reg": 0.0, "step": 11110 }, { "epoch": 0.0731578947368421, "grad_norm": 2.65625, "grad_norm_var": 0.23191731770833332, "learning_rate": 0.0001, "loss": 3.3217, "loss/crossentropy": 2.204717183113098, "loss/hidden": 3.4859375, "loss/incoh": 0.0, "loss/logits": 0.5070606812834739, "loss/reg": 0.0, "step": 11120 }, { "epoch": 0.07322368421052632, "grad_norm": 2.640625, "grad_norm_var": 0.2526519775390625, "learning_rate": 0.0001, "loss": 3.345, "loss/crossentropy": 2.558793139457703, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.42952366173267365, "loss/reg": 0.0, "step": 11130 }, { "epoch": 0.07328947368421053, "grad_norm": 2.53125, "grad_norm_var": 0.0856109619140625, "learning_rate": 0.0001, "loss": 3.3754, "loss/crossentropy": 2.227391791343689, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.27742871195077895, "loss/reg": 0.0, "step": 11140 }, { "epoch": 0.07335526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.07666015625, "learning_rate": 0.0001, "loss": 3.2194, "loss/crossentropy": 2.32956976890564, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.2826590985059738, "loss/reg": 0.0, "step": 11150 }, { "epoch": 0.07342105263157894, "grad_norm": 8.625, "grad_norm_var": 2.3614542643229166, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.0696181058883667, "loss/hidden": 3.3640625, "loss/incoh": 0.0, "loss/logits": 0.2718909472227097, "loss/reg": 0.0, "step": 11160 }, { "epoch": 0.07348684210526316, "grad_norm": 2.234375, "grad_norm_var": 2.3623697916666666, "learning_rate": 0.0001, "loss": 3.2983, "loss/crossentropy": 2.5612054228782655, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.26728835999965667, "loss/reg": 0.0, "step": 11170 }, { "epoch": 0.07355263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.11398111979166667, "learning_rate": 0.0001, "loss": 3.1779, "loss/crossentropy": 2.5427613735198973, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.24258261919021606, "loss/reg": 0.0, "step": 11180 }, { "epoch": 0.07361842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.07968648274739583, "learning_rate": 0.0001, "loss": 3.1565, "loss/crossentropy": 2.3772116184234617, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.24017660170793534, "loss/reg": 0.0, "step": 11190 }, { "epoch": 0.07368421052631578, "grad_norm": 2.9375, "grad_norm_var": 0.058652496337890624, "learning_rate": 0.0001, "loss": 3.2383, "loss/crossentropy": 2.284471166133881, "loss/hidden": 3.0625, "loss/incoh": 0.0, "loss/logits": 0.31355464905500413, "loss/reg": 0.0, "step": 11200 }, { "epoch": 0.07375, "grad_norm": 2.5625, "grad_norm_var": 0.04164937337239583, "learning_rate": 0.0001, "loss": 3.2288, "loss/crossentropy": 2.1727640271186828, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24059856683015823, "loss/reg": 0.0, "step": 11210 }, { "epoch": 0.07381578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.015363566080729167, "learning_rate": 0.0001, "loss": 3.2857, "loss/crossentropy": 2.377478325366974, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.2916824325919151, "loss/reg": 0.0, "step": 11220 }, { "epoch": 0.07388157894736842, "grad_norm": 3.140625, "grad_norm_var": 0.04091695149739583, "learning_rate": 0.0001, "loss": 3.2848, "loss/crossentropy": 2.372981941699982, "loss/hidden": 3.140625, "loss/incoh": 0.0, "loss/logits": 0.3395596519112587, "loss/reg": 0.0, "step": 11230 }, { "epoch": 0.07394736842105264, "grad_norm": 2.59375, "grad_norm_var": 1.1325103759765625, "learning_rate": 0.0001, "loss": 3.2954, "loss/crossentropy": 2.571339511871338, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.3173587560653687, "loss/reg": 0.0, "step": 11240 }, { "epoch": 0.07401315789473684, "grad_norm": 2.828125, "grad_norm_var": 0.11298421223958334, "learning_rate": 0.0001, "loss": 3.213, "loss/crossentropy": 2.093873751163483, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.23975073918700218, "loss/reg": 0.0, "step": 11250 }, { "epoch": 0.07407894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.13909403483072916, "learning_rate": 0.0001, "loss": 3.25, "loss/crossentropy": 2.55220787525177, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.29909972846508026, "loss/reg": 0.0, "step": 11260 }, { "epoch": 0.07414473684210526, "grad_norm": 2.734375, "grad_norm_var": 0.11393229166666667, "learning_rate": 0.0001, "loss": 3.2424, "loss/crossentropy": 2.18590772151947, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2539385199546814, "loss/reg": 0.0, "step": 11270 }, { "epoch": 0.07421052631578948, "grad_norm": 3.078125, "grad_norm_var": 0.07273661295572917, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.4804351210594175, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2747698500752449, "loss/reg": 0.0, "step": 11280 }, { "epoch": 0.07427631578947369, "grad_norm": 2.140625, "grad_norm_var": 0.17068684895833333, "learning_rate": 0.0001, "loss": 3.2114, "loss/crossentropy": 2.4408376574516297, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.3068179443478584, "loss/reg": 0.0, "step": 11290 }, { "epoch": 0.07434210526315789, "grad_norm": 2.5625, "grad_norm_var": 0.27847900390625, "learning_rate": 0.0001, "loss": 3.3225, "loss/crossentropy": 2.284189748764038, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.2627576723694801, "loss/reg": 0.0, "step": 11300 }, { "epoch": 0.0744078947368421, "grad_norm": 2.546875, "grad_norm_var": 0.22431233723958333, "learning_rate": 0.0001, "loss": 3.2794, "loss/crossentropy": 2.348969095945358, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.22996244430541993, "loss/reg": 0.0, "step": 11310 }, { "epoch": 0.07447368421052632, "grad_norm": 2.890625, "grad_norm_var": 0.8210245768229166, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.455811655521393, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2752078205347061, "loss/reg": 0.0, "step": 11320 }, { "epoch": 0.07453947368421053, "grad_norm": 4.75, "grad_norm_var": 0.3614095052083333, "learning_rate": 0.0001, "loss": 3.2713, "loss/crossentropy": 2.4881197214126587, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.33918842375278474, "loss/reg": 0.0, "step": 11330 }, { "epoch": 0.07460526315789473, "grad_norm": 2.59375, "grad_norm_var": 0.4044596354166667, "learning_rate": 0.0001, "loss": 3.334, "loss/crossentropy": 2.4584757328033446, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.3295674562454224, "loss/reg": 0.0, "step": 11340 }, { "epoch": 0.07467105263157894, "grad_norm": 2.703125, "grad_norm_var": 0.9745920817057292, "learning_rate": 0.0001, "loss": 3.2347, "loss/crossentropy": 2.413297247886658, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.2875028237700462, "loss/reg": 0.0, "step": 11350 }, { "epoch": 0.07473684210526316, "grad_norm": 2.609375, "grad_norm_var": 0.0769439697265625, "learning_rate": 0.0001, "loss": 3.2495, "loss/crossentropy": 2.297819769382477, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.28927008211612704, "loss/reg": 0.0, "step": 11360 }, { "epoch": 0.07480263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.054182942708333334, "learning_rate": 0.0001, "loss": 3.1982, "loss/crossentropy": 2.2709707379341126, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.29085248410701753, "loss/reg": 0.0, "step": 11370 }, { "epoch": 0.07486842105263158, "grad_norm": 2.609375, "grad_norm_var": 0.315283203125, "learning_rate": 0.0001, "loss": 3.2877, "loss/crossentropy": 2.5602762937545775, "loss/hidden": 3.1828125, "loss/incoh": 0.0, "loss/logits": 0.3155085578560829, "loss/reg": 0.0, "step": 11380 }, { "epoch": 0.07493421052631578, "grad_norm": 8.375, "grad_norm_var": 2.533186848958333, "learning_rate": 0.0001, "loss": 3.2192, "loss/crossentropy": 2.231611895561218, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.27611204236745834, "loss/reg": 0.0, "step": 11390 }, { "epoch": 0.075, "grad_norm": 2.109375, "grad_norm_var": 2.313280232747396, "learning_rate": 0.0001, "loss": 3.3273, "loss/crossentropy": 2.2164941787719727, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2650335058569908, "loss/reg": 0.0, "step": 11400 }, { "epoch": 0.07506578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.3658528645833333, "learning_rate": 0.0001, "loss": 3.2587, "loss/crossentropy": 2.2440002799034118, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.3373932957649231, "loss/reg": 0.0, "step": 11410 }, { "epoch": 0.07513157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.06290690104166667, "learning_rate": 0.0001, "loss": 3.1439, "loss/crossentropy": 2.405060076713562, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.26099992990493776, "loss/reg": 0.0, "step": 11420 }, { "epoch": 0.07519736842105264, "grad_norm": 2.40625, "grad_norm_var": 0.04853108723958333, "learning_rate": 0.0001, "loss": 3.1454, "loss/crossentropy": 2.5482593178749084, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2424457401037216, "loss/reg": 0.0, "step": 11430 }, { "epoch": 0.07526315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.5184244791666667, "learning_rate": 0.0001, "loss": 3.2189, "loss/crossentropy": 2.1846509099006655, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.29890656769275664, "loss/reg": 0.0, "step": 11440 }, { "epoch": 0.07532894736842105, "grad_norm": 2.46875, "grad_norm_var": 0.03676656087239583, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.3350785970687866, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.26758097261190417, "loss/reg": 0.0, "step": 11450 }, { "epoch": 0.07539473684210526, "grad_norm": 2.40625, "grad_norm_var": 56.07302958170573, "learning_rate": 0.0001, "loss": 3.3264, "loss/crossentropy": 2.357296335697174, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.32786626666784285, "loss/reg": 0.0, "step": 11460 }, { "epoch": 0.07546052631578948, "grad_norm": 2.71875, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 3.1956, "loss/crossentropy": 2.29429577589035, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22037848085165024, "loss/reg": 0.0, "step": 11470 }, { "epoch": 0.07552631578947368, "grad_norm": 2.84375, "grad_norm_var": 0.09621988932291667, "learning_rate": 0.0001, "loss": 3.2053, "loss/crossentropy": 2.450187027454376, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.27029853165149687, "loss/reg": 0.0, "step": 11480 }, { "epoch": 0.07559210526315789, "grad_norm": 2.421875, "grad_norm_var": 0.06201171875, "learning_rate": 0.0001, "loss": 3.2252, "loss/crossentropy": 1.9360981225967406, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.23251112401485444, "loss/reg": 0.0, "step": 11490 }, { "epoch": 0.0756578947368421, "grad_norm": 2.375, "grad_norm_var": 0.07566731770833333, "learning_rate": 0.0001, "loss": 3.2003, "loss/crossentropy": 2.21165417432785, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2515560120344162, "loss/reg": 0.0, "step": 11500 }, { "epoch": 0.07572368421052632, "grad_norm": 2.546875, "grad_norm_var": 0.07636311848958334, "learning_rate": 0.0001, "loss": 3.1758, "loss/crossentropy": 2.5108731746673585, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.31811543107032775, "loss/reg": 0.0, "step": 11510 }, { "epoch": 0.07578947368421053, "grad_norm": 2.21875, "grad_norm_var": 0.17082926432291667, "learning_rate": 0.0001, "loss": 3.2811, "loss/crossentropy": 2.1942604899406435, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.25081480443477633, "loss/reg": 0.0, "step": 11520 }, { "epoch": 0.07585526315789473, "grad_norm": 2.859375, "grad_norm_var": 0.07385660807291666, "learning_rate": 0.0001, "loss": 3.239, "loss/crossentropy": 2.0190611362457274, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.28278502970933916, "loss/reg": 0.0, "step": 11530 }, { "epoch": 0.07592105263157894, "grad_norm": 2.296875, "grad_norm_var": 0.045166015625, "learning_rate": 0.0001, "loss": 3.2825, "loss/crossentropy": 2.3928887605667115, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.31386475563049315, "loss/reg": 0.0, "step": 11540 }, { "epoch": 0.07598684210526316, "grad_norm": 2.34375, "grad_norm_var": 0.05419514973958333, "learning_rate": 0.0001, "loss": 3.236, "loss/crossentropy": 2.3381729245185854, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2617738708853722, "loss/reg": 0.0, "step": 11550 }, { "epoch": 0.07605263157894737, "grad_norm": 3.421875, "grad_norm_var": 0.10877278645833334, "learning_rate": 0.0001, "loss": 3.1435, "loss/crossentropy": 2.1223382353782654, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2554148808121681, "loss/reg": 0.0, "step": 11560 }, { "epoch": 0.07611842105263159, "grad_norm": 2.75, "grad_norm_var": 0.13093973795572916, "learning_rate": 0.0001, "loss": 3.2511, "loss/crossentropy": 2.4102694511413576, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.2813078135251999, "loss/reg": 0.0, "step": 11570 }, { "epoch": 0.07618421052631578, "grad_norm": 2.296875, "grad_norm_var": 0.12099507649739584, "learning_rate": 0.0001, "loss": 3.2712, "loss/crossentropy": 2.2883424520492555, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.27575887441635133, "loss/reg": 0.0, "step": 11580 }, { "epoch": 0.07625, "grad_norm": 2.09375, "grad_norm_var": 0.15095926920572916, "learning_rate": 0.0001, "loss": 3.2856, "loss/crossentropy": 2.4679728865623476, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.3342022061347961, "loss/reg": 0.0, "step": 11590 }, { "epoch": 0.07631578947368421, "grad_norm": 2.703125, "grad_norm_var": 0.1062896728515625, "learning_rate": 0.0001, "loss": 3.2302, "loss/crossentropy": 2.35604043006897, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.30481296926736834, "loss/reg": 0.0, "step": 11600 }, { "epoch": 0.07638157894736843, "grad_norm": 2.4375, "grad_norm_var": 0.17685445149739584, "learning_rate": 0.0001, "loss": 3.3621, "loss/crossentropy": 2.302362835407257, "loss/hidden": 3.153125, "loss/incoh": 0.0, "loss/logits": 0.29829359203577044, "loss/reg": 0.0, "step": 11610 }, { "epoch": 0.07644736842105262, "grad_norm": 2.515625, "grad_norm_var": 2.837443679954338e+17, "learning_rate": 0.0001, "loss": 3.365, "loss/crossentropy": 2.3786125659942625, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2504860758781433, "loss/reg": 0.0, "step": 11620 }, { "epoch": 0.07651315789473684, "grad_norm": 2.59375, "grad_norm_var": 2.8374436804315274e+17, "learning_rate": 0.0001, "loss": 3.285, "loss/crossentropy": 1.7914829134941102, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.26675148904323576, "loss/reg": 0.0, "step": 11630 }, { "epoch": 0.07657894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.047028605143229166, "learning_rate": 0.0001, "loss": 3.2398, "loss/crossentropy": 2.3633928418159487, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2506480649113655, "loss/reg": 0.0, "step": 11640 }, { "epoch": 0.07664473684210527, "grad_norm": 2.1875, "grad_norm_var": 0.12795308430989583, "learning_rate": 0.0001, "loss": 3.1729, "loss/crossentropy": 2.3739442467689513, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.24874649345874786, "loss/reg": 0.0, "step": 11650 }, { "epoch": 0.07671052631578948, "grad_norm": 3.140625, "grad_norm_var": 0.057938639322916666, "learning_rate": 0.0001, "loss": 3.19, "loss/crossentropy": 1.946711039543152, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.26388829201459885, "loss/reg": 0.0, "step": 11660 }, { "epoch": 0.07677631578947368, "grad_norm": 2.28125, "grad_norm_var": 0.16907145182291666, "learning_rate": 0.0001, "loss": 3.2141, "loss/crossentropy": 2.5971063375473022, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.29624524116516116, "loss/reg": 0.0, "step": 11670 }, { "epoch": 0.07684210526315789, "grad_norm": 2.796875, "grad_norm_var": 0.20071512858072918, "learning_rate": 0.0001, "loss": 3.2566, "loss/crossentropy": 2.5601096868515016, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.272810535132885, "loss/reg": 0.0, "step": 11680 }, { "epoch": 0.0769078947368421, "grad_norm": 2.375, "grad_norm_var": 0.14016927083333333, "learning_rate": 0.0001, "loss": 3.1653, "loss/crossentropy": 2.4755476355552672, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.27636886537075045, "loss/reg": 0.0, "step": 11690 }, { "epoch": 0.07697368421052632, "grad_norm": 2.484375, "grad_norm_var": 0.641162109375, "learning_rate": 0.0001, "loss": 3.1798, "loss/crossentropy": 2.558279812335968, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.24465988874435424, "loss/reg": 0.0, "step": 11700 }, { "epoch": 0.07703947368421053, "grad_norm": 2.25, "grad_norm_var": 0.0510162353515625, "learning_rate": 0.0001, "loss": 3.1825, "loss/crossentropy": 2.437064230442047, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.25304732471704483, "loss/reg": 0.0, "step": 11710 }, { "epoch": 0.07710526315789473, "grad_norm": 2.578125, "grad_norm_var": 0.012482706705729167, "learning_rate": 0.0001, "loss": 3.1497, "loss/crossentropy": 2.3207133412361145, "loss/hidden": 3.0640625, "loss/incoh": 0.0, "loss/logits": 0.26052851378917696, "loss/reg": 0.0, "step": 11720 }, { "epoch": 0.07717105263157895, "grad_norm": 2.375, "grad_norm_var": 0.03972066243489583, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.210064744949341, "loss/hidden": 3.1765625, "loss/incoh": 0.0, "loss/logits": 0.321417099237442, "loss/reg": 0.0, "step": 11730 }, { "epoch": 0.07723684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.11972249348958333, "learning_rate": 0.0001, "loss": 3.2828, "loss/crossentropy": 2.2914742827415466, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.30591298937797545, "loss/reg": 0.0, "step": 11740 }, { "epoch": 0.07730263157894737, "grad_norm": 2.875, "grad_norm_var": 0.48640034993489584, "learning_rate": 0.0001, "loss": 3.2259, "loss/crossentropy": 2.2794241905212402, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.3180560126900673, "loss/reg": 0.0, "step": 11750 }, { "epoch": 0.07736842105263157, "grad_norm": 3.15625, "grad_norm_var": 0.4784576416015625, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.237711024284363, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.22340844720602035, "loss/reg": 0.0, "step": 11760 }, { "epoch": 0.07743421052631579, "grad_norm": 2.34375, "grad_norm_var": 0.2001129150390625, "learning_rate": 0.0001, "loss": 3.2992, "loss/crossentropy": 2.4300220131874086, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.279655484855175, "loss/reg": 0.0, "step": 11770 }, { "epoch": 0.0775, "grad_norm": 2.4375, "grad_norm_var": 0.0681549072265625, "learning_rate": 0.0001, "loss": 3.1433, "loss/crossentropy": 2.1450002193450928, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.26157657504081727, "loss/reg": 0.0, "step": 11780 }, { "epoch": 0.07756578947368421, "grad_norm": 2.671875, "grad_norm_var": 0.038492838541666664, "learning_rate": 0.0001, "loss": 3.2329, "loss/crossentropy": 2.273455095291138, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2625778928399086, "loss/reg": 0.0, "step": 11790 }, { "epoch": 0.07763157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.06272379557291667, "learning_rate": 0.0001, "loss": 3.2736, "loss/crossentropy": 2.2713128685951234, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2852136388421059, "loss/reg": 0.0, "step": 11800 }, { "epoch": 0.07769736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.14368082682291666, "learning_rate": 0.0001, "loss": 3.2063, "loss/crossentropy": 2.276504385471344, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.2478427141904831, "loss/reg": 0.0, "step": 11810 }, { "epoch": 0.07776315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.14405008951822917, "learning_rate": 0.0001, "loss": 3.2627, "loss/crossentropy": 2.3879762291908264, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.26758207380771637, "loss/reg": 0.0, "step": 11820 }, { "epoch": 0.07782894736842105, "grad_norm": 2.21875, "grad_norm_var": 16.03980712890625, "learning_rate": 0.0001, "loss": 3.1689, "loss/crossentropy": 2.238702917098999, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.25569620728492737, "loss/reg": 0.0, "step": 11830 }, { "epoch": 0.07789473684210527, "grad_norm": 2.09375, "grad_norm_var": 16.03136774698893, "learning_rate": 0.0001, "loss": 3.1737, "loss/crossentropy": 2.11713285446167, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.2322181984782219, "loss/reg": 0.0, "step": 11840 }, { "epoch": 0.07796052631578948, "grad_norm": 3.53125, "grad_norm_var": 0.14957249959309896, "learning_rate": 0.0001, "loss": 3.2023, "loss/crossentropy": 2.279906690120697, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.26055515706539156, "loss/reg": 0.0, "step": 11850 }, { "epoch": 0.07802631578947368, "grad_norm": 2.453125, "grad_norm_var": 0.1443756103515625, "learning_rate": 0.0001, "loss": 3.1657, "loss/crossentropy": 2.338582932949066, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.23672561049461366, "loss/reg": 0.0, "step": 11860 }, { "epoch": 0.0780921052631579, "grad_norm": 2.171875, "grad_norm_var": 0.08467508951822916, "learning_rate": 0.0001, "loss": 3.1991, "loss/crossentropy": 2.3666534066200255, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.27455085664987566, "loss/reg": 0.0, "step": 11870 }, { "epoch": 0.0781578947368421, "grad_norm": 2.359375, "grad_norm_var": 0.08329671223958333, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 2.232216811180115, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.29752269983291624, "loss/reg": 0.0, "step": 11880 }, { "epoch": 0.07822368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.013792928059895833, "learning_rate": 0.0001, "loss": 3.2052, "loss/crossentropy": 2.5155674695968626, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2883127599954605, "loss/reg": 0.0, "step": 11890 }, { "epoch": 0.07828947368421052, "grad_norm": 2.109375, "grad_norm_var": 0.05334879557291667, "learning_rate": 0.0001, "loss": 3.1696, "loss/crossentropy": 2.3109512329101562, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2651562377810478, "loss/reg": 0.0, "step": 11900 }, { "epoch": 0.07835526315789473, "grad_norm": 2.578125, "grad_norm_var": 0.1116363525390625, "learning_rate": 0.0001, "loss": 3.2305, "loss/crossentropy": 2.4196372270584106, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24704778790473939, "loss/reg": 0.0, "step": 11910 }, { "epoch": 0.07842105263157895, "grad_norm": 2.25, "grad_norm_var": 0.10711161295572917, "learning_rate": 0.0001, "loss": 3.2486, "loss/crossentropy": 2.359645998477936, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.28654517233371735, "loss/reg": 0.0, "step": 11920 }, { "epoch": 0.07848684210526316, "grad_norm": 2.75, "grad_norm_var": 0.06504618326822917, "learning_rate": 0.0001, "loss": 3.2312, "loss/crossentropy": 2.364006555080414, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.24593105614185334, "loss/reg": 0.0, "step": 11930 }, { "epoch": 0.07855263157894737, "grad_norm": 2.390625, "grad_norm_var": 0.11108296712239583, "learning_rate": 0.0001, "loss": 3.1318, "loss/crossentropy": 2.2041036009788515, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.2498387575149536, "loss/reg": 0.0, "step": 11940 }, { "epoch": 0.07861842105263157, "grad_norm": 3.015625, "grad_norm_var": 0.06920166015625, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.2502759456634522, "loss/hidden": 3.053125, "loss/incoh": 0.0, "loss/logits": 0.27016896903514864, "loss/reg": 0.0, "step": 11950 }, { "epoch": 0.07868421052631579, "grad_norm": 3.125, "grad_norm_var": 0.09563700358072917, "learning_rate": 0.0001, "loss": 3.2128, "loss/crossentropy": 2.1190481543540955, "loss/hidden": 3.1734375, "loss/incoh": 0.0, "loss/logits": 0.288416750729084, "loss/reg": 0.0, "step": 11960 }, { "epoch": 0.07875, "grad_norm": 2.203125, "grad_norm_var": 0.15563863118489582, "learning_rate": 0.0001, "loss": 3.2263, "loss/crossentropy": 1.9541548937559128, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.271015003323555, "loss/reg": 0.0, "step": 11970 }, { "epoch": 0.07881578947368421, "grad_norm": 3.015625, "grad_norm_var": 0.09900614420572916, "learning_rate": 0.0001, "loss": 3.0889, "loss/crossentropy": 2.3341493129730226, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2499636933207512, "loss/reg": 0.0, "step": 11980 }, { "epoch": 0.07888157894736843, "grad_norm": 2.46875, "grad_norm_var": 0.07329813639322917, "learning_rate": 0.0001, "loss": 3.1932, "loss/crossentropy": 2.388286221027374, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.24724568724632262, "loss/reg": 0.0, "step": 11990 }, { "epoch": 0.07894736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.05429280598958333, "learning_rate": 0.0001, "loss": 3.1876, "loss/crossentropy": 2.351203644275665, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.26266307979822157, "loss/reg": 0.0, "step": 12000 }, { "epoch": 0.07901315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.04205322265625, "learning_rate": 0.0001, "loss": 3.1379, "loss/crossentropy": 2.3741995811462404, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.24828371405601501, "loss/reg": 0.0, "step": 12010 }, { "epoch": 0.07907894736842105, "grad_norm": 2.640625, "grad_norm_var": 0.13201395670572916, "learning_rate": 0.0001, "loss": 3.2741, "loss/crossentropy": 2.037536895275116, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.24230389446020126, "loss/reg": 0.0, "step": 12020 }, { "epoch": 0.07914473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.04157613118489583, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.2516727566719057, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2339026600122452, "loss/reg": 0.0, "step": 12030 }, { "epoch": 0.07921052631578947, "grad_norm": 2.3125, "grad_norm_var": 26.919873046875, "learning_rate": 0.0001, "loss": 3.2787, "loss/crossentropy": 2.3474916219711304, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2710026606917381, "loss/reg": 0.0, "step": 12040 }, { "epoch": 0.07927631578947368, "grad_norm": 2.453125, "grad_norm_var": 26.796516927083335, "learning_rate": 0.0001, "loss": 3.254, "loss/crossentropy": 2.153431460261345, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.23484777919948102, "loss/reg": 0.0, "step": 12050 }, { "epoch": 0.0793421052631579, "grad_norm": 2.71875, "grad_norm_var": 0.0520416259765625, "learning_rate": 0.0001, "loss": 3.2866, "loss/crossentropy": 2.137704038619995, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2761687204241753, "loss/reg": 0.0, "step": 12060 }, { "epoch": 0.07940789473684211, "grad_norm": 2.4375, "grad_norm_var": 0.05562515258789062, "learning_rate": 0.0001, "loss": 3.1658, "loss/crossentropy": 2.2096517443656922, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22131142765283585, "loss/reg": 0.0, "step": 12070 }, { "epoch": 0.07947368421052632, "grad_norm": 2.578125, "grad_norm_var": 0.033300526936848956, "learning_rate": 0.0001, "loss": 3.1878, "loss/crossentropy": 2.486378014087677, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.288157556951046, "loss/reg": 0.0, "step": 12080 }, { "epoch": 0.07953947368421052, "grad_norm": 2.359375, "grad_norm_var": 0.18038736979166667, "learning_rate": 0.0001, "loss": 3.3441, "loss/crossentropy": 2.1570433020591735, "loss/hidden": 3.3609375, "loss/incoh": 0.0, "loss/logits": 0.33738467693328855, "loss/reg": 0.0, "step": 12090 }, { "epoch": 0.07960526315789473, "grad_norm": 2.234375, "grad_norm_var": 0.14531962076822916, "learning_rate": 0.0001, "loss": 3.2092, "loss/crossentropy": 2.11829297542572, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.29526630192995074, "loss/reg": 0.0, "step": 12100 }, { "epoch": 0.07967105263157895, "grad_norm": 2.328125, "grad_norm_var": 0.11277669270833333, "learning_rate": 0.0001, "loss": 3.2437, "loss/crossentropy": 2.151422083377838, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.31264509409666064, "loss/reg": 0.0, "step": 12110 }, { "epoch": 0.07973684210526316, "grad_norm": 3.046875, "grad_norm_var": 0.09020182291666666, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.190220355987549, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.26489182114601134, "loss/reg": 0.0, "step": 12120 }, { "epoch": 0.07980263157894738, "grad_norm": 2.359375, "grad_norm_var": 0.10075581868489583, "learning_rate": 0.0001, "loss": 3.2733, "loss/crossentropy": 2.4329964399337767, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.29891559183597566, "loss/reg": 0.0, "step": 12130 }, { "epoch": 0.07986842105263157, "grad_norm": 2.640625, "grad_norm_var": 0.08171284993489583, "learning_rate": 0.0001, "loss": 3.2143, "loss/crossentropy": 2.3487884759902955, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2699664428830147, "loss/reg": 0.0, "step": 12140 }, { "epoch": 0.07993421052631579, "grad_norm": 2.1875, "grad_norm_var": 0.11846415201822917, "learning_rate": 0.0001, "loss": 3.1594, "loss/crossentropy": 2.3031589150428773, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2605236619710922, "loss/reg": 0.0, "step": 12150 }, { "epoch": 0.08, "grad_norm": 3.015625, "grad_norm_var": 0.17751363118489583, "learning_rate": 0.0001, "loss": 3.3283, "loss/crossentropy": 2.4235578894615175, "loss/hidden": 3.265625, "loss/incoh": 0.0, "loss/logits": 0.40606142282485963, "loss/reg": 0.0, "step": 12160 }, { "epoch": 0.08006578947368422, "grad_norm": 2.46875, "grad_norm_var": 0.18465067545572916, "learning_rate": 0.0001, "loss": 3.2069, "loss/crossentropy": 2.3678341031074526, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.26937836706638335, "loss/reg": 0.0, "step": 12170 }, { "epoch": 0.08013157894736841, "grad_norm": 2.4375, "grad_norm_var": 0.11225484212239584, "learning_rate": 0.0001, "loss": 3.1606, "loss/crossentropy": 2.606902313232422, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2720926284790039, "loss/reg": 0.0, "step": 12180 }, { "epoch": 0.08019736842105263, "grad_norm": 2.140625, "grad_norm_var": 0.03882548014322917, "learning_rate": 0.0001, "loss": 3.2091, "loss/crossentropy": 2.389057195186615, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2834290415048599, "loss/reg": 0.0, "step": 12190 }, { "epoch": 0.08026315789473684, "grad_norm": 3.328125, "grad_norm_var": 0.082470703125, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.1885082483291627, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.250583179295063, "loss/reg": 0.0, "step": 12200 }, { "epoch": 0.08032894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.14348551432291667, "learning_rate": 0.0001, "loss": 3.2234, "loss/crossentropy": 2.2288808941841127, "loss/hidden": 3.159375, "loss/incoh": 0.0, "loss/logits": 0.2857954427599907, "loss/reg": 0.0, "step": 12210 }, { "epoch": 0.08039473684210527, "grad_norm": 2.578125, "grad_norm_var": 0.05944010416666667, "learning_rate": 0.0001, "loss": 3.1535, "loss/crossentropy": 2.3943295001983644, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.27064385265111923, "loss/reg": 0.0, "step": 12220 }, { "epoch": 0.08046052631578947, "grad_norm": 2.921875, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 3.2165, "loss/crossentropy": 2.36237952709198, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.29628041982650755, "loss/reg": 0.0, "step": 12230 }, { "epoch": 0.08052631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.06207682291666667, "learning_rate": 0.0001, "loss": 3.2112, "loss/crossentropy": 2.26582453250885, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.3621983379125595, "loss/reg": 0.0, "step": 12240 }, { "epoch": 0.0805921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.08144124348958333, "learning_rate": 0.0001, "loss": 3.2711, "loss/crossentropy": 2.4002971291542052, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.29988196343183515, "loss/reg": 0.0, "step": 12250 }, { "epoch": 0.08065789473684211, "grad_norm": 2.78125, "grad_norm_var": 3.1181549072265624, "learning_rate": 0.0001, "loss": 3.2486, "loss/crossentropy": 2.26135613322258, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.24128984957933425, "loss/reg": 0.0, "step": 12260 }, { "epoch": 0.08072368421052632, "grad_norm": 2.34375, "grad_norm_var": 4.51636962890625, "learning_rate": 0.0001, "loss": 3.5227, "loss/crossentropy": 2.2277899503707888, "loss/hidden": 3.453125, "loss/incoh": 0.0, "loss/logits": 0.35484138429164885, "loss/reg": 0.0, "step": 12270 }, { "epoch": 0.08078947368421052, "grad_norm": 2.703125, "grad_norm_var": 2.06083984375, "learning_rate": 0.0001, "loss": 3.2936, "loss/crossentropy": 2.313612127304077, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2666293315589428, "loss/reg": 0.0, "step": 12280 }, { "epoch": 0.08085526315789474, "grad_norm": 4.625, "grad_norm_var": 6.363402303059896, "learning_rate": 0.0001, "loss": 3.2228, "loss/crossentropy": 2.3592599511146544, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.25869676619768145, "loss/reg": 0.0, "step": 12290 }, { "epoch": 0.08092105263157895, "grad_norm": 2.421875, "grad_norm_var": 0.3772532145182292, "learning_rate": 0.0001, "loss": 3.2186, "loss/crossentropy": 2.50057338476181, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2711464300751686, "loss/reg": 0.0, "step": 12300 }, { "epoch": 0.08098684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.07467447916666667, "learning_rate": 0.0001, "loss": 3.2317, "loss/crossentropy": 2.381076216697693, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24998296648263932, "loss/reg": 0.0, "step": 12310 }, { "epoch": 0.08105263157894736, "grad_norm": 2.53125, "grad_norm_var": 1.58385009765625, "learning_rate": 0.0001, "loss": 3.3666, "loss/crossentropy": 2.2987404227256776, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24597887545824051, "loss/reg": 0.0, "step": 12320 }, { "epoch": 0.08111842105263158, "grad_norm": 2.421875, "grad_norm_var": 3.158852918199495e+17, "learning_rate": 0.0001, "loss": 3.3716, "loss/crossentropy": 2.415051448345184, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.29122400283813477, "loss/reg": 0.0, "step": 12330 }, { "epoch": 0.08118421052631579, "grad_norm": 2.734375, "grad_norm_var": 3.158852918454168e+17, "learning_rate": 0.0001, "loss": 3.2774, "loss/crossentropy": 2.082320672273636, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2748811081051826, "loss/reg": 0.0, "step": 12340 }, { "epoch": 0.08125, "grad_norm": 3.46875, "grad_norm_var": 0.26638997395833336, "learning_rate": 0.0001, "loss": 3.2315, "loss/crossentropy": 2.7044607162475587, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.32181375473737717, "loss/reg": 0.0, "step": 12350 }, { "epoch": 0.08131578947368422, "grad_norm": 2.765625, "grad_norm_var": 0.22693684895833333, "learning_rate": 0.0001, "loss": 3.2351, "loss/crossentropy": 2.7221840620040894, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.26635742783546446, "loss/reg": 0.0, "step": 12360 }, { "epoch": 0.08138157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.26437174479166664, "learning_rate": 0.0001, "loss": 3.2068, "loss/crossentropy": 2.3512622594833372, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2477712720632553, "loss/reg": 0.0, "step": 12370 }, { "epoch": 0.08144736842105263, "grad_norm": 2.125, "grad_norm_var": 0.3268707275390625, "learning_rate": 0.0001, "loss": 3.1974, "loss/crossentropy": 2.210025131702423, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.36077398508787156, "loss/reg": 0.0, "step": 12380 }, { "epoch": 0.08151315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.5682525634765625, "learning_rate": 0.0001, "loss": 3.2079, "loss/crossentropy": 2.0402897000312805, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.26192123591899874, "loss/reg": 0.0, "step": 12390 }, { "epoch": 0.08157894736842106, "grad_norm": 2.65625, "grad_norm_var": 0.3712565104166667, "learning_rate": 0.0001, "loss": 3.2107, "loss/crossentropy": 1.9560218453407288, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.21844983994960784, "loss/reg": 0.0, "step": 12400 }, { "epoch": 0.08164473684210527, "grad_norm": 3.171875, "grad_norm_var": 0.4505859375, "learning_rate": 0.0001, "loss": 3.2759, "loss/crossentropy": 2.3604748249053955, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.2524840489029884, "loss/reg": 0.0, "step": 12410 }, { "epoch": 0.08171052631578947, "grad_norm": 2.484375, "grad_norm_var": 0.24068603515625, "learning_rate": 0.0001, "loss": 3.2728, "loss/crossentropy": 2.517817199230194, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.29347013384103776, "loss/reg": 0.0, "step": 12420 }, { "epoch": 0.08177631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.12271728515625, "learning_rate": 0.0001, "loss": 3.2551, "loss/crossentropy": 2.4665472149848937, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2709146931767464, "loss/reg": 0.0, "step": 12430 }, { "epoch": 0.0818421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.2295074462890625, "learning_rate": 0.0001, "loss": 3.2219, "loss/crossentropy": 2.40498046875, "loss/hidden": 3.1, "loss/incoh": 0.0, "loss/logits": 0.32999152690172195, "loss/reg": 0.0, "step": 12440 }, { "epoch": 0.08190789473684211, "grad_norm": 2.421875, "grad_norm_var": 0.1619049072265625, "learning_rate": 0.0001, "loss": 3.0867, "loss/crossentropy": 2.3188422203063963, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.29690912514925005, "loss/reg": 0.0, "step": 12450 }, { "epoch": 0.08197368421052631, "grad_norm": 2.09375, "grad_norm_var": 0.07049051920572917, "learning_rate": 0.0001, "loss": 3.2682, "loss/crossentropy": 2.588274967670441, "loss/hidden": 3.4046875, "loss/incoh": 0.0, "loss/logits": 0.39090928733348845, "loss/reg": 0.0, "step": 12460 }, { "epoch": 0.08203947368421052, "grad_norm": 3.921875, "grad_norm_var": 0.2250261942545573, "learning_rate": 0.0001, "loss": 3.2503, "loss/crossentropy": 2.6322230458259583, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2629552409052849, "loss/reg": 0.0, "step": 12470 }, { "epoch": 0.08210526315789474, "grad_norm": 2.203125, "grad_norm_var": 0.18522109985351562, "learning_rate": 0.0001, "loss": 3.1313, "loss/crossentropy": 2.1257114171981812, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23226768374443055, "loss/reg": 0.0, "step": 12480 }, { "epoch": 0.08217105263157895, "grad_norm": 2.25, "grad_norm_var": 0.16383056640625, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 1.9993813276290893, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21686773076653482, "loss/reg": 0.0, "step": 12490 }, { "epoch": 0.08223684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.12244364420572916, "learning_rate": 0.0001, "loss": 3.1643, "loss/crossentropy": 2.4217318654060365, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.2706719309091568, "loss/reg": 0.0, "step": 12500 }, { "epoch": 0.08230263157894736, "grad_norm": 3.03125, "grad_norm_var": 0.9795888264973959, "learning_rate": 0.0001, "loss": 3.2763, "loss/crossentropy": 2.131495940685272, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.24229931831359863, "loss/reg": 0.0, "step": 12510 }, { "epoch": 0.08236842105263158, "grad_norm": 2.171875, "grad_norm_var": 1.021240234375, "learning_rate": 0.0001, "loss": 3.1894, "loss/crossentropy": 2.2545747995376586, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.26802987307310105, "loss/reg": 0.0, "step": 12520 }, { "epoch": 0.08243421052631579, "grad_norm": 2.234375, "grad_norm_var": 2.0329969940865024e+17, "learning_rate": 0.0001, "loss": 3.364, "loss/crossentropy": 2.37786750793457, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.27177259773015977, "loss/reg": 0.0, "step": 12530 }, { "epoch": 0.0825, "grad_norm": 2.46875, "grad_norm_var": 0.0586822509765625, "learning_rate": 0.0001, "loss": 3.1425, "loss/crossentropy": 2.3742210388183596, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25972897857427596, "loss/reg": 0.0, "step": 12540 }, { "epoch": 0.08256578947368422, "grad_norm": 2.21875, "grad_norm_var": 0.16419270833333333, "learning_rate": 0.0001, "loss": 3.1808, "loss/crossentropy": 2.2249507308006287, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.260022896528244, "loss/reg": 0.0, "step": 12550 }, { "epoch": 0.08263157894736842, "grad_norm": 3.765625, "grad_norm_var": 0.1635406494140625, "learning_rate": 0.0001, "loss": 3.2244, "loss/crossentropy": 2.417391860485077, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24499612003564836, "loss/reg": 0.0, "step": 12560 }, { "epoch": 0.08269736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.24940999348958334, "learning_rate": 0.0001, "loss": 3.259, "loss/crossentropy": 2.008757221698761, "loss/hidden": 3.371875, "loss/incoh": 0.0, "loss/logits": 0.28915109634399416, "loss/reg": 0.0, "step": 12570 }, { "epoch": 0.08276315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.030516560872395834, "learning_rate": 0.0001, "loss": 3.0918, "loss/crossentropy": 2.3331239223480225, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.27426364421844485, "loss/reg": 0.0, "step": 12580 }, { "epoch": 0.08282894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.4984527587890625, "learning_rate": 0.0001, "loss": 3.2675, "loss/crossentropy": 2.3463852405548096, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.2746781826019287, "loss/reg": 0.0, "step": 12590 }, { "epoch": 0.08289473684210526, "grad_norm": 2.09375, "grad_norm_var": 0.026822916666666665, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.1946144729852675, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.273574560880661, "loss/reg": 0.0, "step": 12600 }, { "epoch": 0.08296052631578947, "grad_norm": 2.75, "grad_norm_var": 0.399658203125, "learning_rate": 0.0001, "loss": 3.3113, "loss/crossentropy": 2.292886030673981, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3399674043059349, "loss/reg": 0.0, "step": 12610 }, { "epoch": 0.08302631578947368, "grad_norm": 2.703125, "grad_norm_var": 0.306982421875, "learning_rate": 0.0001, "loss": 3.3316, "loss/crossentropy": 2.2384460091590883, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2688921958208084, "loss/reg": 0.0, "step": 12620 }, { "epoch": 0.0830921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.04449462890625, "learning_rate": 0.0001, "loss": 3.1944, "loss/crossentropy": 2.2317716479301453, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23216352015733718, "loss/reg": 0.0, "step": 12630 }, { "epoch": 0.08315789473684211, "grad_norm": 2.203125, "grad_norm_var": 0.05810139973958333, "learning_rate": 0.0001, "loss": 3.1734, "loss/crossentropy": 2.513770651817322, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.26424300968647, "loss/reg": 0.0, "step": 12640 }, { "epoch": 0.08322368421052631, "grad_norm": 2.765625, "grad_norm_var": 0.060302734375, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.3643002271652223, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2974396377801895, "loss/reg": 0.0, "step": 12650 }, { "epoch": 0.08328947368421052, "grad_norm": 2.765625, "grad_norm_var": 0.11692301432291667, "learning_rate": 0.0001, "loss": 3.2013, "loss/crossentropy": 2.200558376312256, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.26023727655410767, "loss/reg": 0.0, "step": 12660 }, { "epoch": 0.08335526315789474, "grad_norm": 2.34375, "grad_norm_var": 0.0976470947265625, "learning_rate": 0.0001, "loss": 3.2425, "loss/crossentropy": 2.3456878662109375, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.30842293947935107, "loss/reg": 0.0, "step": 12670 }, { "epoch": 0.08342105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.05621744791666667, "learning_rate": 0.0001, "loss": 3.1823, "loss/crossentropy": 2.2649134039878844, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24366314560174943, "loss/reg": 0.0, "step": 12680 }, { "epoch": 0.08348684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.059891764322916666, "learning_rate": 0.0001, "loss": 3.1859, "loss/crossentropy": 2.0741775274276733, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.24329135864973067, "loss/reg": 0.0, "step": 12690 }, { "epoch": 0.08355263157894736, "grad_norm": 2.40625, "grad_norm_var": 0.13880208333333333, "learning_rate": 0.0001, "loss": 3.1551, "loss/crossentropy": 2.4719223856925963, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.24188547879457473, "loss/reg": 0.0, "step": 12700 }, { "epoch": 0.08361842105263158, "grad_norm": 2.8125, "grad_norm_var": 0.32080078125, "learning_rate": 0.0001, "loss": 3.3068, "loss/crossentropy": 2.279301416873932, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.2998178914189339, "loss/reg": 0.0, "step": 12710 }, { "epoch": 0.08368421052631579, "grad_norm": 2.65625, "grad_norm_var": 0.341064453125, "learning_rate": 0.0001, "loss": 3.2089, "loss/crossentropy": 2.4134485125541687, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.28444976508617403, "loss/reg": 0.0, "step": 12720 }, { "epoch": 0.08375, "grad_norm": 2.40625, "grad_norm_var": 0.13637593587239583, "learning_rate": 0.0001, "loss": 3.2345, "loss/crossentropy": 2.4403869032859804, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.26378336995840074, "loss/reg": 0.0, "step": 12730 }, { "epoch": 0.0838157894736842, "grad_norm": 2.34375, "grad_norm_var": 0.0974273681640625, "learning_rate": 0.0001, "loss": 3.1501, "loss/crossentropy": 2.1977667093276976, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.23563524186611176, "loss/reg": 0.0, "step": 12740 }, { "epoch": 0.08388157894736842, "grad_norm": 3.203125, "grad_norm_var": 0.12566731770833334, "learning_rate": 0.0001, "loss": 3.2516, "loss/crossentropy": 2.206720507144928, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.2731472015380859, "loss/reg": 0.0, "step": 12750 }, { "epoch": 0.08394736842105263, "grad_norm": 3.859375, "grad_norm_var": 0.21768290201822918, "learning_rate": 0.0001, "loss": 3.2035, "loss/crossentropy": 2.3951833486557006, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.26070789247751236, "loss/reg": 0.0, "step": 12760 }, { "epoch": 0.08401315789473685, "grad_norm": 2.140625, "grad_norm_var": 0.36861572265625, "learning_rate": 0.0001, "loss": 3.3054, "loss/crossentropy": 2.393438732624054, "loss/hidden": 3.2328125, "loss/incoh": 0.0, "loss/logits": 0.3618380635976791, "loss/reg": 0.0, "step": 12770 }, { "epoch": 0.08407894736842106, "grad_norm": 2.28125, "grad_norm_var": 0.24163004557291667, "learning_rate": 0.0001, "loss": 3.2972, "loss/crossentropy": 2.510524129867554, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.26494802087545394, "loss/reg": 0.0, "step": 12780 }, { "epoch": 0.08414473684210526, "grad_norm": 2.640625, "grad_norm_var": 0.07789713541666667, "learning_rate": 0.0001, "loss": 3.2375, "loss/crossentropy": 2.326508915424347, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.25067940801382066, "loss/reg": 0.0, "step": 12790 }, { "epoch": 0.08421052631578947, "grad_norm": 2.625, "grad_norm_var": 0.0654693603515625, "learning_rate": 0.0001, "loss": 3.1664, "loss/crossentropy": 2.2524615049362184, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.23585905730724335, "loss/reg": 0.0, "step": 12800 }, { "epoch": 0.08427631578947369, "grad_norm": 2.296875, "grad_norm_var": 0.05461324055989583, "learning_rate": 0.0001, "loss": 3.2086, "loss/crossentropy": 2.586345672607422, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2775671869516373, "loss/reg": 0.0, "step": 12810 }, { "epoch": 0.0843421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.157568359375, "learning_rate": 0.0001, "loss": 3.2467, "loss/crossentropy": 2.4124717354774474, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.27734949439764023, "loss/reg": 0.0, "step": 12820 }, { "epoch": 0.0844078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.151953125, "learning_rate": 0.0001, "loss": 3.1838, "loss/crossentropy": 2.217060422897339, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.31069841980934143, "loss/reg": 0.0, "step": 12830 }, { "epoch": 0.08447368421052631, "grad_norm": 2.46875, "grad_norm_var": 0.15916341145833332, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.2915706515312193, "loss/hidden": 3.128125, "loss/incoh": 0.0, "loss/logits": 0.28919376283884046, "loss/reg": 0.0, "step": 12840 }, { "epoch": 0.08453947368421053, "grad_norm": 2.8125, "grad_norm_var": 0.05465494791666667, "learning_rate": 0.0001, "loss": 3.1509, "loss/crossentropy": 2.3436198830604553, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2566779345273972, "loss/reg": 0.0, "step": 12850 }, { "epoch": 0.08460526315789474, "grad_norm": 2.796875, "grad_norm_var": 0.12815653483072917, "learning_rate": 0.0001, "loss": 3.1553, "loss/crossentropy": 2.6007506489753722, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27069382518529894, "loss/reg": 0.0, "step": 12860 }, { "epoch": 0.08467105263157895, "grad_norm": 3.125, "grad_norm_var": 0.09325764973958334, "learning_rate": 0.0001, "loss": 3.1848, "loss/crossentropy": 2.3489827513694763, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2542911395430565, "loss/reg": 0.0, "step": 12870 }, { "epoch": 0.08473684210526315, "grad_norm": 2.359375, "grad_norm_var": 0.2053375244140625, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.3320749402046204, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.25249498784542085, "loss/reg": 0.0, "step": 12880 }, { "epoch": 0.08480263157894737, "grad_norm": 2.375, "grad_norm_var": 0.22139383951822916, "learning_rate": 0.0001, "loss": 3.2744, "loss/crossentropy": 2.5565970659255983, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.30419613122940065, "loss/reg": 0.0, "step": 12890 }, { "epoch": 0.08486842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.12555338541666666, "learning_rate": 0.0001, "loss": 3.2548, "loss/crossentropy": 2.382706320285797, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.28827311396598815, "loss/reg": 0.0, "step": 12900 }, { "epoch": 0.08493421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.031119791666666667, "learning_rate": 0.0001, "loss": 3.1646, "loss/crossentropy": 2.209449625015259, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.27490794360637666, "loss/reg": 0.0, "step": 12910 }, { "epoch": 0.085, "grad_norm": 2.46875, "grad_norm_var": 0.075537109375, "learning_rate": 0.0001, "loss": 3.1151, "loss/crossentropy": 2.42991498708725, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.24879284277558328, "loss/reg": 0.0, "step": 12920 }, { "epoch": 0.0850657894736842, "grad_norm": 2.21875, "grad_norm_var": 0.06396865844726562, "learning_rate": 0.0001, "loss": 3.1176, "loss/crossentropy": 2.124844658374786, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26795649230480195, "loss/reg": 0.0, "step": 12930 }, { "epoch": 0.08513157894736842, "grad_norm": 3.046875, "grad_norm_var": 0.0884844462076823, "learning_rate": 0.0001, "loss": 3.1416, "loss/crossentropy": 2.390605056285858, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2603973612189293, "loss/reg": 0.0, "step": 12940 }, { "epoch": 0.08519736842105263, "grad_norm": 2.15625, "grad_norm_var": 0.11295572916666667, "learning_rate": 0.0001, "loss": 3.1506, "loss/crossentropy": 2.3307228684425354, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.23968299478292465, "loss/reg": 0.0, "step": 12950 }, { "epoch": 0.08526315789473685, "grad_norm": 2.375, "grad_norm_var": 0.07505594889322917, "learning_rate": 0.0001, "loss": 3.1409, "loss/crossentropy": 2.32118815779686, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.27437605410814286, "loss/reg": 0.0, "step": 12960 }, { "epoch": 0.08532894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.05011393229166667, "learning_rate": 0.0001, "loss": 3.1943, "loss/crossentropy": 2.1953859329223633, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.2964278385043144, "loss/reg": 0.0, "step": 12970 }, { "epoch": 0.08539473684210526, "grad_norm": 2.265625, "grad_norm_var": 0.044733683268229164, "learning_rate": 0.0001, "loss": 3.1596, "loss/crossentropy": 2.48675742149353, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2866469621658325, "loss/reg": 0.0, "step": 12980 }, { "epoch": 0.08546052631578947, "grad_norm": 2.890625, "grad_norm_var": 0.04439188639322917, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.3276284098625184, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.254511134326458, "loss/reg": 0.0, "step": 12990 }, { "epoch": 0.08552631578947369, "grad_norm": 2.125, "grad_norm_var": 0.18092041015625, "learning_rate": 0.0001, "loss": 3.3308, "loss/crossentropy": 2.436275231838226, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27356108725070954, "loss/reg": 0.0, "step": 13000 }, { "epoch": 0.0855921052631579, "grad_norm": 2.25, "grad_norm_var": 0.17511393229166666, "learning_rate": 0.0001, "loss": 3.1302, "loss/crossentropy": 2.2856626510620117, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24442512094974517, "loss/reg": 0.0, "step": 13010 }, { "epoch": 0.0856578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.10161844889322917, "learning_rate": 0.0001, "loss": 3.1923, "loss/crossentropy": 2.4494728326797484, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2575364217162132, "loss/reg": 0.0, "step": 13020 }, { "epoch": 0.08572368421052631, "grad_norm": 2.34375, "grad_norm_var": 0.09909566243489583, "learning_rate": 0.0001, "loss": 3.1732, "loss/crossentropy": 2.4833264112472535, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2727135464549065, "loss/reg": 0.0, "step": 13030 }, { "epoch": 0.08578947368421053, "grad_norm": 3.203125, "grad_norm_var": 0.12711588541666666, "learning_rate": 0.0001, "loss": 3.165, "loss/crossentropy": 2.284275805950165, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.29309146106243134, "loss/reg": 0.0, "step": 13040 }, { "epoch": 0.08585526315789474, "grad_norm": 2.28125, "grad_norm_var": 0.18906148274739584, "learning_rate": 0.0001, "loss": 3.1988, "loss/crossentropy": 2.1231042385101317, "loss/hidden": 3.0375, "loss/incoh": 0.0, "loss/logits": 0.246621835231781, "loss/reg": 0.0, "step": 13050 }, { "epoch": 0.08592105263157895, "grad_norm": 2.234375, "grad_norm_var": 0.14317118326822917, "learning_rate": 0.0001, "loss": 3.2204, "loss/crossentropy": 2.115370142459869, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.22535212635993956, "loss/reg": 0.0, "step": 13060 }, { "epoch": 0.08598684210526315, "grad_norm": 2.234375, "grad_norm_var": 0.17832743326822917, "learning_rate": 0.0001, "loss": 3.1988, "loss/crossentropy": 2.271882343292236, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2518279105424881, "loss/reg": 0.0, "step": 13070 }, { "epoch": 0.08605263157894737, "grad_norm": 1.90625, "grad_norm_var": 0.1100250244140625, "learning_rate": 0.0001, "loss": 3.2672, "loss/crossentropy": 2.211618059873581, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2709794193506241, "loss/reg": 0.0, "step": 13080 }, { "epoch": 0.08611842105263158, "grad_norm": 2.15625, "grad_norm_var": 0.06676432291666666, "learning_rate": 0.0001, "loss": 3.2393, "loss/crossentropy": 2.4678762197494506, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2634593158960342, "loss/reg": 0.0, "step": 13090 }, { "epoch": 0.0861842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 3.19, "loss/crossentropy": 2.5426036715507507, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.26041875034570694, "loss/reg": 0.0, "step": 13100 }, { "epoch": 0.08625, "grad_norm": 2.34375, "grad_norm_var": 0.028034464518229166, "learning_rate": 0.0001, "loss": 3.1556, "loss/crossentropy": 2.1748136937618257, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.3230336934328079, "loss/reg": 0.0, "step": 13110 }, { "epoch": 0.0863157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.3787750244140625, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.352058470249176, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.3100366100668907, "loss/reg": 0.0, "step": 13120 }, { "epoch": 0.08638157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.12065327962239583, "learning_rate": 0.0001, "loss": 3.2897, "loss/crossentropy": 2.2920926332473757, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.24493622779846191, "loss/reg": 0.0, "step": 13130 }, { "epoch": 0.08644736842105263, "grad_norm": 3.140625, "grad_norm_var": 0.17266337076822916, "learning_rate": 0.0001, "loss": 3.2541, "loss/crossentropy": 2.253483748435974, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24110897034406661, "loss/reg": 0.0, "step": 13140 }, { "epoch": 0.08651315789473685, "grad_norm": 3.140625, "grad_norm_var": 0.15883687337239583, "learning_rate": 0.0001, "loss": 3.2161, "loss/crossentropy": 2.490678381919861, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2905942976474762, "loss/reg": 0.0, "step": 13150 }, { "epoch": 0.08657894736842105, "grad_norm": 2.671875, "grad_norm_var": 0.124365234375, "learning_rate": 0.0001, "loss": 3.2037, "loss/crossentropy": 2.607399010658264, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.3426252081990242, "loss/reg": 0.0, "step": 13160 }, { "epoch": 0.08664473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.20832697550455728, "learning_rate": 0.0001, "loss": 3.3389, "loss/crossentropy": 2.5389102935791015, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.27587536424398423, "loss/reg": 0.0, "step": 13170 }, { "epoch": 0.08671052631578947, "grad_norm": 2.171875, "grad_norm_var": 3.749950368844328e+17, "learning_rate": 0.0001, "loss": 3.3435, "loss/crossentropy": 2.2043145060539246, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.27380194514989853, "loss/reg": 0.0, "step": 13180 }, { "epoch": 0.08677631578947369, "grad_norm": 3.203125, "grad_norm_var": 0.10250651041666667, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.5562551975250245, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.35649020969867706, "loss/reg": 0.0, "step": 13190 }, { "epoch": 0.0868421052631579, "grad_norm": 2.046875, "grad_norm_var": 0.11367899576822917, "learning_rate": 0.0001, "loss": 3.1639, "loss/crossentropy": 2.2194557189941406, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.2623781323432922, "loss/reg": 0.0, "step": 13200 }, { "epoch": 0.0869078947368421, "grad_norm": 2.59375, "grad_norm_var": 0.09927978515625, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.3938650250434876, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22924545854330064, "loss/reg": 0.0, "step": 13210 }, { "epoch": 0.08697368421052631, "grad_norm": 2.328125, "grad_norm_var": 0.07858072916666667, "learning_rate": 0.0001, "loss": 3.1885, "loss/crossentropy": 2.659112477302551, "loss/hidden": 3.2625, "loss/incoh": 0.0, "loss/logits": 0.3096113160252571, "loss/reg": 0.0, "step": 13220 }, { "epoch": 0.08703947368421053, "grad_norm": 2.140625, "grad_norm_var": 0.08105061848958334, "learning_rate": 0.0001, "loss": 3.1608, "loss/crossentropy": 2.2919702410697935, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.28502654284238815, "loss/reg": 0.0, "step": 13230 }, { "epoch": 0.08710526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.2513631184895833, "learning_rate": 0.0001, "loss": 3.1995, "loss/crossentropy": 2.206997013092041, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.20357893258333207, "loss/reg": 0.0, "step": 13240 }, { "epoch": 0.08717105263157894, "grad_norm": 2.1875, "grad_norm_var": 0.06531575520833334, "learning_rate": 0.0001, "loss": 3.1267, "loss/crossentropy": 2.095241755247116, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.23652465790510177, "loss/reg": 0.0, "step": 13250 }, { "epoch": 0.08723684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.110009765625, "learning_rate": 0.0001, "loss": 3.1799, "loss/crossentropy": 2.1254674077033995, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.22684407681226731, "loss/reg": 0.0, "step": 13260 }, { "epoch": 0.08730263157894737, "grad_norm": 2.5625, "grad_norm_var": 3.398986257643471e+17, "learning_rate": 0.0001, "loss": 3.358, "loss/crossentropy": 2.3295519828796385, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.30277568846940994, "loss/reg": 0.0, "step": 13270 }, { "epoch": 0.08736842105263158, "grad_norm": 2.359375, "grad_norm_var": 3.3989862579380115e+17, "learning_rate": 0.0001, "loss": 3.3036, "loss/crossentropy": 2.496494376659393, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2583273336291313, "loss/reg": 0.0, "step": 13280 }, { "epoch": 0.0874342105263158, "grad_norm": 2.640625, "grad_norm_var": 0.0663238525390625, "learning_rate": 0.0001, "loss": 3.2025, "loss/crossentropy": 2.332583689689636, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24232984483242034, "loss/reg": 0.0, "step": 13290 }, { "epoch": 0.0875, "grad_norm": 2.25, "grad_norm_var": 0.43166910807291664, "learning_rate": 0.0001, "loss": 3.1339, "loss/crossentropy": 2.394269013404846, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24607907682657243, "loss/reg": 0.0, "step": 13300 }, { "epoch": 0.08756578947368421, "grad_norm": 2.40625, "grad_norm_var": 0.112890625, "learning_rate": 0.0001, "loss": 3.1815, "loss/crossentropy": 2.2034417927265166, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.25049934834241866, "loss/reg": 0.0, "step": 13310 }, { "epoch": 0.08763157894736842, "grad_norm": 2.6875, "grad_norm_var": 0.0748443603515625, "learning_rate": 0.0001, "loss": 3.2491, "loss/crossentropy": 2.058911919593811, "loss/hidden": 3.1359375, "loss/incoh": 0.0, "loss/logits": 0.25788910537958143, "loss/reg": 0.0, "step": 13320 }, { "epoch": 0.08769736842105263, "grad_norm": 3.046875, "grad_norm_var": 0.22857666015625, "learning_rate": 0.0001, "loss": 3.2051, "loss/crossentropy": 2.3308457016944883, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.3006305813789368, "loss/reg": 0.0, "step": 13330 }, { "epoch": 0.08776315789473685, "grad_norm": 2.46875, "grad_norm_var": 0.266796875, "learning_rate": 0.0001, "loss": 3.1792, "loss/crossentropy": 2.392533528804779, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.25690300911664965, "loss/reg": 0.0, "step": 13340 }, { "epoch": 0.08782894736842105, "grad_norm": 2.015625, "grad_norm_var": 0.24251302083333334, "learning_rate": 0.0001, "loss": 3.1342, "loss/crossentropy": 2.303742027282715, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.24467559903860092, "loss/reg": 0.0, "step": 13350 }, { "epoch": 0.08789473684210526, "grad_norm": 3.21875, "grad_norm_var": 0.26910400390625, "learning_rate": 0.0001, "loss": 3.2614, "loss/crossentropy": 2.5700215101242065, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.346772675216198, "loss/reg": 0.0, "step": 13360 }, { "epoch": 0.08796052631578948, "grad_norm": 2.234375, "grad_norm_var": 8.30354715983073, "learning_rate": 0.0001, "loss": 3.1987, "loss/crossentropy": 2.3953630328178406, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2601369693875313, "loss/reg": 0.0, "step": 13370 }, { "epoch": 0.08802631578947369, "grad_norm": 3.671875, "grad_norm_var": 3.5722076416015627, "learning_rate": 0.0001, "loss": 3.2285, "loss/crossentropy": 2.540582847595215, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.274001345038414, "loss/reg": 0.0, "step": 13380 }, { "epoch": 0.08809210526315789, "grad_norm": 2.359375, "grad_norm_var": 0.2377593994140625, "learning_rate": 0.0001, "loss": 3.1787, "loss/crossentropy": 2.3221506476402283, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2874672919511795, "loss/reg": 0.0, "step": 13390 }, { "epoch": 0.0881578947368421, "grad_norm": 2.15625, "grad_norm_var": 1.4219309488932292, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 2.255697971582413, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.25251765847206115, "loss/reg": 0.0, "step": 13400 }, { "epoch": 0.08822368421052632, "grad_norm": 2.28125, "grad_norm_var": 0.023307291666666667, "learning_rate": 0.0001, "loss": 3.1642, "loss/crossentropy": 2.376703941822052, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2795195817947388, "loss/reg": 0.0, "step": 13410 }, { "epoch": 0.08828947368421053, "grad_norm": 2.359375, "grad_norm_var": 0.32665608723958334, "learning_rate": 0.0001, "loss": 3.1802, "loss/crossentropy": 2.390151119232178, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.255537736415863, "loss/reg": 0.0, "step": 13420 }, { "epoch": 0.08835526315789474, "grad_norm": 2.203125, "grad_norm_var": 0.33414306640625, "learning_rate": 0.0001, "loss": 3.1779, "loss/crossentropy": 2.2273300528526305, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2409852236509323, "loss/reg": 0.0, "step": 13430 }, { "epoch": 0.08842105263157894, "grad_norm": 25.75, "grad_norm_var": 34.02027587890625, "learning_rate": 0.0001, "loss": 3.1974, "loss/crossentropy": 2.281948208808899, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2378145158290863, "loss/reg": 0.0, "step": 13440 }, { "epoch": 0.08848684210526316, "grad_norm": 2.75, "grad_norm_var": 34.235252888997394, "learning_rate": 0.0001, "loss": 3.3736, "loss/crossentropy": 2.267159104347229, "loss/hidden": 3.1296875, "loss/incoh": 0.0, "loss/logits": 0.2702139914035797, "loss/reg": 0.0, "step": 13450 }, { "epoch": 0.08855263157894737, "grad_norm": 2.703125, "grad_norm_var": 1.1851145426432292, "learning_rate": 0.0001, "loss": 3.16, "loss/crossentropy": 2.3185499548912047, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2232184700667858, "loss/reg": 0.0, "step": 13460 }, { "epoch": 0.08861842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.07108968098958333, "learning_rate": 0.0001, "loss": 3.2029, "loss/crossentropy": 2.1752323627471926, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.3101599723100662, "loss/reg": 0.0, "step": 13470 }, { "epoch": 0.0886842105263158, "grad_norm": 2.21875, "grad_norm_var": 0.11365559895833334, "learning_rate": 0.0001, "loss": 3.2671, "loss/crossentropy": 2.255975532531738, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.30118285566568376, "loss/reg": 0.0, "step": 13480 }, { "epoch": 0.08875, "grad_norm": 2.4375, "grad_norm_var": 0.06363525390625, "learning_rate": 0.0001, "loss": 3.1907, "loss/crossentropy": 2.2357093393802643, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.23682421892881395, "loss/reg": 0.0, "step": 13490 }, { "epoch": 0.08881578947368421, "grad_norm": 2.75, "grad_norm_var": 0.05452372233072917, "learning_rate": 0.0001, "loss": 3.1246, "loss/crossentropy": 2.3712179183959963, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2336268275976181, "loss/reg": 0.0, "step": 13500 }, { "epoch": 0.08888157894736842, "grad_norm": 2.40625, "grad_norm_var": 0.13904520670572917, "learning_rate": 0.0001, "loss": 3.1879, "loss/crossentropy": 2.407870662212372, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.28650608360767366, "loss/reg": 0.0, "step": 13510 }, { "epoch": 0.08894736842105264, "grad_norm": 2.359375, "grad_norm_var": 0.009284464518229167, "learning_rate": 0.0001, "loss": 3.1881, "loss/crossentropy": 2.4734713077545165, "loss/hidden": 3.071875, "loss/incoh": 0.0, "loss/logits": 0.30961792171001434, "loss/reg": 0.0, "step": 13520 }, { "epoch": 0.08901315789473684, "grad_norm": 2.796875, "grad_norm_var": 0.03455301920572917, "learning_rate": 0.0001, "loss": 3.1521, "loss/crossentropy": 2.1405319690704347, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.25160129070281984, "loss/reg": 0.0, "step": 13530 }, { "epoch": 0.08907894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.13059488932291666, "learning_rate": 0.0001, "loss": 3.2623, "loss/crossentropy": 2.5234264612197874, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2405148908495903, "loss/reg": 0.0, "step": 13540 }, { "epoch": 0.08914473684210526, "grad_norm": 3.109375, "grad_norm_var": 0.14059244791666667, "learning_rate": 0.0001, "loss": 3.1637, "loss/crossentropy": 2.4861895561218263, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2647073075175285, "loss/reg": 0.0, "step": 13550 }, { "epoch": 0.08921052631578948, "grad_norm": 2.09375, "grad_norm_var": 0.08899332682291666, "learning_rate": 0.0001, "loss": 3.2354, "loss/crossentropy": 2.241242003440857, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.25270578265190125, "loss/reg": 0.0, "step": 13560 }, { "epoch": 0.08927631578947369, "grad_norm": 2.703125, "grad_norm_var": 0.15600484212239582, "learning_rate": 0.0001, "loss": 3.2692, "loss/crossentropy": 2.5087321639060973, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.2870207831263542, "loss/reg": 0.0, "step": 13570 }, { "epoch": 0.08934210526315789, "grad_norm": 1.9140625, "grad_norm_var": 0.09006322224934896, "learning_rate": 0.0001, "loss": 3.2167, "loss/crossentropy": 2.519565200805664, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2802609995007515, "loss/reg": 0.0, "step": 13580 }, { "epoch": 0.0894078947368421, "grad_norm": 2.859375, "grad_norm_var": 0.08097508748372396, "learning_rate": 0.0001, "loss": 3.2167, "loss/crossentropy": 2.6469456434249876, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.2738121926784515, "loss/reg": 0.0, "step": 13590 }, { "epoch": 0.08947368421052632, "grad_norm": 4.34375, "grad_norm_var": 0.28693033854166666, "learning_rate": 0.0001, "loss": 3.0727, "loss/crossentropy": 2.4280938267707826, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.24971676170825957, "loss/reg": 0.0, "step": 13600 }, { "epoch": 0.08953947368421053, "grad_norm": 2.171875, "grad_norm_var": 0.28564046223958334, "learning_rate": 0.0001, "loss": 3.2647, "loss/crossentropy": 2.331229364871979, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.25523588955402376, "loss/reg": 0.0, "step": 13610 }, { "epoch": 0.08960526315789474, "grad_norm": 2.03125, "grad_norm_var": 0.1018463134765625, "learning_rate": 0.0001, "loss": 3.141, "loss/crossentropy": 2.248876082897186, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.27915321439504626, "loss/reg": 0.0, "step": 13620 }, { "epoch": 0.08967105263157894, "grad_norm": 2.484375, "grad_norm_var": 0.05377604166666667, "learning_rate": 0.0001, "loss": 3.1282, "loss/crossentropy": 2.475773072242737, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26457071453332903, "loss/reg": 0.0, "step": 13630 }, { "epoch": 0.08973684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.09758707682291666, "learning_rate": 0.0001, "loss": 3.1607, "loss/crossentropy": 2.3480275869369507, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.2902192771434784, "loss/reg": 0.0, "step": 13640 }, { "epoch": 0.08980263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.1151519775390625, "learning_rate": 0.0001, "loss": 3.1651, "loss/crossentropy": 2.362102711200714, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.24841166138648987, "loss/reg": 0.0, "step": 13650 }, { "epoch": 0.08986842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.0971832275390625, "learning_rate": 0.0001, "loss": 3.1725, "loss/crossentropy": 2.134939956665039, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2594160199165344, "loss/reg": 0.0, "step": 13660 }, { "epoch": 0.08993421052631578, "grad_norm": 2.296875, "grad_norm_var": 0.09054361979166667, "learning_rate": 0.0001, "loss": 3.0604, "loss/crossentropy": 2.22348096370697, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2797392845153809, "loss/reg": 0.0, "step": 13670 }, { "epoch": 0.09, "grad_norm": 2.65625, "grad_norm_var": 0.0264556884765625, "learning_rate": 0.0001, "loss": 3.165, "loss/crossentropy": 2.321420121192932, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.25273988842964173, "loss/reg": 0.0, "step": 13680 }, { "epoch": 0.09006578947368421, "grad_norm": 2.1875, "grad_norm_var": 0.0448394775390625, "learning_rate": 0.0001, "loss": 3.2251, "loss/crossentropy": 2.5713982224464416, "loss/hidden": 3.1890625, "loss/incoh": 0.0, "loss/logits": 0.3743001103401184, "loss/reg": 0.0, "step": 13690 }, { "epoch": 0.09013157894736842, "grad_norm": 2.34375, "grad_norm_var": 0.05821024576822917, "learning_rate": 0.0001, "loss": 3.1174, "loss/crossentropy": 2.433469843864441, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.29491184949874877, "loss/reg": 0.0, "step": 13700 }, { "epoch": 0.09019736842105264, "grad_norm": 2.328125, "grad_norm_var": 0.06334228515625, "learning_rate": 0.0001, "loss": 3.1422, "loss/crossentropy": 2.498783230781555, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2630519106984138, "loss/reg": 0.0, "step": 13710 }, { "epoch": 0.09026315789473684, "grad_norm": 2.484375, "grad_norm_var": 0.18053385416666667, "learning_rate": 0.0001, "loss": 3.2017, "loss/crossentropy": 2.295622777938843, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.25550642013549807, "loss/reg": 0.0, "step": 13720 }, { "epoch": 0.09032894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.14754231770833334, "learning_rate": 0.0001, "loss": 3.2136, "loss/crossentropy": 2.4960160851478577, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.29100794196128843, "loss/reg": 0.0, "step": 13730 }, { "epoch": 0.09039473684210526, "grad_norm": 2.5, "grad_norm_var": 0.060888671875, "learning_rate": 0.0001, "loss": 3.182, "loss/crossentropy": 2.543604516983032, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.26428850889205935, "loss/reg": 0.0, "step": 13740 }, { "epoch": 0.09046052631578948, "grad_norm": 2.59375, "grad_norm_var": 0.1883697509765625, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.495186424255371, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24936733841896058, "loss/reg": 0.0, "step": 13750 }, { "epoch": 0.09052631578947369, "grad_norm": 2.515625, "grad_norm_var": 0.17746480305989584, "learning_rate": 0.0001, "loss": 3.2332, "loss/crossentropy": 2.5168472051620485, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.3214556619524956, "loss/reg": 0.0, "step": 13760 }, { "epoch": 0.09059210526315789, "grad_norm": 2.53125, "grad_norm_var": 0.049853515625, "learning_rate": 0.0001, "loss": 3.1549, "loss/crossentropy": 2.2676196336746215, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.28792900443077085, "loss/reg": 0.0, "step": 13770 }, { "epoch": 0.0906578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.50240478515625, "learning_rate": 0.0001, "loss": 3.1942, "loss/crossentropy": 2.3297463774681093, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.20883744955062866, "loss/reg": 0.0, "step": 13780 }, { "epoch": 0.09072368421052632, "grad_norm": 2.546875, "grad_norm_var": 0.4667154947916667, "learning_rate": 0.0001, "loss": 3.2349, "loss/crossentropy": 2.1975256204605103, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2709361925721169, "loss/reg": 0.0, "step": 13790 }, { "epoch": 0.09078947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.03693745930989583, "learning_rate": 0.0001, "loss": 3.1167, "loss/crossentropy": 2.3970743119716644, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.2433250866830349, "loss/reg": 0.0, "step": 13800 }, { "epoch": 0.09085526315789473, "grad_norm": 2.609375, "grad_norm_var": 0.7566965738932292, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.269898569583893, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23498842120170593, "loss/reg": 0.0, "step": 13810 }, { "epoch": 0.09092105263157894, "grad_norm": 2.265625, "grad_norm_var": 0.050389607747395836, "learning_rate": 0.0001, "loss": 3.1946, "loss/crossentropy": 2.5624868392944338, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.36430184692144396, "loss/reg": 0.0, "step": 13820 }, { "epoch": 0.09098684210526316, "grad_norm": 2.640625, "grad_norm_var": 0.06546122233072917, "learning_rate": 0.0001, "loss": 3.225, "loss/crossentropy": 2.2082170367240908, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2326130375266075, "loss/reg": 0.0, "step": 13830 }, { "epoch": 0.09105263157894737, "grad_norm": 3.046875, "grad_norm_var": 0.1621002197265625, "learning_rate": 0.0001, "loss": 3.225, "loss/crossentropy": 2.2408367514610292, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.25092404931783674, "loss/reg": 0.0, "step": 13840 }, { "epoch": 0.09111842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.18394775390625, "learning_rate": 0.0001, "loss": 3.1397, "loss/crossentropy": 2.092372101545334, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25034971833229064, "loss/reg": 0.0, "step": 13850 }, { "epoch": 0.09118421052631578, "grad_norm": 2.421875, "grad_norm_var": 0.0753313700358073, "learning_rate": 0.0001, "loss": 3.1936, "loss/crossentropy": 2.2277904510498048, "loss/hidden": 3.075, "loss/incoh": 0.0, "loss/logits": 0.28555874079465865, "loss/reg": 0.0, "step": 13860 }, { "epoch": 0.09125, "grad_norm": 2.203125, "grad_norm_var": 0.06499608357747395, "learning_rate": 0.0001, "loss": 3.1857, "loss/crossentropy": 2.0974882781505584, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.22869385927915573, "loss/reg": 0.0, "step": 13870 }, { "epoch": 0.09131578947368421, "grad_norm": 3.1875, "grad_norm_var": 0.30895894368489585, "learning_rate": 0.0001, "loss": 3.1808, "loss/crossentropy": 2.2242319107055666, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.24477246254682541, "loss/reg": 0.0, "step": 13880 }, { "epoch": 0.09138157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.30500895182291665, "learning_rate": 0.0001, "loss": 3.1587, "loss/crossentropy": 2.2787875294685365, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.23667097985744476, "loss/reg": 0.0, "step": 13890 }, { "epoch": 0.09144736842105264, "grad_norm": 2.109375, "grad_norm_var": 0.04114176432291667, "learning_rate": 0.0001, "loss": 3.1791, "loss/crossentropy": 2.5015464782714845, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.26709298938512804, "loss/reg": 0.0, "step": 13900 }, { "epoch": 0.09151315789473684, "grad_norm": 1.9765625, "grad_norm_var": 0.38769505818684896, "learning_rate": 0.0001, "loss": 3.1858, "loss/crossentropy": 2.367018985748291, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2575811371207237, "loss/reg": 0.0, "step": 13910 }, { "epoch": 0.09157894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.12981338500976564, "learning_rate": 0.0001, "loss": 3.2014, "loss/crossentropy": 2.3600114941596986, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2798996135592461, "loss/reg": 0.0, "step": 13920 }, { "epoch": 0.09164473684210526, "grad_norm": 2.40625, "grad_norm_var": 0.5571940104166667, "learning_rate": 0.0001, "loss": 3.1415, "loss/crossentropy": 2.07312273979187, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.27816066443920134, "loss/reg": 0.0, "step": 13930 }, { "epoch": 0.09171052631578948, "grad_norm": 2.234375, "grad_norm_var": 0.16337483723958332, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.3842572927474976, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.3044495239853859, "loss/reg": 0.0, "step": 13940 }, { "epoch": 0.09177631578947368, "grad_norm": 2.28125, "grad_norm_var": 0.011617024739583334, "learning_rate": 0.0001, "loss": 3.2638, "loss/crossentropy": 2.258384811878204, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.24848029464483262, "loss/reg": 0.0, "step": 13950 }, { "epoch": 0.09184210526315789, "grad_norm": 2.625, "grad_norm_var": 0.08046875, "learning_rate": 0.0001, "loss": 3.2277, "loss/crossentropy": 2.1830771923065186, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.282887265086174, "loss/reg": 0.0, "step": 13960 }, { "epoch": 0.0919078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.69127197265625, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.5693406105041503, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2618736609816551, "loss/reg": 0.0, "step": 13970 }, { "epoch": 0.09197368421052632, "grad_norm": 2.265625, "grad_norm_var": 1.2373443603515626, "learning_rate": 0.0001, "loss": 3.163, "loss/crossentropy": 2.496062994003296, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.27756498008966446, "loss/reg": 0.0, "step": 13980 }, { "epoch": 0.09203947368421053, "grad_norm": 2.375, "grad_norm_var": 1.256787109375, "learning_rate": 0.0001, "loss": 3.2974, "loss/crossentropy": 2.233779698610306, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.22414856255054474, "loss/reg": 0.0, "step": 13990 }, { "epoch": 0.09210526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.04104817708333333, "learning_rate": 0.0001, "loss": 3.1465, "loss/crossentropy": 2.3837480187416076, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.29815254360437393, "loss/reg": 0.0, "step": 14000 }, { "epoch": 0.09217105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.03504130045572917, "learning_rate": 0.0001, "loss": 3.1933, "loss/crossentropy": 2.4089162349700928, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.28769134283065795, "loss/reg": 0.0, "step": 14010 }, { "epoch": 0.09223684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.099560546875, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.387049177289009, "loss/hidden": 3.078125, "loss/incoh": 0.0, "loss/logits": 0.31867421939969065, "loss/reg": 0.0, "step": 14020 }, { "epoch": 0.09230263157894737, "grad_norm": 2.203125, "grad_norm_var": 0.12417704264322917, "learning_rate": 0.0001, "loss": 3.0959, "loss/crossentropy": 2.2009261429309843, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2346379891037941, "loss/reg": 0.0, "step": 14030 }, { "epoch": 0.09236842105263159, "grad_norm": 2.484375, "grad_norm_var": 0.0412750244140625, "learning_rate": 0.0001, "loss": 3.1679, "loss/crossentropy": 2.379330587387085, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23624493330717086, "loss/reg": 0.0, "step": 14040 }, { "epoch": 0.09243421052631579, "grad_norm": 2.515625, "grad_norm_var": 0.25758056640625, "learning_rate": 0.0001, "loss": 3.1901, "loss/crossentropy": 2.2017428398132326, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2819879144430161, "loss/reg": 0.0, "step": 14050 }, { "epoch": 0.0925, "grad_norm": 2.59375, "grad_norm_var": 1.2106597900390625, "learning_rate": 0.0001, "loss": 3.1627, "loss/crossentropy": 2.053563690185547, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24388336688280104, "loss/reg": 0.0, "step": 14060 }, { "epoch": 0.09256578947368421, "grad_norm": 1.953125, "grad_norm_var": 0.0501617431640625, "learning_rate": 0.0001, "loss": 3.1122, "loss/crossentropy": 2.5748242855072023, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2592714115977287, "loss/reg": 0.0, "step": 14070 }, { "epoch": 0.09263157894736843, "grad_norm": 2.546875, "grad_norm_var": 0.115869140625, "learning_rate": 0.0001, "loss": 3.2403, "loss/crossentropy": 2.1434300899505616, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.23879239857196807, "loss/reg": 0.0, "step": 14080 }, { "epoch": 0.09269736842105263, "grad_norm": 2.53125, "grad_norm_var": 0.08590087890625, "learning_rate": 0.0001, "loss": 3.187, "loss/crossentropy": 2.078695094585419, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.29750160723924635, "loss/reg": 0.0, "step": 14090 }, { "epoch": 0.09276315789473684, "grad_norm": 3.078125, "grad_norm_var": 0.06355692545572916, "learning_rate": 0.0001, "loss": 3.1426, "loss/crossentropy": 2.4077930808067323, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2784098371863365, "loss/reg": 0.0, "step": 14100 }, { "epoch": 0.09282894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.28290913899739584, "learning_rate": 0.0001, "loss": 3.1999, "loss/crossentropy": 2.551260459423065, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.30844325572252274, "loss/reg": 0.0, "step": 14110 }, { "epoch": 0.09289473684210527, "grad_norm": 2.890625, "grad_norm_var": 0.28843994140625, "learning_rate": 0.0001, "loss": 3.1903, "loss/crossentropy": 2.057539927959442, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21393984705209732, "loss/reg": 0.0, "step": 14120 }, { "epoch": 0.09296052631578948, "grad_norm": 2.328125, "grad_norm_var": 0.16035054524739584, "learning_rate": 0.0001, "loss": 3.2036, "loss/crossentropy": 2.2166428923606873, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.28170192539691924, "loss/reg": 0.0, "step": 14130 }, { "epoch": 0.09302631578947368, "grad_norm": 2.328125, "grad_norm_var": 0.04431864420572917, "learning_rate": 0.0001, "loss": 3.1706, "loss/crossentropy": 2.5224907636642455, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2656204476952553, "loss/reg": 0.0, "step": 14140 }, { "epoch": 0.09309210526315789, "grad_norm": 2.28125, "grad_norm_var": 0.13594462076822916, "learning_rate": 0.0001, "loss": 3.2365, "loss/crossentropy": 2.365175998210907, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.28030987083911896, "loss/reg": 0.0, "step": 14150 }, { "epoch": 0.0931578947368421, "grad_norm": 2.375, "grad_norm_var": 3.4480684566712595e+17, "learning_rate": 0.0001, "loss": 3.3698, "loss/crossentropy": 2.4446977496147158, "loss/hidden": 4.159375, "loss/incoh": 0.0, "loss/logits": 0.35163595527410507, "loss/reg": 0.0, "step": 14160 }, { "epoch": 0.09322368421052632, "grad_norm": 2.46875, "grad_norm_var": 3.4480684570076774e+17, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.144097054004669, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.2777150124311447, "loss/reg": 0.0, "step": 14170 }, { "epoch": 0.09328947368421053, "grad_norm": 2.484375, "grad_norm_var": 0.1343902587890625, "learning_rate": 0.0001, "loss": 3.1593, "loss/crossentropy": 2.5162782430648805, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2642303377389908, "loss/reg": 0.0, "step": 14180 }, { "epoch": 0.09335526315789473, "grad_norm": 2.203125, "grad_norm_var": 0.169921875, "learning_rate": 0.0001, "loss": 3.1927, "loss/crossentropy": 2.1481271982192993, "loss/hidden": 3.034375, "loss/incoh": 0.0, "loss/logits": 0.27440374791622163, "loss/reg": 0.0, "step": 14190 }, { "epoch": 0.09342105263157895, "grad_norm": 2.21875, "grad_norm_var": 0.188330078125, "learning_rate": 0.0001, "loss": 3.2049, "loss/crossentropy": 2.4672377467155457, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25575320422649384, "loss/reg": 0.0, "step": 14200 }, { "epoch": 0.09348684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.07030843098958334, "learning_rate": 0.0001, "loss": 3.1256, "loss/crossentropy": 2.4822991728782653, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.23255562111735345, "loss/reg": 0.0, "step": 14210 }, { "epoch": 0.09355263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.06575698852539062, "learning_rate": 0.0001, "loss": 3.1061, "loss/crossentropy": 2.4455429315567017, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.2238618478178978, "loss/reg": 0.0, "step": 14220 }, { "epoch": 0.09361842105263157, "grad_norm": 2.453125, "grad_norm_var": 0.13621317545572917, "learning_rate": 0.0001, "loss": 3.1927, "loss/crossentropy": 2.3609827399253844, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.3211557373404503, "loss/reg": 0.0, "step": 14230 }, { "epoch": 0.09368421052631579, "grad_norm": 2.3125, "grad_norm_var": 0.1390777587890625, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.4929265141487122, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.256229493021965, "loss/reg": 0.0, "step": 14240 }, { "epoch": 0.09375, "grad_norm": 2.28125, "grad_norm_var": 0.014774576822916666, "learning_rate": 0.0001, "loss": 3.0973, "loss/crossentropy": 2.396563506126404, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.3292677074670792, "loss/reg": 0.0, "step": 14250 }, { "epoch": 0.09381578947368421, "grad_norm": 3.046875, "grad_norm_var": 3.801495474913411e+17, "learning_rate": 0.0001, "loss": 3.3636, "loss/crossentropy": 2.1659668326377868, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.24123398140072821, "loss/reg": 0.0, "step": 14260 }, { "epoch": 0.09388157894736843, "grad_norm": 2.640625, "grad_norm_var": 3.801495474059215e+17, "learning_rate": 0.0001, "loss": 3.2259, "loss/crossentropy": 2.35439647436142, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22794780433177947, "loss/reg": 0.0, "step": 14270 }, { "epoch": 0.09394736842105263, "grad_norm": 2.734375, "grad_norm_var": 0.42942708333333335, "learning_rate": 0.0001, "loss": 3.1665, "loss/crossentropy": 2.3001658797264097, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.26924325078725814, "loss/reg": 0.0, "step": 14280 }, { "epoch": 0.09401315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.0408843994140625, "learning_rate": 0.0001, "loss": 3.0518, "loss/crossentropy": 2.3841129422187803, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.26079314202070236, "loss/reg": 0.0, "step": 14290 }, { "epoch": 0.09407894736842105, "grad_norm": 2.296875, "grad_norm_var": 0.03784891764322917, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.2710848689079284, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.23626017868518828, "loss/reg": 0.0, "step": 14300 }, { "epoch": 0.09414473684210527, "grad_norm": 2.921875, "grad_norm_var": 0.11845296223958333, "learning_rate": 0.0001, "loss": 3.2715, "loss/crossentropy": 2.4462394237518312, "loss/hidden": 3.0515625, "loss/incoh": 0.0, "loss/logits": 0.3083674684166908, "loss/reg": 0.0, "step": 14310 }, { "epoch": 0.09421052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.115771484375, "learning_rate": 0.0001, "loss": 3.1137, "loss/crossentropy": 2.4102493643760683, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.3285558119416237, "loss/reg": 0.0, "step": 14320 }, { "epoch": 0.09427631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.0410552978515625, "learning_rate": 0.0001, "loss": 3.1163, "loss/crossentropy": 2.496846008300781, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2681970477104187, "loss/reg": 0.0, "step": 14330 }, { "epoch": 0.0943421052631579, "grad_norm": 3.28125, "grad_norm_var": 0.3074615478515625, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.368880546092987, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27040198296308515, "loss/reg": 0.0, "step": 14340 }, { "epoch": 0.09440789473684211, "grad_norm": 2.359375, "grad_norm_var": 0.087841796875, "learning_rate": 0.0001, "loss": 3.1677, "loss/crossentropy": 2.379721689224243, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.3262082427740097, "loss/reg": 0.0, "step": 14350 }, { "epoch": 0.09447368421052632, "grad_norm": 3.03125, "grad_norm_var": 0.06515299479166667, "learning_rate": 0.0001, "loss": 3.1541, "loss/crossentropy": 2.3577764987945558, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.28836628049612045, "loss/reg": 0.0, "step": 14360 }, { "epoch": 0.09453947368421052, "grad_norm": 2.40625, "grad_norm_var": 0.09937744140625, "learning_rate": 0.0001, "loss": 3.1429, "loss/crossentropy": 2.2929248332977297, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2473236471414566, "loss/reg": 0.0, "step": 14370 }, { "epoch": 0.09460526315789473, "grad_norm": 2.90625, "grad_norm_var": 0.056428019205729166, "learning_rate": 0.0001, "loss": 3.1331, "loss/crossentropy": 2.11824688911438, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.19864632040262223, "loss/reg": 0.0, "step": 14380 }, { "epoch": 0.09467105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.2792154947916667, "learning_rate": 0.0001, "loss": 3.2558, "loss/crossentropy": 2.4552414536476137, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.26964131295681, "loss/reg": 0.0, "step": 14390 }, { "epoch": 0.09473684210526316, "grad_norm": 2.828125, "grad_norm_var": 0.1197418212890625, "learning_rate": 0.0001, "loss": 3.1627, "loss/crossentropy": 2.4108232736587523, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2856267586350441, "loss/reg": 0.0, "step": 14400 }, { "epoch": 0.09480263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.09917704264322917, "learning_rate": 0.0001, "loss": 3.2683, "loss/crossentropy": 2.245331883430481, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.26227104514837263, "loss/reg": 0.0, "step": 14410 }, { "epoch": 0.09486842105263157, "grad_norm": 2.71875, "grad_norm_var": 0.47395426432291665, "learning_rate": 0.0001, "loss": 3.3242, "loss/crossentropy": 2.5019222021102907, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2487493023276329, "loss/reg": 0.0, "step": 14420 }, { "epoch": 0.09493421052631579, "grad_norm": 2.84375, "grad_norm_var": 0.8088053385416667, "learning_rate": 0.0001, "loss": 3.1846, "loss/crossentropy": 2.239874541759491, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.30340075492858887, "loss/reg": 0.0, "step": 14430 }, { "epoch": 0.095, "grad_norm": 3.203125, "grad_norm_var": 0.5774698893229167, "learning_rate": 0.0001, "loss": 3.2356, "loss/crossentropy": 2.212349569797516, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.25182478949427606, "loss/reg": 0.0, "step": 14440 }, { "epoch": 0.09506578947368421, "grad_norm": 2.890625, "grad_norm_var": 0.19168294270833333, "learning_rate": 0.0001, "loss": 3.2659, "loss/crossentropy": 2.337146294116974, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2963370710611343, "loss/reg": 0.0, "step": 14450 }, { "epoch": 0.09513157894736841, "grad_norm": 2.5625, "grad_norm_var": 0.10920308430989584, "learning_rate": 0.0001, "loss": 3.2442, "loss/crossentropy": 1.8299875736236573, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.21587850898504257, "loss/reg": 0.0, "step": 14460 }, { "epoch": 0.09519736842105263, "grad_norm": 2.4375, "grad_norm_var": 0.04309794108072917, "learning_rate": 0.0001, "loss": 3.1964, "loss/crossentropy": 2.558639335632324, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.25251512974500656, "loss/reg": 0.0, "step": 14470 }, { "epoch": 0.09526315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.020686848958333334, "learning_rate": 0.0001, "loss": 3.1942, "loss/crossentropy": 2.514283466339111, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.2890095472335815, "loss/reg": 0.0, "step": 14480 }, { "epoch": 0.09532894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.028123982747395835, "learning_rate": 0.0001, "loss": 3.1856, "loss/crossentropy": 2.2959898948669433, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24410185664892198, "loss/reg": 0.0, "step": 14490 }, { "epoch": 0.09539473684210527, "grad_norm": 2.328125, "grad_norm_var": 0.03728841145833333, "learning_rate": 0.0001, "loss": 3.2528, "loss/crossentropy": 2.421563959121704, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.30335861891508104, "loss/reg": 0.0, "step": 14500 }, { "epoch": 0.09546052631578947, "grad_norm": 2.625, "grad_norm_var": 0.041825358072916666, "learning_rate": 0.0001, "loss": 3.1292, "loss/crossentropy": 2.3713988065719604, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2508593872189522, "loss/reg": 0.0, "step": 14510 }, { "epoch": 0.09552631578947368, "grad_norm": 2.015625, "grad_norm_var": 0.12280985514322916, "learning_rate": 0.0001, "loss": 3.1975, "loss/crossentropy": 2.3875895380973815, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.26996816471219065, "loss/reg": 0.0, "step": 14520 }, { "epoch": 0.0955921052631579, "grad_norm": 2.375, "grad_norm_var": 0.13038101196289062, "learning_rate": 0.0001, "loss": 3.1709, "loss/crossentropy": 2.3352912187576296, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.23363660275936127, "loss/reg": 0.0, "step": 14530 }, { "epoch": 0.09565789473684211, "grad_norm": 2.5625, "grad_norm_var": 0.11688206990559896, "learning_rate": 0.0001, "loss": 3.1364, "loss/crossentropy": 2.473167669773102, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24549597799777984, "loss/reg": 0.0, "step": 14540 }, { "epoch": 0.09572368421052632, "grad_norm": 2.734375, "grad_norm_var": 0.060868326822916666, "learning_rate": 0.0001, "loss": 3.2086, "loss/crossentropy": 2.3421871423721314, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.2576961562037468, "loss/reg": 0.0, "step": 14550 }, { "epoch": 0.09578947368421052, "grad_norm": 2.59375, "grad_norm_var": 0.10832926432291666, "learning_rate": 0.0001, "loss": 3.1744, "loss/crossentropy": 2.1765161633491514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2244855582714081, "loss/reg": 0.0, "step": 14560 }, { "epoch": 0.09585526315789474, "grad_norm": 2.265625, "grad_norm_var": 0.1071685791015625, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.507940888404846, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.2875757083296776, "loss/reg": 0.0, "step": 14570 }, { "epoch": 0.09592105263157895, "grad_norm": 2.140625, "grad_norm_var": 0.029523722330729165, "learning_rate": 0.0001, "loss": 3.1669, "loss/crossentropy": 2.2886616230010985, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.3050649344921112, "loss/reg": 0.0, "step": 14580 }, { "epoch": 0.09598684210526316, "grad_norm": 2.8125, "grad_norm_var": 0.03560282389322917, "learning_rate": 0.0001, "loss": 3.1454, "loss/crossentropy": 2.395359969139099, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2635947346687317, "loss/reg": 0.0, "step": 14590 }, { "epoch": 0.09605263157894736, "grad_norm": 2.03125, "grad_norm_var": 0.23357747395833334, "learning_rate": 0.0001, "loss": 3.2176, "loss/crossentropy": 2.002879011631012, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.24021245390176774, "loss/reg": 0.0, "step": 14600 }, { "epoch": 0.09611842105263158, "grad_norm": 2.109375, "grad_norm_var": 0.216552734375, "learning_rate": 0.0001, "loss": 3.1289, "loss/crossentropy": 2.4440099954605103, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.3085349440574646, "loss/reg": 0.0, "step": 14610 }, { "epoch": 0.09618421052631579, "grad_norm": 2.21875, "grad_norm_var": 0.18892822265625, "learning_rate": 0.0001, "loss": 3.2541, "loss/crossentropy": 2.5584524154663084, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.33079054951667786, "loss/reg": 0.0, "step": 14620 }, { "epoch": 0.09625, "grad_norm": 2.546875, "grad_norm_var": 0.18036702473958333, "learning_rate": 0.0001, "loss": 3.1899, "loss/crossentropy": 2.386956262588501, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2895423695445061, "loss/reg": 0.0, "step": 14630 }, { "epoch": 0.09631578947368422, "grad_norm": 2.390625, "grad_norm_var": 0.15852762858072916, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.0192247331142426, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2274337187409401, "loss/reg": 0.0, "step": 14640 }, { "epoch": 0.09638157894736842, "grad_norm": 2.375, "grad_norm_var": 0.11549072265625, "learning_rate": 0.0001, "loss": 3.1243, "loss/crossentropy": 2.499123454093933, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.25674946457147596, "loss/reg": 0.0, "step": 14650 }, { "epoch": 0.09644736842105263, "grad_norm": 2130706432.0, "grad_norm_var": 2.83744368059244e+17, "learning_rate": 0.0001, "loss": 3.251, "loss/crossentropy": 2.1218835711479187, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.23685040771961213, "loss/reg": 0.0, "step": 14660 }, { "epoch": 0.09651315789473684, "grad_norm": 2.15625, "grad_norm_var": 2.8374436794771485e+17, "learning_rate": 0.0001, "loss": 3.2008, "loss/crossentropy": 2.125733083486557, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3448520749807358, "loss/reg": 0.0, "step": 14670 }, { "epoch": 0.09657894736842106, "grad_norm": 2.40625, "grad_norm_var": 0.24964090983072917, "learning_rate": 0.0001, "loss": 3.1553, "loss/crossentropy": 2.604245328903198, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.23915667235851287, "loss/reg": 0.0, "step": 14680 }, { "epoch": 0.09664473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.11900634765625, "learning_rate": 0.0001, "loss": 3.1023, "loss/crossentropy": 2.3761191368103027, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2546194761991501, "loss/reg": 0.0, "step": 14690 }, { "epoch": 0.09671052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.05559488932291667, "learning_rate": 0.0001, "loss": 3.1544, "loss/crossentropy": 2.3285223722457884, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.24710593670606612, "loss/reg": 0.0, "step": 14700 }, { "epoch": 0.09677631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.13647842407226562, "learning_rate": 0.0001, "loss": 3.1385, "loss/crossentropy": 2.364056706428528, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22887694388628005, "loss/reg": 0.0, "step": 14710 }, { "epoch": 0.0968421052631579, "grad_norm": 2.140625, "grad_norm_var": 5.376778157552083, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.2852516174316406, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24893905222415924, "loss/reg": 0.0, "step": 14720 }, { "epoch": 0.09690789473684211, "grad_norm": 2.828125, "grad_norm_var": 5.43084487915039, "learning_rate": 0.0001, "loss": 3.1089, "loss/crossentropy": 2.1680606245994567, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23210474848747253, "loss/reg": 0.0, "step": 14730 }, { "epoch": 0.09697368421052631, "grad_norm": 1.984375, "grad_norm_var": 0.1650054931640625, "learning_rate": 0.0001, "loss": 3.1202, "loss/crossentropy": 2.1676085114479067, "loss/hidden": 3.103125, "loss/incoh": 0.0, "loss/logits": 0.2939779102802277, "loss/reg": 0.0, "step": 14740 }, { "epoch": 0.09703947368421052, "grad_norm": 2.078125, "grad_norm_var": 0.17122294108072916, "learning_rate": 0.0001, "loss": 3.0635, "loss/crossentropy": 2.498277449607849, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.22846879661083222, "loss/reg": 0.0, "step": 14750 }, { "epoch": 0.09710526315789474, "grad_norm": 2.296875, "grad_norm_var": 0.11088765462239583, "learning_rate": 0.0001, "loss": 3.155, "loss/crossentropy": 2.2534152626991273, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2535191968083382, "loss/reg": 0.0, "step": 14760 }, { "epoch": 0.09717105263157895, "grad_norm": 2.625, "grad_norm_var": 0.10987955729166667, "learning_rate": 0.0001, "loss": 3.2477, "loss/crossentropy": 2.085835373401642, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.25164034962654114, "loss/reg": 0.0, "step": 14770 }, { "epoch": 0.09723684210526316, "grad_norm": 2.578125, "grad_norm_var": 17.859382120768228, "learning_rate": 0.0001, "loss": 3.2842, "loss/crossentropy": 1.8799749910831451, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.24884901493787764, "loss/reg": 0.0, "step": 14780 }, { "epoch": 0.09730263157894736, "grad_norm": 2.078125, "grad_norm_var": 17.996207682291665, "learning_rate": 0.0001, "loss": 3.2386, "loss/crossentropy": 2.307372045516968, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.26386404484510423, "loss/reg": 0.0, "step": 14790 }, { "epoch": 0.09736842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06170247395833333, "learning_rate": 0.0001, "loss": 3.1452, "loss/crossentropy": 2.454807901382446, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.3172868087887764, "loss/reg": 0.0, "step": 14800 }, { "epoch": 0.09743421052631579, "grad_norm": 2.5, "grad_norm_var": 0.05920817057291667, "learning_rate": 0.0001, "loss": 3.1114, "loss/crossentropy": 2.014724650979042, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.1908031925559044, "loss/reg": 0.0, "step": 14810 }, { "epoch": 0.0975, "grad_norm": 2.90625, "grad_norm_var": 0.060445149739583336, "learning_rate": 0.0001, "loss": 3.2493, "loss/crossentropy": 2.463605833053589, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24406316578388215, "loss/reg": 0.0, "step": 14820 }, { "epoch": 0.09756578947368422, "grad_norm": 2.28125, "grad_norm_var": 0.29640299479166665, "learning_rate": 0.0001, "loss": 3.1614, "loss/crossentropy": 2.501069176197052, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2565233051776886, "loss/reg": 0.0, "step": 14830 }, { "epoch": 0.09763157894736842, "grad_norm": 2.09375, "grad_norm_var": 0.5406534830729167, "learning_rate": 0.0001, "loss": 3.1904, "loss/crossentropy": 2.462419664859772, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.22374519556760789, "loss/reg": 0.0, "step": 14840 }, { "epoch": 0.09769736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.7624501546223958, "learning_rate": 0.0001, "loss": 3.2431, "loss/crossentropy": 2.226536822319031, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.26644248366355894, "loss/reg": 0.0, "step": 14850 }, { "epoch": 0.09776315789473684, "grad_norm": 2.328125, "grad_norm_var": 2.5783355712890623, "learning_rate": 0.0001, "loss": 3.3229, "loss/crossentropy": 2.3811925053596497, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22725498378276826, "loss/reg": 0.0, "step": 14860 }, { "epoch": 0.09782894736842106, "grad_norm": 2.90625, "grad_norm_var": 2.3144205729166667, "learning_rate": 0.0001, "loss": 3.2061, "loss/crossentropy": 2.5685184717178347, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2757527410984039, "loss/reg": 0.0, "step": 14870 }, { "epoch": 0.09789473684210526, "grad_norm": 2.515625, "grad_norm_var": 1.900218709309896, "learning_rate": 0.0001, "loss": 3.1588, "loss/crossentropy": 2.224149799346924, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.24766245782375335, "loss/reg": 0.0, "step": 14880 }, { "epoch": 0.09796052631578947, "grad_norm": 2.421875, "grad_norm_var": 0.13209228515625, "learning_rate": 0.0001, "loss": 3.1723, "loss/crossentropy": 2.2869945645332335, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.3204286351799965, "loss/reg": 0.0, "step": 14890 }, { "epoch": 0.09802631578947368, "grad_norm": 2.71875, "grad_norm_var": 0.036408487955729166, "learning_rate": 0.0001, "loss": 3.1244, "loss/crossentropy": 2.276153302192688, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2348289594054222, "loss/reg": 0.0, "step": 14900 }, { "epoch": 0.0980921052631579, "grad_norm": 3.328125, "grad_norm_var": 0.0842926025390625, "learning_rate": 0.0001, "loss": 3.2024, "loss/crossentropy": 2.2277270436286924, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2504657179117203, "loss/reg": 0.0, "step": 14910 }, { "epoch": 0.09815789473684211, "grad_norm": 2.25, "grad_norm_var": 0.15784098307291666, "learning_rate": 0.0001, "loss": 3.1643, "loss/crossentropy": 2.601578450202942, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.27564091980457306, "loss/reg": 0.0, "step": 14920 }, { "epoch": 0.09822368421052631, "grad_norm": 2.21875, "grad_norm_var": 0.12454325358072917, "learning_rate": 0.0001, "loss": 3.0857, "loss/crossentropy": 2.4973155736923216, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2681492820382118, "loss/reg": 0.0, "step": 14930 }, { "epoch": 0.09828947368421052, "grad_norm": 2.703125, "grad_norm_var": 0.04514872233072917, "learning_rate": 0.0001, "loss": 3.1214, "loss/crossentropy": 2.526623797416687, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.26705596745014193, "loss/reg": 0.0, "step": 14940 }, { "epoch": 0.09835526315789474, "grad_norm": 2.21875, "grad_norm_var": 0.23338216145833332, "learning_rate": 0.0001, "loss": 3.1468, "loss/crossentropy": 2.273276376724243, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.28430237621068954, "loss/reg": 0.0, "step": 14950 }, { "epoch": 0.09842105263157895, "grad_norm": 2.1875, "grad_norm_var": 0.2802398681640625, "learning_rate": 0.0001, "loss": 3.2042, "loss/crossentropy": 2.4497693538665772, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2505017280578613, "loss/reg": 0.0, "step": 14960 }, { "epoch": 0.09848684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.15282796223958334, "learning_rate": 0.0001, "loss": 3.1746, "loss/crossentropy": 2.4736087679862977, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.24588750153779984, "loss/reg": 0.0, "step": 14970 }, { "epoch": 0.09855263157894736, "grad_norm": 2.203125, "grad_norm_var": 0.11015218098958333, "learning_rate": 0.0001, "loss": 3.1539, "loss/crossentropy": 2.373614990711212, "loss/hidden": 3.0578125, "loss/incoh": 0.0, "loss/logits": 0.30631934851408005, "loss/reg": 0.0, "step": 14980 }, { "epoch": 0.09861842105263158, "grad_norm": 3.921875, "grad_norm_var": 0.1558990478515625, "learning_rate": 0.0001, "loss": 3.1982, "loss/crossentropy": 2.295030379295349, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.24311772882938384, "loss/reg": 0.0, "step": 14990 }, { "epoch": 0.09868421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.3582967122395833, "learning_rate": 0.0001, "loss": 3.2225, "loss/crossentropy": 2.29546434879303, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.45574710667133334, "loss/reg": 0.0, "step": 15000 }, { "epoch": 0.09875, "grad_norm": 2.796875, "grad_norm_var": 5.883512115478515, "learning_rate": 0.0001, "loss": 3.1782, "loss/crossentropy": 2.4186841249465942, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3855607584118843, "loss/reg": 0.0, "step": 15010 }, { "epoch": 0.0988157894736842, "grad_norm": 2.578125, "grad_norm_var": 5.986443837483724, "learning_rate": 0.0001, "loss": 3.0661, "loss/crossentropy": 2.409875476360321, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.23775186911225318, "loss/reg": 0.0, "step": 15020 }, { "epoch": 0.09888157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.055916086832682295, "learning_rate": 0.0001, "loss": 3.1756, "loss/crossentropy": 2.187470281124115, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.2504939392209053, "loss/reg": 0.0, "step": 15030 }, { "epoch": 0.09894736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.058203125, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.634449529647827, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2524956986308098, "loss/reg": 0.0, "step": 15040 }, { "epoch": 0.09901315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.09206441243489584, "learning_rate": 0.0001, "loss": 3.1804, "loss/crossentropy": 2.3493194222450255, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.24196315556764603, "loss/reg": 0.0, "step": 15050 }, { "epoch": 0.09907894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.0587799072265625, "learning_rate": 0.0001, "loss": 3.1473, "loss/crossentropy": 2.5005852222442626, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.2778876781463623, "loss/reg": 0.0, "step": 15060 }, { "epoch": 0.09914473684210526, "grad_norm": 2.890625, "grad_norm_var": 0.07665913899739583, "learning_rate": 0.0001, "loss": 3.2621, "loss/crossentropy": 2.30003308057785, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.28490780740976335, "loss/reg": 0.0, "step": 15070 }, { "epoch": 0.09921052631578947, "grad_norm": 2.78125, "grad_norm_var": 0.07154541015625, "learning_rate": 0.0001, "loss": 3.1472, "loss/crossentropy": 2.2739776968955994, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2682348355650902, "loss/reg": 0.0, "step": 15080 }, { "epoch": 0.09927631578947368, "grad_norm": 2.484375, "grad_norm_var": 0.044169108072916664, "learning_rate": 0.0001, "loss": 3.1016, "loss/crossentropy": 2.2083258867263793, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23891242742538452, "loss/reg": 0.0, "step": 15090 }, { "epoch": 0.0993421052631579, "grad_norm": 2.34375, "grad_norm_var": 0.0837890625, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.4516371846199037, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21450784876942636, "loss/reg": 0.0, "step": 15100 }, { "epoch": 0.09940789473684211, "grad_norm": 2.515625, "grad_norm_var": 0.14197489420572917, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.2019423693418503, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.278714843839407, "loss/reg": 0.0, "step": 15110 }, { "epoch": 0.09947368421052631, "grad_norm": 1.984375, "grad_norm_var": 0.06102676391601562, "learning_rate": 0.0001, "loss": 3.0573, "loss/crossentropy": 2.0795727133750916, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24602137356996537, "loss/reg": 0.0, "step": 15120 }, { "epoch": 0.09953947368421052, "grad_norm": 2.484375, "grad_norm_var": 0.04859619140625, "learning_rate": 0.0001, "loss": 3.1684, "loss/crossentropy": 2.0709302008152006, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.27385044246912005, "loss/reg": 0.0, "step": 15130 }, { "epoch": 0.09960526315789474, "grad_norm": 3.359375, "grad_norm_var": 0.10572509765625, "learning_rate": 0.0001, "loss": 3.2319, "loss/crossentropy": 2.1219942808151244, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.31611712723970414, "loss/reg": 0.0, "step": 15140 }, { "epoch": 0.09967105263157895, "grad_norm": 2.265625, "grad_norm_var": 0.30730794270833334, "learning_rate": 0.0001, "loss": 3.3146, "loss/crossentropy": 2.36390962600708, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2202586129307747, "loss/reg": 0.0, "step": 15150 }, { "epoch": 0.09973684210526315, "grad_norm": 3.015625, "grad_norm_var": 0.3680948893229167, "learning_rate": 0.0001, "loss": 3.1903, "loss/crossentropy": 2.592810094356537, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.31535505652427676, "loss/reg": 0.0, "step": 15160 }, { "epoch": 0.09980263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.06611226399739584, "learning_rate": 0.0001, "loss": 3.1583, "loss/crossentropy": 2.4805197715759277, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.25672143548727033, "loss/reg": 0.0, "step": 15170 }, { "epoch": 0.09986842105263158, "grad_norm": 2.1875, "grad_norm_var": 0.026691691080729166, "learning_rate": 0.0001, "loss": 3.0654, "loss/crossentropy": 2.428369462490082, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2483712613582611, "loss/reg": 0.0, "step": 15180 }, { "epoch": 0.09993421052631579, "grad_norm": 2.203125, "grad_norm_var": 0.16633707682291668, "learning_rate": 0.0001, "loss": 3.2011, "loss/crossentropy": 1.8888699412345886, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.25502131283283236, "loss/reg": 0.0, "step": 15190 }, { "epoch": 0.1, "grad_norm": 2.515625, "grad_norm_var": 0.09675191243489584, "learning_rate": 0.0001, "loss": 3.1893, "loss/crossentropy": 2.1418832421302794, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2491638869047165, "loss/reg": 0.0, "step": 15200 }, { "epoch": 0.1000657894736842, "grad_norm": 2.46875, "grad_norm_var": 0.08440755208333334, "learning_rate": 0.0001, "loss": 3.2207, "loss/crossentropy": 2.5305609703063965, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.28034003674983976, "loss/reg": 0.0, "step": 15210 }, { "epoch": 0.10013157894736842, "grad_norm": 2.703125, "grad_norm_var": 0.10357666015625, "learning_rate": 0.0001, "loss": 3.2279, "loss/crossentropy": 2.1161023378372192, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2513925403356552, "loss/reg": 0.0, "step": 15220 }, { "epoch": 0.10019736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.2027008056640625, "learning_rate": 0.0001, "loss": 3.2361, "loss/crossentropy": 2.064685332775116, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.28131103515625, "loss/reg": 0.0, "step": 15230 }, { "epoch": 0.10026315789473685, "grad_norm": 2.28125, "grad_norm_var": 0.14849853515625, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.4768277525901796, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.24746784269809724, "loss/reg": 0.0, "step": 15240 }, { "epoch": 0.10032894736842106, "grad_norm": 2.15625, "grad_norm_var": 0.07953999837239584, "learning_rate": 0.0001, "loss": 3.1794, "loss/crossentropy": 2.2046443462371825, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.26957911550998687, "loss/reg": 0.0, "step": 15250 }, { "epoch": 0.10039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.12280985514322916, "learning_rate": 0.0001, "loss": 3.0606, "loss/crossentropy": 2.418226730823517, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.21895384043455124, "loss/reg": 0.0, "step": 15260 }, { "epoch": 0.10046052631578947, "grad_norm": 2.46875, "grad_norm_var": 0.09387613932291666, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.1524213790893554, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.23847964853048326, "loss/reg": 0.0, "step": 15270 }, { "epoch": 0.10052631578947369, "grad_norm": 2.859375, "grad_norm_var": 0.12705459594726562, "learning_rate": 0.0001, "loss": 3.0626, "loss/crossentropy": 1.9326023817062379, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.24662483483552933, "loss/reg": 0.0, "step": 15280 }, { "epoch": 0.1005921052631579, "grad_norm": 2.640625, "grad_norm_var": 8.221414947509766, "learning_rate": 0.0001, "loss": 3.2248, "loss/crossentropy": 2.2926568508148195, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.26595802754163744, "loss/reg": 0.0, "step": 15290 }, { "epoch": 0.1006578947368421, "grad_norm": 2.71875, "grad_norm_var": 8.17893778483073, "learning_rate": 0.0001, "loss": 3.1429, "loss/crossentropy": 2.4002704977989198, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.2911446109414101, "loss/reg": 0.0, "step": 15300 }, { "epoch": 0.10072368421052631, "grad_norm": 2.1875, "grad_norm_var": 0.03648681640625, "learning_rate": 0.0001, "loss": 3.1424, "loss/crossentropy": 2.1603388369083403, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.27493633329868317, "loss/reg": 0.0, "step": 15310 }, { "epoch": 0.10078947368421053, "grad_norm": 3.96875, "grad_norm_var": 0.1912506103515625, "learning_rate": 0.0001, "loss": 3.133, "loss/crossentropy": 2.279471695423126, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.23778771311044694, "loss/reg": 0.0, "step": 15320 }, { "epoch": 0.10085526315789474, "grad_norm": 2.140625, "grad_norm_var": 0.2115875244140625, "learning_rate": 0.0001, "loss": 3.0811, "loss/crossentropy": 2.1736138820648194, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.19810649901628494, "loss/reg": 0.0, "step": 15330 }, { "epoch": 0.10092105263157895, "grad_norm": 2.453125, "grad_norm_var": 0.05100809733072917, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.2725695729255677, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.2860790088772774, "loss/reg": 0.0, "step": 15340 }, { "epoch": 0.10098684210526315, "grad_norm": 2.6875, "grad_norm_var": 2.056571451822917, "learning_rate": 0.0001, "loss": 3.2468, "loss/crossentropy": 2.215101981163025, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.27275211960077284, "loss/reg": 0.0, "step": 15350 }, { "epoch": 0.10105263157894737, "grad_norm": 2.15625, "grad_norm_var": 2.1172159830729167, "learning_rate": 0.0001, "loss": 3.1519, "loss/crossentropy": 2.416505420207977, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.26659835278987887, "loss/reg": 0.0, "step": 15360 }, { "epoch": 0.10111842105263158, "grad_norm": 2.234375, "grad_norm_var": 0.051708984375, "learning_rate": 0.0001, "loss": 3.0702, "loss/crossentropy": 2.3915260195732118, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.23197022825479507, "loss/reg": 0.0, "step": 15370 }, { "epoch": 0.1011842105263158, "grad_norm": 2.359375, "grad_norm_var": 0.03733622233072917, "learning_rate": 0.0001, "loss": 3.1794, "loss/crossentropy": 2.4574419140815733, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.34623306542634963, "loss/reg": 0.0, "step": 15380 }, { "epoch": 0.10125, "grad_norm": 2.3125, "grad_norm_var": 0.229052734375, "learning_rate": 0.0001, "loss": 3.1256, "loss/crossentropy": 2.1932177782058715, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.22062241584062575, "loss/reg": 0.0, "step": 15390 }, { "epoch": 0.1013157894736842, "grad_norm": 2.234375, "grad_norm_var": 0.08741861979166667, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.2610700845718386, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.22664773613214492, "loss/reg": 0.0, "step": 15400 }, { "epoch": 0.10138157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.02672119140625, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.394396644830704, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.25834341049194337, "loss/reg": 0.0, "step": 15410 }, { "epoch": 0.10144736842105263, "grad_norm": 2.484375, "grad_norm_var": 0.034895833333333334, "learning_rate": 0.0001, "loss": 3.1834, "loss/crossentropy": 2.290709137916565, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2504901379346848, "loss/reg": 0.0, "step": 15420 }, { "epoch": 0.10151315789473685, "grad_norm": 2.734375, "grad_norm_var": 5.647508748372396, "learning_rate": 0.0001, "loss": 3.1705, "loss/crossentropy": 2.597213554382324, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.2691803678870201, "loss/reg": 0.0, "step": 15430 }, { "epoch": 0.10157894736842105, "grad_norm": 2.125, "grad_norm_var": 0.07888997395833333, "learning_rate": 0.0001, "loss": 3.1069, "loss/crossentropy": 2.1887213230133056, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2691598206758499, "loss/reg": 0.0, "step": 15440 }, { "epoch": 0.10164473684210526, "grad_norm": 3.109375, "grad_norm_var": 0.0681793212890625, "learning_rate": 0.0001, "loss": 3.1639, "loss/crossentropy": 2.097235471010208, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2574038878083229, "loss/reg": 0.0, "step": 15450 }, { "epoch": 0.10171052631578947, "grad_norm": 2.125, "grad_norm_var": 0.0678863525390625, "learning_rate": 0.0001, "loss": 3.1301, "loss/crossentropy": 2.2443934082984924, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.24395571500062943, "loss/reg": 0.0, "step": 15460 }, { "epoch": 0.10177631578947369, "grad_norm": 2.265625, "grad_norm_var": 0.1054595947265625, "learning_rate": 0.0001, "loss": 3.2604, "loss/crossentropy": 2.429443156719208, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.2746900498867035, "loss/reg": 0.0, "step": 15470 }, { "epoch": 0.1018421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.1326080322265625, "learning_rate": 0.0001, "loss": 3.221, "loss/crossentropy": 2.423279583454132, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.27492027878761294, "loss/reg": 0.0, "step": 15480 }, { "epoch": 0.1019078947368421, "grad_norm": 2.484375, "grad_norm_var": 0.1138092041015625, "learning_rate": 0.0001, "loss": 3.257, "loss/crossentropy": 2.648502016067505, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2935266062617302, "loss/reg": 0.0, "step": 15490 }, { "epoch": 0.10197368421052631, "grad_norm": 2.40625, "grad_norm_var": 0.1727203369140625, "learning_rate": 0.0001, "loss": 3.1197, "loss/crossentropy": 2.485008704662323, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2535603493452072, "loss/reg": 0.0, "step": 15500 }, { "epoch": 0.10203947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.18816731770833334, "learning_rate": 0.0001, "loss": 3.1804, "loss/crossentropy": 2.180854117870331, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.24670835435390473, "loss/reg": 0.0, "step": 15510 }, { "epoch": 0.10210526315789474, "grad_norm": 2.453125, "grad_norm_var": 0.07157796223958333, "learning_rate": 0.0001, "loss": 3.1547, "loss/crossentropy": 2.5997716546058656, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2878054201602936, "loss/reg": 0.0, "step": 15520 }, { "epoch": 0.10217105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.154443359375, "learning_rate": 0.0001, "loss": 3.1694, "loss/crossentropy": 2.3764194369316103, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.24656722396612168, "loss/reg": 0.0, "step": 15530 }, { "epoch": 0.10223684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.25536702473958334, "learning_rate": 0.0001, "loss": 3.1474, "loss/crossentropy": 1.9735815048217773, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.24598835706710814, "loss/reg": 0.0, "step": 15540 }, { "epoch": 0.10230263157894737, "grad_norm": 2.8125, "grad_norm_var": 0.1596832275390625, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.525629758834839, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.33703051060438155, "loss/reg": 0.0, "step": 15550 }, { "epoch": 0.10236842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.1472808837890625, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.03394900560379, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.24415955394506456, "loss/reg": 0.0, "step": 15560 }, { "epoch": 0.1024342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.07866923014322917, "learning_rate": 0.0001, "loss": 3.122, "loss/crossentropy": 2.0943363308906555, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.23014722466468812, "loss/reg": 0.0, "step": 15570 }, { "epoch": 0.1025, "grad_norm": 2.359375, "grad_norm_var": 0.057389322916666666, "learning_rate": 0.0001, "loss": 3.1268, "loss/crossentropy": 2.033569025993347, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2175581559538841, "loss/reg": 0.0, "step": 15580 }, { "epoch": 0.10256578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.08776041666666666, "learning_rate": 0.0001, "loss": 3.0706, "loss/crossentropy": 2.6018026471138, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2438505232334137, "loss/reg": 0.0, "step": 15590 }, { "epoch": 0.10263157894736842, "grad_norm": 2.078125, "grad_norm_var": 0.1537994384765625, "learning_rate": 0.0001, "loss": 3.1439, "loss/crossentropy": 2.196335256099701, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2767083361744881, "loss/reg": 0.0, "step": 15600 }, { "epoch": 0.10269736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.07605692545572916, "learning_rate": 0.0001, "loss": 3.1523, "loss/crossentropy": 2.326652777194977, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2691790975630283, "loss/reg": 0.0, "step": 15610 }, { "epoch": 0.10276315789473685, "grad_norm": 2.21875, "grad_norm_var": 0.09851455688476562, "learning_rate": 0.0001, "loss": 3.1315, "loss/crossentropy": 2.219775491952896, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2539939820766449, "loss/reg": 0.0, "step": 15620 }, { "epoch": 0.10282894736842105, "grad_norm": 2.078125, "grad_norm_var": 0.14008560180664062, "learning_rate": 0.0001, "loss": 3.1501, "loss/crossentropy": 2.284542143344879, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.27733145356178285, "loss/reg": 0.0, "step": 15630 }, { "epoch": 0.10289473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.10869115193684896, "learning_rate": 0.0001, "loss": 3.1621, "loss/crossentropy": 2.4228519797325134, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.32454578429460523, "loss/reg": 0.0, "step": 15640 }, { "epoch": 0.10296052631578947, "grad_norm": 2.21875, "grad_norm_var": 0.11193211873372395, "learning_rate": 0.0001, "loss": 3.1631, "loss/crossentropy": 2.3004646062850953, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.2865068309009075, "loss/reg": 0.0, "step": 15650 }, { "epoch": 0.10302631578947369, "grad_norm": 2.203125, "grad_norm_var": 0.08788655598958334, "learning_rate": 0.0001, "loss": 3.2413, "loss/crossentropy": 2.3326223611831667, "loss/hidden": 3.0703125, "loss/incoh": 0.0, "loss/logits": 0.2786666050553322, "loss/reg": 0.0, "step": 15660 }, { "epoch": 0.1030921052631579, "grad_norm": 2.0625, "grad_norm_var": 0.05933329264322917, "learning_rate": 0.0001, "loss": 3.0799, "loss/crossentropy": 2.308107304573059, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23495433181524278, "loss/reg": 0.0, "step": 15670 }, { "epoch": 0.1031578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.04383926391601563, "learning_rate": 0.0001, "loss": 3.0889, "loss/crossentropy": 2.3051168084144593, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.278233802318573, "loss/reg": 0.0, "step": 15680 }, { "epoch": 0.10322368421052631, "grad_norm": 2.234375, "grad_norm_var": 1.4827247619628907, "learning_rate": 0.0001, "loss": 3.1864, "loss/crossentropy": 2.2644376397132873, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.22414357215166092, "loss/reg": 0.0, "step": 15690 }, { "epoch": 0.10328947368421053, "grad_norm": 2.296875, "grad_norm_var": 0.962451171875, "learning_rate": 0.0001, "loss": 3.1546, "loss/crossentropy": 2.424470007419586, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.24368802309036255, "loss/reg": 0.0, "step": 15700 }, { "epoch": 0.10335526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.07737223307291667, "learning_rate": 0.0001, "loss": 3.1624, "loss/crossentropy": 2.459938275814056, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23296410143375396, "loss/reg": 0.0, "step": 15710 }, { "epoch": 0.10342105263157894, "grad_norm": 2.59375, "grad_norm_var": 0.06443583170572917, "learning_rate": 0.0001, "loss": 3.1932, "loss/crossentropy": 2.2153470873832704, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.33586266040802004, "loss/reg": 0.0, "step": 15720 }, { "epoch": 0.10348684210526315, "grad_norm": 3.53125, "grad_norm_var": 0.17069905598958332, "learning_rate": 0.0001, "loss": 3.257, "loss/crossentropy": 2.2473284363746644, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.24566184133291244, "loss/reg": 0.0, "step": 15730 }, { "epoch": 0.10355263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.163916015625, "learning_rate": 0.0001, "loss": 3.0707, "loss/crossentropy": 2.620988368988037, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.29941551238298414, "loss/reg": 0.0, "step": 15740 }, { "epoch": 0.10361842105263158, "grad_norm": 2.703125, "grad_norm_var": 0.35409749348958336, "learning_rate": 0.0001, "loss": 3.2423, "loss/crossentropy": 2.3977234601974486, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.26520802527666093, "loss/reg": 0.0, "step": 15750 }, { "epoch": 0.1036842105263158, "grad_norm": 2.375, "grad_norm_var": 3.301877391050758e+17, "learning_rate": 0.0001, "loss": 3.2653, "loss/crossentropy": 2.3854068875312806, "loss/hidden": 3.8296875, "loss/incoh": 0.0, "loss/logits": 0.3163578942418098, "loss/reg": 0.0, "step": 15760 }, { "epoch": 0.10375, "grad_norm": 3.09375, "grad_norm_var": 3.301877391679248e+17, "learning_rate": 0.0001, "loss": 3.1843, "loss/crossentropy": 2.2795175433158876, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.26798346936702727, "loss/reg": 0.0, "step": 15770 }, { "epoch": 0.10381578947368421, "grad_norm": 2.578125, "grad_norm_var": 0.1086334228515625, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.125895881652832, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.23809853047132493, "loss/reg": 0.0, "step": 15780 }, { "epoch": 0.10388157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.09806289672851562, "learning_rate": 0.0001, "loss": 3.121, "loss/crossentropy": 2.3405375838279725, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23903141915798187, "loss/reg": 0.0, "step": 15790 }, { "epoch": 0.10394736842105264, "grad_norm": 2.1875, "grad_norm_var": 0.05409927368164062, "learning_rate": 0.0001, "loss": 3.1165, "loss/crossentropy": 2.371259605884552, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.24798257499933243, "loss/reg": 0.0, "step": 15800 }, { "epoch": 0.10401315789473685, "grad_norm": 2.625, "grad_norm_var": 0.046727498372395836, "learning_rate": 0.0001, "loss": 3.0808, "loss/crossentropy": 2.5483869433403017, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.2987742185592651, "loss/reg": 0.0, "step": 15810 }, { "epoch": 0.10407894736842105, "grad_norm": 2.8125, "grad_norm_var": 0.43404541015625, "learning_rate": 0.0001, "loss": 3.1913, "loss/crossentropy": 2.3843488097190857, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.41426307857036593, "loss/reg": 0.0, "step": 15820 }, { "epoch": 0.10414473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.42040913899739585, "learning_rate": 0.0001, "loss": 3.1412, "loss/crossentropy": 2.451664757728577, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2949830338358879, "loss/reg": 0.0, "step": 15830 }, { "epoch": 0.10421052631578948, "grad_norm": 2.046875, "grad_norm_var": 12064558770995.855, "learning_rate": 0.0001, "loss": 3.2876, "loss/crossentropy": 2.540658712387085, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.3069792494177818, "loss/reg": 0.0, "step": 15840 }, { "epoch": 0.10427631578947369, "grad_norm": 2.59375, "grad_norm_var": 0.26617431640625, "learning_rate": 0.0001, "loss": 3.1409, "loss/crossentropy": 2.3011590003967286, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23567215949296952, "loss/reg": 0.0, "step": 15850 }, { "epoch": 0.10434210526315789, "grad_norm": 2.21875, "grad_norm_var": 0.0716217041015625, "learning_rate": 0.0001, "loss": 3.2032, "loss/crossentropy": 2.624694299697876, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.2735205709934235, "loss/reg": 0.0, "step": 15860 }, { "epoch": 0.1044078947368421, "grad_norm": 2.671875, "grad_norm_var": 0.037018839518229166, "learning_rate": 0.0001, "loss": 3.1547, "loss/crossentropy": 2.3426929831504824, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26750322580337527, "loss/reg": 0.0, "step": 15870 }, { "epoch": 0.10447368421052632, "grad_norm": 2.625, "grad_norm_var": 0.0340728759765625, "learning_rate": 0.0001, "loss": 3.1257, "loss/crossentropy": 2.0538764238357543, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23220817893743514, "loss/reg": 0.0, "step": 15880 }, { "epoch": 0.10453947368421053, "grad_norm": 2.484375, "grad_norm_var": 0.10313084920247396, "learning_rate": 0.0001, "loss": 3.0791, "loss/crossentropy": 2.4493046522140505, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.264581099152565, "loss/reg": 0.0, "step": 15890 }, { "epoch": 0.10460526315789474, "grad_norm": 2.140625, "grad_norm_var": 0.08592910766601562, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.384776270389557, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.20801746100187302, "loss/reg": 0.0, "step": 15900 }, { "epoch": 0.10467105263157894, "grad_norm": 2.203125, "grad_norm_var": 0.03535054524739583, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.2780667304992677, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.33164310455322266, "loss/reg": 0.0, "step": 15910 }, { "epoch": 0.10473684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.04937744140625, "learning_rate": 0.0001, "loss": 3.1317, "loss/crossentropy": 2.3224631786346435, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.26515267193317416, "loss/reg": 0.0, "step": 15920 }, { "epoch": 0.10480263157894737, "grad_norm": 3.375, "grad_norm_var": 0.09521077473958334, "learning_rate": 0.0001, "loss": 3.1393, "loss/crossentropy": 2.26437486410141, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.25974428951740264, "loss/reg": 0.0, "step": 15930 }, { "epoch": 0.10486842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.10048828125, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.216641104221344, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.3172867178916931, "loss/reg": 0.0, "step": 15940 }, { "epoch": 0.10493421052631578, "grad_norm": 2.15625, "grad_norm_var": 0.0614898681640625, "learning_rate": 0.0001, "loss": 3.0768, "loss/crossentropy": 2.6155009508132934, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2578451469540596, "loss/reg": 0.0, "step": 15950 }, { "epoch": 0.105, "grad_norm": 2.234375, "grad_norm_var": 0.060269927978515624, "learning_rate": 0.0001, "loss": 3.161, "loss/crossentropy": 2.5140405654907227, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2738805189728737, "loss/reg": 0.0, "step": 15960 }, { "epoch": 0.10506578947368421, "grad_norm": 2.59375, "grad_norm_var": 0.050176747639973956, "learning_rate": 0.0001, "loss": 3.0829, "loss/crossentropy": 2.133234918117523, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2213057592511177, "loss/reg": 0.0, "step": 15970 }, { "epoch": 0.10513157894736842, "grad_norm": 2.515625, "grad_norm_var": 0.18106180826822918, "learning_rate": 0.0001, "loss": 3.2666, "loss/crossentropy": 2.489140582084656, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2818035438656807, "loss/reg": 0.0, "step": 15980 }, { "epoch": 0.10519736842105264, "grad_norm": 2.390625, "grad_norm_var": 0.6164947509765625, "learning_rate": 0.0001, "loss": 3.1722, "loss/crossentropy": 2.51891051530838, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2563257798552513, "loss/reg": 0.0, "step": 15990 }, { "epoch": 0.10526315789473684, "grad_norm": 2.75, "grad_norm_var": 0.8845987955729167, "learning_rate": 0.0001, "loss": 3.1763, "loss/crossentropy": 2.2177582025527953, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2367611363530159, "loss/reg": 0.0, "step": 16000 }, { "epoch": 0.10532894736842105, "grad_norm": 2.5625, "grad_norm_var": 0.451318359375, "learning_rate": 0.0001, "loss": 3.1408, "loss/crossentropy": 2.2017379879951475, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.20062217488884926, "loss/reg": 0.0, "step": 16010 }, { "epoch": 0.10539473684210526, "grad_norm": 2.390625, "grad_norm_var": 0.050439453125, "learning_rate": 0.0001, "loss": 3.1466, "loss/crossentropy": 2.2292946934700013, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2783455640077591, "loss/reg": 0.0, "step": 16020 }, { "epoch": 0.10546052631578948, "grad_norm": 1.984375, "grad_norm_var": 0.1126129150390625, "learning_rate": 0.0001, "loss": 3.1018, "loss/crossentropy": 2.530960404872894, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.23581028431653978, "loss/reg": 0.0, "step": 16030 }, { "epoch": 0.10552631578947369, "grad_norm": 2.109375, "grad_norm_var": 0.08625386555989584, "learning_rate": 0.0001, "loss": 3.1222, "loss/crossentropy": 2.432818961143494, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.23912513256072998, "loss/reg": 0.0, "step": 16040 }, { "epoch": 0.10559210526315789, "grad_norm": 2.234375, "grad_norm_var": 0.1727203369140625, "learning_rate": 0.0001, "loss": 3.1108, "loss/crossentropy": 2.481075167655945, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2601512670516968, "loss/reg": 0.0, "step": 16050 }, { "epoch": 0.1056578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.22974853515625, "learning_rate": 0.0001, "loss": 3.1102, "loss/crossentropy": 2.347836995124817, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2967532381415367, "loss/reg": 0.0, "step": 16060 }, { "epoch": 0.10572368421052632, "grad_norm": 2.421875, "grad_norm_var": 3.920637003580729, "learning_rate": 0.0001, "loss": 3.1789, "loss/crossentropy": 2.2461806178092956, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.2796615108847618, "loss/reg": 0.0, "step": 16070 }, { "epoch": 0.10578947368421053, "grad_norm": 3.234375, "grad_norm_var": 3.8184529622395833, "learning_rate": 0.0001, "loss": 3.2028, "loss/crossentropy": 2.2602067947387696, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.27418701648712157, "loss/reg": 0.0, "step": 16080 }, { "epoch": 0.10585526315789473, "grad_norm": 2.546875, "grad_norm_var": 0.0978912353515625, "learning_rate": 0.0001, "loss": 3.087, "loss/crossentropy": 2.4076030969619753, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2339254654943943, "loss/reg": 0.0, "step": 16090 }, { "epoch": 0.10592105263157894, "grad_norm": 8.8125, "grad_norm_var": 2.634016927083333, "learning_rate": 0.0001, "loss": 3.2404, "loss/crossentropy": 1.9294680893421172, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.24796108528971672, "loss/reg": 0.0, "step": 16100 }, { "epoch": 0.10598684210526316, "grad_norm": 2.5, "grad_norm_var": 2.7831013997395835, "learning_rate": 0.0001, "loss": 3.1866, "loss/crossentropy": 1.8769205152988433, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2611444815993309, "loss/reg": 0.0, "step": 16110 }, { "epoch": 0.10605263157894737, "grad_norm": 2.703125, "grad_norm_var": 0.76617431640625, "learning_rate": 0.0001, "loss": 3.2357, "loss/crossentropy": 2.2888787627220153, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2839828670024872, "loss/reg": 0.0, "step": 16120 }, { "epoch": 0.10611842105263158, "grad_norm": 2.359375, "grad_norm_var": 0.04062093098958333, "learning_rate": 0.0001, "loss": 3.1608, "loss/crossentropy": 2.313342797756195, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2872613161802292, "loss/reg": 0.0, "step": 16130 }, { "epoch": 0.10618421052631578, "grad_norm": 2.28125, "grad_norm_var": 0.4191691080729167, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.6026643037796022, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.28212335854768755, "loss/reg": 0.0, "step": 16140 }, { "epoch": 0.10625, "grad_norm": 2.4375, "grad_norm_var": 0.41529541015625, "learning_rate": 0.0001, "loss": 3.1346, "loss/crossentropy": 2.556915271282196, "loss/hidden": 3.1078125, "loss/incoh": 0.0, "loss/logits": 0.2837150752544403, "loss/reg": 0.0, "step": 16150 }, { "epoch": 0.10631578947368421, "grad_norm": 2.75, "grad_norm_var": 0.053441365559895836, "learning_rate": 0.0001, "loss": 3.1471, "loss/crossentropy": 2.399764931201935, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2460502192378044, "loss/reg": 0.0, "step": 16160 }, { "epoch": 0.10638157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.8765462239583334, "learning_rate": 0.0001, "loss": 3.2831, "loss/crossentropy": 2.2979734420776365, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.2876197725534439, "loss/reg": 0.0, "step": 16170 }, { "epoch": 0.10644736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.9296834309895833, "learning_rate": 0.0001, "loss": 3.2187, "loss/crossentropy": 2.2085362553596495, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.24199980795383452, "loss/reg": 0.0, "step": 16180 }, { "epoch": 0.10651315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.06541239420572917, "learning_rate": 0.0001, "loss": 3.1524, "loss/crossentropy": 2.4575427293777468, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2581774353981018, "loss/reg": 0.0, "step": 16190 }, { "epoch": 0.10657894736842105, "grad_norm": 3.34375, "grad_norm_var": 0.2840983072916667, "learning_rate": 0.0001, "loss": 3.2356, "loss/crossentropy": 2.444900369644165, "loss/hidden": 3.1625, "loss/incoh": 0.0, "loss/logits": 0.295256008207798, "loss/reg": 0.0, "step": 16200 }, { "epoch": 0.10664473684210526, "grad_norm": 2.15625, "grad_norm_var": 0.27533137003580727, "learning_rate": 0.0001, "loss": 3.0994, "loss/crossentropy": 2.5305007696151733, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.2888329938054085, "loss/reg": 0.0, "step": 16210 }, { "epoch": 0.10671052631578948, "grad_norm": 2.609375, "grad_norm_var": 0.6374224344889323, "learning_rate": 0.0001, "loss": 3.1837, "loss/crossentropy": 2.394989788532257, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.22691280096769334, "loss/reg": 0.0, "step": 16220 }, { "epoch": 0.10677631578947368, "grad_norm": 2.546875, "grad_norm_var": 0.6025950113932291, "learning_rate": 0.0001, "loss": 3.1035, "loss/crossentropy": 2.3761568784713747, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25212556272745135, "loss/reg": 0.0, "step": 16230 }, { "epoch": 0.10684210526315789, "grad_norm": 2.390625, "grad_norm_var": 0.07968343098958333, "learning_rate": 0.0001, "loss": 3.1507, "loss/crossentropy": 2.1518751621246337, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.25603573620319364, "loss/reg": 0.0, "step": 16240 }, { "epoch": 0.1069078947368421, "grad_norm": 2.03125, "grad_norm_var": 0.0736968994140625, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.2170523405075073, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2497616469860077, "loss/reg": 0.0, "step": 16250 }, { "epoch": 0.10697368421052632, "grad_norm": 2.671875, "grad_norm_var": 0.15055338541666666, "learning_rate": 0.0001, "loss": 3.18, "loss/crossentropy": 2.2126652002334595, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2371530830860138, "loss/reg": 0.0, "step": 16260 }, { "epoch": 0.10703947368421053, "grad_norm": 2.359375, "grad_norm_var": 0.1051666259765625, "learning_rate": 0.0001, "loss": 3.1175, "loss/crossentropy": 2.5849027037620544, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.31244791448116305, "loss/reg": 0.0, "step": 16270 }, { "epoch": 0.10710526315789473, "grad_norm": 2.34375, "grad_norm_var": 0.05666910807291667, "learning_rate": 0.0001, "loss": 3.2064, "loss/crossentropy": 1.9444570660591125, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2180204689502716, "loss/reg": 0.0, "step": 16280 }, { "epoch": 0.10717105263157894, "grad_norm": 2.203125, "grad_norm_var": 0.08806050618489583, "learning_rate": 0.0001, "loss": 3.0722, "loss/crossentropy": 2.3104967713356017, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.2631675943732262, "loss/reg": 0.0, "step": 16290 }, { "epoch": 0.10723684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.018146769205729166, "learning_rate": 0.0001, "loss": 3.0587, "loss/crossentropy": 2.1140322208404543, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2300342008471489, "loss/reg": 0.0, "step": 16300 }, { "epoch": 0.10730263157894737, "grad_norm": 2.40625, "grad_norm_var": 0.036774698893229166, "learning_rate": 0.0001, "loss": 3.2022, "loss/crossentropy": 2.5614047765731813, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2788904532790184, "loss/reg": 0.0, "step": 16310 }, { "epoch": 0.10736842105263159, "grad_norm": 2.734375, "grad_norm_var": 0.21725972493489584, "learning_rate": 0.0001, "loss": 3.1511, "loss/crossentropy": 1.8924081802368165, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2040714904665947, "loss/reg": 0.0, "step": 16320 }, { "epoch": 0.10743421052631578, "grad_norm": 2.234375, "grad_norm_var": 0.23220926920572918, "learning_rate": 0.0001, "loss": 3.1014, "loss/crossentropy": 2.342508816719055, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.27500451505184176, "loss/reg": 0.0, "step": 16330 }, { "epoch": 0.1075, "grad_norm": 2.21875, "grad_norm_var": 0.5662424723307292, "learning_rate": 0.0001, "loss": 3.2912, "loss/crossentropy": 2.3959147095680238, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2863708436489105, "loss/reg": 0.0, "step": 16340 }, { "epoch": 0.10756578947368421, "grad_norm": 2.078125, "grad_norm_var": 0.11278889973958334, "learning_rate": 0.0001, "loss": 3.1532, "loss/crossentropy": 2.166390228271484, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.23770884573459625, "loss/reg": 0.0, "step": 16350 }, { "epoch": 0.10763157894736843, "grad_norm": 2.265625, "grad_norm_var": 0.06780776977539063, "learning_rate": 0.0001, "loss": 3.0399, "loss/crossentropy": 2.3057939767837525, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2797847852110863, "loss/reg": 0.0, "step": 16360 }, { "epoch": 0.10769736842105262, "grad_norm": 2.390625, "grad_norm_var": 0.038826243082682295, "learning_rate": 0.0001, "loss": 3.12, "loss/crossentropy": 2.265635335445404, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23047662824392318, "loss/reg": 0.0, "step": 16370 }, { "epoch": 0.10776315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.08754781087239584, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.6614980459213258, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.23254811465740205, "loss/reg": 0.0, "step": 16380 }, { "epoch": 0.10782894736842105, "grad_norm": 3.390625, "grad_norm_var": 0.14463882446289061, "learning_rate": 0.0001, "loss": 3.1805, "loss/crossentropy": 2.419999623298645, "loss/hidden": 3.0984375, "loss/incoh": 0.0, "loss/logits": 0.3057109400629997, "loss/reg": 0.0, "step": 16390 }, { "epoch": 0.10789473684210527, "grad_norm": 2.234375, "grad_norm_var": 0.12823893229166666, "learning_rate": 0.0001, "loss": 3.1467, "loss/crossentropy": 2.145538020133972, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.23782579749822616, "loss/reg": 0.0, "step": 16400 }, { "epoch": 0.10796052631578948, "grad_norm": 2.40625, "grad_norm_var": 0.07416890462239584, "learning_rate": 0.0001, "loss": 3.1471, "loss/crossentropy": 2.3262511491775513, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.23651919960975648, "loss/reg": 0.0, "step": 16410 }, { "epoch": 0.10802631578947368, "grad_norm": 2.453125, "grad_norm_var": 0.052245076497395834, "learning_rate": 0.0001, "loss": 3.1573, "loss/crossentropy": 2.446836495399475, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22134366482496262, "loss/reg": 0.0, "step": 16420 }, { "epoch": 0.10809210526315789, "grad_norm": 2.90625, "grad_norm_var": 0.12043355305989584, "learning_rate": 0.0001, "loss": 3.2359, "loss/crossentropy": 2.3113945603370665, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2689176544547081, "loss/reg": 0.0, "step": 16430 }, { "epoch": 0.1081578947368421, "grad_norm": 2.078125, "grad_norm_var": 0.10204671223958334, "learning_rate": 0.0001, "loss": 3.1679, "loss/crossentropy": 2.422752869129181, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.306715852022171, "loss/reg": 0.0, "step": 16440 }, { "epoch": 0.10822368421052632, "grad_norm": 2.296875, "grad_norm_var": 0.06880594889322916, "learning_rate": 0.0001, "loss": 3.147, "loss/crossentropy": 2.048931634426117, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2303798720240593, "loss/reg": 0.0, "step": 16450 }, { "epoch": 0.10828947368421053, "grad_norm": 2.234375, "grad_norm_var": 0.15419514973958334, "learning_rate": 0.0001, "loss": 3.1877, "loss/crossentropy": 2.68409343957901, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.25028681606054304, "loss/reg": 0.0, "step": 16460 }, { "epoch": 0.10835526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.09561258951822917, "learning_rate": 0.0001, "loss": 3.0881, "loss/crossentropy": 2.3087923645973207, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.2732239991426468, "loss/reg": 0.0, "step": 16470 }, { "epoch": 0.10842105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.14474283854166667, "learning_rate": 0.0001, "loss": 3.1675, "loss/crossentropy": 2.3147490501403807, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.25520786345005037, "loss/reg": 0.0, "step": 16480 }, { "epoch": 0.10848684210526316, "grad_norm": 2.21875, "grad_norm_var": 0.12763671875, "learning_rate": 0.0001, "loss": 3.1948, "loss/crossentropy": 2.4046076416969298, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.24635857343673706, "loss/reg": 0.0, "step": 16490 }, { "epoch": 0.10855263157894737, "grad_norm": 3.234375, "grad_norm_var": 0.19675267537434896, "learning_rate": 0.0001, "loss": 3.134, "loss/crossentropy": 2.1358281135559083, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20830129384994506, "loss/reg": 0.0, "step": 16500 }, { "epoch": 0.10861842105263157, "grad_norm": 2.25, "grad_norm_var": 0.20981216430664062, "learning_rate": 0.0001, "loss": 3.1161, "loss/crossentropy": 2.2100176930427553, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22965313643217086, "loss/reg": 0.0, "step": 16510 }, { "epoch": 0.10868421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.025813802083333334, "learning_rate": 0.0001, "loss": 3.1402, "loss/crossentropy": 2.1080865144729612, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.24372481554746628, "loss/reg": 0.0, "step": 16520 }, { "epoch": 0.10875, "grad_norm": 2.390625, "grad_norm_var": 0.03843994140625, "learning_rate": 0.0001, "loss": 3.112, "loss/crossentropy": 2.184442663192749, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.25090422928333284, "loss/reg": 0.0, "step": 16530 }, { "epoch": 0.10881578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.03850682576497396, "learning_rate": 0.0001, "loss": 3.0917, "loss/crossentropy": 2.2426281213760375, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.26249563246965407, "loss/reg": 0.0, "step": 16540 }, { "epoch": 0.10888157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.037082672119140625, "learning_rate": 0.0001, "loss": 3.1533, "loss/crossentropy": 2.4359527707099913, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.28122682869434357, "loss/reg": 0.0, "step": 16550 }, { "epoch": 0.10894736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.06422526041666667, "learning_rate": 0.0001, "loss": 3.1086, "loss/crossentropy": 2.679885816574097, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.25820834636688234, "loss/reg": 0.0, "step": 16560 }, { "epoch": 0.10901315789473684, "grad_norm": 2.375, "grad_norm_var": 0.033543904622395836, "learning_rate": 0.0001, "loss": 3.1226, "loss/crossentropy": 2.230025511980057, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.24839217215776443, "loss/reg": 0.0, "step": 16570 }, { "epoch": 0.10907894736842105, "grad_norm": 2.171875, "grad_norm_var": 0.08507486979166666, "learning_rate": 0.0001, "loss": 3.158, "loss/crossentropy": 2.3745269417762755, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22690069079399108, "loss/reg": 0.0, "step": 16580 }, { "epoch": 0.10914473684210527, "grad_norm": 2.03125, "grad_norm_var": 0.07500712076822917, "learning_rate": 0.0001, "loss": 3.0464, "loss/crossentropy": 2.3726358652114867, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22285908311605454, "loss/reg": 0.0, "step": 16590 }, { "epoch": 0.10921052631578948, "grad_norm": 2.984375, "grad_norm_var": 0.0905426025390625, "learning_rate": 0.0001, "loss": 3.0802, "loss/crossentropy": 2.297460901737213, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.24248487651348113, "loss/reg": 0.0, "step": 16600 }, { "epoch": 0.10927631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.42377827962239584, "learning_rate": 0.0001, "loss": 3.1369, "loss/crossentropy": 2.1492306351661683, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.25582308024168016, "loss/reg": 0.0, "step": 16610 }, { "epoch": 0.1093421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.38531494140625, "learning_rate": 0.0001, "loss": 3.1291, "loss/crossentropy": 2.1944116175174715, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.26006165742874143, "loss/reg": 0.0, "step": 16620 }, { "epoch": 0.1094078947368421, "grad_norm": 2.296875, "grad_norm_var": 0.031305948893229164, "learning_rate": 0.0001, "loss": 3.0966, "loss/crossentropy": 2.2580446600914, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2571749180555344, "loss/reg": 0.0, "step": 16630 }, { "epoch": 0.10947368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.08385009765625, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.711108660697937, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.23514284193515778, "loss/reg": 0.0, "step": 16640 }, { "epoch": 0.10953947368421052, "grad_norm": 2.53125, "grad_norm_var": 0.39968973795572915, "learning_rate": 0.0001, "loss": 3.212, "loss/crossentropy": 2.318574833869934, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.3364941954612732, "loss/reg": 0.0, "step": 16650 }, { "epoch": 0.10960526315789473, "grad_norm": 2.90625, "grad_norm_var": 0.5502431233723958, "learning_rate": 0.0001, "loss": 3.1398, "loss/crossentropy": 2.5402591228485107, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2698328003287315, "loss/reg": 0.0, "step": 16660 }, { "epoch": 0.10967105263157895, "grad_norm": 2.578125, "grad_norm_var": 0.7628651936848958, "learning_rate": 0.0001, "loss": 3.2125, "loss/crossentropy": 2.4290089428424837, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.23459196090698242, "loss/reg": 0.0, "step": 16670 }, { "epoch": 0.10973684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.5333984375, "learning_rate": 0.0001, "loss": 3.1568, "loss/crossentropy": 2.4715004682540895, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2527153715491295, "loss/reg": 0.0, "step": 16680 }, { "epoch": 0.10980263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.01929931640625, "learning_rate": 0.0001, "loss": 3.1188, "loss/crossentropy": 2.565124809741974, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2764893934130669, "loss/reg": 0.0, "step": 16690 }, { "epoch": 0.10986842105263157, "grad_norm": 2.609375, "grad_norm_var": 0.7143287658691406, "learning_rate": 0.0001, "loss": 3.1851, "loss/crossentropy": 2.1823901176452636, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2804734021425247, "loss/reg": 0.0, "step": 16700 }, { "epoch": 0.10993421052631579, "grad_norm": 3.75, "grad_norm_var": 0.800158437093099, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.339951229095459, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23388542532920836, "loss/reg": 0.0, "step": 16710 }, { "epoch": 0.11, "grad_norm": 3.0625, "grad_norm_var": 0.46857808430989584, "learning_rate": 0.0001, "loss": 3.168, "loss/crossentropy": 2.5235843658447266, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.25846762359142306, "loss/reg": 0.0, "step": 16720 }, { "epoch": 0.11006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.2997792561848958, "learning_rate": 0.0001, "loss": 3.199, "loss/crossentropy": 2.4483086824417115, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.24846359938383103, "loss/reg": 0.0, "step": 16730 }, { "epoch": 0.11013157894736843, "grad_norm": 2.703125, "grad_norm_var": 0.0768218994140625, "learning_rate": 0.0001, "loss": 3.1155, "loss/crossentropy": 2.318689703941345, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.25379282981157303, "loss/reg": 0.0, "step": 16740 }, { "epoch": 0.11019736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.04757258097330729, "learning_rate": 0.0001, "loss": 3.0428, "loss/crossentropy": 2.3445772767066955, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.26050496101379395, "loss/reg": 0.0, "step": 16750 }, { "epoch": 0.11026315789473684, "grad_norm": 3.265625, "grad_norm_var": 0.09231669108072917, "learning_rate": 0.0001, "loss": 3.1426, "loss/crossentropy": 2.2978576898574827, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.23892502784729003, "loss/reg": 0.0, "step": 16760 }, { "epoch": 0.11032894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.09435221354166666, "learning_rate": 0.0001, "loss": 3.0687, "loss/crossentropy": 2.15471470952034, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22209101915359497, "loss/reg": 0.0, "step": 16770 }, { "epoch": 0.11039473684210527, "grad_norm": 2.625, "grad_norm_var": 0.084326171875, "learning_rate": 0.0001, "loss": 3.0934, "loss/crossentropy": 2.0944029092788696, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2732590340077877, "loss/reg": 0.0, "step": 16780 }, { "epoch": 0.11046052631578947, "grad_norm": 2.296875, "grad_norm_var": 0.07172749837239584, "learning_rate": 0.0001, "loss": 3.196, "loss/crossentropy": 2.4396159648895264, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.25805698037147523, "loss/reg": 0.0, "step": 16790 }, { "epoch": 0.11052631578947368, "grad_norm": 2.53125, "grad_norm_var": 0.04521382649739583, "learning_rate": 0.0001, "loss": 3.1492, "loss/crossentropy": 2.3519849300384523, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.24649366587400437, "loss/reg": 0.0, "step": 16800 }, { "epoch": 0.1105921052631579, "grad_norm": 2.90625, "grad_norm_var": 0.06435445149739584, "learning_rate": 0.0001, "loss": 3.1366, "loss/crossentropy": 2.2999491453170777, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2394917294383049, "loss/reg": 0.0, "step": 16810 }, { "epoch": 0.11065789473684211, "grad_norm": 2.3125, "grad_norm_var": 0.09391988118489583, "learning_rate": 0.0001, "loss": 3.1441, "loss/crossentropy": 2.4930548071861267, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2843811124563217, "loss/reg": 0.0, "step": 16820 }, { "epoch": 0.11072368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.23037007649739583, "learning_rate": 0.0001, "loss": 3.1744, "loss/crossentropy": 2.327615487575531, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.25857883393764497, "loss/reg": 0.0, "step": 16830 }, { "epoch": 0.11078947368421052, "grad_norm": 2.046875, "grad_norm_var": 0.08505859375, "learning_rate": 0.0001, "loss": 3.1966, "loss/crossentropy": 2.0340147018432617, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22139777690172197, "loss/reg": 0.0, "step": 16840 }, { "epoch": 0.11085526315789473, "grad_norm": 2.359375, "grad_norm_var": 0.04459228515625, "learning_rate": 0.0001, "loss": 3.0559, "loss/crossentropy": 2.327996277809143, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2415407806634903, "loss/reg": 0.0, "step": 16850 }, { "epoch": 0.11092105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.15735677083333333, "learning_rate": 0.0001, "loss": 3.1824, "loss/crossentropy": 2.3824142098426817, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.28444231003522874, "loss/reg": 0.0, "step": 16860 }, { "epoch": 0.11098684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.2694498697916667, "learning_rate": 0.0001, "loss": 3.1079, "loss/crossentropy": 2.4686101198196413, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.23947114795446395, "loss/reg": 0.0, "step": 16870 }, { "epoch": 0.11105263157894738, "grad_norm": 2.53125, "grad_norm_var": 0.2833811442057292, "learning_rate": 0.0001, "loss": 3.0544, "loss/crossentropy": 2.2384608387947083, "loss/hidden": 3.23125, "loss/incoh": 0.0, "loss/logits": 0.29755171537399294, "loss/reg": 0.0, "step": 16880 }, { "epoch": 0.11111842105263157, "grad_norm": 2.484375, "grad_norm_var": 0.11591389973958334, "learning_rate": 0.0001, "loss": 3.1316, "loss/crossentropy": 2.065184140205383, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.22740320414304732, "loss/reg": 0.0, "step": 16890 }, { "epoch": 0.11118421052631579, "grad_norm": 2.46875, "grad_norm_var": 0.14423421223958333, "learning_rate": 0.0001, "loss": 3.1867, "loss/crossentropy": 2.241389238834381, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2434727743268013, "loss/reg": 0.0, "step": 16900 }, { "epoch": 0.11125, "grad_norm": 2.390625, "grad_norm_var": 0.14661458333333333, "learning_rate": 0.0001, "loss": 3.1063, "loss/crossentropy": 2.1571604132652284, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.21824948787689208, "loss/reg": 0.0, "step": 16910 }, { "epoch": 0.11131578947368422, "grad_norm": 2.1875, "grad_norm_var": 0.9939605712890625, "learning_rate": 0.0001, "loss": 3.2191, "loss/crossentropy": 2.4117300748825072, "loss/hidden": 3.2203125, "loss/incoh": 0.0, "loss/logits": 0.33291524201631545, "loss/reg": 0.0, "step": 16920 }, { "epoch": 0.11138157894736841, "grad_norm": 2.46875, "grad_norm_var": 0.9513580322265625, "learning_rate": 0.0001, "loss": 3.2001, "loss/crossentropy": 2.490310883522034, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2631781131029129, "loss/reg": 0.0, "step": 16930 }, { "epoch": 0.11144736842105263, "grad_norm": 2.421875, "grad_norm_var": 0.12056884765625, "learning_rate": 0.0001, "loss": 3.203, "loss/crossentropy": 2.3007746815681456, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2449628993868828, "loss/reg": 0.0, "step": 16940 }, { "epoch": 0.11151315789473684, "grad_norm": 2.515625, "grad_norm_var": 0.11777242024739583, "learning_rate": 0.0001, "loss": 3.1245, "loss/crossentropy": 2.3470576763153077, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24132359698414801, "loss/reg": 0.0, "step": 16950 }, { "epoch": 0.11157894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.05214742024739583, "learning_rate": 0.0001, "loss": 3.115, "loss/crossentropy": 2.4354201793670653, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22888378351926802, "loss/reg": 0.0, "step": 16960 }, { "epoch": 0.11164473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.07139383951822917, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.1988754749298094, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22949053347110748, "loss/reg": 0.0, "step": 16970 }, { "epoch": 0.11171052631578947, "grad_norm": 2.375, "grad_norm_var": 0.06669921875, "learning_rate": 0.0001, "loss": 3.113, "loss/crossentropy": 2.2436501502990724, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.26571269184350965, "loss/reg": 0.0, "step": 16980 }, { "epoch": 0.11177631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.1141998291015625, "learning_rate": 0.0001, "loss": 3.2373, "loss/crossentropy": 2.0406009197235107, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2788648784160614, "loss/reg": 0.0, "step": 16990 }, { "epoch": 0.1118421052631579, "grad_norm": 3.5, "grad_norm_var": 0.13699544270833333, "learning_rate": 0.0001, "loss": 3.1371, "loss/crossentropy": 2.282533049583435, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.27586452960968016, "loss/reg": 0.0, "step": 17000 }, { "epoch": 0.11190789473684211, "grad_norm": 2.390625, "grad_norm_var": 0.13681233723958333, "learning_rate": 0.0001, "loss": 3.1898, "loss/crossentropy": 2.192341995239258, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2728851273655891, "loss/reg": 0.0, "step": 17010 }, { "epoch": 0.11197368421052632, "grad_norm": 2.9375, "grad_norm_var": 0.08626302083333333, "learning_rate": 0.0001, "loss": 3.1208, "loss/crossentropy": 2.036851680278778, "loss/hidden": 3.090625, "loss/incoh": 0.0, "loss/logits": 0.29492041319608686, "loss/reg": 0.0, "step": 17020 }, { "epoch": 0.11203947368421052, "grad_norm": 3.171875, "grad_norm_var": 0.38601888020833336, "learning_rate": 0.0001, "loss": 3.1351, "loss/crossentropy": 2.013105309009552, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.21977066546678542, "loss/reg": 0.0, "step": 17030 }, { "epoch": 0.11210526315789474, "grad_norm": 2.515625, "grad_norm_var": 0.515862782796224, "learning_rate": 0.0001, "loss": 3.0822, "loss/crossentropy": 2.2523082733154296, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.23816863894462587, "loss/reg": 0.0, "step": 17040 }, { "epoch": 0.11217105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.13596598307291666, "learning_rate": 0.0001, "loss": 3.0894, "loss/crossentropy": 2.474255657196045, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.28221793919801713, "loss/reg": 0.0, "step": 17050 }, { "epoch": 0.11223684210526316, "grad_norm": 2.375, "grad_norm_var": 2.8374436804564963e+17, "learning_rate": 0.0001, "loss": 3.2437, "loss/crossentropy": 2.5942065715789795, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.26605169773101806, "loss/reg": 0.0, "step": 17060 }, { "epoch": 0.11230263157894736, "grad_norm": 1.984375, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 3.0525, "loss/crossentropy": 2.034271013736725, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2154896892607212, "loss/reg": 0.0, "step": 17070 }, { "epoch": 0.11236842105263158, "grad_norm": 2.25, "grad_norm_var": 0.050902303059895834, "learning_rate": 0.0001, "loss": 3.1033, "loss/crossentropy": 2.229046678543091, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.22233688235282897, "loss/reg": 0.0, "step": 17080 }, { "epoch": 0.11243421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.04163309733072917, "learning_rate": 0.0001, "loss": 3.1293, "loss/crossentropy": 2.4799925684928894, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2588403090834618, "loss/reg": 0.0, "step": 17090 }, { "epoch": 0.1125, "grad_norm": 2.1875, "grad_norm_var": 0.1295074462890625, "learning_rate": 0.0001, "loss": 3.0953, "loss/crossentropy": 2.308819645643234, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.23669061064720154, "loss/reg": 0.0, "step": 17100 }, { "epoch": 0.11256578947368422, "grad_norm": 2.140625, "grad_norm_var": 0.7806142171223959, "learning_rate": 0.0001, "loss": 3.1353, "loss/crossentropy": 2.3123676419258117, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.29565141499042513, "loss/reg": 0.0, "step": 17110 }, { "epoch": 0.11263157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.08242085774739584, "learning_rate": 0.0001, "loss": 3.0662, "loss/crossentropy": 2.262893891334534, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.23198095709085464, "loss/reg": 0.0, "step": 17120 }, { "epoch": 0.11269736842105263, "grad_norm": 2.265625, "grad_norm_var": 0.1591949462890625, "learning_rate": 0.0001, "loss": 3.0855, "loss/crossentropy": 2.1816434502601623, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.2444119155406952, "loss/reg": 0.0, "step": 17130 }, { "epoch": 0.11276315789473684, "grad_norm": 2.125, "grad_norm_var": 0.14752197265625, "learning_rate": 0.0001, "loss": 3.1414, "loss/crossentropy": 2.4847304582595826, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.26482110619544985, "loss/reg": 0.0, "step": 17140 }, { "epoch": 0.11282894736842106, "grad_norm": 2.109375, "grad_norm_var": 0.09546610514322916, "learning_rate": 0.0001, "loss": 3.0923, "loss/crossentropy": 2.5822364687919617, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.26294154226779937, "loss/reg": 0.0, "step": 17150 }, { "epoch": 0.11289473684210527, "grad_norm": 3.796875, "grad_norm_var": 0.19820963541666667, "learning_rate": 0.0001, "loss": 3.0656, "loss/crossentropy": 2.3146368622779847, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.23589982092380524, "loss/reg": 0.0, "step": 17160 }, { "epoch": 0.11296052631578947, "grad_norm": 2.0625, "grad_norm_var": 1.0957618713378907, "learning_rate": 0.0001, "loss": 3.1693, "loss/crossentropy": 2.355622184276581, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.2783960849046707, "loss/reg": 0.0, "step": 17170 }, { "epoch": 0.11302631578947368, "grad_norm": 2.515625, "grad_norm_var": 1.0233965555826823, "learning_rate": 0.0001, "loss": 3.1617, "loss/crossentropy": 2.3866767287254333, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.25886829346418383, "loss/reg": 0.0, "step": 17180 }, { "epoch": 0.1130921052631579, "grad_norm": 2.25, "grad_norm_var": 0.08580322265625, "learning_rate": 0.0001, "loss": 3.1277, "loss/crossentropy": 2.095993900299072, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.1956949472427368, "loss/reg": 0.0, "step": 17190 }, { "epoch": 0.11315789473684211, "grad_norm": 2.375, "grad_norm_var": 0.09143778483072916, "learning_rate": 0.0001, "loss": 3.0835, "loss/crossentropy": 2.3784751892089844, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.26899502128362657, "loss/reg": 0.0, "step": 17200 }, { "epoch": 0.11322368421052631, "grad_norm": 2.3125, "grad_norm_var": 0.027274576822916667, "learning_rate": 0.0001, "loss": 3.1617, "loss/crossentropy": 2.505306875705719, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.2929636001586914, "loss/reg": 0.0, "step": 17210 }, { "epoch": 0.11328947368421052, "grad_norm": 2.296875, "grad_norm_var": 0.06923421223958333, "learning_rate": 0.0001, "loss": 3.12, "loss/crossentropy": 2.1889270186424254, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2669166073203087, "loss/reg": 0.0, "step": 17220 }, { "epoch": 0.11335526315789474, "grad_norm": 2.3125, "grad_norm_var": 0.17213312784830728, "learning_rate": 0.0001, "loss": 3.0681, "loss/crossentropy": 2.2528875708580016, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.23936019986867904, "loss/reg": 0.0, "step": 17230 }, { "epoch": 0.11342105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.12957763671875, "learning_rate": 0.0001, "loss": 3.119, "loss/crossentropy": 2.2457367897033693, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.23305099308490754, "loss/reg": 0.0, "step": 17240 }, { "epoch": 0.11348684210526316, "grad_norm": 2.578125, "grad_norm_var": 0.07983779907226562, "learning_rate": 0.0001, "loss": 3.0851, "loss/crossentropy": 2.4546321392059327, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.24581009149551392, "loss/reg": 0.0, "step": 17250 }, { "epoch": 0.11355263157894736, "grad_norm": 4.1875, "grad_norm_var": 0.2813791910807292, "learning_rate": 0.0001, "loss": 3.171, "loss/crossentropy": 1.9349523544311524, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.31686680018901825, "loss/reg": 0.0, "step": 17260 }, { "epoch": 0.11361842105263158, "grad_norm": 2.59375, "grad_norm_var": 1.4602701822916666, "learning_rate": 0.0001, "loss": 3.1656, "loss/crossentropy": 2.2751689314842225, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2878335312008858, "loss/reg": 0.0, "step": 17270 }, { "epoch": 0.11368421052631579, "grad_norm": 2.765625, "grad_norm_var": 0.5687662760416666, "learning_rate": 0.0001, "loss": 3.0772, "loss/crossentropy": 2.4036699175834655, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.24233294725418092, "loss/reg": 0.0, "step": 17280 }, { "epoch": 0.11375, "grad_norm": 2.96875, "grad_norm_var": 0.06691792805989584, "learning_rate": 0.0001, "loss": 3.1161, "loss/crossentropy": 2.4056329488754273, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.25776179879903793, "loss/reg": 0.0, "step": 17290 }, { "epoch": 0.11381578947368422, "grad_norm": 2.203125, "grad_norm_var": 0.12681884765625, "learning_rate": 0.0001, "loss": 3.2097, "loss/crossentropy": 2.4936662912368774, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.27261604368686676, "loss/reg": 0.0, "step": 17300 }, { "epoch": 0.11388157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.08603108723958333, "learning_rate": 0.0001, "loss": 3.1596, "loss/crossentropy": 2.2334436774253845, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.27515108734369276, "loss/reg": 0.0, "step": 17310 }, { "epoch": 0.11394736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.06290690104166667, "learning_rate": 0.0001, "loss": 3.1322, "loss/crossentropy": 2.3009248971939087, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2514318063855171, "loss/reg": 0.0, "step": 17320 }, { "epoch": 0.11401315789473684, "grad_norm": 2.953125, "grad_norm_var": 0.19062093098958333, "learning_rate": 0.0001, "loss": 3.2222, "loss/crossentropy": 2.2115599513053894, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24124337881803512, "loss/reg": 0.0, "step": 17330 }, { "epoch": 0.11407894736842106, "grad_norm": 2.34375, "grad_norm_var": 0.4003245035807292, "learning_rate": 0.0001, "loss": 3.1404, "loss/crossentropy": 2.2533546447753907, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.25740948766469957, "loss/reg": 0.0, "step": 17340 }, { "epoch": 0.11414473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.06924540201822917, "learning_rate": 0.0001, "loss": 3.1509, "loss/crossentropy": 2.4188032507896424, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.27805479913949965, "loss/reg": 0.0, "step": 17350 }, { "epoch": 0.11421052631578947, "grad_norm": 2.96875, "grad_norm_var": 0.6562489827473958, "learning_rate": 0.0001, "loss": 3.2795, "loss/crossentropy": 2.1555517435073854, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.28488118648529054, "loss/reg": 0.0, "step": 17360 }, { "epoch": 0.11427631578947368, "grad_norm": 2.140625, "grad_norm_var": 0.62613525390625, "learning_rate": 0.0001, "loss": 3.1637, "loss/crossentropy": 2.351659083366394, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.2672264903783798, "loss/reg": 0.0, "step": 17370 }, { "epoch": 0.1143421052631579, "grad_norm": 2.09375, "grad_norm_var": 0.08678385416666666, "learning_rate": 0.0001, "loss": 3.1242, "loss/crossentropy": 2.2498091578483583, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24537185132503508, "loss/reg": 0.0, "step": 17380 }, { "epoch": 0.11440789473684211, "grad_norm": 2.53125, "grad_norm_var": 0.17229817708333334, "learning_rate": 0.0001, "loss": 3.1776, "loss/crossentropy": 2.401424062252045, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.3059739723801613, "loss/reg": 0.0, "step": 17390 }, { "epoch": 0.11447368421052631, "grad_norm": 2.125, "grad_norm_var": 0.1625885009765625, "learning_rate": 0.0001, "loss": 3.1477, "loss/crossentropy": 2.292530918121338, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.2784269869327545, "loss/reg": 0.0, "step": 17400 }, { "epoch": 0.11453947368421052, "grad_norm": 2.375, "grad_norm_var": 0.09892171223958333, "learning_rate": 0.0001, "loss": 3.1288, "loss/crossentropy": 2.464983069896698, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2796273499727249, "loss/reg": 0.0, "step": 17410 }, { "epoch": 0.11460526315789474, "grad_norm": 2.921875, "grad_norm_var": 0.09850972493489583, "learning_rate": 0.0001, "loss": 3.1289, "loss/crossentropy": 2.1951366662979126, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.21696581244468688, "loss/reg": 0.0, "step": 17420 }, { "epoch": 0.11467105263157895, "grad_norm": 2.53125, "grad_norm_var": 0.06843159993489584, "learning_rate": 0.0001, "loss": 3.1597, "loss/crossentropy": 2.3925849914550783, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2907762542366982, "loss/reg": 0.0, "step": 17430 }, { "epoch": 0.11473684210526315, "grad_norm": 2.21875, "grad_norm_var": 0.21046549479166668, "learning_rate": 0.0001, "loss": 3.1634, "loss/crossentropy": 2.464526927471161, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2937367483973503, "loss/reg": 0.0, "step": 17440 }, { "epoch": 0.11480263157894736, "grad_norm": 2.203125, "grad_norm_var": 0.047526041666666664, "learning_rate": 0.0001, "loss": 3.1144, "loss/crossentropy": 2.072342965006828, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2236027292907238, "loss/reg": 0.0, "step": 17450 }, { "epoch": 0.11486842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.05799051920572917, "learning_rate": 0.0001, "loss": 3.0873, "loss/crossentropy": 2.240265655517578, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.23422597646713256, "loss/reg": 0.0, "step": 17460 }, { "epoch": 0.11493421052631579, "grad_norm": 2.078125, "grad_norm_var": 0.10058186848958334, "learning_rate": 0.0001, "loss": 3.127, "loss/crossentropy": 2.3168819308280946, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2430237874388695, "loss/reg": 0.0, "step": 17470 }, { "epoch": 0.115, "grad_norm": 2.625, "grad_norm_var": 0.11015523274739583, "learning_rate": 0.0001, "loss": 3.0618, "loss/crossentropy": 2.2366016268730164, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23987135738134385, "loss/reg": 0.0, "step": 17480 }, { "epoch": 0.1150657894736842, "grad_norm": 2.609375, "grad_norm_var": 0.09355061848958333, "learning_rate": 0.0001, "loss": 3.1875, "loss/crossentropy": 2.1879064559936525, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.24197361022233962, "loss/reg": 0.0, "step": 17490 }, { "epoch": 0.11513157894736842, "grad_norm": 2.421875, "grad_norm_var": 7.735667928059896, "learning_rate": 0.0001, "loss": 3.2427, "loss/crossentropy": 2.149078643321991, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.24705443456768988, "loss/reg": 0.0, "step": 17500 }, { "epoch": 0.11519736842105263, "grad_norm": 2.265625, "grad_norm_var": 7.899019368489584, "learning_rate": 0.0001, "loss": 3.1837, "loss/crossentropy": 2.3580456256866453, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23221501410007478, "loss/reg": 0.0, "step": 17510 }, { "epoch": 0.11526315789473685, "grad_norm": 2.6875, "grad_norm_var": 0.2678456624348958, "learning_rate": 0.0001, "loss": 3.079, "loss/crossentropy": 2.414518404006958, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2647902578115463, "loss/reg": 0.0, "step": 17520 }, { "epoch": 0.11532894736842106, "grad_norm": 3.125, "grad_norm_var": 0.14198811848958334, "learning_rate": 0.0001, "loss": 3.1613, "loss/crossentropy": 2.416664445400238, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22963229715824127, "loss/reg": 0.0, "step": 17530 }, { "epoch": 0.11539473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.7926177978515625, "learning_rate": 0.0001, "loss": 3.1532, "loss/crossentropy": 2.1853476405143737, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2286560907959938, "loss/reg": 0.0, "step": 17540 }, { "epoch": 0.11546052631578947, "grad_norm": 2.859375, "grad_norm_var": 0.8031534830729167, "learning_rate": 0.0001, "loss": 3.1557, "loss/crossentropy": 2.6209848165512084, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.25361735969781873, "loss/reg": 0.0, "step": 17550 }, { "epoch": 0.11552631578947369, "grad_norm": 2.4375, "grad_norm_var": 0.09949544270833334, "learning_rate": 0.0001, "loss": 3.2229, "loss/crossentropy": 2.3588363409042357, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.30205955654382705, "loss/reg": 0.0, "step": 17560 }, { "epoch": 0.1155921052631579, "grad_norm": 1.9765625, "grad_norm_var": 0.07576471964518229, "learning_rate": 0.0001, "loss": 3.174, "loss/crossentropy": 2.1386860758066177, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.24600727967917918, "loss/reg": 0.0, "step": 17570 }, { "epoch": 0.1156578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.07134577433268229, "learning_rate": 0.0001, "loss": 3.1143, "loss/crossentropy": 2.5142947912216185, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2483747810125351, "loss/reg": 0.0, "step": 17580 }, { "epoch": 0.11572368421052631, "grad_norm": 2.46875, "grad_norm_var": 0.7249959309895834, "learning_rate": 0.0001, "loss": 3.1598, "loss/crossentropy": 2.339717888832092, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2511411294341087, "loss/reg": 0.0, "step": 17590 }, { "epoch": 0.11578947368421053, "grad_norm": 2.671875, "grad_norm_var": 0.9438435872395833, "learning_rate": 0.0001, "loss": 3.0902, "loss/crossentropy": 2.3273319840431212, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.321346378326416, "loss/reg": 0.0, "step": 17600 }, { "epoch": 0.11585526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.93150634765625, "learning_rate": 0.0001, "loss": 3.1392, "loss/crossentropy": 2.0881393194198608, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2542056769132614, "loss/reg": 0.0, "step": 17610 }, { "epoch": 0.11592105263157895, "grad_norm": 2.25, "grad_norm_var": 0.05450846354166667, "learning_rate": 0.0001, "loss": 3.153, "loss/crossentropy": 2.200914776325226, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2596256859600544, "loss/reg": 0.0, "step": 17620 }, { "epoch": 0.11598684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.06134440104166667, "learning_rate": 0.0001, "loss": 3.1138, "loss/crossentropy": 2.391866648197174, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.24801254719495774, "loss/reg": 0.0, "step": 17630 }, { "epoch": 0.11605263157894737, "grad_norm": 2.5625, "grad_norm_var": 0.19595947265625, "learning_rate": 0.0001, "loss": 3.1197, "loss/crossentropy": 2.3731101989746093, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2734734550118446, "loss/reg": 0.0, "step": 17640 }, { "epoch": 0.11611842105263158, "grad_norm": 2.3125, "grad_norm_var": 0.06402587890625, "learning_rate": 0.0001, "loss": 3.186, "loss/crossentropy": 2.510706162452698, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.3243511900305748, "loss/reg": 0.0, "step": 17650 }, { "epoch": 0.11618421052631579, "grad_norm": 2.53125, "grad_norm_var": 0.08347066243489583, "learning_rate": 0.0001, "loss": 3.1433, "loss/crossentropy": 2.3694114327430724, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.23816563338041305, "loss/reg": 0.0, "step": 17660 }, { "epoch": 0.11625, "grad_norm": 2.515625, "grad_norm_var": 0.07176106770833333, "learning_rate": 0.0001, "loss": 3.117, "loss/crossentropy": 2.3822181940078737, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2559090554714203, "loss/reg": 0.0, "step": 17670 }, { "epoch": 0.1163157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.06597391764322917, "learning_rate": 0.0001, "loss": 3.167, "loss/crossentropy": 2.236483609676361, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.24177847057580948, "loss/reg": 0.0, "step": 17680 }, { "epoch": 0.11638157894736842, "grad_norm": 2.140625, "grad_norm_var": 0.05454813639322917, "learning_rate": 0.0001, "loss": 3.1207, "loss/crossentropy": 2.386284852027893, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.23972099274396896, "loss/reg": 0.0, "step": 17690 }, { "epoch": 0.11644736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.029325358072916665, "learning_rate": 0.0001, "loss": 3.1551, "loss/crossentropy": 2.3102688074111937, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.25639690458774567, "loss/reg": 0.0, "step": 17700 }, { "epoch": 0.11651315789473685, "grad_norm": 2.390625, "grad_norm_var": 0.1159576416015625, "learning_rate": 0.0001, "loss": 3.1227, "loss/crossentropy": 2.164554476737976, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2502566508948803, "loss/reg": 0.0, "step": 17710 }, { "epoch": 0.11657894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.16982421875, "learning_rate": 0.0001, "loss": 3.2736, "loss/crossentropy": 2.166772598028183, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.24817814379930497, "loss/reg": 0.0, "step": 17720 }, { "epoch": 0.11664473684210526, "grad_norm": 2.75, "grad_norm_var": 0.13912353515625, "learning_rate": 0.0001, "loss": 3.1985, "loss/crossentropy": 2.1537662744522095, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.21837295591831207, "loss/reg": 0.0, "step": 17730 }, { "epoch": 0.11671052631578947, "grad_norm": 2.046875, "grad_norm_var": 0.09753392537434896, "learning_rate": 0.0001, "loss": 3.1066, "loss/crossentropy": 2.106878674030304, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2489032343029976, "loss/reg": 0.0, "step": 17740 }, { "epoch": 0.11677631578947369, "grad_norm": 2.5625, "grad_norm_var": 0.08820699055989584, "learning_rate": 0.0001, "loss": 3.1647, "loss/crossentropy": 2.186918389797211, "loss/hidden": 3.1171875, "loss/incoh": 0.0, "loss/logits": 0.26816043853759763, "loss/reg": 0.0, "step": 17750 }, { "epoch": 0.1168421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.14002176920572917, "learning_rate": 0.0001, "loss": 3.2041, "loss/crossentropy": 2.130704140663147, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.26593275666236876, "loss/reg": 0.0, "step": 17760 }, { "epoch": 0.1169078947368421, "grad_norm": 3.6875, "grad_norm_var": 0.16592508951822918, "learning_rate": 0.0001, "loss": 3.0748, "loss/crossentropy": 2.4294887661933897, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.26008370518684387, "loss/reg": 0.0, "step": 17770 }, { "epoch": 0.11697368421052631, "grad_norm": 2.109375, "grad_norm_var": 0.4354156494140625, "learning_rate": 0.0001, "loss": 3.2129, "loss/crossentropy": 2.0633517861366273, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2134759709239006, "loss/reg": 0.0, "step": 17780 }, { "epoch": 0.11703947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.114306640625, "learning_rate": 0.0001, "loss": 3.1969, "loss/crossentropy": 2.470662558078766, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.32178338468074796, "loss/reg": 0.0, "step": 17790 }, { "epoch": 0.11710526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.1042388916015625, "learning_rate": 0.0001, "loss": 3.132, "loss/crossentropy": 2.180730104446411, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2408156231045723, "loss/reg": 0.0, "step": 17800 }, { "epoch": 0.11717105263157895, "grad_norm": 2.5, "grad_norm_var": 0.08092041015625, "learning_rate": 0.0001, "loss": 3.1107, "loss/crossentropy": 2.3672548174858092, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2295123293995857, "loss/reg": 0.0, "step": 17810 }, { "epoch": 0.11723684210526315, "grad_norm": 2.40625, "grad_norm_var": 0.029108683268229168, "learning_rate": 0.0001, "loss": 3.1063, "loss/crossentropy": 2.486844336986542, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2817386701703072, "loss/reg": 0.0, "step": 17820 }, { "epoch": 0.11730263157894737, "grad_norm": 2.609375, "grad_norm_var": 3.6479156901264755e+17, "learning_rate": 0.0001, "loss": 3.2511, "loss/crossentropy": 2.3458006739616395, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.25344357788562777, "loss/reg": 0.0, "step": 17830 }, { "epoch": 0.11736842105263158, "grad_norm": 2.75, "grad_norm_var": 1.1187174479166666, "learning_rate": 0.0001, "loss": 3.1952, "loss/crossentropy": 2.3346620917320253, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24867950975894929, "loss/reg": 0.0, "step": 17840 }, { "epoch": 0.1174342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.05538736979166667, "learning_rate": 0.0001, "loss": 3.1781, "loss/crossentropy": 2.1594719171524046, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.32041922956705093, "loss/reg": 0.0, "step": 17850 }, { "epoch": 0.1175, "grad_norm": 2.3125, "grad_norm_var": 0.13107096354166667, "learning_rate": 0.0001, "loss": 3.1675, "loss/crossentropy": 2.335154187679291, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.23140522688627244, "loss/reg": 0.0, "step": 17860 }, { "epoch": 0.1175657894736842, "grad_norm": 2.421875, "grad_norm_var": 0.061777496337890626, "learning_rate": 0.0001, "loss": 3.1431, "loss/crossentropy": 2.448777401447296, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24006084948778153, "loss/reg": 0.0, "step": 17870 }, { "epoch": 0.11763157894736842, "grad_norm": 2.25, "grad_norm_var": 0.14251302083333334, "learning_rate": 0.0001, "loss": 3.2176, "loss/crossentropy": 2.2756917595863344, "loss/hidden": 3.0359375, "loss/incoh": 0.0, "loss/logits": 0.30769334733486176, "loss/reg": 0.0, "step": 17880 }, { "epoch": 0.11769736842105263, "grad_norm": 1.921875, "grad_norm_var": 0.14058837890625, "learning_rate": 0.0001, "loss": 3.1661, "loss/crossentropy": 2.3477345585823057, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.2401590347290039, "loss/reg": 0.0, "step": 17890 }, { "epoch": 0.11776315789473685, "grad_norm": 3.34375, "grad_norm_var": 0.14404271443684896, "learning_rate": 0.0001, "loss": 3.1339, "loss/crossentropy": 2.2531643748283385, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.27347354739904406, "loss/reg": 0.0, "step": 17900 }, { "epoch": 0.11782894736842105, "grad_norm": 2.59375, "grad_norm_var": 0.10000991821289062, "learning_rate": 0.0001, "loss": 3.1182, "loss/crossentropy": 2.2883235216140747, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2531774565577507, "loss/reg": 0.0, "step": 17910 }, { "epoch": 0.11789473684210526, "grad_norm": 2.328125, "grad_norm_var": 0.0377349853515625, "learning_rate": 0.0001, "loss": 3.1566, "loss/crossentropy": 2.2789531648159027, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.235878374427557, "loss/reg": 0.0, "step": 17920 }, { "epoch": 0.11796052631578947, "grad_norm": 2.25, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 3.1068, "loss/crossentropy": 2.3758100509643554, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22890822887420653, "loss/reg": 0.0, "step": 17930 }, { "epoch": 0.11802631578947369, "grad_norm": 2.578125, "grad_norm_var": 0.07280171712239583, "learning_rate": 0.0001, "loss": 3.1773, "loss/crossentropy": 2.2372307777404785, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.24057336896657944, "loss/reg": 0.0, "step": 17940 }, { "epoch": 0.1180921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.12463785807291666, "learning_rate": 0.0001, "loss": 3.1969, "loss/crossentropy": 2.1488450884819033, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25413042977452277, "loss/reg": 0.0, "step": 17950 }, { "epoch": 0.1181578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.11864827473958334, "learning_rate": 0.0001, "loss": 3.2164, "loss/crossentropy": 2.5382091283798216, "loss/hidden": 3.13125, "loss/incoh": 0.0, "loss/logits": 0.26207938939332964, "loss/reg": 0.0, "step": 17960 }, { "epoch": 0.11822368421052631, "grad_norm": 2.0625, "grad_norm_var": 0.06974283854166667, "learning_rate": 0.0001, "loss": 3.1713, "loss/crossentropy": 2.3174261093139648, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.279338338971138, "loss/reg": 0.0, "step": 17970 }, { "epoch": 0.11828947368421053, "grad_norm": 2.6875, "grad_norm_var": 0.04395243326822917, "learning_rate": 0.0001, "loss": 3.1261, "loss/crossentropy": 2.182445216178894, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2768437474966049, "loss/reg": 0.0, "step": 17980 }, { "epoch": 0.11835526315789474, "grad_norm": 2.703125, "grad_norm_var": 0.14431050618489583, "learning_rate": 0.0001, "loss": 3.1287, "loss/crossentropy": 2.202815556526184, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.21700907945632936, "loss/reg": 0.0, "step": 17990 }, { "epoch": 0.11842105263157894, "grad_norm": 2.546875, "grad_norm_var": 2.2837799072265623, "learning_rate": 0.0001, "loss": 3.1737, "loss/crossentropy": 2.0051895678043365, "loss/hidden": 3.2046875, "loss/incoh": 0.0, "loss/logits": 0.35186032503843306, "loss/reg": 0.0, "step": 18000 }, { "epoch": 0.11848684210526315, "grad_norm": 2.4375, "grad_norm_var": 2.469374338785807, "learning_rate": 0.0001, "loss": 3.0848, "loss/crossentropy": 1.872368621826172, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2913415163755417, "loss/reg": 0.0, "step": 18010 }, { "epoch": 0.11855263157894737, "grad_norm": 2.046875, "grad_norm_var": 0.1679278055826823, "learning_rate": 0.0001, "loss": 3.2063, "loss/crossentropy": 2.367658519744873, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.27947854697704316, "loss/reg": 0.0, "step": 18020 }, { "epoch": 0.11861842105263158, "grad_norm": 3.40625, "grad_norm_var": 0.15479227701822917, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 2.087648892402649, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2204158440232277, "loss/reg": 0.0, "step": 18030 }, { "epoch": 0.1186842105263158, "grad_norm": 3.171875, "grad_norm_var": 0.18935546875, "learning_rate": 0.0001, "loss": 3.205, "loss/crossentropy": 2.1874751687049865, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2435563921928406, "loss/reg": 0.0, "step": 18040 }, { "epoch": 0.11875, "grad_norm": 2.234375, "grad_norm_var": 0.38063151041666665, "learning_rate": 0.0001, "loss": 3.1996, "loss/crossentropy": 2.1633397549390794, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.21968650221824645, "loss/reg": 0.0, "step": 18050 }, { "epoch": 0.11881578947368421, "grad_norm": 2.140625, "grad_norm_var": 0.5637278238932292, "learning_rate": 0.0001, "loss": 3.1736, "loss/crossentropy": 2.281619644165039, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.295219412446022, "loss/reg": 0.0, "step": 18060 }, { "epoch": 0.11888157894736842, "grad_norm": 2.828125, "grad_norm_var": 1.9192342122395833, "learning_rate": 0.0001, "loss": 3.2705, "loss/crossentropy": 2.21823273897171, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.23566214740276337, "loss/reg": 0.0, "step": 18070 }, { "epoch": 0.11894736842105263, "grad_norm": 2.515625, "grad_norm_var": 1.6334869384765625, "learning_rate": 0.0001, "loss": 3.1873, "loss/crossentropy": 2.2668980836868284, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2256957158446312, "loss/reg": 0.0, "step": 18080 }, { "epoch": 0.11901315789473685, "grad_norm": 2.046875, "grad_norm_var": 0.0713287353515625, "learning_rate": 0.0001, "loss": 3.0942, "loss/crossentropy": 2.218550479412079, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.3167494982481003, "loss/reg": 0.0, "step": 18090 }, { "epoch": 0.11907894736842105, "grad_norm": 2.375, "grad_norm_var": 0.05681050618489583, "learning_rate": 0.0001, "loss": 3.1816, "loss/crossentropy": 2.464331579208374, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.25844376236200334, "loss/reg": 0.0, "step": 18100 }, { "epoch": 0.11914473684210526, "grad_norm": 2.390625, "grad_norm_var": 0.07647196451822917, "learning_rate": 0.0001, "loss": 3.1727, "loss/crossentropy": 2.2838521599769592, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2587725341320038, "loss/reg": 0.0, "step": 18110 }, { "epoch": 0.11921052631578948, "grad_norm": 2.1875, "grad_norm_var": 0.13923238118489584, "learning_rate": 0.0001, "loss": 3.2584, "loss/crossentropy": 2.3038102626800536, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.3060797408223152, "loss/reg": 0.0, "step": 18120 }, { "epoch": 0.11927631578947369, "grad_norm": 2.09375, "grad_norm_var": 0.11160481770833333, "learning_rate": 0.0001, "loss": 3.0877, "loss/crossentropy": 2.1922419667243958, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2569601759314537, "loss/reg": 0.0, "step": 18130 }, { "epoch": 0.11934210526315789, "grad_norm": 2.34375, "grad_norm_var": 0.18005269368489582, "learning_rate": 0.0001, "loss": 3.2654, "loss/crossentropy": 2.336120533943176, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.3214158996939659, "loss/reg": 0.0, "step": 18140 }, { "epoch": 0.1194078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.211083984375, "learning_rate": 0.0001, "loss": 3.1801, "loss/crossentropy": 2.6258700489997864, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.23279473930597305, "loss/reg": 0.0, "step": 18150 }, { "epoch": 0.11947368421052632, "grad_norm": 2.46875, "grad_norm_var": 0.097412109375, "learning_rate": 0.0001, "loss": 3.0988, "loss/crossentropy": 2.378826451301575, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.24316011592745781, "loss/reg": 0.0, "step": 18160 }, { "epoch": 0.11953947368421053, "grad_norm": 2.953125, "grad_norm_var": 0.25974934895833335, "learning_rate": 0.0001, "loss": 3.1615, "loss/crossentropy": 2.3753814458847047, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2662284314632416, "loss/reg": 0.0, "step": 18170 }, { "epoch": 0.11960526315789474, "grad_norm": 2.28125, "grad_norm_var": 0.2468658447265625, "learning_rate": 0.0001, "loss": 3.2061, "loss/crossentropy": 2.2813811898231506, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.3734820380806923, "loss/reg": 0.0, "step": 18180 }, { "epoch": 0.11967105263157894, "grad_norm": 2.96875, "grad_norm_var": 0.12701416015625, "learning_rate": 0.0001, "loss": 3.16, "loss/crossentropy": 2.2644060015678407, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.22710230052471161, "loss/reg": 0.0, "step": 18190 }, { "epoch": 0.11973684210526316, "grad_norm": 2.015625, "grad_norm_var": 0.059342447916666666, "learning_rate": 0.0001, "loss": 3.0907, "loss/crossentropy": 2.2262014031410216, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.20997287034988404, "loss/reg": 0.0, "step": 18200 }, { "epoch": 0.11980263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.04023335774739583, "learning_rate": 0.0001, "loss": 3.2103, "loss/crossentropy": 2.261046063899994, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.262168163061142, "loss/reg": 0.0, "step": 18210 }, { "epoch": 0.11986842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.11513570149739584, "learning_rate": 0.0001, "loss": 3.194, "loss/crossentropy": 2.39845809340477, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.2080080732703209, "loss/reg": 0.0, "step": 18220 }, { "epoch": 0.1199342105263158, "grad_norm": 2.140625, "grad_norm_var": 0.12336324055989584, "learning_rate": 0.0001, "loss": 3.1321, "loss/crossentropy": 2.221972668170929, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.2630168259143829, "loss/reg": 0.0, "step": 18230 }, { "epoch": 0.12, "grad_norm": 2.46875, "grad_norm_var": 0.0994049072265625, "learning_rate": 0.0001, "loss": 3.192, "loss/crossentropy": 1.910440945625305, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.22247039675712585, "loss/reg": 0.0, "step": 18240 }, { "epoch": 0.12006578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.38481343587239586, "learning_rate": 0.0001, "loss": 3.211, "loss/crossentropy": 2.3357484221458433, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.28341811895370483, "loss/reg": 0.0, "step": 18250 }, { "epoch": 0.12013157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.1193023681640625, "learning_rate": 0.0001, "loss": 3.1537, "loss/crossentropy": 2.277235043048859, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2401238664984703, "loss/reg": 0.0, "step": 18260 }, { "epoch": 0.12019736842105264, "grad_norm": 2.5, "grad_norm_var": 0.0804595947265625, "learning_rate": 0.0001, "loss": 3.1693, "loss/crossentropy": 2.4965412855148315, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.23239507675170898, "loss/reg": 0.0, "step": 18270 }, { "epoch": 0.12026315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.1340240478515625, "learning_rate": 0.0001, "loss": 3.0471, "loss/crossentropy": 2.187135934829712, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22307218313217164, "loss/reg": 0.0, "step": 18280 }, { "epoch": 0.12032894736842105, "grad_norm": 2.03125, "grad_norm_var": 0.08740946451822916, "learning_rate": 0.0001, "loss": 3.0703, "loss/crossentropy": 2.144671416282654, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.23641343265771866, "loss/reg": 0.0, "step": 18290 }, { "epoch": 0.12039473684210526, "grad_norm": 2.484375, "grad_norm_var": 0.0469390869140625, "learning_rate": 0.0001, "loss": 3.0766, "loss/crossentropy": 2.3019802451133726, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.27486053854227066, "loss/reg": 0.0, "step": 18300 }, { "epoch": 0.12046052631578948, "grad_norm": 2.765625, "grad_norm_var": 0.056086222330729164, "learning_rate": 0.0001, "loss": 3.1479, "loss/crossentropy": 2.350484275817871, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.2915784493088722, "loss/reg": 0.0, "step": 18310 }, { "epoch": 0.12052631578947369, "grad_norm": 2.3125, "grad_norm_var": 0.0907867431640625, "learning_rate": 0.0001, "loss": 3.085, "loss/crossentropy": 2.519459903240204, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2799163952469826, "loss/reg": 0.0, "step": 18320 }, { "epoch": 0.12059210526315789, "grad_norm": 3.03125, "grad_norm_var": 0.07711588541666667, "learning_rate": 0.0001, "loss": 3.1352, "loss/crossentropy": 2.196255683898926, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.256213016808033, "loss/reg": 0.0, "step": 18330 }, { "epoch": 0.1206578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.111083984375, "learning_rate": 0.0001, "loss": 3.1252, "loss/crossentropy": 2.421483266353607, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.3222003743052483, "loss/reg": 0.0, "step": 18340 }, { "epoch": 0.12072368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.07574462890625, "learning_rate": 0.0001, "loss": 3.1338, "loss/crossentropy": 2.248370945453644, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.24929146319627762, "loss/reg": 0.0, "step": 18350 }, { "epoch": 0.12078947368421053, "grad_norm": 2.375, "grad_norm_var": 0.05138346354166667, "learning_rate": 0.0001, "loss": 3.1477, "loss/crossentropy": 2.3680251955986025, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2555671989917755, "loss/reg": 0.0, "step": 18360 }, { "epoch": 0.12085526315789474, "grad_norm": 3.296875, "grad_norm_var": 0.06642964680989584, "learning_rate": 0.0001, "loss": 3.1165, "loss/crossentropy": 2.199974000453949, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.24803201854228973, "loss/reg": 0.0, "step": 18370 }, { "epoch": 0.12092105263157894, "grad_norm": 2.171875, "grad_norm_var": 0.11695556640625, "learning_rate": 0.0001, "loss": 3.1367, "loss/crossentropy": 2.5192595601081846, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.24734148681163787, "loss/reg": 0.0, "step": 18380 }, { "epoch": 0.12098684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.05526936848958333, "learning_rate": 0.0001, "loss": 3.1724, "loss/crossentropy": 2.450891613960266, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2809787794947624, "loss/reg": 0.0, "step": 18390 }, { "epoch": 0.12105263157894737, "grad_norm": 2.15625, "grad_norm_var": 0.0400787353515625, "learning_rate": 0.0001, "loss": 3.1368, "loss/crossentropy": 2.142946255207062, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.25829449892044065, "loss/reg": 0.0, "step": 18400 }, { "epoch": 0.12111842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.12823893229166666, "learning_rate": 0.0001, "loss": 3.1313, "loss/crossentropy": 2.4548865795135497, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2860646352171898, "loss/reg": 0.0, "step": 18410 }, { "epoch": 0.12118421052631578, "grad_norm": 2.46875, "grad_norm_var": 0.22473042805989582, "learning_rate": 0.0001, "loss": 3.0336, "loss/crossentropy": 2.4296185970306396, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24130426943302155, "loss/reg": 0.0, "step": 18420 }, { "epoch": 0.12125, "grad_norm": 2.546875, "grad_norm_var": 0.24602864583333334, "learning_rate": 0.0001, "loss": 3.0905, "loss/crossentropy": 2.2097928047180178, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.22747812122106553, "loss/reg": 0.0, "step": 18430 }, { "epoch": 0.12131578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.03878580729166667, "learning_rate": 0.0001, "loss": 3.1482, "loss/crossentropy": 2.266276228427887, "loss/hidden": 3.1015625, "loss/incoh": 0.0, "loss/logits": 0.3076672673225403, "loss/reg": 0.0, "step": 18440 }, { "epoch": 0.12138157894736842, "grad_norm": 2.046875, "grad_norm_var": 0.19593098958333333, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.3656784892082214, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.21082431972026824, "loss/reg": 0.0, "step": 18450 }, { "epoch": 0.12144736842105264, "grad_norm": 2.28125, "grad_norm_var": 0.13351236979166667, "learning_rate": 0.0001, "loss": 3.1809, "loss/crossentropy": 2.363306760787964, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.259796117246151, "loss/reg": 0.0, "step": 18460 }, { "epoch": 0.12151315789473684, "grad_norm": 2.921875, "grad_norm_var": 0.10623372395833333, "learning_rate": 0.0001, "loss": 3.0904, "loss/crossentropy": 2.2991411328315734, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22767046988010406, "loss/reg": 0.0, "step": 18470 }, { "epoch": 0.12157894736842105, "grad_norm": 2.25, "grad_norm_var": 0.051595052083333336, "learning_rate": 0.0001, "loss": 3.0997, "loss/crossentropy": 2.479309868812561, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.2216094933450222, "loss/reg": 0.0, "step": 18480 }, { "epoch": 0.12164473684210526, "grad_norm": 2.171875, "grad_norm_var": 0.029279581705729165, "learning_rate": 0.0001, "loss": 3.0982, "loss/crossentropy": 2.224264907836914, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2954641401767731, "loss/reg": 0.0, "step": 18490 }, { "epoch": 0.12171052631578948, "grad_norm": 2.484375, "grad_norm_var": 0.0418365478515625, "learning_rate": 0.0001, "loss": 3.1278, "loss/crossentropy": 2.1996599078178405, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.24900650084018708, "loss/reg": 0.0, "step": 18500 }, { "epoch": 0.12177631578947369, "grad_norm": 2.65625, "grad_norm_var": 6.255301920572917, "learning_rate": 0.0001, "loss": 3.1924, "loss/crossentropy": 2.388622558116913, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2473811611533165, "loss/reg": 0.0, "step": 18510 }, { "epoch": 0.12184210526315789, "grad_norm": 2.359375, "grad_norm_var": 0.07906901041666667, "learning_rate": 0.0001, "loss": 3.1227, "loss/crossentropy": 2.325872230529785, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2705776423215866, "loss/reg": 0.0, "step": 18520 }, { "epoch": 0.1219078947368421, "grad_norm": 3.96875, "grad_norm_var": 0.2271197001139323, "learning_rate": 0.0001, "loss": 3.104, "loss/crossentropy": 2.4012768149375914, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.24956294745206833, "loss/reg": 0.0, "step": 18530 }, { "epoch": 0.12197368421052632, "grad_norm": 2.015625, "grad_norm_var": 0.2910316467285156, "learning_rate": 0.0001, "loss": 3.1206, "loss/crossentropy": 2.605260455608368, "loss/hidden": 3.33125, "loss/incoh": 0.0, "loss/logits": 0.2743007704615593, "loss/reg": 0.0, "step": 18540 }, { "epoch": 0.12203947368421053, "grad_norm": 2.1875, "grad_norm_var": 0.1703277587890625, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 1.9470459461212157, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.27280396595597267, "loss/reg": 0.0, "step": 18550 }, { "epoch": 0.12210526315789473, "grad_norm": 4.5625, "grad_norm_var": 0.52919921875, "learning_rate": 0.0001, "loss": 3.1155, "loss/crossentropy": 2.216350567340851, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22622163146734237, "loss/reg": 0.0, "step": 18560 }, { "epoch": 0.12217105263157894, "grad_norm": 2.65625, "grad_norm_var": 0.4434733072916667, "learning_rate": 0.0001, "loss": 3.1029, "loss/crossentropy": 2.1628998279571534, "loss/hidden": 3.0609375, "loss/incoh": 0.0, "loss/logits": 0.28237638175487517, "loss/reg": 0.0, "step": 18570 }, { "epoch": 0.12223684210526316, "grad_norm": 2.890625, "grad_norm_var": 0.3641916910807292, "learning_rate": 0.0001, "loss": 3.1955, "loss/crossentropy": 2.486808693408966, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.2668070778250694, "loss/reg": 0.0, "step": 18580 }, { "epoch": 0.12230263157894737, "grad_norm": 3.03125, "grad_norm_var": 0.3584950764973958, "learning_rate": 0.0001, "loss": 3.1489, "loss/crossentropy": 2.3242167592048646, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2651012405753136, "loss/reg": 0.0, "step": 18590 }, { "epoch": 0.12236842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.05087890625, "learning_rate": 0.0001, "loss": 3.0364, "loss/crossentropy": 2.4761088371276854, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.24524887502193451, "loss/reg": 0.0, "step": 18600 }, { "epoch": 0.12243421052631578, "grad_norm": 2.59375, "grad_norm_var": 0.08817952473958333, "learning_rate": 0.0001, "loss": 3.1836, "loss/crossentropy": 2.012187111377716, "loss/hidden": 3.0765625, "loss/incoh": 0.0, "loss/logits": 0.2920076042413712, "loss/reg": 0.0, "step": 18610 }, { "epoch": 0.1225, "grad_norm": 2.421875, "grad_norm_var": 0.07818094889322917, "learning_rate": 0.0001, "loss": 3.1844, "loss/crossentropy": 2.3601612210273744, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.2700245052576065, "loss/reg": 0.0, "step": 18620 }, { "epoch": 0.12256578947368421, "grad_norm": 2.28125, "grad_norm_var": 0.6979482014973958, "learning_rate": 0.0001, "loss": 3.1953, "loss/crossentropy": 2.2805428981781004, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.21399059891700745, "loss/reg": 0.0, "step": 18630 }, { "epoch": 0.12263157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.09913736979166667, "learning_rate": 0.0001, "loss": 3.2462, "loss/crossentropy": 2.3119712233543397, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.27173476070165636, "loss/reg": 0.0, "step": 18640 }, { "epoch": 0.12269736842105264, "grad_norm": 2.484375, "grad_norm_var": 0.12066650390625, "learning_rate": 0.0001, "loss": 3.1387, "loss/crossentropy": 2.1647680759429933, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2955679178237915, "loss/reg": 0.0, "step": 18650 }, { "epoch": 0.12276315789473684, "grad_norm": 2.21875, "grad_norm_var": 0.0642242431640625, "learning_rate": 0.0001, "loss": 3.0672, "loss/crossentropy": 2.398548412322998, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.24050813913345337, "loss/reg": 0.0, "step": 18660 }, { "epoch": 0.12282894736842105, "grad_norm": 3.046875, "grad_norm_var": 0.43463134765625, "learning_rate": 0.0001, "loss": 3.1768, "loss/crossentropy": 2.007632791996002, "loss/hidden": 3.2265625, "loss/incoh": 0.0, "loss/logits": 0.28355503678321836, "loss/reg": 0.0, "step": 18670 }, { "epoch": 0.12289473684210526, "grad_norm": 2.21875, "grad_norm_var": 0.41833394368489585, "learning_rate": 0.0001, "loss": 3.1491, "loss/crossentropy": 2.4610511898994445, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.23162013590335845, "loss/reg": 0.0, "step": 18680 }, { "epoch": 0.12296052631578948, "grad_norm": 4.15625, "grad_norm_var": 0.29620335896809896, "learning_rate": 0.0001, "loss": 3.1762, "loss/crossentropy": 2.11824317574501, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2083466961979866, "loss/reg": 0.0, "step": 18690 }, { "epoch": 0.12302631578947368, "grad_norm": 2.203125, "grad_norm_var": 1.1225685119628905, "learning_rate": 0.0001, "loss": 3.1766, "loss/crossentropy": 2.236714720726013, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.23704309910535812, "loss/reg": 0.0, "step": 18700 }, { "epoch": 0.12309210526315789, "grad_norm": 2.109375, "grad_norm_var": 0.12454020182291667, "learning_rate": 0.0001, "loss": 3.0992, "loss/crossentropy": 2.2541938424110413, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.23853187412023544, "loss/reg": 0.0, "step": 18710 }, { "epoch": 0.1231578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.070654296875, "learning_rate": 0.0001, "loss": 3.0403, "loss/crossentropy": 2.4512630701065063, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.24162033200263977, "loss/reg": 0.0, "step": 18720 }, { "epoch": 0.12322368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.03876546223958333, "learning_rate": 0.0001, "loss": 3.1522, "loss/crossentropy": 2.3199268102645876, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.23435179740190507, "loss/reg": 0.0, "step": 18730 }, { "epoch": 0.12328947368421053, "grad_norm": 2.421875, "grad_norm_var": 0.0454254150390625, "learning_rate": 0.0001, "loss": 3.019, "loss/crossentropy": 2.3065245509147645, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22434473037719727, "loss/reg": 0.0, "step": 18740 }, { "epoch": 0.12335526315789473, "grad_norm": 2.515625, "grad_norm_var": 0.1038726806640625, "learning_rate": 0.0001, "loss": 3.145, "loss/crossentropy": 2.2620524525642396, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2564009681344032, "loss/reg": 0.0, "step": 18750 }, { "epoch": 0.12342105263157895, "grad_norm": 3.40625, "grad_norm_var": 0.1225982666015625, "learning_rate": 0.0001, "loss": 3.1243, "loss/crossentropy": 2.06068754196167, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.23558274507522584, "loss/reg": 0.0, "step": 18760 }, { "epoch": 0.12348684210526316, "grad_norm": 2.4375, "grad_norm_var": 0.28750712076822915, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.5775238513946532, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2552958935499191, "loss/reg": 0.0, "step": 18770 }, { "epoch": 0.12355263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.09669596354166667, "learning_rate": 0.0001, "loss": 3.1108, "loss/crossentropy": 2.100378167629242, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.23871416002511978, "loss/reg": 0.0, "step": 18780 }, { "epoch": 0.12361842105263159, "grad_norm": 5.5, "grad_norm_var": 0.6776112874348958, "learning_rate": 0.0001, "loss": 3.1095, "loss/crossentropy": 2.4402441143989564, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23556852638721465, "loss/reg": 0.0, "step": 18790 }, { "epoch": 0.12368421052631579, "grad_norm": 2.28125, "grad_norm_var": 0.6309967041015625, "learning_rate": 0.0001, "loss": 3.1612, "loss/crossentropy": 2.3034533500671386, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2525601238012314, "loss/reg": 0.0, "step": 18800 }, { "epoch": 0.12375, "grad_norm": 2.90625, "grad_norm_var": 0.06685282389322916, "learning_rate": 0.0001, "loss": 3.1566, "loss/crossentropy": 2.447878336906433, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2531526446342468, "loss/reg": 0.0, "step": 18810 }, { "epoch": 0.12381578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.17978515625, "learning_rate": 0.0001, "loss": 3.1701, "loss/crossentropy": 2.297843897342682, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.2250390335917473, "loss/reg": 0.0, "step": 18820 }, { "epoch": 0.12388157894736843, "grad_norm": 2.4375, "grad_norm_var": 0.06181640625, "learning_rate": 0.0001, "loss": 3.1129, "loss/crossentropy": 2.1577285885810853, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2582122042775154, "loss/reg": 0.0, "step": 18830 }, { "epoch": 0.12394736842105263, "grad_norm": 2.859375, "grad_norm_var": 0.08957926432291667, "learning_rate": 0.0001, "loss": 3.1521, "loss/crossentropy": 2.5041862964630126, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.24232363551855088, "loss/reg": 0.0, "step": 18840 }, { "epoch": 0.12401315789473684, "grad_norm": 2.953125, "grad_norm_var": 0.10745035807291667, "learning_rate": 0.0001, "loss": 3.1007, "loss/crossentropy": 1.8846357107162475, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.2533442348241806, "loss/reg": 0.0, "step": 18850 }, { "epoch": 0.12407894736842105, "grad_norm": 2.65625, "grad_norm_var": 0.11793212890625, "learning_rate": 0.0001, "loss": 3.0916, "loss/crossentropy": 2.5479671955108643, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.26022554039955137, "loss/reg": 0.0, "step": 18860 }, { "epoch": 0.12414473684210527, "grad_norm": 3.203125, "grad_norm_var": 0.4687652587890625, "learning_rate": 0.0001, "loss": 3.1202, "loss/crossentropy": 2.3512953519821167, "loss/hidden": 3.0296875, "loss/incoh": 0.0, "loss/logits": 0.2577581197023392, "loss/reg": 0.0, "step": 18870 }, { "epoch": 0.12421052631578948, "grad_norm": 2.265625, "grad_norm_var": 0.49575907389322915, "learning_rate": 0.0001, "loss": 3.192, "loss/crossentropy": 2.4191364645957947, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2868465960025787, "loss/reg": 0.0, "step": 18880 }, { "epoch": 0.12427631578947368, "grad_norm": 2.296875, "grad_norm_var": 0.20038655598958333, "learning_rate": 0.0001, "loss": 3.1072, "loss/crossentropy": 2.548805284500122, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.22707584351301194, "loss/reg": 0.0, "step": 18890 }, { "epoch": 0.12434210526315789, "grad_norm": 2.4375, "grad_norm_var": 0.16153055826822918, "learning_rate": 0.0001, "loss": 3.0653, "loss/crossentropy": 2.4287894129753114, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24110280126333236, "loss/reg": 0.0, "step": 18900 }, { "epoch": 0.1244078947368421, "grad_norm": 2.59375, "grad_norm_var": 0.06378580729166666, "learning_rate": 0.0001, "loss": 3.1229, "loss/crossentropy": 2.381858563423157, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.28408930599689486, "loss/reg": 0.0, "step": 18910 }, { "epoch": 0.12447368421052632, "grad_norm": 3.9375, "grad_norm_var": 0.33860575358072914, "learning_rate": 0.0001, "loss": 3.1459, "loss/crossentropy": 2.0421772241592406, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24683285355567933, "loss/reg": 0.0, "step": 18920 }, { "epoch": 0.12453947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.3823638916015625, "learning_rate": 0.0001, "loss": 3.1332, "loss/crossentropy": 2.528811717033386, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.26049076169729235, "loss/reg": 0.0, "step": 18930 }, { "epoch": 0.12460526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.1140045166015625, "learning_rate": 0.0001, "loss": 3.2262, "loss/crossentropy": 2.391852283477783, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.3043837010860443, "loss/reg": 0.0, "step": 18940 }, { "epoch": 0.12467105263157895, "grad_norm": 2.03125, "grad_norm_var": 0.06843973795572916, "learning_rate": 0.0001, "loss": 3.149, "loss/crossentropy": 2.218567681312561, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.254660502076149, "loss/reg": 0.0, "step": 18950 }, { "epoch": 0.12473684210526316, "grad_norm": 2.375, "grad_norm_var": 0.0237701416015625, "learning_rate": 0.0001, "loss": 3.1052, "loss/crossentropy": 2.3154717803001406, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2391164407134056, "loss/reg": 0.0, "step": 18960 }, { "epoch": 0.12480263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.05070699055989583, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.2152688026428224, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.28472713232040403, "loss/reg": 0.0, "step": 18970 }, { "epoch": 0.12486842105263157, "grad_norm": 2.3125, "grad_norm_var": 0.08042704264322917, "learning_rate": 0.0001, "loss": 3.1049, "loss/crossentropy": 2.392467772960663, "loss/hidden": 3.1921875, "loss/incoh": 0.0, "loss/logits": 0.34464606642723083, "loss/reg": 0.0, "step": 18980 }, { "epoch": 0.12493421052631579, "grad_norm": 2.625, "grad_norm_var": 0.07502848307291667, "learning_rate": 0.0001, "loss": 3.1321, "loss/crossentropy": 2.4656677722930906, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.25502916276454923, "loss/reg": 0.0, "step": 18990 }, { "epoch": 0.125, "grad_norm": 2.421875, "grad_norm_var": 1.4638824462890625, "learning_rate": 0.0001, "loss": 3.2055, "loss/crossentropy": 2.27457115650177, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.21155266612768173, "loss/reg": 0.0, "step": 19000 }, { "epoch": 0.1250657894736842, "grad_norm": 2.15625, "grad_norm_var": 0.5252115885416667, "learning_rate": 0.0001, "loss": 3.0721, "loss/crossentropy": 2.232183575630188, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.23653523325920106, "loss/reg": 0.0, "step": 19010 }, { "epoch": 0.12513157894736843, "grad_norm": 2.546875, "grad_norm_var": 0.3392415364583333, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.134384286403656, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.23607225120067596, "loss/reg": 0.0, "step": 19020 }, { "epoch": 0.12519736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.14305013020833332, "learning_rate": 0.0001, "loss": 3.1322, "loss/crossentropy": 2.406242322921753, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.2707079291343689, "loss/reg": 0.0, "step": 19030 }, { "epoch": 0.12526315789473685, "grad_norm": 2.234375, "grad_norm_var": 0.34036051432291664, "learning_rate": 0.0001, "loss": 3.2069, "loss/crossentropy": 2.4997928857803347, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2432278200984001, "loss/reg": 0.0, "step": 19040 }, { "epoch": 0.12532894736842104, "grad_norm": 2.4375, "grad_norm_var": 0.12541910807291667, "learning_rate": 0.0001, "loss": 3.0995, "loss/crossentropy": 2.2165623545646667, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24738555699586867, "loss/reg": 0.0, "step": 19050 }, { "epoch": 0.12539473684210525, "grad_norm": 2.8125, "grad_norm_var": 0.15855712890625, "learning_rate": 0.0001, "loss": 3.1697, "loss/crossentropy": 2.510625755786896, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.22799582332372664, "loss/reg": 0.0, "step": 19060 }, { "epoch": 0.12546052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.09364827473958333, "learning_rate": 0.0001, "loss": 3.2057, "loss/crossentropy": 2.380051004886627, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2702123373746872, "loss/reg": 0.0, "step": 19070 }, { "epoch": 0.12552631578947368, "grad_norm": 2.015625, "grad_norm_var": 0.07896728515625, "learning_rate": 0.0001, "loss": 3.0865, "loss/crossentropy": 2.2913742661476135, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22653487473726272, "loss/reg": 0.0, "step": 19080 }, { "epoch": 0.1255921052631579, "grad_norm": 2.453125, "grad_norm_var": 0.1556549072265625, "learning_rate": 0.0001, "loss": 3.1844, "loss/crossentropy": 2.39451003074646, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.23494229055941104, "loss/reg": 0.0, "step": 19090 }, { "epoch": 0.1256578947368421, "grad_norm": 2.5, "grad_norm_var": 0.42444254557291666, "learning_rate": 0.0001, "loss": 3.1173, "loss/crossentropy": 2.2207372069358824, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2351771742105484, "loss/reg": 0.0, "step": 19100 }, { "epoch": 0.12572368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.405517578125, "learning_rate": 0.0001, "loss": 3.1686, "loss/crossentropy": 2.544073963165283, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.23578422516584396, "loss/reg": 0.0, "step": 19110 }, { "epoch": 0.12578947368421053, "grad_norm": 2.28125, "grad_norm_var": 0.022623697916666668, "learning_rate": 0.0001, "loss": 3.1575, "loss/crossentropy": 2.0650166511535644, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2400606468319893, "loss/reg": 0.0, "step": 19120 }, { "epoch": 0.12585526315789475, "grad_norm": 2.078125, "grad_norm_var": 0.12542215983072916, "learning_rate": 0.0001, "loss": 3.1717, "loss/crossentropy": 2.1879891753196716, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.23869821876287461, "loss/reg": 0.0, "step": 19130 }, { "epoch": 0.12592105263157893, "grad_norm": 2.84375, "grad_norm_var": 2.683204189608586e+17, "learning_rate": 0.0001, "loss": 3.2235, "loss/crossentropy": 2.4875245571136473, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.3171342611312866, "loss/reg": 0.0, "step": 19140 }, { "epoch": 0.12598684210526315, "grad_norm": 2.109375, "grad_norm_var": 0.12364908854166666, "learning_rate": 0.0001, "loss": 3.1832, "loss/crossentropy": 2.199462330341339, "loss/hidden": 3.1125, "loss/incoh": 0.0, "loss/logits": 0.2820486217737198, "loss/reg": 0.0, "step": 19150 }, { "epoch": 0.12605263157894736, "grad_norm": 3.53125, "grad_norm_var": 0.133642578125, "learning_rate": 0.0001, "loss": 3.2179, "loss/crossentropy": 2.433344876766205, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.3452912583947182, "loss/reg": 0.0, "step": 19160 }, { "epoch": 0.12611842105263157, "grad_norm": 2.328125, "grad_norm_var": 0.23087946573893228, "learning_rate": 0.0001, "loss": 3.073, "loss/crossentropy": 2.107306253910065, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.29124595075845716, "loss/reg": 0.0, "step": 19170 }, { "epoch": 0.1261842105263158, "grad_norm": 2.640625, "grad_norm_var": 2.206763811704668e+17, "learning_rate": 0.0001, "loss": 3.2818, "loss/crossentropy": 2.17460697889328, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.26940477788448336, "loss/reg": 0.0, "step": 19180 }, { "epoch": 0.12625, "grad_norm": 2.25, "grad_norm_var": 0.06741434733072917, "learning_rate": 0.0001, "loss": 3.1977, "loss/crossentropy": 2.249357485771179, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.29158340096473695, "loss/reg": 0.0, "step": 19190 }, { "epoch": 0.12631578947368421, "grad_norm": 2.28125, "grad_norm_var": 0.1665679931640625, "learning_rate": 0.0001, "loss": 3.0798, "loss/crossentropy": 2.39562349319458, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.24128414690494537, "loss/reg": 0.0, "step": 19200 }, { "epoch": 0.12638157894736843, "grad_norm": 2.515625, "grad_norm_var": 0.04352925618489583, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 2.4379887223243712, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2693786233663559, "loss/reg": 0.0, "step": 19210 }, { "epoch": 0.12644736842105264, "grad_norm": 2.609375, "grad_norm_var": 0.05164286295572917, "learning_rate": 0.0001, "loss": 3.052, "loss/crossentropy": 2.327665722370148, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.37138707339763644, "loss/reg": 0.0, "step": 19220 }, { "epoch": 0.12651315789473686, "grad_norm": 2.09375, "grad_norm_var": 1.5772623697916666, "learning_rate": 0.0001, "loss": 3.1007, "loss/crossentropy": 2.1090051174163817, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.23532682955265044, "loss/reg": 0.0, "step": 19230 }, { "epoch": 0.12657894736842104, "grad_norm": 2.5625, "grad_norm_var": 0.76259765625, "learning_rate": 0.0001, "loss": 3.087, "loss/crossentropy": 2.3096412897109984, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2415456846356392, "loss/reg": 0.0, "step": 19240 }, { "epoch": 0.12664473684210525, "grad_norm": 2.5, "grad_norm_var": 0.0485504150390625, "learning_rate": 0.0001, "loss": 3.0857, "loss/crossentropy": 2.252851128578186, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.21757592558860778, "loss/reg": 0.0, "step": 19250 }, { "epoch": 0.12671052631578947, "grad_norm": 2.359375, "grad_norm_var": 0.16155192057291667, "learning_rate": 0.0001, "loss": 3.1508, "loss/crossentropy": 2.19182807803154, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.23997026532888413, "loss/reg": 0.0, "step": 19260 }, { "epoch": 0.12677631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.4118609110514323, "learning_rate": 0.0001, "loss": 3.1121, "loss/crossentropy": 2.4667071104049683, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.23552417606115342, "loss/reg": 0.0, "step": 19270 }, { "epoch": 0.1268421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.04609553019205729, "learning_rate": 0.0001, "loss": 3.1763, "loss/crossentropy": 2.1369692206382753, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24942600578069687, "loss/reg": 0.0, "step": 19280 }, { "epoch": 0.1269078947368421, "grad_norm": 2.5, "grad_norm_var": 0.0526031494140625, "learning_rate": 0.0001, "loss": 3.1068, "loss/crossentropy": 2.11109459400177, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.25110483914613724, "loss/reg": 0.0, "step": 19290 }, { "epoch": 0.12697368421052632, "grad_norm": 2.578125, "grad_norm_var": 0.053343709309895834, "learning_rate": 0.0001, "loss": 3.1517, "loss/crossentropy": 2.380664014816284, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2508363798260689, "loss/reg": 0.0, "step": 19300 }, { "epoch": 0.12703947368421054, "grad_norm": 2.171875, "grad_norm_var": 0.034032185872395836, "learning_rate": 0.0001, "loss": 3.1303, "loss/crossentropy": 2.380405902862549, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2564548909664154, "loss/reg": 0.0, "step": 19310 }, { "epoch": 0.12710526315789475, "grad_norm": 2.375, "grad_norm_var": 0.03234049479166667, "learning_rate": 0.0001, "loss": 3.1223, "loss/crossentropy": 2.1872188091278075, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2912978962063789, "loss/reg": 0.0, "step": 19320 }, { "epoch": 0.12717105263157893, "grad_norm": 2.328125, "grad_norm_var": 0.07059504191080729, "learning_rate": 0.0001, "loss": 3.1221, "loss/crossentropy": 2.470194971561432, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2459412097930908, "loss/reg": 0.0, "step": 19330 }, { "epoch": 0.12723684210526315, "grad_norm": 2.484375, "grad_norm_var": 0.1386431376139323, "learning_rate": 0.0001, "loss": 3.1555, "loss/crossentropy": 2.3521093368530273, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2635854005813599, "loss/reg": 0.0, "step": 19340 }, { "epoch": 0.12730263157894736, "grad_norm": 2.4375, "grad_norm_var": 0.053857421875, "learning_rate": 0.0001, "loss": 3.0594, "loss/crossentropy": 2.099438285827637, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22058220505714415, "loss/reg": 0.0, "step": 19350 }, { "epoch": 0.12736842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.10086161295572917, "learning_rate": 0.0001, "loss": 3.1877, "loss/crossentropy": 2.2927380204200745, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.25694535821676256, "loss/reg": 0.0, "step": 19360 }, { "epoch": 0.1274342105263158, "grad_norm": 1.9375, "grad_norm_var": 0.05416259765625, "learning_rate": 0.0001, "loss": 3.0826, "loss/crossentropy": 2.1295772194862366, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.254076661169529, "loss/reg": 0.0, "step": 19370 }, { "epoch": 0.1275, "grad_norm": 2.5625, "grad_norm_var": 0.24744466145833333, "learning_rate": 0.0001, "loss": 3.1415, "loss/crossentropy": 1.9256490916013718, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.24017905220389366, "loss/reg": 0.0, "step": 19380 }, { "epoch": 0.12756578947368422, "grad_norm": 2.421875, "grad_norm_var": 0.5484659830729167, "learning_rate": 0.0001, "loss": 3.1094, "loss/crossentropy": 2.4685181736946107, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.26331629455089567, "loss/reg": 0.0, "step": 19390 }, { "epoch": 0.12763157894736843, "grad_norm": 2.3125, "grad_norm_var": 0.39381103515625, "learning_rate": 0.0001, "loss": 3.1037, "loss/crossentropy": 2.2764881372451784, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.25390326231718063, "loss/reg": 0.0, "step": 19400 }, { "epoch": 0.12769736842105264, "grad_norm": 2.765625, "grad_norm_var": 0.0698150634765625, "learning_rate": 0.0001, "loss": 3.1262, "loss/crossentropy": 2.299801528453827, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2553006038069725, "loss/reg": 0.0, "step": 19410 }, { "epoch": 0.12776315789473683, "grad_norm": 2.453125, "grad_norm_var": 0.266357421875, "learning_rate": 0.0001, "loss": 3.1239, "loss/crossentropy": 2.0346228003501894, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.22659170776605606, "loss/reg": 0.0, "step": 19420 }, { "epoch": 0.12782894736842104, "grad_norm": 2.34375, "grad_norm_var": 0.37324930826822916, "learning_rate": 0.0001, "loss": 3.1527, "loss/crossentropy": 1.8951176881790162, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.23443188220262529, "loss/reg": 0.0, "step": 19430 }, { "epoch": 0.12789473684210526, "grad_norm": 2.71875, "grad_norm_var": 0.30020243326822915, "learning_rate": 0.0001, "loss": 3.1742, "loss/crossentropy": 2.0540109515190124, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.20190905332565307, "loss/reg": 0.0, "step": 19440 }, { "epoch": 0.12796052631578947, "grad_norm": 2.75, "grad_norm_var": 0.19728190104166668, "learning_rate": 0.0001, "loss": 3.1528, "loss/crossentropy": 2.3164134502410887, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2684441477060318, "loss/reg": 0.0, "step": 19450 }, { "epoch": 0.12802631578947368, "grad_norm": 2.25, "grad_norm_var": 0.2419586181640625, "learning_rate": 0.0001, "loss": 3.1599, "loss/crossentropy": 2.380271017551422, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.2657022625207901, "loss/reg": 0.0, "step": 19460 }, { "epoch": 0.1280921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.11366780598958333, "learning_rate": 0.0001, "loss": 3.1441, "loss/crossentropy": 2.4884063005447388, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2683678835630417, "loss/reg": 0.0, "step": 19470 }, { "epoch": 0.1281578947368421, "grad_norm": 2038431744.0, "grad_norm_var": 2.597002478369833e+17, "learning_rate": 0.0001, "loss": 3.2142, "loss/crossentropy": 2.0524453282356263, "loss/hidden": 3.80625, "loss/incoh": 0.0, "loss/logits": 0.2409697949886322, "loss/reg": 0.0, "step": 19480 }, { "epoch": 0.12822368421052632, "grad_norm": 2.203125, "grad_norm_var": 2.597002478497235e+17, "learning_rate": 0.0001, "loss": 3.0528, "loss/crossentropy": 2.2681432604789733, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.24559762477874755, "loss/reg": 0.0, "step": 19490 }, { "epoch": 0.12828947368421054, "grad_norm": 2.296875, "grad_norm_var": 0.030345662434895834, "learning_rate": 0.0001, "loss": 3.0796, "loss/crossentropy": 2.362069141864777, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2367064341902733, "loss/reg": 0.0, "step": 19500 }, { "epoch": 0.12835526315789475, "grad_norm": 2.75, "grad_norm_var": 0.059382120768229164, "learning_rate": 0.0001, "loss": 3.1465, "loss/crossentropy": 2.260143554210663, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.25476695597171783, "loss/reg": 0.0, "step": 19510 }, { "epoch": 0.12842105263157894, "grad_norm": 2.40625, "grad_norm_var": 0.18327534993489583, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.322359097003937, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.22897413671016692, "loss/reg": 0.0, "step": 19520 }, { "epoch": 0.12848684210526315, "grad_norm": 2.078125, "grad_norm_var": 0.4216379801432292, "learning_rate": 0.0001, "loss": 3.1736, "loss/crossentropy": 2.658802056312561, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.26290144920349123, "loss/reg": 0.0, "step": 19530 }, { "epoch": 0.12855263157894736, "grad_norm": 2.0625, "grad_norm_var": 0.1238677978515625, "learning_rate": 0.0001, "loss": 3.1818, "loss/crossentropy": 2.389211559295654, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.29232275635004046, "loss/reg": 0.0, "step": 19540 }, { "epoch": 0.12861842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.049845123291015626, "learning_rate": 0.0001, "loss": 3.0708, "loss/crossentropy": 2.186306023597717, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2743732765316963, "loss/reg": 0.0, "step": 19550 }, { "epoch": 0.1286842105263158, "grad_norm": 2.40625, "grad_norm_var": 0.024887847900390624, "learning_rate": 0.0001, "loss": 3.1515, "loss/crossentropy": 2.325215721130371, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.24441724121570588, "loss/reg": 0.0, "step": 19560 }, { "epoch": 0.12875, "grad_norm": 2.40625, "grad_norm_var": 0.18791910807291667, "learning_rate": 0.0001, "loss": 3.1163, "loss/crossentropy": 2.4399210453033446, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.23570073395967484, "loss/reg": 0.0, "step": 19570 }, { "epoch": 0.12881578947368422, "grad_norm": 2.34375, "grad_norm_var": 0.0600250244140625, "learning_rate": 0.0001, "loss": 3.0731, "loss/crossentropy": 2.250354325771332, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.25124100893735885, "loss/reg": 0.0, "step": 19580 }, { "epoch": 0.12888157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.2105865478515625, "learning_rate": 0.0001, "loss": 3.1494, "loss/crossentropy": 2.3004735589027403, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.250587160885334, "loss/reg": 0.0, "step": 19590 }, { "epoch": 0.12894736842105264, "grad_norm": 2.125, "grad_norm_var": 0.23645833333333333, "learning_rate": 0.0001, "loss": 3.099, "loss/crossentropy": 2.3559111833572386, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.26171091198921204, "loss/reg": 0.0, "step": 19600 }, { "epoch": 0.12901315789473683, "grad_norm": 2.328125, "grad_norm_var": 0.028955078125, "learning_rate": 0.0001, "loss": 3.171, "loss/crossentropy": 2.2754984378814695, "loss/hidden": 3.046875, "loss/incoh": 0.0, "loss/logits": 0.2695039168000221, "loss/reg": 0.0, "step": 19610 }, { "epoch": 0.12907894736842104, "grad_norm": 2.703125, "grad_norm_var": 0.12908426920572916, "learning_rate": 0.0001, "loss": 3.1609, "loss/crossentropy": 2.457037115097046, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2523172840476036, "loss/reg": 0.0, "step": 19620 }, { "epoch": 0.12914473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.14287007649739583, "learning_rate": 0.0001, "loss": 3.0694, "loss/crossentropy": 2.33393777012825, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23025162070989608, "loss/reg": 0.0, "step": 19630 }, { "epoch": 0.12921052631578947, "grad_norm": 2.125, "grad_norm_var": 0.029618326822916666, "learning_rate": 0.0001, "loss": 3.0684, "loss/crossentropy": 2.155827796459198, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.22635919600725174, "loss/reg": 0.0, "step": 19640 }, { "epoch": 0.12927631578947368, "grad_norm": 2.03125, "grad_norm_var": 0.31851806640625, "learning_rate": 0.0001, "loss": 3.112, "loss/crossentropy": 2.2350030899047852, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.2474326401948929, "loss/reg": 0.0, "step": 19650 }, { "epoch": 0.1293421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.30481363932291666, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.1235520601272584, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23340977281332015, "loss/reg": 0.0, "step": 19660 }, { "epoch": 0.1294078947368421, "grad_norm": 2.4375, "grad_norm_var": 0.058958943684895834, "learning_rate": 0.0001, "loss": 3.1092, "loss/crossentropy": 2.436754751205444, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2396928071975708, "loss/reg": 0.0, "step": 19670 }, { "epoch": 0.12947368421052632, "grad_norm": 2.84375, "grad_norm_var": 0.2934804280598958, "learning_rate": 0.0001, "loss": 3.0531, "loss/crossentropy": 2.24703209400177, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.23143238127231597, "loss/reg": 0.0, "step": 19680 }, { "epoch": 0.12953947368421054, "grad_norm": 2.25, "grad_norm_var": 0.25420303344726564, "learning_rate": 0.0001, "loss": 3.0318, "loss/crossentropy": 2.536008381843567, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22230196446180345, "loss/reg": 0.0, "step": 19690 }, { "epoch": 0.12960526315789472, "grad_norm": 2.21875, "grad_norm_var": 0.29778416951497394, "learning_rate": 0.0001, "loss": 3.0902, "loss/crossentropy": 2.1229737393558024, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2103697349317372, "loss/reg": 0.0, "step": 19700 }, { "epoch": 0.12967105263157894, "grad_norm": 2.0625, "grad_norm_var": 0.09654032389322917, "learning_rate": 0.0001, "loss": 3.0303, "loss/crossentropy": 2.360739004611969, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.238627889752388, "loss/reg": 0.0, "step": 19710 }, { "epoch": 0.12973684210526315, "grad_norm": 2.234375, "grad_norm_var": 0.16044514973958332, "learning_rate": 0.0001, "loss": 3.082, "loss/crossentropy": 2.3643787026405336, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2354188710451126, "loss/reg": 0.0, "step": 19720 }, { "epoch": 0.12980263157894736, "grad_norm": 2.375, "grad_norm_var": 0.23007405598958333, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.3766650319099427, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.26477697044610976, "loss/reg": 0.0, "step": 19730 }, { "epoch": 0.12986842105263158, "grad_norm": 2.21875, "grad_norm_var": 0.055946604410807295, "learning_rate": 0.0001, "loss": 3.0414, "loss/crossentropy": 2.6002285480499268, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.33312758058309555, "loss/reg": 0.0, "step": 19740 }, { "epoch": 0.1299342105263158, "grad_norm": 2.515625, "grad_norm_var": 0.13170547485351564, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.457702159881592, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.25578114166855814, "loss/reg": 0.0, "step": 19750 }, { "epoch": 0.13, "grad_norm": 2.65625, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 3.0882, "loss/crossentropy": 2.261428934335709, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2278106167912483, "loss/reg": 0.0, "step": 19760 }, { "epoch": 0.13006578947368422, "grad_norm": 2.421875, "grad_norm_var": 0.022248331705729166, "learning_rate": 0.0001, "loss": 3.0582, "loss/crossentropy": 2.060434710979462, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.24812956005334855, "loss/reg": 0.0, "step": 19770 }, { "epoch": 0.13013157894736843, "grad_norm": 2.40625, "grad_norm_var": 0.0256256103515625, "learning_rate": 0.0001, "loss": 3.08, "loss/crossentropy": 2.2472903966903686, "loss/hidden": 3.1, "loss/incoh": 0.0, "loss/logits": 0.34347400814294815, "loss/reg": 0.0, "step": 19780 }, { "epoch": 0.13019736842105264, "grad_norm": 2.265625, "grad_norm_var": 0.0520660400390625, "learning_rate": 0.0001, "loss": 3.1154, "loss/crossentropy": 2.3003188014030456, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2695119082927704, "loss/reg": 0.0, "step": 19790 }, { "epoch": 0.13026315789473683, "grad_norm": 2.578125, "grad_norm_var": 0.08396809895833333, "learning_rate": 0.0001, "loss": 3.0827, "loss/crossentropy": 2.2551465153694155, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.24707757085561752, "loss/reg": 0.0, "step": 19800 }, { "epoch": 0.13032894736842104, "grad_norm": 2.65625, "grad_norm_var": 0.15972900390625, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.323357033729553, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2660538278520107, "loss/reg": 0.0, "step": 19810 }, { "epoch": 0.13039473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.15328776041666667, "learning_rate": 0.0001, "loss": 3.0792, "loss/crossentropy": 2.369428300857544, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2439693480730057, "loss/reg": 0.0, "step": 19820 }, { "epoch": 0.13046052631578947, "grad_norm": 2.328125, "grad_norm_var": 0.09102274576822916, "learning_rate": 0.0001, "loss": 3.1636, "loss/crossentropy": 2.2469497442245485, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.24479606077075006, "loss/reg": 0.0, "step": 19830 }, { "epoch": 0.13052631578947368, "grad_norm": 2.625, "grad_norm_var": 0.052302042643229164, "learning_rate": 0.0001, "loss": 3.1694, "loss/crossentropy": 2.4753658294677736, "loss/hidden": 3.0390625, "loss/incoh": 0.0, "loss/logits": 0.3045656159520149, "loss/reg": 0.0, "step": 19840 }, { "epoch": 0.1305921052631579, "grad_norm": 2.265625, "grad_norm_var": 0.0764068603515625, "learning_rate": 0.0001, "loss": 3.0959, "loss/crossentropy": 2.4656317472457885, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.258389513194561, "loss/reg": 0.0, "step": 19850 }, { "epoch": 0.1306578947368421, "grad_norm": 2.515625, "grad_norm_var": 0.06215718587239583, "learning_rate": 0.0001, "loss": 3.0406, "loss/crossentropy": 2.440659189224243, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.26817646920681, "loss/reg": 0.0, "step": 19860 }, { "epoch": 0.13072368421052633, "grad_norm": 2.296875, "grad_norm_var": 0.04420166015625, "learning_rate": 0.0001, "loss": 3.0968, "loss/crossentropy": 2.206334137916565, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24976521283388137, "loss/reg": 0.0, "step": 19870 }, { "epoch": 0.13078947368421054, "grad_norm": 2.109375, "grad_norm_var": 0.08323567708333333, "learning_rate": 0.0001, "loss": 3.0685, "loss/crossentropy": 2.21451940536499, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.21932003498077393, "loss/reg": 0.0, "step": 19880 }, { "epoch": 0.13085526315789472, "grad_norm": 2.84375, "grad_norm_var": 0.17522786458333334, "learning_rate": 0.0001, "loss": 3.0628, "loss/crossentropy": 2.1315925240516664, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23312209993600846, "loss/reg": 0.0, "step": 19890 }, { "epoch": 0.13092105263157894, "grad_norm": 6.28125, "grad_norm_var": 0.9668528238932291, "learning_rate": 0.0001, "loss": 3.1207, "loss/crossentropy": 2.5357746481895447, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2542704403400421, "loss/reg": 0.0, "step": 19900 }, { "epoch": 0.13098684210526315, "grad_norm": 2.25, "grad_norm_var": 0.9971913655598958, "learning_rate": 0.0001, "loss": 3.0247, "loss/crossentropy": 2.53050377368927, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2318735808134079, "loss/reg": 0.0, "step": 19910 }, { "epoch": 0.13105263157894737, "grad_norm": 2.78125, "grad_norm_var": 0.14268290201822917, "learning_rate": 0.0001, "loss": 3.1545, "loss/crossentropy": 2.475492477416992, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.29062798619270325, "loss/reg": 0.0, "step": 19920 }, { "epoch": 0.13111842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.08727925618489583, "learning_rate": 0.0001, "loss": 3.1316, "loss/crossentropy": 2.4628885269165037, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.24764630049467087, "loss/reg": 0.0, "step": 19930 }, { "epoch": 0.1311842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.07839736938476563, "learning_rate": 0.0001, "loss": 3.0597, "loss/crossentropy": 2.46280722618103, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2163504734635353, "loss/reg": 0.0, "step": 19940 }, { "epoch": 0.13125, "grad_norm": 2.34375, "grad_norm_var": 0.0990875244140625, "learning_rate": 0.0001, "loss": 3.1648, "loss/crossentropy": 2.4153407394886015, "loss/hidden": 3.0875, "loss/incoh": 0.0, "loss/logits": 0.26917385756969453, "loss/reg": 0.0, "step": 19950 }, { "epoch": 0.13131578947368422, "grad_norm": 2.8125, "grad_norm_var": 0.18907877604166667, "learning_rate": 0.0001, "loss": 3.1173, "loss/crossentropy": 2.232962656021118, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.3076439991593361, "loss/reg": 0.0, "step": 19960 }, { "epoch": 0.13138157894736843, "grad_norm": 2.3125, "grad_norm_var": 0.13981119791666666, "learning_rate": 0.0001, "loss": 3.0478, "loss/crossentropy": 2.353410243988037, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24693673849105835, "loss/reg": 0.0, "step": 19970 }, { "epoch": 0.13144736842105262, "grad_norm": 3.15625, "grad_norm_var": 0.1372711181640625, "learning_rate": 0.0001, "loss": 3.1452, "loss/crossentropy": 2.121303880214691, "loss/hidden": 3.125, "loss/incoh": 0.0, "loss/logits": 0.31650226265192033, "loss/reg": 0.0, "step": 19980 }, { "epoch": 0.13151315789473683, "grad_norm": 2.25, "grad_norm_var": 0.1741607666015625, "learning_rate": 0.0001, "loss": 3.1028, "loss/crossentropy": 2.43160719871521, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.2277207463979721, "loss/reg": 0.0, "step": 19990 }, { "epoch": 0.13157894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.15798238118489583, "learning_rate": 0.0001, "loss": 3.0532, "loss/crossentropy": 2.2883418917655947, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.2525527849793434, "loss/reg": 0.0, "step": 20000 }, { "epoch": 0.13164473684210526, "grad_norm": 2.28125, "grad_norm_var": 0.10852457682291666, "learning_rate": 0.0001, "loss": 3.0348, "loss/crossentropy": 2.369931137561798, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.22242200672626494, "loss/reg": 0.0, "step": 20010 }, { "epoch": 0.13171052631578947, "grad_norm": 2.125, "grad_norm_var": 0.0374664306640625, "learning_rate": 0.0001, "loss": 3.0316, "loss/crossentropy": 2.1939921617507934, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.234733884036541, "loss/reg": 0.0, "step": 20020 }, { "epoch": 0.13177631578947369, "grad_norm": 2.515625, "grad_norm_var": 0.07238667805989583, "learning_rate": 0.0001, "loss": 3.1278, "loss/crossentropy": 2.4681461334228514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.30574188083410264, "loss/reg": 0.0, "step": 20030 }, { "epoch": 0.1318421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.07011311848958333, "learning_rate": 0.0001, "loss": 3.0111, "loss/crossentropy": 2.699459183216095, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.27421810775995253, "loss/reg": 0.0, "step": 20040 }, { "epoch": 0.1319078947368421, "grad_norm": 2.546875, "grad_norm_var": 41.75862528483073, "learning_rate": 0.0001, "loss": 3.202, "loss/crossentropy": 2.263700020313263, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23062770962715148, "loss/reg": 0.0, "step": 20050 }, { "epoch": 0.13197368421052633, "grad_norm": 2.546875, "grad_norm_var": 0.08103841145833333, "learning_rate": 0.0001, "loss": 3.1056, "loss/crossentropy": 2.4495514154434206, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2597358673810959, "loss/reg": 0.0, "step": 20060 }, { "epoch": 0.13203947368421054, "grad_norm": 2.21875, "grad_norm_var": 0.10988667805989584, "learning_rate": 0.0001, "loss": 3.1429, "loss/crossentropy": 2.5027857065200805, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23284846246242524, "loss/reg": 0.0, "step": 20070 }, { "epoch": 0.13210526315789473, "grad_norm": 2.8125, "grad_norm_var": 0.4439442952473958, "learning_rate": 0.0001, "loss": 3.1051, "loss/crossentropy": 2.3987682700157165, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2284790888428688, "loss/reg": 0.0, "step": 20080 }, { "epoch": 0.13217105263157894, "grad_norm": 4.9375, "grad_norm_var": 0.8553456624348958, "learning_rate": 0.0001, "loss": 3.1699, "loss/crossentropy": 2.415048587322235, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2737443670630455, "loss/reg": 0.0, "step": 20090 }, { "epoch": 0.13223684210526315, "grad_norm": 2.40625, "grad_norm_var": 0.5597005208333333, "learning_rate": 0.0001, "loss": 3.1123, "loss/crossentropy": 2.42084618806839, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.26620708405971527, "loss/reg": 0.0, "step": 20100 }, { "epoch": 0.13230263157894737, "grad_norm": 2.125, "grad_norm_var": 0.022086588541666667, "learning_rate": 0.0001, "loss": 3.0069, "loss/crossentropy": 2.4669047951698304, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.22731948494911194, "loss/reg": 0.0, "step": 20110 }, { "epoch": 0.13236842105263158, "grad_norm": 2.703125, "grad_norm_var": 0.6456451416015625, "learning_rate": 0.0001, "loss": 3.1966, "loss/crossentropy": 2.344004726409912, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.2402897983789444, "loss/reg": 0.0, "step": 20120 }, { "epoch": 0.1324342105263158, "grad_norm": 2.046875, "grad_norm_var": 1.062083943684896, "learning_rate": 0.0001, "loss": 3.1146, "loss/crossentropy": 2.375274932384491, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.22684457302093505, "loss/reg": 0.0, "step": 20130 }, { "epoch": 0.1325, "grad_norm": 2.265625, "grad_norm_var": 0.046076456705729164, "learning_rate": 0.0001, "loss": 3.0369, "loss/crossentropy": 2.366211712360382, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.24463532716035843, "loss/reg": 0.0, "step": 20140 }, { "epoch": 0.13256578947368422, "grad_norm": 1.9375, "grad_norm_var": 0.12783915201822918, "learning_rate": 0.0001, "loss": 3.1105, "loss/crossentropy": 2.384170186519623, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.28617204576730726, "loss/reg": 0.0, "step": 20150 }, { "epoch": 0.13263157894736843, "grad_norm": 2.171875, "grad_norm_var": 0.5388671875, "learning_rate": 0.0001, "loss": 3.13, "loss/crossentropy": 2.37620815038681, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22599491178989412, "loss/reg": 0.0, "step": 20160 }, { "epoch": 0.13269736842105262, "grad_norm": 2.265625, "grad_norm_var": 0.5671875, "learning_rate": 0.0001, "loss": 3.0816, "loss/crossentropy": 2.3014034748077394, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.21943795531988144, "loss/reg": 0.0, "step": 20170 }, { "epoch": 0.13276315789473683, "grad_norm": 2.4375, "grad_norm_var": 0.11660868326822917, "learning_rate": 0.0001, "loss": 3.1416, "loss/crossentropy": 2.619466185569763, "loss/hidden": 3.190625, "loss/incoh": 0.0, "loss/logits": 0.3190195769071579, "loss/reg": 0.0, "step": 20180 }, { "epoch": 0.13282894736842105, "grad_norm": 3.296875, "grad_norm_var": 0.15591532389322918, "learning_rate": 0.0001, "loss": 3.128, "loss/crossentropy": 2.3887166023254394, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.27821466475725176, "loss/reg": 0.0, "step": 20190 }, { "epoch": 0.13289473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.6188639322916667, "learning_rate": 0.0001, "loss": 3.0968, "loss/crossentropy": 2.49861718416214, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2538172617554665, "loss/reg": 0.0, "step": 20200 }, { "epoch": 0.13296052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.20885416666666667, "learning_rate": 0.0001, "loss": 3.1671, "loss/crossentropy": 2.220921754837036, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23103244453668595, "loss/reg": 0.0, "step": 20210 }, { "epoch": 0.1330263157894737, "grad_norm": 2.625, "grad_norm_var": 0.18879801432291668, "learning_rate": 0.0001, "loss": 3.1111, "loss/crossentropy": 2.333580756187439, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2729429990053177, "loss/reg": 0.0, "step": 20220 }, { "epoch": 0.1330921052631579, "grad_norm": 2.171875, "grad_norm_var": 0.0948883056640625, "learning_rate": 0.0001, "loss": 3.0829, "loss/crossentropy": 2.3158997893333435, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.19884210526943208, "loss/reg": 0.0, "step": 20230 }, { "epoch": 0.13315789473684211, "grad_norm": 2.375, "grad_norm_var": 0.20250244140625, "learning_rate": 0.0001, "loss": 3.0896, "loss/crossentropy": 2.369309663772583, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.26042123287916186, "loss/reg": 0.0, "step": 20240 }, { "epoch": 0.13322368421052633, "grad_norm": 3.015625, "grad_norm_var": 0.17239176432291667, "learning_rate": 0.0001, "loss": 3.1863, "loss/crossentropy": 2.2114102065563204, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.21532374620437622, "loss/reg": 0.0, "step": 20250 }, { "epoch": 0.1332894736842105, "grad_norm": 2.328125, "grad_norm_var": 0.33000869750976564, "learning_rate": 0.0001, "loss": 3.0896, "loss/crossentropy": 2.4173696994781495, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2683261051774025, "loss/reg": 0.0, "step": 20260 }, { "epoch": 0.13335526315789473, "grad_norm": 2.875, "grad_norm_var": 0.10729548136393229, "learning_rate": 0.0001, "loss": 3.1426, "loss/crossentropy": 2.3808977246284484, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.3049054339528084, "loss/reg": 0.0, "step": 20270 }, { "epoch": 0.13342105263157894, "grad_norm": 2.484375, "grad_norm_var": 0.1077789306640625, "learning_rate": 0.0001, "loss": 3.1685, "loss/crossentropy": 2.244146704673767, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2616411089897156, "loss/reg": 0.0, "step": 20280 }, { "epoch": 0.13348684210526315, "grad_norm": 2.15625, "grad_norm_var": 0.1076171875, "learning_rate": 0.0001, "loss": 3.1354, "loss/crossentropy": 2.145916444063187, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.21995762139558792, "loss/reg": 0.0, "step": 20290 }, { "epoch": 0.13355263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.08604227701822917, "learning_rate": 0.0001, "loss": 3.0957, "loss/crossentropy": 2.5188711285591125, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.264768448472023, "loss/reg": 0.0, "step": 20300 }, { "epoch": 0.13361842105263158, "grad_norm": 2.5625, "grad_norm_var": 0.3717274983723958, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 2.392267715930939, "loss/hidden": 3.1953125, "loss/incoh": 0.0, "loss/logits": 0.3307821795344353, "loss/reg": 0.0, "step": 20310 }, { "epoch": 0.1336842105263158, "grad_norm": 2.46875, "grad_norm_var": 0.39180399576822916, "learning_rate": 0.0001, "loss": 3.1034, "loss/crossentropy": 2.2370328307151794, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.20846432447433472, "loss/reg": 0.0, "step": 20320 }, { "epoch": 0.13375, "grad_norm": 2.515625, "grad_norm_var": 0.136962890625, "learning_rate": 0.0001, "loss": 3.1123, "loss/crossentropy": 2.358556866645813, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.264502127468586, "loss/reg": 0.0, "step": 20330 }, { "epoch": 0.13381578947368422, "grad_norm": 2.25, "grad_norm_var": 0.04733072916666667, "learning_rate": 0.0001, "loss": 3.1698, "loss/crossentropy": 2.302602219581604, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2898910105228424, "loss/reg": 0.0, "step": 20340 }, { "epoch": 0.13388157894736843, "grad_norm": 2.75, "grad_norm_var": 0.07590230305989583, "learning_rate": 0.0001, "loss": 3.155, "loss/crossentropy": 2.2594828605651855, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2617498949170113, "loss/reg": 0.0, "step": 20350 }, { "epoch": 0.13394736842105262, "grad_norm": 2.421875, "grad_norm_var": 0.1132476806640625, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.6042701482772825, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2813595399260521, "loss/reg": 0.0, "step": 20360 }, { "epoch": 0.13401315789473683, "grad_norm": 2.265625, "grad_norm_var": 3.111881782679394e+17, "learning_rate": 0.0001, "loss": 3.1969, "loss/crossentropy": 2.596519351005554, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.24256999790668488, "loss/reg": 0.0, "step": 20370 }, { "epoch": 0.13407894736842105, "grad_norm": 2.515625, "grad_norm_var": 3.111881782603852e+17, "learning_rate": 0.0001, "loss": 3.1352, "loss/crossentropy": 2.1411483764648436, "loss/hidden": 3.09375, "loss/incoh": 0.0, "loss/logits": 0.2837516859173775, "loss/reg": 0.0, "step": 20380 }, { "epoch": 0.13414473684210526, "grad_norm": 7.5, "grad_norm_var": 1.6672810872395833, "learning_rate": 0.0001, "loss": 3.139, "loss/crossentropy": 2.224585199356079, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.24173519760370255, "loss/reg": 0.0, "step": 20390 }, { "epoch": 0.13421052631578947, "grad_norm": 2.03125, "grad_norm_var": 1.8035634358723958, "learning_rate": 0.0001, "loss": 3.0216, "loss/crossentropy": 2.153792452812195, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.19605738371610643, "loss/reg": 0.0, "step": 20400 }, { "epoch": 0.1342763157894737, "grad_norm": 2.703125, "grad_norm_var": 0.18290913899739583, "learning_rate": 0.0001, "loss": 3.1134, "loss/crossentropy": 2.3031864166259766, "loss/hidden": 3.05, "loss/incoh": 0.0, "loss/logits": 0.26905038952827454, "loss/reg": 0.0, "step": 20410 }, { "epoch": 0.1343421052631579, "grad_norm": 2.5625, "grad_norm_var": 0.04761454264322917, "learning_rate": 0.0001, "loss": 3.0931, "loss/crossentropy": 2.250312161445618, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2365841895341873, "loss/reg": 0.0, "step": 20420 }, { "epoch": 0.13440789473684212, "grad_norm": 2.15625, "grad_norm_var": 0.06604410807291666, "learning_rate": 0.0001, "loss": 3.0394, "loss/crossentropy": 2.1318182408809663, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2652772217988968, "loss/reg": 0.0, "step": 20430 }, { "epoch": 0.13447368421052633, "grad_norm": 2.21875, "grad_norm_var": 0.054585774739583336, "learning_rate": 0.0001, "loss": 2.9721, "loss/crossentropy": 2.237068510055542, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.21526216119527816, "loss/reg": 0.0, "step": 20440 }, { "epoch": 0.13453947368421051, "grad_norm": 2.203125, "grad_norm_var": 0.05564676920572917, "learning_rate": 0.0001, "loss": 3.0973, "loss/crossentropy": 2.0180070281028746, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.22148093655705453, "loss/reg": 0.0, "step": 20450 }, { "epoch": 0.13460526315789473, "grad_norm": 2.296875, "grad_norm_var": 0.058821614583333334, "learning_rate": 0.0001, "loss": 3.176, "loss/crossentropy": 2.1725693702697755, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.26163683980703356, "loss/reg": 0.0, "step": 20460 }, { "epoch": 0.13467105263157894, "grad_norm": 2.5625, "grad_norm_var": 0.06783625284830729, "learning_rate": 0.0001, "loss": 3.0549, "loss/crossentropy": 1.9785831272602081, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.21865816414356232, "loss/reg": 0.0, "step": 20470 }, { "epoch": 0.13473684210526315, "grad_norm": 2.859375, "grad_norm_var": 0.12560806274414063, "learning_rate": 0.0001, "loss": 3.1254, "loss/crossentropy": 2.316719186306, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.23244183510541916, "loss/reg": 0.0, "step": 20480 }, { "epoch": 0.13480263157894737, "grad_norm": 2.328125, "grad_norm_var": 0.1337554931640625, "learning_rate": 0.0001, "loss": 3.0912, "loss/crossentropy": 2.2532522082328796, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.2548024535179138, "loss/reg": 0.0, "step": 20490 }, { "epoch": 0.13486842105263158, "grad_norm": 2.203125, "grad_norm_var": 0.05217692057291667, "learning_rate": 0.0001, "loss": 3.0455, "loss/crossentropy": 2.3155321717262267, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.26803631633520125, "loss/reg": 0.0, "step": 20500 }, { "epoch": 0.1349342105263158, "grad_norm": 2.296875, "grad_norm_var": 0.4710896809895833, "learning_rate": 0.0001, "loss": 3.1847, "loss/crossentropy": 2.4292261719703676, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.26074815839529036, "loss/reg": 0.0, "step": 20510 }, { "epoch": 0.135, "grad_norm": 2.28125, "grad_norm_var": 0.06902567545572917, "learning_rate": 0.0001, "loss": 3.1187, "loss/crossentropy": 2.311373507976532, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2725979134440422, "loss/reg": 0.0, "step": 20520 }, { "epoch": 0.13506578947368422, "grad_norm": 2.453125, "grad_norm_var": 0.28644917805989584, "learning_rate": 0.0001, "loss": 3.0904, "loss/crossentropy": 1.8008847087621689, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2606606900691986, "loss/reg": 0.0, "step": 20530 }, { "epoch": 0.1351315789473684, "grad_norm": 2.125, "grad_norm_var": 0.06108296712239583, "learning_rate": 0.0001, "loss": 3.1547, "loss/crossentropy": 2.0716704607009886, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.21668420732021332, "loss/reg": 0.0, "step": 20540 }, { "epoch": 0.13519736842105262, "grad_norm": 2.84375, "grad_norm_var": 0.35935872395833335, "learning_rate": 0.0001, "loss": 3.1812, "loss/crossentropy": 2.343623089790344, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.25094970166683195, "loss/reg": 0.0, "step": 20550 }, { "epoch": 0.13526315789473684, "grad_norm": 2.609375, "grad_norm_var": 0.20611979166666666, "learning_rate": 0.0001, "loss": 3.1019, "loss/crossentropy": 2.1298214733600616, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.3111059933900833, "loss/reg": 0.0, "step": 20560 }, { "epoch": 0.13532894736842105, "grad_norm": 2.5, "grad_norm_var": 0.07794596354166666, "learning_rate": 0.0001, "loss": 3.0833, "loss/crossentropy": 2.1468781232833862, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.242007839679718, "loss/reg": 0.0, "step": 20570 }, { "epoch": 0.13539473684210526, "grad_norm": 2.4375, "grad_norm_var": 0.3861223856608073, "learning_rate": 0.0001, "loss": 3.056, "loss/crossentropy": 2.0755307257175444, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.21787920594215393, "loss/reg": 0.0, "step": 20580 }, { "epoch": 0.13546052631578948, "grad_norm": 2.09375, "grad_norm_var": 0.048713938395182295, "learning_rate": 0.0001, "loss": 3.0617, "loss/crossentropy": 2.167508864402771, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2543604165315628, "loss/reg": 0.0, "step": 20590 }, { "epoch": 0.1355263157894737, "grad_norm": 4.40625, "grad_norm_var": 0.31427408854166666, "learning_rate": 0.0001, "loss": 3.0881, "loss/crossentropy": 2.1288257122039793, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.2275502547621727, "loss/reg": 0.0, "step": 20600 }, { "epoch": 0.1355921052631579, "grad_norm": 2.59375, "grad_norm_var": 0.3212257385253906, "learning_rate": 0.0001, "loss": 3.0587, "loss/crossentropy": 2.373918867111206, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.25912273228168486, "loss/reg": 0.0, "step": 20610 }, { "epoch": 0.13565789473684212, "grad_norm": 2.671875, "grad_norm_var": 0.05368245442708333, "learning_rate": 0.0001, "loss": 3.0912, "loss/crossentropy": 2.0399203658103944, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.23557177633047105, "loss/reg": 0.0, "step": 20620 }, { "epoch": 0.1357236842105263, "grad_norm": 2.703125, "grad_norm_var": 0.11100031534830729, "learning_rate": 0.0001, "loss": 3.1051, "loss/crossentropy": 2.5496551632881164, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.2645132452249527, "loss/reg": 0.0, "step": 20630 }, { "epoch": 0.13578947368421052, "grad_norm": 2.359375, "grad_norm_var": 0.3220855712890625, "learning_rate": 0.0001, "loss": 3.0665, "loss/crossentropy": 2.226586413383484, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.22172508686780928, "loss/reg": 0.0, "step": 20640 }, { "epoch": 0.13585526315789473, "grad_norm": 2.578125, "grad_norm_var": 0.2806925455729167, "learning_rate": 0.0001, "loss": 3.1914, "loss/crossentropy": 2.2754149079322814, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.27330992817878724, "loss/reg": 0.0, "step": 20650 }, { "epoch": 0.13592105263157894, "grad_norm": 2.34375, "grad_norm_var": 0.05778172810872396, "learning_rate": 0.0001, "loss": 3.1347, "loss/crossentropy": 2.1621429324150085, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2567806988954544, "loss/reg": 0.0, "step": 20660 }, { "epoch": 0.13598684210526316, "grad_norm": 2.375, "grad_norm_var": 0.2951637268066406, "learning_rate": 0.0001, "loss": 3.0649, "loss/crossentropy": 2.342224645614624, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2490961804986, "loss/reg": 0.0, "step": 20670 }, { "epoch": 0.13605263157894737, "grad_norm": 2.109375, "grad_norm_var": 0.27790425618489584, "learning_rate": 0.0001, "loss": 3.1081, "loss/crossentropy": 2.522631120681763, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.28559967428445815, "loss/reg": 0.0, "step": 20680 }, { "epoch": 0.13611842105263158, "grad_norm": 2.03125, "grad_norm_var": 0.07203369140625, "learning_rate": 0.0001, "loss": 3.0501, "loss/crossentropy": 2.1872968673706055, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.21910873502492906, "loss/reg": 0.0, "step": 20690 }, { "epoch": 0.1361842105263158, "grad_norm": 2.40625, "grad_norm_var": 0.06353251139322917, "learning_rate": 0.0001, "loss": 3.0469, "loss/crossentropy": 2.0313059210777284, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.21065909266471863, "loss/reg": 0.0, "step": 20700 }, { "epoch": 0.13625, "grad_norm": 2.1875, "grad_norm_var": 0.12810872395833334, "learning_rate": 0.0001, "loss": 3.1641, "loss/crossentropy": 1.9590769171714784, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2205381214618683, "loss/reg": 0.0, "step": 20710 }, { "epoch": 0.13631578947368422, "grad_norm": 2.625, "grad_norm_var": 0.039526112874348956, "learning_rate": 0.0001, "loss": 3.0038, "loss/crossentropy": 2.181920811533928, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.20574837550520897, "loss/reg": 0.0, "step": 20720 }, { "epoch": 0.1363815789473684, "grad_norm": 2.1875, "grad_norm_var": 0.04170710245768229, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.5629841327667235, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.25942468345165254, "loss/reg": 0.0, "step": 20730 }, { "epoch": 0.13644736842105262, "grad_norm": 2.28125, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 3.1239, "loss/crossentropy": 2.5672937512397764, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2589055135846138, "loss/reg": 0.0, "step": 20740 }, { "epoch": 0.13651315789473684, "grad_norm": 2.453125, "grad_norm_var": 0.051558430989583334, "learning_rate": 0.0001, "loss": 3.1031, "loss/crossentropy": 2.598326253890991, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.2481001317501068, "loss/reg": 0.0, "step": 20750 }, { "epoch": 0.13657894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.08925374348958333, "learning_rate": 0.0001, "loss": 3.0563, "loss/crossentropy": 2.403074288368225, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.25444435328245163, "loss/reg": 0.0, "step": 20760 }, { "epoch": 0.13664473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.33211161295572916, "learning_rate": 0.0001, "loss": 3.1043, "loss/crossentropy": 2.161120080947876, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2631052315235138, "loss/reg": 0.0, "step": 20770 }, { "epoch": 0.13671052631578948, "grad_norm": 2.203125, "grad_norm_var": 0.2020416259765625, "learning_rate": 0.0001, "loss": 3.0514, "loss/crossentropy": 2.601893973350525, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23912052661180497, "loss/reg": 0.0, "step": 20780 }, { "epoch": 0.1367763157894737, "grad_norm": 2.28125, "grad_norm_var": 0.11573893229166667, "learning_rate": 0.0001, "loss": 3.0923, "loss/crossentropy": 2.3733190536499023, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.21903542578220367, "loss/reg": 0.0, "step": 20790 }, { "epoch": 0.1368421052631579, "grad_norm": 2.59375, "grad_norm_var": 0.08520406087239583, "learning_rate": 0.0001, "loss": 3.1174, "loss/crossentropy": 2.344668173789978, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2994943633675575, "loss/reg": 0.0, "step": 20800 }, { "epoch": 0.13690789473684212, "grad_norm": 2.421875, "grad_norm_var": 3.350255903434211e+17, "learning_rate": 0.0001, "loss": 3.2853, "loss/crossentropy": 2.1936369478702544, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2379646047949791, "loss/reg": 0.0, "step": 20810 }, { "epoch": 0.1369736842105263, "grad_norm": 2.125, "grad_norm_var": 0.025055948893229166, "learning_rate": 0.0001, "loss": 3.103, "loss/crossentropy": 2.2369776248931883, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2727003887295723, "loss/reg": 0.0, "step": 20820 }, { "epoch": 0.13703947368421052, "grad_norm": 2.5625, "grad_norm_var": 0.033788045247395836, "learning_rate": 0.0001, "loss": 3.0878, "loss/crossentropy": 2.2766017496585844, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.21165435910224914, "loss/reg": 0.0, "step": 20830 }, { "epoch": 0.13710526315789473, "grad_norm": 5.84375, "grad_norm_var": 0.75269775390625, "learning_rate": 0.0001, "loss": 3.1106, "loss/crossentropy": 2.123941707611084, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2290189027786255, "loss/reg": 0.0, "step": 20840 }, { "epoch": 0.13717105263157894, "grad_norm": 2.234375, "grad_norm_var": 0.7671132405598958, "learning_rate": 0.0001, "loss": 3.0839, "loss/crossentropy": 2.239221286773682, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2762044548988342, "loss/reg": 0.0, "step": 20850 }, { "epoch": 0.13723684210526316, "grad_norm": 2.375, "grad_norm_var": 0.1312896728515625, "learning_rate": 0.0001, "loss": 3.1207, "loss/crossentropy": 2.4664002418518067, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2304532825946808, "loss/reg": 0.0, "step": 20860 }, { "epoch": 0.13730263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.1725847880045573, "learning_rate": 0.0001, "loss": 3.1318, "loss/crossentropy": 2.3205449104309084, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22169369757175444, "loss/reg": 0.0, "step": 20870 }, { "epoch": 0.13736842105263158, "grad_norm": 2.375, "grad_norm_var": 0.046533203125, "learning_rate": 0.0001, "loss": 3.1764, "loss/crossentropy": 2.2292275547981264, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.26578798294067385, "loss/reg": 0.0, "step": 20880 }, { "epoch": 0.1374342105263158, "grad_norm": 2.21875, "grad_norm_var": 0.013053385416666667, "learning_rate": 0.0001, "loss": 3.0906, "loss/crossentropy": 2.4730883955955507, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2600066691637039, "loss/reg": 0.0, "step": 20890 }, { "epoch": 0.1375, "grad_norm": 2.78125, "grad_norm_var": 0.03638916015625, "learning_rate": 0.0001, "loss": 3.1146, "loss/crossentropy": 2.389441525936127, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2204804763197899, "loss/reg": 0.0, "step": 20900 }, { "epoch": 0.1375657894736842, "grad_norm": 2.015625, "grad_norm_var": 0.05347391764322917, "learning_rate": 0.0001, "loss": 3.1005, "loss/crossentropy": 2.182996892929077, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.23823060542345048, "loss/reg": 0.0, "step": 20910 }, { "epoch": 0.1376315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.0274810791015625, "learning_rate": 0.0001, "loss": 3.1323, "loss/crossentropy": 2.3811920285224915, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.22427577376365662, "loss/reg": 0.0, "step": 20920 }, { "epoch": 0.13769736842105262, "grad_norm": 2.234375, "grad_norm_var": 0.0668609619140625, "learning_rate": 0.0001, "loss": 3.1591, "loss/crossentropy": 2.106644082069397, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.27698966562747956, "loss/reg": 0.0, "step": 20930 }, { "epoch": 0.13776315789473684, "grad_norm": 2.53125, "grad_norm_var": 0.07274983723958334, "learning_rate": 0.0001, "loss": 3.165, "loss/crossentropy": 2.3118181228637695, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.30020255893468856, "loss/reg": 0.0, "step": 20940 }, { "epoch": 0.13782894736842105, "grad_norm": 2.21875, "grad_norm_var": 0.025804646809895835, "learning_rate": 0.0001, "loss": 3.1041, "loss/crossentropy": 2.429260182380676, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.2653680741786957, "loss/reg": 0.0, "step": 20950 }, { "epoch": 0.13789473684210526, "grad_norm": 3.25, "grad_norm_var": 0.0960528055826823, "learning_rate": 0.0001, "loss": 3.0853, "loss/crossentropy": 1.888709381222725, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.21381159387528897, "loss/reg": 0.0, "step": 20960 }, { "epoch": 0.13796052631578948, "grad_norm": 2.140625, "grad_norm_var": 0.09798355102539062, "learning_rate": 0.0001, "loss": 3.1087, "loss/crossentropy": 2.210513544082642, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.24400804787874222, "loss/reg": 0.0, "step": 20970 }, { "epoch": 0.1380263157894737, "grad_norm": 2.0, "grad_norm_var": 0.1358062744140625, "learning_rate": 0.0001, "loss": 3.1309, "loss/crossentropy": 2.3269863963127135, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.25037053376436236, "loss/reg": 0.0, "step": 20980 }, { "epoch": 0.1380921052631579, "grad_norm": 2.03125, "grad_norm_var": 0.13828125, "learning_rate": 0.0001, "loss": 3.103, "loss/crossentropy": 2.4014554262161254, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.22934938669204713, "loss/reg": 0.0, "step": 20990 }, { "epoch": 0.13815789473684212, "grad_norm": 2.34375, "grad_norm_var": 0.0584381103515625, "learning_rate": 0.0001, "loss": 3.1088, "loss/crossentropy": 2.208330225944519, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.23984504938125611, "loss/reg": 0.0, "step": 21000 }, { "epoch": 0.1382236842105263, "grad_norm": 1.9296875, "grad_norm_var": 0.1774614969889323, "learning_rate": 0.0001, "loss": 3.0588, "loss/crossentropy": 2.496232438087463, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.29541969746351243, "loss/reg": 0.0, "step": 21010 }, { "epoch": 0.13828947368421052, "grad_norm": 2.46875, "grad_norm_var": 0.4096514383951823, "learning_rate": 0.0001, "loss": 3.1337, "loss/crossentropy": 2.167738914489746, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.23880672752857207, "loss/reg": 0.0, "step": 21020 }, { "epoch": 0.13835526315789473, "grad_norm": 2.34375, "grad_norm_var": 0.06155192057291667, "learning_rate": 0.0001, "loss": 3.1325, "loss/crossentropy": 2.1132714807987214, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.227035228908062, "loss/reg": 0.0, "step": 21030 }, { "epoch": 0.13842105263157894, "grad_norm": 2.3125, "grad_norm_var": 0.09390360514322917, "learning_rate": 0.0001, "loss": 3.1269, "loss/crossentropy": 2.467393732070923, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.24615048468112946, "loss/reg": 0.0, "step": 21040 }, { "epoch": 0.13848684210526316, "grad_norm": 2.171875, "grad_norm_var": 0.035090128580729164, "learning_rate": 0.0001, "loss": 3.1166, "loss/crossentropy": 2.2899758577346803, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.25930332988500593, "loss/reg": 0.0, "step": 21050 }, { "epoch": 0.13855263157894737, "grad_norm": 2.609375, "grad_norm_var": 0.08716812133789062, "learning_rate": 0.0001, "loss": 3.074, "loss/crossentropy": 2.3258708715438843, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.23002547025680542, "loss/reg": 0.0, "step": 21060 }, { "epoch": 0.13861842105263159, "grad_norm": 2.203125, "grad_norm_var": 0.10960667928059896, "learning_rate": 0.0001, "loss": 3.1011, "loss/crossentropy": 2.368458116054535, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.26667492985725405, "loss/reg": 0.0, "step": 21070 }, { "epoch": 0.1386842105263158, "grad_norm": 2.1875, "grad_norm_var": 0.053807576497395836, "learning_rate": 0.0001, "loss": 3.0992, "loss/crossentropy": 2.268073225021362, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23710739463567734, "loss/reg": 0.0, "step": 21080 }, { "epoch": 0.13875, "grad_norm": 2.15625, "grad_norm_var": 0.7524088541666667, "learning_rate": 0.0001, "loss": 3.123, "loss/crossentropy": 2.116519570350647, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.2593864217400551, "loss/reg": 0.0, "step": 21090 }, { "epoch": 0.1388157894736842, "grad_norm": 2.125, "grad_norm_var": 0.10280329386393229, "learning_rate": 0.0001, "loss": 3.0788, "loss/crossentropy": 2.49674973487854, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2637839734554291, "loss/reg": 0.0, "step": 21100 }, { "epoch": 0.1388815789473684, "grad_norm": 2.78125, "grad_norm_var": 0.06420059204101562, "learning_rate": 0.0001, "loss": 3.0901, "loss/crossentropy": 2.142958414554596, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.28233732134103773, "loss/reg": 0.0, "step": 21110 }, { "epoch": 0.13894736842105262, "grad_norm": 2.34375, "grad_norm_var": 0.09973551432291666, "learning_rate": 0.0001, "loss": 3.1008, "loss/crossentropy": 2.300196385383606, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.24371459782123567, "loss/reg": 0.0, "step": 21120 }, { "epoch": 0.13901315789473684, "grad_norm": 2.375, "grad_norm_var": 0.07563247680664062, "learning_rate": 0.0001, "loss": 3.0722, "loss/crossentropy": 2.424953353404999, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.28959580361843107, "loss/reg": 0.0, "step": 21130 }, { "epoch": 0.13907894736842105, "grad_norm": 2.484375, "grad_norm_var": 0.05977554321289062, "learning_rate": 0.0001, "loss": 3.1188, "loss/crossentropy": 2.3048367381095884, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26107140332460405, "loss/reg": 0.0, "step": 21140 }, { "epoch": 0.13914473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.17164713541666668, "learning_rate": 0.0001, "loss": 3.1765, "loss/crossentropy": 2.376292657852173, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.24798475652933122, "loss/reg": 0.0, "step": 21150 }, { "epoch": 0.13921052631578948, "grad_norm": 2.34375, "grad_norm_var": 0.0533355712890625, "learning_rate": 0.0001, "loss": 3.0556, "loss/crossentropy": 2.620091509819031, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2451164111495018, "loss/reg": 0.0, "step": 21160 }, { "epoch": 0.1392763157894737, "grad_norm": 2.328125, "grad_norm_var": 0.07333882649739583, "learning_rate": 0.0001, "loss": 3.1244, "loss/crossentropy": 2.2057873249053954, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.26459684371948244, "loss/reg": 0.0, "step": 21170 }, { "epoch": 0.1393421052631579, "grad_norm": 2.640625, "grad_norm_var": 0.024214680989583334, "learning_rate": 0.0001, "loss": 3.0668, "loss/crossentropy": 2.087160420417786, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2084551602602005, "loss/reg": 0.0, "step": 21180 }, { "epoch": 0.1394078947368421, "grad_norm": 2.0625, "grad_norm_var": 0.10177408854166667, "learning_rate": 0.0001, "loss": 3.0369, "loss/crossentropy": 2.4722538352012635, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.28627206683158873, "loss/reg": 0.0, "step": 21190 }, { "epoch": 0.1394736842105263, "grad_norm": 2.4375, "grad_norm_var": 3.921613566080729, "learning_rate": 0.0001, "loss": 3.2147, "loss/crossentropy": 2.103289079666138, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2449532501399517, "loss/reg": 0.0, "step": 21200 }, { "epoch": 0.13953947368421052, "grad_norm": 2.71875, "grad_norm_var": 3.8165679931640626, "learning_rate": 0.0001, "loss": 3.123, "loss/crossentropy": 2.377441930770874, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.27957661896944047, "loss/reg": 0.0, "step": 21210 }, { "epoch": 0.13960526315789473, "grad_norm": 3.34375, "grad_norm_var": 0.18837788899739583, "learning_rate": 0.0001, "loss": 3.1171, "loss/crossentropy": 2.218759763240814, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2158270835876465, "loss/reg": 0.0, "step": 21220 }, { "epoch": 0.13967105263157895, "grad_norm": 2.46875, "grad_norm_var": 14.1974609375, "learning_rate": 0.0001, "loss": 3.1327, "loss/crossentropy": 2.4489439368247985, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.23917225003242493, "loss/reg": 0.0, "step": 21230 }, { "epoch": 0.13973684210526316, "grad_norm": 3.140625, "grad_norm_var": 14.437064361572265, "learning_rate": 0.0001, "loss": 3.0414, "loss/crossentropy": 2.2649194478988646, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.21438237726688386, "loss/reg": 0.0, "step": 21240 }, { "epoch": 0.13980263157894737, "grad_norm": 2.046875, "grad_norm_var": 2.151969401041667, "learning_rate": 0.0001, "loss": 3.1748, "loss/crossentropy": 2.350678837299347, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2586789205670357, "loss/reg": 0.0, "step": 21250 }, { "epoch": 0.1398684210526316, "grad_norm": 2.203125, "grad_norm_var": 0.07167867024739584, "learning_rate": 0.0001, "loss": 3.1436, "loss/crossentropy": 2.383778703212738, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.30858664512634276, "loss/reg": 0.0, "step": 21260 }, { "epoch": 0.1399342105263158, "grad_norm": 2.234375, "grad_norm_var": 0.04163004557291667, "learning_rate": 0.0001, "loss": 3.0829, "loss/crossentropy": 2.4344274759292603, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.24489458352327348, "loss/reg": 0.0, "step": 21270 }, { "epoch": 0.14, "grad_norm": 2.15625, "grad_norm_var": 0.04241129557291667, "learning_rate": 0.0001, "loss": 3.0751, "loss/crossentropy": 2.59662504196167, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.23801603466272353, "loss/reg": 0.0, "step": 21280 }, { "epoch": 0.1400657894736842, "grad_norm": 2.328125, "grad_norm_var": 1.7194010416666667, "learning_rate": 0.0001, "loss": 3.0867, "loss/crossentropy": 2.466573119163513, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.29062790870666505, "loss/reg": 0.0, "step": 21290 }, { "epoch": 0.1401315789473684, "grad_norm": 2.609375, "grad_norm_var": 0.19259440104166667, "learning_rate": 0.0001, "loss": 3.2042, "loss/crossentropy": 2.5092820644378664, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.3163674846291542, "loss/reg": 0.0, "step": 21300 }, { "epoch": 0.14019736842105263, "grad_norm": 2.046875, "grad_norm_var": 0.09862874348958334, "learning_rate": 0.0001, "loss": 3.0485, "loss/crossentropy": 2.1548155784606933, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.20280121639370918, "loss/reg": 0.0, "step": 21310 }, { "epoch": 0.14026315789473684, "grad_norm": 2.203125, "grad_norm_var": 0.25152587890625, "learning_rate": 0.0001, "loss": 3.0701, "loss/crossentropy": 2.1562862396240234, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.1931696727871895, "loss/reg": 0.0, "step": 21320 }, { "epoch": 0.14032894736842105, "grad_norm": 7.6875, "grad_norm_var": 1.9003896077473958, "learning_rate": 0.0001, "loss": 3.1242, "loss/crossentropy": 2.4868319749832155, "loss/hidden": 3.2, "loss/incoh": 0.0, "loss/logits": 0.263777095079422, "loss/reg": 0.0, "step": 21330 }, { "epoch": 0.14039473684210527, "grad_norm": 2.140625, "grad_norm_var": 2.187555948893229, "learning_rate": 0.0001, "loss": 3.0713, "loss/crossentropy": 2.4403932213783266, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.239518903195858, "loss/reg": 0.0, "step": 21340 }, { "epoch": 0.14046052631578948, "grad_norm": 2.9375, "grad_norm_var": 0.92603759765625, "learning_rate": 0.0001, "loss": 3.161, "loss/crossentropy": 2.637459659576416, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2770415723323822, "loss/reg": 0.0, "step": 21350 }, { "epoch": 0.1405263157894737, "grad_norm": 2.1875, "grad_norm_var": 0.6560373942057292, "learning_rate": 0.0001, "loss": 3.1023, "loss/crossentropy": 2.4345417737960817, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.23416574895381928, "loss/reg": 0.0, "step": 21360 }, { "epoch": 0.1405921052631579, "grad_norm": 4.625, "grad_norm_var": 0.39641927083333334, "learning_rate": 0.0001, "loss": 3.0819, "loss/crossentropy": 2.3080653309822083, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.203302001953125, "loss/reg": 0.0, "step": 21370 }, { "epoch": 0.1406578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.3663736979166667, "learning_rate": 0.0001, "loss": 3.005, "loss/crossentropy": 2.312281048297882, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.22557048946619035, "loss/reg": 0.0, "step": 21380 }, { "epoch": 0.1407236842105263, "grad_norm": 2.203125, "grad_norm_var": 0.0878326416015625, "learning_rate": 0.0001, "loss": 3.0862, "loss/crossentropy": 2.725596809387207, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.26989305168390276, "loss/reg": 0.0, "step": 21390 }, { "epoch": 0.14078947368421052, "grad_norm": 2.59375, "grad_norm_var": 0.7812662760416667, "learning_rate": 0.0001, "loss": 3.1293, "loss/crossentropy": 2.4127392411231994, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.28430653512477877, "loss/reg": 0.0, "step": 21400 }, { "epoch": 0.14085526315789473, "grad_norm": 2.546875, "grad_norm_var": 0.76865234375, "learning_rate": 0.0001, "loss": 3.1054, "loss/crossentropy": 2.3319249004125595, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22504624500870704, "loss/reg": 0.0, "step": 21410 }, { "epoch": 0.14092105263157895, "grad_norm": 2.09375, "grad_norm_var": 0.09716389973958334, "learning_rate": 0.0001, "loss": 3.0617, "loss/crossentropy": 2.6645477771759034, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24796359091997147, "loss/reg": 0.0, "step": 21420 }, { "epoch": 0.14098684210526316, "grad_norm": 2.609375, "grad_norm_var": 4.503599619195098e+17, "learning_rate": 0.0001, "loss": 3.2735, "loss/crossentropy": 2.1648690342903136, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.23503894209861756, "loss/reg": 0.0, "step": 21430 }, { "epoch": 0.14105263157894737, "grad_norm": 2.078125, "grad_norm_var": 0.07038472493489584, "learning_rate": 0.0001, "loss": 3.0659, "loss/crossentropy": 2.477909338474274, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2327214926481247, "loss/reg": 0.0, "step": 21440 }, { "epoch": 0.1411184210526316, "grad_norm": 2.234375, "grad_norm_var": 0.03515625, "learning_rate": 0.0001, "loss": 3.057, "loss/crossentropy": 2.354469347000122, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23355985432863235, "loss/reg": 0.0, "step": 21450 }, { "epoch": 0.1411842105263158, "grad_norm": 3.34375, "grad_norm_var": 0.18144124348958332, "learning_rate": 0.0001, "loss": 3.1157, "loss/crossentropy": 2.2400832891464235, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.24985998570919038, "loss/reg": 0.0, "step": 21460 }, { "epoch": 0.14125, "grad_norm": 5.65625, "grad_norm_var": 0.7304921468098958, "learning_rate": 0.0001, "loss": 3.0675, "loss/crossentropy": 2.324231135845184, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.22793666571378707, "loss/reg": 0.0, "step": 21470 }, { "epoch": 0.1413157894736842, "grad_norm": 2.109375, "grad_norm_var": 0.7799641927083333, "learning_rate": 0.0001, "loss": 3.1565, "loss/crossentropy": 2.4094609022140503, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.21761243641376496, "loss/reg": 0.0, "step": 21480 }, { "epoch": 0.1413815789473684, "grad_norm": 2.765625, "grad_norm_var": 0.1587310791015625, "learning_rate": 0.0001, "loss": 3.0847, "loss/crossentropy": 1.9431841492652893, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.24099752232432364, "loss/reg": 0.0, "step": 21490 }, { "epoch": 0.14144736842105263, "grad_norm": 2.1875, "grad_norm_var": 0.0576324462890625, "learning_rate": 0.0001, "loss": 3.0518, "loss/crossentropy": 2.5469526767730715, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.24212357103824617, "loss/reg": 0.0, "step": 21500 }, { "epoch": 0.14151315789473684, "grad_norm": 2.6875, "grad_norm_var": 0.07248942057291667, "learning_rate": 0.0001, "loss": 3.1081, "loss/crossentropy": 2.303622233867645, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23643973991274833, "loss/reg": 0.0, "step": 21510 }, { "epoch": 0.14157894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.16367085774739584, "learning_rate": 0.0001, "loss": 3.0839, "loss/crossentropy": 2.178890359401703, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.23347833156585693, "loss/reg": 0.0, "step": 21520 }, { "epoch": 0.14164473684210527, "grad_norm": 2.4375, "grad_norm_var": 0.0575836181640625, "learning_rate": 0.0001, "loss": 3.1122, "loss/crossentropy": 2.4304057359695435, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.2439558282494545, "loss/reg": 0.0, "step": 21530 }, { "epoch": 0.14171052631578948, "grad_norm": 2.765625, "grad_norm_var": 0.1728668212890625, "learning_rate": 0.0001, "loss": 3.1416, "loss/crossentropy": 2.111624151468277, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2380165532231331, "loss/reg": 0.0, "step": 21540 }, { "epoch": 0.1417763157894737, "grad_norm": 2.6875, "grad_norm_var": 0.17907613118489582, "learning_rate": 0.0001, "loss": 3.0817, "loss/crossentropy": 2.2606616258621215, "loss/hidden": 3.0546875, "loss/incoh": 0.0, "loss/logits": 0.24319592714309693, "loss/reg": 0.0, "step": 21550 }, { "epoch": 0.1418421052631579, "grad_norm": 2.90625, "grad_norm_var": 0.065234375, "learning_rate": 0.0001, "loss": 3.0592, "loss/crossentropy": 2.2833418786525725, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.24779771864414216, "loss/reg": 0.0, "step": 21560 }, { "epoch": 0.1419078947368421, "grad_norm": 2.1875, "grad_norm_var": 0.19248758951822917, "learning_rate": 0.0001, "loss": 3.1323, "loss/crossentropy": 2.2865166664123535, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2382546842098236, "loss/reg": 0.0, "step": 21570 }, { "epoch": 0.1419736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.19429931640625, "learning_rate": 0.0001, "loss": 3.0883, "loss/crossentropy": 2.37155544757843, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.28304021507501603, "loss/reg": 0.0, "step": 21580 }, { "epoch": 0.14203947368421052, "grad_norm": 2.171875, "grad_norm_var": 0.06051432291666667, "learning_rate": 0.0001, "loss": 3.1666, "loss/crossentropy": 2.0952234268188477, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.28353812396526334, "loss/reg": 0.0, "step": 21590 }, { "epoch": 0.14210526315789473, "grad_norm": 2.203125, "grad_norm_var": 0.0432769775390625, "learning_rate": 0.0001, "loss": 3.1277, "loss/crossentropy": 2.6342113494873045, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.27442471832036974, "loss/reg": 0.0, "step": 21600 }, { "epoch": 0.14217105263157895, "grad_norm": 2.46875, "grad_norm_var": 0.052018229166666666, "learning_rate": 0.0001, "loss": 3.0889, "loss/crossentropy": 2.3795015037059786, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24964673370122908, "loss/reg": 0.0, "step": 21610 }, { "epoch": 0.14223684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.15912984212239584, "learning_rate": 0.0001, "loss": 3.1592, "loss/crossentropy": 2.435245943069458, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.36159887462854384, "loss/reg": 0.0, "step": 21620 }, { "epoch": 0.14230263157894738, "grad_norm": 2.203125, "grad_norm_var": 0.19715067545572917, "learning_rate": 0.0001, "loss": 2.9695, "loss/crossentropy": 2.3731093287467955, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.24517869502305983, "loss/reg": 0.0, "step": 21630 }, { "epoch": 0.1423684210526316, "grad_norm": 2.375, "grad_norm_var": 0.029588826497395835, "learning_rate": 0.0001, "loss": 3.09, "loss/crossentropy": 2.1785697817802427, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22660856544971467, "loss/reg": 0.0, "step": 21640 }, { "epoch": 0.1424342105263158, "grad_norm": 2.4375, "grad_norm_var": 0.0379302978515625, "learning_rate": 0.0001, "loss": 3.1062, "loss/crossentropy": 2.3706693768501284, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.27286539524793624, "loss/reg": 0.0, "step": 21650 }, { "epoch": 0.1425, "grad_norm": 2.40625, "grad_norm_var": 0.05085347493489583, "learning_rate": 0.0001, "loss": 3.1297, "loss/crossentropy": 2.173750901222229, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.2637738898396492, "loss/reg": 0.0, "step": 21660 }, { "epoch": 0.1425657894736842, "grad_norm": 3.046875, "grad_norm_var": 0.10258153279622396, "learning_rate": 0.0001, "loss": 3.0637, "loss/crossentropy": 2.133163595199585, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.19840268045663834, "loss/reg": 0.0, "step": 21670 }, { "epoch": 0.14263157894736841, "grad_norm": 2.390625, "grad_norm_var": 0.062646484375, "learning_rate": 0.0001, "loss": 3.0662, "loss/crossentropy": 2.4523648262023925, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.23104819357395173, "loss/reg": 0.0, "step": 21680 }, { "epoch": 0.14269736842105263, "grad_norm": 2.625, "grad_norm_var": 0.2750935872395833, "learning_rate": 0.0001, "loss": 3.1536, "loss/crossentropy": 2.250650954246521, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.24096653312444688, "loss/reg": 0.0, "step": 21690 }, { "epoch": 0.14276315789473684, "grad_norm": 2.5, "grad_norm_var": 0.26304931640625, "learning_rate": 0.0001, "loss": 3.0649, "loss/crossentropy": 2.30162969827652, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.24212537854909896, "loss/reg": 0.0, "step": 21700 }, { "epoch": 0.14282894736842106, "grad_norm": 2.296875, "grad_norm_var": 0.027521769205729168, "learning_rate": 0.0001, "loss": 3.1097, "loss/crossentropy": 2.1112605214118956, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.26635565906763076, "loss/reg": 0.0, "step": 21710 }, { "epoch": 0.14289473684210527, "grad_norm": 2.859375, "grad_norm_var": 0.0798492431640625, "learning_rate": 0.0001, "loss": 3.0973, "loss/crossentropy": 2.3435486197471618, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2827204465866089, "loss/reg": 0.0, "step": 21720 }, { "epoch": 0.14296052631578948, "grad_norm": 2.078125, "grad_norm_var": 0.114111328125, "learning_rate": 0.0001, "loss": 3.1039, "loss/crossentropy": 2.4754523396492005, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.29244270324707033, "loss/reg": 0.0, "step": 21730 }, { "epoch": 0.1430263157894737, "grad_norm": 2.0625, "grad_norm_var": 0.18517252604166667, "learning_rate": 0.0001, "loss": 3.1588, "loss/crossentropy": 2.2757002115249634, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.27034522593021393, "loss/reg": 0.0, "step": 21740 }, { "epoch": 0.14309210526315788, "grad_norm": 2.203125, "grad_norm_var": 0.19296773274739584, "learning_rate": 0.0001, "loss": 3.112, "loss/crossentropy": 2.52050644159317, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.2818415179848671, "loss/reg": 0.0, "step": 21750 }, { "epoch": 0.1431578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.1665679931640625, "learning_rate": 0.0001, "loss": 3.2325, "loss/crossentropy": 2.3630916833877564, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.2456425666809082, "loss/reg": 0.0, "step": 21760 }, { "epoch": 0.1432236842105263, "grad_norm": 2.984375, "grad_norm_var": 0.21158854166666666, "learning_rate": 0.0001, "loss": 3.0907, "loss/crossentropy": 2.1248918056488035, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.22751243263483048, "loss/reg": 0.0, "step": 21770 }, { "epoch": 0.14328947368421052, "grad_norm": 2.25, "grad_norm_var": 0.06939697265625, "learning_rate": 0.0001, "loss": 3.0952, "loss/crossentropy": 2.273802196979523, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2570581123232841, "loss/reg": 0.0, "step": 21780 }, { "epoch": 0.14335526315789474, "grad_norm": 2.4375, "grad_norm_var": 0.025763956705729167, "learning_rate": 0.0001, "loss": 3.0584, "loss/crossentropy": 2.096750485897064, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.21872427612543105, "loss/reg": 0.0, "step": 21790 }, { "epoch": 0.14342105263157895, "grad_norm": 2.25, "grad_norm_var": 0.023412068684895832, "learning_rate": 0.0001, "loss": 3.0604, "loss/crossentropy": 2.228634536266327, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2626715019345284, "loss/reg": 0.0, "step": 21800 }, { "epoch": 0.14348684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.28734512329101564, "learning_rate": 0.0001, "loss": 3.1324, "loss/crossentropy": 2.475996434688568, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.23401851058006287, "loss/reg": 0.0, "step": 21810 }, { "epoch": 0.14355263157894738, "grad_norm": 2.03125, "grad_norm_var": 0.39009984334309894, "learning_rate": 0.0001, "loss": 3.1624, "loss/crossentropy": 2.285959267616272, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.24858510196208955, "loss/reg": 0.0, "step": 21820 }, { "epoch": 0.1436184210526316, "grad_norm": 2.5625, "grad_norm_var": 0.18899739583333333, "learning_rate": 0.0001, "loss": 3.0702, "loss/crossentropy": 2.463927984237671, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2442393772304058, "loss/reg": 0.0, "step": 21830 }, { "epoch": 0.1436842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.14226786295572916, "learning_rate": 0.0001, "loss": 3.0965, "loss/crossentropy": 2.277498161792755, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2153932645916939, "loss/reg": 0.0, "step": 21840 }, { "epoch": 0.14375, "grad_norm": 2.4375, "grad_norm_var": 1.7546953837076822, "learning_rate": 0.0001, "loss": 3.108, "loss/crossentropy": 2.3049885034561157, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.26804344207048414, "loss/reg": 0.0, "step": 21850 }, { "epoch": 0.1438157894736842, "grad_norm": 2.90625, "grad_norm_var": 4.087398020426432, "learning_rate": 0.0001, "loss": 3.1586, "loss/crossentropy": 2.239436650276184, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.27900682389736176, "loss/reg": 0.0, "step": 21860 }, { "epoch": 0.14388157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.09602762858072916, "learning_rate": 0.0001, "loss": 3.1657, "loss/crossentropy": 2.385130798816681, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.28002837896347044, "loss/reg": 0.0, "step": 21870 }, { "epoch": 0.14394736842105263, "grad_norm": 2.71875, "grad_norm_var": 0.12711588541666666, "learning_rate": 0.0001, "loss": 3.1437, "loss/crossentropy": 2.426273798942566, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.3090307667851448, "loss/reg": 0.0, "step": 21880 }, { "epoch": 0.14401315789473684, "grad_norm": 2.484375, "grad_norm_var": 0.09078776041666667, "learning_rate": 0.0001, "loss": 3.1105, "loss/crossentropy": 2.148479038476944, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.22178827822208405, "loss/reg": 0.0, "step": 21890 }, { "epoch": 0.14407894736842106, "grad_norm": 2.5, "grad_norm_var": 0.0625, "learning_rate": 0.0001, "loss": 3.1009, "loss/crossentropy": 2.2989478588104246, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.21844121366739272, "loss/reg": 0.0, "step": 21900 }, { "epoch": 0.14414473684210527, "grad_norm": 2.265625, "grad_norm_var": 0.10066630045572916, "learning_rate": 0.0001, "loss": 3.1368, "loss/crossentropy": 2.459014880657196, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2873328909277916, "loss/reg": 0.0, "step": 21910 }, { "epoch": 0.14421052631578948, "grad_norm": 2.296875, "grad_norm_var": 0.15559794108072916, "learning_rate": 0.0001, "loss": 3.048, "loss/crossentropy": 2.0745410680770875, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.24865385442972182, "loss/reg": 0.0, "step": 21920 }, { "epoch": 0.1442763157894737, "grad_norm": 2.75, "grad_norm_var": 0.1090240478515625, "learning_rate": 0.0001, "loss": 3.2472, "loss/crossentropy": 2.216059982776642, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.25595484375953675, "loss/reg": 0.0, "step": 21930 }, { "epoch": 0.14434210526315788, "grad_norm": 2.59375, "grad_norm_var": 0.06485773722330729, "learning_rate": 0.0001, "loss": 3.0388, "loss/crossentropy": 2.3124427676200865, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.2212652564048767, "loss/reg": 0.0, "step": 21940 }, { "epoch": 0.1444078947368421, "grad_norm": 2.953125, "grad_norm_var": 0.4722246805826823, "learning_rate": 0.0001, "loss": 3.2024, "loss/crossentropy": 2.1076952695846556, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2454703077673912, "loss/reg": 0.0, "step": 21950 }, { "epoch": 0.1444736842105263, "grad_norm": 2.59375, "grad_norm_var": 0.6233306884765625, "learning_rate": 0.0001, "loss": 3.1576, "loss/crossentropy": 2.3738736391067503, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.22644958645105362, "loss/reg": 0.0, "step": 21960 }, { "epoch": 0.14453947368421052, "grad_norm": 2.4375, "grad_norm_var": 0.48958231608072916, "learning_rate": 0.0001, "loss": 3.1195, "loss/crossentropy": 2.3681930541992187, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.23997818529605866, "loss/reg": 0.0, "step": 21970 }, { "epoch": 0.14460526315789474, "grad_norm": 2.671875, "grad_norm_var": 0.06328125, "learning_rate": 0.0001, "loss": 3.086, "loss/crossentropy": 2.202842915058136, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.22619501650333404, "loss/reg": 0.0, "step": 21980 }, { "epoch": 0.14467105263157895, "grad_norm": 2.25, "grad_norm_var": 0.08901265462239584, "learning_rate": 0.0001, "loss": 3.0968, "loss/crossentropy": 2.498619997501373, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2592793509364128, "loss/reg": 0.0, "step": 21990 }, { "epoch": 0.14473684210526316, "grad_norm": 2.75, "grad_norm_var": 0.07700907389322917, "learning_rate": 0.0001, "loss": 3.0646, "loss/crossentropy": 2.5274386525154116, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.2393401548266411, "loss/reg": 0.0, "step": 22000 }, { "epoch": 0.14480263157894738, "grad_norm": 2.609375, "grad_norm_var": 0.03200581868489583, "learning_rate": 0.0001, "loss": 3.1375, "loss/crossentropy": 2.331031596660614, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.22175868153572081, "loss/reg": 0.0, "step": 22010 }, { "epoch": 0.1448684210526316, "grad_norm": 2.53125, "grad_norm_var": 0.031473541259765626, "learning_rate": 0.0001, "loss": 3.1187, "loss/crossentropy": 2.1961219191551207, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.23576426208019258, "loss/reg": 0.0, "step": 22020 }, { "epoch": 0.14493421052631578, "grad_norm": 2.15625, "grad_norm_var": 0.04307835896809896, "learning_rate": 0.0001, "loss": 3.0722, "loss/crossentropy": 2.4104359984397887, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.25036042332649233, "loss/reg": 0.0, "step": 22030 }, { "epoch": 0.145, "grad_norm": 2.78125, "grad_norm_var": 0.0566558837890625, "learning_rate": 0.0001, "loss": 3.0535, "loss/crossentropy": 2.293129473924637, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.3266716688871384, "loss/reg": 0.0, "step": 22040 }, { "epoch": 0.1450657894736842, "grad_norm": 2.21875, "grad_norm_var": 0.1048828125, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.1948832154273985, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2283545032143593, "loss/reg": 0.0, "step": 22050 }, { "epoch": 0.14513157894736842, "grad_norm": 2.15625, "grad_norm_var": 0.85728759765625, "learning_rate": 0.0001, "loss": 3.1379, "loss/crossentropy": 2.4909852266311647, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.26901226192712785, "loss/reg": 0.0, "step": 22060 }, { "epoch": 0.14519736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.8632476806640625, "learning_rate": 0.0001, "loss": 3.0345, "loss/crossentropy": 2.4894071340560915, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2518943950533867, "loss/reg": 0.0, "step": 22070 }, { "epoch": 0.14526315789473684, "grad_norm": 2.5625, "grad_norm_var": 0.035553995768229166, "learning_rate": 0.0001, "loss": 3.0777, "loss/crossentropy": 2.250881004333496, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.254881876707077, "loss/reg": 0.0, "step": 22080 }, { "epoch": 0.14532894736842106, "grad_norm": 2.140625, "grad_norm_var": 0.6524251302083334, "learning_rate": 0.0001, "loss": 3.0046, "loss/crossentropy": 2.4635193943977356, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.28406478762626647, "loss/reg": 0.0, "step": 22090 }, { "epoch": 0.14539473684210527, "grad_norm": 2.203125, "grad_norm_var": 0.2197906494140625, "learning_rate": 0.0001, "loss": 3.0607, "loss/crossentropy": 2.176581871509552, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2112376168370247, "loss/reg": 0.0, "step": 22100 }, { "epoch": 0.14546052631578948, "grad_norm": 2.421875, "grad_norm_var": 0.0716461181640625, "learning_rate": 0.0001, "loss": 3.09, "loss/crossentropy": 2.418153405189514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.24471160471439363, "loss/reg": 0.0, "step": 22110 }, { "epoch": 0.1455263157894737, "grad_norm": 2.203125, "grad_norm_var": 0.024909464518229167, "learning_rate": 0.0001, "loss": 3.0661, "loss/crossentropy": 2.34368360042572, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.26933753192424775, "loss/reg": 0.0, "step": 22120 }, { "epoch": 0.14559210526315788, "grad_norm": 2.28125, "grad_norm_var": 0.0403228759765625, "learning_rate": 0.0001, "loss": 3.0961, "loss/crossentropy": 2.2858925461769104, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.23935395181179048, "loss/reg": 0.0, "step": 22130 }, { "epoch": 0.1456578947368421, "grad_norm": 2.140625, "grad_norm_var": 0.06553446451822917, "learning_rate": 0.0001, "loss": 3.1287, "loss/crossentropy": 2.476478910446167, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2711992233991623, "loss/reg": 0.0, "step": 22140 }, { "epoch": 0.1457236842105263, "grad_norm": 2.90625, "grad_norm_var": 0.14517822265625, "learning_rate": 0.0001, "loss": 3.0753, "loss/crossentropy": 2.095052421092987, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.2310307502746582, "loss/reg": 0.0, "step": 22150 }, { "epoch": 0.14578947368421052, "grad_norm": 2.234375, "grad_norm_var": 0.13459243774414062, "learning_rate": 0.0001, "loss": 3.0609, "loss/crossentropy": 2.252713418006897, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2497629001736641, "loss/reg": 0.0, "step": 22160 }, { "epoch": 0.14585526315789474, "grad_norm": 2.5625, "grad_norm_var": 0.06102701822916667, "learning_rate": 0.0001, "loss": 3.092, "loss/crossentropy": 2.2792391180992126, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2850720778107643, "loss/reg": 0.0, "step": 22170 }, { "epoch": 0.14592105263157895, "grad_norm": 2.828125, "grad_norm_var": 0.2100738525390625, "learning_rate": 0.0001, "loss": 3.1172, "loss/crossentropy": 2.116334557533264, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22928555756807328, "loss/reg": 0.0, "step": 22180 }, { "epoch": 0.14598684210526316, "grad_norm": 2.46875, "grad_norm_var": 0.12631734212239584, "learning_rate": 0.0001, "loss": 3.103, "loss/crossentropy": 2.2788902163505553, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.24738141447305678, "loss/reg": 0.0, "step": 22190 }, { "epoch": 0.14605263157894738, "grad_norm": 2.0625, "grad_norm_var": 0.05478108723958333, "learning_rate": 0.0001, "loss": 2.9932, "loss/crossentropy": 2.3781439542770384, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2298112317919731, "loss/reg": 0.0, "step": 22200 }, { "epoch": 0.1461184210526316, "grad_norm": 2.328125, "grad_norm_var": 1.2361399332682292, "learning_rate": 0.0001, "loss": 3.1279, "loss/crossentropy": 2.367407274246216, "loss/hidden": 3.0234375, "loss/incoh": 0.0, "loss/logits": 0.25443960130214693, "loss/reg": 0.0, "step": 22210 }, { "epoch": 0.14618421052631578, "grad_norm": 2.484375, "grad_norm_var": 0.03775634765625, "learning_rate": 0.0001, "loss": 3.1253, "loss/crossentropy": 2.4262067079544067, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23887630701065063, "loss/reg": 0.0, "step": 22220 }, { "epoch": 0.14625, "grad_norm": 2.359375, "grad_norm_var": 0.19719136555989583, "learning_rate": 0.0001, "loss": 3.1058, "loss/crossentropy": 2.547460687160492, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.25797063410282134, "loss/reg": 0.0, "step": 22230 }, { "epoch": 0.1463157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.2289446512858073, "learning_rate": 0.0001, "loss": 3.1238, "loss/crossentropy": 2.1126400232315063, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.23666388988494874, "loss/reg": 0.0, "step": 22240 }, { "epoch": 0.14638157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.20608495076497396, "learning_rate": 0.0001, "loss": 3.0914, "loss/crossentropy": 2.5851184129714966, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.2918281376361847, "loss/reg": 0.0, "step": 22250 }, { "epoch": 0.14644736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.5136329650878906, "learning_rate": 0.0001, "loss": 3.0971, "loss/crossentropy": 2.319287371635437, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.283975313603878, "loss/reg": 0.0, "step": 22260 }, { "epoch": 0.14651315789473685, "grad_norm": 2.625, "grad_norm_var": 9.41323216756185, "learning_rate": 0.0001, "loss": 3.2274, "loss/crossentropy": 2.350848126411438, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.3113620936870575, "loss/reg": 0.0, "step": 22270 }, { "epoch": 0.14657894736842106, "grad_norm": 3.71875, "grad_norm_var": 0.20803120930989583, "learning_rate": 0.0001, "loss": 3.1285, "loss/crossentropy": 2.175449311733246, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.235707426071167, "loss/reg": 0.0, "step": 22280 }, { "epoch": 0.14664473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.37470601399739584, "learning_rate": 0.0001, "loss": 3.1206, "loss/crossentropy": 2.6359380006790163, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2809421643614769, "loss/reg": 0.0, "step": 22290 }, { "epoch": 0.14671052631578949, "grad_norm": 2.015625, "grad_norm_var": 0.33459879557291666, "learning_rate": 0.0001, "loss": 3.1002, "loss/crossentropy": 2.2076025128364565, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22375066131353377, "loss/reg": 0.0, "step": 22300 }, { "epoch": 0.14677631578947367, "grad_norm": 4.34375, "grad_norm_var": 0.37032877604166664, "learning_rate": 0.0001, "loss": 3.117, "loss/crossentropy": 2.0601858735084533, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.3449657797813416, "loss/reg": 0.0, "step": 22310 }, { "epoch": 0.14684210526315788, "grad_norm": 2.40625, "grad_norm_var": 0.7130360921223958, "learning_rate": 0.0001, "loss": 3.0669, "loss/crossentropy": 2.2765709161758423, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2070981428027153, "loss/reg": 0.0, "step": 22320 }, { "epoch": 0.1469078947368421, "grad_norm": 2.53125, "grad_norm_var": 0.5365193684895834, "learning_rate": 0.0001, "loss": 3.0347, "loss/crossentropy": 2.1606739163398743, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2066648006439209, "loss/reg": 0.0, "step": 22330 }, { "epoch": 0.1469736842105263, "grad_norm": 11.25, "grad_norm_var": 5.021190388997396, "learning_rate": 0.0001, "loss": 3.1108, "loss/crossentropy": 2.4670102834701537, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20643151551485062, "loss/reg": 0.0, "step": 22340 }, { "epoch": 0.14703947368421053, "grad_norm": 2.609375, "grad_norm_var": 4.964111328125, "learning_rate": 0.0001, "loss": 3.0792, "loss/crossentropy": 2.1708640813827516, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.2493487134575844, "loss/reg": 0.0, "step": 22350 }, { "epoch": 0.14710526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.2525299072265625, "learning_rate": 0.0001, "loss": 3.1371, "loss/crossentropy": 2.4388205766677857, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.25962347984313966, "loss/reg": 0.0, "step": 22360 }, { "epoch": 0.14717105263157895, "grad_norm": 2.0625, "grad_norm_var": 0.1518707275390625, "learning_rate": 0.0001, "loss": 3.0391, "loss/crossentropy": 2.295293319225311, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2302995279431343, "loss/reg": 0.0, "step": 22370 }, { "epoch": 0.14723684210526317, "grad_norm": 3.609375, "grad_norm_var": 0.20093994140625, "learning_rate": 0.0001, "loss": 3.1142, "loss/crossentropy": 2.081736671924591, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.21390146017074585, "loss/reg": 0.0, "step": 22380 }, { "epoch": 0.14730263157894738, "grad_norm": 2.390625, "grad_norm_var": 0.1891021728515625, "learning_rate": 0.0001, "loss": 3.0243, "loss/crossentropy": 2.1469112396240235, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.24780639857053757, "loss/reg": 0.0, "step": 22390 }, { "epoch": 0.14736842105263157, "grad_norm": 3.5, "grad_norm_var": 0.41617431640625, "learning_rate": 0.0001, "loss": 3.1187, "loss/crossentropy": 2.219987118244171, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.2700192674994469, "loss/reg": 0.0, "step": 22400 }, { "epoch": 0.14743421052631578, "grad_norm": 2.46875, "grad_norm_var": 0.10216471354166666, "learning_rate": 0.0001, "loss": 3.1659, "loss/crossentropy": 2.387179672718048, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.25483787804841995, "loss/reg": 0.0, "step": 22410 }, { "epoch": 0.1475, "grad_norm": 3.21875, "grad_norm_var": 0.10955403645833334, "learning_rate": 0.0001, "loss": 3.1812, "loss/crossentropy": 2.4982651591300966, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.23052496314048768, "loss/reg": 0.0, "step": 22420 }, { "epoch": 0.1475657894736842, "grad_norm": 2.6875, "grad_norm_var": 0.0999664306640625, "learning_rate": 0.0001, "loss": 3.0656, "loss/crossentropy": 2.2789443135261536, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23072773963212967, "loss/reg": 0.0, "step": 22430 }, { "epoch": 0.14763157894736842, "grad_norm": 2.53125, "grad_norm_var": 0.074658203125, "learning_rate": 0.0001, "loss": 3.0366, "loss/crossentropy": 2.384058713912964, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2288333684206009, "loss/reg": 0.0, "step": 22440 }, { "epoch": 0.14769736842105263, "grad_norm": 2.234375, "grad_norm_var": 0.05872294108072917, "learning_rate": 0.0001, "loss": 3.1249, "loss/crossentropy": 2.1794327974319456, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.25599117279052735, "loss/reg": 0.0, "step": 22450 }, { "epoch": 0.14776315789473685, "grad_norm": 2.578125, "grad_norm_var": 0.20020243326822917, "learning_rate": 0.0001, "loss": 3.2428, "loss/crossentropy": 2.3936930537223815, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.33337023556232454, "loss/reg": 0.0, "step": 22460 }, { "epoch": 0.14782894736842106, "grad_norm": 2.515625, "grad_norm_var": 0.2228424072265625, "learning_rate": 0.0001, "loss": 3.0933, "loss/crossentropy": 2.290210509300232, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.22466516196727754, "loss/reg": 0.0, "step": 22470 }, { "epoch": 0.14789473684210527, "grad_norm": 2.375, "grad_norm_var": 0.07656962076822917, "learning_rate": 0.0001, "loss": 3.1112, "loss/crossentropy": 2.1793927550315857, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2438851624727249, "loss/reg": 0.0, "step": 22480 }, { "epoch": 0.1479605263157895, "grad_norm": 2.171875, "grad_norm_var": 0.15196024576822917, "learning_rate": 0.0001, "loss": 3.105, "loss/crossentropy": 2.3326605677604677, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.22931813299655915, "loss/reg": 0.0, "step": 22490 }, { "epoch": 0.14802631578947367, "grad_norm": 2.1875, "grad_norm_var": 0.15309244791666668, "learning_rate": 0.0001, "loss": 3.0875, "loss/crossentropy": 2.222768235206604, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.2537077710032463, "loss/reg": 0.0, "step": 22500 }, { "epoch": 0.14809210526315789, "grad_norm": 2.28125, "grad_norm_var": 0.0938385009765625, "learning_rate": 0.0001, "loss": 3.1419, "loss/crossentropy": 2.308623898029327, "loss/hidden": 3.0859375, "loss/incoh": 0.0, "loss/logits": 0.27548344135284425, "loss/reg": 0.0, "step": 22510 }, { "epoch": 0.1481578947368421, "grad_norm": 2.15625, "grad_norm_var": 0.07166239420572916, "learning_rate": 0.0001, "loss": 3.068, "loss/crossentropy": 2.261019694805145, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.23012840151786804, "loss/reg": 0.0, "step": 22520 }, { "epoch": 0.1482236842105263, "grad_norm": 2.390625, "grad_norm_var": 0.13349202473958333, "learning_rate": 0.0001, "loss": 3.1074, "loss/crossentropy": 2.265119397640228, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2193696081638336, "loss/reg": 0.0, "step": 22530 }, { "epoch": 0.14828947368421053, "grad_norm": 2.578125, "grad_norm_var": 2.090127618074695e+17, "learning_rate": 0.0001, "loss": 3.3232, "loss/crossentropy": 2.414242672920227, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24838838130235671, "loss/reg": 0.0, "step": 22540 }, { "epoch": 0.14835526315789474, "grad_norm": 2.25, "grad_norm_var": 0.0516265869140625, "learning_rate": 0.0001, "loss": 3.147, "loss/crossentropy": 2.4570279359817504, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.27158954441547395, "loss/reg": 0.0, "step": 22550 }, { "epoch": 0.14842105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.0767229715983073, "learning_rate": 0.0001, "loss": 3.0495, "loss/crossentropy": 2.1562333941459655, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.252650773525238, "loss/reg": 0.0, "step": 22560 }, { "epoch": 0.14848684210526317, "grad_norm": 2.234375, "grad_norm_var": 0.2514625549316406, "learning_rate": 0.0001, "loss": 3.1717, "loss/crossentropy": 2.0726990699768066, "loss/hidden": 3.05, "loss/incoh": 0.0, "loss/logits": 0.2678316295146942, "loss/reg": 0.0, "step": 22570 }, { "epoch": 0.14855263157894738, "grad_norm": 2.125, "grad_norm_var": 0.21327718098958334, "learning_rate": 0.0001, "loss": 3.1387, "loss/crossentropy": 2.32381352186203, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.2564924195408821, "loss/reg": 0.0, "step": 22580 }, { "epoch": 0.14861842105263157, "grad_norm": 2.140625, "grad_norm_var": 0.08567301432291667, "learning_rate": 0.0001, "loss": 3.1625, "loss/crossentropy": 2.15835280418396, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.3166464313864708, "loss/reg": 0.0, "step": 22590 }, { "epoch": 0.14868421052631578, "grad_norm": 2.5, "grad_norm_var": 0.13315327962239584, "learning_rate": 0.0001, "loss": 3.0573, "loss/crossentropy": 2.5024718761444094, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2705509811639786, "loss/reg": 0.0, "step": 22600 }, { "epoch": 0.14875, "grad_norm": 2.203125, "grad_norm_var": 0.1475176493326823, "learning_rate": 0.0001, "loss": 3.0119, "loss/crossentropy": 2.2811665177345275, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.23891925811767578, "loss/reg": 0.0, "step": 22610 }, { "epoch": 0.1488157894736842, "grad_norm": 2.40625, "grad_norm_var": 31.435579172770183, "learning_rate": 0.0001, "loss": 3.0136, "loss/crossentropy": 2.1407122373580934, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.26466352492570877, "loss/reg": 0.0, "step": 22620 }, { "epoch": 0.14888157894736842, "grad_norm": 2.171875, "grad_norm_var": 0.10041402180989584, "learning_rate": 0.0001, "loss": 3.0561, "loss/crossentropy": 2.3511832118034364, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.26619747579097747, "loss/reg": 0.0, "step": 22630 }, { "epoch": 0.14894736842105263, "grad_norm": 2.609375, "grad_norm_var": 0.16308186848958334, "learning_rate": 0.0001, "loss": 3.1069, "loss/crossentropy": 2.1790146827697754, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2223970666527748, "loss/reg": 0.0, "step": 22640 }, { "epoch": 0.14901315789473685, "grad_norm": 2.265625, "grad_norm_var": 0.48355204264322915, "learning_rate": 0.0001, "loss": 3.1746, "loss/crossentropy": 2.2969871520996095, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.28595788925886156, "loss/reg": 0.0, "step": 22650 }, { "epoch": 0.14907894736842106, "grad_norm": 2.828125, "grad_norm_var": 0.38804931640625, "learning_rate": 0.0001, "loss": 3.1378, "loss/crossentropy": 2.484911561012268, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.28142704218626025, "loss/reg": 0.0, "step": 22660 }, { "epoch": 0.14914473684210527, "grad_norm": 3.359375, "grad_norm_var": 0.11812744140625, "learning_rate": 0.0001, "loss": 3.2449, "loss/crossentropy": 2.3331188917160035, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2567854106426239, "loss/reg": 0.0, "step": 22670 }, { "epoch": 0.14921052631578946, "grad_norm": 2.0, "grad_norm_var": 0.11311848958333333, "learning_rate": 0.0001, "loss": 3.0464, "loss/crossentropy": 2.5099231958389283, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22425275444984435, "loss/reg": 0.0, "step": 22680 }, { "epoch": 0.14927631578947367, "grad_norm": 2.265625, "grad_norm_var": 0.033707682291666666, "learning_rate": 0.0001, "loss": 3.0841, "loss/crossentropy": 2.5707973539829254, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.27995080798864364, "loss/reg": 0.0, "step": 22690 }, { "epoch": 0.1493421052631579, "grad_norm": 2.375, "grad_norm_var": 0.03998921712239583, "learning_rate": 0.0001, "loss": 3.0877, "loss/crossentropy": 2.2857648015022276, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2597381517291069, "loss/reg": 0.0, "step": 22700 }, { "epoch": 0.1494078947368421, "grad_norm": 2.421875, "grad_norm_var": 0.05201416015625, "learning_rate": 0.0001, "loss": 3.0551, "loss/crossentropy": 2.4230233311653135, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2558469220995903, "loss/reg": 0.0, "step": 22710 }, { "epoch": 0.14947368421052631, "grad_norm": 2.28125, "grad_norm_var": 0.08981831868489583, "learning_rate": 0.0001, "loss": 3.0629, "loss/crossentropy": 2.3983253479003905, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.25465885996818544, "loss/reg": 0.0, "step": 22720 }, { "epoch": 0.14953947368421053, "grad_norm": 2.34375, "grad_norm_var": 0.14431966145833333, "learning_rate": 0.0001, "loss": 3.1884, "loss/crossentropy": 2.227739405632019, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.27533823624253273, "loss/reg": 0.0, "step": 22730 }, { "epoch": 0.14960526315789474, "grad_norm": 2.5, "grad_norm_var": 0.09478759765625, "learning_rate": 0.0001, "loss": 3.044, "loss/crossentropy": 2.2618382215499877, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.22343547642230988, "loss/reg": 0.0, "step": 22740 }, { "epoch": 0.14967105263157895, "grad_norm": 2.390625, "grad_norm_var": 0.18606363932291667, "learning_rate": 0.0001, "loss": 3.145, "loss/crossentropy": 2.4264959335327148, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.26770799309015275, "loss/reg": 0.0, "step": 22750 }, { "epoch": 0.14973684210526317, "grad_norm": 2.5, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 3.2178, "loss/crossentropy": 2.603011679649353, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.28421192467212675, "loss/reg": 0.0, "step": 22760 }, { "epoch": 0.14980263157894738, "grad_norm": 2.296875, "grad_norm_var": 0.04109598795572917, "learning_rate": 0.0001, "loss": 3.0566, "loss/crossentropy": 2.239471447467804, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2365947112441063, "loss/reg": 0.0, "step": 22770 }, { "epoch": 0.14986842105263157, "grad_norm": 2.203125, "grad_norm_var": 0.11721903483072917, "learning_rate": 0.0001, "loss": 3.0489, "loss/crossentropy": 2.264164900779724, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2334253668785095, "loss/reg": 0.0, "step": 22780 }, { "epoch": 0.14993421052631578, "grad_norm": 2.65625, "grad_norm_var": 0.1690093994140625, "learning_rate": 0.0001, "loss": 3.1266, "loss/crossentropy": 2.3352009534835814, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.23781208097934722, "loss/reg": 0.0, "step": 22790 }, { "epoch": 0.15, "grad_norm": 3.25, "grad_norm_var": 0.25569559733072916, "learning_rate": 0.0001, "loss": 3.1863, "loss/crossentropy": 2.260995364189148, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.28239033967256544, "loss/reg": 0.0, "step": 22800 }, { "epoch": 0.1500657894736842, "grad_norm": 2.203125, "grad_norm_var": 0.5896321614583333, "learning_rate": 0.0001, "loss": 3.1214, "loss/crossentropy": 2.487893545627594, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2422095239162445, "loss/reg": 0.0, "step": 22810 }, { "epoch": 0.15013157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.44045817057291664, "learning_rate": 0.0001, "loss": 3.0838, "loss/crossentropy": 2.484136939048767, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22923200577497482, "loss/reg": 0.0, "step": 22820 }, { "epoch": 0.15019736842105263, "grad_norm": 2.890625, "grad_norm_var": 3.111881782327837e+17, "learning_rate": 0.0001, "loss": 3.2029, "loss/crossentropy": 2.462354898452759, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.23231214433908462, "loss/reg": 0.0, "step": 22830 }, { "epoch": 0.15026315789473685, "grad_norm": 2.25, "grad_norm_var": 3.1118817821970925e+17, "learning_rate": 0.0001, "loss": 3.1305, "loss/crossentropy": 2.1894614577293394, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.24518598318099977, "loss/reg": 0.0, "step": 22840 }, { "epoch": 0.15032894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.23975321451822917, "learning_rate": 0.0001, "loss": 3.0371, "loss/crossentropy": 2.427228879928589, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.250908388197422, "loss/reg": 0.0, "step": 22850 }, { "epoch": 0.15039473684210528, "grad_norm": 7.125, "grad_norm_var": 1.4729563395182292, "learning_rate": 0.0001, "loss": 3.1606, "loss/crossentropy": 2.163099730014801, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.2531519740819931, "loss/reg": 0.0, "step": 22860 }, { "epoch": 0.15046052631578946, "grad_norm": 2.484375, "grad_norm_var": 1.7556060791015624, "learning_rate": 0.0001, "loss": 3.127, "loss/crossentropy": 2.4134068608284, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.2419949784874916, "loss/reg": 0.0, "step": 22870 }, { "epoch": 0.15052631578947367, "grad_norm": 2.234375, "grad_norm_var": 0.6191080729166667, "learning_rate": 0.0001, "loss": 3.0458, "loss/crossentropy": 2.2460831999778748, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.25571899116039276, "loss/reg": 0.0, "step": 22880 }, { "epoch": 0.1505921052631579, "grad_norm": 2.421875, "grad_norm_var": 0.1325592041015625, "learning_rate": 0.0001, "loss": 3.1666, "loss/crossentropy": 2.32741322517395, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.23611859530210494, "loss/reg": 0.0, "step": 22890 }, { "epoch": 0.1506578947368421, "grad_norm": 4.03125, "grad_norm_var": 0.24289957682291666, "learning_rate": 0.0001, "loss": 3.1655, "loss/crossentropy": 2.436409044265747, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.28332450836896894, "loss/reg": 0.0, "step": 22900 }, { "epoch": 0.15072368421052632, "grad_norm": 2.265625, "grad_norm_var": 0.23401590983072917, "learning_rate": 0.0001, "loss": 2.9932, "loss/crossentropy": 2.2307902753353117, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.23847298920154572, "loss/reg": 0.0, "step": 22910 }, { "epoch": 0.15078947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.12184244791666667, "learning_rate": 0.0001, "loss": 3.1045, "loss/crossentropy": 2.2170926928520203, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.23038018941879274, "loss/reg": 0.0, "step": 22920 }, { "epoch": 0.15085526315789474, "grad_norm": 2.8125, "grad_norm_var": 0.09688695271809895, "learning_rate": 0.0001, "loss": 3.0845, "loss/crossentropy": 2.1675124526023866, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.20779158547520638, "loss/reg": 0.0, "step": 22930 }, { "epoch": 0.15092105263157896, "grad_norm": 2.4375, "grad_norm_var": 0.7031776428222656, "learning_rate": 0.0001, "loss": 3.131, "loss/crossentropy": 2.483162760734558, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2888242840766907, "loss/reg": 0.0, "step": 22940 }, { "epoch": 0.15098684210526317, "grad_norm": 2.21875, "grad_norm_var": 0.6901682535807292, "learning_rate": 0.0001, "loss": 3.1206, "loss/crossentropy": 1.9849469184875488, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.21038227528333664, "loss/reg": 0.0, "step": 22950 }, { "epoch": 0.15105263157894736, "grad_norm": 2.484375, "grad_norm_var": 0.08320210774739584, "learning_rate": 0.0001, "loss": 3.1315, "loss/crossentropy": 2.424567532539368, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.2791455164551735, "loss/reg": 0.0, "step": 22960 }, { "epoch": 0.15111842105263157, "grad_norm": 2.0, "grad_norm_var": 0.0634674072265625, "learning_rate": 0.0001, "loss": 3.054, "loss/crossentropy": 2.531536269187927, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.24857166707515715, "loss/reg": 0.0, "step": 22970 }, { "epoch": 0.15118421052631578, "grad_norm": 2.28125, "grad_norm_var": 0.06180013020833333, "learning_rate": 0.0001, "loss": 3.0542, "loss/crossentropy": 2.4494295597076414, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.24070152640342712, "loss/reg": 0.0, "step": 22980 }, { "epoch": 0.15125, "grad_norm": 2.078125, "grad_norm_var": 0.07551167805989584, "learning_rate": 0.0001, "loss": 3.0773, "loss/crossentropy": 2.418122172355652, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.24059886187314988, "loss/reg": 0.0, "step": 22990 }, { "epoch": 0.1513157894736842, "grad_norm": 2.328125, "grad_norm_var": 0.1075592041015625, "learning_rate": 0.0001, "loss": 3.1222, "loss/crossentropy": 2.184150278568268, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.26524696350097654, "loss/reg": 0.0, "step": 23000 }, { "epoch": 0.15138157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.12967020670572918, "learning_rate": 0.0001, "loss": 3.043, "loss/crossentropy": 2.48363002538681, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.22335316240787506, "loss/reg": 0.0, "step": 23010 }, { "epoch": 0.15144736842105264, "grad_norm": 2.109375, "grad_norm_var": 0.1149810791015625, "learning_rate": 0.0001, "loss": 3.1275, "loss/crossentropy": 2.305170452594757, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.23285460025072097, "loss/reg": 0.0, "step": 23020 }, { "epoch": 0.15151315789473685, "grad_norm": 2.21875, "grad_norm_var": 0.06519775390625, "learning_rate": 0.0001, "loss": 3.0832, "loss/crossentropy": 2.5363330364227297, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22520080506801604, "loss/reg": 0.0, "step": 23030 }, { "epoch": 0.15157894736842106, "grad_norm": 2.125, "grad_norm_var": 0.39241129557291665, "learning_rate": 0.0001, "loss": 3.113, "loss/crossentropy": 2.1367889642715454, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.29226300269365313, "loss/reg": 0.0, "step": 23040 }, { "epoch": 0.15164473684210528, "grad_norm": 2.171875, "grad_norm_var": 0.39695638020833335, "learning_rate": 0.0001, "loss": 3.1147, "loss/crossentropy": 2.4076698064804076, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.34615216627717016, "loss/reg": 0.0, "step": 23050 }, { "epoch": 0.15171052631578946, "grad_norm": 2.125, "grad_norm_var": 0.23273824055989584, "learning_rate": 0.0001, "loss": 3.0922, "loss/crossentropy": 2.2620026230812074, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24059030711650847, "loss/reg": 0.0, "step": 23060 }, { "epoch": 0.15177631578947368, "grad_norm": 2.125, "grad_norm_var": 0.258984375, "learning_rate": 0.0001, "loss": 3.0849, "loss/crossentropy": 2.4243380606174467, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.28078764528036115, "loss/reg": 0.0, "step": 23070 }, { "epoch": 0.1518421052631579, "grad_norm": 2.796875, "grad_norm_var": 4.336297930295782e+17, "learning_rate": 0.0001, "loss": 3.1967, "loss/crossentropy": 2.3340250134468077, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2355951637029648, "loss/reg": 0.0, "step": 23080 }, { "epoch": 0.1519078947368421, "grad_norm": 2.265625, "grad_norm_var": 4.336297930357517e+17, "learning_rate": 0.0001, "loss": 3.031, "loss/crossentropy": 2.1574897170066833, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.21495652794837952, "loss/reg": 0.0, "step": 23090 }, { "epoch": 0.15197368421052632, "grad_norm": 2.109375, "grad_norm_var": 0.1123199462890625, "learning_rate": 0.0001, "loss": 3.1392, "loss/crossentropy": 2.405716967582703, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.27174981236457824, "loss/reg": 0.0, "step": 23100 }, { "epoch": 0.15203947368421053, "grad_norm": 2.359375, "grad_norm_var": 2.003859923676365e+16, "learning_rate": 0.0001, "loss": 3.2336, "loss/crossentropy": 2.339055967330933, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.28572248220443724, "loss/reg": 0.0, "step": 23110 }, { "epoch": 0.15210526315789474, "grad_norm": 2.125, "grad_norm_var": 0.21015218098958333, "learning_rate": 0.0001, "loss": 3.0005, "loss/crossentropy": 2.2984538078308105, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.21156432926654817, "loss/reg": 0.0, "step": 23120 }, { "epoch": 0.15217105263157896, "grad_norm": 2.140625, "grad_norm_var": 0.08379618326822917, "learning_rate": 0.0001, "loss": 3.116, "loss/crossentropy": 2.3342798829078673, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.22476595863699914, "loss/reg": 0.0, "step": 23130 }, { "epoch": 0.15223684210526317, "grad_norm": 2.5, "grad_norm_var": 0.0606842041015625, "learning_rate": 0.0001, "loss": 3.0265, "loss/crossentropy": 2.3210121750831605, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23322480022907258, "loss/reg": 0.0, "step": 23140 }, { "epoch": 0.15230263157894736, "grad_norm": 2.109375, "grad_norm_var": 0.23906962076822916, "learning_rate": 0.0001, "loss": 3.0813, "loss/crossentropy": 2.391731929779053, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.3979014977812767, "loss/reg": 0.0, "step": 23150 }, { "epoch": 0.15236842105263157, "grad_norm": 2.609375, "grad_norm_var": 0.049214680989583336, "learning_rate": 0.0001, "loss": 3.0574, "loss/crossentropy": 2.492924761772156, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.23957584500312806, "loss/reg": 0.0, "step": 23160 }, { "epoch": 0.15243421052631578, "grad_norm": 1.9765625, "grad_norm_var": 0.07945938110351562, "learning_rate": 0.0001, "loss": 3.0317, "loss/crossentropy": 2.3342637419700623, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.30051169246435167, "loss/reg": 0.0, "step": 23170 }, { "epoch": 0.1525, "grad_norm": 2.78125, "grad_norm_var": 0.1088396708170573, "learning_rate": 0.0001, "loss": 3.1944, "loss/crossentropy": 2.306346225738525, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.246099853515625, "loss/reg": 0.0, "step": 23180 }, { "epoch": 0.1525657894736842, "grad_norm": 2.5625, "grad_norm_var": 0.17786839803059895, "learning_rate": 0.0001, "loss": 3.0341, "loss/crossentropy": 2.030124247074127, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.21610897928476333, "loss/reg": 0.0, "step": 23190 }, { "epoch": 0.15263157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.12164688110351562, "learning_rate": 0.0001, "loss": 3.0381, "loss/crossentropy": 2.3866997003555297, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.25531851798295974, "loss/reg": 0.0, "step": 23200 }, { "epoch": 0.15269736842105264, "grad_norm": 2.140625, "grad_norm_var": 0.13189697265625, "learning_rate": 0.0001, "loss": 3.1059, "loss/crossentropy": 2.6614113092422484, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.26844717264175416, "loss/reg": 0.0, "step": 23210 }, { "epoch": 0.15276315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.24109598795572917, "learning_rate": 0.0001, "loss": 3.0657, "loss/crossentropy": 2.384077048301697, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2543630987405777, "loss/reg": 0.0, "step": 23220 }, { "epoch": 0.15282894736842106, "grad_norm": 2.21875, "grad_norm_var": 0.27134501139322914, "learning_rate": 0.0001, "loss": 3.0512, "loss/crossentropy": 2.4361581921577455, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.24168266355991364, "loss/reg": 0.0, "step": 23230 }, { "epoch": 0.15289473684210525, "grad_norm": 2.6875, "grad_norm_var": 0.06411031087239584, "learning_rate": 0.0001, "loss": 3.1163, "loss/crossentropy": 2.439437985420227, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.23927776366472245, "loss/reg": 0.0, "step": 23240 }, { "epoch": 0.15296052631578946, "grad_norm": 2.4375, "grad_norm_var": 0.20168355305989583, "learning_rate": 0.0001, "loss": 3.1405, "loss/crossentropy": 1.9147474735975265, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.27681511342525483, "loss/reg": 0.0, "step": 23250 }, { "epoch": 0.15302631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.3036122639973958, "learning_rate": 0.0001, "loss": 3.1139, "loss/crossentropy": 1.9262932538986206, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2078495942056179, "loss/reg": 0.0, "step": 23260 }, { "epoch": 0.1530921052631579, "grad_norm": 3.1875, "grad_norm_var": 0.26516520182291664, "learning_rate": 0.0001, "loss": 3.0706, "loss/crossentropy": 2.252467918395996, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.24831898286938667, "loss/reg": 0.0, "step": 23270 }, { "epoch": 0.1531578947368421, "grad_norm": 2.921875, "grad_norm_var": 0.12827860514322917, "learning_rate": 0.0001, "loss": 3.1631, "loss/crossentropy": 2.0835241615772246, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2629551820456982, "loss/reg": 0.0, "step": 23280 }, { "epoch": 0.15322368421052632, "grad_norm": 4.0625, "grad_norm_var": 0.2850494384765625, "learning_rate": 0.0001, "loss": 3.0116, "loss/crossentropy": 2.533276152610779, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.25189711451530455, "loss/reg": 0.0, "step": 23290 }, { "epoch": 0.15328947368421053, "grad_norm": 1.984375, "grad_norm_var": 0.28486328125, "learning_rate": 0.0001, "loss": 3.0549, "loss/crossentropy": 2.2883153557777405, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20354210436344147, "loss/reg": 0.0, "step": 23300 }, { "epoch": 0.15335526315789474, "grad_norm": 3.03125, "grad_norm_var": 0.1171295166015625, "learning_rate": 0.0001, "loss": 3.1273, "loss/crossentropy": 2.3346426010131838, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.32933240532875063, "loss/reg": 0.0, "step": 23310 }, { "epoch": 0.15342105263157896, "grad_norm": 2.65625, "grad_norm_var": 0.15787760416666666, "learning_rate": 0.0001, "loss": 3.141, "loss/crossentropy": 2.1821151852607725, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2542765408754349, "loss/reg": 0.0, "step": 23320 }, { "epoch": 0.15348684210526317, "grad_norm": 2.640625, "grad_norm_var": 0.1633453369140625, "learning_rate": 0.0001, "loss": 3.1175, "loss/crossentropy": 2.303962028026581, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.25389211922883986, "loss/reg": 0.0, "step": 23330 }, { "epoch": 0.15355263157894736, "grad_norm": 2.46875, "grad_norm_var": 6.715729777018229, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 2.387230467796326, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26721241027116777, "loss/reg": 0.0, "step": 23340 }, { "epoch": 0.15361842105263157, "grad_norm": 2.3125, "grad_norm_var": 0.14257405598958334, "learning_rate": 0.0001, "loss": 3.1622, "loss/crossentropy": 2.4242820143699646, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.29456254839897156, "loss/reg": 0.0, "step": 23350 }, { "epoch": 0.15368421052631578, "grad_norm": 2.4375, "grad_norm_var": 0.18007405598958334, "learning_rate": 0.0001, "loss": 3.1082, "loss/crossentropy": 2.1694801568984987, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.3005404189229012, "loss/reg": 0.0, "step": 23360 }, { "epoch": 0.15375, "grad_norm": 2.4375, "grad_norm_var": 0.13083089192708333, "learning_rate": 0.0001, "loss": 3.0146, "loss/crossentropy": 2.5201270818710326, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.23150605708360672, "loss/reg": 0.0, "step": 23370 }, { "epoch": 0.1538157894736842, "grad_norm": 2.21875, "grad_norm_var": 0.1706621805826823, "learning_rate": 0.0001, "loss": 3.04, "loss/crossentropy": 2.4117377281188963, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.22549189925193786, "loss/reg": 0.0, "step": 23380 }, { "epoch": 0.15388157894736842, "grad_norm": 2.4375, "grad_norm_var": 0.049843088785807295, "learning_rate": 0.0001, "loss": 3.0355, "loss/crossentropy": 2.166217362880707, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2460268869996071, "loss/reg": 0.0, "step": 23390 }, { "epoch": 0.15394736842105264, "grad_norm": 2.3125, "grad_norm_var": 0.027437337239583335, "learning_rate": 0.0001, "loss": 3.0332, "loss/crossentropy": 2.2912797331809998, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.2184738412499428, "loss/reg": 0.0, "step": 23400 }, { "epoch": 0.15401315789473685, "grad_norm": 2.46875, "grad_norm_var": 0.02008056640625, "learning_rate": 0.0001, "loss": 3.049, "loss/crossentropy": 2.2143723726272584, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23227078467607498, "loss/reg": 0.0, "step": 23410 }, { "epoch": 0.15407894736842107, "grad_norm": 2.46875, "grad_norm_var": 0.04762140909830729, "learning_rate": 0.0001, "loss": 2.9746, "loss/crossentropy": 2.1996548891067507, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.20859691947698594, "loss/reg": 0.0, "step": 23420 }, { "epoch": 0.15414473684210525, "grad_norm": 2.390625, "grad_norm_var": 0.08440348307291666, "learning_rate": 0.0001, "loss": 3.1332, "loss/crossentropy": 2.353720319271088, "loss/hidden": 3.0734375, "loss/incoh": 0.0, "loss/logits": 0.32487219721078875, "loss/reg": 0.0, "step": 23430 }, { "epoch": 0.15421052631578946, "grad_norm": 2.421875, "grad_norm_var": 0.04627278645833333, "learning_rate": 0.0001, "loss": 3.0352, "loss/crossentropy": 1.994243037700653, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2228219524025917, "loss/reg": 0.0, "step": 23440 }, { "epoch": 0.15427631578947368, "grad_norm": 2.515625, "grad_norm_var": 0.025699869791666666, "learning_rate": 0.0001, "loss": 3.0571, "loss/crossentropy": 2.368081831932068, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.24828383028507234, "loss/reg": 0.0, "step": 23450 }, { "epoch": 0.1543421052631579, "grad_norm": 2.734375, "grad_norm_var": 0.06443684895833333, "learning_rate": 0.0001, "loss": 3.0708, "loss/crossentropy": 2.1184537053108214, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2568272680044174, "loss/reg": 0.0, "step": 23460 }, { "epoch": 0.1544078947368421, "grad_norm": 2.421875, "grad_norm_var": 0.13352762858072917, "learning_rate": 0.0001, "loss": 3.1323, "loss/crossentropy": 2.3327906489372254, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.2940856009721756, "loss/reg": 0.0, "step": 23470 }, { "epoch": 0.15447368421052632, "grad_norm": 2.4375, "grad_norm_var": 0.14630533854166666, "learning_rate": 0.0001, "loss": 3.0207, "loss/crossentropy": 2.5257789850234986, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.24059889316558838, "loss/reg": 0.0, "step": 23480 }, { "epoch": 0.15453947368421053, "grad_norm": 2.34375, "grad_norm_var": 0.059357706705729166, "learning_rate": 0.0001, "loss": 3.0888, "loss/crossentropy": 2.0001726031303404, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2182185634970665, "loss/reg": 0.0, "step": 23490 }, { "epoch": 0.15460526315789475, "grad_norm": 2.203125, "grad_norm_var": 0.07535171508789062, "learning_rate": 0.0001, "loss": 3.0415, "loss/crossentropy": 2.4137069463729857, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.26817760467529295, "loss/reg": 0.0, "step": 23500 }, { "epoch": 0.15467105263157896, "grad_norm": 2.296875, "grad_norm_var": 0.09633967081705729, "learning_rate": 0.0001, "loss": 3.0427, "loss/crossentropy": 2.534043550491333, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.22827706933021547, "loss/reg": 0.0, "step": 23510 }, { "epoch": 0.15473684210526314, "grad_norm": 2.1875, "grad_norm_var": 0.0346832275390625, "learning_rate": 0.0001, "loss": 3.0819, "loss/crossentropy": 2.1481072187423704, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.22196744605898858, "loss/reg": 0.0, "step": 23520 }, { "epoch": 0.15480263157894736, "grad_norm": 2.734375, "grad_norm_var": 0.07886962890625, "learning_rate": 0.0001, "loss": 3.1167, "loss/crossentropy": 2.0396682798862455, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.22266072854399682, "loss/reg": 0.0, "step": 23530 }, { "epoch": 0.15486842105263157, "grad_norm": 2.0625, "grad_norm_var": 0.038182576497395836, "learning_rate": 0.0001, "loss": 3.075, "loss/crossentropy": 2.164088273048401, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2313580572605133, "loss/reg": 0.0, "step": 23540 }, { "epoch": 0.15493421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.2714515686035156, "learning_rate": 0.0001, "loss": 3.0593, "loss/crossentropy": 2.304581105709076, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.24230952113866805, "loss/reg": 0.0, "step": 23550 }, { "epoch": 0.155, "grad_norm": 2.421875, "grad_norm_var": 0.1161376953125, "learning_rate": 0.0001, "loss": 2.9886, "loss/crossentropy": 1.9074640274047852, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.1951706364750862, "loss/reg": 0.0, "step": 23560 }, { "epoch": 0.1550657894736842, "grad_norm": 2.515625, "grad_norm_var": 0.12790908813476562, "learning_rate": 0.0001, "loss": 3.0776, "loss/crossentropy": 2.3702269911766054, "loss/hidden": 3.0203125, "loss/incoh": 0.0, "loss/logits": 0.2716818228363991, "loss/reg": 0.0, "step": 23570 }, { "epoch": 0.15513157894736843, "grad_norm": 2.578125, "grad_norm_var": 0.1339019775390625, "learning_rate": 0.0001, "loss": 3.0873, "loss/crossentropy": 2.205389070510864, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2579692542552948, "loss/reg": 0.0, "step": 23580 }, { "epoch": 0.15519736842105264, "grad_norm": 2.296875, "grad_norm_var": 0.06905924479166667, "learning_rate": 0.0001, "loss": 3.088, "loss/crossentropy": 2.091149592399597, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2086069330573082, "loss/reg": 0.0, "step": 23590 }, { "epoch": 0.15526315789473685, "grad_norm": 2.390625, "grad_norm_var": 0.056722005208333336, "learning_rate": 0.0001, "loss": 3.1228, "loss/crossentropy": 2.2192450404167174, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.28743477165699005, "loss/reg": 0.0, "step": 23600 }, { "epoch": 0.15532894736842107, "grad_norm": 2.328125, "grad_norm_var": 0.05885009765625, "learning_rate": 0.0001, "loss": 3.0878, "loss/crossentropy": 2.531976878643036, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2344336360692978, "loss/reg": 0.0, "step": 23610 }, { "epoch": 0.15539473684210525, "grad_norm": 2.984375, "grad_norm_var": 0.07700907389322917, "learning_rate": 0.0001, "loss": 3.0331, "loss/crossentropy": 2.2710886120796205, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2305494710803032, "loss/reg": 0.0, "step": 23620 }, { "epoch": 0.15546052631578947, "grad_norm": 2.625, "grad_norm_var": 0.055964152018229164, "learning_rate": 0.0001, "loss": 3.1336, "loss/crossentropy": 2.3865880966186523, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.25577795803546904, "loss/reg": 0.0, "step": 23630 }, { "epoch": 0.15552631578947368, "grad_norm": 2.109375, "grad_norm_var": 0.05320536295572917, "learning_rate": 0.0001, "loss": 3.1079, "loss/crossentropy": 2.388904869556427, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.26212394386529925, "loss/reg": 0.0, "step": 23640 }, { "epoch": 0.1555921052631579, "grad_norm": 2.25, "grad_norm_var": 0.28601252237955727, "learning_rate": 0.0001, "loss": 3.0189, "loss/crossentropy": 2.3917470812797545, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.28409580439329146, "loss/reg": 0.0, "step": 23650 }, { "epoch": 0.1556578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.05244038899739583, "learning_rate": 0.0001, "loss": 3.0892, "loss/crossentropy": 2.266455078125, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.23892848715186119, "loss/reg": 0.0, "step": 23660 }, { "epoch": 0.15572368421052632, "grad_norm": 2.53125, "grad_norm_var": 0.021256510416666666, "learning_rate": 0.0001, "loss": 3.1403, "loss/crossentropy": 2.3996535420417784, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.32787969559431074, "loss/reg": 0.0, "step": 23670 }, { "epoch": 0.15578947368421053, "grad_norm": 2.359375, "grad_norm_var": 0.03367411295572917, "learning_rate": 0.0001, "loss": 3.0311, "loss/crossentropy": 2.2268054604530336, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.25249754935503005, "loss/reg": 0.0, "step": 23680 }, { "epoch": 0.15585526315789475, "grad_norm": 2.328125, "grad_norm_var": 0.112744140625, "learning_rate": 0.0001, "loss": 3.1572, "loss/crossentropy": 2.4530532598495483, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.2573227033019066, "loss/reg": 0.0, "step": 23690 }, { "epoch": 0.15592105263157896, "grad_norm": 2.40625, "grad_norm_var": 0.2130767822265625, "learning_rate": 0.0001, "loss": 3.0932, "loss/crossentropy": 2.4343389749526976, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.26323390007019043, "loss/reg": 0.0, "step": 23700 }, { "epoch": 0.15598684210526315, "grad_norm": 2.421875, "grad_norm_var": 0.15120340983072916, "learning_rate": 0.0001, "loss": 3.1834, "loss/crossentropy": 2.1292245745658875, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.23169804438948632, "loss/reg": 0.0, "step": 23710 }, { "epoch": 0.15605263157894736, "grad_norm": 2.375, "grad_norm_var": 0.0550201416015625, "learning_rate": 0.0001, "loss": 3.0296, "loss/crossentropy": 2.4026230216026305, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.24430365711450577, "loss/reg": 0.0, "step": 23720 }, { "epoch": 0.15611842105263157, "grad_norm": 1.96875, "grad_norm_var": 0.07566731770833333, "learning_rate": 0.0001, "loss": 3.0245, "loss/crossentropy": 2.186066722869873, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2246588721871376, "loss/reg": 0.0, "step": 23730 }, { "epoch": 0.1561842105263158, "grad_norm": 2.46875, "grad_norm_var": 0.05691731770833333, "learning_rate": 0.0001, "loss": 3.0638, "loss/crossentropy": 2.3755934476852416, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.23479508757591247, "loss/reg": 0.0, "step": 23740 }, { "epoch": 0.15625, "grad_norm": 2.640625, "grad_norm_var": 0.06516520182291667, "learning_rate": 0.0001, "loss": 3.1125, "loss/crossentropy": 2.1289533019065856, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.22816545218229295, "loss/reg": 0.0, "step": 23750 }, { "epoch": 0.1563157894736842, "grad_norm": 2.0625, "grad_norm_var": 0.15618489583333334, "learning_rate": 0.0001, "loss": 3.1231, "loss/crossentropy": 2.5586398363113405, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2851672574877739, "loss/reg": 0.0, "step": 23760 }, { "epoch": 0.15638157894736843, "grad_norm": 2.5, "grad_norm_var": 0.1464752197265625, "learning_rate": 0.0001, "loss": 3.063, "loss/crossentropy": 2.3234678387641905, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.2802169814705849, "loss/reg": 0.0, "step": 23770 }, { "epoch": 0.15644736842105264, "grad_norm": 2.5625, "grad_norm_var": 3.3502559026353274e+17, "learning_rate": 0.0001, "loss": 3.217, "loss/crossentropy": 2.1664799213409425, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.24028320759534835, "loss/reg": 0.0, "step": 23780 }, { "epoch": 0.15651315789473685, "grad_norm": 2.4375, "grad_norm_var": 3.35025590324881e+17, "learning_rate": 0.0001, "loss": 3.0729, "loss/crossentropy": 2.4900641202926637, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.22242969423532485, "loss/reg": 0.0, "step": 23790 }, { "epoch": 0.15657894736842104, "grad_norm": 3.125, "grad_norm_var": 0.42876561482747394, "learning_rate": 0.0001, "loss": 3.0548, "loss/crossentropy": 2.285599112510681, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.27018204629421233, "loss/reg": 0.0, "step": 23800 }, { "epoch": 0.15664473684210525, "grad_norm": 2.328125, "grad_norm_var": 0.41369527180989585, "learning_rate": 0.0001, "loss": 2.9886, "loss/crossentropy": 2.352921783924103, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.20102634727954866, "loss/reg": 0.0, "step": 23810 }, { "epoch": 0.15671052631578947, "grad_norm": 2.0625, "grad_norm_var": 0.11093343098958333, "learning_rate": 0.0001, "loss": 3.0036, "loss/crossentropy": 2.4093087553977965, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.24685168713331224, "loss/reg": 0.0, "step": 23820 }, { "epoch": 0.15677631578947368, "grad_norm": 2.140625, "grad_norm_var": 0.0461578369140625, "learning_rate": 0.0001, "loss": 3.0504, "loss/crossentropy": 2.2815535068511963, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.21896685659885406, "loss/reg": 0.0, "step": 23830 }, { "epoch": 0.1568421052631579, "grad_norm": 2.15625, "grad_norm_var": 0.07251561482747396, "learning_rate": 0.0001, "loss": 2.9885, "loss/crossentropy": 2.6134737491607667, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22691430598497392, "loss/reg": 0.0, "step": 23840 }, { "epoch": 0.1569078947368421, "grad_norm": 4.53125, "grad_norm_var": 0.35949605305989585, "learning_rate": 0.0001, "loss": 3.0464, "loss/crossentropy": 2.220982587337494, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.22496577724814415, "loss/reg": 0.0, "step": 23850 }, { "epoch": 0.15697368421052632, "grad_norm": 2.1875, "grad_norm_var": 0.4110636393229167, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.291455662250519, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2523698851466179, "loss/reg": 0.0, "step": 23860 }, { "epoch": 0.15703947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.2355133056640625, "learning_rate": 0.0001, "loss": 3.0867, "loss/crossentropy": 2.4676188588142396, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20559777021408082, "loss/reg": 0.0, "step": 23870 }, { "epoch": 0.15710526315789475, "grad_norm": 2.609375, "grad_norm_var": 0.09799702962239583, "learning_rate": 0.0001, "loss": 3.0123, "loss/crossentropy": 2.4020154595375063, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.24061929136514665, "loss/reg": 0.0, "step": 23880 }, { "epoch": 0.15717105263157893, "grad_norm": 2.578125, "grad_norm_var": 2.1426422119140627, "learning_rate": 0.0001, "loss": 3.02, "loss/crossentropy": 2.0822024583816527, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.23049053847789763, "loss/reg": 0.0, "step": 23890 }, { "epoch": 0.15723684210526315, "grad_norm": 2.1875, "grad_norm_var": 2.0829254150390626, "learning_rate": 0.0001, "loss": 3.0151, "loss/crossentropy": 2.210465407371521, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.24976756423711777, "loss/reg": 0.0, "step": 23900 }, { "epoch": 0.15730263157894736, "grad_norm": 2.546875, "grad_norm_var": 0.5316731770833333, "learning_rate": 0.0001, "loss": 3.0642, "loss/crossentropy": 2.1934141278266908, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.18757687732577324, "loss/reg": 0.0, "step": 23910 }, { "epoch": 0.15736842105263157, "grad_norm": 2.53125, "grad_norm_var": 0.5506998697916666, "learning_rate": 0.0001, "loss": 3.1533, "loss/crossentropy": 2.2942312955856323, "loss/hidden": 3.0890625, "loss/incoh": 0.0, "loss/logits": 0.25652203559875486, "loss/reg": 0.0, "step": 23920 }, { "epoch": 0.1574342105263158, "grad_norm": 2.875, "grad_norm_var": 0.38448893229166664, "learning_rate": 0.0001, "loss": 3.0496, "loss/crossentropy": 2.4290681004524233, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.22672214657068251, "loss/reg": 0.0, "step": 23930 }, { "epoch": 0.1575, "grad_norm": 2.796875, "grad_norm_var": 0.35227864583333335, "learning_rate": 0.0001, "loss": 3.0868, "loss/crossentropy": 2.1520013570785523, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2738748073577881, "loss/reg": 0.0, "step": 23940 }, { "epoch": 0.15756578947368421, "grad_norm": 2.359375, "grad_norm_var": 0.07700907389322917, "learning_rate": 0.0001, "loss": 3.1436, "loss/crossentropy": 2.5199429869651793, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2519889883697033, "loss/reg": 0.0, "step": 23950 }, { "epoch": 0.15763157894736843, "grad_norm": 2.171875, "grad_norm_var": 1.195580037434896, "learning_rate": 0.0001, "loss": 3.0924, "loss/crossentropy": 2.362662875652313, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.2809165805578232, "loss/reg": 0.0, "step": 23960 }, { "epoch": 0.15769736842105264, "grad_norm": 1.9921875, "grad_norm_var": 1.2223609924316405, "learning_rate": 0.0001, "loss": 3.1196, "loss/crossentropy": 2.360364031791687, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2587847799062729, "loss/reg": 0.0, "step": 23970 }, { "epoch": 0.15776315789473686, "grad_norm": 2.171875, "grad_norm_var": 0.028562164306640624, "learning_rate": 0.0001, "loss": 3.0014, "loss/crossentropy": 2.268549180030823, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2278840996325016, "loss/reg": 0.0, "step": 23980 }, { "epoch": 0.15782894736842104, "grad_norm": 2.390625, "grad_norm_var": 0.14148661295572917, "learning_rate": 0.0001, "loss": 3.0699, "loss/crossentropy": 2.0497791051864622, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2620595782995224, "loss/reg": 0.0, "step": 23990 }, { "epoch": 0.15789473684210525, "grad_norm": 2.78125, "grad_norm_var": 2.927515672594874e+17, "learning_rate": 0.0001, "loss": 3.2727, "loss/crossentropy": 2.035860624909401, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.20649760514497756, "loss/reg": 0.0, "step": 24000 }, { "epoch": 0.15796052631578947, "grad_norm": 3.265625, "grad_norm_var": 2.927515672138351e+17, "learning_rate": 0.0001, "loss": 3.1828, "loss/crossentropy": 2.358655941486359, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.23838587403297423, "loss/reg": 0.0, "step": 24010 }, { "epoch": 0.15802631578947368, "grad_norm": 4.34375, "grad_norm_var": 0.4474029541015625, "learning_rate": 0.0001, "loss": 3.0939, "loss/crossentropy": 2.436113882064819, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.25293364077806474, "loss/reg": 0.0, "step": 24020 }, { "epoch": 0.1580921052631579, "grad_norm": 2.15625, "grad_norm_var": 0.346875, "learning_rate": 0.0001, "loss": 3.1031, "loss/crossentropy": 2.1950599789619445, "loss/hidden": 3.025, "loss/incoh": 0.0, "loss/logits": 0.29097774922847747, "loss/reg": 0.0, "step": 24030 }, { "epoch": 0.1581578947368421, "grad_norm": 3.5, "grad_norm_var": 0.84117431640625, "learning_rate": 0.0001, "loss": 3.138, "loss/crossentropy": 2.488498842716217, "loss/hidden": 2.9921875, "loss/incoh": 0.0, "loss/logits": 0.2546138882637024, "loss/reg": 0.0, "step": 24040 }, { "epoch": 0.15822368421052632, "grad_norm": 2.296875, "grad_norm_var": 0.20335286458333332, "learning_rate": 0.0001, "loss": 3.1208, "loss/crossentropy": 2.1634037137031554, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.23214713484048843, "loss/reg": 0.0, "step": 24050 }, { "epoch": 0.15828947368421054, "grad_norm": 2.375, "grad_norm_var": 0.036454264322916666, "learning_rate": 0.0001, "loss": 3.1708, "loss/crossentropy": 2.3869518637657166, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2666103199124336, "loss/reg": 0.0, "step": 24060 }, { "epoch": 0.15835526315789475, "grad_norm": 2.234375, "grad_norm_var": 0.1547515869140625, "learning_rate": 0.0001, "loss": 3.0924, "loss/crossentropy": 2.2607127904891966, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.25745444297790526, "loss/reg": 0.0, "step": 24070 }, { "epoch": 0.15842105263157893, "grad_norm": 4.0, "grad_norm_var": 0.22906494140625, "learning_rate": 0.0001, "loss": 3.0239, "loss/crossentropy": 2.2656429171562196, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2332296848297119, "loss/reg": 0.0, "step": 24080 }, { "epoch": 0.15848684210526315, "grad_norm": 2.078125, "grad_norm_var": 0.23699442545572916, "learning_rate": 0.0001, "loss": 3.0181, "loss/crossentropy": 2.2533790946006773, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2367706872522831, "loss/reg": 0.0, "step": 24090 }, { "epoch": 0.15855263157894736, "grad_norm": 2.34375, "grad_norm_var": 0.06172587076822917, "learning_rate": 0.0001, "loss": 3.045, "loss/crossentropy": 2.3126723527908326, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.23582341223955156, "loss/reg": 0.0, "step": 24100 }, { "epoch": 0.15861842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.08645426432291667, "learning_rate": 0.0001, "loss": 3.0989, "loss/crossentropy": 2.284153974056244, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.3264383256435394, "loss/reg": 0.0, "step": 24110 }, { "epoch": 0.1586842105263158, "grad_norm": 3.671875, "grad_norm_var": 172.1703125, "learning_rate": 0.0001, "loss": 3.2515, "loss/crossentropy": 2.463814842700958, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2696057379245758, "loss/reg": 0.0, "step": 24120 }, { "epoch": 0.15875, "grad_norm": 2.34375, "grad_norm_var": 0.19868062337239584, "learning_rate": 0.0001, "loss": 3.1115, "loss/crossentropy": 2.41875559091568, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2305321291089058, "loss/reg": 0.0, "step": 24130 }, { "epoch": 0.15881578947368422, "grad_norm": 2.140625, "grad_norm_var": 0.5208513895670573, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.051196885108948, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22749217078089715, "loss/reg": 0.0, "step": 24140 }, { "epoch": 0.15888157894736843, "grad_norm": 2298478592.0, "grad_norm_var": 3.301877386622895e+17, "learning_rate": 0.0001, "loss": 3.3081, "loss/crossentropy": 2.3360164284706117, "loss/hidden": 3.290625, "loss/incoh": 0.0, "loss/logits": 0.3266043797135353, "loss/reg": 0.0, "step": 24150 }, { "epoch": 0.15894736842105264, "grad_norm": 2.046875, "grad_norm_var": 3.3018773920608314e+17, "learning_rate": 0.0001, "loss": 3.0123, "loss/crossentropy": 2.5075667977333067, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.19614752382040024, "loss/reg": 0.0, "step": 24160 }, { "epoch": 0.15901315789473683, "grad_norm": 2.453125, "grad_norm_var": 0.06570002237955729, "learning_rate": 0.0001, "loss": 3.0729, "loss/crossentropy": 2.1672366857528687, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2613453507423401, "loss/reg": 0.0, "step": 24170 }, { "epoch": 0.15907894736842104, "grad_norm": 2.109375, "grad_norm_var": 0.437475331624349, "learning_rate": 0.0001, "loss": 3.073, "loss/crossentropy": 2.1420519262552262, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.20748247653245927, "loss/reg": 0.0, "step": 24180 }, { "epoch": 0.15914473684210526, "grad_norm": 2.359375, "grad_norm_var": 0.5490468343098959, "learning_rate": 0.0001, "loss": 3.1018, "loss/crossentropy": 2.5135774850845336, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.2358848750591278, "loss/reg": 0.0, "step": 24190 }, { "epoch": 0.15921052631578947, "grad_norm": 2.15625, "grad_norm_var": 0.05199559529622396, "learning_rate": 0.0001, "loss": 3.0415, "loss/crossentropy": 2.392611360549927, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.29126308858394623, "loss/reg": 0.0, "step": 24200 }, { "epoch": 0.15927631578947368, "grad_norm": 2.03125, "grad_norm_var": 0.3502886454264323, "learning_rate": 0.0001, "loss": 3.0974, "loss/crossentropy": 2.2008650302886963, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2575715593993664, "loss/reg": 0.0, "step": 24210 }, { "epoch": 0.1593421052631579, "grad_norm": 2.625, "grad_norm_var": 0.2436920166015625, "learning_rate": 0.0001, "loss": 3.1615, "loss/crossentropy": 2.398923659324646, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2272189199924469, "loss/reg": 0.0, "step": 24220 }, { "epoch": 0.1594078947368421, "grad_norm": 2.3125, "grad_norm_var": 0.2711252848307292, "learning_rate": 0.0001, "loss": 3.1147, "loss/crossentropy": 2.2159452080726623, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.24379997029900552, "loss/reg": 0.0, "step": 24230 }, { "epoch": 0.15947368421052632, "grad_norm": 2.40625, "grad_norm_var": 0.16237691243489583, "learning_rate": 0.0001, "loss": 3.032, "loss/crossentropy": 2.219667136669159, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2103893890976906, "loss/reg": 0.0, "step": 24240 }, { "epoch": 0.15953947368421054, "grad_norm": 2.625, "grad_norm_var": 0.035920206705729166, "learning_rate": 0.0001, "loss": 3.1149, "loss/crossentropy": 2.0510872304439545, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2106800675392151, "loss/reg": 0.0, "step": 24250 }, { "epoch": 0.15960526315789475, "grad_norm": 2.078125, "grad_norm_var": 0.05742085774739583, "learning_rate": 0.0001, "loss": 2.9921, "loss/crossentropy": 2.202706849575043, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.200262913107872, "loss/reg": 0.0, "step": 24260 }, { "epoch": 0.15967105263157894, "grad_norm": 2.03125, "grad_norm_var": 0.47508138020833335, "learning_rate": 0.0001, "loss": 3.0355, "loss/crossentropy": 2.312657380104065, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.25428467951714995, "loss/reg": 0.0, "step": 24270 }, { "epoch": 0.15973684210526315, "grad_norm": 2.09375, "grad_norm_var": 0.11523030598958334, "learning_rate": 0.0001, "loss": 3.1753, "loss/crossentropy": 2.6283345460891723, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2865023031830788, "loss/reg": 0.0, "step": 24280 }, { "epoch": 0.15980263157894736, "grad_norm": 2.640625, "grad_norm_var": 0.04806315104166667, "learning_rate": 0.0001, "loss": 3.0594, "loss/crossentropy": 2.4725473642349245, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.20711587965488434, "loss/reg": 0.0, "step": 24290 }, { "epoch": 0.15986842105263158, "grad_norm": 2.3125, "grad_norm_var": 2.7487790628399786e+17, "learning_rate": 0.0001, "loss": 3.265, "loss/crossentropy": 2.2610708713531493, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.27779700309038163, "loss/reg": 0.0, "step": 24300 }, { "epoch": 0.1599342105263158, "grad_norm": 1.828125, "grad_norm_var": 2.748779062992896e+17, "learning_rate": 0.0001, "loss": 3.0342, "loss/crossentropy": 2.4125430822372436, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.24020388573408127, "loss/reg": 0.0, "step": 24310 }, { "epoch": 0.16, "grad_norm": 2.4375, "grad_norm_var": 0.10875244140625, "learning_rate": 0.0001, "loss": 3.0387, "loss/crossentropy": 2.2118794441223146, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.23726068288087845, "loss/reg": 0.0, "step": 24320 }, { "epoch": 0.16006578947368422, "grad_norm": 2.109375, "grad_norm_var": 0.052897135416666664, "learning_rate": 0.0001, "loss": 3.0208, "loss/crossentropy": 2.4390548706054687, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24900538772344588, "loss/reg": 0.0, "step": 24330 }, { "epoch": 0.16013157894736843, "grad_norm": 2.21875, "grad_norm_var": 0.5782511393229167, "learning_rate": 0.0001, "loss": 3.2442, "loss/crossentropy": 2.3729852437973022, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.23976370990276336, "loss/reg": 0.0, "step": 24340 }, { "epoch": 0.16019736842105264, "grad_norm": 2.328125, "grad_norm_var": 0.11790364583333333, "learning_rate": 0.0001, "loss": 3.0854, "loss/crossentropy": 2.4694011569023133, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.25265379548072814, "loss/reg": 0.0, "step": 24350 }, { "epoch": 0.16026315789473683, "grad_norm": 2.3125, "grad_norm_var": 0.0570465087890625, "learning_rate": 0.0001, "loss": 3.1662, "loss/crossentropy": 2.044074738025665, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.24245730862021447, "loss/reg": 0.0, "step": 24360 }, { "epoch": 0.16032894736842104, "grad_norm": 2.296875, "grad_norm_var": 0.0236724853515625, "learning_rate": 0.0001, "loss": 3.0431, "loss/crossentropy": 2.3142729878425596, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22269122749567033, "loss/reg": 0.0, "step": 24370 }, { "epoch": 0.16039473684210526, "grad_norm": 2.0625, "grad_norm_var": 0.08518473307291667, "learning_rate": 0.0001, "loss": 3.1019, "loss/crossentropy": 2.0431338131427763, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2301457904279232, "loss/reg": 0.0, "step": 24380 }, { "epoch": 0.16046052631578947, "grad_norm": 2.5, "grad_norm_var": 0.08948160807291666, "learning_rate": 0.0001, "loss": 2.9974, "loss/crossentropy": 2.215762954950333, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.21034832447767257, "loss/reg": 0.0, "step": 24390 }, { "epoch": 0.16052631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.013671875, "learning_rate": 0.0001, "loss": 3.0257, "loss/crossentropy": 2.285198521614075, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22049440741539, "loss/reg": 0.0, "step": 24400 }, { "epoch": 0.1605921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.04234110514322917, "learning_rate": 0.0001, "loss": 3.0891, "loss/crossentropy": 2.509168195724487, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.28251273930072784, "loss/reg": 0.0, "step": 24410 }, { "epoch": 0.1606578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.33464253743489586, "learning_rate": 0.0001, "loss": 3.0359, "loss/crossentropy": 2.0764550805091857, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2346973106265068, "loss/reg": 0.0, "step": 24420 }, { "epoch": 0.16072368421052632, "grad_norm": 2.265625, "grad_norm_var": 0.04140625, "learning_rate": 0.0001, "loss": 3.0962, "loss/crossentropy": 2.4705166459083556, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2445474848151207, "loss/reg": 0.0, "step": 24430 }, { "epoch": 0.16078947368421054, "grad_norm": 2.34375, "grad_norm_var": 0.3681060791015625, "learning_rate": 0.0001, "loss": 3.0701, "loss/crossentropy": 2.347282111644745, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.27417414337396623, "loss/reg": 0.0, "step": 24440 }, { "epoch": 0.16085526315789472, "grad_norm": 2.609375, "grad_norm_var": 0.0697662353515625, "learning_rate": 0.0001, "loss": 3.1662, "loss/crossentropy": 2.1282618045806885, "loss/hidden": 3.0671875, "loss/incoh": 0.0, "loss/logits": 0.32502798140048983, "loss/reg": 0.0, "step": 24450 }, { "epoch": 0.16092105263157894, "grad_norm": 2.546875, "grad_norm_var": 0.06052958170572917, "learning_rate": 0.0001, "loss": 3.0394, "loss/crossentropy": 2.5319641828536987, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23511321395635604, "loss/reg": 0.0, "step": 24460 }, { "epoch": 0.16098684210526315, "grad_norm": 3.84375, "grad_norm_var": 0.18154067993164064, "learning_rate": 0.0001, "loss": 3.0401, "loss/crossentropy": 2.3912153840065002, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2340209573507309, "loss/reg": 0.0, "step": 24470 }, { "epoch": 0.16105263157894736, "grad_norm": 2.234375, "grad_norm_var": 0.45718485514322915, "learning_rate": 0.0001, "loss": 3.0803, "loss/crossentropy": 2.4838199734687807, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.23594623059034348, "loss/reg": 0.0, "step": 24480 }, { "epoch": 0.16111842105263158, "grad_norm": 2.40625, "grad_norm_var": 0.12635269165039062, "learning_rate": 0.0001, "loss": 3.0486, "loss/crossentropy": 2.2548271775245667, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.22111207991838455, "loss/reg": 0.0, "step": 24490 }, { "epoch": 0.1611842105263158, "grad_norm": 1.953125, "grad_norm_var": 0.08367487589518229, "learning_rate": 0.0001, "loss": 3.0843, "loss/crossentropy": 2.364280033111572, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2182385429739952, "loss/reg": 0.0, "step": 24500 }, { "epoch": 0.16125, "grad_norm": 2.28125, "grad_norm_var": 0.13824462890625, "learning_rate": 0.0001, "loss": 3.1481, "loss/crossentropy": 2.3593606472015383, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.24336900562047958, "loss/reg": 0.0, "step": 24510 }, { "epoch": 0.16131578947368422, "grad_norm": 2.15625, "grad_norm_var": 0.24010009765625, "learning_rate": 0.0001, "loss": 3.0484, "loss/crossentropy": 2.212929093837738, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.23406590819358825, "loss/reg": 0.0, "step": 24520 }, { "epoch": 0.16138157894736843, "grad_norm": 2.078125, "grad_norm_var": 0.240869140625, "learning_rate": 0.0001, "loss": 3.1152, "loss/crossentropy": 2.0746700882911684, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.20809975266456604, "loss/reg": 0.0, "step": 24530 }, { "epoch": 0.16144736842105264, "grad_norm": 2.40625, "grad_norm_var": 0.1971343994140625, "learning_rate": 0.0001, "loss": 3.0887, "loss/crossentropy": 2.5128459453582765, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.23968077450990677, "loss/reg": 0.0, "step": 24540 }, { "epoch": 0.16151315789473683, "grad_norm": 2.296875, "grad_norm_var": 0.10077718098958334, "learning_rate": 0.0001, "loss": 3.0776, "loss/crossentropy": 2.2077295780181885, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.24557382240891457, "loss/reg": 0.0, "step": 24550 }, { "epoch": 0.16157894736842104, "grad_norm": 2.15625, "grad_norm_var": 0.09933268229166667, "learning_rate": 0.0001, "loss": 3.0416, "loss/crossentropy": 2.3417739272117615, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.23852093145251274, "loss/reg": 0.0, "step": 24560 }, { "epoch": 0.16164473684210526, "grad_norm": 2.4375, "grad_norm_var": 0.16413472493489584, "learning_rate": 0.0001, "loss": 3.1237, "loss/crossentropy": 2.230830729007721, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.23145168870687485, "loss/reg": 0.0, "step": 24570 }, { "epoch": 0.16171052631578947, "grad_norm": 2.84375, "grad_norm_var": 0.11370035807291666, "learning_rate": 0.0001, "loss": 3.1152, "loss/crossentropy": 2.1511558890342712, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.24633188247680665, "loss/reg": 0.0, "step": 24580 }, { "epoch": 0.16177631578947368, "grad_norm": 2.15625, "grad_norm_var": 0.3695818583170573, "learning_rate": 0.0001, "loss": 3.0297, "loss/crossentropy": 2.3748416185379027, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.2599245056509972, "loss/reg": 0.0, "step": 24590 }, { "epoch": 0.1618421052631579, "grad_norm": 1.96875, "grad_norm_var": 0.11649576822916667, "learning_rate": 0.0001, "loss": 3.0334, "loss/crossentropy": 1.958118262887001, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2173558861017227, "loss/reg": 0.0, "step": 24600 }, { "epoch": 0.1619078947368421, "grad_norm": 2.171875, "grad_norm_var": 0.09065729777018229, "learning_rate": 0.0001, "loss": 3.0806, "loss/crossentropy": 2.3804409861564637, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.28529137223958967, "loss/reg": 0.0, "step": 24610 }, { "epoch": 0.16197368421052633, "grad_norm": 2.40625, "grad_norm_var": 0.18765869140625, "learning_rate": 0.0001, "loss": 3.1415, "loss/crossentropy": 2.4188234567642213, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2486872687935829, "loss/reg": 0.0, "step": 24620 }, { "epoch": 0.16203947368421054, "grad_norm": 2.46875, "grad_norm_var": 0.1975982666015625, "learning_rate": 0.0001, "loss": 3.0741, "loss/crossentropy": 2.2737186551094055, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.21121124178171158, "loss/reg": 0.0, "step": 24630 }, { "epoch": 0.16210526315789472, "grad_norm": 2.5, "grad_norm_var": 2.4922159830729167, "learning_rate": 0.0001, "loss": 3.0672, "loss/crossentropy": 1.935386747121811, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.21296948455274106, "loss/reg": 0.0, "step": 24640 }, { "epoch": 0.16217105263157894, "grad_norm": 2.25, "grad_norm_var": 0.053587849934895834, "learning_rate": 0.0001, "loss": 3.1024, "loss/crossentropy": 2.456861126422882, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.29705640524625776, "loss/reg": 0.0, "step": 24650 }, { "epoch": 0.16223684210526315, "grad_norm": 2.140625, "grad_norm_var": 0.2731679280598958, "learning_rate": 0.0001, "loss": 3.0683, "loss/crossentropy": 2.3239927887916565, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.24205588102340697, "loss/reg": 0.0, "step": 24660 }, { "epoch": 0.16230263157894737, "grad_norm": 2.15625, "grad_norm_var": 0.22311197916666667, "learning_rate": 0.0001, "loss": 3.0314, "loss/crossentropy": 2.3312941789627075, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.28574763536453246, "loss/reg": 0.0, "step": 24670 }, { "epoch": 0.16236842105263158, "grad_norm": 2.859375, "grad_norm_var": 0.24403889973958334, "learning_rate": 0.0001, "loss": 3.1553, "loss/crossentropy": 2.493465173244476, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.23360646292567253, "loss/reg": 0.0, "step": 24680 }, { "epoch": 0.1624342105263158, "grad_norm": 2.84375, "grad_norm_var": 0.16258138020833332, "learning_rate": 0.0001, "loss": 3.088, "loss/crossentropy": 2.3201822876930236, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.24165551662445067, "loss/reg": 0.0, "step": 24690 }, { "epoch": 0.1625, "grad_norm": 2.84375, "grad_norm_var": 0.18403218587239584, "learning_rate": 0.0001, "loss": 3.1727, "loss/crossentropy": 2.2391496330499647, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.25802505761384964, "loss/reg": 0.0, "step": 24700 }, { "epoch": 0.16256578947368422, "grad_norm": 2.40625, "grad_norm_var": 0.047663370768229164, "learning_rate": 0.0001, "loss": 3.0701, "loss/crossentropy": 2.1174232959747314, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.25788239687681197, "loss/reg": 0.0, "step": 24710 }, { "epoch": 0.16263157894736843, "grad_norm": 2.390625, "grad_norm_var": 0.03251520792643229, "learning_rate": 0.0001, "loss": 3.0028, "loss/crossentropy": 2.303908097743988, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22646306753158568, "loss/reg": 0.0, "step": 24720 }, { "epoch": 0.16269736842105262, "grad_norm": 2.328125, "grad_norm_var": 0.09428609212239583, "learning_rate": 0.0001, "loss": 3.0573, "loss/crossentropy": 2.4429296493530273, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.24618532359600068, "loss/reg": 0.0, "step": 24730 }, { "epoch": 0.16276315789473683, "grad_norm": 2.28125, "grad_norm_var": 0.10568745930989583, "learning_rate": 0.0001, "loss": 3.0411, "loss/crossentropy": 2.245293366909027, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.3406794846057892, "loss/reg": 0.0, "step": 24740 }, { "epoch": 0.16282894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.24233169555664064, "learning_rate": 0.0001, "loss": 3.0254, "loss/crossentropy": 2.1815924525260924, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.20982651114463807, "loss/reg": 0.0, "step": 24750 }, { "epoch": 0.16289473684210526, "grad_norm": 2.140625, "grad_norm_var": 0.051889801025390626, "learning_rate": 0.0001, "loss": 3.0337, "loss/crossentropy": 2.530372714996338, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.22411757558584214, "loss/reg": 0.0, "step": 24760 }, { "epoch": 0.16296052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.025414021809895833, "learning_rate": 0.0001, "loss": 2.9668, "loss/crossentropy": 2.271071231365204, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.23753461390733718, "loss/reg": 0.0, "step": 24770 }, { "epoch": 0.16302631578947369, "grad_norm": 2.765625, "grad_norm_var": 0.0559722900390625, "learning_rate": 0.0001, "loss": 3.0646, "loss/crossentropy": 2.2592864990234376, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.21606265306472777, "loss/reg": 0.0, "step": 24780 }, { "epoch": 0.1630921052631579, "grad_norm": 1.9921875, "grad_norm_var": 0.05602595011393229, "learning_rate": 0.0001, "loss": 3.059, "loss/crossentropy": 2.021050810813904, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2471683457493782, "loss/reg": 0.0, "step": 24790 }, { "epoch": 0.1631578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.25566991170247394, "learning_rate": 0.0001, "loss": 3.0148, "loss/crossentropy": 2.4111449480056764, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.23949681222438812, "loss/reg": 0.0, "step": 24800 }, { "epoch": 0.16322368421052633, "grad_norm": 2.421875, "grad_norm_var": 0.21315816243489583, "learning_rate": 0.0001, "loss": 3.0918, "loss/crossentropy": 2.5244083642959594, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2454654648900032, "loss/reg": 0.0, "step": 24810 }, { "epoch": 0.16328947368421054, "grad_norm": 2.21875, "grad_norm_var": 0.09656575520833334, "learning_rate": 0.0001, "loss": 2.9972, "loss/crossentropy": 2.4637012243270875, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.2438522458076477, "loss/reg": 0.0, "step": 24820 }, { "epoch": 0.16335526315789473, "grad_norm": 2.1875, "grad_norm_var": 0.03803609212239583, "learning_rate": 0.0001, "loss": 3.0278, "loss/crossentropy": 2.2508904099464417, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.24632504507899283, "loss/reg": 0.0, "step": 24830 }, { "epoch": 0.16342105263157894, "grad_norm": 2.96875, "grad_norm_var": 0.0544921875, "learning_rate": 0.0001, "loss": 3.0273, "loss/crossentropy": 2.2554102897644044, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.2244628369808197, "loss/reg": 0.0, "step": 24840 }, { "epoch": 0.16348684210526315, "grad_norm": 2.15625, "grad_norm_var": 0.0915679931640625, "learning_rate": 0.0001, "loss": 2.9974, "loss/crossentropy": 2.4115317344665526, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.22097424566745758, "loss/reg": 0.0, "step": 24850 }, { "epoch": 0.16355263157894737, "grad_norm": 2.328125, "grad_norm_var": 0.0794342041015625, "learning_rate": 0.0001, "loss": 3.0875, "loss/crossentropy": 2.2210743844509127, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2252424478530884, "loss/reg": 0.0, "step": 24860 }, { "epoch": 0.16361842105263158, "grad_norm": 2.984375, "grad_norm_var": 3.853392423706864e+17, "learning_rate": 0.0001, "loss": 3.1398, "loss/crossentropy": 2.5947876930236817, "loss/hidden": 3.771875, "loss/incoh": 0.0, "loss/logits": 0.33976440876722336, "loss/reg": 0.0, "step": 24870 }, { "epoch": 0.1636842105263158, "grad_norm": 2.171875, "grad_norm_var": 3.853392423689082e+17, "learning_rate": 0.0001, "loss": 3.0576, "loss/crossentropy": 2.177506458759308, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.254705648124218, "loss/reg": 0.0, "step": 24880 }, { "epoch": 0.16375, "grad_norm": 2.4375, "grad_norm_var": 0.08782730102539063, "learning_rate": 0.0001, "loss": 3.0306, "loss/crossentropy": 2.4746748208999634, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.2408411219716072, "loss/reg": 0.0, "step": 24890 }, { "epoch": 0.16381578947368422, "grad_norm": 2.1875, "grad_norm_var": 0.07857666015625, "learning_rate": 0.0001, "loss": 3.0083, "loss/crossentropy": 2.273878073692322, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.2608170732855797, "loss/reg": 0.0, "step": 24900 }, { "epoch": 0.16388157894736843, "grad_norm": 2.5, "grad_norm_var": 0.17737528483072917, "learning_rate": 0.0001, "loss": 3.1428, "loss/crossentropy": 2.240007960796356, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.22373474836349488, "loss/reg": 0.0, "step": 24910 }, { "epoch": 0.16394736842105262, "grad_norm": 2.25, "grad_norm_var": 0.10565999348958334, "learning_rate": 0.0001, "loss": 3.0246, "loss/crossentropy": 2.229018306732178, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.29760813266038894, "loss/reg": 0.0, "step": 24920 }, { "epoch": 0.16401315789473683, "grad_norm": 2.65625, "grad_norm_var": 0.3898834228515625, "learning_rate": 0.0001, "loss": 3.1527, "loss/crossentropy": 2.2012178540229796, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.28048097975552083, "loss/reg": 0.0, "step": 24930 }, { "epoch": 0.16407894736842105, "grad_norm": 2.765625, "grad_norm_var": 0.1896636962890625, "learning_rate": 0.0001, "loss": 3.1143, "loss/crossentropy": 2.396012032032013, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.31100255995988846, "loss/reg": 0.0, "step": 24940 }, { "epoch": 0.16414473684210526, "grad_norm": 2.109375, "grad_norm_var": 0.11585464477539062, "learning_rate": 0.0001, "loss": 2.9923, "loss/crossentropy": 2.35096116065979, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.1970454752445221, "loss/reg": 0.0, "step": 24950 }, { "epoch": 0.16421052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.048278554280598955, "learning_rate": 0.0001, "loss": 3.0105, "loss/crossentropy": 2.1574088990688325, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.20654048770666122, "loss/reg": 0.0, "step": 24960 }, { "epoch": 0.1642763157894737, "grad_norm": 2.171875, "grad_norm_var": 0.41565348307291666, "learning_rate": 0.0001, "loss": 3.1106, "loss/crossentropy": 2.087399756908417, "loss/hidden": 3.11875, "loss/incoh": 0.0, "loss/logits": 0.2551331013441086, "loss/reg": 0.0, "step": 24970 }, { "epoch": 0.1643421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.4007232666015625, "learning_rate": 0.0001, "loss": 3.1262, "loss/crossentropy": 2.104507529735565, "loss/hidden": 3.0828125, "loss/incoh": 0.0, "loss/logits": 0.3040700241923332, "loss/reg": 0.0, "step": 24980 }, { "epoch": 0.16440789473684211, "grad_norm": 2.46875, "grad_norm_var": 0.29108072916666666, "learning_rate": 0.0001, "loss": 3.1264, "loss/crossentropy": 2.393839418888092, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2309044197201729, "loss/reg": 0.0, "step": 24990 }, { "epoch": 0.16447368421052633, "grad_norm": 2.65625, "grad_norm_var": 0.34716695149739585, "learning_rate": 0.0001, "loss": 3.0873, "loss/crossentropy": 2.295112156867981, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2622707739472389, "loss/reg": 0.0, "step": 25000 }, { "epoch": 0.1645394736842105, "grad_norm": 2.9375, "grad_norm_var": 2.9363606770833335, "learning_rate": 0.0001, "loss": 3.1494, "loss/crossentropy": 2.0088412761688232, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2086470142006874, "loss/reg": 0.0, "step": 25010 }, { "epoch": 0.16460526315789473, "grad_norm": 2.46875, "grad_norm_var": 3.0145467122395835, "learning_rate": 0.0001, "loss": 3.0267, "loss/crossentropy": 2.230376589298248, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.25718885809183123, "loss/reg": 0.0, "step": 25020 }, { "epoch": 0.16467105263157894, "grad_norm": 2.25, "grad_norm_var": 0.16364644368489584, "learning_rate": 0.0001, "loss": 3.1098, "loss/crossentropy": 2.2357122182846068, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.20328374579548836, "loss/reg": 0.0, "step": 25030 }, { "epoch": 0.16473684210526315, "grad_norm": 2.15625, "grad_norm_var": 0.13963597615559895, "learning_rate": 0.0001, "loss": 3.0687, "loss/crossentropy": 2.3226524710655214, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.23719747960567475, "loss/reg": 0.0, "step": 25040 }, { "epoch": 0.16480263157894737, "grad_norm": 2298478592.0, "grad_norm_var": 3.301877391914183e+17, "learning_rate": 0.0001, "loss": 3.151, "loss/crossentropy": 2.2604412317276, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.2530272454023361, "loss/reg": 0.0, "step": 25050 }, { "epoch": 0.16486842105263158, "grad_norm": 2.203125, "grad_norm_var": 3.301877391448801e+17, "learning_rate": 0.0001, "loss": 3.0688, "loss/crossentropy": 2.374041938781738, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.23906515687704086, "loss/reg": 0.0, "step": 25060 }, { "epoch": 0.1649342105263158, "grad_norm": 2.078125, "grad_norm_var": 0.08870035807291667, "learning_rate": 0.0001, "loss": 3.0466, "loss/crossentropy": 2.2033262491226195, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.2643999807536602, "loss/reg": 0.0, "step": 25070 }, { "epoch": 0.165, "grad_norm": 2.328125, "grad_norm_var": 0.22410481770833332, "learning_rate": 0.0001, "loss": 3.0549, "loss/crossentropy": 2.268805372714996, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.22531848698854445, "loss/reg": 0.0, "step": 25080 }, { "epoch": 0.16506578947368422, "grad_norm": 2.171875, "grad_norm_var": 0.17864481608072916, "learning_rate": 0.0001, "loss": 3.05, "loss/crossentropy": 2.32329341173172, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23195795714855194, "loss/reg": 0.0, "step": 25090 }, { "epoch": 0.16513157894736843, "grad_norm": 2.078125, "grad_norm_var": 0.07084859212239583, "learning_rate": 0.0001, "loss": 3.0737, "loss/crossentropy": 2.3633965373039247, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.24620314687490463, "loss/reg": 0.0, "step": 25100 }, { "epoch": 0.16519736842105262, "grad_norm": 2.296875, "grad_norm_var": 0.37231343587239585, "learning_rate": 0.0001, "loss": 3.0891, "loss/crossentropy": 2.4329662203788756, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23993222489953042, "loss/reg": 0.0, "step": 25110 }, { "epoch": 0.16526315789473683, "grad_norm": 2.359375, "grad_norm_var": 0.48580322265625, "learning_rate": 0.0001, "loss": 3.0705, "loss/crossentropy": 2.418682646751404, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.22389767169952393, "loss/reg": 0.0, "step": 25120 }, { "epoch": 0.16532894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.08493550618489583, "learning_rate": 0.0001, "loss": 3.0513, "loss/crossentropy": 2.394873285293579, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2614659383893013, "loss/reg": 0.0, "step": 25130 }, { "epoch": 0.16539473684210526, "grad_norm": 2.875, "grad_norm_var": 0.19609273274739583, "learning_rate": 0.0001, "loss": 3.0834, "loss/crossentropy": 2.1957722663879395, "loss/hidden": 2.9609375, "loss/incoh": 0.0, "loss/logits": 0.2501178741455078, "loss/reg": 0.0, "step": 25140 }, { "epoch": 0.16546052631578947, "grad_norm": 2.140625, "grad_norm_var": 0.17849019368489583, "learning_rate": 0.0001, "loss": 3.024, "loss/crossentropy": 2.389811897277832, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2392006903886795, "loss/reg": 0.0, "step": 25150 }, { "epoch": 0.1655263157894737, "grad_norm": 2.53125, "grad_norm_var": 0.08551610310872396, "learning_rate": 0.0001, "loss": 3.0152, "loss/crossentropy": 2.328965938091278, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2474894031882286, "loss/reg": 0.0, "step": 25160 }, { "epoch": 0.1655921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.26106363932291665, "learning_rate": 0.0001, "loss": 3.092, "loss/crossentropy": 2.5120302557945253, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.25525424629449844, "loss/reg": 0.0, "step": 25170 }, { "epoch": 0.16565789473684212, "grad_norm": 2.296875, "grad_norm_var": 0.4998372395833333, "learning_rate": 0.0001, "loss": 3.1112, "loss/crossentropy": 2.37037136554718, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.22218506336212157, "loss/reg": 0.0, "step": 25180 }, { "epoch": 0.16572368421052633, "grad_norm": 4.6875, "grad_norm_var": 0.4266916910807292, "learning_rate": 0.0001, "loss": 3.1019, "loss/crossentropy": 2.3847400188446044, "loss/hidden": 3.021875, "loss/incoh": 0.0, "loss/logits": 0.2786879613995552, "loss/reg": 0.0, "step": 25190 }, { "epoch": 0.16578947368421051, "grad_norm": 2.53125, "grad_norm_var": 0.4002766927083333, "learning_rate": 0.0001, "loss": 3.0387, "loss/crossentropy": 2.2348836183547975, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.24782560914754867, "loss/reg": 0.0, "step": 25200 }, { "epoch": 0.16585526315789473, "grad_norm": 2.4375, "grad_norm_var": 0.21065165201822916, "learning_rate": 0.0001, "loss": 3.1195, "loss/crossentropy": 2.257748210430145, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.21636683791875838, "loss/reg": 0.0, "step": 25210 }, { "epoch": 0.16592105263157894, "grad_norm": 2.515625, "grad_norm_var": 0.05474853515625, "learning_rate": 0.0001, "loss": 3.099, "loss/crossentropy": 2.4620559096336363, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.21316515058279037, "loss/reg": 0.0, "step": 25220 }, { "epoch": 0.16598684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 3.0173, "loss/crossentropy": 2.3012343645095825, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2387869328260422, "loss/reg": 0.0, "step": 25230 }, { "epoch": 0.16605263157894737, "grad_norm": 2.046875, "grad_norm_var": 0.06116434733072917, "learning_rate": 0.0001, "loss": 3.0454, "loss/crossentropy": 2.20231409072876, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.22614135295152665, "loss/reg": 0.0, "step": 25240 }, { "epoch": 0.16611842105263158, "grad_norm": 2.046875, "grad_norm_var": 0.05122782389322917, "learning_rate": 0.0001, "loss": 3.0264, "loss/crossentropy": 2.2305002450942992, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.22803680896759032, "loss/reg": 0.0, "step": 25250 }, { "epoch": 0.1661842105263158, "grad_norm": 2.0, "grad_norm_var": 0.05510660807291667, "learning_rate": 0.0001, "loss": 3.0045, "loss/crossentropy": 2.209262716770172, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.2319029837846756, "loss/reg": 0.0, "step": 25260 }, { "epoch": 0.16625, "grad_norm": 2.671875, "grad_norm_var": 0.0671051025390625, "learning_rate": 0.0001, "loss": 3.0665, "loss/crossentropy": 2.3621141076087953, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.24575576186180115, "loss/reg": 0.0, "step": 25270 }, { "epoch": 0.16631578947368422, "grad_norm": 2.359375, "grad_norm_var": 0.1259674072265625, "learning_rate": 0.0001, "loss": 3.0746, "loss/crossentropy": 2.297244334220886, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.21904474049806594, "loss/reg": 0.0, "step": 25280 }, { "epoch": 0.1663815789473684, "grad_norm": 2.4375, "grad_norm_var": 0.12108968098958334, "learning_rate": 0.0001, "loss": 3.0591, "loss/crossentropy": 2.167328989505768, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.254831300675869, "loss/reg": 0.0, "step": 25290 }, { "epoch": 0.16644736842105262, "grad_norm": 2.15625, "grad_norm_var": 0.030272420247395834, "learning_rate": 0.0001, "loss": 3.0706, "loss/crossentropy": 2.5636652946472167, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.26798896938562394, "loss/reg": 0.0, "step": 25300 }, { "epoch": 0.16651315789473684, "grad_norm": 2.46875, "grad_norm_var": 0.03521728515625, "learning_rate": 0.0001, "loss": 3.0397, "loss/crossentropy": 2.241175901889801, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.23810605853796005, "loss/reg": 0.0, "step": 25310 }, { "epoch": 0.16657894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.1312652587890625, "learning_rate": 0.0001, "loss": 3.0874, "loss/crossentropy": 2.461783027648926, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.22298123091459274, "loss/reg": 0.0, "step": 25320 }, { "epoch": 0.16664473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.01871337890625, "learning_rate": 0.0001, "loss": 3.0339, "loss/crossentropy": 2.6030556201934814, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.2566943824291229, "loss/reg": 0.0, "step": 25330 }, { "epoch": 0.16671052631578948, "grad_norm": 5.375, "grad_norm_var": 0.6346506754557292, "learning_rate": 0.0001, "loss": 3.0403, "loss/crossentropy": 2.2260714411735534, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.21501264423131944, "loss/reg": 0.0, "step": 25340 }, { "epoch": 0.1667763157894737, "grad_norm": 2.28125, "grad_norm_var": 0.6130198160807292, "learning_rate": 0.0001, "loss": 3.1302, "loss/crossentropy": 2.2205970883369446, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2506598949432373, "loss/reg": 0.0, "step": 25350 }, { "epoch": 0.1668421052631579, "grad_norm": 2.34375, "grad_norm_var": 0.07870686848958333, "learning_rate": 0.0001, "loss": 3.0579, "loss/crossentropy": 2.3078066170215608, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.2546592831611633, "loss/reg": 0.0, "step": 25360 }, { "epoch": 0.16690789473684212, "grad_norm": 2.484375, "grad_norm_var": 0.7152821858723958, "learning_rate": 0.0001, "loss": 3.084, "loss/crossentropy": 2.103282463550568, "loss/hidden": 3.0796875, "loss/incoh": 0.0, "loss/logits": 0.2719234719872475, "loss/reg": 0.0, "step": 25370 }, { "epoch": 0.1669736842105263, "grad_norm": 1.8984375, "grad_norm_var": 1.6605242411295573, "learning_rate": 0.0001, "loss": 3.0337, "loss/crossentropy": 2.533907437324524, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.2172473669052124, "loss/reg": 0.0, "step": 25380 }, { "epoch": 0.16703947368421052, "grad_norm": 2.421875, "grad_norm_var": 0.841961415608724, "learning_rate": 0.0001, "loss": 3.1811, "loss/crossentropy": 2.349000704288483, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.2332908734679222, "loss/reg": 0.0, "step": 25390 }, { "epoch": 0.16710526315789473, "grad_norm": 2.265625, "grad_norm_var": 0.75146484375, "learning_rate": 0.0001, "loss": 3.0519, "loss/crossentropy": 2.3112148702144624, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2460096523165703, "loss/reg": 0.0, "step": 25400 }, { "epoch": 0.16717105263157894, "grad_norm": 2.125, "grad_norm_var": 0.0905426025390625, "learning_rate": 0.0001, "loss": 3.0931, "loss/crossentropy": 2.208834648132324, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.21595955416560172, "loss/reg": 0.0, "step": 25410 }, { "epoch": 0.16723684210526316, "grad_norm": 2.21875, "grad_norm_var": 0.07610575358072917, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 2.2315654158592224, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23381351083517074, "loss/reg": 0.0, "step": 25420 }, { "epoch": 0.16730263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.043302154541015624, "learning_rate": 0.0001, "loss": 3.0197, "loss/crossentropy": 2.3990158438682556, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.24045146703720094, "loss/reg": 0.0, "step": 25430 }, { "epoch": 0.16736842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.0761138916015625, "learning_rate": 0.0001, "loss": 3.0792, "loss/crossentropy": 2.1985233664512633, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2625061020255089, "loss/reg": 0.0, "step": 25440 }, { "epoch": 0.1674342105263158, "grad_norm": 2.328125, "grad_norm_var": 0.08713150024414062, "learning_rate": 0.0001, "loss": 3.0474, "loss/crossentropy": 2.2765584170818327, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2559843085706234, "loss/reg": 0.0, "step": 25450 }, { "epoch": 0.1675, "grad_norm": 2.25, "grad_norm_var": 0.06063003540039062, "learning_rate": 0.0001, "loss": 3.0819, "loss/crossentropy": 2.241065299510956, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.22642052918672562, "loss/reg": 0.0, "step": 25460 }, { "epoch": 0.16756578947368422, "grad_norm": 2.15625, "grad_norm_var": 0.5740468343098958, "learning_rate": 0.0001, "loss": 3.1251, "loss/crossentropy": 2.388518822193146, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.283678263425827, "loss/reg": 0.0, "step": 25470 }, { "epoch": 0.1676315789473684, "grad_norm": 2.390625, "grad_norm_var": 0.13244400024414063, "learning_rate": 0.0001, "loss": 3.09, "loss/crossentropy": 2.2575212955474853, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.21582936197519303, "loss/reg": 0.0, "step": 25480 }, { "epoch": 0.16769736842105262, "grad_norm": 2.46875, "grad_norm_var": 0.14921773274739583, "learning_rate": 0.0001, "loss": 3.0472, "loss/crossentropy": 2.378064513206482, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.22392996102571489, "loss/reg": 0.0, "step": 25490 }, { "epoch": 0.16776315789473684, "grad_norm": 2.109375, "grad_norm_var": 0.046083323160807294, "learning_rate": 0.0001, "loss": 3.1247, "loss/crossentropy": 2.354731285572052, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24077038019895552, "loss/reg": 0.0, "step": 25500 }, { "epoch": 0.16782894736842105, "grad_norm": 2.0625, "grad_norm_var": 0.024348958333333334, "learning_rate": 0.0001, "loss": 3.0441, "loss/crossentropy": 2.645008158683777, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2468577191233635, "loss/reg": 0.0, "step": 25510 }, { "epoch": 0.16789473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.10139567057291667, "learning_rate": 0.0001, "loss": 3.158, "loss/crossentropy": 2.4286780834197996, "loss/hidden": 3.015625, "loss/incoh": 0.0, "loss/logits": 0.33578050583601, "loss/reg": 0.0, "step": 25520 }, { "epoch": 0.16796052631578948, "grad_norm": 2.546875, "grad_norm_var": 0.03992513020833333, "learning_rate": 0.0001, "loss": 3.0624, "loss/crossentropy": 2.4203076124191285, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.27344991117715833, "loss/reg": 0.0, "step": 25530 }, { "epoch": 0.1680263157894737, "grad_norm": 2.546875, "grad_norm_var": 0.03359273274739583, "learning_rate": 0.0001, "loss": 3.111, "loss/crossentropy": 2.1932618856430053, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.27270109951496124, "loss/reg": 0.0, "step": 25540 }, { "epoch": 0.1680921052631579, "grad_norm": 2.359375, "grad_norm_var": 0.03035252888997396, "learning_rate": 0.0001, "loss": 3.0572, "loss/crossentropy": 2.1443075835704803, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.21711303144693375, "loss/reg": 0.0, "step": 25550 }, { "epoch": 0.16815789473684212, "grad_norm": 2.171875, "grad_norm_var": 0.06535822550455729, "learning_rate": 0.0001, "loss": 3.0248, "loss/crossentropy": 2.2873115301132203, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.26505507379770277, "loss/reg": 0.0, "step": 25560 }, { "epoch": 0.1682236842105263, "grad_norm": 2.171875, "grad_norm_var": 0.21750895182291666, "learning_rate": 0.0001, "loss": 3.245, "loss/crossentropy": 2.464116406440735, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2560413718223572, "loss/reg": 0.0, "step": 25570 }, { "epoch": 0.16828947368421052, "grad_norm": 4.0625, "grad_norm_var": 0.2865193684895833, "learning_rate": 0.0001, "loss": 3.1803, "loss/crossentropy": 2.299769949913025, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2611501321196556, "loss/reg": 0.0, "step": 25580 }, { "epoch": 0.16835526315789473, "grad_norm": 2.21875, "grad_norm_var": 0.24952799479166668, "learning_rate": 0.0001, "loss": 3.1427, "loss/crossentropy": 2.0992822468280794, "loss/hidden": 3.109375, "loss/incoh": 0.0, "loss/logits": 0.2576856903731823, "loss/reg": 0.0, "step": 25590 }, { "epoch": 0.16842105263157894, "grad_norm": 2.328125, "grad_norm_var": 0.4341705322265625, "learning_rate": 0.0001, "loss": 3.101, "loss/crossentropy": 2.0800092458724975, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.20497069805860518, "loss/reg": 0.0, "step": 25600 }, { "epoch": 0.16848684210526316, "grad_norm": 2.625, "grad_norm_var": 0.4129231770833333, "learning_rate": 0.0001, "loss": 3.0582, "loss/crossentropy": 2.3167444467544556, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.2798536166548729, "loss/reg": 0.0, "step": 25610 }, { "epoch": 0.16855263157894737, "grad_norm": 3.59375, "grad_norm_var": 0.13404541015625, "learning_rate": 0.0001, "loss": 3.0518, "loss/crossentropy": 2.456227493286133, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.27998181581497195, "loss/reg": 0.0, "step": 25620 }, { "epoch": 0.16861842105263158, "grad_norm": 2.03125, "grad_norm_var": 0.15030899047851562, "learning_rate": 0.0001, "loss": 3.043, "loss/crossentropy": 2.208226388692856, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24489136189222335, "loss/reg": 0.0, "step": 25630 }, { "epoch": 0.1686842105263158, "grad_norm": 2.171875, "grad_norm_var": 1.020721181233724, "learning_rate": 0.0001, "loss": 3.1397, "loss/crossentropy": 2.2634316802024843, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.27392966747283937, "loss/reg": 0.0, "step": 25640 }, { "epoch": 0.16875, "grad_norm": 2.40625, "grad_norm_var": 0.980279286702474, "learning_rate": 0.0001, "loss": 3.0574, "loss/crossentropy": 2.2914888978004457, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.23568858355283737, "loss/reg": 0.0, "step": 25650 }, { "epoch": 0.1688157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.07039769490559895, "learning_rate": 0.0001, "loss": 3.139, "loss/crossentropy": 2.1703743815422056, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.25462436228990554, "loss/reg": 0.0, "step": 25660 }, { "epoch": 0.1688815789473684, "grad_norm": 2.546875, "grad_norm_var": 0.22234598795572916, "learning_rate": 0.0001, "loss": 3.0749, "loss/crossentropy": 2.115765154361725, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.2619611322879791, "loss/reg": 0.0, "step": 25670 }, { "epoch": 0.16894736842105262, "grad_norm": 2.3125, "grad_norm_var": 0.21928609212239583, "learning_rate": 0.0001, "loss": 3.1309, "loss/crossentropy": 2.355665123462677, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.25024047791957854, "loss/reg": 0.0, "step": 25680 }, { "epoch": 0.16901315789473684, "grad_norm": 2.40625, "grad_norm_var": 0.6276204427083333, "learning_rate": 0.0001, "loss": 3.2325, "loss/crossentropy": 2.3543809175491335, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.28042998909950256, "loss/reg": 0.0, "step": 25690 }, { "epoch": 0.16907894736842105, "grad_norm": 2.15625, "grad_norm_var": 0.55308837890625, "learning_rate": 0.0001, "loss": 3.1367, "loss/crossentropy": 2.0251551032066346, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2671583190560341, "loss/reg": 0.0, "step": 25700 }, { "epoch": 0.16914473684210526, "grad_norm": 2.5625, "grad_norm_var": 0.8514638264973958, "learning_rate": 0.0001, "loss": 3.0816, "loss/crossentropy": 2.3947930097579957, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.23190638422966003, "loss/reg": 0.0, "step": 25710 }, { "epoch": 0.16921052631578948, "grad_norm": 2.21875, "grad_norm_var": 0.8416341145833334, "learning_rate": 0.0001, "loss": 3.0317, "loss/crossentropy": 2.3030895352363587, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.25654992610216143, "loss/reg": 0.0, "step": 25720 }, { "epoch": 0.1692763157894737, "grad_norm": 2.203125, "grad_norm_var": 0.1338043212890625, "learning_rate": 0.0001, "loss": 3.0553, "loss/crossentropy": 2.4580634593963624, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.29121011197566987, "loss/reg": 0.0, "step": 25730 }, { "epoch": 0.1693421052631579, "grad_norm": 2.578125, "grad_norm_var": 0.12617162068684895, "learning_rate": 0.0001, "loss": 3.0517, "loss/crossentropy": 2.2136544942855836, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2460548087954521, "loss/reg": 0.0, "step": 25740 }, { "epoch": 0.16940789473684212, "grad_norm": 2.34375, "grad_norm_var": 0.018700154622395833, "learning_rate": 0.0001, "loss": 3.0731, "loss/crossentropy": 2.2090337753295897, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.26342562288045884, "loss/reg": 0.0, "step": 25750 }, { "epoch": 0.1694736842105263, "grad_norm": 2.703125, "grad_norm_var": 0.31311442057291666, "learning_rate": 0.0001, "loss": 3.1412, "loss/crossentropy": 2.2381184220314028, "loss/hidden": 3.1265625, "loss/incoh": 0.0, "loss/logits": 0.3542740896344185, "loss/reg": 0.0, "step": 25760 }, { "epoch": 0.16953947368421052, "grad_norm": 3.9375, "grad_norm_var": 0.39954020182291666, "learning_rate": 0.0001, "loss": 3.1884, "loss/crossentropy": 2.115560531616211, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.2626333341002464, "loss/reg": 0.0, "step": 25770 }, { "epoch": 0.16960526315789473, "grad_norm": 2.171875, "grad_norm_var": 0.21519775390625, "learning_rate": 0.0001, "loss": 3.0564, "loss/crossentropy": 1.9519969999790192, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.22164231091737746, "loss/reg": 0.0, "step": 25780 }, { "epoch": 0.16967105263157894, "grad_norm": 2.5625, "grad_norm_var": 0.08953450520833334, "learning_rate": 0.0001, "loss": 3.1503, "loss/crossentropy": 2.2421632409095764, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.23813997954130173, "loss/reg": 0.0, "step": 25790 }, { "epoch": 0.16973684210526316, "grad_norm": 2.015625, "grad_norm_var": 0.1667144775390625, "learning_rate": 0.0001, "loss": 3.0487, "loss/crossentropy": 1.9510473489761353, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.1950627200305462, "loss/reg": 0.0, "step": 25800 }, { "epoch": 0.16980263157894737, "grad_norm": 2.0625, "grad_norm_var": 0.15810139973958334, "learning_rate": 0.0001, "loss": 3.0855, "loss/crossentropy": 2.2411985039710998, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.25535731613636015, "loss/reg": 0.0, "step": 25810 }, { "epoch": 0.16986842105263159, "grad_norm": 2.03125, "grad_norm_var": 1.0436358133951822, "learning_rate": 0.0001, "loss": 3.115, "loss/crossentropy": 2.045396554470062, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.22041565626859666, "loss/reg": 0.0, "step": 25820 }, { "epoch": 0.1699342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.18515599568684896, "learning_rate": 0.0001, "loss": 3.0967, "loss/crossentropy": 2.4469074010849, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.2558215782046318, "loss/reg": 0.0, "step": 25830 }, { "epoch": 0.17, "grad_norm": 2.109375, "grad_norm_var": 0.38408203125, "learning_rate": 0.0001, "loss": 3.1042, "loss/crossentropy": 2.038024789094925, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.28011882603168486, "loss/reg": 0.0, "step": 25840 }, { "epoch": 0.1700657894736842, "grad_norm": 2.21875, "grad_norm_var": 0.12269694010416667, "learning_rate": 0.0001, "loss": 3.0198, "loss/crossentropy": 2.246475076675415, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.21217122972011565, "loss/reg": 0.0, "step": 25850 }, { "epoch": 0.1701315789473684, "grad_norm": 2.140625, "grad_norm_var": 0.5065388997395833, "learning_rate": 0.0001, "loss": 3.0953, "loss/crossentropy": 2.2885293424129487, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23523461371660231, "loss/reg": 0.0, "step": 25860 }, { "epoch": 0.17019736842105262, "grad_norm": 4.71875, "grad_norm_var": 0.8165323893229167, "learning_rate": 0.0001, "loss": 3.0392, "loss/crossentropy": 2.1574928402900695, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.25582125633955, "loss/reg": 0.0, "step": 25870 }, { "epoch": 0.17026315789473684, "grad_norm": 3.046875, "grad_norm_var": 0.57008056640625, "learning_rate": 0.0001, "loss": 3.1252, "loss/crossentropy": 2.563046908378601, "loss/hidden": 3.2078125, "loss/incoh": 0.0, "loss/logits": 0.2562621384859085, "loss/reg": 0.0, "step": 25880 }, { "epoch": 0.17032894736842105, "grad_norm": 2.578125, "grad_norm_var": 0.2592437744140625, "learning_rate": 0.0001, "loss": 3.054, "loss/crossentropy": 2.4634771227836607, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.25339619666337965, "loss/reg": 0.0, "step": 25890 }, { "epoch": 0.17039473684210527, "grad_norm": 2.234375, "grad_norm_var": 0.04925130208333333, "learning_rate": 0.0001, "loss": 3.0529, "loss/crossentropy": 2.5104726552963257, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.24876073151826858, "loss/reg": 0.0, "step": 25900 }, { "epoch": 0.17046052631578948, "grad_norm": 2.75, "grad_norm_var": 0.07171122233072917, "learning_rate": 0.0001, "loss": 3.0629, "loss/crossentropy": 2.2496933460235597, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2183684565126896, "loss/reg": 0.0, "step": 25910 }, { "epoch": 0.1705263157894737, "grad_norm": 2.5, "grad_norm_var": 0.05900777180989583, "learning_rate": 0.0001, "loss": 3.0543, "loss/crossentropy": 2.3338397264480593, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.26187729388475417, "loss/reg": 0.0, "step": 25920 }, { "epoch": 0.1705921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.018359375, "learning_rate": 0.0001, "loss": 3.0801, "loss/crossentropy": 2.499437928199768, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.25035695284605025, "loss/reg": 0.0, "step": 25930 }, { "epoch": 0.1706578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.014264933268229167, "learning_rate": 0.0001, "loss": 3.0337, "loss/crossentropy": 2.217645859718323, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.21889251619577407, "loss/reg": 0.0, "step": 25940 }, { "epoch": 0.1707236842105263, "grad_norm": 2.140625, "grad_norm_var": 0.04052734375, "learning_rate": 0.0001, "loss": 2.9938, "loss/crossentropy": 2.371463644504547, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2288275495171547, "loss/reg": 0.0, "step": 25950 }, { "epoch": 0.17078947368421052, "grad_norm": 2.609375, "grad_norm_var": 0.1755859375, "learning_rate": 0.0001, "loss": 3.0916, "loss/crossentropy": 2.0315194338560105, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.23134928867220877, "loss/reg": 0.0, "step": 25960 }, { "epoch": 0.17085526315789473, "grad_norm": 2.140625, "grad_norm_var": 0.18121744791666666, "learning_rate": 0.0001, "loss": 3.0837, "loss/crossentropy": 2.1368157267570496, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2233546957373619, "loss/reg": 0.0, "step": 25970 }, { "epoch": 0.17092105263157895, "grad_norm": 2.640625, "grad_norm_var": 0.027311197916666665, "learning_rate": 0.0001, "loss": 3.0984, "loss/crossentropy": 2.3421939969062806, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.28762088268995284, "loss/reg": 0.0, "step": 25980 }, { "epoch": 0.17098684210526316, "grad_norm": 2.21875, "grad_norm_var": 0.051656087239583336, "learning_rate": 0.0001, "loss": 3.0788, "loss/crossentropy": 2.002690005302429, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24228747338056564, "loss/reg": 0.0, "step": 25990 }, { "epoch": 0.17105263157894737, "grad_norm": 2.4375, "grad_norm_var": 0.049397786458333336, "learning_rate": 0.0001, "loss": 3.0917, "loss/crossentropy": 2.6486097812652587, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.24952505975961686, "loss/reg": 0.0, "step": 26000 }, { "epoch": 0.1711184210526316, "grad_norm": 2.0, "grad_norm_var": 0.0459869384765625, "learning_rate": 0.0001, "loss": 3.0288, "loss/crossentropy": 2.516502094268799, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.24382732957601547, "loss/reg": 0.0, "step": 26010 }, { "epoch": 0.1711842105263158, "grad_norm": 2.34375, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 3.0415, "loss/crossentropy": 2.171498954296112, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20375996455550194, "loss/reg": 0.0, "step": 26020 }, { "epoch": 0.17125, "grad_norm": 2.296875, "grad_norm_var": 0.0759600321451823, "learning_rate": 0.0001, "loss": 3.0905, "loss/crossentropy": 2.2978348255157472, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.2875170633196831, "loss/reg": 0.0, "step": 26030 }, { "epoch": 0.1713157894736842, "grad_norm": 3.546875, "grad_norm_var": 0.1603167215983073, "learning_rate": 0.0001, "loss": 3.0384, "loss/crossentropy": 2.462969756126404, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2628189787268639, "loss/reg": 0.0, "step": 26040 }, { "epoch": 0.1713815789473684, "grad_norm": 2.46875, "grad_norm_var": 0.13243815104166667, "learning_rate": 0.0001, "loss": 3.0513, "loss/crossentropy": 2.537035143375397, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.24593261033296585, "loss/reg": 0.0, "step": 26050 }, { "epoch": 0.17144736842105263, "grad_norm": 2.453125, "grad_norm_var": 0.061102040608723956, "learning_rate": 0.0001, "loss": 3.0445, "loss/crossentropy": 2.2031071186065674, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.24524077475070954, "loss/reg": 0.0, "step": 26060 }, { "epoch": 0.17151315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.009764607747395833, "learning_rate": 0.0001, "loss": 3.0725, "loss/crossentropy": 2.4979568481445313, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2647134616971016, "loss/reg": 0.0, "step": 26070 }, { "epoch": 0.17157894736842105, "grad_norm": 2.53125, "grad_norm_var": 0.026024373372395833, "learning_rate": 0.0001, "loss": 2.99, "loss/crossentropy": 2.350401425361633, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.24495915323495865, "loss/reg": 0.0, "step": 26080 }, { "epoch": 0.17164473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.028857421875, "learning_rate": 0.0001, "loss": 3.0236, "loss/crossentropy": 2.275194299221039, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.22880630493164061, "loss/reg": 0.0, "step": 26090 }, { "epoch": 0.17171052631578948, "grad_norm": 2.25, "grad_norm_var": 0.03967692057291667, "learning_rate": 0.0001, "loss": 3.0823, "loss/crossentropy": 2.000411808490753, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2721859060227871, "loss/reg": 0.0, "step": 26100 }, { "epoch": 0.1717763157894737, "grad_norm": 2.46875, "grad_norm_var": 0.06353759765625, "learning_rate": 0.0001, "loss": 3.0778, "loss/crossentropy": 2.305191624164581, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2284097969532013, "loss/reg": 0.0, "step": 26110 }, { "epoch": 0.1718421052631579, "grad_norm": 2.28125, "grad_norm_var": 0.34947509765625, "learning_rate": 0.0001, "loss": 3.0215, "loss/crossentropy": 2.3149521112442017, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.27900824695825577, "loss/reg": 0.0, "step": 26120 }, { "epoch": 0.1719078947368421, "grad_norm": 2.65625, "grad_norm_var": 0.39952977498372394, "learning_rate": 0.0001, "loss": 3.0197, "loss/crossentropy": 2.421990168094635, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2581001713871956, "loss/reg": 0.0, "step": 26130 }, { "epoch": 0.1719736842105263, "grad_norm": 2.25, "grad_norm_var": 0.19842529296875, "learning_rate": 0.0001, "loss": 3.115, "loss/crossentropy": 2.473453497886658, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.25612604022026064, "loss/reg": 0.0, "step": 26140 }, { "epoch": 0.17203947368421052, "grad_norm": 2.015625, "grad_norm_var": 0.06412328084309896, "learning_rate": 0.0001, "loss": 3.1205, "loss/crossentropy": 2.395763027667999, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.25399915128946304, "loss/reg": 0.0, "step": 26150 }, { "epoch": 0.17210526315789473, "grad_norm": 3.21875, "grad_norm_var": 3.7263832092285156, "learning_rate": 0.0001, "loss": 3.1443, "loss/crossentropy": 2.3229804635047913, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.25968860387802123, "loss/reg": 0.0, "step": 26160 }, { "epoch": 0.17217105263157895, "grad_norm": 2.3125, "grad_norm_var": 3.5683990478515626, "learning_rate": 0.0001, "loss": 3.0833, "loss/crossentropy": 2.27206437587738, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.26493641585111616, "loss/reg": 0.0, "step": 26170 }, { "epoch": 0.17223684210526316, "grad_norm": 2.78125, "grad_norm_var": 0.140771484375, "learning_rate": 0.0001, "loss": 3.1236, "loss/crossentropy": 2.1949939370155334, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.24421997666358947, "loss/reg": 0.0, "step": 26180 }, { "epoch": 0.17230263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.08112691243489584, "learning_rate": 0.0001, "loss": 3.0485, "loss/crossentropy": 2.292436492443085, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2527725785970688, "loss/reg": 0.0, "step": 26190 }, { "epoch": 0.1723684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.08532613118489583, "learning_rate": 0.0001, "loss": 3.1316, "loss/crossentropy": 2.497014009952545, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2447853922843933, "loss/reg": 0.0, "step": 26200 }, { "epoch": 0.1724342105263158, "grad_norm": 2.421875, "grad_norm_var": 0.142578125, "learning_rate": 0.0001, "loss": 3.1014, "loss/crossentropy": 2.22248170375824, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2591711387038231, "loss/reg": 0.0, "step": 26210 }, { "epoch": 0.1725, "grad_norm": 2.109375, "grad_norm_var": 0.12844009399414064, "learning_rate": 0.0001, "loss": 3.0061, "loss/crossentropy": 2.3277372121810913, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2391911566257477, "loss/reg": 0.0, "step": 26220 }, { "epoch": 0.1725657894736842, "grad_norm": 2.46875, "grad_norm_var": 0.03429361979166667, "learning_rate": 0.0001, "loss": 3.0468, "loss/crossentropy": 2.593550610542297, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.26268754005432127, "loss/reg": 0.0, "step": 26230 }, { "epoch": 0.1726315789473684, "grad_norm": 3.59375, "grad_norm_var": 0.13815689086914062, "learning_rate": 0.0001, "loss": 3.0785, "loss/crossentropy": 2.037731957435608, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.19823871441185476, "loss/reg": 0.0, "step": 26240 }, { "epoch": 0.17269736842105263, "grad_norm": 2.203125, "grad_norm_var": 0.1658404032389323, "learning_rate": 0.0001, "loss": 3.0753, "loss/crossentropy": 2.365402173995972, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.22439467608928682, "loss/reg": 0.0, "step": 26250 }, { "epoch": 0.17276315789473684, "grad_norm": 2.296875, "grad_norm_var": 0.42582906087239586, "learning_rate": 0.0001, "loss": 3.0247, "loss/crossentropy": 2.4660609483718874, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2206026643514633, "loss/reg": 0.0, "step": 26260 }, { "epoch": 0.17282894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.4584307352701823, "learning_rate": 0.0001, "loss": 2.9979, "loss/crossentropy": 2.3537610173225403, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24690964370965957, "loss/reg": 0.0, "step": 26270 }, { "epoch": 0.17289473684210527, "grad_norm": 2.34375, "grad_norm_var": 0.1235015869140625, "learning_rate": 0.0001, "loss": 3.0554, "loss/crossentropy": 2.4086636185646055, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.22505185902118682, "loss/reg": 0.0, "step": 26280 }, { "epoch": 0.17296052631578948, "grad_norm": 2.296875, "grad_norm_var": 0.26606343587239584, "learning_rate": 0.0001, "loss": 3.0265, "loss/crossentropy": 2.120291304588318, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.20824237614870073, "loss/reg": 0.0, "step": 26290 }, { "epoch": 0.1730263157894737, "grad_norm": 2.296875, "grad_norm_var": 0.18574117024739584, "learning_rate": 0.0001, "loss": 2.9919, "loss/crossentropy": 2.1023782581090926, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.19826814979314805, "loss/reg": 0.0, "step": 26300 }, { "epoch": 0.1730921052631579, "grad_norm": 2.8125, "grad_norm_var": 0.24006754557291668, "learning_rate": 0.0001, "loss": 3.1028, "loss/crossentropy": 2.226356017589569, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.2514319851994514, "loss/reg": 0.0, "step": 26310 }, { "epoch": 0.1731578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.234326171875, "learning_rate": 0.0001, "loss": 3.0524, "loss/crossentropy": 2.2389047503471375, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.23142587393522263, "loss/reg": 0.0, "step": 26320 }, { "epoch": 0.1732236842105263, "grad_norm": 2.203125, "grad_norm_var": 0.07327041625976563, "learning_rate": 0.0001, "loss": 3.054, "loss/crossentropy": 2.3862741947174073, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2310537427663803, "loss/reg": 0.0, "step": 26330 }, { "epoch": 0.17328947368421052, "grad_norm": 2.171875, "grad_norm_var": 0.07641499837239583, "learning_rate": 0.0001, "loss": 3.0653, "loss/crossentropy": 2.348588991165161, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.22305506616830825, "loss/reg": 0.0, "step": 26340 }, { "epoch": 0.17335526315789473, "grad_norm": 2.390625, "grad_norm_var": 0.05279541015625, "learning_rate": 0.0001, "loss": 3.0529, "loss/crossentropy": 2.036811423301697, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.23410281240940095, "loss/reg": 0.0, "step": 26350 }, { "epoch": 0.17342105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.2839101155598958, "learning_rate": 0.0001, "loss": 3.1092, "loss/crossentropy": 2.1091545104980467, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.23594892621040345, "loss/reg": 0.0, "step": 26360 }, { "epoch": 0.17348684210526316, "grad_norm": 2.59375, "grad_norm_var": 0.061848958333333336, "learning_rate": 0.0001, "loss": 3.091, "loss/crossentropy": 2.290253794193268, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.29689010232686996, "loss/reg": 0.0, "step": 26370 }, { "epoch": 0.17355263157894738, "grad_norm": 2.890625, "grad_norm_var": 0.04799702962239583, "learning_rate": 0.0001, "loss": 3.0457, "loss/crossentropy": 2.193889284133911, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.22776648998260499, "loss/reg": 0.0, "step": 26380 }, { "epoch": 0.1736184210526316, "grad_norm": 2.125, "grad_norm_var": 0.08166910807291666, "learning_rate": 0.0001, "loss": 3.0939, "loss/crossentropy": 2.2826230049133303, "loss/hidden": 2.9390625, "loss/incoh": 0.0, "loss/logits": 0.2337636888027191, "loss/reg": 0.0, "step": 26390 }, { "epoch": 0.1736842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.16920572916666668, "learning_rate": 0.0001, "loss": 3.0459, "loss/crossentropy": 2.233789896965027, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.20103665292263032, "loss/reg": 0.0, "step": 26400 }, { "epoch": 0.17375, "grad_norm": 2.125, "grad_norm_var": 0.17185440063476562, "learning_rate": 0.0001, "loss": 3.0538, "loss/crossentropy": 2.295468533039093, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2207227498292923, "loss/reg": 0.0, "step": 26410 }, { "epoch": 0.1738157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.048618316650390625, "learning_rate": 0.0001, "loss": 3.0468, "loss/crossentropy": 2.405815064907074, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.22815033197402954, "loss/reg": 0.0, "step": 26420 }, { "epoch": 0.17388157894736841, "grad_norm": 2.53125, "grad_norm_var": 0.05583394368489583, "learning_rate": 0.0001, "loss": 3.1146, "loss/crossentropy": 2.2692611694335936, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.24037961512804032, "loss/reg": 0.0, "step": 26430 }, { "epoch": 0.17394736842105263, "grad_norm": 2.6875, "grad_norm_var": 0.1251617431640625, "learning_rate": 0.0001, "loss": 3.1101, "loss/crossentropy": 2.2592686772346497, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.24446354508399964, "loss/reg": 0.0, "step": 26440 }, { "epoch": 0.17401315789473684, "grad_norm": 2.484375, "grad_norm_var": 0.10937093098958334, "learning_rate": 0.0001, "loss": 3.0527, "loss/crossentropy": 2.291548252105713, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.25084199756383896, "loss/reg": 0.0, "step": 26450 }, { "epoch": 0.17407894736842106, "grad_norm": 2.984375, "grad_norm_var": 0.06529947916666666, "learning_rate": 0.0001, "loss": 3.1052, "loss/crossentropy": 2.37350834608078, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.2556142807006836, "loss/reg": 0.0, "step": 26460 }, { "epoch": 0.17414473684210527, "grad_norm": 2.375, "grad_norm_var": 0.07598037719726562, "learning_rate": 0.0001, "loss": 3.0113, "loss/crossentropy": 2.250531017780304, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2357124164700508, "loss/reg": 0.0, "step": 26470 }, { "epoch": 0.17421052631578948, "grad_norm": 2.625, "grad_norm_var": 0.17610677083333334, "learning_rate": 0.0001, "loss": 3.1412, "loss/crossentropy": 2.439283573627472, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2121993750333786, "loss/reg": 0.0, "step": 26480 }, { "epoch": 0.1742763157894737, "grad_norm": 2.71875, "grad_norm_var": 0.16172587076822917, "learning_rate": 0.0001, "loss": 3.0562, "loss/crossentropy": 2.3756911277771, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2490501657128334, "loss/reg": 0.0, "step": 26490 }, { "epoch": 0.17434210526315788, "grad_norm": 2.140625, "grad_norm_var": 0.050211588541666664, "learning_rate": 0.0001, "loss": 3.0538, "loss/crossentropy": 2.5375622749328612, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24366173297166824, "loss/reg": 0.0, "step": 26500 }, { "epoch": 0.1744078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.5032185872395833, "learning_rate": 0.0001, "loss": 3.0919, "loss/crossentropy": 2.3419446110725404, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2749262437224388, "loss/reg": 0.0, "step": 26510 }, { "epoch": 0.1744736842105263, "grad_norm": 2.5625, "grad_norm_var": 0.03943583170572917, "learning_rate": 0.0001, "loss": 3.0832, "loss/crossentropy": 2.070459759235382, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2193553477525711, "loss/reg": 0.0, "step": 26520 }, { "epoch": 0.17453947368421052, "grad_norm": 2.421875, "grad_norm_var": 0.11748758951822917, "learning_rate": 0.0001, "loss": 3.0301, "loss/crossentropy": 2.2101051688194273, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2358426868915558, "loss/reg": 0.0, "step": 26530 }, { "epoch": 0.17460526315789474, "grad_norm": 2.5, "grad_norm_var": 0.11904195149739584, "learning_rate": 0.0001, "loss": 2.9898, "loss/crossentropy": 2.301384377479553, "loss/hidden": 2.9671875, "loss/incoh": 0.0, "loss/logits": 0.270483261346817, "loss/reg": 0.0, "step": 26540 }, { "epoch": 0.17467105263157895, "grad_norm": 2.296875, "grad_norm_var": 0.03364969889322917, "learning_rate": 0.0001, "loss": 2.9894, "loss/crossentropy": 2.233541655540466, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.23210279494524003, "loss/reg": 0.0, "step": 26550 }, { "epoch": 0.17473684210526316, "grad_norm": 4.3125, "grad_norm_var": 0.30920817057291666, "learning_rate": 0.0001, "loss": 3.0375, "loss/crossentropy": 2.4949169993400573, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.27407604902982713, "loss/reg": 0.0, "step": 26560 }, { "epoch": 0.17480263157894738, "grad_norm": 2.4375, "grad_norm_var": 0.4443511962890625, "learning_rate": 0.0001, "loss": 3.1064, "loss/crossentropy": 2.180380403995514, "loss/hidden": 3.06875, "loss/incoh": 0.0, "loss/logits": 0.2965380325913429, "loss/reg": 0.0, "step": 26570 }, { "epoch": 0.1748684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.12886962890625, "learning_rate": 0.0001, "loss": 3.0636, "loss/crossentropy": 2.2048864006996154, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.24606209397315978, "loss/reg": 0.0, "step": 26580 }, { "epoch": 0.1749342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.090576171875, "learning_rate": 0.0001, "loss": 3.0745, "loss/crossentropy": 2.0117301523685454, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.211642824113369, "loss/reg": 0.0, "step": 26590 }, { "epoch": 0.175, "grad_norm": 2.15625, "grad_norm_var": 0.9207834879557292, "learning_rate": 0.0001, "loss": 2.9913, "loss/crossentropy": 2.2407095432281494, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.25644198805093765, "loss/reg": 0.0, "step": 26600 }, { "epoch": 0.1750657894736842, "grad_norm": 2.03125, "grad_norm_var": 0.9292063395182292, "learning_rate": 0.0001, "loss": 3.1122, "loss/crossentropy": 2.510072946548462, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.24346872568130493, "loss/reg": 0.0, "step": 26610 }, { "epoch": 0.17513157894736842, "grad_norm": 2.140625, "grad_norm_var": 1.2361806233723958, "learning_rate": 0.0001, "loss": 3.1052, "loss/crossentropy": 2.0434396266937256, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.19786725342273712, "loss/reg": 0.0, "step": 26620 }, { "epoch": 0.17519736842105263, "grad_norm": 2.25, "grad_norm_var": 1.2163075764973958, "learning_rate": 0.0001, "loss": 3.1224, "loss/crossentropy": 2.2104805946350097, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.21630636900663375, "loss/reg": 0.0, "step": 26630 }, { "epoch": 0.17526315789473684, "grad_norm": 2.421875, "grad_norm_var": 0.03358739217122396, "learning_rate": 0.0001, "loss": 3.028, "loss/crossentropy": 2.178933525085449, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.20190188586711882, "loss/reg": 0.0, "step": 26640 }, { "epoch": 0.17532894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.09688212076822916, "learning_rate": 0.0001, "loss": 3.0897, "loss/crossentropy": 2.2301127552986144, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23655431792140008, "loss/reg": 0.0, "step": 26650 }, { "epoch": 0.17539473684210527, "grad_norm": 2.71875, "grad_norm_var": 0.10475972493489584, "learning_rate": 0.0001, "loss": 3.0935, "loss/crossentropy": 2.2094313383102415, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.27910117208957674, "loss/reg": 0.0, "step": 26660 }, { "epoch": 0.17546052631578948, "grad_norm": 2.28125, "grad_norm_var": 0.050446573893229166, "learning_rate": 0.0001, "loss": 3.0452, "loss/crossentropy": 2.491715204715729, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.24501692056655883, "loss/reg": 0.0, "step": 26670 }, { "epoch": 0.1755263157894737, "grad_norm": 2.296875, "grad_norm_var": 0.42122294108072916, "learning_rate": 0.0001, "loss": 3.1247, "loss/crossentropy": 2.431166636943817, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.24856783598661422, "loss/reg": 0.0, "step": 26680 }, { "epoch": 0.17559210526315788, "grad_norm": 2.28125, "grad_norm_var": 0.0764801025390625, "learning_rate": 0.0001, "loss": 3.1265, "loss/crossentropy": 2.2466578841209413, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.22143050357699395, "loss/reg": 0.0, "step": 26690 }, { "epoch": 0.1756578947368421, "grad_norm": 2.421875, "grad_norm_var": 0.06314697265625, "learning_rate": 0.0001, "loss": 3.0522, "loss/crossentropy": 1.8643316149711608, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.22551749348640443, "loss/reg": 0.0, "step": 26700 }, { "epoch": 0.1757236842105263, "grad_norm": 2.421875, "grad_norm_var": 0.12968648274739583, "learning_rate": 0.0001, "loss": 3.0386, "loss/crossentropy": 1.784457266330719, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.17761861719191074, "loss/reg": 0.0, "step": 26710 }, { "epoch": 0.17578947368421052, "grad_norm": 2.484375, "grad_norm_var": 0.13782450358072917, "learning_rate": 0.0001, "loss": 3.0727, "loss/crossentropy": 2.388060462474823, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.25690646171569825, "loss/reg": 0.0, "step": 26720 }, { "epoch": 0.17585526315789474, "grad_norm": 2.3125, "grad_norm_var": 0.5772450764973959, "learning_rate": 0.0001, "loss": 3.017, "loss/crossentropy": 2.293041491508484, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2976214215159416, "loss/reg": 0.0, "step": 26730 }, { "epoch": 0.17592105263157895, "grad_norm": 2.203125, "grad_norm_var": 0.7439737955729167, "learning_rate": 0.0001, "loss": 3.1688, "loss/crossentropy": 2.346044087409973, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.24346352070569993, "loss/reg": 0.0, "step": 26740 }, { "epoch": 0.17598684210526316, "grad_norm": 2.171875, "grad_norm_var": 0.19332275390625, "learning_rate": 0.0001, "loss": 3.1074, "loss/crossentropy": 2.3460346817970277, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.22802013754844666, "loss/reg": 0.0, "step": 26750 }, { "epoch": 0.17605263157894738, "grad_norm": 2.28125, "grad_norm_var": 0.09132486979166667, "learning_rate": 0.0001, "loss": 3.0194, "loss/crossentropy": 2.361405539512634, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2546524882316589, "loss/reg": 0.0, "step": 26760 }, { "epoch": 0.1761184210526316, "grad_norm": 2.21875, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 3.1365, "loss/crossentropy": 2.184977853298187, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24620451256632805, "loss/reg": 0.0, "step": 26770 }, { "epoch": 0.17618421052631578, "grad_norm": 2.453125, "grad_norm_var": 0.04890034993489583, "learning_rate": 0.0001, "loss": 3.0491, "loss/crossentropy": 2.3393765091896057, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.21822300404310227, "loss/reg": 0.0, "step": 26780 }, { "epoch": 0.17625, "grad_norm": 2.234375, "grad_norm_var": 0.08136571248372396, "learning_rate": 0.0001, "loss": 3.0288, "loss/crossentropy": 2.1801982522010803, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2308456853032112, "loss/reg": 0.0, "step": 26790 }, { "epoch": 0.1763157894736842, "grad_norm": 2.046875, "grad_norm_var": 0.08675918579101563, "learning_rate": 0.0001, "loss": 3.0063, "loss/crossentropy": 2.3970799446105957, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23122948557138442, "loss/reg": 0.0, "step": 26800 }, { "epoch": 0.17638157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.04929911295572917, "learning_rate": 0.0001, "loss": 3.0628, "loss/crossentropy": 2.3741040945053102, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.26243693232536314, "loss/reg": 0.0, "step": 26810 }, { "epoch": 0.17644736842105263, "grad_norm": 2.265625, "grad_norm_var": 0.07210184733072916, "learning_rate": 0.0001, "loss": 3.0187, "loss/crossentropy": 2.2319773197174073, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.201442664116621, "loss/reg": 0.0, "step": 26820 }, { "epoch": 0.17651315789473684, "grad_norm": 2.171875, "grad_norm_var": 0.20078023274739584, "learning_rate": 0.0001, "loss": 3.0845, "loss/crossentropy": 2.0902840554714204, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.23434632122516633, "loss/reg": 0.0, "step": 26830 }, { "epoch": 0.17657894736842106, "grad_norm": 2.078125, "grad_norm_var": 0.48001200358072915, "learning_rate": 0.0001, "loss": 3.0548, "loss/crossentropy": 1.8517399907112122, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.1863703802227974, "loss/reg": 0.0, "step": 26840 }, { "epoch": 0.17664473684210527, "grad_norm": 2.21875, "grad_norm_var": 0.4188313802083333, "learning_rate": 0.0001, "loss": 3.0383, "loss/crossentropy": 2.398709797859192, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.2097424551844597, "loss/reg": 0.0, "step": 26850 }, { "epoch": 0.17671052631578948, "grad_norm": 2.265625, "grad_norm_var": 0.14855855305989582, "learning_rate": 0.0001, "loss": 3.0439, "loss/crossentropy": 2.2976414561271667, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2062876433134079, "loss/reg": 0.0, "step": 26860 }, { "epoch": 0.1767763157894737, "grad_norm": 2.828125, "grad_norm_var": 0.3957801818847656, "learning_rate": 0.0001, "loss": 3.1221, "loss/crossentropy": 2.2302059173583983, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.2583988204598427, "loss/reg": 0.0, "step": 26870 }, { "epoch": 0.17684210526315788, "grad_norm": 2.09375, "grad_norm_var": 0.34606831868489585, "learning_rate": 0.0001, "loss": 3.0334, "loss/crossentropy": 2.2456519961357118, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.26051086038351057, "loss/reg": 0.0, "step": 26880 }, { "epoch": 0.1769078947368421, "grad_norm": 2.765625, "grad_norm_var": 0.09192301432291666, "learning_rate": 0.0001, "loss": 3.0806, "loss/crossentropy": 2.25178804397583, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2424270063638687, "loss/reg": 0.0, "step": 26890 }, { "epoch": 0.1769736842105263, "grad_norm": 2.1875, "grad_norm_var": 0.179736328125, "learning_rate": 0.0001, "loss": 3.0299, "loss/crossentropy": 2.2696429252624513, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.25682739466428756, "loss/reg": 0.0, "step": 26900 }, { "epoch": 0.17703947368421052, "grad_norm": 2.0625, "grad_norm_var": 0.44387613932291664, "learning_rate": 0.0001, "loss": 3.0266, "loss/crossentropy": 2.349669575691223, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.24149384647607802, "loss/reg": 0.0, "step": 26910 }, { "epoch": 0.17710526315789474, "grad_norm": 2.21875, "grad_norm_var": 0.34541727701822916, "learning_rate": 0.0001, "loss": 3.0779, "loss/crossentropy": 2.20119423866272, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.21436720862984657, "loss/reg": 0.0, "step": 26920 }, { "epoch": 0.17717105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.05929361979166667, "learning_rate": 0.0001, "loss": 3.0616, "loss/crossentropy": 2.2472872853279116, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.22420328930020333, "loss/reg": 0.0, "step": 26930 }, { "epoch": 0.17723684210526316, "grad_norm": 2.734375, "grad_norm_var": 0.08504231770833333, "learning_rate": 0.0001, "loss": 3.1167, "loss/crossentropy": 2.308558535575867, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.2697313494980335, "loss/reg": 0.0, "step": 26940 }, { "epoch": 0.17730263157894738, "grad_norm": 2.375, "grad_norm_var": 0.0501617431640625, "learning_rate": 0.0001, "loss": 3.0375, "loss/crossentropy": 2.4767362475395203, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22490220367908478, "loss/reg": 0.0, "step": 26950 }, { "epoch": 0.1773684210526316, "grad_norm": 4.25, "grad_norm_var": 0.29531962076822915, "learning_rate": 0.0001, "loss": 3.0694, "loss/crossentropy": 2.041945827007294, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.21851452216506004, "loss/reg": 0.0, "step": 26960 }, { "epoch": 0.17743421052631578, "grad_norm": 2.234375, "grad_norm_var": 0.2590728759765625, "learning_rate": 0.0001, "loss": 3.0748, "loss/crossentropy": 2.3148224115371705, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.24593613892793656, "loss/reg": 0.0, "step": 26970 }, { "epoch": 0.1775, "grad_norm": 2.375, "grad_norm_var": 0.09059956868489584, "learning_rate": 0.0001, "loss": 3.0916, "loss/crossentropy": 2.3537511348724367, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.26722831577062606, "loss/reg": 0.0, "step": 26980 }, { "epoch": 0.1775657894736842, "grad_norm": 2.640625, "grad_norm_var": 0.09703776041666666, "learning_rate": 0.0001, "loss": 3.0801, "loss/crossentropy": 2.3036428809165956, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.28045037388801575, "loss/reg": 0.0, "step": 26990 }, { "epoch": 0.17763157894736842, "grad_norm": 2.5, "grad_norm_var": 0.0448883056640625, "learning_rate": 0.0001, "loss": 3.1064, "loss/crossentropy": 2.210177004337311, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.20902514159679414, "loss/reg": 0.0, "step": 27000 }, { "epoch": 0.17769736842105263, "grad_norm": 2.640625, "grad_norm_var": 0.408251953125, "learning_rate": 0.0001, "loss": 3.1257, "loss/crossentropy": 2.354000687599182, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2463086098432541, "loss/reg": 0.0, "step": 27010 }, { "epoch": 0.17776315789473685, "grad_norm": 2.125, "grad_norm_var": 0.07931086222330729, "learning_rate": 0.0001, "loss": 2.9938, "loss/crossentropy": 2.2460681438446044, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2801851168274879, "loss/reg": 0.0, "step": 27020 }, { "epoch": 0.17782894736842106, "grad_norm": 1.984375, "grad_norm_var": 0.07854715983072917, "learning_rate": 0.0001, "loss": 3.0282, "loss/crossentropy": 2.1731944918632506, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.22212435267865657, "loss/reg": 0.0, "step": 27030 }, { "epoch": 0.17789473684210527, "grad_norm": 2.25, "grad_norm_var": 0.116162109375, "learning_rate": 0.0001, "loss": 3.0344, "loss/crossentropy": 2.441677284240723, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.25195191353559493, "loss/reg": 0.0, "step": 27040 }, { "epoch": 0.17796052631578949, "grad_norm": 2.375, "grad_norm_var": 0.15789388020833334, "learning_rate": 0.0001, "loss": 3.0824, "loss/crossentropy": 2.30471476316452, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.2780482307076454, "loss/reg": 0.0, "step": 27050 }, { "epoch": 0.17802631578947367, "grad_norm": 2.15625, "grad_norm_var": 0.13613179524739583, "learning_rate": 0.0001, "loss": 3.0864, "loss/crossentropy": 2.1705368638038633, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2402413785457611, "loss/reg": 0.0, "step": 27060 }, { "epoch": 0.17809210526315788, "grad_norm": 2.609375, "grad_norm_var": 0.09636128743489583, "learning_rate": 0.0001, "loss": 3.1186, "loss/crossentropy": 2.1933916807174683, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22591635286808015, "loss/reg": 0.0, "step": 27070 }, { "epoch": 0.1781578947368421, "grad_norm": 2.296875, "grad_norm_var": 0.08137613932291667, "learning_rate": 0.0001, "loss": 3.0876, "loss/crossentropy": 2.2060605615377424, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.2600886657834053, "loss/reg": 0.0, "step": 27080 }, { "epoch": 0.1782236842105263, "grad_norm": 3.0, "grad_norm_var": 0.12360738118489584, "learning_rate": 0.0001, "loss": 3.0018, "loss/crossentropy": 2.3198684811592103, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.23098773807287215, "loss/reg": 0.0, "step": 27090 }, { "epoch": 0.17828947368421053, "grad_norm": 2.703125, "grad_norm_var": 0.10364583333333334, "learning_rate": 0.0001, "loss": 2.977, "loss/crossentropy": 2.3410441994667055, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.25723920315504073, "loss/reg": 0.0, "step": 27100 }, { "epoch": 0.17835526315789474, "grad_norm": 2.609375, "grad_norm_var": 0.08136393229166666, "learning_rate": 0.0001, "loss": 2.9812, "loss/crossentropy": 2.290076696872711, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.20929899364709853, "loss/reg": 0.0, "step": 27110 }, { "epoch": 0.17842105263157895, "grad_norm": 2.53125, "grad_norm_var": 0.061310831705729166, "learning_rate": 0.0001, "loss": 3.0998, "loss/crossentropy": 2.6890960454940798, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.25387548208236693, "loss/reg": 0.0, "step": 27120 }, { "epoch": 0.17848684210526317, "grad_norm": 2.3125, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 3.0312, "loss/crossentropy": 2.4378479957580566, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22957875877618789, "loss/reg": 0.0, "step": 27130 }, { "epoch": 0.17855263157894738, "grad_norm": 2.328125, "grad_norm_var": 0.11809794108072917, "learning_rate": 0.0001, "loss": 3.1739, "loss/crossentropy": 2.2034701079130175, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.2694556161761284, "loss/reg": 0.0, "step": 27140 }, { "epoch": 0.17861842105263157, "grad_norm": 2.015625, "grad_norm_var": 0.243505859375, "learning_rate": 0.0001, "loss": 3.0594, "loss/crossentropy": 2.371540868282318, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.23407552391290665, "loss/reg": 0.0, "step": 27150 }, { "epoch": 0.17868421052631578, "grad_norm": 2.21875, "grad_norm_var": 0.15914306640625, "learning_rate": 0.0001, "loss": 2.9942, "loss/crossentropy": 1.9778084814548493, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.19847352504730226, "loss/reg": 0.0, "step": 27160 }, { "epoch": 0.17875, "grad_norm": 2.71875, "grad_norm_var": 0.05803120930989583, "learning_rate": 0.0001, "loss": 3.0767, "loss/crossentropy": 2.3956029176712037, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.21606186181306838, "loss/reg": 0.0, "step": 27170 }, { "epoch": 0.1788157894736842, "grad_norm": 2.703125, "grad_norm_var": 0.07607014973958333, "learning_rate": 0.0001, "loss": 3.0248, "loss/crossentropy": 2.2445591568946837, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.23037664219737053, "loss/reg": 0.0, "step": 27180 }, { "epoch": 0.17888157894736842, "grad_norm": 2.453125, "grad_norm_var": 0.08280843098958333, "learning_rate": 0.0001, "loss": 3.1128, "loss/crossentropy": 1.8874721288681031, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2291864424943924, "loss/reg": 0.0, "step": 27190 }, { "epoch": 0.17894736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.08245035807291666, "learning_rate": 0.0001, "loss": 3.0599, "loss/crossentropy": 2.321315813064575, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.28183609843254087, "loss/reg": 0.0, "step": 27200 }, { "epoch": 0.17901315789473685, "grad_norm": 2.40625, "grad_norm_var": 0.04579976399739583, "learning_rate": 0.0001, "loss": 3.1486, "loss/crossentropy": 2.1769141793251037, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.3312571823596954, "loss/reg": 0.0, "step": 27210 }, { "epoch": 0.17907894736842106, "grad_norm": 2.28125, "grad_norm_var": 0.0505035400390625, "learning_rate": 0.0001, "loss": 3.0333, "loss/crossentropy": 2.421796774864197, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2053638830780983, "loss/reg": 0.0, "step": 27220 }, { "epoch": 0.17914473684210527, "grad_norm": 2.34375, "grad_norm_var": 0.13408915201822916, "learning_rate": 0.0001, "loss": 3.0911, "loss/crossentropy": 2.539614748954773, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2433784618973732, "loss/reg": 0.0, "step": 27230 }, { "epoch": 0.1792105263157895, "grad_norm": 2.125, "grad_norm_var": 0.08008524576822916, "learning_rate": 0.0001, "loss": 3.0, "loss/crossentropy": 2.1362823486328124, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22769216746091842, "loss/reg": 0.0, "step": 27240 }, { "epoch": 0.17927631578947367, "grad_norm": 2.40625, "grad_norm_var": 0.038374837239583334, "learning_rate": 0.0001, "loss": 2.9645, "loss/crossentropy": 2.341468358039856, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21539552509784698, "loss/reg": 0.0, "step": 27250 }, { "epoch": 0.17934210526315789, "grad_norm": 2.234375, "grad_norm_var": 0.031174468994140624, "learning_rate": 0.0001, "loss": 3.0392, "loss/crossentropy": 2.2766498208045958, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2082691341638565, "loss/reg": 0.0, "step": 27260 }, { "epoch": 0.1794078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.16941909790039061, "learning_rate": 0.0001, "loss": 3.0554, "loss/crossentropy": 2.537719178199768, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.21607491597533227, "loss/reg": 0.0, "step": 27270 }, { "epoch": 0.1794736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.23950093587239582, "learning_rate": 0.0001, "loss": 2.9943, "loss/crossentropy": 2.4620283365249636, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.23522212058305741, "loss/reg": 0.0, "step": 27280 }, { "epoch": 0.17953947368421053, "grad_norm": 2.328125, "grad_norm_var": 0.1655181884765625, "learning_rate": 0.0001, "loss": 2.9975, "loss/crossentropy": 2.2144778609275817, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.22236768230795861, "loss/reg": 0.0, "step": 27290 }, { "epoch": 0.17960526315789474, "grad_norm": 2.46875, "grad_norm_var": 0.4901529947916667, "learning_rate": 0.0001, "loss": 3.0091, "loss/crossentropy": 2.1528895676136015, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.2013701803982258, "loss/reg": 0.0, "step": 27300 }, { "epoch": 0.17967105263157895, "grad_norm": 2.21875, "grad_norm_var": 0.5030751546223958, "learning_rate": 0.0001, "loss": 2.932, "loss/crossentropy": 2.2105651617050173, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2156506821513176, "loss/reg": 0.0, "step": 27310 }, { "epoch": 0.17973684210526317, "grad_norm": 2.75, "grad_norm_var": 0.1188385009765625, "learning_rate": 0.0001, "loss": 3.0315, "loss/crossentropy": 2.3311607003211976, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.22039272785186767, "loss/reg": 0.0, "step": 27320 }, { "epoch": 0.17980263157894738, "grad_norm": 2.5, "grad_norm_var": 0.10274149576822916, "learning_rate": 0.0001, "loss": 2.9934, "loss/crossentropy": 2.363705575466156, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.2591767907142639, "loss/reg": 0.0, "step": 27330 }, { "epoch": 0.17986842105263157, "grad_norm": 2.453125, "grad_norm_var": 0.22634175618489583, "learning_rate": 0.0001, "loss": 3.0232, "loss/crossentropy": 2.491378426551819, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.22361548170447348, "loss/reg": 0.0, "step": 27340 }, { "epoch": 0.17993421052631578, "grad_norm": 2.3125, "grad_norm_var": 0.28927408854166664, "learning_rate": 0.0001, "loss": 3.0354, "loss/crossentropy": 2.2730626463890076, "loss/hidden": 2.95, "loss/incoh": 0.0, "loss/logits": 0.2728175431489944, "loss/reg": 0.0, "step": 27350 }, { "epoch": 0.18, "grad_norm": 2.03125, "grad_norm_var": 0.2871734619140625, "learning_rate": 0.0001, "loss": 3.0815, "loss/crossentropy": 2.222132349014282, "loss/hidden": 3.0171875, "loss/incoh": 0.0, "loss/logits": 0.27575425505638124, "loss/reg": 0.0, "step": 27360 }, { "epoch": 0.1800657894736842, "grad_norm": 2.859375, "grad_norm_var": 0.18010660807291667, "learning_rate": 0.0001, "loss": 3.0514, "loss/crossentropy": 2.3982750535011292, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24716890305280687, "loss/reg": 0.0, "step": 27370 }, { "epoch": 0.18013157894736842, "grad_norm": 3.625, "grad_norm_var": 0.20855712890625, "learning_rate": 0.0001, "loss": 3.0229, "loss/crossentropy": 2.3283283829689028, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.27690982520580293, "loss/reg": 0.0, "step": 27380 }, { "epoch": 0.18019736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.14794514973958334, "learning_rate": 0.0001, "loss": 3.1136, "loss/crossentropy": 2.3320053696632383, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.22953213453292848, "loss/reg": 0.0, "step": 27390 }, { "epoch": 0.18026315789473685, "grad_norm": 2.78125, "grad_norm_var": 0.15938212076822916, "learning_rate": 0.0001, "loss": 3.0693, "loss/crossentropy": 2.5745165824890135, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.24661683887243271, "loss/reg": 0.0, "step": 27400 }, { "epoch": 0.18032894736842106, "grad_norm": 2.328125, "grad_norm_var": 0.08530171712239583, "learning_rate": 0.0001, "loss": 3.027, "loss/crossentropy": 2.291408562660217, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2568038985133171, "loss/reg": 0.0, "step": 27410 }, { "epoch": 0.18039473684210527, "grad_norm": 3.0, "grad_norm_var": 0.8119293212890625, "learning_rate": 0.0001, "loss": 3.0758, "loss/crossentropy": 2.275824952125549, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.25695485174655913, "loss/reg": 0.0, "step": 27420 }, { "epoch": 0.18046052631578946, "grad_norm": 2.1875, "grad_norm_var": 0.054488118489583334, "learning_rate": 0.0001, "loss": 2.9932, "loss/crossentropy": 2.3114712476730346, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.23609411865472793, "loss/reg": 0.0, "step": 27430 }, { "epoch": 0.18052631578947367, "grad_norm": 2.5, "grad_norm_var": 0.05273030598958333, "learning_rate": 0.0001, "loss": 3.0682, "loss/crossentropy": 2.179815483093262, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.25313089042901993, "loss/reg": 0.0, "step": 27440 }, { "epoch": 0.1805921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.06285807291666666, "learning_rate": 0.0001, "loss": 3.0519, "loss/crossentropy": 2.3046084105968476, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.27110700607299804, "loss/reg": 0.0, "step": 27450 }, { "epoch": 0.1806578947368421, "grad_norm": 2.15625, "grad_norm_var": 0.06948954264322917, "learning_rate": 0.0001, "loss": 3.0719, "loss/crossentropy": 2.2216897130012514, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2158276081085205, "loss/reg": 0.0, "step": 27460 }, { "epoch": 0.18072368421052631, "grad_norm": 2.625, "grad_norm_var": 0.06531575520833334, "learning_rate": 0.0001, "loss": 3.0662, "loss/crossentropy": 2.31493022441864, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2338400349020958, "loss/reg": 0.0, "step": 27470 }, { "epoch": 0.18078947368421053, "grad_norm": 2.78125, "grad_norm_var": 0.08967997233072916, "learning_rate": 0.0001, "loss": 3.1017, "loss/crossentropy": 2.292111110687256, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.25805368572473525, "loss/reg": 0.0, "step": 27480 }, { "epoch": 0.18085526315789474, "grad_norm": 3.78125, "grad_norm_var": 0.16054585774739583, "learning_rate": 0.0001, "loss": 3.0896, "loss/crossentropy": 2.531006705760956, "loss/hidden": 2.984375, "loss/incoh": 0.0, "loss/logits": 0.3327594205737114, "loss/reg": 0.0, "step": 27490 }, { "epoch": 0.18092105263157895, "grad_norm": 2.3125, "grad_norm_var": 0.1436431884765625, "learning_rate": 0.0001, "loss": 3.0498, "loss/crossentropy": 2.3085229277610777, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23342303335666656, "loss/reg": 0.0, "step": 27500 }, { "epoch": 0.18098684210526317, "grad_norm": 2.828125, "grad_norm_var": 0.24217122395833332, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.476685440540314, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2170659363269806, "loss/reg": 0.0, "step": 27510 }, { "epoch": 0.18105263157894738, "grad_norm": 2.546875, "grad_norm_var": 0.13345438639322918, "learning_rate": 0.0001, "loss": 3.0171, "loss/crossentropy": 2.280565071105957, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.20756303817033767, "loss/reg": 0.0, "step": 27520 }, { "epoch": 0.18111842105263157, "grad_norm": 2.5, "grad_norm_var": 0.047200520833333336, "learning_rate": 0.0001, "loss": 2.9912, "loss/crossentropy": 2.553924024105072, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.24719125479459764, "loss/reg": 0.0, "step": 27530 }, { "epoch": 0.18118421052631578, "grad_norm": 2.171875, "grad_norm_var": 0.06653238932291666, "learning_rate": 0.0001, "loss": 3.121, "loss/crossentropy": 2.174853026866913, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.31346396207809446, "loss/reg": 0.0, "step": 27540 }, { "epoch": 0.18125, "grad_norm": 2.828125, "grad_norm_var": 0.04431966145833333, "learning_rate": 0.0001, "loss": 3.0695, "loss/crossentropy": 2.329943561553955, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.22730601876974105, "loss/reg": 0.0, "step": 27550 }, { "epoch": 0.1813157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.0733551025390625, "learning_rate": 0.0001, "loss": 3.0813, "loss/crossentropy": 2.2811534643173217, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.24358074963092805, "loss/reg": 0.0, "step": 27560 }, { "epoch": 0.18138157894736842, "grad_norm": 2.1875, "grad_norm_var": 0.05774739583333333, "learning_rate": 0.0001, "loss": 3.0169, "loss/crossentropy": 2.3220547437667847, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.24033707976341248, "loss/reg": 0.0, "step": 27570 }, { "epoch": 0.18144736842105263, "grad_norm": 2.5625, "grad_norm_var": 0.38362528483072916, "learning_rate": 0.0001, "loss": 3.0383, "loss/crossentropy": 2.4912595987319945, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2603204816579819, "loss/reg": 0.0, "step": 27580 }, { "epoch": 0.18151315789473685, "grad_norm": 2.75, "grad_norm_var": 0.1091949462890625, "learning_rate": 0.0001, "loss": 3.0424, "loss/crossentropy": 2.4132495522499084, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.23840930834412574, "loss/reg": 0.0, "step": 27590 }, { "epoch": 0.18157894736842106, "grad_norm": 2.09375, "grad_norm_var": 0.224072265625, "learning_rate": 0.0001, "loss": 3.0529, "loss/crossentropy": 2.27939612865448, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2168305702507496, "loss/reg": 0.0, "step": 27600 }, { "epoch": 0.18164473684210528, "grad_norm": 2.453125, "grad_norm_var": 0.354541015625, "learning_rate": 0.0001, "loss": 3.04, "loss/crossentropy": 2.5797234058380125, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.3075165659189224, "loss/reg": 0.0, "step": 27610 }, { "epoch": 0.18171052631578946, "grad_norm": 2.59375, "grad_norm_var": 0.28559544881184895, "learning_rate": 0.0001, "loss": 3.0175, "loss/crossentropy": 2.457747685909271, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2326775386929512, "loss/reg": 0.0, "step": 27620 }, { "epoch": 0.18177631578947367, "grad_norm": 2.078125, "grad_norm_var": 0.2150957743326823, "learning_rate": 0.0001, "loss": 3.025, "loss/crossentropy": 2.2331402122974398, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24202753305435182, "loss/reg": 0.0, "step": 27630 }, { "epoch": 0.1818421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.074267578125, "learning_rate": 0.0001, "loss": 3.0184, "loss/crossentropy": 2.402931201457977, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.24029098004102706, "loss/reg": 0.0, "step": 27640 }, { "epoch": 0.1819078947368421, "grad_norm": 2.234375, "grad_norm_var": 0.12086181640625, "learning_rate": 0.0001, "loss": 3.0823, "loss/crossentropy": 2.1527361035346986, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.22135660648345948, "loss/reg": 0.0, "step": 27650 }, { "epoch": 0.18197368421052632, "grad_norm": 2.453125, "grad_norm_var": 0.13780008951822917, "learning_rate": 0.0001, "loss": 3.0202, "loss/crossentropy": 2.2100749254226684, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2464701771736145, "loss/reg": 0.0, "step": 27660 }, { "epoch": 0.18203947368421053, "grad_norm": 2.28125, "grad_norm_var": 0.08621317545572917, "learning_rate": 0.0001, "loss": 3.0173, "loss/crossentropy": 2.400713062286377, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2229515090584755, "loss/reg": 0.0, "step": 27670 }, { "epoch": 0.18210526315789474, "grad_norm": 2.03125, "grad_norm_var": 0.03254292805989583, "learning_rate": 0.0001, "loss": 2.9616, "loss/crossentropy": 2.053451955318451, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.2730902835726738, "loss/reg": 0.0, "step": 27680 }, { "epoch": 0.18217105263157896, "grad_norm": 2.53125, "grad_norm_var": 0.05116551717122396, "learning_rate": 0.0001, "loss": 2.9723, "loss/crossentropy": 2.321164917945862, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.2052845761179924, "loss/reg": 0.0, "step": 27690 }, { "epoch": 0.18223684210526317, "grad_norm": 2.640625, "grad_norm_var": 0.06518961588541666, "learning_rate": 0.0001, "loss": 3.0474, "loss/crossentropy": 2.1387670636177063, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.24108488559722902, "loss/reg": 0.0, "step": 27700 }, { "epoch": 0.18230263157894736, "grad_norm": 2.703125, "grad_norm_var": 4.39171331639651e+17, "learning_rate": 0.0001, "loss": 3.143, "loss/crossentropy": 2.4102607131004334, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.33623483330011367, "loss/reg": 0.0, "step": 27710 }, { "epoch": 0.18236842105263157, "grad_norm": 2.21875, "grad_norm_var": 4.391713315923646e+17, "learning_rate": 0.0001, "loss": 3.053, "loss/crossentropy": 2.215381217002869, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.21635367721319199, "loss/reg": 0.0, "step": 27720 }, { "epoch": 0.18243421052631578, "grad_norm": 2.09375, "grad_norm_var": 0.118603515625, "learning_rate": 0.0001, "loss": 2.9848, "loss/crossentropy": 2.456426572799683, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22907530665397643, "loss/reg": 0.0, "step": 27730 }, { "epoch": 0.1825, "grad_norm": 2.140625, "grad_norm_var": 0.2470123291015625, "learning_rate": 0.0001, "loss": 2.9824, "loss/crossentropy": 2.0001117050647736, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.18826698660850524, "loss/reg": 0.0, "step": 27740 }, { "epoch": 0.1825657894736842, "grad_norm": 2.265625, "grad_norm_var": 0.21778971354166668, "learning_rate": 0.0001, "loss": 3.0528, "loss/crossentropy": 2.3025170087814333, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.22922251224517823, "loss/reg": 0.0, "step": 27750 }, { "epoch": 0.18263157894736842, "grad_norm": 2.46875, "grad_norm_var": 0.07261962890625, "learning_rate": 0.0001, "loss": 3.0966, "loss/crossentropy": 2.299086034297943, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.23557846546173095, "loss/reg": 0.0, "step": 27760 }, { "epoch": 0.18269736842105264, "grad_norm": 2.140625, "grad_norm_var": 0.27241109212239584, "learning_rate": 0.0001, "loss": 3.0505, "loss/crossentropy": 2.4849728107452393, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.22220651507377626, "loss/reg": 0.0, "step": 27770 }, { "epoch": 0.18276315789473685, "grad_norm": 2.3125, "grad_norm_var": 0.29171549479166664, "learning_rate": 0.0001, "loss": 2.9851, "loss/crossentropy": 2.3156984567642214, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.220241579413414, "loss/reg": 0.0, "step": 27780 }, { "epoch": 0.18282894736842106, "grad_norm": 2.1875, "grad_norm_var": 0.0520416259765625, "learning_rate": 0.0001, "loss": 3.0197, "loss/crossentropy": 2.4274224162101747, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.22342314720153808, "loss/reg": 0.0, "step": 27790 }, { "epoch": 0.18289473684210528, "grad_norm": 2.15625, "grad_norm_var": 0.0919342041015625, "learning_rate": 0.0001, "loss": 3.0726, "loss/crossentropy": 2.399091196060181, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.23315689116716384, "loss/reg": 0.0, "step": 27800 }, { "epoch": 0.18296052631578946, "grad_norm": 2.59375, "grad_norm_var": 0.1072265625, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.2596776604652407, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.25109176337718964, "loss/reg": 0.0, "step": 27810 }, { "epoch": 0.18302631578947368, "grad_norm": 2.6875, "grad_norm_var": 1.1102854410807292, "learning_rate": 0.0001, "loss": 3.1493, "loss/crossentropy": 2.2746080636978148, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2303971141576767, "loss/reg": 0.0, "step": 27820 }, { "epoch": 0.1830921052631579, "grad_norm": 2.25, "grad_norm_var": 1.1916015625, "learning_rate": 0.0001, "loss": 3.0322, "loss/crossentropy": 2.2199900269508364, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.24132085889577864, "loss/reg": 0.0, "step": 27830 }, { "epoch": 0.1831578947368421, "grad_norm": 2.21875, "grad_norm_var": 0.11803385416666666, "learning_rate": 0.0001, "loss": 2.9787, "loss/crossentropy": 2.261062300205231, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23909828811883926, "loss/reg": 0.0, "step": 27840 }, { "epoch": 0.18322368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.09485270182291666, "learning_rate": 0.0001, "loss": 3.0545, "loss/crossentropy": 2.3090713739395143, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.20118657350540162, "loss/reg": 0.0, "step": 27850 }, { "epoch": 0.18328947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.24943008422851562, "learning_rate": 0.0001, "loss": 3.0424, "loss/crossentropy": 2.3824569821357726, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23975446224212646, "loss/reg": 0.0, "step": 27860 }, { "epoch": 0.18335526315789474, "grad_norm": 2.125, "grad_norm_var": 0.14694010416666667, "learning_rate": 0.0001, "loss": 2.9925, "loss/crossentropy": 2.227169597148895, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22001251727342605, "loss/reg": 0.0, "step": 27870 }, { "epoch": 0.18342105263157896, "grad_norm": 2.28125, "grad_norm_var": 0.06550191243489584, "learning_rate": 0.0001, "loss": 3.0493, "loss/crossentropy": 2.5089489579200746, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20695988237857818, "loss/reg": 0.0, "step": 27880 }, { "epoch": 0.18348684210526317, "grad_norm": 2.328125, "grad_norm_var": 0.0484527587890625, "learning_rate": 0.0001, "loss": 2.9952, "loss/crossentropy": 2.4806633472442625, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.27523170709609984, "loss/reg": 0.0, "step": 27890 }, { "epoch": 0.18355263157894736, "grad_norm": 4.3125, "grad_norm_var": 0.2769521077473958, "learning_rate": 0.0001, "loss": 3.0394, "loss/crossentropy": 2.239254057407379, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2308262661099434, "loss/reg": 0.0, "step": 27900 }, { "epoch": 0.18361842105263157, "grad_norm": 2.625, "grad_norm_var": 0.29397761027018227, "learning_rate": 0.0001, "loss": 3.0224, "loss/crossentropy": 2.3714274525642396, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.26112145036458967, "loss/reg": 0.0, "step": 27910 }, { "epoch": 0.18368421052631578, "grad_norm": 2.3125, "grad_norm_var": 0.05971450805664062, "learning_rate": 0.0001, "loss": 2.9517, "loss/crossentropy": 2.0551604270935058, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.24491064846515656, "loss/reg": 0.0, "step": 27920 }, { "epoch": 0.18375, "grad_norm": 2.53125, "grad_norm_var": 0.03795547485351562, "learning_rate": 0.0001, "loss": 3.0193, "loss/crossentropy": 2.3411830067634583, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.23370584547519685, "loss/reg": 0.0, "step": 27930 }, { "epoch": 0.1838157894736842, "grad_norm": 2.40625, "grad_norm_var": 0.1084625244140625, "learning_rate": 0.0001, "loss": 3.0802, "loss/crossentropy": 2.78390554189682, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.26939452439546585, "loss/reg": 0.0, "step": 27940 }, { "epoch": 0.18388157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.41838277180989586, "learning_rate": 0.0001, "loss": 3.1153, "loss/crossentropy": 2.2294405877590178, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.23037993758916855, "loss/reg": 0.0, "step": 27950 }, { "epoch": 0.18394736842105264, "grad_norm": 3.390625, "grad_norm_var": 0.37854410807291666, "learning_rate": 0.0001, "loss": 3.1324, "loss/crossentropy": 2.28962881565094, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.24506068229675293, "loss/reg": 0.0, "step": 27960 }, { "epoch": 0.18401315789473685, "grad_norm": 2.40625, "grad_norm_var": 0.1893999735514323, "learning_rate": 0.0001, "loss": 3.0278, "loss/crossentropy": 2.312084639072418, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2302939549088478, "loss/reg": 0.0, "step": 27970 }, { "epoch": 0.18407894736842106, "grad_norm": 2.359375, "grad_norm_var": 0.15084228515625, "learning_rate": 0.0001, "loss": 3.0267, "loss/crossentropy": 2.3484140396118165, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.21540230959653855, "loss/reg": 0.0, "step": 27980 }, { "epoch": 0.18414473684210525, "grad_norm": 2.421875, "grad_norm_var": 0.15601806640625, "learning_rate": 0.0001, "loss": 3.0073, "loss/crossentropy": 2.4529793858528137, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.23430580049753189, "loss/reg": 0.0, "step": 27990 }, { "epoch": 0.18421052631578946, "grad_norm": 2.421875, "grad_norm_var": 0.0614166259765625, "learning_rate": 0.0001, "loss": 3.0303, "loss/crossentropy": 2.351777696609497, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2307715743780136, "loss/reg": 0.0, "step": 28000 }, { "epoch": 0.18427631578947368, "grad_norm": 4.0625, "grad_norm_var": 0.21398824055989582, "learning_rate": 0.0001, "loss": 3.0205, "loss/crossentropy": 2.435424494743347, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2605126053094864, "loss/reg": 0.0, "step": 28010 }, { "epoch": 0.1843421052631579, "grad_norm": 2.5625, "grad_norm_var": 0.22447509765625, "learning_rate": 0.0001, "loss": 3.0798, "loss/crossentropy": 2.3683685779571535, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.22029414623975754, "loss/reg": 0.0, "step": 28020 }, { "epoch": 0.1844078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.08081766764322916, "learning_rate": 0.0001, "loss": 3.0223, "loss/crossentropy": 2.5003761768341066, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2650137528777122, "loss/reg": 0.0, "step": 28030 }, { "epoch": 0.18447368421052632, "grad_norm": 2.28125, "grad_norm_var": 0.07470601399739583, "learning_rate": 0.0001, "loss": 2.998, "loss/crossentropy": 2.2510382771492004, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.23913081735372543, "loss/reg": 0.0, "step": 28040 }, { "epoch": 0.18453947368421053, "grad_norm": 2.140625, "grad_norm_var": 0.12652587890625, "learning_rate": 0.0001, "loss": 3.0344, "loss/crossentropy": 2.494665837287903, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.24169176146388055, "loss/reg": 0.0, "step": 28050 }, { "epoch": 0.18460526315789474, "grad_norm": 2.0625, "grad_norm_var": 0.08466389973958334, "learning_rate": 0.0001, "loss": 3.104, "loss/crossentropy": 2.3110733151435854, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.22967937737703323, "loss/reg": 0.0, "step": 28060 }, { "epoch": 0.18467105263157896, "grad_norm": 2.578125, "grad_norm_var": 0.067431640625, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.4786781549453734, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.23725456148386, "loss/reg": 0.0, "step": 28070 }, { "epoch": 0.18473684210526317, "grad_norm": 2.828125, "grad_norm_var": 0.07773335774739583, "learning_rate": 0.0001, "loss": 3.0795, "loss/crossentropy": 2.5213263630867004, "loss/hidden": 3.0703125, "loss/incoh": 0.0, "loss/logits": 0.3203597366809845, "loss/reg": 0.0, "step": 28080 }, { "epoch": 0.18480263157894736, "grad_norm": 2.296875, "grad_norm_var": 0.06311747233072916, "learning_rate": 0.0001, "loss": 2.9924, "loss/crossentropy": 2.3318725705146788, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.21815032362937928, "loss/reg": 0.0, "step": 28090 }, { "epoch": 0.18486842105263157, "grad_norm": 2.40625, "grad_norm_var": 0.42880859375, "learning_rate": 0.0001, "loss": 3.0976, "loss/crossentropy": 2.257493245601654, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.22243027836084367, "loss/reg": 0.0, "step": 28100 }, { "epoch": 0.18493421052631578, "grad_norm": 2.5625, "grad_norm_var": 0.16021728515625, "learning_rate": 0.0001, "loss": 3.0607, "loss/crossentropy": 2.320458722114563, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22777790427207947, "loss/reg": 0.0, "step": 28110 }, { "epoch": 0.185, "grad_norm": 2.1875, "grad_norm_var": 0.35485738118489585, "learning_rate": 0.0001, "loss": 3.0824, "loss/crossentropy": 2.5934891939163207, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.217761267721653, "loss/reg": 0.0, "step": 28120 }, { "epoch": 0.1850657894736842, "grad_norm": 2.15625, "grad_norm_var": 0.16167577107747397, "learning_rate": 0.0001, "loss": 3.0055, "loss/crossentropy": 2.2394705057144164, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.2515557982027531, "loss/reg": 0.0, "step": 28130 }, { "epoch": 0.18513157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.14989598592122397, "learning_rate": 0.0001, "loss": 3.0126, "loss/crossentropy": 2.3399840354919434, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2646253302693367, "loss/reg": 0.0, "step": 28140 }, { "epoch": 0.18519736842105264, "grad_norm": 2.234375, "grad_norm_var": 0.14189453125, "learning_rate": 0.0001, "loss": 3.0785, "loss/crossentropy": 2.3720606327056886, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2922904878854752, "loss/reg": 0.0, "step": 28150 }, { "epoch": 0.18526315789473685, "grad_norm": 2.1875, "grad_norm_var": 0.052567545572916666, "learning_rate": 0.0001, "loss": 3.0941, "loss/crossentropy": 2.26825897693634, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.24575791507959366, "loss/reg": 0.0, "step": 28160 }, { "epoch": 0.18532894736842107, "grad_norm": 2.671875, "grad_norm_var": 0.0409820556640625, "learning_rate": 0.0001, "loss": 3.0183, "loss/crossentropy": 2.2412746131420134, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.22467602118849755, "loss/reg": 0.0, "step": 28170 }, { "epoch": 0.18539473684210525, "grad_norm": 2.40625, "grad_norm_var": 0.022264607747395835, "learning_rate": 0.0001, "loss": 3.1005, "loss/crossentropy": 2.465518927574158, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.25958890467882156, "loss/reg": 0.0, "step": 28180 }, { "epoch": 0.18546052631578946, "grad_norm": 2.5625, "grad_norm_var": 0.026130167643229167, "learning_rate": 0.0001, "loss": 3.0111, "loss/crossentropy": 2.4506253004074097, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.23345130831003189, "loss/reg": 0.0, "step": 28190 }, { "epoch": 0.18552631578947368, "grad_norm": 2.859375, "grad_norm_var": 0.2981516520182292, "learning_rate": 0.0001, "loss": 3.1404, "loss/crossentropy": 2.531968724727631, "loss/hidden": 3.134375, "loss/incoh": 0.0, "loss/logits": 0.3312064751982689, "loss/reg": 0.0, "step": 28200 }, { "epoch": 0.1855921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.3452545166015625, "learning_rate": 0.0001, "loss": 3.0669, "loss/crossentropy": 2.1583576440811156, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.22225692719221116, "loss/reg": 0.0, "step": 28210 }, { "epoch": 0.1856578947368421, "grad_norm": 2.5, "grad_norm_var": 0.1086090087890625, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.0994983315467834, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.21283998042345048, "loss/reg": 0.0, "step": 28220 }, { "epoch": 0.18572368421052632, "grad_norm": 2.625, "grad_norm_var": 0.09930013020833334, "learning_rate": 0.0001, "loss": 3.0418, "loss/crossentropy": 2.208499777317047, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21439559012651443, "loss/reg": 0.0, "step": 28230 }, { "epoch": 0.18578947368421053, "grad_norm": 3.609375, "grad_norm_var": 0.1129302978515625, "learning_rate": 0.0001, "loss": 3.1019, "loss/crossentropy": 2.507619249820709, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2734301760792732, "loss/reg": 0.0, "step": 28240 }, { "epoch": 0.18585526315789475, "grad_norm": 2.375, "grad_norm_var": 0.09721577962239583, "learning_rate": 0.0001, "loss": 3.0004, "loss/crossentropy": 2.440661299228668, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.20598573684692384, "loss/reg": 0.0, "step": 28250 }, { "epoch": 0.18592105263157896, "grad_norm": 3.046875, "grad_norm_var": 0.13509012858072916, "learning_rate": 0.0001, "loss": 3.0432, "loss/crossentropy": 2.13590213060379, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.21278525814414023, "loss/reg": 0.0, "step": 28260 }, { "epoch": 0.18598684210526314, "grad_norm": 2.3125, "grad_norm_var": 0.10868733723958333, "learning_rate": 0.0001, "loss": 3.0082, "loss/crossentropy": 2.4866502404212953, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2882431522011757, "loss/reg": 0.0, "step": 28270 }, { "epoch": 0.18605263157894736, "grad_norm": 2.40625, "grad_norm_var": 0.03225504557291667, "learning_rate": 0.0001, "loss": 3.0845, "loss/crossentropy": 1.8784153163433075, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.21082937121391296, "loss/reg": 0.0, "step": 28280 }, { "epoch": 0.18611842105263157, "grad_norm": 2.390625, "grad_norm_var": 0.0173980712890625, "learning_rate": 0.0001, "loss": 3.0764, "loss/crossentropy": 2.357287549972534, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.20526091530919074, "loss/reg": 0.0, "step": 28290 }, { "epoch": 0.18618421052631579, "grad_norm": 2.109375, "grad_norm_var": 0.05768941243489583, "learning_rate": 0.0001, "loss": 3.0812, "loss/crossentropy": 1.8806862443685533, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.18540635257959365, "loss/reg": 0.0, "step": 28300 }, { "epoch": 0.18625, "grad_norm": 2.71875, "grad_norm_var": 0.0767730712890625, "learning_rate": 0.0001, "loss": 3.0752, "loss/crossentropy": 2.2633087158203127, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2433292269706726, "loss/reg": 0.0, "step": 28310 }, { "epoch": 0.1863157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.05572509765625, "learning_rate": 0.0001, "loss": 2.9722, "loss/crossentropy": 2.328176748752594, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.226905982196331, "loss/reg": 0.0, "step": 28320 }, { "epoch": 0.18638157894736843, "grad_norm": 1.9765625, "grad_norm_var": 0.08190485636393229, "learning_rate": 0.0001, "loss": 3.0092, "loss/crossentropy": 2.532880795001984, "loss/hidden": 2.971875, "loss/incoh": 0.0, "loss/logits": 0.3012142822146416, "loss/reg": 0.0, "step": 28330 }, { "epoch": 0.18644736842105264, "grad_norm": 2.625, "grad_norm_var": 0.1636138916015625, "learning_rate": 0.0001, "loss": 3.0108, "loss/crossentropy": 2.823889398574829, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.24069934040308, "loss/reg": 0.0, "step": 28340 }, { "epoch": 0.18651315789473685, "grad_norm": 2.234375, "grad_norm_var": 0.10241597493489583, "learning_rate": 0.0001, "loss": 3.0001, "loss/crossentropy": 2.35920227766037, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.26395892798900605, "loss/reg": 0.0, "step": 28350 }, { "epoch": 0.18657894736842107, "grad_norm": 2.4375, "grad_norm_var": 0.08288472493489583, "learning_rate": 0.0001, "loss": 3.0675, "loss/crossentropy": 2.070932924747467, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.20600728541612626, "loss/reg": 0.0, "step": 28360 }, { "epoch": 0.18664473684210525, "grad_norm": 2.125, "grad_norm_var": 0.07918192545572916, "learning_rate": 0.0001, "loss": 3.017, "loss/crossentropy": 2.307896840572357, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.25419241189956665, "loss/reg": 0.0, "step": 28370 }, { "epoch": 0.18671052631578947, "grad_norm": 2.40625, "grad_norm_var": 0.05845947265625, "learning_rate": 0.0001, "loss": 3.0325, "loss/crossentropy": 2.482656693458557, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.24442674964666367, "loss/reg": 0.0, "step": 28380 }, { "epoch": 0.18677631578947368, "grad_norm": 2.140625, "grad_norm_var": 0.12398173014322916, "learning_rate": 0.0001, "loss": 3.1446, "loss/crossentropy": 2.082607638835907, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.2496044546365738, "loss/reg": 0.0, "step": 28390 }, { "epoch": 0.1868421052631579, "grad_norm": 2.09375, "grad_norm_var": 0.0723785400390625, "learning_rate": 0.0001, "loss": 3.071, "loss/crossentropy": 2.2511567950248716, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.26172232031822207, "loss/reg": 0.0, "step": 28400 }, { "epoch": 0.1869078947368421, "grad_norm": 3.09375, "grad_norm_var": 0.08268229166666667, "learning_rate": 0.0001, "loss": 3.1149, "loss/crossentropy": 2.275085437297821, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.23399723172187806, "loss/reg": 0.0, "step": 28410 }, { "epoch": 0.18697368421052632, "grad_norm": 2.4375, "grad_norm_var": 0.1056793212890625, "learning_rate": 0.0001, "loss": 3.0149, "loss/crossentropy": 2.443694305419922, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.26309441328048705, "loss/reg": 0.0, "step": 28420 }, { "epoch": 0.18703947368421053, "grad_norm": 2.578125, "grad_norm_var": 0.05745340983072917, "learning_rate": 0.0001, "loss": 3.0672, "loss/crossentropy": 2.1224380493164063, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.23011831045150757, "loss/reg": 0.0, "step": 28430 }, { "epoch": 0.18710526315789475, "grad_norm": 2.015625, "grad_norm_var": 0.1089752197265625, "learning_rate": 0.0001, "loss": 3.0275, "loss/crossentropy": 2.3803366303443907, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24528718441724778, "loss/reg": 0.0, "step": 28440 }, { "epoch": 0.18717105263157896, "grad_norm": 2.015625, "grad_norm_var": 0.1678375244140625, "learning_rate": 0.0001, "loss": 3.0534, "loss/crossentropy": 2.254507315158844, "loss/hidden": 2.9875, "loss/incoh": 0.0, "loss/logits": 0.23182358890771865, "loss/reg": 0.0, "step": 28450 }, { "epoch": 0.18723684210526315, "grad_norm": 2.296875, "grad_norm_var": 0.0425201416015625, "learning_rate": 0.0001, "loss": 3.0644, "loss/crossentropy": 2.387832987308502, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.22640825510025026, "loss/reg": 0.0, "step": 28460 }, { "epoch": 0.18730263157894736, "grad_norm": 2.359375, "grad_norm_var": 0.03277587890625, "learning_rate": 0.0001, "loss": 3.0762, "loss/crossentropy": 2.4295106649398805, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.2517640799283981, "loss/reg": 0.0, "step": 28470 }, { "epoch": 0.18736842105263157, "grad_norm": 1.96875, "grad_norm_var": 0.15917561848958334, "learning_rate": 0.0001, "loss": 3.0049, "loss/crossentropy": 2.472483456134796, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.21084701418876647, "loss/reg": 0.0, "step": 28480 }, { "epoch": 0.1874342105263158, "grad_norm": 2.5, "grad_norm_var": 0.15578511555989583, "learning_rate": 0.0001, "loss": 3.1143, "loss/crossentropy": 1.9477856278419494, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.26639992743730545, "loss/reg": 0.0, "step": 28490 }, { "epoch": 0.1875, "grad_norm": 2.375, "grad_norm_var": 0.022737630208333335, "learning_rate": 0.0001, "loss": 3.0663, "loss/crossentropy": 2.3596234798431395, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.27315576672554015, "loss/reg": 0.0, "step": 28500 }, { "epoch": 0.1875657894736842, "grad_norm": 2.34375, "grad_norm_var": 0.013753255208333334, "learning_rate": 0.0001, "loss": 3.0261, "loss/crossentropy": 2.196711075305939, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.23315538316965104, "loss/reg": 0.0, "step": 28510 }, { "epoch": 0.18763157894736843, "grad_norm": 2.453125, "grad_norm_var": 0.05598119099934896, "learning_rate": 0.0001, "loss": 3.1013, "loss/crossentropy": 2.1932213962078095, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2575216740369797, "loss/reg": 0.0, "step": 28520 }, { "epoch": 0.18769736842105264, "grad_norm": 2.484375, "grad_norm_var": 0.0827166239420573, "learning_rate": 0.0001, "loss": 3.0802, "loss/crossentropy": 2.1390640258789064, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.21492594629526138, "loss/reg": 0.0, "step": 28530 }, { "epoch": 0.18776315789473685, "grad_norm": 2.5, "grad_norm_var": 0.38186848958333336, "learning_rate": 0.0001, "loss": 3.1353, "loss/crossentropy": 2.3883109092712402, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24425326883792878, "loss/reg": 0.0, "step": 28540 }, { "epoch": 0.18782894736842104, "grad_norm": 2.0625, "grad_norm_var": 0.30573628743489584, "learning_rate": 0.0001, "loss": 3.0361, "loss/crossentropy": 2.115387570858002, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.287911082804203, "loss/reg": 0.0, "step": 28550 }, { "epoch": 0.18789473684210525, "grad_norm": 2.671875, "grad_norm_var": 0.07654622395833334, "learning_rate": 0.0001, "loss": 3.1181, "loss/crossentropy": 2.195990490913391, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.22947419285774232, "loss/reg": 0.0, "step": 28560 }, { "epoch": 0.18796052631578947, "grad_norm": 2.359375, "grad_norm_var": 0.07486572265625, "learning_rate": 0.0001, "loss": 3.0473, "loss/crossentropy": 2.4482214570045473, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.24045743197202682, "loss/reg": 0.0, "step": 28570 }, { "epoch": 0.18802631578947368, "grad_norm": 2.265625, "grad_norm_var": 0.03916600545247396, "learning_rate": 0.0001, "loss": 3.0881, "loss/crossentropy": 2.3246118783950807, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.3818572014570236, "loss/reg": 0.0, "step": 28580 }, { "epoch": 0.1880921052631579, "grad_norm": 2.5625, "grad_norm_var": 0.16263427734375, "learning_rate": 0.0001, "loss": 3.0568, "loss/crossentropy": 2.3318132519721986, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.25452076345682145, "loss/reg": 0.0, "step": 28590 }, { "epoch": 0.1881578947368421, "grad_norm": 2.484375, "grad_norm_var": 0.20039443969726561, "learning_rate": 0.0001, "loss": 3.0826, "loss/crossentropy": 2.0651711583137513, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.20308290272951127, "loss/reg": 0.0, "step": 28600 }, { "epoch": 0.18822368421052632, "grad_norm": 2.53125, "grad_norm_var": 0.16236750284830728, "learning_rate": 0.0001, "loss": 3.0752, "loss/crossentropy": 2.307542312145233, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.2218876764178276, "loss/reg": 0.0, "step": 28610 }, { "epoch": 0.18828947368421053, "grad_norm": 4.53125, "grad_norm_var": 0.3535011291503906, "learning_rate": 0.0001, "loss": 3.0657, "loss/crossentropy": 2.2695202469825744, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.24460041224956514, "loss/reg": 0.0, "step": 28620 }, { "epoch": 0.18835526315789475, "grad_norm": 2.15625, "grad_norm_var": 0.34776102701822914, "learning_rate": 0.0001, "loss": 3.0931, "loss/crossentropy": 2.409075605869293, "loss/hidden": 3.2140625, "loss/incoh": 0.0, "loss/logits": 0.3033790022134781, "loss/reg": 0.0, "step": 28630 }, { "epoch": 0.18842105263157893, "grad_norm": 2.21875, "grad_norm_var": 0.04940999348958333, "learning_rate": 0.0001, "loss": 3.0248, "loss/crossentropy": 2.3460227608680726, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.24943882077932358, "loss/reg": 0.0, "step": 28640 }, { "epoch": 0.18848684210526315, "grad_norm": 2.078125, "grad_norm_var": 0.18352762858072916, "learning_rate": 0.0001, "loss": 3.0946, "loss/crossentropy": 2.8061397314071654, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2542914628982544, "loss/reg": 0.0, "step": 28650 }, { "epoch": 0.18855263157894736, "grad_norm": 2.203125, "grad_norm_var": 0.05283203125, "learning_rate": 0.0001, "loss": 3.023, "loss/crossentropy": 2.0528534650802612, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2569132924079895, "loss/reg": 0.0, "step": 28660 }, { "epoch": 0.18861842105263157, "grad_norm": 2.171875, "grad_norm_var": 0.026520792643229166, "learning_rate": 0.0001, "loss": 2.9614, "loss/crossentropy": 2.2640815138816834, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.25332382023334504, "loss/reg": 0.0, "step": 28670 }, { "epoch": 0.1886842105263158, "grad_norm": 1.953125, "grad_norm_var": 0.037451171875, "learning_rate": 0.0001, "loss": 3.0214, "loss/crossentropy": 2.0984261095523835, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.27365772873163224, "loss/reg": 0.0, "step": 28680 }, { "epoch": 0.18875, "grad_norm": 2.703125, "grad_norm_var": 0.07590738932291667, "learning_rate": 0.0001, "loss": 3.089, "loss/crossentropy": 2.2889229536056517, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2945134401321411, "loss/reg": 0.0, "step": 28690 }, { "epoch": 0.18881578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.04885152180989583, "learning_rate": 0.0001, "loss": 3.0409, "loss/crossentropy": 2.164911460876465, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.22368502318859101, "loss/reg": 0.0, "step": 28700 }, { "epoch": 0.18888157894736843, "grad_norm": 2.109375, "grad_norm_var": 0.1095855712890625, "learning_rate": 0.0001, "loss": 3.003, "loss/crossentropy": 2.2275232434272767, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.21968200653791428, "loss/reg": 0.0, "step": 28710 }, { "epoch": 0.18894736842105264, "grad_norm": 2.5625, "grad_norm_var": 0.05891494750976563, "learning_rate": 0.0001, "loss": 2.985, "loss/crossentropy": 2.429938530921936, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.22446376383304595, "loss/reg": 0.0, "step": 28720 }, { "epoch": 0.18901315789473686, "grad_norm": 1.984375, "grad_norm_var": 0.13444722493489583, "learning_rate": 0.0001, "loss": 2.9926, "loss/crossentropy": 2.139791202545166, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2599222779273987, "loss/reg": 0.0, "step": 28730 }, { "epoch": 0.18907894736842104, "grad_norm": 2.8125, "grad_norm_var": 0.15026041666666667, "learning_rate": 0.0001, "loss": 2.9502, "loss/crossentropy": 2.5147767782211305, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20266549810767173, "loss/reg": 0.0, "step": 28740 }, { "epoch": 0.18914473684210525, "grad_norm": 3.578125, "grad_norm_var": 0.20552978515625, "learning_rate": 0.0001, "loss": 3.051, "loss/crossentropy": 2.4063800454139708, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.3112260654568672, "loss/reg": 0.0, "step": 28750 }, { "epoch": 0.18921052631578947, "grad_norm": 2.453125, "grad_norm_var": 0.14830322265625, "learning_rate": 0.0001, "loss": 3.057, "loss/crossentropy": 2.2508232951164246, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22451825439929962, "loss/reg": 0.0, "step": 28760 }, { "epoch": 0.18927631578947368, "grad_norm": 2.875, "grad_norm_var": 0.11416727701822917, "learning_rate": 0.0001, "loss": 3.0512, "loss/crossentropy": 2.149078315496445, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.21534078270196916, "loss/reg": 0.0, "step": 28770 }, { "epoch": 0.1893421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.61968994140625, "learning_rate": 0.0001, "loss": 3.0034, "loss/crossentropy": 2.3696099877357484, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.2401156485080719, "loss/reg": 0.0, "step": 28780 }, { "epoch": 0.1894078947368421, "grad_norm": 2.203125, "grad_norm_var": 0.4911092122395833, "learning_rate": 0.0001, "loss": 3.0951, "loss/crossentropy": 2.0147340178489683, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.2364984579384327, "loss/reg": 0.0, "step": 28790 }, { "epoch": 0.18947368421052632, "grad_norm": 2.203125, "grad_norm_var": 0.06545817057291667, "learning_rate": 0.0001, "loss": 3.0985, "loss/crossentropy": 2.4977503657341003, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.25097098499536513, "loss/reg": 0.0, "step": 28800 }, { "epoch": 0.18953947368421054, "grad_norm": 1.96875, "grad_norm_var": 0.08662007649739584, "learning_rate": 0.0001, "loss": 3.0047, "loss/crossentropy": 2.1395092368125916, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.20759201645851136, "loss/reg": 0.0, "step": 28810 }, { "epoch": 0.18960526315789475, "grad_norm": 2.78125, "grad_norm_var": 0.09783426920572917, "learning_rate": 0.0001, "loss": 3.0329, "loss/crossentropy": 2.3047179579734802, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22515993937849998, "loss/reg": 0.0, "step": 28820 }, { "epoch": 0.18967105263157893, "grad_norm": 2.609375, "grad_norm_var": 0.07600504557291667, "learning_rate": 0.0001, "loss": 3.0786, "loss/crossentropy": 2.4169238924980165, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.23725131005048752, "loss/reg": 0.0, "step": 28830 }, { "epoch": 0.18973684210526315, "grad_norm": 2.78125, "grad_norm_var": 0.09014460245768229, "learning_rate": 0.0001, "loss": 3.0935, "loss/crossentropy": 2.473812985420227, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.23891154676675797, "loss/reg": 0.0, "step": 28840 }, { "epoch": 0.18980263157894736, "grad_norm": 2.265625, "grad_norm_var": 0.3475870768229167, "learning_rate": 0.0001, "loss": 3.0779, "loss/crossentropy": 2.1757280230522156, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.2538435861468315, "loss/reg": 0.0, "step": 28850 }, { "epoch": 0.18986842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.2500233968098958, "learning_rate": 0.0001, "loss": 3.0282, "loss/crossentropy": 2.321690630912781, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2390053778886795, "loss/reg": 0.0, "step": 28860 }, { "epoch": 0.1899342105263158, "grad_norm": 2.765625, "grad_norm_var": 0.12604878743489584, "learning_rate": 0.0001, "loss": 3.057, "loss/crossentropy": 2.0470433115959166, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.24679146856069564, "loss/reg": 0.0, "step": 28870 }, { "epoch": 0.19, "grad_norm": 2.921875, "grad_norm_var": 0.17348531087239583, "learning_rate": 0.0001, "loss": 3.1491, "loss/crossentropy": 2.022314542531967, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.23445970118045806, "loss/reg": 0.0, "step": 28880 }, { "epoch": 0.19006578947368422, "grad_norm": 2.515625, "grad_norm_var": 0.06368815104166667, "learning_rate": 0.0001, "loss": 3.0043, "loss/crossentropy": 2.0953749775886537, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23144432604312898, "loss/reg": 0.0, "step": 28890 }, { "epoch": 0.19013157894736843, "grad_norm": 2.0, "grad_norm_var": 0.04041239420572917, "learning_rate": 0.0001, "loss": 3.0328, "loss/crossentropy": 2.2653061032295225, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.23457860052585602, "loss/reg": 0.0, "step": 28900 }, { "epoch": 0.19019736842105264, "grad_norm": 2.359375, "grad_norm_var": 0.04980061848958333, "learning_rate": 0.0001, "loss": 3.0439, "loss/crossentropy": 2.222883141040802, "loss/hidden": 3.0453125, "loss/incoh": 0.0, "loss/logits": 0.2797882482409477, "loss/reg": 0.0, "step": 28910 }, { "epoch": 0.19026315789473683, "grad_norm": 2.296875, "grad_norm_var": 0.0666168212890625, "learning_rate": 0.0001, "loss": 3.0709, "loss/crossentropy": 2.3731595158576964, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.21689550429582596, "loss/reg": 0.0, "step": 28920 }, { "epoch": 0.19032894736842104, "grad_norm": 3.5, "grad_norm_var": 0.15308837890625, "learning_rate": 0.0001, "loss": 3.0906, "loss/crossentropy": 2.3481295704841614, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.20085408240556718, "loss/reg": 0.0, "step": 28930 }, { "epoch": 0.19039473684210526, "grad_norm": 2.59375, "grad_norm_var": 0.427294667561849, "learning_rate": 0.0001, "loss": 3.0422, "loss/crossentropy": 2.1098085761070253, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.20854987427592278, "loss/reg": 0.0, "step": 28940 }, { "epoch": 0.19046052631578947, "grad_norm": 2.609375, "grad_norm_var": 2.635267893473307, "learning_rate": 0.0001, "loss": 3.0915, "loss/crossentropy": 2.1917474389076235, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.20776809751987457, "loss/reg": 0.0, "step": 28950 }, { "epoch": 0.19052631578947368, "grad_norm": 2.125, "grad_norm_var": 0.062235514322916664, "learning_rate": 0.0001, "loss": 3.0466, "loss/crossentropy": 2.0871485114097594, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.21217636987566948, "loss/reg": 0.0, "step": 28960 }, { "epoch": 0.1905921052631579, "grad_norm": 2.1875, "grad_norm_var": 0.028055826822916668, "learning_rate": 0.0001, "loss": 3.0856, "loss/crossentropy": 2.1593764424324036, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.20974511429667472, "loss/reg": 0.0, "step": 28970 }, { "epoch": 0.1906578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.09622395833333333, "learning_rate": 0.0001, "loss": 2.983, "loss/crossentropy": 2.2977413177490233, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.25048716068267823, "loss/reg": 0.0, "step": 28980 }, { "epoch": 0.19072368421052632, "grad_norm": 2.4375, "grad_norm_var": 0.11470947265625, "learning_rate": 0.0001, "loss": 3.0439, "loss/crossentropy": 2.235085117816925, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2309124991297722, "loss/reg": 0.0, "step": 28990 }, { "epoch": 0.19078947368421054, "grad_norm": 2.28125, "grad_norm_var": 0.23596598307291666, "learning_rate": 0.0001, "loss": 3.0173, "loss/crossentropy": 2.5475085377693176, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.25446673631668093, "loss/reg": 0.0, "step": 29000 }, { "epoch": 0.19085526315789475, "grad_norm": 2.59375, "grad_norm_var": 0.18063151041666667, "learning_rate": 0.0001, "loss": 3.0599, "loss/crossentropy": 2.327496898174286, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2621834442019463, "loss/reg": 0.0, "step": 29010 }, { "epoch": 0.19092105263157894, "grad_norm": 2.546875, "grad_norm_var": 0.0766754150390625, "learning_rate": 0.0001, "loss": 3.0494, "loss/crossentropy": 2.0966503024101257, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.21290977895259858, "loss/reg": 0.0, "step": 29020 }, { "epoch": 0.19098684210526315, "grad_norm": 2.3125, "grad_norm_var": 0.0940338134765625, "learning_rate": 0.0001, "loss": 3.0697, "loss/crossentropy": 2.193461000919342, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.2083764299750328, "loss/reg": 0.0, "step": 29030 }, { "epoch": 0.19105263157894736, "grad_norm": 2.484375, "grad_norm_var": 0.20330785115559896, "learning_rate": 0.0001, "loss": 3.0073, "loss/crossentropy": 2.1486512422561646, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2170367144048214, "loss/reg": 0.0, "step": 29040 }, { "epoch": 0.19111842105263158, "grad_norm": 2.4375, "grad_norm_var": 0.05134989420572917, "learning_rate": 0.0001, "loss": 2.9996, "loss/crossentropy": 2.359115946292877, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2235071614384651, "loss/reg": 0.0, "step": 29050 }, { "epoch": 0.1911842105263158, "grad_norm": 2.140625, "grad_norm_var": 0.05135498046875, "learning_rate": 0.0001, "loss": 3.0619, "loss/crossentropy": 2.4227041006088257, "loss/hidden": 2.978125, "loss/incoh": 0.0, "loss/logits": 0.316775880753994, "loss/reg": 0.0, "step": 29060 }, { "epoch": 0.19125, "grad_norm": 3.375, "grad_norm_var": 0.10562235514322917, "learning_rate": 0.0001, "loss": 3.0286, "loss/crossentropy": 2.532416009902954, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.2569827824831009, "loss/reg": 0.0, "step": 29070 }, { "epoch": 0.19131578947368422, "grad_norm": 2.21875, "grad_norm_var": 0.11773173014322917, "learning_rate": 0.0001, "loss": 3.0518, "loss/crossentropy": 2.35415917634964, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.23699928000569342, "loss/reg": 0.0, "step": 29080 }, { "epoch": 0.19138157894736843, "grad_norm": 2.234375, "grad_norm_var": 0.03673502604166667, "learning_rate": 0.0001, "loss": 3.0043, "loss/crossentropy": 2.044126057624817, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.23696889132261276, "loss/reg": 0.0, "step": 29090 }, { "epoch": 0.19144736842105264, "grad_norm": 2.3125, "grad_norm_var": 0.08551432291666666, "learning_rate": 0.0001, "loss": 3.1223, "loss/crossentropy": 2.4426622867584227, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.251794858276844, "loss/reg": 0.0, "step": 29100 }, { "epoch": 0.19151315789473683, "grad_norm": 1.9921875, "grad_norm_var": 0.2218523661295573, "learning_rate": 0.0001, "loss": 3.069, "loss/crossentropy": 2.5686187982559203, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.250452946126461, "loss/reg": 0.0, "step": 29110 }, { "epoch": 0.19157894736842104, "grad_norm": 2.328125, "grad_norm_var": 0.4294288635253906, "learning_rate": 0.0001, "loss": 3.0377, "loss/crossentropy": 2.2918658018112184, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.21649599373340606, "loss/reg": 0.0, "step": 29120 }, { "epoch": 0.19164473684210526, "grad_norm": 2.234375, "grad_norm_var": 0.028401692708333332, "learning_rate": 0.0001, "loss": 3.1086, "loss/crossentropy": 2.3098790526390074, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2144581601023674, "loss/reg": 0.0, "step": 29130 }, { "epoch": 0.19171052631578947, "grad_norm": 2.171875, "grad_norm_var": 0.020048014322916665, "learning_rate": 0.0001, "loss": 3.0103, "loss/crossentropy": 2.470157337188721, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2766988605260849, "loss/reg": 0.0, "step": 29140 }, { "epoch": 0.19177631578947368, "grad_norm": 2.296875, "grad_norm_var": 0.02623291015625, "learning_rate": 0.0001, "loss": 2.9856, "loss/crossentropy": 2.317296016216278, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2372336909174919, "loss/reg": 0.0, "step": 29150 }, { "epoch": 0.1918421052631579, "grad_norm": 2.5, "grad_norm_var": 0.0629547119140625, "learning_rate": 0.0001, "loss": 3.0697, "loss/crossentropy": 2.1767319798469544, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.2674897871911526, "loss/reg": 0.0, "step": 29160 }, { "epoch": 0.1919078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.05247294108072917, "learning_rate": 0.0001, "loss": 2.9929, "loss/crossentropy": 2.2281210064888, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.24070803225040435, "loss/reg": 0.0, "step": 29170 }, { "epoch": 0.19197368421052632, "grad_norm": 2.484375, "grad_norm_var": 0.30436909993489586, "learning_rate": 0.0001, "loss": 3.0128, "loss/crossentropy": 2.2079304993152618, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.20276689082384108, "loss/reg": 0.0, "step": 29180 }, { "epoch": 0.19203947368421054, "grad_norm": 2.28125, "grad_norm_var": 0.331884765625, "learning_rate": 0.0001, "loss": 3.0248, "loss/crossentropy": 2.3135082483291627, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.2161063551902771, "loss/reg": 0.0, "step": 29190 }, { "epoch": 0.19210526315789472, "grad_norm": 2.28125, "grad_norm_var": 0.15064697265625, "learning_rate": 0.0001, "loss": 3.0372, "loss/crossentropy": 2.377930212020874, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.22175273448228836, "loss/reg": 0.0, "step": 29200 }, { "epoch": 0.19217105263157894, "grad_norm": 2.125, "grad_norm_var": 0.12327372233072917, "learning_rate": 0.0001, "loss": 3.1012, "loss/crossentropy": 2.4531391739845274, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2535667777061462, "loss/reg": 0.0, "step": 29210 }, { "epoch": 0.19223684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.07448628743489584, "learning_rate": 0.0001, "loss": 3.1458, "loss/crossentropy": 2.2929787397384644, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.25961560308933257, "loss/reg": 0.0, "step": 29220 }, { "epoch": 0.19230263157894736, "grad_norm": 2.4375, "grad_norm_var": 0.04540608723958333, "learning_rate": 0.0001, "loss": 3.0156, "loss/crossentropy": 2.265605902671814, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.27446902841329573, "loss/reg": 0.0, "step": 29230 }, { "epoch": 0.19236842105263158, "grad_norm": 2.328125, "grad_norm_var": 0.06565348307291667, "learning_rate": 0.0001, "loss": 3.1152, "loss/crossentropy": 2.376231300830841, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.2864221647381783, "loss/reg": 0.0, "step": 29240 }, { "epoch": 0.1924342105263158, "grad_norm": 2.28125, "grad_norm_var": 0.06280085245768229, "learning_rate": 0.0001, "loss": 3.07, "loss/crossentropy": 2.394097375869751, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2377302184700966, "loss/reg": 0.0, "step": 29250 }, { "epoch": 0.1925, "grad_norm": 3.40625, "grad_norm_var": 0.122607421875, "learning_rate": 0.0001, "loss": 3.1114, "loss/crossentropy": 2.5634241580963133, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22698014378547668, "loss/reg": 0.0, "step": 29260 }, { "epoch": 0.19256578947368422, "grad_norm": 3.171875, "grad_norm_var": 0.11873372395833333, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.3539575576782226, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2572378695011139, "loss/reg": 0.0, "step": 29270 }, { "epoch": 0.19263157894736843, "grad_norm": 2.640625, "grad_norm_var": 0.14241536458333334, "learning_rate": 0.0001, "loss": 3.0603, "loss/crossentropy": 2.3354499697685243, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2519584596157074, "loss/reg": 0.0, "step": 29280 }, { "epoch": 0.19269736842105264, "grad_norm": 3.5625, "grad_norm_var": 0.3069081624348958, "learning_rate": 0.0001, "loss": 3.0894, "loss/crossentropy": 2.0824127376079558, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.28106222823262217, "loss/reg": 0.0, "step": 29290 }, { "epoch": 0.19276315789473683, "grad_norm": 2.328125, "grad_norm_var": 0.1501617431640625, "learning_rate": 0.0001, "loss": 3.0121, "loss/crossentropy": 2.1776723742485045, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20980774164199828, "loss/reg": 0.0, "step": 29300 }, { "epoch": 0.19282894736842104, "grad_norm": 2.796875, "grad_norm_var": 0.2937164306640625, "learning_rate": 0.0001, "loss": 3.1225, "loss/crossentropy": 2.157558262348175, "loss/hidden": 3.0953125, "loss/incoh": 0.0, "loss/logits": 0.2775505542755127, "loss/reg": 0.0, "step": 29310 }, { "epoch": 0.19289473684210526, "grad_norm": 3.734375, "grad_norm_var": 0.33560791015625, "learning_rate": 0.0001, "loss": 3.0748, "loss/crossentropy": 2.381651961803436, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2246159717440605, "loss/reg": 0.0, "step": 29320 }, { "epoch": 0.19296052631578947, "grad_norm": 2.84375, "grad_norm_var": 2.1095540364583334, "learning_rate": 0.0001, "loss": 3.2928, "loss/crossentropy": 2.0944801807403564, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.23710142374038695, "loss/reg": 0.0, "step": 29330 }, { "epoch": 0.19302631578947368, "grad_norm": 1.9453125, "grad_norm_var": 0.2770851135253906, "learning_rate": 0.0001, "loss": 3.1, "loss/crossentropy": 2.4732042074203493, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.23788287490606308, "loss/reg": 0.0, "step": 29340 }, { "epoch": 0.1930921052631579, "grad_norm": 2.046875, "grad_norm_var": 0.35339330037434896, "learning_rate": 0.0001, "loss": 3.0744, "loss/crossentropy": 2.4158215165138244, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.23991909474134446, "loss/reg": 0.0, "step": 29350 }, { "epoch": 0.1931578947368421, "grad_norm": 4.03125, "grad_norm_var": 0.3200754801432292, "learning_rate": 0.0001, "loss": 3.1003, "loss/crossentropy": 2.448209798336029, "loss/hidden": 3.3078125, "loss/incoh": 0.0, "loss/logits": 0.3389866009354591, "loss/reg": 0.0, "step": 29360 }, { "epoch": 0.19322368421052633, "grad_norm": 2.796875, "grad_norm_var": 0.22730712890625, "learning_rate": 0.0001, "loss": 3.1074, "loss/crossentropy": 1.9778976082801818, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.2316925495862961, "loss/reg": 0.0, "step": 29370 }, { "epoch": 0.19328947368421054, "grad_norm": 2.390625, "grad_norm_var": 0.09205729166666667, "learning_rate": 0.0001, "loss": 3.1044, "loss/crossentropy": 2.344563841819763, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2164514496922493, "loss/reg": 0.0, "step": 29380 }, { "epoch": 0.19335526315789472, "grad_norm": 2.3125, "grad_norm_var": 0.07641499837239583, "learning_rate": 0.0001, "loss": 3.1445, "loss/crossentropy": 2.186362612247467, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.21316134929656982, "loss/reg": 0.0, "step": 29390 }, { "epoch": 0.19342105263157894, "grad_norm": 2.265625, "grad_norm_var": 0.5996897379557292, "learning_rate": 0.0001, "loss": 3.11, "loss/crossentropy": 2.546473169326782, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.22721548825502397, "loss/reg": 0.0, "step": 29400 }, { "epoch": 0.19348684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.6576881408691406, "learning_rate": 0.0001, "loss": 2.9865, "loss/crossentropy": 2.210515594482422, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20338982120156288, "loss/reg": 0.0, "step": 29410 }, { "epoch": 0.19355263157894737, "grad_norm": 2.46875, "grad_norm_var": 0.06096572875976562, "learning_rate": 0.0001, "loss": 3.0997, "loss/crossentropy": 2.5028133630752563, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.3376652091741562, "loss/reg": 0.0, "step": 29420 }, { "epoch": 0.19361842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.25368626912434894, "learning_rate": 0.0001, "loss": 3.0177, "loss/crossentropy": 2.2334850907325743, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.2385883465409279, "loss/reg": 0.0, "step": 29430 }, { "epoch": 0.1936842105263158, "grad_norm": 1.9765625, "grad_norm_var": 0.24181493123372397, "learning_rate": 0.0001, "loss": 3.0405, "loss/crossentropy": 1.898663866519928, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.20240384489297866, "loss/reg": 0.0, "step": 29440 }, { "epoch": 0.19375, "grad_norm": 2.515625, "grad_norm_var": 0.03530044555664062, "learning_rate": 0.0001, "loss": 3.0841, "loss/crossentropy": 2.3499651670455934, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.27134826183319094, "loss/reg": 0.0, "step": 29450 }, { "epoch": 0.19381578947368422, "grad_norm": 2.28125, "grad_norm_var": 0.02574462890625, "learning_rate": 0.0001, "loss": 3.0395, "loss/crossentropy": 2.3818777322769167, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.25071706622838974, "loss/reg": 0.0, "step": 29460 }, { "epoch": 0.19388157894736843, "grad_norm": 2.125, "grad_norm_var": 0.018602498372395835, "learning_rate": 0.0001, "loss": 3.0232, "loss/crossentropy": 2.1610496282577514, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2543530076742172, "loss/reg": 0.0, "step": 29470 }, { "epoch": 0.19394736842105262, "grad_norm": 2.828125, "grad_norm_var": 0.12858784993489583, "learning_rate": 0.0001, "loss": 3.0346, "loss/crossentropy": 2.3739108026027678, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.22633943632245063, "loss/reg": 0.0, "step": 29480 }, { "epoch": 0.19401315789473683, "grad_norm": 2.1875, "grad_norm_var": 0.116162109375, "learning_rate": 0.0001, "loss": 3.0797, "loss/crossentropy": 2.3364730775356293, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.2235547423362732, "loss/reg": 0.0, "step": 29490 }, { "epoch": 0.19407894736842105, "grad_norm": 2.5625, "grad_norm_var": 0.04695612589518229, "learning_rate": 0.0001, "loss": 3.0498, "loss/crossentropy": 2.2981515765190124, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.3249427303671837, "loss/reg": 0.0, "step": 29500 }, { "epoch": 0.19414473684210526, "grad_norm": 2.421875, "grad_norm_var": 0.06395848592122395, "learning_rate": 0.0001, "loss": 3.0062, "loss/crossentropy": 2.392876994609833, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.2336449593305588, "loss/reg": 0.0, "step": 29510 }, { "epoch": 0.19421052631578947, "grad_norm": 2.3125, "grad_norm_var": 0.07955322265625, "learning_rate": 0.0001, "loss": 3.0426, "loss/crossentropy": 2.3927242517471314, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.23498377203941345, "loss/reg": 0.0, "step": 29520 }, { "epoch": 0.19427631578947369, "grad_norm": 2.203125, "grad_norm_var": 0.030354817708333332, "learning_rate": 0.0001, "loss": 3.0498, "loss/crossentropy": 1.9698969006538392, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2563909277319908, "loss/reg": 0.0, "step": 29530 }, { "epoch": 0.1943421052631579, "grad_norm": 2.703125, "grad_norm_var": 0.08201395670572917, "learning_rate": 0.0001, "loss": 3.0928, "loss/crossentropy": 2.228722929954529, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2367407873272896, "loss/reg": 0.0, "step": 29540 }, { "epoch": 0.1944078947368421, "grad_norm": 2.453125, "grad_norm_var": 0.067626953125, "learning_rate": 0.0001, "loss": 3.0422, "loss/crossentropy": 2.4088454604148866, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.24344853162765503, "loss/reg": 0.0, "step": 29550 }, { "epoch": 0.19447368421052633, "grad_norm": 2.3125, "grad_norm_var": 0.07860921223958334, "learning_rate": 0.0001, "loss": 3.06, "loss/crossentropy": 2.1968480169773104, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.22112407311797141, "loss/reg": 0.0, "step": 29560 }, { "epoch": 0.19453947368421054, "grad_norm": 2.296875, "grad_norm_var": 0.2692942301432292, "learning_rate": 0.0001, "loss": 3.0565, "loss/crossentropy": 2.3560967564582826, "loss/hidden": 3.040625, "loss/incoh": 0.0, "loss/logits": 0.2932572916150093, "loss/reg": 0.0, "step": 29570 }, { "epoch": 0.19460526315789473, "grad_norm": 2.109375, "grad_norm_var": 0.30523859659830727, "learning_rate": 0.0001, "loss": 3.0003, "loss/crossentropy": 2.0988304018974304, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.24731282144784927, "loss/reg": 0.0, "step": 29580 }, { "epoch": 0.19467105263157894, "grad_norm": 2.703125, "grad_norm_var": 0.055944569905598956, "learning_rate": 0.0001, "loss": 3.0501, "loss/crossentropy": 2.2525598287582396, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.20050241351127623, "loss/reg": 0.0, "step": 29590 }, { "epoch": 0.19473684210526315, "grad_norm": 2.421875, "grad_norm_var": 0.038557942708333334, "learning_rate": 0.0001, "loss": 3.0499, "loss/crossentropy": 2.117692744731903, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.20816649496555328, "loss/reg": 0.0, "step": 29600 }, { "epoch": 0.19480263157894737, "grad_norm": 2.140625, "grad_norm_var": 0.048628743489583334, "learning_rate": 0.0001, "loss": 3.0671, "loss/crossentropy": 2.2347262859344483, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.24547438323497772, "loss/reg": 0.0, "step": 29610 }, { "epoch": 0.19486842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.1297027587890625, "learning_rate": 0.0001, "loss": 3.1136, "loss/crossentropy": 2.1894510865211485, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.26874606013298036, "loss/reg": 0.0, "step": 29620 }, { "epoch": 0.1949342105263158, "grad_norm": 2.796875, "grad_norm_var": 0.11825764973958333, "learning_rate": 0.0001, "loss": 3.1032, "loss/crossentropy": 2.314576745033264, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2493584305047989, "loss/reg": 0.0, "step": 29630 }, { "epoch": 0.195, "grad_norm": 2.015625, "grad_norm_var": 0.18583577473958332, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.548297035694122, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.24562474042177201, "loss/reg": 0.0, "step": 29640 }, { "epoch": 0.19506578947368422, "grad_norm": 2.90625, "grad_norm_var": 0.21096903483072918, "learning_rate": 0.0001, "loss": 3.0832, "loss/crossentropy": 2.685001492500305, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.27619747072458267, "loss/reg": 0.0, "step": 29650 }, { "epoch": 0.19513157894736843, "grad_norm": 3.046875, "grad_norm_var": 0.10445556640625, "learning_rate": 0.0001, "loss": 3.0406, "loss/crossentropy": 2.471345865726471, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.233747598528862, "loss/reg": 0.0, "step": 29660 }, { "epoch": 0.19519736842105262, "grad_norm": 2.46875, "grad_norm_var": 0.1791412353515625, "learning_rate": 0.0001, "loss": 3.0572, "loss/crossentropy": 2.378851294517517, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.222721017152071, "loss/reg": 0.0, "step": 29670 }, { "epoch": 0.19526315789473683, "grad_norm": 3.328125, "grad_norm_var": 0.25057144165039064, "learning_rate": 0.0001, "loss": 3.027, "loss/crossentropy": 2.091909795999527, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.24436787366867066, "loss/reg": 0.0, "step": 29680 }, { "epoch": 0.19532894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.22474339803059895, "learning_rate": 0.0001, "loss": 3.0429, "loss/crossentropy": 2.328868269920349, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.262276391685009, "loss/reg": 0.0, "step": 29690 }, { "epoch": 0.19539473684210526, "grad_norm": 2.28125, "grad_norm_var": 0.15204671223958333, "learning_rate": 0.0001, "loss": 3.0549, "loss/crossentropy": 2.296106255054474, "loss/hidden": 2.9734375, "loss/incoh": 0.0, "loss/logits": 0.2577256761491299, "loss/reg": 0.0, "step": 29700 }, { "epoch": 0.19546052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.20373433430989582, "learning_rate": 0.0001, "loss": 3.0571, "loss/crossentropy": 2.495807719230652, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2599473804235458, "loss/reg": 0.0, "step": 29710 }, { "epoch": 0.1955263157894737, "grad_norm": 2.1875, "grad_norm_var": 0.73043212890625, "learning_rate": 0.0001, "loss": 3.0451, "loss/crossentropy": 2.489153337478638, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.2395859479904175, "loss/reg": 0.0, "step": 29720 }, { "epoch": 0.1955921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.09890034993489584, "learning_rate": 0.0001, "loss": 2.9846, "loss/crossentropy": 2.1876869797706604, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20331611335277558, "loss/reg": 0.0, "step": 29730 }, { "epoch": 0.19565789473684211, "grad_norm": 2.78125, "grad_norm_var": 0.12862040201822916, "learning_rate": 0.0001, "loss": 3.0944, "loss/crossentropy": 2.548052740097046, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.23612861186265946, "loss/reg": 0.0, "step": 29740 }, { "epoch": 0.19572368421052633, "grad_norm": 2.1875, "grad_norm_var": 0.038802083333333334, "learning_rate": 0.0001, "loss": 3.0041, "loss/crossentropy": 2.450440764427185, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.20733811408281327, "loss/reg": 0.0, "step": 29750 }, { "epoch": 0.1957894736842105, "grad_norm": 2.671875, "grad_norm_var": 0.035497029622395836, "learning_rate": 0.0001, "loss": 3.0924, "loss/crossentropy": 2.022493052482605, "loss/hidden": 3.0265625, "loss/incoh": 0.0, "loss/logits": 0.26215304881334306, "loss/reg": 0.0, "step": 29760 }, { "epoch": 0.19585526315789473, "grad_norm": 6.1875, "grad_norm_var": 0.9495025634765625, "learning_rate": 0.0001, "loss": 3.0461, "loss/crossentropy": 2.2956669092178346, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24869376718997954, "loss/reg": 0.0, "step": 29770 }, { "epoch": 0.19592105263157894, "grad_norm": 2.40625, "grad_norm_var": 1.1040974934895833, "learning_rate": 0.0001, "loss": 3.0038, "loss/crossentropy": 2.301337730884552, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.2174853652715683, "loss/reg": 0.0, "step": 29780 }, { "epoch": 0.19598684210526315, "grad_norm": 2.5625, "grad_norm_var": 0.26876627604166664, "learning_rate": 0.0001, "loss": 3.1862, "loss/crossentropy": 1.7189460813999176, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.21047319620847701, "loss/reg": 0.0, "step": 29790 }, { "epoch": 0.19605263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.042464192708333334, "learning_rate": 0.0001, "loss": 3.0449, "loss/crossentropy": 2.3193385720252992, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2548700511455536, "loss/reg": 0.0, "step": 29800 }, { "epoch": 0.19611842105263158, "grad_norm": 3.71875, "grad_norm_var": 0.27180887858072916, "learning_rate": 0.0001, "loss": 3.008, "loss/crossentropy": 2.3117380261421205, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.2046105980873108, "loss/reg": 0.0, "step": 29810 }, { "epoch": 0.1961842105263158, "grad_norm": 4.28125, "grad_norm_var": 0.516106923421224, "learning_rate": 0.0001, "loss": 3.0925, "loss/crossentropy": 2.3800249338150024, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.26439079344272615, "loss/reg": 0.0, "step": 29820 }, { "epoch": 0.19625, "grad_norm": 2.546875, "grad_norm_var": 0.3475870768229167, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 2.284978838264942, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.24723461624234916, "loss/reg": 0.0, "step": 29830 }, { "epoch": 0.19631578947368422, "grad_norm": 2.25, "grad_norm_var": 0.29708836873372396, "learning_rate": 0.0001, "loss": 3.0244, "loss/crossentropy": 2.231731951236725, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.3149738535284996, "loss/reg": 0.0, "step": 29840 }, { "epoch": 0.19638157894736843, "grad_norm": 2.296875, "grad_norm_var": 0.287847646077474, "learning_rate": 0.0001, "loss": 3.0728, "loss/crossentropy": 2.453169012069702, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.25355182588100433, "loss/reg": 0.0, "step": 29850 }, { "epoch": 0.19644736842105262, "grad_norm": 2.296875, "grad_norm_var": 0.2615386962890625, "learning_rate": 0.0001, "loss": 3.0352, "loss/crossentropy": 2.174148201942444, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2032485894858837, "loss/reg": 0.0, "step": 29860 }, { "epoch": 0.19651315789473683, "grad_norm": 2.6875, "grad_norm_var": 0.25632705688476565, "learning_rate": 0.0001, "loss": 3.0585, "loss/crossentropy": 2.483177053928375, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2651928335428238, "loss/reg": 0.0, "step": 29870 }, { "epoch": 0.19657894736842105, "grad_norm": 2.140625, "grad_norm_var": 0.12946370442708333, "learning_rate": 0.0001, "loss": 3.0076, "loss/crossentropy": 2.4216135859489443, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2323567435145378, "loss/reg": 0.0, "step": 29880 }, { "epoch": 0.19664473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.07423477172851563, "learning_rate": 0.0001, "loss": 2.9992, "loss/crossentropy": 2.43337767124176, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2374792516231537, "loss/reg": 0.0, "step": 29890 }, { "epoch": 0.19671052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.09332275390625, "learning_rate": 0.0001, "loss": 3.1552, "loss/crossentropy": 2.066901612281799, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2512317180633545, "loss/reg": 0.0, "step": 29900 }, { "epoch": 0.1967763157894737, "grad_norm": 2.265625, "grad_norm_var": 0.3540598551432292, "learning_rate": 0.0001, "loss": 3.0271, "loss/crossentropy": 2.4260810136795046, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.26536442786455156, "loss/reg": 0.0, "step": 29910 }, { "epoch": 0.1968421052631579, "grad_norm": 2.4375, "grad_norm_var": 0.10589090983072917, "learning_rate": 0.0001, "loss": 3.0213, "loss/crossentropy": 2.4159180164337157, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2190088465809822, "loss/reg": 0.0, "step": 29920 }, { "epoch": 0.19690789473684212, "grad_norm": 2.703125, "grad_norm_var": 0.03914286295572917, "learning_rate": 0.0001, "loss": 3.0847, "loss/crossentropy": 2.061979150772095, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.20891676545143129, "loss/reg": 0.0, "step": 29930 }, { "epoch": 0.19697368421052633, "grad_norm": 3.625, "grad_norm_var": 0.14216206868489584, "learning_rate": 0.0001, "loss": 3.0243, "loss/crossentropy": 2.06211262345314, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.22453068271279336, "loss/reg": 0.0, "step": 29940 }, { "epoch": 0.19703947368421051, "grad_norm": 2.453125, "grad_norm_var": 0.13738505045572916, "learning_rate": 0.0001, "loss": 2.9814, "loss/crossentropy": 2.1103883236646652, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.1844348356127739, "loss/reg": 0.0, "step": 29950 }, { "epoch": 0.19710526315789473, "grad_norm": 2.1875, "grad_norm_var": 0.026537068684895835, "learning_rate": 0.0001, "loss": 3.0008, "loss/crossentropy": 2.553321826457977, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.21997541338205337, "loss/reg": 0.0, "step": 29960 }, { "epoch": 0.19717105263157894, "grad_norm": 2.109375, "grad_norm_var": 0.02939453125, "learning_rate": 0.0001, "loss": 3.0382, "loss/crossentropy": 2.4657467365264893, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.2679098337888718, "loss/reg": 0.0, "step": 29970 }, { "epoch": 0.19723684210526315, "grad_norm": 2.328125, "grad_norm_var": 0.32203369140625, "learning_rate": 0.0001, "loss": 3.083, "loss/crossentropy": 2.2925270318984987, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.2816790759563446, "loss/reg": 0.0, "step": 29980 }, { "epoch": 0.19730263157894737, "grad_norm": 2.828125, "grad_norm_var": 0.3014149983723958, "learning_rate": 0.0001, "loss": 3.0358, "loss/crossentropy": 2.652257299423218, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.23255081921815873, "loss/reg": 0.0, "step": 29990 }, { "epoch": 0.19736842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.061799112955729166, "learning_rate": 0.0001, "loss": 2.9585, "loss/crossentropy": 2.3400336265563966, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2366102933883667, "loss/reg": 0.0, "step": 30000 }, { "epoch": 0.1974342105263158, "grad_norm": 2.578125, "grad_norm_var": 0.06998672485351562, "learning_rate": 0.0001, "loss": 3.0787, "loss/crossentropy": 2.2949488759040833, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.24081108272075652, "loss/reg": 0.0, "step": 30010 }, { "epoch": 0.1975, "grad_norm": 2.359375, "grad_norm_var": 0.41442769368489585, "learning_rate": 0.0001, "loss": 3.0288, "loss/crossentropy": 2.414297103881836, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.23484020233154296, "loss/reg": 0.0, "step": 30020 }, { "epoch": 0.19756578947368422, "grad_norm": 2.21875, "grad_norm_var": 0.18356704711914062, "learning_rate": 0.0001, "loss": 3.0326, "loss/crossentropy": 2.300756549835205, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.19037204384803771, "loss/reg": 0.0, "step": 30030 }, { "epoch": 0.1976315789473684, "grad_norm": 2.46875, "grad_norm_var": 0.07932942708333333, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 2.5434207677841187, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2503844425082207, "loss/reg": 0.0, "step": 30040 }, { "epoch": 0.19769736842105262, "grad_norm": 2.453125, "grad_norm_var": 0.03378499348958333, "learning_rate": 0.0001, "loss": 3.0416, "loss/crossentropy": 2.435779368877411, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.22367140799760818, "loss/reg": 0.0, "step": 30050 }, { "epoch": 0.19776315789473684, "grad_norm": 2.40625, "grad_norm_var": 0.11531575520833333, "learning_rate": 0.0001, "loss": 3.0175, "loss/crossentropy": 2.614890933036804, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.2448316603899002, "loss/reg": 0.0, "step": 30060 }, { "epoch": 0.19782894736842105, "grad_norm": 2.1875, "grad_norm_var": 0.13720703125, "learning_rate": 0.0001, "loss": 3.0335, "loss/crossentropy": 2.0392467260360716, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.22009356170892716, "loss/reg": 0.0, "step": 30070 }, { "epoch": 0.19789473684210526, "grad_norm": 2.203125, "grad_norm_var": 0.028351847330729166, "learning_rate": 0.0001, "loss": 3.0004, "loss/crossentropy": 2.463162088394165, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.2216602995991707, "loss/reg": 0.0, "step": 30080 }, { "epoch": 0.19796052631578948, "grad_norm": 1.859375, "grad_norm_var": 0.05168863932291667, "learning_rate": 0.0001, "loss": 2.9309, "loss/crossentropy": 2.386566638946533, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.23080737441778182, "loss/reg": 0.0, "step": 30090 }, { "epoch": 0.1980263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.40987955729166664, "learning_rate": 0.0001, "loss": 2.9946, "loss/crossentropy": 2.5444513320922852, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.22146540880203247, "loss/reg": 0.0, "step": 30100 }, { "epoch": 0.1980921052631579, "grad_norm": 2.328125, "grad_norm_var": 0.45288263956705727, "learning_rate": 0.0001, "loss": 3.0193, "loss/crossentropy": 2.485434365272522, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23654820621013642, "loss/reg": 0.0, "step": 30110 }, { "epoch": 0.19815789473684212, "grad_norm": 2.3125, "grad_norm_var": 0.20391820271809896, "learning_rate": 0.0001, "loss": 3.0528, "loss/crossentropy": 2.378788614273071, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2376804992556572, "loss/reg": 0.0, "step": 30120 }, { "epoch": 0.1982236842105263, "grad_norm": 2.25, "grad_norm_var": 0.10331624348958333, "learning_rate": 0.0001, "loss": 3.0188, "loss/crossentropy": 2.154638743400574, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.20261103957891463, "loss/reg": 0.0, "step": 30130 }, { "epoch": 0.19828947368421052, "grad_norm": 2.40625, "grad_norm_var": 0.08589655558268229, "learning_rate": 0.0001, "loss": 2.9379, "loss/crossentropy": 2.2816161513328552, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.19446674287319182, "loss/reg": 0.0, "step": 30140 }, { "epoch": 0.19835526315789473, "grad_norm": 3.03125, "grad_norm_var": 0.10558980305989583, "learning_rate": 0.0001, "loss": 3.0039, "loss/crossentropy": 2.333029532432556, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.21476658284664155, "loss/reg": 0.0, "step": 30150 }, { "epoch": 0.19842105263157894, "grad_norm": 2.390625, "grad_norm_var": 0.08321940104166667, "learning_rate": 0.0001, "loss": 2.9808, "loss/crossentropy": 2.155623471736908, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.18124678283929824, "loss/reg": 0.0, "step": 30160 }, { "epoch": 0.19848684210526316, "grad_norm": 1.9453125, "grad_norm_var": 0.13827311197916667, "learning_rate": 0.0001, "loss": 3.0116, "loss/crossentropy": 2.109735357761383, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24903104603290557, "loss/reg": 0.0, "step": 30170 }, { "epoch": 0.19855263157894737, "grad_norm": 2.375, "grad_norm_var": 0.040135701497395836, "learning_rate": 0.0001, "loss": 3.0087, "loss/crossentropy": 2.290113925933838, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.21611811220645905, "loss/reg": 0.0, "step": 30180 }, { "epoch": 0.19861842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 2.995, "loss/crossentropy": 1.9855021834373474, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.1928846351802349, "loss/reg": 0.0, "step": 30190 }, { "epoch": 0.1986842105263158, "grad_norm": 2.265625, "grad_norm_var": 0.059366607666015626, "learning_rate": 0.0001, "loss": 3.0265, "loss/crossentropy": 2.213775265216827, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2045104242861271, "loss/reg": 0.0, "step": 30200 }, { "epoch": 0.19875, "grad_norm": 3.203125, "grad_norm_var": 0.10831883748372396, "learning_rate": 0.0001, "loss": 3.0971, "loss/crossentropy": 2.2141775846481324, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2511264935135841, "loss/reg": 0.0, "step": 30210 }, { "epoch": 0.19881578947368422, "grad_norm": 2.265625, "grad_norm_var": 0.18625895182291666, "learning_rate": 0.0001, "loss": 3.1376, "loss/crossentropy": 2.201507192850113, "loss/hidden": 3.01875, "loss/incoh": 0.0, "loss/logits": 0.27276814319193365, "loss/reg": 0.0, "step": 30220 }, { "epoch": 0.1988815789473684, "grad_norm": 2.71875, "grad_norm_var": 0.1700347900390625, "learning_rate": 0.0001, "loss": 3.0325, "loss/crossentropy": 2.4042665481567385, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.22706928849220276, "loss/reg": 0.0, "step": 30230 }, { "epoch": 0.19894736842105262, "grad_norm": 2.671875, "grad_norm_var": 0.18758036295572916, "learning_rate": 0.0001, "loss": 3.049, "loss/crossentropy": 2.0507681727409364, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.22476800307631492, "loss/reg": 0.0, "step": 30240 }, { "epoch": 0.19901315789473684, "grad_norm": 2.5, "grad_norm_var": 0.07414957682291666, "learning_rate": 0.0001, "loss": 2.9881, "loss/crossentropy": 2.2702227234840393, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.2327321708202362, "loss/reg": 0.0, "step": 30250 }, { "epoch": 0.19907894736842105, "grad_norm": 3.40625, "grad_norm_var": 0.29836832682291664, "learning_rate": 0.0001, "loss": 3.1337, "loss/crossentropy": 2.3777806520462037, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2500802963972092, "loss/reg": 0.0, "step": 30260 }, { "epoch": 0.19914473684210526, "grad_norm": 2.4375, "grad_norm_var": 0.27034403483072916, "learning_rate": 0.0001, "loss": 3.0123, "loss/crossentropy": 2.431059980392456, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2794636771082878, "loss/reg": 0.0, "step": 30270 }, { "epoch": 0.19921052631578948, "grad_norm": 2.515625, "grad_norm_var": 0.022591145833333333, "learning_rate": 0.0001, "loss": 3.0564, "loss/crossentropy": 2.491440224647522, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.27763516157865525, "loss/reg": 0.0, "step": 30280 }, { "epoch": 0.1992763157894737, "grad_norm": 2.640625, "grad_norm_var": 0.24989827473958334, "learning_rate": 0.0001, "loss": 3.0591, "loss/crossentropy": 2.443405735492706, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2293305829167366, "loss/reg": 0.0, "step": 30290 }, { "epoch": 0.1993421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.044205729166666666, "learning_rate": 0.0001, "loss": 2.9622, "loss/crossentropy": 2.2449229061603546, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.24314892143011094, "loss/reg": 0.0, "step": 30300 }, { "epoch": 0.19940789473684212, "grad_norm": 2.453125, "grad_norm_var": 0.03411458333333333, "learning_rate": 0.0001, "loss": 3.038, "loss/crossentropy": 2.5837315797805784, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.25667293965816496, "loss/reg": 0.0, "step": 30310 }, { "epoch": 0.1994736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.03482666015625, "learning_rate": 0.0001, "loss": 3.03, "loss/crossentropy": 2.4695034623146057, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.24289859235286712, "loss/reg": 0.0, "step": 30320 }, { "epoch": 0.19953947368421052, "grad_norm": 2.75, "grad_norm_var": 0.08709309895833334, "learning_rate": 0.0001, "loss": 2.9926, "loss/crossentropy": 2.263628613948822, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.25994745194911956, "loss/reg": 0.0, "step": 30330 }, { "epoch": 0.19960526315789473, "grad_norm": 2.40625, "grad_norm_var": 0.12483317057291667, "learning_rate": 0.0001, "loss": 3.0437, "loss/crossentropy": 2.263992178440094, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.22506246864795684, "loss/reg": 0.0, "step": 30340 }, { "epoch": 0.19967105263157894, "grad_norm": 2.0, "grad_norm_var": 0.09973042805989583, "learning_rate": 0.0001, "loss": 3.0212, "loss/crossentropy": 2.4082891941070557, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.24035572707653047, "loss/reg": 0.0, "step": 30350 }, { "epoch": 0.19973684210526316, "grad_norm": 2.125, "grad_norm_var": 0.04556884765625, "learning_rate": 0.0001, "loss": 3.0589, "loss/crossentropy": 2.2902461647987367, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.20010970458388327, "loss/reg": 0.0, "step": 30360 }, { "epoch": 0.19980263157894737, "grad_norm": 2.234375, "grad_norm_var": 0.047098795572916664, "learning_rate": 0.0001, "loss": 3.0519, "loss/crossentropy": 2.260884428024292, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.23876612037420272, "loss/reg": 0.0, "step": 30370 }, { "epoch": 0.19986842105263158, "grad_norm": 2.203125, "grad_norm_var": 0.17197265625, "learning_rate": 0.0001, "loss": 3.0596, "loss/crossentropy": 2.293976974487305, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24686284512281417, "loss/reg": 0.0, "step": 30380 }, { "epoch": 0.1999342105263158, "grad_norm": 1.96875, "grad_norm_var": 0.4962565104166667, "learning_rate": 0.0001, "loss": 3.0143, "loss/crossentropy": 2.0594027996063233, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2534930631518364, "loss/reg": 0.0, "step": 30390 }, { "epoch": 0.2, "grad_norm": 2.140625, "grad_norm_var": 0.5117421468098958, "learning_rate": 0.0001, "loss": 3.0243, "loss/crossentropy": 2.2039811968803407, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2378777429461479, "loss/reg": 0.0, "step": 30400 }, { "epoch": 0.2000657894736842, "grad_norm": 2.34375, "grad_norm_var": 0.06318257649739584, "learning_rate": 0.0001, "loss": 3.0318, "loss/crossentropy": 2.306100535392761, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.25150387436151506, "loss/reg": 0.0, "step": 30410 }, { "epoch": 0.2001315789473684, "grad_norm": 2.171875, "grad_norm_var": 0.048249308268229166, "learning_rate": 0.0001, "loss": 3.0054, "loss/crossentropy": 2.154050374031067, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23610089272260665, "loss/reg": 0.0, "step": 30420 }, { "epoch": 0.20019736842105262, "grad_norm": 2.09375, "grad_norm_var": 0.1066314697265625, "learning_rate": 0.0001, "loss": 3.0415, "loss/crossentropy": 2.3580272018909456, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.21100984290242195, "loss/reg": 0.0, "step": 30430 }, { "epoch": 0.20026315789473684, "grad_norm": 2.40625, "grad_norm_var": 0.10287984212239583, "learning_rate": 0.0001, "loss": 3.021, "loss/crossentropy": 2.3812718391418457, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.2449337661266327, "loss/reg": 0.0, "step": 30440 }, { "epoch": 0.20032894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.05493876139322917, "learning_rate": 0.0001, "loss": 2.9937, "loss/crossentropy": 2.406654155254364, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.28666631430387496, "loss/reg": 0.0, "step": 30450 }, { "epoch": 0.20039473684210526, "grad_norm": 2.140625, "grad_norm_var": 0.0645904541015625, "learning_rate": 0.0001, "loss": 3.1228, "loss/crossentropy": 2.3998040676116945, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2500763192772865, "loss/reg": 0.0, "step": 30460 }, { "epoch": 0.20046052631578948, "grad_norm": 2.234375, "grad_norm_var": 0.1094146728515625, "learning_rate": 0.0001, "loss": 2.9741, "loss/crossentropy": 2.1604729771614073, "loss/hidden": 2.590625, "loss/incoh": 0.0, "loss/logits": 0.19615808725357056, "loss/reg": 0.0, "step": 30470 }, { "epoch": 0.2005263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.1017974853515625, "learning_rate": 0.0001, "loss": 3.0351, "loss/crossentropy": 2.3099865555763244, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.2315102458000183, "loss/reg": 0.0, "step": 30480 }, { "epoch": 0.2005921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.41324462890625, "learning_rate": 0.0001, "loss": 3.0974, "loss/crossentropy": 2.3289413452148438, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2756644278764725, "loss/reg": 0.0, "step": 30490 }, { "epoch": 0.20065789473684212, "grad_norm": 2.390625, "grad_norm_var": 1.2159505208333334, "learning_rate": 0.0001, "loss": 2.9922, "loss/crossentropy": 2.2126728296279907, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.22418673560023308, "loss/reg": 0.0, "step": 30500 }, { "epoch": 0.2007236842105263, "grad_norm": 2.515625, "grad_norm_var": 0.9500343322753906, "learning_rate": 0.0001, "loss": 2.9726, "loss/crossentropy": 2.3763804376125335, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.22033569663763047, "loss/reg": 0.0, "step": 30510 }, { "epoch": 0.20078947368421052, "grad_norm": 2.09375, "grad_norm_var": 0.14224853515625, "learning_rate": 0.0001, "loss": 3.0223, "loss/crossentropy": 2.400152790546417, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.21929119899868965, "loss/reg": 0.0, "step": 30520 }, { "epoch": 0.20085526315789473, "grad_norm": 2.09375, "grad_norm_var": 103.514990234375, "learning_rate": 0.0001, "loss": 3.0504, "loss/crossentropy": 2.1772961974143983, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.23286587446928025, "loss/reg": 0.0, "step": 30530 }, { "epoch": 0.20092105263157894, "grad_norm": 2.53125, "grad_norm_var": 0.11510391235351562, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.2763838887214662, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.22003234028816224, "loss/reg": 0.0, "step": 30540 }, { "epoch": 0.20098684210526316, "grad_norm": 2.203125, "grad_norm_var": 0.08960367838541666, "learning_rate": 0.0001, "loss": 2.9552, "loss/crossentropy": 2.418367159366608, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2519043207168579, "loss/reg": 0.0, "step": 30550 }, { "epoch": 0.20105263157894737, "grad_norm": 2.484375, "grad_norm_var": 0.07086588541666666, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.4948137521743776, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.24468859434127807, "loss/reg": 0.0, "step": 30560 }, { "epoch": 0.20111842105263159, "grad_norm": 4.21875, "grad_norm_var": 0.2363677978515625, "learning_rate": 0.0001, "loss": 3.0781, "loss/crossentropy": 2.697233200073242, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.2648474559187889, "loss/reg": 0.0, "step": 30570 }, { "epoch": 0.2011842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.26945699055989586, "learning_rate": 0.0001, "loss": 2.9756, "loss/crossentropy": 2.3618945240974427, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.2500054851174355, "loss/reg": 0.0, "step": 30580 }, { "epoch": 0.20125, "grad_norm": 2.78125, "grad_norm_var": 0.09013264973958333, "learning_rate": 0.0001, "loss": 3.0325, "loss/crossentropy": 2.1298818975687026, "loss/hidden": 2.5125, "loss/incoh": 0.0, "loss/logits": 0.1715396959334612, "loss/reg": 0.0, "step": 30590 }, { "epoch": 0.2013157894736842, "grad_norm": 2.296875, "grad_norm_var": 0.09729715983072916, "learning_rate": 0.0001, "loss": 3.0119, "loss/crossentropy": 2.204944038391113, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.21538405269384384, "loss/reg": 0.0, "step": 30600 }, { "epoch": 0.2013815789473684, "grad_norm": 2.296875, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 2.9775, "loss/crossentropy": 2.4482433080673216, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.22656193226575852, "loss/reg": 0.0, "step": 30610 }, { "epoch": 0.20144736842105262, "grad_norm": 2.234375, "grad_norm_var": 0.03572769165039062, "learning_rate": 0.0001, "loss": 3.0182, "loss/crossentropy": 2.0842928767204283, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2059495523571968, "loss/reg": 0.0, "step": 30620 }, { "epoch": 0.20151315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.1525286356608073, "learning_rate": 0.0001, "loss": 3.0799, "loss/crossentropy": 2.3205880761146545, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.22585390508174896, "loss/reg": 0.0, "step": 30630 }, { "epoch": 0.20157894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.08941650390625, "learning_rate": 0.0001, "loss": 3.0334, "loss/crossentropy": 2.287918508052826, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.23107146471738815, "loss/reg": 0.0, "step": 30640 }, { "epoch": 0.20164473684210527, "grad_norm": 2.109375, "grad_norm_var": 0.0238433837890625, "learning_rate": 0.0001, "loss": 2.9747, "loss/crossentropy": 2.4189619421958923, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2165643572807312, "loss/reg": 0.0, "step": 30650 }, { "epoch": 0.20171052631578948, "grad_norm": 2.375, "grad_norm_var": 0.07734349568684896, "learning_rate": 0.0001, "loss": 2.9809, "loss/crossentropy": 2.484665501117706, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.25413639694452284, "loss/reg": 0.0, "step": 30660 }, { "epoch": 0.2017763157894737, "grad_norm": 2.328125, "grad_norm_var": 0.13760477701822918, "learning_rate": 0.0001, "loss": 3.0598, "loss/crossentropy": 1.993758463859558, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.23019451200962066, "loss/reg": 0.0, "step": 30670 }, { "epoch": 0.2018421052631579, "grad_norm": 2.25, "grad_norm_var": 0.16373697916666666, "learning_rate": 0.0001, "loss": 3.0066, "loss/crossentropy": 2.508138656616211, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.29257247895002364, "loss/reg": 0.0, "step": 30680 }, { "epoch": 0.2019078947368421, "grad_norm": 2.765625, "grad_norm_var": 0.16415786743164062, "learning_rate": 0.0001, "loss": 3.0694, "loss/crossentropy": 2.178850865364075, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19227326661348343, "loss/reg": 0.0, "step": 30690 }, { "epoch": 0.2019736842105263, "grad_norm": 2.015625, "grad_norm_var": 0.12350234985351563, "learning_rate": 0.0001, "loss": 3.0484, "loss/crossentropy": 2.395913541316986, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.23005425035953522, "loss/reg": 0.0, "step": 30700 }, { "epoch": 0.20203947368421052, "grad_norm": 2.15625, "grad_norm_var": 0.07646077473958333, "learning_rate": 0.0001, "loss": 2.9845, "loss/crossentropy": 2.0765319585800173, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.21056767106056212, "loss/reg": 0.0, "step": 30710 }, { "epoch": 0.20210526315789473, "grad_norm": 2.34375, "grad_norm_var": 0.14062093098958334, "learning_rate": 0.0001, "loss": 3.0582, "loss/crossentropy": 2.1704251885414125, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.217359322309494, "loss/reg": 0.0, "step": 30720 }, { "epoch": 0.20217105263157895, "grad_norm": 2.390625, "grad_norm_var": 0.1090484619140625, "learning_rate": 0.0001, "loss": 3.0285, "loss/crossentropy": 2.1888681292533874, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.23789106756448747, "loss/reg": 0.0, "step": 30730 }, { "epoch": 0.20223684210526316, "grad_norm": 1.9921875, "grad_norm_var": 0.2569536844889323, "learning_rate": 0.0001, "loss": 3.0386, "loss/crossentropy": 2.161084806919098, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.23299970030784606, "loss/reg": 0.0, "step": 30740 }, { "epoch": 0.20230263157894737, "grad_norm": 2.046875, "grad_norm_var": 0.14261245727539062, "learning_rate": 0.0001, "loss": 2.9869, "loss/crossentropy": 2.4508134722709656, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.20573023706674576, "loss/reg": 0.0, "step": 30750 }, { "epoch": 0.2023684210526316, "grad_norm": 2.734375, "grad_norm_var": 0.08206761678059896, "learning_rate": 0.0001, "loss": 3.1039, "loss/crossentropy": 2.5278658986091616, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.2716485261917114, "loss/reg": 0.0, "step": 30760 }, { "epoch": 0.2024342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.14897435506184895, "learning_rate": 0.0001, "loss": 3.0202, "loss/crossentropy": 2.2195838689804077, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.29928734600543977, "loss/reg": 0.0, "step": 30770 }, { "epoch": 0.2025, "grad_norm": 2.265625, "grad_norm_var": 0.3571248372395833, "learning_rate": 0.0001, "loss": 2.9956, "loss/crossentropy": 1.9182006061077117, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2259790524840355, "loss/reg": 0.0, "step": 30780 }, { "epoch": 0.2025657894736842, "grad_norm": 2.359375, "grad_norm_var": 0.3158365885416667, "learning_rate": 0.0001, "loss": 3.043, "loss/crossentropy": 2.2211042761802675, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.23300761282444, "loss/reg": 0.0, "step": 30790 }, { "epoch": 0.2026315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.1349609375, "learning_rate": 0.0001, "loss": 3.046, "loss/crossentropy": 2.2689759850502016, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22895248532295226, "loss/reg": 0.0, "step": 30800 }, { "epoch": 0.20269736842105263, "grad_norm": 2.484375, "grad_norm_var": 0.03440653483072917, "learning_rate": 0.0001, "loss": 2.9736, "loss/crossentropy": 2.2549991250038146, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.24080493301153183, "loss/reg": 0.0, "step": 30810 }, { "epoch": 0.20276315789473684, "grad_norm": 2.90625, "grad_norm_var": 1.0274088541666666, "learning_rate": 0.0001, "loss": 3.0971, "loss/crossentropy": 2.3225981116294863, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.24680722504854202, "loss/reg": 0.0, "step": 30820 }, { "epoch": 0.20282894736842105, "grad_norm": 2.21875, "grad_norm_var": 1.2762102762858072, "learning_rate": 0.0001, "loss": 3.0638, "loss/crossentropy": 2.355189287662506, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.23843924850225448, "loss/reg": 0.0, "step": 30830 }, { "epoch": 0.20289473684210527, "grad_norm": 2.671875, "grad_norm_var": 2.2499407450358073, "learning_rate": 0.0001, "loss": 3.2035, "loss/crossentropy": 2.2234743475914, "loss/hidden": 2.934375, "loss/incoh": 0.0, "loss/logits": 0.2502759709954262, "loss/reg": 0.0, "step": 30840 }, { "epoch": 0.20296052631578948, "grad_norm": 2.0625, "grad_norm_var": 2.0923665364583335, "learning_rate": 0.0001, "loss": 3.0222, "loss/crossentropy": 2.535164365172386, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.21734880432486534, "loss/reg": 0.0, "step": 30850 }, { "epoch": 0.2030263157894737, "grad_norm": 2.265625, "grad_norm_var": 0.03389383951822917, "learning_rate": 0.0001, "loss": 2.9959, "loss/crossentropy": 2.4880054354667664, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2341625601053238, "loss/reg": 0.0, "step": 30860 }, { "epoch": 0.2030921052631579, "grad_norm": 2.140625, "grad_norm_var": 0.0282135009765625, "learning_rate": 0.0001, "loss": 3.0451, "loss/crossentropy": 2.277066648006439, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.2593472898006439, "loss/reg": 0.0, "step": 30870 }, { "epoch": 0.2031578947368421, "grad_norm": 2.140625, "grad_norm_var": 0.03958333333333333, "learning_rate": 0.0001, "loss": 3.0158, "loss/crossentropy": 2.1649017184972763, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.2148670382797718, "loss/reg": 0.0, "step": 30880 }, { "epoch": 0.2032236842105263, "grad_norm": 2.1875, "grad_norm_var": 0.37971903483072916, "learning_rate": 0.0001, "loss": 2.9899, "loss/crossentropy": 2.3688385248184205, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2734518602490425, "loss/reg": 0.0, "step": 30890 }, { "epoch": 0.20328947368421052, "grad_norm": 2.203125, "grad_norm_var": 0.3890452067057292, "learning_rate": 0.0001, "loss": 3.0088, "loss/crossentropy": 2.501011300086975, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.2248419776558876, "loss/reg": 0.0, "step": 30900 }, { "epoch": 0.20335526315789473, "grad_norm": 2.546875, "grad_norm_var": 0.0226470947265625, "learning_rate": 0.0001, "loss": 3.0529, "loss/crossentropy": 2.4394679188728334, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.23242839723825454, "loss/reg": 0.0, "step": 30910 }, { "epoch": 0.20342105263157895, "grad_norm": 1.9296875, "grad_norm_var": 0.06285374959309896, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.1788210391998293, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.21422138065099716, "loss/reg": 0.0, "step": 30920 }, { "epoch": 0.20348684210526316, "grad_norm": 3.09375, "grad_norm_var": 0.09987157185872396, "learning_rate": 0.0001, "loss": 2.948, "loss/crossentropy": 2.4877074480056764, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.22587471902370454, "loss/reg": 0.0, "step": 30930 }, { "epoch": 0.20355263157894737, "grad_norm": 2.890625, "grad_norm_var": 0.2575154622395833, "learning_rate": 0.0001, "loss": 3.0854, "loss/crossentropy": 2.3099985003471373, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.23227183222770692, "loss/reg": 0.0, "step": 30940 }, { "epoch": 0.2036184210526316, "grad_norm": 2.125, "grad_norm_var": 0.18185221354166667, "learning_rate": 0.0001, "loss": 3.1055, "loss/crossentropy": 2.228105306625366, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.24457025676965713, "loss/reg": 0.0, "step": 30950 }, { "epoch": 0.2036842105263158, "grad_norm": 2.40625, "grad_norm_var": 0.08284098307291667, "learning_rate": 0.0001, "loss": 3.131, "loss/crossentropy": 2.395846438407898, "loss/hidden": 3.04375, "loss/incoh": 0.0, "loss/logits": 0.2948496550321579, "loss/reg": 0.0, "step": 30960 }, { "epoch": 0.20375, "grad_norm": 2.328125, "grad_norm_var": 0.07164306640625, "learning_rate": 0.0001, "loss": 2.9739, "loss/crossentropy": 2.221297824382782, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.21414069309830666, "loss/reg": 0.0, "step": 30970 }, { "epoch": 0.2038157894736842, "grad_norm": 2.234375, "grad_norm_var": 0.13599853515625, "learning_rate": 0.0001, "loss": 3.1104, "loss/crossentropy": 2.293365275859833, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2863286077976227, "loss/reg": 0.0, "step": 30980 }, { "epoch": 0.2038815789473684, "grad_norm": 2.265625, "grad_norm_var": 0.12737630208333334, "learning_rate": 0.0001, "loss": 2.9774, "loss/crossentropy": 2.293133610486984, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.22788952738046647, "loss/reg": 0.0, "step": 30990 }, { "epoch": 0.20394736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.01656494140625, "learning_rate": 0.0001, "loss": 2.9724, "loss/crossentropy": 2.4067237973213196, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.220338836312294, "loss/reg": 0.0, "step": 31000 }, { "epoch": 0.20401315789473684, "grad_norm": 2.1875, "grad_norm_var": 0.984301503499349, "learning_rate": 0.0001, "loss": 3.0445, "loss/crossentropy": 2.3435217142105103, "loss/hidden": 2.9984375, "loss/incoh": 0.0, "loss/logits": 0.28915616124868393, "loss/reg": 0.0, "step": 31010 }, { "epoch": 0.20407894736842105, "grad_norm": 3.0, "grad_norm_var": 0.10872573852539062, "learning_rate": 0.0001, "loss": 3.133, "loss/crossentropy": 2.3546693086624146, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.2506974846124649, "loss/reg": 0.0, "step": 31020 }, { "epoch": 0.20414473684210527, "grad_norm": 2.59375, "grad_norm_var": 0.33428929646809896, "learning_rate": 0.0001, "loss": 3.0211, "loss/crossentropy": 2.2451566100120544, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23826195299625397, "loss/reg": 0.0, "step": 31030 }, { "epoch": 0.20421052631578948, "grad_norm": 2.140625, "grad_norm_var": 0.31133626302083334, "learning_rate": 0.0001, "loss": 3.0125, "loss/crossentropy": 2.142116904258728, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.19715375155210496, "loss/reg": 0.0, "step": 31040 }, { "epoch": 0.2042763157894737, "grad_norm": 2.375, "grad_norm_var": 0.1289629618326823, "learning_rate": 0.0001, "loss": 3.0955, "loss/crossentropy": 2.210948419570923, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.2645356684923172, "loss/reg": 0.0, "step": 31050 }, { "epoch": 0.2043421052631579, "grad_norm": 2.140625, "grad_norm_var": 0.1048004150390625, "learning_rate": 0.0001, "loss": 3.1043, "loss/crossentropy": 2.0977415561676027, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.2552319660782814, "loss/reg": 0.0, "step": 31060 }, { "epoch": 0.2044078947368421, "grad_norm": 2.296875, "grad_norm_var": 0.08701070149739583, "learning_rate": 0.0001, "loss": 3.1067, "loss/crossentropy": 2.264439880847931, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2655411049723625, "loss/reg": 0.0, "step": 31070 }, { "epoch": 0.2044736842105263, "grad_norm": 2.25, "grad_norm_var": 0.06838785807291667, "learning_rate": 0.0001, "loss": 3.0627, "loss/crossentropy": 2.0332413136959078, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.21882982477545737, "loss/reg": 0.0, "step": 31080 }, { "epoch": 0.20453947368421052, "grad_norm": 2.484375, "grad_norm_var": 0.10865478515625, "learning_rate": 0.0001, "loss": 3.0466, "loss/crossentropy": 2.2589586079120636, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.2315130352973938, "loss/reg": 0.0, "step": 31090 }, { "epoch": 0.20460526315789473, "grad_norm": 2.375, "grad_norm_var": 0.37800191243489584, "learning_rate": 0.0001, "loss": 3.0036, "loss/crossentropy": 2.3440281629562376, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2972772717475891, "loss/reg": 0.0, "step": 31100 }, { "epoch": 0.20467105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.29983723958333336, "learning_rate": 0.0001, "loss": 2.954, "loss/crossentropy": 2.47297340631485, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.24424962252378463, "loss/reg": 0.0, "step": 31110 }, { "epoch": 0.20473684210526316, "grad_norm": 2.140625, "grad_norm_var": 0.2591634114583333, "learning_rate": 0.0001, "loss": 3.1546, "loss/crossentropy": 2.436366784572601, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.3093844935297966, "loss/reg": 0.0, "step": 31120 }, { "epoch": 0.20480263157894738, "grad_norm": 2.453125, "grad_norm_var": 0.2005859375, "learning_rate": 0.0001, "loss": 3.0745, "loss/crossentropy": 2.140506219863892, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.27134309709072113, "loss/reg": 0.0, "step": 31130 }, { "epoch": 0.2048684210526316, "grad_norm": 2.125, "grad_norm_var": 0.032713826497395834, "learning_rate": 0.0001, "loss": 3.0449, "loss/crossentropy": 2.5245323538780213, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.2610665872693062, "loss/reg": 0.0, "step": 31140 }, { "epoch": 0.2049342105263158, "grad_norm": 2.0625, "grad_norm_var": 0.11193745930989583, "learning_rate": 0.0001, "loss": 3.0709, "loss/crossentropy": 2.5337566137313843, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2527821347117424, "loss/reg": 0.0, "step": 31150 }, { "epoch": 0.205, "grad_norm": 3.953125, "grad_norm_var": 0.3135943094889323, "learning_rate": 0.0001, "loss": 3.1177, "loss/crossentropy": 2.2265261888504027, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.2549376994371414, "loss/reg": 0.0, "step": 31160 }, { "epoch": 0.2050657894736842, "grad_norm": 2.484375, "grad_norm_var": 0.3199534098307292, "learning_rate": 0.0001, "loss": 3.0723, "loss/crossentropy": 2.47224338054657, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.2135899156332016, "loss/reg": 0.0, "step": 31170 }, { "epoch": 0.20513157894736841, "grad_norm": 2.078125, "grad_norm_var": 0.08787333170572917, "learning_rate": 0.0001, "loss": 3.006, "loss/crossentropy": 1.9317480087280274, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.1986733317375183, "loss/reg": 0.0, "step": 31180 }, { "epoch": 0.20519736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.06467692057291667, "learning_rate": 0.0001, "loss": 3.0381, "loss/crossentropy": 2.470042097568512, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2246125504374504, "loss/reg": 0.0, "step": 31190 }, { "epoch": 0.20526315789473684, "grad_norm": 2.578125, "grad_norm_var": 0.05276692708333333, "learning_rate": 0.0001, "loss": 2.9766, "loss/crossentropy": 2.350657284259796, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.22050067782402039, "loss/reg": 0.0, "step": 31200 }, { "epoch": 0.20532894736842106, "grad_norm": 2.125, "grad_norm_var": 0.0873931884765625, "learning_rate": 0.0001, "loss": 3.0405, "loss/crossentropy": 2.3005827188491823, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2127441346645355, "loss/reg": 0.0, "step": 31210 }, { "epoch": 0.20539473684210527, "grad_norm": 2.78125, "grad_norm_var": 5.270894368489583, "learning_rate": 0.0001, "loss": 3.1111, "loss/crossentropy": 2.4935895919799806, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2504632011055946, "loss/reg": 0.0, "step": 31220 }, { "epoch": 0.20546052631578948, "grad_norm": 2.75, "grad_norm_var": 0.06770426432291667, "learning_rate": 0.0001, "loss": 2.9834, "loss/crossentropy": 2.355222475528717, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2353107064962387, "loss/reg": 0.0, "step": 31230 }, { "epoch": 0.2055263157894737, "grad_norm": 2.078125, "grad_norm_var": 0.209130859375, "learning_rate": 0.0001, "loss": 3.0459, "loss/crossentropy": 2.504118573665619, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22558754682540894, "loss/reg": 0.0, "step": 31240 }, { "epoch": 0.20559210526315788, "grad_norm": 2.296875, "grad_norm_var": 0.34091389973958336, "learning_rate": 0.0001, "loss": 3.0119, "loss/crossentropy": 2.1399470806121825, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2328704759478569, "loss/reg": 0.0, "step": 31250 }, { "epoch": 0.2056578947368421, "grad_norm": 2.921875, "grad_norm_var": 0.24337565104166667, "learning_rate": 0.0001, "loss": 3.0416, "loss/crossentropy": 2.251539409160614, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.3552345484495163, "loss/reg": 0.0, "step": 31260 }, { "epoch": 0.2057236842105263, "grad_norm": 2.15625, "grad_norm_var": 0.28566080729166665, "learning_rate": 0.0001, "loss": 3.0568, "loss/crossentropy": 2.310221529006958, "loss/hidden": 2.940625, "loss/incoh": 0.0, "loss/logits": 0.26540524512529373, "loss/reg": 0.0, "step": 31270 }, { "epoch": 0.20578947368421052, "grad_norm": 2.65625, "grad_norm_var": 0.18033447265625, "learning_rate": 0.0001, "loss": 3.0158, "loss/crossentropy": 2.489114725589752, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.24016214907169342, "loss/reg": 0.0, "step": 31280 }, { "epoch": 0.20585526315789474, "grad_norm": 2.09375, "grad_norm_var": 0.09659830729166667, "learning_rate": 0.0001, "loss": 3.0496, "loss/crossentropy": 1.9956150293350219, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.19284655302762985, "loss/reg": 0.0, "step": 31290 }, { "epoch": 0.20592105263157895, "grad_norm": 2.109375, "grad_norm_var": 0.0728179931640625, "learning_rate": 0.0001, "loss": 3.0451, "loss/crossentropy": 2.383978569507599, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21748270690441132, "loss/reg": 0.0, "step": 31300 }, { "epoch": 0.20598684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.05699462890625, "learning_rate": 0.0001, "loss": 3.031, "loss/crossentropy": 2.064331567287445, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.21027912348508834, "loss/reg": 0.0, "step": 31310 }, { "epoch": 0.20605263157894738, "grad_norm": 2.28125, "grad_norm_var": 0.27751363118489586, "learning_rate": 0.0001, "loss": 3.0386, "loss/crossentropy": 2.5503376483917237, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2552351266145706, "loss/reg": 0.0, "step": 31320 }, { "epoch": 0.2061184210526316, "grad_norm": 2.109375, "grad_norm_var": 0.15871480305989583, "learning_rate": 0.0001, "loss": 3.0885, "loss/crossentropy": 2.408406615257263, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.22667487859725952, "loss/reg": 0.0, "step": 31330 }, { "epoch": 0.2061842105263158, "grad_norm": 2.546875, "grad_norm_var": 0.11311009724934896, "learning_rate": 0.0001, "loss": 3.0571, "loss/crossentropy": 2.199143981933594, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.22705173045396804, "loss/reg": 0.0, "step": 31340 }, { "epoch": 0.20625, "grad_norm": 2.4375, "grad_norm_var": 0.11831868489583333, "learning_rate": 0.0001, "loss": 3.0539, "loss/crossentropy": 1.9996395468711854, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22363494634628295, "loss/reg": 0.0, "step": 31350 }, { "epoch": 0.2063157894736842, "grad_norm": 2.53125, "grad_norm_var": 0.0868364969889323, "learning_rate": 0.0001, "loss": 3.053, "loss/crossentropy": 2.067913568019867, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2136005848646164, "loss/reg": 0.0, "step": 31360 }, { "epoch": 0.20638157894736842, "grad_norm": 2.0, "grad_norm_var": 0.086376953125, "learning_rate": 0.0001, "loss": 3.0064, "loss/crossentropy": 2.300830841064453, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2269425220787525, "loss/reg": 0.0, "step": 31370 }, { "epoch": 0.20644736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.05623753865559896, "learning_rate": 0.0001, "loss": 3.0327, "loss/crossentropy": 2.283673417568207, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.22951046377420425, "loss/reg": 0.0, "step": 31380 }, { "epoch": 0.20651315789473684, "grad_norm": 2.5, "grad_norm_var": 0.15467020670572917, "learning_rate": 0.0001, "loss": 3.0314, "loss/crossentropy": 2.2721763372421266, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2015401691198349, "loss/reg": 0.0, "step": 31390 }, { "epoch": 0.20657894736842106, "grad_norm": 2.25, "grad_norm_var": 0.17532958984375, "learning_rate": 0.0001, "loss": 3.0562, "loss/crossentropy": 2.3926843643188476, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.22229372709989548, "loss/reg": 0.0, "step": 31400 }, { "epoch": 0.20664473684210527, "grad_norm": 2.421875, "grad_norm_var": 0.16061986287434896, "learning_rate": 0.0001, "loss": 3.0372, "loss/crossentropy": 2.2162522673606873, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.21594754755496978, "loss/reg": 0.0, "step": 31410 }, { "epoch": 0.20671052631578948, "grad_norm": 2.171875, "grad_norm_var": 0.19730606079101562, "learning_rate": 0.0001, "loss": 2.9993, "loss/crossentropy": 2.2487839818000794, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.23367298394441605, "loss/reg": 0.0, "step": 31420 }, { "epoch": 0.2067763157894737, "grad_norm": 2.140625, "grad_norm_var": 0.22303059895833333, "learning_rate": 0.0001, "loss": 3.03, "loss/crossentropy": 2.304809939861298, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.21387835443019867, "loss/reg": 0.0, "step": 31430 }, { "epoch": 0.20684210526315788, "grad_norm": 2.421875, "grad_norm_var": 0.18980204264322917, "learning_rate": 0.0001, "loss": 3.0591, "loss/crossentropy": 2.428087902069092, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.2132945567369461, "loss/reg": 0.0, "step": 31440 }, { "epoch": 0.2069078947368421, "grad_norm": 2.4375, "grad_norm_var": 0.11033426920572917, "learning_rate": 0.0001, "loss": 3.0247, "loss/crossentropy": 2.3415855526924134, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2590311750769615, "loss/reg": 0.0, "step": 31450 }, { "epoch": 0.2069736842105263, "grad_norm": 2.171875, "grad_norm_var": 2.33717041015625, "learning_rate": 0.0001, "loss": 3.0733, "loss/crossentropy": 2.0509816646575927, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.22177468091249466, "loss/reg": 0.0, "step": 31460 }, { "epoch": 0.20703947368421052, "grad_norm": 2.125, "grad_norm_var": 0.06643473307291667, "learning_rate": 0.0001, "loss": 3.01, "loss/crossentropy": 2.0959688067436217, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2369927003979683, "loss/reg": 0.0, "step": 31470 }, { "epoch": 0.20710526315789474, "grad_norm": 2.234375, "grad_norm_var": 0.07058690388997396, "learning_rate": 0.0001, "loss": 3.0335, "loss/crossentropy": 2.2887695908546446, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21632779389619827, "loss/reg": 0.0, "step": 31480 }, { "epoch": 0.20717105263157895, "grad_norm": 2.171875, "grad_norm_var": 0.0905914306640625, "learning_rate": 0.0001, "loss": 3.022, "loss/crossentropy": 2.251658821105957, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.27912095934152603, "loss/reg": 0.0, "step": 31490 }, { "epoch": 0.20723684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.13886617024739584, "learning_rate": 0.0001, "loss": 3.0013, "loss/crossentropy": 2.200524830818176, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.20394676253199578, "loss/reg": 0.0, "step": 31500 }, { "epoch": 0.20730263157894738, "grad_norm": 2.328125, "grad_norm_var": 0.11672770182291667, "learning_rate": 0.0001, "loss": 3.1006, "loss/crossentropy": 2.410679817199707, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2168733671307564, "loss/reg": 0.0, "step": 31510 }, { "epoch": 0.2073684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.16353759765625, "learning_rate": 0.0001, "loss": 2.9942, "loss/crossentropy": 2.1653492391109466, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.18678945749998094, "loss/reg": 0.0, "step": 31520 }, { "epoch": 0.20743421052631578, "grad_norm": 2.546875, "grad_norm_var": 0.22099202473958332, "learning_rate": 0.0001, "loss": 3.0174, "loss/crossentropy": 2.4728673577308653, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.24382270276546478, "loss/reg": 0.0, "step": 31530 }, { "epoch": 0.2075, "grad_norm": 2.921875, "grad_norm_var": 0.20640869140625, "learning_rate": 0.0001, "loss": 3.065, "loss/crossentropy": 2.3791961908340453, "loss/hidden": 3.0109375, "loss/incoh": 0.0, "loss/logits": 0.29753602892160413, "loss/reg": 0.0, "step": 31540 }, { "epoch": 0.2075657894736842, "grad_norm": 2.125, "grad_norm_var": 0.07801005045572916, "learning_rate": 0.0001, "loss": 2.9835, "loss/crossentropy": 2.1349298536777495, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.2179326094686985, "loss/reg": 0.0, "step": 31550 }, { "epoch": 0.20763157894736842, "grad_norm": 1.9921875, "grad_norm_var": 0.3508989969889323, "learning_rate": 0.0001, "loss": 3.0481, "loss/crossentropy": 2.128028416633606, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24575520306825638, "loss/reg": 0.0, "step": 31560 }, { "epoch": 0.20769736842105263, "grad_norm": 2.140625, "grad_norm_var": 0.36219253540039065, "learning_rate": 0.0001, "loss": 2.992, "loss/crossentropy": 2.366794526576996, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.2530692145228386, "loss/reg": 0.0, "step": 31570 }, { "epoch": 0.20776315789473684, "grad_norm": 2.125, "grad_norm_var": 0.18565165201822917, "learning_rate": 0.0001, "loss": 2.9888, "loss/crossentropy": 2.5053478240966798, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.24881082475185395, "loss/reg": 0.0, "step": 31580 }, { "epoch": 0.20782894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.22190348307291666, "learning_rate": 0.0001, "loss": 3.0479, "loss/crossentropy": 2.115998589992523, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2497893139719963, "loss/reg": 0.0, "step": 31590 }, { "epoch": 0.20789473684210527, "grad_norm": 2.34375, "grad_norm_var": 0.0385406494140625, "learning_rate": 0.0001, "loss": 2.9967, "loss/crossentropy": 2.2767433404922484, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.22527251839637757, "loss/reg": 0.0, "step": 31600 }, { "epoch": 0.20796052631578948, "grad_norm": 2.140625, "grad_norm_var": 0.11295166015625, "learning_rate": 0.0001, "loss": 3.0751, "loss/crossentropy": 2.271492937207222, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.2081324838101864, "loss/reg": 0.0, "step": 31610 }, { "epoch": 0.2080263157894737, "grad_norm": 2.1875, "grad_norm_var": 0.1297271728515625, "learning_rate": 0.0001, "loss": 3.0283, "loss/crossentropy": 2.107327163219452, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.23592574074864386, "loss/reg": 0.0, "step": 31620 }, { "epoch": 0.20809210526315788, "grad_norm": 2.078125, "grad_norm_var": 0.06132583618164063, "learning_rate": 0.0001, "loss": 3.0553, "loss/crossentropy": 2.4657732486724853, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.2265238046646118, "loss/reg": 0.0, "step": 31630 }, { "epoch": 0.2081578947368421, "grad_norm": 2.59375, "grad_norm_var": 0.5120076497395833, "learning_rate": 0.0001, "loss": 3.0358, "loss/crossentropy": 2.4768943071365355, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2473340943455696, "loss/reg": 0.0, "step": 31640 }, { "epoch": 0.2082236842105263, "grad_norm": 2.734375, "grad_norm_var": 0.35252176920572914, "learning_rate": 0.0001, "loss": 3.0949, "loss/crossentropy": 2.5071337461471557, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.23168605715036392, "loss/reg": 0.0, "step": 31650 }, { "epoch": 0.20828947368421052, "grad_norm": 2.484375, "grad_norm_var": 0.19873758951822917, "learning_rate": 0.0001, "loss": 3.0033, "loss/crossentropy": 2.3884094834327696, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2127884179353714, "loss/reg": 0.0, "step": 31660 }, { "epoch": 0.20835526315789474, "grad_norm": 3.421875, "grad_norm_var": 0.33858133951822916, "learning_rate": 0.0001, "loss": 2.9746, "loss/crossentropy": 2.172113299369812, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20361884087324142, "loss/reg": 0.0, "step": 31670 }, { "epoch": 0.20842105263157895, "grad_norm": 2.28125, "grad_norm_var": 0.36571858723958334, "learning_rate": 0.0001, "loss": 2.9778, "loss/crossentropy": 2.3501265048980713, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21666382998228073, "loss/reg": 0.0, "step": 31680 }, { "epoch": 0.20848684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.12886962890625, "learning_rate": 0.0001, "loss": 3.1071, "loss/crossentropy": 2.8127560138702394, "loss/hidden": 3.221875, "loss/incoh": 0.0, "loss/logits": 0.25830034017562864, "loss/reg": 0.0, "step": 31690 }, { "epoch": 0.20855263157894738, "grad_norm": 2.34375, "grad_norm_var": 0.04257405598958333, "learning_rate": 0.0001, "loss": 3.0355, "loss/crossentropy": 2.0618197679519654, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.22268864512443542, "loss/reg": 0.0, "step": 31700 }, { "epoch": 0.2086184210526316, "grad_norm": 2.03125, "grad_norm_var": 0.10972391764322917, "learning_rate": 0.0001, "loss": 2.9789, "loss/crossentropy": 2.4682215332984923, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.27270212322473525, "loss/reg": 0.0, "step": 31710 }, { "epoch": 0.20868421052631578, "grad_norm": 2.5, "grad_norm_var": 0.108251953125, "learning_rate": 0.0001, "loss": 3.0187, "loss/crossentropy": 2.3017919063568115, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.2289327010512352, "loss/reg": 0.0, "step": 31720 }, { "epoch": 0.20875, "grad_norm": 2.3125, "grad_norm_var": 0.08994140625, "learning_rate": 0.0001, "loss": 3.0376, "loss/crossentropy": 2.2758594751358032, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.23465103954076766, "loss/reg": 0.0, "step": 31730 }, { "epoch": 0.2088157894736842, "grad_norm": 2.15625, "grad_norm_var": 0.0693511962890625, "learning_rate": 0.0001, "loss": 2.9794, "loss/crossentropy": 2.3359742760658264, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.25490144789218905, "loss/reg": 0.0, "step": 31740 }, { "epoch": 0.20888157894736842, "grad_norm": 2.375, "grad_norm_var": 0.035399373372395834, "learning_rate": 0.0001, "loss": 3.0153, "loss/crossentropy": 2.2797624588012697, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.1947791814804077, "loss/reg": 0.0, "step": 31750 }, { "epoch": 0.20894736842105263, "grad_norm": 2.25, "grad_norm_var": 0.048460896809895834, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.221743679046631, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2385573446750641, "loss/reg": 0.0, "step": 31760 }, { "epoch": 0.20901315789473685, "grad_norm": 2.4375, "grad_norm_var": 0.0992828369140625, "learning_rate": 0.0001, "loss": 3.0352, "loss/crossentropy": 2.3273940563201903, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.22714275866746902, "loss/reg": 0.0, "step": 31770 }, { "epoch": 0.20907894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.05660400390625, "learning_rate": 0.0001, "loss": 2.9783, "loss/crossentropy": 2.5335972189903258, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.22051512748003005, "loss/reg": 0.0, "step": 31780 }, { "epoch": 0.20914473684210527, "grad_norm": 2.28125, "grad_norm_var": 0.0770172119140625, "learning_rate": 0.0001, "loss": 3.0626, "loss/crossentropy": 2.1290286660194395, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.24563827514648437, "loss/reg": 0.0, "step": 31790 }, { "epoch": 0.20921052631578949, "grad_norm": 2.875, "grad_norm_var": 0.12021484375, "learning_rate": 0.0001, "loss": 3.1026, "loss/crossentropy": 2.277496612071991, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.23920933827757834, "loss/reg": 0.0, "step": 31800 }, { "epoch": 0.20927631578947367, "grad_norm": 2.25, "grad_norm_var": 0.07742513020833333, "learning_rate": 0.0001, "loss": 3.0043, "loss/crossentropy": 2.3269221425056457, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2029653638601303, "loss/reg": 0.0, "step": 31810 }, { "epoch": 0.20934210526315788, "grad_norm": 2.296875, "grad_norm_var": 0.02303034464518229, "learning_rate": 0.0001, "loss": 3.0615, "loss/crossentropy": 2.352055883407593, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2182576224207878, "loss/reg": 0.0, "step": 31820 }, { "epoch": 0.2094078947368421, "grad_norm": 2.21875, "grad_norm_var": 0.048500315348307295, "learning_rate": 0.0001, "loss": 3.0806, "loss/crossentropy": 2.143119287490845, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.24638065993785857, "loss/reg": 0.0, "step": 31830 }, { "epoch": 0.2094736842105263, "grad_norm": 2.171875, "grad_norm_var": 0.13288472493489584, "learning_rate": 0.0001, "loss": 3.1094, "loss/crossentropy": 2.344961977005005, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24402716457843782, "loss/reg": 0.0, "step": 31840 }, { "epoch": 0.20953947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.39104817708333334, "learning_rate": 0.0001, "loss": 3.0091, "loss/crossentropy": 2.3289579272270204, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.21923962533473967, "loss/reg": 0.0, "step": 31850 }, { "epoch": 0.20960526315789474, "grad_norm": 2.25, "grad_norm_var": 0.35844624837239586, "learning_rate": 0.0001, "loss": 3.1472, "loss/crossentropy": 2.2578481793403626, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2393498733639717, "loss/reg": 0.0, "step": 31860 }, { "epoch": 0.20967105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.103369140625, "learning_rate": 0.0001, "loss": 3.0043, "loss/crossentropy": 2.1479632019996644, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.21754586473107337, "loss/reg": 0.0, "step": 31870 }, { "epoch": 0.20973684210526317, "grad_norm": 2.265625, "grad_norm_var": 0.05641988118489583, "learning_rate": 0.0001, "loss": 3.0913, "loss/crossentropy": 2.31242595911026, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.3139140591025352, "loss/reg": 0.0, "step": 31880 }, { "epoch": 0.20980263157894738, "grad_norm": 2.4375, "grad_norm_var": 0.14700113932291667, "learning_rate": 0.0001, "loss": 3.0276, "loss/crossentropy": 2.421666181087494, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.21645313799381255, "loss/reg": 0.0, "step": 31890 }, { "epoch": 0.20986842105263157, "grad_norm": 2.15625, "grad_norm_var": 0.6790028889973958, "learning_rate": 0.0001, "loss": 3.0781, "loss/crossentropy": 2.2751805901527407, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.22757814675569535, "loss/reg": 0.0, "step": 31900 }, { "epoch": 0.20993421052631578, "grad_norm": 2.109375, "grad_norm_var": 0.14388020833333334, "learning_rate": 0.0001, "loss": 2.978, "loss/crossentropy": 2.3446357250213623, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.21789460927248, "loss/reg": 0.0, "step": 31910 }, { "epoch": 0.21, "grad_norm": 2.40625, "grad_norm_var": 0.1076324462890625, "learning_rate": 0.0001, "loss": 3.0473, "loss/crossentropy": 2.3983862519264223, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21307818740606307, "loss/reg": 0.0, "step": 31920 }, { "epoch": 0.2100657894736842, "grad_norm": 2.59375, "grad_norm_var": 0.03589579264322917, "learning_rate": 0.0001, "loss": 3.0171, "loss/crossentropy": 2.37126282453537, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.205704665184021, "loss/reg": 0.0, "step": 31930 }, { "epoch": 0.21013157894736842, "grad_norm": 2.25, "grad_norm_var": 0.08915608723958333, "learning_rate": 0.0001, "loss": 3.0071, "loss/crossentropy": 2.2473104357719422, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.2207282856106758, "loss/reg": 0.0, "step": 31940 }, { "epoch": 0.21019736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.03990478515625, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 2.4488544821739198, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.26014447808265684, "loss/reg": 0.0, "step": 31950 }, { "epoch": 0.21026315789473685, "grad_norm": 2.25, "grad_norm_var": 0.0229156494140625, "learning_rate": 0.0001, "loss": 3.0762, "loss/crossentropy": 1.9240918695926665, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.19550290815532206, "loss/reg": 0.0, "step": 31960 }, { "epoch": 0.21032894736842106, "grad_norm": 2.296875, "grad_norm_var": 0.03746337890625, "learning_rate": 0.0001, "loss": 3.0402, "loss/crossentropy": 2.1934541881084444, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.19934962689876556, "loss/reg": 0.0, "step": 31970 }, { "epoch": 0.21039473684210527, "grad_norm": 2.203125, "grad_norm_var": 0.03241780598958333, "learning_rate": 0.0001, "loss": 2.9459, "loss/crossentropy": 2.071032130718231, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.20514383763074875, "loss/reg": 0.0, "step": 31980 }, { "epoch": 0.2104605263157895, "grad_norm": 2.484375, "grad_norm_var": 0.02534764607747396, "learning_rate": 0.0001, "loss": 3.0513, "loss/crossentropy": 2.274570310115814, "loss/hidden": 2.9515625, "loss/incoh": 0.0, "loss/logits": 0.3155985027551651, "loss/reg": 0.0, "step": 31990 }, { "epoch": 0.21052631578947367, "grad_norm": 2.8125, "grad_norm_var": 3.301877391722643e+17, "learning_rate": 0.0001, "loss": 3.1093, "loss/crossentropy": 2.0131736040115356, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2141942039132118, "loss/reg": 0.0, "step": 32000 }, { "epoch": 0.21059210526315789, "grad_norm": 3.078125, "grad_norm_var": 0.2028717041015625, "learning_rate": 0.0001, "loss": 2.9957, "loss/crossentropy": 2.213227319717407, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.22059416845440866, "loss/reg": 0.0, "step": 32010 }, { "epoch": 0.2106578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.09852676391601563, "learning_rate": 0.0001, "loss": 2.9318, "loss/crossentropy": 2.2327592849731444, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.19538245126605033, "loss/reg": 0.0, "step": 32020 }, { "epoch": 0.2107236842105263, "grad_norm": 2.125, "grad_norm_var": 0.10897191365559895, "learning_rate": 0.0001, "loss": 2.9561, "loss/crossentropy": 2.415834832191467, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2341999500989914, "loss/reg": 0.0, "step": 32030 }, { "epoch": 0.21078947368421053, "grad_norm": 2.328125, "grad_norm_var": 0.0959185282389323, "learning_rate": 0.0001, "loss": 2.9979, "loss/crossentropy": 2.4787596464157104, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23873308449983596, "loss/reg": 0.0, "step": 32040 }, { "epoch": 0.21085526315789474, "grad_norm": 2.359375, "grad_norm_var": 0.12280985514322916, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 2.3683923482894897, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.22164048850536347, "loss/reg": 0.0, "step": 32050 }, { "epoch": 0.21092105263157895, "grad_norm": 4.40625, "grad_norm_var": 0.3617828369140625, "learning_rate": 0.0001, "loss": 3.0122, "loss/crossentropy": 2.336903250217438, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.2218891516327858, "loss/reg": 0.0, "step": 32060 }, { "epoch": 0.21098684210526317, "grad_norm": 2.046875, "grad_norm_var": 0.31454671223958336, "learning_rate": 0.0001, "loss": 2.9132, "loss/crossentropy": 2.3459483861923216, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.20081116408109664, "loss/reg": 0.0, "step": 32070 }, { "epoch": 0.21105263157894738, "grad_norm": 2.171875, "grad_norm_var": 0.04011942545572917, "learning_rate": 0.0001, "loss": 2.9616, "loss/crossentropy": 2.1744574666023255, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21101347357034683, "loss/reg": 0.0, "step": 32080 }, { "epoch": 0.21111842105263157, "grad_norm": 3.921875, "grad_norm_var": 0.22661107381184895, "learning_rate": 0.0001, "loss": 2.9912, "loss/crossentropy": 2.1881649017333986, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2135306864976883, "loss/reg": 0.0, "step": 32090 }, { "epoch": 0.21118421052631578, "grad_norm": 2.3125, "grad_norm_var": 0.20420506795247395, "learning_rate": 0.0001, "loss": 3.0331, "loss/crossentropy": 2.0982267916202546, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.20456733107566832, "loss/reg": 0.0, "step": 32100 }, { "epoch": 0.21125, "grad_norm": 2.15625, "grad_norm_var": 0.12456766764322917, "learning_rate": 0.0001, "loss": 2.9625, "loss/crossentropy": 2.291160595417023, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2443986713886261, "loss/reg": 0.0, "step": 32110 }, { "epoch": 0.2113157894736842, "grad_norm": 2.125, "grad_norm_var": 0.3302968343098958, "learning_rate": 0.0001, "loss": 3.0737, "loss/crossentropy": 2.3988569140434266, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.22473453879356384, "loss/reg": 0.0, "step": 32120 }, { "epoch": 0.21138157894736842, "grad_norm": 2.671875, "grad_norm_var": 1.0375, "learning_rate": 0.0001, "loss": 2.9681, "loss/crossentropy": 2.157117784023285, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.25956482589244845, "loss/reg": 0.0, "step": 32130 }, { "epoch": 0.21144736842105263, "grad_norm": 2.234375, "grad_norm_var": 0.967632802327474, "learning_rate": 0.0001, "loss": 3.0129, "loss/crossentropy": 2.007625603675842, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.21298689395189285, "loss/reg": 0.0, "step": 32140 }, { "epoch": 0.21151315789473685, "grad_norm": 2.6875, "grad_norm_var": 0.09070536295572916, "learning_rate": 0.0001, "loss": 2.9862, "loss/crossentropy": 2.002223217487335, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.18768413811922074, "loss/reg": 0.0, "step": 32150 }, { "epoch": 0.21157894736842106, "grad_norm": 2.40625, "grad_norm_var": 0.04879557291666667, "learning_rate": 0.0001, "loss": 3.0679, "loss/crossentropy": 2.470254373550415, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2670274436473846, "loss/reg": 0.0, "step": 32160 }, { "epoch": 0.21164473684210527, "grad_norm": 2.484375, "grad_norm_var": 0.04566650390625, "learning_rate": 0.0001, "loss": 2.9691, "loss/crossentropy": 2.489493703842163, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2385897770524025, "loss/reg": 0.0, "step": 32170 }, { "epoch": 0.21171052631578946, "grad_norm": 2.625, "grad_norm_var": 0.051512654622395834, "learning_rate": 0.0001, "loss": 3.079, "loss/crossentropy": 2.4059490084648134, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2276516616344452, "loss/reg": 0.0, "step": 32180 }, { "epoch": 0.21177631578947367, "grad_norm": 2.15625, "grad_norm_var": 0.18515625, "learning_rate": 0.0001, "loss": 2.9959, "loss/crossentropy": 2.1902307987213137, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.22064511626958846, "loss/reg": 0.0, "step": 32190 }, { "epoch": 0.2118421052631579, "grad_norm": 3.09375, "grad_norm_var": 0.21318359375, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 2.2975401639938355, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.21864379346370696, "loss/reg": 0.0, "step": 32200 }, { "epoch": 0.2119078947368421, "grad_norm": 3.65625, "grad_norm_var": 0.3596425374348958, "learning_rate": 0.0001, "loss": 3.0936, "loss/crossentropy": 2.3152296900749207, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.21215650141239167, "loss/reg": 0.0, "step": 32210 }, { "epoch": 0.21197368421052631, "grad_norm": 2.171875, "grad_norm_var": 0.4088053385416667, "learning_rate": 0.0001, "loss": 3.0212, "loss/crossentropy": 2.345796763896942, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22906456291675567, "loss/reg": 0.0, "step": 32220 }, { "epoch": 0.21203947368421053, "grad_norm": 2.5, "grad_norm_var": 0.36056315104166664, "learning_rate": 0.0001, "loss": 2.9913, "loss/crossentropy": 2.2359737753868103, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.21989362984895705, "loss/reg": 0.0, "step": 32230 }, { "epoch": 0.21210526315789474, "grad_norm": 2.390625, "grad_norm_var": 0.1499908447265625, "learning_rate": 0.0001, "loss": 2.9931, "loss/crossentropy": 2.3540435433387756, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.20313581377267836, "loss/reg": 0.0, "step": 32240 }, { "epoch": 0.21217105263157895, "grad_norm": 3.171875, "grad_norm_var": 0.08063151041666666, "learning_rate": 0.0001, "loss": 2.966, "loss/crossentropy": 2.3999782681465147, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.20714087784290314, "loss/reg": 0.0, "step": 32250 }, { "epoch": 0.21223684210526317, "grad_norm": 2.25, "grad_norm_var": 0.08381245930989584, "learning_rate": 0.0001, "loss": 3.0748, "loss/crossentropy": 2.40082049369812, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.24139860570430755, "loss/reg": 0.0, "step": 32260 }, { "epoch": 0.21230263157894738, "grad_norm": 2.40625, "grad_norm_var": 0.0259185791015625, "learning_rate": 0.0001, "loss": 2.9642, "loss/crossentropy": 2.5221059322357178, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.24260111004114152, "loss/reg": 0.0, "step": 32270 }, { "epoch": 0.21236842105263157, "grad_norm": 3.421875, "grad_norm_var": 5.511455956857455e+17, "learning_rate": 0.0001, "loss": 3.1235, "loss/crossentropy": 2.582364892959595, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.2309617057442665, "loss/reg": 0.0, "step": 32280 }, { "epoch": 0.21243421052631578, "grad_norm": 2.578125, "grad_norm_var": 8.515180979484448e+17, "learning_rate": 0.0001, "loss": 3.1265, "loss/crossentropy": 1.9406600832939147, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.25122445076704025, "loss/reg": 0.0, "step": 32290 }, { "epoch": 0.2125, "grad_norm": 2.5, "grad_norm_var": 0.11577860514322917, "learning_rate": 0.0001, "loss": 3.0392, "loss/crossentropy": 1.9736381709575652, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.24801484644412994, "loss/reg": 0.0, "step": 32300 }, { "epoch": 0.2125657894736842, "grad_norm": 2.28125, "grad_norm_var": 0.11686909993489583, "learning_rate": 0.0001, "loss": 3.0, "loss/crossentropy": 2.57318320274353, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.25067331790924074, "loss/reg": 0.0, "step": 32310 }, { "epoch": 0.21263157894736842, "grad_norm": 2.234375, "grad_norm_var": 0.11437352498372395, "learning_rate": 0.0001, "loss": 2.9663, "loss/crossentropy": 2.3444815397262575, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23782878071069719, "loss/reg": 0.0, "step": 32320 }, { "epoch": 0.21269736842105263, "grad_norm": 2.359375, "grad_norm_var": 0.05796483357747396, "learning_rate": 0.0001, "loss": 2.9547, "loss/crossentropy": 2.2045453786849976, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2417543426156044, "loss/reg": 0.0, "step": 32330 }, { "epoch": 0.21276315789473685, "grad_norm": 2.171875, "grad_norm_var": 0.045344034830729164, "learning_rate": 0.0001, "loss": 3.0757, "loss/crossentropy": 2.349012243747711, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2658751055598259, "loss/reg": 0.0, "step": 32340 }, { "epoch": 0.21282894736842106, "grad_norm": 2.25, "grad_norm_var": 0.04952799479166667, "learning_rate": 0.0001, "loss": 2.9861, "loss/crossentropy": 2.4626625537872315, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.25371641367673875, "loss/reg": 0.0, "step": 32350 }, { "epoch": 0.21289473684210528, "grad_norm": 2.40625, "grad_norm_var": 0.06630223592122396, "learning_rate": 0.0001, "loss": 2.9574, "loss/crossentropy": 2.186726263165474, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.20951557606458665, "loss/reg": 0.0, "step": 32360 }, { "epoch": 0.21296052631578946, "grad_norm": 2.71875, "grad_norm_var": 0.10197728474934896, "learning_rate": 0.0001, "loss": 2.965, "loss/crossentropy": 2.340285396575928, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.23474745452404022, "loss/reg": 0.0, "step": 32370 }, { "epoch": 0.21302631578947367, "grad_norm": 2.6875, "grad_norm_var": 0.157373046875, "learning_rate": 0.0001, "loss": 3.0962, "loss/crossentropy": 2.47646986246109, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.26605418920516966, "loss/reg": 0.0, "step": 32380 }, { "epoch": 0.2130921052631579, "grad_norm": 2.15625, "grad_norm_var": 0.10019429524739583, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 1.7385891020298003, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.17920974977314472, "loss/reg": 0.0, "step": 32390 }, { "epoch": 0.2131578947368421, "grad_norm": 2.171875, "grad_norm_var": 0.14401041666666667, "learning_rate": 0.0001, "loss": 2.9707, "loss/crossentropy": 2.182602137327194, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.2166632428765297, "loss/reg": 0.0, "step": 32400 }, { "epoch": 0.21322368421052632, "grad_norm": 2.375, "grad_norm_var": 0.15817057291666667, "learning_rate": 0.0001, "loss": 3.0277, "loss/crossentropy": 2.3320940494537354, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.263668055832386, "loss/reg": 0.0, "step": 32410 }, { "epoch": 0.21328947368421053, "grad_norm": 2.859375, "grad_norm_var": 0.156201171875, "learning_rate": 0.0001, "loss": 3.0197, "loss/crossentropy": 2.338312292098999, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.22362788915634155, "loss/reg": 0.0, "step": 32420 }, { "epoch": 0.21335526315789474, "grad_norm": 2.15625, "grad_norm_var": 0.376025390625, "learning_rate": 0.0001, "loss": 3.0193, "loss/crossentropy": 2.3314969301223756, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2156234934926033, "loss/reg": 0.0, "step": 32430 }, { "epoch": 0.21342105263157896, "grad_norm": 2.296875, "grad_norm_var": 0.6317545572916666, "learning_rate": 0.0001, "loss": 2.97, "loss/crossentropy": 2.3391250133514405, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.21457959488034248, "loss/reg": 0.0, "step": 32440 }, { "epoch": 0.21348684210526317, "grad_norm": 2.5625, "grad_norm_var": 0.3609375, "learning_rate": 0.0001, "loss": 3.0389, "loss/crossentropy": 2.267109489440918, "loss/hidden": 3.1234375, "loss/incoh": 0.0, "loss/logits": 0.3404072761535645, "loss/reg": 0.0, "step": 32450 }, { "epoch": 0.21355263157894736, "grad_norm": 2.390625, "grad_norm_var": 0.06378580729166666, "learning_rate": 0.0001, "loss": 2.9758, "loss/crossentropy": 2.3441552996635435, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.24242025315761567, "loss/reg": 0.0, "step": 32460 }, { "epoch": 0.21361842105263157, "grad_norm": 2.46875, "grad_norm_var": 0.0823883056640625, "learning_rate": 0.0001, "loss": 3.0308, "loss/crossentropy": 2.2937331914901735, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.22603273689746856, "loss/reg": 0.0, "step": 32470 }, { "epoch": 0.21368421052631578, "grad_norm": 2.921875, "grad_norm_var": 0.14341532389322917, "learning_rate": 0.0001, "loss": 3.0095, "loss/crossentropy": 2.071279937028885, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.1986673153936863, "loss/reg": 0.0, "step": 32480 }, { "epoch": 0.21375, "grad_norm": 2.828125, "grad_norm_var": 0.11785481770833334, "learning_rate": 0.0001, "loss": 3.0166, "loss/crossentropy": 2.0449776887893676, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.2660417690873146, "loss/reg": 0.0, "step": 32490 }, { "epoch": 0.2138157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.07841389973958333, "learning_rate": 0.0001, "loss": 3.0209, "loss/crossentropy": 2.22779695391655, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2454911906272173, "loss/reg": 0.0, "step": 32500 }, { "epoch": 0.21388157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.22735087076822916, "learning_rate": 0.0001, "loss": 3.0449, "loss/crossentropy": 2.1109942197799683, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.25361132323741914, "loss/reg": 0.0, "step": 32510 }, { "epoch": 0.21394736842105264, "grad_norm": 2.109375, "grad_norm_var": 0.21687825520833334, "learning_rate": 0.0001, "loss": 2.9946, "loss/crossentropy": 2.594680833816528, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.24347000420093537, "loss/reg": 0.0, "step": 32520 }, { "epoch": 0.21401315789473685, "grad_norm": 2.25, "grad_norm_var": 0.027469889322916666, "learning_rate": 0.0001, "loss": 2.935, "loss/crossentropy": 2.4400468468666077, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2244595929980278, "loss/reg": 0.0, "step": 32530 }, { "epoch": 0.21407894736842106, "grad_norm": 2.34375, "grad_norm_var": 0.02568359375, "learning_rate": 0.0001, "loss": 2.9661, "loss/crossentropy": 2.336955714225769, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.1992204010486603, "loss/reg": 0.0, "step": 32540 }, { "epoch": 0.21414473684210528, "grad_norm": 2.109375, "grad_norm_var": 0.051488240559895836, "learning_rate": 0.0001, "loss": 3.0259, "loss/crossentropy": 2.2532342314720153, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.26036129891872406, "loss/reg": 0.0, "step": 32550 }, { "epoch": 0.21421052631578946, "grad_norm": 1.9765625, "grad_norm_var": 0.03230158487955729, "learning_rate": 0.0001, "loss": 2.9443, "loss/crossentropy": 2.112717604637146, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.17500633224844933, "loss/reg": 0.0, "step": 32560 }, { "epoch": 0.21427631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.03321711222330729, "learning_rate": 0.0001, "loss": 3.0203, "loss/crossentropy": 2.288592982292175, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22093368023633958, "loss/reg": 0.0, "step": 32570 }, { "epoch": 0.2143421052631579, "grad_norm": 2.390625, "grad_norm_var": 0.32156575520833336, "learning_rate": 0.0001, "loss": 2.9199, "loss/crossentropy": 2.1705434560775756, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.20051559060811996, "loss/reg": 0.0, "step": 32580 }, { "epoch": 0.2144078947368421, "grad_norm": 2.359375, "grad_norm_var": 0.06936442057291667, "learning_rate": 0.0001, "loss": 3.0023, "loss/crossentropy": 2.0764291286468506, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.20810510888695716, "loss/reg": 0.0, "step": 32590 }, { "epoch": 0.21447368421052632, "grad_norm": 2.03125, "grad_norm_var": 0.09537760416666667, "learning_rate": 0.0001, "loss": 2.9267, "loss/crossentropy": 2.377918064594269, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.23365774154663085, "loss/reg": 0.0, "step": 32600 }, { "epoch": 0.21453947368421053, "grad_norm": 2.671875, "grad_norm_var": 0.13495686848958333, "learning_rate": 0.0001, "loss": 3.0409, "loss/crossentropy": 2.418037164211273, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2420778289437294, "loss/reg": 0.0, "step": 32610 }, { "epoch": 0.21460526315789474, "grad_norm": 2.875, "grad_norm_var": 0.37043863932291665, "learning_rate": 0.0001, "loss": 2.9946, "loss/crossentropy": 2.272017073631287, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.213658407330513, "loss/reg": 0.0, "step": 32620 }, { "epoch": 0.21467105263157896, "grad_norm": 2.53125, "grad_norm_var": 0.40995686848958335, "learning_rate": 0.0001, "loss": 2.9827, "loss/crossentropy": 2.6424944162368775, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2500556200742722, "loss/reg": 0.0, "step": 32630 }, { "epoch": 0.21473684210526317, "grad_norm": 2.234375, "grad_norm_var": 6.259407552083333, "learning_rate": 0.0001, "loss": 3.0884, "loss/crossentropy": 2.5622973680496215, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.23999259918928145, "loss/reg": 0.0, "step": 32640 }, { "epoch": 0.21480263157894736, "grad_norm": 6.65625, "grad_norm_var": 1.3544270833333334, "learning_rate": 0.0001, "loss": 3.0648, "loss/crossentropy": 2.322819399833679, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.288960862159729, "loss/reg": 0.0, "step": 32650 }, { "epoch": 0.21486842105263157, "grad_norm": 2.296875, "grad_norm_var": 1.3127237955729167, "learning_rate": 0.0001, "loss": 3.0818, "loss/crossentropy": 2.179678440093994, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.31069366484880445, "loss/reg": 0.0, "step": 32660 }, { "epoch": 0.21493421052631578, "grad_norm": 2.640625, "grad_norm_var": 0.10913798014322916, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.2929873704910277, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24380201399326323, "loss/reg": 0.0, "step": 32670 }, { "epoch": 0.215, "grad_norm": 2.5, "grad_norm_var": 0.05563151041666667, "learning_rate": 0.0001, "loss": 3.017, "loss/crossentropy": 1.9826122522354126, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.18868125975131989, "loss/reg": 0.0, "step": 32680 }, { "epoch": 0.2150657894736842, "grad_norm": 2.21875, "grad_norm_var": 0.08579813639322917, "learning_rate": 0.0001, "loss": 2.9559, "loss/crossentropy": 2.4682058572769163, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.22441071420907974, "loss/reg": 0.0, "step": 32690 }, { "epoch": 0.21513157894736842, "grad_norm": 2.265625, "grad_norm_var": 0.07283503214518229, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.255161941051483, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.2831642434000969, "loss/reg": 0.0, "step": 32700 }, { "epoch": 0.21519736842105264, "grad_norm": 2.25, "grad_norm_var": 0.1724273681640625, "learning_rate": 0.0001, "loss": 3.0657, "loss/crossentropy": 2.233939862251282, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.21798643767833709, "loss/reg": 0.0, "step": 32710 }, { "epoch": 0.21526315789473685, "grad_norm": 3.1875, "grad_norm_var": 1.3363352457682292, "learning_rate": 0.0001, "loss": 3.0325, "loss/crossentropy": 2.5652535796165465, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2587194755673409, "loss/reg": 0.0, "step": 32720 }, { "epoch": 0.21532894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.130419921875, "learning_rate": 0.0001, "loss": 3.0003, "loss/crossentropy": 2.1636557817459106, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.20836876183748246, "loss/reg": 0.0, "step": 32730 }, { "epoch": 0.21539473684210525, "grad_norm": 2.046875, "grad_norm_var": 0.035302734375, "learning_rate": 0.0001, "loss": 2.9321, "loss/crossentropy": 2.4362986207008364, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.24660978615283966, "loss/reg": 0.0, "step": 32740 }, { "epoch": 0.21546052631578946, "grad_norm": 2.5, "grad_norm_var": 0.035105133056640626, "learning_rate": 0.0001, "loss": 2.9716, "loss/crossentropy": 2.255325746536255, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.24538252800703048, "loss/reg": 0.0, "step": 32750 }, { "epoch": 0.21552631578947368, "grad_norm": 2.515625, "grad_norm_var": 0.09314676920572916, "learning_rate": 0.0001, "loss": 3.0161, "loss/crossentropy": 2.329508912563324, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.21971380114555358, "loss/reg": 0.0, "step": 32760 }, { "epoch": 0.2155921052631579, "grad_norm": 2.625, "grad_norm_var": 0.20466206868489584, "learning_rate": 0.0001, "loss": 2.963, "loss/crossentropy": 2.262472677230835, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.20472683310508727, "loss/reg": 0.0, "step": 32770 }, { "epoch": 0.2156578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.25344009399414064, "learning_rate": 0.0001, "loss": 3.0138, "loss/crossentropy": 2.3205514192581176, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.23903629779815674, "loss/reg": 0.0, "step": 32780 }, { "epoch": 0.21572368421052632, "grad_norm": 3.09375, "grad_norm_var": 2.3024370829264322, "learning_rate": 0.0001, "loss": 3.0763, "loss/crossentropy": 2.315890896320343, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.24931414425373077, "loss/reg": 0.0, "step": 32790 }, { "epoch": 0.21578947368421053, "grad_norm": 2.53125, "grad_norm_var": 0.34871317545572916, "learning_rate": 0.0001, "loss": 3.0259, "loss/crossentropy": 2.2797498226165773, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.20434827357530594, "loss/reg": 0.0, "step": 32800 }, { "epoch": 0.21585526315789474, "grad_norm": 2.203125, "grad_norm_var": 0.021117146809895834, "learning_rate": 0.0001, "loss": 2.9999, "loss/crossentropy": 2.095989489555359, "loss/hidden": 3.10625, "loss/incoh": 0.0, "loss/logits": 0.2900803714990616, "loss/reg": 0.0, "step": 32810 }, { "epoch": 0.21592105263157896, "grad_norm": 2.453125, "grad_norm_var": 0.06117121378580729, "learning_rate": 0.0001, "loss": 3.03, "loss/crossentropy": 2.325262129306793, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.28796064853668213, "loss/reg": 0.0, "step": 32820 }, { "epoch": 0.21598684210526317, "grad_norm": 4.96875, "grad_norm_var": 0.48582356770833335, "learning_rate": 0.0001, "loss": 2.9917, "loss/crossentropy": 2.521669340133667, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2139764204621315, "loss/reg": 0.0, "step": 32830 }, { "epoch": 0.21605263157894736, "grad_norm": 2.296875, "grad_norm_var": 0.48948567708333335, "learning_rate": 0.0001, "loss": 2.9862, "loss/crossentropy": 2.479197156429291, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.22702316492795943, "loss/reg": 0.0, "step": 32840 }, { "epoch": 0.21611842105263157, "grad_norm": 2.71875, "grad_norm_var": 0.04804280598958333, "learning_rate": 0.0001, "loss": 2.9695, "loss/crossentropy": 1.8314170956611633, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.19034175872802733, "loss/reg": 0.0, "step": 32850 }, { "epoch": 0.21618421052631578, "grad_norm": 3.15625, "grad_norm_var": 0.12819798787434897, "learning_rate": 0.0001, "loss": 2.9729, "loss/crossentropy": 2.3301822662353517, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.23585740774869918, "loss/reg": 0.0, "step": 32860 }, { "epoch": 0.21625, "grad_norm": 2.53125, "grad_norm_var": 3.20617589974401e+17, "learning_rate": 0.0001, "loss": 3.0646, "loss/crossentropy": 2.3723347544670106, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22372013479471206, "loss/reg": 0.0, "step": 32870 }, { "epoch": 0.2163157894736842, "grad_norm": 2.484375, "grad_norm_var": 3.2061758995523174e+17, "learning_rate": 0.0001, "loss": 3.0732, "loss/crossentropy": 2.337008368968964, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.24527821093797683, "loss/reg": 0.0, "step": 32880 }, { "epoch": 0.21638157894736842, "grad_norm": 2.5625, "grad_norm_var": 0.18332926432291666, "learning_rate": 0.0001, "loss": 3.029, "loss/crossentropy": 2.056360971927643, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.20862108021974562, "loss/reg": 0.0, "step": 32890 }, { "epoch": 0.21644736842105264, "grad_norm": 2.359375, "grad_norm_var": 0.2578277587890625, "learning_rate": 0.0001, "loss": 3.0287, "loss/crossentropy": 2.425034213066101, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.22822874337434768, "loss/reg": 0.0, "step": 32900 }, { "epoch": 0.21651315789473685, "grad_norm": 2.734375, "grad_norm_var": 0.1828125, "learning_rate": 0.0001, "loss": 3.046, "loss/crossentropy": 2.0453652262687685, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.2606233850121498, "loss/reg": 0.0, "step": 32910 }, { "epoch": 0.21657894736842107, "grad_norm": 2.453125, "grad_norm_var": 0.1352203369140625, "learning_rate": 0.0001, "loss": 2.992, "loss/crossentropy": 2.245816957950592, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.2316955327987671, "loss/reg": 0.0, "step": 32920 }, { "epoch": 0.21664473684210525, "grad_norm": 2.359375, "grad_norm_var": 0.11937255859375, "learning_rate": 0.0001, "loss": 2.8992, "loss/crossentropy": 2.3973164916038514, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.21730958074331283, "loss/reg": 0.0, "step": 32930 }, { "epoch": 0.21671052631578946, "grad_norm": 2.265625, "grad_norm_var": 0.16926167805989584, "learning_rate": 0.0001, "loss": 3.076, "loss/crossentropy": 2.3131038188934325, "loss/hidden": 3.0328125, "loss/incoh": 0.0, "loss/logits": 0.2888943269848824, "loss/reg": 0.0, "step": 32940 }, { "epoch": 0.21677631578947368, "grad_norm": 2.640625, "grad_norm_var": 0.1410552978515625, "learning_rate": 0.0001, "loss": 3.0965, "loss/crossentropy": 2.441923999786377, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.2226712241768837, "loss/reg": 0.0, "step": 32950 }, { "epoch": 0.2168421052631579, "grad_norm": 5.6875, "grad_norm_var": 0.6874338785807291, "learning_rate": 0.0001, "loss": 3.0441, "loss/crossentropy": 2.0053435921669007, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.20919274985790254, "loss/reg": 0.0, "step": 32960 }, { "epoch": 0.2169078947368421, "grad_norm": 2.03125, "grad_norm_var": 0.7307291666666667, "learning_rate": 0.0001, "loss": 3.0422, "loss/crossentropy": 2.2273124933242796, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2289419651031494, "loss/reg": 0.0, "step": 32970 }, { "epoch": 0.21697368421052632, "grad_norm": 2.328125, "grad_norm_var": 0.0592681884765625, "learning_rate": 0.0001, "loss": 2.972, "loss/crossentropy": 2.315923011302948, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.19858411103487014, "loss/reg": 0.0, "step": 32980 }, { "epoch": 0.21703947368421053, "grad_norm": 2.625, "grad_norm_var": 0.06795145670572916, "learning_rate": 0.0001, "loss": 3.1097, "loss/crossentropy": 2.0356208801269533, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.18648817762732506, "loss/reg": 0.0, "step": 32990 }, { "epoch": 0.21710526315789475, "grad_norm": 2.25, "grad_norm_var": 0.087548828125, "learning_rate": 0.0001, "loss": 3.1037, "loss/crossentropy": 2.4323093771934508, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.2574485644698143, "loss/reg": 0.0, "step": 33000 }, { "epoch": 0.21717105263157896, "grad_norm": 2.375, "grad_norm_var": 0.08557535807291666, "learning_rate": 0.0001, "loss": 2.9955, "loss/crossentropy": 2.2170889139175416, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.23443114012479782, "loss/reg": 0.0, "step": 33010 }, { "epoch": 0.21723684210526314, "grad_norm": 2.828125, "grad_norm_var": 0.0577056884765625, "learning_rate": 0.0001, "loss": 3.0261, "loss/crossentropy": 2.2761878967285156, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.23024188429117204, "loss/reg": 0.0, "step": 33020 }, { "epoch": 0.21730263157894736, "grad_norm": 2.5, "grad_norm_var": 0.041844685872395836, "learning_rate": 0.0001, "loss": 3.0025, "loss/crossentropy": 2.281536602973938, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2611322790384293, "loss/reg": 0.0, "step": 33030 }, { "epoch": 0.21736842105263157, "grad_norm": 2.34375, "grad_norm_var": 2.2070302327473956, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 2.285531198978424, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.27087974548339844, "loss/reg": 0.0, "step": 33040 }, { "epoch": 0.21743421052631579, "grad_norm": 2.1875, "grad_norm_var": 2.3124013264973957, "learning_rate": 0.0001, "loss": 3.0046, "loss/crossentropy": 2.1499021500349045, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2288394898176193, "loss/reg": 0.0, "step": 33050 }, { "epoch": 0.2175, "grad_norm": 2.375, "grad_norm_var": 0.185888671875, "learning_rate": 0.0001, "loss": 3.01, "loss/crossentropy": 2.2262683510780334, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.21322729140520097, "loss/reg": 0.0, "step": 33060 }, { "epoch": 0.2175657894736842, "grad_norm": 2.078125, "grad_norm_var": 0.19086278279622396, "learning_rate": 0.0001, "loss": 2.9547, "loss/crossentropy": 2.512454855442047, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22262426763772963, "loss/reg": 0.0, "step": 33070 }, { "epoch": 0.21763157894736843, "grad_norm": 2.296875, "grad_norm_var": 0.06533584594726563, "learning_rate": 0.0001, "loss": 3.0191, "loss/crossentropy": 2.4302139401435854, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.21147249341011048, "loss/reg": 0.0, "step": 33080 }, { "epoch": 0.21769736842105264, "grad_norm": 2.265625, "grad_norm_var": 0.05804036458333333, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 2.288132381439209, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.20329886302351952, "loss/reg": 0.0, "step": 33090 }, { "epoch": 0.21776315789473685, "grad_norm": 2.09375, "grad_norm_var": 0.07825419108072916, "learning_rate": 0.0001, "loss": 2.9798, "loss/crossentropy": 2.4942794919013975, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.22770892977714538, "loss/reg": 0.0, "step": 33100 }, { "epoch": 0.21782894736842107, "grad_norm": 2.078125, "grad_norm_var": 0.46223551432291665, "learning_rate": 0.0001, "loss": 2.998, "loss/crossentropy": 2.496278202533722, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23407908529043198, "loss/reg": 0.0, "step": 33110 }, { "epoch": 0.21789473684210525, "grad_norm": 2.546875, "grad_norm_var": 0.1009674072265625, "learning_rate": 0.0001, "loss": 3.0352, "loss/crossentropy": 2.0836499094963075, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22313288152217864, "loss/reg": 0.0, "step": 33120 }, { "epoch": 0.21796052631578947, "grad_norm": 2.1875, "grad_norm_var": 0.0770660400390625, "learning_rate": 0.0001, "loss": 2.982, "loss/crossentropy": 2.413359189033508, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.23794451206922532, "loss/reg": 0.0, "step": 33130 }, { "epoch": 0.21802631578947368, "grad_norm": 2.4375, "grad_norm_var": 0.17366536458333334, "learning_rate": 0.0001, "loss": 2.9588, "loss/crossentropy": 2.390314495563507, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20836979001760483, "loss/reg": 0.0, "step": 33140 }, { "epoch": 0.2180921052631579, "grad_norm": 2.40625, "grad_norm_var": 0.1214263916015625, "learning_rate": 0.0001, "loss": 3.0312, "loss/crossentropy": 2.1217519342899323, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.22108341604471207, "loss/reg": 0.0, "step": 33150 }, { "epoch": 0.2181578947368421, "grad_norm": 2.359375, "grad_norm_var": 0.023981730143229168, "learning_rate": 0.0001, "loss": 2.9862, "loss/crossentropy": 2.3113851308822633, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.27378889471292495, "loss/reg": 0.0, "step": 33160 }, { "epoch": 0.21822368421052632, "grad_norm": 2.921875, "grad_norm_var": 0.07545166015625, "learning_rate": 0.0001, "loss": 3.0085, "loss/crossentropy": 2.366367816925049, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.20626646727323533, "loss/reg": 0.0, "step": 33170 }, { "epoch": 0.21828947368421053, "grad_norm": 2.734375, "grad_norm_var": 0.08708394368489583, "learning_rate": 0.0001, "loss": 2.9855, "loss/crossentropy": 2.2448646426200867, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2878586873412132, "loss/reg": 0.0, "step": 33180 }, { "epoch": 0.21835526315789475, "grad_norm": 2.15625, "grad_norm_var": 0.0867828369140625, "learning_rate": 0.0001, "loss": 2.9798, "loss/crossentropy": 2.2996390104293822, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.22233397513628006, "loss/reg": 0.0, "step": 33190 }, { "epoch": 0.21842105263157896, "grad_norm": 2.6875, "grad_norm_var": 0.12769266764322917, "learning_rate": 0.0001, "loss": 3.0705, "loss/crossentropy": 2.22303249835968, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.32309592962265016, "loss/reg": 0.0, "step": 33200 }, { "epoch": 0.21848684210526315, "grad_norm": 2.28125, "grad_norm_var": 0.07171122233072917, "learning_rate": 0.0001, "loss": 3.0575, "loss/crossentropy": 2.455464780330658, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2447066068649292, "loss/reg": 0.0, "step": 33210 }, { "epoch": 0.21855263157894736, "grad_norm": 2.421875, "grad_norm_var": 0.13964436848958334, "learning_rate": 0.0001, "loss": 3.0108, "loss/crossentropy": 2.4379093527793883, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.20427114516496658, "loss/reg": 0.0, "step": 33220 }, { "epoch": 0.21861842105263157, "grad_norm": 2.53125, "grad_norm_var": 0.14078776041666666, "learning_rate": 0.0001, "loss": 3.0186, "loss/crossentropy": 2.3393636345863342, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24139494746923446, "loss/reg": 0.0, "step": 33230 }, { "epoch": 0.2186842105263158, "grad_norm": 2.203125, "grad_norm_var": 0.017975870768229166, "learning_rate": 0.0001, "loss": 3.0254, "loss/crossentropy": 2.313693457841873, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.25786808133125305, "loss/reg": 0.0, "step": 33240 }, { "epoch": 0.21875, "grad_norm": 2.28125, "grad_norm_var": 0.048111979166666666, "learning_rate": 0.0001, "loss": 2.9975, "loss/crossentropy": 2.3757463097572327, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.21639765352010726, "loss/reg": 0.0, "step": 33250 }, { "epoch": 0.2188157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.07258707682291667, "learning_rate": 0.0001, "loss": 2.9429, "loss/crossentropy": 2.540412497520447, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.1985073819756508, "loss/reg": 0.0, "step": 33260 }, { "epoch": 0.21888157894736843, "grad_norm": 2.53125, "grad_norm_var": 0.03837788899739583, "learning_rate": 0.0001, "loss": 3.0685, "loss/crossentropy": 2.373577296733856, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.2263040378689766, "loss/reg": 0.0, "step": 33270 }, { "epoch": 0.21894736842105264, "grad_norm": 2.390625, "grad_norm_var": 0.029686482747395833, "learning_rate": 0.0001, "loss": 2.9608, "loss/crossentropy": 2.2283737421035767, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23245082497596742, "loss/reg": 0.0, "step": 33280 }, { "epoch": 0.21901315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.09920145670572916, "learning_rate": 0.0001, "loss": 3.0048, "loss/crossentropy": 2.2985653638839723, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.27912328094244004, "loss/reg": 0.0, "step": 33290 }, { "epoch": 0.21907894736842104, "grad_norm": 2.109375, "grad_norm_var": 0.10875651041666666, "learning_rate": 0.0001, "loss": 3.005, "loss/crossentropy": 2.342892110347748, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.2504061296582222, "loss/reg": 0.0, "step": 33300 }, { "epoch": 0.21914473684210525, "grad_norm": 2.546875, "grad_norm_var": 0.04578348795572917, "learning_rate": 0.0001, "loss": 3.0108, "loss/crossentropy": 2.337607777118683, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.20644718632102013, "loss/reg": 0.0, "step": 33310 }, { "epoch": 0.21921052631578947, "grad_norm": 2.140625, "grad_norm_var": 0.08603108723958333, "learning_rate": 0.0001, "loss": 2.9696, "loss/crossentropy": 2.080331861972809, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.27878897339105607, "loss/reg": 0.0, "step": 33320 }, { "epoch": 0.21927631578947368, "grad_norm": 1.96875, "grad_norm_var": 0.17981363932291666, "learning_rate": 0.0001, "loss": 3.0076, "loss/crossentropy": 2.3555988073349, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24901487827301025, "loss/reg": 0.0, "step": 33330 }, { "epoch": 0.2193421052631579, "grad_norm": 2.359375, "grad_norm_var": 0.17130533854166666, "learning_rate": 0.0001, "loss": 3.0368, "loss/crossentropy": 2.223631227016449, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.19968705028295516, "loss/reg": 0.0, "step": 33340 }, { "epoch": 0.2194078947368421, "grad_norm": 2.15625, "grad_norm_var": 0.22228190104166667, "learning_rate": 0.0001, "loss": 3.0191, "loss/crossentropy": 2.108024871349335, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.20175025016069412, "loss/reg": 0.0, "step": 33350 }, { "epoch": 0.21947368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.2382720947265625, "learning_rate": 0.0001, "loss": 3.0196, "loss/crossentropy": 2.2824084401130675, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.20648818761110305, "loss/reg": 0.0, "step": 33360 }, { "epoch": 0.21953947368421053, "grad_norm": 2.234375, "grad_norm_var": 0.06386311848958333, "learning_rate": 0.0001, "loss": 2.983, "loss/crossentropy": 2.193799388408661, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.20398449748754502, "loss/reg": 0.0, "step": 33370 }, { "epoch": 0.21960526315789475, "grad_norm": 2.84375, "grad_norm_var": 0.21061375935872395, "learning_rate": 0.0001, "loss": 3.0511, "loss/crossentropy": 2.4952160239219667, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.21543749868869783, "loss/reg": 0.0, "step": 33380 }, { "epoch": 0.21967105263157893, "grad_norm": 2.515625, "grad_norm_var": 0.6498410542805989, "learning_rate": 0.0001, "loss": 3.0987, "loss/crossentropy": 2.4116694569587707, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2724195793271065, "loss/reg": 0.0, "step": 33390 }, { "epoch": 0.21973684210526315, "grad_norm": 2.0625, "grad_norm_var": 0.23567072550455728, "learning_rate": 0.0001, "loss": 2.9416, "loss/crossentropy": 2.597557008266449, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2337816461920738, "loss/reg": 0.0, "step": 33400 }, { "epoch": 0.21980263157894736, "grad_norm": 2.109375, "grad_norm_var": 0.03090387980143229, "learning_rate": 0.0001, "loss": 2.9881, "loss/crossentropy": 2.446978306770325, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.2176991730928421, "loss/reg": 0.0, "step": 33410 }, { "epoch": 0.21986842105263157, "grad_norm": 2.140625, "grad_norm_var": 0.0204742431640625, "learning_rate": 0.0001, "loss": 3.0409, "loss/crossentropy": 2.0214170694351195, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.23912209570407866, "loss/reg": 0.0, "step": 33420 }, { "epoch": 0.2199342105263158, "grad_norm": 2.234375, "grad_norm_var": 0.01568603515625, "learning_rate": 0.0001, "loss": 3.042, "loss/crossentropy": 2.4408559799194336, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.23146803677082062, "loss/reg": 0.0, "step": 33430 }, { "epoch": 0.22, "grad_norm": 2.71875, "grad_norm_var": 0.05896809895833333, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.699678826332092, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.250095309317112, "loss/reg": 0.0, "step": 33440 }, { "epoch": 0.22006578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.14205322265625, "learning_rate": 0.0001, "loss": 3.0448, "loss/crossentropy": 2.2925549387931823, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.24470344185829163, "loss/reg": 0.0, "step": 33450 }, { "epoch": 0.22013157894736843, "grad_norm": 2.25, "grad_norm_var": 3.588060506184896, "learning_rate": 0.0001, "loss": 3.1012, "loss/crossentropy": 2.3695211172103883, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.24156490415334703, "loss/reg": 0.0, "step": 33460 }, { "epoch": 0.22019736842105264, "grad_norm": 2382364672.0, "grad_norm_var": 3.5472883849629094e+17, "learning_rate": 0.0001, "loss": 3.1654, "loss/crossentropy": 2.488659930229187, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.21741390377283096, "loss/reg": 0.0, "step": 33470 }, { "epoch": 0.22026315789473686, "grad_norm": 2.15625, "grad_norm_var": 3.547288386504622e+17, "learning_rate": 0.0001, "loss": 3.0377, "loss/crossentropy": 2.2884308457374574, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.231295208632946, "loss/reg": 0.0, "step": 33480 }, { "epoch": 0.22032894736842104, "grad_norm": 2.421875, "grad_norm_var": 0.049658203125, "learning_rate": 0.0001, "loss": 3.06, "loss/crossentropy": 2.163191545009613, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22781991362571716, "loss/reg": 0.0, "step": 33490 }, { "epoch": 0.22039473684210525, "grad_norm": 2.328125, "grad_norm_var": 0.08287353515625, "learning_rate": 0.0001, "loss": 2.9897, "loss/crossentropy": 2.2403807282447814, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.23052917122840882, "loss/reg": 0.0, "step": 33500 }, { "epoch": 0.22046052631578947, "grad_norm": 2.328125, "grad_norm_var": 0.4391029357910156, "learning_rate": 0.0001, "loss": 2.9359, "loss/crossentropy": 2.215052628517151, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.17493754550814627, "loss/reg": 0.0, "step": 33510 }, { "epoch": 0.22052631578947368, "grad_norm": 2.03125, "grad_norm_var": 1.1144650777180989, "learning_rate": 0.0001, "loss": 3.0608, "loss/crossentropy": 2.0380016922950746, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2704147264361382, "loss/reg": 0.0, "step": 33520 }, { "epoch": 0.2205921052631579, "grad_norm": 2.109375, "grad_norm_var": 1.154686482747396, "learning_rate": 0.0001, "loss": 3.0527, "loss/crossentropy": 2.5095550060272216, "loss/hidden": 2.9453125, "loss/incoh": 0.0, "loss/logits": 0.2915314584970474, "loss/reg": 0.0, "step": 33530 }, { "epoch": 0.2206578947368421, "grad_norm": 2.21875, "grad_norm_var": 0.4706939697265625, "learning_rate": 0.0001, "loss": 2.9389, "loss/crossentropy": 2.175918400287628, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.19029978811740875, "loss/reg": 0.0, "step": 33540 }, { "epoch": 0.22072368421052632, "grad_norm": 2.296875, "grad_norm_var": 0.13570556640625, "learning_rate": 0.0001, "loss": 3.1049, "loss/crossentropy": 2.1491673469543455, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.28144592940807345, "loss/reg": 0.0, "step": 33550 }, { "epoch": 0.22078947368421054, "grad_norm": 2.265625, "grad_norm_var": 0.20730692545572918, "learning_rate": 0.0001, "loss": 3.0324, "loss/crossentropy": 2.10293396115303, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.20049710869789122, "loss/reg": 0.0, "step": 33560 }, { "epoch": 0.22085526315789475, "grad_norm": 2.40625, "grad_norm_var": 0.20046284993489583, "learning_rate": 0.0001, "loss": 3.082, "loss/crossentropy": 2.4584061741828918, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.235392826795578, "loss/reg": 0.0, "step": 33570 }, { "epoch": 0.22092105263157893, "grad_norm": 2.265625, "grad_norm_var": 0.0829742431640625, "learning_rate": 0.0001, "loss": 3.0276, "loss/crossentropy": 2.107621490955353, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2272868797183037, "loss/reg": 0.0, "step": 33580 }, { "epoch": 0.22098684210526315, "grad_norm": 2.484375, "grad_norm_var": 0.25580952962239584, "learning_rate": 0.0001, "loss": 3.0614, "loss/crossentropy": 2.120447838306427, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.19922292605042458, "loss/reg": 0.0, "step": 33590 }, { "epoch": 0.22105263157894736, "grad_norm": 2.078125, "grad_norm_var": 0.027180989583333332, "learning_rate": 0.0001, "loss": 2.9857, "loss/crossentropy": 2.192478084564209, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2926119461655617, "loss/reg": 0.0, "step": 33600 }, { "epoch": 0.22111842105263158, "grad_norm": 2.09375, "grad_norm_var": 0.07001851399739584, "learning_rate": 0.0001, "loss": 3.0663, "loss/crossentropy": 2.4031628251075743, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.21948793083429335, "loss/reg": 0.0, "step": 33610 }, { "epoch": 0.2211842105263158, "grad_norm": 3.671875, "grad_norm_var": 0.183837890625, "learning_rate": 0.0001, "loss": 3.0731, "loss/crossentropy": 2.2898489594459535, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2655272573232651, "loss/reg": 0.0, "step": 33620 }, { "epoch": 0.22125, "grad_norm": 2.25, "grad_norm_var": 1.4440388997395834, "learning_rate": 0.0001, "loss": 3.0888, "loss/crossentropy": 2.334256184101105, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.20996386408805848, "loss/reg": 0.0, "step": 33630 }, { "epoch": 0.22131578947368422, "grad_norm": 2.375, "grad_norm_var": 0.04494527180989583, "learning_rate": 0.0001, "loss": 3.0546, "loss/crossentropy": 2.3563809156417848, "loss/hidden": 3.0484375, "loss/incoh": 0.0, "loss/logits": 0.29970877766609194, "loss/reg": 0.0, "step": 33640 }, { "epoch": 0.22138157894736843, "grad_norm": 2.28125, "grad_norm_var": 0.040283203125, "learning_rate": 0.0001, "loss": 3.0316, "loss/crossentropy": 2.469383955001831, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22515997290611267, "loss/reg": 0.0, "step": 33650 }, { "epoch": 0.22144736842105264, "grad_norm": 2.21875, "grad_norm_var": 0.0654449462890625, "learning_rate": 0.0001, "loss": 3.1037, "loss/crossentropy": 2.4161542892456054, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.22671497017145156, "loss/reg": 0.0, "step": 33660 }, { "epoch": 0.22151315789473683, "grad_norm": 3.953125, "grad_norm_var": 0.27838134765625, "learning_rate": 0.0001, "loss": 3.0341, "loss/crossentropy": 2.415839672088623, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2382939413189888, "loss/reg": 0.0, "step": 33670 }, { "epoch": 0.22157894736842104, "grad_norm": 2.203125, "grad_norm_var": 0.9553260803222656, "learning_rate": 0.0001, "loss": 3.0584, "loss/crossentropy": 2.353093123435974, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.276655937731266, "loss/reg": 0.0, "step": 33680 }, { "epoch": 0.22164473684210526, "grad_norm": 2.421875, "grad_norm_var": 0.8398048400878906, "learning_rate": 0.0001, "loss": 3.0327, "loss/crossentropy": 2.2812642812728883, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20394409000873565, "loss/reg": 0.0, "step": 33690 }, { "epoch": 0.22171052631578947, "grad_norm": 1.859375, "grad_norm_var": 0.10286051432291667, "learning_rate": 0.0001, "loss": 3.0121, "loss/crossentropy": 2.2089013338088987, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.2560999572277069, "loss/reg": 0.0, "step": 33700 }, { "epoch": 0.22177631578947368, "grad_norm": 2.453125, "grad_norm_var": 0.224267578125, "learning_rate": 0.0001, "loss": 2.9909, "loss/crossentropy": 2.579922378063202, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2395647093653679, "loss/reg": 0.0, "step": 33710 }, { "epoch": 0.2218421052631579, "grad_norm": 2.28125, "grad_norm_var": 0.2459307352701823, "learning_rate": 0.0001, "loss": 2.9947, "loss/crossentropy": 2.269551432132721, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.22979858070611953, "loss/reg": 0.0, "step": 33720 }, { "epoch": 0.2219078947368421, "grad_norm": 2.0, "grad_norm_var": 0.0810455322265625, "learning_rate": 0.0001, "loss": 3.0223, "loss/crossentropy": 2.5746334075927733, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.237695574760437, "loss/reg": 0.0, "step": 33730 }, { "epoch": 0.22197368421052632, "grad_norm": 2.5, "grad_norm_var": 0.0932281494140625, "learning_rate": 0.0001, "loss": 3.0095, "loss/crossentropy": 2.111787271499634, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.21114924997091294, "loss/reg": 0.0, "step": 33740 }, { "epoch": 0.22203947368421054, "grad_norm": 2.390625, "grad_norm_var": 0.1626129150390625, "learning_rate": 0.0001, "loss": 3.0218, "loss/crossentropy": 2.392720115184784, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.19515267461538316, "loss/reg": 0.0, "step": 33750 }, { "epoch": 0.22210526315789475, "grad_norm": 2.09375, "grad_norm_var": 0.049347941080729166, "learning_rate": 0.0001, "loss": 3.0357, "loss/crossentropy": 2.4739153146743775, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.3125626042485237, "loss/reg": 0.0, "step": 33760 }, { "epoch": 0.22217105263157894, "grad_norm": 1.9453125, "grad_norm_var": 0.1904070536295573, "learning_rate": 0.0001, "loss": 2.974, "loss/crossentropy": 2.290574276447296, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.23448731899261474, "loss/reg": 0.0, "step": 33770 }, { "epoch": 0.22223684210526315, "grad_norm": 2.125, "grad_norm_var": 0.13733495076497396, "learning_rate": 0.0001, "loss": 2.9673, "loss/crossentropy": 2.317961239814758, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.21905806362628938, "loss/reg": 0.0, "step": 33780 }, { "epoch": 0.22230263157894736, "grad_norm": 2.390625, "grad_norm_var": 0.16253255208333334, "learning_rate": 0.0001, "loss": 3.0608, "loss/crossentropy": 2.170513927936554, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.20969858914613723, "loss/reg": 0.0, "step": 33790 }, { "epoch": 0.22236842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.20142822265625, "learning_rate": 0.0001, "loss": 3.0502, "loss/crossentropy": 2.3547741293907167, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.21735511124134063, "loss/reg": 0.0, "step": 33800 }, { "epoch": 0.2224342105263158, "grad_norm": 2.53125, "grad_norm_var": 0.62681884765625, "learning_rate": 0.0001, "loss": 2.9739, "loss/crossentropy": 2.3395539283752442, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.20967817306518555, "loss/reg": 0.0, "step": 33810 }, { "epoch": 0.2225, "grad_norm": 2.09375, "grad_norm_var": 0.0564117431640625, "learning_rate": 0.0001, "loss": 2.9871, "loss/crossentropy": 2.294986367225647, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2147073432803154, "loss/reg": 0.0, "step": 33820 }, { "epoch": 0.22256578947368422, "grad_norm": 2.09375, "grad_norm_var": 0.07506103515625, "learning_rate": 0.0001, "loss": 2.9679, "loss/crossentropy": 2.257847762107849, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.30166392475366594, "loss/reg": 0.0, "step": 33830 }, { "epoch": 0.22263157894736843, "grad_norm": 3.40625, "grad_norm_var": 0.17619527180989583, "learning_rate": 0.0001, "loss": 2.9993, "loss/crossentropy": 2.2673079133033753, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.21112384274601936, "loss/reg": 0.0, "step": 33840 }, { "epoch": 0.22269736842105264, "grad_norm": 5.0, "grad_norm_var": 0.5702311197916666, "learning_rate": 0.0001, "loss": 3.0953, "loss/crossentropy": 2.632491409778595, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.23320973068475723, "loss/reg": 0.0, "step": 33850 }, { "epoch": 0.22276315789473683, "grad_norm": 2.375, "grad_norm_var": 0.7403310139973959, "learning_rate": 0.0001, "loss": 3.0625, "loss/crossentropy": 2.331642270088196, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.23803979456424712, "loss/reg": 0.0, "step": 33860 }, { "epoch": 0.22282894736842104, "grad_norm": 2.21875, "grad_norm_var": 0.041890462239583336, "learning_rate": 0.0001, "loss": 3.034, "loss/crossentropy": 2.2647625207901, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.20834072977304458, "loss/reg": 0.0, "step": 33870 }, { "epoch": 0.22289473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.0492095947265625, "learning_rate": 0.0001, "loss": 3.0249, "loss/crossentropy": 2.259378707408905, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.21118512228131295, "loss/reg": 0.0, "step": 33880 }, { "epoch": 0.22296052631578947, "grad_norm": 2.875, "grad_norm_var": 0.10131810506184896, "learning_rate": 0.0001, "loss": 3.0156, "loss/crossentropy": 2.31290363073349, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.2051762267947197, "loss/reg": 0.0, "step": 33890 }, { "epoch": 0.22302631578947368, "grad_norm": 2.15625, "grad_norm_var": 0.11342544555664062, "learning_rate": 0.0001, "loss": 3.0365, "loss/crossentropy": 1.8784321069717407, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.21909460723400115, "loss/reg": 0.0, "step": 33900 }, { "epoch": 0.2230921052631579, "grad_norm": 2.34375, "grad_norm_var": 0.05671284993489583, "learning_rate": 0.0001, "loss": 3.016, "loss/crossentropy": 2.3022819638252257, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.22228346467018129, "loss/reg": 0.0, "step": 33910 }, { "epoch": 0.2231578947368421, "grad_norm": 2.296875, "grad_norm_var": 0.23657938639322917, "learning_rate": 0.0001, "loss": 3.0624, "loss/crossentropy": 2.335461437702179, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.23024386763572693, "loss/reg": 0.0, "step": 33920 }, { "epoch": 0.22322368421052632, "grad_norm": 2.40625, "grad_norm_var": 0.18298238118489582, "learning_rate": 0.0001, "loss": 3.1342, "loss/crossentropy": 2.149783802032471, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24487016648054122, "loss/reg": 0.0, "step": 33930 }, { "epoch": 0.22328947368421054, "grad_norm": 2.625, "grad_norm_var": 0.07650731404622396, "learning_rate": 0.0001, "loss": 3.0341, "loss/crossentropy": 2.248632514476776, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.22311466187238693, "loss/reg": 0.0, "step": 33940 }, { "epoch": 0.22335526315789472, "grad_norm": 2.140625, "grad_norm_var": 0.0645416259765625, "learning_rate": 0.0001, "loss": 3.1104, "loss/crossentropy": 2.367274534702301, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2507220461964607, "loss/reg": 0.0, "step": 33950 }, { "epoch": 0.22342105263157894, "grad_norm": 2.90625, "grad_norm_var": 0.05906473795572917, "learning_rate": 0.0001, "loss": 3.0713, "loss/crossentropy": 2.3802406549453736, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.23559605777263642, "loss/reg": 0.0, "step": 33960 }, { "epoch": 0.22348684210526315, "grad_norm": 2.5625, "grad_norm_var": 0.045979817708333336, "learning_rate": 0.0001, "loss": 3.0398, "loss/crossentropy": 2.283581781387329, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.22127386182546616, "loss/reg": 0.0, "step": 33970 }, { "epoch": 0.22355263157894736, "grad_norm": 2.34375, "grad_norm_var": 0.0328277587890625, "learning_rate": 0.0001, "loss": 3.0036, "loss/crossentropy": 2.2841804146766664, "loss/hidden": 2.9328125, "loss/incoh": 0.0, "loss/logits": 0.29590369313955306, "loss/reg": 0.0, "step": 33980 }, { "epoch": 0.22361842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.0925201416015625, "learning_rate": 0.0001, "loss": 3.0846, "loss/crossentropy": 2.1158400774002075, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.21308634355664252, "loss/reg": 0.0, "step": 33990 }, { "epoch": 0.2236842105263158, "grad_norm": 2.203125, "grad_norm_var": 0.1506744384765625, "learning_rate": 0.0001, "loss": 3.081, "loss/crossentropy": 2.3101623356342316, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.22568342536687852, "loss/reg": 0.0, "step": 34000 }, { "epoch": 0.22375, "grad_norm": 3.28125, "grad_norm_var": 0.21297200520833334, "learning_rate": 0.0001, "loss": 3.0695, "loss/crossentropy": 2.493696868419647, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.32419712096452713, "loss/reg": 0.0, "step": 34010 }, { "epoch": 0.22381578947368422, "grad_norm": 2.453125, "grad_norm_var": 0.470361328125, "learning_rate": 0.0001, "loss": 3.0831, "loss/crossentropy": 2.181097960472107, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.21221623122692107, "loss/reg": 0.0, "step": 34020 }, { "epoch": 0.22388157894736843, "grad_norm": 2.28125, "grad_norm_var": 0.4900716145833333, "learning_rate": 0.0001, "loss": 2.9904, "loss/crossentropy": 2.2486413717269897, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.2214418590068817, "loss/reg": 0.0, "step": 34030 }, { "epoch": 0.22394736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.026363118489583334, "learning_rate": 0.0001, "loss": 3.0049, "loss/crossentropy": 2.058054494857788, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.20110160410404204, "loss/reg": 0.0, "step": 34040 }, { "epoch": 0.22401315789473683, "grad_norm": 2.109375, "grad_norm_var": 0.8867828369140625, "learning_rate": 0.0001, "loss": 3.0557, "loss/crossentropy": 2.183249998092651, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.3117926768958569, "loss/reg": 0.0, "step": 34050 }, { "epoch": 0.22407894736842104, "grad_norm": 2.625, "grad_norm_var": 0.73414306640625, "learning_rate": 0.0001, "loss": 3.0466, "loss/crossentropy": 2.2317178010940553, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.23523171246051788, "loss/reg": 0.0, "step": 34060 }, { "epoch": 0.22414473684210526, "grad_norm": 2.078125, "grad_norm_var": 0.11988525390625, "learning_rate": 0.0001, "loss": 3.0361, "loss/crossentropy": 2.1958236932754516, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.2175719380378723, "loss/reg": 0.0, "step": 34070 }, { "epoch": 0.22421052631578947, "grad_norm": 2.578125, "grad_norm_var": 0.08212483723958333, "learning_rate": 0.0001, "loss": 3.0053, "loss/crossentropy": 2.3172508239746095, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.21442833691835403, "loss/reg": 0.0, "step": 34080 }, { "epoch": 0.22427631578947368, "grad_norm": 2.03125, "grad_norm_var": 0.13967666625976563, "learning_rate": 0.0001, "loss": 2.9913, "loss/crossentropy": 2.244894874095917, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19675317481160165, "loss/reg": 0.0, "step": 34090 }, { "epoch": 0.2243421052631579, "grad_norm": 2.375, "grad_norm_var": 0.16440200805664062, "learning_rate": 0.0001, "loss": 3.046, "loss/crossentropy": 2.256273639202118, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.22018496468663215, "loss/reg": 0.0, "step": 34100 }, { "epoch": 0.2244078947368421, "grad_norm": 2.046875, "grad_norm_var": 0.15823160807291667, "learning_rate": 0.0001, "loss": 2.9706, "loss/crossentropy": 2.450822639465332, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.31932070925831796, "loss/reg": 0.0, "step": 34110 }, { "epoch": 0.22447368421052633, "grad_norm": 2.484375, "grad_norm_var": 0.212939453125, "learning_rate": 0.0001, "loss": 3.0805, "loss/crossentropy": 2.363582265377045, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.24354881346225737, "loss/reg": 0.0, "step": 34120 }, { "epoch": 0.22453947368421054, "grad_norm": 2.125, "grad_norm_var": 0.07653706868489583, "learning_rate": 0.0001, "loss": 3.0054, "loss/crossentropy": 2.4329180479049684, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2980032041668892, "loss/reg": 0.0, "step": 34130 }, { "epoch": 0.22460526315789472, "grad_norm": 2.640625, "grad_norm_var": 0.14047012329101563, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.339194095134735, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22101039290428162, "loss/reg": 0.0, "step": 34140 }, { "epoch": 0.22467105263157894, "grad_norm": 2.296875, "grad_norm_var": 0.14289957682291668, "learning_rate": 0.0001, "loss": 2.9833, "loss/crossentropy": 2.1932947993278504, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2560951545834541, "loss/reg": 0.0, "step": 34150 }, { "epoch": 0.22473684210526315, "grad_norm": 2.0, "grad_norm_var": 0.11972249348958333, "learning_rate": 0.0001, "loss": 3.0913, "loss/crossentropy": 2.341690111160278, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.21807242110371589, "loss/reg": 0.0, "step": 34160 }, { "epoch": 0.22480263157894737, "grad_norm": 2.625, "grad_norm_var": 0.039937337239583336, "learning_rate": 0.0001, "loss": 3.0886, "loss/crossentropy": 1.9180511385202408, "loss/hidden": 2.9625, "loss/incoh": 0.0, "loss/logits": 0.2749033223837614, "loss/reg": 0.0, "step": 34170 }, { "epoch": 0.22486842105263158, "grad_norm": 2.140625, "grad_norm_var": 0.0544586181640625, "learning_rate": 0.0001, "loss": 3.0813, "loss/crossentropy": 2.0566360354423523, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.22760727405548095, "loss/reg": 0.0, "step": 34180 }, { "epoch": 0.2249342105263158, "grad_norm": 2.25, "grad_norm_var": 0.0463043212890625, "learning_rate": 0.0001, "loss": 3.0512, "loss/crossentropy": 2.291218078136444, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.23884439915418626, "loss/reg": 0.0, "step": 34190 }, { "epoch": 0.225, "grad_norm": 2.171875, "grad_norm_var": 0.04059956868489583, "learning_rate": 0.0001, "loss": 3.0016, "loss/crossentropy": 2.409184718132019, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.22745371460914612, "loss/reg": 0.0, "step": 34200 }, { "epoch": 0.22506578947368422, "grad_norm": 2.40625, "grad_norm_var": 0.028944651285807293, "learning_rate": 0.0001, "loss": 3.0011, "loss/crossentropy": 2.541308951377869, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.19628863781690598, "loss/reg": 0.0, "step": 34210 }, { "epoch": 0.22513157894736843, "grad_norm": 2.796875, "grad_norm_var": 0.080078125, "learning_rate": 0.0001, "loss": 3.0556, "loss/crossentropy": 1.890171855688095, "loss/hidden": 2.975, "loss/incoh": 0.0, "loss/logits": 0.23669045567512512, "loss/reg": 0.0, "step": 34220 }, { "epoch": 0.22519736842105262, "grad_norm": 2.28125, "grad_norm_var": 0.031754557291666666, "learning_rate": 0.0001, "loss": 3.0229, "loss/crossentropy": 1.9730781435966491, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2429889589548111, "loss/reg": 0.0, "step": 34230 }, { "epoch": 0.22526315789473683, "grad_norm": 2.234375, "grad_norm_var": 0.12756754557291666, "learning_rate": 0.0001, "loss": 3.0908, "loss/crossentropy": 2.430511474609375, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.26906117498874665, "loss/reg": 0.0, "step": 34240 }, { "epoch": 0.22532894736842105, "grad_norm": 3.5625, "grad_norm_var": 0.36015625, "learning_rate": 0.0001, "loss": 3.1105, "loss/crossentropy": 2.387774920463562, "loss/hidden": 2.9265625, "loss/incoh": 0.0, "loss/logits": 0.23492622524499893, "loss/reg": 0.0, "step": 34250 }, { "epoch": 0.22539473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.5035804748535156, "learning_rate": 0.0001, "loss": 2.9848, "loss/crossentropy": 2.3150025963783265, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.22484272122383117, "loss/reg": 0.0, "step": 34260 }, { "epoch": 0.22546052631578947, "grad_norm": 2.53125, "grad_norm_var": 0.47118504842122394, "learning_rate": 0.0001, "loss": 2.9996, "loss/crossentropy": 2.1695404648780823, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.20138196349143983, "loss/reg": 0.0, "step": 34270 }, { "epoch": 0.22552631578947369, "grad_norm": 2.1875, "grad_norm_var": 0.055063629150390626, "learning_rate": 0.0001, "loss": 2.9955, "loss/crossentropy": 2.348333418369293, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2416067436337471, "loss/reg": 0.0, "step": 34280 }, { "epoch": 0.2255921052631579, "grad_norm": 2.484375, "grad_norm_var": 0.1240631103515625, "learning_rate": 0.0001, "loss": 3.051, "loss/crossentropy": 2.1614134192466734, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.2166384145617485, "loss/reg": 0.0, "step": 34290 }, { "epoch": 0.2256578947368421, "grad_norm": 2.125, "grad_norm_var": 0.18293355305989584, "learning_rate": 0.0001, "loss": 2.9882, "loss/crossentropy": 2.1018754601478578, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.19822993129491806, "loss/reg": 0.0, "step": 34300 }, { "epoch": 0.22572368421052633, "grad_norm": 2.515625, "grad_norm_var": 0.10800374348958333, "learning_rate": 0.0001, "loss": 3.1428, "loss/crossentropy": 2.4500314474105833, "loss/hidden": 2.990625, "loss/incoh": 0.0, "loss/logits": 0.3025324195623398, "loss/reg": 0.0, "step": 34310 }, { "epoch": 0.22578947368421054, "grad_norm": 2.078125, "grad_norm_var": 0.0825592041015625, "learning_rate": 0.0001, "loss": 3.0246, "loss/crossentropy": 2.1041884064674377, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.22658382654190062, "loss/reg": 0.0, "step": 34320 }, { "epoch": 0.22585526315789473, "grad_norm": 2.40625, "grad_norm_var": 0.07895406087239583, "learning_rate": 0.0001, "loss": 3.0125, "loss/crossentropy": 2.397817623615265, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.28272619247436526, "loss/reg": 0.0, "step": 34330 }, { "epoch": 0.22592105263157894, "grad_norm": 1.953125, "grad_norm_var": 0.08463134765625, "learning_rate": 0.0001, "loss": 3.0419, "loss/crossentropy": 2.0744609951972963, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.27017629742622373, "loss/reg": 0.0, "step": 34340 }, { "epoch": 0.22598684210526315, "grad_norm": 3.171875, "grad_norm_var": 0.10567118326822916, "learning_rate": 0.0001, "loss": 3.054, "loss/crossentropy": 2.5243443489074706, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.22290815711021422, "loss/reg": 0.0, "step": 34350 }, { "epoch": 0.22605263157894737, "grad_norm": 2.671875, "grad_norm_var": 0.20874735514322917, "learning_rate": 0.0001, "loss": 3.0311, "loss/crossentropy": 2.291029953956604, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.23554186373949051, "loss/reg": 0.0, "step": 34360 }, { "epoch": 0.22611842105263158, "grad_norm": 2.65625, "grad_norm_var": 0.17711588541666667, "learning_rate": 0.0001, "loss": 3.0285, "loss/crossentropy": 2.4715088486671446, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2226739391684532, "loss/reg": 0.0, "step": 34370 }, { "epoch": 0.2261842105263158, "grad_norm": 2.765625, "grad_norm_var": 0.07121988932291666, "learning_rate": 0.0001, "loss": 2.9697, "loss/crossentropy": 2.226884996891022, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.19653952047228812, "loss/reg": 0.0, "step": 34380 }, { "epoch": 0.22625, "grad_norm": 2.140625, "grad_norm_var": 0.11801656087239583, "learning_rate": 0.0001, "loss": 3.0042, "loss/crossentropy": 2.6288553953170775, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.22481338530778885, "loss/reg": 0.0, "step": 34390 }, { "epoch": 0.22631578947368422, "grad_norm": 2.234375, "grad_norm_var": 0.056029256184895834, "learning_rate": 0.0001, "loss": 2.9881, "loss/crossentropy": 2.086442506313324, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.19545092582702636, "loss/reg": 0.0, "step": 34400 }, { "epoch": 0.22638157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.07093098958333334, "learning_rate": 0.0001, "loss": 2.9785, "loss/crossentropy": 2.1755173921585085, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.2160940498113632, "loss/reg": 0.0, "step": 34410 }, { "epoch": 0.22644736842105262, "grad_norm": 2.78125, "grad_norm_var": 0.5419667561848959, "learning_rate": 0.0001, "loss": 3.148, "loss/crossentropy": 2.306460678577423, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.23442021161317825, "loss/reg": 0.0, "step": 34420 }, { "epoch": 0.22651315789473683, "grad_norm": 2.359375, "grad_norm_var": 0.509814453125, "learning_rate": 0.0001, "loss": 3.144, "loss/crossentropy": 2.4273258805274964, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.2834622025489807, "loss/reg": 0.0, "step": 34430 }, { "epoch": 0.22657894736842105, "grad_norm": 2.046875, "grad_norm_var": 0.20230712890625, "learning_rate": 0.0001, "loss": 3.0404, "loss/crossentropy": 2.382106697559357, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2566205784678459, "loss/reg": 0.0, "step": 34440 }, { "epoch": 0.22664473684210526, "grad_norm": 2.734375, "grad_norm_var": 1.1934967041015625, "learning_rate": 0.0001, "loss": 3.1251, "loss/crossentropy": 2.4071131587028503, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25932526737451556, "loss/reg": 0.0, "step": 34450 }, { "epoch": 0.22671052631578947, "grad_norm": 3.65625, "grad_norm_var": 0.17838109334309896, "learning_rate": 0.0001, "loss": 3.0335, "loss/crossentropy": 2.3701361656188964, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2191499412059784, "loss/reg": 0.0, "step": 34460 }, { "epoch": 0.2267763157894737, "grad_norm": 2.3125, "grad_norm_var": 0.1755633036295573, "learning_rate": 0.0001, "loss": 3.1017, "loss/crossentropy": 1.7690342009067535, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.19500833228230477, "loss/reg": 0.0, "step": 34470 }, { "epoch": 0.2268421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.36656494140625, "learning_rate": 0.0001, "loss": 3.014, "loss/crossentropy": 2.1445329546928407, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.192388217151165, "loss/reg": 0.0, "step": 34480 }, { "epoch": 0.22690789473684211, "grad_norm": 3.65625, "grad_norm_var": 0.5748331705729167, "learning_rate": 0.0001, "loss": 3.156, "loss/crossentropy": 2.152642047405243, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.23683252632617952, "loss/reg": 0.0, "step": 34490 }, { "epoch": 0.22697368421052633, "grad_norm": 2.46875, "grad_norm_var": 0.1991119384765625, "learning_rate": 0.0001, "loss": 3.1698, "loss/crossentropy": 2.343445897102356, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.25687034577131274, "loss/reg": 0.0, "step": 34500 }, { "epoch": 0.2270394736842105, "grad_norm": 2.109375, "grad_norm_var": 0.1173828125, "learning_rate": 0.0001, "loss": 3.0052, "loss/crossentropy": 2.352896881103516, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.27792870104312895, "loss/reg": 0.0, "step": 34510 }, { "epoch": 0.22710526315789473, "grad_norm": 2.328125, "grad_norm_var": 0.05995992024739583, "learning_rate": 0.0001, "loss": 3.0362, "loss/crossentropy": 2.2525784373283386, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.25623630434274675, "loss/reg": 0.0, "step": 34520 }, { "epoch": 0.22717105263157894, "grad_norm": 2.1875, "grad_norm_var": 2.48424072265625, "learning_rate": 0.0001, "loss": 3.097, "loss/crossentropy": 2.3944430470466616, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23495300412178038, "loss/reg": 0.0, "step": 34530 }, { "epoch": 0.22723684210526315, "grad_norm": 2.75, "grad_norm_var": 2.325169881184896, "learning_rate": 0.0001, "loss": 3.0441, "loss/crossentropy": 2.1533145189285277, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.20518074482679366, "loss/reg": 0.0, "step": 34540 }, { "epoch": 0.22730263157894737, "grad_norm": 3.453125, "grad_norm_var": 0.6099609375, "learning_rate": 0.0001, "loss": 3.1133, "loss/crossentropy": 2.298123669624329, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.23540952503681184, "loss/reg": 0.0, "step": 34550 }, { "epoch": 0.22736842105263158, "grad_norm": 2.625, "grad_norm_var": 0.169775390625, "learning_rate": 0.0001, "loss": 3.0075, "loss/crossentropy": 2.333153510093689, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.23416389673948287, "loss/reg": 0.0, "step": 34560 }, { "epoch": 0.2274342105263158, "grad_norm": 2.125, "grad_norm_var": 0.14983317057291667, "learning_rate": 0.0001, "loss": 3.0272, "loss/crossentropy": 2.455455815792084, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.26056560724973676, "loss/reg": 0.0, "step": 34570 }, { "epoch": 0.2275, "grad_norm": 2.359375, "grad_norm_var": 0.17576065063476562, "learning_rate": 0.0001, "loss": 2.9814, "loss/crossentropy": 1.9633507668972014, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.19498348981142044, "loss/reg": 0.0, "step": 34580 }, { "epoch": 0.22756578947368422, "grad_norm": 2.1875, "grad_norm_var": 0.1496246337890625, "learning_rate": 0.0001, "loss": 3.0304, "loss/crossentropy": 1.8702698707580567, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.2022824764251709, "loss/reg": 0.0, "step": 34590 }, { "epoch": 0.22763157894736843, "grad_norm": 2.46875, "grad_norm_var": 0.1550445556640625, "learning_rate": 0.0001, "loss": 3.027, "loss/crossentropy": 2.588277578353882, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.28010246604681016, "loss/reg": 0.0, "step": 34600 }, { "epoch": 0.22769736842105262, "grad_norm": 2.46875, "grad_norm_var": 0.09648335774739583, "learning_rate": 0.0001, "loss": 3.0036, "loss/crossentropy": 2.3437662720680237, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.21182923167943954, "loss/reg": 0.0, "step": 34610 }, { "epoch": 0.22776315789473683, "grad_norm": 2.65625, "grad_norm_var": 0.08466389973958334, "learning_rate": 0.0001, "loss": 3.0695, "loss/crossentropy": 2.3003593921661376, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.2246791511774063, "loss/reg": 0.0, "step": 34620 }, { "epoch": 0.22782894736842105, "grad_norm": 2.625, "grad_norm_var": 2.332779947916667, "learning_rate": 0.0001, "loss": 2.9537, "loss/crossentropy": 2.2143739581108095, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.2086007311940193, "loss/reg": 0.0, "step": 34630 }, { "epoch": 0.22789473684210526, "grad_norm": 2.328125, "grad_norm_var": 2.5328409830729166, "learning_rate": 0.0001, "loss": 3.0153, "loss/crossentropy": 2.1308518052101135, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.2038565307855606, "loss/reg": 0.0, "step": 34640 }, { "epoch": 0.22796052631578947, "grad_norm": 2.40625, "grad_norm_var": 0.36980361938476564, "learning_rate": 0.0001, "loss": 2.9625, "loss/crossentropy": 2.088499677181244, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.268018501996994, "loss/reg": 0.0, "step": 34650 }, { "epoch": 0.2280263157894737, "grad_norm": 1962934272.0, "grad_norm_var": 2.408194341792645e+17, "learning_rate": 0.0001, "loss": 3.0604, "loss/crossentropy": 2.413490664958954, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.21956574469804763, "loss/reg": 0.0, "step": 34660 }, { "epoch": 0.2280921052631579, "grad_norm": 2.484375, "grad_norm_var": 2.4081943414143712e+17, "learning_rate": 0.0001, "loss": 3.0371, "loss/crossentropy": 2.1357826590538025, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.19804717004299163, "loss/reg": 0.0, "step": 34670 }, { "epoch": 0.22815789473684212, "grad_norm": 2.203125, "grad_norm_var": 0.16970113118489583, "learning_rate": 0.0001, "loss": 2.9031, "loss/crossentropy": 2.4601009845733643, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.23078858107328415, "loss/reg": 0.0, "step": 34680 }, { "epoch": 0.22822368421052633, "grad_norm": 2.28125, "grad_norm_var": 0.0817535400390625, "learning_rate": 0.0001, "loss": 3.0007, "loss/crossentropy": 2.170434999465942, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.23267455995082856, "loss/reg": 0.0, "step": 34690 }, { "epoch": 0.22828947368421051, "grad_norm": 2.796875, "grad_norm_var": 0.08743489583333333, "learning_rate": 0.0001, "loss": 2.9957, "loss/crossentropy": 2.378873956203461, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.25174411833286287, "loss/reg": 0.0, "step": 34700 }, { "epoch": 0.22835526315789473, "grad_norm": 2.625, "grad_norm_var": 0.0942400614420573, "learning_rate": 0.0001, "loss": 2.9195, "loss/crossentropy": 2.3702409982681276, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.2114430248737335, "loss/reg": 0.0, "step": 34710 }, { "epoch": 0.22842105263157894, "grad_norm": 2.5625, "grad_norm_var": 0.1205230712890625, "learning_rate": 0.0001, "loss": 2.9774, "loss/crossentropy": 2.1158406376838683, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2316122904419899, "loss/reg": 0.0, "step": 34720 }, { "epoch": 0.22848684210526315, "grad_norm": 2.75, "grad_norm_var": 0.04747899373372396, "learning_rate": 0.0001, "loss": 2.9868, "loss/crossentropy": 2.2178874969482423, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.22333518415689468, "loss/reg": 0.0, "step": 34730 }, { "epoch": 0.22855263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.08137105305989584, "learning_rate": 0.0001, "loss": 3.0257, "loss/crossentropy": 2.364703667163849, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.22582925111055374, "loss/reg": 0.0, "step": 34740 }, { "epoch": 0.22861842105263158, "grad_norm": 2.03125, "grad_norm_var": 0.0731842041015625, "learning_rate": 0.0001, "loss": 2.9844, "loss/crossentropy": 2.06840842962265, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.24444130361080169, "loss/reg": 0.0, "step": 34750 }, { "epoch": 0.2286842105263158, "grad_norm": 2.34375, "grad_norm_var": 0.06175308227539063, "learning_rate": 0.0001, "loss": 3.0122, "loss/crossentropy": 2.5480314254760743, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.3109170734882355, "loss/reg": 0.0, "step": 34760 }, { "epoch": 0.22875, "grad_norm": 2.296875, "grad_norm_var": 0.06448567708333333, "learning_rate": 0.0001, "loss": 2.9767, "loss/crossentropy": 2.4194095849990847, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.23701667934656143, "loss/reg": 0.0, "step": 34770 }, { "epoch": 0.22881578947368422, "grad_norm": 2.296875, "grad_norm_var": 0.13319676717122395, "learning_rate": 0.0001, "loss": 3.0117, "loss/crossentropy": 2.330536985397339, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.1930417075753212, "loss/reg": 0.0, "step": 34780 }, { "epoch": 0.2288815789473684, "grad_norm": 2.140625, "grad_norm_var": 0.13925755818684896, "learning_rate": 0.0001, "loss": 3.0049, "loss/crossentropy": 2.055137485265732, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.1990390993654728, "loss/reg": 0.0, "step": 34790 }, { "epoch": 0.22894736842105262, "grad_norm": 2.375, "grad_norm_var": 0.0531402587890625, "learning_rate": 0.0001, "loss": 2.9451, "loss/crossentropy": 2.3400762557983397, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2514717549085617, "loss/reg": 0.0, "step": 34800 }, { "epoch": 0.22901315789473684, "grad_norm": 2.25, "grad_norm_var": 0.14586588541666667, "learning_rate": 0.0001, "loss": 3.0989, "loss/crossentropy": 2.3216898441314697, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.20793744474649428, "loss/reg": 0.0, "step": 34810 }, { "epoch": 0.22907894736842105, "grad_norm": 2.28125, "grad_norm_var": 0.23554280598958333, "learning_rate": 0.0001, "loss": 3.1053, "loss/crossentropy": 2.368983769416809, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23178687542676926, "loss/reg": 0.0, "step": 34820 }, { "epoch": 0.22914473684210526, "grad_norm": 2.140625, "grad_norm_var": 0.09830322265625, "learning_rate": 0.0001, "loss": 2.9238, "loss/crossentropy": 2.1155460715293883, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.21411084979772568, "loss/reg": 0.0, "step": 34830 }, { "epoch": 0.22921052631578948, "grad_norm": 2.796875, "grad_norm_var": 0.6712799072265625, "learning_rate": 0.0001, "loss": 3.0934, "loss/crossentropy": 1.9747644543647767, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.22649588584899902, "loss/reg": 0.0, "step": 34840 }, { "epoch": 0.2292763157894737, "grad_norm": 2.125, "grad_norm_var": 0.6342081705729167, "learning_rate": 0.0001, "loss": 2.968, "loss/crossentropy": 2.282280433177948, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.24287160784006118, "loss/reg": 0.0, "step": 34850 }, { "epoch": 0.2293421052631579, "grad_norm": 2.453125, "grad_norm_var": 0.676806640625, "learning_rate": 0.0001, "loss": 3.0144, "loss/crossentropy": 2.423893141746521, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.27746856659650804, "loss/reg": 0.0, "step": 34860 }, { "epoch": 0.22940789473684212, "grad_norm": 2.109375, "grad_norm_var": 0.15266927083333334, "learning_rate": 0.0001, "loss": 2.9868, "loss/crossentropy": 2.2340946078300474, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23156266957521437, "loss/reg": 0.0, "step": 34870 }, { "epoch": 0.2294736842105263, "grad_norm": 2.015625, "grad_norm_var": 0.12720947265625, "learning_rate": 0.0001, "loss": 2.9166, "loss/crossentropy": 2.3554929852485658, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2240227773785591, "loss/reg": 0.0, "step": 34880 }, { "epoch": 0.22953947368421052, "grad_norm": 2.015625, "grad_norm_var": 0.1996978759765625, "learning_rate": 0.0001, "loss": 2.9409, "loss/crossentropy": 2.1706709206104278, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.1888253793120384, "loss/reg": 0.0, "step": 34890 }, { "epoch": 0.22960526315789473, "grad_norm": 2.1875, "grad_norm_var": 0.26612955729166665, "learning_rate": 0.0001, "loss": 3.0791, "loss/crossentropy": 2.2094788432121275, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.21013466268777847, "loss/reg": 0.0, "step": 34900 }, { "epoch": 0.22967105263157894, "grad_norm": 2.359375, "grad_norm_var": 0.3094716389973958, "learning_rate": 0.0001, "loss": 3.0647, "loss/crossentropy": 2.008031153678894, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2073305867612362, "loss/reg": 0.0, "step": 34910 }, { "epoch": 0.22973684210526316, "grad_norm": 2.234375, "grad_norm_var": 0.16913960774739584, "learning_rate": 0.0001, "loss": 2.9933, "loss/crossentropy": 2.3257227003574372, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2534154921770096, "loss/reg": 0.0, "step": 34920 }, { "epoch": 0.22980263157894737, "grad_norm": 2.609375, "grad_norm_var": 0.09401041666666667, "learning_rate": 0.0001, "loss": 3.078, "loss/crossentropy": 2.299355685710907, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.4137351721525192, "loss/reg": 0.0, "step": 34930 }, { "epoch": 0.22986842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.059163411458333336, "learning_rate": 0.0001, "loss": 3.066, "loss/crossentropy": 2.063184082508087, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2067345403134823, "loss/reg": 0.0, "step": 34940 }, { "epoch": 0.2299342105263158, "grad_norm": 2.34375, "grad_norm_var": 0.0319000244140625, "learning_rate": 0.0001, "loss": 3.0384, "loss/crossentropy": 2.3379054069519043, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.21520548760890962, "loss/reg": 0.0, "step": 34950 }, { "epoch": 0.23, "grad_norm": 2.1875, "grad_norm_var": 0.04029541015625, "learning_rate": 0.0001, "loss": 3.0224, "loss/crossentropy": 2.2715999722480773, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23961079716682435, "loss/reg": 0.0, "step": 34960 }, { "epoch": 0.23006578947368422, "grad_norm": 2.375, "grad_norm_var": 0.017545572916666665, "learning_rate": 0.0001, "loss": 2.9942, "loss/crossentropy": 2.547387886047363, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.2642354011535645, "loss/reg": 0.0, "step": 34970 }, { "epoch": 0.2301315789473684, "grad_norm": 2.703125, "grad_norm_var": 0.0195465087890625, "learning_rate": 0.0001, "loss": 2.9968, "loss/crossentropy": 2.3052905440330504, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2478991135954857, "loss/reg": 0.0, "step": 34980 }, { "epoch": 0.23019736842105262, "grad_norm": 2.296875, "grad_norm_var": 0.03509114583333333, "learning_rate": 0.0001, "loss": 2.9466, "loss/crossentropy": 2.136173462867737, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.23240896165370942, "loss/reg": 0.0, "step": 34990 }, { "epoch": 0.23026315789473684, "grad_norm": 2.578125, "grad_norm_var": 0.04448140462239583, "learning_rate": 0.0001, "loss": 3.0289, "loss/crossentropy": 2.2616278886795045, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.2103984072804451, "loss/reg": 0.0, "step": 35000 }, { "epoch": 0.23032894736842105, "grad_norm": 2.21875, "grad_norm_var": 0.055464680989583334, "learning_rate": 0.0001, "loss": 3.0574, "loss/crossentropy": 2.234957480430603, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22579004764556884, "loss/reg": 0.0, "step": 35010 }, { "epoch": 0.23039473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.0838531494140625, "learning_rate": 0.0001, "loss": 3.0444, "loss/crossentropy": 2.2260616183280946, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2393326446413994, "loss/reg": 0.0, "step": 35020 }, { "epoch": 0.23046052631578948, "grad_norm": 2.4375, "grad_norm_var": 0.110595703125, "learning_rate": 0.0001, "loss": 3.0705, "loss/crossentropy": 2.2648324608802795, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.2452955961227417, "loss/reg": 0.0, "step": 35030 }, { "epoch": 0.2305263157894737, "grad_norm": 2.234375, "grad_norm_var": 0.05959879557291667, "learning_rate": 0.0001, "loss": 3.0123, "loss/crossentropy": 2.3035391211509704, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.20398545116186143, "loss/reg": 0.0, "step": 35040 }, { "epoch": 0.2305921052631579, "grad_norm": 2.34375, "grad_norm_var": 0.10580952962239583, "learning_rate": 0.0001, "loss": 3.0008, "loss/crossentropy": 2.467655122280121, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2141210839152336, "loss/reg": 0.0, "step": 35050 }, { "epoch": 0.23065789473684212, "grad_norm": 2.640625, "grad_norm_var": 0.12373046875, "learning_rate": 0.0001, "loss": 2.9259, "loss/crossentropy": 2.4413907408714293, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.21587167531251908, "loss/reg": 0.0, "step": 35060 }, { "epoch": 0.2307236842105263, "grad_norm": 2.4375, "grad_norm_var": 0.18179931640625, "learning_rate": 0.0001, "loss": 3.0733, "loss/crossentropy": 2.5797677636146545, "loss/hidden": 3.096875, "loss/incoh": 0.0, "loss/logits": 0.3117774799466133, "loss/reg": 0.0, "step": 35070 }, { "epoch": 0.23078947368421052, "grad_norm": 2.46875, "grad_norm_var": 8.836572265625, "learning_rate": 0.0001, "loss": 2.98, "loss/crossentropy": 2.0451542496681214, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.25981259644031524, "loss/reg": 0.0, "step": 35080 }, { "epoch": 0.23085526315789473, "grad_norm": 2.140625, "grad_norm_var": 8.937858072916667, "learning_rate": 0.0001, "loss": 2.9729, "loss/crossentropy": 2.4952213764190674, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.27273062616586685, "loss/reg": 0.0, "step": 35090 }, { "epoch": 0.23092105263157894, "grad_norm": 2.421875, "grad_norm_var": 0.039632161458333336, "learning_rate": 0.0001, "loss": 3.0086, "loss/crossentropy": 2.1235549569129946, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.2294046923518181, "loss/reg": 0.0, "step": 35100 }, { "epoch": 0.23098684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.2903065999348958, "learning_rate": 0.0001, "loss": 3.004, "loss/crossentropy": 2.2860934376716613, "loss/hidden": 3.009375, "loss/incoh": 0.0, "loss/logits": 0.23025134950876236, "loss/reg": 0.0, "step": 35110 }, { "epoch": 0.23105263157894737, "grad_norm": 3.125, "grad_norm_var": 0.1121978759765625, "learning_rate": 0.0001, "loss": 2.9751, "loss/crossentropy": 2.2237872958183287, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.2481079339981079, "loss/reg": 0.0, "step": 35120 }, { "epoch": 0.23111842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.12190348307291667, "learning_rate": 0.0001, "loss": 3.0254, "loss/crossentropy": 2.367337203025818, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.24131873100996018, "loss/reg": 0.0, "step": 35130 }, { "epoch": 0.2311842105263158, "grad_norm": 2.5, "grad_norm_var": 0.061310831705729166, "learning_rate": 0.0001, "loss": 3.0044, "loss/crossentropy": 2.287555253505707, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.24582310616970063, "loss/reg": 0.0, "step": 35140 }, { "epoch": 0.23125, "grad_norm": 2.046875, "grad_norm_var": 0.0275543212890625, "learning_rate": 0.0001, "loss": 3.0166, "loss/crossentropy": 2.3102270364761353, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.22130452394485473, "loss/reg": 0.0, "step": 35150 }, { "epoch": 0.2313157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.022557576497395832, "learning_rate": 0.0001, "loss": 3.0183, "loss/crossentropy": 2.3422921657562257, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.26338334679603576, "loss/reg": 0.0, "step": 35160 }, { "epoch": 0.2313815789473684, "grad_norm": 3.203125, "grad_norm_var": 0.0848297119140625, "learning_rate": 0.0001, "loss": 3.0288, "loss/crossentropy": 2.1819352507591248, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.21551729440689088, "loss/reg": 0.0, "step": 35170 }, { "epoch": 0.23144736842105262, "grad_norm": 2.296875, "grad_norm_var": 0.17290751139322916, "learning_rate": 0.0001, "loss": 3.0282, "loss/crossentropy": 2.512664186954498, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20309768319129945, "loss/reg": 0.0, "step": 35180 }, { "epoch": 0.23151315789473684, "grad_norm": 2.453125, "grad_norm_var": 3.398986258287209e+17, "learning_rate": 0.0001, "loss": 3.1551, "loss/crossentropy": 2.3065868735313417, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.2547546997666359, "loss/reg": 0.0, "step": 35190 }, { "epoch": 0.23157894736842105, "grad_norm": 2.203125, "grad_norm_var": 0.2131744384765625, "learning_rate": 0.0001, "loss": 3.0611, "loss/crossentropy": 2.5265056133270263, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.21266352534294128, "loss/reg": 0.0, "step": 35200 }, { "epoch": 0.23164473684210526, "grad_norm": 2.921875, "grad_norm_var": 0.4802480061848958, "learning_rate": 0.0001, "loss": 3.1018, "loss/crossentropy": 2.378112018108368, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.2218308851122856, "loss/reg": 0.0, "step": 35210 }, { "epoch": 0.23171052631578948, "grad_norm": 2.171875, "grad_norm_var": 0.28329671223958336, "learning_rate": 0.0001, "loss": 2.9809, "loss/crossentropy": 2.3841970801353454, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.20064097195863723, "loss/reg": 0.0, "step": 35220 }, { "epoch": 0.2317763157894737, "grad_norm": 2.28125, "grad_norm_var": 0.13202718098958333, "learning_rate": 0.0001, "loss": 3.0415, "loss/crossentropy": 2.2034762263298036, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.26952334195375444, "loss/reg": 0.0, "step": 35230 }, { "epoch": 0.2318421052631579, "grad_norm": 3.484375, "grad_norm_var": 0.17197850545247395, "learning_rate": 0.0001, "loss": 2.9833, "loss/crossentropy": 2.333740735054016, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.2028803788125515, "loss/reg": 0.0, "step": 35240 }, { "epoch": 0.23190789473684212, "grad_norm": 2.25, "grad_norm_var": 0.12504247029622395, "learning_rate": 0.0001, "loss": 3.0428, "loss/crossentropy": 1.9891301035881042, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.33240934908390046, "loss/reg": 0.0, "step": 35250 }, { "epoch": 0.2319736842105263, "grad_norm": 2.125, "grad_norm_var": 0.0198394775390625, "learning_rate": 0.0001, "loss": 2.9679, "loss/crossentropy": 2.394805371761322, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2079624727368355, "loss/reg": 0.0, "step": 35260 }, { "epoch": 0.23203947368421052, "grad_norm": 2.125, "grad_norm_var": 0.07119140625, "learning_rate": 0.0001, "loss": 2.9843, "loss/crossentropy": 2.466187059879303, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.21209322810173034, "loss/reg": 0.0, "step": 35270 }, { "epoch": 0.23210526315789473, "grad_norm": 2.796875, "grad_norm_var": 0.7116737365722656, "learning_rate": 0.0001, "loss": 2.9743, "loss/crossentropy": 2.2582387804985045, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2430204778909683, "loss/reg": 0.0, "step": 35280 }, { "epoch": 0.23217105263157894, "grad_norm": 3.28125, "grad_norm_var": 3.206175898909409e+17, "learning_rate": 0.0001, "loss": 3.1474, "loss/crossentropy": 2.2272575974464415, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.20790190547704696, "loss/reg": 0.0, "step": 35290 }, { "epoch": 0.23223684210526316, "grad_norm": 2.75, "grad_norm_var": 3.206175899148288e+17, "learning_rate": 0.0001, "loss": 3.0553, "loss/crossentropy": 2.3405486226081846, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.24497875720262527, "loss/reg": 0.0, "step": 35300 }, { "epoch": 0.23230263157894737, "grad_norm": 2.3125, "grad_norm_var": 0.12532145182291668, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.316471701860428, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.23606702983379363, "loss/reg": 0.0, "step": 35310 }, { "epoch": 0.23236842105263159, "grad_norm": 2.25, "grad_norm_var": 0.07993876139322917, "learning_rate": 0.0001, "loss": 3.0771, "loss/crossentropy": 2.068638467788696, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.2723642893135548, "loss/reg": 0.0, "step": 35320 }, { "epoch": 0.2324342105263158, "grad_norm": 2.375, "grad_norm_var": 0.1643463134765625, "learning_rate": 0.0001, "loss": 3.0607, "loss/crossentropy": 2.209253963828087, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2534912425093353, "loss/reg": 0.0, "step": 35330 }, { "epoch": 0.2325, "grad_norm": 2.3125, "grad_norm_var": 0.0254302978515625, "learning_rate": 0.0001, "loss": 3.0043, "loss/crossentropy": 2.3624029099941253, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.25295102000236513, "loss/reg": 0.0, "step": 35340 }, { "epoch": 0.2325657894736842, "grad_norm": 2.171875, "grad_norm_var": 0.04498291015625, "learning_rate": 0.0001, "loss": 3.0386, "loss/crossentropy": 2.3273918867111205, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22172463536262513, "loss/reg": 0.0, "step": 35350 }, { "epoch": 0.2326315789473684, "grad_norm": 2.390625, "grad_norm_var": 0.055403645833333334, "learning_rate": 0.0001, "loss": 3.0584, "loss/crossentropy": 1.895402479171753, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.23743433207273484, "loss/reg": 0.0, "step": 35360 }, { "epoch": 0.23269736842105262, "grad_norm": 2.421875, "grad_norm_var": 0.056962076822916666, "learning_rate": 0.0001, "loss": 3.1011, "loss/crossentropy": 2.0022137641906737, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.21825481653213502, "loss/reg": 0.0, "step": 35370 }, { "epoch": 0.23276315789473684, "grad_norm": 2.40625, "grad_norm_var": 0.08046468098958333, "learning_rate": 0.0001, "loss": 3.0227, "loss/crossentropy": 1.9984350323677063, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.2356773540377617, "loss/reg": 0.0, "step": 35380 }, { "epoch": 0.23282894736842105, "grad_norm": 3.140625, "grad_norm_var": 0.13664957682291667, "learning_rate": 0.0001, "loss": 3.0069, "loss/crossentropy": 2.1517210602760315, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.22341941446065902, "loss/reg": 0.0, "step": 35390 }, { "epoch": 0.23289473684210527, "grad_norm": 2.40625, "grad_norm_var": 0.26809794108072915, "learning_rate": 0.0001, "loss": 3.0346, "loss/crossentropy": 2.380366015434265, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2661885678768158, "loss/reg": 0.0, "step": 35400 }, { "epoch": 0.23296052631578948, "grad_norm": 2.671875, "grad_norm_var": 0.036149088541666666, "learning_rate": 0.0001, "loss": 3.0274, "loss/crossentropy": 2.084783911705017, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22119598612189292, "loss/reg": 0.0, "step": 35410 }, { "epoch": 0.2330263157894737, "grad_norm": 2.0625, "grad_norm_var": 0.21288960774739582, "learning_rate": 0.0001, "loss": 2.9574, "loss/crossentropy": 2.288650333881378, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2464343011379242, "loss/reg": 0.0, "step": 35420 }, { "epoch": 0.2330921052631579, "grad_norm": 2.765625, "grad_norm_var": 0.22062174479166666, "learning_rate": 0.0001, "loss": 3.0144, "loss/crossentropy": 2.1648478746414184, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.2272023007273674, "loss/reg": 0.0, "step": 35430 }, { "epoch": 0.2331578947368421, "grad_norm": 2.546875, "grad_norm_var": 0.07825113932291666, "learning_rate": 0.0001, "loss": 2.9664, "loss/crossentropy": 2.4016910076141356, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.19540313482284546, "loss/reg": 0.0, "step": 35440 }, { "epoch": 0.2332236842105263, "grad_norm": 2.0625, "grad_norm_var": 0.12473526000976562, "learning_rate": 0.0001, "loss": 3.0054, "loss/crossentropy": 2.2149259626865385, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.211626535654068, "loss/reg": 0.0, "step": 35450 }, { "epoch": 0.23328947368421052, "grad_norm": 2.21875, "grad_norm_var": 0.2657244364420573, "learning_rate": 0.0001, "loss": 3.022, "loss/crossentropy": 2.0099823474884033, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.21212436258792877, "loss/reg": 0.0, "step": 35460 }, { "epoch": 0.23335526315789473, "grad_norm": 2.375, "grad_norm_var": 0.035560862223307295, "learning_rate": 0.0001, "loss": 2.959, "loss/crossentropy": 2.149253064393997, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19234723448753357, "loss/reg": 0.0, "step": 35470 }, { "epoch": 0.23342105263157895, "grad_norm": 2.28125, "grad_norm_var": 0.196630859375, "learning_rate": 0.0001, "loss": 3.0157, "loss/crossentropy": 2.3657627940177917, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.223566497862339, "loss/reg": 0.0, "step": 35480 }, { "epoch": 0.23348684210526316, "grad_norm": 2.171875, "grad_norm_var": 0.024527994791666667, "learning_rate": 0.0001, "loss": 2.9839, "loss/crossentropy": 2.4007309794425966, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.19649036675691606, "loss/reg": 0.0, "step": 35490 }, { "epoch": 0.23355263157894737, "grad_norm": 3.046875, "grad_norm_var": 0.05445556640625, "learning_rate": 0.0001, "loss": 3.0323, "loss/crossentropy": 2.414763641357422, "loss/hidden": 2.9296875, "loss/incoh": 0.0, "loss/logits": 0.2565377399325371, "loss/reg": 0.0, "step": 35500 }, { "epoch": 0.2336184210526316, "grad_norm": 2.625, "grad_norm_var": 0.04409077962239583, "learning_rate": 0.0001, "loss": 2.9834, "loss/crossentropy": 2.272168481349945, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.2014573760330677, "loss/reg": 0.0, "step": 35510 }, { "epoch": 0.2336842105263158, "grad_norm": 2.234375, "grad_norm_var": 0.091162109375, "learning_rate": 0.0001, "loss": 3.0118, "loss/crossentropy": 2.4486998319625854, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2566034272313118, "loss/reg": 0.0, "step": 35520 }, { "epoch": 0.23375, "grad_norm": 2.34375, "grad_norm_var": 0.11804097493489583, "learning_rate": 0.0001, "loss": 2.9697, "loss/crossentropy": 2.2795116662979127, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.20817176550626754, "loss/reg": 0.0, "step": 35530 }, { "epoch": 0.2338157894736842, "grad_norm": 2.40625, "grad_norm_var": 0.05882059733072917, "learning_rate": 0.0001, "loss": 2.9864, "loss/crossentropy": 2.1612624883651734, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.234655499458313, "loss/reg": 0.0, "step": 35540 }, { "epoch": 0.2338815789473684, "grad_norm": 2.15625, "grad_norm_var": 0.051545206705729166, "learning_rate": 0.0001, "loss": 2.9369, "loss/crossentropy": 2.342636692523956, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.2195874720811844, "loss/reg": 0.0, "step": 35550 }, { "epoch": 0.23394736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.2511678059895833, "learning_rate": 0.0001, "loss": 2.9652, "loss/crossentropy": 2.4172530651092528, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.22586058229207992, "loss/reg": 0.0, "step": 35560 }, { "epoch": 0.23401315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.21130269368489582, "learning_rate": 0.0001, "loss": 2.9721, "loss/crossentropy": 2.19435909986496, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.22134042978286744, "loss/reg": 0.0, "step": 35570 }, { "epoch": 0.23407894736842105, "grad_norm": 2.1875, "grad_norm_var": 0.12469075520833334, "learning_rate": 0.0001, "loss": 3.0487, "loss/crossentropy": 2.491572880744934, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.2359505832195282, "loss/reg": 0.0, "step": 35580 }, { "epoch": 0.23414473684210527, "grad_norm": 2.546875, "grad_norm_var": 0.24637832641601562, "learning_rate": 0.0001, "loss": 3.0188, "loss/crossentropy": 2.0867708206176756, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.29949132055044175, "loss/reg": 0.0, "step": 35590 }, { "epoch": 0.23421052631578948, "grad_norm": 2.78125, "grad_norm_var": 0.09173075358072917, "learning_rate": 0.0001, "loss": 3.0162, "loss/crossentropy": 2.500631070137024, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.23038013875484467, "loss/reg": 0.0, "step": 35600 }, { "epoch": 0.2342763157894737, "grad_norm": 2.78125, "grad_norm_var": 0.09622294108072917, "learning_rate": 0.0001, "loss": 2.9738, "loss/crossentropy": 2.330187511444092, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.21412627547979354, "loss/reg": 0.0, "step": 35610 }, { "epoch": 0.2343421052631579, "grad_norm": 2.09375, "grad_norm_var": 0.06207275390625, "learning_rate": 0.0001, "loss": 2.9646, "loss/crossentropy": 2.0748894095420836, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.2262689083814621, "loss/reg": 0.0, "step": 35620 }, { "epoch": 0.2344078947368421, "grad_norm": 2.546875, "grad_norm_var": 0.0324859619140625, "learning_rate": 0.0001, "loss": 2.9761, "loss/crossentropy": 2.415894079208374, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.20092394277453424, "loss/reg": 0.0, "step": 35630 }, { "epoch": 0.2344736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.1505859375, "learning_rate": 0.0001, "loss": 3.0498, "loss/crossentropy": 2.2875362753868105, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.24587944149971008, "loss/reg": 0.0, "step": 35640 }, { "epoch": 0.23453947368421052, "grad_norm": 2.515625, "grad_norm_var": 0.16611226399739584, "learning_rate": 0.0001, "loss": 2.9052, "loss/crossentropy": 2.4340951919555662, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.22377320230007172, "loss/reg": 0.0, "step": 35650 }, { "epoch": 0.23460526315789473, "grad_norm": 2.140625, "grad_norm_var": 0.06776936848958333, "learning_rate": 0.0001, "loss": 2.9088, "loss/crossentropy": 2.3177594542503357, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.2174980953335762, "loss/reg": 0.0, "step": 35660 }, { "epoch": 0.23467105263157895, "grad_norm": 2.1875, "grad_norm_var": 0.0744049072265625, "learning_rate": 0.0001, "loss": 3.0171, "loss/crossentropy": 2.2726083517074587, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.30478386878967284, "loss/reg": 0.0, "step": 35670 }, { "epoch": 0.23473684210526316, "grad_norm": 2.734375, "grad_norm_var": 0.0516754150390625, "learning_rate": 0.0001, "loss": 3.072, "loss/crossentropy": 2.2021562099456786, "loss/hidden": 3.0625, "loss/incoh": 0.0, "loss/logits": 0.27211851328611375, "loss/reg": 0.0, "step": 35680 }, { "epoch": 0.23480263157894737, "grad_norm": 2.515625, "grad_norm_var": 0.05318603515625, "learning_rate": 0.0001, "loss": 3.0061, "loss/crossentropy": 2.2337870597839355, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.23637954592704774, "loss/reg": 0.0, "step": 35690 }, { "epoch": 0.2348684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.21002197265625, "learning_rate": 0.0001, "loss": 3.0582, "loss/crossentropy": 1.9850136280059814, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.2096769317984581, "loss/reg": 0.0, "step": 35700 }, { "epoch": 0.2349342105263158, "grad_norm": 6.125, "grad_norm_var": 1.515673828125, "learning_rate": 0.0001, "loss": 2.9576, "loss/crossentropy": 2.496402633190155, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.22533617317676544, "loss/reg": 0.0, "step": 35710 }, { "epoch": 0.235, "grad_norm": 2.359375, "grad_norm_var": 0.95963134765625, "learning_rate": 0.0001, "loss": 3.0797, "loss/crossentropy": 2.140752410888672, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.20643131881952287, "loss/reg": 0.0, "step": 35720 }, { "epoch": 0.2350657894736842, "grad_norm": 2.796875, "grad_norm_var": 0.19202067057291666, "learning_rate": 0.0001, "loss": 2.9408, "loss/crossentropy": 1.9449581146240233, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.1967112362384796, "loss/reg": 0.0, "step": 35730 }, { "epoch": 0.2351315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.718365224202474, "learning_rate": 0.0001, "loss": 2.873, "loss/crossentropy": 2.059830403327942, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.1984706148505211, "loss/reg": 0.0, "step": 35740 }, { "epoch": 0.23519736842105263, "grad_norm": 1.90625, "grad_norm_var": 0.7783404032389323, "learning_rate": 0.0001, "loss": 2.9617, "loss/crossentropy": 2.5269897818565368, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.24567708522081375, "loss/reg": 0.0, "step": 35750 }, { "epoch": 0.23526315789473684, "grad_norm": 2.75, "grad_norm_var": 0.3910441080729167, "learning_rate": 0.0001, "loss": 3.049, "loss/crossentropy": 2.2286418437957765, "loss/hidden": 3.084375, "loss/incoh": 0.0, "loss/logits": 0.22506199181079864, "loss/reg": 0.0, "step": 35760 }, { "epoch": 0.23532894736842105, "grad_norm": 2.75, "grad_norm_var": 0.20345052083333334, "learning_rate": 0.0001, "loss": 3.0262, "loss/crossentropy": 2.321590745449066, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.22042199671268464, "loss/reg": 0.0, "step": 35770 }, { "epoch": 0.23539473684210527, "grad_norm": 2.328125, "grad_norm_var": 0.6254981994628906, "learning_rate": 0.0001, "loss": 2.9981, "loss/crossentropy": 2.3971486926078795, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.21692513525485993, "loss/reg": 0.0, "step": 35780 }, { "epoch": 0.23546052631578948, "grad_norm": 2.078125, "grad_norm_var": 0.2902809143066406, "learning_rate": 0.0001, "loss": 2.931, "loss/crossentropy": 2.3023766756057737, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2162985995411873, "loss/reg": 0.0, "step": 35790 }, { "epoch": 0.2355263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.16299540201822918, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.355253207683563, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.24053574204444886, "loss/reg": 0.0, "step": 35800 }, { "epoch": 0.2355921052631579, "grad_norm": 2.15625, "grad_norm_var": 0.10937398274739583, "learning_rate": 0.0001, "loss": 2.9729, "loss/crossentropy": 2.493846869468689, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2182188354432583, "loss/reg": 0.0, "step": 35810 }, { "epoch": 0.2356578947368421, "grad_norm": 2.296875, "grad_norm_var": 0.46917317708333334, "learning_rate": 0.0001, "loss": 3.0177, "loss/crossentropy": 2.3108774185180665, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.17969555854797364, "loss/reg": 0.0, "step": 35820 }, { "epoch": 0.2357236842105263, "grad_norm": 2.21875, "grad_norm_var": 0.29129231770833336, "learning_rate": 0.0001, "loss": 3.0199, "loss/crossentropy": 2.1752161145210267, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.25736078470945356, "loss/reg": 0.0, "step": 35830 }, { "epoch": 0.23578947368421052, "grad_norm": 2.390625, "grad_norm_var": 0.15127665201822918, "learning_rate": 0.0001, "loss": 3.0057, "loss/crossentropy": 2.21168737411499, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.24892309606075286, "loss/reg": 0.0, "step": 35840 }, { "epoch": 0.23585526315789473, "grad_norm": 2.328125, "grad_norm_var": 0.38827718098958336, "learning_rate": 0.0001, "loss": 2.9855, "loss/crossentropy": 2.4601115822792052, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.23503982722759248, "loss/reg": 0.0, "step": 35850 }, { "epoch": 0.23592105263157895, "grad_norm": 2.375, "grad_norm_var": 0.0817779541015625, "learning_rate": 0.0001, "loss": 2.9943, "loss/crossentropy": 2.4704690098762514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2563426047563553, "loss/reg": 0.0, "step": 35860 }, { "epoch": 0.23598684210526316, "grad_norm": 2.078125, "grad_norm_var": 0.16731541951497395, "learning_rate": 0.0001, "loss": 3.0406, "loss/crossentropy": 2.6593142986297607, "loss/hidden": 3.028125, "loss/incoh": 0.0, "loss/logits": 0.2891248628497124, "loss/reg": 0.0, "step": 35870 }, { "epoch": 0.23605263157894738, "grad_norm": 2.234375, "grad_norm_var": 0.13696263631184896, "learning_rate": 0.0001, "loss": 3.0187, "loss/crossentropy": 2.4739042282104493, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.23506251722574234, "loss/reg": 0.0, "step": 35880 }, { "epoch": 0.2361184210526316, "grad_norm": 2.296875, "grad_norm_var": 0.159375, "learning_rate": 0.0001, "loss": 3.0207, "loss/crossentropy": 2.2195907831192017, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.20974744409322738, "loss/reg": 0.0, "step": 35890 }, { "epoch": 0.2361842105263158, "grad_norm": 2.21875, "grad_norm_var": 0.2710039774576823, "learning_rate": 0.0001, "loss": 2.9166, "loss/crossentropy": 2.1691187381744386, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.20902955271303653, "loss/reg": 0.0, "step": 35900 }, { "epoch": 0.23625, "grad_norm": 2.484375, "grad_norm_var": 0.021848297119140624, "learning_rate": 0.0001, "loss": 2.9421, "loss/crossentropy": 2.380651593208313, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.21168311238288878, "loss/reg": 0.0, "step": 35910 }, { "epoch": 0.2363157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.0631256103515625, "learning_rate": 0.0001, "loss": 2.9784, "loss/crossentropy": 2.194112575054169, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2107112467288971, "loss/reg": 0.0, "step": 35920 }, { "epoch": 0.23638157894736841, "grad_norm": 2.296875, "grad_norm_var": 0.19700419108072917, "learning_rate": 0.0001, "loss": 3.0817, "loss/crossentropy": 2.289298951625824, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.1993821457028389, "loss/reg": 0.0, "step": 35930 }, { "epoch": 0.23644736842105263, "grad_norm": 2.828125, "grad_norm_var": 0.07625325520833333, "learning_rate": 0.0001, "loss": 2.9474, "loss/crossentropy": 2.4226235032081602, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2387748032808304, "loss/reg": 0.0, "step": 35940 }, { "epoch": 0.23651315789473684, "grad_norm": 2.390625, "grad_norm_var": 0.04461263020833333, "learning_rate": 0.0001, "loss": 2.9628, "loss/crossentropy": 2.380166971683502, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2143217995762825, "loss/reg": 0.0, "step": 35950 }, { "epoch": 0.23657894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.1749908447265625, "learning_rate": 0.0001, "loss": 2.9418, "loss/crossentropy": 2.286650228500366, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.21095365434885024, "loss/reg": 0.0, "step": 35960 }, { "epoch": 0.23664473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.11900634765625, "learning_rate": 0.0001, "loss": 3.0537, "loss/crossentropy": 2.2950440287590026, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.2576613754034042, "loss/reg": 0.0, "step": 35970 }, { "epoch": 0.23671052631578948, "grad_norm": 2.28125, "grad_norm_var": 0.12654520670572916, "learning_rate": 0.0001, "loss": 2.9324, "loss/crossentropy": 2.236435151100159, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.20467466115951538, "loss/reg": 0.0, "step": 35980 }, { "epoch": 0.2367763157894737, "grad_norm": 2.40625, "grad_norm_var": 0.12952473958333333, "learning_rate": 0.0001, "loss": 2.9877, "loss/crossentropy": 2.503733921051025, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.21396732181310654, "loss/reg": 0.0, "step": 35990 }, { "epoch": 0.23684210526315788, "grad_norm": 2.109375, "grad_norm_var": 0.10692952473958334, "learning_rate": 0.0001, "loss": 2.9584, "loss/crossentropy": 2.493756449222565, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.23922124207019807, "loss/reg": 0.0, "step": 36000 }, { "epoch": 0.2369078947368421, "grad_norm": 2.28125, "grad_norm_var": 0.92545166015625, "learning_rate": 0.0001, "loss": 3.0174, "loss/crossentropy": 1.8596992015838623, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.19636918008327484, "loss/reg": 0.0, "step": 36010 }, { "epoch": 0.2369736842105263, "grad_norm": 2.28125, "grad_norm_var": 0.9360260009765625, "learning_rate": 0.0001, "loss": 2.9994, "loss/crossentropy": 2.239300674200058, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.21985187232494355, "loss/reg": 0.0, "step": 36020 }, { "epoch": 0.23703947368421052, "grad_norm": 4.625, "grad_norm_var": 2.2611793518066405, "learning_rate": 0.0001, "loss": 2.987, "loss/crossentropy": 2.22670122385025, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.22871969491243363, "loss/reg": 0.0, "step": 36030 }, { "epoch": 0.23710526315789474, "grad_norm": 3.0, "grad_norm_var": 0.4241167704264323, "learning_rate": 0.0001, "loss": 3.0196, "loss/crossentropy": 2.0241116285324097, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.24767006561160088, "loss/reg": 0.0, "step": 36040 }, { "epoch": 0.23717105263157895, "grad_norm": 2.09375, "grad_norm_var": 0.30030924479166665, "learning_rate": 0.0001, "loss": 2.9484, "loss/crossentropy": 2.086419093608856, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.1900019347667694, "loss/reg": 0.0, "step": 36050 }, { "epoch": 0.23723684210526316, "grad_norm": 2.25, "grad_norm_var": 0.1086822509765625, "learning_rate": 0.0001, "loss": 2.9919, "loss/crossentropy": 2.1598266899585723, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.26148284450173376, "loss/reg": 0.0, "step": 36060 }, { "epoch": 0.23730263157894738, "grad_norm": 3.203125, "grad_norm_var": 0.10439453125, "learning_rate": 0.0001, "loss": 3.0653, "loss/crossentropy": 2.1726396083831787, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.2597889766097069, "loss/reg": 0.0, "step": 36070 }, { "epoch": 0.2373684210526316, "grad_norm": 2.140625, "grad_norm_var": 0.10669657389322916, "learning_rate": 0.0001, "loss": 2.9768, "loss/crossentropy": 2.1650060296058653, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.22962034344673157, "loss/reg": 0.0, "step": 36080 }, { "epoch": 0.2374342105263158, "grad_norm": 2.40625, "grad_norm_var": 0.04478759765625, "learning_rate": 0.0001, "loss": 2.9233, "loss/crossentropy": 2.289682912826538, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.2165387198328972, "loss/reg": 0.0, "step": 36090 }, { "epoch": 0.2375, "grad_norm": 1.9921875, "grad_norm_var": 0.09746068318684896, "learning_rate": 0.0001, "loss": 3.0195, "loss/crossentropy": 2.332055389881134, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.21832392662763594, "loss/reg": 0.0, "step": 36100 }, { "epoch": 0.2375657894736842, "grad_norm": 2.75, "grad_norm_var": 0.07844619750976563, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.5219852209091185, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.27026659697294236, "loss/reg": 0.0, "step": 36110 }, { "epoch": 0.23763157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.07219950358072917, "learning_rate": 0.0001, "loss": 2.9762, "loss/crossentropy": 2.2172631919384003, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22206666469573974, "loss/reg": 0.0, "step": 36120 }, { "epoch": 0.23769736842105263, "grad_norm": 2.0625, "grad_norm_var": 0.11806640625, "learning_rate": 0.0001, "loss": 2.9813, "loss/crossentropy": 2.235606110095978, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.23971866220235824, "loss/reg": 0.0, "step": 36130 }, { "epoch": 0.23776315789473684, "grad_norm": 2.53125, "grad_norm_var": 1.190087890625, "learning_rate": 0.0001, "loss": 3.0329, "loss/crossentropy": 2.2865034997463227, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.32822101563215256, "loss/reg": 0.0, "step": 36140 }, { "epoch": 0.23782894736842106, "grad_norm": 2.625, "grad_norm_var": 1.3279296875, "learning_rate": 0.0001, "loss": 3.0021, "loss/crossentropy": 2.2038097441196443, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.21367209404706955, "loss/reg": 0.0, "step": 36150 }, { "epoch": 0.23789473684210527, "grad_norm": 2.265625, "grad_norm_var": 0.05592041015625, "learning_rate": 0.0001, "loss": 3.0254, "loss/crossentropy": 2.2345643639564514, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.27571815252304077, "loss/reg": 0.0, "step": 36160 }, { "epoch": 0.23796052631578948, "grad_norm": 2.65625, "grad_norm_var": 0.11005757649739584, "learning_rate": 0.0001, "loss": 3.039, "loss/crossentropy": 2.1099373579025267, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.21341636329889296, "loss/reg": 0.0, "step": 36170 }, { "epoch": 0.2380263157894737, "grad_norm": 2.125, "grad_norm_var": 0.114306640625, "learning_rate": 0.0001, "loss": 2.9229, "loss/crossentropy": 2.1778077363967894, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.18448112457990645, "loss/reg": 0.0, "step": 36180 }, { "epoch": 0.23809210526315788, "grad_norm": 2.375, "grad_norm_var": 0.042867024739583336, "learning_rate": 0.0001, "loss": 2.9876, "loss/crossentropy": 2.5516823649406435, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.25054328292608263, "loss/reg": 0.0, "step": 36190 }, { "epoch": 0.2381578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.07666727701822916, "learning_rate": 0.0001, "loss": 2.9734, "loss/crossentropy": 2.257242572307587, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2207578793168068, "loss/reg": 0.0, "step": 36200 }, { "epoch": 0.2382236842105263, "grad_norm": 2.078125, "grad_norm_var": 0.07727864583333334, "learning_rate": 0.0001, "loss": 2.999, "loss/crossentropy": 2.093051493167877, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2044597864151001, "loss/reg": 0.0, "step": 36210 }, { "epoch": 0.23828947368421052, "grad_norm": 2.0625, "grad_norm_var": 0.11516825358072917, "learning_rate": 0.0001, "loss": 2.9287, "loss/crossentropy": 2.1220389723777773, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.19910518899559976, "loss/reg": 0.0, "step": 36220 }, { "epoch": 0.23835526315789474, "grad_norm": 2.65625, "grad_norm_var": 0.2674763997395833, "learning_rate": 0.0001, "loss": 3.0577, "loss/crossentropy": 2.1114853382110597, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.2416158512234688, "loss/reg": 0.0, "step": 36230 }, { "epoch": 0.23842105263157895, "grad_norm": 2.671875, "grad_norm_var": 0.05162353515625, "learning_rate": 0.0001, "loss": 2.9816, "loss/crossentropy": 2.332682567834854, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.2057093556970358, "loss/reg": 0.0, "step": 36240 }, { "epoch": 0.23848684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.04888916015625, "learning_rate": 0.0001, "loss": 2.9878, "loss/crossentropy": 2.26646374464035, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.22881892770528794, "loss/reg": 0.0, "step": 36250 }, { "epoch": 0.23855263157894738, "grad_norm": 2.109375, "grad_norm_var": 0.0685455322265625, "learning_rate": 0.0001, "loss": 3.041, "loss/crossentropy": 2.0819905757904054, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2033102184534073, "loss/reg": 0.0, "step": 36260 }, { "epoch": 0.2386184210526316, "grad_norm": 3.453125, "grad_norm_var": 2.275194295247396, "learning_rate": 0.0001, "loss": 3.0941, "loss/crossentropy": 2.1767791748046874, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.24875595569610595, "loss/reg": 0.0, "step": 36270 }, { "epoch": 0.23868421052631578, "grad_norm": 2.28125, "grad_norm_var": 3.0041412353515624, "learning_rate": 0.0001, "loss": 3.0775, "loss/crossentropy": 2.389254295825958, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.20708534121513367, "loss/reg": 0.0, "step": 36280 }, { "epoch": 0.23875, "grad_norm": 2.3125, "grad_norm_var": 1.19429931640625, "learning_rate": 0.0001, "loss": 2.9765, "loss/crossentropy": 2.2551061868667603, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.23167352080345155, "loss/reg": 0.0, "step": 36290 }, { "epoch": 0.2388157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.4574127197265625, "learning_rate": 0.0001, "loss": 3.0732, "loss/crossentropy": 2.3170044660568236, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.22510029524564742, "loss/reg": 0.0, "step": 36300 }, { "epoch": 0.23888157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.6108904520670573, "learning_rate": 0.0001, "loss": 3.0454, "loss/crossentropy": 1.9875805854797364, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.21809106022119523, "loss/reg": 0.0, "step": 36310 }, { "epoch": 0.23894736842105263, "grad_norm": 2.40625, "grad_norm_var": 0.7914265950520833, "learning_rate": 0.0001, "loss": 2.9925, "loss/crossentropy": 2.3829528450965882, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2381367027759552, "loss/reg": 0.0, "step": 36320 }, { "epoch": 0.23901315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.3254778544108073, "learning_rate": 0.0001, "loss": 3.0144, "loss/crossentropy": 2.196928286552429, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.24151963144540786, "loss/reg": 0.0, "step": 36330 }, { "epoch": 0.23907894736842106, "grad_norm": 2.375, "grad_norm_var": 0.0215240478515625, "learning_rate": 0.0001, "loss": 2.9815, "loss/crossentropy": 2.2826131939888, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.23180599957704545, "loss/reg": 0.0, "step": 36340 }, { "epoch": 0.23914473684210527, "grad_norm": 2.671875, "grad_norm_var": 0.7601847330729167, "learning_rate": 0.0001, "loss": 3.025, "loss/crossentropy": 2.4469813466072083, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.21003625690937042, "loss/reg": 0.0, "step": 36350 }, { "epoch": 0.23921052631578948, "grad_norm": 2.3125, "grad_norm_var": 0.04129231770833333, "learning_rate": 0.0001, "loss": 2.9786, "loss/crossentropy": 2.5087629079818727, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.22830777615308762, "loss/reg": 0.0, "step": 36360 }, { "epoch": 0.2392763157894737, "grad_norm": 2.40625, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 3.0514, "loss/crossentropy": 2.0158798813819887, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.22552752420306205, "loss/reg": 0.0, "step": 36370 }, { "epoch": 0.23934210526315788, "grad_norm": 2.25, "grad_norm_var": 0.056298828125, "learning_rate": 0.0001, "loss": 3.0251, "loss/crossentropy": 1.9761714577674865, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.23326522260904312, "loss/reg": 0.0, "step": 36380 }, { "epoch": 0.2394078947368421, "grad_norm": 2.640625, "grad_norm_var": 0.19085286458333334, "learning_rate": 0.0001, "loss": 3.0543, "loss/crossentropy": 2.1724894046783447, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.2123780742287636, "loss/reg": 0.0, "step": 36390 }, { "epoch": 0.2394736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.1413726806640625, "learning_rate": 0.0001, "loss": 2.9757, "loss/crossentropy": 2.1725098967552183, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.2148168534040451, "loss/reg": 0.0, "step": 36400 }, { "epoch": 0.23953947368421052, "grad_norm": 2.640625, "grad_norm_var": 4.847835032145182, "learning_rate": 0.0001, "loss": 3.046, "loss/crossentropy": 2.4329644799232484, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.26010534912347794, "loss/reg": 0.0, "step": 36410 }, { "epoch": 0.23960526315789474, "grad_norm": 2.453125, "grad_norm_var": 4.855410766601563, "learning_rate": 0.0001, "loss": 2.9563, "loss/crossentropy": 2.215455192327499, "loss/hidden": 2.909375, "loss/incoh": 0.0, "loss/logits": 0.23449745997786522, "loss/reg": 0.0, "step": 36420 }, { "epoch": 0.23967105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.0796539306640625, "learning_rate": 0.0001, "loss": 3.015, "loss/crossentropy": 2.411003601551056, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.27160040885210035, "loss/reg": 0.0, "step": 36430 }, { "epoch": 0.23973684210526316, "grad_norm": 2.328125, "grad_norm_var": 0.0956207275390625, "learning_rate": 0.0001, "loss": 2.9926, "loss/crossentropy": 2.542527449131012, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.23901409804821014, "loss/reg": 0.0, "step": 36440 }, { "epoch": 0.23980263157894738, "grad_norm": 2.421875, "grad_norm_var": 0.1344146728515625, "learning_rate": 0.0001, "loss": 3.0325, "loss/crossentropy": 2.28466220498085, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.21214617267251015, "loss/reg": 0.0, "step": 36450 }, { "epoch": 0.2398684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.07780659993489583, "learning_rate": 0.0001, "loss": 3.0155, "loss/crossentropy": 2.521458077430725, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.24949344843626023, "loss/reg": 0.0, "step": 36460 }, { "epoch": 0.23993421052631578, "grad_norm": 3.171875, "grad_norm_var": 0.10063374837239583, "learning_rate": 0.0001, "loss": 3.0598, "loss/crossentropy": 2.11941602230072, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.265853063762188, "loss/reg": 0.0, "step": 36470 }, { "epoch": 0.24, "grad_norm": 2.078125, "grad_norm_var": 0.15134175618489584, "learning_rate": 0.0001, "loss": 3.1036, "loss/crossentropy": 2.159947466850281, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.23642594143748283, "loss/reg": 0.0, "step": 36480 }, { "epoch": 0.2400657894736842, "grad_norm": 2.34375, "grad_norm_var": 0.1106109619140625, "learning_rate": 0.0001, "loss": 2.9888, "loss/crossentropy": 2.2564534187316894, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24869659841060637, "loss/reg": 0.0, "step": 36490 }, { "epoch": 0.24013157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.04983723958333333, "learning_rate": 0.0001, "loss": 3.107, "loss/crossentropy": 2.3479647517204283, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.2286713719367981, "loss/reg": 0.0, "step": 36500 }, { "epoch": 0.24019736842105263, "grad_norm": 2.5, "grad_norm_var": 0.07700093587239583, "learning_rate": 0.0001, "loss": 3.0664, "loss/crossentropy": 2.4374531388282774, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23715483397245407, "loss/reg": 0.0, "step": 36510 }, { "epoch": 0.24026315789473685, "grad_norm": 3.171875, "grad_norm_var": 0.28349202473958335, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 1.9271621584892273, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.2382538564503193, "loss/reg": 0.0, "step": 36520 }, { "epoch": 0.24032894736842106, "grad_norm": 2.1875, "grad_norm_var": 0.26936747233072916, "learning_rate": 0.0001, "loss": 2.9974, "loss/crossentropy": 2.3039157390594482, "loss/hidden": 2.5375, "loss/incoh": 0.0, "loss/logits": 0.18885463178157808, "loss/reg": 0.0, "step": 36530 }, { "epoch": 0.24039473684210527, "grad_norm": 2.359375, "grad_norm_var": 0.04563700358072917, "learning_rate": 0.0001, "loss": 2.9944, "loss/crossentropy": 2.1725877285003663, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.22165239751338958, "loss/reg": 0.0, "step": 36540 }, { "epoch": 0.24046052631578949, "grad_norm": 2.46875, "grad_norm_var": 0.026154581705729166, "learning_rate": 0.0001, "loss": 3.011, "loss/crossentropy": 2.393988037109375, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.25498510152101517, "loss/reg": 0.0, "step": 36550 }, { "epoch": 0.24052631578947367, "grad_norm": 5.125, "grad_norm_var": 0.5747548421223958, "learning_rate": 0.0001, "loss": 3.0477, "loss/crossentropy": 2.3300102829933165, "loss/hidden": 2.921875, "loss/incoh": 0.0, "loss/logits": 0.2910366252064705, "loss/reg": 0.0, "step": 36560 }, { "epoch": 0.24059210526315788, "grad_norm": 2.390625, "grad_norm_var": 0.541943359375, "learning_rate": 0.0001, "loss": 3.0418, "loss/crossentropy": 2.2268447399139406, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.236025869846344, "loss/reg": 0.0, "step": 36570 }, { "epoch": 0.2406578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.11101786295572917, "learning_rate": 0.0001, "loss": 3.0594, "loss/crossentropy": 2.4716415405273438, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2721579551696777, "loss/reg": 0.0, "step": 36580 }, { "epoch": 0.2407236842105263, "grad_norm": 2.609375, "grad_norm_var": 0.09099934895833334, "learning_rate": 0.0001, "loss": 2.9772, "loss/crossentropy": 2.2797286033630373, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.24081944078207015, "loss/reg": 0.0, "step": 36590 }, { "epoch": 0.24078947368421053, "grad_norm": 2.1875, "grad_norm_var": 0.04234110514322917, "learning_rate": 0.0001, "loss": 2.9358, "loss/crossentropy": 2.2048792719841, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2422194480895996, "loss/reg": 0.0, "step": 36600 }, { "epoch": 0.24085526315789474, "grad_norm": 2.34375, "grad_norm_var": 0.061766560872395834, "learning_rate": 0.0001, "loss": 3.0394, "loss/crossentropy": 2.4691983580589296, "loss/hidden": 2.903125, "loss/incoh": 0.0, "loss/logits": 0.2871833577752113, "loss/reg": 0.0, "step": 36610 }, { "epoch": 0.24092105263157895, "grad_norm": 2.1875, "grad_norm_var": 0.07330322265625, "learning_rate": 0.0001, "loss": 3.0204, "loss/crossentropy": 2.3721364736557007, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.24769402742385865, "loss/reg": 0.0, "step": 36620 }, { "epoch": 0.24098684210526317, "grad_norm": 2.40625, "grad_norm_var": 0.08896077473958333, "learning_rate": 0.0001, "loss": 3.0345, "loss/crossentropy": 2.6213492870330812, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.22394069284200668, "loss/reg": 0.0, "step": 36630 }, { "epoch": 0.24105263157894738, "grad_norm": 2.609375, "grad_norm_var": 0.0711822509765625, "learning_rate": 0.0001, "loss": 2.9613, "loss/crossentropy": 2.4529595017433166, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.2483013778924942, "loss/reg": 0.0, "step": 36640 }, { "epoch": 0.24111842105263157, "grad_norm": 2.34375, "grad_norm_var": 0.024247233072916666, "learning_rate": 0.0001, "loss": 3.0826, "loss/crossentropy": 2.5188237547874452, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.25313877910375593, "loss/reg": 0.0, "step": 36650 }, { "epoch": 0.24118421052631578, "grad_norm": 1.96875, "grad_norm_var": 0.41513264973958336, "learning_rate": 0.0001, "loss": 3.0485, "loss/crossentropy": 2.358896279335022, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2380443513393402, "loss/reg": 0.0, "step": 36660 }, { "epoch": 0.24125, "grad_norm": 2088763392.0, "grad_norm_var": 4.570212428561056e+17, "learning_rate": 0.0001, "loss": 3.3122, "loss/crossentropy": 2.4535842657089235, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23562741428613662, "loss/reg": 0.0, "step": 36670 }, { "epoch": 0.2413157894736842, "grad_norm": 2.453125, "grad_norm_var": 4.5702124295693926e+17, "learning_rate": 0.0001, "loss": 3.0286, "loss/crossentropy": 2.1914322853088377, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.25221034586429597, "loss/reg": 0.0, "step": 36680 }, { "epoch": 0.24138157894736842, "grad_norm": 2.578125, "grad_norm_var": 0.031927235921223956, "learning_rate": 0.0001, "loss": 3.0104, "loss/crossentropy": 2.4763585209846495, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.2514804720878601, "loss/reg": 0.0, "step": 36690 }, { "epoch": 0.24144736842105263, "grad_norm": 3.1875, "grad_norm_var": 0.08718236287434895, "learning_rate": 0.0001, "loss": 3.159, "loss/crossentropy": 2.0223356008529665, "loss/hidden": 3.146875, "loss/incoh": 0.0, "loss/logits": 0.26726412028074265, "loss/reg": 0.0, "step": 36700 }, { "epoch": 0.24151315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.056004842122395836, "learning_rate": 0.0001, "loss": 3.0129, "loss/crossentropy": 2.2822505116462706, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21236415654420854, "loss/reg": 0.0, "step": 36710 }, { "epoch": 0.24157894736842106, "grad_norm": 2.1875, "grad_norm_var": 0.049605305989583334, "learning_rate": 0.0001, "loss": 2.9728, "loss/crossentropy": 2.152122360467911, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22159827202558519, "loss/reg": 0.0, "step": 36720 }, { "epoch": 0.24164473684210527, "grad_norm": 2.390625, "grad_norm_var": 0.2835845947265625, "learning_rate": 0.0001, "loss": 3.1062, "loss/crossentropy": 2.415015733242035, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2637294501066208, "loss/reg": 0.0, "step": 36730 }, { "epoch": 0.2417105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.24265950520833332, "learning_rate": 0.0001, "loss": 3.0508, "loss/crossentropy": 2.143397808074951, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.26159815937280656, "loss/reg": 0.0, "step": 36740 }, { "epoch": 0.24177631578947367, "grad_norm": 2.875, "grad_norm_var": 0.0584381103515625, "learning_rate": 0.0001, "loss": 3.1012, "loss/crossentropy": 2.3698354959487915, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.26768290251493454, "loss/reg": 0.0, "step": 36750 }, { "epoch": 0.24184210526315789, "grad_norm": 1.9375, "grad_norm_var": 0.9022532145182292, "learning_rate": 0.0001, "loss": 2.9442, "loss/crossentropy": 2.1970738768577576, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2042816124856472, "loss/reg": 0.0, "step": 36760 }, { "epoch": 0.2419078947368421, "grad_norm": 2.15625, "grad_norm_var": 0.9153279622395833, "learning_rate": 0.0001, "loss": 3.0157, "loss/crossentropy": 2.345754420757294, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23231206387281417, "loss/reg": 0.0, "step": 36770 }, { "epoch": 0.2419736842105263, "grad_norm": 2.25, "grad_norm_var": 0.08898111979166666, "learning_rate": 0.0001, "loss": 3.0409, "loss/crossentropy": 2.3259128451347353, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.23587240129709244, "loss/reg": 0.0, "step": 36780 }, { "epoch": 0.24203947368421053, "grad_norm": 2.5625, "grad_norm_var": 0.07978108723958334, "learning_rate": 0.0001, "loss": 3.0165, "loss/crossentropy": 2.5162180185317995, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2471260607242584, "loss/reg": 0.0, "step": 36790 }, { "epoch": 0.24210526315789474, "grad_norm": 2.234375, "grad_norm_var": 0.03634440104166667, "learning_rate": 0.0001, "loss": 3.0087, "loss/crossentropy": 2.3737236499786376, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.2186498686671257, "loss/reg": 0.0, "step": 36800 }, { "epoch": 0.24217105263157895, "grad_norm": 2.09375, "grad_norm_var": 0.049605305989583334, "learning_rate": 0.0001, "loss": 3.0755, "loss/crossentropy": 2.353866970539093, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.22997004687786102, "loss/reg": 0.0, "step": 36810 }, { "epoch": 0.24223684210526317, "grad_norm": 2.234375, "grad_norm_var": 0.11220296223958333, "learning_rate": 0.0001, "loss": 3.0278, "loss/crossentropy": 2.421842908859253, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.21242005676031112, "loss/reg": 0.0, "step": 36820 }, { "epoch": 0.24230263157894738, "grad_norm": 2.203125, "grad_norm_var": 0.16858622233072917, "learning_rate": 0.0001, "loss": 3.1032, "loss/crossentropy": 2.3163668155670165, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24295762777328492, "loss/reg": 0.0, "step": 36830 }, { "epoch": 0.24236842105263157, "grad_norm": 2.984375, "grad_norm_var": 0.0773150126139323, "learning_rate": 0.0001, "loss": 2.9972, "loss/crossentropy": 2.162323606014252, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2663910940289497, "loss/reg": 0.0, "step": 36840 }, { "epoch": 0.24243421052631578, "grad_norm": 2.171875, "grad_norm_var": 0.21162923177083334, "learning_rate": 0.0001, "loss": 2.962, "loss/crossentropy": 2.159565594792366, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.19483270347118378, "loss/reg": 0.0, "step": 36850 }, { "epoch": 0.2425, "grad_norm": 2.28125, "grad_norm_var": 0.3277259826660156, "learning_rate": 0.0001, "loss": 3.0165, "loss/crossentropy": 2.5494640946388243, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.21899326890707016, "loss/reg": 0.0, "step": 36860 }, { "epoch": 0.2425657894736842, "grad_norm": 2.296875, "grad_norm_var": 0.02713190714518229, "learning_rate": 0.0001, "loss": 2.9519, "loss/crossentropy": 2.3883594393730165, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.21375252604484557, "loss/reg": 0.0, "step": 36870 }, { "epoch": 0.24263157894736842, "grad_norm": 1.9296875, "grad_norm_var": 0.03628107706705729, "learning_rate": 0.0001, "loss": 2.9473, "loss/crossentropy": 2.1211004137992857, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2303142726421356, "loss/reg": 0.0, "step": 36880 }, { "epoch": 0.24269736842105263, "grad_norm": 2.6875, "grad_norm_var": 0.053929646809895836, "learning_rate": 0.0001, "loss": 2.9946, "loss/crossentropy": 2.2483023762702943, "loss/hidden": 2.965625, "loss/incoh": 0.0, "loss/logits": 0.2498287908732891, "loss/reg": 0.0, "step": 36890 }, { "epoch": 0.24276315789473685, "grad_norm": 2.390625, "grad_norm_var": 0.026569620768229166, "learning_rate": 0.0001, "loss": 3.0207, "loss/crossentropy": 2.4569414138793944, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.22985844165086747, "loss/reg": 0.0, "step": 36900 }, { "epoch": 0.24282894736842106, "grad_norm": 2.953125, "grad_norm_var": 0.06067708333333333, "learning_rate": 0.0001, "loss": 3.0104, "loss/crossentropy": 2.438031816482544, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.207527032494545, "loss/reg": 0.0, "step": 36910 }, { "epoch": 0.24289473684210527, "grad_norm": 2.28125, "grad_norm_var": 0.08677469889322917, "learning_rate": 0.0001, "loss": 2.9992, "loss/crossentropy": 2.4970327615737915, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.24948519468307495, "loss/reg": 0.0, "step": 36920 }, { "epoch": 0.24296052631578946, "grad_norm": 2.5, "grad_norm_var": 0.07526041666666666, "learning_rate": 0.0001, "loss": 3.0188, "loss/crossentropy": 2.4824806571006777, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.26679628640413283, "loss/reg": 0.0, "step": 36930 }, { "epoch": 0.24302631578947367, "grad_norm": 3.125, "grad_norm_var": 0.15319595336914063, "learning_rate": 0.0001, "loss": 3.0338, "loss/crossentropy": 2.588907778263092, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2500623852014542, "loss/reg": 0.0, "step": 36940 }, { "epoch": 0.2430921052631579, "grad_norm": 2.5, "grad_norm_var": 0.105810546875, "learning_rate": 0.0001, "loss": 3.0026, "loss/crossentropy": 2.233619010448456, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23702021837234497, "loss/reg": 0.0, "step": 36950 }, { "epoch": 0.2431578947368421, "grad_norm": 2.046875, "grad_norm_var": 0.032210286458333334, "learning_rate": 0.0001, "loss": 2.9011, "loss/crossentropy": 2.494797945022583, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2184271454811096, "loss/reg": 0.0, "step": 36960 }, { "epoch": 0.24322368421052631, "grad_norm": 2.3125, "grad_norm_var": 0.026590983072916668, "learning_rate": 0.0001, "loss": 3.048, "loss/crossentropy": 2.323407733440399, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.2522672712802887, "loss/reg": 0.0, "step": 36970 }, { "epoch": 0.24328947368421053, "grad_norm": 2.84375, "grad_norm_var": 0.04624735514322917, "learning_rate": 0.0001, "loss": 3.0306, "loss/crossentropy": 2.248076152801514, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22199858874082565, "loss/reg": 0.0, "step": 36980 }, { "epoch": 0.24335526315789474, "grad_norm": 2.21875, "grad_norm_var": 0.38259989420572915, "learning_rate": 0.0001, "loss": 2.9916, "loss/crossentropy": 2.251713329553604, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21513912975788116, "loss/reg": 0.0, "step": 36990 }, { "epoch": 0.24342105263157895, "grad_norm": 2.390625, "grad_norm_var": 0.384716796875, "learning_rate": 0.0001, "loss": 3.0287, "loss/crossentropy": 2.0817331850528715, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.21513576656579972, "loss/reg": 0.0, "step": 37000 }, { "epoch": 0.24348684210526317, "grad_norm": 2.546875, "grad_norm_var": 0.07632420857747396, "learning_rate": 0.0001, "loss": 3.0023, "loss/crossentropy": 2.351958858966827, "loss/hidden": 3.0078125, "loss/incoh": 0.0, "loss/logits": 0.27439084053039553, "loss/reg": 0.0, "step": 37010 }, { "epoch": 0.24355263157894738, "grad_norm": 2.0625, "grad_norm_var": 0.12300186157226563, "learning_rate": 0.0001, "loss": 3.0848, "loss/crossentropy": 2.2964416265487673, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.28409299105405805, "loss/reg": 0.0, "step": 37020 }, { "epoch": 0.24361842105263157, "grad_norm": 2.6875, "grad_norm_var": 0.18337173461914064, "learning_rate": 0.0001, "loss": 2.9231, "loss/crossentropy": 2.2850769579410555, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20450889468193054, "loss/reg": 0.0, "step": 37030 }, { "epoch": 0.24368421052631578, "grad_norm": 2.3125, "grad_norm_var": 0.18819071451822916, "learning_rate": 0.0001, "loss": 3.0757, "loss/crossentropy": 2.157426190376282, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.23339182287454605, "loss/reg": 0.0, "step": 37040 }, { "epoch": 0.24375, "grad_norm": 2.796875, "grad_norm_var": 0.0508209228515625, "learning_rate": 0.0001, "loss": 3.0667, "loss/crossentropy": 2.3142905950546266, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27201273292303085, "loss/reg": 0.0, "step": 37050 }, { "epoch": 0.2438157894736842, "grad_norm": 2.078125, "grad_norm_var": 0.054230753580729166, "learning_rate": 0.0001, "loss": 2.9379, "loss/crossentropy": 2.182029736042023, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2124377816915512, "loss/reg": 0.0, "step": 37060 }, { "epoch": 0.24388157894736842, "grad_norm": 1.96875, "grad_norm_var": 0.05328776041666667, "learning_rate": 0.0001, "loss": 2.9728, "loss/crossentropy": 2.47544287443161, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.20328531116247178, "loss/reg": 0.0, "step": 37070 }, { "epoch": 0.24394736842105263, "grad_norm": 2.28125, "grad_norm_var": 0.11983617146809895, "learning_rate": 0.0001, "loss": 3.038, "loss/crossentropy": 2.315932643413544, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2507309168577194, "loss/reg": 0.0, "step": 37080 }, { "epoch": 0.24401315789473685, "grad_norm": 3.125, "grad_norm_var": 0.5823707580566406, "learning_rate": 0.0001, "loss": 3.0428, "loss/crossentropy": 2.0966883838176726, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.24450836032629014, "loss/reg": 0.0, "step": 37090 }, { "epoch": 0.24407894736842106, "grad_norm": 2.3125, "grad_norm_var": 0.562109375, "learning_rate": 0.0001, "loss": 3.0281, "loss/crossentropy": 2.111643207073212, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.22489524409174919, "loss/reg": 0.0, "step": 37100 }, { "epoch": 0.24414473684210528, "grad_norm": 2.65625, "grad_norm_var": 0.048238118489583336, "learning_rate": 0.0001, "loss": 2.9667, "loss/crossentropy": 2.2703915894031526, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.20935535281896592, "loss/reg": 0.0, "step": 37110 }, { "epoch": 0.24421052631578946, "grad_norm": 2.234375, "grad_norm_var": 0.12493082682291666, "learning_rate": 0.0001, "loss": 3.0306, "loss/crossentropy": 2.4712467312812807, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.21050616949796677, "loss/reg": 0.0, "step": 37120 }, { "epoch": 0.24427631578947367, "grad_norm": 2.078125, "grad_norm_var": 0.22398173014322917, "learning_rate": 0.0001, "loss": 2.9499, "loss/crossentropy": 2.1810320615768433, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.1909588247537613, "loss/reg": 0.0, "step": 37130 }, { "epoch": 0.2443421052631579, "grad_norm": 2.1875, "grad_norm_var": 0.18337300618489583, "learning_rate": 0.0001, "loss": 3.0502, "loss/crossentropy": 2.259117543697357, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.2917395427823067, "loss/reg": 0.0, "step": 37140 }, { "epoch": 0.2444078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.04478251139322917, "learning_rate": 0.0001, "loss": 3.003, "loss/crossentropy": 2.3330495953559875, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.25376534312963483, "loss/reg": 0.0, "step": 37150 }, { "epoch": 0.24447368421052632, "grad_norm": 2.125, "grad_norm_var": 0.025809733072916667, "learning_rate": 0.0001, "loss": 2.9673, "loss/crossentropy": 2.2595450043678285, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.24186031073331832, "loss/reg": 0.0, "step": 37160 }, { "epoch": 0.24453947368421053, "grad_norm": 2.3125, "grad_norm_var": 0.040257771809895836, "learning_rate": 0.0001, "loss": 3.0137, "loss/crossentropy": 2.3244964241981507, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.22835270911455155, "loss/reg": 0.0, "step": 37170 }, { "epoch": 0.24460526315789474, "grad_norm": 2.328125, "grad_norm_var": 0.18911844889322918, "learning_rate": 0.0001, "loss": 3.095, "loss/crossentropy": 2.1679759979248048, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.22593683898448944, "loss/reg": 0.0, "step": 37180 }, { "epoch": 0.24467105263157896, "grad_norm": 2.390625, "grad_norm_var": 0.0819488525390625, "learning_rate": 0.0001, "loss": 3.0209, "loss/crossentropy": 2.3522397756576536, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22514844089746475, "loss/reg": 0.0, "step": 37190 }, { "epoch": 0.24473684210526317, "grad_norm": 3.171875, "grad_norm_var": 0.14850972493489584, "learning_rate": 0.0001, "loss": 3.0028, "loss/crossentropy": 2.1798877120018005, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.26098706275224687, "loss/reg": 0.0, "step": 37200 }, { "epoch": 0.24480263157894736, "grad_norm": 3.4375, "grad_norm_var": 0.16476949055989584, "learning_rate": 0.0001, "loss": 3.0878, "loss/crossentropy": 2.3389397978782656, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.23207588642835617, "loss/reg": 0.0, "step": 37210 }, { "epoch": 0.24486842105263157, "grad_norm": 1.96875, "grad_norm_var": 0.13612874348958334, "learning_rate": 0.0001, "loss": 3.0629, "loss/crossentropy": 2.29145188331604, "loss/hidden": 2.9859375, "loss/incoh": 0.0, "loss/logits": 0.23295368999242783, "loss/reg": 0.0, "step": 37220 }, { "epoch": 0.24493421052631578, "grad_norm": 2.578125, "grad_norm_var": 0.0704742431640625, "learning_rate": 0.0001, "loss": 3.0207, "loss/crossentropy": 2.263294315338135, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.21528264433145522, "loss/reg": 0.0, "step": 37230 }, { "epoch": 0.245, "grad_norm": 2.765625, "grad_norm_var": 0.04729410807291667, "learning_rate": 0.0001, "loss": 2.9616, "loss/crossentropy": 2.4668697357177733, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2833425417542458, "loss/reg": 0.0, "step": 37240 }, { "epoch": 0.2450657894736842, "grad_norm": 2.390625, "grad_norm_var": 0.1173736572265625, "learning_rate": 0.0001, "loss": 3.0141, "loss/crossentropy": 2.310600745677948, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.2221137210726738, "loss/reg": 0.0, "step": 37250 }, { "epoch": 0.24513157894736842, "grad_norm": 3.359375, "grad_norm_var": 0.15969136555989583, "learning_rate": 0.0001, "loss": 3.0175, "loss/crossentropy": 2.113830578327179, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2413550451397896, "loss/reg": 0.0, "step": 37260 }, { "epoch": 0.24519736842105264, "grad_norm": 2.625, "grad_norm_var": 0.12274983723958334, "learning_rate": 0.0001, "loss": 3.0058, "loss/crossentropy": 1.9976770758628846, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.19502876847982406, "loss/reg": 0.0, "step": 37270 }, { "epoch": 0.24526315789473685, "grad_norm": 2.40625, "grad_norm_var": 0.11809488932291666, "learning_rate": 0.0001, "loss": 2.9752, "loss/crossentropy": 2.1309264838695525, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.20758389085531234, "loss/reg": 0.0, "step": 37280 }, { "epoch": 0.24532894736842106, "grad_norm": 2.21875, "grad_norm_var": 0.12764383951822916, "learning_rate": 0.0001, "loss": 3.044, "loss/crossentropy": 2.142688637971878, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2955352425575256, "loss/reg": 0.0, "step": 37290 }, { "epoch": 0.24539473684210528, "grad_norm": 2.640625, "grad_norm_var": 0.0466949462890625, "learning_rate": 0.0001, "loss": 2.9294, "loss/crossentropy": 2.255223333835602, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.22907140627503395, "loss/reg": 0.0, "step": 37300 }, { "epoch": 0.24546052631578946, "grad_norm": 2.125, "grad_norm_var": 0.085693359375, "learning_rate": 0.0001, "loss": 2.9874, "loss/crossentropy": 2.121692883968353, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.195634426176548, "loss/reg": 0.0, "step": 37310 }, { "epoch": 0.24552631578947368, "grad_norm": 2.703125, "grad_norm_var": 0.12754694620768228, "learning_rate": 0.0001, "loss": 3.0103, "loss/crossentropy": 2.2369035363197325, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.2212720662355423, "loss/reg": 0.0, "step": 37320 }, { "epoch": 0.2455921052631579, "grad_norm": 2.953125, "grad_norm_var": 0.14957046508789062, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 1.6413456916809082, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.18164349719882011, "loss/reg": 0.0, "step": 37330 }, { "epoch": 0.2456578947368421, "grad_norm": 2.484375, "grad_norm_var": 0.17405497233072917, "learning_rate": 0.0001, "loss": 3.0623, "loss/crossentropy": 2.128761112689972, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.20798836946487426, "loss/reg": 0.0, "step": 37340 }, { "epoch": 0.24572368421052632, "grad_norm": 2.46875, "grad_norm_var": 0.05827534993489583, "learning_rate": 0.0001, "loss": 2.9564, "loss/crossentropy": 2.271031451225281, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.21863894239068032, "loss/reg": 0.0, "step": 37350 }, { "epoch": 0.24578947368421053, "grad_norm": 2.265625, "grad_norm_var": 0.10803629557291666, "learning_rate": 0.0001, "loss": 3.0487, "loss/crossentropy": 2.5324214935302733, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2654300585389137, "loss/reg": 0.0, "step": 37360 }, { "epoch": 0.24585526315789474, "grad_norm": 2.046875, "grad_norm_var": 0.0805816650390625, "learning_rate": 0.0001, "loss": 2.9872, "loss/crossentropy": 2.476090502738953, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22690275460481643, "loss/reg": 0.0, "step": 37370 }, { "epoch": 0.24592105263157896, "grad_norm": 2.109375, "grad_norm_var": 0.0459136962890625, "learning_rate": 0.0001, "loss": 2.9355, "loss/crossentropy": 2.158538544178009, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.2366640105843544, "loss/reg": 0.0, "step": 37380 }, { "epoch": 0.24598684210526317, "grad_norm": 1.8828125, "grad_norm_var": 0.048860422770182294, "learning_rate": 0.0001, "loss": 3.0021, "loss/crossentropy": 2.2990810751914976, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20744879692792892, "loss/reg": 0.0, "step": 37390 }, { "epoch": 0.24605263157894736, "grad_norm": 2.40625, "grad_norm_var": 0.052247873942057294, "learning_rate": 0.0001, "loss": 2.9696, "loss/crossentropy": 2.2528524160385133, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22408121079206467, "loss/reg": 0.0, "step": 37400 }, { "epoch": 0.24611842105263157, "grad_norm": 2.140625, "grad_norm_var": 0.046122233072916664, "learning_rate": 0.0001, "loss": 2.991, "loss/crossentropy": 2.4535847425460817, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.27996278256177903, "loss/reg": 0.0, "step": 37410 }, { "epoch": 0.24618421052631578, "grad_norm": 2.03125, "grad_norm_var": 0.50777587890625, "learning_rate": 0.0001, "loss": 2.9405, "loss/crossentropy": 2.2263678312301636, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.21364209055900574, "loss/reg": 0.0, "step": 37420 }, { "epoch": 0.24625, "grad_norm": 2.03125, "grad_norm_var": 0.0452301025390625, "learning_rate": 0.0001, "loss": 2.966, "loss/crossentropy": 2.3186843156814576, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.251382140815258, "loss/reg": 0.0, "step": 37430 }, { "epoch": 0.2463157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.035674794514973955, "learning_rate": 0.0001, "loss": 2.9334, "loss/crossentropy": 2.3233260989189146, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.26353937536478045, "loss/reg": 0.0, "step": 37440 }, { "epoch": 0.24638157894736842, "grad_norm": 2.03125, "grad_norm_var": 0.09699071248372396, "learning_rate": 0.0001, "loss": 3.0099, "loss/crossentropy": 2.1750458002090456, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22462447360157967, "loss/reg": 0.0, "step": 37450 }, { "epoch": 0.24644736842105264, "grad_norm": 2.390625, "grad_norm_var": 0.09807942708333334, "learning_rate": 0.0001, "loss": 2.9668, "loss/crossentropy": 2.59984233379364, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.25129624158144, "loss/reg": 0.0, "step": 37460 }, { "epoch": 0.24651315789473685, "grad_norm": 2.109375, "grad_norm_var": 0.04807840983072917, "learning_rate": 0.0001, "loss": 2.9464, "loss/crossentropy": 2.4232494115829466, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.22763155549764633, "loss/reg": 0.0, "step": 37470 }, { "epoch": 0.24657894736842106, "grad_norm": 2.421875, "grad_norm_var": 0.2599283854166667, "learning_rate": 0.0001, "loss": 3.0313, "loss/crossentropy": 2.314221677184105, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.19144997373223305, "loss/reg": 0.0, "step": 37480 }, { "epoch": 0.24664473684210525, "grad_norm": 2.484375, "grad_norm_var": 0.3412068684895833, "learning_rate": 0.0001, "loss": 2.98, "loss/crossentropy": 2.4459351778030394, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24818085730075837, "loss/reg": 0.0, "step": 37490 }, { "epoch": 0.24671052631578946, "grad_norm": 2.234375, "grad_norm_var": 0.31691080729166665, "learning_rate": 0.0001, "loss": 3.0414, "loss/crossentropy": 2.1927252769470216, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.2274562269449234, "loss/reg": 0.0, "step": 37500 }, { "epoch": 0.24677631578947368, "grad_norm": 2.3125, "grad_norm_var": 0.2977935791015625, "learning_rate": 0.0001, "loss": 3.0205, "loss/crossentropy": 2.0236656427383424, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.23172616437077523, "loss/reg": 0.0, "step": 37510 }, { "epoch": 0.2468421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.050593058268229164, "learning_rate": 0.0001, "loss": 2.9751, "loss/crossentropy": 2.0957834839820864, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.23045892268419266, "loss/reg": 0.0, "step": 37520 }, { "epoch": 0.2469078947368421, "grad_norm": 2.140625, "grad_norm_var": 0.09323628743489583, "learning_rate": 0.0001, "loss": 2.9756, "loss/crossentropy": 2.189413595199585, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.22707376182079314, "loss/reg": 0.0, "step": 37530 }, { "epoch": 0.24697368421052632, "grad_norm": 2.0, "grad_norm_var": 0.056183878580729166, "learning_rate": 0.0001, "loss": 2.9589, "loss/crossentropy": 2.733688974380493, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2146020546555519, "loss/reg": 0.0, "step": 37540 }, { "epoch": 0.24703947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.04794921875, "learning_rate": 0.0001, "loss": 3.0378, "loss/crossentropy": 2.206034767627716, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.24584971666336058, "loss/reg": 0.0, "step": 37550 }, { "epoch": 0.24710526315789474, "grad_norm": 2.25, "grad_norm_var": 0.058288319905598955, "learning_rate": 0.0001, "loss": 2.9179, "loss/crossentropy": 1.987370991706848, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.194708900898695, "loss/reg": 0.0, "step": 37560 }, { "epoch": 0.24717105263157896, "grad_norm": 2.21875, "grad_norm_var": 0.03406956990559896, "learning_rate": 0.0001, "loss": 2.9143, "loss/crossentropy": 2.281442165374756, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.19732007533311843, "loss/reg": 0.0, "step": 37570 }, { "epoch": 0.24723684210526317, "grad_norm": 2.0625, "grad_norm_var": 0.0267730712890625, "learning_rate": 0.0001, "loss": 2.9615, "loss/crossentropy": 2.3091693341732027, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.22810056209564208, "loss/reg": 0.0, "step": 37580 }, { "epoch": 0.24730263157894736, "grad_norm": 2.578125, "grad_norm_var": 0.0711334228515625, "learning_rate": 0.0001, "loss": 3.0095, "loss/crossentropy": 2.276491713523865, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2274973288178444, "loss/reg": 0.0, "step": 37590 }, { "epoch": 0.24736842105263157, "grad_norm": 2.4375, "grad_norm_var": 0.052994791666666666, "learning_rate": 0.0001, "loss": 3.0237, "loss/crossentropy": 2.096492087841034, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.22436774969100953, "loss/reg": 0.0, "step": 37600 }, { "epoch": 0.24743421052631578, "grad_norm": 2.21875, "grad_norm_var": 0.023664347330729165, "learning_rate": 0.0001, "loss": 2.9825, "loss/crossentropy": 2.1003798633813857, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.21021526902914048, "loss/reg": 0.0, "step": 37610 }, { "epoch": 0.2475, "grad_norm": 2.21875, "grad_norm_var": 0.12235921223958333, "learning_rate": 0.0001, "loss": 2.9975, "loss/crossentropy": 2.338260293006897, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.2373722493648529, "loss/reg": 0.0, "step": 37620 }, { "epoch": 0.2475657894736842, "grad_norm": 2.75, "grad_norm_var": 0.1563738505045573, "learning_rate": 0.0001, "loss": 2.9835, "loss/crossentropy": 2.4219950675964355, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.210829646140337, "loss/reg": 0.0, "step": 37630 }, { "epoch": 0.24763157894736842, "grad_norm": 2.859375, "grad_norm_var": 3.647915690527556e+17, "learning_rate": 0.0001, "loss": 3.1081, "loss/crossentropy": 2.1713775038719176, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.20183515548706055, "loss/reg": 0.0, "step": 37640 }, { "epoch": 0.24769736842105264, "grad_norm": 2.328125, "grad_norm_var": 3.6479156911331085e+17, "learning_rate": 0.0001, "loss": 2.9457, "loss/crossentropy": 2.509985637664795, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.23514220416545867, "loss/reg": 0.0, "step": 37650 }, { "epoch": 0.24776315789473685, "grad_norm": 4.125, "grad_norm_var": 0.30654703776041664, "learning_rate": 0.0001, "loss": 2.934, "loss/crossentropy": 2.2876363396644592, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.20297650545835494, "loss/reg": 0.0, "step": 37660 }, { "epoch": 0.24782894736842107, "grad_norm": 2.328125, "grad_norm_var": 0.27844416300455727, "learning_rate": 0.0001, "loss": 2.9907, "loss/crossentropy": 2.4115204930305483, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2483261451125145, "loss/reg": 0.0, "step": 37670 }, { "epoch": 0.24789473684210525, "grad_norm": 2.375, "grad_norm_var": 0.05608317057291667, "learning_rate": 0.0001, "loss": 2.9908, "loss/crossentropy": 2.3460915803909304, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24570741802453994, "loss/reg": 0.0, "step": 37680 }, { "epoch": 0.24796052631578946, "grad_norm": 2.21875, "grad_norm_var": 0.1174468994140625, "learning_rate": 0.0001, "loss": 2.9498, "loss/crossentropy": 2.2238924860954286, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.21785593777894974, "loss/reg": 0.0, "step": 37690 }, { "epoch": 0.24802631578947368, "grad_norm": 2.265625, "grad_norm_var": 0.18594462076822918, "learning_rate": 0.0001, "loss": 2.9304, "loss/crossentropy": 2.2358608484268188, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.20752606838941573, "loss/reg": 0.0, "step": 37700 }, { "epoch": 0.2480921052631579, "grad_norm": 2.015625, "grad_norm_var": 6.753564453125, "learning_rate": 0.0001, "loss": 3.002, "loss/crossentropy": 2.481326103210449, "loss/hidden": 2.96875, "loss/incoh": 0.0, "loss/logits": 0.34728084355592725, "loss/reg": 0.0, "step": 37710 }, { "epoch": 0.2481578947368421, "grad_norm": 2.34375, "grad_norm_var": 6.80933837890625, "learning_rate": 0.0001, "loss": 2.9933, "loss/crossentropy": 2.11043701171875, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.231504824757576, "loss/reg": 0.0, "step": 37720 }, { "epoch": 0.24822368421052632, "grad_norm": 2.171875, "grad_norm_var": 0.0999176025390625, "learning_rate": 0.0001, "loss": 2.998, "loss/crossentropy": 2.3754311203956604, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2291927695274353, "loss/reg": 0.0, "step": 37730 }, { "epoch": 0.24828947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.044677734375, "learning_rate": 0.0001, "loss": 2.9851, "loss/crossentropy": 2.2369036078453064, "loss/hidden": 2.5625, "loss/incoh": 0.0, "loss/logits": 0.19738300889730453, "loss/reg": 0.0, "step": 37740 }, { "epoch": 0.24835526315789475, "grad_norm": 2.6875, "grad_norm_var": 0.265234375, "learning_rate": 0.0001, "loss": 3.0716, "loss/crossentropy": 2.236533558368683, "loss/hidden": 3.1171875, "loss/incoh": 0.0, "loss/logits": 0.2648838981986046, "loss/reg": 0.0, "step": 37750 }, { "epoch": 0.24842105263157896, "grad_norm": 2.125, "grad_norm_var": 0.28010660807291665, "learning_rate": 0.0001, "loss": 3.0245, "loss/crossentropy": 2.3721311211586, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.27348231226205827, "loss/reg": 0.0, "step": 37760 }, { "epoch": 0.24848684210526314, "grad_norm": 2.265625, "grad_norm_var": 0.13202718098958333, "learning_rate": 0.0001, "loss": 3.0549, "loss/crossentropy": 2.2753346085548403, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.2692110911011696, "loss/reg": 0.0, "step": 37770 }, { "epoch": 0.24855263157894736, "grad_norm": 2.359375, "grad_norm_var": 0.10732014973958333, "learning_rate": 0.0001, "loss": 3.022, "loss/crossentropy": 2.2350252270698547, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.19948195964097976, "loss/reg": 0.0, "step": 37780 }, { "epoch": 0.24861842105263157, "grad_norm": 3.375, "grad_norm_var": 0.13839518229166667, "learning_rate": 0.0001, "loss": 2.9917, "loss/crossentropy": 2.175236439704895, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.21911766976118088, "loss/reg": 0.0, "step": 37790 }, { "epoch": 0.24868421052631579, "grad_norm": 2.046875, "grad_norm_var": 0.10273030598958334, "learning_rate": 0.0001, "loss": 2.9491, "loss/crossentropy": 2.3891021251678466, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2441677287220955, "loss/reg": 0.0, "step": 37800 }, { "epoch": 0.24875, "grad_norm": 2.296875, "grad_norm_var": 0.11543782552083333, "learning_rate": 0.0001, "loss": 2.9554, "loss/crossentropy": 2.1731114864349363, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.24577540904283524, "loss/reg": 0.0, "step": 37810 }, { "epoch": 0.2488157894736842, "grad_norm": 2.46875, "grad_norm_var": 0.13479588826497396, "learning_rate": 0.0001, "loss": 3.0308, "loss/crossentropy": 2.459308052062988, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.25517907589673994, "loss/reg": 0.0, "step": 37820 }, { "epoch": 0.24888157894736843, "grad_norm": 2.828125, "grad_norm_var": 0.06494140625, "learning_rate": 0.0001, "loss": 3.0135, "loss/crossentropy": 2.296464502811432, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.21491459012031555, "loss/reg": 0.0, "step": 37830 }, { "epoch": 0.24894736842105264, "grad_norm": 2.21875, "grad_norm_var": 0.64693603515625, "learning_rate": 0.0001, "loss": 3.0707, "loss/crossentropy": 2.414829707145691, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.2108585089445114, "loss/reg": 0.0, "step": 37840 }, { "epoch": 0.24901315789473685, "grad_norm": 2.25, "grad_norm_var": 0.5141021728515625, "learning_rate": 0.0001, "loss": 2.9619, "loss/crossentropy": 1.9343510389328002, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.22914507612586021, "loss/reg": 0.0, "step": 37850 }, { "epoch": 0.24907894736842107, "grad_norm": 2.53125, "grad_norm_var": 0.1226226806640625, "learning_rate": 0.0001, "loss": 2.967, "loss/crossentropy": 2.5030432462692263, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.23077891543507575, "loss/reg": 0.0, "step": 37860 }, { "epoch": 0.24914473684210525, "grad_norm": 2.3125, "grad_norm_var": 0.07437235514322917, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.3556608080863954, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.2477458581328392, "loss/reg": 0.0, "step": 37870 }, { "epoch": 0.24921052631578947, "grad_norm": 2.3125, "grad_norm_var": 0.05681050618489583, "learning_rate": 0.0001, "loss": 2.9916, "loss/crossentropy": 2.2093961358070375, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.19773360043764115, "loss/reg": 0.0, "step": 37880 }, { "epoch": 0.24927631578947368, "grad_norm": 2.28125, "grad_norm_var": 0.032624308268229166, "learning_rate": 0.0001, "loss": 2.9833, "loss/crossentropy": 2.070064514875412, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20810991451144217, "loss/reg": 0.0, "step": 37890 }, { "epoch": 0.2493421052631579, "grad_norm": 2.4375, "grad_norm_var": 0.3275227864583333, "learning_rate": 0.0001, "loss": 3.087, "loss/crossentropy": 2.4348979711532595, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.1990129753947258, "loss/reg": 0.0, "step": 37900 }, { "epoch": 0.2494078947368421, "grad_norm": 2.109375, "grad_norm_var": 0.1884661356608073, "learning_rate": 0.0001, "loss": 2.9224, "loss/crossentropy": 2.313380515575409, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.20934066772460938, "loss/reg": 0.0, "step": 37910 }, { "epoch": 0.24947368421052632, "grad_norm": 2.109375, "grad_norm_var": 0.2013567606608073, "learning_rate": 0.0001, "loss": 2.9925, "loss/crossentropy": 2.2767374873161317, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.21055591180920602, "loss/reg": 0.0, "step": 37920 }, { "epoch": 0.24953947368421053, "grad_norm": 2.921875, "grad_norm_var": 0.29850972493489586, "learning_rate": 0.0001, "loss": 2.9993, "loss/crossentropy": 2.361280381679535, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2328987866640091, "loss/reg": 0.0, "step": 37930 }, { "epoch": 0.24960526315789475, "grad_norm": 2.59375, "grad_norm_var": 0.2740234375, "learning_rate": 0.0001, "loss": 2.9261, "loss/crossentropy": 2.1607108235359194, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.21131338626146318, "loss/reg": 0.0, "step": 37940 }, { "epoch": 0.24967105263157896, "grad_norm": 2.578125, "grad_norm_var": 0.15468343098958334, "learning_rate": 0.0001, "loss": 2.9839, "loss/crossentropy": 2.2607654333114624, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.18967671394348146, "loss/reg": 0.0, "step": 37950 }, { "epoch": 0.24973684210526315, "grad_norm": 2.578125, "grad_norm_var": 0.1874956766764323, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.483866810798645, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.22582052797079086, "loss/reg": 0.0, "step": 37960 }, { "epoch": 0.24980263157894736, "grad_norm": 2.40625, "grad_norm_var": 0.21528294881184895, "learning_rate": 0.0001, "loss": 3.017, "loss/crossentropy": 2.568509268760681, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.25563120394945144, "loss/reg": 0.0, "step": 37970 }, { "epoch": 0.24986842105263157, "grad_norm": 2.046875, "grad_norm_var": 0.21802469889322917, "learning_rate": 0.0001, "loss": 3.002, "loss/crossentropy": 2.394374597072601, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2696389377117157, "loss/reg": 0.0, "step": 37980 }, { "epoch": 0.2499342105263158, "grad_norm": 2.296875, "grad_norm_var": 0.41775716145833336, "learning_rate": 0.0001, "loss": 3.0951, "loss/crossentropy": 2.279202771186829, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.22951147556304932, "loss/reg": 0.0, "step": 37990 }, { "epoch": 0.25, "grad_norm": 2.5, "grad_norm_var": 0.25235087076822915, "learning_rate": 0.0001, "loss": 2.9227, "loss/crossentropy": 2.0242787480354307, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20535071119666098, "loss/reg": 0.0, "step": 38000 }, { "epoch": 0.2500657894736842, "grad_norm": 2.640625, "grad_norm_var": 0.08347880045572917, "learning_rate": 0.0001, "loss": 2.9955, "loss/crossentropy": 2.109722208976746, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.20487915128469467, "loss/reg": 0.0, "step": 38010 }, { "epoch": 0.2501315789473684, "grad_norm": 2.328125, "grad_norm_var": 6.417805380649222e+17, "learning_rate": 0.0001, "loss": 3.1302, "loss/crossentropy": 2.31266930103302, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.22627270370721816, "loss/reg": 0.0, "step": 38020 }, { "epoch": 0.2501973684210526, "grad_norm": 2.84375, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 2.9995, "loss/crossentropy": 2.147220182418823, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.18802944347262382, "loss/reg": 0.0, "step": 38030 }, { "epoch": 0.25026315789473685, "grad_norm": 2.421875, "grad_norm_var": 0.18092041015625, "learning_rate": 0.0001, "loss": 3.014, "loss/crossentropy": 2.1841834664344786, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23548600152134896, "loss/reg": 0.0, "step": 38040 }, { "epoch": 0.25032894736842104, "grad_norm": 2.5, "grad_norm_var": 0.17408854166666668, "learning_rate": 0.0001, "loss": 3.0632, "loss/crossentropy": 2.1282356858253477, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2339039534330368, "loss/reg": 0.0, "step": 38050 }, { "epoch": 0.2503947368421053, "grad_norm": 2.53125, "grad_norm_var": 0.15420633951822918, "learning_rate": 0.0001, "loss": 2.9998, "loss/crossentropy": 2.2876673698425294, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20793070793151855, "loss/reg": 0.0, "step": 38060 }, { "epoch": 0.25046052631578947, "grad_norm": 2.765625, "grad_norm_var": 0.17851460774739583, "learning_rate": 0.0001, "loss": 3.0085, "loss/crossentropy": 2.327640974521637, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.26281672716140747, "loss/reg": 0.0, "step": 38070 }, { "epoch": 0.2505263157894737, "grad_norm": 2.703125, "grad_norm_var": 0.6514231363932291, "learning_rate": 0.0001, "loss": 3.0383, "loss/crossentropy": 1.913532590866089, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.20542303174734117, "loss/reg": 0.0, "step": 38080 }, { "epoch": 0.2505921052631579, "grad_norm": 1.984375, "grad_norm_var": 0.7180580139160156, "learning_rate": 0.0001, "loss": 2.9836, "loss/crossentropy": 2.27760968208313, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23442062735557556, "loss/reg": 0.0, "step": 38090 }, { "epoch": 0.2506578947368421, "grad_norm": 4.125, "grad_norm_var": 0.5421974182128906, "learning_rate": 0.0001, "loss": 2.9747, "loss/crossentropy": 2.13877215385437, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.19861070141196252, "loss/reg": 0.0, "step": 38100 }, { "epoch": 0.2507236842105263, "grad_norm": 3.59375, "grad_norm_var": 0.27887369791666666, "learning_rate": 0.0001, "loss": 3.1014, "loss/crossentropy": 2.2125035911798476, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.20661384277045727, "loss/reg": 0.0, "step": 38110 }, { "epoch": 0.2507894736842105, "grad_norm": 2.25, "grad_norm_var": 0.3249908447265625, "learning_rate": 0.0001, "loss": 3.0608, "loss/crossentropy": 2.3948875069618225, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.21930506974458694, "loss/reg": 0.0, "step": 38120 }, { "epoch": 0.25085526315789475, "grad_norm": 2.53125, "grad_norm_var": 0.28649063110351564, "learning_rate": 0.0001, "loss": 2.9885, "loss/crossentropy": 2.292639744281769, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2267945870757103, "loss/reg": 0.0, "step": 38130 }, { "epoch": 0.25092105263157893, "grad_norm": 2.59375, "grad_norm_var": 0.05728759765625, "learning_rate": 0.0001, "loss": 2.9936, "loss/crossentropy": 2.097894775867462, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.20571077913045882, "loss/reg": 0.0, "step": 38140 }, { "epoch": 0.2509868421052632, "grad_norm": 3.375, "grad_norm_var": 3.1128214518229167, "learning_rate": 0.0001, "loss": 2.9914, "loss/crossentropy": 2.272007715702057, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.21056424751877784, "loss/reg": 0.0, "step": 38150 }, { "epoch": 0.25105263157894736, "grad_norm": 2.421875, "grad_norm_var": 0.16922098795572918, "learning_rate": 0.0001, "loss": 2.9551, "loss/crossentropy": 2.2579661190509794, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.26000856757164004, "loss/reg": 0.0, "step": 38160 }, { "epoch": 0.2511184210526316, "grad_norm": 2.15625, "grad_norm_var": 0.1134674072265625, "learning_rate": 0.0001, "loss": 2.9975, "loss/crossentropy": 2.0489343285560606, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.17201969623565674, "loss/reg": 0.0, "step": 38170 }, { "epoch": 0.2511842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.030980428059895832, "learning_rate": 0.0001, "loss": 2.9523, "loss/crossentropy": 2.252878558635712, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.23066311478614807, "loss/reg": 0.0, "step": 38180 }, { "epoch": 0.25125, "grad_norm": 2.140625, "grad_norm_var": 0.0248931884765625, "learning_rate": 0.0001, "loss": 2.9488, "loss/crossentropy": 2.3150659799575806, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.21407101899385453, "loss/reg": 0.0, "step": 38190 }, { "epoch": 0.2513157894736842, "grad_norm": 2.640625, "grad_norm_var": 2.1658424377441405, "learning_rate": 0.0001, "loss": 2.9789, "loss/crossentropy": 2.4279683232307434, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.2308952897787094, "loss/reg": 0.0, "step": 38200 }, { "epoch": 0.2513815789473684, "grad_norm": 3.75, "grad_norm_var": 1.915710194905599, "learning_rate": 0.0001, "loss": 2.9791, "loss/crossentropy": 2.454149055480957, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.27904040217399595, "loss/reg": 0.0, "step": 38210 }, { "epoch": 0.25144736842105264, "grad_norm": 2.28125, "grad_norm_var": 0.16384175618489583, "learning_rate": 0.0001, "loss": 3.025, "loss/crossentropy": 2.4395872354507446, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.24970148950815202, "loss/reg": 0.0, "step": 38220 }, { "epoch": 0.2515131578947368, "grad_norm": 3.1875, "grad_norm_var": 0.13736572265625, "learning_rate": 0.0001, "loss": 2.9912, "loss/crossentropy": 2.4788294553756716, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.21640673130750657, "loss/reg": 0.0, "step": 38230 }, { "epoch": 0.25157894736842107, "grad_norm": 2.359375, "grad_norm_var": 0.1446929931640625, "learning_rate": 0.0001, "loss": 2.978, "loss/crossentropy": 2.15856648683548, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.22779612690210344, "loss/reg": 0.0, "step": 38240 }, { "epoch": 0.25164473684210525, "grad_norm": 2.484375, "grad_norm_var": 0.028629557291666666, "learning_rate": 0.0001, "loss": 3.0221, "loss/crossentropy": 2.5230648517608643, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22997782975435258, "loss/reg": 0.0, "step": 38250 }, { "epoch": 0.2517105263157895, "grad_norm": 2.21875, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 2.9928, "loss/crossentropy": 2.3802834033966063, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.25779378563165667, "loss/reg": 0.0, "step": 38260 }, { "epoch": 0.2517763157894737, "grad_norm": 2.359375, "grad_norm_var": 0.18492838541666667, "learning_rate": 0.0001, "loss": 2.9498, "loss/crossentropy": 2.31593804359436, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.245054192841053, "loss/reg": 0.0, "step": 38270 }, { "epoch": 0.25184210526315787, "grad_norm": 3.0625, "grad_norm_var": 0.1485015869140625, "learning_rate": 0.0001, "loss": 3.0313, "loss/crossentropy": 2.241119909286499, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.19746284037828446, "loss/reg": 0.0, "step": 38280 }, { "epoch": 0.2519078947368421, "grad_norm": 2.53125, "grad_norm_var": 0.10631510416666666, "learning_rate": 0.0001, "loss": 3.0635, "loss/crossentropy": 2.0983599185943604, "loss/hidden": 2.99375, "loss/incoh": 0.0, "loss/logits": 0.25142730176448824, "loss/reg": 0.0, "step": 38290 }, { "epoch": 0.2519736842105263, "grad_norm": 3.28125, "grad_norm_var": 0.11451416015625, "learning_rate": 0.0001, "loss": 3.046, "loss/crossentropy": 2.36417875289917, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.24252529442310333, "loss/reg": 0.0, "step": 38300 }, { "epoch": 0.25203947368421054, "grad_norm": 3.046875, "grad_norm_var": 0.11712239583333334, "learning_rate": 0.0001, "loss": 3.1178, "loss/crossentropy": 2.3146223425865173, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.26736019253730775, "loss/reg": 0.0, "step": 38310 }, { "epoch": 0.2521052631578947, "grad_norm": 2.609375, "grad_norm_var": 0.7847076416015625, "learning_rate": 0.0001, "loss": 3.0818, "loss/crossentropy": 2.2367316365242003, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.19362774714827538, "loss/reg": 0.0, "step": 38320 }, { "epoch": 0.25217105263157896, "grad_norm": 2.359375, "grad_norm_var": 0.3490386962890625, "learning_rate": 0.0001, "loss": 2.9877, "loss/crossentropy": 2.212986183166504, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.21934635937213898, "loss/reg": 0.0, "step": 38330 }, { "epoch": 0.25223684210526315, "grad_norm": 2.71875, "grad_norm_var": 0.0618560791015625, "learning_rate": 0.0001, "loss": 3.0083, "loss/crossentropy": 2.424324858188629, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.2144932597875595, "loss/reg": 0.0, "step": 38340 }, { "epoch": 0.2523026315789474, "grad_norm": 2.21875, "grad_norm_var": 0.05483296712239583, "learning_rate": 0.0001, "loss": 2.9994, "loss/crossentropy": 2.3819736361503603, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.282964862883091, "loss/reg": 0.0, "step": 38350 }, { "epoch": 0.2523684210526316, "grad_norm": 2.421875, "grad_norm_var": 6.966681585372321e+17, "learning_rate": 0.0001, "loss": 3.0863, "loss/crossentropy": 2.094909679889679, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.21131259053945542, "loss/reg": 0.0, "step": 38360 }, { "epoch": 0.2524342105263158, "grad_norm": 1.9765625, "grad_norm_var": 0.49862848917643227, "learning_rate": 0.0001, "loss": 2.9827, "loss/crossentropy": 2.215614175796509, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.2258283868432045, "loss/reg": 0.0, "step": 38370 }, { "epoch": 0.2525, "grad_norm": 2.328125, "grad_norm_var": 0.4853370666503906, "learning_rate": 0.0001, "loss": 2.9659, "loss/crossentropy": 2.0444746017456055, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.17757057920098304, "loss/reg": 0.0, "step": 38380 }, { "epoch": 0.2525657894736842, "grad_norm": 2.4375, "grad_norm_var": 0.09589436848958334, "learning_rate": 0.0001, "loss": 3.0212, "loss/crossentropy": 2.3424991607666015, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.20197789669036864, "loss/reg": 0.0, "step": 38390 }, { "epoch": 0.25263157894736843, "grad_norm": 2.34375, "grad_norm_var": 0.10813395182291667, "learning_rate": 0.0001, "loss": 2.9537, "loss/crossentropy": 2.090820240974426, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.18726751990616322, "loss/reg": 0.0, "step": 38400 }, { "epoch": 0.2526973684210526, "grad_norm": 2.5625, "grad_norm_var": 0.06373291015625, "learning_rate": 0.0001, "loss": 2.9561, "loss/crossentropy": 1.846102112531662, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.1744068369269371, "loss/reg": 0.0, "step": 38410 }, { "epoch": 0.25276315789473686, "grad_norm": 2.34375, "grad_norm_var": 0.10798924763997396, "learning_rate": 0.0001, "loss": 2.9147, "loss/crossentropy": 2.336298942565918, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2202483594417572, "loss/reg": 0.0, "step": 38420 }, { "epoch": 0.25282894736842104, "grad_norm": 2.671875, "grad_norm_var": 0.21279195149739583, "learning_rate": 0.0001, "loss": 3.0227, "loss/crossentropy": 2.413504159450531, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.2817773073911667, "loss/reg": 0.0, "step": 38430 }, { "epoch": 0.2528947368421053, "grad_norm": 2.0625, "grad_norm_var": 0.4711985270182292, "learning_rate": 0.0001, "loss": 2.9499, "loss/crossentropy": 2.3754114389419554, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.20899627953767777, "loss/reg": 0.0, "step": 38440 }, { "epoch": 0.25296052631578947, "grad_norm": 2.5625, "grad_norm_var": 0.11868489583333333, "learning_rate": 0.0001, "loss": 3.0609, "loss/crossentropy": 2.1895971417427065, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.27013812959194183, "loss/reg": 0.0, "step": 38450 }, { "epoch": 0.2530263157894737, "grad_norm": 2.140625, "grad_norm_var": 0.07203776041666667, "learning_rate": 0.0001, "loss": 2.9742, "loss/crossentropy": 2.398327910900116, "loss/hidden": 2.534375, "loss/incoh": 0.0, "loss/logits": 0.20559832453727722, "loss/reg": 0.0, "step": 38460 }, { "epoch": 0.2530921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.042479451497395834, "learning_rate": 0.0001, "loss": 3.0595, "loss/crossentropy": 2.1632952094078064, "loss/hidden": 3.0921875, "loss/incoh": 0.0, "loss/logits": 0.26632697582244874, "loss/reg": 0.0, "step": 38470 }, { "epoch": 0.2531578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.0286285400390625, "learning_rate": 0.0001, "loss": 2.948, "loss/crossentropy": 2.3557206630706786, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23956954926252366, "loss/reg": 0.0, "step": 38480 }, { "epoch": 0.2532236842105263, "grad_norm": 2.484375, "grad_norm_var": 0.06353759765625, "learning_rate": 0.0001, "loss": 3.0005, "loss/crossentropy": 2.1524956703186033, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2026323951780796, "loss/reg": 0.0, "step": 38490 }, { "epoch": 0.2532894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.04701309204101563, "learning_rate": 0.0001, "loss": 2.9439, "loss/crossentropy": 2.081376886367798, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.20958617925643921, "loss/reg": 0.0, "step": 38500 }, { "epoch": 0.25335526315789475, "grad_norm": 2.375, "grad_norm_var": 0.057708485921223955, "learning_rate": 0.0001, "loss": 3.008, "loss/crossentropy": 2.419314205646515, "loss/hidden": 2.9125, "loss/incoh": 0.0, "loss/logits": 0.28001276552677157, "loss/reg": 0.0, "step": 38510 }, { "epoch": 0.25342105263157894, "grad_norm": 2.140625, "grad_norm_var": 280.5541015625, "learning_rate": 0.0001, "loss": 3.124, "loss/crossentropy": 2.209311616420746, "loss/hidden": 3.334375, "loss/incoh": 0.0, "loss/logits": 0.4400923550128937, "loss/reg": 0.0, "step": 38520 }, { "epoch": 0.2534868421052632, "grad_norm": 2.34375, "grad_norm_var": 280.8059855143229, "learning_rate": 0.0001, "loss": 2.963, "loss/crossentropy": 2.386173665523529, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20247507095336914, "loss/reg": 0.0, "step": 38530 }, { "epoch": 0.25355263157894736, "grad_norm": 2.375, "grad_norm_var": 1.2670237223307292, "learning_rate": 0.0001, "loss": 2.9903, "loss/crossentropy": 2.3542658567428587, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.229895980656147, "loss/reg": 0.0, "step": 38540 }, { "epoch": 0.2536184210526316, "grad_norm": 2.328125, "grad_norm_var": 0.12191340128580729, "learning_rate": 0.0001, "loss": 3.0164, "loss/crossentropy": 2.4656338930130004, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2140819951891899, "loss/reg": 0.0, "step": 38550 }, { "epoch": 0.2536842105263158, "grad_norm": 2.546875, "grad_norm_var": 9.503270467122396, "learning_rate": 0.0001, "loss": 3.0177, "loss/crossentropy": 2.4247732520103455, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.26161234080791473, "loss/reg": 0.0, "step": 38560 }, { "epoch": 0.25375, "grad_norm": 2.1875, "grad_norm_var": 9.574365234375, "learning_rate": 0.0001, "loss": 3.0298, "loss/crossentropy": 2.2777958273887635, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.3407336473464966, "loss/reg": 0.0, "step": 38570 }, { "epoch": 0.2538157894736842, "grad_norm": 2.53125, "grad_norm_var": 0.3904693603515625, "learning_rate": 0.0001, "loss": 2.933, "loss/crossentropy": 2.220312762260437, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2032586969435215, "loss/reg": 0.0, "step": 38580 }, { "epoch": 0.2538815789473684, "grad_norm": 2.78125, "grad_norm_var": 0.08896077473958333, "learning_rate": 0.0001, "loss": 2.9606, "loss/crossentropy": 2.1711514949798585, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.17734147608280182, "loss/reg": 0.0, "step": 38590 }, { "epoch": 0.25394736842105264, "grad_norm": 2.21875, "grad_norm_var": 0.20473531087239583, "learning_rate": 0.0001, "loss": 2.9321, "loss/crossentropy": 1.9470943689346314, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.19334470629692077, "loss/reg": 0.0, "step": 38600 }, { "epoch": 0.25401315789473683, "grad_norm": 3.03125, "grad_norm_var": 0.06678059895833334, "learning_rate": 0.0001, "loss": 2.9233, "loss/crossentropy": 2.273166114091873, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.20435749739408493, "loss/reg": 0.0, "step": 38610 }, { "epoch": 0.25407894736842107, "grad_norm": 2.15625, "grad_norm_var": 0.11378580729166667, "learning_rate": 0.0001, "loss": 2.9767, "loss/crossentropy": 2.571880316734314, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.21721504628658295, "loss/reg": 0.0, "step": 38620 }, { "epoch": 0.25414473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.08742574055989584, "learning_rate": 0.0001, "loss": 2.9226, "loss/crossentropy": 2.2264232873916625, "loss/hidden": 2.890625, "loss/incoh": 0.0, "loss/logits": 0.21014079004526137, "loss/reg": 0.0, "step": 38630 }, { "epoch": 0.2542105263157895, "grad_norm": 2.578125, "grad_norm_var": 0.05276285807291667, "learning_rate": 0.0001, "loss": 2.9634, "loss/crossentropy": 2.2364309787750245, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.20938622057437897, "loss/reg": 0.0, "step": 38640 }, { "epoch": 0.2542763157894737, "grad_norm": 2.359375, "grad_norm_var": 0.03668619791666667, "learning_rate": 0.0001, "loss": 3.0195, "loss/crossentropy": 2.135267126560211, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2628340318799019, "loss/reg": 0.0, "step": 38650 }, { "epoch": 0.25434210526315787, "grad_norm": 2.3125, "grad_norm_var": 1.3797353108723958, "learning_rate": 0.0001, "loss": 2.927, "loss/crossentropy": 2.403776562213898, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.2078718587756157, "loss/reg": 0.0, "step": 38660 }, { "epoch": 0.2544078947368421, "grad_norm": 2.265625, "grad_norm_var": 1.3981770833333333, "learning_rate": 0.0001, "loss": 2.9993, "loss/crossentropy": 2.358154129981995, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2573642887175083, "loss/reg": 0.0, "step": 38670 }, { "epoch": 0.2544736842105263, "grad_norm": 2.046875, "grad_norm_var": 0.06347249348958334, "learning_rate": 0.0001, "loss": 2.9781, "loss/crossentropy": 2.3121828198432923, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.2003457449376583, "loss/reg": 0.0, "step": 38680 }, { "epoch": 0.25453947368421054, "grad_norm": 6.4375, "grad_norm_var": 1.1110694885253907, "learning_rate": 0.0001, "loss": 2.9505, "loss/crossentropy": 2.153343600034714, "loss/hidden": 2.9796875, "loss/incoh": 0.0, "loss/logits": 0.25137421637773516, "loss/reg": 0.0, "step": 38690 }, { "epoch": 0.2546052631578947, "grad_norm": 2.203125, "grad_norm_var": 1.0830393473307292, "learning_rate": 0.0001, "loss": 2.9718, "loss/crossentropy": 2.052643448114395, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.25727019011974334, "loss/reg": 0.0, "step": 38700 }, { "epoch": 0.25467105263157896, "grad_norm": 2.15625, "grad_norm_var": 0.0537506103515625, "learning_rate": 0.0001, "loss": 3.006, "loss/crossentropy": 2.152147728204727, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.23068247735500336, "loss/reg": 0.0, "step": 38710 }, { "epoch": 0.25473684210526315, "grad_norm": 2.25, "grad_norm_var": 0.0999420166015625, "learning_rate": 0.0001, "loss": 2.9765, "loss/crossentropy": 2.2647915482521057, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.24034264236688613, "loss/reg": 0.0, "step": 38720 }, { "epoch": 0.2548026315789474, "grad_norm": 2.890625, "grad_norm_var": 0.19918212890625, "learning_rate": 0.0001, "loss": 2.9698, "loss/crossentropy": 2.4958234548568727, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23010408282279968, "loss/reg": 0.0, "step": 38730 }, { "epoch": 0.2548684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.15207926432291666, "learning_rate": 0.0001, "loss": 2.9802, "loss/crossentropy": 2.2727454662323, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.20178039968013764, "loss/reg": 0.0, "step": 38740 }, { "epoch": 0.25493421052631576, "grad_norm": 2.234375, "grad_norm_var": 0.06728108723958333, "learning_rate": 0.0001, "loss": 3.036, "loss/crossentropy": 2.2367177844047545, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.24128163158893584, "loss/reg": 0.0, "step": 38750 }, { "epoch": 0.255, "grad_norm": 2.109375, "grad_norm_var": 0.06168212890625, "learning_rate": 0.0001, "loss": 2.9621, "loss/crossentropy": 2.34770849943161, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.21328310072422027, "loss/reg": 0.0, "step": 38760 }, { "epoch": 0.2550657894736842, "grad_norm": 2.203125, "grad_norm_var": 0.1226470947265625, "learning_rate": 0.0001, "loss": 2.9942, "loss/crossentropy": 2.306888747215271, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.2507788211107254, "loss/reg": 0.0, "step": 38770 }, { "epoch": 0.25513157894736843, "grad_norm": 2.359375, "grad_norm_var": 0.17222900390625, "learning_rate": 0.0001, "loss": 2.8878, "loss/crossentropy": 2.0050193548202513, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.18458792492747306, "loss/reg": 0.0, "step": 38780 }, { "epoch": 0.2551973684210526, "grad_norm": 2.125, "grad_norm_var": 0.0305816650390625, "learning_rate": 0.0001, "loss": 2.9807, "loss/crossentropy": 2.279655563831329, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22222616970539094, "loss/reg": 0.0, "step": 38790 }, { "epoch": 0.25526315789473686, "grad_norm": 2.3125, "grad_norm_var": 0.045426432291666666, "learning_rate": 0.0001, "loss": 2.9266, "loss/crossentropy": 1.8874349355697633, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.1566801816225052, "loss/reg": 0.0, "step": 38800 }, { "epoch": 0.25532894736842104, "grad_norm": 2.140625, "grad_norm_var": 0.13137919108072918, "learning_rate": 0.0001, "loss": 2.9532, "loss/crossentropy": 2.0287832260131835, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.19014589190483094, "loss/reg": 0.0, "step": 38810 }, { "epoch": 0.2553947368421053, "grad_norm": 2.328125, "grad_norm_var": 0.16416727701822917, "learning_rate": 0.0001, "loss": 2.9798, "loss/crossentropy": 2.183877873420715, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.21431390047073365, "loss/reg": 0.0, "step": 38820 }, { "epoch": 0.25546052631578947, "grad_norm": 2.453125, "grad_norm_var": 0.24260965983072916, "learning_rate": 0.0001, "loss": 2.9393, "loss/crossentropy": 2.440583086013794, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.20157338082790374, "loss/reg": 0.0, "step": 38830 }, { "epoch": 0.25552631578947366, "grad_norm": 2.265625, "grad_norm_var": 0.1785797119140625, "learning_rate": 0.0001, "loss": 2.9731, "loss/crossentropy": 2.5752979278564454, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.24179309010505676, "loss/reg": 0.0, "step": 38840 }, { "epoch": 0.2555921052631579, "grad_norm": 2.640625, "grad_norm_var": 0.1367340087890625, "learning_rate": 0.0001, "loss": 2.9657, "loss/crossentropy": 2.35681791305542, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24579865038394927, "loss/reg": 0.0, "step": 38850 }, { "epoch": 0.2556578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.13662007649739583, "learning_rate": 0.0001, "loss": 2.939, "loss/crossentropy": 2.2475174188613893, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.21202882528305053, "loss/reg": 0.0, "step": 38860 }, { "epoch": 0.2557236842105263, "grad_norm": 2.234375, "grad_norm_var": 0.2050933837890625, "learning_rate": 0.0001, "loss": 2.9837, "loss/crossentropy": 2.3431418418884276, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23483068943023683, "loss/reg": 0.0, "step": 38870 }, { "epoch": 0.2557894736842105, "grad_norm": 3.03125, "grad_norm_var": 2.675121053059896, "learning_rate": 0.0001, "loss": 2.9856, "loss/crossentropy": 2.483526587486267, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2897988960146904, "loss/reg": 0.0, "step": 38880 }, { "epoch": 0.25585526315789475, "grad_norm": 2.21875, "grad_norm_var": 0.0619781494140625, "learning_rate": 0.0001, "loss": 2.9633, "loss/crossentropy": 2.1767768919467927, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.234356290102005, "loss/reg": 0.0, "step": 38890 }, { "epoch": 0.25592105263157894, "grad_norm": 2.96875, "grad_norm_var": 0.2576568603515625, "learning_rate": 0.0001, "loss": 2.9781, "loss/crossentropy": 2.4225967705249785, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.23021362945437432, "loss/reg": 0.0, "step": 38900 }, { "epoch": 0.2559868421052632, "grad_norm": 2.625, "grad_norm_var": 0.21616923014322917, "learning_rate": 0.0001, "loss": 2.9813, "loss/crossentropy": 2.215132641792297, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.23095793277025223, "loss/reg": 0.0, "step": 38910 }, { "epoch": 0.25605263157894737, "grad_norm": 2.015625, "grad_norm_var": 0.08826497395833334, "learning_rate": 0.0001, "loss": 2.9896, "loss/crossentropy": 2.1709822535514833, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.20210467725992204, "loss/reg": 0.0, "step": 38920 }, { "epoch": 0.2561184210526316, "grad_norm": 2.375, "grad_norm_var": 0.08967997233072916, "learning_rate": 0.0001, "loss": 2.9936, "loss/crossentropy": 2.343753528594971, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22285044342279434, "loss/reg": 0.0, "step": 38930 }, { "epoch": 0.2561842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.029637654622395832, "learning_rate": 0.0001, "loss": 2.918, "loss/crossentropy": 2.4347527027130127, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2453876256942749, "loss/reg": 0.0, "step": 38940 }, { "epoch": 0.25625, "grad_norm": 2.265625, "grad_norm_var": 0.058405558268229164, "learning_rate": 0.0001, "loss": 2.9806, "loss/crossentropy": 2.24286550283432, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.24712017327547073, "loss/reg": 0.0, "step": 38950 }, { "epoch": 0.2563157894736842, "grad_norm": 2.390625, "grad_norm_var": 0.051756795247395834, "learning_rate": 0.0001, "loss": 2.9834, "loss/crossentropy": 2.170624256134033, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2218889057636261, "loss/reg": 0.0, "step": 38960 }, { "epoch": 0.2563815789473684, "grad_norm": 2.140625, "grad_norm_var": 0.0666015625, "learning_rate": 0.0001, "loss": 3.0572, "loss/crossentropy": 2.154159963130951, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.20279831290245057, "loss/reg": 0.0, "step": 38970 }, { "epoch": 0.25644736842105265, "grad_norm": 2.484375, "grad_norm_var": 0.08325093587239583, "learning_rate": 0.0001, "loss": 2.9366, "loss/crossentropy": 2.1254536151885985, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.24286270141601562, "loss/reg": 0.0, "step": 38980 }, { "epoch": 0.25651315789473683, "grad_norm": 2.828125, "grad_norm_var": 0.1099609375, "learning_rate": 0.0001, "loss": 2.9717, "loss/crossentropy": 2.1491544008255006, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.27067394405603407, "loss/reg": 0.0, "step": 38990 }, { "epoch": 0.2565789473684211, "grad_norm": 2.6875, "grad_norm_var": 0.6976226806640625, "learning_rate": 0.0001, "loss": 3.0136, "loss/crossentropy": 2.2009241580963135, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.19883604645729064, "loss/reg": 0.0, "step": 39000 }, { "epoch": 0.25664473684210526, "grad_norm": 2.828125, "grad_norm_var": 0.6546183268229167, "learning_rate": 0.0001, "loss": 2.9775, "loss/crossentropy": 2.086053115129471, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.26883399933576585, "loss/reg": 0.0, "step": 39010 }, { "epoch": 0.2567105263157895, "grad_norm": 3.71875, "grad_norm_var": 0.23528238932291667, "learning_rate": 0.0001, "loss": 2.9801, "loss/crossentropy": 2.121158719062805, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.22595785260200502, "loss/reg": 0.0, "step": 39020 }, { "epoch": 0.2567763157894737, "grad_norm": 2.203125, "grad_norm_var": 0.22114156087239584, "learning_rate": 0.0001, "loss": 2.8994, "loss/crossentropy": 2.2681718945503233, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.25748113095760344, "loss/reg": 0.0, "step": 39030 }, { "epoch": 0.25684210526315787, "grad_norm": 2.28125, "grad_norm_var": 0.08943583170572916, "learning_rate": 0.0001, "loss": 2.8768, "loss/crossentropy": 2.4575144767761232, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.20152714401483535, "loss/reg": 0.0, "step": 39040 }, { "epoch": 0.2569078947368421, "grad_norm": 2.46875, "grad_norm_var": 0.0328765869140625, "learning_rate": 0.0001, "loss": 2.9306, "loss/crossentropy": 2.369240176677704, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.22774229645729066, "loss/reg": 0.0, "step": 39050 }, { "epoch": 0.2569736842105263, "grad_norm": 2.640625, "grad_norm_var": 3.1118817818339136e+17, "learning_rate": 0.0001, "loss": 3.0462, "loss/crossentropy": 2.1120986580848693, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2498351290822029, "loss/reg": 0.0, "step": 39060 }, { "epoch": 0.25703947368421054, "grad_norm": 2.3125, "grad_norm_var": 0.034886678059895836, "learning_rate": 0.0001, "loss": 2.8904, "loss/crossentropy": 2.1073123455047607, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.19584316536784172, "loss/reg": 0.0, "step": 39070 }, { "epoch": 0.2571052631578947, "grad_norm": 2.890625, "grad_norm_var": 0.07847900390625, "learning_rate": 0.0001, "loss": 2.9755, "loss/crossentropy": 2.3284847140312195, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.2493668183684349, "loss/reg": 0.0, "step": 39080 }, { "epoch": 0.25717105263157897, "grad_norm": 2.0625, "grad_norm_var": 0.12560221354166667, "learning_rate": 0.0001, "loss": 2.9003, "loss/crossentropy": 2.334082317352295, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.1966678135097027, "loss/reg": 0.0, "step": 39090 }, { "epoch": 0.25723684210526315, "grad_norm": 2.453125, "grad_norm_var": 0.08367513020833334, "learning_rate": 0.0001, "loss": 2.9616, "loss/crossentropy": 2.6055662870407104, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.2218620851635933, "loss/reg": 0.0, "step": 39100 }, { "epoch": 0.2573026315789474, "grad_norm": 2.703125, "grad_norm_var": 0.22674051920572916, "learning_rate": 0.0001, "loss": 2.9551, "loss/crossentropy": 2.1810374021530152, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.20281792134046556, "loss/reg": 0.0, "step": 39110 }, { "epoch": 0.2573684210526316, "grad_norm": 2.328125, "grad_norm_var": 0.13672587076822917, "learning_rate": 0.0001, "loss": 2.9471, "loss/crossentropy": 2.150469708442688, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21119417250156403, "loss/reg": 0.0, "step": 39120 }, { "epoch": 0.25743421052631577, "grad_norm": 2.15625, "grad_norm_var": 0.7312978108723959, "learning_rate": 0.0001, "loss": 2.9604, "loss/crossentropy": 1.9471443891525269, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2092149168252945, "loss/reg": 0.0, "step": 39130 }, { "epoch": 0.2575, "grad_norm": 2.109375, "grad_norm_var": 0.7445330301920573, "learning_rate": 0.0001, "loss": 2.9095, "loss/crossentropy": 2.2832953572273254, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.21912914961576463, "loss/reg": 0.0, "step": 39140 }, { "epoch": 0.2575657894736842, "grad_norm": 3.15625, "grad_norm_var": 0.14458719889322916, "learning_rate": 0.0001, "loss": 2.9719, "loss/crossentropy": 2.3353841066360475, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.22544662952423095, "loss/reg": 0.0, "step": 39150 }, { "epoch": 0.25763157894736843, "grad_norm": 2.359375, "grad_norm_var": 0.3525136311848958, "learning_rate": 0.0001, "loss": 3.0065, "loss/crossentropy": 2.3361639380455017, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2251232832670212, "loss/reg": 0.0, "step": 39160 }, { "epoch": 0.2576973684210526, "grad_norm": 2.359375, "grad_norm_var": 0.295556640625, "learning_rate": 0.0001, "loss": 3.0384, "loss/crossentropy": 2.324616348743439, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.23191166073083877, "loss/reg": 0.0, "step": 39170 }, { "epoch": 0.25776315789473686, "grad_norm": 2.359375, "grad_norm_var": 0.06609700520833334, "learning_rate": 0.0001, "loss": 2.9448, "loss/crossentropy": 2.2094304859638214, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.24130020663142204, "loss/reg": 0.0, "step": 39180 }, { "epoch": 0.25782894736842105, "grad_norm": 2.03125, "grad_norm_var": 0.09587376912434896, "learning_rate": 0.0001, "loss": 2.9087, "loss/crossentropy": 2.379458463191986, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2207609809935093, "loss/reg": 0.0, "step": 39190 }, { "epoch": 0.2578947368421053, "grad_norm": 2.8125, "grad_norm_var": 0.10582275390625, "learning_rate": 0.0001, "loss": 3.0122, "loss/crossentropy": 2.1732101857662203, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.20026833638548852, "loss/reg": 0.0, "step": 39200 }, { "epoch": 0.2579605263157895, "grad_norm": 2.21875, "grad_norm_var": 0.07842992146809896, "learning_rate": 0.0001, "loss": 2.9131, "loss/crossentropy": 1.8863205194473267, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.1805630184710026, "loss/reg": 0.0, "step": 39210 }, { "epoch": 0.25802631578947366, "grad_norm": 2.140625, "grad_norm_var": 0.16445719401041667, "learning_rate": 0.0001, "loss": 2.9865, "loss/crossentropy": 2.3708848357200623, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2134988009929657, "loss/reg": 0.0, "step": 39220 }, { "epoch": 0.2580921052631579, "grad_norm": 2.109375, "grad_norm_var": 0.043190256754557295, "learning_rate": 0.0001, "loss": 2.8704, "loss/crossentropy": 2.5470376133918764, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.21798246651887893, "loss/reg": 0.0, "step": 39230 }, { "epoch": 0.2581578947368421, "grad_norm": 3.75, "grad_norm_var": 0.15417378743489582, "learning_rate": 0.0001, "loss": 2.9229, "loss/crossentropy": 2.3233034729957582, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2058998629450798, "loss/reg": 0.0, "step": 39240 }, { "epoch": 0.2582236842105263, "grad_norm": 2.34375, "grad_norm_var": 0.23391087849934897, "learning_rate": 0.0001, "loss": 2.9943, "loss/crossentropy": 2.3962061643600463, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2253645345568657, "loss/reg": 0.0, "step": 39250 }, { "epoch": 0.2582894736842105, "grad_norm": 2.3125, "grad_norm_var": 0.0898577372233073, "learning_rate": 0.0001, "loss": 2.9846, "loss/crossentropy": 2.2847093820571898, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.20621878653764725, "loss/reg": 0.0, "step": 39260 }, { "epoch": 0.25835526315789475, "grad_norm": 2.859375, "grad_norm_var": 0.062235514322916664, "learning_rate": 0.0001, "loss": 3.0113, "loss/crossentropy": 2.1093714237213135, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.22439000010490417, "loss/reg": 0.0, "step": 39270 }, { "epoch": 0.25842105263157894, "grad_norm": 2.625, "grad_norm_var": 0.061091105143229164, "learning_rate": 0.0001, "loss": 2.9406, "loss/crossentropy": 2.3419513583183287, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.2029005065560341, "loss/reg": 0.0, "step": 39280 }, { "epoch": 0.2584868421052632, "grad_norm": 1.96875, "grad_norm_var": 0.0909332275390625, "learning_rate": 0.0001, "loss": 2.8985, "loss/crossentropy": 2.230232834815979, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20411441028118132, "loss/reg": 0.0, "step": 39290 }, { "epoch": 0.25855263157894737, "grad_norm": 3.0625, "grad_norm_var": 0.1186676025390625, "learning_rate": 0.0001, "loss": 2.9583, "loss/crossentropy": 2.2822274684906008, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.21612002551555634, "loss/reg": 0.0, "step": 39300 }, { "epoch": 0.25861842105263155, "grad_norm": 3.640625, "grad_norm_var": 0.1718414306640625, "learning_rate": 0.0001, "loss": 3.0234, "loss/crossentropy": 2.483612024784088, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.22477086931467055, "loss/reg": 0.0, "step": 39310 }, { "epoch": 0.2586842105263158, "grad_norm": 2.578125, "grad_norm_var": 0.18178609212239583, "learning_rate": 0.0001, "loss": 2.9349, "loss/crossentropy": 2.3175944685935974, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.2550591230392456, "loss/reg": 0.0, "step": 39320 }, { "epoch": 0.25875, "grad_norm": 2.265625, "grad_norm_var": 0.08434956868489583, "learning_rate": 0.0001, "loss": 2.8974, "loss/crossentropy": 2.333976149559021, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.20737813785672188, "loss/reg": 0.0, "step": 39330 }, { "epoch": 0.2588157894736842, "grad_norm": 2.65625, "grad_norm_var": 0.0402984619140625, "learning_rate": 0.0001, "loss": 2.8992, "loss/crossentropy": 1.9081893861293793, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.19327801540493966, "loss/reg": 0.0, "step": 39340 }, { "epoch": 0.2588815789473684, "grad_norm": 2.34375, "grad_norm_var": 0.03901341756184896, "learning_rate": 0.0001, "loss": 2.8945, "loss/crossentropy": 2.112404853105545, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.1989109069108963, "loss/reg": 0.0, "step": 39350 }, { "epoch": 0.25894736842105265, "grad_norm": 2.25, "grad_norm_var": 0.02379124959309896, "learning_rate": 0.0001, "loss": 2.8712, "loss/crossentropy": 2.0947551012039183, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.20299985110759736, "loss/reg": 0.0, "step": 39360 }, { "epoch": 0.25901315789473683, "grad_norm": 2.703125, "grad_norm_var": 0.08593343098958334, "learning_rate": 0.0001, "loss": 2.9641, "loss/crossentropy": 2.160096913576126, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.23282611221075059, "loss/reg": 0.0, "step": 39370 }, { "epoch": 0.2590789473684211, "grad_norm": 2.46875, "grad_norm_var": 0.09907124837239584, "learning_rate": 0.0001, "loss": 3.0059, "loss/crossentropy": 2.1344396591186525, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.20642745792865752, "loss/reg": 0.0, "step": 39380 }, { "epoch": 0.25914473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.0337066650390625, "learning_rate": 0.0001, "loss": 2.9501, "loss/crossentropy": 2.211651337146759, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.20566888749599457, "loss/reg": 0.0, "step": 39390 }, { "epoch": 0.25921052631578945, "grad_norm": 2.828125, "grad_norm_var": 0.3254191080729167, "learning_rate": 0.0001, "loss": 2.9949, "loss/crossentropy": 2.0947441935539244, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.22096366733312606, "loss/reg": 0.0, "step": 39400 }, { "epoch": 0.2592763157894737, "grad_norm": 1.9921875, "grad_norm_var": 0.19651667277018228, "learning_rate": 0.0001, "loss": 2.9392, "loss/crossentropy": 2.3617849826812742, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.25957188159227373, "loss/reg": 0.0, "step": 39410 }, { "epoch": 0.2593421052631579, "grad_norm": 2.328125, "grad_norm_var": 0.0928179423014323, "learning_rate": 0.0001, "loss": 2.9103, "loss/crossentropy": 2.373174512386322, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.2120439499616623, "loss/reg": 0.0, "step": 39420 }, { "epoch": 0.2594078947368421, "grad_norm": 2.484375, "grad_norm_var": 0.053587849934895834, "learning_rate": 0.0001, "loss": 2.9039, "loss/crossentropy": 2.2536337614059447, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.18805657625198363, "loss/reg": 0.0, "step": 39430 }, { "epoch": 0.2594736842105263, "grad_norm": 2.0, "grad_norm_var": 0.3204264322916667, "learning_rate": 0.0001, "loss": 3.0201, "loss/crossentropy": 2.052464544773102, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.2089213587343693, "loss/reg": 0.0, "step": 39440 }, { "epoch": 0.25953947368421054, "grad_norm": 3.078125, "grad_norm_var": 5.6367123205395795e+17, "learning_rate": 0.0001, "loss": 3.1226, "loss/crossentropy": 2.0904810786247254, "loss/hidden": 2.53125, "loss/incoh": 0.0, "loss/logits": 0.18436047285795212, "loss/reg": 0.0, "step": 39450 }, { "epoch": 0.25960526315789473, "grad_norm": 2.53125, "grad_norm_var": 5.636712321591454e+17, "learning_rate": 0.0001, "loss": 2.8972, "loss/crossentropy": 2.383215081691742, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2062357634305954, "loss/reg": 0.0, "step": 39460 }, { "epoch": 0.25967105263157897, "grad_norm": 2.703125, "grad_norm_var": 0.2125139872233073, "learning_rate": 0.0001, "loss": 3.0526, "loss/crossentropy": 2.035197800397873, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.22759751938283443, "loss/reg": 0.0, "step": 39470 }, { "epoch": 0.25973684210526315, "grad_norm": 2.265625, "grad_norm_var": 0.1047503153483073, "learning_rate": 0.0001, "loss": 2.8596, "loss/crossentropy": 2.0758556723594666, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.18243583887815476, "loss/reg": 0.0, "step": 39480 }, { "epoch": 0.2598026315789474, "grad_norm": 2.046875, "grad_norm_var": 0.6220232645670573, "learning_rate": 0.0001, "loss": 2.9124, "loss/crossentropy": 2.337350380420685, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2153509557247162, "loss/reg": 0.0, "step": 39490 }, { "epoch": 0.2598684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.24768651326497396, "learning_rate": 0.0001, "loss": 2.899, "loss/crossentropy": 2.2104454159736635, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.23622069656848907, "loss/reg": 0.0, "step": 39500 }, { "epoch": 0.25993421052631577, "grad_norm": 2.25, "grad_norm_var": 0.06471354166666667, "learning_rate": 0.0001, "loss": 2.8843, "loss/crossentropy": 2.006416219472885, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.21008393466472625, "loss/reg": 0.0, "step": 39510 }, { "epoch": 0.26, "grad_norm": 2.15625, "grad_norm_var": 0.5674112955729167, "learning_rate": 0.0001, "loss": 3.0434, "loss/crossentropy": 2.2104514479637145, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24286225736141204, "loss/reg": 0.0, "step": 39520 }, { "epoch": 0.2600657894736842, "grad_norm": 3.125, "grad_norm_var": 2.339940388997396, "learning_rate": 0.0001, "loss": 2.9646, "loss/crossentropy": 2.1471508383750915, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2512055471539497, "loss/reg": 0.0, "step": 39530 }, { "epoch": 0.26013157894736844, "grad_norm": 2.53125, "grad_norm_var": 1.9162261962890625, "learning_rate": 0.0001, "loss": 3.015, "loss/crossentropy": 2.252353233098984, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.22746440172195434, "loss/reg": 0.0, "step": 39540 }, { "epoch": 0.2601973684210526, "grad_norm": 2801795072.0, "grad_norm_var": 4.9062847572554765e+17, "learning_rate": 0.0001, "loss": 3.0698, "loss/crossentropy": 2.506903576850891, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.22553565353155136, "loss/reg": 0.0, "step": 39550 }, { "epoch": 0.26026315789473686, "grad_norm": 2.15625, "grad_norm_var": 4.9062847573576256e+17, "learning_rate": 0.0001, "loss": 2.9221, "loss/crossentropy": 2.4941019773483277, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23029526472091674, "loss/reg": 0.0, "step": 39560 }, { "epoch": 0.26032894736842105, "grad_norm": 2.609375, "grad_norm_var": 0.12497456868489583, "learning_rate": 0.0001, "loss": 2.8649, "loss/crossentropy": 2.1986557483673095, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.18305001109838487, "loss/reg": 0.0, "step": 39570 }, { "epoch": 0.2603947368421053, "grad_norm": 2.453125, "grad_norm_var": 0.10641276041666667, "learning_rate": 0.0001, "loss": 2.917, "loss/crossentropy": 2.1990869998931886, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.1851473018527031, "loss/reg": 0.0, "step": 39580 }, { "epoch": 0.2604605263157895, "grad_norm": 2.375, "grad_norm_var": 1.3978749593098958, "learning_rate": 0.0001, "loss": 3.0133, "loss/crossentropy": 2.436345875263214, "loss/hidden": 2.9578125, "loss/incoh": 0.0, "loss/logits": 0.30376828759908675, "loss/reg": 0.0, "step": 39590 }, { "epoch": 0.26052631578947366, "grad_norm": 2.734375, "grad_norm_var": 0.1521148681640625, "learning_rate": 0.0001, "loss": 2.9345, "loss/crossentropy": 2.342786800861359, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.19189061373472213, "loss/reg": 0.0, "step": 39600 }, { "epoch": 0.2605921052631579, "grad_norm": 2.140625, "grad_norm_var": 0.15281575520833332, "learning_rate": 0.0001, "loss": 2.9767, "loss/crossentropy": 2.4071336150169373, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.22345878183841705, "loss/reg": 0.0, "step": 39610 }, { "epoch": 0.2606578947368421, "grad_norm": 1.9140625, "grad_norm_var": 0.280962880452474, "learning_rate": 0.0001, "loss": 2.9462, "loss/crossentropy": 2.1481564074754713, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.20991018489003183, "loss/reg": 0.0, "step": 39620 }, { "epoch": 0.26072368421052633, "grad_norm": 3.609375, "grad_norm_var": 0.6643674214680989, "learning_rate": 0.0001, "loss": 3.0336, "loss/crossentropy": 2.0887478232383727, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.22009581476449966, "loss/reg": 0.0, "step": 39630 }, { "epoch": 0.2607894736842105, "grad_norm": 2.21875, "grad_norm_var": 0.36109619140625, "learning_rate": 0.0001, "loss": 3.0258, "loss/crossentropy": 2.138615000247955, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2106044813990593, "loss/reg": 0.0, "step": 39640 }, { "epoch": 0.26085526315789476, "grad_norm": 2.140625, "grad_norm_var": 0.3787750244140625, "learning_rate": 0.0001, "loss": 3.0633, "loss/crossentropy": 1.9567537367343903, "loss/hidden": 2.9484375, "loss/incoh": 0.0, "loss/logits": 0.2987550154328346, "loss/reg": 0.0, "step": 39650 }, { "epoch": 0.26092105263157894, "grad_norm": 2.1875, "grad_norm_var": 0.15900065104166666, "learning_rate": 0.0001, "loss": 2.9489, "loss/crossentropy": 2.524258053302765, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.20932213813066483, "loss/reg": 0.0, "step": 39660 }, { "epoch": 0.2609868421052632, "grad_norm": 3.140625, "grad_norm_var": 0.073583984375, "learning_rate": 0.0001, "loss": 2.9985, "loss/crossentropy": 2.552958643436432, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.21318122893571853, "loss/reg": 0.0, "step": 39670 }, { "epoch": 0.26105263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.0707672119140625, "learning_rate": 0.0001, "loss": 2.9692, "loss/crossentropy": 2.2176095962524416, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.2208583801984787, "loss/reg": 0.0, "step": 39680 }, { "epoch": 0.26111842105263156, "grad_norm": 1.875, "grad_norm_var": 0.058869425455729166, "learning_rate": 0.0001, "loss": 2.9243, "loss/crossentropy": 2.2860087156295776, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.2486639067530632, "loss/reg": 0.0, "step": 39690 }, { "epoch": 0.2611842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.06942952473958333, "learning_rate": 0.0001, "loss": 2.9554, "loss/crossentropy": 2.3581568241119384, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.25539501309394835, "loss/reg": 0.0, "step": 39700 }, { "epoch": 0.26125, "grad_norm": 2.453125, "grad_norm_var": 1.3456217447916667, "learning_rate": 0.0001, "loss": 2.9286, "loss/crossentropy": 2.494968020915985, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.23517013043165208, "loss/reg": 0.0, "step": 39710 }, { "epoch": 0.2613157894736842, "grad_norm": 2.171875, "grad_norm_var": 1.3488566080729167, "learning_rate": 0.0001, "loss": 3.0312, "loss/crossentropy": 2.37692905664444, "loss/hidden": 2.9, "loss/incoh": 0.0, "loss/logits": 0.29073845595121384, "loss/reg": 0.0, "step": 39720 }, { "epoch": 0.2613815789473684, "grad_norm": 2.5, "grad_norm_var": 0.202294921875, "learning_rate": 0.0001, "loss": 3.0294, "loss/crossentropy": 2.281411385536194, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.22122962027788162, "loss/reg": 0.0, "step": 39730 }, { "epoch": 0.26144736842105265, "grad_norm": 2.015625, "grad_norm_var": 0.2295318603515625, "learning_rate": 0.0001, "loss": 2.9489, "loss/crossentropy": 2.2690483570098876, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2626466929912567, "loss/reg": 0.0, "step": 39740 }, { "epoch": 0.26151315789473684, "grad_norm": 2.515625, "grad_norm_var": 0.16734619140625, "learning_rate": 0.0001, "loss": 3.0073, "loss/crossentropy": 2.394347441196442, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2102888874709606, "loss/reg": 0.0, "step": 39750 }, { "epoch": 0.2615789473684211, "grad_norm": 2.125, "grad_norm_var": 0.2183990478515625, "learning_rate": 0.0001, "loss": 2.9586, "loss/crossentropy": 2.1268094956874846, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.20743194743990898, "loss/reg": 0.0, "step": 39760 }, { "epoch": 0.26164473684210526, "grad_norm": 2.640625, "grad_norm_var": 0.1597564697265625, "learning_rate": 0.0001, "loss": 2.9826, "loss/crossentropy": 2.3416704416275023, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20019679591059686, "loss/reg": 0.0, "step": 39770 }, { "epoch": 0.26171052631578945, "grad_norm": 3.03125, "grad_norm_var": 0.08323160807291667, "learning_rate": 0.0001, "loss": 2.9672, "loss/crossentropy": 2.0245181202888487, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.21602422520518302, "loss/reg": 0.0, "step": 39780 }, { "epoch": 0.2617763157894737, "grad_norm": 2.515625, "grad_norm_var": 0.12693583170572917, "learning_rate": 0.0001, "loss": 3.0371, "loss/crossentropy": 2.2631885528564455, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.2282739758491516, "loss/reg": 0.0, "step": 39790 }, { "epoch": 0.2618421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.0588287353515625, "learning_rate": 0.0001, "loss": 2.9485, "loss/crossentropy": 2.1909173846244814, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.22533988654613496, "loss/reg": 0.0, "step": 39800 }, { "epoch": 0.2619078947368421, "grad_norm": 1.9765625, "grad_norm_var": 0.07842178344726562, "learning_rate": 0.0001, "loss": 2.9508, "loss/crossentropy": 2.3490309596061705, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2864062897861004, "loss/reg": 0.0, "step": 39810 }, { "epoch": 0.2619736842105263, "grad_norm": 2.8125, "grad_norm_var": 0.08014500935872396, "learning_rate": 0.0001, "loss": 2.9507, "loss/crossentropy": 2.5319572567939757, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2143532671034336, "loss/reg": 0.0, "step": 39820 }, { "epoch": 0.26203947368421054, "grad_norm": 1.96875, "grad_norm_var": 0.0728515625, "learning_rate": 0.0001, "loss": 2.9447, "loss/crossentropy": 2.350716245174408, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.21392515301704407, "loss/reg": 0.0, "step": 39830 }, { "epoch": 0.26210526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.07763264973958334, "learning_rate": 0.0001, "loss": 2.9706, "loss/crossentropy": 2.2897894740104676, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.242967389523983, "loss/reg": 0.0, "step": 39840 }, { "epoch": 0.26217105263157897, "grad_norm": 2.34375, "grad_norm_var": 0.138525390625, "learning_rate": 0.0001, "loss": 3.0426, "loss/crossentropy": 2.309167265892029, "loss/hidden": 2.9640625, "loss/incoh": 0.0, "loss/logits": 0.2831707686185837, "loss/reg": 0.0, "step": 39850 }, { "epoch": 0.26223684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.13242085774739584, "learning_rate": 0.0001, "loss": 3.0234, "loss/crossentropy": 2.3926889896392822, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.19550751000642777, "loss/reg": 0.0, "step": 39860 }, { "epoch": 0.26230263157894734, "grad_norm": 2.265625, "grad_norm_var": 0.028873697916666666, "learning_rate": 0.0001, "loss": 2.8699, "loss/crossentropy": 2.389265012741089, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.20654226988554, "loss/reg": 0.0, "step": 39870 }, { "epoch": 0.2623684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.04534098307291667, "learning_rate": 0.0001, "loss": 2.9553, "loss/crossentropy": 2.052034729719162, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.20443758517503738, "loss/reg": 0.0, "step": 39880 }, { "epoch": 0.26243421052631577, "grad_norm": 2.25, "grad_norm_var": 0.06367899576822916, "learning_rate": 0.0001, "loss": 2.8898, "loss/crossentropy": 2.276283013820648, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.25378709435462954, "loss/reg": 0.0, "step": 39890 }, { "epoch": 0.2625, "grad_norm": 1.9609375, "grad_norm_var": 0.06096369425455729, "learning_rate": 0.0001, "loss": 2.9238, "loss/crossentropy": 2.3180172562599184, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.23101864904165267, "loss/reg": 0.0, "step": 39900 }, { "epoch": 0.2625657894736842, "grad_norm": 2.34375, "grad_norm_var": 0.1415911356608073, "learning_rate": 0.0001, "loss": 2.9487, "loss/crossentropy": 2.6349021911621096, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.27188998460769653, "loss/reg": 0.0, "step": 39910 }, { "epoch": 0.26263157894736844, "grad_norm": 2.3125, "grad_norm_var": 0.11770731608072917, "learning_rate": 0.0001, "loss": 2.8688, "loss/crossentropy": 2.276544678211212, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.20636183619499207, "loss/reg": 0.0, "step": 39920 }, { "epoch": 0.2626973684210526, "grad_norm": 2.1875, "grad_norm_var": 0.1878814697265625, "learning_rate": 0.0001, "loss": 3.023, "loss/crossentropy": 2.388458263874054, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.23089798912405968, "loss/reg": 0.0, "step": 39930 }, { "epoch": 0.26276315789473687, "grad_norm": 2.484375, "grad_norm_var": 0.14111226399739582, "learning_rate": 0.0001, "loss": 2.9919, "loss/crossentropy": 2.0341047286987304, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.21918695122003556, "loss/reg": 0.0, "step": 39940 }, { "epoch": 0.26282894736842105, "grad_norm": 3.1875, "grad_norm_var": 0.183544921875, "learning_rate": 0.0001, "loss": 3.0396, "loss/crossentropy": 2.3217403292655945, "loss/hidden": 2.95625, "loss/incoh": 0.0, "loss/logits": 0.26592089235782623, "loss/reg": 0.0, "step": 39950 }, { "epoch": 0.26289473684210524, "grad_norm": 2.140625, "grad_norm_var": 0.0977691650390625, "learning_rate": 0.0001, "loss": 2.9174, "loss/crossentropy": 2.55605833530426, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.2847170978784561, "loss/reg": 0.0, "step": 39960 }, { "epoch": 0.2629605263157895, "grad_norm": 2.5625, "grad_norm_var": 0.06572977701822917, "learning_rate": 0.0001, "loss": 2.9685, "loss/crossentropy": 1.9914783239364624, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.20305539071559905, "loss/reg": 0.0, "step": 39970 }, { "epoch": 0.26302631578947366, "grad_norm": 2.25, "grad_norm_var": 0.04635416666666667, "learning_rate": 0.0001, "loss": 2.9532, "loss/crossentropy": 2.478356397151947, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.2187245175242424, "loss/reg": 0.0, "step": 39980 }, { "epoch": 0.2630921052631579, "grad_norm": 2.3125, "grad_norm_var": 0.03802083333333333, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.4963066220283507, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.23317703753709793, "loss/reg": 0.0, "step": 39990 }, { "epoch": 0.2631578947368421, "grad_norm": 2.6875, "grad_norm_var": 0.10871480305989584, "learning_rate": 0.0001, "loss": 2.9937, "loss/crossentropy": 2.340533971786499, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.22518680542707442, "loss/reg": 0.0, "step": 40000 }, { "epoch": 0.26322368421052633, "grad_norm": 2.125, "grad_norm_var": 0.16312026977539062, "learning_rate": 0.0001, "loss": 2.9277, "loss/crossentropy": 2.350822412967682, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21506368815898896, "loss/reg": 0.0, "step": 40010 }, { "epoch": 0.2632894736842105, "grad_norm": 2.421875, "grad_norm_var": 0.41668065388997394, "learning_rate": 0.0001, "loss": 3.0121, "loss/crossentropy": 2.249849808216095, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.23412341475486756, "loss/reg": 0.0, "step": 40020 }, { "epoch": 0.26335526315789476, "grad_norm": 2.09375, "grad_norm_var": 0.3884348551432292, "learning_rate": 0.0001, "loss": 2.983, "loss/crossentropy": 2.0109333872795103, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.1806010849773884, "loss/reg": 0.0, "step": 40030 }, { "epoch": 0.26342105263157894, "grad_norm": 2.40625, "grad_norm_var": 0.32890218098958335, "learning_rate": 0.0001, "loss": 2.9464, "loss/crossentropy": 1.981540322303772, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.23581981062889099, "loss/reg": 0.0, "step": 40040 }, { "epoch": 0.26348684210526313, "grad_norm": 2.359375, "grad_norm_var": 0.2914947509765625, "learning_rate": 0.0001, "loss": 2.9589, "loss/crossentropy": 2.1642192125320436, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.18157752826809884, "loss/reg": 0.0, "step": 40050 }, { "epoch": 0.26355263157894737, "grad_norm": 2.328125, "grad_norm_var": 0.33581441243489585, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.303639805316925, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.20302913039922715, "loss/reg": 0.0, "step": 40060 }, { "epoch": 0.26361842105263156, "grad_norm": 2.515625, "grad_norm_var": 0.34353841145833336, "learning_rate": 0.0001, "loss": 2.9513, "loss/crossentropy": 2.5005850553512574, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.20455448627471923, "loss/reg": 0.0, "step": 40070 }, { "epoch": 0.2636842105263158, "grad_norm": 2.375, "grad_norm_var": 0.12288309733072916, "learning_rate": 0.0001, "loss": 2.9947, "loss/crossentropy": 2.3203944087028505, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.20487026870250702, "loss/reg": 0.0, "step": 40080 }, { "epoch": 0.26375, "grad_norm": 2.4375, "grad_norm_var": 0.019074503580729166, "learning_rate": 0.0001, "loss": 2.9125, "loss/crossentropy": 1.9645262598991393, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.19660852625966072, "loss/reg": 0.0, "step": 40090 }, { "epoch": 0.2638157894736842, "grad_norm": 2.421875, "grad_norm_var": 0.2106842041015625, "learning_rate": 0.0001, "loss": 3.1066, "loss/crossentropy": 2.413771164417267, "loss/hidden": 3.1140625, "loss/incoh": 0.0, "loss/logits": 0.2495443567633629, "loss/reg": 0.0, "step": 40100 }, { "epoch": 0.2638815789473684, "grad_norm": 2.0, "grad_norm_var": 0.11607666015625, "learning_rate": 0.0001, "loss": 2.9617, "loss/crossentropy": 2.3317906260490417, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.22329760938882828, "loss/reg": 0.0, "step": 40110 }, { "epoch": 0.26394736842105265, "grad_norm": 2.234375, "grad_norm_var": 0.12685139973958334, "learning_rate": 0.0001, "loss": 2.9921, "loss/crossentropy": 2.3062703013420105, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.215774242579937, "loss/reg": 0.0, "step": 40120 }, { "epoch": 0.26401315789473684, "grad_norm": 2.203125, "grad_norm_var": 0.0352691650390625, "learning_rate": 0.0001, "loss": 2.9344, "loss/crossentropy": 2.034626638889313, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.19955597221851348, "loss/reg": 0.0, "step": 40130 }, { "epoch": 0.2640789473684211, "grad_norm": 2.28125, "grad_norm_var": 0.018317667643229167, "learning_rate": 0.0001, "loss": 2.89, "loss/crossentropy": 2.269369065761566, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.239308962225914, "loss/reg": 0.0, "step": 40140 }, { "epoch": 0.26414473684210527, "grad_norm": 2.640625, "grad_norm_var": 0.1079010009765625, "learning_rate": 0.0001, "loss": 3.0115, "loss/crossentropy": 2.4881121158599853, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.2211257502436638, "loss/reg": 0.0, "step": 40150 }, { "epoch": 0.26421052631578945, "grad_norm": 2.140625, "grad_norm_var": 0.12138264973958333, "learning_rate": 0.0001, "loss": 2.9284, "loss/crossentropy": 2.1556692004203795, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.19819557964801787, "loss/reg": 0.0, "step": 40160 }, { "epoch": 0.2642763157894737, "grad_norm": 2.6875, "grad_norm_var": 0.07958577473958334, "learning_rate": 0.0001, "loss": 2.9124, "loss/crossentropy": 2.3040011644363405, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.2509505271911621, "loss/reg": 0.0, "step": 40170 }, { "epoch": 0.2643421052631579, "grad_norm": 2.84375, "grad_norm_var": 0.06483968098958333, "learning_rate": 0.0001, "loss": 2.9537, "loss/crossentropy": 2.337058222293854, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.1980614498257637, "loss/reg": 0.0, "step": 40180 }, { "epoch": 0.2644078947368421, "grad_norm": 2.296875, "grad_norm_var": 0.04985326131184896, "learning_rate": 0.0001, "loss": 2.924, "loss/crossentropy": 2.4608359217643736, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.24846454709768295, "loss/reg": 0.0, "step": 40190 }, { "epoch": 0.2644736842105263, "grad_norm": 2.546875, "grad_norm_var": 0.042238108317057294, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.4272888004779816, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.23606432974338531, "loss/reg": 0.0, "step": 40200 }, { "epoch": 0.26453947368421055, "grad_norm": 2.140625, "grad_norm_var": 0.5006418863932292, "learning_rate": 0.0001, "loss": 3.0158, "loss/crossentropy": 2.244870090484619, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.24423955231904984, "loss/reg": 0.0, "step": 40210 }, { "epoch": 0.26460526315789473, "grad_norm": 3.078125, "grad_norm_var": 2.5122081410500963e+17, "learning_rate": 0.0001, "loss": 3.1304, "loss/crossentropy": 2.2134504556655883, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.27766663283109666, "loss/reg": 0.0, "step": 40220 }, { "epoch": 0.264671052631579, "grad_norm": 2.6875, "grad_norm_var": 2.5122081415565366e+17, "learning_rate": 0.0001, "loss": 2.8698, "loss/crossentropy": 2.6814637303352358, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.18843439370393752, "loss/reg": 0.0, "step": 40230 }, { "epoch": 0.26473684210526316, "grad_norm": 3.671875, "grad_norm_var": 0.25183003743489585, "learning_rate": 0.0001, "loss": 3.0176, "loss/crossentropy": 2.1788122773170473, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2455573059618473, "loss/reg": 0.0, "step": 40240 }, { "epoch": 0.26480263157894735, "grad_norm": 2.140625, "grad_norm_var": 0.296423085530599, "learning_rate": 0.0001, "loss": 2.9276, "loss/crossentropy": 2.112660551071167, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.22626257836818695, "loss/reg": 0.0, "step": 40250 }, { "epoch": 0.2648684210526316, "grad_norm": 2.421875, "grad_norm_var": 0.12968343098958332, "learning_rate": 0.0001, "loss": 2.9632, "loss/crossentropy": 2.1655852854251862, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.23819684684276582, "loss/reg": 0.0, "step": 40260 }, { "epoch": 0.26493421052631577, "grad_norm": 2.234375, "grad_norm_var": 0.07055562337239583, "learning_rate": 0.0001, "loss": 2.9289, "loss/crossentropy": 2.3028429448604584, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22059553861618042, "loss/reg": 0.0, "step": 40270 }, { "epoch": 0.265, "grad_norm": 2.03125, "grad_norm_var": 0.14325764973958333, "learning_rate": 0.0001, "loss": 2.822, "loss/crossentropy": 2.0632973074913026, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.19669774621725084, "loss/reg": 0.0, "step": 40280 }, { "epoch": 0.2650657894736842, "grad_norm": 2.25, "grad_norm_var": 0.10683186848958333, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.056211602687836, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.18428613618016243, "loss/reg": 0.0, "step": 40290 }, { "epoch": 0.26513157894736844, "grad_norm": 2.125, "grad_norm_var": 0.0510894775390625, "learning_rate": 0.0001, "loss": 2.876, "loss/crossentropy": 2.7461586475372313, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.21496437042951583, "loss/reg": 0.0, "step": 40300 }, { "epoch": 0.2651973684210526, "grad_norm": 2.03125, "grad_norm_var": 0.03899332682291667, "learning_rate": 0.0001, "loss": 2.9584, "loss/crossentropy": 2.4342568159103393, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2367442712187767, "loss/reg": 0.0, "step": 40310 }, { "epoch": 0.26526315789473687, "grad_norm": 2.625, "grad_norm_var": 0.05810139973958333, "learning_rate": 0.0001, "loss": 2.9497, "loss/crossentropy": 2.364444077014923, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.23197888135910033, "loss/reg": 0.0, "step": 40320 }, { "epoch": 0.26532894736842105, "grad_norm": 2.359375, "grad_norm_var": 0.053954060872395834, "learning_rate": 0.0001, "loss": 2.9413, "loss/crossentropy": 2.3223790407180784, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2181138962507248, "loss/reg": 0.0, "step": 40330 }, { "epoch": 0.26539473684210524, "grad_norm": 2.671875, "grad_norm_var": 0.057027180989583336, "learning_rate": 0.0001, "loss": 2.9242, "loss/crossentropy": 2.3679453134536743, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.19444463551044464, "loss/reg": 0.0, "step": 40340 }, { "epoch": 0.2654605263157895, "grad_norm": 2.3125, "grad_norm_var": 0.032689412434895836, "learning_rate": 0.0001, "loss": 2.9138, "loss/crossentropy": 2.227433943748474, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.22437175512313842, "loss/reg": 0.0, "step": 40350 }, { "epoch": 0.26552631578947367, "grad_norm": 2.28125, "grad_norm_var": 0.026460774739583335, "learning_rate": 0.0001, "loss": 2.9298, "loss/crossentropy": 2.299468970298767, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.28592042773962023, "loss/reg": 0.0, "step": 40360 }, { "epoch": 0.2655921052631579, "grad_norm": 2.46875, "grad_norm_var": 0.0747467041015625, "learning_rate": 0.0001, "loss": 3.0098, "loss/crossentropy": 2.397823441028595, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.24638008177280427, "loss/reg": 0.0, "step": 40370 }, { "epoch": 0.2656578947368421, "grad_norm": 2.34375, "grad_norm_var": 0.100927734375, "learning_rate": 0.0001, "loss": 2.8985, "loss/crossentropy": 2.328071665763855, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21325500309467316, "loss/reg": 0.0, "step": 40380 }, { "epoch": 0.26572368421052633, "grad_norm": 4.03125, "grad_norm_var": 0.20921122233072917, "learning_rate": 0.0001, "loss": 2.9711, "loss/crossentropy": 2.153274118900299, "loss/hidden": 2.93125, "loss/incoh": 0.0, "loss/logits": 0.2502943441271782, "loss/reg": 0.0, "step": 40390 }, { "epoch": 0.2657894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.20421549479166667, "learning_rate": 0.0001, "loss": 2.9365, "loss/crossentropy": 2.2486332654953003, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2196296200156212, "loss/reg": 0.0, "step": 40400 }, { "epoch": 0.26585526315789476, "grad_norm": 2.5, "grad_norm_var": 0.042333984375, "learning_rate": 0.0001, "loss": 3.003, "loss/crossentropy": 2.5074538946151734, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2675335884094238, "loss/reg": 0.0, "step": 40410 }, { "epoch": 0.26592105263157895, "grad_norm": 2.765625, "grad_norm_var": 0.11083984375, "learning_rate": 0.0001, "loss": 2.9331, "loss/crossentropy": 2.0382320880889893, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.20733923465013504, "loss/reg": 0.0, "step": 40420 }, { "epoch": 0.26598684210526313, "grad_norm": 3.0, "grad_norm_var": 0.13010965983072917, "learning_rate": 0.0001, "loss": 2.9417, "loss/crossentropy": 2.348378622531891, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.2281837359070778, "loss/reg": 0.0, "step": 40430 }, { "epoch": 0.2660526315789474, "grad_norm": 2.640625, "grad_norm_var": 0.11596577962239583, "learning_rate": 0.0001, "loss": 2.967, "loss/crossentropy": 2.0158197045326234, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.18082055151462556, "loss/reg": 0.0, "step": 40440 }, { "epoch": 0.26611842105263156, "grad_norm": 2.3125, "grad_norm_var": 0.07532552083333334, "learning_rate": 0.0001, "loss": 2.8778, "loss/crossentropy": 2.191635239124298, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.17337005734443664, "loss/reg": 0.0, "step": 40450 }, { "epoch": 0.2661842105263158, "grad_norm": 3.109375, "grad_norm_var": 0.1762603759765625, "learning_rate": 0.0001, "loss": 2.9873, "loss/crossentropy": 2.3875040888786314, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.2097866028547287, "loss/reg": 0.0, "step": 40460 }, { "epoch": 0.26625, "grad_norm": 2.15625, "grad_norm_var": 0.2105133056640625, "learning_rate": 0.0001, "loss": 2.9557, "loss/crossentropy": 2.3220904231071473, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.208808983117342, "loss/reg": 0.0, "step": 40470 }, { "epoch": 0.26631578947368423, "grad_norm": 2.765625, "grad_norm_var": 0.3130442301432292, "learning_rate": 0.0001, "loss": 2.9918, "loss/crossentropy": 2.353991484642029, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.217716945707798, "loss/reg": 0.0, "step": 40480 }, { "epoch": 0.2663815789473684, "grad_norm": 2.71875, "grad_norm_var": 0.2750885009765625, "learning_rate": 0.0001, "loss": 2.9293, "loss/crossentropy": 2.01134432554245, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.22038960456848145, "loss/reg": 0.0, "step": 40490 }, { "epoch": 0.26644736842105265, "grad_norm": 2.171875, "grad_norm_var": 0.10012105305989584, "learning_rate": 0.0001, "loss": 2.914, "loss/crossentropy": 2.506447398662567, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.22289757877588273, "loss/reg": 0.0, "step": 40500 }, { "epoch": 0.26651315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.08028335571289062, "learning_rate": 0.0001, "loss": 2.9048, "loss/crossentropy": 2.4329306721687316, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.2071765571832657, "loss/reg": 0.0, "step": 40510 }, { "epoch": 0.266578947368421, "grad_norm": 2.5, "grad_norm_var": 0.24204813639322917, "learning_rate": 0.0001, "loss": 2.9498, "loss/crossentropy": 2.028571844100952, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.21152863651514053, "loss/reg": 0.0, "step": 40520 }, { "epoch": 0.26664473684210527, "grad_norm": 2.390625, "grad_norm_var": 0.228076171875, "learning_rate": 0.0001, "loss": 2.9253, "loss/crossentropy": 2.212118911743164, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.25934153944253924, "loss/reg": 0.0, "step": 40530 }, { "epoch": 0.26671052631578945, "grad_norm": 2.09375, "grad_norm_var": 5.0841417579530374e+17, "learning_rate": 0.0001, "loss": 3.1236, "loss/crossentropy": 2.37177095413208, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.3150399401783943, "loss/reg": 0.0, "step": 40540 }, { "epoch": 0.2667763157894737, "grad_norm": 2.34375, "grad_norm_var": 5.084141758719918e+17, "learning_rate": 0.0001, "loss": 2.8429, "loss/crossentropy": 2.155189025402069, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.18673713505268097, "loss/reg": 0.0, "step": 40550 }, { "epoch": 0.2668421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.05227457682291667, "learning_rate": 0.0001, "loss": 2.8727, "loss/crossentropy": 2.3162846088409426, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.22148830592632293, "loss/reg": 0.0, "step": 40560 }, { "epoch": 0.2669078947368421, "grad_norm": 2.96875, "grad_norm_var": 0.08015950520833333, "learning_rate": 0.0001, "loss": 2.9363, "loss/crossentropy": 2.414511263370514, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.23514206409454347, "loss/reg": 0.0, "step": 40570 }, { "epoch": 0.2669736842105263, "grad_norm": 3.046875, "grad_norm_var": 0.2453765869140625, "learning_rate": 0.0001, "loss": 3.0091, "loss/crossentropy": 2.2697977662086486, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.23296948075294494, "loss/reg": 0.0, "step": 40580 }, { "epoch": 0.26703947368421055, "grad_norm": 2.59375, "grad_norm_var": 0.2507120768229167, "learning_rate": 0.0001, "loss": 2.9473, "loss/crossentropy": 2.516921269893646, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.19809878170490264, "loss/reg": 0.0, "step": 40590 }, { "epoch": 0.26710526315789473, "grad_norm": 2.375, "grad_norm_var": 0.08453776041666666, "learning_rate": 0.0001, "loss": 2.9537, "loss/crossentropy": 2.383486819267273, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.23753664493560792, "loss/reg": 0.0, "step": 40600 }, { "epoch": 0.2671710526315789, "grad_norm": 2.28125, "grad_norm_var": 0.1238433837890625, "learning_rate": 0.0001, "loss": 2.9164, "loss/crossentropy": 2.318392050266266, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.2057785451412201, "loss/reg": 0.0, "step": 40610 }, { "epoch": 0.26723684210526316, "grad_norm": 2.578125, "grad_norm_var": 0.0269927978515625, "learning_rate": 0.0001, "loss": 2.9287, "loss/crossentropy": 2.278804612159729, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.29213285595178606, "loss/reg": 0.0, "step": 40620 }, { "epoch": 0.26730263157894735, "grad_norm": 2.515625, "grad_norm_var": 0.03610026041666667, "learning_rate": 0.0001, "loss": 2.9505, "loss/crossentropy": 2.2450153946876528, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.32442753612995145, "loss/reg": 0.0, "step": 40630 }, { "epoch": 0.2673684210526316, "grad_norm": 2.328125, "grad_norm_var": 0.023241170247395835, "learning_rate": 0.0001, "loss": 2.8864, "loss/crossentropy": 2.1880478858947754, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.20965700894594191, "loss/reg": 0.0, "step": 40640 }, { "epoch": 0.2674342105263158, "grad_norm": 2.40625, "grad_norm_var": 0.044482421875, "learning_rate": 0.0001, "loss": 2.992, "loss/crossentropy": 2.3512598276138306, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.23798110634088515, "loss/reg": 0.0, "step": 40650 }, { "epoch": 0.2675, "grad_norm": 2.671875, "grad_norm_var": 0.13796284993489583, "learning_rate": 0.0001, "loss": 3.0364, "loss/crossentropy": 1.8830746769905091, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.1743113711476326, "loss/reg": 0.0, "step": 40660 }, { "epoch": 0.2675657894736842, "grad_norm": 2.234375, "grad_norm_var": 0.12076416015625, "learning_rate": 0.0001, "loss": 2.996, "loss/crossentropy": 2.290823459625244, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.21459258496761321, "loss/reg": 0.0, "step": 40670 }, { "epoch": 0.26763157894736844, "grad_norm": 2.5625, "grad_norm_var": 0.12794596354166668, "learning_rate": 0.0001, "loss": 2.9489, "loss/crossentropy": 2.2376190185546876, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.21406567543745042, "loss/reg": 0.0, "step": 40680 }, { "epoch": 0.26769736842105263, "grad_norm": 2.59375, "grad_norm_var": 0.08486328125, "learning_rate": 0.0001, "loss": 3.0225, "loss/crossentropy": 2.2353453993797303, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2258991152048111, "loss/reg": 0.0, "step": 40690 }, { "epoch": 0.26776315789473687, "grad_norm": 2.625, "grad_norm_var": 0.03912353515625, "learning_rate": 0.0001, "loss": 2.9277, "loss/crossentropy": 2.409557521343231, "loss/hidden": 3.0, "loss/incoh": 0.0, "loss/logits": 0.22400896847248078, "loss/reg": 0.0, "step": 40700 }, { "epoch": 0.26782894736842106, "grad_norm": 2.046875, "grad_norm_var": 0.0279296875, "learning_rate": 0.0001, "loss": 2.919, "loss/crossentropy": 2.3743964195251466, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.2226839616894722, "loss/reg": 0.0, "step": 40710 }, { "epoch": 0.26789473684210524, "grad_norm": 2.0, "grad_norm_var": 0.0439361572265625, "learning_rate": 0.0001, "loss": 2.8685, "loss/crossentropy": 2.36965035200119, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22511910051107406, "loss/reg": 0.0, "step": 40720 }, { "epoch": 0.2679605263157895, "grad_norm": 3.96875, "grad_norm_var": 0.22343343098958332, "learning_rate": 0.0001, "loss": 3.0209, "loss/crossentropy": 2.3880274653434754, "loss/hidden": 3.08125, "loss/incoh": 0.0, "loss/logits": 0.308881114423275, "loss/reg": 0.0, "step": 40730 }, { "epoch": 0.26802631578947367, "grad_norm": 2.109375, "grad_norm_var": 0.4930826822916667, "learning_rate": 0.0001, "loss": 2.9862, "loss/crossentropy": 2.565578031539917, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.20404565930366517, "loss/reg": 0.0, "step": 40740 }, { "epoch": 0.2680921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.3862213134765625, "learning_rate": 0.0001, "loss": 2.9592, "loss/crossentropy": 2.4117432594299317, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2140928290784359, "loss/reg": 0.0, "step": 40750 }, { "epoch": 0.2681578947368421, "grad_norm": 2.125, "grad_norm_var": 0.14703776041666666, "learning_rate": 0.0001, "loss": 2.9877, "loss/crossentropy": 2.48085355758667, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.22400564551353455, "loss/reg": 0.0, "step": 40760 }, { "epoch": 0.26822368421052634, "grad_norm": 2.71875, "grad_norm_var": 0.6468424479166667, "learning_rate": 0.0001, "loss": 2.9328, "loss/crossentropy": 2.3089797854423524, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.24400631934404374, "loss/reg": 0.0, "step": 40770 }, { "epoch": 0.2682894736842105, "grad_norm": 3.46875, "grad_norm_var": 0.20162124633789064, "learning_rate": 0.0001, "loss": 2.9293, "loss/crossentropy": 2.379192590713501, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.2283795580267906, "loss/reg": 0.0, "step": 40780 }, { "epoch": 0.26835526315789476, "grad_norm": 2.5625, "grad_norm_var": 0.19916763305664062, "learning_rate": 0.0001, "loss": 2.9355, "loss/crossentropy": 2.335457980632782, "loss/hidden": 3.0125, "loss/incoh": 0.0, "loss/logits": 0.22422940135002137, "loss/reg": 0.0, "step": 40790 }, { "epoch": 0.26842105263157895, "grad_norm": 3.03125, "grad_norm_var": 0.09563395182291666, "learning_rate": 0.0001, "loss": 3.0224, "loss/crossentropy": 2.1988858938217164, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.20501140356063843, "loss/reg": 0.0, "step": 40800 }, { "epoch": 0.26848684210526313, "grad_norm": 2.15625, "grad_norm_var": 1.3167551676432292, "learning_rate": 0.0001, "loss": 2.9471, "loss/crossentropy": 2.385665547847748, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.20895608514547348, "loss/reg": 0.0, "step": 40810 }, { "epoch": 0.2685526315789474, "grad_norm": 2.0, "grad_norm_var": 1.34332275390625, "learning_rate": 0.0001, "loss": 2.8408, "loss/crossentropy": 2.37712676525116, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.19455353766679764, "loss/reg": 0.0, "step": 40820 }, { "epoch": 0.26861842105263156, "grad_norm": 2.34375, "grad_norm_var": 0.06674702962239583, "learning_rate": 0.0001, "loss": 2.9378, "loss/crossentropy": 2.4048449754714967, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.19571173936128616, "loss/reg": 0.0, "step": 40830 }, { "epoch": 0.2686842105263158, "grad_norm": 2.34375, "grad_norm_var": 0.049559529622395834, "learning_rate": 0.0001, "loss": 2.9492, "loss/crossentropy": 2.1990068197250365, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.2072514608502388, "loss/reg": 0.0, "step": 40840 }, { "epoch": 0.26875, "grad_norm": 3.046875, "grad_norm_var": 0.27886962890625, "learning_rate": 0.0001, "loss": 2.9642, "loss/crossentropy": 2.507074761390686, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2254748597741127, "loss/reg": 0.0, "step": 40850 }, { "epoch": 0.26881578947368423, "grad_norm": 2.234375, "grad_norm_var": 0.35652567545572916, "learning_rate": 0.0001, "loss": 2.9497, "loss/crossentropy": 2.425125169754028, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.21821437627077103, "loss/reg": 0.0, "step": 40860 }, { "epoch": 0.2688815789473684, "grad_norm": 2.234375, "grad_norm_var": 0.22220052083333333, "learning_rate": 0.0001, "loss": 2.9509, "loss/crossentropy": 2.5203884124755858, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2379711538553238, "loss/reg": 0.0, "step": 40870 }, { "epoch": 0.26894736842105266, "grad_norm": 2.78125, "grad_norm_var": 1.55201416015625, "learning_rate": 0.0001, "loss": 3.0554, "loss/crossentropy": 2.107471191883087, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.21151991412043572, "loss/reg": 0.0, "step": 40880 }, { "epoch": 0.26901315789473684, "grad_norm": 2.09375, "grad_norm_var": 1.5634674072265624, "learning_rate": 0.0001, "loss": 2.9029, "loss/crossentropy": 2.505987584590912, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.2121686741709709, "loss/reg": 0.0, "step": 40890 }, { "epoch": 0.26907894736842103, "grad_norm": 2.0, "grad_norm_var": 0.168017578125, "learning_rate": 0.0001, "loss": 3.0268, "loss/crossentropy": 2.22397176027298, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2620822206139565, "loss/reg": 0.0, "step": 40900 }, { "epoch": 0.26914473684210527, "grad_norm": 2.21875, "grad_norm_var": 0.12713114420572916, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 2.23619726896286, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.231631501019001, "loss/reg": 0.0, "step": 40910 }, { "epoch": 0.26921052631578946, "grad_norm": 2.34375, "grad_norm_var": 0.14329020182291666, "learning_rate": 0.0001, "loss": 2.9898, "loss/crossentropy": 2.1167128562927244, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.1906721480190754, "loss/reg": 0.0, "step": 40920 }, { "epoch": 0.2692763157894737, "grad_norm": 2.84375, "grad_norm_var": 0.11011962890625, "learning_rate": 0.0001, "loss": 2.9878, "loss/crossentropy": 2.421756899356842, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.21042222082614898, "loss/reg": 0.0, "step": 40930 }, { "epoch": 0.2693421052631579, "grad_norm": 2.546875, "grad_norm_var": 0.11083577473958334, "learning_rate": 0.0001, "loss": 2.9642, "loss/crossentropy": 2.419151175022125, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.27305717021226883, "loss/reg": 0.0, "step": 40940 }, { "epoch": 0.2694078947368421, "grad_norm": 2.28125, "grad_norm_var": 0.4359537760416667, "learning_rate": 0.0001, "loss": 2.9801, "loss/crossentropy": 2.2244869232177735, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.2133845791220665, "loss/reg": 0.0, "step": 40950 }, { "epoch": 0.2694736842105263, "grad_norm": 2.96875, "grad_norm_var": 0.11788736979166667, "learning_rate": 0.0001, "loss": 2.9162, "loss/crossentropy": 1.7888985753059388, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.1832651600241661, "loss/reg": 0.0, "step": 40960 }, { "epoch": 0.26953947368421055, "grad_norm": 2.390625, "grad_norm_var": 0.11350504557291667, "learning_rate": 0.0001, "loss": 2.9968, "loss/crossentropy": 2.229509103298187, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2455837309360504, "loss/reg": 0.0, "step": 40970 }, { "epoch": 0.26960526315789474, "grad_norm": 2.71875, "grad_norm_var": 0.09237874348958333, "learning_rate": 0.0001, "loss": 2.9161, "loss/crossentropy": 2.3413291692733766, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.23349182158708573, "loss/reg": 0.0, "step": 40980 }, { "epoch": 0.2696710526315789, "grad_norm": 3.0625, "grad_norm_var": 0.1189361572265625, "learning_rate": 0.0001, "loss": 2.933, "loss/crossentropy": 2.1656792044639586, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.1818828582763672, "loss/reg": 0.0, "step": 40990 }, { "epoch": 0.26973684210526316, "grad_norm": 2.84375, "grad_norm_var": 1.1934733072916666, "learning_rate": 0.0001, "loss": 2.9504, "loss/crossentropy": 2.5566773414611816, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2236784353852272, "loss/reg": 0.0, "step": 41000 }, { "epoch": 0.26980263157894735, "grad_norm": 2.65625, "grad_norm_var": 0.6011627197265625, "learning_rate": 0.0001, "loss": 2.9097, "loss/crossentropy": 2.409429597854614, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2192213535308838, "loss/reg": 0.0, "step": 41010 }, { "epoch": 0.2698684210526316, "grad_norm": 2.453125, "grad_norm_var": 0.09265034993489583, "learning_rate": 0.0001, "loss": 2.9051, "loss/crossentropy": 2.3679562330245973, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.21384229212999345, "loss/reg": 0.0, "step": 41020 }, { "epoch": 0.2699342105263158, "grad_norm": 2.28125, "grad_norm_var": 0.11223551432291666, "learning_rate": 0.0001, "loss": 2.8788, "loss/crossentropy": 2.2784803986549376, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.20671008080244063, "loss/reg": 0.0, "step": 41030 }, { "epoch": 0.27, "grad_norm": 2.578125, "grad_norm_var": 0.06103108723958333, "learning_rate": 0.0001, "loss": 2.8818, "loss/crossentropy": 2.073737895488739, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.19423705488443374, "loss/reg": 0.0, "step": 41040 }, { "epoch": 0.2700657894736842, "grad_norm": 2.453125, "grad_norm_var": 0.0287017822265625, "learning_rate": 0.0001, "loss": 2.9054, "loss/crossentropy": 2.3877384781837465, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2506763800978661, "loss/reg": 0.0, "step": 41050 }, { "epoch": 0.27013157894736844, "grad_norm": 2.390625, "grad_norm_var": 0.023714192708333335, "learning_rate": 0.0001, "loss": 2.9184, "loss/crossentropy": 2.3060161471366882, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.22390931099653244, "loss/reg": 0.0, "step": 41060 }, { "epoch": 0.27019736842105263, "grad_norm": 2.5, "grad_norm_var": 32.44180399576823, "learning_rate": 0.0001, "loss": 2.8509, "loss/crossentropy": 2.4271527886390687, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.20195934623479844, "loss/reg": 0.0, "step": 41070 }, { "epoch": 0.2702631578947368, "grad_norm": 2.6875, "grad_norm_var": 0.23207906087239583, "learning_rate": 0.0001, "loss": 2.9049, "loss/crossentropy": 2.0002416908740996, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.19162290468811988, "loss/reg": 0.0, "step": 41080 }, { "epoch": 0.27032894736842106, "grad_norm": 2.109375, "grad_norm_var": 0.180712890625, "learning_rate": 0.0001, "loss": 2.8292, "loss/crossentropy": 2.3094120621681213, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.23437123894691467, "loss/reg": 0.0, "step": 41090 }, { "epoch": 0.27039473684210524, "grad_norm": 2.671875, "grad_norm_var": 0.11016337076822917, "learning_rate": 0.0001, "loss": 2.9111, "loss/crossentropy": 2.247733747959137, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.21751733124256134, "loss/reg": 0.0, "step": 41100 }, { "epoch": 0.2704605263157895, "grad_norm": 2.078125, "grad_norm_var": 0.12379150390625, "learning_rate": 0.0001, "loss": 2.8917, "loss/crossentropy": 2.0373709321022035, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.18501856476068496, "loss/reg": 0.0, "step": 41110 }, { "epoch": 0.27052631578947367, "grad_norm": 2.890625, "grad_norm_var": 0.06482747395833334, "learning_rate": 0.0001, "loss": 2.8237, "loss/crossentropy": 2.184122359752655, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.19761546701192856, "loss/reg": 0.0, "step": 41120 }, { "epoch": 0.2705921052631579, "grad_norm": 2.484375, "grad_norm_var": 0.20320536295572916, "learning_rate": 0.0001, "loss": 2.8814, "loss/crossentropy": 2.1371851563453674, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.18069906309247016, "loss/reg": 0.0, "step": 41130 }, { "epoch": 0.2706578947368421, "grad_norm": 2.0625, "grad_norm_var": 0.17932840983072917, "learning_rate": 0.0001, "loss": 2.8801, "loss/crossentropy": 2.380477821826935, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.1982297345995903, "loss/reg": 0.0, "step": 41140 }, { "epoch": 0.27072368421052634, "grad_norm": 2.28125, "grad_norm_var": 0.07521158854166667, "learning_rate": 0.0001, "loss": 2.8606, "loss/crossentropy": 2.3723392605781557, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.21119217127561568, "loss/reg": 0.0, "step": 41150 }, { "epoch": 0.2707894736842105, "grad_norm": 3.3125, "grad_norm_var": 0.5795206705729167, "learning_rate": 0.0001, "loss": 2.9998, "loss/crossentropy": 2.1229013562202455, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.2063151404261589, "loss/reg": 0.0, "step": 41160 }, { "epoch": 0.2708552631578947, "grad_norm": 2.328125, "grad_norm_var": 0.2788238525390625, "learning_rate": 0.0001, "loss": 2.8526, "loss/crossentropy": 2.178604793548584, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.1873784750699997, "loss/reg": 0.0, "step": 41170 }, { "epoch": 0.27092105263157895, "grad_norm": 2.125, "grad_norm_var": 0.15403645833333332, "learning_rate": 0.0001, "loss": 2.8594, "loss/crossentropy": 2.2511882066726683, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.2070169784128666, "loss/reg": 0.0, "step": 41180 }, { "epoch": 0.27098684210526314, "grad_norm": 2.78125, "grad_norm_var": 0.12288411458333333, "learning_rate": 0.0001, "loss": 2.8826, "loss/crossentropy": 2.1874929666519165, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.20047285854816438, "loss/reg": 0.0, "step": 41190 }, { "epoch": 0.2710526315789474, "grad_norm": 2197815296.0, "grad_norm_var": 3.018995039238204e+17, "learning_rate": 0.0001, "loss": 3.1046, "loss/crossentropy": 2.145432484149933, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.21647956371307372, "loss/reg": 0.0, "step": 41200 }, { "epoch": 0.27111842105263156, "grad_norm": 2.5, "grad_norm_var": 3.018995040093864e+17, "learning_rate": 0.0001, "loss": 2.8322, "loss/crossentropy": 2.4048340201377867, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.2009089097380638, "loss/reg": 0.0, "step": 41210 }, { "epoch": 0.2711842105263158, "grad_norm": 2.6875, "grad_norm_var": 0.033121744791666664, "learning_rate": 0.0001, "loss": 2.8765, "loss/crossentropy": 1.878375419974327, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.23517074361443518, "loss/reg": 0.0, "step": 41220 }, { "epoch": 0.27125, "grad_norm": 2.84375, "grad_norm_var": 1.1466105143229166, "learning_rate": 0.0001, "loss": 2.9121, "loss/crossentropy": 2.480013167858124, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.23036694675683975, "loss/reg": 0.0, "step": 41230 }, { "epoch": 0.27131578947368423, "grad_norm": 2.640625, "grad_norm_var": 0.30252176920572915, "learning_rate": 0.0001, "loss": 2.9135, "loss/crossentropy": 2.232916295528412, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.19655855149030685, "loss/reg": 0.0, "step": 41240 }, { "epoch": 0.2713815789473684, "grad_norm": 2.796875, "grad_norm_var": 0.12746988932291667, "learning_rate": 0.0001, "loss": 2.956, "loss/crossentropy": 2.2043753027915955, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.1965479537844658, "loss/reg": 0.0, "step": 41250 }, { "epoch": 0.2714473684210526, "grad_norm": 2.15625, "grad_norm_var": 0.2058013916015625, "learning_rate": 0.0001, "loss": 2.8575, "loss/crossentropy": 2.406782853603363, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.20034269541501998, "loss/reg": 0.0, "step": 41260 }, { "epoch": 0.27151315789473685, "grad_norm": 2.1875, "grad_norm_var": 0.03121337890625, "learning_rate": 0.0001, "loss": 2.8778, "loss/crossentropy": 2.1481690287590025, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.19364384561777115, "loss/reg": 0.0, "step": 41270 }, { "epoch": 0.27157894736842103, "grad_norm": 2.609375, "grad_norm_var": 0.041258748372395834, "learning_rate": 0.0001, "loss": 2.9107, "loss/crossentropy": 2.4759095549583434, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.22636631429195403, "loss/reg": 0.0, "step": 41280 }, { "epoch": 0.27164473684210527, "grad_norm": 2.515625, "grad_norm_var": 0.0617584228515625, "learning_rate": 0.0001, "loss": 2.8954, "loss/crossentropy": 2.1929529905319214, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.19717235416173934, "loss/reg": 0.0, "step": 41290 }, { "epoch": 0.27171052631578946, "grad_norm": 2.328125, "grad_norm_var": 0.06896870930989583, "learning_rate": 0.0001, "loss": 2.9087, "loss/crossentropy": 2.3081828951835632, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.20817425847053528, "loss/reg": 0.0, "step": 41300 }, { "epoch": 0.2717763157894737, "grad_norm": 2.0625, "grad_norm_var": 0.10899149576822917, "learning_rate": 0.0001, "loss": 2.9758, "loss/crossentropy": 2.204688477516174, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.20805404484272003, "loss/reg": 0.0, "step": 41310 }, { "epoch": 0.2718421052631579, "grad_norm": 2.6875, "grad_norm_var": 0.1543853759765625, "learning_rate": 0.0001, "loss": 2.9371, "loss/crossentropy": 2.2314152359962462, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20818763971328735, "loss/reg": 0.0, "step": 41320 }, { "epoch": 0.2719078947368421, "grad_norm": 2.6875, "grad_norm_var": 0.44286702473958334, "learning_rate": 0.0001, "loss": 2.9377, "loss/crossentropy": 2.1831210136413572, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.19648245871067047, "loss/reg": 0.0, "step": 41330 }, { "epoch": 0.2719736842105263, "grad_norm": 2.6875, "grad_norm_var": 0.4179026285807292, "learning_rate": 0.0001, "loss": 2.9205, "loss/crossentropy": 2.2966633677482604, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.19041728973388672, "loss/reg": 0.0, "step": 41340 }, { "epoch": 0.27203947368421055, "grad_norm": 2.640625, "grad_norm_var": 0.09402567545572917, "learning_rate": 0.0001, "loss": 2.8849, "loss/crossentropy": 2.185028040409088, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.18163363635540009, "loss/reg": 0.0, "step": 41350 }, { "epoch": 0.27210526315789474, "grad_norm": 2.4375, "grad_norm_var": 0.08332926432291667, "learning_rate": 0.0001, "loss": 2.8645, "loss/crossentropy": 2.4740115880966185, "loss/hidden": 2.590625, "loss/incoh": 0.0, "loss/logits": 0.21196720749139786, "loss/reg": 0.0, "step": 41360 }, { "epoch": 0.2721710526315789, "grad_norm": 2.390625, "grad_norm_var": 0.03287353515625, "learning_rate": 0.0001, "loss": 2.8955, "loss/crossentropy": 2.087357831001282, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.19198780804872512, "loss/reg": 0.0, "step": 41370 }, { "epoch": 0.27223684210526317, "grad_norm": 2.65625, "grad_norm_var": 0.05615946451822917, "learning_rate": 0.0001, "loss": 2.8919, "loss/crossentropy": 2.31329106092453, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2663746111094952, "loss/reg": 0.0, "step": 41380 }, { "epoch": 0.27230263157894735, "grad_norm": 2.484375, "grad_norm_var": 3.3502559027076794e+17, "learning_rate": 0.0001, "loss": 3.0541, "loss/crossentropy": 1.9595822080969811, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.20186286009848117, "loss/reg": 0.0, "step": 41390 }, { "epoch": 0.2723684210526316, "grad_norm": 2.375, "grad_norm_var": 3.350255902780031e+17, "learning_rate": 0.0001, "loss": 2.8766, "loss/crossentropy": 2.352650213241577, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20668166875839233, "loss/reg": 0.0, "step": 41400 }, { "epoch": 0.2724342105263158, "grad_norm": 2.625, "grad_norm_var": 0.14541727701822918, "learning_rate": 0.0001, "loss": 2.8439, "loss/crossentropy": 1.7044901728630066, "loss/hidden": 2.4921875, "loss/incoh": 0.0, "loss/logits": 0.14977376386523247, "loss/reg": 0.0, "step": 41410 }, { "epoch": 0.2725, "grad_norm": 2.5625, "grad_norm_var": 0.0694976806640625, "learning_rate": 0.0001, "loss": 2.8521, "loss/crossentropy": 2.1086192607879637, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.2395867094397545, "loss/reg": 0.0, "step": 41420 }, { "epoch": 0.2725657894736842, "grad_norm": 2.421875, "grad_norm_var": 0.03229878743489583, "learning_rate": 0.0001, "loss": 2.9257, "loss/crossentropy": 2.249106192588806, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.22158748805522918, "loss/reg": 0.0, "step": 41430 }, { "epoch": 0.27263157894736845, "grad_norm": 2.3125, "grad_norm_var": 0.019660441080729167, "learning_rate": 0.0001, "loss": 2.8336, "loss/crossentropy": 2.244483399391174, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.18955016806721686, "loss/reg": 0.0, "step": 41440 }, { "epoch": 0.27269736842105263, "grad_norm": 2.625, "grad_norm_var": 0.055720011393229164, "learning_rate": 0.0001, "loss": 2.9039, "loss/crossentropy": 2.172132980823517, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.23444610685110093, "loss/reg": 0.0, "step": 41450 }, { "epoch": 0.2727631578947368, "grad_norm": 3.0625, "grad_norm_var": 0.060693359375, "learning_rate": 0.0001, "loss": 2.8647, "loss/crossentropy": 2.5057065963745115, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.21957635134458542, "loss/reg": 0.0, "step": 41460 }, { "epoch": 0.27282894736842106, "grad_norm": 4.28125, "grad_norm_var": 0.2601064046223958, "learning_rate": 0.0001, "loss": 2.9177, "loss/crossentropy": 2.511070191860199, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.21949827820062637, "loss/reg": 0.0, "step": 41470 }, { "epoch": 0.27289473684210525, "grad_norm": 2.375, "grad_norm_var": 0.2640940348307292, "learning_rate": 0.0001, "loss": 2.9167, "loss/crossentropy": 2.1745970487594604, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.19311966225504876, "loss/reg": 0.0, "step": 41480 }, { "epoch": 0.2729605263157895, "grad_norm": 2.1875, "grad_norm_var": 0.05781148274739583, "learning_rate": 0.0001, "loss": 2.8882, "loss/crossentropy": 2.2907721221446993, "loss/hidden": 2.4390625, "loss/incoh": 0.0, "loss/logits": 0.16826940402388574, "loss/reg": 0.0, "step": 41490 }, { "epoch": 0.2730263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.1280670166015625, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.403340423107147, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.22561162263154982, "loss/reg": 0.0, "step": 41500 }, { "epoch": 0.2730921052631579, "grad_norm": 2.734375, "grad_norm_var": 0.07307535807291667, "learning_rate": 0.0001, "loss": 2.8806, "loss/crossentropy": 2.1145091533660887, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.22574817836284639, "loss/reg": 0.0, "step": 41510 }, { "epoch": 0.2731578947368421, "grad_norm": 3.421875, "grad_norm_var": 0.14630533854166666, "learning_rate": 0.0001, "loss": 2.9351, "loss/crossentropy": 2.1064493060112, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.22888276129961013, "loss/reg": 0.0, "step": 41520 }, { "epoch": 0.27322368421052634, "grad_norm": 2.546875, "grad_norm_var": 0.24026590983072918, "learning_rate": 0.0001, "loss": 2.8871, "loss/crossentropy": 2.31835732460022, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.20172006785869598, "loss/reg": 0.0, "step": 41530 }, { "epoch": 0.2732894736842105, "grad_norm": 2.546875, "grad_norm_var": 0.11721598307291667, "learning_rate": 0.0001, "loss": 2.9508, "loss/crossentropy": 2.2194876432418824, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2564773142337799, "loss/reg": 0.0, "step": 41540 }, { "epoch": 0.2733552631578947, "grad_norm": 2.578125, "grad_norm_var": 0.40068257649739586, "learning_rate": 0.0001, "loss": 2.9282, "loss/crossentropy": 1.8796088099479675, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.21419723257422446, "loss/reg": 0.0, "step": 41550 }, { "epoch": 0.27342105263157895, "grad_norm": 2.15625, "grad_norm_var": 0.06572265625, "learning_rate": 0.0001, "loss": 2.8964, "loss/crossentropy": 2.37721506357193, "loss/hidden": 2.5375, "loss/incoh": 0.0, "loss/logits": 0.19073823913931848, "loss/reg": 0.0, "step": 41560 }, { "epoch": 0.27348684210526314, "grad_norm": 2.375, "grad_norm_var": 0.4864898681640625, "learning_rate": 0.0001, "loss": 2.8907, "loss/crossentropy": 2.51407128572464, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.20777955502271653, "loss/reg": 0.0, "step": 41570 }, { "epoch": 0.2735526315789474, "grad_norm": 2.796875, "grad_norm_var": 0.4667633056640625, "learning_rate": 0.0001, "loss": 2.8871, "loss/crossentropy": 2.4381505370140077, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.20822121053934098, "loss/reg": 0.0, "step": 41580 }, { "epoch": 0.27361842105263157, "grad_norm": 2.21875, "grad_norm_var": 0.17844950358072917, "learning_rate": 0.0001, "loss": 2.8661, "loss/crossentropy": 2.6211124539375303, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2006961762905121, "loss/reg": 0.0, "step": 41590 }, { "epoch": 0.2736842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.04586181640625, "learning_rate": 0.0001, "loss": 2.9362, "loss/crossentropy": 2.2242876410484316, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20689061284065247, "loss/reg": 0.0, "step": 41600 }, { "epoch": 0.27375, "grad_norm": 2.796875, "grad_norm_var": 0.028416951497395832, "learning_rate": 0.0001, "loss": 2.9104, "loss/crossentropy": 2.419981896877289, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.22146868705749512, "loss/reg": 0.0, "step": 41610 }, { "epoch": 0.27381578947368423, "grad_norm": 3.140625, "grad_norm_var": 0.09551493326822917, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 2.2197636127471925, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.1994896985590458, "loss/reg": 0.0, "step": 41620 }, { "epoch": 0.2738815789473684, "grad_norm": 2.734375, "grad_norm_var": 0.06243489583333333, "learning_rate": 0.0001, "loss": 2.8977, "loss/crossentropy": 2.0759979009628298, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.1956456944346428, "loss/reg": 0.0, "step": 41630 }, { "epoch": 0.2739473684210526, "grad_norm": 2.59375, "grad_norm_var": 0.06266276041666667, "learning_rate": 0.0001, "loss": 2.8816, "loss/crossentropy": 2.3616039574146273, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.19884779751300813, "loss/reg": 0.0, "step": 41640 }, { "epoch": 0.27401315789473685, "grad_norm": 2.5, "grad_norm_var": 0.044408162434895836, "learning_rate": 0.0001, "loss": 2.9378, "loss/crossentropy": 1.949123704433441, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.20976211726665497, "loss/reg": 0.0, "step": 41650 }, { "epoch": 0.27407894736842103, "grad_norm": 2.453125, "grad_norm_var": 0.053385416666666664, "learning_rate": 0.0001, "loss": 2.8998, "loss/crossentropy": 2.2032551646232603, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.19168981835246085, "loss/reg": 0.0, "step": 41660 }, { "epoch": 0.2741447368421053, "grad_norm": 2.453125, "grad_norm_var": 0.04306233723958333, "learning_rate": 0.0001, "loss": 2.9381, "loss/crossentropy": 2.3122259378433228, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2329431250691414, "loss/reg": 0.0, "step": 41670 }, { "epoch": 0.27421052631578946, "grad_norm": 2.375, "grad_norm_var": 0.3040435791015625, "learning_rate": 0.0001, "loss": 2.8744, "loss/crossentropy": 2.2190945386886596, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2630679361522198, "loss/reg": 0.0, "step": 41680 }, { "epoch": 0.2742763157894737, "grad_norm": 2.703125, "grad_norm_var": 0.32198893229166664, "learning_rate": 0.0001, "loss": 2.8806, "loss/crossentropy": 2.4296725749969483, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.23305364549160004, "loss/reg": 0.0, "step": 41690 }, { "epoch": 0.2743421052631579, "grad_norm": 2.5625, "grad_norm_var": 0.07125244140625, "learning_rate": 0.0001, "loss": 2.8362, "loss/crossentropy": 2.1196131706237793, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.16888414323329926, "loss/reg": 0.0, "step": 41700 }, { "epoch": 0.27440789473684213, "grad_norm": 2.390625, "grad_norm_var": 0.11493733723958334, "learning_rate": 0.0001, "loss": 2.8899, "loss/crossentropy": 2.096984314918518, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.217166056483984, "loss/reg": 0.0, "step": 41710 }, { "epoch": 0.2744736842105263, "grad_norm": 2.28125, "grad_norm_var": 0.13425191243489584, "learning_rate": 0.0001, "loss": 2.8421, "loss/crossentropy": 2.3359764575958253, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23834931701421738, "loss/reg": 0.0, "step": 41720 }, { "epoch": 0.2745394736842105, "grad_norm": 2.546875, "grad_norm_var": 0.06536051432291666, "learning_rate": 0.0001, "loss": 2.9234, "loss/crossentropy": 2.2544549703598022, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.20072139501571656, "loss/reg": 0.0, "step": 41730 }, { "epoch": 0.27460526315789474, "grad_norm": 2.6875, "grad_norm_var": 0.03714192708333333, "learning_rate": 0.0001, "loss": 2.8596, "loss/crossentropy": 2.302277886867523, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.1977499932050705, "loss/reg": 0.0, "step": 41740 }, { "epoch": 0.2746710526315789, "grad_norm": 2.6875, "grad_norm_var": 0.031050618489583334, "learning_rate": 0.0001, "loss": 2.8737, "loss/crossentropy": 2.329440402984619, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.23690769970417022, "loss/reg": 0.0, "step": 41750 }, { "epoch": 0.27473684210526317, "grad_norm": 2.40625, "grad_norm_var": 0.18598531087239584, "learning_rate": 0.0001, "loss": 2.8811, "loss/crossentropy": 2.3601991534233093, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2735078051686287, "loss/reg": 0.0, "step": 41760 }, { "epoch": 0.27480263157894735, "grad_norm": 2.53125, "grad_norm_var": 0.20947977701822917, "learning_rate": 0.0001, "loss": 2.8913, "loss/crossentropy": 2.033779376745224, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.18523527830839157, "loss/reg": 0.0, "step": 41770 }, { "epoch": 0.2748684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.9051432291666667, "learning_rate": 0.0001, "loss": 2.8389, "loss/crossentropy": 2.392315423488617, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.2058789387345314, "loss/reg": 0.0, "step": 41780 }, { "epoch": 0.2749342105263158, "grad_norm": 2.140625, "grad_norm_var": 0.10076395670572917, "learning_rate": 0.0001, "loss": 2.9075, "loss/crossentropy": 2.353836953639984, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2003943383693695, "loss/reg": 0.0, "step": 41790 }, { "epoch": 0.275, "grad_norm": 2.625, "grad_norm_var": 0.10082906087239583, "learning_rate": 0.0001, "loss": 2.8857, "loss/crossentropy": 2.6220825910568237, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.2236192360520363, "loss/reg": 0.0, "step": 41800 }, { "epoch": 0.2750657894736842, "grad_norm": 3.921875, "grad_norm_var": 0.16223856608072917, "learning_rate": 0.0001, "loss": 2.9166, "loss/crossentropy": 2.0004385352134704, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.16848198920488358, "loss/reg": 0.0, "step": 41810 }, { "epoch": 0.2751315789473684, "grad_norm": 2.671875, "grad_norm_var": 0.18361714680989583, "learning_rate": 0.0001, "loss": 2.8722, "loss/crossentropy": 2.240838098526001, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.23155076205730438, "loss/reg": 0.0, "step": 41820 }, { "epoch": 0.27519736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.017658487955729166, "learning_rate": 0.0001, "loss": 2.8267, "loss/crossentropy": 2.5004455208778382, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.22757815271615983, "loss/reg": 0.0, "step": 41830 }, { "epoch": 0.2752631578947368, "grad_norm": 2.421875, "grad_norm_var": 0.0156158447265625, "learning_rate": 0.0001, "loss": 2.8437, "loss/crossentropy": 2.028176671266556, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.19746825397014617, "loss/reg": 0.0, "step": 41840 }, { "epoch": 0.27532894736842106, "grad_norm": 2.546875, "grad_norm_var": 0.029694620768229166, "learning_rate": 0.0001, "loss": 2.8658, "loss/crossentropy": 2.5022127270698546, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.21884308755397797, "loss/reg": 0.0, "step": 41850 }, { "epoch": 0.27539473684210525, "grad_norm": 2.296875, "grad_norm_var": 0.06630757649739584, "learning_rate": 0.0001, "loss": 2.8116, "loss/crossentropy": 2.4167380571365356, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.18526040762662888, "loss/reg": 0.0, "step": 41860 }, { "epoch": 0.2754605263157895, "grad_norm": 2.703125, "grad_norm_var": 0.06577860514322917, "learning_rate": 0.0001, "loss": 2.8687, "loss/crossentropy": 2.521064019203186, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.2554768264293671, "loss/reg": 0.0, "step": 41870 }, { "epoch": 0.2755263157894737, "grad_norm": 2.453125, "grad_norm_var": 0.043797810872395836, "learning_rate": 0.0001, "loss": 2.8525, "loss/crossentropy": 2.316687321662903, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.19146600365638733, "loss/reg": 0.0, "step": 41880 }, { "epoch": 0.2755921052631579, "grad_norm": 2.40625, "grad_norm_var": 0.07185643513997396, "learning_rate": 0.0001, "loss": 2.8009, "loss/crossentropy": 2.265317702293396, "loss/hidden": 2.525, "loss/incoh": 0.0, "loss/logits": 0.1787434309720993, "loss/reg": 0.0, "step": 41890 }, { "epoch": 0.2756578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.17363255818684895, "learning_rate": 0.0001, "loss": 2.9225, "loss/crossentropy": 2.375841462612152, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.1972646877169609, "loss/reg": 0.0, "step": 41900 }, { "epoch": 0.27572368421052634, "grad_norm": 3.0625, "grad_norm_var": 0.15393473307291666, "learning_rate": 0.0001, "loss": 2.9105, "loss/crossentropy": 2.497803068161011, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.22261143401265143, "loss/reg": 0.0, "step": 41910 }, { "epoch": 0.27578947368421053, "grad_norm": 2.265625, "grad_norm_var": 0.0914947509765625, "learning_rate": 0.0001, "loss": 2.8619, "loss/crossentropy": 2.3061989665031435, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.1887695536017418, "loss/reg": 0.0, "step": 41920 }, { "epoch": 0.2758552631578947, "grad_norm": 2.59375, "grad_norm_var": 0.0603424072265625, "learning_rate": 0.0001, "loss": 2.8884, "loss/crossentropy": 2.3149193286895753, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.22339601963758468, "loss/reg": 0.0, "step": 41930 }, { "epoch": 0.27592105263157896, "grad_norm": 2.109375, "grad_norm_var": 0.1061431884765625, "learning_rate": 0.0001, "loss": 2.8216, "loss/crossentropy": 2.1692532539367675, "loss/hidden": 2.53125, "loss/incoh": 0.0, "loss/logits": 0.18136480674147606, "loss/reg": 0.0, "step": 41940 }, { "epoch": 0.27598684210526314, "grad_norm": 2.15625, "grad_norm_var": 0.6952107747395834, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.1056182980537415, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.1803471863269806, "loss/reg": 0.0, "step": 41950 }, { "epoch": 0.2760526315789474, "grad_norm": 2.234375, "grad_norm_var": 0.8908030192057291, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.007475769519806, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.20084686875343322, "loss/reg": 0.0, "step": 41960 }, { "epoch": 0.27611842105263157, "grad_norm": 2.5, "grad_norm_var": 0.26748758951822915, "learning_rate": 0.0001, "loss": 2.8984, "loss/crossentropy": 1.9994977772235871, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21658042073249817, "loss/reg": 0.0, "step": 41970 }, { "epoch": 0.2761842105263158, "grad_norm": 2.828125, "grad_norm_var": 0.3663564046223958, "learning_rate": 0.0001, "loss": 2.9774, "loss/crossentropy": 2.444515585899353, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2198921859264374, "loss/reg": 0.0, "step": 41980 }, { "epoch": 0.27625, "grad_norm": 2.234375, "grad_norm_var": 0.10633036295572916, "learning_rate": 0.0001, "loss": 2.9, "loss/crossentropy": 2.2061278581619264, "loss/hidden": 2.5171875, "loss/incoh": 0.0, "loss/logits": 0.20719580352306366, "loss/reg": 0.0, "step": 41990 }, { "epoch": 0.27631578947368424, "grad_norm": 2.546875, "grad_norm_var": 0.08648681640625, "learning_rate": 0.0001, "loss": 2.8332, "loss/crossentropy": 2.4916536927223207, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22260852605104448, "loss/reg": 0.0, "step": 42000 }, { "epoch": 0.2763815789473684, "grad_norm": 2.765625, "grad_norm_var": 0.07151285807291667, "learning_rate": 0.0001, "loss": 2.8677, "loss/crossentropy": 2.561967873573303, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.21093451529741286, "loss/reg": 0.0, "step": 42010 }, { "epoch": 0.2764473684210526, "grad_norm": 2.5, "grad_norm_var": 0.05084228515625, "learning_rate": 0.0001, "loss": 2.8803, "loss/crossentropy": 2.261924123764038, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.23825129866600037, "loss/reg": 0.0, "step": 42020 }, { "epoch": 0.27651315789473685, "grad_norm": 2.4375, "grad_norm_var": 0.026460774739583335, "learning_rate": 0.0001, "loss": 2.8734, "loss/crossentropy": 2.2160896092653273, "loss/hidden": 2.496875, "loss/incoh": 0.0, "loss/logits": 0.1706544417887926, "loss/reg": 0.0, "step": 42030 }, { "epoch": 0.27657894736842104, "grad_norm": 2.28125, "grad_norm_var": 0.04378840128580729, "learning_rate": 0.0001, "loss": 2.8002, "loss/crossentropy": 2.0748197197914124, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.16419670060276986, "loss/reg": 0.0, "step": 42040 }, { "epoch": 0.2766447368421053, "grad_norm": 3.109375, "grad_norm_var": 0.05515848795572917, "learning_rate": 0.0001, "loss": 2.9576, "loss/crossentropy": 2.011886727809906, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.1762137532234192, "loss/reg": 0.0, "step": 42050 }, { "epoch": 0.27671052631578946, "grad_norm": 2.28125, "grad_norm_var": 0.05481669108072917, "learning_rate": 0.0001, "loss": 2.8109, "loss/crossentropy": 2.1301441192626953, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.20016891434788703, "loss/reg": 0.0, "step": 42060 }, { "epoch": 0.2767763157894737, "grad_norm": 2.359375, "grad_norm_var": 0.029715983072916667, "learning_rate": 0.0001, "loss": 2.8464, "loss/crossentropy": 2.1609872460365294, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.22251954674720764, "loss/reg": 0.0, "step": 42070 }, { "epoch": 0.2768421052631579, "grad_norm": 2.8125, "grad_norm_var": 0.05791015625, "learning_rate": 0.0001, "loss": 2.8611, "loss/crossentropy": 2.196348261833191, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.17615699172019958, "loss/reg": 0.0, "step": 42080 }, { "epoch": 0.27690789473684213, "grad_norm": 2.953125, "grad_norm_var": 0.0859039306640625, "learning_rate": 0.0001, "loss": 3.0034, "loss/crossentropy": 2.096973741054535, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.22597918659448624, "loss/reg": 0.0, "step": 42090 }, { "epoch": 0.2769736842105263, "grad_norm": 3.1875, "grad_norm_var": 0.15637613932291666, "learning_rate": 0.0001, "loss": 2.9585, "loss/crossentropy": 2.3460837483406065, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2634055718779564, "loss/reg": 0.0, "step": 42100 }, { "epoch": 0.2770394736842105, "grad_norm": 2.59375, "grad_norm_var": 0.17405192057291666, "learning_rate": 0.0001, "loss": 2.9321, "loss/crossentropy": 2.287611174583435, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.18909256979823114, "loss/reg": 0.0, "step": 42110 }, { "epoch": 0.27710526315789474, "grad_norm": 2.96875, "grad_norm_var": 0.10876363118489583, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.3075396180152894, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.24828632175922394, "loss/reg": 0.0, "step": 42120 }, { "epoch": 0.27717105263157893, "grad_norm": 2.59375, "grad_norm_var": 0.0587066650390625, "learning_rate": 0.0001, "loss": 2.9036, "loss/crossentropy": 2.2195401787757874, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.22550088316202163, "loss/reg": 0.0, "step": 42130 }, { "epoch": 0.27723684210526317, "grad_norm": 2.59375, "grad_norm_var": 0.05871988932291667, "learning_rate": 0.0001, "loss": 2.9698, "loss/crossentropy": 2.3867506980895996, "loss/hidden": 2.9546875, "loss/incoh": 0.0, "loss/logits": 0.2459012359380722, "loss/reg": 0.0, "step": 42140 }, { "epoch": 0.27730263157894736, "grad_norm": 2.390625, "grad_norm_var": 0.08957417805989583, "learning_rate": 0.0001, "loss": 2.8333, "loss/crossentropy": 2.2128496170043945, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2157162606716156, "loss/reg": 0.0, "step": 42150 }, { "epoch": 0.2773684210526316, "grad_norm": 2.5, "grad_norm_var": 0.040160115559895834, "learning_rate": 0.0001, "loss": 2.9001, "loss/crossentropy": 2.2390334010124207, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.20353079438209534, "loss/reg": 0.0, "step": 42160 }, { "epoch": 0.2774342105263158, "grad_norm": 2.28125, "grad_norm_var": 0.030647786458333333, "learning_rate": 0.0001, "loss": 2.9274, "loss/crossentropy": 2.309735429286957, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.23406820893287658, "loss/reg": 0.0, "step": 42170 }, { "epoch": 0.2775, "grad_norm": 3.328125, "grad_norm_var": 0.07941792805989584, "learning_rate": 0.0001, "loss": 2.9907, "loss/crossentropy": 2.5579655170440674, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.22064082473516464, "loss/reg": 0.0, "step": 42180 }, { "epoch": 0.2775657894736842, "grad_norm": 2.328125, "grad_norm_var": 0.09910380045572917, "learning_rate": 0.0001, "loss": 3.0059, "loss/crossentropy": 2.247300398349762, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.24163080006837845, "loss/reg": 0.0, "step": 42190 }, { "epoch": 0.2776315789473684, "grad_norm": 2.921875, "grad_norm_var": 0.03762613932291667, "learning_rate": 0.0001, "loss": 2.9519, "loss/crossentropy": 2.397110617160797, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.22527115494012834, "loss/reg": 0.0, "step": 42200 }, { "epoch": 0.27769736842105264, "grad_norm": 2.453125, "grad_norm_var": 0.21934305826822917, "learning_rate": 0.0001, "loss": 3.0047, "loss/crossentropy": 2.563957917690277, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.20910734981298446, "loss/reg": 0.0, "step": 42210 }, { "epoch": 0.2777631578947368, "grad_norm": 2.40625, "grad_norm_var": 0.19838765462239583, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.5008722066879274, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2206524670124054, "loss/reg": 0.0, "step": 42220 }, { "epoch": 0.27782894736842106, "grad_norm": 2.421875, "grad_norm_var": 0.1513671875, "learning_rate": 0.0001, "loss": 2.9548, "loss/crossentropy": 2.2133307099342345, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.2283889204263687, "loss/reg": 0.0, "step": 42230 }, { "epoch": 0.27789473684210525, "grad_norm": 2.328125, "grad_norm_var": 0.2716949462890625, "learning_rate": 0.0001, "loss": 3.0247, "loss/crossentropy": 2.3474934935569762, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.2300518810749054, "loss/reg": 0.0, "step": 42240 }, { "epoch": 0.2779605263157895, "grad_norm": 2.515625, "grad_norm_var": 0.09308268229166666, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.3735881805419923, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.22015147358179094, "loss/reg": 0.0, "step": 42250 }, { "epoch": 0.2780263157894737, "grad_norm": 2.125, "grad_norm_var": 0.051774088541666666, "learning_rate": 0.0001, "loss": 2.8232, "loss/crossentropy": 2.43296275138855, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.20032749623060225, "loss/reg": 0.0, "step": 42260 }, { "epoch": 0.2780921052631579, "grad_norm": 2.421875, "grad_norm_var": 0.025, "learning_rate": 0.0001, "loss": 2.9476, "loss/crossentropy": 2.1745434403419495, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.25861982107162473, "loss/reg": 0.0, "step": 42270 }, { "epoch": 0.2781578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.0298004150390625, "learning_rate": 0.0001, "loss": 2.9134, "loss/crossentropy": 2.2291437149047852, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.19973741993308067, "loss/reg": 0.0, "step": 42280 }, { "epoch": 0.2782236842105263, "grad_norm": 2.6875, "grad_norm_var": 0.039484659830729164, "learning_rate": 0.0001, "loss": 2.9144, "loss/crossentropy": 2.2053532004356384, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.24890440553426743, "loss/reg": 0.0, "step": 42290 }, { "epoch": 0.27828947368421053, "grad_norm": 2.515625, "grad_norm_var": 0.043024698893229164, "learning_rate": 0.0001, "loss": 2.9423, "loss/crossentropy": 2.318292462825775, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.25796281844377517, "loss/reg": 0.0, "step": 42300 }, { "epoch": 0.2783552631578947, "grad_norm": 3.09375, "grad_norm_var": 0.06493733723958334, "learning_rate": 0.0001, "loss": 2.9299, "loss/crossentropy": 2.234728491306305, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.19837756156921388, "loss/reg": 0.0, "step": 42310 }, { "epoch": 0.27842105263157896, "grad_norm": 3.453125, "grad_norm_var": 0.132275390625, "learning_rate": 0.0001, "loss": 3.0364, "loss/crossentropy": 2.18809494972229, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.21234770864248276, "loss/reg": 0.0, "step": 42320 }, { "epoch": 0.27848684210526314, "grad_norm": 2.453125, "grad_norm_var": 0.09457906087239583, "learning_rate": 0.0001, "loss": 3.0015, "loss/crossentropy": 2.2722102999687195, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.23758016675710678, "loss/reg": 0.0, "step": 42330 }, { "epoch": 0.2785526315789474, "grad_norm": 2.40625, "grad_norm_var": 0.07525126139322917, "learning_rate": 0.0001, "loss": 2.9252, "loss/crossentropy": 2.551211154460907, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2422679618000984, "loss/reg": 0.0, "step": 42340 }, { "epoch": 0.27861842105263157, "grad_norm": 2.4375, "grad_norm_var": 0.0372467041015625, "learning_rate": 0.0001, "loss": 2.9312, "loss/crossentropy": 2.187495565414429, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.19979869723320007, "loss/reg": 0.0, "step": 42350 }, { "epoch": 0.2786842105263158, "grad_norm": 2.328125, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 2.8615, "loss/crossentropy": 2.477112650871277, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.23886641710996628, "loss/reg": 0.0, "step": 42360 }, { "epoch": 0.27875, "grad_norm": 2.171875, "grad_norm_var": 0.05650634765625, "learning_rate": 0.0001, "loss": 2.8804, "loss/crossentropy": 2.206949603557587, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.19809990674257277, "loss/reg": 0.0, "step": 42370 }, { "epoch": 0.2788157894736842, "grad_norm": 2.640625, "grad_norm_var": 0.043257649739583334, "learning_rate": 0.0001, "loss": 2.9587, "loss/crossentropy": 2.3392465353012084, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.251371867954731, "loss/reg": 0.0, "step": 42380 }, { "epoch": 0.2788815789473684, "grad_norm": 2.140625, "grad_norm_var": 0.03759358723958333, "learning_rate": 0.0001, "loss": 2.8252, "loss/crossentropy": 2.169689583778381, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.1918659433722496, "loss/reg": 0.0, "step": 42390 }, { "epoch": 0.2789473684210526, "grad_norm": 2.140625, "grad_norm_var": 0.0246246337890625, "learning_rate": 0.0001, "loss": 2.822, "loss/crossentropy": 2.0233542740345003, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.21287423446774484, "loss/reg": 0.0, "step": 42400 }, { "epoch": 0.27901315789473685, "grad_norm": 2.96875, "grad_norm_var": 0.06875, "learning_rate": 0.0001, "loss": 2.8653, "loss/crossentropy": 2.348061430454254, "loss/hidden": 2.5171875, "loss/incoh": 0.0, "loss/logits": 0.18440440446138381, "loss/reg": 0.0, "step": 42410 }, { "epoch": 0.27907894736842104, "grad_norm": 2.4375, "grad_norm_var": 0.05303319295247396, "learning_rate": 0.0001, "loss": 2.8137, "loss/crossentropy": 2.1020639896392823, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.19108923822641372, "loss/reg": 0.0, "step": 42420 }, { "epoch": 0.2791447368421053, "grad_norm": 2.25, "grad_norm_var": 0.033945465087890626, "learning_rate": 0.0001, "loss": 2.8598, "loss/crossentropy": 2.3110833168029785, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22894209772348403, "loss/reg": 0.0, "step": 42430 }, { "epoch": 0.27921052631578946, "grad_norm": 2.34375, "grad_norm_var": 0.025455729166666666, "learning_rate": 0.0001, "loss": 2.8924, "loss/crossentropy": 2.3408103704452516, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.1961030662059784, "loss/reg": 0.0, "step": 42440 }, { "epoch": 0.2792763157894737, "grad_norm": 2.421875, "grad_norm_var": 0.016820271809895832, "learning_rate": 0.0001, "loss": 2.8556, "loss/crossentropy": 2.2271142959594727, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.21622314378619195, "loss/reg": 0.0, "step": 42450 }, { "epoch": 0.2793421052631579, "grad_norm": 2.3125, "grad_norm_var": 0.038309733072916664, "learning_rate": 0.0001, "loss": 2.874, "loss/crossentropy": 2.1396164536476134, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2101667642593384, "loss/reg": 0.0, "step": 42460 }, { "epoch": 0.27940789473684213, "grad_norm": 2.359375, "grad_norm_var": 0.04208577473958333, "learning_rate": 0.0001, "loss": 2.8479, "loss/crossentropy": 2.2276328176259996, "loss/hidden": 2.4828125, "loss/incoh": 0.0, "loss/logits": 0.19731640629470348, "loss/reg": 0.0, "step": 42470 }, { "epoch": 0.2794736842105263, "grad_norm": 2.375, "grad_norm_var": 0.0394439697265625, "learning_rate": 0.0001, "loss": 2.9128, "loss/crossentropy": 2.303075134754181, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.21030478179454803, "loss/reg": 0.0, "step": 42480 }, { "epoch": 0.2795394736842105, "grad_norm": 2.0625, "grad_norm_var": 0.04985249837239583, "learning_rate": 0.0001, "loss": 2.8594, "loss/crossentropy": 1.9939319342374802, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.21429630406200886, "loss/reg": 0.0, "step": 42490 }, { "epoch": 0.27960526315789475, "grad_norm": 2.375, "grad_norm_var": 0.03638916015625, "learning_rate": 0.0001, "loss": 2.9099, "loss/crossentropy": 2.2645588636398317, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.20669519007205964, "loss/reg": 0.0, "step": 42500 }, { "epoch": 0.27967105263157893, "grad_norm": 2.921875, "grad_norm_var": 0.10486653645833334, "learning_rate": 0.0001, "loss": 2.9019, "loss/crossentropy": 2.55795441865921, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.2312750220298767, "loss/reg": 0.0, "step": 42510 }, { "epoch": 0.2797368421052632, "grad_norm": 2.390625, "grad_norm_var": 0.06526285807291667, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.375762069225311, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.21791439950466157, "loss/reg": 0.0, "step": 42520 }, { "epoch": 0.27980263157894736, "grad_norm": 2.84375, "grad_norm_var": 0.1442535400390625, "learning_rate": 0.0001, "loss": 2.9147, "loss/crossentropy": 2.2956878185272216, "loss/hidden": 3.0015625, "loss/incoh": 0.0, "loss/logits": 0.2245861381292343, "loss/reg": 0.0, "step": 42530 }, { "epoch": 0.2798684210526316, "grad_norm": 2.4375, "grad_norm_var": 178.290625, "learning_rate": 0.0001, "loss": 2.9325, "loss/crossentropy": 2.536232256889343, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2203051507472992, "loss/reg": 0.0, "step": 42540 }, { "epoch": 0.2799342105263158, "grad_norm": 2.296875, "grad_norm_var": 0.09308980305989584, "learning_rate": 0.0001, "loss": 2.8431, "loss/crossentropy": 2.2231294393539427, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.20066248178482055, "loss/reg": 0.0, "step": 42550 }, { "epoch": 0.28, "grad_norm": 2.21875, "grad_norm_var": 0.4710845947265625, "learning_rate": 0.0001, "loss": 2.8859, "loss/crossentropy": 2.1808334231376647, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.19183403253555298, "loss/reg": 0.0, "step": 42560 }, { "epoch": 0.2800657894736842, "grad_norm": 2.859375, "grad_norm_var": 0.46190999348958334, "learning_rate": 0.0001, "loss": 2.8947, "loss/crossentropy": 2.3319015502929688, "loss/hidden": 2.5359375, "loss/incoh": 0.0, "loss/logits": 0.1891280397772789, "loss/reg": 0.0, "step": 42570 }, { "epoch": 0.2801315789473684, "grad_norm": 2.484375, "grad_norm_var": 0.1077056884765625, "learning_rate": 0.0001, "loss": 2.9142, "loss/crossentropy": 2.3004127979278564, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2539206549525261, "loss/reg": 0.0, "step": 42580 }, { "epoch": 0.28019736842105264, "grad_norm": 2.875, "grad_norm_var": 0.0446441650390625, "learning_rate": 0.0001, "loss": 2.8447, "loss/crossentropy": 2.384749412536621, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.2116282746195793, "loss/reg": 0.0, "step": 42590 }, { "epoch": 0.2802631578947368, "grad_norm": 2.265625, "grad_norm_var": 0.05185546875, "learning_rate": 0.0001, "loss": 2.9361, "loss/crossentropy": 2.199612820148468, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20648394674062728, "loss/reg": 0.0, "step": 42600 }, { "epoch": 0.28032894736842107, "grad_norm": 2.40625, "grad_norm_var": 0.055859375, "learning_rate": 0.0001, "loss": 2.8846, "loss/crossentropy": 2.3382102131843565, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20498812347650527, "loss/reg": 0.0, "step": 42610 }, { "epoch": 0.28039473684210525, "grad_norm": 2.359375, "grad_norm_var": 0.1548004150390625, "learning_rate": 0.0001, "loss": 2.8968, "loss/crossentropy": 2.3840024352073668, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.24561102092266082, "loss/reg": 0.0, "step": 42620 }, { "epoch": 0.2804605263157895, "grad_norm": 2.6875, "grad_norm_var": 3.301877391101635e+17, "learning_rate": 0.0001, "loss": 2.9974, "loss/crossentropy": 2.3609643936157227, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.2507575586438179, "loss/reg": 0.0, "step": 42630 }, { "epoch": 0.2805263157894737, "grad_norm": 2.9375, "grad_norm_var": 3.3018773913231034e+17, "learning_rate": 0.0001, "loss": 2.8753, "loss/crossentropy": 2.262173730134964, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.21605781465768814, "loss/reg": 0.0, "step": 42640 }, { "epoch": 0.2805921052631579, "grad_norm": 2.578125, "grad_norm_var": 0.17101236979166667, "learning_rate": 0.0001, "loss": 2.9367, "loss/crossentropy": 2.767684292793274, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.222770756483078, "loss/reg": 0.0, "step": 42650 }, { "epoch": 0.2806578947368421, "grad_norm": 2.46875, "grad_norm_var": 0.237548828125, "learning_rate": 0.0001, "loss": 2.9172, "loss/crossentropy": 1.8920587301254272, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.1819734364748001, "loss/reg": 0.0, "step": 42660 }, { "epoch": 0.2807236842105263, "grad_norm": 2.546875, "grad_norm_var": 0.43106180826822915, "learning_rate": 0.0001, "loss": 2.9085, "loss/crossentropy": 2.347963583469391, "loss/hidden": 2.5625, "loss/incoh": 0.0, "loss/logits": 0.20594773888587953, "loss/reg": 0.0, "step": 42670 }, { "epoch": 0.28078947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.3912261962890625, "learning_rate": 0.0001, "loss": 2.9774, "loss/crossentropy": 2.3553409457206724, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.19900170117616653, "loss/reg": 0.0, "step": 42680 }, { "epoch": 0.2808552631578947, "grad_norm": 4.84375, "grad_norm_var": 0.48345947265625, "learning_rate": 0.0001, "loss": 2.8949, "loss/crossentropy": 2.2932916402816774, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24585704803466796, "loss/reg": 0.0, "step": 42690 }, { "epoch": 0.28092105263157896, "grad_norm": 2.265625, "grad_norm_var": 0.46728515625, "learning_rate": 0.0001, "loss": 2.8888, "loss/crossentropy": 2.1760712534189226, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.1853573277592659, "loss/reg": 0.0, "step": 42700 }, { "epoch": 0.28098684210526315, "grad_norm": 2.265625, "grad_norm_var": 0.08305562337239583, "learning_rate": 0.0001, "loss": 2.8617, "loss/crossentropy": 2.107070708274841, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.18711100667715072, "loss/reg": 0.0, "step": 42710 }, { "epoch": 0.2810526315789474, "grad_norm": 2.5, "grad_norm_var": 0.11194559733072916, "learning_rate": 0.0001, "loss": 2.9793, "loss/crossentropy": 2.1783786177635194, "loss/hidden": 2.915625, "loss/incoh": 0.0, "loss/logits": 0.22375259548425674, "loss/reg": 0.0, "step": 42720 }, { "epoch": 0.2811184210526316, "grad_norm": 2.234375, "grad_norm_var": 0.07424214680989584, "learning_rate": 0.0001, "loss": 2.8845, "loss/crossentropy": 2.264366888999939, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.24374438896775247, "loss/reg": 0.0, "step": 42730 }, { "epoch": 0.2811842105263158, "grad_norm": 2.234375, "grad_norm_var": 0.07332356770833333, "learning_rate": 0.0001, "loss": 2.93, "loss/crossentropy": 2.186056840419769, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.20485055148601533, "loss/reg": 0.0, "step": 42740 }, { "epoch": 0.28125, "grad_norm": 2.515625, "grad_norm_var": 0.10054906209309895, "learning_rate": 0.0001, "loss": 2.8478, "loss/crossentropy": 2.382728338241577, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.22291550785303116, "loss/reg": 0.0, "step": 42750 }, { "epoch": 0.2813157894736842, "grad_norm": 2.265625, "grad_norm_var": 0.046211496988932295, "learning_rate": 0.0001, "loss": 2.8337, "loss/crossentropy": 2.5008355140686036, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.21876171380281448, "loss/reg": 0.0, "step": 42760 }, { "epoch": 0.2813815789473684, "grad_norm": 2.203125, "grad_norm_var": 0.06311848958333334, "learning_rate": 0.0001, "loss": 2.8752, "loss/crossentropy": 2.2665163397789003, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.2148457556962967, "loss/reg": 0.0, "step": 42770 }, { "epoch": 0.2814473684210526, "grad_norm": 2.1875, "grad_norm_var": 0.0521881103515625, "learning_rate": 0.0001, "loss": 2.8309, "loss/crossentropy": 2.099846678972244, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.21828532814979554, "loss/reg": 0.0, "step": 42780 }, { "epoch": 0.28151315789473685, "grad_norm": 2.859375, "grad_norm_var": 0.09503580729166666, "learning_rate": 0.0001, "loss": 2.9937, "loss/crossentropy": 2.3042611122131347, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.19256508648395537, "loss/reg": 0.0, "step": 42790 }, { "epoch": 0.28157894736842104, "grad_norm": 2.546875, "grad_norm_var": 0.05154622395833333, "learning_rate": 0.0001, "loss": 2.8525, "loss/crossentropy": 2.5033236145973206, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.21664173752069474, "loss/reg": 0.0, "step": 42800 }, { "epoch": 0.2816447368421053, "grad_norm": 2.453125, "grad_norm_var": 0.03567301432291667, "learning_rate": 0.0001, "loss": 2.8655, "loss/crossentropy": 2.109819343686104, "loss/hidden": 2.446875, "loss/incoh": 0.0, "loss/logits": 0.15977648124098778, "loss/reg": 0.0, "step": 42810 }, { "epoch": 0.28171052631578947, "grad_norm": 2.25, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 2.9318, "loss/crossentropy": 2.315469813346863, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.21322815269231796, "loss/reg": 0.0, "step": 42820 }, { "epoch": 0.2817763157894737, "grad_norm": 2.4375, "grad_norm_var": 0.0635894775390625, "learning_rate": 0.0001, "loss": 2.9006, "loss/crossentropy": 2.1585512161254883, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.18488840013742447, "loss/reg": 0.0, "step": 42830 }, { "epoch": 0.2818421052631579, "grad_norm": 2.171875, "grad_norm_var": 0.1818359375, "learning_rate": 0.0001, "loss": 2.9051, "loss/crossentropy": 2.371001732349396, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.21074966639280318, "loss/reg": 0.0, "step": 42840 }, { "epoch": 0.2819078947368421, "grad_norm": 2.265625, "grad_norm_var": 0.18161519368489584, "learning_rate": 0.0001, "loss": 2.9264, "loss/crossentropy": 2.3333333492279054, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.20477814972400665, "loss/reg": 0.0, "step": 42850 }, { "epoch": 0.2819736842105263, "grad_norm": 3.171875, "grad_norm_var": 0.12586161295572917, "learning_rate": 0.0001, "loss": 2.9204, "loss/crossentropy": 2.303652584552765, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.18824008405208587, "loss/reg": 0.0, "step": 42860 }, { "epoch": 0.2820394736842105, "grad_norm": 2.421875, "grad_norm_var": 0.32629292805989585, "learning_rate": 0.0001, "loss": 2.9111, "loss/crossentropy": 2.368567180633545, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22113584131002426, "loss/reg": 0.0, "step": 42870 }, { "epoch": 0.28210526315789475, "grad_norm": 2.265625, "grad_norm_var": 0.27291259765625, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.1704141736030578, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.20244133546948434, "loss/reg": 0.0, "step": 42880 }, { "epoch": 0.28217105263157893, "grad_norm": 2.515625, "grad_norm_var": 0.0517730712890625, "learning_rate": 0.0001, "loss": 2.9112, "loss/crossentropy": 2.173205888271332, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.31559294313192365, "loss/reg": 0.0, "step": 42890 }, { "epoch": 0.2822368421052632, "grad_norm": 2.3125, "grad_norm_var": 0.04108784993489583, "learning_rate": 0.0001, "loss": 2.9465, "loss/crossentropy": 2.102366679906845, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.20641533359885217, "loss/reg": 0.0, "step": 42900 }, { "epoch": 0.28230263157894736, "grad_norm": 2.734375, "grad_norm_var": 3.6987571085973914e+17, "learning_rate": 0.0001, "loss": 2.9686, "loss/crossentropy": 2.4069250583648683, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.21986496299505234, "loss/reg": 0.0, "step": 42910 }, { "epoch": 0.2823684210526316, "grad_norm": 2.40625, "grad_norm_var": 3.6987571080684064e+17, "learning_rate": 0.0001, "loss": 2.9632, "loss/crossentropy": 1.9471881210803985, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.20800219103693962, "loss/reg": 0.0, "step": 42920 }, { "epoch": 0.2824342105263158, "grad_norm": 2.5625, "grad_norm_var": 0.0381011962890625, "learning_rate": 0.0001, "loss": 2.8824, "loss/crossentropy": 2.2220829725265503, "loss/hidden": 2.5203125, "loss/incoh": 0.0, "loss/logits": 0.1915499299764633, "loss/reg": 0.0, "step": 42930 }, { "epoch": 0.2825, "grad_norm": 4.0625, "grad_norm_var": 0.33640950520833335, "learning_rate": 0.0001, "loss": 2.9647, "loss/crossentropy": 2.3369960486888885, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.195915350317955, "loss/reg": 0.0, "step": 42940 }, { "epoch": 0.2825657894736842, "grad_norm": 2.59375, "grad_norm_var": 0.35084635416666665, "learning_rate": 0.0001, "loss": 2.9079, "loss/crossentropy": 2.422371518611908, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.2516687363386154, "loss/reg": 0.0, "step": 42950 }, { "epoch": 0.2826315789473684, "grad_norm": 2.9375, "grad_norm_var": 0.08162434895833333, "learning_rate": 0.0001, "loss": 2.8482, "loss/crossentropy": 2.1067722499370576, "loss/hidden": 2.4859375, "loss/incoh": 0.0, "loss/logits": 0.17220191434025764, "loss/reg": 0.0, "step": 42960 }, { "epoch": 0.28269736842105264, "grad_norm": 2.640625, "grad_norm_var": 0.062352498372395836, "learning_rate": 0.0001, "loss": 2.91, "loss/crossentropy": 1.81765655875206, "loss/hidden": 2.5, "loss/incoh": 0.0, "loss/logits": 0.17306745275855065, "loss/reg": 0.0, "step": 42970 }, { "epoch": 0.2827631578947368, "grad_norm": 2.1875, "grad_norm_var": 0.26802978515625, "learning_rate": 0.0001, "loss": 2.8942, "loss/crossentropy": 2.2257973313331605, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.18886819034814833, "loss/reg": 0.0, "step": 42980 }, { "epoch": 0.28282894736842107, "grad_norm": 2.421875, "grad_norm_var": 0.20359598795572917, "learning_rate": 0.0001, "loss": 2.8689, "loss/crossentropy": 2.213522565364838, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.21868394762277604, "loss/reg": 0.0, "step": 42990 }, { "epoch": 0.28289473684210525, "grad_norm": 2.3125, "grad_norm_var": 0.03430582682291667, "learning_rate": 0.0001, "loss": 2.905, "loss/crossentropy": 2.2623468041419983, "loss/hidden": 2.5265625, "loss/incoh": 0.0, "loss/logits": 0.1790510594844818, "loss/reg": 0.0, "step": 43000 }, { "epoch": 0.2829605263157895, "grad_norm": 2.65625, "grad_norm_var": 0.0690093994140625, "learning_rate": 0.0001, "loss": 2.8707, "loss/crossentropy": 2.0131213307380675, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.1806529849767685, "loss/reg": 0.0, "step": 43010 }, { "epoch": 0.2830263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.41450093587239584, "learning_rate": 0.0001, "loss": 2.8758, "loss/crossentropy": 2.107782691717148, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.1818483553826809, "loss/reg": 0.0, "step": 43020 }, { "epoch": 0.28309210526315787, "grad_norm": 2.1875, "grad_norm_var": 0.028449503580729167, "learning_rate": 0.0001, "loss": 2.834, "loss/crossentropy": 2.322493839263916, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.19568417072296143, "loss/reg": 0.0, "step": 43030 }, { "epoch": 0.2831578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.5433095296223959, "learning_rate": 0.0001, "loss": 2.9294, "loss/crossentropy": 2.1520102262496947, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.25125996097922326, "loss/reg": 0.0, "step": 43040 }, { "epoch": 0.2832236842105263, "grad_norm": 2.546875, "grad_norm_var": 0.5350087483723959, "learning_rate": 0.0001, "loss": 2.9087, "loss/crossentropy": 2.02963308095932, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.1829964980483055, "loss/reg": 0.0, "step": 43050 }, { "epoch": 0.28328947368421054, "grad_norm": 2.15625, "grad_norm_var": 0.056883748372395834, "learning_rate": 0.0001, "loss": 2.8843, "loss/crossentropy": 2.196904015541077, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.24516793042421342, "loss/reg": 0.0, "step": 43060 }, { "epoch": 0.2833552631578947, "grad_norm": 2.21875, "grad_norm_var": 0.13963114420572917, "learning_rate": 0.0001, "loss": 2.9473, "loss/crossentropy": 2.405648821592331, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.20916384160518647, "loss/reg": 0.0, "step": 43070 }, { "epoch": 0.28342105263157896, "grad_norm": 2.3125, "grad_norm_var": 0.14690348307291667, "learning_rate": 0.0001, "loss": 2.9263, "loss/crossentropy": 2.0797518253326417, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.21191841810941697, "loss/reg": 0.0, "step": 43080 }, { "epoch": 0.28348684210526315, "grad_norm": 2.953125, "grad_norm_var": 0.060822550455729166, "learning_rate": 0.0001, "loss": 2.8689, "loss/crossentropy": 2.186980628967285, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.19851578921079635, "loss/reg": 0.0, "step": 43090 }, { "epoch": 0.2835526315789474, "grad_norm": 2.390625, "grad_norm_var": 0.1039873758951823, "learning_rate": 0.0001, "loss": 2.8485, "loss/crossentropy": 2.352395212650299, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.2057969883084297, "loss/reg": 0.0, "step": 43100 }, { "epoch": 0.2836184210526316, "grad_norm": 2.625, "grad_norm_var": 0.04119466145833333, "learning_rate": 0.0001, "loss": 2.8799, "loss/crossentropy": 2.2552805066108705, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.20091552436351776, "loss/reg": 0.0, "step": 43110 }, { "epoch": 0.2836842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.054915364583333334, "learning_rate": 0.0001, "loss": 2.8535, "loss/crossentropy": 2.5067098140716553, "loss/hidden": 2.5328125, "loss/incoh": 0.0, "loss/logits": 0.2122147187590599, "loss/reg": 0.0, "step": 43120 }, { "epoch": 0.28375, "grad_norm": 2.09375, "grad_norm_var": 0.20985921223958334, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.370109164714813, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.18405528366565704, "loss/reg": 0.0, "step": 43130 }, { "epoch": 0.2838157894736842, "grad_norm": 3.171875, "grad_norm_var": 0.9889394124348958, "learning_rate": 0.0001, "loss": 2.8996, "loss/crossentropy": 2.004147744178772, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.20317752063274383, "loss/reg": 0.0, "step": 43140 }, { "epoch": 0.28388157894736843, "grad_norm": 2.359375, "grad_norm_var": 1.1228179931640625, "learning_rate": 0.0001, "loss": 2.8683, "loss/crossentropy": 2.4105277180671694, "loss/hidden": 2.896875, "loss/incoh": 0.0, "loss/logits": 0.28994072824716566, "loss/reg": 0.0, "step": 43150 }, { "epoch": 0.2839473684210526, "grad_norm": 2.5625, "grad_norm_var": 0.3641438802083333, "learning_rate": 0.0001, "loss": 2.8807, "loss/crossentropy": 2.1559565663337708, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.18878259956836702, "loss/reg": 0.0, "step": 43160 }, { "epoch": 0.28401315789473686, "grad_norm": 2.3125, "grad_norm_var": 0.1378570556640625, "learning_rate": 0.0001, "loss": 2.8886, "loss/crossentropy": 2.4954119086265565, "loss/hidden": 2.5375, "loss/incoh": 0.0, "loss/logits": 0.1976638063788414, "loss/reg": 0.0, "step": 43170 }, { "epoch": 0.28407894736842104, "grad_norm": 2.359375, "grad_norm_var": 0.15821024576822917, "learning_rate": 0.0001, "loss": 2.9884, "loss/crossentropy": 2.287571144104004, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.21388431638479233, "loss/reg": 0.0, "step": 43180 }, { "epoch": 0.2841447368421053, "grad_norm": 2.375, "grad_norm_var": 0.12967122395833333, "learning_rate": 0.0001, "loss": 2.8693, "loss/crossentropy": 2.394725167751312, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.21783926486968994, "loss/reg": 0.0, "step": 43190 }, { "epoch": 0.28421052631578947, "grad_norm": 2.21875, "grad_norm_var": 0.0968170166015625, "learning_rate": 0.0001, "loss": 2.8246, "loss/crossentropy": 2.3161295771598818, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.20319831594824792, "loss/reg": 0.0, "step": 43200 }, { "epoch": 0.2842763157894737, "grad_norm": 2.34375, "grad_norm_var": 0.08818257649739583, "learning_rate": 0.0001, "loss": 2.9036, "loss/crossentropy": 2.3764306902885437, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.2284521132707596, "loss/reg": 0.0, "step": 43210 }, { "epoch": 0.2843421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.048661295572916666, "learning_rate": 0.0001, "loss": 2.8945, "loss/crossentropy": 2.435572648048401, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.19654896706342698, "loss/reg": 0.0, "step": 43220 }, { "epoch": 0.2844078947368421, "grad_norm": 3.25, "grad_norm_var": 0.15965169270833332, "learning_rate": 0.0001, "loss": 2.9366, "loss/crossentropy": 2.206356239318848, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.23759893849492073, "loss/reg": 0.0, "step": 43230 }, { "epoch": 0.2844736842105263, "grad_norm": 2.5, "grad_norm_var": 0.07883707682291667, "learning_rate": 0.0001, "loss": 2.8649, "loss/crossentropy": 2.363961708545685, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.22791220843791962, "loss/reg": 0.0, "step": 43240 }, { "epoch": 0.2845394736842105, "grad_norm": 3.890625, "grad_norm_var": 0.20872294108072917, "learning_rate": 0.0001, "loss": 2.9983, "loss/crossentropy": 2.4680182695388795, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.2318154662847519, "loss/reg": 0.0, "step": 43250 }, { "epoch": 0.28460526315789475, "grad_norm": 2.875, "grad_norm_var": 0.18140869140625, "learning_rate": 0.0001, "loss": 2.8735, "loss/crossentropy": 2.1184932470321653, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.18925365805625916, "loss/reg": 0.0, "step": 43260 }, { "epoch": 0.28467105263157894, "grad_norm": 2.78125, "grad_norm_var": 1.9480631510416666, "learning_rate": 0.0001, "loss": 2.9947, "loss/crossentropy": 2.1837258338928223, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.32422677204012873, "loss/reg": 0.0, "step": 43270 }, { "epoch": 0.2847368421052632, "grad_norm": 2.609375, "grad_norm_var": 0.693505859375, "learning_rate": 0.0001, "loss": 2.9718, "loss/crossentropy": 2.240171182155609, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.23304136991500854, "loss/reg": 0.0, "step": 43280 }, { "epoch": 0.28480263157894736, "grad_norm": 2.296875, "grad_norm_var": 0.3250935872395833, "learning_rate": 0.0001, "loss": 2.9168, "loss/crossentropy": 2.2175217270851135, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.20192077308893203, "loss/reg": 0.0, "step": 43290 }, { "epoch": 0.2848684210526316, "grad_norm": 2.265625, "grad_norm_var": 0.0390625, "learning_rate": 0.0001, "loss": 2.9255, "loss/crossentropy": 2.3124610424041747, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.21189500540494918, "loss/reg": 0.0, "step": 43300 }, { "epoch": 0.2849342105263158, "grad_norm": 2.1875, "grad_norm_var": 0.13122456868489582, "learning_rate": 0.0001, "loss": 2.9175, "loss/crossentropy": 2.407304513454437, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.22599541991949082, "loss/reg": 0.0, "step": 43310 }, { "epoch": 0.285, "grad_norm": 2.5, "grad_norm_var": 0.036519368489583336, "learning_rate": 0.0001, "loss": 2.9041, "loss/crossentropy": 2.3194751501083375, "loss/hidden": 2.5390625, "loss/incoh": 0.0, "loss/logits": 0.1946787007153034, "loss/reg": 0.0, "step": 43320 }, { "epoch": 0.2850657894736842, "grad_norm": 2.375, "grad_norm_var": 0.201416015625, "learning_rate": 0.0001, "loss": 2.9433, "loss/crossentropy": 2.1137040853500366, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.21837345659732818, "loss/reg": 0.0, "step": 43330 }, { "epoch": 0.2851315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.2180816650390625, "learning_rate": 0.0001, "loss": 2.9092, "loss/crossentropy": 2.2650736451148985, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.21844549477100372, "loss/reg": 0.0, "step": 43340 }, { "epoch": 0.28519736842105264, "grad_norm": 2.4375, "grad_norm_var": 0.0765533447265625, "learning_rate": 0.0001, "loss": 2.8859, "loss/crossentropy": 2.227513551712036, "loss/hidden": 2.503125, "loss/incoh": 0.0, "loss/logits": 0.18697744160890578, "loss/reg": 0.0, "step": 43350 }, { "epoch": 0.28526315789473683, "grad_norm": 2.09375, "grad_norm_var": 0.08496805826822916, "learning_rate": 0.0001, "loss": 2.9441, "loss/crossentropy": 2.4589428544044494, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.22191307842731475, "loss/reg": 0.0, "step": 43360 }, { "epoch": 0.28532894736842107, "grad_norm": 2.484375, "grad_norm_var": 0.13730061848958333, "learning_rate": 0.0001, "loss": 2.991, "loss/crossentropy": 2.2811187386512755, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.21838787496089934, "loss/reg": 0.0, "step": 43370 }, { "epoch": 0.28539473684210526, "grad_norm": 2.859375, "grad_norm_var": 0.1448883056640625, "learning_rate": 0.0001, "loss": 2.8698, "loss/crossentropy": 2.2639885544776917, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2506809413433075, "loss/reg": 0.0, "step": 43380 }, { "epoch": 0.2854605263157895, "grad_norm": 2.28125, "grad_norm_var": 0.1380279541015625, "learning_rate": 0.0001, "loss": 2.833, "loss/crossentropy": 2.3065077543258665, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.20683425068855285, "loss/reg": 0.0, "step": 43390 }, { "epoch": 0.2855263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.07613525390625, "learning_rate": 0.0001, "loss": 2.8844, "loss/crossentropy": 2.1088157147169113, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.19653198271989822, "loss/reg": 0.0, "step": 43400 }, { "epoch": 0.28559210526315787, "grad_norm": 2.1875, "grad_norm_var": 0.0309967041015625, "learning_rate": 0.0001, "loss": 2.9009, "loss/crossentropy": 2.277215671539307, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.22881786823272704, "loss/reg": 0.0, "step": 43410 }, { "epoch": 0.2856578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.15431315104166668, "learning_rate": 0.0001, "loss": 2.8907, "loss/crossentropy": 2.291688120365143, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.1900433823466301, "loss/reg": 0.0, "step": 43420 }, { "epoch": 0.2857236842105263, "grad_norm": 2.5625, "grad_norm_var": 0.16030171712239583, "learning_rate": 0.0001, "loss": 2.876, "loss/crossentropy": 2.3951743960380556, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.24456514418125153, "loss/reg": 0.0, "step": 43430 }, { "epoch": 0.28578947368421054, "grad_norm": 2.34375, "grad_norm_var": 0.030296834309895833, "learning_rate": 0.0001, "loss": 2.8513, "loss/crossentropy": 2.493042540550232, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.20264011174440383, "loss/reg": 0.0, "step": 43440 }, { "epoch": 0.2858552631578947, "grad_norm": 2.296875, "grad_norm_var": 0.0193267822265625, "learning_rate": 0.0001, "loss": 2.9262, "loss/crossentropy": 2.2254455208778383, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20465197637677193, "loss/reg": 0.0, "step": 43450 }, { "epoch": 0.28592105263157896, "grad_norm": 2.515625, "grad_norm_var": 0.0371246337890625, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 2.0460331082344054, "loss/hidden": 2.8609375, "loss/incoh": 0.0, "loss/logits": 0.2760195881128311, "loss/reg": 0.0, "step": 43460 }, { "epoch": 0.28598684210526315, "grad_norm": 2.4375, "grad_norm_var": 0.06692301432291667, "learning_rate": 0.0001, "loss": 2.936, "loss/crossentropy": 2.226793646812439, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.22832536995410918, "loss/reg": 0.0, "step": 43470 }, { "epoch": 0.2860526315789474, "grad_norm": 2.484375, "grad_norm_var": 0.07673238118489584, "learning_rate": 0.0001, "loss": 2.8521, "loss/crossentropy": 2.2850674867630003, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.186783504486084, "loss/reg": 0.0, "step": 43480 }, { "epoch": 0.2861184210526316, "grad_norm": 2.375, "grad_norm_var": 0.06207275390625, "learning_rate": 0.0001, "loss": 2.9529, "loss/crossentropy": 2.2625157833099365, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.21787840723991395, "loss/reg": 0.0, "step": 43490 }, { "epoch": 0.28618421052631576, "grad_norm": 2.703125, "grad_norm_var": 0.10569661458333333, "learning_rate": 0.0001, "loss": 2.9641, "loss/crossentropy": 2.2594608426094056, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.217470745742321, "loss/reg": 0.0, "step": 43500 }, { "epoch": 0.28625, "grad_norm": 2.4375, "grad_norm_var": 0.18192952473958332, "learning_rate": 0.0001, "loss": 2.875, "loss/crossentropy": 2.381129193305969, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.18271760568022727, "loss/reg": 0.0, "step": 43510 }, { "epoch": 0.2863157894736842, "grad_norm": 2.546875, "grad_norm_var": 0.1366607666015625, "learning_rate": 0.0001, "loss": 2.9553, "loss/crossentropy": 2.099366343021393, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.21303561180830002, "loss/reg": 0.0, "step": 43520 }, { "epoch": 0.28638157894736843, "grad_norm": 2.390625, "grad_norm_var": 0.1060211181640625, "learning_rate": 0.0001, "loss": 2.8486, "loss/crossentropy": 2.2939124584197996, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.21459156423807144, "loss/reg": 0.0, "step": 43530 }, { "epoch": 0.2864473684210526, "grad_norm": 2.5625, "grad_norm_var": 0.9132965087890625, "learning_rate": 0.0001, "loss": 2.9422, "loss/crossentropy": 2.260778844356537, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.23691344261169434, "loss/reg": 0.0, "step": 43540 }, { "epoch": 0.28651315789473686, "grad_norm": 2.46875, "grad_norm_var": 0.8979807535807292, "learning_rate": 0.0001, "loss": 2.8914, "loss/crossentropy": 2.3666525959968565, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.26658767461776733, "loss/reg": 0.0, "step": 43550 }, { "epoch": 0.28657894736842104, "grad_norm": 2.5, "grad_norm_var": 0.08554280598958333, "learning_rate": 0.0001, "loss": 2.9762, "loss/crossentropy": 2.2393365621566774, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.24293814599514008, "loss/reg": 0.0, "step": 43560 }, { "epoch": 0.2866447368421053, "grad_norm": 2.390625, "grad_norm_var": 0.11634012858072916, "learning_rate": 0.0001, "loss": 2.9056, "loss/crossentropy": 2.40314177274704, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20444777905941008, "loss/reg": 0.0, "step": 43570 }, { "epoch": 0.28671052631578947, "grad_norm": 2.71875, "grad_norm_var": 0.21525065104166666, "learning_rate": 0.0001, "loss": 2.9644, "loss/crossentropy": 2.187191128730774, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.2378062218427658, "loss/reg": 0.0, "step": 43580 }, { "epoch": 0.28677631578947366, "grad_norm": 2.609375, "grad_norm_var": 0.0376373291015625, "learning_rate": 0.0001, "loss": 2.8504, "loss/crossentropy": 2.2294183015823363, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.18793374449014663, "loss/reg": 0.0, "step": 43590 }, { "epoch": 0.2868421052631579, "grad_norm": 2.234375, "grad_norm_var": 0.10021870930989583, "learning_rate": 0.0001, "loss": 2.8934, "loss/crossentropy": 2.245951211452484, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.18825952857732772, "loss/reg": 0.0, "step": 43600 }, { "epoch": 0.2869078947368421, "grad_norm": 4.96875, "grad_norm_var": 0.4262603759765625, "learning_rate": 0.0001, "loss": 2.9725, "loss/crossentropy": 2.1506550788879393, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.24849173277616501, "loss/reg": 0.0, "step": 43610 }, { "epoch": 0.2869736842105263, "grad_norm": 2.59375, "grad_norm_var": 0.3882395426432292, "learning_rate": 0.0001, "loss": 2.956, "loss/crossentropy": 2.312927782535553, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.1921321392059326, "loss/reg": 0.0, "step": 43620 }, { "epoch": 0.2870394736842105, "grad_norm": 2.96875, "grad_norm_var": 0.05771484375, "learning_rate": 0.0001, "loss": 2.9528, "loss/crossentropy": 2.0427388191223144, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.20523644536733626, "loss/reg": 0.0, "step": 43630 }, { "epoch": 0.28710526315789475, "grad_norm": 2.015625, "grad_norm_var": 0.09976806640625, "learning_rate": 0.0001, "loss": 2.8574, "loss/crossentropy": 2.1490531086921694, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.17445433661341667, "loss/reg": 0.0, "step": 43640 }, { "epoch": 0.28717105263157894, "grad_norm": 2.359375, "grad_norm_var": 0.18307291666666667, "learning_rate": 0.0001, "loss": 2.9806, "loss/crossentropy": 2.5936270475387575, "loss/hidden": 2.89375, "loss/incoh": 0.0, "loss/logits": 0.2775991588830948, "loss/reg": 0.0, "step": 43650 }, { "epoch": 0.2872368421052632, "grad_norm": 2.453125, "grad_norm_var": 0.1656158447265625, "learning_rate": 0.0001, "loss": 2.927, "loss/crossentropy": 2.2534231781959533, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.24972749650478362, "loss/reg": 0.0, "step": 43660 }, { "epoch": 0.28730263157894737, "grad_norm": 2.03125, "grad_norm_var": 0.106103515625, "learning_rate": 0.0001, "loss": 2.9426, "loss/crossentropy": 2.353523164987564, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.21333006471395494, "loss/reg": 0.0, "step": 43670 }, { "epoch": 0.2873684210526316, "grad_norm": 2.5625, "grad_norm_var": 0.13334859212239583, "learning_rate": 0.0001, "loss": 2.9588, "loss/crossentropy": 2.426537847518921, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.25118381828069686, "loss/reg": 0.0, "step": 43680 }, { "epoch": 0.2874342105263158, "grad_norm": 2.3125, "grad_norm_var": 0.0178619384765625, "learning_rate": 0.0001, "loss": 2.8305, "loss/crossentropy": 2.1598091602325438, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.21664716601371764, "loss/reg": 0.0, "step": 43690 }, { "epoch": 0.2875, "grad_norm": 2.609375, "grad_norm_var": 0.024022420247395832, "learning_rate": 0.0001, "loss": 2.8952, "loss/crossentropy": 2.1410681873559954, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.1833462379872799, "loss/reg": 0.0, "step": 43700 }, { "epoch": 0.2875657894736842, "grad_norm": 2.5625, "grad_norm_var": 0.0342681884765625, "learning_rate": 0.0001, "loss": 2.866, "loss/crossentropy": 2.1174031019210817, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.1871117517352104, "loss/reg": 0.0, "step": 43710 }, { "epoch": 0.2876315789473684, "grad_norm": 2.578125, "grad_norm_var": 0.042699178059895836, "learning_rate": 0.0001, "loss": 2.8969, "loss/crossentropy": 2.275778961181641, "loss/hidden": 2.8625, "loss/incoh": 0.0, "loss/logits": 0.2656059965491295, "loss/reg": 0.0, "step": 43720 }, { "epoch": 0.28769736842105265, "grad_norm": 2.46875, "grad_norm_var": 0.027106730143229167, "learning_rate": 0.0001, "loss": 2.8843, "loss/crossentropy": 2.2238790929317473, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23833621367812158, "loss/reg": 0.0, "step": 43730 }, { "epoch": 0.28776315789473683, "grad_norm": 2.40625, "grad_norm_var": 0.06194661458333333, "learning_rate": 0.0001, "loss": 2.8914, "loss/crossentropy": 2.3475514531135557, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.2025994226336479, "loss/reg": 0.0, "step": 43740 }, { "epoch": 0.2878289473684211, "grad_norm": 2.25, "grad_norm_var": 0.07217508951822917, "learning_rate": 0.0001, "loss": 2.8506, "loss/crossentropy": 2.0865403711795807, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.19129689037799835, "loss/reg": 0.0, "step": 43750 }, { "epoch": 0.28789473684210526, "grad_norm": 3.109375, "grad_norm_var": 0.2137115478515625, "learning_rate": 0.0001, "loss": 2.9077, "loss/crossentropy": 2.236695647239685, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.19605601131916045, "loss/reg": 0.0, "step": 43760 }, { "epoch": 0.2879605263157895, "grad_norm": 2.59375, "grad_norm_var": 0.20093994140625, "learning_rate": 0.0001, "loss": 2.8762, "loss/crossentropy": 2.325711560249329, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2332364372909069, "loss/reg": 0.0, "step": 43770 }, { "epoch": 0.2880263157894737, "grad_norm": 2.109375, "grad_norm_var": 0.12214253743489584, "learning_rate": 0.0001, "loss": 2.8895, "loss/crossentropy": 2.2797622442245484, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.2048424243927002, "loss/reg": 0.0, "step": 43780 }, { "epoch": 0.28809210526315787, "grad_norm": 2.265625, "grad_norm_var": 0.12011311848958334, "learning_rate": 0.0001, "loss": 2.862, "loss/crossentropy": 2.193244618177414, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2567788928747177, "loss/reg": 0.0, "step": 43790 }, { "epoch": 0.2881578947368421, "grad_norm": 2.265625, "grad_norm_var": 0.035933430989583334, "learning_rate": 0.0001, "loss": 2.8103, "loss/crossentropy": 2.2069244503974916, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.17846334353089333, "loss/reg": 0.0, "step": 43800 }, { "epoch": 0.2882236842105263, "grad_norm": 3.875, "grad_norm_var": 0.1794586181640625, "learning_rate": 0.0001, "loss": 2.854, "loss/crossentropy": 2.303934109210968, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.20203901827335358, "loss/reg": 0.0, "step": 43810 }, { "epoch": 0.28828947368421054, "grad_norm": 2.328125, "grad_norm_var": 0.17466532389322917, "learning_rate": 0.0001, "loss": 2.9517, "loss/crossentropy": 2.32292320728302, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.2085458070039749, "loss/reg": 0.0, "step": 43820 }, { "epoch": 0.2883552631578947, "grad_norm": 2.703125, "grad_norm_var": 0.0373443603515625, "learning_rate": 0.0001, "loss": 2.9042, "loss/crossentropy": 2.39540011882782, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.20445933938026428, "loss/reg": 0.0, "step": 43830 }, { "epoch": 0.28842105263157897, "grad_norm": 2.53125, "grad_norm_var": 0.05032145182291667, "learning_rate": 0.0001, "loss": 2.8811, "loss/crossentropy": 2.2399554371833803, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.20801245272159577, "loss/reg": 0.0, "step": 43840 }, { "epoch": 0.28848684210526315, "grad_norm": 2.484375, "grad_norm_var": 0.0369049072265625, "learning_rate": 0.0001, "loss": 2.8878, "loss/crossentropy": 2.277040088176727, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.19514358937740325, "loss/reg": 0.0, "step": 43850 }, { "epoch": 0.2885526315789474, "grad_norm": 2.796875, "grad_norm_var": 0.03703511555989583, "learning_rate": 0.0001, "loss": 2.948, "loss/crossentropy": 2.4415757775306703, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.24605545103549958, "loss/reg": 0.0, "step": 43860 }, { "epoch": 0.2886184210526316, "grad_norm": 2.5625, "grad_norm_var": 0.07614644368489583, "learning_rate": 0.0001, "loss": 2.9629, "loss/crossentropy": 2.106474182009697, "loss/hidden": 2.540625, "loss/incoh": 0.0, "loss/logits": 0.17892866916954517, "loss/reg": 0.0, "step": 43870 }, { "epoch": 0.28868421052631577, "grad_norm": 2.5625, "grad_norm_var": 0.0381744384765625, "learning_rate": 0.0001, "loss": 2.8951, "loss/crossentropy": 2.2741947531700135, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.21769139170646667, "loss/reg": 0.0, "step": 43880 }, { "epoch": 0.28875, "grad_norm": 2.203125, "grad_norm_var": 0.1417877197265625, "learning_rate": 0.0001, "loss": 2.962, "loss/crossentropy": 2.2644962430000306, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.2914708510041237, "loss/reg": 0.0, "step": 43890 }, { "epoch": 0.2888157894736842, "grad_norm": 2.484375, "grad_norm_var": 0.05133056640625, "learning_rate": 0.0001, "loss": 2.902, "loss/crossentropy": 2.0679893016815187, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.20461439192295075, "loss/reg": 0.0, "step": 43900 }, { "epoch": 0.28888157894736843, "grad_norm": 2.765625, "grad_norm_var": 0.063330078125, "learning_rate": 0.0001, "loss": 2.8894, "loss/crossentropy": 2.241969954967499, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.18241164088249207, "loss/reg": 0.0, "step": 43910 }, { "epoch": 0.2889473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.024250284830729166, "learning_rate": 0.0001, "loss": 2.8613, "loss/crossentropy": 2.2248759984970095, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.19123225659132004, "loss/reg": 0.0, "step": 43920 }, { "epoch": 0.28901315789473686, "grad_norm": 2.78125, "grad_norm_var": 0.10695699055989584, "learning_rate": 0.0001, "loss": 2.9693, "loss/crossentropy": 2.2456079959869384, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.2504643514752388, "loss/reg": 0.0, "step": 43930 }, { "epoch": 0.28907894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.029361979166666666, "learning_rate": 0.0001, "loss": 2.8948, "loss/crossentropy": 1.9807691693305969, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.19716739431023597, "loss/reg": 0.0, "step": 43940 }, { "epoch": 0.2891447368421053, "grad_norm": 2.328125, "grad_norm_var": 0.02373046875, "learning_rate": 0.0001, "loss": 2.9681, "loss/crossentropy": 2.4408227682113646, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.23912132233381272, "loss/reg": 0.0, "step": 43950 }, { "epoch": 0.2892105263157895, "grad_norm": 2.546875, "grad_norm_var": 0.03590087890625, "learning_rate": 0.0001, "loss": 2.9187, "loss/crossentropy": 2.2150730013847353, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.297735433280468, "loss/reg": 0.0, "step": 43960 }, { "epoch": 0.28927631578947366, "grad_norm": 2.5, "grad_norm_var": 0.061328125, "learning_rate": 0.0001, "loss": 2.9443, "loss/crossentropy": 2.221993064880371, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.19261694550514222, "loss/reg": 0.0, "step": 43970 }, { "epoch": 0.2893421052631579, "grad_norm": 2.78125, "grad_norm_var": 0.0856597900390625, "learning_rate": 0.0001, "loss": 2.923, "loss/crossentropy": 2.5590139389038087, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2841035008430481, "loss/reg": 0.0, "step": 43980 }, { "epoch": 0.2894078947368421, "grad_norm": 2.71875, "grad_norm_var": 0.0741363525390625, "learning_rate": 0.0001, "loss": 2.8746, "loss/crossentropy": 1.924873685836792, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.1860344648361206, "loss/reg": 0.0, "step": 43990 }, { "epoch": 0.2894736842105263, "grad_norm": 2.078125, "grad_norm_var": 0.05142822265625, "learning_rate": 0.0001, "loss": 2.8797, "loss/crossentropy": 2.270802712440491, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.28401806205511093, "loss/reg": 0.0, "step": 44000 }, { "epoch": 0.2895394736842105, "grad_norm": 2.421875, "grad_norm_var": 0.02955322265625, "learning_rate": 0.0001, "loss": 2.8847, "loss/crossentropy": 2.505043661594391, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.2222891017794609, "loss/reg": 0.0, "step": 44010 }, { "epoch": 0.28960526315789475, "grad_norm": 2.28125, "grad_norm_var": 0.030013020833333334, "learning_rate": 0.0001, "loss": 2.8794, "loss/crossentropy": 2.244774329662323, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2427571013569832, "loss/reg": 0.0, "step": 44020 }, { "epoch": 0.28967105263157894, "grad_norm": 2.21875, "grad_norm_var": 0.0271636962890625, "learning_rate": 0.0001, "loss": 2.8317, "loss/crossentropy": 2.2546144366264342, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.23028647750616074, "loss/reg": 0.0, "step": 44030 }, { "epoch": 0.2897368421052632, "grad_norm": 3.140625, "grad_norm_var": 0.0588287353515625, "learning_rate": 0.0001, "loss": 2.9189, "loss/crossentropy": 2.353494131565094, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.19220632463693618, "loss/reg": 0.0, "step": 44040 }, { "epoch": 0.28980263157894737, "grad_norm": 2.546875, "grad_norm_var": 0.08026936848958334, "learning_rate": 0.0001, "loss": 2.8553, "loss/crossentropy": 2.562294936180115, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.20318009629845618, "loss/reg": 0.0, "step": 44050 }, { "epoch": 0.28986842105263155, "grad_norm": 2.203125, "grad_norm_var": 0.0839263916015625, "learning_rate": 0.0001, "loss": 2.8848, "loss/crossentropy": 2.2914687156677247, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.19924205392599106, "loss/reg": 0.0, "step": 44060 }, { "epoch": 0.2899342105263158, "grad_norm": 2.15625, "grad_norm_var": 0.0777740478515625, "learning_rate": 0.0001, "loss": 2.8031, "loss/crossentropy": 2.2433938503265383, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.23954728841781617, "loss/reg": 0.0, "step": 44070 }, { "epoch": 0.29, "grad_norm": 2.90625, "grad_norm_var": 0.08421223958333333, "learning_rate": 0.0001, "loss": 2.9111, "loss/crossentropy": 2.1184794545173644, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.20742128640413285, "loss/reg": 0.0, "step": 44080 }, { "epoch": 0.2900657894736842, "grad_norm": 2.390625, "grad_norm_var": 0.17042643229166668, "learning_rate": 0.0001, "loss": 2.8315, "loss/crossentropy": 2.4952237367630006, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.2358098953962326, "loss/reg": 0.0, "step": 44090 }, { "epoch": 0.2901315789473684, "grad_norm": 2.375, "grad_norm_var": 0.0303131103515625, "learning_rate": 0.0001, "loss": 2.8861, "loss/crossentropy": 2.2693952560424804, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.21331281661987306, "loss/reg": 0.0, "step": 44100 }, { "epoch": 0.29019736842105265, "grad_norm": 2.328125, "grad_norm_var": 0.052953084309895836, "learning_rate": 0.0001, "loss": 2.9031, "loss/crossentropy": 2.4467560291290282, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.2067430779337883, "loss/reg": 0.0, "step": 44110 }, { "epoch": 0.29026315789473683, "grad_norm": 2.203125, "grad_norm_var": 0.04892171223958333, "learning_rate": 0.0001, "loss": 2.8471, "loss/crossentropy": 2.0895180702209473, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.1865648239850998, "loss/reg": 0.0, "step": 44120 }, { "epoch": 0.2903289473684211, "grad_norm": 2.515625, "grad_norm_var": 0.045556640625, "learning_rate": 0.0001, "loss": 2.8598, "loss/crossentropy": 1.9967158436775208, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.21473891139030457, "loss/reg": 0.0, "step": 44130 }, { "epoch": 0.29039473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.04460347493489583, "learning_rate": 0.0001, "loss": 2.8581, "loss/crossentropy": 2.0971244096755983, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.19329497888684272, "loss/reg": 0.0, "step": 44140 }, { "epoch": 0.29046052631578945, "grad_norm": 2.53125, "grad_norm_var": 0.04951883951822917, "learning_rate": 0.0001, "loss": 2.8796, "loss/crossentropy": 2.33571001291275, "loss/hidden": 2.5234375, "loss/incoh": 0.0, "loss/logits": 0.1916544705629349, "loss/reg": 0.0, "step": 44150 }, { "epoch": 0.2905263157894737, "grad_norm": 2.296875, "grad_norm_var": 0.07965087890625, "learning_rate": 0.0001, "loss": 2.9433, "loss/crossentropy": 2.199085795879364, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.23039796352386474, "loss/reg": 0.0, "step": 44160 }, { "epoch": 0.2905921052631579, "grad_norm": 2.796875, "grad_norm_var": 0.0462310791015625, "learning_rate": 0.0001, "loss": 2.8692, "loss/crossentropy": 1.9691104173660279, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.27670957446098327, "loss/reg": 0.0, "step": 44170 }, { "epoch": 0.2906578947368421, "grad_norm": 2.4375, "grad_norm_var": 0.0419830322265625, "learning_rate": 0.0001, "loss": 2.9071, "loss/crossentropy": 2.414106321334839, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.20228614285588264, "loss/reg": 0.0, "step": 44180 }, { "epoch": 0.2907236842105263, "grad_norm": 2.546875, "grad_norm_var": 0.051268513997395834, "learning_rate": 0.0001, "loss": 2.8711, "loss/crossentropy": 2.1124113619327547, "loss/hidden": 2.534375, "loss/incoh": 0.0, "loss/logits": 0.19734383448958398, "loss/reg": 0.0, "step": 44190 }, { "epoch": 0.29078947368421054, "grad_norm": 2.203125, "grad_norm_var": 0.53990478515625, "learning_rate": 0.0001, "loss": 2.8983, "loss/crossentropy": 2.6030133843421934, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.223942731320858, "loss/reg": 0.0, "step": 44200 }, { "epoch": 0.29085526315789473, "grad_norm": 2.25, "grad_norm_var": 0.5744293212890625, "learning_rate": 0.0001, "loss": 2.9142, "loss/crossentropy": 2.4371443510055544, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.21589200645685197, "loss/reg": 0.0, "step": 44210 }, { "epoch": 0.29092105263157897, "grad_norm": 2.4375, "grad_norm_var": 0.055826822916666664, "learning_rate": 0.0001, "loss": 2.8637, "loss/crossentropy": 2.410751664638519, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.21393486708402634, "loss/reg": 0.0, "step": 44220 }, { "epoch": 0.29098684210526315, "grad_norm": 2.546875, "grad_norm_var": 0.09482014973958333, "learning_rate": 0.0001, "loss": 2.9603, "loss/crossentropy": 2.324754476547241, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.24492140412330626, "loss/reg": 0.0, "step": 44230 }, { "epoch": 0.2910526315789474, "grad_norm": 2.515625, "grad_norm_var": 0.0699127197265625, "learning_rate": 0.0001, "loss": 2.9381, "loss/crossentropy": 2.441379189491272, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.25381250530481336, "loss/reg": 0.0, "step": 44240 }, { "epoch": 0.2911184210526316, "grad_norm": 2.546875, "grad_norm_var": 0.0202545166015625, "learning_rate": 0.0001, "loss": 2.8371, "loss/crossentropy": 1.93411483168602, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.18959135562181473, "loss/reg": 0.0, "step": 44250 }, { "epoch": 0.29118421052631577, "grad_norm": 2.375, "grad_norm_var": 0.019303385416666666, "learning_rate": 0.0001, "loss": 2.8869, "loss/crossentropy": 2.29964417219162, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.21715250611305237, "loss/reg": 0.0, "step": 44260 }, { "epoch": 0.29125, "grad_norm": 3.296875, "grad_norm_var": 0.06917215983072916, "learning_rate": 0.0001, "loss": 2.8812, "loss/crossentropy": 2.504518961906433, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.2181348979473114, "loss/reg": 0.0, "step": 44270 }, { "epoch": 0.2913157894736842, "grad_norm": 2.75, "grad_norm_var": 0.0833160400390625, "learning_rate": 0.0001, "loss": 2.9741, "loss/crossentropy": 2.0311013221740724, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2127850979566574, "loss/reg": 0.0, "step": 44280 }, { "epoch": 0.29138157894736844, "grad_norm": 2.5625, "grad_norm_var": 0.2720865885416667, "learning_rate": 0.0001, "loss": 2.9591, "loss/crossentropy": 2.011301353573799, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.20991104319691659, "loss/reg": 0.0, "step": 44290 }, { "epoch": 0.2914473684210526, "grad_norm": 2.328125, "grad_norm_var": 0.23811442057291668, "learning_rate": 0.0001, "loss": 2.9188, "loss/crossentropy": 2.344858396053314, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.20629321187734603, "loss/reg": 0.0, "step": 44300 }, { "epoch": 0.29151315789473686, "grad_norm": 2.328125, "grad_norm_var": 0.23449605305989582, "learning_rate": 0.0001, "loss": 2.8888, "loss/crossentropy": 2.0987025618553163, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2720726072788239, "loss/reg": 0.0, "step": 44310 }, { "epoch": 0.29157894736842105, "grad_norm": 2.921875, "grad_norm_var": 0.0304351806640625, "learning_rate": 0.0001, "loss": 2.942, "loss/crossentropy": 2.514825201034546, "loss/hidden": 2.85625, "loss/incoh": 0.0, "loss/logits": 0.28860004246234894, "loss/reg": 0.0, "step": 44320 }, { "epoch": 0.2916447368421053, "grad_norm": 2.125, "grad_norm_var": 0.04784749348958333, "learning_rate": 0.0001, "loss": 2.8938, "loss/crossentropy": 2.1344955801963805, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.20094945505261422, "loss/reg": 0.0, "step": 44330 }, { "epoch": 0.2917105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.5031321207682292, "learning_rate": 0.0001, "loss": 2.8478, "loss/crossentropy": 2.3695374608039854, "loss/hidden": 2.5296875, "loss/incoh": 0.0, "loss/logits": 0.2018287569284439, "loss/reg": 0.0, "step": 44340 }, { "epoch": 0.29177631578947366, "grad_norm": 2.1875, "grad_norm_var": 0.5140777587890625, "learning_rate": 0.0001, "loss": 2.9199, "loss/crossentropy": 2.3624900221824645, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20448887199163437, "loss/reg": 0.0, "step": 44350 }, { "epoch": 0.2918421052631579, "grad_norm": 2.4375, "grad_norm_var": 0.0560546875, "learning_rate": 0.0001, "loss": 2.9299, "loss/crossentropy": 2.1106066584587095, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.23115261644124985, "loss/reg": 0.0, "step": 44360 }, { "epoch": 0.2919078947368421, "grad_norm": 2.34375, "grad_norm_var": 0.07128499348958334, "learning_rate": 0.0001, "loss": 2.9052, "loss/crossentropy": 2.4425766229629517, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2108648642897606, "loss/reg": 0.0, "step": 44370 }, { "epoch": 0.29197368421052633, "grad_norm": 2.265625, "grad_norm_var": 0.03076171875, "learning_rate": 0.0001, "loss": 2.8899, "loss/crossentropy": 2.2634895920753477, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.22594818025827407, "loss/reg": 0.0, "step": 44380 }, { "epoch": 0.2920394736842105, "grad_norm": 2.28125, "grad_norm_var": 0.0407379150390625, "learning_rate": 0.0001, "loss": 2.8771, "loss/crossentropy": 2.119717907905579, "loss/hidden": 2.853125, "loss/incoh": 0.0, "loss/logits": 0.22816947847604752, "loss/reg": 0.0, "step": 44390 }, { "epoch": 0.29210526315789476, "grad_norm": 2.453125, "grad_norm_var": 0.04156494140625, "learning_rate": 0.0001, "loss": 2.9005, "loss/crossentropy": 2.6144713282585146, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.2645583629608154, "loss/reg": 0.0, "step": 44400 }, { "epoch": 0.29217105263157894, "grad_norm": 2.265625, "grad_norm_var": 0.04833577473958333, "learning_rate": 0.0001, "loss": 2.9017, "loss/crossentropy": 2.2905097723007204, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.25144878029823303, "loss/reg": 0.0, "step": 44410 }, { "epoch": 0.2922368421052632, "grad_norm": 2.9375, "grad_norm_var": 0.10294596354166667, "learning_rate": 0.0001, "loss": 2.9136, "loss/crossentropy": 2.378885769844055, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.22462185621261596, "loss/reg": 0.0, "step": 44420 }, { "epoch": 0.29230263157894737, "grad_norm": 2.59375, "grad_norm_var": 0.2397613525390625, "learning_rate": 0.0001, "loss": 2.9157, "loss/crossentropy": 2.237425982952118, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.27028548568487165, "loss/reg": 0.0, "step": 44430 }, { "epoch": 0.29236842105263156, "grad_norm": 2.875, "grad_norm_var": 0.2059478759765625, "learning_rate": 0.0001, "loss": 2.9107, "loss/crossentropy": 2.2707148790359497, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.2450641006231308, "loss/reg": 0.0, "step": 44440 }, { "epoch": 0.2924342105263158, "grad_norm": 2.203125, "grad_norm_var": 0.047652180989583334, "learning_rate": 0.0001, "loss": 2.9173, "loss/crossentropy": 2.169828236103058, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.18287437111139299, "loss/reg": 0.0, "step": 44450 }, { "epoch": 0.2925, "grad_norm": 2.46875, "grad_norm_var": 0.04554036458333333, "learning_rate": 0.0001, "loss": 2.9596, "loss/crossentropy": 1.7703922271728516, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.22647661939263344, "loss/reg": 0.0, "step": 44460 }, { "epoch": 0.2925657894736842, "grad_norm": 2.296875, "grad_norm_var": 0.05278218587239583, "learning_rate": 0.0001, "loss": 2.8576, "loss/crossentropy": 2.29345440864563, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20200318098068237, "loss/reg": 0.0, "step": 44470 }, { "epoch": 0.2926315789473684, "grad_norm": 2.421875, "grad_norm_var": 0.045361328125, "learning_rate": 0.0001, "loss": 2.8876, "loss/crossentropy": 1.9010921955108642, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.19372418969869615, "loss/reg": 0.0, "step": 44480 }, { "epoch": 0.29269736842105265, "grad_norm": 2.21875, "grad_norm_var": 0.036742146809895834, "learning_rate": 0.0001, "loss": 2.9433, "loss/crossentropy": 2.5401357293128966, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23779045343399047, "loss/reg": 0.0, "step": 44490 }, { "epoch": 0.29276315789473684, "grad_norm": 2.625, "grad_norm_var": 0.11043192545572916, "learning_rate": 0.0001, "loss": 2.9045, "loss/crossentropy": 2.456793177127838, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.20541274249553682, "loss/reg": 0.0, "step": 44500 }, { "epoch": 0.2928289473684211, "grad_norm": 2.46875, "grad_norm_var": 0.0562408447265625, "learning_rate": 0.0001, "loss": 2.9996, "loss/crossentropy": 2.3363917231559754, "loss/hidden": 2.5109375, "loss/incoh": 0.0, "loss/logits": 0.19056100845336915, "loss/reg": 0.0, "step": 44510 }, { "epoch": 0.29289473684210526, "grad_norm": 1.953125, "grad_norm_var": 0.04414774576822917, "learning_rate": 0.0001, "loss": 2.8734, "loss/crossentropy": 2.2348657965660097, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.197630175948143, "loss/reg": 0.0, "step": 44520 }, { "epoch": 0.29296052631578945, "grad_norm": 2.15625, "grad_norm_var": 0.04695002237955729, "learning_rate": 0.0001, "loss": 2.8307, "loss/crossentropy": 2.23298202753067, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20767777115106584, "loss/reg": 0.0, "step": 44530 }, { "epoch": 0.2930263157894737, "grad_norm": 2.625, "grad_norm_var": 0.029890696207682293, "learning_rate": 0.0001, "loss": 2.8536, "loss/crossentropy": 2.300823438167572, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.202718086540699, "loss/reg": 0.0, "step": 44540 }, { "epoch": 0.2930921052631579, "grad_norm": 2.40625, "grad_norm_var": 0.08928120930989583, "learning_rate": 0.0001, "loss": 2.8749, "loss/crossentropy": 2.007755708694458, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.2166401542723179, "loss/reg": 0.0, "step": 44550 }, { "epoch": 0.2931578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.07295913696289062, "learning_rate": 0.0001, "loss": 2.8757, "loss/crossentropy": 2.3889171957969664, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.20355958342552186, "loss/reg": 0.0, "step": 44560 }, { "epoch": 0.2932236842105263, "grad_norm": 2.453125, "grad_norm_var": 0.5293131510416667, "learning_rate": 0.0001, "loss": 2.8751, "loss/crossentropy": 2.59027259349823, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.26287306249141695, "loss/reg": 0.0, "step": 44570 }, { "epoch": 0.29328947368421054, "grad_norm": 2.578125, "grad_norm_var": 0.5596018473307292, "learning_rate": 0.0001, "loss": 2.8711, "loss/crossentropy": 2.390466403961182, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.21787833869457246, "loss/reg": 0.0, "step": 44580 }, { "epoch": 0.29335526315789473, "grad_norm": 2.3125, "grad_norm_var": 0.15518290201822918, "learning_rate": 0.0001, "loss": 2.8985, "loss/crossentropy": 2.3344136834144593, "loss/hidden": 2.8453125, "loss/incoh": 0.0, "loss/logits": 0.2769021064043045, "loss/reg": 0.0, "step": 44590 }, { "epoch": 0.29342105263157897, "grad_norm": 2.8125, "grad_norm_var": 0.0267578125, "learning_rate": 0.0001, "loss": 2.8864, "loss/crossentropy": 2.2177732944488526, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.20458547919988632, "loss/reg": 0.0, "step": 44600 }, { "epoch": 0.29348684210526316, "grad_norm": 2.28125, "grad_norm_var": 0.03173421223958333, "learning_rate": 0.0001, "loss": 2.8689, "loss/crossentropy": 2.2422897934913637, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.20795199647545815, "loss/reg": 0.0, "step": 44610 }, { "epoch": 0.29355263157894734, "grad_norm": 2.328125, "grad_norm_var": 0.0489898681640625, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 2.127839004993439, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.2462543874979019, "loss/reg": 0.0, "step": 44620 }, { "epoch": 0.2936184210526316, "grad_norm": 2399141888.0, "grad_norm_var": 3.597426116831521e+17, "learning_rate": 0.0001, "loss": 2.9766, "loss/crossentropy": 2.2330288529396056, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.20840450823307038, "loss/reg": 0.0, "step": 44630 }, { "epoch": 0.29368421052631577, "grad_norm": 2.46875, "grad_norm_var": 3.597426116337948e+17, "learning_rate": 0.0001, "loss": 3.0194, "loss/crossentropy": 2.2751620769500733, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.260174798220396, "loss/reg": 0.0, "step": 44640 }, { "epoch": 0.29375, "grad_norm": 2.4375, "grad_norm_var": 0.04641520182291667, "learning_rate": 0.0001, "loss": 2.9085, "loss/crossentropy": 2.3432190775871278, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.23176079839468003, "loss/reg": 0.0, "step": 44650 }, { "epoch": 0.2938157894736842, "grad_norm": 2.171875, "grad_norm_var": 0.028125, "learning_rate": 0.0001, "loss": 2.8382, "loss/crossentropy": 2.12693852186203, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.19466151297092438, "loss/reg": 0.0, "step": 44660 }, { "epoch": 0.29388157894736844, "grad_norm": 2.328125, "grad_norm_var": 0.09014383951822917, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.3793718457221984, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.2262612372636795, "loss/reg": 0.0, "step": 44670 }, { "epoch": 0.2939473684210526, "grad_norm": 2.6875, "grad_norm_var": 0.0902008056640625, "learning_rate": 0.0001, "loss": 2.8956, "loss/crossentropy": 2.3033525943756104, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.22931652516126633, "loss/reg": 0.0, "step": 44680 }, { "epoch": 0.29401315789473687, "grad_norm": 2.421875, "grad_norm_var": 0.10585835774739584, "learning_rate": 0.0001, "loss": 2.8955, "loss/crossentropy": 2.3368748664855956, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.2073671281337738, "loss/reg": 0.0, "step": 44690 }, { "epoch": 0.29407894736842105, "grad_norm": 2.53125, "grad_norm_var": 0.15390218098958333, "learning_rate": 0.0001, "loss": 2.863, "loss/crossentropy": 2.032200348377228, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.19209973216056825, "loss/reg": 0.0, "step": 44700 }, { "epoch": 0.29414473684210524, "grad_norm": 2.0625, "grad_norm_var": 4.616893297344577e+17, "learning_rate": 0.0001, "loss": 2.9583, "loss/crossentropy": 2.205097663402557, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.21937327682971955, "loss/reg": 0.0, "step": 44710 }, { "epoch": 0.2942105263157895, "grad_norm": 2.421875, "grad_norm_var": 4.6168932974012006e+17, "learning_rate": 0.0001, "loss": 2.8681, "loss/crossentropy": 2.4582468628883363, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.22413594126701356, "loss/reg": 0.0, "step": 44720 }, { "epoch": 0.29427631578947366, "grad_norm": 2.65625, "grad_norm_var": 0.0855133056640625, "learning_rate": 0.0001, "loss": 2.9675, "loss/crossentropy": 2.1345455288887023, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.23042114526033403, "loss/reg": 0.0, "step": 44730 }, { "epoch": 0.2943421052631579, "grad_norm": 2.40625, "grad_norm_var": 0.07298177083333333, "learning_rate": 0.0001, "loss": 2.8608, "loss/crossentropy": 2.05478475689888, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.21500623300671579, "loss/reg": 0.0, "step": 44740 }, { "epoch": 0.2944078947368421, "grad_norm": 3.234375, "grad_norm_var": 0.12776285807291668, "learning_rate": 0.0001, "loss": 2.9412, "loss/crossentropy": 2.135305869579315, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2145572282373905, "loss/reg": 0.0, "step": 44750 }, { "epoch": 0.29447368421052633, "grad_norm": 2.234375, "grad_norm_var": 0.12942606608072918, "learning_rate": 0.0001, "loss": 2.9788, "loss/crossentropy": 2.2340178370475767, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.2626689672470093, "loss/reg": 0.0, "step": 44760 }, { "epoch": 0.2945394736842105, "grad_norm": 2.671875, "grad_norm_var": 0.13245340983072917, "learning_rate": 0.0001, "loss": 2.8741, "loss/crossentropy": 2.149627870321274, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.17575221583247186, "loss/reg": 0.0, "step": 44770 }, { "epoch": 0.29460526315789476, "grad_norm": 2.46875, "grad_norm_var": 0.025748697916666667, "learning_rate": 0.0001, "loss": 2.8851, "loss/crossentropy": 2.32595431804657, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22441416531801223, "loss/reg": 0.0, "step": 44780 }, { "epoch": 0.29467105263157894, "grad_norm": 2.515625, "grad_norm_var": 0.0297760009765625, "learning_rate": 0.0001, "loss": 2.9476, "loss/crossentropy": 2.247841775417328, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22422362267971038, "loss/reg": 0.0, "step": 44790 }, { "epoch": 0.29473684210526313, "grad_norm": 2.359375, "grad_norm_var": 0.04607747395833333, "learning_rate": 0.0001, "loss": 2.9456, "loss/crossentropy": 2.32385675907135, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2649970054626465, "loss/reg": 0.0, "step": 44800 }, { "epoch": 0.29480263157894737, "grad_norm": 3.09375, "grad_norm_var": 0.099267578125, "learning_rate": 0.0001, "loss": 2.94, "loss/crossentropy": 2.107827401161194, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2153822124004364, "loss/reg": 0.0, "step": 44810 }, { "epoch": 0.29486842105263156, "grad_norm": 2.34375, "grad_norm_var": 0.08561197916666667, "learning_rate": 0.0001, "loss": 2.8813, "loss/crossentropy": 2.189478170871735, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.19373691380023955, "loss/reg": 0.0, "step": 44820 }, { "epoch": 0.2949342105263158, "grad_norm": 2.859375, "grad_norm_var": 0.04933980305989583, "learning_rate": 0.0001, "loss": 2.9438, "loss/crossentropy": 2.0668108105659484, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.21971826180815696, "loss/reg": 0.0, "step": 44830 }, { "epoch": 0.295, "grad_norm": 2.203125, "grad_norm_var": 0.06331380208333333, "learning_rate": 0.0001, "loss": 2.8512, "loss/crossentropy": 2.159078085422516, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.16200024709105493, "loss/reg": 0.0, "step": 44840 }, { "epoch": 0.2950657894736842, "grad_norm": 2.546875, "grad_norm_var": 0.06672261555989584, "learning_rate": 0.0001, "loss": 2.8774, "loss/crossentropy": 2.419037628173828, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2086916409432888, "loss/reg": 0.0, "step": 44850 }, { "epoch": 0.2951315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.0802642822265625, "learning_rate": 0.0001, "loss": 2.8497, "loss/crossentropy": 2.1597745537757875, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.21028273850679396, "loss/reg": 0.0, "step": 44860 }, { "epoch": 0.29519736842105265, "grad_norm": 2.984375, "grad_norm_var": 2.575671952197026e+17, "learning_rate": 0.0001, "loss": 3.0289, "loss/crossentropy": 2.571130645275116, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.22348283976316452, "loss/reg": 0.0, "step": 44870 }, { "epoch": 0.29526315789473684, "grad_norm": 2.328125, "grad_norm_var": 2.575671952255178e+17, "learning_rate": 0.0001, "loss": 2.8083, "loss/crossentropy": 2.335205411911011, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.22102317959070206, "loss/reg": 0.0, "step": 44880 }, { "epoch": 0.2953289473684211, "grad_norm": 3.28125, "grad_norm_var": 0.09698893229166666, "learning_rate": 0.0001, "loss": 2.8903, "loss/crossentropy": 2.47419410943985, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.24135932624340056, "loss/reg": 0.0, "step": 44890 }, { "epoch": 0.29539473684210527, "grad_norm": 2.359375, "grad_norm_var": 0.17791239420572916, "learning_rate": 0.0001, "loss": 2.8542, "loss/crossentropy": 2.0170480608940125, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.20553454309701918, "loss/reg": 0.0, "step": 44900 }, { "epoch": 0.29546052631578945, "grad_norm": 3.03125, "grad_norm_var": 0.24402567545572917, "learning_rate": 0.0001, "loss": 2.9037, "loss/crossentropy": 2.4817562222480776, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.26282968968153, "loss/reg": 0.0, "step": 44910 }, { "epoch": 0.2955263157894737, "grad_norm": 2.703125, "grad_norm_var": 0.23791402180989582, "learning_rate": 0.0001, "loss": 2.8782, "loss/crossentropy": 2.170925426483154, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2398928925395012, "loss/reg": 0.0, "step": 44920 }, { "epoch": 0.2955921052631579, "grad_norm": 2.359375, "grad_norm_var": 0.18430074055989584, "learning_rate": 0.0001, "loss": 2.8822, "loss/crossentropy": 2.534380865097046, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.20652569085359573, "loss/reg": 0.0, "step": 44930 }, { "epoch": 0.2956578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.28396809895833336, "learning_rate": 0.0001, "loss": 2.8979, "loss/crossentropy": 2.3678994297981264, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.21856625080108644, "loss/reg": 0.0, "step": 44940 }, { "epoch": 0.2957236842105263, "grad_norm": 2.46875, "grad_norm_var": 0.26363525390625, "learning_rate": 0.0001, "loss": 2.9045, "loss/crossentropy": 2.1923569798469544, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2561697602272034, "loss/reg": 0.0, "step": 44950 }, { "epoch": 0.29578947368421055, "grad_norm": 2.671875, "grad_norm_var": 0.6543284098307292, "learning_rate": 0.0001, "loss": 2.9922, "loss/crossentropy": 2.362464094161987, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.2340029790997505, "loss/reg": 0.0, "step": 44960 }, { "epoch": 0.29585526315789473, "grad_norm": 2.703125, "grad_norm_var": 0.67841796875, "learning_rate": 0.0001, "loss": 2.834, "loss/crossentropy": 2.160146486759186, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.21961185038089753, "loss/reg": 0.0, "step": 44970 }, { "epoch": 0.295921052631579, "grad_norm": 3.015625, "grad_norm_var": 0.11281636555989584, "learning_rate": 0.0001, "loss": 2.9141, "loss/crossentropy": 2.209792697429657, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.20368139445781708, "loss/reg": 0.0, "step": 44980 }, { "epoch": 0.29598684210526316, "grad_norm": 2.671875, "grad_norm_var": 0.17455952962239582, "learning_rate": 0.0001, "loss": 2.8694, "loss/crossentropy": 2.2400214672088623, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.22141096740961075, "loss/reg": 0.0, "step": 44990 }, { "epoch": 0.29605263157894735, "grad_norm": 2.234375, "grad_norm_var": 0.11776936848958333, "learning_rate": 0.0001, "loss": 2.8526, "loss/crossentropy": 2.4232283234596252, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.21499115973711014, "loss/reg": 0.0, "step": 45000 }, { "epoch": 0.2961184210526316, "grad_norm": 2.46875, "grad_norm_var": 1.8845189439402147e+17, "learning_rate": 0.0001, "loss": 3.0046, "loss/crossentropy": 2.1754011154174804, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.18199468404054642, "loss/reg": 0.0, "step": 45010 }, { "epoch": 0.29618421052631577, "grad_norm": 2.109375, "grad_norm_var": 0.16470947265625, "learning_rate": 0.0001, "loss": 2.9359, "loss/crossentropy": 2.054402434825897, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.20195132344961167, "loss/reg": 0.0, "step": 45020 }, { "epoch": 0.29625, "grad_norm": 2.609375, "grad_norm_var": 0.20591532389322917, "learning_rate": 0.0001, "loss": 2.9542, "loss/crossentropy": 2.135529112815857, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.28090175688266755, "loss/reg": 0.0, "step": 45030 }, { "epoch": 0.2963157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.3951568603515625, "learning_rate": 0.0001, "loss": 2.9499, "loss/crossentropy": 2.2031121969223024, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.22122678495943546, "loss/reg": 0.0, "step": 45040 }, { "epoch": 0.29638157894736844, "grad_norm": 2.296875, "grad_norm_var": 0.34488525390625, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.1922505140304565, "loss/hidden": 2.53125, "loss/incoh": 0.0, "loss/logits": 0.21252339631319045, "loss/reg": 0.0, "step": 45050 }, { "epoch": 0.2964473684210526, "grad_norm": 2.515625, "grad_norm_var": 0.03573811848958333, "learning_rate": 0.0001, "loss": 2.903, "loss/crossentropy": 2.1779013514518737, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.20681750625371934, "loss/reg": 0.0, "step": 45060 }, { "epoch": 0.29651315789473687, "grad_norm": 2.453125, "grad_norm_var": 0.05342508951822917, "learning_rate": 0.0001, "loss": 2.8965, "loss/crossentropy": 2.2254809021949766, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.21693927720189093, "loss/reg": 0.0, "step": 45070 }, { "epoch": 0.29657894736842105, "grad_norm": 3.0, "grad_norm_var": 0.05478515625, "learning_rate": 0.0001, "loss": 2.9132, "loss/crossentropy": 2.2376030564308165, "loss/hidden": 2.5375, "loss/incoh": 0.0, "loss/logits": 0.22201271131634712, "loss/reg": 0.0, "step": 45080 }, { "epoch": 0.29664473684210524, "grad_norm": 2.5625, "grad_norm_var": 0.031298828125, "learning_rate": 0.0001, "loss": 2.911, "loss/crossentropy": 2.2825773656368256, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.20657121613621712, "loss/reg": 0.0, "step": 45090 }, { "epoch": 0.2967105263157895, "grad_norm": 2.640625, "grad_norm_var": 0.034566243489583336, "learning_rate": 0.0001, "loss": 2.8718, "loss/crossentropy": 2.083207994699478, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.21633562073111534, "loss/reg": 0.0, "step": 45100 }, { "epoch": 0.29677631578947367, "grad_norm": 2.4375, "grad_norm_var": 0.070751953125, "learning_rate": 0.0001, "loss": 2.8913, "loss/crossentropy": 1.9589912176132203, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2076563499867916, "loss/reg": 0.0, "step": 45110 }, { "epoch": 0.2968421052631579, "grad_norm": 2.390625, "grad_norm_var": 0.038913726806640625, "learning_rate": 0.0001, "loss": 2.8875, "loss/crossentropy": 2.4524953603744506, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.20787258446216583, "loss/reg": 0.0, "step": 45120 }, { "epoch": 0.2969078947368421, "grad_norm": 2.609375, "grad_norm_var": 0.10882161458333334, "learning_rate": 0.0001, "loss": 2.8385, "loss/crossentropy": 2.415916585922241, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.21832268685102463, "loss/reg": 0.0, "step": 45130 }, { "epoch": 0.29697368421052633, "grad_norm": 2.484375, "grad_norm_var": 0.21521708170572917, "learning_rate": 0.0001, "loss": 2.86, "loss/crossentropy": 1.998589563369751, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.22005839571356772, "loss/reg": 0.0, "step": 45140 }, { "epoch": 0.2970394736842105, "grad_norm": 2.21875, "grad_norm_var": 0.6879191080729167, "learning_rate": 0.0001, "loss": 2.8804, "loss/crossentropy": 2.1602801471948623, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.20606082901358605, "loss/reg": 0.0, "step": 45150 }, { "epoch": 0.29710526315789476, "grad_norm": 2.546875, "grad_norm_var": 0.3255686442057292, "learning_rate": 0.0001, "loss": 2.9208, "loss/crossentropy": 2.0087016046047212, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.21927152648568154, "loss/reg": 0.0, "step": 45160 }, { "epoch": 0.29717105263157895, "grad_norm": 2.5, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 2.9091, "loss/crossentropy": 2.5549843549728393, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.2396952196955681, "loss/reg": 0.0, "step": 45170 }, { "epoch": 0.29723684210526313, "grad_norm": 2.453125, "grad_norm_var": 0.05968424479166667, "learning_rate": 0.0001, "loss": 2.834, "loss/crossentropy": 2.2432658553123472, "loss/hidden": 2.55625, "loss/incoh": 0.0, "loss/logits": 0.18803772255778312, "loss/reg": 0.0, "step": 45180 }, { "epoch": 0.2973026315789474, "grad_norm": 2.203125, "grad_norm_var": 0.47131754557291666, "learning_rate": 0.0001, "loss": 2.9434, "loss/crossentropy": 2.3188610672950745, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.24268923252820968, "loss/reg": 0.0, "step": 45190 }, { "epoch": 0.29736842105263156, "grad_norm": 2.4375, "grad_norm_var": 0.47000325520833336, "learning_rate": 0.0001, "loss": 2.8531, "loss/crossentropy": 2.2581568241119383, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.196596609801054, "loss/reg": 0.0, "step": 45200 }, { "epoch": 0.2974342105263158, "grad_norm": 2.359375, "grad_norm_var": 0.050126139322916666, "learning_rate": 0.0001, "loss": 2.909, "loss/crossentropy": 2.3120391011238097, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.22595764994621276, "loss/reg": 0.0, "step": 45210 }, { "epoch": 0.2975, "grad_norm": 2.96875, "grad_norm_var": 0.17593994140625, "learning_rate": 0.0001, "loss": 2.9312, "loss/crossentropy": 2.3330196261405947, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.2016739070415497, "loss/reg": 0.0, "step": 45220 }, { "epoch": 0.29756578947368423, "grad_norm": 2.3125, "grad_norm_var": 0.0477935791015625, "learning_rate": 0.0001, "loss": 2.9428, "loss/crossentropy": 1.9430033266544342, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.20442718118429185, "loss/reg": 0.0, "step": 45230 }, { "epoch": 0.2976315789473684, "grad_norm": 2.1875, "grad_norm_var": 0.04070638020833333, "learning_rate": 0.0001, "loss": 2.8501, "loss/crossentropy": 2.3361942529678346, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.22816102206707, "loss/reg": 0.0, "step": 45240 }, { "epoch": 0.29769736842105265, "grad_norm": 2.546875, "grad_norm_var": 0.17150065104166667, "learning_rate": 0.0001, "loss": 2.8146, "loss/crossentropy": 2.457094645500183, "loss/hidden": 2.490625, "loss/incoh": 0.0, "loss/logits": 0.18126515448093414, "loss/reg": 0.0, "step": 45250 }, { "epoch": 0.29776315789473684, "grad_norm": 2.21875, "grad_norm_var": 0.08371480305989583, "learning_rate": 0.0001, "loss": 2.91, "loss/crossentropy": 2.524383175373077, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2097005546092987, "loss/reg": 0.0, "step": 45260 }, { "epoch": 0.297828947368421, "grad_norm": 4.1875, "grad_norm_var": 0.30914713541666666, "learning_rate": 0.0001, "loss": 2.8672, "loss/crossentropy": 2.3364664196968077, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2110161989927292, "loss/reg": 0.0, "step": 45270 }, { "epoch": 0.29789473684210527, "grad_norm": 2.125, "grad_norm_var": 0.7207834879557292, "learning_rate": 0.0001, "loss": 2.8551, "loss/crossentropy": 2.0226253271102905, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.19008681252598764, "loss/reg": 0.0, "step": 45280 }, { "epoch": 0.29796052631578945, "grad_norm": 2.234375, "grad_norm_var": 0.6046834309895833, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.0710988521575926, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.20828001499176024, "loss/reg": 0.0, "step": 45290 }, { "epoch": 0.2980263157894737, "grad_norm": 2.34375, "grad_norm_var": 0.083544921875, "learning_rate": 0.0001, "loss": 2.9351, "loss/crossentropy": 2.360078001022339, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.2156745031476021, "loss/reg": 0.0, "step": 45300 }, { "epoch": 0.2980921052631579, "grad_norm": 2.421875, "grad_norm_var": 0.029585774739583334, "learning_rate": 0.0001, "loss": 2.8723, "loss/crossentropy": 2.3367308497428896, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.28912911415100095, "loss/reg": 0.0, "step": 45310 }, { "epoch": 0.2981578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.05543212890625, "learning_rate": 0.0001, "loss": 2.9095, "loss/crossentropy": 2.316341680288315, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.21053346544504165, "loss/reg": 0.0, "step": 45320 }, { "epoch": 0.2982236842105263, "grad_norm": 2.140625, "grad_norm_var": 0.05133463541666667, "learning_rate": 0.0001, "loss": 2.8948, "loss/crossentropy": 2.070668321847916, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.23948114216327668, "loss/reg": 0.0, "step": 45330 }, { "epoch": 0.29828947368421055, "grad_norm": 2.34375, "grad_norm_var": 0.07511393229166667, "learning_rate": 0.0001, "loss": 2.8215, "loss/crossentropy": 2.2381530523300173, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.18948778808116912, "loss/reg": 0.0, "step": 45340 }, { "epoch": 0.29835526315789473, "grad_norm": 2.46875, "grad_norm_var": 0.14205322265625, "learning_rate": 0.0001, "loss": 2.9682, "loss/crossentropy": 2.2352765440940856, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.21023350208997726, "loss/reg": 0.0, "step": 45350 }, { "epoch": 0.2984210526315789, "grad_norm": 2.46875, "grad_norm_var": 0.150732421875, "learning_rate": 0.0001, "loss": 2.8791, "loss/crossentropy": 2.4872194051742555, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.21265227496623992, "loss/reg": 0.0, "step": 45360 }, { "epoch": 0.29848684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.040266927083333334, "learning_rate": 0.0001, "loss": 2.8536, "loss/crossentropy": 2.249758982658386, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2461320787668228, "loss/reg": 0.0, "step": 45370 }, { "epoch": 0.29855263157894735, "grad_norm": 2.21875, "grad_norm_var": 0.06700846354166666, "learning_rate": 0.0001, "loss": 2.9077, "loss/crossentropy": 2.149129128456116, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.249987830221653, "loss/reg": 0.0, "step": 45380 }, { "epoch": 0.2986184210526316, "grad_norm": 2.9375, "grad_norm_var": 1.786937459309896, "learning_rate": 0.0001, "loss": 2.9641, "loss/crossentropy": 2.4855190992355345, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2801044538617134, "loss/reg": 0.0, "step": 45390 }, { "epoch": 0.2986842105263158, "grad_norm": 2.453125, "grad_norm_var": 0.03970947265625, "learning_rate": 0.0001, "loss": 2.917, "loss/crossentropy": 2.2864890813827516, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.21102356612682344, "loss/reg": 0.0, "step": 45400 }, { "epoch": 0.29875, "grad_norm": 2.40625, "grad_norm_var": 0.0933013916015625, "learning_rate": 0.0001, "loss": 2.9045, "loss/crossentropy": 2.5197973608970643, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.33027863800525664, "loss/reg": 0.0, "step": 45410 }, { "epoch": 0.2988157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.09870503743489584, "learning_rate": 0.0001, "loss": 2.843, "loss/crossentropy": 2.2372475624084474, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.1891390025615692, "loss/reg": 0.0, "step": 45420 }, { "epoch": 0.29888157894736844, "grad_norm": 2.40625, "grad_norm_var": 0.08911844889322916, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.1673420310020446, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.21711432337760925, "loss/reg": 0.0, "step": 45430 }, { "epoch": 0.29894736842105263, "grad_norm": 2.375, "grad_norm_var": 0.058024088541666664, "learning_rate": 0.0001, "loss": 2.8755, "loss/crossentropy": 2.3685627222061156, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.22005878537893295, "loss/reg": 0.0, "step": 45440 }, { "epoch": 0.29901315789473687, "grad_norm": 2.265625, "grad_norm_var": 0.02750244140625, "learning_rate": 0.0001, "loss": 2.94, "loss/crossentropy": 2.129115641117096, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.19235717505216599, "loss/reg": 0.0, "step": 45450 }, { "epoch": 0.29907894736842106, "grad_norm": 4.25, "grad_norm_var": 0.42069905598958335, "learning_rate": 0.0001, "loss": 3.0316, "loss/crossentropy": 1.904208129644394, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.22624684870243073, "loss/reg": 0.0, "step": 45460 }, { "epoch": 0.29914473684210524, "grad_norm": 2.578125, "grad_norm_var": 0.28338216145833334, "learning_rate": 0.0001, "loss": 2.9353, "loss/crossentropy": 2.0894439965486526, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.18237627409398555, "loss/reg": 0.0, "step": 45470 }, { "epoch": 0.2992105263157895, "grad_norm": 3.140625, "grad_norm_var": 0.21353734334309896, "learning_rate": 0.0001, "loss": 2.8302, "loss/crossentropy": 1.9808445692062377, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.16738578900694848, "loss/reg": 0.0, "step": 45480 }, { "epoch": 0.29927631578947367, "grad_norm": 3.0, "grad_norm_var": 0.19410171508789062, "learning_rate": 0.0001, "loss": 2.8752, "loss/crossentropy": 2.1755286514759065, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.22187821492552756, "loss/reg": 0.0, "step": 45490 }, { "epoch": 0.2993421052631579, "grad_norm": 2.734375, "grad_norm_var": 0.044896443684895836, "learning_rate": 0.0001, "loss": 2.8994, "loss/crossentropy": 2.0994277954101563, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.2246171072125435, "loss/reg": 0.0, "step": 45500 }, { "epoch": 0.2994078947368421, "grad_norm": 2.265625, "grad_norm_var": 0.034228515625, "learning_rate": 0.0001, "loss": 2.8323, "loss/crossentropy": 2.5412381410598757, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.2155904546380043, "loss/reg": 0.0, "step": 45510 }, { "epoch": 0.29947368421052634, "grad_norm": 2.234375, "grad_norm_var": 0.0323150634765625, "learning_rate": 0.0001, "loss": 2.9104, "loss/crossentropy": 2.2692856311798097, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.19631940573453904, "loss/reg": 0.0, "step": 45520 }, { "epoch": 0.2995394736842105, "grad_norm": 2.265625, "grad_norm_var": 0.061783854166666666, "learning_rate": 0.0001, "loss": 2.9131, "loss/crossentropy": 2.5011532068252564, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2306092858314514, "loss/reg": 0.0, "step": 45530 }, { "epoch": 0.29960526315789476, "grad_norm": 3.453125, "grad_norm_var": 0.1206939697265625, "learning_rate": 0.0001, "loss": 2.9129, "loss/crossentropy": 2.202244734764099, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.21703190952539445, "loss/reg": 0.0, "step": 45540 }, { "epoch": 0.29967105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.09494527180989583, "learning_rate": 0.0001, "loss": 2.8532, "loss/crossentropy": 2.355343055725098, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.22889090329408646, "loss/reg": 0.0, "step": 45550 }, { "epoch": 0.29973684210526313, "grad_norm": 2.0625, "grad_norm_var": 0.0596343994140625, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 2.141188895702362, "loss/hidden": 2.84375, "loss/incoh": 0.0, "loss/logits": 0.23390487730503082, "loss/reg": 0.0, "step": 45560 }, { "epoch": 0.2998026315789474, "grad_norm": 2.484375, "grad_norm_var": 0.07488505045572917, "learning_rate": 0.0001, "loss": 2.8597, "loss/crossentropy": 2.3653998374938965, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.2095509447157383, "loss/reg": 0.0, "step": 45570 }, { "epoch": 0.29986842105263156, "grad_norm": 2.109375, "grad_norm_var": 0.06542561848958334, "learning_rate": 0.0001, "loss": 2.8589, "loss/crossentropy": 2.2982463479042052, "loss/hidden": 2.5140625, "loss/incoh": 0.0, "loss/logits": 0.18520271480083467, "loss/reg": 0.0, "step": 45580 }, { "epoch": 0.2999342105263158, "grad_norm": 2.6875, "grad_norm_var": 0.04472554524739583, "learning_rate": 0.0001, "loss": 2.9092, "loss/crossentropy": 2.070514017343521, "loss/hidden": 2.8234375, "loss/incoh": 0.0, "loss/logits": 0.20155109465122223, "loss/reg": 0.0, "step": 45590 }, { "epoch": 0.3, "grad_norm": 4.46875, "grad_norm_var": 0.27454020182291666, "learning_rate": 0.0001, "loss": 2.8916, "loss/crossentropy": 1.9415469765663147, "loss/hidden": 2.846875, "loss/incoh": 0.0, "loss/logits": 0.1953571006655693, "loss/reg": 0.0, "step": 45600 }, { "epoch": 0.30006578947368423, "grad_norm": 2.578125, "grad_norm_var": 0.2735677083333333, "learning_rate": 0.0001, "loss": 2.8231, "loss/crossentropy": 2.405189561843872, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2097194403409958, "loss/reg": 0.0, "step": 45610 }, { "epoch": 0.3001315789473684, "grad_norm": 2.34375, "grad_norm_var": 0.23993733723958333, "learning_rate": 0.0001, "loss": 2.8848, "loss/crossentropy": 2.254215121269226, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.20808875411748887, "loss/reg": 0.0, "step": 45620 }, { "epoch": 0.30019736842105266, "grad_norm": 2.53125, "grad_norm_var": 0.21018473307291666, "learning_rate": 0.0001, "loss": 2.9013, "loss/crossentropy": 2.253717315196991, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.21508381515741348, "loss/reg": 0.0, "step": 45630 }, { "epoch": 0.30026315789473684, "grad_norm": 2.15625, "grad_norm_var": 0.024702962239583334, "learning_rate": 0.0001, "loss": 2.911, "loss/crossentropy": 1.9200111985206605, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.18654979169368743, "loss/reg": 0.0, "step": 45640 }, { "epoch": 0.30032894736842103, "grad_norm": 2.34375, "grad_norm_var": 0.06530659993489583, "learning_rate": 0.0001, "loss": 2.899, "loss/crossentropy": 2.254462903738022, "loss/hidden": 2.5265625, "loss/incoh": 0.0, "loss/logits": 0.18003339171409607, "loss/reg": 0.0, "step": 45650 }, { "epoch": 0.30039473684210527, "grad_norm": 2.296875, "grad_norm_var": 0.10065104166666666, "learning_rate": 0.0001, "loss": 2.9076, "loss/crossentropy": 2.3432245433330534, "loss/hidden": 2.4953125, "loss/incoh": 0.0, "loss/logits": 0.17783247530460358, "loss/reg": 0.0, "step": 45660 }, { "epoch": 0.30046052631578946, "grad_norm": 2.328125, "grad_norm_var": 0.10013020833333333, "learning_rate": 0.0001, "loss": 2.8922, "loss/crossentropy": 2.0137857019901277, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.19445927888154985, "loss/reg": 0.0, "step": 45670 }, { "epoch": 0.3005263157894737, "grad_norm": 2.53125, "grad_norm_var": 0.5976470947265625, "learning_rate": 0.0001, "loss": 2.9129, "loss/crossentropy": 2.39476215839386, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.20432666391134263, "loss/reg": 0.0, "step": 45680 }, { "epoch": 0.3005921052631579, "grad_norm": 2.40625, "grad_norm_var": 0.03867899576822917, "learning_rate": 0.0001, "loss": 2.928, "loss/crossentropy": 2.0343277215957642, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.19227560311555864, "loss/reg": 0.0, "step": 45690 }, { "epoch": 0.3006578947368421, "grad_norm": 2.03125, "grad_norm_var": 0.0449371337890625, "learning_rate": 0.0001, "loss": 2.9358, "loss/crossentropy": 2.0982259273529054, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.19166601151227952, "loss/reg": 0.0, "step": 45700 }, { "epoch": 0.3007236842105263, "grad_norm": 2.78125, "grad_norm_var": 0.055501302083333336, "learning_rate": 0.0001, "loss": 2.8456, "loss/crossentropy": 2.427810883522034, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.20450241416692733, "loss/reg": 0.0, "step": 45710 }, { "epoch": 0.30078947368421055, "grad_norm": 2.25, "grad_norm_var": 0.05090738932291667, "learning_rate": 0.0001, "loss": 2.8764, "loss/crossentropy": 2.336765134334564, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.23735760748386384, "loss/reg": 0.0, "step": 45720 }, { "epoch": 0.30085526315789474, "grad_norm": 2.5625, "grad_norm_var": 0.1027496337890625, "learning_rate": 0.0001, "loss": 2.898, "loss/crossentropy": 2.366568052768707, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22972290366888046, "loss/reg": 0.0, "step": 45730 }, { "epoch": 0.3009210526315789, "grad_norm": 2.3125, "grad_norm_var": 0.29002278645833335, "learning_rate": 0.0001, "loss": 2.9157, "loss/crossentropy": 2.1257889211177825, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2420471802353859, "loss/reg": 0.0, "step": 45740 }, { "epoch": 0.30098684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.19798177083333332, "learning_rate": 0.0001, "loss": 2.8785, "loss/crossentropy": 2.3287550687789915, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.21805956363677978, "loss/reg": 0.0, "step": 45750 }, { "epoch": 0.30105263157894735, "grad_norm": 2.734375, "grad_norm_var": 0.06585184733072917, "learning_rate": 0.0001, "loss": 2.9177, "loss/crossentropy": 2.276636278629303, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.19628972709178924, "loss/reg": 0.0, "step": 45760 }, { "epoch": 0.3011184210526316, "grad_norm": 2.328125, "grad_norm_var": 1.257030232747396, "learning_rate": 0.0001, "loss": 2.9358, "loss/crossentropy": 1.9050792157649994, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2279894657433033, "loss/reg": 0.0, "step": 45770 }, { "epoch": 0.3011842105263158, "grad_norm": 2.265625, "grad_norm_var": 1.8242838541666666, "learning_rate": 0.0001, "loss": 2.9258, "loss/crossentropy": 2.4727717638015747, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.20371496081352233, "loss/reg": 0.0, "step": 45780 }, { "epoch": 0.30125, "grad_norm": 2.5625, "grad_norm_var": 0.032445271809895836, "learning_rate": 0.0001, "loss": 2.9234, "loss/crossentropy": 2.4192578196525574, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2362167790532112, "loss/reg": 0.0, "step": 45790 }, { "epoch": 0.3013157894736842, "grad_norm": 2.375, "grad_norm_var": 0.0233795166015625, "learning_rate": 0.0001, "loss": 2.8709, "loss/crossentropy": 2.1924211621284484, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.19910378456115724, "loss/reg": 0.0, "step": 45800 }, { "epoch": 0.30138157894736844, "grad_norm": 2.578125, "grad_norm_var": 0.09599202473958333, "learning_rate": 0.0001, "loss": 2.907, "loss/crossentropy": 2.167942667007446, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.23753527849912642, "loss/reg": 0.0, "step": 45810 }, { "epoch": 0.30144736842105263, "grad_norm": 2.40625, "grad_norm_var": 0.053034464518229164, "learning_rate": 0.0001, "loss": 2.8632, "loss/crossentropy": 2.296053612232208, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.19977862909436225, "loss/reg": 0.0, "step": 45820 }, { "epoch": 0.3015131578947368, "grad_norm": 2.375, "grad_norm_var": 0.017088826497395834, "learning_rate": 0.0001, "loss": 2.8862, "loss/crossentropy": 1.9618752896785736, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.20557804256677628, "loss/reg": 0.0, "step": 45830 }, { "epoch": 0.30157894736842106, "grad_norm": 2.78125, "grad_norm_var": 1.2352121988932292, "learning_rate": 0.0001, "loss": 2.8936, "loss/crossentropy": 2.1968328237533568, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.22243955582380295, "loss/reg": 0.0, "step": 45840 }, { "epoch": 0.30164473684210524, "grad_norm": 3.15625, "grad_norm_var": 0.10556538899739583, "learning_rate": 0.0001, "loss": 2.9937, "loss/crossentropy": 2.268926668167114, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.21252032667398452, "loss/reg": 0.0, "step": 45850 }, { "epoch": 0.3017105263157895, "grad_norm": 2.265625, "grad_norm_var": 0.1048736572265625, "learning_rate": 0.0001, "loss": 2.9017, "loss/crossentropy": 2.292650747299194, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.22599299550056456, "loss/reg": 0.0, "step": 45860 }, { "epoch": 0.30177631578947367, "grad_norm": 2.40625, "grad_norm_var": 0.09665425618489583, "learning_rate": 0.0001, "loss": 2.8809, "loss/crossentropy": 2.120444667339325, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.20320819616317748, "loss/reg": 0.0, "step": 45870 }, { "epoch": 0.3018421052631579, "grad_norm": 2.5, "grad_norm_var": 0.13325093587239584, "learning_rate": 0.0001, "loss": 2.8944, "loss/crossentropy": 2.2236520767211916, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.22441551238298416, "loss/reg": 0.0, "step": 45880 }, { "epoch": 0.3019078947368421, "grad_norm": 2.234375, "grad_norm_var": 0.11323140462239584, "learning_rate": 0.0001, "loss": 2.884, "loss/crossentropy": 2.465269994735718, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.22115966230630874, "loss/reg": 0.0, "step": 45890 }, { "epoch": 0.30197368421052634, "grad_norm": 2.125, "grad_norm_var": 0.05012919108072917, "learning_rate": 0.0001, "loss": 2.9405, "loss/crossentropy": 2.2171998232603074, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.22443641871213912, "loss/reg": 0.0, "step": 45900 }, { "epoch": 0.3020394736842105, "grad_norm": 2.59375, "grad_norm_var": 0.17444559733072917, "learning_rate": 0.0001, "loss": 2.8925, "loss/crossentropy": 2.421375799179077, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2077776938676834, "loss/reg": 0.0, "step": 45910 }, { "epoch": 0.3021052631578947, "grad_norm": 2.828125, "grad_norm_var": 0.1510894775390625, "learning_rate": 0.0001, "loss": 2.9434, "loss/crossentropy": 2.1692538976669313, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.1836540386080742, "loss/reg": 0.0, "step": 45920 }, { "epoch": 0.30217105263157895, "grad_norm": 2.5, "grad_norm_var": 0.06027018229166667, "learning_rate": 0.0001, "loss": 2.9173, "loss/crossentropy": 2.070696848630905, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.19826132282614709, "loss/reg": 0.0, "step": 45930 }, { "epoch": 0.30223684210526314, "grad_norm": 2.421875, "grad_norm_var": 0.0334136962890625, "learning_rate": 0.0001, "loss": 2.8512, "loss/crossentropy": 2.5418182611465454, "loss/hidden": 2.5359375, "loss/incoh": 0.0, "loss/logits": 0.20328862816095353, "loss/reg": 0.0, "step": 45940 }, { "epoch": 0.3023026315789474, "grad_norm": 2.296875, "grad_norm_var": 0.02623291015625, "learning_rate": 0.0001, "loss": 2.8298, "loss/crossentropy": 2.361379289627075, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20945519506931304, "loss/reg": 0.0, "step": 45950 }, { "epoch": 0.30236842105263156, "grad_norm": 3.09375, "grad_norm_var": 0.06640218098958334, "learning_rate": 0.0001, "loss": 2.8685, "loss/crossentropy": 1.9659605145454406, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.18451029285788537, "loss/reg": 0.0, "step": 45960 }, { "epoch": 0.3024342105263158, "grad_norm": 2.140625, "grad_norm_var": 0.05813395182291667, "learning_rate": 0.0001, "loss": 2.9249, "loss/crossentropy": 2.5293596982955933, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.22592951804399491, "loss/reg": 0.0, "step": 45970 }, { "epoch": 0.3025, "grad_norm": 2.484375, "grad_norm_var": 0.0653472900390625, "learning_rate": 0.0001, "loss": 2.9502, "loss/crossentropy": 2.3019928216934202, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.2079014778137207, "loss/reg": 0.0, "step": 45980 }, { "epoch": 0.30256578947368423, "grad_norm": 2.421875, "grad_norm_var": 0.06419169108072917, "learning_rate": 0.0001, "loss": 2.9539, "loss/crossentropy": 2.265769374370575, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.18871892094612122, "loss/reg": 0.0, "step": 45990 }, { "epoch": 0.3026315789473684, "grad_norm": 5.78125, "grad_norm_var": 0.8137685139973958, "learning_rate": 0.0001, "loss": 2.8726, "loss/crossentropy": 2.310705029964447, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19850367158651352, "loss/reg": 0.0, "step": 46000 }, { "epoch": 0.3026973684210526, "grad_norm": 2.578125, "grad_norm_var": 0.7478261311848958, "learning_rate": 0.0001, "loss": 2.933, "loss/crossentropy": 2.3275864243507387, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.22508644461631774, "loss/reg": 0.0, "step": 46010 }, { "epoch": 0.30276315789473685, "grad_norm": 2.96875, "grad_norm_var": 0.165625, "learning_rate": 0.0001, "loss": 2.9495, "loss/crossentropy": 2.1588327407836916, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.2147763192653656, "loss/reg": 0.0, "step": 46020 }, { "epoch": 0.30282894736842103, "grad_norm": 2.328125, "grad_norm_var": 0.14313151041666666, "learning_rate": 0.0001, "loss": 2.9383, "loss/crossentropy": 2.143091416358948, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.21448529958724977, "loss/reg": 0.0, "step": 46030 }, { "epoch": 0.30289473684210527, "grad_norm": 2.3125, "grad_norm_var": 0.05299072265625, "learning_rate": 0.0001, "loss": 2.8457, "loss/crossentropy": 2.0303418397903443, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.18702931851148605, "loss/reg": 0.0, "step": 46040 }, { "epoch": 0.30296052631578946, "grad_norm": 2.609375, "grad_norm_var": 0.053343709309895834, "learning_rate": 0.0001, "loss": 2.9373, "loss/crossentropy": 2.1964720129966735, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.20750815272331238, "loss/reg": 0.0, "step": 46050 }, { "epoch": 0.3030263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.058740234375, "learning_rate": 0.0001, "loss": 2.9041, "loss/crossentropy": 2.2683252096176147, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.20643253922462462, "loss/reg": 0.0, "step": 46060 }, { "epoch": 0.3030921052631579, "grad_norm": 2.5625, "grad_norm_var": 0.10847066243489584, "learning_rate": 0.0001, "loss": 2.9918, "loss/crossentropy": 2.301306390762329, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.1995823234319687, "loss/reg": 0.0, "step": 46070 }, { "epoch": 0.3031578947368421, "grad_norm": 2.140625, "grad_norm_var": 0.12829488118489582, "learning_rate": 0.0001, "loss": 2.8829, "loss/crossentropy": 2.2798464715480806, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.202628430724144, "loss/reg": 0.0, "step": 46080 }, { "epoch": 0.3032236842105263, "grad_norm": 2.53125, "grad_norm_var": 0.06545817057291667, "learning_rate": 0.0001, "loss": 2.9771, "loss/crossentropy": 2.10953232049942, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.21056633889675141, "loss/reg": 0.0, "step": 46090 }, { "epoch": 0.30328947368421055, "grad_norm": 3.8125, "grad_norm_var": 0.16931050618489582, "learning_rate": 0.0001, "loss": 2.9212, "loss/crossentropy": 2.2346729397773744, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.195981153100729, "loss/reg": 0.0, "step": 46100 }, { "epoch": 0.30335526315789474, "grad_norm": 2.84375, "grad_norm_var": 0.2163726806640625, "learning_rate": 0.0001, "loss": 3.0393, "loss/crossentropy": 2.5306612491607665, "loss/hidden": 2.590625, "loss/incoh": 0.0, "loss/logits": 0.2133290022611618, "loss/reg": 0.0, "step": 46110 }, { "epoch": 0.3034210526315789, "grad_norm": 2.3125, "grad_norm_var": 0.05232645670572917, "learning_rate": 0.0001, "loss": 2.964, "loss/crossentropy": 2.102805256843567, "loss/hidden": 2.8546875, "loss/incoh": 0.0, "loss/logits": 0.22952064722776414, "loss/reg": 0.0, "step": 46120 }, { "epoch": 0.30348684210526317, "grad_norm": 2.359375, "grad_norm_var": 0.2646230061848958, "learning_rate": 0.0001, "loss": 2.9978, "loss/crossentropy": 2.2487234115600585, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.21154432967305184, "loss/reg": 0.0, "step": 46130 }, { "epoch": 0.30355263157894735, "grad_norm": 2.421875, "grad_norm_var": 0.29417317708333335, "learning_rate": 0.0001, "loss": 2.8446, "loss/crossentropy": 2.474739682674408, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.20553253591060638, "loss/reg": 0.0, "step": 46140 }, { "epoch": 0.3036184210526316, "grad_norm": 2.34375, "grad_norm_var": 0.0238433837890625, "learning_rate": 0.0001, "loss": 2.9041, "loss/crossentropy": 2.2835312485694885, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.19158032536506653, "loss/reg": 0.0, "step": 46150 }, { "epoch": 0.3036842105263158, "grad_norm": 2.671875, "grad_norm_var": 0.028271484375, "learning_rate": 0.0001, "loss": 2.8748, "loss/crossentropy": 2.2986427545547485, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20903730392456055, "loss/reg": 0.0, "step": 46160 }, { "epoch": 0.30375, "grad_norm": 2.28125, "grad_norm_var": 0.08157145182291667, "learning_rate": 0.0001, "loss": 2.8682, "loss/crossentropy": 2.431505000591278, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.23315258026123048, "loss/reg": 0.0, "step": 46170 }, { "epoch": 0.3038157894736842, "grad_norm": 2.796875, "grad_norm_var": 0.11365559895833334, "learning_rate": 0.0001, "loss": 2.8421, "loss/crossentropy": 2.3595207810401915, "loss/hidden": 2.55, "loss/incoh": 0.0, "loss/logits": 0.1808723673224449, "loss/reg": 0.0, "step": 46180 }, { "epoch": 0.30388157894736845, "grad_norm": 2.4375, "grad_norm_var": 0.05005594889322917, "learning_rate": 0.0001, "loss": 2.8879, "loss/crossentropy": 2.4395034074783326, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.20314342081546782, "loss/reg": 0.0, "step": 46190 }, { "epoch": 0.30394736842105263, "grad_norm": 2.15625, "grad_norm_var": 0.056441243489583334, "learning_rate": 0.0001, "loss": 2.8835, "loss/crossentropy": 2.09265775680542, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23340400606393813, "loss/reg": 0.0, "step": 46200 }, { "epoch": 0.3040131578947368, "grad_norm": 2.234375, "grad_norm_var": 0.0838287353515625, "learning_rate": 0.0001, "loss": 2.8403, "loss/crossentropy": 2.4368330359458925, "loss/hidden": 2.5296875, "loss/incoh": 0.0, "loss/logits": 0.19347180128097535, "loss/reg": 0.0, "step": 46210 }, { "epoch": 0.30407894736842106, "grad_norm": 2.515625, "grad_norm_var": 0.06177469889322917, "learning_rate": 0.0001, "loss": 2.8812, "loss/crossentropy": 2.3351404428482057, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.19155988991260528, "loss/reg": 0.0, "step": 46220 }, { "epoch": 0.30414473684210525, "grad_norm": 2.546875, "grad_norm_var": 0.029059855143229167, "learning_rate": 0.0001, "loss": 2.9516, "loss/crossentropy": 2.325025570392609, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.18175926730036734, "loss/reg": 0.0, "step": 46230 }, { "epoch": 0.3042105263157895, "grad_norm": 2.359375, "grad_norm_var": 0.028694661458333333, "learning_rate": 0.0001, "loss": 2.9233, "loss/crossentropy": 2.2433771550655366, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.20588717013597488, "loss/reg": 0.0, "step": 46240 }, { "epoch": 0.3042763157894737, "grad_norm": 2.234375, "grad_norm_var": 2.032996994316643e+17, "learning_rate": 0.0001, "loss": 2.9668, "loss/crossentropy": 2.2682536482810973, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.21041297167539597, "loss/reg": 0.0, "step": 46250 }, { "epoch": 0.3043421052631579, "grad_norm": 2.359375, "grad_norm_var": 2.0329969939972643e+17, "learning_rate": 0.0001, "loss": 2.9263, "loss/crossentropy": 2.2238661527633665, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.2151746019721031, "loss/reg": 0.0, "step": 46260 }, { "epoch": 0.3044078947368421, "grad_norm": 2.21875, "grad_norm_var": 0.021968587239583334, "learning_rate": 0.0001, "loss": 2.8702, "loss/crossentropy": 2.4624280095100404, "loss/hidden": 2.55, "loss/incoh": 0.0, "loss/logits": 0.20537107512354852, "loss/reg": 0.0, "step": 46270 }, { "epoch": 0.30447368421052634, "grad_norm": 2.234375, "grad_norm_var": 0.02154541015625, "learning_rate": 0.0001, "loss": 2.9141, "loss/crossentropy": 1.9576608955860137, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2020930230617523, "loss/reg": 0.0, "step": 46280 }, { "epoch": 0.3045394736842105, "grad_norm": 2.5, "grad_norm_var": 0.027311197916666665, "learning_rate": 0.0001, "loss": 2.9381, "loss/crossentropy": 2.382508409023285, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.2037816397845745, "loss/reg": 0.0, "step": 46290 }, { "epoch": 0.3046052631578947, "grad_norm": 2.296875, "grad_norm_var": 1.9411092122395834, "learning_rate": 0.0001, "loss": 2.9127, "loss/crossentropy": 2.3719544887542723, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.21640437170863153, "loss/reg": 0.0, "step": 46300 }, { "epoch": 0.30467105263157895, "grad_norm": 3.25, "grad_norm_var": 0.05152079264322917, "learning_rate": 0.0001, "loss": 2.937, "loss/crossentropy": 2.3593274116516114, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.22033072710037233, "loss/reg": 0.0, "step": 46310 }, { "epoch": 0.30473684210526314, "grad_norm": 2.484375, "grad_norm_var": 0.08999735514322917, "learning_rate": 0.0001, "loss": 2.8673, "loss/crossentropy": 2.343029201030731, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22900201976299286, "loss/reg": 0.0, "step": 46320 }, { "epoch": 0.3048026315789474, "grad_norm": 2.890625, "grad_norm_var": 0.16262105305989583, "learning_rate": 0.0001, "loss": 2.9506, "loss/crossentropy": 2.3008418917655944, "loss/hidden": 2.9140625, "loss/incoh": 0.0, "loss/logits": 0.22308911085128785, "loss/reg": 0.0, "step": 46330 }, { "epoch": 0.30486842105263157, "grad_norm": 2.234375, "grad_norm_var": 0.051558430989583334, "learning_rate": 0.0001, "loss": 2.8807, "loss/crossentropy": 2.4655162572860716, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.2183946594595909, "loss/reg": 0.0, "step": 46340 }, { "epoch": 0.3049342105263158, "grad_norm": 2.328125, "grad_norm_var": 0.04927469889322917, "learning_rate": 0.0001, "loss": 2.8953, "loss/crossentropy": 2.1735773921012878, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.20741543918848038, "loss/reg": 0.0, "step": 46350 }, { "epoch": 0.305, "grad_norm": 2.296875, "grad_norm_var": 0.05756734212239583, "learning_rate": 0.0001, "loss": 2.8498, "loss/crossentropy": 2.2832759380340577, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.227975295484066, "loss/reg": 0.0, "step": 46360 }, { "epoch": 0.30506578947368423, "grad_norm": 2.703125, "grad_norm_var": 0.06142476399739583, "learning_rate": 0.0001, "loss": 2.9905, "loss/crossentropy": 2.429787003993988, "loss/hidden": 2.8765625, "loss/incoh": 0.0, "loss/logits": 0.2667254194617271, "loss/reg": 0.0, "step": 46370 }, { "epoch": 0.3051315789473684, "grad_norm": 2.421875, "grad_norm_var": 0.13557840983072916, "learning_rate": 0.0001, "loss": 2.9605, "loss/crossentropy": 2.2765231370925902, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.22259269952774047, "loss/reg": 0.0, "step": 46380 }, { "epoch": 0.3051973684210526, "grad_norm": 4.6875, "grad_norm_var": 0.38179931640625, "learning_rate": 0.0001, "loss": 3.0318, "loss/crossentropy": 2.2750582814216616, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2187570735812187, "loss/reg": 0.0, "step": 46390 }, { "epoch": 0.30526315789473685, "grad_norm": 2.484375, "grad_norm_var": 0.383984375, "learning_rate": 0.0001, "loss": 2.9312, "loss/crossentropy": 2.2600544333457946, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.21048903465270996, "loss/reg": 0.0, "step": 46400 }, { "epoch": 0.30532894736842103, "grad_norm": 2.65625, "grad_norm_var": 0.44326171875, "learning_rate": 0.0001, "loss": 2.8769, "loss/crossentropy": 2.0841195225715636, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.20692167431116104, "loss/reg": 0.0, "step": 46410 }, { "epoch": 0.3053947368421053, "grad_norm": 2.546875, "grad_norm_var": 0.4472564697265625, "learning_rate": 0.0001, "loss": 2.8928, "loss/crossentropy": 2.335439169406891, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.24331560879945754, "loss/reg": 0.0, "step": 46420 }, { "epoch": 0.30546052631578946, "grad_norm": 2.5625, "grad_norm_var": 0.0326812744140625, "learning_rate": 0.0001, "loss": 2.8711, "loss/crossentropy": 2.480947661399841, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.22052866369485855, "loss/reg": 0.0, "step": 46430 }, { "epoch": 0.3055263157894737, "grad_norm": 2.265625, "grad_norm_var": 0.5632232666015625, "learning_rate": 0.0001, "loss": 2.9803, "loss/crossentropy": 2.35667564868927, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.2380623623728752, "loss/reg": 0.0, "step": 46440 }, { "epoch": 0.3055921052631579, "grad_norm": 2.46875, "grad_norm_var": 0.52841796875, "learning_rate": 0.0001, "loss": 2.9791, "loss/crossentropy": 2.4100435614585876, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.23432549238204955, "loss/reg": 0.0, "step": 46450 }, { "epoch": 0.30565789473684213, "grad_norm": 2.515625, "grad_norm_var": 0.0650054931640625, "learning_rate": 0.0001, "loss": 2.8776, "loss/crossentropy": 2.0402866125106813, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.2100946880877018, "loss/reg": 0.0, "step": 46460 }, { "epoch": 0.3057236842105263, "grad_norm": 3.140625, "grad_norm_var": 0.09850260416666666, "learning_rate": 0.0001, "loss": 2.9335, "loss/crossentropy": 2.3046294331550596, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2110200807452202, "loss/reg": 0.0, "step": 46470 }, { "epoch": 0.3057894736842105, "grad_norm": 2.484375, "grad_norm_var": 0.1031402587890625, "learning_rate": 0.0001, "loss": 2.8657, "loss/crossentropy": 2.0314175844192506, "loss/hidden": 2.5390625, "loss/incoh": 0.0, "loss/logits": 0.17318851873278618, "loss/reg": 0.0, "step": 46480 }, { "epoch": 0.30585526315789474, "grad_norm": 2.390625, "grad_norm_var": 0.10927327473958333, "learning_rate": 0.0001, "loss": 2.9307, "loss/crossentropy": 2.458233743906021, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.24307574182748795, "loss/reg": 0.0, "step": 46490 }, { "epoch": 0.3059210526315789, "grad_norm": 2.453125, "grad_norm_var": 0.05523681640625, "learning_rate": 0.0001, "loss": 2.8663, "loss/crossentropy": 2.324156606197357, "loss/hidden": 2.5359375, "loss/incoh": 0.0, "loss/logits": 0.1886878177523613, "loss/reg": 0.0, "step": 46500 }, { "epoch": 0.30598684210526317, "grad_norm": 2.53125, "grad_norm_var": 0.08167215983072916, "learning_rate": 0.0001, "loss": 2.95, "loss/crossentropy": 2.0832616806030275, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.19053458645939828, "loss/reg": 0.0, "step": 46510 }, { "epoch": 0.30605263157894735, "grad_norm": 2.484375, "grad_norm_var": 0.0176666259765625, "learning_rate": 0.0001, "loss": 2.8792, "loss/crossentropy": 2.2077152729034424, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.22565555274486543, "loss/reg": 0.0, "step": 46520 }, { "epoch": 0.3061184210526316, "grad_norm": 2.265625, "grad_norm_var": 0.038981119791666664, "learning_rate": 0.0001, "loss": 2.8707, "loss/crossentropy": 2.272774338722229, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.2208704888820648, "loss/reg": 0.0, "step": 46530 }, { "epoch": 0.3061842105263158, "grad_norm": 2.03125, "grad_norm_var": 0.14322509765625, "learning_rate": 0.0001, "loss": 2.9102, "loss/crossentropy": 2.361661732196808, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.20811166018247604, "loss/reg": 0.0, "step": 46540 }, { "epoch": 0.30625, "grad_norm": 2.328125, "grad_norm_var": 0.04151102701822917, "learning_rate": 0.0001, "loss": 2.9011, "loss/crossentropy": 2.226359796524048, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.19523840695619582, "loss/reg": 0.0, "step": 46550 }, { "epoch": 0.3063157894736842, "grad_norm": 2.3125, "grad_norm_var": 0.08003641764322916, "learning_rate": 0.0001, "loss": 2.8885, "loss/crossentropy": 2.249724733829498, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22478358298540116, "loss/reg": 0.0, "step": 46560 }, { "epoch": 0.3063815789473684, "grad_norm": 2.4375, "grad_norm_var": 0.06914774576822917, "learning_rate": 0.0001, "loss": 2.884, "loss/crossentropy": 2.3421544313430784, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.18777238726615905, "loss/reg": 0.0, "step": 46570 }, { "epoch": 0.30644736842105263, "grad_norm": 2.640625, "grad_norm_var": 0.024242146809895834, "learning_rate": 0.0001, "loss": 2.8934, "loss/crossentropy": 2.2724972873926164, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.21739839240908623, "loss/reg": 0.0, "step": 46580 }, { "epoch": 0.3065131578947368, "grad_norm": 2.4375, "grad_norm_var": 0.029573567708333335, "learning_rate": 0.0001, "loss": 2.9594, "loss/crossentropy": 2.350151038169861, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2118473842740059, "loss/reg": 0.0, "step": 46590 }, { "epoch": 0.30657894736842106, "grad_norm": 2.46875, "grad_norm_var": 0.32193603515625, "learning_rate": 0.0001, "loss": 2.9174, "loss/crossentropy": 2.126652777194977, "loss/hidden": 2.94375, "loss/incoh": 0.0, "loss/logits": 0.24047405421733856, "loss/reg": 0.0, "step": 46600 }, { "epoch": 0.30664473684210525, "grad_norm": 2.421875, "grad_norm_var": 0.0703765869140625, "learning_rate": 0.0001, "loss": 2.8802, "loss/crossentropy": 2.180238723754883, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.2222717933356762, "loss/reg": 0.0, "step": 46610 }, { "epoch": 0.3067105263157895, "grad_norm": 2.28125, "grad_norm_var": 1.8672159830729167, "learning_rate": 0.0001, "loss": 2.9666, "loss/crossentropy": 2.2806544423103334, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.18192966133356095, "loss/reg": 0.0, "step": 46620 }, { "epoch": 0.3067763157894737, "grad_norm": 2.84375, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 2.9222, "loss/crossentropy": 2.3014461755752564, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.20887460857629775, "loss/reg": 0.0, "step": 46630 }, { "epoch": 0.3068421052631579, "grad_norm": 2.78125, "grad_norm_var": 0.09582926432291666, "learning_rate": 0.0001, "loss": 2.8794, "loss/crossentropy": 2.2777719259262086, "loss/hidden": 2.590625, "loss/incoh": 0.0, "loss/logits": 0.18657590746879577, "loss/reg": 0.0, "step": 46640 }, { "epoch": 0.3069078947368421, "grad_norm": 2.234375, "grad_norm_var": 0.074365234375, "learning_rate": 0.0001, "loss": 2.917, "loss/crossentropy": 2.3803712129592896, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.23606608361005782, "loss/reg": 0.0, "step": 46650 }, { "epoch": 0.30697368421052634, "grad_norm": 2.421875, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 2.9604, "loss/crossentropy": 2.2255991458892823, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.24595575854182244, "loss/reg": 0.0, "step": 46660 }, { "epoch": 0.30703947368421053, "grad_norm": 2.421875, "grad_norm_var": 0.0411041259765625, "learning_rate": 0.0001, "loss": 2.9216, "loss/crossentropy": 2.4009926080703736, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.23557150810956956, "loss/reg": 0.0, "step": 46670 }, { "epoch": 0.3071052631578947, "grad_norm": 2.765625, "grad_norm_var": 0.04820556640625, "learning_rate": 0.0001, "loss": 2.9309, "loss/crossentropy": 2.275268578529358, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.24864207059144974, "loss/reg": 0.0, "step": 46680 }, { "epoch": 0.30717105263157896, "grad_norm": 2.234375, "grad_norm_var": 0.050291951497395834, "learning_rate": 0.0001, "loss": 2.8874, "loss/crossentropy": 2.337228834629059, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2160692498087883, "loss/reg": 0.0, "step": 46690 }, { "epoch": 0.30723684210526314, "grad_norm": 2.59375, "grad_norm_var": 0.05328369140625, "learning_rate": 0.0001, "loss": 2.9293, "loss/crossentropy": 2.315832555294037, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.21215708255767823, "loss/reg": 0.0, "step": 46700 }, { "epoch": 0.3073026315789474, "grad_norm": 2.921875, "grad_norm_var": 0.21256103515625, "learning_rate": 0.0001, "loss": 3.0436, "loss/crossentropy": 2.4910483956336975, "loss/hidden": 3.1609375, "loss/incoh": 0.0, "loss/logits": 0.27946537286043166, "loss/reg": 0.0, "step": 46710 }, { "epoch": 0.30736842105263157, "grad_norm": 3.046875, "grad_norm_var": 0.21310221354166667, "learning_rate": 0.0001, "loss": 2.9395, "loss/crossentropy": 2.2839889526367188, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.24501137137413026, "loss/reg": 0.0, "step": 46720 }, { "epoch": 0.3074342105263158, "grad_norm": 2.453125, "grad_norm_var": 0.19905497233072916, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 1.9390459656715393, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.1986946605145931, "loss/reg": 0.0, "step": 46730 }, { "epoch": 0.3075, "grad_norm": 2.21875, "grad_norm_var": 0.24219462076822917, "learning_rate": 0.0001, "loss": 2.952, "loss/crossentropy": 2.19783194065094, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.2188527226448059, "loss/reg": 0.0, "step": 46740 }, { "epoch": 0.30756578947368424, "grad_norm": 2.21875, "grad_norm_var": 0.1031158447265625, "learning_rate": 0.0001, "loss": 2.8519, "loss/crossentropy": 2.189402091503143, "loss/hidden": 2.5125, "loss/incoh": 0.0, "loss/logits": 0.1844658687710762, "loss/reg": 0.0, "step": 46750 }, { "epoch": 0.3076315789473684, "grad_norm": 2.65625, "grad_norm_var": 2.0901276180532646e+17, "learning_rate": 0.0001, "loss": 3.0496, "loss/crossentropy": 2.1885711789131164, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.258860644698143, "loss/reg": 0.0, "step": 46760 }, { "epoch": 0.3076973684210526, "grad_norm": 2.765625, "grad_norm_var": 2.090127618008023e+17, "learning_rate": 0.0001, "loss": 2.8674, "loss/crossentropy": 2.0935221791267393, "loss/hidden": 2.553125, "loss/incoh": 0.0, "loss/logits": 0.1952270820736885, "loss/reg": 0.0, "step": 46770 }, { "epoch": 0.30776315789473685, "grad_norm": 2.28125, "grad_norm_var": 2.97307943494899e+17, "learning_rate": 0.0001, "loss": 3.0198, "loss/crossentropy": 2.5749504923820496, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.20140731632709502, "loss/reg": 0.0, "step": 46780 }, { "epoch": 0.30782894736842104, "grad_norm": 2.484375, "grad_norm_var": 0.06008707682291667, "learning_rate": 0.0001, "loss": 2.8862, "loss/crossentropy": 2.4690249443054197, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2241351678967476, "loss/reg": 0.0, "step": 46790 }, { "epoch": 0.3078947368421053, "grad_norm": 2.203125, "grad_norm_var": 0.03867899576822917, "learning_rate": 0.0001, "loss": 2.8681, "loss/crossentropy": 2.3744490504264832, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.20469798296689987, "loss/reg": 0.0, "step": 46800 }, { "epoch": 0.30796052631578946, "grad_norm": 2.484375, "grad_norm_var": 0.04309488932291667, "learning_rate": 0.0001, "loss": 2.9025, "loss/crossentropy": 2.3218981444835665, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.22615415453910828, "loss/reg": 0.0, "step": 46810 }, { "epoch": 0.3080263157894737, "grad_norm": 2.296875, "grad_norm_var": 0.045914713541666666, "learning_rate": 0.0001, "loss": 2.9282, "loss/crossentropy": 2.3578683853149416, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.2729095622897148, "loss/reg": 0.0, "step": 46820 }, { "epoch": 0.3080921052631579, "grad_norm": 3.953125, "grad_norm_var": 0.16900634765625, "learning_rate": 0.0001, "loss": 2.9316, "loss/crossentropy": 2.032913315296173, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.1857515200972557, "loss/reg": 0.0, "step": 46830 }, { "epoch": 0.30815789473684213, "grad_norm": 2.265625, "grad_norm_var": 0.21873372395833332, "learning_rate": 0.0001, "loss": 2.9062, "loss/crossentropy": 2.2378383755683897, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.18230140060186387, "loss/reg": 0.0, "step": 46840 }, { "epoch": 0.3082236842105263, "grad_norm": 2.5625, "grad_norm_var": 0.17040913899739582, "learning_rate": 0.0001, "loss": 2.8681, "loss/crossentropy": 2.1580735087394713, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.18702870905399321, "loss/reg": 0.0, "step": 46850 }, { "epoch": 0.3082894736842105, "grad_norm": 2.765625, "grad_norm_var": 0.14306640625, "learning_rate": 0.0001, "loss": 2.9176, "loss/crossentropy": 2.3329507946968078, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.2229396864771843, "loss/reg": 0.0, "step": 46860 }, { "epoch": 0.30835526315789474, "grad_norm": 2.4375, "grad_norm_var": 0.24783426920572918, "learning_rate": 0.0001, "loss": 3.0018, "loss/crossentropy": 2.3390440702438355, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2002165734767914, "loss/reg": 0.0, "step": 46870 }, { "epoch": 0.30842105263157893, "grad_norm": 2.171875, "grad_norm_var": 0.2434722900390625, "learning_rate": 0.0001, "loss": 2.8389, "loss/crossentropy": 2.161488401889801, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.20065706968307495, "loss/reg": 0.0, "step": 46880 }, { "epoch": 0.30848684210526317, "grad_norm": 2.140625, "grad_norm_var": 0.03192952473958333, "learning_rate": 0.0001, "loss": 2.8702, "loss/crossentropy": 2.403856432437897, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.224203859269619, "loss/reg": 0.0, "step": 46890 }, { "epoch": 0.30855263157894736, "grad_norm": 2.234375, "grad_norm_var": 0.30696207682291665, "learning_rate": 0.0001, "loss": 2.9276, "loss/crossentropy": 2.3433762073516844, "loss/hidden": 2.4953125, "loss/incoh": 0.0, "loss/logits": 0.19603877663612365, "loss/reg": 0.0, "step": 46900 }, { "epoch": 0.3086184210526316, "grad_norm": 2.21875, "grad_norm_var": 0.07763671875, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.3531020402908327, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.20727200508117677, "loss/reg": 0.0, "step": 46910 }, { "epoch": 0.3086842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.08056640625, "learning_rate": 0.0001, "loss": 2.882, "loss/crossentropy": 2.3594457119703294, "loss/hidden": 2.521875, "loss/incoh": 0.0, "loss/logits": 0.1882092721760273, "loss/reg": 0.0, "step": 46920 }, { "epoch": 0.30875, "grad_norm": 2.234375, "grad_norm_var": 0.04384765625, "learning_rate": 0.0001, "loss": 2.9607, "loss/crossentropy": 2.3665390610694885, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.21455881446599961, "loss/reg": 0.0, "step": 46930 }, { "epoch": 0.3088157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.0638580322265625, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.4031014323234556, "loss/hidden": 2.521875, "loss/incoh": 0.0, "loss/logits": 0.20351839065551758, "loss/reg": 0.0, "step": 46940 }, { "epoch": 0.3088815789473684, "grad_norm": 2.625, "grad_norm_var": 0.0325347900390625, "learning_rate": 0.0001, "loss": 2.9461, "loss/crossentropy": 2.573493945598602, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.2336792156100273, "loss/reg": 0.0, "step": 46950 }, { "epoch": 0.30894736842105264, "grad_norm": 2.34375, "grad_norm_var": 0.052098592122395836, "learning_rate": 0.0001, "loss": 3.002, "loss/crossentropy": 2.095530104637146, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.20165919363498688, "loss/reg": 0.0, "step": 46960 }, { "epoch": 0.3090131578947368, "grad_norm": 2.65625, "grad_norm_var": 0.050455729166666664, "learning_rate": 0.0001, "loss": 3.0036, "loss/crossentropy": 2.3939878225326536, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23971444219350815, "loss/reg": 0.0, "step": 46970 }, { "epoch": 0.30907894736842106, "grad_norm": 3.0625, "grad_norm_var": 0.96357421875, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 2.4210414409637453, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.19955383241176605, "loss/reg": 0.0, "step": 46980 }, { "epoch": 0.30914473684210525, "grad_norm": 2.609375, "grad_norm_var": 0.08678385416666666, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.177080976963043, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.1978432796895504, "loss/reg": 0.0, "step": 46990 }, { "epoch": 0.3092105263157895, "grad_norm": 2.328125, "grad_norm_var": 0.13810933430989583, "learning_rate": 0.0001, "loss": 2.8992, "loss/crossentropy": 2.193236434459686, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2046295240521431, "loss/reg": 0.0, "step": 47000 }, { "epoch": 0.3092763157894737, "grad_norm": 2.359375, "grad_norm_var": 0.12996317545572916, "learning_rate": 0.0001, "loss": 2.9634, "loss/crossentropy": 2.321349394321442, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.23815212845802308, "loss/reg": 0.0, "step": 47010 }, { "epoch": 0.3093421052631579, "grad_norm": 2.984375, "grad_norm_var": 0.0554840087890625, "learning_rate": 0.0001, "loss": 2.9045, "loss/crossentropy": 2.3896005034446715, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.2383191093802452, "loss/reg": 0.0, "step": 47020 }, { "epoch": 0.3094078947368421, "grad_norm": 2.375, "grad_norm_var": 0.2749908447265625, "learning_rate": 0.0001, "loss": 2.9892, "loss/crossentropy": 2.381329596042633, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.23298230171203613, "loss/reg": 0.0, "step": 47030 }, { "epoch": 0.3094736842105263, "grad_norm": 2.328125, "grad_norm_var": 0.29605712890625, "learning_rate": 0.0001, "loss": 2.88, "loss/crossentropy": 2.4349808216094972, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.21665673702955246, "loss/reg": 0.0, "step": 47040 }, { "epoch": 0.30953947368421053, "grad_norm": 2.546875, "grad_norm_var": 0.021613566080729167, "learning_rate": 0.0001, "loss": 2.8959, "loss/crossentropy": 2.0010550260543822, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.239803434163332, "loss/reg": 0.0, "step": 47050 }, { "epoch": 0.3096052631578947, "grad_norm": 2.3125, "grad_norm_var": 0.05473531087239583, "learning_rate": 0.0001, "loss": 2.9327, "loss/crossentropy": 2.388463830947876, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.264525243639946, "loss/reg": 0.0, "step": 47060 }, { "epoch": 0.30967105263157896, "grad_norm": 2.75, "grad_norm_var": 0.052506510416666666, "learning_rate": 0.0001, "loss": 2.9742, "loss/crossentropy": 2.016686964035034, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22417988777160644, "loss/reg": 0.0, "step": 47070 }, { "epoch": 0.30973684210526314, "grad_norm": 2.234375, "grad_norm_var": 0.07810872395833333, "learning_rate": 0.0001, "loss": 2.8403, "loss/crossentropy": 2.3212387442588804, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.20094894617795944, "loss/reg": 0.0, "step": 47080 }, { "epoch": 0.3098026315789474, "grad_norm": 2.421875, "grad_norm_var": 1.3462066650390625, "learning_rate": 0.0001, "loss": 3.0032, "loss/crossentropy": 2.5010061025619508, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.24285152107477187, "loss/reg": 0.0, "step": 47090 }, { "epoch": 0.30986842105263157, "grad_norm": 2.640625, "grad_norm_var": 1.3594228108723958, "learning_rate": 0.0001, "loss": 2.8946, "loss/crossentropy": 2.1737305998802183, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.1882098987698555, "loss/reg": 0.0, "step": 47100 }, { "epoch": 0.3099342105263158, "grad_norm": 2.5625, "grad_norm_var": 0.05084635416666667, "learning_rate": 0.0001, "loss": 2.9422, "loss/crossentropy": 2.4177565038204194, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2029744416475296, "loss/reg": 0.0, "step": 47110 }, { "epoch": 0.31, "grad_norm": 2.390625, "grad_norm_var": 0.04309488932291667, "learning_rate": 0.0001, "loss": 2.8972, "loss/crossentropy": 2.3867032647132875, "loss/hidden": 2.8796875, "loss/incoh": 0.0, "loss/logits": 0.26529945582151415, "loss/reg": 0.0, "step": 47120 }, { "epoch": 0.3100657894736842, "grad_norm": 2.015625, "grad_norm_var": 0.05185139973958333, "learning_rate": 0.0001, "loss": 2.8812, "loss/crossentropy": 2.224932336807251, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.21053457856178284, "loss/reg": 0.0, "step": 47130 }, { "epoch": 0.3101315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.025470987955729166, "learning_rate": 0.0001, "loss": 2.955, "loss/crossentropy": 2.3104440689086916, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.23976083248853683, "loss/reg": 0.0, "step": 47140 }, { "epoch": 0.3101973684210526, "grad_norm": 2.46875, "grad_norm_var": 0.0161041259765625, "learning_rate": 0.0001, "loss": 2.8767, "loss/crossentropy": 2.20803416967392, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.2076631098985672, "loss/reg": 0.0, "step": 47150 }, { "epoch": 0.31026315789473685, "grad_norm": 2.09375, "grad_norm_var": 0.0688873291015625, "learning_rate": 0.0001, "loss": 2.9268, "loss/crossentropy": 2.171284222602844, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.2102009579539299, "loss/reg": 0.0, "step": 47160 }, { "epoch": 0.31032894736842104, "grad_norm": 2.25, "grad_norm_var": 0.062254842122395834, "learning_rate": 0.0001, "loss": 2.8293, "loss/crossentropy": 2.4027581930160524, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.203232841193676, "loss/reg": 0.0, "step": 47170 }, { "epoch": 0.3103947368421053, "grad_norm": 2.40625, "grad_norm_var": 0.07753499348958333, "learning_rate": 0.0001, "loss": 2.9945, "loss/crossentropy": 2.3069218039512633, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.2285212755203247, "loss/reg": 0.0, "step": 47180 }, { "epoch": 0.31046052631578946, "grad_norm": 2.375, "grad_norm_var": 0.04003499348958333, "learning_rate": 0.0001, "loss": 2.8566, "loss/crossentropy": 2.1290971398353578, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2064266324043274, "loss/reg": 0.0, "step": 47190 }, { "epoch": 0.3105263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.05724283854166667, "learning_rate": 0.0001, "loss": 2.864, "loss/crossentropy": 2.1773618817329408, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.2198542281985283, "loss/reg": 0.0, "step": 47200 }, { "epoch": 0.3105921052631579, "grad_norm": 2.5625, "grad_norm_var": 1.55546875, "learning_rate": 0.0001, "loss": 2.9448, "loss/crossentropy": 2.2356161952018736, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.22558419704437255, "loss/reg": 0.0, "step": 47210 }, { "epoch": 0.31065789473684213, "grad_norm": 2.265625, "grad_norm_var": 1.4870025634765625, "learning_rate": 0.0001, "loss": 2.9794, "loss/crossentropy": 2.4171629667282106, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.21991426944732667, "loss/reg": 0.0, "step": 47220 }, { "epoch": 0.3107236842105263, "grad_norm": 2.3125, "grad_norm_var": 0.07053629557291667, "learning_rate": 0.0001, "loss": 2.8347, "loss/crossentropy": 2.2315442681312563, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.2142227217555046, "loss/reg": 0.0, "step": 47230 }, { "epoch": 0.3107894736842105, "grad_norm": 2.546875, "grad_norm_var": 0.07381083170572916, "learning_rate": 0.0001, "loss": 2.892, "loss/crossentropy": 2.3194393634796144, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.21396755427122116, "loss/reg": 0.0, "step": 47240 }, { "epoch": 0.31085526315789475, "grad_norm": 2.296875, "grad_norm_var": 0.28253580729166666, "learning_rate": 0.0001, "loss": 2.9059, "loss/crossentropy": 2.4150348782539366, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.21279818564653397, "loss/reg": 0.0, "step": 47250 }, { "epoch": 0.31092105263157893, "grad_norm": 2.65625, "grad_norm_var": 0.19973551432291667, "learning_rate": 0.0001, "loss": 2.9792, "loss/crossentropy": 2.2982528805732727, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2317174881696701, "loss/reg": 0.0, "step": 47260 }, { "epoch": 0.3109868421052632, "grad_norm": 2.609375, "grad_norm_var": 0.29963785807291665, "learning_rate": 0.0001, "loss": 2.9112, "loss/crossentropy": 2.4336291551589966, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.23635929375886916, "loss/reg": 0.0, "step": 47270 }, { "epoch": 0.31105263157894736, "grad_norm": 2.265625, "grad_norm_var": 0.30977274576822916, "learning_rate": 0.0001, "loss": 2.8752, "loss/crossentropy": 2.2698886513710024, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.20392750948667526, "loss/reg": 0.0, "step": 47280 }, { "epoch": 0.3111184210526316, "grad_norm": 2.265625, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 2.9047, "loss/crossentropy": 2.2182451486587524, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.21638350188732147, "loss/reg": 0.0, "step": 47290 }, { "epoch": 0.3111842105263158, "grad_norm": 2.53125, "grad_norm_var": 0.09938151041666667, "learning_rate": 0.0001, "loss": 2.8911, "loss/crossentropy": 2.2719017028808595, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.17746797800064087, "loss/reg": 0.0, "step": 47300 }, { "epoch": 0.31125, "grad_norm": 2.359375, "grad_norm_var": 0.07478739420572916, "learning_rate": 0.0001, "loss": 2.8981, "loss/crossentropy": 2.2905858039855955, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.188574481010437, "loss/reg": 0.0, "step": 47310 }, { "epoch": 0.3113157894736842, "grad_norm": 2.109375, "grad_norm_var": 0.08089192708333333, "learning_rate": 0.0001, "loss": 2.8542, "loss/crossentropy": 2.226369655132294, "loss/hidden": 2.5296875, "loss/incoh": 0.0, "loss/logits": 0.1978940784931183, "loss/reg": 0.0, "step": 47320 }, { "epoch": 0.3113815789473684, "grad_norm": 2.28125, "grad_norm_var": 0.04794921875, "learning_rate": 0.0001, "loss": 2.8483, "loss/crossentropy": 2.3317304372787477, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.21459078341722487, "loss/reg": 0.0, "step": 47330 }, { "epoch": 0.31144736842105264, "grad_norm": 2.359375, "grad_norm_var": 0.0774322509765625, "learning_rate": 0.0001, "loss": 2.9155, "loss/crossentropy": 2.1292049527168273, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23992046266794204, "loss/reg": 0.0, "step": 47340 }, { "epoch": 0.3115131578947368, "grad_norm": 2.296875, "grad_norm_var": 0.07056884765625, "learning_rate": 0.0001, "loss": 2.896, "loss/crossentropy": 2.2794556856155395, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.22360755279660224, "loss/reg": 0.0, "step": 47350 }, { "epoch": 0.31157894736842107, "grad_norm": 2.0625, "grad_norm_var": 0.04049072265625, "learning_rate": 0.0001, "loss": 2.9127, "loss/crossentropy": 2.27066285610199, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.257367941737175, "loss/reg": 0.0, "step": 47360 }, { "epoch": 0.31164473684210525, "grad_norm": 2.125, "grad_norm_var": 0.07480061848958333, "learning_rate": 0.0001, "loss": 2.9146, "loss/crossentropy": 2.4018382906913756, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.23269833326339723, "loss/reg": 0.0, "step": 47370 }, { "epoch": 0.3117105263157895, "grad_norm": 1.9140625, "grad_norm_var": 0.06328506469726562, "learning_rate": 0.0001, "loss": 2.9133, "loss/crossentropy": 2.436321532726288, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2123062089085579, "loss/reg": 0.0, "step": 47380 }, { "epoch": 0.3117763157894737, "grad_norm": 2.375, "grad_norm_var": 0.1536883036295573, "learning_rate": 0.0001, "loss": 2.8985, "loss/crossentropy": 2.4360718727111816, "loss/hidden": 2.503125, "loss/incoh": 0.0, "loss/logits": 0.18638769686222076, "loss/reg": 0.0, "step": 47390 }, { "epoch": 0.3118421052631579, "grad_norm": 2.546875, "grad_norm_var": 0.1322906494140625, "learning_rate": 0.0001, "loss": 2.9255, "loss/crossentropy": 2.2739851474761963, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.23892153799533844, "loss/reg": 0.0, "step": 47400 }, { "epoch": 0.3119078947368421, "grad_norm": 2.25, "grad_norm_var": 0.049559529622395834, "learning_rate": 0.0001, "loss": 2.867, "loss/crossentropy": 2.4996579766273497, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.20919591784477234, "loss/reg": 0.0, "step": 47410 }, { "epoch": 0.3119736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.11883138020833334, "learning_rate": 0.0001, "loss": 2.945, "loss/crossentropy": 2.145795100927353, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.18240004181861877, "loss/reg": 0.0, "step": 47420 }, { "epoch": 0.31203947368421053, "grad_norm": 2.09375, "grad_norm_var": 0.1162506103515625, "learning_rate": 0.0001, "loss": 2.8565, "loss/crossentropy": 2.1218804359436034, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.2017826572060585, "loss/reg": 0.0, "step": 47430 }, { "epoch": 0.3121052631578947, "grad_norm": 2.4375, "grad_norm_var": 0.0540435791015625, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.4842190861701967, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.22418318539857865, "loss/reg": 0.0, "step": 47440 }, { "epoch": 0.31217105263157896, "grad_norm": 2.328125, "grad_norm_var": 0.11988016764322916, "learning_rate": 0.0001, "loss": 2.924, "loss/crossentropy": 2.5431269645690917, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.22005292475223542, "loss/reg": 0.0, "step": 47450 }, { "epoch": 0.31223684210526315, "grad_norm": 2.671875, "grad_norm_var": 0.044990793863932295, "learning_rate": 0.0001, "loss": 2.8641, "loss/crossentropy": 2.237706160545349, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.19899242296814917, "loss/reg": 0.0, "step": 47460 }, { "epoch": 0.3123026315789474, "grad_norm": 2.046875, "grad_norm_var": 0.037261708577473955, "learning_rate": 0.0001, "loss": 2.8567, "loss/crossentropy": 2.3878122091293337, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20960068255662917, "loss/reg": 0.0, "step": 47470 }, { "epoch": 0.3123684210526316, "grad_norm": 2.890625, "grad_norm_var": 0.10047098795572916, "learning_rate": 0.0001, "loss": 2.925, "loss/crossentropy": 2.5351375579833983, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.2197231486439705, "loss/reg": 0.0, "step": 47480 }, { "epoch": 0.3124342105263158, "grad_norm": 2.515625, "grad_norm_var": 0.4715166727701823, "learning_rate": 0.0001, "loss": 2.9223, "loss/crossentropy": 2.1534179925918577, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.2107102245092392, "loss/reg": 0.0, "step": 47490 }, { "epoch": 0.3125, "grad_norm": 2.546875, "grad_norm_var": 0.42719624837239584, "learning_rate": 0.0001, "loss": 2.9506, "loss/crossentropy": 2.3555686116218566, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.2777353286743164, "loss/reg": 0.0, "step": 47500 }, { "epoch": 0.3125657894736842, "grad_norm": 1.9453125, "grad_norm_var": 0.039469146728515626, "learning_rate": 0.0001, "loss": 2.8577, "loss/crossentropy": 2.3224485993385313, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.21028360202908516, "loss/reg": 0.0, "step": 47510 }, { "epoch": 0.3126315789473684, "grad_norm": 2.328125, "grad_norm_var": 0.048990631103515626, "learning_rate": 0.0001, "loss": 2.8893, "loss/crossentropy": 2.1786094903945923, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.22256029546260833, "loss/reg": 0.0, "step": 47520 }, { "epoch": 0.3126973684210526, "grad_norm": 2.34375, "grad_norm_var": 0.20402018229166666, "learning_rate": 0.0001, "loss": 2.9599, "loss/crossentropy": 2.239318060874939, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.1920368880033493, "loss/reg": 0.0, "step": 47530 }, { "epoch": 0.31276315789473685, "grad_norm": 2.5, "grad_norm_var": 0.15624593098958334, "learning_rate": 0.0001, "loss": 2.9014, "loss/crossentropy": 2.3715943336486816, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23497617095708848, "loss/reg": 0.0, "step": 47540 }, { "epoch": 0.31282894736842104, "grad_norm": 2.421875, "grad_norm_var": 0.13093159993489584, "learning_rate": 0.0001, "loss": 2.8851, "loss/crossentropy": 2.3275977253913878, "loss/hidden": 2.509375, "loss/incoh": 0.0, "loss/logits": 0.18069526553153992, "loss/reg": 0.0, "step": 47550 }, { "epoch": 0.3128947368421053, "grad_norm": 2.21875, "grad_norm_var": 0.10741780598958334, "learning_rate": 0.0001, "loss": 2.9183, "loss/crossentropy": 2.182415187358856, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.18630916029214858, "loss/reg": 0.0, "step": 47560 }, { "epoch": 0.31296052631578947, "grad_norm": 2.375, "grad_norm_var": 0.08325093587239583, "learning_rate": 0.0001, "loss": 2.8784, "loss/crossentropy": 2.087134909629822, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.22054121196269988, "loss/reg": 0.0, "step": 47570 }, { "epoch": 0.3130263157894737, "grad_norm": 2.171875, "grad_norm_var": 0.73795166015625, "learning_rate": 0.0001, "loss": 2.904, "loss/crossentropy": 2.541267967224121, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.25836118310689926, "loss/reg": 0.0, "step": 47580 }, { "epoch": 0.3130921052631579, "grad_norm": 2.34375, "grad_norm_var": 0.13954976399739583, "learning_rate": 0.0001, "loss": 2.8817, "loss/crossentropy": 2.0592684149742126, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.1761935718357563, "loss/reg": 0.0, "step": 47590 }, { "epoch": 0.3131578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.03362223307291667, "learning_rate": 0.0001, "loss": 2.944, "loss/crossentropy": 2.1959443509578707, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.1908652253448963, "loss/reg": 0.0, "step": 47600 }, { "epoch": 0.3132236842105263, "grad_norm": 2.640625, "grad_norm_var": 0.1462799072265625, "learning_rate": 0.0001, "loss": 3.0463, "loss/crossentropy": 2.0683607935905455, "loss/hidden": 3.03125, "loss/incoh": 0.0, "loss/logits": 0.26247444152832033, "loss/reg": 0.0, "step": 47610 }, { "epoch": 0.3132894736842105, "grad_norm": 2.296875, "grad_norm_var": 0.06212565104166667, "learning_rate": 0.0001, "loss": 2.8854, "loss/crossentropy": 2.1339609384536744, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.29851278066635134, "loss/reg": 0.0, "step": 47620 }, { "epoch": 0.31335526315789475, "grad_norm": 2.359375, "grad_norm_var": 0.018115234375, "learning_rate": 0.0001, "loss": 2.8918, "loss/crossentropy": 2.3508539319038393, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.20801450163125992, "loss/reg": 0.0, "step": 47630 }, { "epoch": 0.31342105263157893, "grad_norm": 2.25, "grad_norm_var": 0.020246378580729165, "learning_rate": 0.0001, "loss": 2.8484, "loss/crossentropy": 2.287714219093323, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.22931959182024003, "loss/reg": 0.0, "step": 47640 }, { "epoch": 0.3134868421052632, "grad_norm": 2.0625, "grad_norm_var": 0.04388020833333333, "learning_rate": 0.0001, "loss": 2.8549, "loss/crossentropy": 2.3919747710227965, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.20896707028150557, "loss/reg": 0.0, "step": 47650 }, { "epoch": 0.31355263157894736, "grad_norm": 2.734375, "grad_norm_var": 0.07839253743489584, "learning_rate": 0.0001, "loss": 2.9728, "loss/crossentropy": 2.239887022972107, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.18265613168478012, "loss/reg": 0.0, "step": 47660 }, { "epoch": 0.3136184210526316, "grad_norm": 2.359375, "grad_norm_var": 0.08300679524739583, "learning_rate": 0.0001, "loss": 2.8803, "loss/crossentropy": 2.10336891412735, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.18336264789104462, "loss/reg": 0.0, "step": 47670 }, { "epoch": 0.3136842105263158, "grad_norm": 3.078125, "grad_norm_var": 0.07238667805989583, "learning_rate": 0.0001, "loss": 2.9177, "loss/crossentropy": 2.223607176542282, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.1979481853544712, "loss/reg": 0.0, "step": 47680 }, { "epoch": 0.31375, "grad_norm": 2.71875, "grad_norm_var": 0.06995035807291666, "learning_rate": 0.0001, "loss": 2.9713, "loss/crossentropy": 2.320642650127411, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2153625890612602, "loss/reg": 0.0, "step": 47690 }, { "epoch": 0.3138157894736842, "grad_norm": 2.875, "grad_norm_var": 0.05663655598958333, "learning_rate": 0.0001, "loss": 2.9218, "loss/crossentropy": 2.330680477619171, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.3164501816034317, "loss/reg": 0.0, "step": 47700 }, { "epoch": 0.3138815789473684, "grad_norm": 2.265625, "grad_norm_var": 0.05095113118489583, "learning_rate": 0.0001, "loss": 2.85, "loss/crossentropy": 1.9173928856849671, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.19458426684141159, "loss/reg": 0.0, "step": 47710 }, { "epoch": 0.31394736842105264, "grad_norm": 2.84375, "grad_norm_var": 1.5052314135403178e+17, "learning_rate": 0.0001, "loss": 3.0506, "loss/crossentropy": 2.440174865722656, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.24388239085674285, "loss/reg": 0.0, "step": 47720 }, { "epoch": 0.3140131578947368, "grad_norm": 2.546875, "grad_norm_var": 1.5052314136494352e+17, "learning_rate": 0.0001, "loss": 2.9105, "loss/crossentropy": 2.3985848426818848, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.19902859181165694, "loss/reg": 0.0, "step": 47730 }, { "epoch": 0.31407894736842107, "grad_norm": 2.796875, "grad_norm_var": 0.059015909830729164, "learning_rate": 0.0001, "loss": 2.8874, "loss/crossentropy": 2.1807025194168093, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.22236142605543135, "loss/reg": 0.0, "step": 47740 }, { "epoch": 0.31414473684210525, "grad_norm": 2.359375, "grad_norm_var": 0.053938802083333334, "learning_rate": 0.0001, "loss": 2.8975, "loss/crossentropy": 2.318903088569641, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.21749626994132995, "loss/reg": 0.0, "step": 47750 }, { "epoch": 0.3142105263157895, "grad_norm": 2.4375, "grad_norm_var": 0.033951822916666666, "learning_rate": 0.0001, "loss": 2.905, "loss/crossentropy": 2.2025362849235535, "loss/hidden": 2.53125, "loss/incoh": 0.0, "loss/logits": 0.18802652433514594, "loss/reg": 0.0, "step": 47760 }, { "epoch": 0.3142763157894737, "grad_norm": 2.046875, "grad_norm_var": 0.06155192057291667, "learning_rate": 0.0001, "loss": 2.923, "loss/crossentropy": 2.11036012172699, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2140059530735016, "loss/reg": 0.0, "step": 47770 }, { "epoch": 0.31434210526315787, "grad_norm": 2.125, "grad_norm_var": 1.6159088134765625, "learning_rate": 0.0001, "loss": 2.895, "loss/crossentropy": 2.0403364419937136, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.20629735440015792, "loss/reg": 0.0, "step": 47780 }, { "epoch": 0.3144078947368421, "grad_norm": 2.515625, "grad_norm_var": 1.5314737955729167, "learning_rate": 0.0001, "loss": 2.9647, "loss/crossentropy": 2.4687071561813356, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.27800605446100235, "loss/reg": 0.0, "step": 47790 }, { "epoch": 0.3144736842105263, "grad_norm": 2.5625, "grad_norm_var": 0.15804036458333334, "learning_rate": 0.0001, "loss": 2.9574, "loss/crossentropy": 2.4449599504470827, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.22672125399112703, "loss/reg": 0.0, "step": 47800 }, { "epoch": 0.31453947368421054, "grad_norm": 2.375, "grad_norm_var": 0.061507161458333334, "learning_rate": 0.0001, "loss": 2.9461, "loss/crossentropy": 2.3883477687835692, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.208871478587389, "loss/reg": 0.0, "step": 47810 }, { "epoch": 0.3146052631578947, "grad_norm": 2.390625, "grad_norm_var": 0.07379150390625, "learning_rate": 0.0001, "loss": 2.9078, "loss/crossentropy": 2.221570980548859, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.2067367672920227, "loss/reg": 0.0, "step": 47820 }, { "epoch": 0.31467105263157896, "grad_norm": 2.21875, "grad_norm_var": 0.0278717041015625, "learning_rate": 0.0001, "loss": 2.9276, "loss/crossentropy": 2.0075812578201293, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.2325127363204956, "loss/reg": 0.0, "step": 47830 }, { "epoch": 0.31473684210526315, "grad_norm": 1853882368.0, "grad_norm_var": 2.1480498904752566e+17, "learning_rate": 0.0001, "loss": 3.1007, "loss/crossentropy": 2.3326231360435488, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.22243160158395767, "loss/reg": 0.0, "step": 47840 }, { "epoch": 0.3148026315789474, "grad_norm": 2.453125, "grad_norm_var": 2.1480498903714582e+17, "learning_rate": 0.0001, "loss": 2.8565, "loss/crossentropy": 2.230215311050415, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2210487097501755, "loss/reg": 0.0, "step": 47850 }, { "epoch": 0.3148684210526316, "grad_norm": 2.140625, "grad_norm_var": 0.0411041259765625, "learning_rate": 0.0001, "loss": 2.9034, "loss/crossentropy": 2.5245557546615602, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22323217540979384, "loss/reg": 0.0, "step": 47860 }, { "epoch": 0.3149342105263158, "grad_norm": 2.796875, "grad_norm_var": 0.08408915201822917, "learning_rate": 0.0001, "loss": 2.8951, "loss/crossentropy": 2.595898985862732, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.1952609673142433, "loss/reg": 0.0, "step": 47870 }, { "epoch": 0.315, "grad_norm": 2.203125, "grad_norm_var": 0.0993670145670573, "learning_rate": 0.0001, "loss": 2.8313, "loss/crossentropy": 2.552921807765961, "loss/hidden": 2.4640625, "loss/incoh": 0.0, "loss/logits": 0.18224859684705735, "loss/reg": 0.0, "step": 47880 }, { "epoch": 0.3150657894736842, "grad_norm": 2.453125, "grad_norm_var": 0.06483535766601563, "learning_rate": 0.0001, "loss": 2.9292, "loss/crossentropy": 2.4762013435363768, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.22217516154050826, "loss/reg": 0.0, "step": 47890 }, { "epoch": 0.31513157894736843, "grad_norm": 2.390625, "grad_norm_var": 0.03127848307291667, "learning_rate": 0.0001, "loss": 2.8655, "loss/crossentropy": 2.0918250560760496, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.20359376221895217, "loss/reg": 0.0, "step": 47900 }, { "epoch": 0.3151973684210526, "grad_norm": 2.1875, "grad_norm_var": 0.13836263020833334, "learning_rate": 0.0001, "loss": 2.9297, "loss/crossentropy": 2.1880165934562683, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2320842534303665, "loss/reg": 0.0, "step": 47910 }, { "epoch": 0.31526315789473686, "grad_norm": 2.15625, "grad_norm_var": 0.13775634765625, "learning_rate": 0.0001, "loss": 2.9266, "loss/crossentropy": 2.30414457321167, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20234824791550637, "loss/reg": 0.0, "step": 47920 }, { "epoch": 0.31532894736842104, "grad_norm": 2.25, "grad_norm_var": 0.12180989583333333, "learning_rate": 0.0001, "loss": 2.8957, "loss/crossentropy": 2.4937914848327636, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2237060159444809, "loss/reg": 0.0, "step": 47930 }, { "epoch": 0.3153947368421053, "grad_norm": 2.09375, "grad_norm_var": 0.1736328125, "learning_rate": 0.0001, "loss": 2.9359, "loss/crossentropy": 2.2661273002624513, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.2152247577905655, "loss/reg": 0.0, "step": 47940 }, { "epoch": 0.31546052631578947, "grad_norm": 2.75, "grad_norm_var": 0.08046468098958333, "learning_rate": 0.0001, "loss": 2.8849, "loss/crossentropy": 2.249963569641113, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2286963388323784, "loss/reg": 0.0, "step": 47950 }, { "epoch": 0.3155263157894737, "grad_norm": 2.375, "grad_norm_var": 0.06696675618489584, "learning_rate": 0.0001, "loss": 2.8899, "loss/crossentropy": 2.345364308357239, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.22313790619373322, "loss/reg": 0.0, "step": 47960 }, { "epoch": 0.3155921052631579, "grad_norm": 2.140625, "grad_norm_var": 0.095361328125, "learning_rate": 0.0001, "loss": 2.9455, "loss/crossentropy": 2.273095965385437, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.22394748479127885, "loss/reg": 0.0, "step": 47970 }, { "epoch": 0.3156578947368421, "grad_norm": 2.359375, "grad_norm_var": 0.10379130045572917, "learning_rate": 0.0001, "loss": 2.8768, "loss/crossentropy": 2.4961345195770264, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.20829266160726548, "loss/reg": 0.0, "step": 47980 }, { "epoch": 0.3157236842105263, "grad_norm": 2.53125, "grad_norm_var": 0.1017242431640625, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.429774749279022, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2559376731514931, "loss/reg": 0.0, "step": 47990 }, { "epoch": 0.3157894736842105, "grad_norm": 2.28125, "grad_norm_var": 0.03125, "learning_rate": 0.0001, "loss": 2.9195, "loss/crossentropy": 2.493117320537567, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.2011844500899315, "loss/reg": 0.0, "step": 48000 }, { "epoch": 0.31585526315789475, "grad_norm": 2.453125, "grad_norm_var": 0.034789021809895834, "learning_rate": 0.0001, "loss": 2.844, "loss/crossentropy": 2.1538923740386964, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.2279973089694977, "loss/reg": 0.0, "step": 48010 }, { "epoch": 0.31592105263157894, "grad_norm": 2.6875, "grad_norm_var": 0.0507720947265625, "learning_rate": 0.0001, "loss": 2.9436, "loss/crossentropy": 2.3753435134887697, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.25982323437929156, "loss/reg": 0.0, "step": 48020 }, { "epoch": 0.3159868421052632, "grad_norm": 2.265625, "grad_norm_var": 0.06646728515625, "learning_rate": 0.0001, "loss": 2.9013, "loss/crossentropy": 2.32724791765213, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.21036870926618575, "loss/reg": 0.0, "step": 48030 }, { "epoch": 0.31605263157894736, "grad_norm": 2.328125, "grad_norm_var": 0.08539937337239584, "learning_rate": 0.0001, "loss": 2.9884, "loss/crossentropy": 2.399867832660675, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.20067741870880126, "loss/reg": 0.0, "step": 48040 }, { "epoch": 0.3161184210526316, "grad_norm": 2.15625, "grad_norm_var": 0.09907938639322916, "learning_rate": 0.0001, "loss": 2.8446, "loss/crossentropy": 2.484140694141388, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.23947062641382216, "loss/reg": 0.0, "step": 48050 }, { "epoch": 0.3161842105263158, "grad_norm": 2.140625, "grad_norm_var": 0.034912109375, "learning_rate": 0.0001, "loss": 2.9458, "loss/crossentropy": 2.3192034482955934, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.21113595217466355, "loss/reg": 0.0, "step": 48060 }, { "epoch": 0.31625, "grad_norm": 2.546875, "grad_norm_var": 0.01607666015625, "learning_rate": 0.0001, "loss": 2.9295, "loss/crossentropy": 2.1492195785045625, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.18331362754106523, "loss/reg": 0.0, "step": 48070 }, { "epoch": 0.3163157894736842, "grad_norm": 2.28125, "grad_norm_var": 0.60830078125, "learning_rate": 0.0001, "loss": 2.8483, "loss/crossentropy": 2.349029487371445, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2231395237147808, "loss/reg": 0.0, "step": 48080 }, { "epoch": 0.3163815789473684, "grad_norm": 2.515625, "grad_norm_var": 0.5968007405598958, "learning_rate": 0.0001, "loss": 2.8759, "loss/crossentropy": 2.1751973271369933, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.20116825103759767, "loss/reg": 0.0, "step": 48090 }, { "epoch": 0.31644736842105264, "grad_norm": 2.234375, "grad_norm_var": 0.25004781087239586, "learning_rate": 0.0001, "loss": 2.9444, "loss/crossentropy": 2.446299123764038, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20448354184627532, "loss/reg": 0.0, "step": 48100 }, { "epoch": 0.31651315789473683, "grad_norm": 3.15625, "grad_norm_var": 0.08663736979166667, "learning_rate": 0.0001, "loss": 2.824, "loss/crossentropy": 1.8551177978515625, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.1948135480284691, "loss/reg": 0.0, "step": 48110 }, { "epoch": 0.31657894736842107, "grad_norm": 2.515625, "grad_norm_var": 0.10354410807291667, "learning_rate": 0.0001, "loss": 2.8311, "loss/crossentropy": 2.1509138345718384, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.19649668633937836, "loss/reg": 0.0, "step": 48120 }, { "epoch": 0.31664473684210526, "grad_norm": 2.453125, "grad_norm_var": 0.08146870930989583, "learning_rate": 0.0001, "loss": 2.8297, "loss/crossentropy": 1.861658263206482, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.17768474370241166, "loss/reg": 0.0, "step": 48130 }, { "epoch": 0.3167105263157895, "grad_norm": 2.578125, "grad_norm_var": 0.09993082682291667, "learning_rate": 0.0001, "loss": 2.9458, "loss/crossentropy": 2.4552276849746706, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2319492816925049, "loss/reg": 0.0, "step": 48140 }, { "epoch": 0.3167763157894737, "grad_norm": 2.140625, "grad_norm_var": 0.13113505045572918, "learning_rate": 0.0001, "loss": 2.862, "loss/crossentropy": 2.248338222503662, "loss/hidden": 2.553125, "loss/incoh": 0.0, "loss/logits": 0.1969129338860512, "loss/reg": 0.0, "step": 48150 }, { "epoch": 0.31684210526315787, "grad_norm": 2.34375, "grad_norm_var": 0.38157552083333335, "learning_rate": 0.0001, "loss": 2.8613, "loss/crossentropy": 2.260501515865326, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.18076354935765265, "loss/reg": 0.0, "step": 48160 }, { "epoch": 0.3169078947368421, "grad_norm": 2.40625, "grad_norm_var": 0.07310791015625, "learning_rate": 0.0001, "loss": 2.9252, "loss/crossentropy": 2.4308408975601195, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.20350244641304016, "loss/reg": 0.0, "step": 48170 }, { "epoch": 0.3169736842105263, "grad_norm": 2.390625, "grad_norm_var": 0.07569071451822916, "learning_rate": 0.0001, "loss": 2.9254, "loss/crossentropy": 2.3121266603469848, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.24460460841655732, "loss/reg": 0.0, "step": 48180 }, { "epoch": 0.31703947368421054, "grad_norm": 2.5, "grad_norm_var": 0.5751047770182292, "learning_rate": 0.0001, "loss": 2.8459, "loss/crossentropy": 2.006595250964165, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.2391217201948166, "loss/reg": 0.0, "step": 48190 }, { "epoch": 0.3171052631578947, "grad_norm": 2.625, "grad_norm_var": 0.7856435139973958, "learning_rate": 0.0001, "loss": 2.9197, "loss/crossentropy": 2.2920273542404175, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.300459124147892, "loss/reg": 0.0, "step": 48200 }, { "epoch": 0.31717105263157896, "grad_norm": 2.546875, "grad_norm_var": 0.17291259765625, "learning_rate": 0.0001, "loss": 2.9282, "loss/crossentropy": 2.2983280539512636, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.19814616739749907, "loss/reg": 0.0, "step": 48210 }, { "epoch": 0.31723684210526315, "grad_norm": 2.203125, "grad_norm_var": 0.1888580322265625, "learning_rate": 0.0001, "loss": 2.8479, "loss/crossentropy": 2.4685839414596558, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.19216738790273666, "loss/reg": 0.0, "step": 48220 }, { "epoch": 0.3173026315789474, "grad_norm": 2.296875, "grad_norm_var": 0.19517313639322917, "learning_rate": 0.0001, "loss": 2.8937, "loss/crossentropy": 2.178745114803314, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.2260315828025341, "loss/reg": 0.0, "step": 48230 }, { "epoch": 0.3173684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.041169230143229166, "learning_rate": 0.0001, "loss": 2.9063, "loss/crossentropy": 2.0943093478679655, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.21273743212223054, "loss/reg": 0.0, "step": 48240 }, { "epoch": 0.31743421052631576, "grad_norm": 2.140625, "grad_norm_var": 0.029296875, "learning_rate": 0.0001, "loss": 2.8852, "loss/crossentropy": 2.393284833431244, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2024989292025566, "loss/reg": 0.0, "step": 48250 }, { "epoch": 0.3175, "grad_norm": 2.140625, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 2.8511, "loss/crossentropy": 2.1866501688957216, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.1884273424744606, "loss/reg": 0.0, "step": 48260 }, { "epoch": 0.3175657894736842, "grad_norm": 2.421875, "grad_norm_var": 0.04527079264322917, "learning_rate": 0.0001, "loss": 2.9111, "loss/crossentropy": 2.1250454902648928, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.24670460373163222, "loss/reg": 0.0, "step": 48270 }, { "epoch": 0.31763157894736843, "grad_norm": 2.671875, "grad_norm_var": 0.0383941650390625, "learning_rate": 0.0001, "loss": 2.9266, "loss/crossentropy": 2.172704815864563, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22742634862661362, "loss/reg": 0.0, "step": 48280 }, { "epoch": 0.3176973684210526, "grad_norm": 2.4375, "grad_norm_var": 0.17049153645833334, "learning_rate": 0.0001, "loss": 2.8866, "loss/crossentropy": 2.493341588973999, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.2169245198369026, "loss/reg": 0.0, "step": 48290 }, { "epoch": 0.31776315789473686, "grad_norm": 2.171875, "grad_norm_var": 0.20450846354166666, "learning_rate": 0.0001, "loss": 2.8554, "loss/crossentropy": 2.4816686630249025, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22373825311660767, "loss/reg": 0.0, "step": 48300 }, { "epoch": 0.31782894736842104, "grad_norm": 2.28125, "grad_norm_var": 0.03033447265625, "learning_rate": 0.0001, "loss": 2.8495, "loss/crossentropy": 2.1776862502098084, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.21240659952163696, "loss/reg": 0.0, "step": 48310 }, { "epoch": 0.3178947368421053, "grad_norm": 2.140625, "grad_norm_var": 0.03234049479166667, "learning_rate": 0.0001, "loss": 2.8774, "loss/crossentropy": 2.1492156267166136, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.19673264920711517, "loss/reg": 0.0, "step": 48320 }, { "epoch": 0.31796052631578947, "grad_norm": 2.28125, "grad_norm_var": 0.049609375, "learning_rate": 0.0001, "loss": 2.9294, "loss/crossentropy": 2.2977320909500123, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.19563792943954467, "loss/reg": 0.0, "step": 48330 }, { "epoch": 0.31802631578947366, "grad_norm": 2.65625, "grad_norm_var": 0.0579498291015625, "learning_rate": 0.0001, "loss": 2.9002, "loss/crossentropy": 2.281618130207062, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2275415539741516, "loss/reg": 0.0, "step": 48340 }, { "epoch": 0.3180921052631579, "grad_norm": 2.515625, "grad_norm_var": 0.1643218994140625, "learning_rate": 0.0001, "loss": 2.8894, "loss/crossentropy": 2.322252941131592, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.21429211795330047, "loss/reg": 0.0, "step": 48350 }, { "epoch": 0.3181578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.15035400390625, "learning_rate": 0.0001, "loss": 2.8713, "loss/crossentropy": 2.1389993906021116, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.19286162182688713, "loss/reg": 0.0, "step": 48360 }, { "epoch": 0.3182236842105263, "grad_norm": 2.5, "grad_norm_var": 0.031591796875, "learning_rate": 0.0001, "loss": 2.9279, "loss/crossentropy": 2.1158339738845826, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.24611180424690246, "loss/reg": 0.0, "step": 48370 }, { "epoch": 0.3182894736842105, "grad_norm": 2.75, "grad_norm_var": 0.03711649576822917, "learning_rate": 0.0001, "loss": 2.9091, "loss/crossentropy": 2.2521745681762697, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.23796939700841904, "loss/reg": 0.0, "step": 48380 }, { "epoch": 0.31835526315789475, "grad_norm": 2.84375, "grad_norm_var": 0.08840738932291667, "learning_rate": 0.0001, "loss": 2.8577, "loss/crossentropy": 2.360404074192047, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.20013875067234038, "loss/reg": 0.0, "step": 48390 }, { "epoch": 0.31842105263157894, "grad_norm": 2.75, "grad_norm_var": 0.0508941650390625, "learning_rate": 0.0001, "loss": 2.8793, "loss/crossentropy": 2.352422773838043, "loss/hidden": 2.5359375, "loss/incoh": 0.0, "loss/logits": 0.20002578645944596, "loss/reg": 0.0, "step": 48400 }, { "epoch": 0.3184868421052632, "grad_norm": 2.875, "grad_norm_var": 0.0668365478515625, "learning_rate": 0.0001, "loss": 2.9198, "loss/crossentropy": 2.0061663508415224, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.18354280292987823, "loss/reg": 0.0, "step": 48410 }, { "epoch": 0.31855263157894737, "grad_norm": 2.625, "grad_norm_var": 0.0823394775390625, "learning_rate": 0.0001, "loss": 2.8305, "loss/crossentropy": 2.3757291793823243, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.2020177111029625, "loss/reg": 0.0, "step": 48420 }, { "epoch": 0.3186184210526316, "grad_norm": 2.5625, "grad_norm_var": 0.18567301432291666, "learning_rate": 0.0001, "loss": 2.9444, "loss/crossentropy": 2.3555168747901916, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.21999936550855637, "loss/reg": 0.0, "step": 48430 }, { "epoch": 0.3186842105263158, "grad_norm": 2.28125, "grad_norm_var": 0.2275787353515625, "learning_rate": 0.0001, "loss": 2.9646, "loss/crossentropy": 2.4941683292388914, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.20963085144758226, "loss/reg": 0.0, "step": 48440 }, { "epoch": 0.31875, "grad_norm": 2.296875, "grad_norm_var": 0.17922770182291667, "learning_rate": 0.0001, "loss": 2.894, "loss/crossentropy": 2.315898859500885, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.21460773348808287, "loss/reg": 0.0, "step": 48450 }, { "epoch": 0.3188157894736842, "grad_norm": 2.25, "grad_norm_var": 0.16884663899739583, "learning_rate": 0.0001, "loss": 2.9352, "loss/crossentropy": 2.3479946732521055, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.20540411323308944, "loss/reg": 0.0, "step": 48460 }, { "epoch": 0.3188815789473684, "grad_norm": 2.484375, "grad_norm_var": 2.837443680575794e+17, "learning_rate": 0.0001, "loss": 2.9576, "loss/crossentropy": 2.165738654136658, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.20141223669052125, "loss/reg": 0.0, "step": 48470 }, { "epoch": 0.31894736842105265, "grad_norm": 2.40625, "grad_norm_var": 2.8374436802761632e+17, "learning_rate": 0.0001, "loss": 2.8612, "loss/crossentropy": 2.0683544993400576, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.1904444068670273, "loss/reg": 0.0, "step": 48480 }, { "epoch": 0.31901315789473683, "grad_norm": 2.546875, "grad_norm_var": 0.024149576822916668, "learning_rate": 0.0001, "loss": 2.9044, "loss/crossentropy": 2.149365448951721, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20966950953006744, "loss/reg": 0.0, "step": 48490 }, { "epoch": 0.3190789473684211, "grad_norm": 2.65625, "grad_norm_var": 0.03964436848958333, "learning_rate": 0.0001, "loss": 2.8841, "loss/crossentropy": 2.0361252307891844, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.22005419433116913, "loss/reg": 0.0, "step": 48500 }, { "epoch": 0.31914473684210526, "grad_norm": 2.234375, "grad_norm_var": 0.09338785807291666, "learning_rate": 0.0001, "loss": 2.9186, "loss/crossentropy": 2.2767465472221375, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.19084959253668785, "loss/reg": 0.0, "step": 48510 }, { "epoch": 0.3192105263157895, "grad_norm": 2.609375, "grad_norm_var": 0.0809722900390625, "learning_rate": 0.0001, "loss": 2.9599, "loss/crossentropy": 2.2696720242500303, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.21025597751140596, "loss/reg": 0.0, "step": 48520 }, { "epoch": 0.3192763157894737, "grad_norm": 2.328125, "grad_norm_var": 0.046891276041666666, "learning_rate": 0.0001, "loss": 2.9138, "loss/crossentropy": 2.2747150301933288, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.1847570665180683, "loss/reg": 0.0, "step": 48530 }, { "epoch": 0.31934210526315787, "grad_norm": 2.34375, "grad_norm_var": 0.15712890625, "learning_rate": 0.0001, "loss": 2.8404, "loss/crossentropy": 2.243081784248352, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.22356289029121398, "loss/reg": 0.0, "step": 48540 }, { "epoch": 0.3194078947368421, "grad_norm": 2.53125, "grad_norm_var": 0.17273763020833333, "learning_rate": 0.0001, "loss": 2.957, "loss/crossentropy": 2.3117844104766845, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.21899890452623366, "loss/reg": 0.0, "step": 48550 }, { "epoch": 0.3194736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.05608723958333333, "learning_rate": 0.0001, "loss": 2.8298, "loss/crossentropy": 2.1012299180030825, "loss/hidden": 2.55625, "loss/incoh": 0.0, "loss/logits": 0.18718306869268417, "loss/reg": 0.0, "step": 48560 }, { "epoch": 0.31953947368421054, "grad_norm": 3.015625, "grad_norm_var": 0.07720438639322917, "learning_rate": 0.0001, "loss": 2.9209, "loss/crossentropy": 2.466643476486206, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.3076913967728615, "loss/reg": 0.0, "step": 48570 }, { "epoch": 0.3196052631578947, "grad_norm": 2.640625, "grad_norm_var": 0.07153218587239583, "learning_rate": 0.0001, "loss": 2.9364, "loss/crossentropy": 2.32701975107193, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.23566516637802123, "loss/reg": 0.0, "step": 48580 }, { "epoch": 0.31967105263157897, "grad_norm": 2.75, "grad_norm_var": 2.302171834309896, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.4910380721092222, "loss/hidden": 2.8578125, "loss/incoh": 0.0, "loss/logits": 0.21845423579216003, "loss/reg": 0.0, "step": 48590 }, { "epoch": 0.31973684210526315, "grad_norm": 2.65625, "grad_norm_var": 2.3369293212890625, "learning_rate": 0.0001, "loss": 2.8744, "loss/crossentropy": 2.036991012096405, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.1922347828745842, "loss/reg": 0.0, "step": 48600 }, { "epoch": 0.3198026315789474, "grad_norm": 2.40625, "grad_norm_var": 0.07013931274414062, "learning_rate": 0.0001, "loss": 2.8794, "loss/crossentropy": 2.0422864139080046, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.20800687670707702, "loss/reg": 0.0, "step": 48610 }, { "epoch": 0.3198684210526316, "grad_norm": 2.484375, "grad_norm_var": 0.20564142862955728, "learning_rate": 0.0001, "loss": 2.865, "loss/crossentropy": 2.4818240284919737, "loss/hidden": 2.4953125, "loss/incoh": 0.0, "loss/logits": 0.1868782602250576, "loss/reg": 0.0, "step": 48620 }, { "epoch": 0.31993421052631577, "grad_norm": 2.25, "grad_norm_var": 0.178369140625, "learning_rate": 0.0001, "loss": 2.8385, "loss/crossentropy": 2.093449079990387, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.2353523850440979, "loss/reg": 0.0, "step": 48630 }, { "epoch": 0.32, "grad_norm": 2.578125, "grad_norm_var": 0.031981404622395834, "learning_rate": 0.0001, "loss": 2.8879, "loss/crossentropy": 2.454249179363251, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.19036460369825364, "loss/reg": 0.0, "step": 48640 }, { "epoch": 0.3200657894736842, "grad_norm": 2.015625, "grad_norm_var": 0.042867024739583336, "learning_rate": 0.0001, "loss": 2.8623, "loss/crossentropy": 2.297064745426178, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.19468972980976104, "loss/reg": 0.0, "step": 48650 }, { "epoch": 0.32013157894736843, "grad_norm": 2.359375, "grad_norm_var": 0.0404205322265625, "learning_rate": 0.0001, "loss": 2.8182, "loss/crossentropy": 2.056578814983368, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24695860147476195, "loss/reg": 0.0, "step": 48660 }, { "epoch": 0.3201973684210526, "grad_norm": 2.578125, "grad_norm_var": 0.13028971354166666, "learning_rate": 0.0001, "loss": 2.9125, "loss/crossentropy": 2.327669143676758, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.20416615009307862, "loss/reg": 0.0, "step": 48670 }, { "epoch": 0.32026315789473686, "grad_norm": 2.421875, "grad_norm_var": 0.10358072916666666, "learning_rate": 0.0001, "loss": 2.8794, "loss/crossentropy": 2.172209453582764, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.20705851912498474, "loss/reg": 0.0, "step": 48680 }, { "epoch": 0.32032894736842105, "grad_norm": 2.65625, "grad_norm_var": 0.04728902180989583, "learning_rate": 0.0001, "loss": 2.9495, "loss/crossentropy": 1.9710035145282745, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.19330802634358407, "loss/reg": 0.0, "step": 48690 }, { "epoch": 0.3203947368421053, "grad_norm": 2.765625, "grad_norm_var": 0.43522847493489586, "learning_rate": 0.0001, "loss": 2.923, "loss/crossentropy": 2.4312652468681337, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2221812203526497, "loss/reg": 0.0, "step": 48700 }, { "epoch": 0.3204605263157895, "grad_norm": 2.0625, "grad_norm_var": 0.47359619140625, "learning_rate": 0.0001, "loss": 2.8472, "loss/crossentropy": 2.3014976620674132, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.22046991884708406, "loss/reg": 0.0, "step": 48710 }, { "epoch": 0.32052631578947366, "grad_norm": 3.109375, "grad_norm_var": 0.40172119140625, "learning_rate": 0.0001, "loss": 2.9263, "loss/crossentropy": 2.2770553469657897, "loss/hidden": 2.540625, "loss/incoh": 0.0, "loss/logits": 0.18486379757523536, "loss/reg": 0.0, "step": 48720 }, { "epoch": 0.3205921052631579, "grad_norm": 2.84375, "grad_norm_var": 0.3174224853515625, "learning_rate": 0.0001, "loss": 2.8826, "loss/crossentropy": 2.077821058034897, "loss/hidden": 2.9046875, "loss/incoh": 0.0, "loss/logits": 0.20089543014764785, "loss/reg": 0.0, "step": 48730 }, { "epoch": 0.3206578947368421, "grad_norm": 2.03125, "grad_norm_var": 0.07232666015625, "learning_rate": 0.0001, "loss": 2.8862, "loss/crossentropy": 2.1765020549297334, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.24207602739334105, "loss/reg": 0.0, "step": 48740 }, { "epoch": 0.3207236842105263, "grad_norm": 2.296875, "grad_norm_var": 0.05244140625, "learning_rate": 0.0001, "loss": 2.8765, "loss/crossentropy": 2.2446788787841796, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.2524278685450554, "loss/reg": 0.0, "step": 48750 }, { "epoch": 0.3207894736842105, "grad_norm": 2.703125, "grad_norm_var": 0.04015299479166667, "learning_rate": 0.0001, "loss": 2.973, "loss/crossentropy": 2.1072701811790466, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.21228813976049424, "loss/reg": 0.0, "step": 48760 }, { "epoch": 0.32085526315789475, "grad_norm": 2.5, "grad_norm_var": 0.034521484375, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.4306872606277468, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.21897053718566895, "loss/reg": 0.0, "step": 48770 }, { "epoch": 0.32092105263157894, "grad_norm": 2.65625, "grad_norm_var": 0.0386138916015625, "learning_rate": 0.0001, "loss": 2.8507, "loss/crossentropy": 2.120554503798485, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.22201697826385497, "loss/reg": 0.0, "step": 48780 }, { "epoch": 0.3209868421052632, "grad_norm": 2.625, "grad_norm_var": 0.12919820149739583, "learning_rate": 0.0001, "loss": 2.9236, "loss/crossentropy": 2.1547642946243286, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2330174393951893, "loss/reg": 0.0, "step": 48790 }, { "epoch": 0.32105263157894737, "grad_norm": 2.609375, "grad_norm_var": 0.04812825520833333, "learning_rate": 0.0001, "loss": 2.9366, "loss/crossentropy": 2.1647364020347597, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.1908225104212761, "loss/reg": 0.0, "step": 48800 }, { "epoch": 0.32111842105263155, "grad_norm": 2.25, "grad_norm_var": 0.043456013997395834, "learning_rate": 0.0001, "loss": 2.8658, "loss/crossentropy": 2.461720180511475, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.21650804877281188, "loss/reg": 0.0, "step": 48810 }, { "epoch": 0.3211842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.04706624348958333, "learning_rate": 0.0001, "loss": 2.8807, "loss/crossentropy": 2.2111263990402223, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.211372534930706, "loss/reg": 0.0, "step": 48820 }, { "epoch": 0.32125, "grad_norm": 2.96875, "grad_norm_var": 0.05627848307291667, "learning_rate": 0.0001, "loss": 2.8969, "loss/crossentropy": 2.057933932542801, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.2275244802236557, "loss/reg": 0.0, "step": 48830 }, { "epoch": 0.3213157894736842, "grad_norm": 2.46875, "grad_norm_var": 0.05849507649739583, "learning_rate": 0.0001, "loss": 2.9249, "loss/crossentropy": 2.237890601158142, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.224201962351799, "loss/reg": 0.0, "step": 48840 }, { "epoch": 0.3213815789473684, "grad_norm": 2.25, "grad_norm_var": 0.029320271809895833, "learning_rate": 0.0001, "loss": 2.9347, "loss/crossentropy": 2.167393219470978, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.1992882952094078, "loss/reg": 0.0, "step": 48850 }, { "epoch": 0.32144736842105265, "grad_norm": 4.90625, "grad_norm_var": 0.46609598795572915, "learning_rate": 0.0001, "loss": 2.9112, "loss/crossentropy": 2.2063543438911437, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.18422723188996315, "loss/reg": 0.0, "step": 48860 }, { "epoch": 0.32151315789473683, "grad_norm": 2.5625, "grad_norm_var": 0.45816141764322915, "learning_rate": 0.0001, "loss": 2.9566, "loss/crossentropy": 2.6454216599464417, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.21323546022176743, "loss/reg": 0.0, "step": 48870 }, { "epoch": 0.3215789473684211, "grad_norm": 2.75, "grad_norm_var": 0.05054423014322917, "learning_rate": 0.0001, "loss": 2.8502, "loss/crossentropy": 2.5088499069213865, "loss/hidden": 2.4859375, "loss/incoh": 0.0, "loss/logits": 0.1907164439558983, "loss/reg": 0.0, "step": 48880 }, { "epoch": 0.32164473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 2.8564, "loss/crossentropy": 2.334225118160248, "loss/hidden": 2.540625, "loss/incoh": 0.0, "loss/logits": 0.19292876943945886, "loss/reg": 0.0, "step": 48890 }, { "epoch": 0.32171052631578945, "grad_norm": 2.671875, "grad_norm_var": 0.10932515462239584, "learning_rate": 0.0001, "loss": 2.9649, "loss/crossentropy": 2.3368978261947633, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.21074423044919968, "loss/reg": 0.0, "step": 48900 }, { "epoch": 0.3217763157894737, "grad_norm": 2.3125, "grad_norm_var": 0.04153645833333333, "learning_rate": 0.0001, "loss": 2.8729, "loss/crossentropy": 2.3171313762664796, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.20221460461616517, "loss/reg": 0.0, "step": 48910 }, { "epoch": 0.3218421052631579, "grad_norm": 2.875, "grad_norm_var": 0.04999898274739583, "learning_rate": 0.0001, "loss": 2.8629, "loss/crossentropy": 2.199036979675293, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.2102992132306099, "loss/reg": 0.0, "step": 48920 }, { "epoch": 0.3219078947368421, "grad_norm": 2.6875, "grad_norm_var": 0.05152587890625, "learning_rate": 0.0001, "loss": 2.9467, "loss/crossentropy": 2.3751417875289915, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.21881704181432723, "loss/reg": 0.0, "step": 48930 }, { "epoch": 0.3219736842105263, "grad_norm": 2.703125, "grad_norm_var": 6.70093994140625, "learning_rate": 0.0001, "loss": 2.886, "loss/crossentropy": 2.19942022562027, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.20192338526248932, "loss/reg": 0.0, "step": 48940 }, { "epoch": 0.32203947368421054, "grad_norm": 2.28125, "grad_norm_var": 6.779032389322917, "learning_rate": 0.0001, "loss": 2.8316, "loss/crossentropy": 2.5518424272537232, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.22403278350830078, "loss/reg": 0.0, "step": 48950 }, { "epoch": 0.32210526315789473, "grad_norm": 2.59375, "grad_norm_var": 0.020555623372395835, "learning_rate": 0.0001, "loss": 2.8693, "loss/crossentropy": 2.4508535981178285, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.2125440463423729, "loss/reg": 0.0, "step": 48960 }, { "epoch": 0.32217105263157897, "grad_norm": 2.734375, "grad_norm_var": 1.47286179141632e+17, "learning_rate": 0.0001, "loss": 2.996, "loss/crossentropy": 2.4686465859413147, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.20892000198364258, "loss/reg": 0.0, "step": 48970 }, { "epoch": 0.32223684210526315, "grad_norm": 2.53125, "grad_norm_var": 0.10898030598958333, "learning_rate": 0.0001, "loss": 3.0072, "loss/crossentropy": 2.340684413909912, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2499086856842041, "loss/reg": 0.0, "step": 48980 }, { "epoch": 0.3223026315789474, "grad_norm": 2.5625, "grad_norm_var": 0.10900065104166666, "learning_rate": 0.0001, "loss": 2.8835, "loss/crossentropy": 2.325006532669067, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.1873799592256546, "loss/reg": 0.0, "step": 48990 }, { "epoch": 0.3223684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.2942860921223958, "learning_rate": 0.0001, "loss": 2.8715, "loss/crossentropy": 2.3799846291542055, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.2293248325586319, "loss/reg": 0.0, "step": 49000 }, { "epoch": 0.32243421052631577, "grad_norm": 2.46875, "grad_norm_var": 0.8867472330729167, "learning_rate": 0.0001, "loss": 2.9172, "loss/crossentropy": 2.2918012857437136, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22967704981565476, "loss/reg": 0.0, "step": 49010 }, { "epoch": 0.3225, "grad_norm": 2.390625, "grad_norm_var": 0.2672515869140625, "learning_rate": 0.0001, "loss": 2.8362, "loss/crossentropy": 2.341455674171448, "loss/hidden": 2.5328125, "loss/incoh": 0.0, "loss/logits": 0.195063978433609, "loss/reg": 0.0, "step": 49020 }, { "epoch": 0.3225657894736842, "grad_norm": 2.234375, "grad_norm_var": 0.021491495768229167, "learning_rate": 0.0001, "loss": 2.9, "loss/crossentropy": 2.2926043391227724, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.2086925357580185, "loss/reg": 0.0, "step": 49030 }, { "epoch": 0.32263157894736844, "grad_norm": 2.515625, "grad_norm_var": 0.057450358072916666, "learning_rate": 0.0001, "loss": 2.9404, "loss/crossentropy": 2.282513803243637, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.23624783307313918, "loss/reg": 0.0, "step": 49040 }, { "epoch": 0.3226973684210526, "grad_norm": 5.15625, "grad_norm_var": 0.45535380045572915, "learning_rate": 0.0001, "loss": 2.9075, "loss/crossentropy": 2.1169674158096314, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.19339826256036757, "loss/reg": 0.0, "step": 49050 }, { "epoch": 0.32276315789473686, "grad_norm": 2.625, "grad_norm_var": 0.5364908854166667, "learning_rate": 0.0001, "loss": 2.9216, "loss/crossentropy": 2.185878598690033, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.2089112728834152, "loss/reg": 0.0, "step": 49060 }, { "epoch": 0.32282894736842105, "grad_norm": 2.296875, "grad_norm_var": 0.13308817545572918, "learning_rate": 0.0001, "loss": 2.9196, "loss/crossentropy": 2.6764113187789915, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.2758709296584129, "loss/reg": 0.0, "step": 49070 }, { "epoch": 0.3228947368421053, "grad_norm": 2.296875, "grad_norm_var": 0.030427042643229166, "learning_rate": 0.0001, "loss": 2.8585, "loss/crossentropy": 2.5345089435577393, "loss/hidden": 2.553125, "loss/incoh": 0.0, "loss/logits": 0.21514544785022735, "loss/reg": 0.0, "step": 49080 }, { "epoch": 0.3229605263157895, "grad_norm": 2.34375, "grad_norm_var": 0.059403483072916666, "learning_rate": 0.0001, "loss": 2.8958, "loss/crossentropy": 2.4408546566963194, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.21567849069833755, "loss/reg": 0.0, "step": 49090 }, { "epoch": 0.32302631578947366, "grad_norm": 2.234375, "grad_norm_var": 0.03474019368489583, "learning_rate": 0.0001, "loss": 2.9017, "loss/crossentropy": 2.2046252250671388, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.19873327910900115, "loss/reg": 0.0, "step": 49100 }, { "epoch": 0.3230921052631579, "grad_norm": 2.21875, "grad_norm_var": 0.15855712890625, "learning_rate": 0.0001, "loss": 2.9202, "loss/crossentropy": 2.4964420795440674, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.20678060948848725, "loss/reg": 0.0, "step": 49110 }, { "epoch": 0.3231578947368421, "grad_norm": 2.515625, "grad_norm_var": 0.1696685791015625, "learning_rate": 0.0001, "loss": 2.9217, "loss/crossentropy": 2.241098368167877, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.2213309407234192, "loss/reg": 0.0, "step": 49120 }, { "epoch": 0.32322368421052633, "grad_norm": 2.59375, "grad_norm_var": 0.02330322265625, "learning_rate": 0.0001, "loss": 2.8954, "loss/crossentropy": 2.1962792754173277, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.2113658145070076, "loss/reg": 0.0, "step": 49130 }, { "epoch": 0.3232894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.0814117431640625, "learning_rate": 0.0001, "loss": 2.943, "loss/crossentropy": 2.1587972164154055, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.19411415085196496, "loss/reg": 0.0, "step": 49140 }, { "epoch": 0.32335526315789476, "grad_norm": 2.21875, "grad_norm_var": 0.10986328125, "learning_rate": 0.0001, "loss": 2.978, "loss/crossentropy": 2.198576009273529, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.22099283784627916, "loss/reg": 0.0, "step": 49150 }, { "epoch": 0.32342105263157894, "grad_norm": 2.34375, "grad_norm_var": 0.06280008951822917, "learning_rate": 0.0001, "loss": 2.8873, "loss/crossentropy": 2.2491976261138915, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.2042415864765644, "loss/reg": 0.0, "step": 49160 }, { "epoch": 0.3234868421052632, "grad_norm": 2.359375, "grad_norm_var": 0.04739176432291667, "learning_rate": 0.0001, "loss": 2.9345, "loss/crossentropy": 2.1360318034887316, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.18910920433700085, "loss/reg": 0.0, "step": 49170 }, { "epoch": 0.32355263157894737, "grad_norm": 3.03125, "grad_norm_var": 0.055826822916666664, "learning_rate": 0.0001, "loss": 2.9356, "loss/crossentropy": 2.267302417755127, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.19285966604948043, "loss/reg": 0.0, "step": 49180 }, { "epoch": 0.32361842105263156, "grad_norm": 2.328125, "grad_norm_var": 0.08940327962239583, "learning_rate": 0.0001, "loss": 2.8517, "loss/crossentropy": 2.206797957420349, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.19456288665533067, "loss/reg": 0.0, "step": 49190 }, { "epoch": 0.3236842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.03463134765625, "learning_rate": 0.0001, "loss": 2.8735, "loss/crossentropy": 2.099406361579895, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.1973948135972023, "loss/reg": 0.0, "step": 49200 }, { "epoch": 0.32375, "grad_norm": 2.421875, "grad_norm_var": 0.027437337239583335, "learning_rate": 0.0001, "loss": 2.8742, "loss/crossentropy": 2.4252512574195864, "loss/hidden": 2.9109375, "loss/incoh": 0.0, "loss/logits": 0.30805373936891556, "loss/reg": 0.0, "step": 49210 }, { "epoch": 0.3238157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.03937886555989583, "learning_rate": 0.0001, "loss": 2.8958, "loss/crossentropy": 2.035316228866577, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.19419800043106078, "loss/reg": 0.0, "step": 49220 }, { "epoch": 0.3238815789473684, "grad_norm": 2.4375, "grad_norm_var": 0.0542877197265625, "learning_rate": 0.0001, "loss": 2.8579, "loss/crossentropy": 2.1851854801177977, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.21032845079898835, "loss/reg": 0.0, "step": 49230 }, { "epoch": 0.32394736842105265, "grad_norm": 2.21875, "grad_norm_var": 0.03379618326822917, "learning_rate": 0.0001, "loss": 2.8357, "loss/crossentropy": 2.189658224582672, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.21095257103443146, "loss/reg": 0.0, "step": 49240 }, { "epoch": 0.32401315789473684, "grad_norm": 2.671875, "grad_norm_var": 0.03795572916666667, "learning_rate": 0.0001, "loss": 2.8679, "loss/crossentropy": 2.1443102836608885, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.1757427379488945, "loss/reg": 0.0, "step": 49250 }, { "epoch": 0.3240789473684211, "grad_norm": 2.421875, "grad_norm_var": 0.04201558430989583, "learning_rate": 0.0001, "loss": 2.9463, "loss/crossentropy": 2.365839719772339, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.21451021134853362, "loss/reg": 0.0, "step": 49260 }, { "epoch": 0.32414473684210526, "grad_norm": 2.34375, "grad_norm_var": 4.391713315775229e+17, "learning_rate": 0.0001, "loss": 3.0405, "loss/crossentropy": 2.2562620639801025, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.20587447360157968, "loss/reg": 0.0, "step": 49270 }, { "epoch": 0.32421052631578945, "grad_norm": 2.28125, "grad_norm_var": 0.18796971638997395, "learning_rate": 0.0001, "loss": 2.8895, "loss/crossentropy": 2.271971035003662, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.20573140382766725, "loss/reg": 0.0, "step": 49280 }, { "epoch": 0.3242763157894737, "grad_norm": 2.421875, "grad_norm_var": 0.09122289021809896, "learning_rate": 0.0001, "loss": 2.8573, "loss/crossentropy": 2.498966121673584, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.19266293495893477, "loss/reg": 0.0, "step": 49290 }, { "epoch": 0.3243421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.013427734375, "learning_rate": 0.0001, "loss": 2.846, "loss/crossentropy": 2.2498430967330934, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.22948729246854782, "loss/reg": 0.0, "step": 49300 }, { "epoch": 0.3244078947368421, "grad_norm": 2.4375, "grad_norm_var": 0.2159088134765625, "learning_rate": 0.0001, "loss": 3.0001, "loss/crossentropy": 2.1157378435134886, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2695459216833115, "loss/reg": 0.0, "step": 49310 }, { "epoch": 0.3244736842105263, "grad_norm": 2.234375, "grad_norm_var": 0.20668843587239583, "learning_rate": 0.0001, "loss": 2.8942, "loss/crossentropy": 2.3125268816947937, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.19173220843076705, "loss/reg": 0.0, "step": 49320 }, { "epoch": 0.32453947368421054, "grad_norm": 2.34375, "grad_norm_var": 0.018903605143229165, "learning_rate": 0.0001, "loss": 2.9071, "loss/crossentropy": 2.390227258205414, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.20166983231902122, "loss/reg": 0.0, "step": 49330 }, { "epoch": 0.32460526315789473, "grad_norm": 2.53125, "grad_norm_var": 0.03866780598958333, "learning_rate": 0.0001, "loss": 2.865, "loss/crossentropy": 2.4144001603126526, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.19747094213962554, "loss/reg": 0.0, "step": 49340 }, { "epoch": 0.32467105263157897, "grad_norm": 2.65625, "grad_norm_var": 0.06953099568684896, "learning_rate": 0.0001, "loss": 2.8429, "loss/crossentropy": 2.3617787599563598, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.24705647975206374, "loss/reg": 0.0, "step": 49350 }, { "epoch": 0.32473684210526316, "grad_norm": 4.1875, "grad_norm_var": 0.2994544982910156, "learning_rate": 0.0001, "loss": 2.9343, "loss/crossentropy": 2.371607577800751, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.209511861205101, "loss/reg": 0.0, "step": 49360 }, { "epoch": 0.32480263157894734, "grad_norm": 2.359375, "grad_norm_var": 0.2603841145833333, "learning_rate": 0.0001, "loss": 2.928, "loss/crossentropy": 2.3683363676071165, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.22561810910701752, "loss/reg": 0.0, "step": 49370 }, { "epoch": 0.3248684210526316, "grad_norm": 2.03125, "grad_norm_var": 0.0712799072265625, "learning_rate": 0.0001, "loss": 2.8736, "loss/crossentropy": 2.1849867343902587, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22061919867992402, "loss/reg": 0.0, "step": 49380 }, { "epoch": 0.32493421052631577, "grad_norm": 2.546875, "grad_norm_var": 0.1953277587890625, "learning_rate": 0.0001, "loss": 2.8811, "loss/crossentropy": 2.093055486679077, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.1939164362847805, "loss/reg": 0.0, "step": 49390 }, { "epoch": 0.325, "grad_norm": 2.484375, "grad_norm_var": 6.392072550455729, "learning_rate": 0.0001, "loss": 2.9249, "loss/crossentropy": 2.3600495576858522, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.21454770863056183, "loss/reg": 0.0, "step": 49400 }, { "epoch": 0.3250657894736842, "grad_norm": 2.59375, "grad_norm_var": 6.488114420572916, "learning_rate": 0.0001, "loss": 2.8981, "loss/crossentropy": 2.3398555159568786, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24129000902175904, "loss/reg": 0.0, "step": 49410 }, { "epoch": 0.32513157894736844, "grad_norm": 2.515625, "grad_norm_var": 0.035546875, "learning_rate": 0.0001, "loss": 2.8698, "loss/crossentropy": 2.182040643692017, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.23874544501304626, "loss/reg": 0.0, "step": 49420 }, { "epoch": 0.3251973684210526, "grad_norm": 2.6875, "grad_norm_var": 0.03167317708333333, "learning_rate": 0.0001, "loss": 2.9013, "loss/crossentropy": 2.150018906593323, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.24385224282741547, "loss/reg": 0.0, "step": 49430 }, { "epoch": 0.32526315789473687, "grad_norm": 2.390625, "grad_norm_var": 0.04206441243489583, "learning_rate": 0.0001, "loss": 2.9573, "loss/crossentropy": 2.3754819869995116, "loss/hidden": 2.9359375, "loss/incoh": 0.0, "loss/logits": 0.2825229406356812, "loss/reg": 0.0, "step": 49440 }, { "epoch": 0.32532894736842105, "grad_norm": 2.4375, "grad_norm_var": 0.014289347330729167, "learning_rate": 0.0001, "loss": 2.9617, "loss/crossentropy": 2.2600276947021483, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.21928555965423585, "loss/reg": 0.0, "step": 49450 }, { "epoch": 0.32539473684210524, "grad_norm": 2.40625, "grad_norm_var": 0.038590494791666666, "learning_rate": 0.0001, "loss": 2.8822, "loss/crossentropy": 2.102971911430359, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.20292045176029205, "loss/reg": 0.0, "step": 49460 }, { "epoch": 0.3254605263157895, "grad_norm": 2.703125, "grad_norm_var": 0.04195556640625, "learning_rate": 0.0001, "loss": 2.9062, "loss/crossentropy": 2.239990198612213, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.17609065100550653, "loss/reg": 0.0, "step": 49470 }, { "epoch": 0.32552631578947366, "grad_norm": 3.125, "grad_norm_var": 0.05256245930989583, "learning_rate": 0.0001, "loss": 2.9384, "loss/crossentropy": 2.243304353952408, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20012817680835723, "loss/reg": 0.0, "step": 49480 }, { "epoch": 0.3255921052631579, "grad_norm": 2.71875, "grad_norm_var": 0.06834208170572917, "learning_rate": 0.0001, "loss": 2.9191, "loss/crossentropy": 2.4893649339675905, "loss/hidden": 2.55625, "loss/incoh": 0.0, "loss/logits": 0.21039317846298217, "loss/reg": 0.0, "step": 49490 }, { "epoch": 0.3256578947368421, "grad_norm": 2.046875, "grad_norm_var": 0.07620035807291667, "learning_rate": 0.0001, "loss": 2.962, "loss/crossentropy": 2.232936370372772, "loss/hidden": 2.9953125, "loss/incoh": 0.0, "loss/logits": 0.2765511214733124, "loss/reg": 0.0, "step": 49500 }, { "epoch": 0.32572368421052633, "grad_norm": 2.953125, "grad_norm_var": 0.10676676432291667, "learning_rate": 0.0001, "loss": 2.9228, "loss/crossentropy": 2.277019774913788, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21332577019929885, "loss/reg": 0.0, "step": 49510 }, { "epoch": 0.3257894736842105, "grad_norm": 2.34375, "grad_norm_var": 0.12365620930989583, "learning_rate": 0.0001, "loss": 2.9182, "loss/crossentropy": 2.328976881504059, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.20161002427339553, "loss/reg": 0.0, "step": 49520 }, { "epoch": 0.32585526315789476, "grad_norm": 2.234375, "grad_norm_var": 0.08338114420572916, "learning_rate": 0.0001, "loss": 2.8782, "loss/crossentropy": 2.2401389479637146, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.21316224634647368, "loss/reg": 0.0, "step": 49530 }, { "epoch": 0.32592105263157894, "grad_norm": 3.171875, "grad_norm_var": 0.08940327962239583, "learning_rate": 0.0001, "loss": 2.9489, "loss/crossentropy": 2.0091909885406496, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2186304546892643, "loss/reg": 0.0, "step": 49540 }, { "epoch": 0.32598684210526313, "grad_norm": 2.359375, "grad_norm_var": 0.10214436848958333, "learning_rate": 0.0001, "loss": 2.8952, "loss/crossentropy": 2.144161415100098, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.19552899152040482, "loss/reg": 0.0, "step": 49550 }, { "epoch": 0.32605263157894737, "grad_norm": 2.15625, "grad_norm_var": 0.05217183430989583, "learning_rate": 0.0001, "loss": 2.8596, "loss/crossentropy": 2.322604811191559, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22850967794656754, "loss/reg": 0.0, "step": 49560 }, { "epoch": 0.32611842105263156, "grad_norm": 2.625, "grad_norm_var": 0.06790364583333333, "learning_rate": 0.0001, "loss": 2.9136, "loss/crossentropy": 2.142782485485077, "loss/hidden": 2.5625, "loss/incoh": 0.0, "loss/logits": 0.19583540186285972, "loss/reg": 0.0, "step": 49570 }, { "epoch": 0.3261842105263158, "grad_norm": 2.640625, "grad_norm_var": 0.0571929931640625, "learning_rate": 0.0001, "loss": 2.8977, "loss/crossentropy": 2.0746835589408876, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.19169059172272682, "loss/reg": 0.0, "step": 49580 }, { "epoch": 0.32625, "grad_norm": 3.46875, "grad_norm_var": 0.18159077962239584, "learning_rate": 0.0001, "loss": 2.9084, "loss/crossentropy": 2.3189886927604677, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.189093741774559, "loss/reg": 0.0, "step": 49590 }, { "epoch": 0.3263157894736842, "grad_norm": 3.015625, "grad_norm_var": 0.22376200358072917, "learning_rate": 0.0001, "loss": 2.865, "loss/crossentropy": 2.5394527435302736, "loss/hidden": 2.5859375, "loss/incoh": 0.0, "loss/logits": 0.20924156308174133, "loss/reg": 0.0, "step": 49600 }, { "epoch": 0.3263815789473684, "grad_norm": 2.21875, "grad_norm_var": 0.1213043212890625, "learning_rate": 0.0001, "loss": 2.9465, "loss/crossentropy": 2.405171346664429, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2204499736428261, "loss/reg": 0.0, "step": 49610 }, { "epoch": 0.32644736842105265, "grad_norm": 2.171875, "grad_norm_var": 0.2647369384765625, "learning_rate": 0.0001, "loss": 2.9831, "loss/crossentropy": 2.254263687133789, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.2259682908654213, "loss/reg": 0.0, "step": 49620 }, { "epoch": 0.32651315789473684, "grad_norm": 2.203125, "grad_norm_var": 0.26790262858072916, "learning_rate": 0.0001, "loss": 2.8591, "loss/crossentropy": 2.198948323726654, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.18696713894605638, "loss/reg": 0.0, "step": 49630 }, { "epoch": 0.3265789473684211, "grad_norm": 2.34375, "grad_norm_var": 0.21193033854166668, "learning_rate": 0.0001, "loss": 2.8951, "loss/crossentropy": 2.3545778155326844, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.2147059202194214, "loss/reg": 0.0, "step": 49640 }, { "epoch": 0.32664473684210527, "grad_norm": 2.375, "grad_norm_var": 0.5582316080729167, "learning_rate": 0.0001, "loss": 2.8989, "loss/crossentropy": 1.928742492198944, "loss/hidden": 2.9765625, "loss/incoh": 0.0, "loss/logits": 0.25337462723255155, "loss/reg": 0.0, "step": 49650 }, { "epoch": 0.32671052631578945, "grad_norm": 2.609375, "grad_norm_var": 0.41634012858072916, "learning_rate": 0.0001, "loss": 2.8646, "loss/crossentropy": 2.3361602783203126, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.21233459711074829, "loss/reg": 0.0, "step": 49660 }, { "epoch": 0.3267763157894737, "grad_norm": 2.0625, "grad_norm_var": 0.04381103515625, "learning_rate": 0.0001, "loss": 2.9012, "loss/crossentropy": 2.187861943244934, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.20430142283439637, "loss/reg": 0.0, "step": 49670 }, { "epoch": 0.3268421052631579, "grad_norm": 2.265625, "grad_norm_var": 0.0581207275390625, "learning_rate": 0.0001, "loss": 2.8516, "loss/crossentropy": 2.407389521598816, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2328763484954834, "loss/reg": 0.0, "step": 49680 }, { "epoch": 0.3269078947368421, "grad_norm": 2.296875, "grad_norm_var": 0.055964152018229164, "learning_rate": 0.0001, "loss": 2.8807, "loss/crossentropy": 2.4817373037338255, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.20578065514564514, "loss/reg": 0.0, "step": 49690 }, { "epoch": 0.3269736842105263, "grad_norm": 2.265625, "grad_norm_var": 3.206175899516928e+17, "learning_rate": 0.0001, "loss": 3.0359, "loss/crossentropy": 2.389392149448395, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.25747421085834504, "loss/reg": 0.0, "step": 49700 }, { "epoch": 0.32703947368421055, "grad_norm": 2.5625, "grad_norm_var": 3.206175899720417e+17, "learning_rate": 0.0001, "loss": 2.8342, "loss/crossentropy": 2.293433403968811, "loss/hidden": 2.7984375, "loss/incoh": 0.0, "loss/logits": 0.28496605157852173, "loss/reg": 0.0, "step": 49710 }, { "epoch": 0.32710526315789473, "grad_norm": 2.25, "grad_norm_var": 0.0517578125, "learning_rate": 0.0001, "loss": 2.9136, "loss/crossentropy": 2.3245654940605163, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.19901563674211503, "loss/reg": 0.0, "step": 49720 }, { "epoch": 0.327171052631579, "grad_norm": 2.40625, "grad_norm_var": 0.05077718098958333, "learning_rate": 0.0001, "loss": 2.853, "loss/crossentropy": 2.036464524269104, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.19052523747086525, "loss/reg": 0.0, "step": 49730 }, { "epoch": 0.32723684210526316, "grad_norm": 2.984375, "grad_norm_var": 0.10963134765625, "learning_rate": 0.0001, "loss": 2.8814, "loss/crossentropy": 2.194696569442749, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.22804517149925232, "loss/reg": 0.0, "step": 49740 }, { "epoch": 0.32730263157894735, "grad_norm": 2.234375, "grad_norm_var": 3.111881782528311e+17, "learning_rate": 0.0001, "loss": 3.0078, "loss/crossentropy": 2.4293861746788026, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.21010107845067977, "loss/reg": 0.0, "step": 49750 }, { "epoch": 0.3273684210526316, "grad_norm": 2.453125, "grad_norm_var": 3.111881782632907e+17, "learning_rate": 0.0001, "loss": 2.8284, "loss/crossentropy": 2.396776723861694, "loss/hidden": 2.55625, "loss/incoh": 0.0, "loss/logits": 0.20671335160732268, "loss/reg": 0.0, "step": 49760 }, { "epoch": 0.32743421052631577, "grad_norm": 2.5, "grad_norm_var": 0.04614969889322917, "learning_rate": 0.0001, "loss": 2.8596, "loss/crossentropy": 2.0073238372802735, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20146738439798356, "loss/reg": 0.0, "step": 49770 }, { "epoch": 0.3275, "grad_norm": 2.4375, "grad_norm_var": 0.04158426920572917, "learning_rate": 0.0001, "loss": 2.9302, "loss/crossentropy": 2.189040958881378, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.2335902512073517, "loss/reg": 0.0, "step": 49780 }, { "epoch": 0.3275657894736842, "grad_norm": 2.375, "grad_norm_var": 0.0267730712890625, "learning_rate": 0.0001, "loss": 2.8856, "loss/crossentropy": 2.4772990345954895, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.21396184861660003, "loss/reg": 0.0, "step": 49790 }, { "epoch": 0.32763157894736844, "grad_norm": 2.484375, "grad_norm_var": 0.042756144205729166, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.204012727737427, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.25400942414999006, "loss/reg": 0.0, "step": 49800 }, { "epoch": 0.3276973684210526, "grad_norm": 2.90625, "grad_norm_var": 0.046284993489583336, "learning_rate": 0.0001, "loss": 2.9154, "loss/crossentropy": 2.289921748638153, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.25173943638801577, "loss/reg": 0.0, "step": 49810 }, { "epoch": 0.32776315789473687, "grad_norm": 2.109375, "grad_norm_var": 0.4901031494140625, "learning_rate": 0.0001, "loss": 2.9097, "loss/crossentropy": 1.9598671555519105, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.19857908487319947, "loss/reg": 0.0, "step": 49820 }, { "epoch": 0.32782894736842105, "grad_norm": 2.40625, "grad_norm_var": 0.03339436848958333, "learning_rate": 0.0001, "loss": 2.8966, "loss/crossentropy": 2.357456934452057, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.1982443705201149, "loss/reg": 0.0, "step": 49830 }, { "epoch": 0.32789473684210524, "grad_norm": 2.265625, "grad_norm_var": 0.0430816650390625, "learning_rate": 0.0001, "loss": 2.9115, "loss/crossentropy": 2.3156696438789366, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.24953905418515204, "loss/reg": 0.0, "step": 49840 }, { "epoch": 0.3279605263157895, "grad_norm": 3.125, "grad_norm_var": 0.06883138020833333, "learning_rate": 0.0001, "loss": 2.9365, "loss/crossentropy": 2.427468740940094, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.20974498763680458, "loss/reg": 0.0, "step": 49850 }, { "epoch": 0.32802631578947367, "grad_norm": 2.1875, "grad_norm_var": 0.082568359375, "learning_rate": 0.0001, "loss": 2.9242, "loss/crossentropy": 2.456352782249451, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.24482279270887375, "loss/reg": 0.0, "step": 49860 }, { "epoch": 0.3280921052631579, "grad_norm": 2.421875, "grad_norm_var": 10.816502888997396, "learning_rate": 0.0001, "loss": 2.8694, "loss/crossentropy": 2.254105007648468, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.21307358592748643, "loss/reg": 0.0, "step": 49870 }, { "epoch": 0.3281578947368421, "grad_norm": 2.296875, "grad_norm_var": 17.999120076497395, "learning_rate": 0.0001, "loss": 2.9855, "loss/crossentropy": 2.236227023601532, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.24152030497789384, "loss/reg": 0.0, "step": 49880 }, { "epoch": 0.32822368421052633, "grad_norm": 2.359375, "grad_norm_var": 8.80474853515625, "learning_rate": 0.0001, "loss": 2.9092, "loss/crossentropy": 2.5275360584259032, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.24033236205577851, "loss/reg": 0.0, "step": 49890 }, { "epoch": 0.3282894736842105, "grad_norm": 2.609375, "grad_norm_var": 0.16717122395833334, "learning_rate": 0.0001, "loss": 2.9453, "loss/crossentropy": 2.042096012830734, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.2067515764385462, "loss/reg": 0.0, "step": 49900 }, { "epoch": 0.32835526315789476, "grad_norm": 2.5625, "grad_norm_var": 0.15269775390625, "learning_rate": 0.0001, "loss": 2.8965, "loss/crossentropy": 2.331100916862488, "loss/hidden": 2.5015625, "loss/incoh": 0.0, "loss/logits": 0.18624642193317414, "loss/reg": 0.0, "step": 49910 }, { "epoch": 0.32842105263157895, "grad_norm": 2.515625, "grad_norm_var": 0.10349019368489583, "learning_rate": 0.0001, "loss": 2.9184, "loss/crossentropy": 1.8542328655719758, "loss/hidden": 2.80859375, "loss/incoh": 0.0, "loss/logits": 0.25203222781419754, "loss/reg": 0.0, "step": 49920 }, { "epoch": 0.32848684210526313, "grad_norm": 2.65625, "grad_norm_var": 0.06513671875, "learning_rate": 0.0001, "loss": 2.8343, "loss/crossentropy": 2.3653822779655456, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.17638102620840074, "loss/reg": 0.0, "step": 49930 }, { "epoch": 0.3285526315789474, "grad_norm": 2.390625, "grad_norm_var": 0.06457926432291666, "learning_rate": 0.0001, "loss": 2.9031, "loss/crossentropy": 2.4460161209106444, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.2447662591934204, "loss/reg": 0.0, "step": 49940 }, { "epoch": 0.32861842105263156, "grad_norm": 2.375, "grad_norm_var": 0.06418863932291667, "learning_rate": 0.0001, "loss": 2.8743, "loss/crossentropy": 2.199061155319214, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.21010118275880812, "loss/reg": 0.0, "step": 49950 }, { "epoch": 0.3286842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.14882405598958334, "learning_rate": 0.0001, "loss": 2.8892, "loss/crossentropy": 2.0261480867862702, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2423352465033531, "loss/reg": 0.0, "step": 49960 }, { "epoch": 0.32875, "grad_norm": 2.125, "grad_norm_var": 0.05260009765625, "learning_rate": 0.0001, "loss": 2.8926, "loss/crossentropy": 2.2479127287864684, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22456666678190232, "loss/reg": 0.0, "step": 49970 }, { "epoch": 0.32881578947368423, "grad_norm": 2.703125, "grad_norm_var": 0.22978413899739583, "learning_rate": 0.0001, "loss": 2.9402, "loss/crossentropy": 2.1669164448976517, "loss/hidden": 2.5203125, "loss/incoh": 0.0, "loss/logits": 0.19384834542870522, "loss/reg": 0.0, "step": 49980 }, { "epoch": 0.3288815789473684, "grad_norm": 2.390625, "grad_norm_var": 0.19810791015625, "learning_rate": 0.0001, "loss": 2.9459, "loss/crossentropy": 2.0145064294338226, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.18306682631373405, "loss/reg": 0.0, "step": 49990 }, { "epoch": 0.32894736842105265, "grad_norm": 2.265625, "grad_norm_var": 0.12587483723958334, "learning_rate": 0.0001, "loss": 2.982, "loss/crossentropy": 2.186770462989807, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.23250895887613296, "loss/reg": 0.0, "step": 50000 }, { "epoch": 0.32901315789473684, "grad_norm": 2.3125, "grad_norm_var": 0.15289306640625, "learning_rate": 0.0001, "loss": 2.8709, "loss/crossentropy": 2.608423948287964, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.20663814693689347, "loss/reg": 0.0, "step": 50010 }, { "epoch": 0.329078947368421, "grad_norm": 2.796875, "grad_norm_var": 0.0913970947265625, "learning_rate": 0.0001, "loss": 2.9419, "loss/crossentropy": 2.1840139865875243, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22605900466442108, "loss/reg": 0.0, "step": 50020 }, { "epoch": 0.32914473684210527, "grad_norm": 2.421875, "grad_norm_var": 0.08843994140625, "learning_rate": 0.0001, "loss": 2.9153, "loss/crossentropy": 2.133372277021408, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.217558391392231, "loss/reg": 0.0, "step": 50030 }, { "epoch": 0.32921052631578945, "grad_norm": 3.09375, "grad_norm_var": 3.905641215863336e+17, "learning_rate": 0.0001, "loss": 3.0267, "loss/crossentropy": 2.3309960842132567, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.22189951092004775, "loss/reg": 0.0, "step": 50040 }, { "epoch": 0.3292763157894737, "grad_norm": 2.09375, "grad_norm_var": 3.9056412153653274e+17, "learning_rate": 0.0001, "loss": 2.8793, "loss/crossentropy": 2.136474812030792, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.22353410124778747, "loss/reg": 0.0, "step": 50050 }, { "epoch": 0.3293421052631579, "grad_norm": 2.65625, "grad_norm_var": 0.2767862955729167, "learning_rate": 0.0001, "loss": 2.9543, "loss/crossentropy": 2.1973337888717652, "loss/hidden": 2.88125, "loss/incoh": 0.0, "loss/logits": 0.2246320903301239, "loss/reg": 0.0, "step": 50060 }, { "epoch": 0.3294078947368421, "grad_norm": 2.1875, "grad_norm_var": 0.04363606770833333, "learning_rate": 0.0001, "loss": 2.8924, "loss/crossentropy": 2.235460567474365, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.19880189895629882, "loss/reg": 0.0, "step": 50070 }, { "epoch": 0.3294736842105263, "grad_norm": 2.25, "grad_norm_var": 0.3841145833333333, "learning_rate": 0.0001, "loss": 2.8434, "loss/crossentropy": 2.463154363632202, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.1982060581445694, "loss/reg": 0.0, "step": 50080 }, { "epoch": 0.32953947368421055, "grad_norm": 2.296875, "grad_norm_var": 0.3861968994140625, "learning_rate": 0.0001, "loss": 2.8655, "loss/crossentropy": 2.4530899047851564, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.207638917863369, "loss/reg": 0.0, "step": 50090 }, { "epoch": 0.32960526315789473, "grad_norm": 2.375, "grad_norm_var": 0.08765869140625, "learning_rate": 0.0001, "loss": 2.8412, "loss/crossentropy": 2.4556174516677856, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.22642846554517745, "loss/reg": 0.0, "step": 50100 }, { "epoch": 0.3296710526315789, "grad_norm": 2.34375, "grad_norm_var": 0.20978190104166666, "learning_rate": 0.0001, "loss": 2.9471, "loss/crossentropy": 2.359029984474182, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.20481059700250626, "loss/reg": 0.0, "step": 50110 }, { "epoch": 0.32973684210526316, "grad_norm": 2.3125, "grad_norm_var": 0.08172581990559896, "learning_rate": 0.0001, "loss": 2.9221, "loss/crossentropy": 2.13444447517395, "loss/hidden": 2.834375, "loss/incoh": 0.0, "loss/logits": 0.22104116082191466, "loss/reg": 0.0, "step": 50120 }, { "epoch": 0.32980263157894735, "grad_norm": 2.265625, "grad_norm_var": 0.05947443644205729, "learning_rate": 0.0001, "loss": 2.903, "loss/crossentropy": 2.385596251487732, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.24476318657398224, "loss/reg": 0.0, "step": 50130 }, { "epoch": 0.3298684210526316, "grad_norm": 2.828125, "grad_norm_var": 0.052000935872395834, "learning_rate": 0.0001, "loss": 2.8288, "loss/crossentropy": 2.506817579269409, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.20057181119918824, "loss/reg": 0.0, "step": 50140 }, { "epoch": 0.3299342105263158, "grad_norm": 2.609375, "grad_norm_var": 0.0572418212890625, "learning_rate": 0.0001, "loss": 2.8866, "loss/crossentropy": 2.536951684951782, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.23353694677352904, "loss/reg": 0.0, "step": 50150 }, { "epoch": 0.33, "grad_norm": 2.03125, "grad_norm_var": 0.1406402587890625, "learning_rate": 0.0001, "loss": 2.8853, "loss/crossentropy": 2.3347967505455016, "loss/hidden": 2.50625, "loss/incoh": 0.0, "loss/logits": 0.1855148121714592, "loss/reg": 0.0, "step": 50160 }, { "epoch": 0.3300657894736842, "grad_norm": 2.53125, "grad_norm_var": 0.19454930623372396, "learning_rate": 0.0001, "loss": 2.8825, "loss/crossentropy": 2.253298079967499, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.18906203284859657, "loss/reg": 0.0, "step": 50170 }, { "epoch": 0.33013157894736844, "grad_norm": 2.453125, "grad_norm_var": 0.1872576395670573, "learning_rate": 0.0001, "loss": 2.9053, "loss/crossentropy": 2.19364293217659, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.22882090657949447, "loss/reg": 0.0, "step": 50180 }, { "epoch": 0.33019736842105263, "grad_norm": 2.375, "grad_norm_var": 0.036351521809895836, "learning_rate": 0.0001, "loss": 2.9381, "loss/crossentropy": 2.4069783806800844, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.18913611769676208, "loss/reg": 0.0, "step": 50190 }, { "epoch": 0.33026315789473687, "grad_norm": 3.125, "grad_norm_var": 0.08808186848958334, "learning_rate": 0.0001, "loss": 2.9265, "loss/crossentropy": 2.4304837822914123, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.20381492525339126, "loss/reg": 0.0, "step": 50200 }, { "epoch": 0.33032894736842106, "grad_norm": 2.71875, "grad_norm_var": 0.06588541666666667, "learning_rate": 0.0001, "loss": 2.87, "loss/crossentropy": 2.39021555185318, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.20780349969863893, "loss/reg": 0.0, "step": 50210 }, { "epoch": 0.33039473684210524, "grad_norm": 2.484375, "grad_norm_var": 0.07418212890625, "learning_rate": 0.0001, "loss": 2.9514, "loss/crossentropy": 2.3013276934623716, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.22350992485880852, "loss/reg": 0.0, "step": 50220 }, { "epoch": 0.3304605263157895, "grad_norm": 2.203125, "grad_norm_var": 0.032698567708333334, "learning_rate": 0.0001, "loss": 2.9534, "loss/crossentropy": 2.1017030358314512, "loss/hidden": 2.8375, "loss/incoh": 0.0, "loss/logits": 0.2739591747522354, "loss/reg": 0.0, "step": 50230 }, { "epoch": 0.33052631578947367, "grad_norm": 2.0625, "grad_norm_var": 0.07257486979166666, "learning_rate": 0.0001, "loss": 2.9487, "loss/crossentropy": 2.2890019059181212, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.2194996580481529, "loss/reg": 0.0, "step": 50240 }, { "epoch": 0.3305921052631579, "grad_norm": 2.34375, "grad_norm_var": 0.15289306640625, "learning_rate": 0.0001, "loss": 2.8885, "loss/crossentropy": 2.2827965140342714, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.21171458661556244, "loss/reg": 0.0, "step": 50250 }, { "epoch": 0.3306578947368421, "grad_norm": 2.515625, "grad_norm_var": 0.24058837890625, "learning_rate": 0.0001, "loss": 2.8613, "loss/crossentropy": 2.1263321161270143, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.1856191024184227, "loss/reg": 0.0, "step": 50260 }, { "epoch": 0.33072368421052634, "grad_norm": 2.421875, "grad_norm_var": 0.151953125, "learning_rate": 0.0001, "loss": 2.8866, "loss/crossentropy": 1.929057303071022, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.18848896846175195, "loss/reg": 0.0, "step": 50270 }, { "epoch": 0.3307894736842105, "grad_norm": 2.765625, "grad_norm_var": 0.022847493489583332, "learning_rate": 0.0001, "loss": 2.859, "loss/crossentropy": 2.487294101715088, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2100590080022812, "loss/reg": 0.0, "step": 50280 }, { "epoch": 0.33085526315789476, "grad_norm": 2.453125, "grad_norm_var": 0.06490478515625, "learning_rate": 0.0001, "loss": 2.8894, "loss/crossentropy": 1.8435019850730896, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.18557069301605225, "loss/reg": 0.0, "step": 50290 }, { "epoch": 0.33092105263157895, "grad_norm": 2.890625, "grad_norm_var": 0.09509175618489583, "learning_rate": 0.0001, "loss": 2.918, "loss/crossentropy": 2.373463535308838, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.27773230373859403, "loss/reg": 0.0, "step": 50300 }, { "epoch": 0.33098684210526313, "grad_norm": 2.53125, "grad_norm_var": 0.0619537353515625, "learning_rate": 0.0001, "loss": 2.8046, "loss/crossentropy": 2.378634786605835, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.20952736288309098, "loss/reg": 0.0, "step": 50310 }, { "epoch": 0.3310526315789474, "grad_norm": 2.078125, "grad_norm_var": 0.047684733072916666, "learning_rate": 0.0001, "loss": 2.8959, "loss/crossentropy": 2.1136455297470094, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2044602021574974, "loss/reg": 0.0, "step": 50320 }, { "epoch": 0.33111842105263156, "grad_norm": 2.21875, "grad_norm_var": 0.0512603759765625, "learning_rate": 0.0001, "loss": 2.8343, "loss/crossentropy": 2.442075264453888, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.2013813614845276, "loss/reg": 0.0, "step": 50330 }, { "epoch": 0.3311842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.03511454264322917, "learning_rate": 0.0001, "loss": 2.8764, "loss/crossentropy": 2.0809543550014498, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19942371621727945, "loss/reg": 0.0, "step": 50340 }, { "epoch": 0.33125, "grad_norm": 2.578125, "grad_norm_var": 0.25320638020833336, "learning_rate": 0.0001, "loss": 2.9236, "loss/crossentropy": 1.996916115283966, "loss/hidden": 2.859375, "loss/incoh": 0.0, "loss/logits": 0.22222127094864846, "loss/reg": 0.0, "step": 50350 }, { "epoch": 0.33131578947368423, "grad_norm": 2.171875, "grad_norm_var": 0.28105367024739586, "learning_rate": 0.0001, "loss": 2.8969, "loss/crossentropy": 2.313769519329071, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.21315369457006456, "loss/reg": 0.0, "step": 50360 }, { "epoch": 0.3313815789473684, "grad_norm": 2.265625, "grad_norm_var": 1.1902170817057292, "learning_rate": 0.0001, "loss": 2.9243, "loss/crossentropy": 2.376867949962616, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.18450321555137633, "loss/reg": 0.0, "step": 50370 }, { "epoch": 0.33144736842105266, "grad_norm": 2.328125, "grad_norm_var": 1.7361183166503906, "learning_rate": 0.0001, "loss": 2.8706, "loss/crossentropy": 2.052333354949951, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.1883837789297104, "loss/reg": 0.0, "step": 50380 }, { "epoch": 0.33151315789473684, "grad_norm": 2.28125, "grad_norm_var": 0.06515884399414062, "learning_rate": 0.0001, "loss": 2.8477, "loss/crossentropy": 2.0888612389564516, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.18040964677929877, "loss/reg": 0.0, "step": 50390 }, { "epoch": 0.33157894736842103, "grad_norm": 2.625, "grad_norm_var": 0.0737213134765625, "learning_rate": 0.0001, "loss": 2.864, "loss/crossentropy": 2.370367646217346, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.21432522237300872, "loss/reg": 0.0, "step": 50400 }, { "epoch": 0.33164473684210527, "grad_norm": 2.359375, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 2.9266, "loss/crossentropy": 2.237786555290222, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22062029093503951, "loss/reg": 0.0, "step": 50410 }, { "epoch": 0.33171052631578946, "grad_norm": 2.15625, "grad_norm_var": 0.07538655598958334, "learning_rate": 0.0001, "loss": 2.8914, "loss/crossentropy": 1.8082878708839416, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.21079254969954492, "loss/reg": 0.0, "step": 50420 }, { "epoch": 0.3317763157894737, "grad_norm": 2.734375, "grad_norm_var": 0.19761454264322917, "learning_rate": 0.0001, "loss": 2.8603, "loss/crossentropy": 2.0727750599384307, "loss/hidden": 2.76875, "loss/incoh": 0.0, "loss/logits": 0.21904964596033097, "loss/reg": 0.0, "step": 50430 }, { "epoch": 0.3318421052631579, "grad_norm": 3.171875, "grad_norm_var": 0.07109273274739583, "learning_rate": 0.0001, "loss": 2.9436, "loss/crossentropy": 2.278210985660553, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20327704399824142, "loss/reg": 0.0, "step": 50440 }, { "epoch": 0.3319078947368421, "grad_norm": 2.484375, "grad_norm_var": 0.07994384765625, "learning_rate": 0.0001, "loss": 2.9487, "loss/crossentropy": 2.545610189437866, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.2313461720943451, "loss/reg": 0.0, "step": 50450 }, { "epoch": 0.3319736842105263, "grad_norm": 2.3125, "grad_norm_var": 0.07317606608072917, "learning_rate": 0.0001, "loss": 2.8192, "loss/crossentropy": 2.1331045269966125, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.19365556091070174, "loss/reg": 0.0, "step": 50460 }, { "epoch": 0.33203947368421055, "grad_norm": 2.328125, "grad_norm_var": 0.06905008951822916, "learning_rate": 0.0001, "loss": 2.931, "loss/crossentropy": 2.278953754901886, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.21813278496265412, "loss/reg": 0.0, "step": 50470 }, { "epoch": 0.33210526315789474, "grad_norm": 2.359375, "grad_norm_var": 0.0637603759765625, "learning_rate": 0.0001, "loss": 2.8225, "loss/crossentropy": 2.2639258742332458, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.2262943983078003, "loss/reg": 0.0, "step": 50480 }, { "epoch": 0.3321710526315789, "grad_norm": 2.625, "grad_norm_var": 0.037262980143229166, "learning_rate": 0.0001, "loss": 2.8661, "loss/crossentropy": 2.2626337349414825, "loss/hidden": 2.4828125, "loss/incoh": 0.0, "loss/logits": 0.1757798582315445, "loss/reg": 0.0, "step": 50490 }, { "epoch": 0.33223684210526316, "grad_norm": 2.390625, "grad_norm_var": 0.0932281494140625, "learning_rate": 0.0001, "loss": 2.8859, "loss/crossentropy": 2.037859523296356, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.20832669138908386, "loss/reg": 0.0, "step": 50500 }, { "epoch": 0.33230263157894735, "grad_norm": 2.359375, "grad_norm_var": 0.131201171875, "learning_rate": 0.0001, "loss": 2.9169, "loss/crossentropy": 2.245512640476227, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.19610281884670258, "loss/reg": 0.0, "step": 50510 }, { "epoch": 0.3323684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.052473958333333334, "learning_rate": 0.0001, "loss": 2.8659, "loss/crossentropy": 1.7806527376174928, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.20760951340198516, "loss/reg": 0.0, "step": 50520 }, { "epoch": 0.3324342105263158, "grad_norm": 2.375, "grad_norm_var": 0.02047119140625, "learning_rate": 0.0001, "loss": 2.9106, "loss/crossentropy": 2.2821572184562684, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.19843539893627166, "loss/reg": 0.0, "step": 50530 }, { "epoch": 0.3325, "grad_norm": 4.96875, "grad_norm_var": 0.42890625, "learning_rate": 0.0001, "loss": 2.9628, "loss/crossentropy": 2.0077955335378648, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.1874359395354986, "loss/reg": 0.0, "step": 50540 }, { "epoch": 0.3325657894736842, "grad_norm": 2.296875, "grad_norm_var": 0.4422841389973958, "learning_rate": 0.0001, "loss": 2.8661, "loss/crossentropy": 2.367862570285797, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.1995317429304123, "loss/reg": 0.0, "step": 50550 }, { "epoch": 0.33263157894736844, "grad_norm": 2.453125, "grad_norm_var": 0.253173828125, "learning_rate": 0.0001, "loss": 2.9369, "loss/crossentropy": 2.2312045812606813, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.1858774021267891, "loss/reg": 0.0, "step": 50560 }, { "epoch": 0.33269736842105263, "grad_norm": 2.578125, "grad_norm_var": 0.22688700358072916, "learning_rate": 0.0001, "loss": 2.9248, "loss/crossentropy": 2.4077724695205687, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.20457258224487304, "loss/reg": 0.0, "step": 50570 }, { "epoch": 0.3327631578947368, "grad_norm": 2.25, "grad_norm_var": 0.03736572265625, "learning_rate": 0.0001, "loss": 2.9118, "loss/crossentropy": 2.4545961141586305, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.21606967300176622, "loss/reg": 0.0, "step": 50580 }, { "epoch": 0.33282894736842106, "grad_norm": 2.46875, "grad_norm_var": 0.06807352701822916, "learning_rate": 0.0001, "loss": 2.9377, "loss/crossentropy": 2.389681613445282, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.28717913031578063, "loss/reg": 0.0, "step": 50590 }, { "epoch": 0.33289473684210524, "grad_norm": 2.328125, "grad_norm_var": 0.17813212076822918, "learning_rate": 0.0001, "loss": 2.8652, "loss/crossentropy": 2.449984407424927, "loss/hidden": 2.5234375, "loss/incoh": 0.0, "loss/logits": 0.19355312883853912, "loss/reg": 0.0, "step": 50600 }, { "epoch": 0.3329605263157895, "grad_norm": 2.421875, "grad_norm_var": 0.14624735514322917, "learning_rate": 0.0001, "loss": 2.8165, "loss/crossentropy": 2.3701043009757994, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.213811431825161, "loss/reg": 0.0, "step": 50610 }, { "epoch": 0.33302631578947367, "grad_norm": 2.65625, "grad_norm_var": 0.19875386555989583, "learning_rate": 0.0001, "loss": 2.9309, "loss/crossentropy": 2.548377585411072, "loss/hidden": 2.5390625, "loss/incoh": 0.0, "loss/logits": 0.21067244112491607, "loss/reg": 0.0, "step": 50620 }, { "epoch": 0.3330921052631579, "grad_norm": 2.5, "grad_norm_var": 0.0710845947265625, "learning_rate": 0.0001, "loss": 2.9266, "loss/crossentropy": 2.325981104373932, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.20033977478742598, "loss/reg": 0.0, "step": 50630 }, { "epoch": 0.3331578947368421, "grad_norm": 2.90625, "grad_norm_var": 0.06009114583333333, "learning_rate": 0.0001, "loss": 2.939, "loss/crossentropy": 2.2107152104377747, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.21968744024634362, "loss/reg": 0.0, "step": 50640 }, { "epoch": 0.33322368421052634, "grad_norm": 2.15625, "grad_norm_var": 0.23088785807291667, "learning_rate": 0.0001, "loss": 2.8931, "loss/crossentropy": 2.5067268133163454, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.21632191836833953, "loss/reg": 0.0, "step": 50650 }, { "epoch": 0.3332894736842105, "grad_norm": 2.0625, "grad_norm_var": 0.22502848307291667, "learning_rate": 0.0001, "loss": 2.9251, "loss/crossentropy": 2.0246191143989565, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.21657928973436355, "loss/reg": 0.0, "step": 50660 }, { "epoch": 0.3333552631578947, "grad_norm": 2.25, "grad_norm_var": 0.07379150390625, "learning_rate": 0.0001, "loss": 2.8755, "loss/crossentropy": 2.5911722660064695, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.22023314386606216, "loss/reg": 0.0, "step": 50670 }, { "epoch": 0.33342105263157895, "grad_norm": 2.34375, "grad_norm_var": 0.1233306884765625, "learning_rate": 0.0001, "loss": 2.9218, "loss/crossentropy": 2.079791951179504, "loss/hidden": 2.996875, "loss/incoh": 0.0, "loss/logits": 0.21135509461164476, "loss/reg": 0.0, "step": 50680 }, { "epoch": 0.33348684210526314, "grad_norm": 2.640625, "grad_norm_var": 0.0994049072265625, "learning_rate": 0.0001, "loss": 2.9061, "loss/crossentropy": 2.2185009717941284, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2151113323867321, "loss/reg": 0.0, "step": 50690 }, { "epoch": 0.3335526315789474, "grad_norm": 2.1875, "grad_norm_var": 0.042008463541666666, "learning_rate": 0.0001, "loss": 2.9014, "loss/crossentropy": 2.5826416015625, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.20103476494550704, "loss/reg": 0.0, "step": 50700 }, { "epoch": 0.33361842105263156, "grad_norm": 2.4375, "grad_norm_var": 0.07581278483072916, "learning_rate": 0.0001, "loss": 2.9147, "loss/crossentropy": 2.4500887274742125, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.21521444469690323, "loss/reg": 0.0, "step": 50710 }, { "epoch": 0.3336842105263158, "grad_norm": 2.6875, "grad_norm_var": 0.0680084228515625, "learning_rate": 0.0001, "loss": 2.886, "loss/crossentropy": 2.2272815346717834, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.22068194150924683, "loss/reg": 0.0, "step": 50720 }, { "epoch": 0.33375, "grad_norm": 2.40625, "grad_norm_var": 0.0424468994140625, "learning_rate": 0.0001, "loss": 2.8964, "loss/crossentropy": 2.0950161099433897, "loss/hidden": 2.8640625, "loss/incoh": 0.0, "loss/logits": 0.2555733427405357, "loss/reg": 0.0, "step": 50730 }, { "epoch": 0.33381578947368423, "grad_norm": 2.234375, "grad_norm_var": 0.08638916015625, "learning_rate": 0.0001, "loss": 2.9286, "loss/crossentropy": 2.0966663360595703, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.2145072713494301, "loss/reg": 0.0, "step": 50740 }, { "epoch": 0.3338815789473684, "grad_norm": 2.421875, "grad_norm_var": 0.0245025634765625, "learning_rate": 0.0001, "loss": 2.8752, "loss/crossentropy": 2.210767614841461, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.19852697625756263, "loss/reg": 0.0, "step": 50750 }, { "epoch": 0.3339473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.03713785807291667, "learning_rate": 0.0001, "loss": 2.9689, "loss/crossentropy": 2.11176974773407, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.25268652737140657, "loss/reg": 0.0, "step": 50760 }, { "epoch": 0.33401315789473685, "grad_norm": 2.515625, "grad_norm_var": 0.15764973958333334, "learning_rate": 0.0001, "loss": 2.9768, "loss/crossentropy": 2.2423816442489626, "loss/hidden": 2.90625, "loss/incoh": 0.0, "loss/logits": 0.2602867230772972, "loss/reg": 0.0, "step": 50770 }, { "epoch": 0.33407894736842103, "grad_norm": 2.234375, "grad_norm_var": 0.03665364583333333, "learning_rate": 0.0001, "loss": 2.8858, "loss/crossentropy": 2.187647223472595, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.20351214855909347, "loss/reg": 0.0, "step": 50780 }, { "epoch": 0.33414473684210527, "grad_norm": 2.171875, "grad_norm_var": 0.0225494384765625, "learning_rate": 0.0001, "loss": 2.8667, "loss/crossentropy": 2.3542147397994997, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.22136716544628143, "loss/reg": 0.0, "step": 50790 }, { "epoch": 0.33421052631578946, "grad_norm": 2.125, "grad_norm_var": 0.0398834228515625, "learning_rate": 0.0001, "loss": 2.8839, "loss/crossentropy": 2.117619347572327, "loss/hidden": 2.9375, "loss/incoh": 0.0, "loss/logits": 0.2707338482141495, "loss/reg": 0.0, "step": 50800 }, { "epoch": 0.3342763157894737, "grad_norm": 2.28125, "grad_norm_var": 0.043797810872395836, "learning_rate": 0.0001, "loss": 2.922, "loss/crossentropy": 2.2649453401565554, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.22664185166358947, "loss/reg": 0.0, "step": 50810 }, { "epoch": 0.3343421052631579, "grad_norm": 2.03125, "grad_norm_var": 0.06808268229166667, "learning_rate": 0.0001, "loss": 2.9956, "loss/crossentropy": 2.4377845764160155, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.24173070788383483, "loss/reg": 0.0, "step": 50820 }, { "epoch": 0.3344078947368421, "grad_norm": 2.3125, "grad_norm_var": 0.07766011555989584, "learning_rate": 0.0001, "loss": 2.9561, "loss/crossentropy": 2.1634218096733093, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.20271003395318984, "loss/reg": 0.0, "step": 50830 }, { "epoch": 0.3344736842105263, "grad_norm": 2.625, "grad_norm_var": 0.0377593994140625, "learning_rate": 0.0001, "loss": 2.922, "loss/crossentropy": 2.23525390625, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.19301858320832252, "loss/reg": 0.0, "step": 50840 }, { "epoch": 0.33453947368421055, "grad_norm": 2.234375, "grad_norm_var": 0.0619049072265625, "learning_rate": 0.0001, "loss": 2.9415, "loss/crossentropy": 2.333955454826355, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.22684998959302902, "loss/reg": 0.0, "step": 50850 }, { "epoch": 0.33460526315789474, "grad_norm": 2.265625, "grad_norm_var": 2.5122081411023066e+17, "learning_rate": 0.0001, "loss": 3.1717, "loss/crossentropy": 2.417708957195282, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2552970707416534, "loss/reg": 0.0, "step": 50860 }, { "epoch": 0.3346710526315789, "grad_norm": 2.15625, "grad_norm_var": 0.04784749348958333, "learning_rate": 0.0001, "loss": 2.9138, "loss/crossentropy": 2.574558162689209, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.22551280856132508, "loss/reg": 0.0, "step": 50870 }, { "epoch": 0.33473684210526317, "grad_norm": 2.1875, "grad_norm_var": 0.050050608317057294, "learning_rate": 0.0001, "loss": 2.876, "loss/crossentropy": 2.1544400453567505, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20682999789714812, "loss/reg": 0.0, "step": 50880 }, { "epoch": 0.33480263157894735, "grad_norm": 2.734375, "grad_norm_var": 0.047308095296223956, "learning_rate": 0.0001, "loss": 2.9572, "loss/crossentropy": 2.1814372539520264, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.2361245334148407, "loss/reg": 0.0, "step": 50890 }, { "epoch": 0.3348684210526316, "grad_norm": 2.125, "grad_norm_var": 0.056061808268229166, "learning_rate": 0.0001, "loss": 2.9148, "loss/crossentropy": 2.299385201931, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.22338827252388, "loss/reg": 0.0, "step": 50900 }, { "epoch": 0.3349342105263158, "grad_norm": 2.109375, "grad_norm_var": 0.0869781494140625, "learning_rate": 0.0001, "loss": 2.9312, "loss/crossentropy": 2.1938913822174073, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.19955417588353158, "loss/reg": 0.0, "step": 50910 }, { "epoch": 0.335, "grad_norm": 2.171875, "grad_norm_var": 0.05683492024739583, "learning_rate": 0.0001, "loss": 2.8518, "loss/crossentropy": 2.540507507324219, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.20163573324680328, "loss/reg": 0.0, "step": 50920 }, { "epoch": 0.3350657894736842, "grad_norm": 2.46875, "grad_norm_var": 0.056493123372395836, "learning_rate": 0.0001, "loss": 2.8929, "loss/crossentropy": 2.355184495449066, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.19840589091181754, "loss/reg": 0.0, "step": 50930 }, { "epoch": 0.33513157894736845, "grad_norm": 2.828125, "grad_norm_var": 0.4327545166015625, "learning_rate": 0.0001, "loss": 2.9971, "loss/crossentropy": 2.3064613580703734, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.19526328295469284, "loss/reg": 0.0, "step": 50940 }, { "epoch": 0.33519736842105263, "grad_norm": 2.375, "grad_norm_var": 0.469873046875, "learning_rate": 0.0001, "loss": 2.8762, "loss/crossentropy": 2.393480455875397, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.21682310551404954, "loss/reg": 0.0, "step": 50950 }, { "epoch": 0.3352631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.027274576822916667, "learning_rate": 0.0001, "loss": 2.9084, "loss/crossentropy": 2.197929584980011, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.19201867580413817, "loss/reg": 0.0, "step": 50960 }, { "epoch": 0.33532894736842106, "grad_norm": 2.84375, "grad_norm_var": 0.24527079264322918, "learning_rate": 0.0001, "loss": 2.9294, "loss/crossentropy": 2.3099944829940795, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.26518677920103073, "loss/reg": 0.0, "step": 50970 }, { "epoch": 0.33539473684210525, "grad_norm": 2.203125, "grad_norm_var": 0.26024983723958334, "learning_rate": 0.0001, "loss": 2.9132, "loss/crossentropy": 2.4253310918807984, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.20256113409996032, "loss/reg": 0.0, "step": 50980 }, { "epoch": 0.3354605263157895, "grad_norm": 2.40625, "grad_norm_var": 0.18618876139322918, "learning_rate": 0.0001, "loss": 2.8864, "loss/crossentropy": 2.0144184470176696, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.20889573097229003, "loss/reg": 0.0, "step": 50990 }, { "epoch": 0.3355263157894737, "grad_norm": 2.390625, "grad_norm_var": 0.15875651041666666, "learning_rate": 0.0001, "loss": 2.9385, "loss/crossentropy": 2.3347760319709776, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.26305699795484544, "loss/reg": 0.0, "step": 51000 }, { "epoch": 0.3355921052631579, "grad_norm": 2.78125, "grad_norm_var": 0.150439453125, "learning_rate": 0.0001, "loss": 2.9547, "loss/crossentropy": 2.0401941418647764, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.23677497506141662, "loss/reg": 0.0, "step": 51010 }, { "epoch": 0.3356578947368421, "grad_norm": 2.234375, "grad_norm_var": 0.12185770670572917, "learning_rate": 0.0001, "loss": 2.8799, "loss/crossentropy": 2.1681514263153074, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.1978909820318222, "loss/reg": 0.0, "step": 51020 }, { "epoch": 0.33572368421052634, "grad_norm": 2.140625, "grad_norm_var": 0.03355712890625, "learning_rate": 0.0001, "loss": 2.8844, "loss/crossentropy": 2.4946326494216917, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.2138603374361992, "loss/reg": 0.0, "step": 51030 }, { "epoch": 0.3357894736842105, "grad_norm": 2.578125, "grad_norm_var": 0.028351847330729166, "learning_rate": 0.0001, "loss": 2.9442, "loss/crossentropy": 2.0505879521369934, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.1794215366244316, "loss/reg": 0.0, "step": 51040 }, { "epoch": 0.3358552631578947, "grad_norm": 2.203125, "grad_norm_var": 0.047638956705729166, "learning_rate": 0.0001, "loss": 3.0238, "loss/crossentropy": 2.245543509721756, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.19880647510290145, "loss/reg": 0.0, "step": 51050 }, { "epoch": 0.33592105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.032470703125, "learning_rate": 0.0001, "loss": 2.9689, "loss/crossentropy": 2.393807661533356, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.24181752949953078, "loss/reg": 0.0, "step": 51060 }, { "epoch": 0.33598684210526314, "grad_norm": 3.046875, "grad_norm_var": 0.11060791015625, "learning_rate": 0.0001, "loss": 3.0186, "loss/crossentropy": 2.471091812849045, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.246793632209301, "loss/reg": 0.0, "step": 51070 }, { "epoch": 0.3360526315789474, "grad_norm": 2.21875, "grad_norm_var": 0.10116780598958333, "learning_rate": 0.0001, "loss": 2.9292, "loss/crossentropy": 2.157414972782135, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.19926275461912155, "loss/reg": 0.0, "step": 51080 }, { "epoch": 0.33611842105263157, "grad_norm": 2.71875, "grad_norm_var": 0.3255849202473958, "learning_rate": 0.0001, "loss": 3.018, "loss/crossentropy": 2.6597060918807984, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.24392834752798082, "loss/reg": 0.0, "step": 51090 }, { "epoch": 0.3361842105263158, "grad_norm": 2.390625, "grad_norm_var": 1.9766580179740045e+17, "learning_rate": 0.0001, "loss": 2.9883, "loss/crossentropy": 2.4657155752182005, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.21559127122163774, "loss/reg": 0.0, "step": 51100 }, { "epoch": 0.33625, "grad_norm": 2.75, "grad_norm_var": 0.06601155598958333, "learning_rate": 0.0001, "loss": 2.9366, "loss/crossentropy": 2.1518523573875425, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.22397182285785674, "loss/reg": 0.0, "step": 51110 }, { "epoch": 0.33631578947368423, "grad_norm": 2.4375, "grad_norm_var": 0.26298421223958335, "learning_rate": 0.0001, "loss": 2.971, "loss/crossentropy": 2.1960764288902284, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.21993028074502946, "loss/reg": 0.0, "step": 51120 }, { "epoch": 0.3363815789473684, "grad_norm": 2.390625, "grad_norm_var": 0.2503082275390625, "learning_rate": 0.0001, "loss": 2.8512, "loss/crossentropy": 2.548286294937134, "loss/hidden": 2.509375, "loss/incoh": 0.0, "loss/logits": 0.2046455278992653, "loss/reg": 0.0, "step": 51130 }, { "epoch": 0.3364473684210526, "grad_norm": 2.546875, "grad_norm_var": 0.039404296875, "learning_rate": 0.0001, "loss": 2.9214, "loss/crossentropy": 2.2616363525390626, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.19665150046348573, "loss/reg": 0.0, "step": 51140 }, { "epoch": 0.33651315789473685, "grad_norm": 2.453125, "grad_norm_var": 0.09569905598958334, "learning_rate": 0.0001, "loss": 2.9497, "loss/crossentropy": 2.461883544921875, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.21027203649282455, "loss/reg": 0.0, "step": 51150 }, { "epoch": 0.33657894736842103, "grad_norm": 2.4375, "grad_norm_var": 0.043680826822916664, "learning_rate": 0.0001, "loss": 2.8532, "loss/crossentropy": 2.3153322339057922, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2384658545255661, "loss/reg": 0.0, "step": 51160 }, { "epoch": 0.3366447368421053, "grad_norm": 2.625, "grad_norm_var": 0.022782389322916666, "learning_rate": 0.0001, "loss": 2.8936, "loss/crossentropy": 2.2255983471870424, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.208677077293396, "loss/reg": 0.0, "step": 51170 }, { "epoch": 0.33671052631578946, "grad_norm": 2.296875, "grad_norm_var": 0.031183878580729168, "learning_rate": 0.0001, "loss": 2.8935, "loss/crossentropy": 2.217636561393738, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2151350975036621, "loss/reg": 0.0, "step": 51180 }, { "epoch": 0.3367763157894737, "grad_norm": 2.359375, "grad_norm_var": 0.0353424072265625, "learning_rate": 0.0001, "loss": 2.9034, "loss/crossentropy": 2.4696980714797974, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20950591564178467, "loss/reg": 0.0, "step": 51190 }, { "epoch": 0.3368421052631579, "grad_norm": 2.75, "grad_norm_var": 0.03313395182291667, "learning_rate": 0.0001, "loss": 2.9255, "loss/crossentropy": 2.3735176801681517, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.20985998809337617, "loss/reg": 0.0, "step": 51200 }, { "epoch": 0.33690789473684213, "grad_norm": 3.109375, "grad_norm_var": 0.07780659993489583, "learning_rate": 0.0001, "loss": 2.9531, "loss/crossentropy": 2.3303568601608275, "loss/hidden": 2.9828125, "loss/incoh": 0.0, "loss/logits": 0.2706267848610878, "loss/reg": 0.0, "step": 51210 }, { "epoch": 0.3369736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.08244527180989583, "learning_rate": 0.0001, "loss": 2.919, "loss/crossentropy": 2.27491295337677, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.20597711056470872, "loss/reg": 0.0, "step": 51220 }, { "epoch": 0.3370394736842105, "grad_norm": 2.234375, "grad_norm_var": 0.0528472900390625, "learning_rate": 0.0001, "loss": 2.8773, "loss/crossentropy": 2.0537822604179383, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2212661013007164, "loss/reg": 0.0, "step": 51230 }, { "epoch": 0.33710526315789474, "grad_norm": 2.546875, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 2.9091, "loss/crossentropy": 2.093616771697998, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.24028838872909547, "loss/reg": 0.0, "step": 51240 }, { "epoch": 0.3371710526315789, "grad_norm": 2.296875, "grad_norm_var": 0.1221832275390625, "learning_rate": 0.0001, "loss": 3.0161, "loss/crossentropy": 2.2296194076538085, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.21063584536314012, "loss/reg": 0.0, "step": 51250 }, { "epoch": 0.33723684210526317, "grad_norm": 2.71875, "grad_norm_var": 0.13632710774739584, "learning_rate": 0.0001, "loss": 2.9558, "loss/crossentropy": 2.195955741405487, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.19952035173773766, "loss/reg": 0.0, "step": 51260 }, { "epoch": 0.33730263157894735, "grad_norm": 3.21875, "grad_norm_var": 0.1055328369140625, "learning_rate": 0.0001, "loss": 2.8963, "loss/crossentropy": 2.2839467763900756, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.25927660167217254, "loss/reg": 0.0, "step": 51270 }, { "epoch": 0.3373684210526316, "grad_norm": 2.328125, "grad_norm_var": 0.10461832682291666, "learning_rate": 0.0001, "loss": 2.9276, "loss/crossentropy": 2.175657331943512, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.22597195729613304, "loss/reg": 0.0, "step": 51280 }, { "epoch": 0.3374342105263158, "grad_norm": 2.234375, "grad_norm_var": 0.06855061848958334, "learning_rate": 0.0001, "loss": 2.8737, "loss/crossentropy": 2.4763439178466795, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.22902814149856568, "loss/reg": 0.0, "step": 51290 }, { "epoch": 0.3375, "grad_norm": 2.8125, "grad_norm_var": 0.06529032389322917, "learning_rate": 0.0001, "loss": 2.9892, "loss/crossentropy": 2.2901416301727293, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.21565229669213296, "loss/reg": 0.0, "step": 51300 }, { "epoch": 0.3375657894736842, "grad_norm": 2.171875, "grad_norm_var": 0.13690999348958333, "learning_rate": 0.0001, "loss": 2.955, "loss/crossentropy": 2.0497992098331452, "loss/hidden": 2.790625, "loss/incoh": 0.0, "loss/logits": 0.21756108552217485, "loss/reg": 0.0, "step": 51310 }, { "epoch": 0.3376315789473684, "grad_norm": 2.1875, "grad_norm_var": 0.12815348307291666, "learning_rate": 0.0001, "loss": 2.8985, "loss/crossentropy": 2.3337001323699953, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.21024569123983383, "loss/reg": 0.0, "step": 51320 }, { "epoch": 0.33769736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.0409088134765625, "learning_rate": 0.0001, "loss": 2.9572, "loss/crossentropy": 2.3777615547180178, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.19942517206072807, "loss/reg": 0.0, "step": 51330 }, { "epoch": 0.3377631578947368, "grad_norm": 2.578125, "grad_norm_var": 0.06061197916666667, "learning_rate": 0.0001, "loss": 2.8986, "loss/crossentropy": 2.1074992537498476, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.22392439246177673, "loss/reg": 0.0, "step": 51340 }, { "epoch": 0.33782894736842106, "grad_norm": 2.65625, "grad_norm_var": 0.06864827473958333, "learning_rate": 0.0001, "loss": 2.942, "loss/crossentropy": 2.1408634305000307, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.22475532665848733, "loss/reg": 0.0, "step": 51350 }, { "epoch": 0.33789473684210525, "grad_norm": 2.75, "grad_norm_var": 0.07216796875, "learning_rate": 0.0001, "loss": 2.9538, "loss/crossentropy": 2.343469262123108, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.2040960043668747, "loss/reg": 0.0, "step": 51360 }, { "epoch": 0.3379605263157895, "grad_norm": 2.203125, "grad_norm_var": 0.037165323893229164, "learning_rate": 0.0001, "loss": 2.8995, "loss/crossentropy": 2.2144591093063353, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.21058980822563172, "loss/reg": 0.0, "step": 51370 }, { "epoch": 0.3380263157894737, "grad_norm": 2.515625, "grad_norm_var": 0.0152252197265625, "learning_rate": 0.0001, "loss": 2.9464, "loss/crossentropy": 2.195129668712616, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.27738767564296724, "loss/reg": 0.0, "step": 51380 }, { "epoch": 0.3380921052631579, "grad_norm": 2.46875, "grad_norm_var": 0.0571685791015625, "learning_rate": 0.0001, "loss": 2.9861, "loss/crossentropy": 2.282071602344513, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.19449208825826644, "loss/reg": 0.0, "step": 51390 }, { "epoch": 0.3381578947368421, "grad_norm": 2.734375, "grad_norm_var": 2.4081943415460003e+17, "learning_rate": 0.0001, "loss": 3.0227, "loss/crossentropy": 2.1599539875984193, "loss/hidden": 2.625, "loss/incoh": 0.0, "loss/logits": 0.1725857824087143, "loss/reg": 0.0, "step": 51400 }, { "epoch": 0.33822368421052634, "grad_norm": 2.3125, "grad_norm_var": 2.4081943416252333e+17, "learning_rate": 0.0001, "loss": 2.8904, "loss/crossentropy": 2.5691097378730774, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.1780735358595848, "loss/reg": 0.0, "step": 51410 }, { "epoch": 0.33828947368421053, "grad_norm": 2.28125, "grad_norm_var": 0.0538238525390625, "learning_rate": 0.0001, "loss": 2.8634, "loss/crossentropy": 2.211638700962067, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.20883913189172745, "loss/reg": 0.0, "step": 51420 }, { "epoch": 0.3383552631578947, "grad_norm": 2.5, "grad_norm_var": 2.3328033447265626, "learning_rate": 0.0001, "loss": 3.0002, "loss/crossentropy": 2.232033360004425, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.4572066754102707, "loss/reg": 0.0, "step": 51430 }, { "epoch": 0.33842105263157896, "grad_norm": 2.171875, "grad_norm_var": 2.63359375, "learning_rate": 0.0001, "loss": 2.9024, "loss/crossentropy": 2.3384958863258363, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.20987383425235748, "loss/reg": 0.0, "step": 51440 }, { "epoch": 0.33848684210526314, "grad_norm": 3.0, "grad_norm_var": 0.0443511962890625, "learning_rate": 0.0001, "loss": 2.8115, "loss/crossentropy": 2.125646489858627, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.20138383656740189, "loss/reg": 0.0, "step": 51450 }, { "epoch": 0.3385526315789474, "grad_norm": 2.34375, "grad_norm_var": 0.06591389973958334, "learning_rate": 0.0001, "loss": 2.9453, "loss/crossentropy": 2.04107426404953, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.25250485837459563, "loss/reg": 0.0, "step": 51460 }, { "epoch": 0.33861842105263157, "grad_norm": 2.0625, "grad_norm_var": 0.048737589518229166, "learning_rate": 0.0001, "loss": 2.8845, "loss/crossentropy": 2.231530177593231, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.23468084037303924, "loss/reg": 0.0, "step": 51470 }, { "epoch": 0.3386842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.049860636393229164, "learning_rate": 0.0001, "loss": 2.8947, "loss/crossentropy": 2.17267849445343, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.15510939210653304, "loss/reg": 0.0, "step": 51480 }, { "epoch": 0.33875, "grad_norm": 2.328125, "grad_norm_var": 0.09455973307291667, "learning_rate": 0.0001, "loss": 2.8992, "loss/crossentropy": 2.188182008266449, "loss/hidden": 2.471875, "loss/incoh": 0.0, "loss/logits": 0.17678166329860687, "loss/reg": 0.0, "step": 51490 }, { "epoch": 0.33881578947368424, "grad_norm": 2.421875, "grad_norm_var": 0.06751200358072916, "learning_rate": 0.0001, "loss": 2.9199, "loss/crossentropy": 2.4890690207481385, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20976100265979766, "loss/reg": 0.0, "step": 51500 }, { "epoch": 0.3388815789473684, "grad_norm": 3.5625, "grad_norm_var": 3.6365234375, "learning_rate": 0.0001, "loss": 2.9087, "loss/crossentropy": 2.1425050377845762, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.18070595487952232, "loss/reg": 0.0, "step": 51510 }, { "epoch": 0.3389473684210526, "grad_norm": 2.34375, "grad_norm_var": 3.5402903238932293, "learning_rate": 0.0001, "loss": 2.9895, "loss/crossentropy": 2.3227904438972473, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.22007289677858352, "loss/reg": 0.0, "step": 51520 }, { "epoch": 0.33901315789473685, "grad_norm": 2.59375, "grad_norm_var": 0.0755767822265625, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.3235227942466734, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.21638802140951158, "loss/reg": 0.0, "step": 51530 }, { "epoch": 0.33907894736842104, "grad_norm": 2.765625, "grad_norm_var": 0.051981608072916664, "learning_rate": 0.0001, "loss": 2.9687, "loss/crossentropy": 2.391672122478485, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.22687717378139496, "loss/reg": 0.0, "step": 51540 }, { "epoch": 0.3391447368421053, "grad_norm": 2.5625, "grad_norm_var": 0.054117838541666664, "learning_rate": 0.0001, "loss": 2.8967, "loss/crossentropy": 2.2732358992099764, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.21500950455665588, "loss/reg": 0.0, "step": 51550 }, { "epoch": 0.33921052631578946, "grad_norm": 2.296875, "grad_norm_var": 0.03235575358072917, "learning_rate": 0.0001, "loss": 2.8639, "loss/crossentropy": 2.253628873825073, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.204642254114151, "loss/reg": 0.0, "step": 51560 }, { "epoch": 0.3392763157894737, "grad_norm": 2.4375, "grad_norm_var": 0.07819010416666666, "learning_rate": 0.0001, "loss": 2.8785, "loss/crossentropy": 2.2255041360855103, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.20591113567352295, "loss/reg": 0.0, "step": 51570 }, { "epoch": 0.3393421052631579, "grad_norm": 2.46875, "grad_norm_var": 0.09401041666666667, "learning_rate": 0.0001, "loss": 2.8771, "loss/crossentropy": 2.356953811645508, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.20334179252386092, "loss/reg": 0.0, "step": 51580 }, { "epoch": 0.33940789473684213, "grad_norm": 2.703125, "grad_norm_var": 0.051123046875, "learning_rate": 0.0001, "loss": 2.9225, "loss/crossentropy": 2.5231595396995545, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.19643236249685286, "loss/reg": 0.0, "step": 51590 }, { "epoch": 0.3394736842105263, "grad_norm": 2.75, "grad_norm_var": 0.0424713134765625, "learning_rate": 0.0001, "loss": 2.9341, "loss/crossentropy": 2.3847658157348635, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.22482667714357377, "loss/reg": 0.0, "step": 51600 }, { "epoch": 0.3395394736842105, "grad_norm": 2.625, "grad_norm_var": 0.0580230712890625, "learning_rate": 0.0001, "loss": 2.9179, "loss/crossentropy": 2.10107136964798, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.22811854034662246, "loss/reg": 0.0, "step": 51610 }, { "epoch": 0.33960526315789474, "grad_norm": 2.875, "grad_norm_var": 0.0918853759765625, "learning_rate": 0.0001, "loss": 2.9267, "loss/crossentropy": 2.2242676436901094, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.2098591223359108, "loss/reg": 0.0, "step": 51620 }, { "epoch": 0.33967105263157893, "grad_norm": 2.421875, "grad_norm_var": 0.0696685791015625, "learning_rate": 0.0001, "loss": 2.9631, "loss/crossentropy": 2.3255584239959717, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.24604955911636353, "loss/reg": 0.0, "step": 51630 }, { "epoch": 0.33973684210526317, "grad_norm": 2.375, "grad_norm_var": 0.03994038899739583, "learning_rate": 0.0001, "loss": 2.9443, "loss/crossentropy": 2.3222431540489197, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.19137527495622636, "loss/reg": 0.0, "step": 51640 }, { "epoch": 0.33980263157894736, "grad_norm": 2.21875, "grad_norm_var": 0.024169921875, "learning_rate": 0.0001, "loss": 2.9143, "loss/crossentropy": 2.258522403240204, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.2283541038632393, "loss/reg": 0.0, "step": 51650 }, { "epoch": 0.3398684210526316, "grad_norm": 2.5, "grad_norm_var": 0.09348958333333333, "learning_rate": 0.0001, "loss": 2.8987, "loss/crossentropy": 2.1713691473007204, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2105034738779068, "loss/reg": 0.0, "step": 51660 }, { "epoch": 0.3399342105263158, "grad_norm": 2.125, "grad_norm_var": 0.036279296875, "learning_rate": 0.0001, "loss": 2.9425, "loss/crossentropy": 2.116175282001495, "loss/hidden": 2.7765625, "loss/incoh": 0.0, "loss/logits": 0.2105830803513527, "loss/reg": 0.0, "step": 51670 }, { "epoch": 0.34, "grad_norm": 2.390625, "grad_norm_var": 0.1028228759765625, "learning_rate": 0.0001, "loss": 2.8827, "loss/crossentropy": 2.402062952518463, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.22185142785310746, "loss/reg": 0.0, "step": 51680 }, { "epoch": 0.3400657894736842, "grad_norm": 2.359375, "grad_norm_var": 0.10289281209309896, "learning_rate": 0.0001, "loss": 2.8452, "loss/crossentropy": 2.3525772213935854, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.25423287600278854, "loss/reg": 0.0, "step": 51690 }, { "epoch": 0.3401315789473684, "grad_norm": 2.671875, "grad_norm_var": 0.03021214803059896, "learning_rate": 0.0001, "loss": 2.8705, "loss/crossentropy": 2.2071650981903077, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.22391030937433243, "loss/reg": 0.0, "step": 51700 }, { "epoch": 0.34019736842105264, "grad_norm": 2.546875, "grad_norm_var": 0.07631734212239584, "learning_rate": 0.0001, "loss": 2.9635, "loss/crossentropy": 2.5307309150695803, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.22949395477771758, "loss/reg": 0.0, "step": 51710 }, { "epoch": 0.3402631578947368, "grad_norm": 2.390625, "grad_norm_var": 0.0524810791015625, "learning_rate": 0.0001, "loss": 2.882, "loss/crossentropy": 2.3727113544940948, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.2060551702976227, "loss/reg": 0.0, "step": 51720 }, { "epoch": 0.34032894736842106, "grad_norm": 2.40625, "grad_norm_var": 0.07641499837239583, "learning_rate": 0.0001, "loss": 2.8903, "loss/crossentropy": 2.036094093322754, "loss/hidden": 2.55859375, "loss/incoh": 0.0, "loss/logits": 0.2005591869354248, "loss/reg": 0.0, "step": 51730 }, { "epoch": 0.34039473684210525, "grad_norm": 2.28125, "grad_norm_var": 0.0489410400390625, "learning_rate": 0.0001, "loss": 2.9155, "loss/crossentropy": 2.1675809741020204, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2944232538342476, "loss/reg": 0.0, "step": 51740 }, { "epoch": 0.3404605263157895, "grad_norm": 2.265625, "grad_norm_var": 0.15915425618489584, "learning_rate": 0.0001, "loss": 2.8667, "loss/crossentropy": 2.5967798113822935, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.2140802413225174, "loss/reg": 0.0, "step": 51750 }, { "epoch": 0.3405263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.14199930826822918, "learning_rate": 0.0001, "loss": 2.8843, "loss/crossentropy": 2.306249403953552, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.22473669052124023, "loss/reg": 0.0, "step": 51760 }, { "epoch": 0.3405921052631579, "grad_norm": 2.375, "grad_norm_var": 0.0881500244140625, "learning_rate": 0.0001, "loss": 2.9709, "loss/crossentropy": 2.3759385347366333, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.20319178253412246, "loss/reg": 0.0, "step": 51770 }, { "epoch": 0.3406578947368421, "grad_norm": 2.3125, "grad_norm_var": 0.10449930826822916, "learning_rate": 0.0001, "loss": 2.8351, "loss/crossentropy": 2.2855198264122008, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.2129150539636612, "loss/reg": 0.0, "step": 51780 }, { "epoch": 0.3407236842105263, "grad_norm": 2.515625, "grad_norm_var": 0.07551244099934896, "learning_rate": 0.0001, "loss": 2.9777, "loss/crossentropy": 1.9084784746170045, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.17118511945009232, "loss/reg": 0.0, "step": 51790 }, { "epoch": 0.34078947368421053, "grad_norm": 2.453125, "grad_norm_var": 0.056298828125, "learning_rate": 0.0001, "loss": 2.9526, "loss/crossentropy": 2.333062982559204, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.23569816052913667, "loss/reg": 0.0, "step": 51800 }, { "epoch": 0.3408552631578947, "grad_norm": 2.40625, "grad_norm_var": 0.0355865478515625, "learning_rate": 0.0001, "loss": 2.9158, "loss/crossentropy": 2.0264336347579954, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.18544332757592202, "loss/reg": 0.0, "step": 51810 }, { "epoch": 0.34092105263157896, "grad_norm": 2.390625, "grad_norm_var": 0.03167317708333333, "learning_rate": 0.0001, "loss": 2.7956, "loss/crossentropy": 2.3214208841323853, "loss/hidden": 2.590625, "loss/incoh": 0.0, "loss/logits": 0.20100444331765174, "loss/reg": 0.0, "step": 51820 }, { "epoch": 0.34098684210526314, "grad_norm": 2.546875, "grad_norm_var": 0.042170206705729164, "learning_rate": 0.0001, "loss": 2.9294, "loss/crossentropy": 2.078841817378998, "loss/hidden": 2.9015625, "loss/incoh": 0.0, "loss/logits": 0.2494706243276596, "loss/reg": 0.0, "step": 51830 }, { "epoch": 0.3410526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.052815755208333336, "learning_rate": 0.0001, "loss": 2.8844, "loss/crossentropy": 2.569971442222595, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.21882319897413255, "loss/reg": 0.0, "step": 51840 }, { "epoch": 0.34111842105263157, "grad_norm": 2.421875, "grad_norm_var": 0.035822550455729164, "learning_rate": 0.0001, "loss": 2.9477, "loss/crossentropy": 2.3724521517753603, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.20934848487377167, "loss/reg": 0.0, "step": 51850 }, { "epoch": 0.3411842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.03232014973958333, "learning_rate": 0.0001, "loss": 2.9747, "loss/crossentropy": 2.32980762720108, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2163873180747032, "loss/reg": 0.0, "step": 51860 }, { "epoch": 0.34125, "grad_norm": 2.640625, "grad_norm_var": 0.0388336181640625, "learning_rate": 0.0001, "loss": 2.9176, "loss/crossentropy": 2.373174750804901, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.20113704204559327, "loss/reg": 0.0, "step": 51870 }, { "epoch": 0.3413157894736842, "grad_norm": 3.90625, "grad_norm_var": 0.18828125, "learning_rate": 0.0001, "loss": 2.9075, "loss/crossentropy": 2.47163565158844, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22505685091018676, "loss/reg": 0.0, "step": 51880 }, { "epoch": 0.3413815789473684, "grad_norm": 2.359375, "grad_norm_var": 0.22124735514322916, "learning_rate": 0.0001, "loss": 2.8741, "loss/crossentropy": 2.420213043689728, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.1947748377919197, "loss/reg": 0.0, "step": 51890 }, { "epoch": 0.3414473684210526, "grad_norm": 2.640625, "grad_norm_var": 0.059912109375, "learning_rate": 0.0001, "loss": 2.9375, "loss/crossentropy": 2.439142274856567, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.21283862367272377, "loss/reg": 0.0, "step": 51900 }, { "epoch": 0.34151315789473685, "grad_norm": 2.328125, "grad_norm_var": 0.05400288899739583, "learning_rate": 0.0001, "loss": 2.9337, "loss/crossentropy": 2.4820411920547487, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.2706652104854584, "loss/reg": 0.0, "step": 51910 }, { "epoch": 0.34157894736842104, "grad_norm": 2.15625, "grad_norm_var": 0.04120686848958333, "learning_rate": 0.0001, "loss": 2.8937, "loss/crossentropy": 2.127081960439682, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21399440988898277, "loss/reg": 0.0, "step": 51920 }, { "epoch": 0.3416447368421053, "grad_norm": 2.359375, "grad_norm_var": 0.0621978759765625, "learning_rate": 0.0001, "loss": 2.8803, "loss/crossentropy": 2.1909793615341187, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.19982205852866172, "loss/reg": 0.0, "step": 51930 }, { "epoch": 0.34171052631578946, "grad_norm": 2.5625, "grad_norm_var": 0.06357014973958333, "learning_rate": 0.0001, "loss": 3.0056, "loss/crossentropy": 1.9428649455308915, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.18353333249688147, "loss/reg": 0.0, "step": 51940 }, { "epoch": 0.3417763157894737, "grad_norm": 3.0625, "grad_norm_var": 0.08336181640625, "learning_rate": 0.0001, "loss": 2.9489, "loss/crossentropy": 2.348272681236267, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.20626830756664277, "loss/reg": 0.0, "step": 51950 }, { "epoch": 0.3418421052631579, "grad_norm": 2.625, "grad_norm_var": 0.05829671223958333, "learning_rate": 0.0001, "loss": 2.9136, "loss/crossentropy": 2.470963728427887, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2712224945425987, "loss/reg": 0.0, "step": 51960 }, { "epoch": 0.34190789473684213, "grad_norm": 2.421875, "grad_norm_var": 0.07379557291666666, "learning_rate": 0.0001, "loss": 2.8957, "loss/crossentropy": 1.976397204399109, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.18823150619864465, "loss/reg": 0.0, "step": 51970 }, { "epoch": 0.3419736842105263, "grad_norm": 2.375, "grad_norm_var": 0.07942301432291667, "learning_rate": 0.0001, "loss": 2.9165, "loss/crossentropy": 2.1997026324272158, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.18681176304817199, "loss/reg": 0.0, "step": 51980 }, { "epoch": 0.3420394736842105, "grad_norm": 2.484375, "grad_norm_var": 0.0561676025390625, "learning_rate": 0.0001, "loss": 2.861, "loss/crossentropy": 2.218196964263916, "loss/hidden": 2.521875, "loss/incoh": 0.0, "loss/logits": 0.1685948759317398, "loss/reg": 0.0, "step": 51990 }, { "epoch": 0.34210526315789475, "grad_norm": 2.671875, "grad_norm_var": 0.048216756184895834, "learning_rate": 0.0001, "loss": 2.8868, "loss/crossentropy": 1.9513134121894837, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.2158975951373577, "loss/reg": 0.0, "step": 52000 }, { "epoch": 0.34217105263157893, "grad_norm": 2.765625, "grad_norm_var": 0.08417561848958334, "learning_rate": 0.0001, "loss": 2.9414, "loss/crossentropy": 2.1274460673332216, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.21164817214012147, "loss/reg": 0.0, "step": 52010 }, { "epoch": 0.3422368421052632, "grad_norm": 2.28125, "grad_norm_var": 0.09986572265625, "learning_rate": 0.0001, "loss": 2.8966, "loss/crossentropy": 2.4406869173049928, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20551624596118928, "loss/reg": 0.0, "step": 52020 }, { "epoch": 0.34230263157894736, "grad_norm": 2.109375, "grad_norm_var": 0.19917704264322916, "learning_rate": 0.0001, "loss": 2.8586, "loss/crossentropy": 2.2455283284187315, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.19949279874563217, "loss/reg": 0.0, "step": 52030 }, { "epoch": 0.3423684210526316, "grad_norm": 2.515625, "grad_norm_var": 3.67939453125, "learning_rate": 0.0001, "loss": 2.9152, "loss/crossentropy": 2.2811938285827638, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.21227557063102723, "loss/reg": 0.0, "step": 52040 }, { "epoch": 0.3424342105263158, "grad_norm": 2.609375, "grad_norm_var": 0.26988525390625, "learning_rate": 0.0001, "loss": 2.8406, "loss/crossentropy": 2.2678314685821532, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.22228483259677886, "loss/reg": 0.0, "step": 52050 }, { "epoch": 0.3425, "grad_norm": 2.5, "grad_norm_var": 0.0811431884765625, "learning_rate": 0.0001, "loss": 2.9018, "loss/crossentropy": 2.3194375872612, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.21775465756654738, "loss/reg": 0.0, "step": 52060 }, { "epoch": 0.3425657894736842, "grad_norm": 2.328125, "grad_norm_var": 0.13670145670572917, "learning_rate": 0.0001, "loss": 3.0076, "loss/crossentropy": 1.9399695813655853, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.1701349914073944, "loss/reg": 0.0, "step": 52070 }, { "epoch": 0.3426315789473684, "grad_norm": 2.015625, "grad_norm_var": 0.05289306640625, "learning_rate": 0.0001, "loss": 2.8221, "loss/crossentropy": 2.1561759293079374, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.1848122924566269, "loss/reg": 0.0, "step": 52080 }, { "epoch": 0.34269736842105264, "grad_norm": 2.28125, "grad_norm_var": 0.03004150390625, "learning_rate": 0.0001, "loss": 2.85, "loss/crossentropy": 2.2355022549629213, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.19540479630231858, "loss/reg": 0.0, "step": 52090 }, { "epoch": 0.3427631578947368, "grad_norm": 2.296875, "grad_norm_var": 0.0665435791015625, "learning_rate": 0.0001, "loss": 2.9684, "loss/crossentropy": 2.283226990699768, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2536857232451439, "loss/reg": 0.0, "step": 52100 }, { "epoch": 0.34282894736842107, "grad_norm": 2.453125, "grad_norm_var": 0.03892313639322917, "learning_rate": 0.0001, "loss": 2.9224, "loss/crossentropy": 2.3374486804008483, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23258694559335708, "loss/reg": 0.0, "step": 52110 }, { "epoch": 0.34289473684210525, "grad_norm": 2.4375, "grad_norm_var": 0.0326080322265625, "learning_rate": 0.0001, "loss": 2.88, "loss/crossentropy": 2.5377514719963075, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.2620867036283016, "loss/reg": 0.0, "step": 52120 }, { "epoch": 0.3429605263157895, "grad_norm": 2.328125, "grad_norm_var": 0.047587076822916664, "learning_rate": 0.0001, "loss": 2.8835, "loss/crossentropy": 2.228368306159973, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.23643250465393068, "loss/reg": 0.0, "step": 52130 }, { "epoch": 0.3430263157894737, "grad_norm": 2.6875, "grad_norm_var": 0.08179931640625, "learning_rate": 0.0001, "loss": 2.9626, "loss/crossentropy": 2.2396329164505007, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.18757053315639496, "loss/reg": 0.0, "step": 52140 }, { "epoch": 0.3430921052631579, "grad_norm": 3.53125, "grad_norm_var": 0.14036051432291666, "learning_rate": 0.0001, "loss": 2.9285, "loss/crossentropy": 2.0495252430438997, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.1827125608921051, "loss/reg": 0.0, "step": 52150 }, { "epoch": 0.3431578947368421, "grad_norm": 2.15625, "grad_norm_var": 0.1458892822265625, "learning_rate": 0.0001, "loss": 2.8798, "loss/crossentropy": 2.4296726822853087, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.18709824830293656, "loss/reg": 0.0, "step": 52160 }, { "epoch": 0.3432236842105263, "grad_norm": 2.40625, "grad_norm_var": 0.056136067708333334, "learning_rate": 0.0001, "loss": 2.8456, "loss/crossentropy": 2.1002680897712707, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.21592585295438765, "loss/reg": 0.0, "step": 52170 }, { "epoch": 0.34328947368421053, "grad_norm": 2.703125, "grad_norm_var": 0.06311442057291666, "learning_rate": 0.0001, "loss": 2.953, "loss/crossentropy": 1.8350662678480147, "loss/hidden": 2.9234375, "loss/incoh": 0.0, "loss/logits": 0.18298245184123516, "loss/reg": 0.0, "step": 52180 }, { "epoch": 0.3433552631578947, "grad_norm": 2.28125, "grad_norm_var": 0.05396219889322917, "learning_rate": 0.0001, "loss": 2.8949, "loss/crossentropy": 2.2670717239379883, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.20936698317527772, "loss/reg": 0.0, "step": 52190 }, { "epoch": 0.34342105263157896, "grad_norm": 2.625, "grad_norm_var": 0.03478190104166667, "learning_rate": 0.0001, "loss": 2.8341, "loss/crossentropy": 2.232769775390625, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.19983401149511337, "loss/reg": 0.0, "step": 52200 }, { "epoch": 0.34348684210526315, "grad_norm": 2.828125, "grad_norm_var": 0.3353800455729167, "learning_rate": 0.0001, "loss": 2.9703, "loss/crossentropy": 2.029405951499939, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.22377823442220687, "loss/reg": 0.0, "step": 52210 }, { "epoch": 0.3435526315789474, "grad_norm": 2.8125, "grad_norm_var": 0.24864908854166667, "learning_rate": 0.0001, "loss": 2.9027, "loss/crossentropy": 2.256174883246422, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.232363311201334, "loss/reg": 0.0, "step": 52220 }, { "epoch": 0.3436184210526316, "grad_norm": 2.46875, "grad_norm_var": 0.03448893229166667, "learning_rate": 0.0001, "loss": 2.8202, "loss/crossentropy": 1.867715910077095, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.17418045848608016, "loss/reg": 0.0, "step": 52230 }, { "epoch": 0.3436842105263158, "grad_norm": 2.921875, "grad_norm_var": 0.053319295247395836, "learning_rate": 0.0001, "loss": 2.9932, "loss/crossentropy": 2.1638339042663572, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.19540889114141463, "loss/reg": 0.0, "step": 52240 }, { "epoch": 0.34375, "grad_norm": 2.796875, "grad_norm_var": 0.0538238525390625, "learning_rate": 0.0001, "loss": 2.9389, "loss/crossentropy": 2.3159988164901733, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.2365440919995308, "loss/reg": 0.0, "step": 52250 }, { "epoch": 0.3438157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.06023763020833333, "learning_rate": 0.0001, "loss": 2.9242, "loss/crossentropy": 2.434188163280487, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20742014050483704, "loss/reg": 0.0, "step": 52260 }, { "epoch": 0.3438815789473684, "grad_norm": 2.359375, "grad_norm_var": 0.060384114583333336, "learning_rate": 0.0001, "loss": 2.9105, "loss/crossentropy": 2.133386346697807, "loss/hidden": 2.4515625, "loss/incoh": 0.0, "loss/logits": 0.16493220999836922, "loss/reg": 0.0, "step": 52270 }, { "epoch": 0.3439473684210526, "grad_norm": 2.65625, "grad_norm_var": 0.1170806884765625, "learning_rate": 0.0001, "loss": 3.0059, "loss/crossentropy": 2.2425517320632933, "loss/hidden": 2.7796875, "loss/incoh": 0.0, "loss/logits": 0.2439103677868843, "loss/reg": 0.0, "step": 52280 }, { "epoch": 0.34401315789473685, "grad_norm": 3.421875, "grad_norm_var": 0.16565755208333333, "learning_rate": 0.0001, "loss": 2.858, "loss/crossentropy": 2.4354204535484314, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.20877938270568847, "loss/reg": 0.0, "step": 52290 }, { "epoch": 0.34407894736842104, "grad_norm": 1.9921875, "grad_norm_var": 0.12317886352539062, "learning_rate": 0.0001, "loss": 2.8823, "loss/crossentropy": 2.260420361161232, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2028535097837448, "loss/reg": 0.0, "step": 52300 }, { "epoch": 0.3441447368421053, "grad_norm": 2.328125, "grad_norm_var": 0.12390721638997396, "learning_rate": 0.0001, "loss": 2.8726, "loss/crossentropy": 2.144476592540741, "loss/hidden": 2.9203125, "loss/incoh": 0.0, "loss/logits": 0.23524800688028336, "loss/reg": 0.0, "step": 52310 }, { "epoch": 0.34421052631578947, "grad_norm": 2.265625, "grad_norm_var": 0.031086222330729166, "learning_rate": 0.0001, "loss": 2.8889, "loss/crossentropy": 2.4071126222610473, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.23318285197019578, "loss/reg": 0.0, "step": 52320 }, { "epoch": 0.3442763157894737, "grad_norm": 2.171875, "grad_norm_var": 0.052831776936848956, "learning_rate": 0.0001, "loss": 2.8869, "loss/crossentropy": 2.4069865703582765, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.19570475369691848, "loss/reg": 0.0, "step": 52330 }, { "epoch": 0.3443421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.3637123107910156, "learning_rate": 0.0001, "loss": 2.9654, "loss/crossentropy": 2.187102174758911, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.17945594415068628, "loss/reg": 0.0, "step": 52340 }, { "epoch": 0.3444078947368421, "grad_norm": 2.359375, "grad_norm_var": 0.035970052083333336, "learning_rate": 0.0001, "loss": 2.8687, "loss/crossentropy": 2.2893654823303224, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.21776844635605813, "loss/reg": 0.0, "step": 52350 }, { "epoch": 0.3444736842105263, "grad_norm": 2.34375, "grad_norm_var": 0.06990458170572916, "learning_rate": 0.0001, "loss": 2.9637, "loss/crossentropy": 2.1083612561225893, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.20914254188537598, "loss/reg": 0.0, "step": 52360 }, { "epoch": 0.3445394736842105, "grad_norm": 2.546875, "grad_norm_var": 0.07351786295572917, "learning_rate": 0.0001, "loss": 2.9073, "loss/crossentropy": 1.990160346031189, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.21899903267621995, "loss/reg": 0.0, "step": 52370 }, { "epoch": 0.34460526315789475, "grad_norm": 2.640625, "grad_norm_var": 0.1428375244140625, "learning_rate": 0.0001, "loss": 2.9109, "loss/crossentropy": 2.5973541378974914, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.21541214287281035, "loss/reg": 0.0, "step": 52380 }, { "epoch": 0.34467105263157893, "grad_norm": 2.25, "grad_norm_var": 0.20927327473958332, "learning_rate": 0.0001, "loss": 2.9169, "loss/crossentropy": 2.6102408170700073, "loss/hidden": 2.8703125, "loss/incoh": 0.0, "loss/logits": 0.25710607320070267, "loss/reg": 0.0, "step": 52390 }, { "epoch": 0.3447368421052632, "grad_norm": 2.421875, "grad_norm_var": 0.14179280598958333, "learning_rate": 0.0001, "loss": 2.8832, "loss/crossentropy": 2.0105576276779176, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.17460423409938813, "loss/reg": 0.0, "step": 52400 }, { "epoch": 0.34480263157894736, "grad_norm": 2.140625, "grad_norm_var": 0.5973470052083333, "learning_rate": 0.0001, "loss": 2.8851, "loss/crossentropy": 2.3661433458328247, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.2002450242638588, "loss/reg": 0.0, "step": 52410 }, { "epoch": 0.3448684210526316, "grad_norm": 2.390625, "grad_norm_var": 0.023509724934895834, "learning_rate": 0.0001, "loss": 2.888, "loss/crossentropy": 2.297610378265381, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.181213166564703, "loss/reg": 0.0, "step": 52420 }, { "epoch": 0.3449342105263158, "grad_norm": 2.234375, "grad_norm_var": 0.07720438639322917, "learning_rate": 0.0001, "loss": 2.9554, "loss/crossentropy": 2.3576013684272765, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.18937283605337143, "loss/reg": 0.0, "step": 52430 }, { "epoch": 0.345, "grad_norm": 2.234375, "grad_norm_var": 0.048713938395182295, "learning_rate": 0.0001, "loss": 2.8559, "loss/crossentropy": 2.5495412349700928, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.2298163965344429, "loss/reg": 0.0, "step": 52440 }, { "epoch": 0.3450657894736842, "grad_norm": 2.828125, "grad_norm_var": 0.05360285441080729, "learning_rate": 0.0001, "loss": 2.8932, "loss/crossentropy": 2.316998291015625, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.21764996498823166, "loss/reg": 0.0, "step": 52450 }, { "epoch": 0.3451315789473684, "grad_norm": 2.671875, "grad_norm_var": 0.11072158813476562, "learning_rate": 0.0001, "loss": 2.878, "loss/crossentropy": 2.2441266417503356, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.17445197999477385, "loss/reg": 0.0, "step": 52460 }, { "epoch": 0.34519736842105264, "grad_norm": 2.515625, "grad_norm_var": 0.05642267862955729, "learning_rate": 0.0001, "loss": 2.8984, "loss/crossentropy": 2.212930905818939, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.2140599310398102, "loss/reg": 0.0, "step": 52470 }, { "epoch": 0.3452631578947368, "grad_norm": 2.6875, "grad_norm_var": 0.05476786295572917, "learning_rate": 0.0001, "loss": 2.8607, "loss/crossentropy": 2.427528750896454, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2224172592163086, "loss/reg": 0.0, "step": 52480 }, { "epoch": 0.34532894736842107, "grad_norm": 2.09375, "grad_norm_var": 0.12746988932291667, "learning_rate": 0.0001, "loss": 2.8906, "loss/crossentropy": 2.569909119606018, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.24657644629478453, "loss/reg": 0.0, "step": 52490 }, { "epoch": 0.34539473684210525, "grad_norm": 2.28125, "grad_norm_var": 0.17516276041666667, "learning_rate": 0.0001, "loss": 2.8638, "loss/crossentropy": 2.2866649270057677, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.1972719192504883, "loss/reg": 0.0, "step": 52500 }, { "epoch": 0.3454605263157895, "grad_norm": 2.296875, "grad_norm_var": 0.08682225545247396, "learning_rate": 0.0001, "loss": 2.8351, "loss/crossentropy": 2.0271351039409637, "loss/hidden": 2.515625, "loss/incoh": 0.0, "loss/logits": 0.18033576905727386, "loss/reg": 0.0, "step": 52510 }, { "epoch": 0.3455263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.034468587239583334, "learning_rate": 0.0001, "loss": 2.9051, "loss/crossentropy": 2.189496612548828, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.2023579180240631, "loss/reg": 0.0, "step": 52520 }, { "epoch": 0.34559210526315787, "grad_norm": 2.6875, "grad_norm_var": 0.0360992431640625, "learning_rate": 0.0001, "loss": 2.8771, "loss/crossentropy": 2.2662399649620055, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.202101032435894, "loss/reg": 0.0, "step": 52530 }, { "epoch": 0.3456578947368421, "grad_norm": 2.625, "grad_norm_var": 0.05723368326822917, "learning_rate": 0.0001, "loss": 2.9213, "loss/crossentropy": 2.1396500945091246, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.19974327385425567, "loss/reg": 0.0, "step": 52540 }, { "epoch": 0.3457236842105263, "grad_norm": 2.6875, "grad_norm_var": 0.04128316243489583, "learning_rate": 0.0001, "loss": 2.9754, "loss/crossentropy": 2.360002267360687, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22255755513906478, "loss/reg": 0.0, "step": 52550 }, { "epoch": 0.34578947368421054, "grad_norm": 2.78125, "grad_norm_var": 0.05250244140625, "learning_rate": 0.0001, "loss": 2.9789, "loss/crossentropy": 2.178372836112976, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.1923023357987404, "loss/reg": 0.0, "step": 52560 }, { "epoch": 0.3458552631578947, "grad_norm": 2.640625, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 2.9142, "loss/crossentropy": 2.0009225964546205, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.2003360614180565, "loss/reg": 0.0, "step": 52570 }, { "epoch": 0.34592105263157896, "grad_norm": 2.28125, "grad_norm_var": 0.0634185791015625, "learning_rate": 0.0001, "loss": 2.8671, "loss/crossentropy": 2.4606681942939757, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2598592460155487, "loss/reg": 0.0, "step": 52580 }, { "epoch": 0.34598684210526315, "grad_norm": 2.171875, "grad_norm_var": 0.2886871337890625, "learning_rate": 0.0001, "loss": 2.9024, "loss/crossentropy": 1.9307557940483093, "loss/hidden": 2.925, "loss/incoh": 0.0, "loss/logits": 0.2296278715133667, "loss/reg": 0.0, "step": 52590 }, { "epoch": 0.3460526315789474, "grad_norm": 2.265625, "grad_norm_var": 0.2808258056640625, "learning_rate": 0.0001, "loss": 2.8601, "loss/crossentropy": 2.522324061393738, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.22633564472198486, "loss/reg": 0.0, "step": 52600 }, { "epoch": 0.3461184210526316, "grad_norm": 2.203125, "grad_norm_var": 0.23394266764322916, "learning_rate": 0.0001, "loss": 2.9409, "loss/crossentropy": 1.9623164892196656, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.17356592938303947, "loss/reg": 0.0, "step": 52610 }, { "epoch": 0.3461842105263158, "grad_norm": 2.171875, "grad_norm_var": 0.025422159830729166, "learning_rate": 0.0001, "loss": 2.8864, "loss/crossentropy": 2.2784194707870484, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.20694662481546403, "loss/reg": 0.0, "step": 52620 }, { "epoch": 0.34625, "grad_norm": 2.203125, "grad_norm_var": 0.3293121337890625, "learning_rate": 0.0001, "loss": 3.0971, "loss/crossentropy": 2.4445704340934755, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.24530962407588958, "loss/reg": 0.0, "step": 52630 }, { "epoch": 0.3463157894736842, "grad_norm": 2.203125, "grad_norm_var": 0.3467437744140625, "learning_rate": 0.0001, "loss": 2.9149, "loss/crossentropy": 2.3850828289985655, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.26596111953258517, "loss/reg": 0.0, "step": 52640 }, { "epoch": 0.34638157894736843, "grad_norm": 2.546875, "grad_norm_var": 0.09890848795572917, "learning_rate": 0.0001, "loss": 2.929, "loss/crossentropy": 2.423541986942291, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.19384607076644897, "loss/reg": 0.0, "step": 52650 }, { "epoch": 0.3464473684210526, "grad_norm": 2.46875, "grad_norm_var": 0.0463531494140625, "learning_rate": 0.0001, "loss": 2.8683, "loss/crossentropy": 2.1982513785362245, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.2405138686299324, "loss/reg": 0.0, "step": 52660 }, { "epoch": 0.34651315789473686, "grad_norm": 2.25, "grad_norm_var": 0.014110310872395834, "learning_rate": 0.0001, "loss": 2.8853, "loss/crossentropy": 2.3440744280815125, "loss/hidden": 2.515625, "loss/incoh": 0.0, "loss/logits": 0.18673576563596725, "loss/reg": 0.0, "step": 52670 }, { "epoch": 0.34657894736842104, "grad_norm": 2.203125, "grad_norm_var": 0.04527079264322917, "learning_rate": 0.0001, "loss": 2.9244, "loss/crossentropy": 2.3004306674003603, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.19433464109897614, "loss/reg": 0.0, "step": 52680 }, { "epoch": 0.3466447368421053, "grad_norm": 2.359375, "grad_norm_var": 0.044331868489583336, "learning_rate": 0.0001, "loss": 2.933, "loss/crossentropy": 2.117548030614853, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.18942732810974122, "loss/reg": 0.0, "step": 52690 }, { "epoch": 0.34671052631578947, "grad_norm": 2.453125, "grad_norm_var": 0.02642822265625, "learning_rate": 0.0001, "loss": 2.9057, "loss/crossentropy": 2.4710735440254212, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.22848521023988724, "loss/reg": 0.0, "step": 52700 }, { "epoch": 0.3467763157894737, "grad_norm": 2.453125, "grad_norm_var": 0.0428863525390625, "learning_rate": 0.0001, "loss": 2.9336, "loss/crossentropy": 2.416713094711304, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.22298106998205186, "loss/reg": 0.0, "step": 52710 }, { "epoch": 0.3468421052631579, "grad_norm": 2.609375, "grad_norm_var": 0.0701080322265625, "learning_rate": 0.0001, "loss": 2.9024, "loss/crossentropy": 2.273067903518677, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.20538954362273215, "loss/reg": 0.0, "step": 52720 }, { "epoch": 0.3469078947368421, "grad_norm": 2.515625, "grad_norm_var": 0.11046549479166666, "learning_rate": 0.0001, "loss": 2.9113, "loss/crossentropy": 2.313697862625122, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.20560828298330308, "loss/reg": 0.0, "step": 52730 }, { "epoch": 0.3469736842105263, "grad_norm": 2.421875, "grad_norm_var": 0.11502176920572917, "learning_rate": 0.0001, "loss": 3.0057, "loss/crossentropy": 2.470817005634308, "loss/hidden": 3.059375, "loss/incoh": 0.0, "loss/logits": 0.3368117868900299, "loss/reg": 0.0, "step": 52740 }, { "epoch": 0.3470394736842105, "grad_norm": 2.34375, "grad_norm_var": 0.13078384399414061, "learning_rate": 0.0001, "loss": 2.9351, "loss/crossentropy": 2.034555125236511, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.20478889867663383, "loss/reg": 0.0, "step": 52750 }, { "epoch": 0.34710526315789475, "grad_norm": 2.296875, "grad_norm_var": 0.11770426432291667, "learning_rate": 0.0001, "loss": 2.9626, "loss/crossentropy": 2.359877645969391, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.20988256633281707, "loss/reg": 0.0, "step": 52760 }, { "epoch": 0.34717105263157894, "grad_norm": 2.171875, "grad_norm_var": 0.02021484375, "learning_rate": 0.0001, "loss": 2.9151, "loss/crossentropy": 2.1640021204948425, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.24370682537555693, "loss/reg": 0.0, "step": 52770 }, { "epoch": 0.3472368421052632, "grad_norm": 2.46875, "grad_norm_var": 0.024051920572916666, "learning_rate": 0.0001, "loss": 2.8893, "loss/crossentropy": 2.3070614457130434, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.1925300642848015, "loss/reg": 0.0, "step": 52780 }, { "epoch": 0.34730263157894736, "grad_norm": 2.53125, "grad_norm_var": 0.015885416666666666, "learning_rate": 0.0001, "loss": 2.9423, "loss/crossentropy": 2.2626334190368653, "loss/hidden": 2.8859375, "loss/incoh": 0.0, "loss/logits": 0.2612622074782848, "loss/reg": 0.0, "step": 52790 }, { "epoch": 0.3473684210526316, "grad_norm": 2.546875, "grad_norm_var": 0.023470052083333335, "learning_rate": 0.0001, "loss": 2.9248, "loss/crossentropy": 2.127668726444244, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.1950176738202572, "loss/reg": 0.0, "step": 52800 }, { "epoch": 0.3474342105263158, "grad_norm": 3.390625, "grad_norm_var": 0.10909830729166667, "learning_rate": 0.0001, "loss": 2.9715, "loss/crossentropy": 2.659942126274109, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.2352418839931488, "loss/reg": 0.0, "step": 52810 }, { "epoch": 0.3475, "grad_norm": 2.140625, "grad_norm_var": 0.26939697265625, "learning_rate": 0.0001, "loss": 2.9045, "loss/crossentropy": 2.3262238264083863, "loss/hidden": 2.9171875, "loss/incoh": 0.0, "loss/logits": 0.29094011783599855, "loss/reg": 0.0, "step": 52820 }, { "epoch": 0.3475657894736842, "grad_norm": 2.296875, "grad_norm_var": 0.0359283447265625, "learning_rate": 0.0001, "loss": 2.921, "loss/crossentropy": 2.2226936340332033, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2458650901913643, "loss/reg": 0.0, "step": 52830 }, { "epoch": 0.3476315789473684, "grad_norm": 2.1875, "grad_norm_var": 0.02490234375, "learning_rate": 0.0001, "loss": 2.8981, "loss/crossentropy": 2.0966040670871733, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.20251986682415007, "loss/reg": 0.0, "step": 52840 }, { "epoch": 0.34769736842105264, "grad_norm": 2.546875, "grad_norm_var": 0.20653889973958334, "learning_rate": 0.0001, "loss": 2.9262, "loss/crossentropy": 2.196694779396057, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.20171415954828262, "loss/reg": 0.0, "step": 52850 }, { "epoch": 0.34776315789473683, "grad_norm": 2.40625, "grad_norm_var": 0.18264567057291667, "learning_rate": 0.0001, "loss": 2.9324, "loss/crossentropy": 2.358584666252136, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.22144707068800926, "loss/reg": 0.0, "step": 52860 }, { "epoch": 0.34782894736842107, "grad_norm": 2.4375, "grad_norm_var": 0.04899800618489583, "learning_rate": 0.0001, "loss": 2.9673, "loss/crossentropy": 2.2547359704971313, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.23645546436309814, "loss/reg": 0.0, "step": 52870 }, { "epoch": 0.34789473684210526, "grad_norm": 3.015625, "grad_norm_var": 1.6511067708333333, "learning_rate": 0.0001, "loss": 2.896, "loss/crossentropy": 1.9594634771347046, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.20984201580286027, "loss/reg": 0.0, "step": 52880 }, { "epoch": 0.3479605263157895, "grad_norm": 2.1875, "grad_norm_var": 0.06841532389322917, "learning_rate": 0.0001, "loss": 3.003, "loss/crossentropy": 2.1890581130981444, "loss/hidden": 3.05625, "loss/incoh": 0.0, "loss/logits": 0.26319274455308916, "loss/reg": 0.0, "step": 52890 }, { "epoch": 0.3480263157894737, "grad_norm": 2.421875, "grad_norm_var": 0.0427642822265625, "learning_rate": 0.0001, "loss": 2.9732, "loss/crossentropy": 2.0610204577445983, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.2125498190522194, "loss/reg": 0.0, "step": 52900 }, { "epoch": 0.34809210526315787, "grad_norm": 2.25, "grad_norm_var": 0.026927693684895834, "learning_rate": 0.0001, "loss": 2.8925, "loss/crossentropy": 2.313750076293945, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.20246998742222785, "loss/reg": 0.0, "step": 52910 }, { "epoch": 0.3481578947368421, "grad_norm": 2.484375, "grad_norm_var": 1.1937408447265625, "learning_rate": 0.0001, "loss": 2.9255, "loss/crossentropy": 2.21316112279892, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.19719128012657167, "loss/reg": 0.0, "step": 52920 }, { "epoch": 0.3482236842105263, "grad_norm": 2.234375, "grad_norm_var": 1.1394490559895833, "learning_rate": 0.0001, "loss": 2.947, "loss/crossentropy": 2.291529965400696, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.1981390655040741, "loss/reg": 0.0, "step": 52930 }, { "epoch": 0.34828947368421054, "grad_norm": 1.9765625, "grad_norm_var": 0.06852188110351562, "learning_rate": 0.0001, "loss": 2.9408, "loss/crossentropy": 2.5819833517074584, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.2240348443388939, "loss/reg": 0.0, "step": 52940 }, { "epoch": 0.3483552631578947, "grad_norm": 2.328125, "grad_norm_var": 0.06681900024414063, "learning_rate": 0.0001, "loss": 2.9182, "loss/crossentropy": 2.1857009410858153, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.23530044704675673, "loss/reg": 0.0, "step": 52950 }, { "epoch": 0.34842105263157896, "grad_norm": 2.171875, "grad_norm_var": 0.03167317708333333, "learning_rate": 0.0001, "loss": 2.9057, "loss/crossentropy": 2.274757134914398, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.20489104390144347, "loss/reg": 0.0, "step": 52960 }, { "epoch": 0.34848684210526315, "grad_norm": 2.484375, "grad_norm_var": 0.04495035807291667, "learning_rate": 0.0001, "loss": 2.9553, "loss/crossentropy": 2.4982900261878966, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.20264890193939208, "loss/reg": 0.0, "step": 52970 }, { "epoch": 0.3485526315789474, "grad_norm": 2.671875, "grad_norm_var": 0.049560546875, "learning_rate": 0.0001, "loss": 2.8984, "loss/crossentropy": 2.253995645046234, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.18697913587093354, "loss/reg": 0.0, "step": 52980 }, { "epoch": 0.3486184210526316, "grad_norm": 3.46875, "grad_norm_var": 0.09401041666666667, "learning_rate": 0.0001, "loss": 2.9092, "loss/crossentropy": 2.363197720050812, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.20556249022483825, "loss/reg": 0.0, "step": 52990 }, { "epoch": 0.34868421052631576, "grad_norm": 3.3125, "grad_norm_var": 0.14605712890625, "learning_rate": 0.0001, "loss": 2.9517, "loss/crossentropy": 2.2076730370521545, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.19253380447626114, "loss/reg": 0.0, "step": 53000 }, { "epoch": 0.34875, "grad_norm": 2.046875, "grad_norm_var": 0.16629130045572918, "learning_rate": 0.0001, "loss": 2.9056, "loss/crossentropy": 2.1950744926929473, "loss/hidden": 2.8203125, "loss/incoh": 0.0, "loss/logits": 0.20323417633771895, "loss/reg": 0.0, "step": 53010 }, { "epoch": 0.3488157894736842, "grad_norm": 2.109375, "grad_norm_var": 0.15131607055664062, "learning_rate": 0.0001, "loss": 2.8845, "loss/crossentropy": 2.0936007738113402, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2003942146897316, "loss/reg": 0.0, "step": 53020 }, { "epoch": 0.34888157894736843, "grad_norm": 5.46875, "grad_norm_var": 0.7161293029785156, "learning_rate": 0.0001, "loss": 2.9915, "loss/crossentropy": 2.171491777896881, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.17315723448991777, "loss/reg": 0.0, "step": 53030 }, { "epoch": 0.3489473684210526, "grad_norm": 2.9375, "grad_norm_var": 0.6350331624348958, "learning_rate": 0.0001, "loss": 2.9089, "loss/crossentropy": 2.1484507620334625, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.19547340273857117, "loss/reg": 0.0, "step": 53040 }, { "epoch": 0.34901315789473686, "grad_norm": 2.21875, "grad_norm_var": 0.04049479166666667, "learning_rate": 0.0001, "loss": 2.9115, "loss/crossentropy": 2.4049624681472777, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.21436695456504823, "loss/reg": 0.0, "step": 53050 }, { "epoch": 0.34907894736842104, "grad_norm": 2.140625, "grad_norm_var": 0.03630269368489583, "learning_rate": 0.0001, "loss": 2.8063, "loss/crossentropy": 2.2809816241264342, "loss/hidden": 2.5, "loss/incoh": 0.0, "loss/logits": 0.18145203590393066, "loss/reg": 0.0, "step": 53060 }, { "epoch": 0.3491447368421053, "grad_norm": 2.171875, "grad_norm_var": 0.11983413696289062, "learning_rate": 0.0001, "loss": 2.8986, "loss/crossentropy": 2.0428597807884215, "loss/hidden": 2.8, "loss/incoh": 0.0, "loss/logits": 0.17800327241420746, "loss/reg": 0.0, "step": 53070 }, { "epoch": 0.34921052631578947, "grad_norm": 2.109375, "grad_norm_var": 0.08097305297851562, "learning_rate": 0.0001, "loss": 2.872, "loss/crossentropy": 2.2249770164489746, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.2294459342956543, "loss/reg": 0.0, "step": 53080 }, { "epoch": 0.34927631578947366, "grad_norm": 2.265625, "grad_norm_var": 0.030594889322916666, "learning_rate": 0.0001, "loss": 2.913, "loss/crossentropy": 2.2657626271247864, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.21968457400798796, "loss/reg": 0.0, "step": 53090 }, { "epoch": 0.3493421052631579, "grad_norm": 2.421875, "grad_norm_var": 0.043578084309895834, "learning_rate": 0.0001, "loss": 2.9392, "loss/crossentropy": 2.329882037639618, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.23364192992448807, "loss/reg": 0.0, "step": 53100 }, { "epoch": 0.3494078947368421, "grad_norm": 2.609375, "grad_norm_var": 0.015576171875, "learning_rate": 0.0001, "loss": 2.969, "loss/crossentropy": 2.6655625820159914, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.21384950429201127, "loss/reg": 0.0, "step": 53110 }, { "epoch": 0.3494736842105263, "grad_norm": 2.5, "grad_norm_var": 0.02418212890625, "learning_rate": 0.0001, "loss": 2.9262, "loss/crossentropy": 2.165694499015808, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.19207849726080894, "loss/reg": 0.0, "step": 53120 }, { "epoch": 0.3495394736842105, "grad_norm": 1.8984375, "grad_norm_var": 0.046333567301432295, "learning_rate": 0.0001, "loss": 2.8556, "loss/crossentropy": 2.374967670440674, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.2113099917769432, "loss/reg": 0.0, "step": 53130 }, { "epoch": 0.34960526315789475, "grad_norm": 2.296875, "grad_norm_var": 0.10058364868164063, "learning_rate": 0.0001, "loss": 2.8513, "loss/crossentropy": 2.0823599696159363, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.2026284359395504, "loss/reg": 0.0, "step": 53140 }, { "epoch": 0.34967105263157894, "grad_norm": 2.359375, "grad_norm_var": 0.094873046875, "learning_rate": 0.0001, "loss": 2.9172, "loss/crossentropy": 2.236270558834076, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.1926701359450817, "loss/reg": 0.0, "step": 53150 }, { "epoch": 0.3497368421052632, "grad_norm": 2.21875, "grad_norm_var": 0.06276753743489584, "learning_rate": 0.0001, "loss": 2.9538, "loss/crossentropy": 2.425718867778778, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22293473035097122, "loss/reg": 0.0, "step": 53160 }, { "epoch": 0.34980263157894737, "grad_norm": 2.53125, "grad_norm_var": 0.048981730143229166, "learning_rate": 0.0001, "loss": 2.8934, "loss/crossentropy": 2.41066859960556, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.22172476947307587, "loss/reg": 0.0, "step": 53170 }, { "epoch": 0.3498684210526316, "grad_norm": 2.578125, "grad_norm_var": 0.11552734375, "learning_rate": 0.0001, "loss": 2.9824, "loss/crossentropy": 1.9422068178653717, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.2106860101222992, "loss/reg": 0.0, "step": 53180 }, { "epoch": 0.3499342105263158, "grad_norm": 2.453125, "grad_norm_var": 0.12795817057291667, "learning_rate": 0.0001, "loss": 2.8718, "loss/crossentropy": 2.383823883533478, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.23299463838338852, "loss/reg": 0.0, "step": 53190 }, { "epoch": 0.35, "grad_norm": 2.359375, "grad_norm_var": 0.0331451416015625, "learning_rate": 0.0001, "loss": 2.8794, "loss/crossentropy": 2.267080211639404, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.23116162717342376, "loss/reg": 0.0, "step": 53200 }, { "epoch": 0.3500657894736842, "grad_norm": 2.3125, "grad_norm_var": 0.040913899739583336, "learning_rate": 0.0001, "loss": 2.8663, "loss/crossentropy": 2.2024850130081175, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.19921650141477584, "loss/reg": 0.0, "step": 53210 }, { "epoch": 0.3501315789473684, "grad_norm": 2.5, "grad_norm_var": 0.08872782389322917, "learning_rate": 0.0001, "loss": 2.9135, "loss/crossentropy": 2.251377832889557, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.2061144694685936, "loss/reg": 0.0, "step": 53220 }, { "epoch": 0.35019736842105265, "grad_norm": 2.109375, "grad_norm_var": 0.07911783854166667, "learning_rate": 0.0001, "loss": 2.8245, "loss/crossentropy": 2.210775279998779, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.18275636583566665, "loss/reg": 0.0, "step": 53230 }, { "epoch": 0.35026315789473683, "grad_norm": 2.21875, "grad_norm_var": 0.061442057291666664, "learning_rate": 0.0001, "loss": 2.9252, "loss/crossentropy": 2.1525847792625425, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.2072019025683403, "loss/reg": 0.0, "step": 53240 }, { "epoch": 0.3503289473684211, "grad_norm": 2.46875, "grad_norm_var": 0.0503082275390625, "learning_rate": 0.0001, "loss": 2.9076, "loss/crossentropy": 2.455315959453583, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.22518844306468963, "loss/reg": 0.0, "step": 53250 }, { "epoch": 0.35039473684210526, "grad_norm": 2.625, "grad_norm_var": 0.13922526041666666, "learning_rate": 0.0001, "loss": 2.9284, "loss/crossentropy": 2.4326691150665285, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.2264757752418518, "loss/reg": 0.0, "step": 53260 }, { "epoch": 0.3504605263157895, "grad_norm": 2.453125, "grad_norm_var": 0.10966796875, "learning_rate": 0.0001, "loss": 2.9312, "loss/crossentropy": 2.5383957386016847, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.23442549407482147, "loss/reg": 0.0, "step": 53270 }, { "epoch": 0.3505263157894737, "grad_norm": 2.53125, "grad_norm_var": 0.08587239583333334, "learning_rate": 0.0001, "loss": 2.9152, "loss/crossentropy": 2.1879223942756654, "loss/hidden": 2.875, "loss/incoh": 0.0, "loss/logits": 0.2700294405221939, "loss/reg": 0.0, "step": 53280 }, { "epoch": 0.35059210526315787, "grad_norm": 2.1875, "grad_norm_var": 0.0985748291015625, "learning_rate": 0.0001, "loss": 2.8722, "loss/crossentropy": 2.2695354580879212, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.201190747320652, "loss/reg": 0.0, "step": 53290 }, { "epoch": 0.3506578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.09804585774739584, "learning_rate": 0.0001, "loss": 2.9564, "loss/crossentropy": 2.409105050563812, "loss/hidden": 3.05, "loss/incoh": 0.0, "loss/logits": 0.22042183578014374, "loss/reg": 0.0, "step": 53300 }, { "epoch": 0.3507236842105263, "grad_norm": 2.859375, "grad_norm_var": 0.16172587076822917, "learning_rate": 0.0001, "loss": 2.9011, "loss/crossentropy": 2.0697962284088134, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.20846696048974991, "loss/reg": 0.0, "step": 53310 }, { "epoch": 0.35078947368421054, "grad_norm": 2.921875, "grad_norm_var": 0.22058817545572917, "learning_rate": 0.0001, "loss": 2.9233, "loss/crossentropy": 2.5253331542015074, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.2014812245965004, "loss/reg": 0.0, "step": 53320 }, { "epoch": 0.3508552631578947, "grad_norm": 2.546875, "grad_norm_var": 0.1439361572265625, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.3409982323646545, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.20108845084905624, "loss/reg": 0.0, "step": 53330 }, { "epoch": 0.35092105263157897, "grad_norm": 1895825408.0, "grad_norm_var": 2.2463462302055136e+17, "learning_rate": 0.0001, "loss": 3.0537, "loss/crossentropy": 2.154493510723114, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.23148069977760316, "loss/reg": 0.0, "step": 53340 }, { "epoch": 0.35098684210526315, "grad_norm": 2.765625, "grad_norm_var": 2.2463462299956893e+17, "learning_rate": 0.0001, "loss": 2.9617, "loss/crossentropy": 2.194254159927368, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.2319786585867405, "loss/reg": 0.0, "step": 53350 }, { "epoch": 0.3510526315789474, "grad_norm": 2.765625, "grad_norm_var": 0.040787760416666666, "learning_rate": 0.0001, "loss": 2.9344, "loss/crossentropy": 2.80464528799057, "loss/hidden": 2.8265625, "loss/incoh": 0.0, "loss/logits": 0.2226565033197403, "loss/reg": 0.0, "step": 53360 }, { "epoch": 0.3511184210526316, "grad_norm": 2.09375, "grad_norm_var": 0.040816243489583334, "learning_rate": 0.0001, "loss": 2.9097, "loss/crossentropy": 2.1148053646087646, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.20593088418245314, "loss/reg": 0.0, "step": 53370 }, { "epoch": 0.35118421052631577, "grad_norm": 2.09375, "grad_norm_var": 0.06224339803059896, "learning_rate": 0.0001, "loss": 2.9096, "loss/crossentropy": 2.4599186182022095, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.228793229162693, "loss/reg": 0.0, "step": 53380 }, { "epoch": 0.35125, "grad_norm": 2.265625, "grad_norm_var": 0.03495457967122396, "learning_rate": 0.0001, "loss": 2.8845, "loss/crossentropy": 2.3211817145347595, "loss/hidden": 2.8296875, "loss/incoh": 0.0, "loss/logits": 0.2604089707136154, "loss/reg": 0.0, "step": 53390 }, { "epoch": 0.3513157894736842, "grad_norm": 2.3125, "grad_norm_var": 0.025581868489583333, "learning_rate": 0.0001, "loss": 2.8878, "loss/crossentropy": 2.1246866464614866, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.21205822005867958, "loss/reg": 0.0, "step": 53400 }, { "epoch": 0.35138157894736843, "grad_norm": 2.484375, "grad_norm_var": 0.0955474853515625, "learning_rate": 0.0001, "loss": 2.8912, "loss/crossentropy": 2.3926185011863708, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.22541766241192818, "loss/reg": 0.0, "step": 53410 }, { "epoch": 0.3514473684210526, "grad_norm": 2.640625, "grad_norm_var": 0.27461649576822916, "learning_rate": 0.0001, "loss": 2.9729, "loss/crossentropy": 2.145272135734558, "loss/hidden": 2.8984375, "loss/incoh": 0.0, "loss/logits": 0.23754970729351044, "loss/reg": 0.0, "step": 53420 }, { "epoch": 0.35151315789473686, "grad_norm": 2.28125, "grad_norm_var": 0.23486226399739582, "learning_rate": 0.0001, "loss": 2.8578, "loss/crossentropy": 2.0977771043777467, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.19633346870541574, "loss/reg": 0.0, "step": 53430 }, { "epoch": 0.35157894736842105, "grad_norm": 2.59375, "grad_norm_var": 0.025788370768229166, "learning_rate": 0.0001, "loss": 2.9026, "loss/crossentropy": 2.1087430715560913, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.18483442813158035, "loss/reg": 0.0, "step": 53440 }, { "epoch": 0.3516447368421053, "grad_norm": 2.375, "grad_norm_var": 0.0561920166015625, "learning_rate": 0.0001, "loss": 2.9032, "loss/crossentropy": 2.0478337526321413, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.2215722106397152, "loss/reg": 0.0, "step": 53450 }, { "epoch": 0.3517105263157895, "grad_norm": 2.15625, "grad_norm_var": 0.092041015625, "learning_rate": 0.0001, "loss": 2.895, "loss/crossentropy": 2.0904667735099793, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.1862471416592598, "loss/reg": 0.0, "step": 53460 }, { "epoch": 0.35177631578947366, "grad_norm": 2.421875, "grad_norm_var": 0.03691304524739583, "learning_rate": 0.0001, "loss": 2.9141, "loss/crossentropy": 2.1221193432807923, "loss/hidden": 2.515625, "loss/incoh": 0.0, "loss/logits": 0.16524556577205657, "loss/reg": 0.0, "step": 53470 }, { "epoch": 0.3518421052631579, "grad_norm": 2.203125, "grad_norm_var": 0.059130859375, "learning_rate": 0.0001, "loss": 2.8869, "loss/crossentropy": 2.309184992313385, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.19241107851266862, "loss/reg": 0.0, "step": 53480 }, { "epoch": 0.3519078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.0243072509765625, "learning_rate": 0.0001, "loss": 2.8936, "loss/crossentropy": 2.440667998790741, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.21108296141028404, "loss/reg": 0.0, "step": 53490 }, { "epoch": 0.3519736842105263, "grad_norm": 3.375, "grad_norm_var": 0.37263081868489584, "learning_rate": 0.0001, "loss": 2.9877, "loss/crossentropy": 2.1306526899337768, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.20857866555452348, "loss/reg": 0.0, "step": 53500 }, { "epoch": 0.3520394736842105, "grad_norm": 2.578125, "grad_norm_var": 0.14471028645833334, "learning_rate": 0.0001, "loss": 2.9742, "loss/crossentropy": 2.5325379848480223, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.25906131863594056, "loss/reg": 0.0, "step": 53510 }, { "epoch": 0.35210526315789475, "grad_norm": 2.3125, "grad_norm_var": 0.18970947265625, "learning_rate": 0.0001, "loss": 2.9709, "loss/crossentropy": 2.350108063220978, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.23208711445331573, "loss/reg": 0.0, "step": 53520 }, { "epoch": 0.35217105263157894, "grad_norm": 2.5, "grad_norm_var": 0.41080322265625, "learning_rate": 0.0001, "loss": 2.9166, "loss/crossentropy": 2.1673108339309692, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.18917050063610077, "loss/reg": 0.0, "step": 53530 }, { "epoch": 0.3522368421052632, "grad_norm": 2.25, "grad_norm_var": 0.31845296223958336, "learning_rate": 0.0001, "loss": 2.9447, "loss/crossentropy": 2.1697116017341616, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.188032928109169, "loss/reg": 0.0, "step": 53540 }, { "epoch": 0.35230263157894737, "grad_norm": 2.875, "grad_norm_var": 0.0990386962890625, "learning_rate": 0.0001, "loss": 2.9498, "loss/crossentropy": 2.1415011525154113, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.22257547676563263, "loss/reg": 0.0, "step": 53550 }, { "epoch": 0.35236842105263155, "grad_norm": 2.734375, "grad_norm_var": 0.03632710774739583, "learning_rate": 0.0001, "loss": 2.9427, "loss/crossentropy": 2.3115553855895996, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.23712053447961806, "loss/reg": 0.0, "step": 53560 }, { "epoch": 0.3524342105263158, "grad_norm": 2.421875, "grad_norm_var": 0.0578521728515625, "learning_rate": 0.0001, "loss": 2.9527, "loss/crossentropy": 1.938773113489151, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.22507890835404396, "loss/reg": 0.0, "step": 53570 }, { "epoch": 0.3525, "grad_norm": 2.34375, "grad_norm_var": 0.052155558268229166, "learning_rate": 0.0001, "loss": 2.9346, "loss/crossentropy": 2.0930636167526244, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.2004963830113411, "loss/reg": 0.0, "step": 53580 }, { "epoch": 0.3525657894736842, "grad_norm": 2.625, "grad_norm_var": 0.03687744140625, "learning_rate": 0.0001, "loss": 2.9347, "loss/crossentropy": 2.144264447689056, "loss/hidden": 2.5015625, "loss/incoh": 0.0, "loss/logits": 0.18451452851295472, "loss/reg": 0.0, "step": 53590 }, { "epoch": 0.3526315789473684, "grad_norm": 2.265625, "grad_norm_var": 0.2621256510416667, "learning_rate": 0.0001, "loss": 2.9528, "loss/crossentropy": 2.4426235437393187, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.20080262571573257, "loss/reg": 0.0, "step": 53600 }, { "epoch": 0.35269736842105265, "grad_norm": 2.328125, "grad_norm_var": 0.2556549072265625, "learning_rate": 0.0001, "loss": 2.8751, "loss/crossentropy": 2.267673724889755, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.2533807635307312, "loss/reg": 0.0, "step": 53610 }, { "epoch": 0.35276315789473683, "grad_norm": 2.1875, "grad_norm_var": 0.05120442708333333, "learning_rate": 0.0001, "loss": 2.85, "loss/crossentropy": 2.176706624031067, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.21297791302204133, "loss/reg": 0.0, "step": 53620 }, { "epoch": 0.3528289473684211, "grad_norm": 2.265625, "grad_norm_var": 0.1325347900390625, "learning_rate": 0.0001, "loss": 2.8287, "loss/crossentropy": 2.394326329231262, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.22034791558980943, "loss/reg": 0.0, "step": 53630 }, { "epoch": 0.35289473684210526, "grad_norm": 2.578125, "grad_norm_var": 0.1140289306640625, "learning_rate": 0.0001, "loss": 2.8418, "loss/crossentropy": 2.2925031185150146, "loss/hidden": 2.4953125, "loss/incoh": 0.0, "loss/logits": 0.16971914768218993, "loss/reg": 0.0, "step": 53640 }, { "epoch": 0.35296052631578945, "grad_norm": 2.25, "grad_norm_var": 0.161865234375, "learning_rate": 0.0001, "loss": 2.929, "loss/crossentropy": 2.129422640800476, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2229134738445282, "loss/reg": 0.0, "step": 53650 }, { "epoch": 0.3530263157894737, "grad_norm": 2.890625, "grad_norm_var": 0.1974761962890625, "learning_rate": 0.0001, "loss": 2.9551, "loss/crossentropy": 1.9090525090694428, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2006974697113037, "loss/reg": 0.0, "step": 53660 }, { "epoch": 0.3530921052631579, "grad_norm": 3.0, "grad_norm_var": 0.12976786295572917, "learning_rate": 0.0001, "loss": 2.908, "loss/crossentropy": 2.4558427572250365, "loss/hidden": 2.8734375, "loss/incoh": 0.0, "loss/logits": 0.22563381046056746, "loss/reg": 0.0, "step": 53670 }, { "epoch": 0.3531578947368421, "grad_norm": 2.296875, "grad_norm_var": 0.07714436848958334, "learning_rate": 0.0001, "loss": 2.904, "loss/crossentropy": 2.3546396851539613, "loss/hidden": 2.4890625, "loss/incoh": 0.0, "loss/logits": 0.18249461725354194, "loss/reg": 0.0, "step": 53680 }, { "epoch": 0.3532236842105263, "grad_norm": 2.34375, "grad_norm_var": 0.07180582682291667, "learning_rate": 0.0001, "loss": 2.8627, "loss/crossentropy": 2.3418611526489257, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.22591341882944108, "loss/reg": 0.0, "step": 53690 }, { "epoch": 0.35328947368421054, "grad_norm": 2.421875, "grad_norm_var": 0.13683268229166667, "learning_rate": 0.0001, "loss": 2.8599, "loss/crossentropy": 2.2972548246383666, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.2246960759162903, "loss/reg": 0.0, "step": 53700 }, { "epoch": 0.35335526315789473, "grad_norm": 2.28125, "grad_norm_var": 0.19558919270833333, "learning_rate": 0.0001, "loss": 2.9162, "loss/crossentropy": 2.2283814549446106, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.211201773583889, "loss/reg": 0.0, "step": 53710 }, { "epoch": 0.35342105263157897, "grad_norm": 2.359375, "grad_norm_var": 0.19407145182291666, "learning_rate": 0.0001, "loss": 2.8757, "loss/crossentropy": 2.337044906616211, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.21877549439668656, "loss/reg": 0.0, "step": 53720 }, { "epoch": 0.35348684210526315, "grad_norm": 2.3125, "grad_norm_var": 0.10864232381184896, "learning_rate": 0.0001, "loss": 2.851, "loss/crossentropy": 2.626676607131958, "loss/hidden": 2.603125, "loss/incoh": 0.0, "loss/logits": 0.20544309467077254, "loss/reg": 0.0, "step": 53730 }, { "epoch": 0.3535526315789474, "grad_norm": 3.015625, "grad_norm_var": 0.08515625, "learning_rate": 0.0001, "loss": 2.9063, "loss/crossentropy": 2.2013691425323487, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.21109165698289872, "loss/reg": 0.0, "step": 53740 }, { "epoch": 0.3536184210526316, "grad_norm": 2.609375, "grad_norm_var": 0.05159403483072917, "learning_rate": 0.0001, "loss": 2.969, "loss/crossentropy": 2.4265732765197754, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.21013133227825165, "loss/reg": 0.0, "step": 53750 }, { "epoch": 0.35368421052631577, "grad_norm": 2.53125, "grad_norm_var": 0.030257161458333334, "learning_rate": 0.0001, "loss": 2.9642, "loss/crossentropy": 2.449740409851074, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.24221272766590118, "loss/reg": 0.0, "step": 53760 }, { "epoch": 0.35375, "grad_norm": 2.359375, "grad_norm_var": 0.045731608072916666, "learning_rate": 0.0001, "loss": 2.8731, "loss/crossentropy": 2.262735891342163, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.21945056170225144, "loss/reg": 0.0, "step": 53770 }, { "epoch": 0.3538157894736842, "grad_norm": 2.59375, "grad_norm_var": 0.07299702962239583, "learning_rate": 0.0001, "loss": 2.8692, "loss/crossentropy": 2.1168339014053346, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2025334656238556, "loss/reg": 0.0, "step": 53780 }, { "epoch": 0.35388157894736844, "grad_norm": 2.671875, "grad_norm_var": 0.1554840087890625, "learning_rate": 0.0001, "loss": 2.9297, "loss/crossentropy": 2.3220163106918337, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.1911177471280098, "loss/reg": 0.0, "step": 53790 }, { "epoch": 0.3539473684210526, "grad_norm": 2.53125, "grad_norm_var": 0.16155192057291667, "learning_rate": 0.0001, "loss": 2.8724, "loss/crossentropy": 2.294942057132721, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.2255813866853714, "loss/reg": 0.0, "step": 53800 }, { "epoch": 0.35401315789473686, "grad_norm": 3.0, "grad_norm_var": 0.5092030843098958, "learning_rate": 0.0001, "loss": 2.989, "loss/crossentropy": 2.34811726808548, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21754570603370665, "loss/reg": 0.0, "step": 53810 }, { "epoch": 0.35407894736842105, "grad_norm": 2.203125, "grad_norm_var": 0.5641171773274739, "learning_rate": 0.0001, "loss": 2.8268, "loss/crossentropy": 2.295544970035553, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.20054676085710527, "loss/reg": 0.0, "step": 53820 }, { "epoch": 0.3541447368421053, "grad_norm": 2.234375, "grad_norm_var": 0.11310933430989584, "learning_rate": 0.0001, "loss": 2.9152, "loss/crossentropy": 2.153967136144638, "loss/hidden": 2.4984375, "loss/incoh": 0.0, "loss/logits": 0.17787317112088202, "loss/reg": 0.0, "step": 53830 }, { "epoch": 0.3542105263157895, "grad_norm": 2.234375, "grad_norm_var": 0.04741109212239583, "learning_rate": 0.0001, "loss": 2.9033, "loss/crossentropy": 2.33048255443573, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.22133387476205826, "loss/reg": 0.0, "step": 53840 }, { "epoch": 0.35427631578947366, "grad_norm": 2.375, "grad_norm_var": 0.03138020833333333, "learning_rate": 0.0001, "loss": 2.8927, "loss/crossentropy": 2.191905605792999, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.2066198319196701, "loss/reg": 0.0, "step": 53850 }, { "epoch": 0.3543421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.08849995930989583, "learning_rate": 0.0001, "loss": 2.8767, "loss/crossentropy": 2.347062826156616, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.20969609469175338, "loss/reg": 0.0, "step": 53860 }, { "epoch": 0.3544078947368421, "grad_norm": 2.203125, "grad_norm_var": 0.06896158854166666, "learning_rate": 0.0001, "loss": 2.9134, "loss/crossentropy": 2.356901979446411, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.21779165267944336, "loss/reg": 0.0, "step": 53870 }, { "epoch": 0.35447368421052633, "grad_norm": 2.34375, "grad_norm_var": 0.07974344889322917, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.50042062997818, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.28171384483575823, "loss/reg": 0.0, "step": 53880 }, { "epoch": 0.3545394736842105, "grad_norm": 4.5, "grad_norm_var": 0.3167317708333333, "learning_rate": 0.0001, "loss": 2.8769, "loss/crossentropy": 2.4439536452293398, "loss/hidden": 2.8390625, "loss/incoh": 0.0, "loss/logits": 0.31821159422397616, "loss/reg": 0.0, "step": 53890 }, { "epoch": 0.35460526315789476, "grad_norm": 4.46875, "grad_norm_var": 0.5545806884765625, "learning_rate": 0.0001, "loss": 2.9373, "loss/crossentropy": 2.150843107700348, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.2274655967950821, "loss/reg": 0.0, "step": 53900 }, { "epoch": 0.35467105263157894, "grad_norm": 2.34375, "grad_norm_var": 3.8523834228515623, "learning_rate": 0.0001, "loss": 2.9249, "loss/crossentropy": 2.4116583943367003, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20844768583774567, "loss/reg": 0.0, "step": 53910 }, { "epoch": 0.3547368421052632, "grad_norm": 2.625, "grad_norm_var": 3.7307362874348957, "learning_rate": 0.0001, "loss": 2.8942, "loss/crossentropy": 2.469593274593353, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.25590949058532714, "loss/reg": 0.0, "step": 53920 }, { "epoch": 0.35480263157894737, "grad_norm": 2.28125, "grad_norm_var": 0.0785797119140625, "learning_rate": 0.0001, "loss": 2.7905, "loss/crossentropy": 2.1048745036125185, "loss/hidden": 2.534375, "loss/incoh": 0.0, "loss/logits": 0.17983582019805908, "loss/reg": 0.0, "step": 53930 }, { "epoch": 0.35486842105263156, "grad_norm": 2.75, "grad_norm_var": 0.04422200520833333, "learning_rate": 0.0001, "loss": 2.9265, "loss/crossentropy": 2.384124386310577, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.2250848740339279, "loss/reg": 0.0, "step": 53940 }, { "epoch": 0.3549342105263158, "grad_norm": 2.453125, "grad_norm_var": 0.029124959309895834, "learning_rate": 0.0001, "loss": 2.8882, "loss/crossentropy": 2.512051749229431, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21904835253953933, "loss/reg": 0.0, "step": 53950 }, { "epoch": 0.355, "grad_norm": 2.515625, "grad_norm_var": 0.10362955729166666, "learning_rate": 0.0001, "loss": 2.8932, "loss/crossentropy": 2.2494447827339172, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.19756917357444764, "loss/reg": 0.0, "step": 53960 }, { "epoch": 0.3550657894736842, "grad_norm": 2.734375, "grad_norm_var": 0.12119038899739583, "learning_rate": 0.0001, "loss": 2.9758, "loss/crossentropy": 2.2532709777355193, "loss/hidden": 2.7265625, "loss/incoh": 0.0, "loss/logits": 0.20349378138780594, "loss/reg": 0.0, "step": 53970 }, { "epoch": 0.3551315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.6217356363932292, "learning_rate": 0.0001, "loss": 2.9466, "loss/crossentropy": 2.211354374885559, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.20193285793066024, "loss/reg": 0.0, "step": 53980 }, { "epoch": 0.35519736842105265, "grad_norm": 2.46875, "grad_norm_var": 0.7066558837890625, "learning_rate": 0.0001, "loss": 2.8477, "loss/crossentropy": 2.4425131559371946, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.25011684000492096, "loss/reg": 0.0, "step": 53990 }, { "epoch": 0.35526315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.4098917643229167, "learning_rate": 0.0001, "loss": 2.9026, "loss/crossentropy": 2.11671986579895, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.1958182781934738, "loss/reg": 0.0, "step": 54000 }, { "epoch": 0.3553289473684211, "grad_norm": 2.703125, "grad_norm_var": 0.3935455322265625, "learning_rate": 0.0001, "loss": 2.8633, "loss/crossentropy": 2.3988207578659058, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2099326267838478, "loss/reg": 0.0, "step": 54010 }, { "epoch": 0.35539473684210526, "grad_norm": 2.203125, "grad_norm_var": 0.05585835774739583, "learning_rate": 0.0001, "loss": 2.8717, "loss/crossentropy": 2.222412371635437, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.2059466175734997, "loss/reg": 0.0, "step": 54020 }, { "epoch": 0.35546052631578945, "grad_norm": 2.796875, "grad_norm_var": 1.2498860677083334, "learning_rate": 0.0001, "loss": 2.9363, "loss/crossentropy": 2.4887668013572695, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.31383037865161895, "loss/reg": 0.0, "step": 54030 }, { "epoch": 0.3555263157894737, "grad_norm": 2.21875, "grad_norm_var": 1.2401601155598958, "learning_rate": 0.0001, "loss": 2.9412, "loss/crossentropy": 2.265219736099243, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.20582252889871597, "loss/reg": 0.0, "step": 54040 }, { "epoch": 0.3555921052631579, "grad_norm": 2.390625, "grad_norm_var": 0.0431304931640625, "learning_rate": 0.0001, "loss": 2.847, "loss/crossentropy": 2.32872793674469, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.24008683264255523, "loss/reg": 0.0, "step": 54050 }, { "epoch": 0.3556578947368421, "grad_norm": 2.71875, "grad_norm_var": 0.08193257649739584, "learning_rate": 0.0001, "loss": 2.9368, "loss/crossentropy": 2.3705220818519592, "loss/hidden": 2.98125, "loss/incoh": 0.0, "loss/logits": 0.26148772686719896, "loss/reg": 0.0, "step": 54060 }, { "epoch": 0.3557236842105263, "grad_norm": 2.75, "grad_norm_var": 0.06806538899739584, "learning_rate": 0.0001, "loss": 2.8955, "loss/crossentropy": 2.358398449420929, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.20554921478033067, "loss/reg": 0.0, "step": 54070 }, { "epoch": 0.35578947368421054, "grad_norm": 2.25, "grad_norm_var": 0.0540435791015625, "learning_rate": 0.0001, "loss": 2.8878, "loss/crossentropy": 2.4612541437149047, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.22401265949010848, "loss/reg": 0.0, "step": 54080 }, { "epoch": 0.35585526315789473, "grad_norm": 2.9375, "grad_norm_var": 0.069140625, "learning_rate": 0.0001, "loss": 2.9193, "loss/crossentropy": 2.444706678390503, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2063468262553215, "loss/reg": 0.0, "step": 54090 }, { "epoch": 0.35592105263157897, "grad_norm": 2.625, "grad_norm_var": 0.05917561848958333, "learning_rate": 0.0001, "loss": 2.9155, "loss/crossentropy": 1.9089924573898316, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.20203033536672593, "loss/reg": 0.0, "step": 54100 }, { "epoch": 0.35598684210526316, "grad_norm": 2.375, "grad_norm_var": 0.08046773274739584, "learning_rate": 0.0001, "loss": 2.9789, "loss/crossentropy": 2.0448192596435546, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.2367345243692398, "loss/reg": 0.0, "step": 54110 }, { "epoch": 0.35605263157894734, "grad_norm": 2.390625, "grad_norm_var": 0.0923492431640625, "learning_rate": 0.0001, "loss": 2.8905, "loss/crossentropy": 2.1761757135391235, "loss/hidden": 2.51875, "loss/incoh": 0.0, "loss/logits": 0.1973909005522728, "loss/reg": 0.0, "step": 54120 }, { "epoch": 0.3561184210526316, "grad_norm": 2.34375, "grad_norm_var": 0.10938212076822916, "learning_rate": 0.0001, "loss": 2.892, "loss/crossentropy": 2.250565540790558, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.2235861450433731, "loss/reg": 0.0, "step": 54130 }, { "epoch": 0.35618421052631577, "grad_norm": 2.34375, "grad_norm_var": 0.03609619140625, "learning_rate": 0.0001, "loss": 2.9272, "loss/crossentropy": 2.51796293258667, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21794483959674835, "loss/reg": 0.0, "step": 54140 }, { "epoch": 0.35625, "grad_norm": 2.859375, "grad_norm_var": 0.05335871378580729, "learning_rate": 0.0001, "loss": 2.908, "loss/crossentropy": 2.3223796844482423, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.2273564748466015, "loss/reg": 0.0, "step": 54150 }, { "epoch": 0.3563157894736842, "grad_norm": 2.625, "grad_norm_var": 0.06325581868489584, "learning_rate": 0.0001, "loss": 2.9015, "loss/crossentropy": 2.3547950744628907, "loss/hidden": 2.7140625, "loss/incoh": 0.0, "loss/logits": 0.22187849879264832, "loss/reg": 0.0, "step": 54160 }, { "epoch": 0.35638157894736844, "grad_norm": 2.390625, "grad_norm_var": 0.1731842041015625, "learning_rate": 0.0001, "loss": 2.9564, "loss/crossentropy": 2.327433001995087, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.24118574857711791, "loss/reg": 0.0, "step": 54170 }, { "epoch": 0.3564473684210526, "grad_norm": 2.59375, "grad_norm_var": 0.14440104166666667, "learning_rate": 0.0001, "loss": 2.9752, "loss/crossentropy": 2.270350229740143, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.2453131929039955, "loss/reg": 0.0, "step": 54180 }, { "epoch": 0.35651315789473687, "grad_norm": 1954545664.0, "grad_norm_var": 2.3876554645267088e+17, "learning_rate": 0.0001, "loss": 3.0088, "loss/crossentropy": 2.053894114494324, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.2103626012802124, "loss/reg": 0.0, "step": 54190 }, { "epoch": 0.35657894736842105, "grad_norm": 2.40625, "grad_norm_var": 2.3876554644198195e+17, "learning_rate": 0.0001, "loss": 2.8895, "loss/crossentropy": 2.359751486778259, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.22385697960853576, "loss/reg": 0.0, "step": 54200 }, { "epoch": 0.35664473684210524, "grad_norm": 2.578125, "grad_norm_var": 0.06558837890625, "learning_rate": 0.0001, "loss": 2.9354, "loss/crossentropy": 2.395543646812439, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.2394479528069496, "loss/reg": 0.0, "step": 54210 }, { "epoch": 0.3567105263157895, "grad_norm": 2.5, "grad_norm_var": 0.07021382649739584, "learning_rate": 0.0001, "loss": 2.9092, "loss/crossentropy": 2.257586693763733, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.19854284822940826, "loss/reg": 0.0, "step": 54220 }, { "epoch": 0.35677631578947366, "grad_norm": 2.671875, "grad_norm_var": 0.0597564697265625, "learning_rate": 0.0001, "loss": 2.8656, "loss/crossentropy": 2.4398115038871766, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20818059891462326, "loss/reg": 0.0, "step": 54230 }, { "epoch": 0.3568421052631579, "grad_norm": 2.734375, "grad_norm_var": 0.0701324462890625, "learning_rate": 0.0001, "loss": 2.9969, "loss/crossentropy": 2.3631557703018187, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.20158067122101783, "loss/reg": 0.0, "step": 54240 }, { "epoch": 0.3569078947368421, "grad_norm": 2.203125, "grad_norm_var": 0.03310546875, "learning_rate": 0.0001, "loss": 2.9046, "loss/crossentropy": 2.4030117750167848, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.19198712408542634, "loss/reg": 0.0, "step": 54250 }, { "epoch": 0.35697368421052633, "grad_norm": 2.453125, "grad_norm_var": 0.03218485514322917, "learning_rate": 0.0001, "loss": 2.92, "loss/crossentropy": 2.555415093898773, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.23902440965175628, "loss/reg": 0.0, "step": 54260 }, { "epoch": 0.3570394736842105, "grad_norm": 2.421875, "grad_norm_var": 0.019429524739583332, "learning_rate": 0.0001, "loss": 2.9892, "loss/crossentropy": 2.367581534385681, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2215558558702469, "loss/reg": 0.0, "step": 54270 }, { "epoch": 0.35710526315789476, "grad_norm": 2.25, "grad_norm_var": 0.053544108072916666, "learning_rate": 0.0001, "loss": 2.8905, "loss/crossentropy": 2.128105938434601, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.20883714482188226, "loss/reg": 0.0, "step": 54280 }, { "epoch": 0.35717105263157894, "grad_norm": 2.234375, "grad_norm_var": 0.0713043212890625, "learning_rate": 0.0001, "loss": 2.9197, "loss/crossentropy": 2.4443586587905886, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.21806224435567856, "loss/reg": 0.0, "step": 54290 }, { "epoch": 0.35723684210526313, "grad_norm": 2.390625, "grad_norm_var": 0.06886393229166667, "learning_rate": 0.0001, "loss": 2.932, "loss/crossentropy": 2.443032479286194, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.2098913311958313, "loss/reg": 0.0, "step": 54300 }, { "epoch": 0.35730263157894737, "grad_norm": 2.640625, "grad_norm_var": 0.10343424479166667, "learning_rate": 0.0001, "loss": 2.9152, "loss/crossentropy": 2.102934178709984, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.20724054016172885, "loss/reg": 0.0, "step": 54310 }, { "epoch": 0.35736842105263156, "grad_norm": 2.890625, "grad_norm_var": 0.12205301920572917, "learning_rate": 0.0001, "loss": 2.8799, "loss/crossentropy": 2.46229373216629, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.19543251991271973, "loss/reg": 0.0, "step": 54320 }, { "epoch": 0.3574342105263158, "grad_norm": 2.28125, "grad_norm_var": 0.08321024576822916, "learning_rate": 0.0001, "loss": 2.9478, "loss/crossentropy": 1.9944883704185485, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.23129016309976577, "loss/reg": 0.0, "step": 54330 }, { "epoch": 0.3575, "grad_norm": 2.296875, "grad_norm_var": 0.07840169270833333, "learning_rate": 0.0001, "loss": 2.9461, "loss/crossentropy": 2.1792616486549377, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.22607929706573487, "loss/reg": 0.0, "step": 54340 }, { "epoch": 0.3575657894736842, "grad_norm": 2.15625, "grad_norm_var": 0.03369852701822917, "learning_rate": 0.0001, "loss": 2.9205, "loss/crossentropy": 2.004812294244766, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.217975914478302, "loss/reg": 0.0, "step": 54350 }, { "epoch": 0.3576315789473684, "grad_norm": 2.46875, "grad_norm_var": 0.030269368489583334, "learning_rate": 0.0001, "loss": 2.9669, "loss/crossentropy": 1.9730993628501892, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.225260441750288, "loss/reg": 0.0, "step": 54360 }, { "epoch": 0.35769736842105265, "grad_norm": 2.34375, "grad_norm_var": 0.041015625, "learning_rate": 0.0001, "loss": 2.8902, "loss/crossentropy": 2.5220611572265623, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.21170885115861893, "loss/reg": 0.0, "step": 54370 }, { "epoch": 0.35776315789473684, "grad_norm": 2.234375, "grad_norm_var": 0.15008036295572916, "learning_rate": 0.0001, "loss": 2.9585, "loss/crossentropy": 2.3945314407348635, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.24622462689876556, "loss/reg": 0.0, "step": 54380 }, { "epoch": 0.3578289473684211, "grad_norm": 2.40625, "grad_norm_var": 0.7082590738932292, "learning_rate": 0.0001, "loss": 2.9829, "loss/crossentropy": 2.2781185865402223, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21747732758522034, "loss/reg": 0.0, "step": 54390 }, { "epoch": 0.35789473684210527, "grad_norm": 4.0625, "grad_norm_var": 0.7591623942057292, "learning_rate": 0.0001, "loss": 2.9481, "loss/crossentropy": 2.2859074592590334, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.29840278029441836, "loss/reg": 0.0, "step": 54400 }, { "epoch": 0.35796052631578945, "grad_norm": 3.234375, "grad_norm_var": 0.32136128743489584, "learning_rate": 0.0001, "loss": 3.0035, "loss/crossentropy": 2.2163491368293764, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.2404955729842186, "loss/reg": 0.0, "step": 54410 }, { "epoch": 0.3580263157894737, "grad_norm": 2.453125, "grad_norm_var": 0.10522359212239583, "learning_rate": 0.0001, "loss": 2.9343, "loss/crossentropy": 2.2161285519599914, "loss/hidden": 2.7296875, "loss/incoh": 0.0, "loss/logits": 0.21978897005319595, "loss/reg": 0.0, "step": 54420 }, { "epoch": 0.3580921052631579, "grad_norm": 2.625, "grad_norm_var": 0.7315755208333333, "learning_rate": 0.0001, "loss": 2.8935, "loss/crossentropy": 2.435475969314575, "loss/hidden": 2.55, "loss/incoh": 0.0, "loss/logits": 0.19421324282884597, "loss/reg": 0.0, "step": 54430 }, { "epoch": 0.3581578947368421, "grad_norm": 2.3125, "grad_norm_var": 0.03560791015625, "learning_rate": 0.0001, "loss": 2.8516, "loss/crossentropy": 2.322109746932983, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.21368842720985412, "loss/reg": 0.0, "step": 54440 }, { "epoch": 0.3582236842105263, "grad_norm": 2.546875, "grad_norm_var": 0.27691650390625, "learning_rate": 0.0001, "loss": 2.9551, "loss/crossentropy": 2.648325967788696, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.25250559151172636, "loss/reg": 0.0, "step": 54450 }, { "epoch": 0.35828947368421055, "grad_norm": 2.40625, "grad_norm_var": 0.2783437093098958, "learning_rate": 0.0001, "loss": 3.0052, "loss/crossentropy": 2.5408368825912477, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.21664342582225798, "loss/reg": 0.0, "step": 54460 }, { "epoch": 0.35835526315789473, "grad_norm": 2.25, "grad_norm_var": 0.06641337076822916, "learning_rate": 0.0001, "loss": 2.8561, "loss/crossentropy": 2.1981930494308473, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20015666782855987, "loss/reg": 0.0, "step": 54470 }, { "epoch": 0.358421052631579, "grad_norm": 2.484375, "grad_norm_var": 0.013081868489583334, "learning_rate": 0.0001, "loss": 2.8476, "loss/crossentropy": 2.224550926685333, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.19693049490451814, "loss/reg": 0.0, "step": 54480 }, { "epoch": 0.35848684210526316, "grad_norm": 2.53125, "grad_norm_var": 0.07043863932291666, "learning_rate": 0.0001, "loss": 2.9282, "loss/crossentropy": 2.21962434053421, "loss/hidden": 2.85, "loss/incoh": 0.0, "loss/logits": 0.21834225952625275, "loss/reg": 0.0, "step": 54490 }, { "epoch": 0.35855263157894735, "grad_norm": 2.4375, "grad_norm_var": 0.19733861287434895, "learning_rate": 0.0001, "loss": 2.9081, "loss/crossentropy": 2.0452643394470216, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.17951916307210922, "loss/reg": 0.0, "step": 54500 }, { "epoch": 0.3586184210526316, "grad_norm": 2.375, "grad_norm_var": 0.05101496378580729, "learning_rate": 0.0001, "loss": 2.8834, "loss/crossentropy": 2.0895548701286315, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.21570448353886604, "loss/reg": 0.0, "step": 54510 }, { "epoch": 0.35868421052631577, "grad_norm": 2.453125, "grad_norm_var": 0.08339742024739584, "learning_rate": 0.0001, "loss": 2.9095, "loss/crossentropy": 2.34168518781662, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.19523481726646424, "loss/reg": 0.0, "step": 54520 }, { "epoch": 0.35875, "grad_norm": 2.109375, "grad_norm_var": 0.0701812744140625, "learning_rate": 0.0001, "loss": 2.8698, "loss/crossentropy": 2.3053163051605225, "loss/hidden": 2.8171875, "loss/incoh": 0.0, "loss/logits": 0.19365841709077358, "loss/reg": 0.0, "step": 54530 }, { "epoch": 0.3588157894736842, "grad_norm": 2.46875, "grad_norm_var": 0.06896870930989583, "learning_rate": 0.0001, "loss": 2.8738, "loss/crossentropy": 2.6046852350234984, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.2187461659312248, "loss/reg": 0.0, "step": 54540 }, { "epoch": 0.35888157894736844, "grad_norm": 2.484375, "grad_norm_var": 1.107860310872396, "learning_rate": 0.0001, "loss": 2.9331, "loss/crossentropy": 2.0925799012184143, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.18093864768743514, "loss/reg": 0.0, "step": 54550 }, { "epoch": 0.3589473684210526, "grad_norm": 2.78125, "grad_norm_var": 1.0596343994140625, "learning_rate": 0.0001, "loss": 2.9474, "loss/crossentropy": 2.456355333328247, "loss/hidden": 3.0421875, "loss/incoh": 0.0, "loss/logits": 0.2979159809648991, "loss/reg": 0.0, "step": 54560 }, { "epoch": 0.35901315789473687, "grad_norm": 2.265625, "grad_norm_var": 0.05868733723958333, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.247401404380798, "loss/hidden": 3.003125, "loss/incoh": 0.0, "loss/logits": 0.2491950884461403, "loss/reg": 0.0, "step": 54570 }, { "epoch": 0.35907894736842105, "grad_norm": 3.078125, "grad_norm_var": 0.08434244791666666, "learning_rate": 0.0001, "loss": 2.9327, "loss/crossentropy": 2.3208995938301085, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2826381787657738, "loss/reg": 0.0, "step": 54580 }, { "epoch": 0.35914473684210524, "grad_norm": 2.484375, "grad_norm_var": 0.05852457682291667, "learning_rate": 0.0001, "loss": 2.893, "loss/crossentropy": 2.2282600522041323, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20587321519851684, "loss/reg": 0.0, "step": 54590 }, { "epoch": 0.3592105263157895, "grad_norm": 2.65625, "grad_norm_var": 0.03388570149739583, "learning_rate": 0.0001, "loss": 2.9057, "loss/crossentropy": 2.345319354534149, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.23392851352691652, "loss/reg": 0.0, "step": 54600 }, { "epoch": 0.35927631578947367, "grad_norm": 2.65625, "grad_norm_var": 0.04729410807291667, "learning_rate": 0.0001, "loss": 2.8814, "loss/crossentropy": 1.995515191555023, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.17632606998085976, "loss/reg": 0.0, "step": 54610 }, { "epoch": 0.3593421052631579, "grad_norm": 2.28125, "grad_norm_var": 2.0329969941827856e+17, "learning_rate": 0.0001, "loss": 3.0357, "loss/crossentropy": 2.364439904689789, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.2179832339286804, "loss/reg": 0.0, "step": 54620 }, { "epoch": 0.3594078947368421, "grad_norm": 2.5625, "grad_norm_var": 0.030952962239583333, "learning_rate": 0.0001, "loss": 2.9348, "loss/crossentropy": 2.4028625011444094, "loss/hidden": 2.871875, "loss/incoh": 0.0, "loss/logits": 0.22525394856929778, "loss/reg": 0.0, "step": 54630 }, { "epoch": 0.35947368421052633, "grad_norm": 2.59375, "grad_norm_var": 0.041337076822916666, "learning_rate": 0.0001, "loss": 2.9136, "loss/crossentropy": 2.140088367462158, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.23257567137479782, "loss/reg": 0.0, "step": 54640 }, { "epoch": 0.3595394736842105, "grad_norm": 2.53125, "grad_norm_var": 0.04248046875, "learning_rate": 0.0001, "loss": 2.948, "loss/crossentropy": 2.6963785886764526, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.24833889156579972, "loss/reg": 0.0, "step": 54650 }, { "epoch": 0.35960526315789476, "grad_norm": 2.375, "grad_norm_var": 0.04176025390625, "learning_rate": 0.0001, "loss": 2.8859, "loss/crossentropy": 2.0683693051338197, "loss/hidden": 2.7875, "loss/incoh": 0.0, "loss/logits": 0.2056108221411705, "loss/reg": 0.0, "step": 54660 }, { "epoch": 0.35967105263157895, "grad_norm": 2.234375, "grad_norm_var": 0.12267252604166666, "learning_rate": 0.0001, "loss": 2.9422, "loss/crossentropy": 2.217694354057312, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.20696992352604865, "loss/reg": 0.0, "step": 54670 }, { "epoch": 0.35973684210526313, "grad_norm": 2.296875, "grad_norm_var": 0.10498758951822916, "learning_rate": 0.0001, "loss": 2.8323, "loss/crossentropy": 2.308615338802338, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2043146789073944, "loss/reg": 0.0, "step": 54680 }, { "epoch": 0.3598026315789474, "grad_norm": 2.234375, "grad_norm_var": 0.03183186848958333, "learning_rate": 0.0001, "loss": 2.862, "loss/crossentropy": 2.3850364685058594, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20454232394695282, "loss/reg": 0.0, "step": 54690 }, { "epoch": 0.35986842105263156, "grad_norm": 2.234375, "grad_norm_var": 0.04218343098958333, "learning_rate": 0.0001, "loss": 2.9183, "loss/crossentropy": 2.445221424102783, "loss/hidden": 2.6828125, "loss/incoh": 0.0, "loss/logits": 0.21555831879377366, "loss/reg": 0.0, "step": 54700 }, { "epoch": 0.3599342105263158, "grad_norm": 2.40625, "grad_norm_var": 0.03937174479166667, "learning_rate": 0.0001, "loss": 2.8291, "loss/crossentropy": 2.540818250179291, "loss/hidden": 2.5078125, "loss/incoh": 0.0, "loss/logits": 0.21492148712277412, "loss/reg": 0.0, "step": 54710 }, { "epoch": 0.36, "grad_norm": 2.421875, "grad_norm_var": 0.022272745768229168, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.6022736072540282, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.27861695140600207, "loss/reg": 0.0, "step": 54720 }, { "epoch": 0.36006578947368423, "grad_norm": 2.984375, "grad_norm_var": 0.060456339518229166, "learning_rate": 0.0001, "loss": 2.8636, "loss/crossentropy": 2.3128707647323608, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.21024202704429626, "loss/reg": 0.0, "step": 54730 }, { "epoch": 0.3601315789473684, "grad_norm": 2.65625, "grad_norm_var": 0.19508463541666668, "learning_rate": 0.0001, "loss": 2.9128, "loss/crossentropy": 2.485660266876221, "loss/hidden": 2.690625, "loss/incoh": 0.0, "loss/logits": 0.2298157647252083, "loss/reg": 0.0, "step": 54740 }, { "epoch": 0.36019736842105265, "grad_norm": 3.09375, "grad_norm_var": 0.17768452962239584, "learning_rate": 0.0001, "loss": 2.9783, "loss/crossentropy": 2.280470073223114, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.20580463260412216, "loss/reg": 0.0, "step": 54750 }, { "epoch": 0.36026315789473684, "grad_norm": 2.546875, "grad_norm_var": 0.06383056640625, "learning_rate": 0.0001, "loss": 2.954, "loss/crossentropy": 2.474344313144684, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.24877213388681413, "loss/reg": 0.0, "step": 54760 }, { "epoch": 0.360328947368421, "grad_norm": 2.53125, "grad_norm_var": 0.022379557291666668, "learning_rate": 0.0001, "loss": 2.9963, "loss/crossentropy": 2.313224160671234, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.22623538076877595, "loss/reg": 0.0, "step": 54770 }, { "epoch": 0.36039473684210527, "grad_norm": 2.625, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 2.9206, "loss/crossentropy": 2.3442553758621214, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.19730177745223046, "loss/reg": 0.0, "step": 54780 }, { "epoch": 0.36046052631578945, "grad_norm": 2.25, "grad_norm_var": 0.0352203369140625, "learning_rate": 0.0001, "loss": 2.9684, "loss/crossentropy": 2.5469708681106566, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.24271451830863952, "loss/reg": 0.0, "step": 54790 }, { "epoch": 0.3605263157894737, "grad_norm": 2.234375, "grad_norm_var": 0.027262369791666668, "learning_rate": 0.0001, "loss": 2.8957, "loss/crossentropy": 2.223972100019455, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.19848118126392364, "loss/reg": 0.0, "step": 54800 }, { "epoch": 0.3605921052631579, "grad_norm": 2.15625, "grad_norm_var": 0.06982320149739583, "learning_rate": 0.0001, "loss": 2.9388, "loss/crossentropy": 2.1773023545742034, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.22420533522963523, "loss/reg": 0.0, "step": 54810 }, { "epoch": 0.3606578947368421, "grad_norm": 2.640625, "grad_norm_var": 0.04977213541666667, "learning_rate": 0.0001, "loss": 2.9382, "loss/crossentropy": 2.365528738498688, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.24472457617521287, "loss/reg": 0.0, "step": 54820 }, { "epoch": 0.3607236842105263, "grad_norm": 2.46875, "grad_norm_var": 0.8344228108723958, "learning_rate": 0.0001, "loss": 2.955, "loss/crossentropy": 2.3240780234336853, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.2004014417529106, "loss/reg": 0.0, "step": 54830 }, { "epoch": 0.36078947368421055, "grad_norm": 2.234375, "grad_norm_var": 0.8707509358723958, "learning_rate": 0.0001, "loss": 2.7895, "loss/crossentropy": 2.423196053504944, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.20591089576482774, "loss/reg": 0.0, "step": 54840 }, { "epoch": 0.36085526315789473, "grad_norm": 2.15625, "grad_norm_var": 0.12384440104166666, "learning_rate": 0.0001, "loss": 2.92, "loss/crossentropy": 2.1994440078735353, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.1906447410583496, "loss/reg": 0.0, "step": 54850 }, { "epoch": 0.3609210526315789, "grad_norm": 2.390625, "grad_norm_var": 0.07048238118489583, "learning_rate": 0.0001, "loss": 2.9171, "loss/crossentropy": 2.0818288683891297, "loss/hidden": 2.8328125, "loss/incoh": 0.0, "loss/logits": 0.20400619953870774, "loss/reg": 0.0, "step": 54860 }, { "epoch": 0.36098684210526316, "grad_norm": 2.015625, "grad_norm_var": 0.09553120930989584, "learning_rate": 0.0001, "loss": 2.9318, "loss/crossentropy": 2.380126976966858, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.22580958753824235, "loss/reg": 0.0, "step": 54870 }, { "epoch": 0.36105263157894735, "grad_norm": 2.625, "grad_norm_var": 0.048828125, "learning_rate": 0.0001, "loss": 2.8443, "loss/crossentropy": 2.310909152030945, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2569435343146324, "loss/reg": 0.0, "step": 54880 }, { "epoch": 0.3611184210526316, "grad_norm": 2.453125, "grad_norm_var": 0.02183837890625, "learning_rate": 0.0001, "loss": 2.9678, "loss/crossentropy": 2.120818313956261, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2645658552646637, "loss/reg": 0.0, "step": 54890 }, { "epoch": 0.3611842105263158, "grad_norm": 2.3125, "grad_norm_var": 0.06648763020833333, "learning_rate": 0.0001, "loss": 2.9293, "loss/crossentropy": 2.4554855108261107, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.22399022430181503, "loss/reg": 0.0, "step": 54900 }, { "epoch": 0.36125, "grad_norm": 3.421875, "grad_norm_var": 0.6853424072265625, "learning_rate": 0.0001, "loss": 2.9392, "loss/crossentropy": 2.411912763118744, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.22950262278318406, "loss/reg": 0.0, "step": 54910 }, { "epoch": 0.3613157894736842, "grad_norm": 3.359375, "grad_norm_var": 0.5984700520833334, "learning_rate": 0.0001, "loss": 2.951, "loss/crossentropy": 2.369457709789276, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.2315858393907547, "loss/reg": 0.0, "step": 54920 }, { "epoch": 0.36138157894736844, "grad_norm": 3.09375, "grad_norm_var": 0.09597981770833333, "learning_rate": 0.0001, "loss": 2.9305, "loss/crossentropy": 2.09514479637146, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.19469663947820665, "loss/reg": 0.0, "step": 54930 }, { "epoch": 0.36144736842105263, "grad_norm": 2.421875, "grad_norm_var": 0.09164937337239583, "learning_rate": 0.0001, "loss": 2.9242, "loss/crossentropy": 2.307541882991791, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2687637731432915, "loss/reg": 0.0, "step": 54940 }, { "epoch": 0.36151315789473687, "grad_norm": 2.265625, "grad_norm_var": 0.07430013020833333, "learning_rate": 0.0001, "loss": 2.9468, "loss/crossentropy": 2.137566030025482, "loss/hidden": 2.5015625, "loss/incoh": 0.0, "loss/logits": 0.18972200900316238, "loss/reg": 0.0, "step": 54950 }, { "epoch": 0.36157894736842106, "grad_norm": 2.203125, "grad_norm_var": 0.05718994140625, "learning_rate": 0.0001, "loss": 2.948, "loss/crossentropy": 2.2150479674339296, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.21226576566696168, "loss/reg": 0.0, "step": 54960 }, { "epoch": 0.36164473684210524, "grad_norm": 2.90625, "grad_norm_var": 0.0607330322265625, "learning_rate": 0.0001, "loss": 2.9775, "loss/crossentropy": 2.359450840950012, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.30662850439548495, "loss/reg": 0.0, "step": 54970 }, { "epoch": 0.3617105263157895, "grad_norm": 4.15625, "grad_norm_var": 0.24986979166666667, "learning_rate": 0.0001, "loss": 2.8912, "loss/crossentropy": 2.445091760158539, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.21184108629822732, "loss/reg": 0.0, "step": 54980 }, { "epoch": 0.36177631578947367, "grad_norm": 2.71875, "grad_norm_var": 0.22683003743489583, "learning_rate": 0.0001, "loss": 2.901, "loss/crossentropy": 2.3065505266189574, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23976122736930847, "loss/reg": 0.0, "step": 54990 }, { "epoch": 0.3618421052631579, "grad_norm": 2.390625, "grad_norm_var": 0.07372004191080729, "learning_rate": 0.0001, "loss": 2.8473, "loss/crossentropy": 2.4219440102577208, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.21846269965171813, "loss/reg": 0.0, "step": 55000 }, { "epoch": 0.3619078947368421, "grad_norm": 3.171875, "grad_norm_var": 1.1975440979003906, "learning_rate": 0.0001, "loss": 2.8917, "loss/crossentropy": 2.3994370698928833, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.20195741802453995, "loss/reg": 0.0, "step": 55010 }, { "epoch": 0.36197368421052634, "grad_norm": 2.734375, "grad_norm_var": 1.2027008056640625, "learning_rate": 0.0001, "loss": 2.8793, "loss/crossentropy": 2.311794662475586, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.27213078290224074, "loss/reg": 0.0, "step": 55020 }, { "epoch": 0.3620394736842105, "grad_norm": 2.203125, "grad_norm_var": 0.042378743489583336, "learning_rate": 0.0001, "loss": 2.8562, "loss/crossentropy": 2.1769510865211488, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.2048715904355049, "loss/reg": 0.0, "step": 55030 }, { "epoch": 0.36210526315789476, "grad_norm": 2.078125, "grad_norm_var": 0.06980692545572917, "learning_rate": 0.0001, "loss": 2.8807, "loss/crossentropy": 2.359910786151886, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.21098579168319703, "loss/reg": 0.0, "step": 55040 }, { "epoch": 0.36217105263157895, "grad_norm": 2.140625, "grad_norm_var": 0.07160542805989584, "learning_rate": 0.0001, "loss": 2.9082, "loss/crossentropy": 2.5008140325546266, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.21419156342744827, "loss/reg": 0.0, "step": 55050 }, { "epoch": 0.36223684210526313, "grad_norm": 2.21875, "grad_norm_var": 0.03577067057291667, "learning_rate": 0.0001, "loss": 2.89, "loss/crossentropy": 2.336580920219421, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.20757495909929274, "loss/reg": 0.0, "step": 55060 }, { "epoch": 0.3623026315789474, "grad_norm": 2.859375, "grad_norm_var": 0.40943603515625, "learning_rate": 0.0001, "loss": 2.9289, "loss/crossentropy": 1.8712775945663451, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.23173328787088393, "loss/reg": 0.0, "step": 55070 }, { "epoch": 0.36236842105263156, "grad_norm": 2.390625, "grad_norm_var": 0.38713277180989586, "learning_rate": 0.0001, "loss": 2.8909, "loss/crossentropy": 2.3050923466682436, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.2536117509007454, "loss/reg": 0.0, "step": 55080 }, { "epoch": 0.3624342105263158, "grad_norm": 3.09375, "grad_norm_var": 0.0809967041015625, "learning_rate": 0.0001, "loss": 2.8609, "loss/crossentropy": 2.4368178248405457, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2158541977405548, "loss/reg": 0.0, "step": 55090 }, { "epoch": 0.3625, "grad_norm": 2.609375, "grad_norm_var": 0.087744140625, "learning_rate": 0.0001, "loss": 2.893, "loss/crossentropy": 2.378524124622345, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.2091189742088318, "loss/reg": 0.0, "step": 55100 }, { "epoch": 0.36256578947368423, "grad_norm": 2.5625, "grad_norm_var": 0.032938639322916664, "learning_rate": 0.0001, "loss": 2.8797, "loss/crossentropy": 2.4064526081085207, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.207526333630085, "loss/reg": 0.0, "step": 55110 }, { "epoch": 0.3626315789473684, "grad_norm": 2.578125, "grad_norm_var": 0.10364583333333334, "learning_rate": 0.0001, "loss": 2.9032, "loss/crossentropy": 2.2197712540626524, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.22423715740442277, "loss/reg": 0.0, "step": 55120 }, { "epoch": 0.36269736842105266, "grad_norm": 2.546875, "grad_norm_var": 0.15127665201822918, "learning_rate": 0.0001, "loss": 2.8684, "loss/crossentropy": 2.318847489356995, "loss/hidden": 2.6984375, "loss/incoh": 0.0, "loss/logits": 0.23123670369386673, "loss/reg": 0.0, "step": 55130 }, { "epoch": 0.36276315789473684, "grad_norm": 2.4375, "grad_norm_var": 0.2258697509765625, "learning_rate": 0.0001, "loss": 2.8845, "loss/crossentropy": 2.182840144634247, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.2504867151379585, "loss/reg": 0.0, "step": 55140 }, { "epoch": 0.36282894736842103, "grad_norm": 2.359375, "grad_norm_var": 0.1957672119140625, "learning_rate": 0.0001, "loss": 2.9345, "loss/crossentropy": 2.2322204232215883, "loss/hidden": 2.8140625, "loss/incoh": 0.0, "loss/logits": 0.2387073777616024, "loss/reg": 0.0, "step": 55150 }, { "epoch": 0.36289473684210527, "grad_norm": 2.609375, "grad_norm_var": 0.03127848307291667, "learning_rate": 0.0001, "loss": 2.8974, "loss/crossentropy": 2.1007212519645693, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.20121292397379875, "loss/reg": 0.0, "step": 55160 }, { "epoch": 0.36296052631578946, "grad_norm": 2.390625, "grad_norm_var": 0.14036051432291666, "learning_rate": 0.0001, "loss": 2.9081, "loss/crossentropy": 2.380767357349396, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20617091357707978, "loss/reg": 0.0, "step": 55170 }, { "epoch": 0.3630263157894737, "grad_norm": 2.375, "grad_norm_var": 0.09531962076822917, "learning_rate": 0.0001, "loss": 2.9432, "loss/crossentropy": 2.1204582929611204, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.21525612026453017, "loss/reg": 0.0, "step": 55180 }, { "epoch": 0.3630921052631579, "grad_norm": 3.15625, "grad_norm_var": 0.1307037353515625, "learning_rate": 0.0001, "loss": 2.9691, "loss/crossentropy": 2.3426124691963195, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.23233982771635056, "loss/reg": 0.0, "step": 55190 }, { "epoch": 0.3631578947368421, "grad_norm": 2.5, "grad_norm_var": 0.12148335774739584, "learning_rate": 0.0001, "loss": 2.8851, "loss/crossentropy": 2.22878737449646, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.18628740906715394, "loss/reg": 0.0, "step": 55200 }, { "epoch": 0.3632236842105263, "grad_norm": 2.234375, "grad_norm_var": 0.1013671875, "learning_rate": 0.0001, "loss": 2.8738, "loss/crossentropy": 2.375605136156082, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.2060827136039734, "loss/reg": 0.0, "step": 55210 }, { "epoch": 0.36328947368421055, "grad_norm": 2.5, "grad_norm_var": 0.8687825520833333, "learning_rate": 0.0001, "loss": 2.9261, "loss/crossentropy": 2.173552322387695, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21972828954458237, "loss/reg": 0.0, "step": 55220 }, { "epoch": 0.36335526315789474, "grad_norm": 2.3125, "grad_norm_var": 0.06487223307291666, "learning_rate": 0.0001, "loss": 2.8728, "loss/crossentropy": 2.126990908384323, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.17540952265262605, "loss/reg": 0.0, "step": 55230 }, { "epoch": 0.3634210526315789, "grad_norm": 2.65625, "grad_norm_var": 0.033446248372395834, "learning_rate": 0.0001, "loss": 2.8967, "loss/crossentropy": 2.2749139070510864, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.2083474151790142, "loss/reg": 0.0, "step": 55240 }, { "epoch": 0.36348684210526316, "grad_norm": 2.578125, "grad_norm_var": 0.19470926920572917, "learning_rate": 0.0001, "loss": 2.939, "loss/crossentropy": 2.239310896396637, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.1970990240573883, "loss/reg": 0.0, "step": 55250 }, { "epoch": 0.36355263157894735, "grad_norm": 2.578125, "grad_norm_var": 0.07245686848958334, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.013974192738533, "loss/hidden": 2.7046875, "loss/incoh": 0.0, "loss/logits": 0.2109121534973383, "loss/reg": 0.0, "step": 55260 }, { "epoch": 0.3636184210526316, "grad_norm": 3.15625, "grad_norm_var": 0.1068511962890625, "learning_rate": 0.0001, "loss": 2.8864, "loss/crossentropy": 2.1526177525520325, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.23348114043474197, "loss/reg": 0.0, "step": 55270 }, { "epoch": 0.3636842105263158, "grad_norm": 2.6875, "grad_norm_var": 0.0671051025390625, "learning_rate": 0.0001, "loss": 2.8724, "loss/crossentropy": 1.9860335350036622, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.17269461899995803, "loss/reg": 0.0, "step": 55280 }, { "epoch": 0.36375, "grad_norm": 2.671875, "grad_norm_var": 0.03427327473958333, "learning_rate": 0.0001, "loss": 2.8716, "loss/crossentropy": 2.505139982700348, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.20130931288003923, "loss/reg": 0.0, "step": 55290 }, { "epoch": 0.3638157894736842, "grad_norm": 2.078125, "grad_norm_var": 0.06571858723958333, "learning_rate": 0.0001, "loss": 2.8554, "loss/crossentropy": 2.2247818231582643, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.18375571295619011, "loss/reg": 0.0, "step": 55300 }, { "epoch": 0.36388157894736844, "grad_norm": 2.109375, "grad_norm_var": 0.03997395833333333, "learning_rate": 0.0001, "loss": 2.8363, "loss/crossentropy": 2.018349528312683, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.21983690708875656, "loss/reg": 0.0, "step": 55310 }, { "epoch": 0.36394736842105263, "grad_norm": 2.109375, "grad_norm_var": 0.06918843587239583, "learning_rate": 0.0001, "loss": 2.9032, "loss/crossentropy": 2.218311941623688, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.20166290998458863, "loss/reg": 0.0, "step": 55320 }, { "epoch": 0.3640131578947368, "grad_norm": 2.53125, "grad_norm_var": 0.053515625, "learning_rate": 0.0001, "loss": 2.8997, "loss/crossentropy": 2.3984437942504884, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.269355933368206, "loss/reg": 0.0, "step": 55330 }, { "epoch": 0.36407894736842106, "grad_norm": 2.140625, "grad_norm_var": 0.543115234375, "learning_rate": 0.0001, "loss": 2.8426, "loss/crossentropy": 1.9682234406471253, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.19391384199261666, "loss/reg": 0.0, "step": 55340 }, { "epoch": 0.36414473684210524, "grad_norm": 2.34375, "grad_norm_var": 0.5478474934895833, "learning_rate": 0.0001, "loss": 2.8837, "loss/crossentropy": 2.324315643310547, "loss/hidden": 2.74375, "loss/incoh": 0.0, "loss/logits": 0.24450234174728394, "loss/reg": 0.0, "step": 55350 }, { "epoch": 0.3642105263157895, "grad_norm": 2.46875, "grad_norm_var": 0.029878743489583335, "learning_rate": 0.0001, "loss": 2.9241, "loss/crossentropy": 2.0648854255676268, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.2341735541820526, "loss/reg": 0.0, "step": 55360 }, { "epoch": 0.36427631578947367, "grad_norm": 3.296875, "grad_norm_var": 0.0787506103515625, "learning_rate": 0.0001, "loss": 2.8575, "loss/crossentropy": 2.4560649514198305, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.23736364394426346, "loss/reg": 0.0, "step": 55370 }, { "epoch": 0.3643421052631579, "grad_norm": 3.734375, "grad_norm_var": 0.3169016520182292, "learning_rate": 0.0001, "loss": 2.8644, "loss/crossentropy": 2.1610469460487365, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.26959832310676574, "loss/reg": 0.0, "step": 55380 }, { "epoch": 0.3644078947368421, "grad_norm": 3.015625, "grad_norm_var": 0.7365875244140625, "learning_rate": 0.0001, "loss": 3.0608, "loss/crossentropy": 2.3364447712898255, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.22352717071771622, "loss/reg": 0.0, "step": 55390 }, { "epoch": 0.36447368421052634, "grad_norm": 2.4375, "grad_norm_var": 2.5544293868726845e+17, "learning_rate": 0.0001, "loss": 3.0223, "loss/crossentropy": 2.334025263786316, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.19940470084547995, "loss/reg": 0.0, "step": 55400 }, { "epoch": 0.3645394736842105, "grad_norm": 2.171875, "grad_norm_var": 2.55442938801513e+17, "learning_rate": 0.0001, "loss": 2.8633, "loss/crossentropy": 2.2595667243003845, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.22901192009449006, "loss/reg": 0.0, "step": 55410 }, { "epoch": 0.3646052631578947, "grad_norm": 2.65625, "grad_norm_var": 0.09550374348958333, "learning_rate": 0.0001, "loss": 2.8799, "loss/crossentropy": 2.4267357707023622, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.223286272585392, "loss/reg": 0.0, "step": 55420 }, { "epoch": 0.36467105263157895, "grad_norm": 2.203125, "grad_norm_var": 0.041559855143229164, "learning_rate": 0.0001, "loss": 2.915, "loss/crossentropy": 2.0136570632457733, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.18345557600259782, "loss/reg": 0.0, "step": 55430 }, { "epoch": 0.36473684210526314, "grad_norm": 2.359375, "grad_norm_var": 0.03616129557291667, "learning_rate": 0.0001, "loss": 2.8798, "loss/crossentropy": 2.1780881524085998, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.1953047089278698, "loss/reg": 0.0, "step": 55440 }, { "epoch": 0.3648026315789474, "grad_norm": 2.125, "grad_norm_var": 0.03242899576822917, "learning_rate": 0.0001, "loss": 2.8431, "loss/crossentropy": 2.0196262121200563, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.17043603211641312, "loss/reg": 0.0, "step": 55450 }, { "epoch": 0.36486842105263156, "grad_norm": 2.65625, "grad_norm_var": 0.05640360514322917, "learning_rate": 0.0001, "loss": 2.93, "loss/crossentropy": 2.1954982399940492, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.2590381532907486, "loss/reg": 0.0, "step": 55460 }, { "epoch": 0.3649342105263158, "grad_norm": 3.03125, "grad_norm_var": 0.08671773274739583, "learning_rate": 0.0001, "loss": 2.8999, "loss/crossentropy": 2.331378698348999, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.22333091497421265, "loss/reg": 0.0, "step": 55470 }, { "epoch": 0.365, "grad_norm": 2.15625, "grad_norm_var": 0.09843724568684896, "learning_rate": 0.0001, "loss": 2.8263, "loss/crossentropy": 2.254844236373901, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.20436550229787825, "loss/reg": 0.0, "step": 55480 }, { "epoch": 0.36506578947368423, "grad_norm": 2.234375, "grad_norm_var": 0.018977864583333334, "learning_rate": 0.0001, "loss": 2.8473, "loss/crossentropy": 2.2343080043792725, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.22345256060361862, "loss/reg": 0.0, "step": 55490 }, { "epoch": 0.3651315789473684, "grad_norm": 2.828125, "grad_norm_var": 0.03528544108072917, "learning_rate": 0.0001, "loss": 2.9747, "loss/crossentropy": 2.448220360279083, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.19834974110126496, "loss/reg": 0.0, "step": 55500 }, { "epoch": 0.3651973684210526, "grad_norm": 2.40625, "grad_norm_var": 0.04527587890625, "learning_rate": 0.0001, "loss": 2.9568, "loss/crossentropy": 2.4422495126724244, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.2174215093255043, "loss/reg": 0.0, "step": 55510 }, { "epoch": 0.36526315789473685, "grad_norm": 2.546875, "grad_norm_var": 2.7708132757080966e+17, "learning_rate": 0.0001, "loss": 3.022, "loss/crossentropy": 2.2436477184295653, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.18428198769688606, "loss/reg": 0.0, "step": 55520 }, { "epoch": 0.36532894736842103, "grad_norm": 2.28125, "grad_norm_var": 2.7708132757876026e+17, "learning_rate": 0.0001, "loss": 2.8313, "loss/crossentropy": 2.3551650166511537, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.19888587146997452, "loss/reg": 0.0, "step": 55530 }, { "epoch": 0.36539473684210527, "grad_norm": 2.171875, "grad_norm_var": 0.041031901041666666, "learning_rate": 0.0001, "loss": 2.8825, "loss/crossentropy": 2.0016204953193664, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.20748164504766464, "loss/reg": 0.0, "step": 55540 }, { "epoch": 0.36546052631578946, "grad_norm": 2.40625, "grad_norm_var": 0.013765462239583333, "learning_rate": 0.0001, "loss": 2.9179, "loss/crossentropy": 2.3773271679878234, "loss/hidden": 2.9890625, "loss/incoh": 0.0, "loss/logits": 0.26501452922821045, "loss/reg": 0.0, "step": 55550 }, { "epoch": 0.3655263157894737, "grad_norm": 3.328125, "grad_norm_var": 2.1480498905911242e+17, "learning_rate": 0.0001, "loss": 3.0604, "loss/crossentropy": 2.2850507616996767, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.26486287713050843, "loss/reg": 0.0, "step": 55560 }, { "epoch": 0.3655921052631579, "grad_norm": 2.53125, "grad_norm_var": 2.1480498906104355e+17, "learning_rate": 0.0001, "loss": 2.8347, "loss/crossentropy": 2.175314021110535, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.19424628019332885, "loss/reg": 0.0, "step": 55570 }, { "epoch": 0.3656578947368421, "grad_norm": 2.109375, "grad_norm_var": 0.0807525634765625, "learning_rate": 0.0001, "loss": 2.9181, "loss/crossentropy": 2.3080334305763244, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.25163050144910815, "loss/reg": 0.0, "step": 55580 }, { "epoch": 0.3657236842105263, "grad_norm": 2.28125, "grad_norm_var": 0.06670633951822917, "learning_rate": 0.0001, "loss": 2.8901, "loss/crossentropy": 2.1379910469055177, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.22228961661458016, "loss/reg": 0.0, "step": 55590 }, { "epoch": 0.36578947368421055, "grad_norm": 2.6875, "grad_norm_var": 0.0214752197265625, "learning_rate": 0.0001, "loss": 2.8615, "loss/crossentropy": 2.3319657921791075, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.19814871698617936, "loss/reg": 0.0, "step": 55600 }, { "epoch": 0.36585526315789474, "grad_norm": 2.296875, "grad_norm_var": 0.04506734212239583, "learning_rate": 0.0001, "loss": 2.9245, "loss/crossentropy": 2.020485508441925, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.19996123462915422, "loss/reg": 0.0, "step": 55610 }, { "epoch": 0.3659210526315789, "grad_norm": 2.203125, "grad_norm_var": 0.04833577473958333, "learning_rate": 0.0001, "loss": 2.8741, "loss/crossentropy": 2.3666773557662966, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.18183380663394927, "loss/reg": 0.0, "step": 55620 }, { "epoch": 0.36598684210526317, "grad_norm": 2.109375, "grad_norm_var": 0.03404922485351562, "learning_rate": 0.0001, "loss": 2.8122, "loss/crossentropy": 2.2570203065872194, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.2110057607293129, "loss/reg": 0.0, "step": 55630 }, { "epoch": 0.36605263157894735, "grad_norm": 2.546875, "grad_norm_var": 0.11417617797851562, "learning_rate": 0.0001, "loss": 2.9211, "loss/crossentropy": 2.1231314063072206, "loss/hidden": 2.6859375, "loss/incoh": 0.0, "loss/logits": 0.19692023247480392, "loss/reg": 0.0, "step": 55640 }, { "epoch": 0.3661184210526316, "grad_norm": 2.359375, "grad_norm_var": 0.0758697509765625, "learning_rate": 0.0001, "loss": 2.9723, "loss/crossentropy": 2.17354953289032, "loss/hidden": 3.0046875, "loss/incoh": 0.0, "loss/logits": 0.2730823040008545, "loss/reg": 0.0, "step": 55650 }, { "epoch": 0.3661842105263158, "grad_norm": 3.0625, "grad_norm_var": 0.29600321451822914, "learning_rate": 0.0001, "loss": 3.0, "loss/crossentropy": 2.3858034133911135, "loss/hidden": 2.5625, "loss/incoh": 0.0, "loss/logits": 0.19508714079856873, "loss/reg": 0.0, "step": 55660 }, { "epoch": 0.36625, "grad_norm": 2.390625, "grad_norm_var": 0.34434305826822914, "learning_rate": 0.0001, "loss": 2.8176, "loss/crossentropy": 2.3246055483818053, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.20067979842424394, "loss/reg": 0.0, "step": 55670 }, { "epoch": 0.3663157894736842, "grad_norm": 2.234375, "grad_norm_var": 0.08699442545572916, "learning_rate": 0.0001, "loss": 2.8911, "loss/crossentropy": 2.3194999217987062, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.18766528666019439, "loss/reg": 0.0, "step": 55680 }, { "epoch": 0.36638157894736845, "grad_norm": 2.5, "grad_norm_var": 0.20039774576822916, "learning_rate": 0.0001, "loss": 2.8841, "loss/crossentropy": 2.378604805469513, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.2182111293077469, "loss/reg": 0.0, "step": 55690 }, { "epoch": 0.36644736842105263, "grad_norm": 2.296875, "grad_norm_var": 0.11477864583333333, "learning_rate": 0.0001, "loss": 2.8291, "loss/crossentropy": 2.444898080825806, "loss/hidden": 2.5421875, "loss/incoh": 0.0, "loss/logits": 0.20505266189575194, "loss/reg": 0.0, "step": 55700 }, { "epoch": 0.3665131578947368, "grad_norm": 2.75, "grad_norm_var": 0.1647125244140625, "learning_rate": 0.0001, "loss": 2.9405, "loss/crossentropy": 2.2699352383613585, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.2110310286283493, "loss/reg": 0.0, "step": 55710 }, { "epoch": 0.36657894736842106, "grad_norm": 2.265625, "grad_norm_var": 0.13624649047851561, "learning_rate": 0.0001, "loss": 2.85, "loss/crossentropy": 2.2756152868270876, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.21940236091613768, "loss/reg": 0.0, "step": 55720 }, { "epoch": 0.36664473684210525, "grad_norm": 2.5, "grad_norm_var": 0.1439470926920573, "learning_rate": 0.0001, "loss": 2.9558, "loss/crossentropy": 2.084648871421814, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.24087313562631607, "loss/reg": 0.0, "step": 55730 }, { "epoch": 0.3667105263157895, "grad_norm": 2.6875, "grad_norm_var": 0.1082183837890625, "learning_rate": 0.0001, "loss": 2.919, "loss/crossentropy": 2.2081058740615847, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.20752288699150084, "loss/reg": 0.0, "step": 55740 }, { "epoch": 0.3667763157894737, "grad_norm": 2.765625, "grad_norm_var": 0.07021484375, "learning_rate": 0.0001, "loss": 2.9072, "loss/crossentropy": 2.046071267127991, "loss/hidden": 2.7125, "loss/incoh": 0.0, "loss/logits": 0.21553733050823212, "loss/reg": 0.0, "step": 55750 }, { "epoch": 0.3668421052631579, "grad_norm": 2.875, "grad_norm_var": 0.11074117024739584, "learning_rate": 0.0001, "loss": 2.9575, "loss/crossentropy": 2.487001657485962, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.2093600869178772, "loss/reg": 0.0, "step": 55760 }, { "epoch": 0.3669078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.06126200358072917, "learning_rate": 0.0001, "loss": 2.8344, "loss/crossentropy": 2.4812172293663024, "loss/hidden": 2.5375, "loss/incoh": 0.0, "loss/logits": 0.19371156692504882, "loss/reg": 0.0, "step": 55770 }, { "epoch": 0.36697368421052634, "grad_norm": 2.59375, "grad_norm_var": 0.0256500244140625, "learning_rate": 0.0001, "loss": 2.8839, "loss/crossentropy": 2.3962693095207213, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20916363149881362, "loss/reg": 0.0, "step": 55780 }, { "epoch": 0.3670394736842105, "grad_norm": 2.578125, "grad_norm_var": 0.06220703125, "learning_rate": 0.0001, "loss": 2.8452, "loss/crossentropy": 2.143924903869629, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.19737162813544273, "loss/reg": 0.0, "step": 55790 }, { "epoch": 0.3671052631578947, "grad_norm": 2.234375, "grad_norm_var": 0.05098368326822917, "learning_rate": 0.0001, "loss": 2.8482, "loss/crossentropy": 2.0957778453826905, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.18370115458965303, "loss/reg": 0.0, "step": 55800 }, { "epoch": 0.36717105263157895, "grad_norm": 2.40625, "grad_norm_var": 0.0399566650390625, "learning_rate": 0.0001, "loss": 2.8905, "loss/crossentropy": 2.475993883609772, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.230997996032238, "loss/reg": 0.0, "step": 55810 }, { "epoch": 0.36723684210526314, "grad_norm": 2.421875, "grad_norm_var": 0.03612874348958333, "learning_rate": 0.0001, "loss": 2.9309, "loss/crossentropy": 2.073228323459625, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.17802632749080657, "loss/reg": 0.0, "step": 55820 }, { "epoch": 0.3673026315789474, "grad_norm": 2.203125, "grad_norm_var": 0.057861328125, "learning_rate": 0.0001, "loss": 2.9003, "loss/crossentropy": 2.2471689701080324, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.226143179833889, "loss/reg": 0.0, "step": 55830 }, { "epoch": 0.36736842105263157, "grad_norm": 2.140625, "grad_norm_var": 0.09578450520833333, "learning_rate": 0.0001, "loss": 2.9047, "loss/crossentropy": 2.496846580505371, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.21568236500024796, "loss/reg": 0.0, "step": 55840 }, { "epoch": 0.3674342105263158, "grad_norm": 2.671875, "grad_norm_var": 0.08674723307291667, "learning_rate": 0.0001, "loss": 2.8263, "loss/crossentropy": 2.3793046355247496, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.188921919465065, "loss/reg": 0.0, "step": 55850 }, { "epoch": 0.3675, "grad_norm": 4.59375, "grad_norm_var": 0.33552958170572916, "learning_rate": 0.0001, "loss": 2.8515, "loss/crossentropy": 2.127133071422577, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.20127083659172057, "loss/reg": 0.0, "step": 55860 }, { "epoch": 0.36756578947368423, "grad_norm": 2.25, "grad_norm_var": 0.40453999837239585, "learning_rate": 0.0001, "loss": 2.9088, "loss/crossentropy": 1.884271150827408, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.19949935451149942, "loss/reg": 0.0, "step": 55870 }, { "epoch": 0.3676315789473684, "grad_norm": 2.96875, "grad_norm_var": 0.08619791666666667, "learning_rate": 0.0001, "loss": 2.918, "loss/crossentropy": 2.3755285024642943, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.26497130542993547, "loss/reg": 0.0, "step": 55880 }, { "epoch": 0.3676973684210526, "grad_norm": 2.53125, "grad_norm_var": 0.05412495930989583, "learning_rate": 0.0001, "loss": 2.8465, "loss/crossentropy": 2.2545337438583375, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.18386100754141807, "loss/reg": 0.0, "step": 55890 }, { "epoch": 0.36776315789473685, "grad_norm": 2.1875, "grad_norm_var": 0.027555338541666665, "learning_rate": 0.0001, "loss": 2.8831, "loss/crossentropy": 2.244291937351227, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.19960221350193025, "loss/reg": 0.0, "step": 55900 }, { "epoch": 0.36782894736842103, "grad_norm": 2.265625, "grad_norm_var": 0.09318033854166667, "learning_rate": 0.0001, "loss": 2.8443, "loss/crossentropy": 1.9512614130973815, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.17020095884799957, "loss/reg": 0.0, "step": 55910 }, { "epoch": 0.3678947368421053, "grad_norm": 2.328125, "grad_norm_var": 0.1101226806640625, "learning_rate": 0.0001, "loss": 2.9105, "loss/crossentropy": 2.225708472728729, "loss/hidden": 2.709375, "loss/incoh": 0.0, "loss/logits": 0.21985666006803511, "loss/reg": 0.0, "step": 55920 }, { "epoch": 0.36796052631578946, "grad_norm": 2.296875, "grad_norm_var": 0.05573628743489583, "learning_rate": 0.0001, "loss": 2.9139, "loss/crossentropy": 2.4349183201789857, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.24956242889165878, "loss/reg": 0.0, "step": 55930 }, { "epoch": 0.3680263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.0542633056640625, "learning_rate": 0.0001, "loss": 2.8401, "loss/crossentropy": 2.2548092007637024, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2142183303833008, "loss/reg": 0.0, "step": 55940 }, { "epoch": 0.3680921052631579, "grad_norm": 2.453125, "grad_norm_var": 0.18992513020833332, "learning_rate": 0.0001, "loss": 2.979, "loss/crossentropy": 2.436408448219299, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.2978013798594475, "loss/reg": 0.0, "step": 55950 }, { "epoch": 0.36815789473684213, "grad_norm": 2.15625, "grad_norm_var": 0.19690348307291666, "learning_rate": 0.0001, "loss": 2.8596, "loss/crossentropy": 2.2584880113601686, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.2074647530913353, "loss/reg": 0.0, "step": 55960 }, { "epoch": 0.3682236842105263, "grad_norm": 2.390625, "grad_norm_var": 0.14939676920572917, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.065344452857971, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.2257506728172302, "loss/reg": 0.0, "step": 55970 }, { "epoch": 0.3682894736842105, "grad_norm": 2.578125, "grad_norm_var": 0.24550374348958334, "learning_rate": 0.0001, "loss": 2.898, "loss/crossentropy": 2.4591291189193725, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.21433401703834534, "loss/reg": 0.0, "step": 55980 }, { "epoch": 0.36835526315789474, "grad_norm": 2.421875, "grad_norm_var": 0.12206624348958334, "learning_rate": 0.0001, "loss": 2.849, "loss/crossentropy": 2.359066915512085, "loss/hidden": 2.553125, "loss/incoh": 0.0, "loss/logits": 0.19690488874912263, "loss/reg": 0.0, "step": 55990 }, { "epoch": 0.3684210526315789, "grad_norm": 2.5, "grad_norm_var": 0.02388916015625, "learning_rate": 0.0001, "loss": 2.8101, "loss/crossentropy": 2.2798381090164184, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2134224593639374, "loss/reg": 0.0, "step": 56000 }, { "epoch": 0.36848684210526317, "grad_norm": 2.453125, "grad_norm_var": 0.63404541015625, "learning_rate": 0.0001, "loss": 2.8523, "loss/crossentropy": 2.5275405406951905, "loss/hidden": 2.503125, "loss/incoh": 0.0, "loss/logits": 0.18744342476129533, "loss/reg": 0.0, "step": 56010 }, { "epoch": 0.36855263157894735, "grad_norm": 2.640625, "grad_norm_var": 0.05663655598958333, "learning_rate": 0.0001, "loss": 2.8516, "loss/crossentropy": 2.2693510174751284, "loss/hidden": 2.746875, "loss/incoh": 0.0, "loss/logits": 0.2308935195207596, "loss/reg": 0.0, "step": 56020 }, { "epoch": 0.3686184210526316, "grad_norm": 2.296875, "grad_norm_var": 0.0590240478515625, "learning_rate": 0.0001, "loss": 2.8871, "loss/crossentropy": 1.9981503248214723, "loss/hidden": 2.503125, "loss/incoh": 0.0, "loss/logits": 0.17222280651330948, "loss/reg": 0.0, "step": 56030 }, { "epoch": 0.3686842105263158, "grad_norm": 3.3125, "grad_norm_var": 0.089990234375, "learning_rate": 0.0001, "loss": 2.9112, "loss/crossentropy": 1.935350275039673, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.20731825679540633, "loss/reg": 0.0, "step": 56040 }, { "epoch": 0.36875, "grad_norm": 2.46875, "grad_norm_var": 0.06711324055989583, "learning_rate": 0.0001, "loss": 2.8589, "loss/crossentropy": 2.5472738265991213, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.23654481619596482, "loss/reg": 0.0, "step": 56050 }, { "epoch": 0.3688157894736842, "grad_norm": 2.1875, "grad_norm_var": 0.0499664306640625, "learning_rate": 0.0001, "loss": 2.8786, "loss/crossentropy": 2.2211525797843934, "loss/hidden": 2.803125, "loss/incoh": 0.0, "loss/logits": 0.21442678272724153, "loss/reg": 0.0, "step": 56060 }, { "epoch": 0.3688815789473684, "grad_norm": 2.1875, "grad_norm_var": 0.025028483072916666, "learning_rate": 0.0001, "loss": 2.9263, "loss/crossentropy": 2.2272470712661745, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.20897928029298782, "loss/reg": 0.0, "step": 56070 }, { "epoch": 0.36894736842105263, "grad_norm": 2.4375, "grad_norm_var": 2.554429387775585e+17, "learning_rate": 0.0001, "loss": 3.014, "loss/crossentropy": 2.1030837893486023, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.21406531184911728, "loss/reg": 0.0, "step": 56080 }, { "epoch": 0.3690131578947368, "grad_norm": 2.296875, "grad_norm_var": 2.5544293879072032e+17, "learning_rate": 0.0001, "loss": 2.8319, "loss/crossentropy": 2.2578949213027952, "loss/hidden": 2.5359375, "loss/incoh": 0.0, "loss/logits": 0.20407818034291267, "loss/reg": 0.0, "step": 56090 }, { "epoch": 0.36907894736842106, "grad_norm": 2.484375, "grad_norm_var": 0.016144816080729166, "learning_rate": 0.0001, "loss": 2.8504, "loss/crossentropy": 2.104866015911102, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.19711328223347663, "loss/reg": 0.0, "step": 56100 }, { "epoch": 0.36914473684210525, "grad_norm": 2.515625, "grad_norm_var": 0.04446207682291667, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.3963308334350586, "loss/hidden": 2.8015625, "loss/incoh": 0.0, "loss/logits": 0.24214785248041154, "loss/reg": 0.0, "step": 56110 }, { "epoch": 0.3692105263157895, "grad_norm": 2.5625, "grad_norm_var": 0.027046712239583333, "learning_rate": 0.0001, "loss": 2.8905, "loss/crossentropy": 2.213157594203949, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.23338825702667237, "loss/reg": 0.0, "step": 56120 }, { "epoch": 0.3692763157894737, "grad_norm": 2.21875, "grad_norm_var": 0.08632710774739584, "learning_rate": 0.0001, "loss": 2.8847, "loss/crossentropy": 2.419931650161743, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.19743183255195618, "loss/reg": 0.0, "step": 56130 }, { "epoch": 0.3693421052631579, "grad_norm": 2.15625, "grad_norm_var": 0.10515034993489583, "learning_rate": 0.0001, "loss": 2.8416, "loss/crossentropy": 2.124236559867859, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.18200893253087996, "loss/reg": 0.0, "step": 56140 }, { "epoch": 0.3694078947368421, "grad_norm": 2.328125, "grad_norm_var": 0.0852691650390625, "learning_rate": 0.0001, "loss": 2.8387, "loss/crossentropy": 2.1836310505867003, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.1994611408561468, "loss/reg": 0.0, "step": 56150 }, { "epoch": 0.36947368421052634, "grad_norm": 2.65625, "grad_norm_var": 0.42205174763997394, "learning_rate": 0.0001, "loss": 2.8759, "loss/crossentropy": 2.202907645702362, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.2245052605867386, "loss/reg": 0.0, "step": 56160 }, { "epoch": 0.36953947368421053, "grad_norm": 2.984375, "grad_norm_var": 0.5005442301432291, "learning_rate": 0.0001, "loss": 2.9135, "loss/crossentropy": 2.503756415843964, "loss/hidden": 2.8421875, "loss/incoh": 0.0, "loss/logits": 0.24433262795209884, "loss/reg": 0.0, "step": 56170 }, { "epoch": 0.3696052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.3016510009765625, "learning_rate": 0.0001, "loss": 2.8629, "loss/crossentropy": 2.2289433002471926, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.21338208988308907, "loss/reg": 0.0, "step": 56180 }, { "epoch": 0.36967105263157896, "grad_norm": 3.296875, "grad_norm_var": 0.11511128743489583, "learning_rate": 0.0001, "loss": 2.9274, "loss/crossentropy": 2.4364864826202393, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20529537200927733, "loss/reg": 0.0, "step": 56190 }, { "epoch": 0.36973684210526314, "grad_norm": 2.40625, "grad_norm_var": 0.2220703125, "learning_rate": 0.0001, "loss": 2.8077, "loss/crossentropy": 2.085753732919693, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.19000215604901313, "loss/reg": 0.0, "step": 56200 }, { "epoch": 0.3698026315789474, "grad_norm": 3.90625, "grad_norm_var": 0.263623046875, "learning_rate": 0.0001, "loss": 2.9222, "loss/crossentropy": 2.226399099826813, "loss/hidden": 2.928125, "loss/incoh": 0.0, "loss/logits": 0.23553618490695954, "loss/reg": 0.0, "step": 56210 }, { "epoch": 0.36986842105263157, "grad_norm": 2.34375, "grad_norm_var": 0.232763671875, "learning_rate": 0.0001, "loss": 2.8956, "loss/crossentropy": 2.0508364915847777, "loss/hidden": 2.5875, "loss/incoh": 0.0, "loss/logits": 0.20977792665362358, "loss/reg": 0.0, "step": 56220 }, { "epoch": 0.3699342105263158, "grad_norm": 2.484375, "grad_norm_var": 0.09469401041666667, "learning_rate": 0.0001, "loss": 2.8681, "loss/crossentropy": 2.504152810573578, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.2156804919242859, "loss/reg": 0.0, "step": 56230 }, { "epoch": 0.37, "grad_norm": 1.9453125, "grad_norm_var": 0.07011693318684896, "learning_rate": 0.0001, "loss": 2.892, "loss/crossentropy": 2.5594624280929565, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.2897753998637199, "loss/reg": 0.0, "step": 56240 }, { "epoch": 0.37006578947368424, "grad_norm": 2.578125, "grad_norm_var": 0.053529612223307294, "learning_rate": 0.0001, "loss": 2.8762, "loss/crossentropy": 2.160356116294861, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2003995805978775, "loss/reg": 0.0, "step": 56250 }, { "epoch": 0.3701315789473684, "grad_norm": 2.6875, "grad_norm_var": 0.09534403483072916, "learning_rate": 0.0001, "loss": 2.8846, "loss/crossentropy": 2.4481847405433657, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.21351269409060478, "loss/reg": 0.0, "step": 56260 }, { "epoch": 0.3701973684210526, "grad_norm": 1.921875, "grad_norm_var": 0.11350911458333333, "learning_rate": 0.0001, "loss": 2.8027, "loss/crossentropy": 2.1167102456092834, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.21160330399870872, "loss/reg": 0.0, "step": 56270 }, { "epoch": 0.37026315789473685, "grad_norm": 2.390625, "grad_norm_var": 0.8657135009765625, "learning_rate": 0.0001, "loss": 2.8883, "loss/crossentropy": 2.3861053586006165, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.19094537645578386, "loss/reg": 0.0, "step": 56280 }, { "epoch": 0.37032894736842104, "grad_norm": 2.390625, "grad_norm_var": 0.7498006184895833, "learning_rate": 0.0001, "loss": 2.9279, "loss/crossentropy": 2.130525827407837, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20939616709947587, "loss/reg": 0.0, "step": 56290 }, { "epoch": 0.3703947368421053, "grad_norm": 2.09375, "grad_norm_var": 0.0849273681640625, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.3800536513328554, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2418698325753212, "loss/reg": 0.0, "step": 56300 }, { "epoch": 0.37046052631578946, "grad_norm": 2.453125, "grad_norm_var": 0.15541890462239583, "learning_rate": 0.0001, "loss": 2.9121, "loss/crossentropy": 2.30133570432663, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.22292252257466316, "loss/reg": 0.0, "step": 56310 }, { "epoch": 0.3705263157894737, "grad_norm": 3.359375, "grad_norm_var": 0.22711588541666666, "learning_rate": 0.0001, "loss": 2.8896, "loss/crossentropy": 2.518061971664429, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.21759434491395951, "loss/reg": 0.0, "step": 56320 }, { "epoch": 0.3705921052631579, "grad_norm": 2.625, "grad_norm_var": 0.47610270182291664, "learning_rate": 0.0001, "loss": 2.9005, "loss/crossentropy": 2.521447944641113, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.20121440291404724, "loss/reg": 0.0, "step": 56330 }, { "epoch": 0.37065789473684213, "grad_norm": 2.140625, "grad_norm_var": 0.04025065104166667, "learning_rate": 0.0001, "loss": 2.8534, "loss/crossentropy": 2.415323495864868, "loss/hidden": 2.6203125, "loss/incoh": 0.0, "loss/logits": 0.21512671411037446, "loss/reg": 0.0, "step": 56340 }, { "epoch": 0.3707236842105263, "grad_norm": 2.671875, "grad_norm_var": 0.22520243326822917, "learning_rate": 0.0001, "loss": 2.8157, "loss/crossentropy": 2.433765137195587, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.19766139909625052, "loss/reg": 0.0, "step": 56350 }, { "epoch": 0.3707894736842105, "grad_norm": 2.34375, "grad_norm_var": 0.2751953125, "learning_rate": 0.0001, "loss": 2.8304, "loss/crossentropy": 2.1715299487113953, "loss/hidden": 2.771875, "loss/incoh": 0.0, "loss/logits": 0.21264228820800782, "loss/reg": 0.0, "step": 56360 }, { "epoch": 0.37085526315789474, "grad_norm": 2.703125, "grad_norm_var": 0.10632222493489583, "learning_rate": 0.0001, "loss": 2.8743, "loss/crossentropy": 2.1166147589683533, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.18992169350385665, "loss/reg": 0.0, "step": 56370 }, { "epoch": 0.37092105263157893, "grad_norm": 2.015625, "grad_norm_var": 0.14247639973958334, "learning_rate": 0.0001, "loss": 2.8239, "loss/crossentropy": 1.8889483988285065, "loss/hidden": 2.546875, "loss/incoh": 0.0, "loss/logits": 0.18174538463354112, "loss/reg": 0.0, "step": 56380 }, { "epoch": 0.37098684210526317, "grad_norm": 2.59375, "grad_norm_var": 0.04908447265625, "learning_rate": 0.0001, "loss": 2.8156, "loss/crossentropy": 2.285055708885193, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.18968597277998925, "loss/reg": 0.0, "step": 56390 }, { "epoch": 0.37105263157894736, "grad_norm": 2.578125, "grad_norm_var": 0.036421712239583334, "learning_rate": 0.0001, "loss": 2.8622, "loss/crossentropy": 2.393838131427765, "loss/hidden": 2.4984375, "loss/incoh": 0.0, "loss/logits": 0.18481413125991822, "loss/reg": 0.0, "step": 56400 }, { "epoch": 0.3711184210526316, "grad_norm": 2.421875, "grad_norm_var": 0.0473052978515625, "learning_rate": 0.0001, "loss": 2.828, "loss/crossentropy": 2.1886092901229857, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.21773164272308348, "loss/reg": 0.0, "step": 56410 }, { "epoch": 0.3711842105263158, "grad_norm": 2.390625, "grad_norm_var": 0.14888916015625, "learning_rate": 0.0001, "loss": 2.88, "loss/crossentropy": 2.219276762008667, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.2751802295446396, "loss/reg": 0.0, "step": 56420 }, { "epoch": 0.37125, "grad_norm": 2.90625, "grad_norm_var": 0.13599344889322917, "learning_rate": 0.0001, "loss": 2.8573, "loss/crossentropy": 2.340095782279968, "loss/hidden": 2.5671875, "loss/incoh": 0.0, "loss/logits": 0.20732717737555503, "loss/reg": 0.0, "step": 56430 }, { "epoch": 0.3713157894736842, "grad_norm": 2.5, "grad_norm_var": 0.0435943603515625, "learning_rate": 0.0001, "loss": 2.8414, "loss/crossentropy": 2.1741693377494813, "loss/hidden": 2.5515625, "loss/incoh": 0.0, "loss/logits": 0.189312943816185, "loss/reg": 0.0, "step": 56440 }, { "epoch": 0.3713815789473684, "grad_norm": 2.21875, "grad_norm_var": 0.04761454264322917, "learning_rate": 0.0001, "loss": 2.8697, "loss/crossentropy": 1.8752413094043732, "loss/hidden": 2.815625, "loss/incoh": 0.0, "loss/logits": 0.2650171026587486, "loss/reg": 0.0, "step": 56450 }, { "epoch": 0.37144736842105264, "grad_norm": 2.3125, "grad_norm_var": 0.029911295572916666, "learning_rate": 0.0001, "loss": 2.8349, "loss/crossentropy": 2.239310359954834, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.1906931981444359, "loss/reg": 0.0, "step": 56460 }, { "epoch": 0.3715131578947368, "grad_norm": 2.375, "grad_norm_var": 0.032059733072916666, "learning_rate": 0.0001, "loss": 2.8999, "loss/crossentropy": 2.1321326434612273, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.18646096996963024, "loss/reg": 0.0, "step": 56470 }, { "epoch": 0.37157894736842106, "grad_norm": 2.375, "grad_norm_var": 11.748501586914063, "learning_rate": 0.0001, "loss": 2.974, "loss/crossentropy": 1.998432731628418, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.2337036356329918, "loss/reg": 0.0, "step": 56480 }, { "epoch": 0.37164473684210525, "grad_norm": 2.65625, "grad_norm_var": 12.70582275390625, "learning_rate": 0.0001, "loss": 2.8539, "loss/crossentropy": 2.360438823699951, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.23270124346017837, "loss/reg": 0.0, "step": 56490 }, { "epoch": 0.3717105263157895, "grad_norm": 2.40625, "grad_norm_var": 0.05370686848958333, "learning_rate": 0.0001, "loss": 2.8411, "loss/crossentropy": 2.4145756483078005, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.20814784467220307, "loss/reg": 0.0, "step": 56500 }, { "epoch": 0.3717763157894737, "grad_norm": 2.390625, "grad_norm_var": 0.038037109375, "learning_rate": 0.0001, "loss": 2.899, "loss/crossentropy": 2.4294058799743654, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.21973807960748673, "loss/reg": 0.0, "step": 56510 }, { "epoch": 0.3718421052631579, "grad_norm": 2.1875, "grad_norm_var": 0.019742838541666665, "learning_rate": 0.0001, "loss": 2.8565, "loss/crossentropy": 2.259233558177948, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.2094275511801243, "loss/reg": 0.0, "step": 56520 }, { "epoch": 0.3719078947368421, "grad_norm": 2.65625, "grad_norm_var": 0.04120992024739583, "learning_rate": 0.0001, "loss": 2.848, "loss/crossentropy": 2.248432421684265, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.1921083763241768, "loss/reg": 0.0, "step": 56530 }, { "epoch": 0.3719736842105263, "grad_norm": 2.640625, "grad_norm_var": 0.19274800618489582, "learning_rate": 0.0001, "loss": 2.9243, "loss/crossentropy": 2.3464464426040648, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23956212699413298, "loss/reg": 0.0, "step": 56540 }, { "epoch": 0.37203947368421053, "grad_norm": 2.328125, "grad_norm_var": 0.047972615559895834, "learning_rate": 0.0001, "loss": 2.8198, "loss/crossentropy": 2.1600219368934632, "loss/hidden": 2.58125, "loss/incoh": 0.0, "loss/logits": 0.18259955048561097, "loss/reg": 0.0, "step": 56550 }, { "epoch": 0.3721052631578947, "grad_norm": 2.34375, "grad_norm_var": 0.15439351399739584, "learning_rate": 0.0001, "loss": 2.9134, "loss/crossentropy": 2.3626832604408263, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.20156004875898362, "loss/reg": 0.0, "step": 56560 }, { "epoch": 0.37217105263157896, "grad_norm": 2.640625, "grad_norm_var": 0.1386138916015625, "learning_rate": 0.0001, "loss": 2.8438, "loss/crossentropy": 2.286113452911377, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.20848258882761, "loss/reg": 0.0, "step": 56570 }, { "epoch": 0.37223684210526314, "grad_norm": 2.546875, "grad_norm_var": 0.09574356079101562, "learning_rate": 0.0001, "loss": 2.8856, "loss/crossentropy": 2.2616809129714968, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20096972435712815, "loss/reg": 0.0, "step": 56580 }, { "epoch": 0.3723026315789474, "grad_norm": 2.609375, "grad_norm_var": 0.09086685180664063, "learning_rate": 0.0001, "loss": 2.8518, "loss/crossentropy": 2.3883530616760256, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.20202470570802689, "loss/reg": 0.0, "step": 56590 }, { "epoch": 0.37236842105263157, "grad_norm": 2.5, "grad_norm_var": 0.06587626139322916, "learning_rate": 0.0001, "loss": 2.8389, "loss/crossentropy": 2.1587519526481627, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.19255940914154052, "loss/reg": 0.0, "step": 56600 }, { "epoch": 0.3724342105263158, "grad_norm": 2.265625, "grad_norm_var": 0.09870503743489584, "learning_rate": 0.0001, "loss": 2.9084, "loss/crossentropy": 2.236282777786255, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.1922200232744217, "loss/reg": 0.0, "step": 56610 }, { "epoch": 0.3725, "grad_norm": 2.1875, "grad_norm_var": 0.21901041666666668, "learning_rate": 0.0001, "loss": 2.8572, "loss/crossentropy": 2.301276648044586, "loss/hidden": 2.475, "loss/incoh": 0.0, "loss/logits": 0.18458504751324653, "loss/reg": 0.0, "step": 56620 }, { "epoch": 0.3725657894736842, "grad_norm": 2.25, "grad_norm_var": 0.16633199055989584, "learning_rate": 0.0001, "loss": 2.9251, "loss/crossentropy": 2.188789498806, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2017750173807144, "loss/reg": 0.0, "step": 56630 }, { "epoch": 0.3726315789473684, "grad_norm": 2.265625, "grad_norm_var": 1.2013671875, "learning_rate": 0.0001, "loss": 2.9358, "loss/crossentropy": 2.2693992137908934, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.20554796308279039, "loss/reg": 0.0, "step": 56640 }, { "epoch": 0.3726973684210526, "grad_norm": 2.71875, "grad_norm_var": 0.054352823893229166, "learning_rate": 0.0001, "loss": 2.9736, "loss/crossentropy": 2.0425461411476133, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2311855934560299, "loss/reg": 0.0, "step": 56650 }, { "epoch": 0.37276315789473685, "grad_norm": 2.734375, "grad_norm_var": 0.11956278483072917, "learning_rate": 0.0001, "loss": 2.8943, "loss/crossentropy": 2.191503369808197, "loss/hidden": 2.7390625, "loss/incoh": 0.0, "loss/logits": 0.19015718400478362, "loss/reg": 0.0, "step": 56660 }, { "epoch": 0.37282894736842104, "grad_norm": 2.796875, "grad_norm_var": 0.05253804524739583, "learning_rate": 0.0001, "loss": 2.9146, "loss/crossentropy": 2.215645444393158, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.20529815554618835, "loss/reg": 0.0, "step": 56670 }, { "epoch": 0.3728947368421053, "grad_norm": 2.53125, "grad_norm_var": 0.0594635009765625, "learning_rate": 0.0001, "loss": 2.9074, "loss/crossentropy": 2.316050183773041, "loss/hidden": 2.7703125, "loss/incoh": 0.0, "loss/logits": 0.22227884978055953, "loss/reg": 0.0, "step": 56680 }, { "epoch": 0.37296052631578946, "grad_norm": 2.421875, "grad_norm_var": 0.051545206705729166, "learning_rate": 0.0001, "loss": 2.8205, "loss/crossentropy": 2.269527292251587, "loss/hidden": 2.534375, "loss/incoh": 0.0, "loss/logits": 0.19350815266370774, "loss/reg": 0.0, "step": 56690 }, { "epoch": 0.3730263157894737, "grad_norm": 2.390625, "grad_norm_var": 0.01597900390625, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.39568293094635, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.2342778816819191, "loss/reg": 0.0, "step": 56700 }, { "epoch": 0.3730921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.025093587239583333, "learning_rate": 0.0001, "loss": 2.8863, "loss/crossentropy": 2.372193419933319, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.2434217870235443, "loss/reg": 0.0, "step": 56710 }, { "epoch": 0.37315789473684213, "grad_norm": 2.296875, "grad_norm_var": 0.1035797119140625, "learning_rate": 0.0001, "loss": 2.9278, "loss/crossentropy": 2.294344997406006, "loss/hidden": 2.9703125, "loss/incoh": 0.0, "loss/logits": 0.24027424603700637, "loss/reg": 0.0, "step": 56720 }, { "epoch": 0.3732236842105263, "grad_norm": 2.65625, "grad_norm_var": 0.0552642822265625, "learning_rate": 0.0001, "loss": 2.8974, "loss/crossentropy": 2.4375629901885985, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.2196996957063675, "loss/reg": 0.0, "step": 56730 }, { "epoch": 0.3732894736842105, "grad_norm": 2.15625, "grad_norm_var": 0.03609110514322917, "learning_rate": 0.0001, "loss": 2.8632, "loss/crossentropy": 2.327542555332184, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.19565554857254028, "loss/reg": 0.0, "step": 56740 }, { "epoch": 0.37335526315789475, "grad_norm": 2.375, "grad_norm_var": 0.0575347900390625, "learning_rate": 0.0001, "loss": 2.8994, "loss/crossentropy": 2.2621647357940673, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.19066521376371384, "loss/reg": 0.0, "step": 56750 }, { "epoch": 0.37342105263157893, "grad_norm": 2.28125, "grad_norm_var": 0.05546773274739583, "learning_rate": 0.0001, "loss": 2.8833, "loss/crossentropy": 2.390567684173584, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.2222500339150429, "loss/reg": 0.0, "step": 56760 }, { "epoch": 0.3734868421052632, "grad_norm": 2.5625, "grad_norm_var": 0.0617340087890625, "learning_rate": 0.0001, "loss": 2.8198, "loss/crossentropy": 2.280630886554718, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.22925344333052636, "loss/reg": 0.0, "step": 56770 }, { "epoch": 0.37355263157894736, "grad_norm": 2.5, "grad_norm_var": 0.05921122233072917, "learning_rate": 0.0001, "loss": 2.8581, "loss/crossentropy": 2.116881287097931, "loss/hidden": 2.86875, "loss/incoh": 0.0, "loss/logits": 0.20865375101566314, "loss/reg": 0.0, "step": 56780 }, { "epoch": 0.3736184210526316, "grad_norm": 2.328125, "grad_norm_var": 0.07974853515625, "learning_rate": 0.0001, "loss": 2.8946, "loss/crossentropy": 2.1903440594673156, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.20317945033311843, "loss/reg": 0.0, "step": 56790 }, { "epoch": 0.3736842105263158, "grad_norm": 2.578125, "grad_norm_var": 0.0582183837890625, "learning_rate": 0.0001, "loss": 2.9322, "loss/crossentropy": 2.1702419996261595, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.23167342841625213, "loss/reg": 0.0, "step": 56800 }, { "epoch": 0.37375, "grad_norm": 2.671875, "grad_norm_var": 0.06951395670572917, "learning_rate": 0.0001, "loss": 2.9226, "loss/crossentropy": 1.9715431571006774, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.19939200058579445, "loss/reg": 0.0, "step": 56810 }, { "epoch": 0.3738157894736842, "grad_norm": 2.375, "grad_norm_var": 0.15224202473958334, "learning_rate": 0.0001, "loss": 2.9183, "loss/crossentropy": 2.4031955718994142, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.1945131815969944, "loss/reg": 0.0, "step": 56820 }, { "epoch": 0.3738815789473684, "grad_norm": 2.28125, "grad_norm_var": 0.1311920166015625, "learning_rate": 0.0001, "loss": 2.8041, "loss/crossentropy": 2.098873472213745, "loss/hidden": 2.6046875, "loss/incoh": 0.0, "loss/logits": 0.18983764871954917, "loss/reg": 0.0, "step": 56830 }, { "epoch": 0.37394736842105264, "grad_norm": 2.40625, "grad_norm_var": 0.041731770833333334, "learning_rate": 0.0001, "loss": 2.8921, "loss/crossentropy": 2.295369052886963, "loss/hidden": 2.7625, "loss/incoh": 0.0, "loss/logits": 0.2172948494553566, "loss/reg": 0.0, "step": 56840 }, { "epoch": 0.3740131578947368, "grad_norm": 2.21875, "grad_norm_var": 0.03528544108072917, "learning_rate": 0.0001, "loss": 2.8304, "loss/crossentropy": 1.9679815530776978, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.159686116874218, "loss/reg": 0.0, "step": 56850 }, { "epoch": 0.37407894736842107, "grad_norm": 3.03125, "grad_norm_var": 0.0588531494140625, "learning_rate": 0.0001, "loss": 2.8491, "loss/crossentropy": 2.3509429335594176, "loss/hidden": 2.734375, "loss/incoh": 0.0, "loss/logits": 0.2139374390244484, "loss/reg": 0.0, "step": 56860 }, { "epoch": 0.37414473684210525, "grad_norm": 2.4375, "grad_norm_var": 0.06318359375, "learning_rate": 0.0001, "loss": 2.9699, "loss/crossentropy": 2.3131859242916106, "loss/hidden": 2.6640625, "loss/incoh": 0.0, "loss/logits": 0.21008679121732712, "loss/reg": 0.0, "step": 56870 }, { "epoch": 0.3742105263157895, "grad_norm": 2.203125, "grad_norm_var": 0.04301656087239583, "learning_rate": 0.0001, "loss": 2.8719, "loss/crossentropy": 1.9247263312339782, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.1960112929344177, "loss/reg": 0.0, "step": 56880 }, { "epoch": 0.3742763157894737, "grad_norm": 2.40625, "grad_norm_var": 0.04588114420572917, "learning_rate": 0.0001, "loss": 2.7763, "loss/crossentropy": 2.3352128505706786, "loss/hidden": 2.5, "loss/incoh": 0.0, "loss/logits": 0.18376404345035552, "loss/reg": 0.0, "step": 56890 }, { "epoch": 0.3743421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.05567118326822917, "learning_rate": 0.0001, "loss": 2.8617, "loss/crossentropy": 2.417633366584778, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.21791061758995056, "loss/reg": 0.0, "step": 56900 }, { "epoch": 0.3744078947368421, "grad_norm": 2.265625, "grad_norm_var": 0.48972981770833335, "learning_rate": 0.0001, "loss": 2.9731, "loss/crossentropy": 2.224759554862976, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.23550959825515747, "loss/reg": 0.0, "step": 56910 }, { "epoch": 0.3744736842105263, "grad_norm": 2.515625, "grad_norm_var": 0.05266520182291667, "learning_rate": 0.0001, "loss": 2.8176, "loss/crossentropy": 2.177126681804657, "loss/hidden": 2.646875, "loss/incoh": 0.0, "loss/logits": 0.18674521297216415, "loss/reg": 0.0, "step": 56920 }, { "epoch": 0.37453947368421053, "grad_norm": 2.78125, "grad_norm_var": 0.08121744791666667, "learning_rate": 0.0001, "loss": 2.9002, "loss/crossentropy": 2.5103124737739564, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.20279947817325591, "loss/reg": 0.0, "step": 56930 }, { "epoch": 0.3746052631578947, "grad_norm": 2.234375, "grad_norm_var": 0.1398101806640625, "learning_rate": 0.0001, "loss": 2.9003, "loss/crossentropy": 2.4166292428970335, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.21829064041376114, "loss/reg": 0.0, "step": 56940 }, { "epoch": 0.37467105263157896, "grad_norm": 2.109375, "grad_norm_var": 0.146875, "learning_rate": 0.0001, "loss": 2.8984, "loss/crossentropy": 2.413494849205017, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.21194676160812378, "loss/reg": 0.0, "step": 56950 }, { "epoch": 0.37473684210526315, "grad_norm": 2.609375, "grad_norm_var": 0.04224853515625, "learning_rate": 0.0001, "loss": 2.9648, "loss/crossentropy": 2.335786283016205, "loss/hidden": 2.8921875, "loss/incoh": 0.0, "loss/logits": 0.3038992017507553, "loss/reg": 0.0, "step": 56960 }, { "epoch": 0.3748026315789474, "grad_norm": 2.46875, "grad_norm_var": 0.10789388020833333, "learning_rate": 0.0001, "loss": 2.8552, "loss/crossentropy": 2.1228857159614565, "loss/hidden": 2.684375, "loss/incoh": 0.0, "loss/logits": 0.20292344838380813, "loss/reg": 0.0, "step": 56970 }, { "epoch": 0.3748684210526316, "grad_norm": 2.25, "grad_norm_var": 0.1605621337890625, "learning_rate": 0.0001, "loss": 2.9114, "loss/crossentropy": 2.419293451309204, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.2179285317659378, "loss/reg": 0.0, "step": 56980 }, { "epoch": 0.3749342105263158, "grad_norm": 2.609375, "grad_norm_var": 0.06253255208333333, "learning_rate": 0.0001, "loss": 2.8956, "loss/crossentropy": 2.474664807319641, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.23135377764701842, "loss/reg": 0.0, "step": 56990 }, { "epoch": 0.375, "grad_norm": 2.453125, "grad_norm_var": 0.03902587890625, "learning_rate": 0.0001, "loss": 2.8323, "loss/crossentropy": 2.3062317609786986, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.20599044114351273, "loss/reg": 0.0, "step": 57000 }, { "epoch": 0.3750657894736842, "grad_norm": 2.40625, "grad_norm_var": 0.029964192708333334, "learning_rate": 0.0001, "loss": 2.8818, "loss/crossentropy": 2.22975172996521, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.2104538708925247, "loss/reg": 0.0, "step": 57010 }, { "epoch": 0.3751315789473684, "grad_norm": 2.1875, "grad_norm_var": 0.0364410400390625, "learning_rate": 0.0001, "loss": 2.8603, "loss/crossentropy": 2.3365495681762694, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.1614990785717964, "loss/reg": 0.0, "step": 57020 }, { "epoch": 0.3751973684210526, "grad_norm": 2.0625, "grad_norm_var": 0.042723592122395834, "learning_rate": 0.0001, "loss": 2.8745, "loss/crossentropy": 2.2537218809127806, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.2062041662633419, "loss/reg": 0.0, "step": 57030 }, { "epoch": 0.37526315789473685, "grad_norm": 2.15625, "grad_norm_var": 0.44221089680989584, "learning_rate": 0.0001, "loss": 2.8918, "loss/crossentropy": 2.219492554664612, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.20168758630752565, "loss/reg": 0.0, "step": 57040 }, { "epoch": 0.37532894736842104, "grad_norm": 2.484375, "grad_norm_var": 0.4313629150390625, "learning_rate": 0.0001, "loss": 2.8841, "loss/crossentropy": 2.112499403953552, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.20912077128887177, "loss/reg": 0.0, "step": 57050 }, { "epoch": 0.3753947368421053, "grad_norm": 2.46875, "grad_norm_var": 0.03145523071289062, "learning_rate": 0.0001, "loss": 2.8444, "loss/crossentropy": 2.5125695943832396, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.20429453998804092, "loss/reg": 0.0, "step": 57060 }, { "epoch": 0.37546052631578947, "grad_norm": 3.578125, "grad_norm_var": 0.12615331013997397, "learning_rate": 0.0001, "loss": 2.9171, "loss/crossentropy": 2.1103898167610167, "loss/hidden": 2.753125, "loss/incoh": 0.0, "loss/logits": 0.25469849035143854, "loss/reg": 0.0, "step": 57070 }, { "epoch": 0.3755263157894737, "grad_norm": 2.453125, "grad_norm_var": 0.5926096598307292, "learning_rate": 0.0001, "loss": 2.8713, "loss/crossentropy": 2.270982003211975, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.20489346086978913, "loss/reg": 0.0, "step": 57080 }, { "epoch": 0.3755921052631579, "grad_norm": 2.296875, "grad_norm_var": 0.5684855143229167, "learning_rate": 0.0001, "loss": 2.8344, "loss/crossentropy": 2.5241758704185484, "loss/hidden": 2.56875, "loss/incoh": 0.0, "loss/logits": 0.20988865047693253, "loss/reg": 0.0, "step": 57090 }, { "epoch": 0.3756578947368421, "grad_norm": 2.5625, "grad_norm_var": 0.344287109375, "learning_rate": 0.0001, "loss": 2.9554, "loss/crossentropy": 2.1851680397987367, "loss/hidden": 3.0140625, "loss/incoh": 0.0, "loss/logits": 0.3750660873949528, "loss/reg": 0.0, "step": 57100 }, { "epoch": 0.3757236842105263, "grad_norm": 2.4375, "grad_norm_var": 0.32710673014322916, "learning_rate": 0.0001, "loss": 2.9302, "loss/crossentropy": 2.382238245010376, "loss/hidden": 2.5765625, "loss/incoh": 0.0, "loss/logits": 0.2048749327659607, "loss/reg": 0.0, "step": 57110 }, { "epoch": 0.3757894736842105, "grad_norm": 2.15625, "grad_norm_var": 0.041178385416666664, "learning_rate": 0.0001, "loss": 2.8564, "loss/crossentropy": 2.2744884967803953, "loss/hidden": 2.6546875, "loss/incoh": 0.0, "loss/logits": 0.2040935844182968, "loss/reg": 0.0, "step": 57120 }, { "epoch": 0.37585526315789475, "grad_norm": 2.5, "grad_norm_var": 1.3599599202473958, "learning_rate": 0.0001, "loss": 2.9432, "loss/crossentropy": 2.391104209423065, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2660202234983444, "loss/reg": 0.0, "step": 57130 }, { "epoch": 0.37592105263157893, "grad_norm": 2.078125, "grad_norm_var": 0.04212137858072917, "learning_rate": 0.0001, "loss": 2.8597, "loss/crossentropy": 2.391116988658905, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.187996444106102, "loss/reg": 0.0, "step": 57140 }, { "epoch": 0.3759868421052632, "grad_norm": 2.09375, "grad_norm_var": 0.036839803059895836, "learning_rate": 0.0001, "loss": 2.8444, "loss/crossentropy": 2.3998670101165773, "loss/hidden": 2.4796875, "loss/incoh": 0.0, "loss/logits": 0.1628851443529129, "loss/reg": 0.0, "step": 57150 }, { "epoch": 0.37605263157894736, "grad_norm": 2.578125, "grad_norm_var": 0.48054097493489584, "learning_rate": 0.0001, "loss": 2.9344, "loss/crossentropy": 1.9888491988182069, "loss/hidden": 2.8671875, "loss/incoh": 0.0, "loss/logits": 0.25944017320871354, "loss/reg": 0.0, "step": 57160 }, { "epoch": 0.3761184210526316, "grad_norm": 2.546875, "grad_norm_var": 0.4474843343098958, "learning_rate": 0.0001, "loss": 2.9104, "loss/crossentropy": 2.2360977828502655, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.22406233549118043, "loss/reg": 0.0, "step": 57170 }, { "epoch": 0.3761842105263158, "grad_norm": 2.5, "grad_norm_var": 0.028434244791666667, "learning_rate": 0.0001, "loss": 2.9297, "loss/crossentropy": 2.41176176071167, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.2403358370065689, "loss/reg": 0.0, "step": 57180 }, { "epoch": 0.37625, "grad_norm": 2.359375, "grad_norm_var": 0.015543619791666666, "learning_rate": 0.0001, "loss": 2.8922, "loss/crossentropy": 2.2443907022476197, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.19119202494621276, "loss/reg": 0.0, "step": 57190 }, { "epoch": 0.3763157894736842, "grad_norm": 2.359375, "grad_norm_var": 0.057845052083333334, "learning_rate": 0.0001, "loss": 2.8281, "loss/crossentropy": 2.235058605670929, "loss/hidden": 2.5546875, "loss/incoh": 0.0, "loss/logits": 0.18877127021551132, "loss/reg": 0.0, "step": 57200 }, { "epoch": 0.3763815789473684, "grad_norm": 2.3125, "grad_norm_var": 0.0247955322265625, "learning_rate": 0.0001, "loss": 2.8273, "loss/crossentropy": 2.318722116947174, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.2108631983399391, "loss/reg": 0.0, "step": 57210 }, { "epoch": 0.37644736842105264, "grad_norm": 2.75, "grad_norm_var": 0.0709381103515625, "learning_rate": 0.0001, "loss": 2.9291, "loss/crossentropy": 2.522398519515991, "loss/hidden": 2.51875, "loss/incoh": 0.0, "loss/logits": 0.20184292793273925, "loss/reg": 0.0, "step": 57220 }, { "epoch": 0.3765131578947368, "grad_norm": 4.4375, "grad_norm_var": 0.3211090087890625, "learning_rate": 0.0001, "loss": 2.851, "loss/crossentropy": 2.2108476877212526, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.17876118719577788, "loss/reg": 0.0, "step": 57230 }, { "epoch": 0.37657894736842107, "grad_norm": 2.390625, "grad_norm_var": 0.2965779622395833, "learning_rate": 0.0001, "loss": 2.9011, "loss/crossentropy": 2.283047842979431, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.1862645700573921, "loss/reg": 0.0, "step": 57240 }, { "epoch": 0.37664473684210525, "grad_norm": 2.34375, "grad_norm_var": 0.15746434529622397, "learning_rate": 0.0001, "loss": 2.9275, "loss/crossentropy": 2.386126530170441, "loss/hidden": 2.6234375, "loss/incoh": 0.0, "loss/logits": 0.20931785851716994, "loss/reg": 0.0, "step": 57250 }, { "epoch": 0.3767105263157895, "grad_norm": 2.21875, "grad_norm_var": 0.0717302958170573, "learning_rate": 0.0001, "loss": 2.8171, "loss/crossentropy": 2.2000638365745546, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.18800537139177323, "loss/reg": 0.0, "step": 57260 }, { "epoch": 0.3767763157894737, "grad_norm": 2.515625, "grad_norm_var": 2.9275156730119456e+17, "learning_rate": 0.0001, "loss": 3.039, "loss/crossentropy": 2.5164933800697327, "loss/hidden": 2.6328125, "loss/incoh": 0.0, "loss/logits": 0.21517115235328674, "loss/reg": 0.0, "step": 57270 }, { "epoch": 0.37684210526315787, "grad_norm": 2.4375, "grad_norm_var": 4.759178894490206e+17, "learning_rate": 0.0001, "loss": 2.9595, "loss/crossentropy": 2.3897667050361635, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.19530030936002732, "loss/reg": 0.0, "step": 57280 }, { "epoch": 0.3769078947368421, "grad_norm": 2.015625, "grad_norm_var": 2.167533237083898e+17, "learning_rate": 0.0001, "loss": 2.7938, "loss/crossentropy": 2.441529381275177, "loss/hidden": 2.525, "loss/incoh": 0.0, "loss/logits": 0.18321114182472228, "loss/reg": 0.0, "step": 57290 }, { "epoch": 0.3769736842105263, "grad_norm": 2.0, "grad_norm_var": 0.05950113932291667, "learning_rate": 0.0001, "loss": 2.84, "loss/crossentropy": 2.3631966710090637, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21391219049692153, "loss/reg": 0.0, "step": 57300 }, { "epoch": 0.37703947368421054, "grad_norm": 2.25, "grad_norm_var": 0.07882486979166667, "learning_rate": 0.0001, "loss": 2.8842, "loss/crossentropy": 2.1668064475059508, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.19785012155771256, "loss/reg": 0.0, "step": 57310 }, { "epoch": 0.3771052631578947, "grad_norm": 2.375, "grad_norm_var": 0.09732666015625, "learning_rate": 0.0001, "loss": 2.9187, "loss/crossentropy": 2.2404924869537353, "loss/hidden": 2.6171875, "loss/incoh": 0.0, "loss/logits": 0.18843243569135665, "loss/reg": 0.0, "step": 57320 }, { "epoch": 0.37717105263157896, "grad_norm": 2.1875, "grad_norm_var": 0.10520426432291667, "learning_rate": 0.0001, "loss": 2.8538, "loss/crossentropy": 2.079696607589722, "loss/hidden": 2.571875, "loss/incoh": 0.0, "loss/logits": 0.19012553617358208, "loss/reg": 0.0, "step": 57330 }, { "epoch": 0.37723684210526315, "grad_norm": 2.25, "grad_norm_var": 0.0299468994140625, "learning_rate": 0.0001, "loss": 2.799, "loss/crossentropy": 2.13475821018219, "loss/hidden": 2.4375, "loss/incoh": 0.0, "loss/logits": 0.17336471229791642, "loss/reg": 0.0, "step": 57340 }, { "epoch": 0.3773026315789474, "grad_norm": 2.828125, "grad_norm_var": 1.2083730061848958, "learning_rate": 0.0001, "loss": 3.0044, "loss/crossentropy": 2.080480378866196, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.2245394505560398, "loss/reg": 0.0, "step": 57350 }, { "epoch": 0.3773684210526316, "grad_norm": 2.421875, "grad_norm_var": 1.1842519124348958, "learning_rate": 0.0001, "loss": 2.946, "loss/crossentropy": 2.0799950659275055, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.21235487312078477, "loss/reg": 0.0, "step": 57360 }, { "epoch": 0.3774342105263158, "grad_norm": 2.34375, "grad_norm_var": 0.048388671875, "learning_rate": 0.0001, "loss": 2.8575, "loss/crossentropy": 2.303094971179962, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.2594939172267914, "loss/reg": 0.0, "step": 57370 }, { "epoch": 0.3775, "grad_norm": 2.46875, "grad_norm_var": 0.058756510416666664, "learning_rate": 0.0001, "loss": 2.9389, "loss/crossentropy": 2.3353827238082885, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.19574832245707513, "loss/reg": 0.0, "step": 57380 }, { "epoch": 0.3775657894736842, "grad_norm": 2.390625, "grad_norm_var": 0.09351298014322916, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.2315954208374023, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20711566060781478, "loss/reg": 0.0, "step": 57390 }, { "epoch": 0.37763157894736843, "grad_norm": 2.65625, "grad_norm_var": 0.03577473958333333, "learning_rate": 0.0001, "loss": 2.8696, "loss/crossentropy": 2.3095576882362367, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.19530697241425515, "loss/reg": 0.0, "step": 57400 }, { "epoch": 0.3776973684210526, "grad_norm": 2.5, "grad_norm_var": 0.07294514973958334, "learning_rate": 0.0001, "loss": 2.882, "loss/crossentropy": 1.9188507854938508, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2042997807264328, "loss/reg": 0.0, "step": 57410 }, { "epoch": 0.37776315789473686, "grad_norm": 2.015625, "grad_norm_var": 0.04163004557291667, "learning_rate": 0.0001, "loss": 2.8747, "loss/crossentropy": 2.4532555460929872, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.18929362297058105, "loss/reg": 0.0, "step": 57420 }, { "epoch": 0.37782894736842104, "grad_norm": 2.34375, "grad_norm_var": 0.05136311848958333, "learning_rate": 0.0001, "loss": 2.8681, "loss/crossentropy": 2.4132068157196045, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.20341704934835433, "loss/reg": 0.0, "step": 57430 }, { "epoch": 0.3778947368421053, "grad_norm": 2.078125, "grad_norm_var": 0.025569661458333334, "learning_rate": 0.0001, "loss": 2.8572, "loss/crossentropy": 2.3047834038734436, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.18806078433990478, "loss/reg": 0.0, "step": 57440 }, { "epoch": 0.37796052631578947, "grad_norm": 2.15625, "grad_norm_var": 0.14114176432291667, "learning_rate": 0.0001, "loss": 2.9296, "loss/crossentropy": 2.1029324531555176, "loss/hidden": 2.65, "loss/incoh": 0.0, "loss/logits": 0.2052873395383358, "loss/reg": 0.0, "step": 57450 }, { "epoch": 0.3780263157894737, "grad_norm": 2.359375, "grad_norm_var": 0.4616607666015625, "learning_rate": 0.0001, "loss": 2.9058, "loss/crossentropy": 2.062540650367737, "loss/hidden": 2.578125, "loss/incoh": 0.0, "loss/logits": 0.177221193164587, "loss/reg": 0.0, "step": 57460 }, { "epoch": 0.3780921052631579, "grad_norm": 2.4375, "grad_norm_var": 0.4149403889973958, "learning_rate": 0.0001, "loss": 2.8936, "loss/crossentropy": 2.451702928543091, "loss/hidden": 2.6890625, "loss/incoh": 0.0, "loss/logits": 0.24022672772407533, "loss/reg": 0.0, "step": 57470 }, { "epoch": 0.3781578947368421, "grad_norm": 2.109375, "grad_norm_var": 0.0321685791015625, "learning_rate": 0.0001, "loss": 2.818, "loss/crossentropy": 2.071946942806244, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.21061123609542848, "loss/reg": 0.0, "step": 57480 }, { "epoch": 0.3782236842105263, "grad_norm": 2.25, "grad_norm_var": 0.1939849853515625, "learning_rate": 0.0001, "loss": 2.9539, "loss/crossentropy": 2.1918242454528807, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.2054192081093788, "loss/reg": 0.0, "step": 57490 }, { "epoch": 0.3782894736842105, "grad_norm": 4.09375, "grad_norm_var": 0.3421223958333333, "learning_rate": 0.0001, "loss": 2.8489, "loss/crossentropy": 2.056198441982269, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.20800079703330993, "loss/reg": 0.0, "step": 57500 }, { "epoch": 0.37835526315789475, "grad_norm": 2.625, "grad_norm_var": 0.2995887756347656, "learning_rate": 0.0001, "loss": 2.954, "loss/crossentropy": 2.3867305636405947, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.21539995074272156, "loss/reg": 0.0, "step": 57510 }, { "epoch": 0.37842105263157894, "grad_norm": 2.3125, "grad_norm_var": 0.13209635416666668, "learning_rate": 0.0001, "loss": 2.8844, "loss/crossentropy": 2.180548870563507, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.2677584797143936, "loss/reg": 0.0, "step": 57520 }, { "epoch": 0.3784868421052632, "grad_norm": 2.28125, "grad_norm_var": 0.04590555826822917, "learning_rate": 0.0001, "loss": 2.8649, "loss/crossentropy": 2.3179750084877013, "loss/hidden": 2.5390625, "loss/incoh": 0.0, "loss/logits": 0.18993980586528778, "loss/reg": 0.0, "step": 57530 }, { "epoch": 0.37855263157894736, "grad_norm": 2.5625, "grad_norm_var": 0.04830729166666667, "learning_rate": 0.0001, "loss": 2.8594, "loss/crossentropy": 2.32996461391449, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20761759281158448, "loss/reg": 0.0, "step": 57540 }, { "epoch": 0.3786184210526316, "grad_norm": 2.59375, "grad_norm_var": 0.03619384765625, "learning_rate": 0.0001, "loss": 2.9444, "loss/crossentropy": 2.498534631729126, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.22811719924211502, "loss/reg": 0.0, "step": 57550 }, { "epoch": 0.3786842105263158, "grad_norm": 2.671875, "grad_norm_var": 0.08181050618489584, "learning_rate": 0.0001, "loss": 2.8665, "loss/crossentropy": 2.068584406375885, "loss/hidden": 2.8515625, "loss/incoh": 0.0, "loss/logits": 0.20263911336660384, "loss/reg": 0.0, "step": 57560 }, { "epoch": 0.37875, "grad_norm": 2.953125, "grad_norm_var": 0.11533915201822917, "learning_rate": 0.0001, "loss": 2.897, "loss/crossentropy": 1.9728823184967041, "loss/hidden": 2.540625, "loss/incoh": 0.0, "loss/logits": 0.1655775723978877, "loss/reg": 0.0, "step": 57570 }, { "epoch": 0.3788157894736842, "grad_norm": 2.1875, "grad_norm_var": 0.09462483723958333, "learning_rate": 0.0001, "loss": 2.8009, "loss/crossentropy": 1.9365624606609344, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.1690821133553982, "loss/reg": 0.0, "step": 57580 }, { "epoch": 0.3788815789473684, "grad_norm": 2.65625, "grad_norm_var": 0.034601847330729164, "learning_rate": 0.0001, "loss": 2.8581, "loss/crossentropy": 2.3033923745155334, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.20101662129163742, "loss/reg": 0.0, "step": 57590 }, { "epoch": 0.37894736842105264, "grad_norm": 2.578125, "grad_norm_var": 0.03264058430989583, "learning_rate": 0.0001, "loss": 2.9116, "loss/crossentropy": 2.4054296016693115, "loss/hidden": 2.49375, "loss/incoh": 0.0, "loss/logits": 0.19278606325387954, "loss/reg": 0.0, "step": 57600 }, { "epoch": 0.37901315789473683, "grad_norm": 2.359375, "grad_norm_var": 0.042740885416666666, "learning_rate": 0.0001, "loss": 2.8671, "loss/crossentropy": 2.4763006806373595, "loss/hidden": 2.9421875, "loss/incoh": 0.0, "loss/logits": 0.22158413529396057, "loss/reg": 0.0, "step": 57610 }, { "epoch": 0.37907894736842107, "grad_norm": 2.390625, "grad_norm_var": 0.11295166015625, "learning_rate": 0.0001, "loss": 2.8943, "loss/crossentropy": 2.4030768275260925, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.20983522832393647, "loss/reg": 0.0, "step": 57620 }, { "epoch": 0.37914473684210526, "grad_norm": 2.46875, "grad_norm_var": 0.09550374348958333, "learning_rate": 0.0001, "loss": 2.8622, "loss/crossentropy": 2.3537391662597655, "loss/hidden": 2.675, "loss/incoh": 0.0, "loss/logits": 0.2376330778002739, "loss/reg": 0.0, "step": 57630 }, { "epoch": 0.3792105263157895, "grad_norm": 2.46875, "grad_norm_var": 0.13189697265625, "learning_rate": 0.0001, "loss": 2.8731, "loss/crossentropy": 2.3712357878684998, "loss/hidden": 2.51875, "loss/incoh": 0.0, "loss/logits": 0.1842004433274269, "loss/reg": 0.0, "step": 57640 }, { "epoch": 0.3792763157894737, "grad_norm": 2.1875, "grad_norm_var": 0.07590738932291667, "learning_rate": 0.0001, "loss": 2.8505, "loss/crossentropy": 2.0784621477127074, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.18896175622940065, "loss/reg": 0.0, "step": 57650 }, { "epoch": 0.37934210526315787, "grad_norm": 2.34375, "grad_norm_var": 0.066357421875, "learning_rate": 0.0001, "loss": 2.8606, "loss/crossentropy": 2.0798816442489625, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.21382400915026664, "loss/reg": 0.0, "step": 57660 }, { "epoch": 0.3794078947368421, "grad_norm": 2.40625, "grad_norm_var": 0.046629842122395834, "learning_rate": 0.0001, "loss": 2.8963, "loss/crossentropy": 2.044407057762146, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.20169557482004166, "loss/reg": 0.0, "step": 57670 }, { "epoch": 0.3794736842105263, "grad_norm": 2.40625, "grad_norm_var": 0.019090779622395835, "learning_rate": 0.0001, "loss": 2.9133, "loss/crossentropy": 2.575877034664154, "loss/hidden": 2.575, "loss/incoh": 0.0, "loss/logits": 0.19569746255874634, "loss/reg": 0.0, "step": 57680 }, { "epoch": 0.37953947368421054, "grad_norm": 2.421875, "grad_norm_var": 0.02486572265625, "learning_rate": 0.0001, "loss": 2.8645, "loss/crossentropy": 2.404980421066284, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.22096336930990218, "loss/reg": 0.0, "step": 57690 }, { "epoch": 0.3796052631578947, "grad_norm": 2.28125, "grad_norm_var": 0.026838175455729165, "learning_rate": 0.0001, "loss": 2.8836, "loss/crossentropy": 2.317495030164719, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.17882687449455262, "loss/reg": 0.0, "step": 57700 }, { "epoch": 0.37967105263157896, "grad_norm": 2.875, "grad_norm_var": 0.06211649576822917, "learning_rate": 0.0001, "loss": 2.8999, "loss/crossentropy": 2.197268545627594, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.18828486725687982, "loss/reg": 0.0, "step": 57710 }, { "epoch": 0.37973684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.21620686848958334, "learning_rate": 0.0001, "loss": 2.894, "loss/crossentropy": 2.267225503921509, "loss/hidden": 2.6296875, "loss/incoh": 0.0, "loss/logits": 0.2046874061226845, "loss/reg": 0.0, "step": 57720 }, { "epoch": 0.3798026315789474, "grad_norm": 2.328125, "grad_norm_var": 0.0230377197265625, "learning_rate": 0.0001, "loss": 2.8303, "loss/crossentropy": 2.546211838722229, "loss/hidden": 2.5625, "loss/incoh": 0.0, "loss/logits": 0.20918542593717576, "loss/reg": 0.0, "step": 57730 }, { "epoch": 0.3798684210526316, "grad_norm": 2.21875, "grad_norm_var": 0.04964090983072917, "learning_rate": 0.0001, "loss": 2.8625, "loss/crossentropy": 2.2063751935958864, "loss/hidden": 2.809375, "loss/incoh": 0.0, "loss/logits": 0.24073296040296555, "loss/reg": 0.0, "step": 57740 }, { "epoch": 0.37993421052631576, "grad_norm": 2.5625, "grad_norm_var": 0.12055562337239584, "learning_rate": 0.0001, "loss": 2.9325, "loss/crossentropy": 2.312474453449249, "loss/hidden": 2.7734375, "loss/incoh": 0.0, "loss/logits": 0.22882100194692612, "loss/reg": 0.0, "step": 57750 }, { "epoch": 0.38, "grad_norm": 2.375, "grad_norm_var": 0.07242431640625, "learning_rate": 0.0001, "loss": 2.8585, "loss/crossentropy": 2.1653325974941255, "loss/hidden": 2.484375, "loss/incoh": 0.0, "loss/logits": 0.18195100203156472, "loss/reg": 0.0, "step": 57760 }, { "epoch": 0.3800657894736842, "grad_norm": 2.234375, "grad_norm_var": 0.20406901041666667, "learning_rate": 0.0001, "loss": 2.9219, "loss/crossentropy": 2.032618153095245, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.1925370492041111, "loss/reg": 0.0, "step": 57770 }, { "epoch": 0.38013157894736843, "grad_norm": 2.859375, "grad_norm_var": 0.41979878743489585, "learning_rate": 0.0001, "loss": 2.8755, "loss/crossentropy": 2.3025426268577576, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.21944860816001893, "loss/reg": 0.0, "step": 57780 }, { "epoch": 0.3801973684210526, "grad_norm": 2.375, "grad_norm_var": 0.25970052083333334, "learning_rate": 0.0001, "loss": 2.8806, "loss/crossentropy": 2.076842927932739, "loss/hidden": 2.6, "loss/incoh": 0.0, "loss/logits": 0.18683969974517822, "loss/reg": 0.0, "step": 57790 }, { "epoch": 0.38026315789473686, "grad_norm": 2.578125, "grad_norm_var": 0.1130859375, "learning_rate": 0.0001, "loss": 2.895, "loss/crossentropy": 2.222626829147339, "loss/hidden": 2.83125, "loss/incoh": 0.0, "loss/logits": 0.2189730480313301, "loss/reg": 0.0, "step": 57800 }, { "epoch": 0.38032894736842104, "grad_norm": 2.3125, "grad_norm_var": 0.10714518229166667, "learning_rate": 0.0001, "loss": 2.8942, "loss/crossentropy": 2.3418797612190247, "loss/hidden": 2.59375, "loss/incoh": 0.0, "loss/logits": 0.20229362696409225, "loss/reg": 0.0, "step": 57810 }, { "epoch": 0.3803947368421053, "grad_norm": 2.390625, "grad_norm_var": 0.046773274739583336, "learning_rate": 0.0001, "loss": 2.851, "loss/crossentropy": 2.4133851170539855, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.21942623555660248, "loss/reg": 0.0, "step": 57820 }, { "epoch": 0.38046052631578947, "grad_norm": 2.5, "grad_norm_var": 0.0122467041015625, "learning_rate": 0.0001, "loss": 2.809, "loss/crossentropy": 2.042433965206146, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.19593103379011154, "loss/reg": 0.0, "step": 57830 }, { "epoch": 0.38052631578947366, "grad_norm": 2.875, "grad_norm_var": 0.050780232747395834, "learning_rate": 0.0001, "loss": 2.9052, "loss/crossentropy": 2.251876473426819, "loss/hidden": 2.634375, "loss/incoh": 0.0, "loss/logits": 0.22136868089437484, "loss/reg": 0.0, "step": 57840 }, { "epoch": 0.3805921052631579, "grad_norm": 2.6875, "grad_norm_var": 0.09091389973958333, "learning_rate": 0.0001, "loss": 2.8345, "loss/crossentropy": 2.5625521898269654, "loss/hidden": 2.64375, "loss/incoh": 0.0, "loss/logits": 0.222374027967453, "loss/reg": 0.0, "step": 57850 }, { "epoch": 0.3806578947368421, "grad_norm": 2.78125, "grad_norm_var": 0.060384114583333336, "learning_rate": 0.0001, "loss": 2.9788, "loss/crossentropy": 2.2164581179618836, "loss/hidden": 2.55625, "loss/incoh": 0.0, "loss/logits": 0.20478739440441132, "loss/reg": 0.0, "step": 57860 }, { "epoch": 0.3807236842105263, "grad_norm": 2.359375, "grad_norm_var": 0.0742095947265625, "learning_rate": 0.0001, "loss": 2.8895, "loss/crossentropy": 1.7111766412854195, "loss/hidden": 2.725, "loss/incoh": 0.0, "loss/logits": 0.16712991576641797, "loss/reg": 0.0, "step": 57870 }, { "epoch": 0.3807894736842105, "grad_norm": 2.53125, "grad_norm_var": 0.22838134765625, "learning_rate": 0.0001, "loss": 2.9518, "loss/crossentropy": 2.4427096605300904, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.23622211664915085, "loss/reg": 0.0, "step": 57880 }, { "epoch": 0.38085526315789475, "grad_norm": 2.46875, "grad_norm_var": 0.21974995930989583, "learning_rate": 0.0001, "loss": 2.9228, "loss/crossentropy": 2.3371235728263855, "loss/hidden": 2.821875, "loss/incoh": 0.0, "loss/logits": 0.26324093490839007, "loss/reg": 0.0, "step": 57890 }, { "epoch": 0.38092105263157894, "grad_norm": 2.84375, "grad_norm_var": 0.07156575520833333, "learning_rate": 0.0001, "loss": 2.8838, "loss/crossentropy": 2.1530874252319334, "loss/hidden": 2.784375, "loss/incoh": 0.0, "loss/logits": 0.22221154868602752, "loss/reg": 0.0, "step": 57900 }, { "epoch": 0.3809868421052632, "grad_norm": 2.71875, "grad_norm_var": 0.05943603515625, "learning_rate": 0.0001, "loss": 2.9334, "loss/crossentropy": 2.2443442702293397, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.19743588715791702, "loss/reg": 0.0, "step": 57910 }, { "epoch": 0.38105263157894737, "grad_norm": 2.09375, "grad_norm_var": 0.039281209309895836, "learning_rate": 0.0001, "loss": 2.8689, "loss/crossentropy": 1.9921360731124877, "loss/hidden": 2.79375, "loss/incoh": 0.0, "loss/logits": 0.19968579560518265, "loss/reg": 0.0, "step": 57920 }, { "epoch": 0.3811184210526316, "grad_norm": 2.890625, "grad_norm_var": 0.10187886555989584, "learning_rate": 0.0001, "loss": 2.8558, "loss/crossentropy": 1.9992644518613816, "loss/hidden": 2.5453125, "loss/incoh": 0.0, "loss/logits": 0.17999700643122196, "loss/reg": 0.0, "step": 57930 }, { "epoch": 0.3811842105263158, "grad_norm": 2.25, "grad_norm_var": 0.031183878580729168, "learning_rate": 0.0001, "loss": 2.845, "loss/crossentropy": 2.083136683702469, "loss/hidden": 2.5171875, "loss/incoh": 0.0, "loss/logits": 0.1843637578189373, "loss/reg": 0.0, "step": 57940 }, { "epoch": 0.38125, "grad_norm": 2.234375, "grad_norm_var": 0.04279683430989583, "learning_rate": 0.0001, "loss": 2.9058, "loss/crossentropy": 2.2104483246803284, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23131919130682946, "loss/reg": 0.0, "step": 57950 }, { "epoch": 0.3813157894736842, "grad_norm": 2.5, "grad_norm_var": 0.75113525390625, "learning_rate": 0.0001, "loss": 2.8952, "loss/crossentropy": 2.5412496089935304, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.22530737221240998, "loss/reg": 0.0, "step": 57960 }, { "epoch": 0.3813815789473684, "grad_norm": 2.65625, "grad_norm_var": 0.0564117431640625, "learning_rate": 0.0001, "loss": 2.8875, "loss/crossentropy": 2.4057974338531496, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.21192230731248857, "loss/reg": 0.0, "step": 57970 }, { "epoch": 0.38144736842105265, "grad_norm": 2.9375, "grad_norm_var": 0.0709625244140625, "learning_rate": 0.0001, "loss": 2.8825, "loss/crossentropy": 2.024870753288269, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.20590716898441314, "loss/reg": 0.0, "step": 57980 }, { "epoch": 0.38151315789473683, "grad_norm": 2.21875, "grad_norm_var": 0.04708658854166667, "learning_rate": 0.0001, "loss": 2.859, "loss/crossentropy": 2.4650910019874575, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.2203577846288681, "loss/reg": 0.0, "step": 57990 }, { "epoch": 0.3815789473684211, "grad_norm": 2.15625, "grad_norm_var": 0.041825358072916666, "learning_rate": 0.0001, "loss": 2.8664, "loss/crossentropy": 2.4609553813934326, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.23237253725528717, "loss/reg": 0.0, "step": 58000 }, { "epoch": 0.38164473684210526, "grad_norm": 2.984375, "grad_norm_var": 0.090966796875, "learning_rate": 0.0001, "loss": 2.8958, "loss/crossentropy": 2.3002306699752806, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.1987905502319336, "loss/reg": 0.0, "step": 58010 }, { "epoch": 0.3817105263157895, "grad_norm": 3.75, "grad_norm_var": 0.1707672119140625, "learning_rate": 0.0001, "loss": 2.9335, "loss/crossentropy": 2.026836967468262, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.29540627002716063, "loss/reg": 0.0, "step": 58020 }, { "epoch": 0.3817763157894737, "grad_norm": 2.34375, "grad_norm_var": 0.1714019775390625, "learning_rate": 0.0001, "loss": 2.9302, "loss/crossentropy": 2.306756579875946, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.22197161763906478, "loss/reg": 0.0, "step": 58030 }, { "epoch": 0.38184210526315787, "grad_norm": 2.375, "grad_norm_var": 0.0872222900390625, "learning_rate": 0.0001, "loss": 2.8821, "loss/crossentropy": 2.2390402436256407, "loss/hidden": 2.6703125, "loss/incoh": 0.0, "loss/logits": 0.20174480974674225, "loss/reg": 0.0, "step": 58040 }, { "epoch": 0.3819078947368421, "grad_norm": 2.390625, "grad_norm_var": 0.0654693603515625, "learning_rate": 0.0001, "loss": 2.9849, "loss/crossentropy": 2.355214846134186, "loss/hidden": 2.8890625, "loss/incoh": 0.0, "loss/logits": 0.24299003928899765, "loss/reg": 0.0, "step": 58050 }, { "epoch": 0.3819736842105263, "grad_norm": 2.703125, "grad_norm_var": 0.32320048014322916, "learning_rate": 0.0001, "loss": 2.9443, "loss/crossentropy": 2.3851979255676268, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.22102509588003158, "loss/reg": 0.0, "step": 58060 }, { "epoch": 0.38203947368421054, "grad_norm": 2.78125, "grad_norm_var": 0.2820696512858073, "learning_rate": 0.0001, "loss": 2.9169, "loss/crossentropy": 2.087635672092438, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.20776358395814895, "loss/reg": 0.0, "step": 58070 }, { "epoch": 0.3821052631578947, "grad_norm": 2.734375, "grad_norm_var": 0.04014383951822917, "learning_rate": 0.0001, "loss": 2.9513, "loss/crossentropy": 2.159754127264023, "loss/hidden": 2.7921875, "loss/incoh": 0.0, "loss/logits": 0.22105398327112197, "loss/reg": 0.0, "step": 58080 }, { "epoch": 0.38217105263157897, "grad_norm": 2.25, "grad_norm_var": 0.12991536458333333, "learning_rate": 0.0001, "loss": 2.8792, "loss/crossentropy": 2.0740616798400877, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.19475589096546173, "loss/reg": 0.0, "step": 58090 }, { "epoch": 0.38223684210526315, "grad_norm": 2.296875, "grad_norm_var": 0.09526265462239583, "learning_rate": 0.0001, "loss": 2.877, "loss/crossentropy": 2.178272378444672, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.22217049449682236, "loss/reg": 0.0, "step": 58100 }, { "epoch": 0.3823026315789474, "grad_norm": 2.28125, "grad_norm_var": 3.82193603515625, "learning_rate": 0.0001, "loss": 2.9171, "loss/crossentropy": 1.9965001851320268, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.20463632680475713, "loss/reg": 0.0, "step": 58110 }, { "epoch": 0.3823684210526316, "grad_norm": 2.484375, "grad_norm_var": 0.11972630818684896, "learning_rate": 0.0001, "loss": 2.8736, "loss/crossentropy": 2.144296848773956, "loss/hidden": 2.4890625, "loss/incoh": 0.0, "loss/logits": 0.16848402991890907, "loss/reg": 0.0, "step": 58120 }, { "epoch": 0.38243421052631577, "grad_norm": 2.671875, "grad_norm_var": 0.0523101806640625, "learning_rate": 0.0001, "loss": 2.8911, "loss/crossentropy": 2.193416786193848, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.19108324497938156, "loss/reg": 0.0, "step": 58130 }, { "epoch": 0.3825, "grad_norm": 2.5, "grad_norm_var": 0.2054351806640625, "learning_rate": 0.0001, "loss": 2.9785, "loss/crossentropy": 2.2193787455558778, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.22116404995322228, "loss/reg": 0.0, "step": 58140 }, { "epoch": 0.3825657894736842, "grad_norm": 2.296875, "grad_norm_var": 0.24685872395833333, "learning_rate": 0.0001, "loss": 2.7987, "loss/crossentropy": 2.3670801162719726, "loss/hidden": 2.6375, "loss/incoh": 0.0, "loss/logits": 0.20961679816246032, "loss/reg": 0.0, "step": 58150 }, { "epoch": 0.38263157894736843, "grad_norm": 2.734375, "grad_norm_var": 0.04485270182291667, "learning_rate": 0.0001, "loss": 2.8933, "loss/crossentropy": 2.4992054104804993, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.23606279790401458, "loss/reg": 0.0, "step": 58160 }, { "epoch": 0.3826973684210526, "grad_norm": 2.328125, "grad_norm_var": 0.055907185872395834, "learning_rate": 0.0001, "loss": 2.9232, "loss/crossentropy": 2.308665955066681, "loss/hidden": 2.8078125, "loss/incoh": 0.0, "loss/logits": 0.2250552922487259, "loss/reg": 0.0, "step": 58170 }, { "epoch": 0.38276315789473686, "grad_norm": 2.359375, "grad_norm_var": 0.0717193603515625, "learning_rate": 0.0001, "loss": 2.8205, "loss/crossentropy": 2.4612384915351866, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.2357254907488823, "loss/reg": 0.0, "step": 58180 }, { "epoch": 0.38282894736842105, "grad_norm": 2.46875, "grad_norm_var": 0.0223541259765625, "learning_rate": 0.0001, "loss": 2.8975, "loss/crossentropy": 2.3063827842473983, "loss/hidden": 2.4421875, "loss/incoh": 0.0, "loss/logits": 0.18393357917666436, "loss/reg": 0.0, "step": 58190 }, { "epoch": 0.3828947368421053, "grad_norm": 2.296875, "grad_norm_var": 0.03714192708333333, "learning_rate": 0.0001, "loss": 2.8679, "loss/crossentropy": 2.03402339220047, "loss/hidden": 2.653125, "loss/incoh": 0.0, "loss/logits": 0.18462874591350556, "loss/reg": 0.0, "step": 58200 }, { "epoch": 0.3829605263157895, "grad_norm": 2.703125, "grad_norm_var": 0.05965143839518229, "learning_rate": 0.0001, "loss": 2.875, "loss/crossentropy": 2.252269148826599, "loss/hidden": 2.5734375, "loss/incoh": 0.0, "loss/logits": 0.2202517218887806, "loss/reg": 0.0, "step": 58210 }, { "epoch": 0.38302631578947366, "grad_norm": 2.5625, "grad_norm_var": 0.06636962890625, "learning_rate": 0.0001, "loss": 2.9073, "loss/crossentropy": 2.3402514696121215, "loss/hidden": 2.7421875, "loss/incoh": 0.0, "loss/logits": 0.2251133218407631, "loss/reg": 0.0, "step": 58220 }, { "epoch": 0.3830921052631579, "grad_norm": 2.875, "grad_norm_var": 0.7291412353515625, "learning_rate": 0.0001, "loss": 2.9893, "loss/crossentropy": 2.404543232917786, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.23309534639120102, "loss/reg": 0.0, "step": 58230 }, { "epoch": 0.3831578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.7497711181640625, "learning_rate": 0.0001, "loss": 2.9559, "loss/crossentropy": 2.4040826320648194, "loss/hidden": 2.559375, "loss/incoh": 0.0, "loss/logits": 0.2738741263747215, "loss/reg": 0.0, "step": 58240 }, { "epoch": 0.3832236842105263, "grad_norm": 2.3125, "grad_norm_var": 0.18762105305989582, "learning_rate": 0.0001, "loss": 2.9484, "loss/crossentropy": 2.4165258049964904, "loss/hidden": 2.5921875, "loss/incoh": 0.0, "loss/logits": 0.20454092621803283, "loss/reg": 0.0, "step": 58250 }, { "epoch": 0.3832894736842105, "grad_norm": 2.546875, "grad_norm_var": 0.1614654541015625, "learning_rate": 0.0001, "loss": 2.9695, "loss/crossentropy": 2.36160386800766, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.23894393891096116, "loss/reg": 0.0, "step": 58260 }, { "epoch": 0.38335526315789475, "grad_norm": 2.328125, "grad_norm_var": 0.04830322265625, "learning_rate": 0.0001, "loss": 2.908, "loss/crossentropy": 2.401131296157837, "loss/hidden": 2.7640625, "loss/incoh": 0.0, "loss/logits": 0.2368732661008835, "loss/reg": 0.0, "step": 58270 }, { "epoch": 0.38342105263157894, "grad_norm": 3.265625, "grad_norm_var": 0.056761678059895834, "learning_rate": 0.0001, "loss": 2.9578, "loss/crossentropy": 2.401544618606567, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.37941230535507203, "loss/reg": 0.0, "step": 58280 }, { "epoch": 0.3834868421052632, "grad_norm": 2.484375, "grad_norm_var": 0.10248921712239584, "learning_rate": 0.0001, "loss": 2.861, "loss/crossentropy": 2.1442492604255676, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.19750737696886062, "loss/reg": 0.0, "step": 58290 }, { "epoch": 0.38355263157894737, "grad_norm": 5.78125, "grad_norm_var": 0.8044108072916667, "learning_rate": 0.0001, "loss": 2.8903, "loss/crossentropy": 2.096047353744507, "loss/hidden": 2.7078125, "loss/incoh": 0.0, "loss/logits": 0.20191704258322715, "loss/reg": 0.0, "step": 58300 }, { "epoch": 0.38361842105263155, "grad_norm": 2.5, "grad_norm_var": 0.7894683837890625, "learning_rate": 0.0001, "loss": 2.9103, "loss/crossentropy": 2.1300374388694765, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.2378644600510597, "loss/reg": 0.0, "step": 58310 }, { "epoch": 0.3836842105263158, "grad_norm": 2.296875, "grad_norm_var": 0.022118123372395833, "learning_rate": 0.0001, "loss": 2.9222, "loss/crossentropy": 2.250592660903931, "loss/hidden": 2.5109375, "loss/incoh": 0.0, "loss/logits": 0.2027652733027935, "loss/reg": 0.0, "step": 58320 }, { "epoch": 0.38375, "grad_norm": 2.109375, "grad_norm_var": 0.08774312337239583, "learning_rate": 0.0001, "loss": 3.0183, "loss/crossentropy": 2.439738130569458, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.19519764482975005, "loss/reg": 0.0, "step": 58330 }, { "epoch": 0.3838157894736842, "grad_norm": 2.25, "grad_norm_var": 0.1132720947265625, "learning_rate": 0.0001, "loss": 2.9215, "loss/crossentropy": 2.3557589769363405, "loss/hidden": 2.68125, "loss/incoh": 0.0, "loss/logits": 0.2477085769176483, "loss/reg": 0.0, "step": 58340 }, { "epoch": 0.3838815789473684, "grad_norm": 2.453125, "grad_norm_var": 0.08296610514322916, "learning_rate": 0.0001, "loss": 2.9565, "loss/crossentropy": 2.45206618309021, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.2215176910161972, "loss/reg": 0.0, "step": 58350 }, { "epoch": 0.38394736842105265, "grad_norm": 2.875, "grad_norm_var": 0.38427327473958334, "learning_rate": 0.0001, "loss": 2.9186, "loss/crossentropy": 2.326324129104614, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.25265040099620817, "loss/reg": 0.0, "step": 58360 }, { "epoch": 0.38401315789473683, "grad_norm": 2.234375, "grad_norm_var": 0.07880452473958334, "learning_rate": 0.0001, "loss": 2.8595, "loss/crossentropy": 2.4040786385536195, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.21687066107988356, "loss/reg": 0.0, "step": 58370 }, { "epoch": 0.3840789473684211, "grad_norm": 2.515625, "grad_norm_var": 232.89875386555988, "learning_rate": 0.0001, "loss": 2.8862, "loss/crossentropy": 2.1579699397087095, "loss/hidden": 2.596875, "loss/incoh": 0.0, "loss/logits": 0.18747521117329596, "loss/reg": 0.0, "step": 58380 }, { "epoch": 0.38414473684210526, "grad_norm": 2.359375, "grad_norm_var": 0.05048828125, "learning_rate": 0.0001, "loss": 2.8952, "loss/crossentropy": 2.6557264804840086, "loss/hidden": 2.65625, "loss/incoh": 0.0, "loss/logits": 0.21660328060388565, "loss/reg": 0.0, "step": 58390 }, { "epoch": 0.38421052631578945, "grad_norm": 2.75, "grad_norm_var": 0.040999348958333334, "learning_rate": 0.0001, "loss": 2.9289, "loss/crossentropy": 1.9632918775081634, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.21173428148031234, "loss/reg": 0.0, "step": 58400 }, { "epoch": 0.3842763157894737, "grad_norm": 2.734375, "grad_norm_var": 0.021971638997395834, "learning_rate": 0.0001, "loss": 2.8866, "loss/crossentropy": 2.3267595529556275, "loss/hidden": 2.584375, "loss/incoh": 0.0, "loss/logits": 0.23012721687555313, "loss/reg": 0.0, "step": 58410 }, { "epoch": 0.3843421052631579, "grad_norm": 2.390625, "grad_norm_var": 0.06096598307291667, "learning_rate": 0.0001, "loss": 2.9345, "loss/crossentropy": 2.256697487831116, "loss/hidden": 2.8484375, "loss/incoh": 0.0, "loss/logits": 0.2545373126864433, "loss/reg": 0.0, "step": 58420 }, { "epoch": 0.3844078947368421, "grad_norm": 2.390625, "grad_norm_var": 0.053238932291666666, "learning_rate": 0.0001, "loss": 2.8708, "loss/crossentropy": 2.507232594490051, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.21135186403989792, "loss/reg": 0.0, "step": 58430 }, { "epoch": 0.3844736842105263, "grad_norm": 2.671875, "grad_norm_var": 0.04143473307291667, "learning_rate": 0.0001, "loss": 2.9727, "loss/crossentropy": 2.245376408100128, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.22731242179870606, "loss/reg": 0.0, "step": 58440 }, { "epoch": 0.38453947368421054, "grad_norm": 2.640625, "grad_norm_var": 0.04628804524739583, "learning_rate": 0.0001, "loss": 2.9369, "loss/crossentropy": 2.3389252066612243, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.23823900818824767, "loss/reg": 0.0, "step": 58450 }, { "epoch": 0.38460526315789473, "grad_norm": 2.421875, "grad_norm_var": 0.042236328125, "learning_rate": 0.0001, "loss": 2.9509, "loss/crossentropy": 2.292842137813568, "loss/hidden": 2.8875, "loss/incoh": 0.0, "loss/logits": 0.25937593206763265, "loss/reg": 0.0, "step": 58460 }, { "epoch": 0.38467105263157897, "grad_norm": 2.265625, "grad_norm_var": 1.1588175455729166, "learning_rate": 0.0001, "loss": 3.0, "loss/crossentropy": 2.3534352779388428, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.33031875640153885, "loss/reg": 0.0, "step": 58470 }, { "epoch": 0.38473684210526315, "grad_norm": 2.34375, "grad_norm_var": 0.13373921712239584, "learning_rate": 0.0001, "loss": 2.9573, "loss/crossentropy": 2.1403919458389282, "loss/hidden": 2.5953125, "loss/incoh": 0.0, "loss/logits": 0.18915559351444244, "loss/reg": 0.0, "step": 58480 }, { "epoch": 0.3848026315789474, "grad_norm": 2.65625, "grad_norm_var": 0.1492828369140625, "learning_rate": 0.0001, "loss": 2.8801, "loss/crossentropy": 2.284643459320068, "loss/hidden": 2.5703125, "loss/incoh": 0.0, "loss/logits": 0.1817684531211853, "loss/reg": 0.0, "step": 58490 }, { "epoch": 0.3848684210526316, "grad_norm": 2.859375, "grad_norm_var": 0.09656575520833334, "learning_rate": 0.0001, "loss": 2.9747, "loss/crossentropy": 2.430289316177368, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.25551571547985075, "loss/reg": 0.0, "step": 58500 }, { "epoch": 0.38493421052631577, "grad_norm": 2.546875, "grad_norm_var": 0.14075520833333333, "learning_rate": 0.0001, "loss": 2.9487, "loss/crossentropy": 2.143838608264923, "loss/hidden": 3.00625, "loss/incoh": 0.0, "loss/logits": 0.22567144334316253, "loss/reg": 0.0, "step": 58510 }, { "epoch": 0.385, "grad_norm": 2.46875, "grad_norm_var": 0.05930887858072917, "learning_rate": 0.0001, "loss": 2.9254, "loss/crossentropy": 2.3157738566398622, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.24194117337465287, "loss/reg": 0.0, "step": 58520 }, { "epoch": 0.3850657894736842, "grad_norm": 2.984375, "grad_norm_var": 0.07224019368489583, "learning_rate": 0.0001, "loss": 2.9161, "loss/crossentropy": 1.8319615840911865, "loss/hidden": 2.703125, "loss/incoh": 0.0, "loss/logits": 0.19698834419250488, "loss/reg": 0.0, "step": 58530 }, { "epoch": 0.38513157894736844, "grad_norm": 2.71875, "grad_norm_var": 0.115283203125, "learning_rate": 0.0001, "loss": 2.8952, "loss/crossentropy": 2.298383128643036, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.2198283404111862, "loss/reg": 0.0, "step": 58540 }, { "epoch": 0.3851973684210526, "grad_norm": 2.265625, "grad_norm_var": 0.14230855305989584, "learning_rate": 0.0001, "loss": 2.8917, "loss/crossentropy": 2.039904797077179, "loss/hidden": 2.7234375, "loss/incoh": 0.0, "loss/logits": 0.2067241370677948, "loss/reg": 0.0, "step": 58550 }, { "epoch": 0.38526315789473686, "grad_norm": 3.015625, "grad_norm_var": 0.15146484375, "learning_rate": 0.0001, "loss": 2.941, "loss/crossentropy": 2.241132044792175, "loss/hidden": 2.7453125, "loss/incoh": 0.0, "loss/logits": 0.22759715616703033, "loss/reg": 0.0, "step": 58560 }, { "epoch": 0.38532894736842105, "grad_norm": 2.265625, "grad_norm_var": 0.10639546712239584, "learning_rate": 0.0001, "loss": 2.9257, "loss/crossentropy": 2.673251724243164, "loss/hidden": 2.8953125, "loss/incoh": 0.0, "loss/logits": 0.2928698107600212, "loss/reg": 0.0, "step": 58570 }, { "epoch": 0.3853947368421053, "grad_norm": 2.484375, "grad_norm_var": 0.034501139322916666, "learning_rate": 0.0001, "loss": 2.8528, "loss/crossentropy": 2.272460198402405, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.22952667772769927, "loss/reg": 0.0, "step": 58580 }, { "epoch": 0.3854605263157895, "grad_norm": 2.21875, "grad_norm_var": 0.028385416666666666, "learning_rate": 0.0001, "loss": 2.9165, "loss/crossentropy": 2.099717354774475, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.18448562249541284, "loss/reg": 0.0, "step": 58590 }, { "epoch": 0.38552631578947366, "grad_norm": 2.640625, "grad_norm_var": 0.019904581705729167, "learning_rate": 0.0001, "loss": 2.9313, "loss/crossentropy": 2.356782591342926, "loss/hidden": 2.7859375, "loss/incoh": 0.0, "loss/logits": 0.22100431472063065, "loss/reg": 0.0, "step": 58600 }, { "epoch": 0.3855921052631579, "grad_norm": 2.53125, "grad_norm_var": 0.28885091145833336, "learning_rate": 0.0001, "loss": 2.8766, "loss/crossentropy": 2.3523670077323913, "loss/hidden": 2.66875, "loss/incoh": 0.0, "loss/logits": 0.22611600458621978, "loss/reg": 0.0, "step": 58610 }, { "epoch": 0.3856578947368421, "grad_norm": 2.53125, "grad_norm_var": 0.18449605305989583, "learning_rate": 0.0001, "loss": 2.9938, "loss/crossentropy": 2.435256004333496, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.21665037870407106, "loss/reg": 0.0, "step": 58620 }, { "epoch": 0.38572368421052633, "grad_norm": 3.03125, "grad_norm_var": 0.14631245930989584, "learning_rate": 0.0001, "loss": 2.9244, "loss/crossentropy": 2.1092891573905943, "loss/hidden": 2.7515625, "loss/incoh": 0.0, "loss/logits": 0.20635376796126365, "loss/reg": 0.0, "step": 58630 }, { "epoch": 0.3857894736842105, "grad_norm": 2.0625, "grad_norm_var": 0.07097981770833334, "learning_rate": 0.0001, "loss": 2.8897, "loss/crossentropy": 2.0304438918828964, "loss/hidden": 2.7953125, "loss/incoh": 0.0, "loss/logits": 0.22421427853405476, "loss/reg": 0.0, "step": 58640 }, { "epoch": 0.38585526315789476, "grad_norm": 2.234375, "grad_norm_var": 0.07132059733072917, "learning_rate": 0.0001, "loss": 2.9166, "loss/crossentropy": 2.552103614807129, "loss/hidden": 2.728125, "loss/incoh": 0.0, "loss/logits": 0.24292279034852982, "loss/reg": 0.0, "step": 58650 }, { "epoch": 0.38592105263157894, "grad_norm": 2.328125, "grad_norm_var": 0.04419657389322917, "learning_rate": 0.0001, "loss": 2.917, "loss/crossentropy": 2.221625339984894, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.21951914951205254, "loss/reg": 0.0, "step": 58660 }, { "epoch": 0.3859868421052632, "grad_norm": 2.296875, "grad_norm_var": 0.030402628580729167, "learning_rate": 0.0001, "loss": 2.8445, "loss/crossentropy": 2.1745887875556944, "loss/hidden": 2.6078125, "loss/incoh": 0.0, "loss/logits": 0.2107097901403904, "loss/reg": 0.0, "step": 58670 }, { "epoch": 0.38605263157894737, "grad_norm": 2.296875, "grad_norm_var": 0.036844889322916664, "learning_rate": 0.0001, "loss": 2.9025, "loss/crossentropy": 2.388902962207794, "loss/hidden": 2.7484375, "loss/incoh": 0.0, "loss/logits": 0.21923394203186036, "loss/reg": 0.0, "step": 58680 }, { "epoch": 0.38611842105263156, "grad_norm": 3.03125, "grad_norm_var": 0.05239156087239583, "learning_rate": 0.0001, "loss": 2.9008, "loss/crossentropy": 2.187870192527771, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.18367815688252448, "loss/reg": 0.0, "step": 58690 }, { "epoch": 0.3861842105263158, "grad_norm": 3.03125, "grad_norm_var": 0.11149800618489583, "learning_rate": 0.0001, "loss": 2.9087, "loss/crossentropy": 2.400851917266846, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2535915821790695, "loss/reg": 0.0, "step": 58700 }, { "epoch": 0.38625, "grad_norm": 2.6875, "grad_norm_var": 0.11391499837239584, "learning_rate": 0.0001, "loss": 2.8704, "loss/crossentropy": 2.2505051612854006, "loss/hidden": 2.715625, "loss/incoh": 0.0, "loss/logits": 0.23379629775881766, "loss/reg": 0.0, "step": 58710 }, { "epoch": 0.3863157894736842, "grad_norm": 2.375, "grad_norm_var": 0.04019775390625, "learning_rate": 0.0001, "loss": 2.8568, "loss/crossentropy": 2.3933039546012878, "loss/hidden": 2.659375, "loss/incoh": 0.0, "loss/logits": 0.2265729159116745, "loss/reg": 0.0, "step": 58720 }, { "epoch": 0.3863815789473684, "grad_norm": 2.328125, "grad_norm_var": 0.0259674072265625, "learning_rate": 0.0001, "loss": 2.9111, "loss/crossentropy": 2.128358006477356, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.22323397397994996, "loss/reg": 0.0, "step": 58730 }, { "epoch": 0.38644736842105265, "grad_norm": 2.125, "grad_norm_var": 0.03594538370768229, "learning_rate": 0.0001, "loss": 2.8953, "loss/crossentropy": 2.1027749121189117, "loss/hidden": 2.6359375, "loss/incoh": 0.0, "loss/logits": 0.23758888244628906, "loss/reg": 0.0, "step": 58740 }, { "epoch": 0.38651315789473684, "grad_norm": 2.6875, "grad_norm_var": 0.17450536092122396, "learning_rate": 0.0001, "loss": 2.9452, "loss/crossentropy": 2.3743036508560182, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.24718946516513823, "loss/reg": 0.0, "step": 58750 }, { "epoch": 0.3865789473684211, "grad_norm": 2.40625, "grad_norm_var": 0.1627838134765625, "learning_rate": 0.0001, "loss": 2.9457, "loss/crossentropy": 2.1995988726615905, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.1942834511399269, "loss/reg": 0.0, "step": 58760 }, { "epoch": 0.38664473684210526, "grad_norm": 2.3125, "grad_norm_var": 0.019370269775390626, "learning_rate": 0.0001, "loss": 2.8399, "loss/crossentropy": 2.003353601694107, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.1943466916680336, "loss/reg": 0.0, "step": 58770 }, { "epoch": 0.38671052631578945, "grad_norm": 2.546875, "grad_norm_var": 0.0569000244140625, "learning_rate": 0.0001, "loss": 2.876, "loss/crossentropy": 2.3872076272964478, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.2202869713306427, "loss/reg": 0.0, "step": 58780 }, { "epoch": 0.3867763157894737, "grad_norm": 2.140625, "grad_norm_var": 0.05496419270833333, "learning_rate": 0.0001, "loss": 2.8611, "loss/crossentropy": 2.151517152786255, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.20806972533464432, "loss/reg": 0.0, "step": 58790 }, { "epoch": 0.3868421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.056689453125, "learning_rate": 0.0001, "loss": 2.8943, "loss/crossentropy": 2.465940701961517, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.21008821725845336, "loss/reg": 0.0, "step": 58800 }, { "epoch": 0.3869078947368421, "grad_norm": 2.6875, "grad_norm_var": 0.03302408854166667, "learning_rate": 0.0001, "loss": 2.9015, "loss/crossentropy": 1.9902961254119873, "loss/hidden": 2.7375, "loss/incoh": 0.0, "loss/logits": 0.20687344074249267, "loss/reg": 0.0, "step": 58810 }, { "epoch": 0.3869736842105263, "grad_norm": 2.3125, "grad_norm_var": 0.040934244791666664, "learning_rate": 0.0001, "loss": 2.895, "loss/crossentropy": 2.314704346656799, "loss/hidden": 2.5640625, "loss/incoh": 0.0, "loss/logits": 0.18990649282932281, "loss/reg": 0.0, "step": 58820 }, { "epoch": 0.38703947368421054, "grad_norm": 2.34375, "grad_norm_var": 0.03466389973958333, "learning_rate": 0.0001, "loss": 2.8708, "loss/crossentropy": 2.4108652234077455, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.20876727104187012, "loss/reg": 0.0, "step": 58830 }, { "epoch": 0.38710526315789473, "grad_norm": 2.71875, "grad_norm_var": 0.09746805826822917, "learning_rate": 0.0001, "loss": 2.9101, "loss/crossentropy": 2.37710320353508, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.21608619689941405, "loss/reg": 0.0, "step": 58840 }, { "epoch": 0.38717105263157897, "grad_norm": 2.203125, "grad_norm_var": 0.02861328125, "learning_rate": 0.0001, "loss": 2.8802, "loss/crossentropy": 2.353898346424103, "loss/hidden": 2.60625, "loss/incoh": 0.0, "loss/logits": 0.22128148674964904, "loss/reg": 0.0, "step": 58850 }, { "epoch": 0.38723684210526316, "grad_norm": 2.5, "grad_norm_var": 0.04453125, "learning_rate": 0.0001, "loss": 2.8902, "loss/crossentropy": 2.322698414325714, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.23446093797683715, "loss/reg": 0.0, "step": 58860 }, { "epoch": 0.38730263157894734, "grad_norm": 2.59375, "grad_norm_var": 0.03387044270833333, "learning_rate": 0.0001, "loss": 2.987, "loss/crossentropy": 2.5401952505111693, "loss/hidden": 2.6265625, "loss/incoh": 0.0, "loss/logits": 0.22704993784427643, "loss/reg": 0.0, "step": 58870 }, { "epoch": 0.3873684210526316, "grad_norm": 2.265625, "grad_norm_var": 0.07504781087239583, "learning_rate": 0.0001, "loss": 2.8955, "loss/crossentropy": 2.3085227727890016, "loss/hidden": 2.5328125, "loss/incoh": 0.0, "loss/logits": 0.1909288801252842, "loss/reg": 0.0, "step": 58880 }, { "epoch": 0.38743421052631577, "grad_norm": 2.1875, "grad_norm_var": 0.032373046875, "learning_rate": 0.0001, "loss": 2.8742, "loss/crossentropy": 2.398520267009735, "loss/hidden": 2.69375, "loss/incoh": 0.0, "loss/logits": 0.21085063517093658, "loss/reg": 0.0, "step": 58890 }, { "epoch": 0.3875, "grad_norm": 2.25, "grad_norm_var": 0.053620402018229166, "learning_rate": 0.0001, "loss": 2.9193, "loss/crossentropy": 2.1338739097118378, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.30644616931676866, "loss/reg": 0.0, "step": 58900 }, { "epoch": 0.3875657894736842, "grad_norm": 1.96875, "grad_norm_var": 0.05437825520833333, "learning_rate": 0.0001, "loss": 2.8474, "loss/crossentropy": 2.24463951587677, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.20888919979333878, "loss/reg": 0.0, "step": 58910 }, { "epoch": 0.38763157894736844, "grad_norm": 2.65625, "grad_norm_var": 0.705322265625, "learning_rate": 0.0001, "loss": 2.9697, "loss/crossentropy": 2.4294981598854064, "loss/hidden": 2.5390625, "loss/incoh": 0.0, "loss/logits": 0.2036197602748871, "loss/reg": 0.0, "step": 58920 }, { "epoch": 0.3876973684210526, "grad_norm": 2.171875, "grad_norm_var": 0.3561482747395833, "learning_rate": 0.0001, "loss": 2.9351, "loss/crossentropy": 2.531821644306183, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.2110217273235321, "loss/reg": 0.0, "step": 58930 }, { "epoch": 0.38776315789473687, "grad_norm": 2.03125, "grad_norm_var": 0.07768452962239583, "learning_rate": 0.0001, "loss": 2.8429, "loss/crossentropy": 2.340372371673584, "loss/hidden": 2.825, "loss/incoh": 0.0, "loss/logits": 0.21766451746225357, "loss/reg": 0.0, "step": 58940 }, { "epoch": 0.38782894736842105, "grad_norm": 2.125, "grad_norm_var": 0.05739644368489583, "learning_rate": 0.0001, "loss": 2.8411, "loss/crossentropy": 2.3367336988449097, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.19017939865589142, "loss/reg": 0.0, "step": 58950 }, { "epoch": 0.38789473684210524, "grad_norm": 2.25, "grad_norm_var": 0.093212890625, "learning_rate": 0.0001, "loss": 2.8127, "loss/crossentropy": 2.0174095928668976, "loss/hidden": 2.6109375, "loss/incoh": 0.0, "loss/logits": 0.2088319644331932, "loss/reg": 0.0, "step": 58960 }, { "epoch": 0.3879605263157895, "grad_norm": 2.625, "grad_norm_var": 0.284716796875, "learning_rate": 0.0001, "loss": 2.9601, "loss/crossentropy": 1.8936258614063264, "loss/hidden": 2.7109375, "loss/incoh": 0.0, "loss/logits": 0.18863703906536103, "loss/reg": 0.0, "step": 58970 }, { "epoch": 0.38802631578947366, "grad_norm": 2.34375, "grad_norm_var": 0.23472264607747395, "learning_rate": 0.0001, "loss": 2.8588, "loss/crossentropy": 2.256634998321533, "loss/hidden": 2.7359375, "loss/incoh": 0.0, "loss/logits": 0.24646916538476943, "loss/reg": 0.0, "step": 58980 }, { "epoch": 0.3880921052631579, "grad_norm": 2.359375, "grad_norm_var": 0.1550493876139323, "learning_rate": 0.0001, "loss": 2.9644, "loss/crossentropy": 2.288488340377808, "loss/hidden": 2.884375, "loss/incoh": 0.0, "loss/logits": 0.26915098875761034, "loss/reg": 0.0, "step": 58990 }, { "epoch": 0.3881578947368421, "grad_norm": 2.328125, "grad_norm_var": 0.10491434733072917, "learning_rate": 0.0001, "loss": 2.8031, "loss/crossentropy": 2.157363474369049, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.1842524826526642, "loss/reg": 0.0, "step": 59000 }, { "epoch": 0.38822368421052633, "grad_norm": 2.40625, "grad_norm_var": 0.39920247395833336, "learning_rate": 0.0001, "loss": 2.8309, "loss/crossentropy": 2.2694199800491335, "loss/hidden": 2.565625, "loss/incoh": 0.0, "loss/logits": 0.19238079488277435, "loss/reg": 0.0, "step": 59010 }, { "epoch": 0.3882894736842105, "grad_norm": 2.609375, "grad_norm_var": 0.10399144490559896, "learning_rate": 0.0001, "loss": 2.8951, "loss/crossentropy": 2.1333535432815554, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.19943175315856934, "loss/reg": 0.0, "step": 59020 }, { "epoch": 0.38835526315789476, "grad_norm": 2.625, "grad_norm_var": 0.03624674479166667, "learning_rate": 0.0001, "loss": 2.9185, "loss/crossentropy": 2.3492682933807374, "loss/hidden": 2.828125, "loss/incoh": 0.0, "loss/logits": 0.22507993280887603, "loss/reg": 0.0, "step": 59030 }, { "epoch": 0.38842105263157894, "grad_norm": 2.078125, "grad_norm_var": 0.05024312337239583, "learning_rate": 0.0001, "loss": 2.9221, "loss/crossentropy": 2.1066722750663756, "loss/hidden": 2.8359375, "loss/incoh": 0.0, "loss/logits": 0.24366652816534043, "loss/reg": 0.0, "step": 59040 }, { "epoch": 0.38848684210526313, "grad_norm": 2.65625, "grad_norm_var": 0.0495513916015625, "learning_rate": 0.0001, "loss": 2.9188, "loss/crossentropy": 2.378230130672455, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21492177993059158, "loss/reg": 0.0, "step": 59050 }, { "epoch": 0.38855263157894737, "grad_norm": 2.375, "grad_norm_var": 0.03181050618489583, "learning_rate": 0.0001, "loss": 2.9002, "loss/crossentropy": 2.117634689807892, "loss/hidden": 2.7171875, "loss/incoh": 0.0, "loss/logits": 0.21551218181848525, "loss/reg": 0.0, "step": 59060 }, { "epoch": 0.38861842105263156, "grad_norm": 2.25, "grad_norm_var": 0.038960774739583336, "learning_rate": 0.0001, "loss": 2.9067, "loss/crossentropy": 2.2507598400115967, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.23783861845731735, "loss/reg": 0.0, "step": 59070 }, { "epoch": 0.3886842105263158, "grad_norm": 2.515625, "grad_norm_var": 0.03290913899739583, "learning_rate": 0.0001, "loss": 2.8411, "loss/crossentropy": 2.158875548839569, "loss/hidden": 2.5078125, "loss/incoh": 0.0, "loss/logits": 0.18021078407764435, "loss/reg": 0.0, "step": 59080 }, { "epoch": 0.38875, "grad_norm": 2.0625, "grad_norm_var": 0.060236612955729164, "learning_rate": 0.0001, "loss": 3.0357, "loss/crossentropy": 1.9093020737171174, "loss/hidden": 2.959375, "loss/incoh": 0.0, "loss/logits": 0.2339767374098301, "loss/reg": 0.0, "step": 59090 }, { "epoch": 0.3888157894736842, "grad_norm": 2.1875, "grad_norm_var": 0.07278238932291667, "learning_rate": 0.0001, "loss": 2.9038, "loss/crossentropy": 2.223426842689514, "loss/hidden": 2.5234375, "loss/incoh": 0.0, "loss/logits": 0.1933332748711109, "loss/reg": 0.0, "step": 59100 }, { "epoch": 0.3888815789473684, "grad_norm": 2.46875, "grad_norm_var": 0.05379130045572917, "learning_rate": 0.0001, "loss": 2.94, "loss/crossentropy": 2.5007378816604615, "loss/hidden": 2.6421875, "loss/incoh": 0.0, "loss/logits": 0.2263081908226013, "loss/reg": 0.0, "step": 59110 }, { "epoch": 0.38894736842105265, "grad_norm": 2.109375, "grad_norm_var": 0.06277567545572917, "learning_rate": 0.0001, "loss": 2.8934, "loss/crossentropy": 2.136933374404907, "loss/hidden": 2.8046875, "loss/incoh": 0.0, "loss/logits": 0.23307556211948394, "loss/reg": 0.0, "step": 59120 }, { "epoch": 0.38901315789473684, "grad_norm": 2.921875, "grad_norm_var": 0.08725484212239583, "learning_rate": 0.0001, "loss": 2.9692, "loss/crossentropy": 2.2667500495910646, "loss/hidden": 2.7890625, "loss/incoh": 0.0, "loss/logits": 0.22230024337768556, "loss/reg": 0.0, "step": 59130 }, { "epoch": 0.3890789473684211, "grad_norm": 2.9375, "grad_norm_var": 0.10669657389322916, "learning_rate": 0.0001, "loss": 2.9363, "loss/crossentropy": 2.032862901687622, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.2543000549077988, "loss/reg": 0.0, "step": 59140 }, { "epoch": 0.38914473684210527, "grad_norm": 2.1875, "grad_norm_var": 0.143798828125, "learning_rate": 0.0001, "loss": 2.9081, "loss/crossentropy": 2.4820448637008665, "loss/hidden": 2.6015625, "loss/incoh": 0.0, "loss/logits": 0.2191209986805916, "loss/reg": 0.0, "step": 59150 }, { "epoch": 0.38921052631578945, "grad_norm": 2.890625, "grad_norm_var": 0.112060546875, "learning_rate": 0.0001, "loss": 2.9759, "loss/crossentropy": 2.1144785642623902, "loss/hidden": 2.6734375, "loss/incoh": 0.0, "loss/logits": 0.20851679146289825, "loss/reg": 0.0, "step": 59160 }, { "epoch": 0.3892763157894737, "grad_norm": 2.34375, "grad_norm_var": 0.3047108968098958, "learning_rate": 0.0001, "loss": 2.9725, "loss/crossentropy": 2.344272494316101, "loss/hidden": 2.71875, "loss/incoh": 0.0, "loss/logits": 0.21442863196134568, "loss/reg": 0.0, "step": 59170 }, { "epoch": 0.3893421052631579, "grad_norm": 2.28125, "grad_norm_var": 0.10257161458333333, "learning_rate": 0.0001, "loss": 2.8214, "loss/crossentropy": 2.3370450377464294, "loss/hidden": 2.615625, "loss/incoh": 0.0, "loss/logits": 0.21343424320220947, "loss/reg": 0.0, "step": 59180 }, { "epoch": 0.3894078947368421, "grad_norm": 2.421875, "grad_norm_var": 0.0514068603515625, "learning_rate": 0.0001, "loss": 2.9235, "loss/crossentropy": 2.334169101715088, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.21767060309648514, "loss/reg": 0.0, "step": 59190 }, { "epoch": 0.3894736842105263, "grad_norm": 2.140625, "grad_norm_var": 0.120263671875, "learning_rate": 0.0001, "loss": 2.9212, "loss/crossentropy": 2.3568036198616027, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.19541145265102386, "loss/reg": 0.0, "step": 59200 }, { "epoch": 0.38953947368421055, "grad_norm": 2.28125, "grad_norm_var": 0.04934895833333333, "learning_rate": 0.0001, "loss": 2.8991, "loss/crossentropy": 2.3057434767484666, "loss/hidden": 2.5984375, "loss/incoh": 0.0, "loss/logits": 0.19204209931194782, "loss/reg": 0.0, "step": 59210 }, { "epoch": 0.38960526315789473, "grad_norm": 2.375, "grad_norm_var": 0.16243082682291668, "learning_rate": 0.0001, "loss": 2.9145, "loss/crossentropy": 2.3421372413635253, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.2295028880238533, "loss/reg": 0.0, "step": 59220 }, { "epoch": 0.389671052631579, "grad_norm": 2.21875, "grad_norm_var": 0.16776936848958332, "learning_rate": 0.0001, "loss": 2.8938, "loss/crossentropy": 2.242384433746338, "loss/hidden": 2.740625, "loss/incoh": 0.0, "loss/logits": 0.20074864625930786, "loss/reg": 0.0, "step": 59230 }, { "epoch": 0.38973684210526316, "grad_norm": 2.40625, "grad_norm_var": 0.11272379557291666, "learning_rate": 0.0001, "loss": 2.9376, "loss/crossentropy": 2.525466203689575, "loss/hidden": 2.70625, "loss/incoh": 0.0, "loss/logits": 0.24429584443569183, "loss/reg": 0.0, "step": 59240 }, { "epoch": 0.38980263157894735, "grad_norm": 2.34375, "grad_norm_var": 0.12301432291666667, "learning_rate": 0.0001, "loss": 2.9027, "loss/crossentropy": 2.299624133110046, "loss/hidden": 2.6921875, "loss/incoh": 0.0, "loss/logits": 0.22153230756521225, "loss/reg": 0.0, "step": 59250 }, { "epoch": 0.3898684210526316, "grad_norm": 2.84375, "grad_norm_var": 3.801495475109298e+17, "learning_rate": 0.0001, "loss": 3.0067, "loss/crossentropy": 2.3883923649787904, "loss/hidden": 2.6390625, "loss/incoh": 0.0, "loss/logits": 0.23486385643482208, "loss/reg": 0.0, "step": 59260 }, { "epoch": 0.38993421052631577, "grad_norm": 2.203125, "grad_norm_var": 3.801495474948735e+17, "learning_rate": 0.0001, "loss": 2.867, "loss/crossentropy": 2.137361264228821, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.18882133513689042, "loss/reg": 0.0, "step": 59270 }, { "epoch": 0.39, "grad_norm": 2.25, "grad_norm_var": 0.3366933186848958, "learning_rate": 0.0001, "loss": 2.8978, "loss/crossentropy": 2.0713411927223206, "loss/hidden": 2.6484375, "loss/incoh": 0.0, "loss/logits": 0.1943676121532917, "loss/reg": 0.0, "step": 59280 }, { "epoch": 0.3900657894736842, "grad_norm": 2.375, "grad_norm_var": 0.18653055826822917, "learning_rate": 0.0001, "loss": 2.8645, "loss/crossentropy": 2.3110851883888244, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.20071602910757064, "loss/reg": 0.0, "step": 59290 }, { "epoch": 0.39013157894736844, "grad_norm": 2.34375, "grad_norm_var": 0.048661295572916666, "learning_rate": 0.0001, "loss": 2.8907, "loss/crossentropy": 2.0743483901023865, "loss/hidden": 2.91875, "loss/incoh": 0.0, "loss/logits": 0.23969629555940627, "loss/reg": 0.0, "step": 59300 }, { "epoch": 0.3901973684210526, "grad_norm": 2.25, "grad_norm_var": 0.04498291015625, "learning_rate": 0.0001, "loss": 2.9117, "loss/crossentropy": 2.2797260522842406, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.2132597804069519, "loss/reg": 0.0, "step": 59310 }, { "epoch": 0.39026315789473687, "grad_norm": 2.265625, "grad_norm_var": 0.06565348307291667, "learning_rate": 0.0001, "loss": 2.9181, "loss/crossentropy": 2.5366496086120605, "loss/hidden": 2.7546875, "loss/incoh": 0.0, "loss/logits": 0.2214277595281601, "loss/reg": 0.0, "step": 59320 }, { "epoch": 0.39032894736842105, "grad_norm": 2.3125, "grad_norm_var": 0.021955362955729165, "learning_rate": 0.0001, "loss": 2.9451, "loss/crossentropy": 2.3287033438682556, "loss/hidden": 2.5296875, "loss/incoh": 0.0, "loss/logits": 0.20401861295104026, "loss/reg": 0.0, "step": 59330 }, { "epoch": 0.39039473684210524, "grad_norm": 2.421875, "grad_norm_var": 0.122509765625, "learning_rate": 0.0001, "loss": 2.9388, "loss/crossentropy": 2.397945988178253, "loss/hidden": 2.7671875, "loss/incoh": 0.0, "loss/logits": 0.2300954669713974, "loss/reg": 0.0, "step": 59340 }, { "epoch": 0.3904605263157895, "grad_norm": 2.59375, "grad_norm_var": 0.15641988118489583, "learning_rate": 0.0001, "loss": 2.9588, "loss/crossentropy": 2.1311981439590455, "loss/hidden": 2.9078125, "loss/incoh": 0.0, "loss/logits": 0.21890167593955995, "loss/reg": 0.0, "step": 59350 }, { "epoch": 0.39052631578947367, "grad_norm": 2.53125, "grad_norm_var": 0.05625712076822917, "learning_rate": 0.0001, "loss": 2.8526, "loss/crossentropy": 2.4698339462280274, "loss/hidden": 2.6609375, "loss/incoh": 0.0, "loss/logits": 0.19523532390594484, "loss/reg": 0.0, "step": 59360 }, { "epoch": 0.3905921052631579, "grad_norm": 2.84375, "grad_norm_var": 0.037653605143229164, "learning_rate": 0.0001, "loss": 2.8809, "loss/crossentropy": 2.338464593887329, "loss/hidden": 2.6671875, "loss/incoh": 0.0, "loss/logits": 0.22783634215593337, "loss/reg": 0.0, "step": 59370 }, { "epoch": 0.3906578947368421, "grad_norm": 2.390625, "grad_norm_var": 0.09213765462239583, "learning_rate": 0.0001, "loss": 2.903, "loss/crossentropy": 2.4289980471134185, "loss/hidden": 2.6453125, "loss/incoh": 0.0, "loss/logits": 0.21524901390075685, "loss/reg": 0.0, "step": 59380 }, { "epoch": 0.39072368421052633, "grad_norm": 2.328125, "grad_norm_var": 0.04986572265625, "learning_rate": 0.0001, "loss": 2.8734, "loss/crossentropy": 2.1783435702323914, "loss/hidden": 2.8109375, "loss/incoh": 0.0, "loss/logits": 0.23284209966659547, "loss/reg": 0.0, "step": 59390 }, { "epoch": 0.3907894736842105, "grad_norm": 2.0625, "grad_norm_var": 0.23740946451822917, "learning_rate": 0.0001, "loss": 2.9208, "loss/crossentropy": 2.1473644495010378, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.22180481851100922, "loss/reg": 0.0, "step": 59400 }, { "epoch": 0.39085526315789476, "grad_norm": 2.171875, "grad_norm_var": 0.2514394124348958, "learning_rate": 0.0001, "loss": 2.9122, "loss/crossentropy": 2.4968623161315917, "loss/hidden": 2.953125, "loss/incoh": 0.0, "loss/logits": 0.27531384080648424, "loss/reg": 0.0, "step": 59410 }, { "epoch": 0.39092105263157895, "grad_norm": 2.0, "grad_norm_var": 0.69918212890625, "learning_rate": 0.0001, "loss": 2.9476, "loss/crossentropy": 2.4628657698631287, "loss/hidden": 2.63125, "loss/incoh": 0.0, "loss/logits": 0.20733279213309289, "loss/reg": 0.0, "step": 59420 }, { "epoch": 0.39098684210526313, "grad_norm": 2.421875, "grad_norm_var": 0.8736490885416667, "learning_rate": 0.0001, "loss": 2.9784, "loss/crossentropy": 2.5338093519210814, "loss/hidden": 2.81875, "loss/incoh": 0.0, "loss/logits": 0.25779097676277163, "loss/reg": 0.0, "step": 59430 }, { "epoch": 0.3910526315789474, "grad_norm": 2.546875, "grad_norm_var": 2.4703387377523424e+17, "learning_rate": 0.0001, "loss": 3.0293, "loss/crossentropy": 2.341404175758362, "loss/hidden": 2.7609375, "loss/incoh": 0.0, "loss/logits": 0.2682019084692001, "loss/reg": 0.0, "step": 59440 }, { "epoch": 0.39111842105263156, "grad_norm": 2.375, "grad_norm_var": 0.04052327473958333, "learning_rate": 0.0001, "loss": 2.9282, "loss/crossentropy": 2.3629404425621034, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.20505992472171783, "loss/reg": 0.0, "step": 59450 }, { "epoch": 0.3911842105263158, "grad_norm": 2.421875, "grad_norm_var": 0.21685791015625, "learning_rate": 0.0001, "loss": 2.8574, "loss/crossentropy": 2.1211705088615416, "loss/hidden": 2.640625, "loss/incoh": 0.0, "loss/logits": 0.21120150238275529, "loss/reg": 0.0, "step": 59460 }, { "epoch": 0.39125, "grad_norm": 2.390625, "grad_norm_var": 0.20388081868489583, "learning_rate": 0.0001, "loss": 2.8422, "loss/crossentropy": 2.2279642820358276, "loss/hidden": 2.61875, "loss/incoh": 0.0, "loss/logits": 0.18344138339161872, "loss/reg": 0.0, "step": 59470 }, { "epoch": 0.39131578947368423, "grad_norm": 2.390625, "grad_norm_var": 0.05240478515625, "learning_rate": 0.0001, "loss": 2.8765, "loss/crossentropy": 2.2770405650138854, "loss/hidden": 2.6578125, "loss/incoh": 0.0, "loss/logits": 0.1978240042924881, "loss/reg": 0.0, "step": 59480 }, { "epoch": 0.3913815789473684, "grad_norm": 2.34375, "grad_norm_var": 0.0437896728515625, "learning_rate": 0.0001, "loss": 2.851, "loss/crossentropy": 2.420912528038025, "loss/hidden": 2.75625, "loss/incoh": 0.0, "loss/logits": 0.22646478563547134, "loss/reg": 0.0, "step": 59490 }, { "epoch": 0.39144736842105265, "grad_norm": 2.421875, "grad_norm_var": 0.17076416015625, "learning_rate": 0.0001, "loss": 2.9928, "loss/crossentropy": 2.406515657901764, "loss/hidden": 2.946875, "loss/incoh": 0.0, "loss/logits": 0.21923745721578597, "loss/reg": 0.0, "step": 59500 }, { "epoch": 0.39151315789473684, "grad_norm": 3.140625, "grad_norm_var": 0.0912994384765625, "learning_rate": 0.0001, "loss": 2.8558, "loss/crossentropy": 2.1866167187690735, "loss/hidden": 2.721875, "loss/incoh": 0.0, "loss/logits": 0.21702995747327805, "loss/reg": 0.0, "step": 59510 }, { "epoch": 0.391578947368421, "grad_norm": 2.453125, "grad_norm_var": 0.07242431640625, "learning_rate": 0.0001, "loss": 2.9022, "loss/crossentropy": 2.4201531052589416, "loss/hidden": 2.7328125, "loss/incoh": 0.0, "loss/logits": 0.22395686954259872, "loss/reg": 0.0, "step": 59520 }, { "epoch": 0.39164473684210527, "grad_norm": 2.484375, "grad_norm_var": 0.19830729166666666, "learning_rate": 0.0001, "loss": 2.906, "loss/crossentropy": 2.2090616106986998, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.21522201895713805, "loss/reg": 0.0, "step": 59530 }, { "epoch": 0.39171052631578945, "grad_norm": 2.375, "grad_norm_var": 0.0775054931640625, "learning_rate": 0.0001, "loss": 2.9403, "loss/crossentropy": 2.2624866247177122, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20676956102252006, "loss/reg": 0.0, "step": 59540 }, { "epoch": 0.3917763157894737, "grad_norm": 2.625, "grad_norm_var": 0.017985026041666668, "learning_rate": 0.0001, "loss": 2.9742, "loss/crossentropy": 2.1916024923324584, "loss/hidden": 2.6765625, "loss/incoh": 0.0, "loss/logits": 0.20450473725795745, "loss/reg": 0.0, "step": 59550 }, { "epoch": 0.3918421052631579, "grad_norm": 2.3125, "grad_norm_var": 0.05416259765625, "learning_rate": 0.0001, "loss": 2.925, "loss/crossentropy": 2.4917872667312624, "loss/hidden": 2.6875, "loss/incoh": 0.0, "loss/logits": 0.21785712838172913, "loss/reg": 0.0, "step": 59560 }, { "epoch": 0.3919078947368421, "grad_norm": 2.453125, "grad_norm_var": 0.24851786295572917, "learning_rate": 0.0001, "loss": 2.8797, "loss/crossentropy": 2.2272498965263368, "loss/hidden": 2.6515625, "loss/incoh": 0.0, "loss/logits": 0.1961108259856701, "loss/reg": 0.0, "step": 59570 }, { "epoch": 0.3919736842105263, "grad_norm": 3.4375, "grad_norm_var": 0.1443267822265625, "learning_rate": 0.0001, "loss": 2.9077, "loss/crossentropy": 2.2413615703582765, "loss/hidden": 2.865625, "loss/incoh": 0.0, "loss/logits": 0.25376398414373397, "loss/reg": 0.0, "step": 59580 }, { "epoch": 0.39203947368421055, "grad_norm": 2.53125, "grad_norm_var": 0.1130523681640625, "learning_rate": 0.0001, "loss": 2.9178, "loss/crossentropy": 2.216384744644165, "loss/hidden": 2.796875, "loss/incoh": 0.0, "loss/logits": 0.26899572014808654, "loss/reg": 0.0, "step": 59590 }, { "epoch": 0.39210526315789473, "grad_norm": 2.265625, "grad_norm_var": 0.07809956868489583, "learning_rate": 0.0001, "loss": 2.8974, "loss/crossentropy": 2.3132822632789614, "loss/hidden": 2.5484375, "loss/incoh": 0.0, "loss/logits": 0.19529758393764496, "loss/reg": 0.0, "step": 59600 }, { "epoch": 0.3921710526315789, "grad_norm": 2.21875, "grad_norm_var": 0.07805989583333334, "learning_rate": 0.0001, "loss": 2.9383, "loss/crossentropy": 2.1697922348976135, "loss/hidden": 2.678125, "loss/incoh": 0.0, "loss/logits": 0.20721041411161423, "loss/reg": 0.0, "step": 59610 }, { "epoch": 0.39223684210526316, "grad_norm": 2.15625, "grad_norm_var": 0.05035807291666667, "learning_rate": 0.0001, "loss": 2.8932, "loss/crossentropy": 2.559211003780365, "loss/hidden": 2.5890625, "loss/incoh": 0.0, "loss/logits": 0.2241365447640419, "loss/reg": 0.0, "step": 59620 }, { "epoch": 0.39230263157894735, "grad_norm": 2.359375, "grad_norm_var": 0.3782297770182292, "learning_rate": 0.0001, "loss": 2.924, "loss/crossentropy": 2.1259081840515135, "loss/hidden": 2.8828125, "loss/incoh": 0.0, "loss/logits": 0.24934625178575515, "loss/reg": 0.0, "step": 59630 }, { "epoch": 0.3923684210526316, "grad_norm": 2.1875, "grad_norm_var": 0.026416015625, "learning_rate": 0.0001, "loss": 2.8975, "loss/crossentropy": 2.5208987474441527, "loss/hidden": 2.5578125, "loss/incoh": 0.0, "loss/logits": 0.18677499890327454, "loss/reg": 0.0, "step": 59640 }, { "epoch": 0.3924342105263158, "grad_norm": 2.15625, "grad_norm_var": 0.027408854166666666, "learning_rate": 0.0001, "loss": 2.9065, "loss/crossentropy": 2.0934022426605225, "loss/hidden": 2.7203125, "loss/incoh": 0.0, "loss/logits": 0.21657732576131822, "loss/reg": 0.0, "step": 59650 }, { "epoch": 0.3925, "grad_norm": 2.375, "grad_norm_var": 0.040257771809895836, "learning_rate": 0.0001, "loss": 2.856, "loss/crossentropy": 2.208497977256775, "loss/hidden": 2.7, "loss/incoh": 0.0, "loss/logits": 0.21315317377448081, "loss/reg": 0.0, "step": 59660 }, { "epoch": 0.3925657894736842, "grad_norm": 2.4375, "grad_norm_var": 0.22158203125, "learning_rate": 0.0001, "loss": 2.8814, "loss/crossentropy": 2.242791175842285, "loss/hidden": 2.775, "loss/incoh": 0.0, "loss/logits": 0.20608874261379242, "loss/reg": 0.0, "step": 59670 }, { "epoch": 0.39263157894736844, "grad_norm": 2.390625, "grad_norm_var": 0.028986612955729168, "learning_rate": 0.0001, "loss": 2.95, "loss/crossentropy": 2.162680196762085, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.21429501920938493, "loss/reg": 0.0, "step": 59680 }, { "epoch": 0.39269736842105263, "grad_norm": 2.4375, "grad_norm_var": 0.053685506184895836, "learning_rate": 0.0001, "loss": 2.9106, "loss/crossentropy": 2.4090898275375365, "loss/hidden": 2.696875, "loss/incoh": 0.0, "loss/logits": 0.2186921328306198, "loss/reg": 0.0, "step": 59690 }, { "epoch": 0.39276315789473687, "grad_norm": 2.3125, "grad_norm_var": 0.07746988932291667, "learning_rate": 0.0001, "loss": 2.9127, "loss/crossentropy": 1.872154599428177, "loss/hidden": 2.609375, "loss/incoh": 0.0, "loss/logits": 0.18660937994718552, "loss/reg": 0.0, "step": 59700 }, { "epoch": 0.39282894736842106, "grad_norm": 2.34375, "grad_norm_var": 0.052079264322916666, "learning_rate": 0.0001, "loss": 2.9433, "loss/crossentropy": 1.9708613991737365, "loss/hidden": 3.1328125, "loss/incoh": 0.0, "loss/logits": 0.22336742803454399, "loss/reg": 0.0, "step": 59710 }, { "epoch": 0.39289473684210524, "grad_norm": 2.125, "grad_norm_var": 0.046784464518229166, "learning_rate": 0.0001, "loss": 2.8483, "loss/crossentropy": 2.328443133831024, "loss/hidden": 2.6125, "loss/incoh": 0.0, "loss/logits": 0.21370970383286475, "loss/reg": 0.0, "step": 59720 }, { "epoch": 0.3929605263157895, "grad_norm": 2.59375, "grad_norm_var": 0.10468648274739584, "learning_rate": 0.0001, "loss": 2.8818, "loss/crossentropy": 2.3042648136615753, "loss/hidden": 2.7578125, "loss/incoh": 0.0, "loss/logits": 0.27898505628108977, "loss/reg": 0.0, "step": 59730 }, { "epoch": 0.39302631578947367, "grad_norm": 2.484375, "grad_norm_var": 0.13742574055989584, "learning_rate": 0.0001, "loss": 2.9223, "loss/crossentropy": 2.2195008516311647, "loss/hidden": 2.7015625, "loss/incoh": 0.0, "loss/logits": 0.21260949820280076, "loss/reg": 0.0, "step": 59740 }, { "epoch": 0.3930921052631579, "grad_norm": 2.234375, "grad_norm_var": 0.12626546223958332, "learning_rate": 0.0001, "loss": 2.8594, "loss/crossentropy": 2.297947871685028, "loss/hidden": 2.5828125, "loss/incoh": 0.0, "loss/logits": 0.19618549048900605, "loss/reg": 0.0, "step": 59750 }, { "epoch": 0.3931578947368421, "grad_norm": 2.34375, "grad_norm_var": 1.106639607747396, "learning_rate": 0.0001, "loss": 2.905, "loss/crossentropy": 2.2195895671844483, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.21477752551436424, "loss/reg": 0.0, "step": 59760 }, { "epoch": 0.39322368421052634, "grad_norm": 2.46875, "grad_norm_var": 1.0701497395833333, "learning_rate": 0.0001, "loss": 2.9041, "loss/crossentropy": 2.3331664562225343, "loss/hidden": 2.621875, "loss/incoh": 0.0, "loss/logits": 0.22431352138519287, "loss/reg": 0.0, "step": 59770 }, { "epoch": 0.3932894736842105, "grad_norm": 2.453125, "grad_norm_var": 0.11057942708333333, "learning_rate": 0.0001, "loss": 2.8553, "loss/crossentropy": 2.2346091866493225, "loss/hidden": 2.5796875, "loss/incoh": 0.0, "loss/logits": 0.18155529350042343, "loss/reg": 0.0, "step": 59780 }, { "epoch": 0.39335526315789476, "grad_norm": 2.21875, "grad_norm_var": 0.0525787353515625, "learning_rate": 0.0001, "loss": 2.9153, "loss/crossentropy": 2.272337865829468, "loss/hidden": 2.765625, "loss/incoh": 0.0, "loss/logits": 0.28159644156694413, "loss/reg": 0.0, "step": 59790 }, { "epoch": 0.39342105263157895, "grad_norm": 2.03125, "grad_norm_var": 0.032079060872395836, "learning_rate": 0.0001, "loss": 2.7823, "loss/crossentropy": 2.2685976147651674, "loss/hidden": 2.6140625, "loss/incoh": 0.0, "loss/logits": 0.1990963637828827, "loss/reg": 0.0, "step": 59800 }, { "epoch": 0.39348684210526313, "grad_norm": 2.421875, "grad_norm_var": 0.029345703125, "learning_rate": 0.0001, "loss": 2.8633, "loss/crossentropy": 2.161335849761963, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20217425376176834, "loss/reg": 0.0, "step": 59810 }, { "epoch": 0.3935526315789474, "grad_norm": 2.734375, "grad_norm_var": 0.07041727701822917, "learning_rate": 0.0001, "loss": 2.9129, "loss/crossentropy": 2.2743474364280702, "loss/hidden": 2.521875, "loss/incoh": 0.0, "loss/logits": 0.18422066047787666, "loss/reg": 0.0, "step": 59820 }, { "epoch": 0.39361842105263156, "grad_norm": 2.515625, "grad_norm_var": 0.047163899739583334, "learning_rate": 0.0001, "loss": 2.8942, "loss/crossentropy": 2.1809528291225435, "loss/hidden": 2.73125, "loss/incoh": 0.0, "loss/logits": 0.1976180911064148, "loss/reg": 0.0, "step": 59830 }, { "epoch": 0.3936842105263158, "grad_norm": 2.484375, "grad_norm_var": 0.05513916015625, "learning_rate": 0.0001, "loss": 2.9911, "loss/crossentropy": 2.2889644503593445, "loss/hidden": 2.840625, "loss/incoh": 0.0, "loss/logits": 0.27457381039857864, "loss/reg": 0.0, "step": 59840 }, { "epoch": 0.39375, "grad_norm": 2.25, "grad_norm_var": 0.09114481608072916, "learning_rate": 0.0001, "loss": 2.9145, "loss/crossentropy": 2.3241925120353697, "loss/hidden": 2.7828125, "loss/incoh": 0.0, "loss/logits": 0.25822959989309313, "loss/reg": 0.0, "step": 59850 }, { "epoch": 0.39381578947368423, "grad_norm": 2.34375, "grad_norm_var": 0.10615132649739584, "learning_rate": 0.0001, "loss": 2.8812, "loss/crossentropy": 2.512364888191223, "loss/hidden": 2.75, "loss/incoh": 0.0, "loss/logits": 0.22594975233078002, "loss/reg": 0.0, "step": 59860 }, { "epoch": 0.3938815789473684, "grad_norm": 2.328125, "grad_norm_var": 0.09424540201822916, "learning_rate": 0.0001, "loss": 2.9062, "loss/crossentropy": 1.9462629437446595, "loss/hidden": 2.671875, "loss/incoh": 0.0, "loss/logits": 0.2077486865222454, "loss/reg": 0.0, "step": 59870 }, { "epoch": 0.39394736842105266, "grad_norm": 2.453125, "grad_norm_var": 0.0617095947265625, "learning_rate": 0.0001, "loss": 2.8656, "loss/crossentropy": 2.4766764640808105, "loss/hidden": 2.759375, "loss/incoh": 0.0, "loss/logits": 0.21610620021820068, "loss/reg": 0.0, "step": 59880 }, { "epoch": 0.39401315789473684, "grad_norm": 2.203125, "grad_norm_var": 0.053929646809895836, "learning_rate": 0.0001, "loss": 2.8509, "loss/crossentropy": 2.2846495389938353, "loss/hidden": 2.5609375, "loss/incoh": 0.0, "loss/logits": 0.1908312901854515, "loss/reg": 0.0, "step": 59890 }, { "epoch": 0.39407894736842103, "grad_norm": 2.203125, "grad_norm_var": 0.3528391520182292, "learning_rate": 0.0001, "loss": 2.8513, "loss/crossentropy": 2.2797790288925173, "loss/hidden": 2.6625, "loss/incoh": 0.0, "loss/logits": 0.20919702500104903, "loss/reg": 0.0, "step": 59900 }, { "epoch": 0.39414473684210527, "grad_norm": 2.390625, "grad_norm_var": 0.4265696207682292, "learning_rate": 0.0001, "loss": 2.8959, "loss/crossentropy": 2.5101353883743287, "loss/hidden": 2.665625, "loss/incoh": 0.0, "loss/logits": 0.20757998675107955, "loss/reg": 0.0, "step": 59910 }, { "epoch": 0.39421052631578946, "grad_norm": 2.125, "grad_norm_var": 0.06754150390625, "learning_rate": 0.0001, "loss": 2.8635, "loss/crossentropy": 1.9059808373451232, "loss/hidden": 2.628125, "loss/incoh": 0.0, "loss/logits": 0.17825957536697387, "loss/reg": 0.0, "step": 59920 }, { "epoch": 0.3942763157894737, "grad_norm": 2.328125, "grad_norm_var": 0.04439468383789062, "learning_rate": 0.0001, "loss": 2.8472, "loss/crossentropy": 2.140412425994873, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.2095640070736408, "loss/reg": 0.0, "step": 59930 }, { "epoch": 0.3943421052631579, "grad_norm": 2.296875, "grad_norm_var": 0.038822174072265625, "learning_rate": 0.0001, "loss": 2.8266, "loss/crossentropy": 2.7443739652633665, "loss/hidden": 2.6796875, "loss/incoh": 0.0, "loss/logits": 0.21178169846534728, "loss/reg": 0.0, "step": 59940 }, { "epoch": 0.3944078947368421, "grad_norm": 2.421875, "grad_norm_var": 0.03676732381184896, "learning_rate": 0.0001, "loss": 2.8447, "loss/crossentropy": 2.5024601101875303, "loss/hidden": 2.80625, "loss/incoh": 0.0, "loss/logits": 0.2459690824151039, "loss/reg": 0.0, "step": 59950 }, { "epoch": 0.3944736842105263, "grad_norm": 2.46875, "grad_norm_var": 0.06887105305989584, "learning_rate": 0.0001, "loss": 2.8767, "loss/crossentropy": 2.1928273320198057, "loss/hidden": 2.78125, "loss/incoh": 0.0, "loss/logits": 0.23279945105314254, "loss/reg": 0.0, "step": 59960 }, { "epoch": 0.39453947368421055, "grad_norm": 2.34375, "grad_norm_var": 0.15691731770833334, "learning_rate": 0.0001, "loss": 2.8664, "loss/crossentropy": 2.293261182308197, "loss/hidden": 2.778125, "loss/incoh": 0.0, "loss/logits": 0.2522956483066082, "loss/reg": 0.0, "step": 59970 }, { "epoch": 0.39460526315789474, "grad_norm": 2.28125, "grad_norm_var": 0.12585347493489582, "learning_rate": 0.0001, "loss": 2.8782, "loss/crossentropy": 2.3621891856193544, "loss/hidden": 2.8125, "loss/incoh": 0.0, "loss/logits": 0.2567524313926697, "loss/reg": 0.0, "step": 59980 }, { "epoch": 0.3946710526315789, "grad_norm": 2.28125, "grad_norm_var": 0.035456339518229164, "learning_rate": 0.0001, "loss": 2.8968, "loss/crossentropy": 2.5054262280464172, "loss/hidden": 2.6953125, "loss/incoh": 0.0, "loss/logits": 0.24000230729579924, "loss/reg": 0.0, "step": 59990 }, { "epoch": 0.39473684210526316, "grad_norm": 2.25, "grad_norm_var": 0.0948150634765625, "learning_rate": 0.0001, "loss": 2.9276, "loss/crossentropy": 2.2215917229652407, "loss/hidden": 2.878125, "loss/incoh": 0.0, "loss/logits": 0.2282010167837143, "loss/reg": 0.0, "step": 60000 } ], "logging_steps": 10, "max_steps": 152000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.28626504801321e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }