diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,52033 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2631578947368421, + "eval_steps": 2000, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 6.578947368421052e-05, + "grad_norm": 992.0, + "learning_rate": 1e-05, + "loss": 37.1063, + "loss/crossentropy": 15.088774585723877, + "loss/hidden": 19.0875, + "loss/incoh": 0.0, + "loss/logits": 17.867034912109375, + "loss/reg": 0.0, + "step": 10 + }, + { + "epoch": 0.00013157894736842105, + "grad_norm": 408.0, + "grad_norm_var": 138977.05, + "learning_rate": 2e-05, + "loss": 34.8921, + "loss/crossentropy": 14.647695541381836, + "loss/hidden": 18.8125, + "loss/incoh": 0.0, + "loss/logits": 15.686719226837159, + "loss/reg": 0.0, + "step": 20 + }, + { + "epoch": 0.00019736842105263157, + "grad_norm": 296.0, + "grad_norm_var": 17795.066666666666, + "learning_rate": 3e-05, + "loss": 32.2418, + "loss/crossentropy": 14.60380687713623, + "loss/hidden": 18.6375, + "loss/incoh": 0.0, + "loss/logits": 13.480152130126953, + "loss/reg": 0.0, + "step": 30 + }, + { + "epoch": 0.0002631578947368421, + "grad_norm": 26.125, + "grad_norm_var": 30349.038525390624, + "learning_rate": 4e-05, + "loss": 28.5558, + "loss/crossentropy": 15.434869766235352, + "loss/hidden": 18.525, + "loss/incoh": 0.0, + "loss/logits": 10.245227527618407, + "loss/reg": 0.0, + "step": 40 + }, + { + "epoch": 0.0003289473684210526, + "grad_norm": 51.25, + "grad_norm_var": 160.51717122395834, + "learning_rate": 5e-05, + "loss": 27.8535, + "loss/crossentropy": 13.159739780426026, + "loss/hidden": 18.4, + "loss/incoh": 0.0, + "loss/logits": 10.088242149353027, + "loss/reg": 0.0, + "step": 50 + }, + { + "epoch": 0.00039473684210526315, + "grad_norm": 22.125, + "grad_norm_var": 1.3315524858929522e+17, + "learning_rate": 6e-05, + "loss": 26.8127, + "loss/crossentropy": 10.868320941925049, + "loss/hidden": 18.1375, + "loss/incoh": 0.0, + "loss/logits": 8.643436527252197, + "loss/reg": 0.0, + "step": 60 + }, + { + "epoch": 0.0004605263157894737, + "grad_norm": 25.625, + "grad_norm_var": 1.3315524982997056e+17, + "learning_rate": 7e-05, + "loss": 25.6909, + "loss/crossentropy": 10.59145736694336, + "loss/hidden": 17.65, + "loss/incoh": 0.0, + "loss/logits": 8.497289371490478, + "loss/reg": 0.0, + "step": 70 + }, + { + "epoch": 0.0005263157894736842, + "grad_norm": 32.75, + "grad_norm_var": 206.156103515625, + "learning_rate": 8e-05, + "loss": 24.5919, + "loss/crossentropy": 10.117278575897217, + "loss/hidden": 17.025, + "loss/incoh": 0.0, + "loss/logits": 8.660355854034425, + "loss/reg": 0.0, + "step": 80 + }, + { + "epoch": 0.0005921052631578948, + "grad_norm": 128.0, + "grad_norm_var": 1129.743212890625, + "learning_rate": 9e-05, + "loss": 23.5218, + "loss/crossentropy": 9.528321361541748, + "loss/hidden": 16.3125, + "loss/incoh": 0.0, + "loss/logits": 6.695920991897583, + "loss/reg": 0.0, + "step": 90 + }, + { + "epoch": 0.0006578947368421052, + "grad_norm": 68.5, + "grad_norm_var": 439.56223958333334, + "learning_rate": 0.0001, + "loss": 23.2768, + "loss/crossentropy": 10.33118553161621, + "loss/hidden": 16.5125, + "loss/incoh": 0.0, + "loss/logits": 7.274227619171143, + "loss/reg": 0.0, + "step": 100 + }, + { + "epoch": 0.0007236842105263158, + "grad_norm": 73.5, + "grad_norm_var": 2741.439518229167, + "learning_rate": 0.0001, + "loss": 22.6559, + "loss/crossentropy": 9.676900672912598, + "loss/hidden": 16.21875, + "loss/incoh": 0.0, + "loss/logits": 7.22898006439209, + "loss/reg": 0.0, + "step": 110 + }, + { + "epoch": 0.0007894736842105263, + "grad_norm": 115.5, + "grad_norm_var": 1451.31875, + "learning_rate": 0.0001, + "loss": 22.7316, + "loss/crossentropy": 9.472201251983643, + "loss/hidden": 16.03125, + "loss/incoh": 0.0, + "loss/logits": 6.231936502456665, + "loss/reg": 0.0, + "step": 120 + }, + { + "epoch": 0.0008552631578947369, + "grad_norm": 32.25, + "grad_norm_var": 939.03515625, + "learning_rate": 0.0001, + "loss": 22.1075, + "loss/crossentropy": 9.909808540344239, + "loss/hidden": 16.075, + "loss/incoh": 0.0, + "loss/logits": 6.337067890167236, + "loss/reg": 0.0, + "step": 130 + }, + { + "epoch": 0.0009210526315789473, + "grad_norm": 37.75, + "grad_norm_var": 222.684375, + "learning_rate": 0.0001, + "loss": 22.103, + "loss/crossentropy": 9.516400051116943, + "loss/hidden": 15.8875, + "loss/incoh": 0.0, + "loss/logits": 5.677393054962158, + "loss/reg": 0.0, + "step": 140 + }, + { + "epoch": 0.000986842105263158, + "grad_norm": 46.25, + "grad_norm_var": 1221.1551432291667, + "learning_rate": 0.0001, + "loss": 21.5532, + "loss/crossentropy": 9.398476314544677, + "loss/hidden": 15.70625, + "loss/incoh": 0.0, + "loss/logits": 6.764282178878784, + "loss/reg": 0.0, + "step": 150 + }, + { + "epoch": 0.0010526315789473684, + "grad_norm": 42.0, + "grad_norm_var": 416.9809895833333, + "learning_rate": 0.0001, + "loss": 20.8381, + "loss/crossentropy": 9.098556232452392, + "loss/hidden": 15.225, + "loss/incoh": 0.0, + "loss/logits": 5.8288336277008055, + "loss/reg": 0.0, + "step": 160 + }, + { + "epoch": 0.0011184210526315789, + "grad_norm": 33.25, + "grad_norm_var": 334.8979166666667, + "learning_rate": 0.0001, + "loss": 18.9261, + "loss/crossentropy": 7.879999303817749, + "loss/hidden": 13.65, + "loss/incoh": 0.0, + "loss/logits": 5.353159737586975, + "loss/reg": 0.0, + "step": 170 + }, + { + "epoch": 0.0011842105263157896, + "grad_norm": 14.375, + "grad_norm_var": 250.13333333333333, + "learning_rate": 0.0001, + "loss": 16.5004, + "loss/crossentropy": 6.681199312210083, + "loss/hidden": 12.0125, + "loss/incoh": 0.0, + "loss/logits": 4.6115447044372555, + "loss/reg": 0.0, + "step": 180 + }, + { + "epoch": 0.00125, + "grad_norm": 12.5625, + "grad_norm_var": 119.9546875, + "learning_rate": 0.0001, + "loss": 14.1282, + "loss/crossentropy": 5.785468435287475, + "loss/hidden": 10.725, + "loss/incoh": 0.0, + "loss/logits": 3.489908790588379, + "loss/reg": 0.0, + "step": 190 + }, + { + "epoch": 0.0013157894736842105, + "grad_norm": 12.625, + "grad_norm_var": 32.467431640625, + "learning_rate": 0.0001, + "loss": 12.8216, + "loss/crossentropy": 4.923622274398804, + "loss/hidden": 9.675, + "loss/incoh": 0.0, + "loss/logits": 3.1496715903282166, + "loss/reg": 0.0, + "step": 200 + }, + { + "epoch": 0.001381578947368421, + "grad_norm": 26.375, + "grad_norm_var": 22.6759765625, + "learning_rate": 0.0001, + "loss": 11.5516, + "loss/crossentropy": 4.429650473594665, + "loss/hidden": 8.875, + "loss/incoh": 0.0, + "loss/logits": 2.247162342071533, + "loss/reg": 0.0, + "step": 210 + }, + { + "epoch": 0.0014473684210526317, + "grad_norm": 35.5, + "grad_norm_var": 45.67233072916667, + "learning_rate": 0.0001, + "loss": 10.495, + "loss/crossentropy": 4.112493515014648, + "loss/hidden": 8.346875, + "loss/incoh": 0.0, + "loss/logits": 2.2232163310050965, + "loss/reg": 0.0, + "step": 220 + }, + { + "epoch": 0.0015131578947368421, + "grad_norm": 37.5, + "grad_norm_var": 58.01770833333333, + "learning_rate": 0.0001, + "loss": 9.9703, + "loss/crossentropy": 4.019938945770264, + "loss/hidden": 8.015625, + "loss/incoh": 0.0, + "loss/logits": 2.0913679361343385, + "loss/reg": 0.0, + "step": 230 + }, + { + "epoch": 0.0015789473684210526, + "grad_norm": 36.25, + "grad_norm_var": 53.95390625, + "learning_rate": 0.0001, + "loss": 9.5394, + "loss/crossentropy": 3.9986461877822874, + "loss/hidden": 7.490625, + "loss/incoh": 0.0, + "loss/logits": 1.801698899269104, + "loss/reg": 0.0, + "step": 240 + }, + { + "epoch": 0.001644736842105263, + "grad_norm": 34.0, + "grad_norm_var": 913.9712890625, + "learning_rate": 0.0001, + "loss": 9.1248, + "loss/crossentropy": 3.7174268484115602, + "loss/hidden": 7.721875, + "loss/incoh": 0.0, + "loss/logits": 2.0452203273773195, + "loss/reg": 0.0, + "step": 250 + }, + { + "epoch": 0.0017105263157894738, + "grad_norm": 30.625, + "grad_norm_var": 901.7176432291667, + "learning_rate": 0.0001, + "loss": 8.7525, + "loss/crossentropy": 3.6645556688308716, + "loss/hidden": 7.36875, + "loss/incoh": 0.0, + "loss/logits": 1.4742069363594055, + "loss/reg": 0.0, + "step": 260 + }, + { + "epoch": 0.0017763157894736842, + "grad_norm": 40.0, + "grad_norm_var": 110.65826822916667, + "learning_rate": 0.0001, + "loss": 8.8113, + "loss/crossentropy": 3.116227722167969, + "loss/hidden": 7.31875, + "loss/incoh": 0.0, + "loss/logits": 1.6629727721214294, + "loss/reg": 0.0, + "step": 270 + }, + { + "epoch": 0.0018421052631578947, + "grad_norm": 34.75, + "grad_norm_var": 122.43170572916667, + "learning_rate": 0.0001, + "loss": 8.5312, + "loss/crossentropy": 3.413420820236206, + "loss/hidden": 7.38125, + "loss/incoh": 0.0, + "loss/logits": 1.3304585099220276, + "loss/reg": 0.0, + "step": 280 + }, + { + "epoch": 0.0019078947368421052, + "grad_norm": 30.25, + "grad_norm_var": 50.46640625, + "learning_rate": 0.0001, + "loss": 8.1172, + "loss/crossentropy": 3.313588786125183, + "loss/hidden": 6.721875, + "loss/incoh": 0.0, + "loss/logits": 1.1558095216751099, + "loss/reg": 0.0, + "step": 290 + }, + { + "epoch": 0.001973684210526316, + "grad_norm": 33.5, + "grad_norm_var": 66.95618489583333, + "learning_rate": 0.0001, + "loss": 8.4831, + "loss/crossentropy": 3.286371445655823, + "loss/hidden": 6.971875, + "loss/incoh": 0.0, + "loss/logits": 1.4257299542427062, + "loss/reg": 0.0, + "step": 300 + }, + { + "epoch": 0.0020394736842105263, + "grad_norm": 37.0, + "grad_norm_var": 39.40930989583333, + "learning_rate": 0.0001, + "loss": 8.1428, + "loss/crossentropy": 3.1924397230148314, + "loss/hidden": 6.7375, + "loss/incoh": 0.0, + "loss/logits": 1.129437392950058, + "loss/reg": 0.0, + "step": 310 + }, + { + "epoch": 0.002105263157894737, + "grad_norm": 35.75, + "grad_norm_var": 60.73125, + "learning_rate": 0.0001, + "loss": 8.1236, + "loss/crossentropy": 3.217240035533905, + "loss/hidden": 7.0375, + "loss/incoh": 0.0, + "loss/logits": 1.216874635219574, + "loss/reg": 0.0, + "step": 320 + }, + { + "epoch": 0.0021710526315789473, + "grad_norm": 32.75, + "grad_norm_var": 21.7181640625, + "learning_rate": 0.0001, + "loss": 8.0115, + "loss/crossentropy": 3.2230591058731077, + "loss/hidden": 6.665625, + "loss/incoh": 0.0, + "loss/logits": 1.165043205022812, + "loss/reg": 0.0, + "step": 330 + }, + { + "epoch": 0.0022368421052631577, + "grad_norm": 38.75, + "grad_norm_var": 247.53932291666666, + "learning_rate": 0.0001, + "loss": 7.8717, + "loss/crossentropy": 3.491655874252319, + "loss/hidden": 6.628125, + "loss/incoh": 0.0, + "loss/logits": 1.1553439140319823, + "loss/reg": 0.0, + "step": 340 + }, + { + "epoch": 0.002302631578947368, + "grad_norm": 29.0, + "grad_norm_var": 243.79576822916667, + "learning_rate": 0.0001, + "loss": 7.8354, + "loss/crossentropy": 3.3709447622299193, + "loss/hidden": 6.646875, + "loss/incoh": 0.0, + "loss/logits": 1.1052397668361664, + "loss/reg": 0.0, + "step": 350 + }, + { + "epoch": 0.002368421052631579, + "grad_norm": 28.75, + "grad_norm_var": 40.61640625, + "learning_rate": 0.0001, + "loss": 7.5894, + "loss/crossentropy": 3.0430339336395265, + "loss/hidden": 6.778125, + "loss/incoh": 0.0, + "loss/logits": 1.152853137254715, + "loss/reg": 0.0, + "step": 360 + }, + { + "epoch": 0.0024342105263157896, + "grad_norm": 50.0, + "grad_norm_var": 58.61640625, + "learning_rate": 0.0001, + "loss": 7.7607, + "loss/crossentropy": 3.461497259140015, + "loss/hidden": 6.640625, + "loss/incoh": 0.0, + "loss/logits": 1.2907899796962738, + "loss/reg": 0.0, + "step": 370 + }, + { + "epoch": 0.0025, + "grad_norm": 23.875, + "grad_norm_var": 50.36145833333333, + "learning_rate": 0.0001, + "loss": 7.5895, + "loss/crossentropy": 2.8183989763259887, + "loss/hidden": 6.36875, + "loss/incoh": 0.0, + "loss/logits": 0.9597792446613311, + "loss/reg": 0.0, + "step": 380 + }, + { + "epoch": 0.0025657894736842105, + "grad_norm": 22.75, + "grad_norm_var": 18.348958333333332, + "learning_rate": 0.0001, + "loss": 7.4024, + "loss/crossentropy": 3.0434406876564024, + "loss/hidden": 6.425, + "loss/incoh": 0.0, + "loss/logits": 1.1219703614711762, + "loss/reg": 0.0, + "step": 390 + }, + { + "epoch": 0.002631578947368421, + "grad_norm": 18.75, + "grad_norm_var": 75.46451822916667, + "learning_rate": 0.0001, + "loss": 7.4663, + "loss/crossentropy": 2.9813458204269407, + "loss/hidden": 6.334375, + "loss/incoh": 0.0, + "loss/logits": 1.0157755613327026, + "loss/reg": 0.0, + "step": 400 + }, + { + "epoch": 0.0026973684210526315, + "grad_norm": 20.25, + "grad_norm_var": 14.660416666666666, + "learning_rate": 0.0001, + "loss": 7.4425, + "loss/crossentropy": 3.030743360519409, + "loss/hidden": 6.23125, + "loss/incoh": 0.0, + "loss/logits": 1.0403401851654053, + "loss/reg": 0.0, + "step": 410 + }, + { + "epoch": 0.002763157894736842, + "grad_norm": 19.75, + "grad_norm_var": 6.712239583333333, + "learning_rate": 0.0001, + "loss": 7.1935, + "loss/crossentropy": 3.044888973236084, + "loss/hidden": 6.1125, + "loss/incoh": 0.0, + "loss/logits": 0.920581477880478, + "loss/reg": 0.0, + "step": 420 + }, + { + "epoch": 0.002828947368421053, + "grad_norm": 18.625, + "grad_norm_var": 5.847330729166667, + "learning_rate": 0.0001, + "loss": 7.0053, + "loss/crossentropy": 3.2355963468551634, + "loss/hidden": 6.015625, + "loss/incoh": 0.0, + "loss/logits": 0.9773828387260437, + "loss/reg": 0.0, + "step": 430 + }, + { + "epoch": 0.0028947368421052633, + "grad_norm": 21.875, + "grad_norm_var": 9.627067057291667, + "learning_rate": 0.0001, + "loss": 6.9973, + "loss/crossentropy": 3.3775979042053224, + "loss/hidden": 6.071875, + "loss/incoh": 0.0, + "loss/logits": 1.0445533573627472, + "loss/reg": 0.0, + "step": 440 + }, + { + "epoch": 0.0029605263157894738, + "grad_norm": 19.25, + "grad_norm_var": 10.170947265625, + "learning_rate": 0.0001, + "loss": 6.9686, + "loss/crossentropy": 3.0577521562576293, + "loss/hidden": 5.803125, + "loss/incoh": 0.0, + "loss/logits": 0.8946591019630432, + "loss/reg": 0.0, + "step": 450 + }, + { + "epoch": 0.0030263157894736843, + "grad_norm": 19.75, + "grad_norm_var": 5.329166666666667, + "learning_rate": 0.0001, + "loss": 6.8021, + "loss/crossentropy": 3.2570735692977903, + "loss/hidden": 5.678125, + "loss/incoh": 0.0, + "loss/logits": 0.860103166103363, + "loss/reg": 0.0, + "step": 460 + }, + { + "epoch": 0.0030921052631578947, + "grad_norm": 11.5, + "grad_norm_var": 5.882535807291666, + "learning_rate": 0.0001, + "loss": 6.6494, + "loss/crossentropy": 3.045071005821228, + "loss/hidden": 5.759375, + "loss/incoh": 0.0, + "loss/logits": 0.890488612651825, + "loss/reg": 0.0, + "step": 470 + }, + { + "epoch": 0.003157894736842105, + "grad_norm": 12.0, + "grad_norm_var": 3.778369140625, + "learning_rate": 0.0001, + "loss": 6.6399, + "loss/crossentropy": 2.955122375488281, + "loss/hidden": 5.559375, + "loss/incoh": 0.0, + "loss/logits": 0.7988932132720947, + "loss/reg": 0.0, + "step": 480 + }, + { + "epoch": 0.0032236842105263157, + "grad_norm": 14.625, + "grad_norm_var": 3.397509765625, + "learning_rate": 0.0001, + "loss": 6.6006, + "loss/crossentropy": 2.8290895342826845, + "loss/hidden": 5.73125, + "loss/incoh": 0.0, + "loss/logits": 0.8468542337417603, + "loss/reg": 0.0, + "step": 490 + }, + { + "epoch": 0.003289473684210526, + "grad_norm": 11.5, + "grad_norm_var": 4.100113932291666, + "learning_rate": 0.0001, + "loss": 6.4823, + "loss/crossentropy": 2.7418078184127808, + "loss/hidden": 5.665625, + "loss/incoh": 0.0, + "loss/logits": 0.7723756909370423, + "loss/reg": 0.0, + "step": 500 + }, + { + "epoch": 0.003355263157894737, + "grad_norm": 11.5625, + "grad_norm_var": 2.79375, + "learning_rate": 0.0001, + "loss": 6.4511, + "loss/crossentropy": 3.1031686782836916, + "loss/hidden": 5.721875, + "loss/incoh": 0.0, + "loss/logits": 0.9047020822763443, + "loss/reg": 0.0, + "step": 510 + }, + { + "epoch": 0.0034210526315789475, + "grad_norm": 10.625, + "grad_norm_var": 0.8618326822916667, + "learning_rate": 0.0001, + "loss": 6.3114, + "loss/crossentropy": 2.7031071186065674, + "loss/hidden": 5.3875, + "loss/incoh": 0.0, + "loss/logits": 0.7112044870853425, + "loss/reg": 0.0, + "step": 520 + }, + { + "epoch": 0.003486842105263158, + "grad_norm": 9.625, + "grad_norm_var": 94.71066080729166, + "learning_rate": 0.0001, + "loss": 6.2538, + "loss/crossentropy": 2.8632609844207764, + "loss/hidden": 5.784375, + "loss/incoh": 0.0, + "loss/logits": 0.8032145172357559, + "loss/reg": 0.0, + "step": 530 + }, + { + "epoch": 0.0035526315789473684, + "grad_norm": 12.0, + "grad_norm_var": 2.7383748372395833, + "learning_rate": 0.0001, + "loss": 6.1042, + "loss/crossentropy": 3.037100100517273, + "loss/hidden": 5.25625, + "loss/incoh": 0.0, + "loss/logits": 0.7572773277759552, + "loss/reg": 0.0, + "step": 540 + }, + { + "epoch": 0.003618421052631579, + "grad_norm": 10.9375, + "grad_norm_var": 2.5140584309895835, + "learning_rate": 0.0001, + "loss": 6.1353, + "loss/crossentropy": 2.979613184928894, + "loss/hidden": 5.196875, + "loss/incoh": 0.0, + "loss/logits": 0.7719000339508056, + "loss/reg": 0.0, + "step": 550 + }, + { + "epoch": 0.0036842105263157894, + "grad_norm": 8.0625, + "grad_norm_var": 1.6379557291666667, + "learning_rate": 0.0001, + "loss": 6.0637, + "loss/crossentropy": 2.687799036502838, + "loss/hidden": 5.51875, + "loss/incoh": 0.0, + "loss/logits": 0.9166353821754456, + "loss/reg": 0.0, + "step": 560 + }, + { + "epoch": 0.00375, + "grad_norm": 9.9375, + "grad_norm_var": 18.912353515625, + "learning_rate": 0.0001, + "loss": 6.0506, + "loss/crossentropy": 2.935390818119049, + "loss/hidden": 5.165625, + "loss/incoh": 0.0, + "loss/logits": 0.716228786110878, + "loss/reg": 0.0, + "step": 570 + }, + { + "epoch": 0.0038157894736842103, + "grad_norm": 7.25, + "grad_norm_var": 18.753641764322918, + "learning_rate": 0.0001, + "loss": 5.9637, + "loss/crossentropy": 2.7780107259750366, + "loss/hidden": 5.190625, + "loss/incoh": 0.0, + "loss/logits": 0.7067849993705749, + "loss/reg": 0.0, + "step": 580 + }, + { + "epoch": 0.0038815789473684212, + "grad_norm": 6.84375, + "grad_norm_var": 2.39361572265625, + "learning_rate": 0.0001, + "loss": 5.9361, + "loss/crossentropy": 3.0060938119888307, + "loss/hidden": 5.225, + "loss/incoh": 0.0, + "loss/logits": 0.7271955192089081, + "loss/reg": 0.0, + "step": 590 + }, + { + "epoch": 0.003947368421052632, + "grad_norm": 7.6875, + "grad_norm_var": 0.75357666015625, + "learning_rate": 0.0001, + "loss": 5.7669, + "loss/crossentropy": 2.691058301925659, + "loss/hidden": 4.825, + "loss/incoh": 0.0, + "loss/logits": 0.6389567136764527, + "loss/reg": 0.0, + "step": 600 + }, + { + "epoch": 0.004013157894736842, + "grad_norm": 6.78125, + "grad_norm_var": 2.53160400390625, + "learning_rate": 0.0001, + "loss": 5.7022, + "loss/crossentropy": 2.7504406690597536, + "loss/hidden": 5.121875, + "loss/incoh": 0.0, + "loss/logits": 0.8024603247642517, + "loss/reg": 0.0, + "step": 610 + }, + { + "epoch": 0.004078947368421053, + "grad_norm": 6.875, + "grad_norm_var": 2.7155558268229165, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.9313748240470887, + "loss/hidden": 5.00625, + "loss/incoh": 0.0, + "loss/logits": 0.7257195949554444, + "loss/reg": 0.0, + "step": 620 + }, + { + "epoch": 0.0041447368421052636, + "grad_norm": 5.8125, + "grad_norm_var": 0.3809733072916667, + "learning_rate": 0.0001, + "loss": 5.6209, + "loss/crossentropy": 2.8686537384986877, + "loss/hidden": 5.05625, + "loss/incoh": 0.0, + "loss/logits": 0.7894174456596375, + "loss/reg": 0.0, + "step": 630 + }, + { + "epoch": 0.004210526315789474, + "grad_norm": 10.5, + "grad_norm_var": 1.7327962239583334, + "learning_rate": 0.0001, + "loss": 5.6566, + "loss/crossentropy": 2.910102880001068, + "loss/hidden": 4.95, + "loss/incoh": 0.0, + "loss/logits": 0.7542287766933441, + "loss/reg": 0.0, + "step": 640 + }, + { + "epoch": 0.0042763157894736845, + "grad_norm": 6.0, + "grad_norm_var": 1.90406494140625, + "learning_rate": 0.0001, + "loss": 5.6548, + "loss/crossentropy": 3.0497835516929626, + "loss/hidden": 5.075, + "loss/incoh": 0.0, + "loss/logits": 0.6385725855827331, + "loss/reg": 0.0, + "step": 650 + }, + { + "epoch": 0.0043421052631578945, + "grad_norm": 5.90625, + "grad_norm_var": 6.811181640625, + "learning_rate": 0.0001, + "loss": 5.4435, + "loss/crossentropy": 2.839945673942566, + "loss/hidden": 4.621875, + "loss/incoh": 0.0, + "loss/logits": 0.608047366142273, + "loss/reg": 0.0, + "step": 660 + }, + { + "epoch": 0.0044078947368421054, + "grad_norm": 8.5, + "grad_norm_var": 1.419775390625, + "learning_rate": 0.0001, + "loss": 5.4264, + "loss/crossentropy": 2.6664235949516297, + "loss/hidden": 4.71875, + "loss/incoh": 0.0, + "loss/logits": 0.5786069691181183, + "loss/reg": 0.0, + "step": 670 + }, + { + "epoch": 0.0044736842105263155, + "grad_norm": 5.25, + "grad_norm_var": 1.222509765625, + "learning_rate": 0.0001, + "loss": 5.4175, + "loss/crossentropy": 2.7476831912994384, + "loss/hidden": 4.646875, + "loss/incoh": 0.0, + "loss/logits": 0.6524303257465363, + "loss/reg": 0.0, + "step": 680 + }, + { + "epoch": 0.004539473684210526, + "grad_norm": 5.15625, + "grad_norm_var": 0.8695271809895834, + "learning_rate": 0.0001, + "loss": 5.2974, + "loss/crossentropy": 2.718129062652588, + "loss/hidden": 4.69375, + "loss/incoh": 0.0, + "loss/logits": 0.679440614581108, + "loss/reg": 0.0, + "step": 690 + }, + { + "epoch": 0.004605263157894736, + "grad_norm": 5.875, + "grad_norm_var": 1.3825358072916667, + "learning_rate": 0.0001, + "loss": 5.3809, + "loss/crossentropy": 2.896076512336731, + "loss/hidden": 4.503125, + "loss/incoh": 0.0, + "loss/logits": 0.6036079049110412, + "loss/reg": 0.0, + "step": 700 + }, + { + "epoch": 0.004671052631578947, + "grad_norm": 5.875, + "grad_norm_var": 0.94888916015625, + "learning_rate": 0.0001, + "loss": 5.2593, + "loss/crossentropy": 2.765268421173096, + "loss/hidden": 4.803125, + "loss/incoh": 0.0, + "loss/logits": 0.7337387800216675, + "loss/reg": 0.0, + "step": 710 + }, + { + "epoch": 0.004736842105263158, + "grad_norm": 4.75, + "grad_norm_var": 1.0287760416666667, + "learning_rate": 0.0001, + "loss": 5.1503, + "loss/crossentropy": 2.6812595248222353, + "loss/hidden": 4.75625, + "loss/incoh": 0.0, + "loss/logits": 0.6710720509290695, + "loss/reg": 0.0, + "step": 720 + }, + { + "epoch": 0.004802631578947368, + "grad_norm": 6.0625, + "grad_norm_var": 86.4677734375, + "learning_rate": 0.0001, + "loss": 5.3144, + "loss/crossentropy": 2.7298573732376097, + "loss/hidden": 4.484375, + "loss/incoh": 0.0, + "loss/logits": 0.6176893144845963, + "loss/reg": 0.0, + "step": 730 + }, + { + "epoch": 0.004868421052631579, + "grad_norm": 4.875, + "grad_norm_var": 85.56676025390625, + "learning_rate": 0.0001, + "loss": 5.131, + "loss/crossentropy": 2.823095703125, + "loss/hidden": 4.571875, + "loss/incoh": 0.0, + "loss/logits": 0.6797973781824111, + "loss/reg": 0.0, + "step": 740 + }, + { + "epoch": 0.004934210526315789, + "grad_norm": 4.96875, + "grad_norm_var": 4.8291015625, + "learning_rate": 0.0001, + "loss": 5.1654, + "loss/crossentropy": 2.901310992240906, + "loss/hidden": 4.61875, + "loss/incoh": 0.0, + "loss/logits": 0.737342044711113, + "loss/reg": 0.0, + "step": 750 + }, + { + "epoch": 0.005, + "grad_norm": 9.375, + "grad_norm_var": 3.796728515625, + "learning_rate": 0.0001, + "loss": 5.0448, + "loss/crossentropy": 2.5148804664611815, + "loss/hidden": 4.48125, + "loss/incoh": 0.0, + "loss/logits": 0.5650010257959366, + "loss/reg": 0.0, + "step": 760 + }, + { + "epoch": 0.00506578947368421, + "grad_norm": 4.90625, + "grad_norm_var": 18.020947265625, + "learning_rate": 0.0001, + "loss": 5.0271, + "loss/crossentropy": 2.6732282817363737, + "loss/hidden": 4.48125, + "loss/incoh": 0.0, + "loss/logits": 0.5763083070516586, + "loss/reg": 0.0, + "step": 770 + }, + { + "epoch": 0.005131578947368421, + "grad_norm": 4.71875, + "grad_norm_var": 17.617118326822915, + "learning_rate": 0.0001, + "loss": 5.0057, + "loss/crossentropy": 2.927682900428772, + "loss/hidden": 4.578125, + "loss/incoh": 0.0, + "loss/logits": 0.680091741681099, + "loss/reg": 0.0, + "step": 780 + }, + { + "epoch": 0.005197368421052632, + "grad_norm": 4.53125, + "grad_norm_var": 1.5018880208333334, + "learning_rate": 0.0001, + "loss": 5.0918, + "loss/crossentropy": 2.7974375009536745, + "loss/hidden": 4.75, + "loss/incoh": 0.0, + "loss/logits": 0.709694892168045, + "loss/reg": 0.0, + "step": 790 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 4.46875, + "grad_norm_var": 20.911812337239585, + "learning_rate": 0.0001, + "loss": 4.9367, + "loss/crossentropy": 2.875410461425781, + "loss/hidden": 4.578125, + "loss/incoh": 0.0, + "loss/logits": 0.8118050575256348, + "loss/reg": 0.0, + "step": 800 + }, + { + "epoch": 0.005328947368421053, + "grad_norm": 5.375, + "grad_norm_var": 2.1762980143229167, + "learning_rate": 0.0001, + "loss": 4.8483, + "loss/crossentropy": 2.665932035446167, + "loss/hidden": 4.3140625, + "loss/incoh": 0.0, + "loss/logits": 0.5815477341413497, + "loss/reg": 0.0, + "step": 810 + }, + { + "epoch": 0.005394736842105263, + "grad_norm": 4.96875, + "grad_norm_var": 0.6493235270182292, + "learning_rate": 0.0001, + "loss": 4.9314, + "loss/crossentropy": 2.5820897936820986, + "loss/hidden": 4.15, + "loss/incoh": 0.0, + "loss/logits": 0.4888360023498535, + "loss/reg": 0.0, + "step": 820 + }, + { + "epoch": 0.005460526315789474, + "grad_norm": 3.890625, + "grad_norm_var": 0.2476470947265625, + "learning_rate": 0.0001, + "loss": 4.7859, + "loss/crossentropy": 2.647566497325897, + "loss/hidden": 4.3484375, + "loss/incoh": 0.0, + "loss/logits": 0.5885868102312088, + "loss/reg": 0.0, + "step": 830 + }, + { + "epoch": 0.005526315789473684, + "grad_norm": 4.9375, + "grad_norm_var": 0.29011128743489584, + "learning_rate": 0.0001, + "loss": 4.7829, + "loss/crossentropy": 2.583667039871216, + "loss/hidden": 4.2828125, + "loss/incoh": 0.0, + "loss/logits": 0.4921345829963684, + "loss/reg": 0.0, + "step": 840 + }, + { + "epoch": 0.005592105263157895, + "grad_norm": 4.6875, + "grad_norm_var": 0.397900390625, + "learning_rate": 0.0001, + "loss": 4.8821, + "loss/crossentropy": 2.798052990436554, + "loss/hidden": 4.2625, + "loss/incoh": 0.0, + "loss/logits": 0.7617209196090698, + "loss/reg": 0.0, + "step": 850 + }, + { + "epoch": 0.005657894736842106, + "grad_norm": 4.21875, + "grad_norm_var": 0.29501546223958336, + "learning_rate": 0.0001, + "loss": 4.9069, + "loss/crossentropy": 2.8075502276420594, + "loss/hidden": 4.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.5111032873392105, + "loss/reg": 0.0, + "step": 860 + }, + { + "epoch": 0.005723684210526316, + "grad_norm": 4.21875, + "grad_norm_var": 0.24931233723958332, + "learning_rate": 0.0001, + "loss": 4.8049, + "loss/crossentropy": 2.400660240650177, + "loss/hidden": 4.3859375, + "loss/incoh": 0.0, + "loss/logits": 0.6145752131938934, + "loss/reg": 0.0, + "step": 870 + }, + { + "epoch": 0.005789473684210527, + "grad_norm": 4.75, + "grad_norm_var": 1.2445271809895833, + "learning_rate": 0.0001, + "loss": 4.7333, + "loss/crossentropy": 2.631825530529022, + "loss/hidden": 4.3640625, + "loss/incoh": 0.0, + "loss/logits": 0.5737248510122299, + "loss/reg": 0.0, + "step": 880 + }, + { + "epoch": 0.005855263157894737, + "grad_norm": 4.90625, + "grad_norm_var": 1.9954742431640624, + "learning_rate": 0.0001, + "loss": 4.6882, + "loss/crossentropy": 2.9550926446914674, + "loss/hidden": 4.1046875, + "loss/incoh": 0.0, + "loss/logits": 0.5351347416639328, + "loss/reg": 0.0, + "step": 890 + }, + { + "epoch": 0.0059210526315789476, + "grad_norm": 4.15625, + "grad_norm_var": 0.8051991780598958, + "learning_rate": 0.0001, + "loss": 4.7267, + "loss/crossentropy": 2.322158467769623, + "loss/hidden": 4.2359375, + "loss/incoh": 0.0, + "loss/logits": 0.4901482403278351, + "loss/reg": 0.0, + "step": 900 + }, + { + "epoch": 0.005986842105263158, + "grad_norm": 4.59375, + "grad_norm_var": 0.5987589518229167, + "learning_rate": 0.0001, + "loss": 4.7552, + "loss/crossentropy": 2.163175904750824, + "loss/hidden": 4.475, + "loss/incoh": 0.0, + "loss/logits": 0.5075026541948319, + "loss/reg": 0.0, + "step": 910 + }, + { + "epoch": 0.0060526315789473685, + "grad_norm": 4.4375, + "grad_norm_var": 13.234305826822917, + "learning_rate": 0.0001, + "loss": 4.8996, + "loss/crossentropy": 2.5448178887367248, + "loss/hidden": 4.078125, + "loss/incoh": 0.0, + "loss/logits": 0.5253018319606781, + "loss/reg": 0.0, + "step": 920 + }, + { + "epoch": 0.0061184210526315785, + "grad_norm": 4.09375, + "grad_norm_var": 4.241829427083333, + "learning_rate": 0.0001, + "loss": 4.5704, + "loss/crossentropy": 2.551811099052429, + "loss/hidden": 4.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.48250589668750765, + "loss/reg": 0.0, + "step": 930 + }, + { + "epoch": 0.0061842105263157894, + "grad_norm": 4.71875, + "grad_norm_var": 0.13075764973958334, + "learning_rate": 0.0001, + "loss": 4.6287, + "loss/crossentropy": 2.8518677711486817, + "loss/hidden": 4.134375, + "loss/incoh": 0.0, + "loss/logits": 0.5851545244455337, + "loss/reg": 0.0, + "step": 940 + }, + { + "epoch": 0.00625, + "grad_norm": 5.65625, + "grad_norm_var": 1.0635701497395833, + "learning_rate": 0.0001, + "loss": 4.6855, + "loss/crossentropy": 2.8685452222824095, + "loss/hidden": 3.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.5299171417951584, + "loss/reg": 0.0, + "step": 950 + }, + { + "epoch": 0.00631578947368421, + "grad_norm": 4.4375, + "grad_norm_var": 0.2560506184895833, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.73275808095932, + "loss/hidden": 4.121875, + "loss/incoh": 0.0, + "loss/logits": 0.4992497324943542, + "loss/reg": 0.0, + "step": 960 + }, + { + "epoch": 0.006381578947368421, + "grad_norm": 4.0625, + "grad_norm_var": 0.3540598551432292, + "learning_rate": 0.0001, + "loss": 4.6436, + "loss/crossentropy": 2.6122008085250856, + "loss/hidden": 4.084375, + "loss/incoh": 0.0, + "loss/logits": 0.5424144893884659, + "loss/reg": 0.0, + "step": 970 + }, + { + "epoch": 0.006447368421052631, + "grad_norm": 4.0, + "grad_norm_var": 1.2561260533134024e+17, + "learning_rate": 0.0001, + "loss": 4.7384, + "loss/crossentropy": 2.747082471847534, + "loss/hidden": 3.975, + "loss/incoh": 0.0, + "loss/logits": 0.4949415147304535, + "loss/reg": 0.0, + "step": 980 + }, + { + "epoch": 0.006513157894736842, + "grad_norm": 4.6875, + "grad_norm_var": 15.184098307291666, + "learning_rate": 0.0001, + "loss": 4.6154, + "loss/crossentropy": 2.6343679666519164, + "loss/hidden": 4.009375, + "loss/incoh": 0.0, + "loss/logits": 0.46223918795585633, + "loss/reg": 0.0, + "step": 990 + }, + { + "epoch": 0.006578947368421052, + "grad_norm": 4.84375, + "grad_norm_var": 3.58121337890625, + "learning_rate": 0.0001, + "loss": 4.6456, + "loss/crossentropy": 2.618730914592743, + "loss/hidden": 4.1390625, + "loss/incoh": 0.0, + "loss/logits": 0.568024319410324, + "loss/reg": 0.0, + "step": 1000 + }, + { + "epoch": 0.006644736842105263, + "grad_norm": 10.1875, + "grad_norm_var": 2.6229563395182294, + "learning_rate": 0.0001, + "loss": 4.5241, + "loss/crossentropy": 2.7510436296463014, + "loss/hidden": 3.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.4975563734769821, + "loss/reg": 0.0, + "step": 1010 + }, + { + "epoch": 0.006710526315789474, + "grad_norm": 4.59375, + "grad_norm_var": 9.821955362955729, + "learning_rate": 0.0001, + "loss": 4.5805, + "loss/crossentropy": 2.773310422897339, + "loss/hidden": 3.94375, + "loss/incoh": 0.0, + "loss/logits": 0.5206439226865769, + "loss/reg": 0.0, + "step": 1020 + }, + { + "epoch": 0.006776315789473684, + "grad_norm": 4.34375, + "grad_norm_var": 8.482259114583334, + "learning_rate": 0.0001, + "loss": 4.4588, + "loss/crossentropy": 2.6046599745750427, + "loss/hidden": 3.88125, + "loss/incoh": 0.0, + "loss/logits": 0.44563083052635194, + "loss/reg": 0.0, + "step": 1030 + }, + { + "epoch": 0.006842105263157895, + "grad_norm": 4.0, + "grad_norm_var": 5.523110961914062, + "learning_rate": 0.0001, + "loss": 4.6377, + "loss/crossentropy": 2.6867428183555604, + "loss/hidden": 4.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.4769616901874542, + "loss/reg": 0.0, + "step": 1040 + }, + { + "epoch": 0.006907894736842105, + "grad_norm": 3.890625, + "grad_norm_var": 9.615762329101562, + "learning_rate": 0.0001, + "loss": 4.5576, + "loss/crossentropy": 2.6374024391174316, + "loss/hidden": 3.915625, + "loss/incoh": 0.0, + "loss/logits": 0.5235758543014526, + "loss/reg": 0.0, + "step": 1050 + }, + { + "epoch": 0.006973684210526316, + "grad_norm": 3.796875, + "grad_norm_var": 4.420035807291667, + "learning_rate": 0.0001, + "loss": 4.5061, + "loss/crossentropy": 2.409494662284851, + "loss/hidden": 4.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.5403494209051132, + "loss/reg": 0.0, + "step": 1060 + }, + { + "epoch": 0.007039473684210526, + "grad_norm": 4.25, + "grad_norm_var": 12.098844401041667, + "learning_rate": 0.0001, + "loss": 4.7608, + "loss/crossentropy": 2.6496052145957947, + "loss/hidden": 3.878125, + "loss/incoh": 0.0, + "loss/logits": 0.48798912912607195, + "loss/reg": 0.0, + "step": 1070 + }, + { + "epoch": 0.007105263157894737, + "grad_norm": 3.875, + "grad_norm_var": 9.539867146809895, + "learning_rate": 0.0001, + "loss": 4.3815, + "loss/crossentropy": 2.757003378868103, + "loss/hidden": 4.11875, + "loss/incoh": 0.0, + "loss/logits": 0.4590116262435913, + "loss/reg": 0.0, + "step": 1080 + }, + { + "epoch": 0.007171052631578947, + "grad_norm": 8.5625, + "grad_norm_var": 1.5923166910807292, + "learning_rate": 0.0001, + "loss": 4.44, + "loss/crossentropy": 2.550069880485535, + "loss/hidden": 4.071875, + "loss/incoh": 0.0, + "loss/logits": 0.5490242928266525, + "loss/reg": 0.0, + "step": 1090 + }, + { + "epoch": 0.007236842105263158, + "grad_norm": 3.96875, + "grad_norm_var": 1.6922597249348958, + "learning_rate": 0.0001, + "loss": 4.4725, + "loss/crossentropy": 2.606226110458374, + "loss/hidden": 4.0875, + "loss/incoh": 0.0, + "loss/logits": 0.5686961978673934, + "loss/reg": 0.0, + "step": 1100 + }, + { + "epoch": 0.007302631578947369, + "grad_norm": 4.90625, + "grad_norm_var": 0.9616282145182292, + "learning_rate": 0.0001, + "loss": 4.4416, + "loss/crossentropy": 2.742388653755188, + "loss/hidden": 3.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.491252401471138, + "loss/reg": 0.0, + "step": 1110 + }, + { + "epoch": 0.007368421052631579, + "grad_norm": 4.375, + "grad_norm_var": 0.8876302083333333, + "learning_rate": 0.0001, + "loss": 4.4119, + "loss/crossentropy": 2.858240842819214, + "loss/hidden": 3.796875, + "loss/incoh": 0.0, + "loss/logits": 0.4695854902267456, + "loss/reg": 0.0, + "step": 1120 + }, + { + "epoch": 0.00743421052631579, + "grad_norm": 3.453125, + "grad_norm_var": 0.8083485921223958, + "learning_rate": 0.0001, + "loss": 4.3524, + "loss/crossentropy": 2.7758461236953735, + "loss/hidden": 3.90625, + "loss/incoh": 0.0, + "loss/logits": 0.5061279594898224, + "loss/reg": 0.0, + "step": 1130 + }, + { + "epoch": 0.0075, + "grad_norm": 3.78125, + "grad_norm_var": 0.8257802327473959, + "learning_rate": 0.0001, + "loss": 4.296, + "loss/crossentropy": 2.849539041519165, + "loss/hidden": 3.7125, + "loss/incoh": 0.0, + "loss/logits": 0.45034482181072233, + "loss/reg": 0.0, + "step": 1140 + }, + { + "epoch": 0.007565789473684211, + "grad_norm": 4.0625, + "grad_norm_var": 0.22603759765625, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 2.6673625230789186, + "loss/hidden": 3.903125, + "loss/incoh": 0.0, + "loss/logits": 0.45979970395565034, + "loss/reg": 0.0, + "step": 1150 + }, + { + "epoch": 0.007631578947368421, + "grad_norm": 3.984375, + "grad_norm_var": 0.2490631103515625, + "learning_rate": 0.0001, + "loss": 4.2786, + "loss/crossentropy": 2.6413369178771973, + "loss/hidden": 3.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.45444311797618864, + "loss/reg": 0.0, + "step": 1160 + }, + { + "epoch": 0.007697368421052632, + "grad_norm": 6.1875, + "grad_norm_var": 0.7844960530598958, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 2.54203085899353, + "loss/hidden": 3.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.40531369894742963, + "loss/reg": 0.0, + "step": 1170 + }, + { + "epoch": 0.0077631578947368425, + "grad_norm": 3.65625, + "grad_norm_var": 0.444384765625, + "learning_rate": 0.0001, + "loss": 4.3125, + "loss/crossentropy": 2.772641682624817, + "loss/hidden": 3.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.4890771210193634, + "loss/reg": 0.0, + "step": 1180 + }, + { + "epoch": 0.007828947368421053, + "grad_norm": 4.15625, + "grad_norm_var": 0.26546122233072916, + "learning_rate": 0.0001, + "loss": 4.3654, + "loss/crossentropy": 2.661901044845581, + "loss/hidden": 3.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.439369834959507, + "loss/reg": 0.0, + "step": 1190 + }, + { + "epoch": 0.007894736842105263, + "grad_norm": 3.75, + "grad_norm_var": 0.25321858723958335, + "learning_rate": 0.0001, + "loss": 4.2648, + "loss/crossentropy": 2.3847479939460756, + "loss/hidden": 4.028125, + "loss/incoh": 0.0, + "loss/logits": 0.4770447015762329, + "loss/reg": 0.0, + "step": 1200 + }, + { + "epoch": 0.007960526315789473, + "grad_norm": 3.921875, + "grad_norm_var": 0.2809529622395833, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 2.244320285320282, + "loss/hidden": 3.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.40470985919237135, + "loss/reg": 0.0, + "step": 1210 + }, + { + "epoch": 0.008026315789473683, + "grad_norm": 4.28125, + "grad_norm_var": 0.5173004150390625, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.5515334010124207, + "loss/hidden": 4.109375, + "loss/incoh": 0.0, + "loss/logits": 0.5172385692596435, + "loss/reg": 0.0, + "step": 1220 + }, + { + "epoch": 0.008092105263157895, + "grad_norm": 3.5, + "grad_norm_var": 0.2546539306640625, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.5860470652580263, + "loss/hidden": 3.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.4412991553544998, + "loss/reg": 0.0, + "step": 1230 + }, + { + "epoch": 0.008157894736842105, + "grad_norm": 3.1875, + "grad_norm_var": 1.7592185950961664e+17, + "learning_rate": 0.0001, + "loss": 4.4752, + "loss/crossentropy": 2.823768949508667, + "loss/hidden": 3.703125, + "loss/incoh": 0.0, + "loss/logits": 0.5228973954916001, + "loss/reg": 0.0, + "step": 1240 + }, + { + "epoch": 0.008223684210526315, + "grad_norm": 3.21875, + "grad_norm_var": 1.966552734375, + "learning_rate": 0.0001, + "loss": 4.2687, + "loss/crossentropy": 2.523781180381775, + "loss/hidden": 3.825, + "loss/incoh": 0.0, + "loss/logits": 0.4939111739397049, + "loss/reg": 0.0, + "step": 1250 + }, + { + "epoch": 0.008289473684210527, + "grad_norm": 2.796875, + "grad_norm_var": 1.2621897379557292, + "learning_rate": 0.0001, + "loss": 4.1499, + "loss/crossentropy": 2.5173804640769957, + "loss/hidden": 3.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.44251940250396726, + "loss/reg": 0.0, + "step": 1260 + }, + { + "epoch": 0.008355263157894737, + "grad_norm": 4.15625, + "grad_norm_var": 3.665185546875, + "learning_rate": 0.0001, + "loss": 4.3378, + "loss/crossentropy": 2.551619827747345, + "loss/hidden": 3.9625, + "loss/incoh": 0.0, + "loss/logits": 0.5328098922967911, + "loss/reg": 0.0, + "step": 1270 + }, + { + "epoch": 0.008421052631578947, + "grad_norm": 3.53125, + "grad_norm_var": 3.1853352864583333, + "learning_rate": 0.0001, + "loss": 4.2329, + "loss/crossentropy": 2.3984143674373626, + "loss/hidden": 3.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.43639505505561826, + "loss/reg": 0.0, + "step": 1280 + }, + { + "epoch": 0.008486842105263157, + "grad_norm": 4.21875, + "grad_norm_var": 0.21245015462239583, + "learning_rate": 0.0001, + "loss": 4.3604, + "loss/crossentropy": 2.8736027479171753, + "loss/hidden": 3.7875, + "loss/incoh": 0.0, + "loss/logits": 0.48700871765613557, + "loss/reg": 0.0, + "step": 1290 + }, + { + "epoch": 0.008552631578947369, + "grad_norm": 4.125, + "grad_norm_var": 0.24678446451822916, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.714110541343689, + "loss/hidden": 3.75625, + "loss/incoh": 0.0, + "loss/logits": 0.47638387978076935, + "loss/reg": 0.0, + "step": 1300 + }, + { + "epoch": 0.008618421052631579, + "grad_norm": 5.03125, + "grad_norm_var": 0.36387430826822914, + "learning_rate": 0.0001, + "loss": 4.2879, + "loss/crossentropy": 2.6876192927360534, + "loss/hidden": 3.7, + "loss/incoh": 0.0, + "loss/logits": 0.49314437210559847, + "loss/reg": 0.0, + "step": 1310 + }, + { + "epoch": 0.008684210526315789, + "grad_norm": 3.84375, + "grad_norm_var": 0.28189188639322915, + "learning_rate": 0.0001, + "loss": 4.1505, + "loss/crossentropy": 2.4753618359565737, + "loss/hidden": 3.684375, + "loss/incoh": 0.0, + "loss/logits": 0.40235219299793246, + "loss/reg": 0.0, + "step": 1320 + }, + { + "epoch": 0.00875, + "grad_norm": 3.1875, + "grad_norm_var": 2.41715087890625, + "learning_rate": 0.0001, + "loss": 4.2095, + "loss/crossentropy": 2.515524423122406, + "loss/hidden": 3.996875, + "loss/incoh": 0.0, + "loss/logits": 0.5540182292461395, + "loss/reg": 0.0, + "step": 1330 + }, + { + "epoch": 0.008815789473684211, + "grad_norm": 3.8125, + "grad_norm_var": 2.259993489583333, + "learning_rate": 0.0001, + "loss": 4.1889, + "loss/crossentropy": 2.346592426300049, + "loss/hidden": 3.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.4610589429736137, + "loss/reg": 0.0, + "step": 1340 + }, + { + "epoch": 0.008881578947368421, + "grad_norm": 4.0625, + "grad_norm_var": 0.13492431640625, + "learning_rate": 0.0001, + "loss": 4.1587, + "loss/crossentropy": 2.600476896762848, + "loss/hidden": 3.746875, + "loss/incoh": 0.0, + "loss/logits": 0.4497509777545929, + "loss/reg": 0.0, + "step": 1350 + }, + { + "epoch": 0.008947368421052631, + "grad_norm": 3.390625, + "grad_norm_var": 0.7162017822265625, + "learning_rate": 0.0001, + "loss": 4.0563, + "loss/crossentropy": 2.6253500103950502, + "loss/hidden": 3.5359375, + "loss/incoh": 0.0, + "loss/logits": 0.4007237285375595, + "loss/reg": 0.0, + "step": 1360 + }, + { + "epoch": 0.009013157894736843, + "grad_norm": 3.828125, + "grad_norm_var": 0.7642812093098958, + "learning_rate": 0.0001, + "loss": 4.1053, + "loss/crossentropy": 2.4306472063064577, + "loss/hidden": 3.596875, + "loss/incoh": 0.0, + "loss/logits": 0.42645111978054046, + "loss/reg": 0.0, + "step": 1370 + }, + { + "epoch": 0.009078947368421053, + "grad_norm": 5.65625, + "grad_norm_var": 0.3096995035807292, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 2.4861367106437684, + "loss/hidden": 3.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.46326183080673217, + "loss/reg": 0.0, + "step": 1380 + }, + { + "epoch": 0.009144736842105263, + "grad_norm": 3.265625, + "grad_norm_var": 0.36946207682291665, + "learning_rate": 0.0001, + "loss": 4.0653, + "loss/crossentropy": 2.608224070072174, + "loss/hidden": 3.5140625, + "loss/incoh": 0.0, + "loss/logits": 0.3894644558429718, + "loss/reg": 0.0, + "step": 1390 + }, + { + "epoch": 0.009210526315789473, + "grad_norm": 3.21875, + "grad_norm_var": 0.30060221354166666, + "learning_rate": 0.0001, + "loss": 4.1547, + "loss/crossentropy": 2.337048816680908, + "loss/hidden": 3.6046875, + "loss/incoh": 0.0, + "loss/logits": 0.39037723541259767, + "loss/reg": 0.0, + "step": 1400 + }, + { + "epoch": 0.009276315789473685, + "grad_norm": 3.625, + "grad_norm_var": 0.14612630208333333, + "learning_rate": 0.0001, + "loss": 4.076, + "loss/crossentropy": 2.726799726486206, + "loss/hidden": 3.5515625, + "loss/incoh": 0.0, + "loss/logits": 0.4537044405937195, + "loss/reg": 0.0, + "step": 1410 + }, + { + "epoch": 0.009342105263157895, + "grad_norm": 3.875, + "grad_norm_var": 1.9072662353515626, + "learning_rate": 0.0001, + "loss": 4.2387, + "loss/crossentropy": 2.5365243434906004, + "loss/hidden": 3.503125, + "loss/incoh": 0.0, + "loss/logits": 0.37743023335933684, + "loss/reg": 0.0, + "step": 1420 + }, + { + "epoch": 0.009407894736842105, + "grad_norm": 4.1875, + "grad_norm_var": 0.15405171712239582, + "learning_rate": 0.0001, + "loss": 4.0913, + "loss/crossentropy": 2.6032602190971375, + "loss/hidden": 3.6015625, + "loss/incoh": 0.0, + "loss/logits": 0.4031028777360916, + "loss/reg": 0.0, + "step": 1430 + }, + { + "epoch": 0.009473684210526316, + "grad_norm": 3.453125, + "grad_norm_var": 0.35299072265625, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 2.279827582836151, + "loss/hidden": 3.671875, + "loss/incoh": 0.0, + "loss/logits": 0.38540517538785934, + "loss/reg": 0.0, + "step": 1440 + }, + { + "epoch": 0.009539473684210526, + "grad_norm": 3.703125, + "grad_norm_var": 0.44612223307291665, + "learning_rate": 0.0001, + "loss": 4.0791, + "loss/crossentropy": 2.4795989274978636, + "loss/hidden": 3.7, + "loss/incoh": 0.0, + "loss/logits": 0.40130155086517333, + "loss/reg": 0.0, + "step": 1450 + }, + { + "epoch": 0.009605263157894737, + "grad_norm": 3.515625, + "grad_norm_var": 0.43585611979166666, + "learning_rate": 0.0001, + "loss": 4.0499, + "loss/crossentropy": 2.337530755996704, + "loss/hidden": 3.4703125, + "loss/incoh": 0.0, + "loss/logits": 0.3699365258216858, + "loss/reg": 0.0, + "step": 1460 + }, + { + "epoch": 0.009671052631578947, + "grad_norm": 4.375, + "grad_norm_var": 2.1455393473307294, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 2.246569663286209, + "loss/hidden": 3.54375, + "loss/incoh": 0.0, + "loss/logits": 0.37747917622327803, + "loss/reg": 0.0, + "step": 1470 + }, + { + "epoch": 0.009736842105263158, + "grad_norm": 4.34375, + "grad_norm_var": 1.3469017374654464e+17, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.465463387966156, + "loss/hidden": 3.5796875, + "loss/incoh": 0.0, + "loss/logits": 0.409694692492485, + "loss/reg": 0.0, + "step": 1480 + }, + { + "epoch": 0.009802631578947368, + "grad_norm": 3.8125, + "grad_norm_var": 2.647081560180774e+17, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.614275646209717, + "loss/hidden": 3.86875, + "loss/incoh": 0.0, + "loss/logits": 0.4961456567049026, + "loss/reg": 0.0, + "step": 1490 + }, + { + "epoch": 0.009868421052631578, + "grad_norm": 3.59375, + "grad_norm_var": 3.232743326822917, + "learning_rate": 0.0001, + "loss": 4.2432, + "loss/crossentropy": 2.9305615186691285, + "loss/hidden": 3.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.6574043720960617, + "loss/reg": 0.0, + "step": 1500 + }, + { + "epoch": 0.00993421052631579, + "grad_norm": 3.34375, + "grad_norm_var": 3.299430338541667, + "learning_rate": 0.0001, + "loss": 4.066, + "loss/crossentropy": 2.3778577923774717, + "loss/hidden": 3.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.3975479930639267, + "loss/reg": 0.0, + "step": 1510 + }, + { + "epoch": 0.01, + "grad_norm": 3.515625, + "grad_norm_var": 18.948729451497396, + "learning_rate": 0.0001, + "loss": 4.1724, + "loss/crossentropy": 2.527243709564209, + "loss/hidden": 3.596875, + "loss/incoh": 0.0, + "loss/logits": 0.41472980976104734, + "loss/reg": 0.0, + "step": 1520 + }, + { + "epoch": 0.01006578947368421, + "grad_norm": 3.03125, + "grad_norm_var": 18.10924072265625, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 2.7474317073822023, + "loss/hidden": 3.690625, + "loss/incoh": 0.0, + "loss/logits": 0.47478381991386415, + "loss/reg": 0.0, + "step": 1530 + }, + { + "epoch": 0.01013157894736842, + "grad_norm": 4.4375, + "grad_norm_var": 0.45779520670572915, + "learning_rate": 0.0001, + "loss": 4.0726, + "loss/crossentropy": 2.4822750091552734, + "loss/hidden": 3.48125, + "loss/incoh": 0.0, + "loss/logits": 0.39128718376159666, + "loss/reg": 0.0, + "step": 1540 + }, + { + "epoch": 0.010197368421052632, + "grad_norm": 3.5625, + "grad_norm_var": 0.21116129557291666, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 2.480714201927185, + "loss/hidden": 3.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.4798941880464554, + "loss/reg": 0.0, + "step": 1550 + }, + { + "epoch": 0.010263157894736842, + "grad_norm": 3.78125, + "grad_norm_var": 0.21678059895833332, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.536018407344818, + "loss/hidden": 3.53125, + "loss/incoh": 0.0, + "loss/logits": 0.3731235474348068, + "loss/reg": 0.0, + "step": 1560 + }, + { + "epoch": 0.010328947368421052, + "grad_norm": 3.953125, + "grad_norm_var": 0.17219645182291668, + "learning_rate": 0.0001, + "loss": 4.0545, + "loss/crossentropy": 2.4370301008224486, + "loss/hidden": 3.5640625, + "loss/incoh": 0.0, + "loss/logits": 0.4435511589050293, + "loss/reg": 0.0, + "step": 1570 + }, + { + "epoch": 0.010394736842105264, + "grad_norm": 3.140625, + "grad_norm_var": 0.5761027018229167, + "learning_rate": 0.0001, + "loss": 4.0907, + "loss/crossentropy": 2.576425087451935, + "loss/hidden": 3.6484375, + "loss/incoh": 0.0, + "loss/logits": 0.43103125393390657, + "loss/reg": 0.0, + "step": 1580 + }, + { + "epoch": 0.010460526315789474, + "grad_norm": 3.34375, + "grad_norm_var": 0.15403544108072917, + "learning_rate": 0.0001, + "loss": 3.8713, + "loss/crossentropy": 2.2120620369911195, + "loss/hidden": 3.796875, + "loss/incoh": 0.0, + "loss/logits": 0.43894679844379425, + "loss/reg": 0.0, + "step": 1590 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 2.890625, + "grad_norm_var": 0.09127197265625, + "learning_rate": 0.0001, + "loss": 3.9719, + "loss/crossentropy": 2.5636333346366884, + "loss/hidden": 3.575, + "loss/incoh": 0.0, + "loss/logits": 0.4157550185918808, + "loss/reg": 0.0, + "step": 1600 + }, + { + "epoch": 0.010592105263157894, + "grad_norm": 3.5, + "grad_norm_var": 2.413444010416667, + "learning_rate": 0.0001, + "loss": 4.153, + "loss/crossentropy": 2.6213369131088258, + "loss/hidden": 3.6171875, + "loss/incoh": 0.0, + "loss/logits": 0.45062357783317564, + "loss/reg": 0.0, + "step": 1610 + }, + { + "epoch": 0.010657894736842106, + "grad_norm": 9.5, + "grad_norm_var": 4.2955881754557295, + "learning_rate": 0.0001, + "loss": 4.0295, + "loss/crossentropy": 2.626167094707489, + "loss/hidden": 3.5765625, + "loss/incoh": 0.0, + "loss/logits": 0.44043630063533784, + "loss/reg": 0.0, + "step": 1620 + }, + { + "epoch": 0.010723684210526316, + "grad_norm": 3.6875, + "grad_norm_var": 2.3217437744140623, + "learning_rate": 0.0001, + "loss": 3.937, + "loss/crossentropy": 2.6269264578819276, + "loss/hidden": 3.45625, + "loss/incoh": 0.0, + "loss/logits": 0.48427494168281554, + "loss/reg": 0.0, + "step": 1630 + }, + { + "epoch": 0.010789473684210526, + "grad_norm": 3.078125, + "grad_norm_var": 0.12195638020833334, + "learning_rate": 0.0001, + "loss": 3.9365, + "loss/crossentropy": 2.6117714166641237, + "loss/hidden": 3.4359375, + "loss/incoh": 0.0, + "loss/logits": 0.4069202274084091, + "loss/reg": 0.0, + "step": 1640 + }, + { + "epoch": 0.010855263157894738, + "grad_norm": 3.3125, + "grad_norm_var": 0.25017903645833334, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 2.6508745312690736, + "loss/hidden": 3.53125, + "loss/incoh": 0.0, + "loss/logits": 0.4166009187698364, + "loss/reg": 0.0, + "step": 1650 + }, + { + "epoch": 0.010921052631578948, + "grad_norm": 4.21875, + "grad_norm_var": 0.1591217041015625, + "learning_rate": 0.0001, + "loss": 3.8964, + "loss/crossentropy": 2.4683817744255068, + "loss/hidden": 3.4421875, + "loss/incoh": 0.0, + "loss/logits": 0.37454236298799515, + "loss/reg": 0.0, + "step": 1660 + }, + { + "epoch": 0.010986842105263158, + "grad_norm": 3.28125, + "grad_norm_var": 0.12337239583333333, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 2.4436564683914184, + "loss/hidden": 3.3390625, + "loss/incoh": 0.0, + "loss/logits": 0.36218023002147676, + "loss/reg": 0.0, + "step": 1670 + }, + { + "epoch": 0.011052631578947368, + "grad_norm": 3.15625, + "grad_norm_var": 0.08958231608072917, + "learning_rate": 0.0001, + "loss": 3.9784, + "loss/crossentropy": 2.559529435634613, + "loss/hidden": 3.703125, + "loss/incoh": 0.0, + "loss/logits": 0.46595812439918516, + "loss/reg": 0.0, + "step": 1680 + }, + { + "epoch": 0.01111842105263158, + "grad_norm": 2.953125, + "grad_norm_var": 0.11448160807291667, + "learning_rate": 0.0001, + "loss": 3.9419, + "loss/crossentropy": 2.4433623433113096, + "loss/hidden": 3.453125, + "loss/incoh": 0.0, + "loss/logits": 0.41454428136348725, + "loss/reg": 0.0, + "step": 1690 + }, + { + "epoch": 0.01118421052631579, + "grad_norm": 4.0625, + "grad_norm_var": 0.11336161295572916, + "learning_rate": 0.0001, + "loss": 3.8878, + "loss/crossentropy": 2.561389684677124, + "loss/hidden": 3.4640625, + "loss/incoh": 0.0, + "loss/logits": 0.3884895950555801, + "loss/reg": 0.0, + "step": 1700 + }, + { + "epoch": 0.01125, + "grad_norm": 3.296875, + "grad_norm_var": 2.4417928059895835, + "learning_rate": 0.0001, + "loss": 3.9996, + "loss/crossentropy": 2.709191393852234, + "loss/hidden": 3.5515625, + "loss/incoh": 0.0, + "loss/logits": 0.3691831022500992, + "loss/reg": 0.0, + "step": 1710 + }, + { + "epoch": 0.011315789473684211, + "grad_norm": 3.3125, + "grad_norm_var": 0.5698232014973958, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 2.4276717066764832, + "loss/hidden": 3.7375, + "loss/incoh": 0.0, + "loss/logits": 0.4981458902359009, + "loss/reg": 0.0, + "step": 1720 + }, + { + "epoch": 0.011381578947368421, + "grad_norm": 3.71875, + "grad_norm_var": 0.4803059895833333, + "learning_rate": 0.0001, + "loss": 4.0546, + "loss/crossentropy": 2.672085237503052, + "loss/hidden": 3.465625, + "loss/incoh": 0.0, + "loss/logits": 0.41347330510616304, + "loss/reg": 0.0, + "step": 1730 + }, + { + "epoch": 0.011447368421052631, + "grad_norm": 6.59375, + "grad_norm_var": 0.7734700520833333, + "learning_rate": 0.0001, + "loss": 4.0697, + "loss/crossentropy": 2.625990152359009, + "loss/hidden": 3.60625, + "loss/incoh": 0.0, + "loss/logits": 0.45684492886066436, + "loss/reg": 0.0, + "step": 1740 + }, + { + "epoch": 0.011513157894736841, + "grad_norm": 2.8125, + "grad_norm_var": 2.146930948893229, + "learning_rate": 0.0001, + "loss": 4.1129, + "loss/crossentropy": 2.682978630065918, + "loss/hidden": 3.5703125, + "loss/incoh": 0.0, + "loss/logits": 0.4306509166955948, + "loss/reg": 0.0, + "step": 1750 + }, + { + "epoch": 0.011578947368421053, + "grad_norm": 3.203125, + "grad_norm_var": 0.1972076416015625, + "learning_rate": 0.0001, + "loss": 3.8797, + "loss/crossentropy": 2.7138221740722654, + "loss/hidden": 3.60625, + "loss/incoh": 0.0, + "loss/logits": 0.46029032766819, + "loss/reg": 0.0, + "step": 1760 + }, + { + "epoch": 0.011644736842105263, + "grad_norm": 3.03125, + "grad_norm_var": 15.614777628580729, + "learning_rate": 0.0001, + "loss": 4.0409, + "loss/crossentropy": 2.1841426372528074, + "loss/hidden": 3.3828125, + "loss/incoh": 0.0, + "loss/logits": 0.35761781185865404, + "loss/reg": 0.0, + "step": 1770 + }, + { + "epoch": 0.011710526315789473, + "grad_norm": 3.953125, + "grad_norm_var": 0.32083231608072915, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 2.404168051481247, + "loss/hidden": 3.325, + "loss/incoh": 0.0, + "loss/logits": 0.34855909645557404, + "loss/reg": 0.0, + "step": 1780 + }, + { + "epoch": 0.011776315789473683, + "grad_norm": 3.5, + "grad_norm_var": 3.124772135416667, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 2.370392310619354, + "loss/hidden": 3.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.4228974744677544, + "loss/reg": 0.0, + "step": 1790 + }, + { + "epoch": 0.011842105263157895, + "grad_norm": 6.125, + "grad_norm_var": 3.445572916666667, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.6190654158592226, + "loss/hidden": 3.4578125, + "loss/incoh": 0.0, + "loss/logits": 0.406630203127861, + "loss/reg": 0.0, + "step": 1800 + }, + { + "epoch": 0.011907894736842105, + "grad_norm": 3.28125, + "grad_norm_var": 0.575048828125, + "learning_rate": 0.0001, + "loss": 3.8828, + "loss/crossentropy": 2.505708968639374, + "loss/hidden": 3.4203125, + "loss/incoh": 0.0, + "loss/logits": 0.3773295432329178, + "loss/reg": 0.0, + "step": 1810 + }, + { + "epoch": 0.011973684210526315, + "grad_norm": 3.734375, + "grad_norm_var": 14.053641764322917, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 2.8797521352767945, + "loss/hidden": 3.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.7419060736894607, + "loss/reg": 0.0, + "step": 1820 + }, + { + "epoch": 0.012039473684210527, + "grad_norm": 3.96875, + "grad_norm_var": 2.006696573893229, + "learning_rate": 0.0001, + "loss": 3.87, + "loss/crossentropy": 2.792651188373566, + "loss/hidden": 3.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.7776896879076958, + "loss/reg": 0.0, + "step": 1830 + }, + { + "epoch": 0.012105263157894737, + "grad_norm": 4.09375, + "grad_norm_var": 3.2296132405598956, + "learning_rate": 0.0001, + "loss": 3.8688, + "loss/crossentropy": 2.4713597655296327, + "loss/hidden": 3.4484375, + "loss/incoh": 0.0, + "loss/logits": 0.3962660849094391, + "loss/reg": 0.0, + "step": 1840 + }, + { + "epoch": 0.012171052631578947, + "grad_norm": 3.296875, + "grad_norm_var": 0.20377197265625, + "learning_rate": 0.0001, + "loss": 3.9721, + "loss/crossentropy": 2.225203812122345, + "loss/hidden": 3.4734375, + "loss/incoh": 0.0, + "loss/logits": 0.3860040009021759, + "loss/reg": 0.0, + "step": 1850 + }, + { + "epoch": 0.012236842105263157, + "grad_norm": 4.375, + "grad_norm_var": 14.807112630208334, + "learning_rate": 0.0001, + "loss": 4.0342, + "loss/crossentropy": 2.405521821975708, + "loss/hidden": 3.428125, + "loss/incoh": 0.0, + "loss/logits": 0.40949456989765165, + "loss/reg": 0.0, + "step": 1860 + }, + { + "epoch": 0.012302631578947369, + "grad_norm": 4.53125, + "grad_norm_var": 6.056012980143229, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 2.3927958846092223, + "loss/hidden": 3.5265625, + "loss/incoh": 0.0, + "loss/logits": 0.3984386846423149, + "loss/reg": 0.0, + "step": 1870 + }, + { + "epoch": 0.012368421052631579, + "grad_norm": 2.734375, + "grad_norm_var": 0.6073893229166667, + "learning_rate": 0.0001, + "loss": 3.9074, + "loss/crossentropy": 2.6031975388526916, + "loss/hidden": 3.38125, + "loss/incoh": 0.0, + "loss/logits": 0.4128950208425522, + "loss/reg": 0.0, + "step": 1880 + }, + { + "epoch": 0.012434210526315789, + "grad_norm": 3.65625, + "grad_norm_var": 0.3526845296223958, + "learning_rate": 0.0001, + "loss": 3.9143, + "loss/crossentropy": 2.7405603647232057, + "loss/hidden": 3.25625, + "loss/incoh": 0.0, + "loss/logits": 0.37195596396923064, + "loss/reg": 0.0, + "step": 1890 + }, + { + "epoch": 0.0125, + "grad_norm": 3.3125, + "grad_norm_var": 130.27611389160157, + "learning_rate": 0.0001, + "loss": 3.986, + "loss/crossentropy": 2.6554584980010985, + "loss/hidden": 3.5, + "loss/incoh": 0.0, + "loss/logits": 0.3823524177074432, + "loss/reg": 0.0, + "step": 1900 + }, + { + "epoch": 0.01256578947368421, + "grad_norm": 3.6875, + "grad_norm_var": 130.43673400878907, + "learning_rate": 0.0001, + "loss": 3.8682, + "loss/crossentropy": 2.6816349744796755, + "loss/hidden": 3.421875, + "loss/incoh": 0.0, + "loss/logits": 0.43074882328510283, + "loss/reg": 0.0, + "step": 1910 + }, + { + "epoch": 0.01263157894736842, + "grad_norm": 3.0, + "grad_norm_var": 10.65947265625, + "learning_rate": 0.0001, + "loss": 4.0152, + "loss/crossentropy": 2.1248778343200683, + "loss/hidden": 3.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.4099419146776199, + "loss/reg": 0.0, + "step": 1920 + }, + { + "epoch": 0.01269736842105263, + "grad_norm": 2.828125, + "grad_norm_var": 0.41845296223958334, + "learning_rate": 0.0001, + "loss": 3.9099, + "loss/crossentropy": 2.485488569736481, + "loss/hidden": 3.3921875, + "loss/incoh": 0.0, + "loss/logits": 0.3766200736165047, + "loss/reg": 0.0, + "step": 1930 + }, + { + "epoch": 0.012763157894736843, + "grad_norm": 4.375, + "grad_norm_var": 2.3640777587890627, + "learning_rate": 0.0001, + "loss": 4.0886, + "loss/crossentropy": 2.896865522861481, + "loss/hidden": 4.4, + "loss/incoh": 0.0, + "loss/logits": 0.6787679702043533, + "loss/reg": 0.0, + "step": 1940 + }, + { + "epoch": 0.012828947368421053, + "grad_norm": 3.34375, + "grad_norm_var": 34.95194905598958, + "learning_rate": 0.0001, + "loss": 4.0285, + "loss/crossentropy": 2.348608684539795, + "loss/hidden": 3.6171875, + "loss/incoh": 0.0, + "loss/logits": 0.4279856622219086, + "loss/reg": 0.0, + "step": 1950 + }, + { + "epoch": 0.012894736842105263, + "grad_norm": 3.65625, + "grad_norm_var": 1.5377278645833334, + "learning_rate": 0.0001, + "loss": 3.9097, + "loss/crossentropy": 2.3533430814743044, + "loss/hidden": 3.48125, + "loss/incoh": 0.0, + "loss/logits": 0.37981766164302827, + "loss/reg": 0.0, + "step": 1960 + }, + { + "epoch": 0.012960526315789474, + "grad_norm": 4.15625, + "grad_norm_var": 0.23593648274739584, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 2.54624525308609, + "loss/hidden": 3.790625, + "loss/incoh": 0.0, + "loss/logits": 0.4486588716506958, + "loss/reg": 0.0, + "step": 1970 + }, + { + "epoch": 0.013026315789473684, + "grad_norm": 2.859375, + "grad_norm_var": 0.24025777180989583, + "learning_rate": 0.0001, + "loss": 3.8942, + "loss/crossentropy": 2.5797088146209717, + "loss/hidden": 3.7875, + "loss/incoh": 0.0, + "loss/logits": 0.4970128297805786, + "loss/reg": 0.0, + "step": 1980 + }, + { + "epoch": 0.013092105263157894, + "grad_norm": 3.9375, + "grad_norm_var": 0.47609049479166665, + "learning_rate": 0.0001, + "loss": 3.934, + "loss/crossentropy": 2.555588161945343, + "loss/hidden": 3.38125, + "loss/incoh": 0.0, + "loss/logits": 0.38218972086906433, + "loss/reg": 0.0, + "step": 1990 + }, + { + "epoch": 0.013157894736842105, + "grad_norm": 4.03125, + "grad_norm_var": 1.0361073811848958, + "learning_rate": 0.0001, + "loss": 3.8759, + "loss/crossentropy": 2.164474868774414, + "loss/hidden": 3.7125, + "loss/incoh": 0.0, + "loss/logits": 0.3994155451655388, + "loss/reg": 0.0, + "step": 2000 + }, + { + "epoch": 0.013223684210526316, + "grad_norm": 3.453125, + "grad_norm_var": 0.8633778889973959, + "learning_rate": 0.0001, + "loss": 3.8317, + "loss/crossentropy": 2.558875060081482, + "loss/hidden": 3.471875, + "loss/incoh": 0.0, + "loss/logits": 0.3639406472444534, + "loss/reg": 0.0, + "step": 2010 + }, + { + "epoch": 0.013289473684210526, + "grad_norm": 3.75, + "grad_norm_var": 1.5050740559895834, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 2.3385006546974183, + "loss/hidden": 3.4359375, + "loss/incoh": 0.0, + "loss/logits": 0.3938352942466736, + "loss/reg": 0.0, + "step": 2020 + }, + { + "epoch": 0.013355263157894736, + "grad_norm": 3.15625, + "grad_norm_var": 0.9781158447265625, + "learning_rate": 0.0001, + "loss": 3.9531, + "loss/crossentropy": 2.4627522945404055, + "loss/hidden": 3.5578125, + "loss/incoh": 0.0, + "loss/logits": 0.495425808429718, + "loss/reg": 0.0, + "step": 2030 + }, + { + "epoch": 0.013421052631578948, + "grad_norm": 2.75, + "grad_norm_var": 1.949779256184896, + "learning_rate": 0.0001, + "loss": 3.9979, + "loss/crossentropy": 2.11747065782547, + "loss/hidden": 3.59375, + "loss/incoh": 0.0, + "loss/logits": 0.36948435604572294, + "loss/reg": 0.0, + "step": 2040 + }, + { + "epoch": 0.013486842105263158, + "grad_norm": 3.0625, + "grad_norm_var": 1.5490549723307292, + "learning_rate": 0.0001, + "loss": 3.8381, + "loss/crossentropy": 2.473706376552582, + "loss/hidden": 3.4265625, + "loss/incoh": 0.0, + "loss/logits": 0.3867632657289505, + "loss/reg": 0.0, + "step": 2050 + }, + { + "epoch": 0.013552631578947368, + "grad_norm": 3.125, + "grad_norm_var": 2.1093470786676653e+17, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 2.4225202679634092, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.3675911784172058, + "loss/reg": 0.0, + "step": 2060 + }, + { + "epoch": 0.013618421052631578, + "grad_norm": 5.625, + "grad_norm_var": 76.3380849202474, + "learning_rate": 0.0001, + "loss": 4.0746, + "loss/crossentropy": 2.48265061378479, + "loss/hidden": 3.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.4639868468046188, + "loss/reg": 0.0, + "step": 2070 + }, + { + "epoch": 0.01368421052631579, + "grad_norm": 4.5, + "grad_norm_var": 0.7881011962890625, + "learning_rate": 0.0001, + "loss": 4.038, + "loss/crossentropy": 2.7775272965431212, + "loss/hidden": 4.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.4948854446411133, + "loss/reg": 0.0, + "step": 2080 + }, + { + "epoch": 0.01375, + "grad_norm": 3.296875, + "grad_norm_var": 0.8111612955729167, + "learning_rate": 0.0001, + "loss": 3.8968, + "loss/crossentropy": 2.4613396763801574, + "loss/hidden": 3.3484375, + "loss/incoh": 0.0, + "loss/logits": 0.3684074327349663, + "loss/reg": 0.0, + "step": 2090 + }, + { + "epoch": 0.01381578947368421, + "grad_norm": 3.484375, + "grad_norm_var": 1.490550740559896, + "learning_rate": 0.0001, + "loss": 3.867, + "loss/crossentropy": 2.5947747588157655, + "loss/hidden": 3.4765625, + "loss/incoh": 0.0, + "loss/logits": 0.3717468947172165, + "loss/reg": 0.0, + "step": 2100 + }, + { + "epoch": 0.013881578947368422, + "grad_norm": 4.75, + "grad_norm_var": 1.7372385660807292, + "learning_rate": 0.0001, + "loss": 3.8731, + "loss/crossentropy": 2.466842460632324, + "loss/hidden": 3.2890625, + "loss/incoh": 0.0, + "loss/logits": 0.3417574405670166, + "loss/reg": 0.0, + "step": 2110 + }, + { + "epoch": 0.013947368421052632, + "grad_norm": 4.21875, + "grad_norm_var": 1.8861002604166666, + "learning_rate": 0.0001, + "loss": 3.823, + "loss/crossentropy": 2.3006282687187194, + "loss/hidden": 3.390625, + "loss/incoh": 0.0, + "loss/logits": 0.36191926896572113, + "loss/reg": 0.0, + "step": 2120 + }, + { + "epoch": 0.014013157894736842, + "grad_norm": 4.21875, + "grad_norm_var": 0.9100901285807291, + "learning_rate": 0.0001, + "loss": 3.8933, + "loss/crossentropy": 2.6159239768981934, + "loss/hidden": 3.5109375, + "loss/incoh": 0.0, + "loss/logits": 0.46396631598472593, + "loss/reg": 0.0, + "step": 2130 + }, + { + "epoch": 0.014078947368421052, + "grad_norm": 3.734375, + "grad_norm_var": 0.9789388020833333, + "learning_rate": 0.0001, + "loss": 3.8761, + "loss/crossentropy": 2.6355370759963987, + "loss/hidden": 3.3421875, + "loss/incoh": 0.0, + "loss/logits": 0.361694809794426, + "loss/reg": 0.0, + "step": 2140 + }, + { + "epoch": 0.014144736842105264, + "grad_norm": 2.875, + "grad_norm_var": 0.30113525390625, + "learning_rate": 0.0001, + "loss": 3.8617, + "loss/crossentropy": 2.6357606053352356, + "loss/hidden": 3.3515625, + "loss/incoh": 0.0, + "loss/logits": 0.3612044155597687, + "loss/reg": 0.0, + "step": 2150 + }, + { + "epoch": 0.014210526315789474, + "grad_norm": 2.921875, + "grad_norm_var": 0.23430582682291667, + "learning_rate": 0.0001, + "loss": 3.8903, + "loss/crossentropy": 2.551873171329498, + "loss/hidden": 3.44375, + "loss/incoh": 0.0, + "loss/logits": 0.41060586273670197, + "loss/reg": 0.0, + "step": 2160 + }, + { + "epoch": 0.014276315789473684, + "grad_norm": 3.5625, + "grad_norm_var": 0.48121744791666665, + "learning_rate": 0.0001, + "loss": 3.9663, + "loss/crossentropy": 2.495719885826111, + "loss/hidden": 3.690625, + "loss/incoh": 0.0, + "loss/logits": 0.4500987708568573, + "loss/reg": 0.0, + "step": 2170 + }, + { + "epoch": 0.014342105263157894, + "grad_norm": 2.8125, + "grad_norm_var": 0.17538655598958333, + "learning_rate": 0.0001, + "loss": 3.8053, + "loss/crossentropy": 2.520111393928528, + "loss/hidden": 3.4765625, + "loss/incoh": 0.0, + "loss/logits": 0.44193484634160995, + "loss/reg": 0.0, + "step": 2180 + }, + { + "epoch": 0.014407894736842106, + "grad_norm": 3.328125, + "grad_norm_var": 31.391422526041666, + "learning_rate": 0.0001, + "loss": 4.0394, + "loss/crossentropy": 2.5524103164672853, + "loss/hidden": 3.4203125, + "loss/incoh": 0.0, + "loss/logits": 0.38946655094623567, + "loss/reg": 0.0, + "step": 2190 + }, + { + "epoch": 0.014473684210526316, + "grad_norm": 8.6875, + "grad_norm_var": 2.0572499593098956, + "learning_rate": 0.0001, + "loss": 3.9758, + "loss/crossentropy": 2.2560265123844148, + "loss/hidden": 3.59375, + "loss/incoh": 0.0, + "loss/logits": 0.4082709074020386, + "loss/reg": 0.0, + "step": 2200 + }, + { + "epoch": 0.014539473684210526, + "grad_norm": 3.25, + "grad_norm_var": 3.566722615559896, + "learning_rate": 0.0001, + "loss": 3.8174, + "loss/crossentropy": 2.2830474019050597, + "loss/hidden": 3.3375, + "loss/incoh": 0.0, + "loss/logits": 0.3462225392460823, + "loss/reg": 0.0, + "step": 2210 + }, + { + "epoch": 0.014605263157894737, + "grad_norm": 3.109375, + "grad_norm_var": 0.155615234375, + "learning_rate": 0.0001, + "loss": 3.8468, + "loss/crossentropy": 2.3341428637504578, + "loss/hidden": 3.4375, + "loss/incoh": 0.0, + "loss/logits": 0.4024402230978012, + "loss/reg": 0.0, + "step": 2220 + }, + { + "epoch": 0.014671052631578948, + "grad_norm": 2.984375, + "grad_norm_var": 5.723542277018229, + "learning_rate": 0.0001, + "loss": 3.9699, + "loss/crossentropy": 2.201635646820068, + "loss/hidden": 3.2703125, + "loss/incoh": 0.0, + "loss/logits": 0.3397625252604485, + "loss/reg": 0.0, + "step": 2230 + }, + { + "epoch": 0.014736842105263158, + "grad_norm": 2.796875, + "grad_norm_var": 47.54188537597656, + "learning_rate": 0.0001, + "loss": 3.9281, + "loss/crossentropy": 2.5724541902542115, + "loss/hidden": 3.2890625, + "loss/incoh": 0.0, + "loss/logits": 0.3728118479251862, + "loss/reg": 0.0, + "step": 2240 + }, + { + "epoch": 0.014802631578947368, + "grad_norm": 2.984375, + "grad_norm_var": 51.28417867024739, + "learning_rate": 0.0001, + "loss": 3.8237, + "loss/crossentropy": 2.5087037920951842, + "loss/hidden": 3.2125, + "loss/incoh": 0.0, + "loss/logits": 0.3448286011815071, + "loss/reg": 0.0, + "step": 2250 + }, + { + "epoch": 0.01486842105263158, + "grad_norm": 3.171875, + "grad_norm_var": 0.05579427083333333, + "learning_rate": 0.0001, + "loss": 3.8448, + "loss/crossentropy": 2.4813582420349123, + "loss/hidden": 3.409375, + "loss/incoh": 0.0, + "loss/logits": 0.40958506166934966, + "loss/reg": 0.0, + "step": 2260 + }, + { + "epoch": 0.01493421052631579, + "grad_norm": 2.96875, + "grad_norm_var": 0.16342671712239584, + "learning_rate": 0.0001, + "loss": 3.753, + "loss/crossentropy": 2.523963761329651, + "loss/hidden": 3.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.38928901553153994, + "loss/reg": 0.0, + "step": 2270 + }, + { + "epoch": 0.015, + "grad_norm": 3.0625, + "grad_norm_var": 0.7111317952473958, + "learning_rate": 0.0001, + "loss": 3.8078, + "loss/crossentropy": 2.6787729024887086, + "loss/hidden": 3.71875, + "loss/incoh": 0.0, + "loss/logits": 0.4244162023067474, + "loss/reg": 0.0, + "step": 2280 + }, + { + "epoch": 0.015065789473684211, + "grad_norm": 3.09375, + "grad_norm_var": 0.10339253743489583, + "learning_rate": 0.0001, + "loss": 3.7562, + "loss/crossentropy": 2.191652774810791, + "loss/hidden": 3.4140625, + "loss/incoh": 0.0, + "loss/logits": 0.3300579100847244, + "loss/reg": 0.0, + "step": 2290 + }, + { + "epoch": 0.015131578947368421, + "grad_norm": 2.890625, + "grad_norm_var": 0.05054423014322917, + "learning_rate": 0.0001, + "loss": 3.73, + "loss/crossentropy": 2.4444664478302003, + "loss/hidden": 3.4265625, + "loss/incoh": 0.0, + "loss/logits": 0.4327535033226013, + "loss/reg": 0.0, + "step": 2300 + }, + { + "epoch": 0.015197368421052631, + "grad_norm": 3.046875, + "grad_norm_var": 0.15128580729166666, + "learning_rate": 0.0001, + "loss": 3.7723, + "loss/crossentropy": 2.2329365968704225, + "loss/hidden": 3.51875, + "loss/incoh": 0.0, + "loss/logits": 0.3683215394616127, + "loss/reg": 0.0, + "step": 2310 + }, + { + "epoch": 0.015263157894736841, + "grad_norm": 3.03125, + "grad_norm_var": 0.3289713541666667, + "learning_rate": 0.0001, + "loss": 3.7699, + "loss/crossentropy": 2.5342983961105348, + "loss/hidden": 3.2921875, + "loss/incoh": 0.0, + "loss/logits": 0.35688025653362276, + "loss/reg": 0.0, + "step": 2320 + }, + { + "epoch": 0.015328947368421053, + "grad_norm": 3.15625, + "grad_norm_var": 0.5997060139973959, + "learning_rate": 0.0001, + "loss": 3.8494, + "loss/crossentropy": 2.4934002995491027, + "loss/hidden": 3.1546875, + "loss/incoh": 0.0, + "loss/logits": 0.3495032548904419, + "loss/reg": 0.0, + "step": 2330 + }, + { + "epoch": 0.015394736842105263, + "grad_norm": 4.28125, + "grad_norm_var": 1.570759073893229, + "learning_rate": 0.0001, + "loss": 3.9628, + "loss/crossentropy": 2.2064894437789917, + "loss/hidden": 3.478125, + "loss/incoh": 0.0, + "loss/logits": 0.34214983880519867, + "loss/reg": 0.0, + "step": 2340 + }, + { + "epoch": 0.015460526315789473, + "grad_norm": 3.5, + "grad_norm_var": 1.9128865559895833, + "learning_rate": 0.0001, + "loss": 3.9538, + "loss/crossentropy": 2.5408032178878783, + "loss/hidden": 3.515625, + "loss/incoh": 0.0, + "loss/logits": 0.4111128658056259, + "loss/reg": 0.0, + "step": 2350 + }, + { + "epoch": 0.015526315789473685, + "grad_norm": 3.765625, + "grad_norm_var": 0.39661051432291666, + "learning_rate": 0.0001, + "loss": 3.8897, + "loss/crossentropy": 2.4922020554542543, + "loss/hidden": 3.3953125, + "loss/incoh": 0.0, + "loss/logits": 0.36137166023254397, + "loss/reg": 0.0, + "step": 2360 + }, + { + "epoch": 0.015592105263157895, + "grad_norm": 4.09375, + "grad_norm_var": 0.21868082682291667, + "learning_rate": 0.0001, + "loss": 3.8461, + "loss/crossentropy": 2.427833843231201, + "loss/hidden": 3.5921875, + "loss/incoh": 0.0, + "loss/logits": 0.36205882132053374, + "loss/reg": 0.0, + "step": 2370 + }, + { + "epoch": 0.015657894736842107, + "grad_norm": 3.171875, + "grad_norm_var": 0.29641520182291664, + "learning_rate": 0.0001, + "loss": 3.767, + "loss/crossentropy": 2.3795222878456115, + "loss/hidden": 3.5140625, + "loss/incoh": 0.0, + "loss/logits": 0.40104621052742007, + "loss/reg": 0.0, + "step": 2380 + }, + { + "epoch": 0.015723684210526317, + "grad_norm": 3.171875, + "grad_norm_var": 0.4603017171223958, + "learning_rate": 0.0001, + "loss": 3.8162, + "loss/crossentropy": 2.3675019264221193, + "loss/hidden": 3.4, + "loss/incoh": 0.0, + "loss/logits": 0.3826363369822502, + "loss/reg": 0.0, + "step": 2390 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 3.59375, + "grad_norm_var": 0.12642313639322916, + "learning_rate": 0.0001, + "loss": 3.8017, + "loss/crossentropy": 2.56625235080719, + "loss/hidden": 3.4046875, + "loss/incoh": 0.0, + "loss/logits": 0.3704580098390579, + "loss/reg": 0.0, + "step": 2400 + }, + { + "epoch": 0.015855263157894737, + "grad_norm": 2.8125, + "grad_norm_var": 0.22603759765625, + "learning_rate": 0.0001, + "loss": 3.8356, + "loss/crossentropy": 2.3642316341400145, + "loss/hidden": 3.640625, + "loss/incoh": 0.0, + "loss/logits": 0.4971610188484192, + "loss/reg": 0.0, + "step": 2410 + }, + { + "epoch": 0.015921052631578947, + "grad_norm": 3.359375, + "grad_norm_var": 2.7929354478016266e+17, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 2.5260600686073302, + "loss/hidden": 3.4734375, + "loss/incoh": 0.0, + "loss/logits": 0.34504298865795135, + "loss/reg": 0.0, + "step": 2420 + }, + { + "epoch": 0.015986842105263157, + "grad_norm": 3.140625, + "grad_norm_var": 2.792935447600693e+17, + "learning_rate": 0.0001, + "loss": 3.7506, + "loss/crossentropy": 2.6639176845550536, + "loss/hidden": 3.184375, + "loss/incoh": 0.0, + "loss/logits": 0.3401388913393021, + "loss/reg": 0.0, + "step": 2430 + }, + { + "epoch": 0.016052631578947367, + "grad_norm": 2.890625, + "grad_norm_var": 0.0854644775390625, + "learning_rate": 0.0001, + "loss": 3.7687, + "loss/crossentropy": 2.3805726766586304, + "loss/hidden": 3.1671875, + "loss/incoh": 0.0, + "loss/logits": 0.3257554292678833, + "loss/reg": 0.0, + "step": 2440 + }, + { + "epoch": 0.01611842105263158, + "grad_norm": 8.125, + "grad_norm_var": 1.995349713361273e+17, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 2.370633268356323, + "loss/hidden": 3.4765625, + "loss/incoh": 0.0, + "loss/logits": 0.3756751254200935, + "loss/reg": 0.0, + "step": 2450 + }, + { + "epoch": 0.01618421052631579, + "grad_norm": 2.75, + "grad_norm_var": 1.995349712714498e+17, + "learning_rate": 0.0001, + "loss": 3.7822, + "loss/crossentropy": 2.5394015312194824, + "loss/hidden": 3.3125, + "loss/incoh": 0.0, + "loss/logits": 0.34547194838523865, + "loss/reg": 0.0, + "step": 2460 + }, + { + "epoch": 0.01625, + "grad_norm": 3.15625, + "grad_norm_var": 0.0999664306640625, + "learning_rate": 0.0001, + "loss": 3.6838, + "loss/crossentropy": 2.478586256504059, + "loss/hidden": 3.4109375, + "loss/incoh": 0.0, + "loss/logits": 0.4159191906452179, + "loss/reg": 0.0, + "step": 2470 + }, + { + "epoch": 0.01631578947368421, + "grad_norm": 2.703125, + "grad_norm_var": 0.48103739420572916, + "learning_rate": 0.0001, + "loss": 3.7274, + "loss/crossentropy": 2.3859502553939818, + "loss/hidden": 3.4234375, + "loss/incoh": 0.0, + "loss/logits": 0.36551299393177034, + "loss/reg": 0.0, + "step": 2480 + }, + { + "epoch": 0.01638157894736842, + "grad_norm": 3.171875, + "grad_norm_var": 0.33056640625, + "learning_rate": 0.0001, + "loss": 3.8413, + "loss/crossentropy": 2.47422776222229, + "loss/hidden": 3.2703125, + "loss/incoh": 0.0, + "loss/logits": 0.34280748963356017, + "loss/reg": 0.0, + "step": 2490 + }, + { + "epoch": 0.01644736842105263, + "grad_norm": 3.03125, + "grad_norm_var": 0.30504150390625, + "learning_rate": 0.0001, + "loss": 3.7262, + "loss/crossentropy": 2.288319444656372, + "loss/hidden": 3.2109375, + "loss/incoh": 0.0, + "loss/logits": 0.31078503280878067, + "loss/reg": 0.0, + "step": 2500 + }, + { + "epoch": 0.01651315789473684, + "grad_norm": 3.125, + "grad_norm_var": 0.2894683837890625, + "learning_rate": 0.0001, + "loss": 3.8194, + "loss/crossentropy": 2.2243799686431887, + "loss/hidden": 3.434375, + "loss/incoh": 0.0, + "loss/logits": 0.33974049538373946, + "loss/reg": 0.0, + "step": 2510 + }, + { + "epoch": 0.016578947368421054, + "grad_norm": 3.046875, + "grad_norm_var": 0.3177235921223958, + "learning_rate": 0.0001, + "loss": 3.6925, + "loss/crossentropy": 2.63401620388031, + "loss/hidden": 3.39375, + "loss/incoh": 0.0, + "loss/logits": 0.39840718507766726, + "loss/reg": 0.0, + "step": 2520 + }, + { + "epoch": 0.016644736842105264, + "grad_norm": 3.046875, + "grad_norm_var": 0.2181549072265625, + "learning_rate": 0.0001, + "loss": 3.7573, + "loss/crossentropy": 2.4925423860549927, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.3407833933830261, + "loss/reg": 0.0, + "step": 2530 + }, + { + "epoch": 0.016710526315789474, + "grad_norm": 3.328125, + "grad_norm_var": 0.19260152180989584, + "learning_rate": 0.0001, + "loss": 3.7514, + "loss/crossentropy": 2.3376643657684326, + "loss/hidden": 3.4609375, + "loss/incoh": 0.0, + "loss/logits": 0.4351751744747162, + "loss/reg": 0.0, + "step": 2540 + }, + { + "epoch": 0.016776315789473684, + "grad_norm": 2.78125, + "grad_norm_var": 0.12961324055989584, + "learning_rate": 0.0001, + "loss": 3.7217, + "loss/crossentropy": 2.279516875743866, + "loss/hidden": 3.58125, + "loss/incoh": 0.0, + "loss/logits": 0.37157190442085264, + "loss/reg": 0.0, + "step": 2550 + }, + { + "epoch": 0.016842105263157894, + "grad_norm": 2.921875, + "grad_norm_var": 0.0912017822265625, + "learning_rate": 0.0001, + "loss": 3.6839, + "loss/crossentropy": 2.2843039661645888, + "loss/hidden": 3.2125, + "loss/incoh": 0.0, + "loss/logits": 0.33208019435405733, + "loss/reg": 0.0, + "step": 2560 + }, + { + "epoch": 0.016907894736842104, + "grad_norm": 3.28125, + "grad_norm_var": 0.32079671223958334, + "learning_rate": 0.0001, + "loss": 3.7381, + "loss/crossentropy": 2.2308380246162414, + "loss/hidden": 3.29375, + "loss/incoh": 0.0, + "loss/logits": 0.30514844954013826, + "loss/reg": 0.0, + "step": 2570 + }, + { + "epoch": 0.016973684210526314, + "grad_norm": 2.921875, + "grad_norm_var": 0.6193522135416667, + "learning_rate": 0.0001, + "loss": 3.7209, + "loss/crossentropy": 2.500720489025116, + "loss/hidden": 3.2859375, + "loss/incoh": 0.0, + "loss/logits": 0.38955146372318267, + "loss/reg": 0.0, + "step": 2580 + }, + { + "epoch": 0.017039473684210528, + "grad_norm": 3.0, + "grad_norm_var": 0.09539388020833334, + "learning_rate": 0.0001, + "loss": 3.6933, + "loss/crossentropy": 2.397514319419861, + "loss/hidden": 3.4796875, + "loss/incoh": 0.0, + "loss/logits": 0.426153627038002, + "loss/reg": 0.0, + "step": 2590 + }, + { + "epoch": 0.017105263157894738, + "grad_norm": 2.984375, + "grad_norm_var": 0.44207356770833334, + "learning_rate": 0.0001, + "loss": 3.7927, + "loss/crossentropy": 2.4746341586112974, + "loss/hidden": 3.4515625, + "loss/incoh": 0.0, + "loss/logits": 0.3806774616241455, + "loss/reg": 0.0, + "step": 2600 + }, + { + "epoch": 0.017171052631578948, + "grad_norm": 3.109375, + "grad_norm_var": 0.10927632649739584, + "learning_rate": 0.0001, + "loss": 3.7507, + "loss/crossentropy": 2.6908259630203246, + "loss/hidden": 3.459375, + "loss/incoh": 0.0, + "loss/logits": 0.47532927691936494, + "loss/reg": 0.0, + "step": 2610 + }, + { + "epoch": 0.017236842105263158, + "grad_norm": 2.75, + "grad_norm_var": 1.8751780192057292, + "learning_rate": 0.0001, + "loss": 3.8498, + "loss/crossentropy": 2.3184617161750793, + "loss/hidden": 3.3125, + "loss/incoh": 0.0, + "loss/logits": 0.3832893192768097, + "loss/reg": 0.0, + "step": 2620 + }, + { + "epoch": 0.017302631578947368, + "grad_norm": 3.734375, + "grad_norm_var": 1.95693359375, + "learning_rate": 0.0001, + "loss": 3.8697, + "loss/crossentropy": 2.386726236343384, + "loss/hidden": 3.38125, + "loss/incoh": 0.0, + "loss/logits": 0.4112528935074806, + "loss/reg": 0.0, + "step": 2630 + }, + { + "epoch": 0.017368421052631578, + "grad_norm": 3.09375, + "grad_norm_var": 236.78067118326823, + "learning_rate": 0.0001, + "loss": 3.843, + "loss/crossentropy": 2.6268559217453005, + "loss/hidden": 3.525, + "loss/incoh": 0.0, + "loss/logits": 0.5286044746637344, + "loss/reg": 0.0, + "step": 2640 + }, + { + "epoch": 0.017434210526315788, + "grad_norm": 3.15625, + "grad_norm_var": 238.80460510253906, + "learning_rate": 0.0001, + "loss": 3.7934, + "loss/crossentropy": 2.621377694606781, + "loss/hidden": 3.346875, + "loss/incoh": 0.0, + "loss/logits": 0.39670759439468384, + "loss/reg": 0.0, + "step": 2650 + }, + { + "epoch": 0.0175, + "grad_norm": 2.921875, + "grad_norm_var": 0.4898274739583333, + "learning_rate": 0.0001, + "loss": 3.8686, + "loss/crossentropy": 2.3771218061447144, + "loss/hidden": 3.5328125, + "loss/incoh": 0.0, + "loss/logits": 0.39255764335393906, + "loss/reg": 0.0, + "step": 2660 + }, + { + "epoch": 0.01756578947368421, + "grad_norm": 2.96875, + "grad_norm_var": 0.9812001546223958, + "learning_rate": 0.0001, + "loss": 3.6636, + "loss/crossentropy": 2.2852025091648103, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.3551890656352043, + "loss/reg": 0.0, + "step": 2670 + }, + { + "epoch": 0.017631578947368422, + "grad_norm": 3.203125, + "grad_norm_var": 0.38752848307291665, + "learning_rate": 0.0001, + "loss": 3.7338, + "loss/crossentropy": 2.280995038151741, + "loss/hidden": 3.271875, + "loss/incoh": 0.0, + "loss/logits": 0.34781029969453814, + "loss/reg": 0.0, + "step": 2680 + }, + { + "epoch": 0.017697368421052632, + "grad_norm": 2.953125, + "grad_norm_var": 0.5138661702473958, + "learning_rate": 0.0001, + "loss": 3.7751, + "loss/crossentropy": 2.5928542375564576, + "loss/hidden": 3.2203125, + "loss/incoh": 0.0, + "loss/logits": 0.3643882930278778, + "loss/reg": 0.0, + "step": 2690 + }, + { + "epoch": 0.017763157894736842, + "grad_norm": 3.015625, + "grad_norm_var": 2.5011057535807293, + "learning_rate": 0.0001, + "loss": 3.7281, + "loss/crossentropy": 2.71818265914917, + "loss/hidden": 3.3203125, + "loss/incoh": 0.0, + "loss/logits": 0.41874536871910095, + "loss/reg": 0.0, + "step": 2700 + }, + { + "epoch": 0.017828947368421052, + "grad_norm": 3.0625, + "grad_norm_var": 0.2831939697265625, + "learning_rate": 0.0001, + "loss": 3.7723, + "loss/crossentropy": 2.3557824969291685, + "loss/hidden": 3.303125, + "loss/incoh": 0.0, + "loss/logits": 0.36439308822155, + "loss/reg": 0.0, + "step": 2710 + }, + { + "epoch": 0.017894736842105262, + "grad_norm": 3.390625, + "grad_norm_var": 1.9459625244140626, + "learning_rate": 0.0001, + "loss": 3.854, + "loss/crossentropy": 2.3793618083000183, + "loss/hidden": 3.371875, + "loss/incoh": 0.0, + "loss/logits": 0.36435145139694214, + "loss/reg": 0.0, + "step": 2720 + }, + { + "epoch": 0.017960526315789475, + "grad_norm": 2.5, + "grad_norm_var": 1.9445271809895834, + "learning_rate": 0.0001, + "loss": 3.7925, + "loss/crossentropy": 2.364825797080994, + "loss/hidden": 3.2453125, + "loss/incoh": 0.0, + "loss/logits": 0.3577578902244568, + "loss/reg": 0.0, + "step": 2730 + }, + { + "epoch": 0.018026315789473685, + "grad_norm": 2.9375, + "grad_norm_var": 0.18502197265625, + "learning_rate": 0.0001, + "loss": 3.747, + "loss/crossentropy": 2.3110872566699983, + "loss/hidden": 3.4078125, + "loss/incoh": 0.0, + "loss/logits": 0.3662073493003845, + "loss/reg": 0.0, + "step": 2740 + }, + { + "epoch": 0.018092105263157895, + "grad_norm": 2.90625, + "grad_norm_var": 0.06238606770833333, + "learning_rate": 0.0001, + "loss": 3.736, + "loss/crossentropy": 2.4977880120277405, + "loss/hidden": 3.3578125, + "loss/incoh": 0.0, + "loss/logits": 0.40198240578174593, + "loss/reg": 0.0, + "step": 2750 + }, + { + "epoch": 0.018157894736842106, + "grad_norm": 2.65625, + "grad_norm_var": 1.3276357014973958, + "learning_rate": 0.0001, + "loss": 3.7594, + "loss/crossentropy": 2.3260527729988096, + "loss/hidden": 3.39375, + "loss/incoh": 0.0, + "loss/logits": 0.3752635881304741, + "loss/reg": 0.0, + "step": 2760 + }, + { + "epoch": 0.018223684210526316, + "grad_norm": 3.0, + "grad_norm_var": 0.06177978515625, + "learning_rate": 0.0001, + "loss": 3.7632, + "loss/crossentropy": 2.6722695350646974, + "loss/hidden": 3.34375, + "loss/incoh": 0.0, + "loss/logits": 0.36579819619655607, + "loss/reg": 0.0, + "step": 2770 + }, + { + "epoch": 0.018289473684210526, + "grad_norm": 3.78125, + "grad_norm_var": 0.07119038899739584, + "learning_rate": 0.0001, + "loss": 3.7821, + "loss/crossentropy": 2.712409424781799, + "loss/hidden": 3.59375, + "loss/incoh": 0.0, + "loss/logits": 0.43594706654548643, + "loss/reg": 0.0, + "step": 2780 + }, + { + "epoch": 0.018355263157894736, + "grad_norm": 3.21875, + "grad_norm_var": 2.3453409830729166, + "learning_rate": 0.0001, + "loss": 3.7973, + "loss/crossentropy": 2.547107517719269, + "loss/hidden": 3.39375, + "loss/incoh": 0.0, + "loss/logits": 0.4092650800943375, + "loss/reg": 0.0, + "step": 2790 + }, + { + "epoch": 0.018421052631578946, + "grad_norm": 2.796875, + "grad_norm_var": 4.506314086914062, + "learning_rate": 0.0001, + "loss": 3.6334, + "loss/crossentropy": 2.5399341940879823, + "loss/hidden": 3.4046875, + "loss/incoh": 0.0, + "loss/logits": 0.4391636699438095, + "loss/reg": 0.0, + "step": 2800 + }, + { + "epoch": 0.01848684210526316, + "grad_norm": 4.84375, + "grad_norm_var": 2.5375152587890626, + "learning_rate": 0.0001, + "loss": 3.7292, + "loss/crossentropy": 2.5819294929504393, + "loss/hidden": 3.365625, + "loss/incoh": 0.0, + "loss/logits": 0.33986122310161593, + "loss/reg": 0.0, + "step": 2810 + }, + { + "epoch": 0.01855263157894737, + "grad_norm": 4.03125, + "grad_norm_var": 0.3421295166015625, + "learning_rate": 0.0001, + "loss": 3.7003, + "loss/crossentropy": 2.4064539194107057, + "loss/hidden": 3.334375, + "loss/incoh": 0.0, + "loss/logits": 0.3262439340353012, + "loss/reg": 0.0, + "step": 2820 + }, + { + "epoch": 0.01861842105263158, + "grad_norm": 2.828125, + "grad_norm_var": 0.1756988525390625, + "learning_rate": 0.0001, + "loss": 3.804, + "loss/crossentropy": 2.6151478767395018, + "loss/hidden": 3.6390625, + "loss/incoh": 0.0, + "loss/logits": 0.4677980303764343, + "loss/reg": 0.0, + "step": 2830 + }, + { + "epoch": 0.01868421052631579, + "grad_norm": 3.03125, + "grad_norm_var": 0.10458577473958333, + "learning_rate": 0.0001, + "loss": 3.7672, + "loss/crossentropy": 2.5210029244422913, + "loss/hidden": 3.3953125, + "loss/incoh": 0.0, + "loss/logits": 0.40769249498844146, + "loss/reg": 0.0, + "step": 2840 + }, + { + "epoch": 0.01875, + "grad_norm": 3.28125, + "grad_norm_var": 0.1360504150390625, + "learning_rate": 0.0001, + "loss": 3.7829, + "loss/crossentropy": 2.3390053629875185, + "loss/hidden": 3.4421875, + "loss/incoh": 0.0, + "loss/logits": 0.3656760662794113, + "loss/reg": 0.0, + "step": 2850 + }, + { + "epoch": 0.01881578947368421, + "grad_norm": 2.609375, + "grad_norm_var": 0.21334635416666667, + "learning_rate": 0.0001, + "loss": 3.7445, + "loss/crossentropy": 2.044054812192917, + "loss/hidden": 3.55625, + "loss/incoh": 0.0, + "loss/logits": 0.45996856689453125, + "loss/reg": 0.0, + "step": 2860 + }, + { + "epoch": 0.01888157894736842, + "grad_norm": 2.921875, + "grad_norm_var": 0.35147196451822915, + "learning_rate": 0.0001, + "loss": 3.793, + "loss/crossentropy": 2.2911840200424196, + "loss/hidden": 3.5703125, + "loss/incoh": 0.0, + "loss/logits": 0.3373717874288559, + "loss/reg": 0.0, + "step": 2870 + }, + { + "epoch": 0.018947368421052633, + "grad_norm": 2.515625, + "grad_norm_var": 0.3585845947265625, + "learning_rate": 0.0001, + "loss": 3.7057, + "loss/crossentropy": 2.3755346536636353, + "loss/hidden": 3.3609375, + "loss/incoh": 0.0, + "loss/logits": 0.36185318529605864, + "loss/reg": 0.0, + "step": 2880 + }, + { + "epoch": 0.019013157894736843, + "grad_norm": 2.859375, + "grad_norm_var": 0.0810211181640625, + "learning_rate": 0.0001, + "loss": 3.7152, + "loss/crossentropy": 2.1650418758392336, + "loss/hidden": 3.5109375, + "loss/incoh": 0.0, + "loss/logits": 0.36209405958652496, + "loss/reg": 0.0, + "step": 2890 + }, + { + "epoch": 0.019078947368421053, + "grad_norm": 3.171875, + "grad_norm_var": 1.6020182291666667, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 2.345675766468048, + "loss/hidden": 3.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.41470250189304353, + "loss/reg": 0.0, + "step": 2900 + }, + { + "epoch": 0.019144736842105263, + "grad_norm": 3.359375, + "grad_norm_var": 1.289264933268229, + "learning_rate": 0.0001, + "loss": 3.8444, + "loss/crossentropy": 2.3634544610977173, + "loss/hidden": 3.2984375, + "loss/incoh": 0.0, + "loss/logits": 0.3347387105226517, + "loss/reg": 0.0, + "step": 2910 + }, + { + "epoch": 0.019210526315789473, + "grad_norm": 2.890625, + "grad_norm_var": 0.19099019368489584, + "learning_rate": 0.0001, + "loss": 3.7385, + "loss/crossentropy": 2.3202159285545347, + "loss/hidden": 3.3265625, + "loss/incoh": 0.0, + "loss/logits": 0.30690879598259924, + "loss/reg": 0.0, + "step": 2920 + }, + { + "epoch": 0.019276315789473683, + "grad_norm": 3.484375, + "grad_norm_var": 0.4906209309895833, + "learning_rate": 0.0001, + "loss": 3.6643, + "loss/crossentropy": 2.2004665434360504, + "loss/hidden": 3.484375, + "loss/incoh": 0.0, + "loss/logits": 0.3603193074464798, + "loss/reg": 0.0, + "step": 2930 + }, + { + "epoch": 0.019342105263157893, + "grad_norm": 3.421875, + "grad_norm_var": 0.13365478515625, + "learning_rate": 0.0001, + "loss": 3.6656, + "loss/crossentropy": 2.5021592140197755, + "loss/hidden": 3.4109375, + "loss/incoh": 0.0, + "loss/logits": 0.35134916603565214, + "loss/reg": 0.0, + "step": 2940 + }, + { + "epoch": 0.019407894736842107, + "grad_norm": 2.765625, + "grad_norm_var": 1.0520497639973958, + "learning_rate": 0.0001, + "loss": 3.7813, + "loss/crossentropy": 2.4475439548492433, + "loss/hidden": 3.44375, + "loss/incoh": 0.0, + "loss/logits": 0.4413463234901428, + "loss/reg": 0.0, + "step": 2950 + }, + { + "epoch": 0.019473684210526317, + "grad_norm": 2.90625, + "grad_norm_var": 1.761279296875, + "learning_rate": 0.0001, + "loss": 3.7476, + "loss/crossentropy": 2.57927063703537, + "loss/hidden": 3.4234375, + "loss/incoh": 0.0, + "loss/logits": 0.4805259481072426, + "loss/reg": 0.0, + "step": 2960 + }, + { + "epoch": 0.019539473684210527, + "grad_norm": 3.734375, + "grad_norm_var": 0.2982737223307292, + "learning_rate": 0.0001, + "loss": 3.7277, + "loss/crossentropy": 2.0291129291057586, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.285884577780962, + "loss/reg": 0.0, + "step": 2970 + }, + { + "epoch": 0.019605263157894737, + "grad_norm": 3.03125, + "grad_norm_var": 0.6626942952473959, + "learning_rate": 0.0001, + "loss": 3.8596, + "loss/crossentropy": 2.112092435359955, + "loss/hidden": 3.4453125, + "loss/incoh": 0.0, + "loss/logits": 0.36193700730800626, + "loss/reg": 0.0, + "step": 2980 + }, + { + "epoch": 0.019671052631578947, + "grad_norm": 3.359375, + "grad_norm_var": 0.16502176920572917, + "learning_rate": 0.0001, + "loss": 3.7666, + "loss/crossentropy": 2.2168065547943114, + "loss/hidden": 3.3734375, + "loss/incoh": 0.0, + "loss/logits": 0.3593168243765831, + "loss/reg": 0.0, + "step": 2990 + }, + { + "epoch": 0.019736842105263157, + "grad_norm": 2.921875, + "grad_norm_var": 0.45120035807291664, + "learning_rate": 0.0001, + "loss": 3.7107, + "loss/crossentropy": 2.350427895784378, + "loss/hidden": 3.3109375, + "loss/incoh": 0.0, + "loss/logits": 0.37478172183036806, + "loss/reg": 0.0, + "step": 3000 + }, + { + "epoch": 0.019802631578947367, + "grad_norm": 2.71875, + "grad_norm_var": 0.15097249348958333, + "learning_rate": 0.0001, + "loss": 3.7559, + "loss/crossentropy": 2.6041540622711183, + "loss/hidden": 3.3421875, + "loss/incoh": 0.0, + "loss/logits": 0.366491025686264, + "loss/reg": 0.0, + "step": 3010 + }, + { + "epoch": 0.01986842105263158, + "grad_norm": 2.953125, + "grad_norm_var": 0.1023834228515625, + "learning_rate": 0.0001, + "loss": 3.7322, + "loss/crossentropy": 2.2936912298202516, + "loss/hidden": 3.4234375, + "loss/incoh": 0.0, + "loss/logits": 0.3990582287311554, + "loss/reg": 0.0, + "step": 3020 + }, + { + "epoch": 0.01993421052631579, + "grad_norm": 2.9375, + "grad_norm_var": 0.07664388020833333, + "learning_rate": 0.0001, + "loss": 3.7405, + "loss/crossentropy": 2.1873862028121946, + "loss/hidden": 3.4703125, + "loss/incoh": 0.0, + "loss/logits": 0.4286995857954025, + "loss/reg": 0.0, + "step": 3030 + }, + { + "epoch": 0.02, + "grad_norm": 2.53125, + "grad_norm_var": 0.3075480143229167, + "learning_rate": 0.0001, + "loss": 3.7681, + "loss/crossentropy": 2.0712085247039793, + "loss/hidden": 3.096875, + "loss/incoh": 0.0, + "loss/logits": 0.2994038611650467, + "loss/reg": 0.0, + "step": 3040 + }, + { + "epoch": 0.02006578947368421, + "grad_norm": 2.734375, + "grad_norm_var": 0.6994049072265625, + "learning_rate": 0.0001, + "loss": 3.7519, + "loss/crossentropy": 2.155745780467987, + "loss/hidden": 3.2890625, + "loss/incoh": 0.0, + "loss/logits": 0.3106741845607758, + "loss/reg": 0.0, + "step": 3050 + }, + { + "epoch": 0.02013157894736842, + "grad_norm": 3.53125, + "grad_norm_var": 0.177978515625, + "learning_rate": 0.0001, + "loss": 3.7618, + "loss/crossentropy": 2.1236796349287035, + "loss/hidden": 3.2046875, + "loss/incoh": 0.0, + "loss/logits": 0.3017591178417206, + "loss/reg": 0.0, + "step": 3060 + }, + { + "epoch": 0.02019736842105263, + "grad_norm": 3.265625, + "grad_norm_var": 0.16617431640625, + "learning_rate": 0.0001, + "loss": 3.7225, + "loss/crossentropy": 2.4667672872543336, + "loss/hidden": 3.3328125, + "loss/incoh": 0.0, + "loss/logits": 0.3858541399240494, + "loss/reg": 0.0, + "step": 3070 + }, + { + "epoch": 0.02026315789473684, + "grad_norm": 2.703125, + "grad_norm_var": 2.2265110394707968e+17, + "learning_rate": 0.0001, + "loss": 3.8077, + "loss/crossentropy": 2.53361736536026, + "loss/hidden": 3.121875, + "loss/incoh": 0.0, + "loss/logits": 0.3336446687579155, + "loss/reg": 0.0, + "step": 3080 + }, + { + "epoch": 0.020328947368421054, + "grad_norm": 2.34375, + "grad_norm_var": 2.2265110387924992e+17, + "learning_rate": 0.0001, + "loss": 3.7067, + "loss/crossentropy": 2.3802372574806214, + "loss/hidden": 3.2609375, + "loss/incoh": 0.0, + "loss/logits": 0.3561277031898499, + "loss/reg": 0.0, + "step": 3090 + }, + { + "epoch": 0.020394736842105264, + "grad_norm": 4.53125, + "grad_norm_var": 1.25572509765625, + "learning_rate": 0.0001, + "loss": 3.8816, + "loss/crossentropy": 2.8750504910945893, + "loss/hidden": 3.728125, + "loss/incoh": 0.0, + "loss/logits": 0.37489808425307275, + "loss/reg": 0.0, + "step": 3100 + }, + { + "epoch": 0.020460526315789474, + "grad_norm": 2.640625, + "grad_norm_var": 0.77646484375, + "learning_rate": 0.0001, + "loss": 3.6678, + "loss/crossentropy": 2.258682942390442, + "loss/hidden": 3.4890625, + "loss/incoh": 0.0, + "loss/logits": 0.3497451141476631, + "loss/reg": 0.0, + "step": 3110 + }, + { + "epoch": 0.020526315789473684, + "grad_norm": 2.8125, + "grad_norm_var": 0.047412109375, + "learning_rate": 0.0001, + "loss": 3.6585, + "loss/crossentropy": 2.3022143959999086, + "loss/hidden": 3.375, + "loss/incoh": 0.0, + "loss/logits": 0.37089207768440247, + "loss/reg": 0.0, + "step": 3120 + }, + { + "epoch": 0.020592105263157894, + "grad_norm": 2.921875, + "grad_norm_var": 1.5189036051432292, + "learning_rate": 0.0001, + "loss": 3.7862, + "loss/crossentropy": 2.6240602493286134, + "loss/hidden": 3.35625, + "loss/incoh": 0.0, + "loss/logits": 0.407352888584137, + "loss/reg": 0.0, + "step": 3130 + }, + { + "epoch": 0.020657894736842104, + "grad_norm": 3.71875, + "grad_norm_var": 1.5723052978515626, + "learning_rate": 0.0001, + "loss": 3.7355, + "loss/crossentropy": 2.290934902429581, + "loss/hidden": 3.3125, + "loss/incoh": 0.0, + "loss/logits": 0.3808047503232956, + "loss/reg": 0.0, + "step": 3140 + }, + { + "epoch": 0.020723684210526314, + "grad_norm": 2.84375, + "grad_norm_var": 0.5337961832682292, + "learning_rate": 0.0001, + "loss": 3.7778, + "loss/crossentropy": 2.485193204879761, + "loss/hidden": 3.515625, + "loss/incoh": 0.0, + "loss/logits": 0.4543539136648178, + "loss/reg": 0.0, + "step": 3150 + }, + { + "epoch": 0.020789473684210528, + "grad_norm": 2.8125, + "grad_norm_var": 0.3465728759765625, + "learning_rate": 0.0001, + "loss": 3.7506, + "loss/crossentropy": 2.557511067390442, + "loss/hidden": 3.3953125, + "loss/incoh": 0.0, + "loss/logits": 0.4272035837173462, + "loss/reg": 0.0, + "step": 3160 + }, + { + "epoch": 0.020855263157894738, + "grad_norm": 3.0625, + "grad_norm_var": 0.20754801432291667, + "learning_rate": 0.0001, + "loss": 3.6316, + "loss/crossentropy": 2.310171937942505, + "loss/hidden": 3.184375, + "loss/incoh": 0.0, + "loss/logits": 0.32435240745544436, + "loss/reg": 0.0, + "step": 3170 + }, + { + "epoch": 0.020921052631578948, + "grad_norm": 2.578125, + "grad_norm_var": 0.833984375, + "learning_rate": 0.0001, + "loss": 3.7324, + "loss/crossentropy": 2.5518528699874876, + "loss/hidden": 3.3765625, + "loss/incoh": 0.0, + "loss/logits": 0.3816035658121109, + "loss/reg": 0.0, + "step": 3180 + }, + { + "epoch": 0.020986842105263158, + "grad_norm": 2.984375, + "grad_norm_var": 2.3063795635792774e+17, + "learning_rate": 0.0001, + "loss": 3.8813, + "loss/crossentropy": 2.3198139667510986, + "loss/hidden": 3.38125, + "loss/incoh": 0.0, + "loss/logits": 0.3700142025947571, + "loss/reg": 0.0, + "step": 3190 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 2.78125, + "grad_norm_var": 0.10243733723958333, + "learning_rate": 0.0001, + "loss": 3.6063, + "loss/crossentropy": 2.545992207527161, + "loss/hidden": 3.2875, + "loss/incoh": 0.0, + "loss/logits": 0.3494896024465561, + "loss/reg": 0.0, + "step": 3200 + }, + { + "epoch": 0.021118421052631578, + "grad_norm": 2.8125, + "grad_norm_var": 0.06311442057291666, + "learning_rate": 0.0001, + "loss": 3.6901, + "loss/crossentropy": 2.330699014663696, + "loss/hidden": 3.3625, + "loss/incoh": 0.0, + "loss/logits": 0.34815158843994143, + "loss/reg": 0.0, + "step": 3210 + }, + { + "epoch": 0.021184210526315788, + "grad_norm": 3.25, + "grad_norm_var": 0.3830718994140625, + "learning_rate": 0.0001, + "loss": 3.7327, + "loss/crossentropy": 2.489699113368988, + "loss/hidden": 3.325, + "loss/incoh": 0.0, + "loss/logits": 0.337864650785923, + "loss/reg": 0.0, + "step": 3220 + }, + { + "epoch": 0.02125, + "grad_norm": 2.828125, + "grad_norm_var": 0.38974609375, + "learning_rate": 0.0001, + "loss": 3.6871, + "loss/crossentropy": 2.3864477396011354, + "loss/hidden": 3.3859375, + "loss/incoh": 0.0, + "loss/logits": 0.4103096604347229, + "loss/reg": 0.0, + "step": 3230 + }, + { + "epoch": 0.02131578947368421, + "grad_norm": 2.703125, + "grad_norm_var": 0.05084228515625, + "learning_rate": 0.0001, + "loss": 3.6458, + "loss/crossentropy": 2.4074989527463915, + "loss/hidden": 3.375, + "loss/incoh": 0.0, + "loss/logits": 0.35124915838241577, + "loss/reg": 0.0, + "step": 3240 + }, + { + "epoch": 0.02138157894736842, + "grad_norm": 3.453125, + "grad_norm_var": 0.08498942057291667, + "learning_rate": 0.0001, + "loss": 3.7083, + "loss/crossentropy": 2.524831974506378, + "loss/hidden": 3.2515625, + "loss/incoh": 0.0, + "loss/logits": 0.368264502286911, + "loss/reg": 0.0, + "step": 3250 + }, + { + "epoch": 0.02144736842105263, + "grad_norm": 2.78125, + "grad_norm_var": 0.42135009765625, + "learning_rate": 0.0001, + "loss": 3.7602, + "loss/crossentropy": 2.166905391216278, + "loss/hidden": 3.371875, + "loss/incoh": 0.0, + "loss/logits": 0.3093524396419525, + "loss/reg": 0.0, + "step": 3260 + }, + { + "epoch": 0.02151315789473684, + "grad_norm": 2.546875, + "grad_norm_var": 0.4861724853515625, + "learning_rate": 0.0001, + "loss": 3.6738, + "loss/crossentropy": 2.2662750601768495, + "loss/hidden": 3.1578125, + "loss/incoh": 0.0, + "loss/logits": 0.318782140314579, + "loss/reg": 0.0, + "step": 3270 + }, + { + "epoch": 0.02157894736842105, + "grad_norm": 2.875, + "grad_norm_var": 0.13782145182291666, + "learning_rate": 0.0001, + "loss": 3.7461, + "loss/crossentropy": 2.0462193369865416, + "loss/hidden": 3.2859375, + "loss/incoh": 0.0, + "loss/logits": 0.30782590508461, + "loss/reg": 0.0, + "step": 3280 + }, + { + "epoch": 0.021644736842105262, + "grad_norm": 2.875, + "grad_norm_var": 0.11330973307291667, + "learning_rate": 0.0001, + "loss": 3.7008, + "loss/crossentropy": 2.5261476397514344, + "loss/hidden": 3.2734375, + "loss/incoh": 0.0, + "loss/logits": 0.3591727793216705, + "loss/reg": 0.0, + "step": 3290 + }, + { + "epoch": 0.021710526315789475, + "grad_norm": 3.0, + "grad_norm_var": 0.1515533447265625, + "learning_rate": 0.0001, + "loss": 3.751, + "loss/crossentropy": 2.569035267829895, + "loss/hidden": 3.3140625, + "loss/incoh": 0.0, + "loss/logits": 0.3920204371213913, + "loss/reg": 0.0, + "step": 3300 + }, + { + "epoch": 0.021776315789473685, + "grad_norm": 2.96875, + "grad_norm_var": 0.09117431640625, + "learning_rate": 0.0001, + "loss": 3.6385, + "loss/crossentropy": 2.5647154331207274, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.34364444613456724, + "loss/reg": 0.0, + "step": 3310 + }, + { + "epoch": 0.021842105263157895, + "grad_norm": 2.5625, + "grad_norm_var": 1.6411692301432292, + "learning_rate": 0.0001, + "loss": 3.6335, + "loss/crossentropy": 2.427185571193695, + "loss/hidden": 3.30625, + "loss/incoh": 0.0, + "loss/logits": 0.3427447766065598, + "loss/reg": 0.0, + "step": 3320 + }, + { + "epoch": 0.021907894736842105, + "grad_norm": 2.390625, + "grad_norm_var": 0.11515299479166667, + "learning_rate": 0.0001, + "loss": 3.6807, + "loss/crossentropy": 2.1253209471702577, + "loss/hidden": 3.428125, + "loss/incoh": 0.0, + "loss/logits": 0.36423676908016206, + "loss/reg": 0.0, + "step": 3330 + }, + { + "epoch": 0.021973684210526315, + "grad_norm": 2.78125, + "grad_norm_var": 0.046956380208333336, + "learning_rate": 0.0001, + "loss": 3.5711, + "loss/crossentropy": 2.3783676266670226, + "loss/hidden": 3.1703125, + "loss/incoh": 0.0, + "loss/logits": 0.3118838146328926, + "loss/reg": 0.0, + "step": 3340 + }, + { + "epoch": 0.022039473684210525, + "grad_norm": 2.953125, + "grad_norm_var": 0.060791015625, + "learning_rate": 0.0001, + "loss": 3.6252, + "loss/crossentropy": 2.350738251209259, + "loss/hidden": 3.1625, + "loss/incoh": 0.0, + "loss/logits": 0.2957428440451622, + "loss/reg": 0.0, + "step": 3350 + }, + { + "epoch": 0.022105263157894735, + "grad_norm": 2.4375, + "grad_norm_var": 0.09226888020833333, + "learning_rate": 0.0001, + "loss": 3.7233, + "loss/crossentropy": 2.5446563720703126, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.3123622477054596, + "loss/reg": 0.0, + "step": 3360 + }, + { + "epoch": 0.02217105263157895, + "grad_norm": 2.828125, + "grad_norm_var": 0.3047108968098958, + "learning_rate": 0.0001, + "loss": 3.6972, + "loss/crossentropy": 2.33814697265625, + "loss/hidden": 3.20625, + "loss/incoh": 0.0, + "loss/logits": 0.30704180896282196, + "loss/reg": 0.0, + "step": 3370 + }, + { + "epoch": 0.02223684210526316, + "grad_norm": 3.03125, + "grad_norm_var": 0.17353413899739584, + "learning_rate": 0.0001, + "loss": 3.6799, + "loss/crossentropy": 2.2355513691902162, + "loss/hidden": 3.390625, + "loss/incoh": 0.0, + "loss/logits": 0.3377602517604828, + "loss/reg": 0.0, + "step": 3380 + }, + { + "epoch": 0.02230263157894737, + "grad_norm": 2.609375, + "grad_norm_var": 6.300797526041666, + "learning_rate": 0.0001, + "loss": 3.7636, + "loss/crossentropy": 2.3003466069698333, + "loss/hidden": 3.396875, + "loss/incoh": 0.0, + "loss/logits": 0.3391520828008652, + "loss/reg": 0.0, + "step": 3390 + }, + { + "epoch": 0.02236842105263158, + "grad_norm": 3.34375, + "grad_norm_var": 0.0972564697265625, + "learning_rate": 0.0001, + "loss": 3.658, + "loss/crossentropy": 2.3254055261611937, + "loss/hidden": 3.246875, + "loss/incoh": 0.0, + "loss/logits": 0.3125807404518127, + "loss/reg": 0.0, + "step": 3400 + }, + { + "epoch": 0.02243421052631579, + "grad_norm": 2.59375, + "grad_norm_var": 15.84599609375, + "learning_rate": 0.0001, + "loss": 3.7692, + "loss/crossentropy": 2.753233790397644, + "loss/hidden": 3.1546875, + "loss/incoh": 0.0, + "loss/logits": 0.3326481133699417, + "loss/reg": 0.0, + "step": 3410 + }, + { + "epoch": 0.0225, + "grad_norm": 4.875, + "grad_norm_var": 1.10299072265625, + "learning_rate": 0.0001, + "loss": 3.7531, + "loss/crossentropy": 2.282338631153107, + "loss/hidden": 3.3328125, + "loss/incoh": 0.0, + "loss/logits": 0.3607694834470749, + "loss/reg": 0.0, + "step": 3420 + }, + { + "epoch": 0.02256578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 0.626123046875, + "learning_rate": 0.0001, + "loss": 3.8275, + "loss/crossentropy": 2.5421807527542115, + "loss/hidden": 3.425, + "loss/incoh": 0.0, + "loss/logits": 0.5308041572570801, + "loss/reg": 0.0, + "step": 3430 + }, + { + "epoch": 0.022631578947368423, + "grad_norm": 2.78125, + "grad_norm_var": 0.40051676432291666, + "learning_rate": 0.0001, + "loss": 3.7523, + "loss/crossentropy": 2.541818845272064, + "loss/hidden": 3.3625, + "loss/incoh": 0.0, + "loss/logits": 0.359403657913208, + "loss/reg": 0.0, + "step": 3440 + }, + { + "epoch": 0.022697368421052633, + "grad_norm": 2.90625, + "grad_norm_var": 0.35347391764322916, + "learning_rate": 0.0001, + "loss": 3.6627, + "loss/crossentropy": 2.5443089246749877, + "loss/hidden": 3.2125, + "loss/incoh": 0.0, + "loss/logits": 0.32425140738487246, + "loss/reg": 0.0, + "step": 3450 + }, + { + "epoch": 0.022763157894736843, + "grad_norm": 3.296875, + "grad_norm_var": 0.09103190104166667, + "learning_rate": 0.0001, + "loss": 3.6541, + "loss/crossentropy": 2.4523619592189787, + "loss/hidden": 3.4453125, + "loss/incoh": 0.0, + "loss/logits": 0.3741248741745949, + "loss/reg": 0.0, + "step": 3460 + }, + { + "epoch": 0.022828947368421053, + "grad_norm": 5.375, + "grad_norm_var": 1.3390533447265625, + "learning_rate": 0.0001, + "loss": 3.7552, + "loss/crossentropy": 2.282164466381073, + "loss/hidden": 3.3984375, + "loss/incoh": 0.0, + "loss/logits": 0.32355323880910875, + "loss/reg": 0.0, + "step": 3470 + }, + { + "epoch": 0.022894736842105263, + "grad_norm": 2.859375, + "grad_norm_var": 0.5425608317057292, + "learning_rate": 0.0001, + "loss": 3.6438, + "loss/crossentropy": 2.584494400024414, + "loss/hidden": 3.325, + "loss/incoh": 0.0, + "loss/logits": 0.3407262712717056, + "loss/reg": 0.0, + "step": 3480 + }, + { + "epoch": 0.022960526315789473, + "grad_norm": 2.796875, + "grad_norm_var": 11.746613566080729, + "learning_rate": 0.0001, + "loss": 3.8186, + "loss/crossentropy": 2.781242084503174, + "loss/hidden": 3.2421875, + "loss/incoh": 0.0, + "loss/logits": 0.516629433631897, + "loss/reg": 0.0, + "step": 3490 + }, + { + "epoch": 0.023026315789473683, + "grad_norm": 2.96875, + "grad_norm_var": 0.38925374348958336, + "learning_rate": 0.0001, + "loss": 3.6824, + "loss/crossentropy": 2.6085395932197573, + "loss/hidden": 3.55625, + "loss/incoh": 0.0, + "loss/logits": 0.40641255080699923, + "loss/reg": 0.0, + "step": 3500 + }, + { + "epoch": 0.023092105263157896, + "grad_norm": 2.375, + "grad_norm_var": 0.0653228759765625, + "learning_rate": 0.0001, + "loss": 3.6125, + "loss/crossentropy": 2.143614149093628, + "loss/hidden": 3.2953125, + "loss/incoh": 0.0, + "loss/logits": 0.32446493208408356, + "loss/reg": 0.0, + "step": 3510 + }, + { + "epoch": 0.023157894736842106, + "grad_norm": 3.78125, + "grad_norm_var": 0.24351806640625, + "learning_rate": 0.0001, + "loss": 3.7227, + "loss/crossentropy": 2.4581753849983214, + "loss/hidden": 3.5296875, + "loss/incoh": 0.0, + "loss/logits": 0.4273629605770111, + "loss/reg": 0.0, + "step": 3520 + }, + { + "epoch": 0.023223684210526317, + "grad_norm": 2.546875, + "grad_norm_var": 0.3322987874348958, + "learning_rate": 0.0001, + "loss": 3.6619, + "loss/crossentropy": 2.452810299396515, + "loss/hidden": 3.41875, + "loss/incoh": 0.0, + "loss/logits": 0.364976304769516, + "loss/reg": 0.0, + "step": 3530 + }, + { + "epoch": 0.023289473684210527, + "grad_norm": 2.546875, + "grad_norm_var": 0.050780232747395834, + "learning_rate": 0.0001, + "loss": 3.5957, + "loss/crossentropy": 2.6891199111938477, + "loss/hidden": 3.0734375, + "loss/incoh": 0.0, + "loss/logits": 0.3124883592128754, + "loss/reg": 0.0, + "step": 3540 + }, + { + "epoch": 0.023355263157894737, + "grad_norm": 2.671875, + "grad_norm_var": 0.08567301432291667, + "learning_rate": 0.0001, + "loss": 3.7773, + "loss/crossentropy": 2.565224659442902, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.30210830420255663, + "loss/reg": 0.0, + "step": 3550 + }, + { + "epoch": 0.023421052631578947, + "grad_norm": 3.921875, + "grad_norm_var": 0.20045166015625, + "learning_rate": 0.0001, + "loss": 3.6777, + "loss/crossentropy": 2.3339913129806518, + "loss/hidden": 3.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.3782587692141533, + "loss/reg": 0.0, + "step": 3560 + }, + { + "epoch": 0.023486842105263157, + "grad_norm": 6.375, + "grad_norm_var": 7.792964680989583, + "learning_rate": 0.0001, + "loss": 3.9088, + "loss/crossentropy": 2.305986249446869, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.3260203331708908, + "loss/reg": 0.0, + "step": 3570 + }, + { + "epoch": 0.023552631578947367, + "grad_norm": 2.5, + "grad_norm_var": 1.226398722330729, + "learning_rate": 0.0001, + "loss": 3.5858, + "loss/crossentropy": 2.5068121433258055, + "loss/hidden": 3.29375, + "loss/incoh": 0.0, + "loss/logits": 0.38687763810157777, + "loss/reg": 0.0, + "step": 3580 + }, + { + "epoch": 0.02361842105263158, + "grad_norm": 2.6875, + "grad_norm_var": 0.08307291666666666, + "learning_rate": 0.0001, + "loss": 3.5749, + "loss/crossentropy": 2.440618324279785, + "loss/hidden": 3.246875, + "loss/incoh": 0.0, + "loss/logits": 0.3219692587852478, + "loss/reg": 0.0, + "step": 3590 + }, + { + "epoch": 0.02368421052631579, + "grad_norm": 2.921875, + "grad_norm_var": 0.2329742431640625, + "learning_rate": 0.0001, + "loss": 3.7091, + "loss/crossentropy": 2.3230647802352906, + "loss/hidden": 3.4234375, + "loss/incoh": 0.0, + "loss/logits": 0.3276051238179207, + "loss/reg": 0.0, + "step": 3600 + }, + { + "epoch": 0.02375, + "grad_norm": 3.125, + "grad_norm_var": 0.06494038899739583, + "learning_rate": 0.0001, + "loss": 3.6957, + "loss/crossentropy": 2.549039614200592, + "loss/hidden": 3.3265625, + "loss/incoh": 0.0, + "loss/logits": 0.4018037021160126, + "loss/reg": 0.0, + "step": 3610 + }, + { + "epoch": 0.02381578947368421, + "grad_norm": 2.546875, + "grad_norm_var": 0.05314839680989583, + "learning_rate": 0.0001, + "loss": 3.6018, + "loss/crossentropy": 2.5681329488754274, + "loss/hidden": 3.3046875, + "loss/incoh": 0.0, + "loss/logits": 0.3874122858047485, + "loss/reg": 0.0, + "step": 3620 + }, + { + "epoch": 0.02388157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.30606180826822915, + "learning_rate": 0.0001, + "loss": 3.6534, + "loss/crossentropy": 2.224066364765167, + "loss/hidden": 3.41875, + "loss/incoh": 0.0, + "loss/logits": 0.3868778973817825, + "loss/reg": 0.0, + "step": 3630 + }, + { + "epoch": 0.02394736842105263, + "grad_norm": 2.5, + "grad_norm_var": 0.44580078125, + "learning_rate": 0.0001, + "loss": 3.7632, + "loss/crossentropy": 2.313184142112732, + "loss/hidden": 3.29375, + "loss/incoh": 0.0, + "loss/logits": 0.35929109454154967, + "loss/reg": 0.0, + "step": 3640 + }, + { + "epoch": 0.02401315789473684, + "grad_norm": 4.65625, + "grad_norm_var": 0.43798421223958334, + "learning_rate": 0.0001, + "loss": 3.6503, + "loss/crossentropy": 2.483446490764618, + "loss/hidden": 3.2515625, + "loss/incoh": 0.0, + "loss/logits": 0.324999064207077, + "loss/reg": 0.0, + "step": 3650 + }, + { + "epoch": 0.024078947368421054, + "grad_norm": 2.71875, + "grad_norm_var": 0.271875, + "learning_rate": 0.0001, + "loss": 3.7069, + "loss/crossentropy": 2.2979444444179533, + "loss/hidden": 3.2421875, + "loss/incoh": 0.0, + "loss/logits": 0.3168840616941452, + "loss/reg": 0.0, + "step": 3660 + }, + { + "epoch": 0.024144736842105264, + "grad_norm": 2.65625, + "grad_norm_var": 0.0767242431640625, + "learning_rate": 0.0001, + "loss": 3.6808, + "loss/crossentropy": 2.4076380014419554, + "loss/hidden": 3.55625, + "loss/incoh": 0.0, + "loss/logits": 0.3966252237558365, + "loss/reg": 0.0, + "step": 3670 + }, + { + "epoch": 0.024210526315789474, + "grad_norm": 3.109375, + "grad_norm_var": 0.06747639973958333, + "learning_rate": 0.0001, + "loss": 3.7852, + "loss/crossentropy": 2.6107199430465697, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.3948290854692459, + "loss/reg": 0.0, + "step": 3680 + }, + { + "epoch": 0.024276315789473684, + "grad_norm": 3.765625, + "grad_norm_var": 0.13434244791666666, + "learning_rate": 0.0001, + "loss": 3.6048, + "loss/crossentropy": 2.5476237654685976, + "loss/hidden": 3.209375, + "loss/incoh": 0.0, + "loss/logits": 0.352567557990551, + "loss/reg": 0.0, + "step": 3690 + }, + { + "epoch": 0.024342105263157894, + "grad_norm": 2.34375, + "grad_norm_var": 0.6126139322916667, + "learning_rate": 0.0001, + "loss": 3.7034, + "loss/crossentropy": 2.435033369064331, + "loss/hidden": 3.275, + "loss/incoh": 0.0, + "loss/logits": 0.37708690464496614, + "loss/reg": 0.0, + "step": 3700 + }, + { + "epoch": 0.024407894736842104, + "grad_norm": 3.03125, + "grad_norm_var": 0.7496002197265625, + "learning_rate": 0.0001, + "loss": 3.6696, + "loss/crossentropy": 2.550716495513916, + "loss/hidden": 3.3328125, + "loss/incoh": 0.0, + "loss/logits": 0.38517349362373354, + "loss/reg": 0.0, + "step": 3710 + }, + { + "epoch": 0.024473684210526314, + "grad_norm": 2.296875, + "grad_norm_var": 0.5516886393229167, + "learning_rate": 0.0001, + "loss": 3.5623, + "loss/crossentropy": 2.3806477397680283, + "loss/hidden": 3.153125, + "loss/incoh": 0.0, + "loss/logits": 0.3013214536011219, + "loss/reg": 0.0, + "step": 3720 + }, + { + "epoch": 0.024539473684210528, + "grad_norm": 3.1875, + "grad_norm_var": 0.4890207926432292, + "learning_rate": 0.0001, + "loss": 3.6412, + "loss/crossentropy": 2.3859506011009217, + "loss/hidden": 3.428125, + "loss/incoh": 0.0, + "loss/logits": 0.49241943359375, + "loss/reg": 0.0, + "step": 3730 + }, + { + "epoch": 0.024605263157894738, + "grad_norm": 2.6875, + "grad_norm_var": 0.2377838134765625, + "learning_rate": 0.0001, + "loss": 3.5895, + "loss/crossentropy": 2.141975212097168, + "loss/hidden": 3.2140625, + "loss/incoh": 0.0, + "loss/logits": 0.3277399495244026, + "loss/reg": 0.0, + "step": 3740 + }, + { + "epoch": 0.024671052631578948, + "grad_norm": 3.140625, + "grad_norm_var": 0.12506510416666666, + "learning_rate": 0.0001, + "loss": 3.6233, + "loss/crossentropy": 2.4696611404418944, + "loss/hidden": 3.3265625, + "loss/incoh": 0.0, + "loss/logits": 0.3837138593196869, + "loss/reg": 0.0, + "step": 3750 + }, + { + "epoch": 0.024736842105263158, + "grad_norm": 2.34375, + "grad_norm_var": 2.6720540364583334, + "learning_rate": 0.0001, + "loss": 3.7719, + "loss/crossentropy": 2.494647514820099, + "loss/hidden": 3.275, + "loss/incoh": 0.0, + "loss/logits": 0.37584047913551333, + "loss/reg": 0.0, + "step": 3760 + }, + { + "epoch": 0.024802631578947368, + "grad_norm": 2.53125, + "grad_norm_var": 0.63804931640625, + "learning_rate": 0.0001, + "loss": 3.676, + "loss/crossentropy": 2.5242549180984497, + "loss/hidden": 3.2328125, + "loss/incoh": 0.0, + "loss/logits": 0.39244888722896576, + "loss/reg": 0.0, + "step": 3770 + }, + { + "epoch": 0.024868421052631578, + "grad_norm": 3.03125, + "grad_norm_var": 0.3232167561848958, + "learning_rate": 0.0001, + "loss": 3.5206, + "loss/crossentropy": 2.2167584180831907, + "loss/hidden": 3.109375, + "loss/incoh": 0.0, + "loss/logits": 0.31152922809123995, + "loss/reg": 0.0, + "step": 3780 + }, + { + "epoch": 0.024934210526315788, + "grad_norm": 2.640625, + "grad_norm_var": 0.8850331624348958, + "learning_rate": 0.0001, + "loss": 3.6831, + "loss/crossentropy": 2.406609225273132, + "loss/hidden": 3.2734375, + "loss/incoh": 0.0, + "loss/logits": 0.3529895097017288, + "loss/reg": 0.0, + "step": 3790 + }, + { + "epoch": 0.025, + "grad_norm": 2.390625, + "grad_norm_var": 0.8739491780598958, + "learning_rate": 0.0001, + "loss": 3.6024, + "loss/crossentropy": 2.270749258995056, + "loss/hidden": 3.3390625, + "loss/incoh": 0.0, + "loss/logits": 0.32701381742954255, + "loss/reg": 0.0, + "step": 3800 + }, + { + "epoch": 0.02506578947368421, + "grad_norm": 2.625, + "grad_norm_var": 0.06500651041666666, + "learning_rate": 0.0001, + "loss": 3.6037, + "loss/crossentropy": 2.3936703205108643, + "loss/hidden": 3.3078125, + "loss/incoh": 0.0, + "loss/logits": 0.4090299874544144, + "loss/reg": 0.0, + "step": 3810 + }, + { + "epoch": 0.02513157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.3386301676432292, + "learning_rate": 0.0001, + "loss": 3.8203, + "loss/crossentropy": 2.057803177833557, + "loss/hidden": 3.390625, + "loss/incoh": 0.0, + "loss/logits": 0.3018879994750023, + "loss/reg": 0.0, + "step": 3820 + }, + { + "epoch": 0.02519736842105263, + "grad_norm": 10.375, + "grad_norm_var": 3.7443756103515624, + "learning_rate": 0.0001, + "loss": 3.5987, + "loss/crossentropy": 2.558328187465668, + "loss/hidden": 3.3671875, + "loss/incoh": 0.0, + "loss/logits": 0.40375421941280365, + "loss/reg": 0.0, + "step": 3830 + }, + { + "epoch": 0.02526315789473684, + "grad_norm": 2.59375, + "grad_norm_var": 4.824331665039063, + "learning_rate": 0.0001, + "loss": 3.6109, + "loss/crossentropy": 2.4123119592666624, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.3224829614162445, + "loss/reg": 0.0, + "step": 3840 + }, + { + "epoch": 0.02532894736842105, + "grad_norm": 2.921875, + "grad_norm_var": 1.4220611572265625, + "learning_rate": 0.0001, + "loss": 3.6503, + "loss/crossentropy": 2.4460156679153444, + "loss/hidden": 3.3140625, + "loss/incoh": 0.0, + "loss/logits": 0.37439659237861633, + "loss/reg": 0.0, + "step": 3850 + }, + { + "epoch": 0.02539473684210526, + "grad_norm": 2.828125, + "grad_norm_var": 0.24488525390625, + "learning_rate": 0.0001, + "loss": 3.6799, + "loss/crossentropy": 2.471416544914246, + "loss/hidden": 3.265625, + "loss/incoh": 0.0, + "loss/logits": 0.3233081191778183, + "loss/reg": 0.0, + "step": 3860 + }, + { + "epoch": 0.025460526315789475, + "grad_norm": 4.6875, + "grad_norm_var": 2.005939737955729, + "learning_rate": 0.0001, + "loss": 3.7099, + "loss/crossentropy": 2.46109699010849, + "loss/hidden": 3.178125, + "loss/incoh": 0.0, + "loss/logits": 0.31833461821079256, + "loss/reg": 0.0, + "step": 3870 + }, + { + "epoch": 0.025526315789473685, + "grad_norm": 2.328125, + "grad_norm_var": 0.4576568603515625, + "learning_rate": 0.0001, + "loss": 3.62, + "loss/crossentropy": 2.4385437607765197, + "loss/hidden": 3.1953125, + "loss/incoh": 0.0, + "loss/logits": 0.3444881528615952, + "loss/reg": 0.0, + "step": 3880 + }, + { + "epoch": 0.025592105263157895, + "grad_norm": 2.625, + "grad_norm_var": 0.07125244140625, + "learning_rate": 0.0001, + "loss": 3.5814, + "loss/crossentropy": 2.464188981056213, + "loss/hidden": 3.2640625, + "loss/incoh": 0.0, + "loss/logits": 0.4162255361676216, + "loss/reg": 0.0, + "step": 3890 + }, + { + "epoch": 0.025657894736842105, + "grad_norm": 3.203125, + "grad_norm_var": 0.13810221354166666, + "learning_rate": 0.0001, + "loss": 3.5963, + "loss/crossentropy": 2.5776121497154234, + "loss/hidden": 3.1296875, + "loss/incoh": 0.0, + "loss/logits": 0.351967790722847, + "loss/reg": 0.0, + "step": 3900 + }, + { + "epoch": 0.025723684210526315, + "grad_norm": 2.203125, + "grad_norm_var": 0.13177083333333334, + "learning_rate": 0.0001, + "loss": 3.6137, + "loss/crossentropy": 2.4709773540496824, + "loss/hidden": 3.28125, + "loss/incoh": 0.0, + "loss/logits": 0.3811213612556458, + "loss/reg": 0.0, + "step": 3910 + }, + { + "epoch": 0.025789473684210525, + "grad_norm": 2.671875, + "grad_norm_var": 0.047591145833333334, + "learning_rate": 0.0001, + "loss": 3.5243, + "loss/crossentropy": 2.3289324045181274, + "loss/hidden": 3.240625, + "loss/incoh": 0.0, + "loss/logits": 0.32472735941410064, + "loss/reg": 0.0, + "step": 3920 + }, + { + "epoch": 0.025855263157894735, + "grad_norm": 3.359375, + "grad_norm_var": 0.07144266764322917, + "learning_rate": 0.0001, + "loss": 3.5618, + "loss/crossentropy": 2.151239442825317, + "loss/hidden": 3.15, + "loss/incoh": 0.0, + "loss/logits": 0.3790448889136314, + "loss/reg": 0.0, + "step": 3930 + }, + { + "epoch": 0.02592105263157895, + "grad_norm": 2.78125, + "grad_norm_var": 0.19239908854166668, + "learning_rate": 0.0001, + "loss": 3.5689, + "loss/crossentropy": 2.281427323818207, + "loss/hidden": 3.321875, + "loss/incoh": 0.0, + "loss/logits": 0.32773717790842055, + "loss/reg": 0.0, + "step": 3940 + }, + { + "epoch": 0.02598684210526316, + "grad_norm": 2.765625, + "grad_norm_var": 0.07558186848958333, + "learning_rate": 0.0001, + "loss": 3.5047, + "loss/crossentropy": 2.2366716623306275, + "loss/hidden": 3.36875, + "loss/incoh": 0.0, + "loss/logits": 0.3462549954652786, + "loss/reg": 0.0, + "step": 3950 + }, + { + "epoch": 0.02605263157894737, + "grad_norm": 2.46875, + "grad_norm_var": 0.0950347900390625, + "learning_rate": 0.0001, + "loss": 3.6604, + "loss/crossentropy": 2.6565569043159485, + "loss/hidden": 3.3375, + "loss/incoh": 0.0, + "loss/logits": 0.38759642243385317, + "loss/reg": 0.0, + "step": 3960 + }, + { + "epoch": 0.02611842105263158, + "grad_norm": 2.328125, + "grad_norm_var": 0.04934895833333333, + "learning_rate": 0.0001, + "loss": 3.4984, + "loss/crossentropy": 2.3093223094940187, + "loss/hidden": 3.2484375, + "loss/incoh": 0.0, + "loss/logits": 0.3547346442937851, + "loss/reg": 0.0, + "step": 3970 + }, + { + "epoch": 0.02618421052631579, + "grad_norm": 2.640625, + "grad_norm_var": 0.6375284830729167, + "learning_rate": 0.0001, + "loss": 3.6552, + "loss/crossentropy": 2.5669564962387086, + "loss/hidden": 3.640625, + "loss/incoh": 0.0, + "loss/logits": 0.3741306886076927, + "loss/reg": 0.0, + "step": 3980 + }, + { + "epoch": 0.02625, + "grad_norm": 3.328125, + "grad_norm_var": 0.1037994384765625, + "learning_rate": 0.0001, + "loss": 3.707, + "loss/crossentropy": 2.255212366580963, + "loss/hidden": 3.4078125, + "loss/incoh": 0.0, + "loss/logits": 0.36638626754283904, + "loss/reg": 0.0, + "step": 3990 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 3.484375, + "grad_norm_var": 0.1186431884765625, + "learning_rate": 0.0001, + "loss": 3.5907, + "loss/crossentropy": 2.5717132806777956, + "loss/hidden": 3.159375, + "loss/incoh": 0.0, + "loss/logits": 0.3674279361963272, + "loss/reg": 0.0, + "step": 4000 + }, + { + "epoch": 0.026381578947368423, + "grad_norm": 3.09375, + "grad_norm_var": 0.09750874837239583, + "learning_rate": 0.0001, + "loss": 3.5937, + "loss/crossentropy": 2.494677722454071, + "loss/hidden": 3.209375, + "loss/incoh": 0.0, + "loss/logits": 0.34122500121593474, + "loss/reg": 0.0, + "step": 4010 + }, + { + "epoch": 0.026447368421052633, + "grad_norm": 2.9375, + "grad_norm_var": 0.07803446451822917, + "learning_rate": 0.0001, + "loss": 3.4996, + "loss/crossentropy": 2.4037094593048094, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.34369616210460663, + "loss/reg": 0.0, + "step": 4020 + }, + { + "epoch": 0.026513157894736843, + "grad_norm": 2.875, + "grad_norm_var": 0.031087239583333332, + "learning_rate": 0.0001, + "loss": 3.5883, + "loss/crossentropy": 2.5161670804023744, + "loss/hidden": 3.234375, + "loss/incoh": 0.0, + "loss/logits": 0.3320692449808121, + "loss/reg": 0.0, + "step": 4030 + }, + { + "epoch": 0.026578947368421053, + "grad_norm": 2.46875, + "grad_norm_var": 0.27988993326822914, + "learning_rate": 0.0001, + "loss": 3.5987, + "loss/crossentropy": 2.489400029182434, + "loss/hidden": 3.2828125, + "loss/incoh": 0.0, + "loss/logits": 0.37186973094940184, + "loss/reg": 0.0, + "step": 4040 + }, + { + "epoch": 0.026644736842105263, + "grad_norm": 2.453125, + "grad_norm_var": 0.2902740478515625, + "learning_rate": 0.0001, + "loss": 3.6423, + "loss/crossentropy": 2.1810465335845945, + "loss/hidden": 3.140625, + "loss/incoh": 0.0, + "loss/logits": 0.30852093994617463, + "loss/reg": 0.0, + "step": 4050 + }, + { + "epoch": 0.026710526315789473, + "grad_norm": 3.203125, + "grad_norm_var": 0.2831776936848958, + "learning_rate": 0.0001, + "loss": 3.5696, + "loss/crossentropy": 2.5600404262542726, + "loss/hidden": 3.2265625, + "loss/incoh": 0.0, + "loss/logits": 0.38900414407253264, + "loss/reg": 0.0, + "step": 4060 + }, + { + "epoch": 0.026776315789473683, + "grad_norm": 4.125, + "grad_norm_var": 0.25816141764322914, + "learning_rate": 0.0001, + "loss": 3.6429, + "loss/crossentropy": 2.4915748476982116, + "loss/hidden": 3.290625, + "loss/incoh": 0.0, + "loss/logits": 0.35222682952880857, + "loss/reg": 0.0, + "step": 4070 + }, + { + "epoch": 0.026842105263157896, + "grad_norm": 3.875, + "grad_norm_var": 2.97427978515625, + "learning_rate": 0.0001, + "loss": 3.6704, + "loss/crossentropy": 2.131973624229431, + "loss/hidden": 3.36875, + "loss/incoh": 0.0, + "loss/logits": 0.32549644112586973, + "loss/reg": 0.0, + "step": 4080 + }, + { + "epoch": 0.026907894736842106, + "grad_norm": 2.484375, + "grad_norm_var": 0.25222981770833336, + "learning_rate": 0.0001, + "loss": 3.591, + "loss/crossentropy": 2.196081441640854, + "loss/hidden": 3.240625, + "loss/incoh": 0.0, + "loss/logits": 0.35593045353889463, + "loss/reg": 0.0, + "step": 4090 + }, + { + "epoch": 0.026973684210526316, + "grad_norm": 3.03125, + "grad_norm_var": 0.20009765625, + "learning_rate": 0.0001, + "loss": 3.5044, + "loss/crossentropy": 2.3047094464302065, + "loss/hidden": 3.159375, + "loss/incoh": 0.0, + "loss/logits": 0.2999614104628563, + "loss/reg": 0.0, + "step": 4100 + }, + { + "epoch": 0.027039473684210526, + "grad_norm": 2.53125, + "grad_norm_var": 0.5296051025390625, + "learning_rate": 0.0001, + "loss": 3.6015, + "loss/crossentropy": 2.4926783800125123, + "loss/hidden": 3.2578125, + "loss/incoh": 0.0, + "loss/logits": 0.342082779109478, + "loss/reg": 0.0, + "step": 4110 + }, + { + "epoch": 0.027105263157894736, + "grad_norm": 2.609375, + "grad_norm_var": 0.05623270670572917, + "learning_rate": 0.0001, + "loss": 3.5623, + "loss/crossentropy": 2.6063008666038514, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.3086448922753334, + "loss/reg": 0.0, + "step": 4120 + }, + { + "epoch": 0.027171052631578946, + "grad_norm": 2.671875, + "grad_norm_var": 0.103515625, + "learning_rate": 0.0001, + "loss": 3.5628, + "loss/crossentropy": 2.516204798221588, + "loss/hidden": 3.2609375, + "loss/incoh": 0.0, + "loss/logits": 0.36054509580135347, + "loss/reg": 0.0, + "step": 4130 + }, + { + "epoch": 0.027236842105263157, + "grad_norm": 2.421875, + "grad_norm_var": 0.09970601399739583, + "learning_rate": 0.0001, + "loss": 3.5328, + "loss/crossentropy": 2.5081961393356322, + "loss/hidden": 3.2828125, + "loss/incoh": 0.0, + "loss/logits": 0.3743078649044037, + "loss/reg": 0.0, + "step": 4140 + }, + { + "epoch": 0.02730263157894737, + "grad_norm": 2.4375, + "grad_norm_var": 0.0626373291015625, + "learning_rate": 0.0001, + "loss": 3.5896, + "loss/crossentropy": 2.386087703704834, + "loss/hidden": 3.225, + "loss/incoh": 0.0, + "loss/logits": 0.33356338143348696, + "loss/reg": 0.0, + "step": 4150 + }, + { + "epoch": 0.02736842105263158, + "grad_norm": 3.03125, + "grad_norm_var": 0.12830301920572917, + "learning_rate": 0.0001, + "loss": 3.6293, + "loss/crossentropy": 2.2993146777153015, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.28893803358078, + "loss/reg": 0.0, + "step": 4160 + }, + { + "epoch": 0.02743421052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.29797770182291666, + "learning_rate": 0.0001, + "loss": 3.5201, + "loss/crossentropy": 2.4818823099136353, + "loss/hidden": 3.309375, + "loss/incoh": 0.0, + "loss/logits": 0.3620707929134369, + "loss/reg": 0.0, + "step": 4170 + }, + { + "epoch": 0.0275, + "grad_norm": 2.28125, + "grad_norm_var": 0.2361328125, + "learning_rate": 0.0001, + "loss": 3.533, + "loss/crossentropy": 2.4130859971046448, + "loss/hidden": 3.10625, + "loss/incoh": 0.0, + "loss/logits": 0.3289807617664337, + "loss/reg": 0.0, + "step": 4180 + }, + { + "epoch": 0.02756578947368421, + "grad_norm": 2.765625, + "grad_norm_var": 0.06122945149739583, + "learning_rate": 0.0001, + "loss": 3.4961, + "loss/crossentropy": 2.3559444665908815, + "loss/hidden": 3.1140625, + "loss/incoh": 0.0, + "loss/logits": 0.3270682215690613, + "loss/reg": 0.0, + "step": 4190 + }, + { + "epoch": 0.02763157894736842, + "grad_norm": 2.609375, + "grad_norm_var": 0.3246378580729167, + "learning_rate": 0.0001, + "loss": 3.7095, + "loss/crossentropy": 2.370071732997894, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.36643182039260863, + "loss/reg": 0.0, + "step": 4200 + }, + { + "epoch": 0.02769736842105263, + "grad_norm": 8.375, + "grad_norm_var": 2.272069295247396, + "learning_rate": 0.0001, + "loss": 3.5404, + "loss/crossentropy": 2.4906920313835146, + "loss/hidden": 3.2359375, + "loss/incoh": 0.0, + "loss/logits": 0.3274578660726547, + "loss/reg": 0.0, + "step": 4210 + }, + { + "epoch": 0.027763157894736844, + "grad_norm": 2.40625, + "grad_norm_var": 2.19049072265625, + "learning_rate": 0.0001, + "loss": 3.6801, + "loss/crossentropy": 2.66324725151062, + "loss/hidden": 3.5046875, + "loss/incoh": 0.0, + "loss/logits": 0.38620950281620026, + "loss/reg": 0.0, + "step": 4220 + }, + { + "epoch": 0.027828947368421054, + "grad_norm": 2.796875, + "grad_norm_var": 0.1225494384765625, + "learning_rate": 0.0001, + "loss": 3.6293, + "loss/crossentropy": 2.3201419711112976, + "loss/hidden": 3.284375, + "loss/incoh": 0.0, + "loss/logits": 0.33118238747119905, + "loss/reg": 0.0, + "step": 4230 + }, + { + "epoch": 0.027894736842105264, + "grad_norm": 2.765625, + "grad_norm_var": 0.062108357747395836, + "learning_rate": 0.0001, + "loss": 3.5523, + "loss/crossentropy": 2.2000674962997437, + "loss/hidden": 3.1953125, + "loss/incoh": 0.0, + "loss/logits": 0.33759562224149703, + "loss/reg": 0.0, + "step": 4240 + }, + { + "epoch": 0.027960526315789474, + "grad_norm": 2.890625, + "grad_norm_var": 0.10436197916666666, + "learning_rate": 0.0001, + "loss": 3.6554, + "loss/crossentropy": 2.4001947045326233, + "loss/hidden": 3.3578125, + "loss/incoh": 0.0, + "loss/logits": 0.34554801881313324, + "loss/reg": 0.0, + "step": 4250 + }, + { + "epoch": 0.028026315789473684, + "grad_norm": 2.90625, + "grad_norm_var": 0.09031473795572917, + "learning_rate": 0.0001, + "loss": 3.5922, + "loss/crossentropy": 2.5996686697006224, + "loss/hidden": 3.209375, + "loss/incoh": 0.0, + "loss/logits": 0.34134136140346527, + "loss/reg": 0.0, + "step": 4260 + }, + { + "epoch": 0.028092105263157894, + "grad_norm": 2.59375, + "grad_norm_var": 0.07454020182291667, + "learning_rate": 0.0001, + "loss": 3.5895, + "loss/crossentropy": 2.1346129894256594, + "loss/hidden": 3.1671875, + "loss/incoh": 0.0, + "loss/logits": 0.2764407262206078, + "loss/reg": 0.0, + "step": 4270 + }, + { + "epoch": 0.028157894736842104, + "grad_norm": 2.671875, + "grad_norm_var": 0.07390950520833334, + "learning_rate": 0.0001, + "loss": 3.6094, + "loss/crossentropy": 2.539172089099884, + "loss/hidden": 3.3578125, + "loss/incoh": 0.0, + "loss/logits": 0.3945852980017662, + "loss/reg": 0.0, + "step": 4280 + }, + { + "epoch": 0.028223684210526317, + "grad_norm": 2.796875, + "grad_norm_var": 0.14036051432291666, + "learning_rate": 0.0001, + "loss": 3.6505, + "loss/crossentropy": 2.4181793212890623, + "loss/hidden": 3.484375, + "loss/incoh": 0.0, + "loss/logits": 0.45307959616184235, + "loss/reg": 0.0, + "step": 4290 + }, + { + "epoch": 0.028289473684210528, + "grad_norm": 2.40625, + "grad_norm_var": 0.14678446451822916, + "learning_rate": 0.0001, + "loss": 3.5539, + "loss/crossentropy": 2.3630972266197205, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.28445124477148054, + "loss/reg": 0.0, + "step": 4300 + }, + { + "epoch": 0.028355263157894738, + "grad_norm": 2.6875, + "grad_norm_var": 0.3001627604166667, + "learning_rate": 0.0001, + "loss": 3.5098, + "loss/crossentropy": 2.4898954033851624, + "loss/hidden": 3.109375, + "loss/incoh": 0.0, + "loss/logits": 0.320017996430397, + "loss/reg": 0.0, + "step": 4310 + }, + { + "epoch": 0.028421052631578948, + "grad_norm": 2.4375, + "grad_norm_var": 0.13015034993489583, + "learning_rate": 0.0001, + "loss": 3.5755, + "loss/crossentropy": 2.2955414295196532, + "loss/hidden": 3.434375, + "loss/incoh": 0.0, + "loss/logits": 0.41727418303489683, + "loss/reg": 0.0, + "step": 4320 + }, + { + "epoch": 0.028486842105263158, + "grad_norm": 3.015625, + "grad_norm_var": 0.29983723958333336, + "learning_rate": 0.0001, + "loss": 3.7538, + "loss/crossentropy": 2.4246325135231017, + "loss/hidden": 3.3546875, + "loss/incoh": 0.0, + "loss/logits": 0.34737818390131, + "loss/reg": 0.0, + "step": 4330 + }, + { + "epoch": 0.028552631578947368, + "grad_norm": 5.3125, + "grad_norm_var": 0.5324544270833333, + "learning_rate": 0.0001, + "loss": 3.6927, + "loss/crossentropy": 2.393894040584564, + "loss/hidden": 3.321875, + "loss/incoh": 0.0, + "loss/logits": 0.37734392285346985, + "loss/reg": 0.0, + "step": 4340 + }, + { + "epoch": 0.028618421052631578, + "grad_norm": 2.78125, + "grad_norm_var": 0.5618316650390625, + "learning_rate": 0.0001, + "loss": 3.6563, + "loss/crossentropy": 2.5759302139282227, + "loss/hidden": 3.2125, + "loss/incoh": 0.0, + "loss/logits": 0.32841147780418395, + "loss/reg": 0.0, + "step": 4350 + }, + { + "epoch": 0.028684210526315788, + "grad_norm": 2.875, + "grad_norm_var": 0.06923726399739584, + "learning_rate": 0.0001, + "loss": 3.5818, + "loss/crossentropy": 2.664570915699005, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.31884663701057436, + "loss/reg": 0.0, + "step": 4360 + }, + { + "epoch": 0.02875, + "grad_norm": 3.203125, + "grad_norm_var": 0.219873046875, + "learning_rate": 0.0001, + "loss": 3.6444, + "loss/crossentropy": 2.3797228574752807, + "loss/hidden": 3.5796875, + "loss/incoh": 0.0, + "loss/logits": 0.4052841871976852, + "loss/reg": 0.0, + "step": 4370 + }, + { + "epoch": 0.02881578947368421, + "grad_norm": 3.0, + "grad_norm_var": 0.5880849202473958, + "learning_rate": 0.0001, + "loss": 3.5881, + "loss/crossentropy": 2.5849244236946105, + "loss/hidden": 3.5046875, + "loss/incoh": 0.0, + "loss/logits": 0.529149529337883, + "loss/reg": 0.0, + "step": 4380 + }, + { + "epoch": 0.02888157894736842, + "grad_norm": 2.421875, + "grad_norm_var": 0.31160380045572916, + "learning_rate": 0.0001, + "loss": 3.4738, + "loss/crossentropy": 2.5852147936820984, + "loss/hidden": 3.1875, + "loss/incoh": 0.0, + "loss/logits": 0.3336706295609474, + "loss/reg": 0.0, + "step": 4390 + }, + { + "epoch": 0.02894736842105263, + "grad_norm": 2.703125, + "grad_norm_var": 0.05146077473958333, + "learning_rate": 0.0001, + "loss": 3.6199, + "loss/crossentropy": 2.412027895450592, + "loss/hidden": 3.375, + "loss/incoh": 0.0, + "loss/logits": 0.4120332598686218, + "loss/reg": 0.0, + "step": 4400 + }, + { + "epoch": 0.02901315789473684, + "grad_norm": 2.59375, + "grad_norm_var": 0.138232421875, + "learning_rate": 0.0001, + "loss": 3.4929, + "loss/crossentropy": 2.270553803443909, + "loss/hidden": 3.2546875, + "loss/incoh": 0.0, + "loss/logits": 0.34523763358592985, + "loss/reg": 0.0, + "step": 4410 + }, + { + "epoch": 0.02907894736842105, + "grad_norm": 2.359375, + "grad_norm_var": 0.1102691650390625, + "learning_rate": 0.0001, + "loss": 3.5778, + "loss/crossentropy": 2.361116898059845, + "loss/hidden": 3.246875, + "loss/incoh": 0.0, + "loss/logits": 0.34047031700611113, + "loss/reg": 0.0, + "step": 4420 + }, + { + "epoch": 0.02914473684210526, + "grad_norm": 6.375, + "grad_norm_var": 1.0507120768229166, + "learning_rate": 0.0001, + "loss": 3.6517, + "loss/crossentropy": 2.547470712661743, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.298332154750824, + "loss/reg": 0.0, + "step": 4430 + }, + { + "epoch": 0.029210526315789475, + "grad_norm": 3.796875, + "grad_norm_var": 1.0804972330729166, + "learning_rate": 0.0001, + "loss": 3.7389, + "loss/crossentropy": 2.7002538442611694, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.3686490625143051, + "loss/reg": 0.0, + "step": 4440 + }, + { + "epoch": 0.029276315789473685, + "grad_norm": 2.3125, + "grad_norm_var": 0.432177734375, + "learning_rate": 0.0001, + "loss": 3.5541, + "loss/crossentropy": 2.421727478504181, + "loss/hidden": 3.18125, + "loss/incoh": 0.0, + "loss/logits": 0.30692713260650634, + "loss/reg": 0.0, + "step": 4450 + }, + { + "epoch": 0.029342105263157895, + "grad_norm": 2.59375, + "grad_norm_var": 0.09047749837239584, + "learning_rate": 0.0001, + "loss": 3.6132, + "loss/crossentropy": 2.659491038322449, + "loss/hidden": 3.2140625, + "loss/incoh": 0.0, + "loss/logits": 0.3632184773683548, + "loss/reg": 0.0, + "step": 4460 + }, + { + "epoch": 0.029407894736842105, + "grad_norm": 2.453125, + "grad_norm_var": 0.06815999348958333, + "learning_rate": 0.0001, + "loss": 3.4561, + "loss/crossentropy": 2.5192266911268235, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.2842270269989967, + "loss/reg": 0.0, + "step": 4470 + }, + { + "epoch": 0.029473684210526315, + "grad_norm": 3.109375, + "grad_norm_var": 0.1687408447265625, + "learning_rate": 0.0001, + "loss": 3.5582, + "loss/crossentropy": 2.2021409273147583, + "loss/hidden": 3.1484375, + "loss/incoh": 0.0, + "loss/logits": 0.31400761008262634, + "loss/reg": 0.0, + "step": 4480 + }, + { + "epoch": 0.029539473684210525, + "grad_norm": 2.59375, + "grad_norm_var": 0.5006174723307292, + "learning_rate": 0.0001, + "loss": 3.5183, + "loss/crossentropy": 2.4442033648490904, + "loss/hidden": 3.2640625, + "loss/incoh": 0.0, + "loss/logits": 0.3307736128568649, + "loss/reg": 0.0, + "step": 4490 + }, + { + "epoch": 0.029605263157894735, + "grad_norm": 2.734375, + "grad_norm_var": 0.40276692708333334, + "learning_rate": 0.0001, + "loss": 3.4888, + "loss/crossentropy": 2.373897171020508, + "loss/hidden": 3.2234375, + "loss/incoh": 0.0, + "loss/logits": 0.31207202970981596, + "loss/reg": 0.0, + "step": 4500 + }, + { + "epoch": 0.02967105263157895, + "grad_norm": 2.78125, + "grad_norm_var": 0.20685933430989584, + "learning_rate": 0.0001, + "loss": 3.5971, + "loss/crossentropy": 2.325096046924591, + "loss/hidden": 3.40625, + "loss/incoh": 0.0, + "loss/logits": 0.4359890788793564, + "loss/reg": 0.0, + "step": 4510 + }, + { + "epoch": 0.02973684210526316, + "grad_norm": 3.078125, + "grad_norm_var": 0.2956451416015625, + "learning_rate": 0.0001, + "loss": 3.5023, + "loss/crossentropy": 2.1537609457969666, + "loss/hidden": 3.253125, + "loss/incoh": 0.0, + "loss/logits": 0.3213866874575615, + "loss/reg": 0.0, + "step": 4520 + }, + { + "epoch": 0.02980263157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.09353841145833333, + "learning_rate": 0.0001, + "loss": 3.4494, + "loss/crossentropy": 2.4694852471351623, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.3156100481748581, + "loss/reg": 0.0, + "step": 4530 + }, + { + "epoch": 0.02986842105263158, + "grad_norm": 2.78125, + "grad_norm_var": 0.17408447265625, + "learning_rate": 0.0001, + "loss": 3.6124, + "loss/crossentropy": 2.438071775436401, + "loss/hidden": 3.3203125, + "loss/incoh": 0.0, + "loss/logits": 0.40948416888713834, + "loss/reg": 0.0, + "step": 4540 + }, + { + "epoch": 0.02993421052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.6079010009765625, + "learning_rate": 0.0001, + "loss": 3.6751, + "loss/crossentropy": 2.525905132293701, + "loss/hidden": 3.3546875, + "loss/incoh": 0.0, + "loss/logits": 0.4153590425848961, + "loss/reg": 0.0, + "step": 4550 + }, + { + "epoch": 0.03, + "grad_norm": 3.203125, + "grad_norm_var": 0.6879191080729167, + "learning_rate": 0.0001, + "loss": 3.5335, + "loss/crossentropy": 2.421697771549225, + "loss/hidden": 3.025, + "loss/incoh": 0.0, + "loss/logits": 0.3177122876048088, + "loss/reg": 0.0, + "step": 4560 + }, + { + "epoch": 0.03006578947368421, + "grad_norm": 2.328125, + "grad_norm_var": 0.5993123372395833, + "learning_rate": 0.0001, + "loss": 3.5742, + "loss/crossentropy": 2.3068729996681214, + "loss/hidden": 3.05, + "loss/incoh": 0.0, + "loss/logits": 0.29430699050426484, + "loss/reg": 0.0, + "step": 4570 + }, + { + "epoch": 0.030131578947368422, + "grad_norm": 4.21875, + "grad_norm_var": 0.5433339436848958, + "learning_rate": 0.0001, + "loss": 3.6381, + "loss/crossentropy": 2.3981791496276856, + "loss/hidden": 3.2359375, + "loss/incoh": 0.0, + "loss/logits": 0.3355312556028366, + "loss/reg": 0.0, + "step": 4580 + }, + { + "epoch": 0.030197368421052632, + "grad_norm": 3.25, + "grad_norm_var": 0.9806630452473958, + "learning_rate": 0.0001, + "loss": 3.6522, + "loss/crossentropy": 2.309436595439911, + "loss/hidden": 3.4890625, + "loss/incoh": 0.0, + "loss/logits": 0.350050950050354, + "loss/reg": 0.0, + "step": 4590 + }, + { + "epoch": 0.030263157894736843, + "grad_norm": 2.75, + "grad_norm_var": 0.9801910400390625, + "learning_rate": 0.0001, + "loss": 3.5648, + "loss/crossentropy": 2.561086916923523, + "loss/hidden": 3.1203125, + "loss/incoh": 0.0, + "loss/logits": 0.3277510732412338, + "loss/reg": 0.0, + "step": 4600 + }, + { + "epoch": 0.030328947368421053, + "grad_norm": 3.015625, + "grad_norm_var": 0.20364481608072918, + "learning_rate": 0.0001, + "loss": 3.5518, + "loss/crossentropy": 2.6774720311164857, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.31699982583522796, + "loss/reg": 0.0, + "step": 4610 + }, + { + "epoch": 0.030394736842105263, + "grad_norm": 2.765625, + "grad_norm_var": 0.9744425455729167, + "learning_rate": 0.0001, + "loss": 3.5924, + "loss/crossentropy": 2.623241698741913, + "loss/hidden": 3.228125, + "loss/incoh": 0.0, + "loss/logits": 0.38138356506824495, + "loss/reg": 0.0, + "step": 4620 + }, + { + "epoch": 0.030460526315789473, + "grad_norm": 2.5625, + "grad_norm_var": 0.9736073811848959, + "learning_rate": 0.0001, + "loss": 3.4682, + "loss/crossentropy": 2.4855759739875793, + "loss/hidden": 3.2671875, + "loss/incoh": 0.0, + "loss/logits": 0.3377710849046707, + "loss/reg": 0.0, + "step": 4630 + }, + { + "epoch": 0.030526315789473683, + "grad_norm": 2.78125, + "grad_norm_var": 0.6709706624348958, + "learning_rate": 0.0001, + "loss": 3.4714, + "loss/crossentropy": 2.476264202594757, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.3642516300082207, + "loss/reg": 0.0, + "step": 4640 + }, + { + "epoch": 0.030592105263157896, + "grad_norm": 3.0, + "grad_norm_var": 0.6482899983723959, + "learning_rate": 0.0001, + "loss": 3.4865, + "loss/crossentropy": 2.3190789937973024, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.32634713053703307, + "loss/reg": 0.0, + "step": 4650 + }, + { + "epoch": 0.030657894736842106, + "grad_norm": 2.921875, + "grad_norm_var": 0.12795817057291667, + "learning_rate": 0.0001, + "loss": 3.5003, + "loss/crossentropy": 2.4210057139396666, + "loss/hidden": 3.078125, + "loss/incoh": 0.0, + "loss/logits": 0.2987945884466171, + "loss/reg": 0.0, + "step": 4660 + }, + { + "epoch": 0.030723684210526316, + "grad_norm": 3.0625, + "grad_norm_var": 0.08050130208333334, + "learning_rate": 0.0001, + "loss": 3.4858, + "loss/crossentropy": 2.041215348243713, + "loss/hidden": 3.1953125, + "loss/incoh": 0.0, + "loss/logits": 0.2908504828810692, + "loss/reg": 0.0, + "step": 4670 + }, + { + "epoch": 0.030789473684210526, + "grad_norm": 3.0625, + "grad_norm_var": 0.4143300374348958, + "learning_rate": 0.0001, + "loss": 3.6937, + "loss/crossentropy": 2.4882567286491395, + "loss/hidden": 3.3125, + "loss/incoh": 0.0, + "loss/logits": 0.3625297635793686, + "loss/reg": 0.0, + "step": 4680 + }, + { + "epoch": 0.030855263157894736, + "grad_norm": 3.328125, + "grad_norm_var": 0.15123291015625, + "learning_rate": 0.0001, + "loss": 3.5943, + "loss/crossentropy": 2.3254539489746096, + "loss/hidden": 3.2, + "loss/incoh": 0.0, + "loss/logits": 0.308133128285408, + "loss/reg": 0.0, + "step": 4690 + }, + { + "epoch": 0.030921052631578946, + "grad_norm": 2.46875, + "grad_norm_var": 0.19954325358072916, + "learning_rate": 0.0001, + "loss": 3.5843, + "loss/crossentropy": 1.918275660276413, + "loss/hidden": 3.4046875, + "loss/incoh": 0.0, + "loss/logits": 0.36902148872613905, + "loss/reg": 0.0, + "step": 4700 + }, + { + "epoch": 0.030986842105263156, + "grad_norm": 2.84375, + "grad_norm_var": 0.04302978515625, + "learning_rate": 0.0001, + "loss": 3.5845, + "loss/crossentropy": 2.573339414596558, + "loss/hidden": 3.1625, + "loss/incoh": 0.0, + "loss/logits": 0.36042743623256684, + "loss/reg": 0.0, + "step": 4710 + }, + { + "epoch": 0.03105263157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.0475982666015625, + "learning_rate": 0.0001, + "loss": 3.5453, + "loss/crossentropy": 2.313191366195679, + "loss/hidden": 3.2, + "loss/incoh": 0.0, + "loss/logits": 0.3170273721218109, + "loss/reg": 0.0, + "step": 4720 + }, + { + "epoch": 0.03111842105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.0694244384765625, + "learning_rate": 0.0001, + "loss": 3.4641, + "loss/crossentropy": 2.5151350021362306, + "loss/hidden": 3.071875, + "loss/incoh": 0.0, + "loss/logits": 0.3049825429916382, + "loss/reg": 0.0, + "step": 4730 + }, + { + "epoch": 0.03118421052631579, + "grad_norm": 4.25, + "grad_norm_var": 1.9115549723307292, + "learning_rate": 0.0001, + "loss": 3.6088, + "loss/crossentropy": 2.278876805305481, + "loss/hidden": 3.3796875, + "loss/incoh": 0.0, + "loss/logits": 0.38837724179029465, + "loss/reg": 0.0, + "step": 4740 + }, + { + "epoch": 0.03125, + "grad_norm": 2.234375, + "grad_norm_var": 2.200390625, + "learning_rate": 0.0001, + "loss": 3.5348, + "loss/crossentropy": 2.2090991735458374, + "loss/hidden": 3.44375, + "loss/incoh": 0.0, + "loss/logits": 0.3878710061311722, + "loss/reg": 0.0, + "step": 4750 + }, + { + "epoch": 0.031315789473684214, + "grad_norm": 2.25, + "grad_norm_var": 0.6573404947916667, + "learning_rate": 0.0001, + "loss": 3.5378, + "loss/crossentropy": 2.2805041670799255, + "loss/hidden": 3.3015625, + "loss/incoh": 0.0, + "loss/logits": 0.31536445766687393, + "loss/reg": 0.0, + "step": 4760 + }, + { + "epoch": 0.03138157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.38818257649739585, + "learning_rate": 0.0001, + "loss": 3.6082, + "loss/crossentropy": 2.6178433656692506, + "loss/hidden": 3.2828125, + "loss/incoh": 0.0, + "loss/logits": 0.3645938545465469, + "loss/reg": 0.0, + "step": 4770 + }, + { + "epoch": 0.031447368421052634, + "grad_norm": 2.671875, + "grad_norm_var": 0.07810770670572917, + "learning_rate": 0.0001, + "loss": 3.4424, + "loss/crossentropy": 2.5498223304748535, + "loss/hidden": 3.2625, + "loss/incoh": 0.0, + "loss/logits": 0.3333883464336395, + "loss/reg": 0.0, + "step": 4780 + }, + { + "epoch": 0.03151315789473684, + "grad_norm": 2.328125, + "grad_norm_var": 0.11669820149739583, + "learning_rate": 0.0001, + "loss": 3.6145, + "loss/crossentropy": 2.639970850944519, + "loss/hidden": 3.278125, + "loss/incoh": 0.0, + "loss/logits": 0.37047617733478544, + "loss/reg": 0.0, + "step": 4790 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 2.4375, + "grad_norm_var": 0.0907867431640625, + "learning_rate": 0.0001, + "loss": 3.5869, + "loss/crossentropy": 2.458595323562622, + "loss/hidden": 3.60625, + "loss/incoh": 0.0, + "loss/logits": 0.4149588346481323, + "loss/reg": 0.0, + "step": 4800 + }, + { + "epoch": 0.03164473684210526, + "grad_norm": 2.453125, + "grad_norm_var": 0.0759429931640625, + "learning_rate": 0.0001, + "loss": 3.4775, + "loss/crossentropy": 2.01421400308609, + "loss/hidden": 3.2671875, + "loss/incoh": 0.0, + "loss/logits": 0.29176320880651474, + "loss/reg": 0.0, + "step": 4810 + }, + { + "epoch": 0.031710526315789474, + "grad_norm": 2.96875, + "grad_norm_var": 0.1147857666015625, + "learning_rate": 0.0001, + "loss": 3.5268, + "loss/crossentropy": 2.5456383228302, + "loss/hidden": 3.3484375, + "loss/incoh": 0.0, + "loss/logits": 0.3915561467409134, + "loss/reg": 0.0, + "step": 4820 + }, + { + "epoch": 0.03177631578947369, + "grad_norm": 2.421875, + "grad_norm_var": 0.0890625, + "learning_rate": 0.0001, + "loss": 3.5371, + "loss/crossentropy": 2.1736844003200533, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.28229199200868604, + "loss/reg": 0.0, + "step": 4830 + }, + { + "epoch": 0.031842105263157894, + "grad_norm": 2.21875, + "grad_norm_var": 0.08162434895833333, + "learning_rate": 0.0001, + "loss": 3.4898, + "loss/crossentropy": 2.397980511188507, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.28644354790449145, + "loss/reg": 0.0, + "step": 4840 + }, + { + "epoch": 0.03190789473684211, + "grad_norm": 2.578125, + "grad_norm_var": 1.33443603515625, + "learning_rate": 0.0001, + "loss": 3.5788, + "loss/crossentropy": 2.434200632572174, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.37485773116350174, + "loss/reg": 0.0, + "step": 4850 + }, + { + "epoch": 0.031973684210526314, + "grad_norm": 2.640625, + "grad_norm_var": 1.3780181884765625, + "learning_rate": 0.0001, + "loss": 3.5008, + "loss/crossentropy": 2.642025816440582, + "loss/hidden": 3.1375, + "loss/incoh": 0.0, + "loss/logits": 0.34916335344314575, + "loss/reg": 0.0, + "step": 4860 + }, + { + "epoch": 0.03203947368421053, + "grad_norm": 2.578125, + "grad_norm_var": 0.04523111979166667, + "learning_rate": 0.0001, + "loss": 3.4784, + "loss/crossentropy": 2.2365106463432314, + "loss/hidden": 3.4328125, + "loss/incoh": 0.0, + "loss/logits": 0.45036998838186265, + "loss/reg": 0.0, + "step": 4870 + }, + { + "epoch": 0.032105263157894734, + "grad_norm": 2.25, + "grad_norm_var": 0.04436442057291667, + "learning_rate": 0.0001, + "loss": 3.5047, + "loss/crossentropy": 2.250430929660797, + "loss/hidden": 3.2859375, + "loss/incoh": 0.0, + "loss/logits": 0.3373000741004944, + "loss/reg": 0.0, + "step": 4880 + }, + { + "epoch": 0.03217105263157895, + "grad_norm": 2.71875, + "grad_norm_var": 0.07979227701822916, + "learning_rate": 0.0001, + "loss": 3.4476, + "loss/crossentropy": 2.688676381111145, + "loss/hidden": 3.184375, + "loss/incoh": 0.0, + "loss/logits": 0.32961316406726837, + "loss/reg": 0.0, + "step": 4890 + }, + { + "epoch": 0.03223684210526316, + "grad_norm": 2.625, + "grad_norm_var": 0.0539215087890625, + "learning_rate": 0.0001, + "loss": 3.5264, + "loss/crossentropy": 2.5281107783317567, + "loss/hidden": 3.2125, + "loss/incoh": 0.0, + "loss/logits": 0.36207843720912936, + "loss/reg": 0.0, + "step": 4900 + }, + { + "epoch": 0.03230263157894737, + "grad_norm": 2.515625, + "grad_norm_var": 0.11297200520833334, + "learning_rate": 0.0001, + "loss": 3.4276, + "loss/crossentropy": 2.3438509345054626, + "loss/hidden": 3.2671875, + "loss/incoh": 0.0, + "loss/logits": 0.3646134212613106, + "loss/reg": 0.0, + "step": 4910 + }, + { + "epoch": 0.03236842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.06402587890625, + "learning_rate": 0.0001, + "loss": 3.479, + "loss/crossentropy": 2.331527066230774, + "loss/hidden": 3.48125, + "loss/incoh": 0.0, + "loss/logits": 0.407479552924633, + "loss/reg": 0.0, + "step": 4920 + }, + { + "epoch": 0.03243421052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.21721598307291667, + "learning_rate": 0.0001, + "loss": 3.6251, + "loss/crossentropy": 2.3732258677482605, + "loss/hidden": 3.1875, + "loss/incoh": 0.0, + "loss/logits": 0.32182002663612364, + "loss/reg": 0.0, + "step": 4930 + }, + { + "epoch": 0.0325, + "grad_norm": 4.59375, + "grad_norm_var": 1.3630167643229167, + "learning_rate": 0.0001, + "loss": 3.6469, + "loss/crossentropy": 1.9915230482816697, + "loss/hidden": 3.3953125, + "loss/incoh": 0.0, + "loss/logits": 0.35692891776561736, + "loss/reg": 0.0, + "step": 4940 + }, + { + "epoch": 0.03256578947368421, + "grad_norm": 2.40625, + "grad_norm_var": 0.2885894775390625, + "learning_rate": 0.0001, + "loss": 3.5077, + "loss/crossentropy": 1.9868581891059875, + "loss/hidden": 3.215625, + "loss/incoh": 0.0, + "loss/logits": 0.30731415897607806, + "loss/reg": 0.0, + "step": 4950 + }, + { + "epoch": 0.03263157894736842, + "grad_norm": 2.765625, + "grad_norm_var": 0.21422119140625, + "learning_rate": 0.0001, + "loss": 3.5426, + "loss/crossentropy": 2.3579143285751343, + "loss/hidden": 3.3421875, + "loss/incoh": 0.0, + "loss/logits": 0.3354805111885071, + "loss/reg": 0.0, + "step": 4960 + }, + { + "epoch": 0.032697368421052635, + "grad_norm": 2.625, + "grad_norm_var": 4.145735677083334, + "learning_rate": 0.0001, + "loss": 3.5276, + "loss/crossentropy": 2.2787723779678344, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.2516452088952065, + "loss/reg": 0.0, + "step": 4970 + }, + { + "epoch": 0.03276315789473684, + "grad_norm": 2.375, + "grad_norm_var": 0.09233296712239583, + "learning_rate": 0.0001, + "loss": 3.4912, + "loss/crossentropy": 2.2899803042411806, + "loss/hidden": 3.175, + "loss/incoh": 0.0, + "loss/logits": 0.3140288829803467, + "loss/reg": 0.0, + "step": 4980 + }, + { + "epoch": 0.032828947368421055, + "grad_norm": 2.53125, + "grad_norm_var": 0.07078450520833333, + "learning_rate": 0.0001, + "loss": 3.4308, + "loss/crossentropy": 2.4203084468841554, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.2926447048783302, + "loss/reg": 0.0, + "step": 4990 + }, + { + "epoch": 0.03289473684210526, + "grad_norm": 2.375, + "grad_norm_var": 0.1664215087890625, + "learning_rate": 0.0001, + "loss": 3.5233, + "loss/crossentropy": 2.4435134291648866, + "loss/hidden": 3.3640625, + "loss/incoh": 0.0, + "loss/logits": 0.35478622317314146, + "loss/reg": 0.0, + "step": 5000 + }, + { + "epoch": 0.032960526315789475, + "grad_norm": 3.296875, + "grad_norm_var": 0.17375386555989583, + "learning_rate": 0.0001, + "loss": 3.529, + "loss/crossentropy": 2.3886643409729005, + "loss/hidden": 3.196875, + "loss/incoh": 0.0, + "loss/logits": 0.36350963413715365, + "loss/reg": 0.0, + "step": 5010 + }, + { + "epoch": 0.03302631578947368, + "grad_norm": 2.390625, + "grad_norm_var": 0.13043619791666666, + "learning_rate": 0.0001, + "loss": 3.5608, + "loss/crossentropy": 2.5570758461952208, + "loss/hidden": 3.4828125, + "loss/incoh": 0.0, + "loss/logits": 0.343785697221756, + "loss/reg": 0.0, + "step": 5020 + }, + { + "epoch": 0.033092105263157895, + "grad_norm": 2.921875, + "grad_norm_var": 0.19719645182291667, + "learning_rate": 0.0001, + "loss": 3.5903, + "loss/crossentropy": 2.3763694763183594, + "loss/hidden": 3.25625, + "loss/incoh": 0.0, + "loss/logits": 0.32882467210292815, + "loss/reg": 0.0, + "step": 5030 + }, + { + "epoch": 0.03315789473684211, + "grad_norm": 2.484375, + "grad_norm_var": 0.21155497233072917, + "learning_rate": 0.0001, + "loss": 3.5454, + "loss/crossentropy": 2.5775513648986816, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.3086023017764091, + "loss/reg": 0.0, + "step": 5040 + }, + { + "epoch": 0.033223684210526315, + "grad_norm": 2.703125, + "grad_norm_var": 0.08268229166666667, + "learning_rate": 0.0001, + "loss": 3.5005, + "loss/crossentropy": 2.257374918460846, + "loss/hidden": 3.1609375, + "loss/incoh": 0.0, + "loss/logits": 0.3117083102464676, + "loss/reg": 0.0, + "step": 5050 + }, + { + "epoch": 0.03328947368421053, + "grad_norm": 2.46875, + "grad_norm_var": 0.21896158854166667, + "learning_rate": 0.0001, + "loss": 3.4097, + "loss/crossentropy": 2.437604343891144, + "loss/hidden": 3.1953125, + "loss/incoh": 0.0, + "loss/logits": 0.33744728565216064, + "loss/reg": 0.0, + "step": 5060 + }, + { + "epoch": 0.033355263157894735, + "grad_norm": 2.421875, + "grad_norm_var": 0.19402669270833334, + "learning_rate": 0.0001, + "loss": 3.5413, + "loss/crossentropy": 2.1056251645088198, + "loss/hidden": 3.3140625, + "loss/incoh": 0.0, + "loss/logits": 0.35452440977096555, + "loss/reg": 0.0, + "step": 5070 + }, + { + "epoch": 0.03342105263157895, + "grad_norm": 2.984375, + "grad_norm_var": 0.041829427083333336, + "learning_rate": 0.0001, + "loss": 3.5305, + "loss/crossentropy": 2.5163299083709716, + "loss/hidden": 3.296875, + "loss/incoh": 0.0, + "loss/logits": 0.35606471002101897, + "loss/reg": 0.0, + "step": 5080 + }, + { + "epoch": 0.033486842105263155, + "grad_norm": 2.328125, + "grad_norm_var": 0.08046773274739584, + "learning_rate": 0.0001, + "loss": 3.4405, + "loss/crossentropy": 2.37408185005188, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.3234561800956726, + "loss/reg": 0.0, + "step": 5090 + }, + { + "epoch": 0.03355263157894737, + "grad_norm": 2.921875, + "grad_norm_var": 0.17653706868489583, + "learning_rate": 0.0001, + "loss": 3.4303, + "loss/crossentropy": 2.3730108022689818, + "loss/hidden": 3.1359375, + "loss/incoh": 0.0, + "loss/logits": 0.2919617787003517, + "loss/reg": 0.0, + "step": 5100 + }, + { + "epoch": 0.03361842105263158, + "grad_norm": 2.5625, + "grad_norm_var": 0.47526753743489586, + "learning_rate": 0.0001, + "loss": 3.5312, + "loss/crossentropy": 2.4720141887664795, + "loss/hidden": 3.246875, + "loss/incoh": 0.0, + "loss/logits": 0.36764703392982484, + "loss/reg": 0.0, + "step": 5110 + }, + { + "epoch": 0.03368421052631579, + "grad_norm": 2.6875, + "grad_norm_var": 0.4923787434895833, + "learning_rate": 0.0001, + "loss": 3.5491, + "loss/crossentropy": 2.449038088321686, + "loss/hidden": 3.3171875, + "loss/incoh": 0.0, + "loss/logits": 0.4042062431573868, + "loss/reg": 0.0, + "step": 5120 + }, + { + "epoch": 0.03375, + "grad_norm": 2.703125, + "grad_norm_var": 0.1626861572265625, + "learning_rate": 0.0001, + "loss": 3.5298, + "loss/crossentropy": 2.5271955728530884, + "loss/hidden": 3.2515625, + "loss/incoh": 0.0, + "loss/logits": 0.3406914800405502, + "loss/reg": 0.0, + "step": 5130 + }, + { + "epoch": 0.03381578947368421, + "grad_norm": 2.65625, + "grad_norm_var": 0.037018839518229166, + "learning_rate": 0.0001, + "loss": 3.549, + "loss/crossentropy": 2.6082807898521425, + "loss/hidden": 3.321875, + "loss/incoh": 0.0, + "loss/logits": 0.33228414356708524, + "loss/reg": 0.0, + "step": 5140 + }, + { + "epoch": 0.03388157894736842, + "grad_norm": 2.375, + "grad_norm_var": 0.08620503743489584, + "learning_rate": 0.0001, + "loss": 3.505, + "loss/crossentropy": 2.0589061468839644, + "loss/hidden": 3.4890625, + "loss/incoh": 0.0, + "loss/logits": 0.30756633579730985, + "loss/reg": 0.0, + "step": 5150 + }, + { + "epoch": 0.03394736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.3478342692057292, + "learning_rate": 0.0001, + "loss": 3.5598, + "loss/crossentropy": 2.0876080930233, + "loss/hidden": 3.39375, + "loss/incoh": 0.0, + "loss/logits": 0.3331515982747078, + "loss/reg": 0.0, + "step": 5160 + }, + { + "epoch": 0.03401315789473684, + "grad_norm": 2.265625, + "grad_norm_var": 0.3102773030598958, + "learning_rate": 0.0001, + "loss": 3.5298, + "loss/crossentropy": 2.123845911026001, + "loss/hidden": 3.2953125, + "loss/incoh": 0.0, + "loss/logits": 0.27716329991817473, + "loss/reg": 0.0, + "step": 5170 + }, + { + "epoch": 0.034078947368421056, + "grad_norm": 3.484375, + "grad_norm_var": 0.3398590087890625, + "learning_rate": 0.0001, + "loss": 3.546, + "loss/crossentropy": 2.5750380873680117, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.3079290196299553, + "loss/reg": 0.0, + "step": 5180 + }, + { + "epoch": 0.03414473684210526, + "grad_norm": 3.015625, + "grad_norm_var": 0.3519195556640625, + "learning_rate": 0.0001, + "loss": 3.4686, + "loss/crossentropy": 2.1761133074760437, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.28422126173973083, + "loss/reg": 0.0, + "step": 5190 + }, + { + "epoch": 0.034210526315789476, + "grad_norm": 3.21875, + "grad_norm_var": 0.09970296223958333, + "learning_rate": 0.0001, + "loss": 3.5823, + "loss/crossentropy": 2.224585694074631, + "loss/hidden": 3.2109375, + "loss/incoh": 0.0, + "loss/logits": 0.32807315289974215, + "loss/reg": 0.0, + "step": 5200 + }, + { + "epoch": 0.03427631578947368, + "grad_norm": 2.96875, + "grad_norm_var": 0.34479878743489584, + "learning_rate": 0.0001, + "loss": 3.5417, + "loss/crossentropy": 2.217251694202423, + "loss/hidden": 3.159375, + "loss/incoh": 0.0, + "loss/logits": 0.2984598934650421, + "loss/reg": 0.0, + "step": 5210 + }, + { + "epoch": 0.034342105263157896, + "grad_norm": 2.6875, + "grad_norm_var": 0.28804931640625, + "learning_rate": 0.0001, + "loss": 3.4788, + "loss/crossentropy": 2.344852977991104, + "loss/hidden": 3.1265625, + "loss/incoh": 0.0, + "loss/logits": 0.2878506749868393, + "loss/reg": 0.0, + "step": 5220 + }, + { + "epoch": 0.0344078947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.17006734212239583, + "learning_rate": 0.0001, + "loss": 3.4649, + "loss/crossentropy": 2.5100401520729063, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.3491516515612602, + "loss/reg": 0.0, + "step": 5230 + }, + { + "epoch": 0.034473684210526316, + "grad_norm": 2.5625, + "grad_norm_var": 1.1076649983723958, + "learning_rate": 0.0001, + "loss": 3.503, + "loss/crossentropy": 2.565778684616089, + "loss/hidden": 3.146875, + "loss/incoh": 0.0, + "loss/logits": 0.35944747030735014, + "loss/reg": 0.0, + "step": 5240 + }, + { + "epoch": 0.03453947368421053, + "grad_norm": 2.3125, + "grad_norm_var": 1.1434315999348958, + "learning_rate": 0.0001, + "loss": 3.4668, + "loss/crossentropy": 2.5031490683555604, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.2864773109555244, + "loss/reg": 0.0, + "step": 5250 + }, + { + "epoch": 0.034605263157894736, + "grad_norm": 2.90625, + "grad_norm_var": 0.5837849934895833, + "learning_rate": 0.0001, + "loss": 3.4822, + "loss/crossentropy": 2.3741963386535643, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.3231631726026535, + "loss/reg": 0.0, + "step": 5260 + }, + { + "epoch": 0.03467105263157895, + "grad_norm": 2.546875, + "grad_norm_var": 0.62431640625, + "learning_rate": 0.0001, + "loss": 3.4414, + "loss/crossentropy": 2.3789267897605897, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.3201348423957825, + "loss/reg": 0.0, + "step": 5270 + }, + { + "epoch": 0.034736842105263156, + "grad_norm": 5.25, + "grad_norm_var": 0.5048573811848959, + "learning_rate": 0.0001, + "loss": 3.412, + "loss/crossentropy": 2.47695529460907, + "loss/hidden": 3.171875, + "loss/incoh": 0.0, + "loss/logits": 0.31060084253549575, + "loss/reg": 0.0, + "step": 5280 + }, + { + "epoch": 0.03480263157894737, + "grad_norm": 2.625, + "grad_norm_var": 0.54195556640625, + "learning_rate": 0.0001, + "loss": 3.5371, + "loss/crossentropy": 2.4316977143287657, + "loss/hidden": 3.0703125, + "loss/incoh": 0.0, + "loss/logits": 0.30330550074577334, + "loss/reg": 0.0, + "step": 5290 + }, + { + "epoch": 0.034868421052631576, + "grad_norm": 2.625, + "grad_norm_var": 0.1771484375, + "learning_rate": 0.0001, + "loss": 3.534, + "loss/crossentropy": 2.058604693412781, + "loss/hidden": 3.16875, + "loss/incoh": 0.0, + "loss/logits": 0.2998970851302147, + "loss/reg": 0.0, + "step": 5300 + }, + { + "epoch": 0.03493421052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.21590169270833334, + "learning_rate": 0.0001, + "loss": 3.5029, + "loss/crossentropy": 2.1623964309692383, + "loss/hidden": 3.1453125, + "loss/incoh": 0.0, + "loss/logits": 0.2633717767894268, + "loss/reg": 0.0, + "step": 5310 + }, + { + "epoch": 0.035, + "grad_norm": 2.90625, + "grad_norm_var": 0.12704671223958333, + "learning_rate": 0.0001, + "loss": 3.4862, + "loss/crossentropy": 2.5717769265174866, + "loss/hidden": 3.1671875, + "loss/incoh": 0.0, + "loss/logits": 0.30162925869226453, + "loss/reg": 0.0, + "step": 5320 + }, + { + "epoch": 0.03506578947368421, + "grad_norm": 2.734375, + "grad_norm_var": 1.4618398030598958, + "learning_rate": 0.0001, + "loss": 3.5429, + "loss/crossentropy": 2.462851893901825, + "loss/hidden": 3.328125, + "loss/incoh": 0.0, + "loss/logits": 0.3766929477453232, + "loss/reg": 0.0, + "step": 5330 + }, + { + "epoch": 0.03513157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 1.7596181233723958, + "learning_rate": 0.0001, + "loss": 3.5089, + "loss/crossentropy": 2.44319885969162, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.30458342134952543, + "loss/reg": 0.0, + "step": 5340 + }, + { + "epoch": 0.03519736842105263, + "grad_norm": 2.875, + "grad_norm_var": 0.4787668863932292, + "learning_rate": 0.0001, + "loss": 3.6272, + "loss/crossentropy": 2.594151020050049, + "loss/hidden": 3.2296875, + "loss/incoh": 0.0, + "loss/logits": 0.3736713409423828, + "loss/reg": 0.0, + "step": 5350 + }, + { + "epoch": 0.035263157894736843, + "grad_norm": 2.484375, + "grad_norm_var": 0.4522939046223958, + "learning_rate": 0.0001, + "loss": 3.5325, + "loss/crossentropy": 2.0771877110004424, + "loss/hidden": 3.4046875, + "loss/incoh": 0.0, + "loss/logits": 0.3338633939623833, + "loss/reg": 0.0, + "step": 5360 + }, + { + "epoch": 0.03532894736842105, + "grad_norm": 2.390625, + "grad_norm_var": 0.0387847900390625, + "learning_rate": 0.0001, + "loss": 3.4757, + "loss/crossentropy": 2.5547770977020265, + "loss/hidden": 3.25, + "loss/incoh": 0.0, + "loss/logits": 0.36584808975458144, + "loss/reg": 0.0, + "step": 5370 + }, + { + "epoch": 0.035394736842105264, + "grad_norm": 2.34375, + "grad_norm_var": 11.889094034830729, + "learning_rate": 0.0001, + "loss": 3.6332, + "loss/crossentropy": 2.2124004304409026, + "loss/hidden": 3.1265625, + "loss/incoh": 0.0, + "loss/logits": 0.29226877391338346, + "loss/reg": 0.0, + "step": 5380 + }, + { + "epoch": 0.03546052631578948, + "grad_norm": 3.15625, + "grad_norm_var": 6.872362263997396, + "learning_rate": 0.0001, + "loss": 3.5496, + "loss/crossentropy": 2.2622018218040467, + "loss/hidden": 3.259375, + "loss/incoh": 0.0, + "loss/logits": 0.40298803299665453, + "loss/reg": 0.0, + "step": 5390 + }, + { + "epoch": 0.035526315789473684, + "grad_norm": 2.25, + "grad_norm_var": 0.0641998291015625, + "learning_rate": 0.0001, + "loss": 3.4621, + "loss/crossentropy": 2.374979627132416, + "loss/hidden": 3.2109375, + "loss/incoh": 0.0, + "loss/logits": 0.37631402611732484, + "loss/reg": 0.0, + "step": 5400 + }, + { + "epoch": 0.0355921052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.028059895833333334, + "learning_rate": 0.0001, + "loss": 3.4667, + "loss/crossentropy": 2.4809056520462036, + "loss/hidden": 3.1078125, + "loss/incoh": 0.0, + "loss/logits": 0.32154888212680816, + "loss/reg": 0.0, + "step": 5410 + }, + { + "epoch": 0.035657894736842104, + "grad_norm": 2.5625, + "grad_norm_var": 0.19182840983072916, + "learning_rate": 0.0001, + "loss": 3.528, + "loss/crossentropy": 2.3937729835510253, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.29925636053085325, + "loss/reg": 0.0, + "step": 5420 + }, + { + "epoch": 0.03572368421052632, + "grad_norm": 2.734375, + "grad_norm_var": 0.04440104166666667, + "learning_rate": 0.0001, + "loss": 3.4433, + "loss/crossentropy": 2.604015350341797, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.28630980402231215, + "loss/reg": 0.0, + "step": 5430 + }, + { + "epoch": 0.035789473684210524, + "grad_norm": 2.34375, + "grad_norm_var": 0.057454427083333336, + "learning_rate": 0.0001, + "loss": 3.3938, + "loss/crossentropy": 2.3647801518440246, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.2772139713168144, + "loss/reg": 0.0, + "step": 5440 + }, + { + "epoch": 0.03585526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.06382548014322917, + "learning_rate": 0.0001, + "loss": 3.4874, + "loss/crossentropy": 2.4227387428283693, + "loss/hidden": 3.3890625, + "loss/incoh": 0.0, + "loss/logits": 0.3498344630002975, + "loss/reg": 0.0, + "step": 5450 + }, + { + "epoch": 0.03592105263157895, + "grad_norm": 2.5625, + "grad_norm_var": 0.5879221598307292, + "learning_rate": 0.0001, + "loss": 3.5541, + "loss/crossentropy": 2.024171155691147, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.25232082083821294, + "loss/reg": 0.0, + "step": 5460 + }, + { + "epoch": 0.03598684210526316, + "grad_norm": 2.78125, + "grad_norm_var": 1.2988596598307292, + "learning_rate": 0.0001, + "loss": 3.4549, + "loss/crossentropy": 2.5548394203186033, + "loss/hidden": 3.0234375, + "loss/incoh": 0.0, + "loss/logits": 0.29853117763996123, + "loss/reg": 0.0, + "step": 5470 + }, + { + "epoch": 0.03605263157894737, + "grad_norm": 3.234375, + "grad_norm_var": 1.458153279622396, + "learning_rate": 0.0001, + "loss": 3.5047, + "loss/crossentropy": 2.520615005493164, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.31394066512584684, + "loss/reg": 0.0, + "step": 5480 + }, + { + "epoch": 0.03611842105263158, + "grad_norm": 2.96875, + "grad_norm_var": 0.6611328125, + "learning_rate": 0.0001, + "loss": 3.4932, + "loss/crossentropy": 2.447161090373993, + "loss/hidden": 3.090625, + "loss/incoh": 0.0, + "loss/logits": 0.3091880366206169, + "loss/reg": 0.0, + "step": 5490 + }, + { + "epoch": 0.03618421052631579, + "grad_norm": 2.671875, + "grad_norm_var": 0.05611572265625, + "learning_rate": 0.0001, + "loss": 3.4718, + "loss/crossentropy": 2.354436981678009, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.30545540153980255, + "loss/reg": 0.0, + "step": 5500 + }, + { + "epoch": 0.03625, + "grad_norm": 2.828125, + "grad_norm_var": 0.24016011555989583, + "learning_rate": 0.0001, + "loss": 3.5554, + "loss/crossentropy": 2.350696861743927, + "loss/hidden": 3.1578125, + "loss/incoh": 0.0, + "loss/logits": 0.27360412031412124, + "loss/reg": 0.0, + "step": 5510 + }, + { + "epoch": 0.03631578947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.6210245768229167, + "learning_rate": 0.0001, + "loss": 3.513, + "loss/crossentropy": 2.368817460536957, + "loss/hidden": 3.1796875, + "loss/incoh": 0.0, + "loss/logits": 0.31491883993148806, + "loss/reg": 0.0, + "step": 5520 + }, + { + "epoch": 0.036381578947368425, + "grad_norm": 2.640625, + "grad_norm_var": 0.16715087890625, + "learning_rate": 0.0001, + "loss": 3.4444, + "loss/crossentropy": 2.3894132494926454, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.2710603341460228, + "loss/reg": 0.0, + "step": 5530 + }, + { + "epoch": 0.03644736842105263, + "grad_norm": 2.484375, + "grad_norm_var": 2.993724568684896, + "learning_rate": 0.0001, + "loss": 3.5105, + "loss/crossentropy": 2.4798691868782043, + "loss/hidden": 3.25, + "loss/incoh": 0.0, + "loss/logits": 0.422188438475132, + "loss/reg": 0.0, + "step": 5540 + }, + { + "epoch": 0.036513157894736845, + "grad_norm": 3.6875, + "grad_norm_var": 2.864090983072917, + "learning_rate": 0.0001, + "loss": 3.6266, + "loss/crossentropy": 2.499036192893982, + "loss/hidden": 3.21875, + "loss/incoh": 0.0, + "loss/logits": 0.3576551049947739, + "loss/reg": 0.0, + "step": 5550 + }, + { + "epoch": 0.03657894736842105, + "grad_norm": 2.59375, + "grad_norm_var": 0.2598052978515625, + "learning_rate": 0.0001, + "loss": 3.5659, + "loss/crossentropy": 2.4270546317100523, + "loss/hidden": 3.23125, + "loss/incoh": 0.0, + "loss/logits": 0.4616221562027931, + "loss/reg": 0.0, + "step": 5560 + }, + { + "epoch": 0.036644736842105265, + "grad_norm": 3.03125, + "grad_norm_var": 0.39485270182291665, + "learning_rate": 0.0001, + "loss": 3.5978, + "loss/crossentropy": 2.427480709552765, + "loss/hidden": 3.7875, + "loss/incoh": 0.0, + "loss/logits": 0.38075721710920335, + "loss/reg": 0.0, + "step": 5570 + }, + { + "epoch": 0.03671052631578947, + "grad_norm": 2.390625, + "grad_norm_var": 1.5193318684895833, + "learning_rate": 0.0001, + "loss": 3.5541, + "loss/crossentropy": 2.2717662811279298, + "loss/hidden": 3.2359375, + "loss/incoh": 0.0, + "loss/logits": 0.2794697627425194, + "loss/reg": 0.0, + "step": 5580 + }, + { + "epoch": 0.036776315789473685, + "grad_norm": 2.59375, + "grad_norm_var": 1.5052734375, + "learning_rate": 0.0001, + "loss": 3.5513, + "loss/crossentropy": 2.549258255958557, + "loss/hidden": 3.1265625, + "loss/incoh": 0.0, + "loss/logits": 0.3551526039838791, + "loss/reg": 0.0, + "step": 5590 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 2.78125, + "grad_norm_var": 0.12431538899739583, + "learning_rate": 0.0001, + "loss": 3.5166, + "loss/crossentropy": 2.42179411649704, + "loss/hidden": 3.265625, + "loss/incoh": 0.0, + "loss/logits": 0.349351167678833, + "loss/reg": 0.0, + "step": 5600 + }, + { + "epoch": 0.036907894736842105, + "grad_norm": 2.40625, + "grad_norm_var": 0.09789937337239583, + "learning_rate": 0.0001, + "loss": 3.4519, + "loss/crossentropy": 2.275598430633545, + "loss/hidden": 3.2015625, + "loss/incoh": 0.0, + "loss/logits": 0.32426146864891053, + "loss/reg": 0.0, + "step": 5610 + }, + { + "epoch": 0.03697368421052632, + "grad_norm": 2.84375, + "grad_norm_var": 0.10132548014322916, + "learning_rate": 0.0001, + "loss": 3.4632, + "loss/crossentropy": 2.4317312955856325, + "loss/hidden": 3.3171875, + "loss/incoh": 0.0, + "loss/logits": 0.3550658613443375, + "loss/reg": 0.0, + "step": 5620 + }, + { + "epoch": 0.037039473684210525, + "grad_norm": 25.0, + "grad_norm_var": 167.39566650390626, + "learning_rate": 0.0001, + "loss": 3.6136, + "loss/crossentropy": 2.5963298320770263, + "loss/hidden": 3.1546875, + "loss/incoh": 0.0, + "loss/logits": 0.34572866559028625, + "loss/reg": 0.0, + "step": 5630 + }, + { + "epoch": 0.03710526315789474, + "grad_norm": 3.0, + "grad_norm_var": 167.50836486816405, + "learning_rate": 0.0001, + "loss": 3.5122, + "loss/crossentropy": 2.370168614387512, + "loss/hidden": 3.090625, + "loss/incoh": 0.0, + "loss/logits": 0.316168874502182, + "loss/reg": 0.0, + "step": 5640 + }, + { + "epoch": 0.037171052631578945, + "grad_norm": 2.921875, + "grad_norm_var": 0.05074462890625, + "learning_rate": 0.0001, + "loss": 3.4512, + "loss/crossentropy": 2.1566815614700316, + "loss/hidden": 3.109375, + "loss/incoh": 0.0, + "loss/logits": 0.30936725735664367, + "loss/reg": 0.0, + "step": 5650 + }, + { + "epoch": 0.03723684210526316, + "grad_norm": 3.546875, + "grad_norm_var": 0.1086822509765625, + "learning_rate": 0.0001, + "loss": 3.4308, + "loss/crossentropy": 2.315059244632721, + "loss/hidden": 3.1609375, + "loss/incoh": 0.0, + "loss/logits": 0.3344813346862793, + "loss/reg": 0.0, + "step": 5660 + }, + { + "epoch": 0.037302631578947365, + "grad_norm": 2.984375, + "grad_norm_var": 0.22506510416666667, + "learning_rate": 0.0001, + "loss": 3.5705, + "loss/crossentropy": 2.063437449932098, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.2783856257796288, + "loss/reg": 0.0, + "step": 5670 + }, + { + "epoch": 0.03736842105263158, + "grad_norm": 2.75, + "grad_norm_var": 0.48176981608072916, + "learning_rate": 0.0001, + "loss": 3.5285, + "loss/crossentropy": 2.564136099815369, + "loss/hidden": 3.2453125, + "loss/incoh": 0.0, + "loss/logits": 0.3260859474539757, + "loss/reg": 0.0, + "step": 5680 + }, + { + "epoch": 0.03743421052631579, + "grad_norm": 2.78125, + "grad_norm_var": 1.2526519775390625, + "learning_rate": 0.0001, + "loss": 3.581, + "loss/crossentropy": 2.4384737968444825, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.2700443536043167, + "loss/reg": 0.0, + "step": 5690 + }, + { + "epoch": 0.0375, + "grad_norm": 2.53125, + "grad_norm_var": 1.029296875, + "learning_rate": 0.0001, + "loss": 3.5309, + "loss/crossentropy": 2.656829285621643, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.34218672215938567, + "loss/reg": 0.0, + "step": 5700 + }, + { + "epoch": 0.03756578947368421, + "grad_norm": 2.375, + "grad_norm_var": 5.287238566080729, + "learning_rate": 0.0001, + "loss": 3.6012, + "loss/crossentropy": 2.3537548005580904, + "loss/hidden": 3.1875, + "loss/incoh": 0.0, + "loss/logits": 0.33707170784473417, + "loss/reg": 0.0, + "step": 5710 + }, + { + "epoch": 0.03763157894736842, + "grad_norm": 2.359375, + "grad_norm_var": 0.6506795247395833, + "learning_rate": 0.0001, + "loss": 3.4987, + "loss/crossentropy": 2.217307722568512, + "loss/hidden": 3.31875, + "loss/incoh": 0.0, + "loss/logits": 0.36700052917003634, + "loss/reg": 0.0, + "step": 5720 + }, + { + "epoch": 0.03769736842105263, + "grad_norm": 2.796875, + "grad_norm_var": 235.47136942545572, + "learning_rate": 0.0001, + "loss": 3.4822, + "loss/crossentropy": 2.39666086435318, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.322082930803299, + "loss/reg": 0.0, + "step": 5730 + }, + { + "epoch": 0.03776315789473684, + "grad_norm": 2.65625, + "grad_norm_var": 48.15474853515625, + "learning_rate": 0.0001, + "loss": 3.5338, + "loss/crossentropy": 2.5715784192085267, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.31636003255844114, + "loss/reg": 0.0, + "step": 5740 + }, + { + "epoch": 0.03782894736842105, + "grad_norm": 2.375, + "grad_norm_var": 48.25117899576823, + "learning_rate": 0.0001, + "loss": 3.345, + "loss/crossentropy": 2.1092816948890687, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.27086046040058137, + "loss/reg": 0.0, + "step": 5750 + }, + { + "epoch": 0.037894736842105266, + "grad_norm": 3.34375, + "grad_norm_var": 0.1339508056640625, + "learning_rate": 0.0001, + "loss": 3.4478, + "loss/crossentropy": 2.20155810713768, + "loss/hidden": 3.209375, + "loss/incoh": 0.0, + "loss/logits": 0.3250092178583145, + "loss/reg": 0.0, + "step": 5760 + }, + { + "epoch": 0.03796052631578947, + "grad_norm": 2.4375, + "grad_norm_var": 0.5598592122395833, + "learning_rate": 0.0001, + "loss": 3.4268, + "loss/crossentropy": 2.2270141005516053, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.2702811732888222, + "loss/reg": 0.0, + "step": 5770 + }, + { + "epoch": 0.038026315789473686, + "grad_norm": 2.53125, + "grad_norm_var": 0.1083892822265625, + "learning_rate": 0.0001, + "loss": 3.4578, + "loss/crossentropy": 2.4509124517440797, + "loss/hidden": 3.1109375, + "loss/incoh": 0.0, + "loss/logits": 0.3049029678106308, + "loss/reg": 0.0, + "step": 5780 + }, + { + "epoch": 0.03809210526315789, + "grad_norm": 3.21875, + "grad_norm_var": 0.20041402180989584, + "learning_rate": 0.0001, + "loss": 3.4516, + "loss/crossentropy": 2.1973756074905397, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.2764800027012825, + "loss/reg": 0.0, + "step": 5790 + }, + { + "epoch": 0.038157894736842106, + "grad_norm": 2.3125, + "grad_norm_var": 0.17437235514322916, + "learning_rate": 0.0001, + "loss": 3.3789, + "loss/crossentropy": 2.4859437584877013, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.28125611394643785, + "loss/reg": 0.0, + "step": 5800 + }, + { + "epoch": 0.03822368421052631, + "grad_norm": 2.484375, + "grad_norm_var": 1.9865234375, + "learning_rate": 0.0001, + "loss": 3.4479, + "loss/crossentropy": 2.6306315779685976, + "loss/hidden": 3.440625, + "loss/incoh": 0.0, + "loss/logits": 0.2930992156267166, + "loss/reg": 0.0, + "step": 5810 + }, + { + "epoch": 0.038289473684210526, + "grad_norm": 2.40625, + "grad_norm_var": 1.902880859375, + "learning_rate": 0.0001, + "loss": 3.482, + "loss/crossentropy": 2.4275772333145142, + "loss/hidden": 3.0953125, + "loss/incoh": 0.0, + "loss/logits": 0.32571674734354017, + "loss/reg": 0.0, + "step": 5820 + }, + { + "epoch": 0.03835526315789474, + "grad_norm": 2.625, + "grad_norm_var": 0.040087890625, + "learning_rate": 0.0001, + "loss": 3.3832, + "loss/crossentropy": 2.348308402299881, + "loss/hidden": 3.140625, + "loss/incoh": 0.0, + "loss/logits": 0.325964193046093, + "loss/reg": 0.0, + "step": 5830 + }, + { + "epoch": 0.038421052631578946, + "grad_norm": 3.796875, + "grad_norm_var": 0.8002115885416666, + "learning_rate": 0.0001, + "loss": 3.5519, + "loss/crossentropy": 2.1252057909965516, + "loss/hidden": 3.2984375, + "loss/incoh": 0.0, + "loss/logits": 0.3046886622905731, + "loss/reg": 0.0, + "step": 5840 + }, + { + "epoch": 0.03848684210526316, + "grad_norm": 2.625, + "grad_norm_var": 0.2637685139973958, + "learning_rate": 0.0001, + "loss": 3.4841, + "loss/crossentropy": 2.5330613613128663, + "loss/hidden": 3.025, + "loss/incoh": 0.0, + "loss/logits": 0.31516623198986055, + "loss/reg": 0.0, + "step": 5850 + }, + { + "epoch": 0.038552631578947366, + "grad_norm": 2.3125, + "grad_norm_var": 0.24798075358072916, + "learning_rate": 0.0001, + "loss": 3.4625, + "loss/crossentropy": 2.324964237213135, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.29473926275968554, + "loss/reg": 0.0, + "step": 5860 + }, + { + "epoch": 0.03861842105263158, + "grad_norm": 2.609375, + "grad_norm_var": 0.12890218098958334, + "learning_rate": 0.0001, + "loss": 3.3932, + "loss/crossentropy": 2.436046540737152, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.28259717375040055, + "loss/reg": 0.0, + "step": 5870 + }, + { + "epoch": 0.038684210526315786, + "grad_norm": 2.59375, + "grad_norm_var": 0.09658101399739584, + "learning_rate": 0.0001, + "loss": 3.4335, + "loss/crossentropy": 2.3942569494247437, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.29623564779758454, + "loss/reg": 0.0, + "step": 5880 + }, + { + "epoch": 0.03875, + "grad_norm": 2.421875, + "grad_norm_var": 0.0283355712890625, + "learning_rate": 0.0001, + "loss": 3.4179, + "loss/crossentropy": 2.675841474533081, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2852372720837593, + "loss/reg": 0.0, + "step": 5890 + }, + { + "epoch": 0.03881578947368421, + "grad_norm": 2.609375, + "grad_norm_var": 0.25607096354166664, + "learning_rate": 0.0001, + "loss": 3.4508, + "loss/crossentropy": 2.516406524181366, + "loss/hidden": 3.35, + "loss/incoh": 0.0, + "loss/logits": 0.35599096268415453, + "loss/reg": 0.0, + "step": 5900 + }, + { + "epoch": 0.03888157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.19685872395833334, + "learning_rate": 0.0001, + "loss": 3.3701, + "loss/crossentropy": 2.118768775463104, + "loss/hidden": 3.234375, + "loss/incoh": 0.0, + "loss/logits": 0.3433256149291992, + "loss/reg": 0.0, + "step": 5910 + }, + { + "epoch": 0.03894736842105263, + "grad_norm": 2.75, + "grad_norm_var": 0.10144856770833334, + "learning_rate": 0.0001, + "loss": 3.4698, + "loss/crossentropy": 2.4102617263793946, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.2903832048177719, + "loss/reg": 0.0, + "step": 5920 + }, + { + "epoch": 0.03901315789473684, + "grad_norm": 3.25, + "grad_norm_var": 0.05016276041666667, + "learning_rate": 0.0001, + "loss": 3.3435, + "loss/crossentropy": 2.4927979469299317, + "loss/hidden": 3.046875, + "loss/incoh": 0.0, + "loss/logits": 0.2977334216237068, + "loss/reg": 0.0, + "step": 5930 + }, + { + "epoch": 0.03907894736842105, + "grad_norm": 2.25, + "grad_norm_var": 0.2647043863932292, + "learning_rate": 0.0001, + "loss": 3.4123, + "loss/crossentropy": 2.163569325208664, + "loss/hidden": 3.1515625, + "loss/incoh": 0.0, + "loss/logits": 0.27049526423215864, + "loss/reg": 0.0, + "step": 5940 + }, + { + "epoch": 0.03914473684210526, + "grad_norm": 2.765625, + "grad_norm_var": 0.1790679931640625, + "learning_rate": 0.0001, + "loss": 3.4968, + "loss/crossentropy": 2.314089775085449, + "loss/hidden": 3.290625, + "loss/incoh": 0.0, + "loss/logits": 0.2804916575551033, + "loss/reg": 0.0, + "step": 5950 + }, + { + "epoch": 0.03921052631578947, + "grad_norm": 2.578125, + "grad_norm_var": 0.10930989583333334, + "learning_rate": 0.0001, + "loss": 3.4204, + "loss/crossentropy": 2.5095210552215574, + "loss/hidden": 3.1890625, + "loss/incoh": 0.0, + "loss/logits": 0.34905528128147123, + "loss/reg": 0.0, + "step": 5960 + }, + { + "epoch": 0.03927631578947369, + "grad_norm": 2.5, + "grad_norm_var": 0.024348958333333334, + "learning_rate": 0.0001, + "loss": 3.3368, + "loss/crossentropy": 2.3903687596321106, + "loss/hidden": 3.228125, + "loss/incoh": 0.0, + "loss/logits": 0.36534676551818845, + "loss/reg": 0.0, + "step": 5970 + }, + { + "epoch": 0.039342105263157894, + "grad_norm": 2.703125, + "grad_norm_var": 0.04106343587239583, + "learning_rate": 0.0001, + "loss": 3.3975, + "loss/crossentropy": 2.485898661613464, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.28061621338129045, + "loss/reg": 0.0, + "step": 5980 + }, + { + "epoch": 0.03940789473684211, + "grad_norm": 2.578125, + "grad_norm_var": 0.0488677978515625, + "learning_rate": 0.0001, + "loss": 3.3803, + "loss/crossentropy": 2.4124781847000123, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.30112328827381135, + "loss/reg": 0.0, + "step": 5990 + }, + { + "epoch": 0.039473684210526314, + "grad_norm": 2.734375, + "grad_norm_var": 0.44810791015625, + "learning_rate": 0.0001, + "loss": 3.472, + "loss/crossentropy": 2.4459670901298525, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.2799991726875305, + "loss/reg": 0.0, + "step": 6000 + }, + { + "epoch": 0.03953947368421053, + "grad_norm": 2.5625, + "grad_norm_var": 0.1685546875, + "learning_rate": 0.0001, + "loss": 3.4322, + "loss/crossentropy": 2.641672468185425, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.31368287801742556, + "loss/reg": 0.0, + "step": 6010 + }, + { + "epoch": 0.039605263157894734, + "grad_norm": 2.546875, + "grad_norm_var": 0.03518473307291667, + "learning_rate": 0.0001, + "loss": 3.4325, + "loss/crossentropy": 2.2493654131889342, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.29540172666311265, + "loss/reg": 0.0, + "step": 6020 + }, + { + "epoch": 0.03967105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.04810282389322917, + "learning_rate": 0.0001, + "loss": 3.408, + "loss/crossentropy": 2.5626556158065794, + "loss/hidden": 3.178125, + "loss/incoh": 0.0, + "loss/logits": 0.3445401757955551, + "loss/reg": 0.0, + "step": 6030 + }, + { + "epoch": 0.03973684210526316, + "grad_norm": 3.375, + "grad_norm_var": 0.11850484212239583, + "learning_rate": 0.0001, + "loss": 3.3697, + "loss/crossentropy": 2.2249147415161135, + "loss/hidden": 3.165625, + "loss/incoh": 0.0, + "loss/logits": 0.3320572040975094, + "loss/reg": 0.0, + "step": 6040 + }, + { + "epoch": 0.03980263157894737, + "grad_norm": 2.34375, + "grad_norm_var": 0.1265045166015625, + "learning_rate": 0.0001, + "loss": 3.3726, + "loss/crossentropy": 2.479216980934143, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.3085744693875313, + "loss/reg": 0.0, + "step": 6050 + }, + { + "epoch": 0.03986842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.2034088134765625, + "learning_rate": 0.0001, + "loss": 3.4346, + "loss/crossentropy": 2.3974440932273864, + "loss/hidden": 3.259375, + "loss/incoh": 0.0, + "loss/logits": 0.4024490460753441, + "loss/reg": 0.0, + "step": 6060 + }, + { + "epoch": 0.03993421052631579, + "grad_norm": 2.640625, + "grad_norm_var": 0.19241434733072918, + "learning_rate": 0.0001, + "loss": 3.3911, + "loss/crossentropy": 2.3311298370361326, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.2968046382069588, + "loss/reg": 0.0, + "step": 6070 + }, + { + "epoch": 0.04, + "grad_norm": 2.734375, + "grad_norm_var": 0.060445149739583336, + "learning_rate": 0.0001, + "loss": 3.5175, + "loss/crossentropy": 2.6493954181671144, + "loss/hidden": 3.2828125, + "loss/incoh": 0.0, + "loss/logits": 0.3133848324418068, + "loss/reg": 0.0, + "step": 6080 + }, + { + "epoch": 0.04006578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.2754140218098958, + "learning_rate": 0.0001, + "loss": 3.3932, + "loss/crossentropy": 2.4315222024917604, + "loss/hidden": 3.159375, + "loss/incoh": 0.0, + "loss/logits": 0.3378627926111221, + "loss/reg": 0.0, + "step": 6090 + }, + { + "epoch": 0.04013157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.04810282389322917, + "learning_rate": 0.0001, + "loss": 3.3698, + "loss/crossentropy": 2.421731984615326, + "loss/hidden": 3.315625, + "loss/incoh": 0.0, + "loss/logits": 0.3809585988521576, + "loss/reg": 0.0, + "step": 6100 + }, + { + "epoch": 0.040197368421052634, + "grad_norm": 2.328125, + "grad_norm_var": 0.0509429931640625, + "learning_rate": 0.0001, + "loss": 3.3839, + "loss/crossentropy": 2.2816696763038635, + "loss/hidden": 3.0390625, + "loss/incoh": 0.0, + "loss/logits": 0.3188880756497383, + "loss/reg": 0.0, + "step": 6110 + }, + { + "epoch": 0.04026315789473684, + "grad_norm": 2.421875, + "grad_norm_var": 0.26266276041666664, + "learning_rate": 0.0001, + "loss": 3.4852, + "loss/crossentropy": 2.4251498103141786, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.3131637305021286, + "loss/reg": 0.0, + "step": 6120 + }, + { + "epoch": 0.040328947368421054, + "grad_norm": 2.046875, + "grad_norm_var": 0.3377919514973958, + "learning_rate": 0.0001, + "loss": 3.4191, + "loss/crossentropy": 2.321718716621399, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.28195892125368116, + "loss/reg": 0.0, + "step": 6130 + }, + { + "epoch": 0.04039473684210526, + "grad_norm": 2.515625, + "grad_norm_var": 0.0685943603515625, + "learning_rate": 0.0001, + "loss": 3.4412, + "loss/crossentropy": 2.658420753479004, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.3355125278234482, + "loss/reg": 0.0, + "step": 6140 + }, + { + "epoch": 0.040460526315789475, + "grad_norm": 2.1875, + "grad_norm_var": 0.059305826822916664, + "learning_rate": 0.0001, + "loss": 3.3292, + "loss/crossentropy": 2.5203867316246034, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.3060264021158218, + "loss/reg": 0.0, + "step": 6150 + }, + { + "epoch": 0.04052631578947368, + "grad_norm": 3.078125, + "grad_norm_var": 0.052302042643229164, + "learning_rate": 0.0001, + "loss": 3.3844, + "loss/crossentropy": 2.53420695066452, + "loss/hidden": 3.178125, + "loss/incoh": 0.0, + "loss/logits": 0.3124456375837326, + "loss/reg": 0.0, + "step": 6160 + }, + { + "epoch": 0.040592105263157895, + "grad_norm": 2.375, + "grad_norm_var": 0.049779256184895836, + "learning_rate": 0.0001, + "loss": 3.3634, + "loss/crossentropy": 2.5132151365280153, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.35312571823596955, + "loss/reg": 0.0, + "step": 6170 + }, + { + "epoch": 0.04065789473684211, + "grad_norm": 2.46875, + "grad_norm_var": 0.16035054524739584, + "learning_rate": 0.0001, + "loss": 3.4426, + "loss/crossentropy": 2.3004459500312806, + "loss/hidden": 3.1703125, + "loss/incoh": 0.0, + "loss/logits": 0.28007449954748154, + "loss/reg": 0.0, + "step": 6180 + }, + { + "epoch": 0.040723684210526315, + "grad_norm": 2.515625, + "grad_norm_var": 0.1358306884765625, + "learning_rate": 0.0001, + "loss": 3.3068, + "loss/crossentropy": 2.373980039358139, + "loss/hidden": 3.015625, + "loss/incoh": 0.0, + "loss/logits": 0.28091391175985336, + "loss/reg": 0.0, + "step": 6190 + }, + { + "epoch": 0.04078947368421053, + "grad_norm": 2.84375, + "grad_norm_var": 0.08801981608072916, + "learning_rate": 0.0001, + "loss": 3.4006, + "loss/crossentropy": 2.2023098945617674, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.2726595625281334, + "loss/reg": 0.0, + "step": 6200 + }, + { + "epoch": 0.040855263157894735, + "grad_norm": 2.984375, + "grad_norm_var": 3.6811187744140623, + "learning_rate": 0.0001, + "loss": 3.6073, + "loss/crossentropy": 2.382032370567322, + "loss/hidden": 3.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.4323269993066788, + "loss/reg": 0.0, + "step": 6210 + }, + { + "epoch": 0.04092105263157895, + "grad_norm": 2.296875, + "grad_norm_var": 3.804295857747396, + "learning_rate": 0.0001, + "loss": 3.3627, + "loss/crossentropy": 2.569228994846344, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.31191317439079286, + "loss/reg": 0.0, + "step": 6220 + }, + { + "epoch": 0.040986842105263155, + "grad_norm": 2.828125, + "grad_norm_var": 0.08497721354166667, + "learning_rate": 0.0001, + "loss": 3.3414, + "loss/crossentropy": 2.5153043985366823, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.30689269602298735, + "loss/reg": 0.0, + "step": 6230 + }, + { + "epoch": 0.04105263157894737, + "grad_norm": 2.375, + "grad_norm_var": 1.2814849853515624, + "learning_rate": 0.0001, + "loss": 3.4386, + "loss/crossentropy": 2.283157765865326, + "loss/hidden": 3.1203125, + "loss/incoh": 0.0, + "loss/logits": 0.3153227433562279, + "loss/reg": 0.0, + "step": 6240 + }, + { + "epoch": 0.04111842105263158, + "grad_norm": 2.234375, + "grad_norm_var": 5.054263305664063, + "learning_rate": 0.0001, + "loss": 3.4379, + "loss/crossentropy": 2.585819673538208, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.2913993000984192, + "loss/reg": 0.0, + "step": 6250 + }, + { + "epoch": 0.04118421052631579, + "grad_norm": 2.5, + "grad_norm_var": 0.0501373291015625, + "learning_rate": 0.0001, + "loss": 3.3357, + "loss/crossentropy": 2.453801620006561, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.3249350532889366, + "loss/reg": 0.0, + "step": 6260 + }, + { + "epoch": 0.04125, + "grad_norm": 2.5, + "grad_norm_var": 0.0405181884765625, + "learning_rate": 0.0001, + "loss": 3.3525, + "loss/crossentropy": 2.4949014663696287, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.2902356445789337, + "loss/reg": 0.0, + "step": 6270 + }, + { + "epoch": 0.04131578947368421, + "grad_norm": 2.78125, + "grad_norm_var": 1.6241770426432292, + "learning_rate": 0.0001, + "loss": 3.4444, + "loss/crossentropy": 2.029393529891968, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.30831936225295065, + "loss/reg": 0.0, + "step": 6280 + }, + { + "epoch": 0.04138157894736842, + "grad_norm": 2.53125, + "grad_norm_var": 0.0864654541015625, + "learning_rate": 0.0001, + "loss": 3.3944, + "loss/crossentropy": 3.0075352430343627, + "loss/hidden": 3.2390625, + "loss/incoh": 0.0, + "loss/logits": 0.4476942718029022, + "loss/reg": 0.0, + "step": 6290 + }, + { + "epoch": 0.04144736842105263, + "grad_norm": 2.515625, + "grad_norm_var": 0.15415751139322917, + "learning_rate": 0.0001, + "loss": 3.4091, + "loss/crossentropy": 2.36070739030838, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.26602610796689985, + "loss/reg": 0.0, + "step": 6300 + }, + { + "epoch": 0.04151315789473684, + "grad_norm": 3.0, + "grad_norm_var": 0.18596903483072916, + "learning_rate": 0.0001, + "loss": 3.4135, + "loss/crossentropy": 2.186020624637604, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.30242343842983244, + "loss/reg": 0.0, + "step": 6310 + }, + { + "epoch": 0.041578947368421056, + "grad_norm": 3.34375, + "grad_norm_var": 0.06327718098958333, + "learning_rate": 0.0001, + "loss": 3.44, + "loss/crossentropy": 2.3751362919807435, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.29995152205228803, + "loss/reg": 0.0, + "step": 6320 + }, + { + "epoch": 0.04164473684210526, + "grad_norm": 7.9375, + "grad_norm_var": 3.442041015625, + "learning_rate": 0.0001, + "loss": 3.4985, + "loss/crossentropy": 2.2889585196971893, + "loss/hidden": 3.14375, + "loss/incoh": 0.0, + "loss/logits": 0.5501813948154449, + "loss/reg": 0.0, + "step": 6330 + }, + { + "epoch": 0.041710526315789476, + "grad_norm": 2.875, + "grad_norm_var": 3.8211822509765625, + "learning_rate": 0.0001, + "loss": 3.4688, + "loss/crossentropy": 2.378446078300476, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.30127355754375457, + "loss/reg": 0.0, + "step": 6340 + }, + { + "epoch": 0.04177631578947368, + "grad_norm": 3.109375, + "grad_norm_var": 1.1990793863932292, + "learning_rate": 0.0001, + "loss": 3.3813, + "loss/crossentropy": 2.444005084037781, + "loss/hidden": 3.209375, + "loss/incoh": 0.0, + "loss/logits": 0.3470224469900131, + "loss/reg": 0.0, + "step": 6350 + }, + { + "epoch": 0.041842105263157896, + "grad_norm": 2.234375, + "grad_norm_var": 0.7403065999348958, + "learning_rate": 0.0001, + "loss": 3.4, + "loss/crossentropy": 2.026668357849121, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.2548553004860878, + "loss/reg": 0.0, + "step": 6360 + }, + { + "epoch": 0.0419078947368421, + "grad_norm": 2.921875, + "grad_norm_var": 0.3556925455729167, + "learning_rate": 0.0001, + "loss": 3.4591, + "loss/crossentropy": 2.331345629692078, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.26911164075136185, + "loss/reg": 0.0, + "step": 6370 + }, + { + "epoch": 0.041973684210526316, + "grad_norm": 2.484375, + "grad_norm_var": 0.27215169270833334, + "learning_rate": 0.0001, + "loss": 3.4011, + "loss/crossentropy": 2.2243176221847536, + "loss/hidden": 3.0875, + "loss/incoh": 0.0, + "loss/logits": 0.30012439042329786, + "loss/reg": 0.0, + "step": 6380 + }, + { + "epoch": 0.04203947368421053, + "grad_norm": 2.625, + "grad_norm_var": 0.0267974853515625, + "learning_rate": 0.0001, + "loss": 3.3476, + "loss/crossentropy": 2.284479832649231, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.3452886208891869, + "loss/reg": 0.0, + "step": 6390 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 2.859375, + "grad_norm_var": 0.18044331868489583, + "learning_rate": 0.0001, + "loss": 3.4206, + "loss/crossentropy": 2.2036523103713987, + "loss/hidden": 3.0515625, + "loss/incoh": 0.0, + "loss/logits": 0.2783824667334557, + "loss/reg": 0.0, + "step": 6400 + }, + { + "epoch": 0.04217105263157895, + "grad_norm": 2.34375, + "grad_norm_var": 4.91226806640625, + "learning_rate": 0.0001, + "loss": 3.4892, + "loss/crossentropy": 2.2305456399917603, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.29731594026088715, + "loss/reg": 0.0, + "step": 6410 + }, + { + "epoch": 0.042236842105263156, + "grad_norm": 2.15625, + "grad_norm_var": 0.8588216145833333, + "learning_rate": 0.0001, + "loss": 3.3759, + "loss/crossentropy": 2.186525213718414, + "loss/hidden": 3.090625, + "loss/incoh": 0.0, + "loss/logits": 0.31351439356803895, + "loss/reg": 0.0, + "step": 6420 + }, + { + "epoch": 0.04230263157894737, + "grad_norm": 2.421875, + "grad_norm_var": 1.168097941080729, + "learning_rate": 0.0001, + "loss": 3.4452, + "loss/crossentropy": 2.3028628826141357, + "loss/hidden": 3.29375, + "loss/incoh": 0.0, + "loss/logits": 0.40216329991817473, + "loss/reg": 0.0, + "step": 6430 + }, + { + "epoch": 0.042368421052631576, + "grad_norm": 2.296875, + "grad_norm_var": 0.0708160400390625, + "learning_rate": 0.0001, + "loss": 3.362, + "loss/crossentropy": 2.5469772100448607, + "loss/hidden": 3.10625, + "loss/incoh": 0.0, + "loss/logits": 0.3354289785027504, + "loss/reg": 0.0, + "step": 6440 + }, + { + "epoch": 0.04243421052631579, + "grad_norm": 3.4375, + "grad_norm_var": 0.09075419108072917, + "learning_rate": 0.0001, + "loss": 3.366, + "loss/crossentropy": 2.3079045534133913, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.2784201934933662, + "loss/reg": 0.0, + "step": 6450 + }, + { + "epoch": 0.0425, + "grad_norm": 2.5625, + "grad_norm_var": 0.10932515462239584, + "learning_rate": 0.0001, + "loss": 3.4156, + "loss/crossentropy": 2.325330352783203, + "loss/hidden": 3.1828125, + "loss/incoh": 0.0, + "loss/logits": 0.3098024681210518, + "loss/reg": 0.0, + "step": 6460 + }, + { + "epoch": 0.04256578947368421, + "grad_norm": 2.890625, + "grad_norm_var": 0.07423502604166667, + "learning_rate": 0.0001, + "loss": 3.3605, + "loss/crossentropy": 2.4809486865997314, + "loss/hidden": 3.146875, + "loss/incoh": 0.0, + "loss/logits": 0.336503566801548, + "loss/reg": 0.0, + "step": 6470 + }, + { + "epoch": 0.04263157894736842, + "grad_norm": 2.5, + "grad_norm_var": 0.06294657389322916, + "learning_rate": 0.0001, + "loss": 3.4219, + "loss/crossentropy": 2.357036566734314, + "loss/hidden": 3.16875, + "loss/incoh": 0.0, + "loss/logits": 0.31517077386379244, + "loss/reg": 0.0, + "step": 6480 + }, + { + "epoch": 0.04269736842105263, + "grad_norm": 2.96875, + "grad_norm_var": 0.044188435872395834, + "learning_rate": 0.0001, + "loss": 3.2698, + "loss/crossentropy": 2.3011206150054933, + "loss/hidden": 3.0296875, + "loss/incoh": 0.0, + "loss/logits": 0.3023343622684479, + "loss/reg": 0.0, + "step": 6490 + }, + { + "epoch": 0.04276315789473684, + "grad_norm": 3.75, + "grad_norm_var": 0.191259765625, + "learning_rate": 0.0001, + "loss": 3.3991, + "loss/crossentropy": 2.135625755786896, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.30503824055194856, + "loss/reg": 0.0, + "step": 6500 + }, + { + "epoch": 0.04282894736842105, + "grad_norm": 2.796875, + "grad_norm_var": 0.16857096354166667, + "learning_rate": 0.0001, + "loss": 3.317, + "loss/crossentropy": 2.2963179469108583, + "loss/hidden": 3.090625, + "loss/incoh": 0.0, + "loss/logits": 0.30365400537848475, + "loss/reg": 0.0, + "step": 6510 + }, + { + "epoch": 0.04289473684210526, + "grad_norm": 2.578125, + "grad_norm_var": 0.0795562744140625, + "learning_rate": 0.0001, + "loss": 3.3769, + "loss/crossentropy": 2.463003098964691, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2864454731345177, + "loss/reg": 0.0, + "step": 6520 + }, + { + "epoch": 0.04296052631578948, + "grad_norm": 3.28125, + "grad_norm_var": 0.1319244384765625, + "learning_rate": 0.0001, + "loss": 3.4418, + "loss/crossentropy": 2.4336194515228273, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.2907303601503372, + "loss/reg": 0.0, + "step": 6530 + }, + { + "epoch": 0.04302631578947368, + "grad_norm": 2.78125, + "grad_norm_var": 0.18455403645833332, + "learning_rate": 0.0001, + "loss": 3.5029, + "loss/crossentropy": 2.4530020356178284, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.3084172964096069, + "loss/reg": 0.0, + "step": 6540 + }, + { + "epoch": 0.0430921052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.0909332275390625, + "learning_rate": 0.0001, + "loss": 3.4113, + "loss/crossentropy": 2.28972727060318, + "loss/hidden": 3.09375, + "loss/incoh": 0.0, + "loss/logits": 0.29103828966617584, + "loss/reg": 0.0, + "step": 6550 + }, + { + "epoch": 0.0431578947368421, + "grad_norm": 3.15625, + "grad_norm_var": 0.09442952473958334, + "learning_rate": 0.0001, + "loss": 3.3842, + "loss/crossentropy": 2.5410515666007996, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.262499050796032, + "loss/reg": 0.0, + "step": 6560 + }, + { + "epoch": 0.04322368421052632, + "grad_norm": 2.625, + "grad_norm_var": 0.5502105712890625, + "learning_rate": 0.0001, + "loss": 3.3708, + "loss/crossentropy": 2.460482358932495, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.2910577103495598, + "loss/reg": 0.0, + "step": 6570 + }, + { + "epoch": 0.043289473684210523, + "grad_norm": 2.484375, + "grad_norm_var": 0.5198527018229167, + "learning_rate": 0.0001, + "loss": 3.2716, + "loss/crossentropy": 2.264538216590881, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.26389954835176466, + "loss/reg": 0.0, + "step": 6580 + }, + { + "epoch": 0.04335526315789474, + "grad_norm": 2.609375, + "grad_norm_var": 0.58785400390625, + "learning_rate": 0.0001, + "loss": 3.4431, + "loss/crossentropy": 2.705476760864258, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.3710309460759163, + "loss/reg": 0.0, + "step": 6590 + }, + { + "epoch": 0.04342105263157895, + "grad_norm": 2.71875, + "grad_norm_var": 0.7066721598307292, + "learning_rate": 0.0001, + "loss": 3.3844, + "loss/crossentropy": 2.3109049081802366, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.3492768794298172, + "loss/reg": 0.0, + "step": 6600 + }, + { + "epoch": 0.04348684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 2.2473958333333335, + "learning_rate": 0.0001, + "loss": 3.5366, + "loss/crossentropy": 2.110491228103638, + "loss/hidden": 3.2296875, + "loss/incoh": 0.0, + "loss/logits": 0.3138969212770462, + "loss/reg": 0.0, + "step": 6610 + }, + { + "epoch": 0.04355263157894737, + "grad_norm": 2.265625, + "grad_norm_var": 1.2563222249348958, + "learning_rate": 0.0001, + "loss": 3.3583, + "loss/crossentropy": 2.5283448338508605, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.2928563803434372, + "loss/reg": 0.0, + "step": 6620 + }, + { + "epoch": 0.04361842105263158, + "grad_norm": 2.359375, + "grad_norm_var": 1.2412261962890625, + "learning_rate": 0.0001, + "loss": 3.3885, + "loss/crossentropy": 2.579999303817749, + "loss/hidden": 3.4296875, + "loss/incoh": 0.0, + "loss/logits": 0.4414908319711685, + "loss/reg": 0.0, + "step": 6630 + }, + { + "epoch": 0.04368421052631579, + "grad_norm": 2.546875, + "grad_norm_var": 0.11516825358072917, + "learning_rate": 0.0001, + "loss": 3.3022, + "loss/crossentropy": 2.413753032684326, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.28862773478031156, + "loss/reg": 0.0, + "step": 6640 + }, + { + "epoch": 0.04375, + "grad_norm": 2.828125, + "grad_norm_var": 0.28503316243489585, + "learning_rate": 0.0001, + "loss": 3.4009, + "loss/crossentropy": 2.4203495264053343, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2926980495452881, + "loss/reg": 0.0, + "step": 6650 + }, + { + "epoch": 0.04381578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 0.07642822265625, + "learning_rate": 0.0001, + "loss": 3.4092, + "loss/crossentropy": 2.3588085770606995, + "loss/hidden": 3.2, + "loss/incoh": 0.0, + "loss/logits": 0.32405087500810625, + "loss/reg": 0.0, + "step": 6660 + }, + { + "epoch": 0.043881578947368424, + "grad_norm": 3.53125, + "grad_norm_var": 0.45767822265625, + "learning_rate": 0.0001, + "loss": 3.4143, + "loss/crossentropy": 2.512442636489868, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.2854305922985077, + "loss/reg": 0.0, + "step": 6670 + }, + { + "epoch": 0.04394736842105263, + "grad_norm": 2.328125, + "grad_norm_var": 0.3211873372395833, + "learning_rate": 0.0001, + "loss": 3.3297, + "loss/crossentropy": 2.5028780698776245, + "loss/hidden": 3.2421875, + "loss/incoh": 0.0, + "loss/logits": 0.3471944749355316, + "loss/reg": 0.0, + "step": 6680 + }, + { + "epoch": 0.044013157894736844, + "grad_norm": 4.0625, + "grad_norm_var": 0.8257720947265625, + "learning_rate": 0.0001, + "loss": 3.4063, + "loss/crossentropy": 2.2031276702880858, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.25473351776599884, + "loss/reg": 0.0, + "step": 6690 + }, + { + "epoch": 0.04407894736842105, + "grad_norm": 2.265625, + "grad_norm_var": 0.22024637858072918, + "learning_rate": 0.0001, + "loss": 3.4035, + "loss/crossentropy": 2.2040748953819276, + "loss/hidden": 3.434375, + "loss/incoh": 0.0, + "loss/logits": 0.3361481264233589, + "loss/reg": 0.0, + "step": 6700 + }, + { + "epoch": 0.044144736842105264, + "grad_norm": 2.15625, + "grad_norm_var": 0.1261383056640625, + "learning_rate": 0.0001, + "loss": 3.4225, + "loss/crossentropy": 2.390383541584015, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.2891513243317604, + "loss/reg": 0.0, + "step": 6710 + }, + { + "epoch": 0.04421052631578947, + "grad_norm": 2.65625, + "grad_norm_var": 0.0734375, + "learning_rate": 0.0001, + "loss": 3.4363, + "loss/crossentropy": 2.392831575870514, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.3119692116975784, + "loss/reg": 0.0, + "step": 6720 + }, + { + "epoch": 0.044276315789473684, + "grad_norm": 2.453125, + "grad_norm_var": 0.12464192708333334, + "learning_rate": 0.0001, + "loss": 3.3959, + "loss/crossentropy": 2.426222395896912, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.2563284829258919, + "loss/reg": 0.0, + "step": 6730 + }, + { + "epoch": 0.0443421052631579, + "grad_norm": 2.890625, + "grad_norm_var": 3.1175201416015623, + "learning_rate": 0.0001, + "loss": 3.4427, + "loss/crossentropy": 2.408197546005249, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.3056713670492172, + "loss/reg": 0.0, + "step": 6740 + }, + { + "epoch": 0.044407894736842105, + "grad_norm": 2.359375, + "grad_norm_var": 2.6096099853515624, + "learning_rate": 0.0001, + "loss": 3.3365, + "loss/crossentropy": 2.5353691220283507, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.30413212031126025, + "loss/reg": 0.0, + "step": 6750 + }, + { + "epoch": 0.04447368421052632, + "grad_norm": 2.3125, + "grad_norm_var": 0.233740234375, + "learning_rate": 0.0001, + "loss": 3.371, + "loss/crossentropy": 2.495334494113922, + "loss/hidden": 3.0640625, + "loss/incoh": 0.0, + "loss/logits": 0.3251196876168251, + "loss/reg": 0.0, + "step": 6760 + }, + { + "epoch": 0.044539473684210525, + "grad_norm": 2.375, + "grad_norm_var": 0.11097005208333334, + "learning_rate": 0.0001, + "loss": 3.3306, + "loss/crossentropy": 2.3767608165740968, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.27886694818735125, + "loss/reg": 0.0, + "step": 6770 + }, + { + "epoch": 0.04460526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.17870992024739582, + "learning_rate": 0.0001, + "loss": 3.4315, + "loss/crossentropy": 2.5281002640724184, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.31765572130680086, + "loss/reg": 0.0, + "step": 6780 + }, + { + "epoch": 0.044671052631578945, + "grad_norm": 3.046875, + "grad_norm_var": 3.0139973958333335, + "learning_rate": 0.0001, + "loss": 3.4717, + "loss/crossentropy": 2.362587594985962, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.3029619336128235, + "loss/reg": 0.0, + "step": 6790 + }, + { + "epoch": 0.04473684210526316, + "grad_norm": 2.6875, + "grad_norm_var": 7.6724192301432295, + "learning_rate": 0.0001, + "loss": 3.4302, + "loss/crossentropy": 2.4665863275527955, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.2934702351689339, + "loss/reg": 0.0, + "step": 6800 + }, + { + "epoch": 0.04480263157894737, + "grad_norm": 2.453125, + "grad_norm_var": 7.714127604166666, + "learning_rate": 0.0001, + "loss": 3.4034, + "loss/crossentropy": 2.1714313626289368, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.3513699471950531, + "loss/reg": 0.0, + "step": 6810 + }, + { + "epoch": 0.04486842105263158, + "grad_norm": 3.953125, + "grad_norm_var": 0.17561848958333334, + "learning_rate": 0.0001, + "loss": 3.3193, + "loss/crossentropy": 2.145359480381012, + "loss/hidden": 3.1015625, + "loss/incoh": 0.0, + "loss/logits": 0.299596332013607, + "loss/reg": 0.0, + "step": 6820 + }, + { + "epoch": 0.04493421052631579, + "grad_norm": 2.328125, + "grad_norm_var": 0.20790913899739583, + "learning_rate": 0.0001, + "loss": 3.343, + "loss/crossentropy": 2.7453124046325685, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.36230285465717316, + "loss/reg": 0.0, + "step": 6830 + }, + { + "epoch": 0.045, + "grad_norm": 2.6875, + "grad_norm_var": 2.236214192708333, + "learning_rate": 0.0001, + "loss": 3.4372, + "loss/crossentropy": 2.256898009777069, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.2946140691637993, + "loss/reg": 0.0, + "step": 6840 + }, + { + "epoch": 0.04506578947368421, + "grad_norm": 2.375, + "grad_norm_var": 2.391950480143229, + "learning_rate": 0.0001, + "loss": 3.3701, + "loss/crossentropy": 2.6210601568222045, + "loss/hidden": 3.1015625, + "loss/incoh": 0.0, + "loss/logits": 0.3517232984304428, + "loss/reg": 0.0, + "step": 6850 + }, + { + "epoch": 0.04513157894736842, + "grad_norm": 2.4375, + "grad_norm_var": 1.015998331705729, + "learning_rate": 0.0001, + "loss": 3.3998, + "loss/crossentropy": 2.387173318862915, + "loss/hidden": 3.3296875, + "loss/incoh": 0.0, + "loss/logits": 0.3826398134231567, + "loss/reg": 0.0, + "step": 6860 + }, + { + "epoch": 0.04519736842105263, + "grad_norm": 2.84375, + "grad_norm_var": 0.44384663899739585, + "learning_rate": 0.0001, + "loss": 3.3858, + "loss/crossentropy": 2.4001118540763855, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.27424332648515704, + "loss/reg": 0.0, + "step": 6870 + }, + { + "epoch": 0.045263157894736845, + "grad_norm": 2.671875, + "grad_norm_var": 0.06037495930989583, + "learning_rate": 0.0001, + "loss": 3.3752, + "loss/crossentropy": 2.1074828147888183, + "loss/hidden": 3.0234375, + "loss/incoh": 0.0, + "loss/logits": 0.2777694225311279, + "loss/reg": 0.0, + "step": 6880 + }, + { + "epoch": 0.04532894736842105, + "grad_norm": 2.515625, + "grad_norm_var": 0.1735504150390625, + "learning_rate": 0.0001, + "loss": 3.3792, + "loss/crossentropy": 2.2711395502090452, + "loss/hidden": 3.1296875, + "loss/incoh": 0.0, + "loss/logits": 0.3001497104763985, + "loss/reg": 0.0, + "step": 6890 + }, + { + "epoch": 0.045394736842105265, + "grad_norm": 3.0, + "grad_norm_var": 1.8758951822916667, + "learning_rate": 0.0001, + "loss": 3.4311, + "loss/crossentropy": 2.2329100012779235, + "loss/hidden": 3.3421875, + "loss/incoh": 0.0, + "loss/logits": 0.3293987289071083, + "loss/reg": 0.0, + "step": 6900 + }, + { + "epoch": 0.04546052631578947, + "grad_norm": 2.703125, + "grad_norm_var": 1.793781534830729, + "learning_rate": 0.0001, + "loss": 3.2852, + "loss/crossentropy": 2.3576371729373933, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.27053930461406706, + "loss/reg": 0.0, + "step": 6910 + }, + { + "epoch": 0.045526315789473686, + "grad_norm": 2.484375, + "grad_norm_var": 1.989207967122396, + "learning_rate": 0.0001, + "loss": 3.4411, + "loss/crossentropy": 2.532450318336487, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.2638275146484375, + "loss/reg": 0.0, + "step": 6920 + }, + { + "epoch": 0.04559210526315789, + "grad_norm": 2.546875, + "grad_norm_var": 2.184137980143229, + "learning_rate": 0.0001, + "loss": 3.4069, + "loss/crossentropy": 2.4933431267738344, + "loss/hidden": 3.2640625, + "loss/incoh": 0.0, + "loss/logits": 0.32281421422958373, + "loss/reg": 0.0, + "step": 6930 + }, + { + "epoch": 0.045657894736842106, + "grad_norm": 2.421875, + "grad_norm_var": 0.05158589680989583, + "learning_rate": 0.0001, + "loss": 3.284, + "loss/crossentropy": 2.2752655148506165, + "loss/hidden": 2.9734375, + "loss/incoh": 0.0, + "loss/logits": 0.2645736649632454, + "loss/reg": 0.0, + "step": 6940 + }, + { + "epoch": 0.04572368421052632, + "grad_norm": 2.40625, + "grad_norm_var": 0.0625152587890625, + "learning_rate": 0.0001, + "loss": 3.3456, + "loss/crossentropy": 2.290147030353546, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.27036611288785933, + "loss/reg": 0.0, + "step": 6950 + }, + { + "epoch": 0.045789473684210526, + "grad_norm": 2.53125, + "grad_norm_var": 6.027144368489584, + "learning_rate": 0.0001, + "loss": 3.4116, + "loss/crossentropy": 2.359883761405945, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.29680062681436536, + "loss/reg": 0.0, + "step": 6960 + }, + { + "epoch": 0.04585526315789474, + "grad_norm": 2.984375, + "grad_norm_var": 0.5505208333333333, + "learning_rate": 0.0001, + "loss": 3.334, + "loss/crossentropy": 2.4557218074798586, + "loss/hidden": 3.0875, + "loss/incoh": 0.0, + "loss/logits": 0.2946802690625191, + "loss/reg": 0.0, + "step": 6970 + }, + { + "epoch": 0.045921052631578946, + "grad_norm": 2.5, + "grad_norm_var": 0.50865478515625, + "learning_rate": 0.0001, + "loss": 3.3166, + "loss/crossentropy": 2.2482771933078767, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.25328404903411866, + "loss/reg": 0.0, + "step": 6980 + }, + { + "epoch": 0.04598684210526316, + "grad_norm": 2.203125, + "grad_norm_var": 0.045685831705729166, + "learning_rate": 0.0001, + "loss": 3.4432, + "loss/crossentropy": 2.3823115646839144, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.29391862004995345, + "loss/reg": 0.0, + "step": 6990 + }, + { + "epoch": 0.046052631578947366, + "grad_norm": 2.453125, + "grad_norm_var": 0.04589436848958333, + "learning_rate": 0.0001, + "loss": 3.4052, + "loss/crossentropy": 2.67444326877594, + "loss/hidden": 3.3140625, + "loss/incoh": 0.0, + "loss/logits": 0.336136220395565, + "loss/reg": 0.0, + "step": 7000 + }, + { + "epoch": 0.04611842105263158, + "grad_norm": 2.484375, + "grad_norm_var": 0.03218994140625, + "learning_rate": 0.0001, + "loss": 3.3293, + "loss/crossentropy": 2.4088356614112856, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.26314016729593276, + "loss/reg": 0.0, + "step": 7010 + }, + { + "epoch": 0.04618421052631579, + "grad_norm": 2.484375, + "grad_norm_var": 0.044266764322916666, + "learning_rate": 0.0001, + "loss": 3.3201, + "loss/crossentropy": 2.210337924957275, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.28177270889282224, + "loss/reg": 0.0, + "step": 7020 + }, + { + "epoch": 0.04625, + "grad_norm": 2.328125, + "grad_norm_var": 1.1310780843098958, + "learning_rate": 0.0001, + "loss": 3.4065, + "loss/crossentropy": 2.3901759028434753, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.3315755516290665, + "loss/reg": 0.0, + "step": 7030 + }, + { + "epoch": 0.04631578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 0.27060139973958336, + "learning_rate": 0.0001, + "loss": 3.4377, + "loss/crossentropy": 2.3308543801307677, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.27955446392297745, + "loss/reg": 0.0, + "step": 7040 + }, + { + "epoch": 0.04638157894736842, + "grad_norm": 3.953125, + "grad_norm_var": 0.25728759765625, + "learning_rate": 0.0001, + "loss": 3.3076, + "loss/crossentropy": 2.3288169384002684, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.25437864363193513, + "loss/reg": 0.0, + "step": 7050 + }, + { + "epoch": 0.04644736842105263, + "grad_norm": 3.046875, + "grad_norm_var": 0.20709228515625, + "learning_rate": 0.0001, + "loss": 3.439, + "loss/crossentropy": 2.033195120096207, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.2639622241258621, + "loss/reg": 0.0, + "step": 7060 + }, + { + "epoch": 0.04651315789473684, + "grad_norm": 2.390625, + "grad_norm_var": 0.84127197265625, + "learning_rate": 0.0001, + "loss": 3.3119, + "loss/crossentropy": 2.366466200351715, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.2680037707090378, + "loss/reg": 0.0, + "step": 7070 + }, + { + "epoch": 0.04657894736842105, + "grad_norm": 11.9375, + "grad_norm_var": 5.57564697265625, + "learning_rate": 0.0001, + "loss": 3.4068, + "loss/crossentropy": 2.3484351873397826, + "loss/hidden": 3.3828125, + "loss/incoh": 0.0, + "loss/logits": 0.3572035223245621, + "loss/reg": 0.0, + "step": 7080 + }, + { + "epoch": 0.04664473684210527, + "grad_norm": 2.515625, + "grad_norm_var": 5.670992024739584, + "learning_rate": 0.0001, + "loss": 3.5356, + "loss/crossentropy": 2.491948664188385, + "loss/hidden": 3.1484375, + "loss/incoh": 0.0, + "loss/logits": 0.33623201847076417, + "loss/reg": 0.0, + "step": 7090 + }, + { + "epoch": 0.04671052631578947, + "grad_norm": 7.90625, + "grad_norm_var": 1.8282297770182292, + "learning_rate": 0.0001, + "loss": 3.4057, + "loss/crossentropy": 2.3702210783958435, + "loss/hidden": 3.203125, + "loss/incoh": 0.0, + "loss/logits": 0.3451476514339447, + "loss/reg": 0.0, + "step": 7100 + }, + { + "epoch": 0.04677631578947369, + "grad_norm": 2.5, + "grad_norm_var": 1.74947509765625, + "learning_rate": 0.0001, + "loss": 3.4173, + "loss/crossentropy": 2.372307813167572, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.30449149161577227, + "loss/reg": 0.0, + "step": 7110 + }, + { + "epoch": 0.04684210526315789, + "grad_norm": 2.484375, + "grad_norm_var": 0.1269683837890625, + "learning_rate": 0.0001, + "loss": 3.3266, + "loss/crossentropy": 2.4220902919769287, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.29380183219909667, + "loss/reg": 0.0, + "step": 7120 + }, + { + "epoch": 0.04690789473684211, + "grad_norm": 2.375, + "grad_norm_var": 5.4926503499348955, + "learning_rate": 0.0001, + "loss": 3.4299, + "loss/crossentropy": 2.333963227272034, + "loss/hidden": 3.0890625, + "loss/incoh": 0.0, + "loss/logits": 0.32264130711555483, + "loss/reg": 0.0, + "step": 7130 + }, + { + "epoch": 0.04697368421052631, + "grad_norm": 2.453125, + "grad_norm_var": 8.575126139322917, + "learning_rate": 0.0001, + "loss": 3.5129, + "loss/crossentropy": 2.447948896884918, + "loss/hidden": 3.0390625, + "loss/incoh": 0.0, + "loss/logits": 0.34722310602664946, + "loss/reg": 0.0, + "step": 7140 + }, + { + "epoch": 0.04703947368421053, + "grad_norm": 2.171875, + "grad_norm_var": 4.295670572916666, + "learning_rate": 0.0001, + "loss": 3.4954, + "loss/crossentropy": 2.4875372767448427, + "loss/hidden": 3.1765625, + "loss/incoh": 0.0, + "loss/logits": 0.38640123009681704, + "loss/reg": 0.0, + "step": 7150 + }, + { + "epoch": 0.04710526315789473, + "grad_norm": 2.53125, + "grad_norm_var": 0.09901936848958333, + "learning_rate": 0.0001, + "loss": 3.3592, + "loss/crossentropy": 2.4286764740943907, + "loss/hidden": 3.1515625, + "loss/incoh": 0.0, + "loss/logits": 0.3441846176981926, + "loss/reg": 0.0, + "step": 7160 + }, + { + "epoch": 0.04717105263157895, + "grad_norm": 2.46875, + "grad_norm_var": 0.09706624348958333, + "learning_rate": 0.0001, + "loss": 3.4219, + "loss/crossentropy": 2.197064208984375, + "loss/hidden": 3.215625, + "loss/incoh": 0.0, + "loss/logits": 0.30653059035539626, + "loss/reg": 0.0, + "step": 7170 + }, + { + "epoch": 0.04723684210526316, + "grad_norm": 2.421875, + "grad_norm_var": 0.07274983723958334, + "learning_rate": 0.0001, + "loss": 3.4754, + "loss/crossentropy": 2.592367339134216, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.30859925150871276, + "loss/reg": 0.0, + "step": 7180 + }, + { + "epoch": 0.04730263157894737, + "grad_norm": 2.671875, + "grad_norm_var": 0.04854227701822917, + "learning_rate": 0.0001, + "loss": 3.335, + "loss/crossentropy": 2.4139750719070436, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.26624749004840853, + "loss/reg": 0.0, + "step": 7190 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 2.34375, + "grad_norm_var": 0.061579386393229164, + "learning_rate": 0.0001, + "loss": 3.4049, + "loss/crossentropy": 1.9312179803848266, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.23240152448415757, + "loss/reg": 0.0, + "step": 7200 + }, + { + "epoch": 0.04743421052631579, + "grad_norm": 3.140625, + "grad_norm_var": 2.7487790626051414e+17, + "learning_rate": 0.0001, + "loss": 3.5157, + "loss/crossentropy": 2.192197346687317, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.28352061808109286, + "loss/reg": 0.0, + "step": 7210 + }, + { + "epoch": 0.0475, + "grad_norm": 2.828125, + "grad_norm_var": 2.7487790627662506e+17, + "learning_rate": 0.0001, + "loss": 3.3283, + "loss/crossentropy": 2.2706116318702696, + "loss/hidden": 3.0734375, + "loss/incoh": 0.0, + "loss/logits": 0.29702268838882445, + "loss/reg": 0.0, + "step": 7220 + }, + { + "epoch": 0.04756578947368421, + "grad_norm": 2.765625, + "grad_norm_var": 0.05640869140625, + "learning_rate": 0.0001, + "loss": 3.2627, + "loss/crossentropy": 2.354196774959564, + "loss/hidden": 3.0625, + "loss/incoh": 0.0, + "loss/logits": 0.2917496845126152, + "loss/reg": 0.0, + "step": 7230 + }, + { + "epoch": 0.04763157894736842, + "grad_norm": 2.4375, + "grad_norm_var": 0.048779296875, + "learning_rate": 0.0001, + "loss": 3.3302, + "loss/crossentropy": 2.4912230253219603, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.3378173440694809, + "loss/reg": 0.0, + "step": 7240 + }, + { + "epoch": 0.047697368421052634, + "grad_norm": 2.359375, + "grad_norm_var": 0.0933990478515625, + "learning_rate": 0.0001, + "loss": 3.348, + "loss/crossentropy": 2.541353499889374, + "loss/hidden": 3.0359375, + "loss/incoh": 0.0, + "loss/logits": 0.3061401903629303, + "loss/reg": 0.0, + "step": 7250 + }, + { + "epoch": 0.04776315789473684, + "grad_norm": 4.15625, + "grad_norm_var": 0.45806884765625, + "learning_rate": 0.0001, + "loss": 3.5007, + "loss/crossentropy": 2.4307178616523744, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.3470552325248718, + "loss/reg": 0.0, + "step": 7260 + }, + { + "epoch": 0.047828947368421054, + "grad_norm": 2.421875, + "grad_norm_var": 0.4556955973307292, + "learning_rate": 0.0001, + "loss": 3.3409, + "loss/crossentropy": 2.257239353656769, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.29585833847522736, + "loss/reg": 0.0, + "step": 7270 + }, + { + "epoch": 0.04789473684210526, + "grad_norm": 2.546875, + "grad_norm_var": 0.3855445861816406, + "learning_rate": 0.0001, + "loss": 3.2964, + "loss/crossentropy": 2.1451990723609926, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.25767376720905305, + "loss/reg": 0.0, + "step": 7280 + }, + { + "epoch": 0.047960526315789474, + "grad_norm": 2.484375, + "grad_norm_var": 0.26484349568684895, + "learning_rate": 0.0001, + "loss": 3.3283, + "loss/crossentropy": 2.2842231035232543, + "loss/hidden": 3.1171875, + "loss/incoh": 0.0, + "loss/logits": 0.33771214783191683, + "loss/reg": 0.0, + "step": 7290 + }, + { + "epoch": 0.04802631578947368, + "grad_norm": 2.578125, + "grad_norm_var": 0.13931884765625, + "learning_rate": 0.0001, + "loss": 3.2818, + "loss/crossentropy": 2.1290991365909577, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.2695446103811264, + "loss/reg": 0.0, + "step": 7300 + }, + { + "epoch": 0.048092105263157894, + "grad_norm": 2.53125, + "grad_norm_var": 0.0911773681640625, + "learning_rate": 0.0001, + "loss": 3.3833, + "loss/crossentropy": 2.149968445301056, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.24086009413003923, + "loss/reg": 0.0, + "step": 7310 + }, + { + "epoch": 0.04815789473684211, + "grad_norm": 2.65625, + "grad_norm_var": 0.07766011555989584, + "learning_rate": 0.0001, + "loss": 3.3503, + "loss/crossentropy": 2.519470489025116, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.25514234602451324, + "loss/reg": 0.0, + "step": 7320 + }, + { + "epoch": 0.048223684210526314, + "grad_norm": 2.796875, + "grad_norm_var": 0.025614420572916668, + "learning_rate": 0.0001, + "loss": 3.3595, + "loss/crossentropy": 2.5252918124198915, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.281466007232666, + "loss/reg": 0.0, + "step": 7330 + }, + { + "epoch": 0.04828947368421053, + "grad_norm": 2.078125, + "grad_norm_var": 0.37810872395833334, + "learning_rate": 0.0001, + "loss": 3.3791, + "loss/crossentropy": 2.0419042229652407, + "loss/hidden": 3.0515625, + "loss/incoh": 0.0, + "loss/logits": 0.2731199100613594, + "loss/reg": 0.0, + "step": 7340 + }, + { + "epoch": 0.048355263157894735, + "grad_norm": 3.421875, + "grad_norm_var": 0.19280192057291667, + "learning_rate": 0.0001, + "loss": 3.3562, + "loss/crossentropy": 2.1462841510772703, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.23967409282922744, + "loss/reg": 0.0, + "step": 7350 + }, + { + "epoch": 0.04842105263157895, + "grad_norm": 2.75, + "grad_norm_var": 0.13990478515625, + "learning_rate": 0.0001, + "loss": 3.2953, + "loss/crossentropy": 2.4041597843170166, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.26162110567092894, + "loss/reg": 0.0, + "step": 7360 + }, + { + "epoch": 0.048486842105263155, + "grad_norm": 2.375, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 3.2731, + "loss/crossentropy": 2.61968252658844, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.28658540844917296, + "loss/reg": 0.0, + "step": 7370 + }, + { + "epoch": 0.04855263157894737, + "grad_norm": 2.34375, + "grad_norm_var": 0.0795562744140625, + "learning_rate": 0.0001, + "loss": 3.3537, + "loss/crossentropy": 2.330329430103302, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.311858968436718, + "loss/reg": 0.0, + "step": 7380 + }, + { + "epoch": 0.04861842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.06910400390625, + "learning_rate": 0.0001, + "loss": 3.3762, + "loss/crossentropy": 2.331431567668915, + "loss/hidden": 3.14375, + "loss/incoh": 0.0, + "loss/logits": 0.3637159377336502, + "loss/reg": 0.0, + "step": 7390 + }, + { + "epoch": 0.04868421052631579, + "grad_norm": 2.703125, + "grad_norm_var": 0.07679036458333334, + "learning_rate": 0.0001, + "loss": 3.3474, + "loss/crossentropy": 2.334563136100769, + "loss/hidden": 3.1203125, + "loss/incoh": 0.0, + "loss/logits": 0.3102908283472061, + "loss/reg": 0.0, + "step": 7400 + }, + { + "epoch": 0.04875, + "grad_norm": 2.21875, + "grad_norm_var": 0.041727701822916664, + "learning_rate": 0.0001, + "loss": 3.2994, + "loss/crossentropy": 2.4575919032096865, + "loss/hidden": 3.1453125, + "loss/incoh": 0.0, + "loss/logits": 0.30418373495340345, + "loss/reg": 0.0, + "step": 7410 + }, + { + "epoch": 0.04881578947368421, + "grad_norm": 2.5625, + "grad_norm_var": 0.95484619140625, + "learning_rate": 0.0001, + "loss": 3.3136, + "loss/crossentropy": 2.2615838646888733, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.27002790868282317, + "loss/reg": 0.0, + "step": 7420 + }, + { + "epoch": 0.04888157894736842, + "grad_norm": 2.546875, + "grad_norm_var": 0.05439046223958333, + "learning_rate": 0.0001, + "loss": 3.3084, + "loss/crossentropy": 2.2472564220428466, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.28446437120437623, + "loss/reg": 0.0, + "step": 7430 + }, + { + "epoch": 0.04894736842105263, + "grad_norm": 2.40625, + "grad_norm_var": 0.08689676920572917, + "learning_rate": 0.0001, + "loss": 3.3891, + "loss/crossentropy": 2.5821902751922607, + "loss/hidden": 3.21875, + "loss/incoh": 0.0, + "loss/logits": 0.29424687922000886, + "loss/reg": 0.0, + "step": 7440 + }, + { + "epoch": 0.04901315789473684, + "grad_norm": 2.703125, + "grad_norm_var": 0.09962565104166667, + "learning_rate": 0.0001, + "loss": 3.4691, + "loss/crossentropy": 2.547790551185608, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.3217499524354935, + "loss/reg": 0.0, + "step": 7450 + }, + { + "epoch": 0.049078947368421055, + "grad_norm": 3.03125, + "grad_norm_var": 0.18788655598958334, + "learning_rate": 0.0001, + "loss": 3.3014, + "loss/crossentropy": 2.025312936306, + "loss/hidden": 3.0359375, + "loss/incoh": 0.0, + "loss/logits": 0.29601537734270095, + "loss/reg": 0.0, + "step": 7460 + }, + { + "epoch": 0.04914473684210526, + "grad_norm": 2.359375, + "grad_norm_var": 0.15539957682291666, + "learning_rate": 0.0001, + "loss": 3.3516, + "loss/crossentropy": 2.1228564500808718, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.2723393976688385, + "loss/reg": 0.0, + "step": 7470 + }, + { + "epoch": 0.049210526315789475, + "grad_norm": 2.546875, + "grad_norm_var": 0.04488016764322917, + "learning_rate": 0.0001, + "loss": 3.4105, + "loss/crossentropy": 2.221798670291901, + "loss/hidden": 3.303125, + "loss/incoh": 0.0, + "loss/logits": 0.3143211781978607, + "loss/reg": 0.0, + "step": 7480 + }, + { + "epoch": 0.04927631578947368, + "grad_norm": 2.53125, + "grad_norm_var": 0.028938802083333333, + "learning_rate": 0.0001, + "loss": 3.4092, + "loss/crossentropy": 2.4991084337234497, + "loss/hidden": 3.24375, + "loss/incoh": 0.0, + "loss/logits": 0.3354015931487083, + "loss/reg": 0.0, + "step": 7490 + }, + { + "epoch": 0.049342105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.04719136555989583, + "learning_rate": 0.0001, + "loss": 3.2643, + "loss/crossentropy": 2.3538936495780947, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.25752398669719695, + "loss/reg": 0.0, + "step": 7500 + }, + { + "epoch": 0.0494078947368421, + "grad_norm": 2.859375, + "grad_norm_var": 0.5662272135416667, + "learning_rate": 0.0001, + "loss": 3.3662, + "loss/crossentropy": 2.2441036820411684, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.26160079389810564, + "loss/reg": 0.0, + "step": 7510 + }, + { + "epoch": 0.049473684210526316, + "grad_norm": 2.28125, + "grad_norm_var": 0.5289703369140625, + "learning_rate": 0.0001, + "loss": 3.3149, + "loss/crossentropy": 2.544923734664917, + "loss/hidden": 3.015625, + "loss/incoh": 0.0, + "loss/logits": 0.2898894131183624, + "loss/reg": 0.0, + "step": 7520 + }, + { + "epoch": 0.04953947368421053, + "grad_norm": 2.34375, + "grad_norm_var": 0.0388092041015625, + "learning_rate": 0.0001, + "loss": 3.3611, + "loss/crossentropy": 2.5786613702774046, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.2799048855900764, + "loss/reg": 0.0, + "step": 7530 + }, + { + "epoch": 0.049605263157894736, + "grad_norm": 4.625, + "grad_norm_var": 0.3494293212890625, + "learning_rate": 0.0001, + "loss": 3.3299, + "loss/crossentropy": 2.6309832334518433, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.3181111514568329, + "loss/reg": 0.0, + "step": 7540 + }, + { + "epoch": 0.04967105263157895, + "grad_norm": 2.984375, + "grad_norm_var": 0.32229715983072915, + "learning_rate": 0.0001, + "loss": 3.381, + "loss/crossentropy": 2.395383334159851, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.29645184725522994, + "loss/reg": 0.0, + "step": 7550 + }, + { + "epoch": 0.049736842105263156, + "grad_norm": 2.609375, + "grad_norm_var": 0.06101786295572917, + "learning_rate": 0.0001, + "loss": 3.3076, + "loss/crossentropy": 2.2212601780891417, + "loss/hidden": 3.0890625, + "loss/incoh": 0.0, + "loss/logits": 0.2814889296889305, + "loss/reg": 0.0, + "step": 7560 + }, + { + "epoch": 0.04980263157894737, + "grad_norm": 2.171875, + "grad_norm_var": 0.5580800374348959, + "learning_rate": 0.0001, + "loss": 3.3547, + "loss/crossentropy": 2.340949076414108, + "loss/hidden": 3.2046875, + "loss/incoh": 0.0, + "loss/logits": 0.3048363208770752, + "loss/reg": 0.0, + "step": 7570 + }, + { + "epoch": 0.049868421052631576, + "grad_norm": 2.140625, + "grad_norm_var": 0.5829498291015625, + "learning_rate": 0.0001, + "loss": 3.2873, + "loss/crossentropy": 2.434855592250824, + "loss/hidden": 3.1296875, + "loss/incoh": 0.0, + "loss/logits": 0.33333509862422944, + "loss/reg": 0.0, + "step": 7580 + }, + { + "epoch": 0.04993421052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.07898661295572916, + "learning_rate": 0.0001, + "loss": 3.3901, + "loss/crossentropy": 2.2928370952606203, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.30124022662639616, + "loss/reg": 0.0, + "step": 7590 + }, + { + "epoch": 0.05, + "grad_norm": 2.453125, + "grad_norm_var": 0.04219462076822917, + "learning_rate": 0.0001, + "loss": 3.3048, + "loss/crossentropy": 2.0018354773521425, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.25127379447221754, + "loss/reg": 0.0, + "step": 7600 + }, + { + "epoch": 0.05006578947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.02652587890625, + "learning_rate": 0.0001, + "loss": 3.3285, + "loss/crossentropy": 2.2610169410705567, + "loss/hidden": 3.0640625, + "loss/incoh": 0.0, + "loss/logits": 0.27540155351161955, + "loss/reg": 0.0, + "step": 7610 + }, + { + "epoch": 0.05013157894736842, + "grad_norm": 2.421875, + "grad_norm_var": 0.0210357666015625, + "learning_rate": 0.0001, + "loss": 3.2714, + "loss/crossentropy": 2.2387811303138734, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.27260326892137526, + "loss/reg": 0.0, + "step": 7620 + }, + { + "epoch": 0.05019736842105263, + "grad_norm": 2.265625, + "grad_norm_var": 0.25825093587239584, + "learning_rate": 0.0001, + "loss": 3.4396, + "loss/crossentropy": 2.5094308257102966, + "loss/hidden": 3.146875, + "loss/incoh": 0.0, + "loss/logits": 0.3727137431502342, + "loss/reg": 0.0, + "step": 7630 + }, + { + "epoch": 0.05026315789473684, + "grad_norm": 3.203125, + "grad_norm_var": 0.42695210774739584, + "learning_rate": 0.0001, + "loss": 3.4951, + "loss/crossentropy": 2.181921923160553, + "loss/hidden": 3.10625, + "loss/incoh": 0.0, + "loss/logits": 0.29040979146957396, + "loss/reg": 0.0, + "step": 7640 + }, + { + "epoch": 0.05032894736842105, + "grad_norm": 3.5625, + "grad_norm_var": 0.47198893229166666, + "learning_rate": 0.0001, + "loss": 3.3254, + "loss/crossentropy": 2.4378631830215456, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.32593746185302735, + "loss/reg": 0.0, + "step": 7650 + }, + { + "epoch": 0.05039473684210526, + "grad_norm": 2.234375, + "grad_norm_var": 0.12525126139322917, + "learning_rate": 0.0001, + "loss": 3.2625, + "loss/crossentropy": 2.318000388145447, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.267861607670784, + "loss/reg": 0.0, + "step": 7660 + }, + { + "epoch": 0.050460526315789477, + "grad_norm": 2.28125, + "grad_norm_var": 0.09370015462239584, + "learning_rate": 0.0001, + "loss": 3.4075, + "loss/crossentropy": 2.300287425518036, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.272597573697567, + "loss/reg": 0.0, + "step": 7670 + }, + { + "epoch": 0.05052631578947368, + "grad_norm": 2.0625, + "grad_norm_var": 0.09342041015625, + "learning_rate": 0.0001, + "loss": 3.2909, + "loss/crossentropy": 2.5380281090736387, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.2639786213636398, + "loss/reg": 0.0, + "step": 7680 + }, + { + "epoch": 0.0505921052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.165625, + "learning_rate": 0.0001, + "loss": 3.2847, + "loss/crossentropy": 2.0959218978881835, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2350485235452652, + "loss/reg": 0.0, + "step": 7690 + }, + { + "epoch": 0.0506578947368421, + "grad_norm": 2.6875, + "grad_norm_var": 0.17685546875, + "learning_rate": 0.0001, + "loss": 3.3983, + "loss/crossentropy": 2.323216736316681, + "loss/hidden": 3.3, + "loss/incoh": 0.0, + "loss/logits": 0.3249870762228966, + "loss/reg": 0.0, + "step": 7700 + }, + { + "epoch": 0.05072368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 0.0990234375, + "learning_rate": 0.0001, + "loss": 3.3125, + "loss/crossentropy": 2.3656753659248353, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.3182197526097298, + "loss/reg": 0.0, + "step": 7710 + }, + { + "epoch": 0.05078947368421052, + "grad_norm": 2.8125, + "grad_norm_var": 0.19866129557291667, + "learning_rate": 0.0001, + "loss": 3.4009, + "loss/crossentropy": 2.5273406386375425, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.3028098613023758, + "loss/reg": 0.0, + "step": 7720 + }, + { + "epoch": 0.05085526315789474, + "grad_norm": 3.234375, + "grad_norm_var": 0.20156148274739583, + "learning_rate": 0.0001, + "loss": 3.4345, + "loss/crossentropy": 2.391481709480286, + "loss/hidden": 3.175, + "loss/incoh": 0.0, + "loss/logits": 0.3816041976213455, + "loss/reg": 0.0, + "step": 7730 + }, + { + "epoch": 0.05092105263157895, + "grad_norm": 2.984375, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 0.0001, + "loss": 3.3338, + "loss/crossentropy": 2.4912365436553956, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.2999194458127022, + "loss/reg": 0.0, + "step": 7740 + }, + { + "epoch": 0.05098684210526316, + "grad_norm": 2.46875, + "grad_norm_var": 0.32692057291666665, + "learning_rate": 0.0001, + "loss": 3.3967, + "loss/crossentropy": 2.3413360595703123, + "loss/hidden": 3.225, + "loss/incoh": 0.0, + "loss/logits": 0.2852958709001541, + "loss/reg": 0.0, + "step": 7750 + }, + { + "epoch": 0.05105263157894737, + "grad_norm": 2.625, + "grad_norm_var": 0.32123921712239584, + "learning_rate": 0.0001, + "loss": 3.3178, + "loss/crossentropy": 2.6519938707351685, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.30396163165569307, + "loss/reg": 0.0, + "step": 7760 + }, + { + "epoch": 0.05111842105263158, + "grad_norm": 2.546875, + "grad_norm_var": 0.05095113118489583, + "learning_rate": 0.0001, + "loss": 3.3751, + "loss/crossentropy": 2.4069360971450804, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.2661174476146698, + "loss/reg": 0.0, + "step": 7770 + }, + { + "epoch": 0.05118421052631579, + "grad_norm": 2.6875, + "grad_norm_var": 0.12613016764322918, + "learning_rate": 0.0001, + "loss": 3.3737, + "loss/crossentropy": 2.3529985427856444, + "loss/hidden": 3.08125, + "loss/incoh": 0.0, + "loss/logits": 0.2910769283771515, + "loss/reg": 0.0, + "step": 7780 + }, + { + "epoch": 0.05125, + "grad_norm": 2.25, + "grad_norm_var": 0.03486226399739583, + "learning_rate": 0.0001, + "loss": 3.3208, + "loss/crossentropy": 2.1253769397735596, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.2470591977238655, + "loss/reg": 0.0, + "step": 7790 + }, + { + "epoch": 0.05131578947368421, + "grad_norm": 2.96875, + "grad_norm_var": 0.5931925455729167, + "learning_rate": 0.0001, + "loss": 3.4704, + "loss/crossentropy": 2.2813072860240937, + "loss/hidden": 3.3578125, + "loss/incoh": 0.0, + "loss/logits": 0.3708792179822922, + "loss/reg": 0.0, + "step": 7800 + }, + { + "epoch": 0.051381578947368424, + "grad_norm": 2.15625, + "grad_norm_var": 0.6090159098307292, + "learning_rate": 0.0001, + "loss": 3.3966, + "loss/crossentropy": 2.071128582954407, + "loss/hidden": 3.2015625, + "loss/incoh": 0.0, + "loss/logits": 0.332095867395401, + "loss/reg": 0.0, + "step": 7810 + }, + { + "epoch": 0.05144736842105263, + "grad_norm": 2.671875, + "grad_norm_var": 0.19628499348958334, + "learning_rate": 0.0001, + "loss": 3.3989, + "loss/crossentropy": 2.368660008907318, + "loss/hidden": 3.15625, + "loss/incoh": 0.0, + "loss/logits": 0.28022406101226804, + "loss/reg": 0.0, + "step": 7820 + }, + { + "epoch": 0.051513157894736844, + "grad_norm": 2.40625, + "grad_norm_var": 2.9096995035807294, + "learning_rate": 0.0001, + "loss": 3.3211, + "loss/crossentropy": 2.1510692477226256, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.2576268881559372, + "loss/reg": 0.0, + "step": 7830 + }, + { + "epoch": 0.05157894736842105, + "grad_norm": 2.390625, + "grad_norm_var": 2.7149729410807293, + "learning_rate": 0.0001, + "loss": 3.4215, + "loss/crossentropy": 2.4896273493766783, + "loss/hidden": 3.4703125, + "loss/incoh": 0.0, + "loss/logits": 0.3669875577092171, + "loss/reg": 0.0, + "step": 7840 + }, + { + "epoch": 0.051644736842105264, + "grad_norm": 6.125, + "grad_norm_var": 0.9022450764973958, + "learning_rate": 0.0001, + "loss": 3.4161, + "loss/crossentropy": 2.0616636157035826, + "loss/hidden": 3.053125, + "loss/incoh": 0.0, + "loss/logits": 0.2621200427412987, + "loss/reg": 0.0, + "step": 7850 + }, + { + "epoch": 0.05171052631578947, + "grad_norm": 2.421875, + "grad_norm_var": 1.457957967122396, + "learning_rate": 0.0001, + "loss": 3.3632, + "loss/crossentropy": 2.236839824914932, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.28002799302339554, + "loss/reg": 0.0, + "step": 7860 + }, + { + "epoch": 0.051776315789473684, + "grad_norm": 2.828125, + "grad_norm_var": 7.502855428059896, + "learning_rate": 0.0001, + "loss": 3.3749, + "loss/crossentropy": 2.4405242681503294, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2591508060693741, + "loss/reg": 0.0, + "step": 7870 + }, + { + "epoch": 0.0518421052631579, + "grad_norm": 2.34375, + "grad_norm_var": 3.8793904622395834, + "learning_rate": 0.0001, + "loss": 3.3791, + "loss/crossentropy": 2.5814385414123535, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.31668367683887483, + "loss/reg": 0.0, + "step": 7880 + }, + { + "epoch": 0.051907894736842104, + "grad_norm": 2.46875, + "grad_norm_var": 3.8120432535807294, + "learning_rate": 0.0001, + "loss": 3.4478, + "loss/crossentropy": 2.185117280483246, + "loss/hidden": 3.234375, + "loss/incoh": 0.0, + "loss/logits": 0.34759806394577025, + "loss/reg": 0.0, + "step": 7890 + }, + { + "epoch": 0.05197368421052632, + "grad_norm": 2.609375, + "grad_norm_var": 0.09302469889322916, + "learning_rate": 0.0001, + "loss": 3.3764, + "loss/crossentropy": 2.249006187915802, + "loss/hidden": 3.253125, + "loss/incoh": 0.0, + "loss/logits": 0.28100676983594897, + "loss/reg": 0.0, + "step": 7900 + }, + { + "epoch": 0.052039473684210524, + "grad_norm": 2.25, + "grad_norm_var": 0.08615620930989583, + "learning_rate": 0.0001, + "loss": 3.261, + "loss/crossentropy": 2.297496807575226, + "loss/hidden": 3.0953125, + "loss/incoh": 0.0, + "loss/logits": 0.30758740454912187, + "loss/reg": 0.0, + "step": 7910 + }, + { + "epoch": 0.05210526315789474, + "grad_norm": 3.46875, + "grad_norm_var": 0.08404947916666666, + "learning_rate": 0.0001, + "loss": 3.3132, + "loss/crossentropy": 2.411357748508453, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.29194266349077225, + "loss/reg": 0.0, + "step": 7920 + }, + { + "epoch": 0.052171052631578944, + "grad_norm": 2.90625, + "grad_norm_var": 0.27255859375, + "learning_rate": 0.0001, + "loss": 3.3767, + "loss/crossentropy": 2.6010719895362855, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.28802800476551055, + "loss/reg": 0.0, + "step": 7930 + }, + { + "epoch": 0.05223684210526316, + "grad_norm": 2.296875, + "grad_norm_var": 0.0465484619140625, + "learning_rate": 0.0001, + "loss": 3.3193, + "loss/crossentropy": 2.6153851985931396, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.2830176457762718, + "loss/reg": 0.0, + "step": 7940 + }, + { + "epoch": 0.05230263157894737, + "grad_norm": 2.609375, + "grad_norm_var": 0.0275054931640625, + "learning_rate": 0.0001, + "loss": 3.2535, + "loss/crossentropy": 2.2821019262075426, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.28540263772010804, + "loss/reg": 0.0, + "step": 7950 + }, + { + "epoch": 0.05236842105263158, + "grad_norm": 2.125, + "grad_norm_var": 0.16284077962239582, + "learning_rate": 0.0001, + "loss": 3.2614, + "loss/crossentropy": 2.1502284169197083, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.305269892513752, + "loss/reg": 0.0, + "step": 7960 + }, + { + "epoch": 0.05243421052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.1658843994140625, + "learning_rate": 0.0001, + "loss": 3.3904, + "loss/crossentropy": 2.4389883518218993, + "loss/hidden": 3.1078125, + "loss/incoh": 0.0, + "loss/logits": 0.3048334762454033, + "loss/reg": 0.0, + "step": 7970 + }, + { + "epoch": 0.0525, + "grad_norm": 2.765625, + "grad_norm_var": 0.07778218587239584, + "learning_rate": 0.0001, + "loss": 3.4152, + "loss/crossentropy": 2.3947508692741395, + "loss/hidden": 3.15625, + "loss/incoh": 0.0, + "loss/logits": 0.3298331335186958, + "loss/reg": 0.0, + "step": 7980 + }, + { + "epoch": 0.05256578947368421, + "grad_norm": 2.78125, + "grad_norm_var": 0.08662109375, + "learning_rate": 0.0001, + "loss": 3.3709, + "loss/crossentropy": 2.2743655920028685, + "loss/hidden": 3.0734375, + "loss/incoh": 0.0, + "loss/logits": 0.3255280390381813, + "loss/reg": 0.0, + "step": 7990 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.036881510416666666, + "learning_rate": 0.0001, + "loss": 3.251, + "loss/crossentropy": 2.4580028295516967, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.29261877238750456, + "loss/reg": 0.0, + "step": 8000 + }, + { + "epoch": 0.05269736842105263, + "grad_norm": 3.28125, + "grad_norm_var": 0.23829752604166668, + "learning_rate": 0.0001, + "loss": 3.3925, + "loss/crossentropy": 2.569744038581848, + "loss/hidden": 3.3046875, + "loss/incoh": 0.0, + "loss/logits": 0.4232485115528107, + "loss/reg": 0.0, + "step": 8010 + }, + { + "epoch": 0.052763157894736845, + "grad_norm": 2.5625, + "grad_norm_var": 0.09871317545572916, + "learning_rate": 0.0001, + "loss": 3.33, + "loss/crossentropy": 2.381676936149597, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2768414840102196, + "loss/reg": 0.0, + "step": 8020 + }, + { + "epoch": 0.05282894736842105, + "grad_norm": 2.625, + "grad_norm_var": 0.07251688639322916, + "learning_rate": 0.0001, + "loss": 3.3659, + "loss/crossentropy": 2.529127871990204, + "loss/hidden": 3.046875, + "loss/incoh": 0.0, + "loss/logits": 0.3230110973119736, + "loss/reg": 0.0, + "step": 8030 + }, + { + "epoch": 0.052894736842105265, + "grad_norm": 2.375, + "grad_norm_var": 0.03585611979166667, + "learning_rate": 0.0001, + "loss": 3.241, + "loss/crossentropy": 2.4573261976242065, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.26054717898368834, + "loss/reg": 0.0, + "step": 8040 + }, + { + "epoch": 0.05296052631578947, + "grad_norm": 2.4375, + "grad_norm_var": 0.028319295247395834, + "learning_rate": 0.0001, + "loss": 3.3654, + "loss/crossentropy": 2.3388527154922487, + "loss/hidden": 3.03125, + "loss/incoh": 0.0, + "loss/logits": 0.3071700781583786, + "loss/reg": 0.0, + "step": 8050 + }, + { + "epoch": 0.053026315789473685, + "grad_norm": 3.015625, + "grad_norm_var": 0.0393218994140625, + "learning_rate": 0.0001, + "loss": 3.3602, + "loss/crossentropy": 2.4984395027160646, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2504978612065315, + "loss/reg": 0.0, + "step": 8060 + }, + { + "epoch": 0.05309210526315789, + "grad_norm": 2.390625, + "grad_norm_var": 2.4539998372395835, + "learning_rate": 0.0001, + "loss": 3.4446, + "loss/crossentropy": 2.3318467855453493, + "loss/hidden": 3.25625, + "loss/incoh": 0.0, + "loss/logits": 0.38549562990665437, + "loss/reg": 0.0, + "step": 8070 + }, + { + "epoch": 0.053157894736842105, + "grad_norm": 2.921875, + "grad_norm_var": 2.5296946207682294, + "learning_rate": 0.0001, + "loss": 3.3183, + "loss/crossentropy": 2.3632636427879334, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.28499974459409716, + "loss/reg": 0.0, + "step": 8080 + }, + { + "epoch": 0.05322368421052632, + "grad_norm": 2.515625, + "grad_norm_var": 0.2263824462890625, + "learning_rate": 0.0001, + "loss": 3.3982, + "loss/crossentropy": 2.5644856214523317, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.32271777242422106, + "loss/reg": 0.0, + "step": 8090 + }, + { + "epoch": 0.053289473684210525, + "grad_norm": 2.109375, + "grad_norm_var": 0.20636393229166666, + "learning_rate": 0.0001, + "loss": 3.3351, + "loss/crossentropy": 2.195914793014526, + "loss/hidden": 3.2546875, + "loss/incoh": 0.0, + "loss/logits": 0.32049285918474196, + "loss/reg": 0.0, + "step": 8100 + }, + { + "epoch": 0.05335526315789474, + "grad_norm": 2.328125, + "grad_norm_var": 0.03369852701822917, + "learning_rate": 0.0001, + "loss": 3.3031, + "loss/crossentropy": 2.6255326747894285, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.27993575036525725, + "loss/reg": 0.0, + "step": 8110 + }, + { + "epoch": 0.053421052631578946, + "grad_norm": 2.484375, + "grad_norm_var": 0.07769775390625, + "learning_rate": 0.0001, + "loss": 3.3496, + "loss/crossentropy": 2.430241084098816, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.31150197684764863, + "loss/reg": 0.0, + "step": 8120 + }, + { + "epoch": 0.05348684210526316, + "grad_norm": 2.3125, + "grad_norm_var": 0.0697174072265625, + "learning_rate": 0.0001, + "loss": 3.3655, + "loss/crossentropy": 2.4684382557868956, + "loss/hidden": 2.9921875, + "loss/incoh": 0.0, + "loss/logits": 0.2934414252638817, + "loss/reg": 0.0, + "step": 8130 + }, + { + "epoch": 0.053552631578947366, + "grad_norm": 2.921875, + "grad_norm_var": 0.09716389973958334, + "learning_rate": 0.0001, + "loss": 3.3756, + "loss/crossentropy": 2.679113733768463, + "loss/hidden": 3.3, + "loss/incoh": 0.0, + "loss/logits": 0.35277644991874696, + "loss/reg": 0.0, + "step": 8140 + }, + { + "epoch": 0.05361842105263158, + "grad_norm": 4.625, + "grad_norm_var": 0.3069986979166667, + "learning_rate": 0.0001, + "loss": 3.3979, + "loss/crossentropy": 2.3737685680389404, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.4402651429176331, + "loss/reg": 0.0, + "step": 8150 + }, + { + "epoch": 0.05368421052631579, + "grad_norm": 2.53125, + "grad_norm_var": 0.4447255452473958, + "learning_rate": 0.0001, + "loss": 3.3395, + "loss/crossentropy": 2.556090760231018, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2898376002907753, + "loss/reg": 0.0, + "step": 8160 + }, + { + "epoch": 0.05375, + "grad_norm": 2.484375, + "grad_norm_var": 0.4153717041015625, + "learning_rate": 0.0001, + "loss": 3.4755, + "loss/crossentropy": 2.3199184775352477, + "loss/hidden": 3.2375, + "loss/incoh": 0.0, + "loss/logits": 0.3026577115058899, + "loss/reg": 0.0, + "step": 8170 + }, + { + "epoch": 0.05381578947368421, + "grad_norm": 2.765625, + "grad_norm_var": 0.11787821451822916, + "learning_rate": 0.0001, + "loss": 3.3616, + "loss/crossentropy": 2.566536474227905, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.3062845066189766, + "loss/reg": 0.0, + "step": 8180 + }, + { + "epoch": 0.05388157894736842, + "grad_norm": 2.25, + "grad_norm_var": 0.0469146728515625, + "learning_rate": 0.0001, + "loss": 3.3019, + "loss/crossentropy": 2.434731423854828, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.27008683085441587, + "loss/reg": 0.0, + "step": 8190 + }, + { + "epoch": 0.05394736842105263, + "grad_norm": 2.375, + "grad_norm_var": 0.2584299723307292, + "learning_rate": 0.0001, + "loss": 3.359, + "loss/crossentropy": 2.2641067147254943, + "loss/hidden": 3.0171875, + "loss/incoh": 0.0, + "loss/logits": 0.27225579768419267, + "loss/reg": 0.0, + "step": 8200 + }, + { + "epoch": 0.05401315789473684, + "grad_norm": 2.25, + "grad_norm_var": 0.27990620930989585, + "learning_rate": 0.0001, + "loss": 3.3185, + "loss/crossentropy": 2.410356640815735, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.24587155133485794, + "loss/reg": 0.0, + "step": 8210 + }, + { + "epoch": 0.05407894736842105, + "grad_norm": 2.484375, + "grad_norm_var": 0.03493550618489583, + "learning_rate": 0.0001, + "loss": 3.407, + "loss/crossentropy": 2.3557684421539307, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.27189340591430666, + "loss/reg": 0.0, + "step": 8220 + }, + { + "epoch": 0.054144736842105266, + "grad_norm": 2.578125, + "grad_norm_var": 0.031615193684895834, + "learning_rate": 0.0001, + "loss": 3.3054, + "loss/crossentropy": 2.453370213508606, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.3121939614415169, + "loss/reg": 0.0, + "step": 8230 + }, + { + "epoch": 0.05421052631578947, + "grad_norm": 2.4375, + "grad_norm_var": 0.06060791015625, + "learning_rate": 0.0001, + "loss": 3.3556, + "loss/crossentropy": 2.3469805240631105, + "loss/hidden": 3.115625, + "loss/incoh": 0.0, + "loss/logits": 0.35667684078216555, + "loss/reg": 0.0, + "step": 8240 + }, + { + "epoch": 0.054276315789473686, + "grad_norm": 2.484375, + "grad_norm_var": 0.10061442057291667, + "learning_rate": 0.0001, + "loss": 3.3466, + "loss/crossentropy": 2.5584194660186768, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.31178881525993346, + "loss/reg": 0.0, + "step": 8250 + }, + { + "epoch": 0.05434210526315789, + "grad_norm": 2.296875, + "grad_norm_var": 0.06204020182291667, + "learning_rate": 0.0001, + "loss": 3.291, + "loss/crossentropy": 2.372837942838669, + "loss/hidden": 3.071875, + "loss/incoh": 0.0, + "loss/logits": 0.27887275665998457, + "loss/reg": 0.0, + "step": 8260 + }, + { + "epoch": 0.054407894736842106, + "grad_norm": 2.3125, + "grad_norm_var": 0.05986328125, + "learning_rate": 0.0001, + "loss": 3.3687, + "loss/crossentropy": 2.2389731287956236, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.29073659181594846, + "loss/reg": 0.0, + "step": 8270 + }, + { + "epoch": 0.05447368421052631, + "grad_norm": 2.34375, + "grad_norm_var": 0.0772369384765625, + "learning_rate": 0.0001, + "loss": 3.3364, + "loss/crossentropy": 2.130947244167328, + "loss/hidden": 3.140625, + "loss/incoh": 0.0, + "loss/logits": 0.3286518737673759, + "loss/reg": 0.0, + "step": 8280 + }, + { + "epoch": 0.05453947368421053, + "grad_norm": 2.796875, + "grad_norm_var": 0.3402740478515625, + "learning_rate": 0.0001, + "loss": 3.4311, + "loss/crossentropy": 2.5036970019340514, + "loss/hidden": 3.0765625, + "loss/incoh": 0.0, + "loss/logits": 0.3155761957168579, + "loss/reg": 0.0, + "step": 8290 + }, + { + "epoch": 0.05460526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.07206624348958333, + "learning_rate": 0.0001, + "loss": 3.3068, + "loss/crossentropy": 2.15233553647995, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.2912480518221855, + "loss/reg": 0.0, + "step": 8300 + }, + { + "epoch": 0.05467105263157895, + "grad_norm": 2.765625, + "grad_norm_var": 0.043290201822916666, + "learning_rate": 0.0001, + "loss": 3.3888, + "loss/crossentropy": 2.3679205000400545, + "loss/hidden": 3.2984375, + "loss/incoh": 0.0, + "loss/logits": 0.33265506476163864, + "loss/reg": 0.0, + "step": 8310 + }, + { + "epoch": 0.05473684210526316, + "grad_norm": 4.78125, + "grad_norm_var": 0.4022745768229167, + "learning_rate": 0.0001, + "loss": 3.4588, + "loss/crossentropy": 2.3284069895744324, + "loss/hidden": 3.15625, + "loss/incoh": 0.0, + "loss/logits": 0.28553168624639513, + "loss/reg": 0.0, + "step": 8320 + }, + { + "epoch": 0.05480263157894737, + "grad_norm": 2.78125, + "grad_norm_var": 0.5325480143229167, + "learning_rate": 0.0001, + "loss": 3.3491, + "loss/crossentropy": 2.602140688896179, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.30461025387048724, + "loss/reg": 0.0, + "step": 8330 + }, + { + "epoch": 0.05486842105263158, + "grad_norm": 2.375, + "grad_norm_var": 0.21806233723958332, + "learning_rate": 0.0001, + "loss": 3.2902, + "loss/crossentropy": 2.3135936856269836, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.24052460640668868, + "loss/reg": 0.0, + "step": 8340 + }, + { + "epoch": 0.05493421052631579, + "grad_norm": 2.859375, + "grad_norm_var": 0.10383707682291667, + "learning_rate": 0.0001, + "loss": 3.4435, + "loss/crossentropy": 2.133499014377594, + "loss/hidden": 3.14375, + "loss/incoh": 0.0, + "loss/logits": 0.3066937685012817, + "loss/reg": 0.0, + "step": 8350 + }, + { + "epoch": 0.055, + "grad_norm": 3.15625, + "grad_norm_var": 1.1286936442057292, + "learning_rate": 0.0001, + "loss": 3.3443, + "loss/crossentropy": 2.5188000440597533, + "loss/hidden": 3.340625, + "loss/incoh": 0.0, + "loss/logits": 0.3301475077867508, + "loss/reg": 0.0, + "step": 8360 + }, + { + "epoch": 0.055065789473684214, + "grad_norm": 2.4375, + "grad_norm_var": 1.1146230061848958, + "learning_rate": 0.0001, + "loss": 3.2979, + "loss/crossentropy": 2.5192033290863036, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.29569360315799714, + "loss/reg": 0.0, + "step": 8370 + }, + { + "epoch": 0.05513157894736842, + "grad_norm": 4.28125, + "grad_norm_var": 0.2727691650390625, + "learning_rate": 0.0001, + "loss": 3.312, + "loss/crossentropy": 2.5402653098106383, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2737806707620621, + "loss/reg": 0.0, + "step": 8380 + }, + { + "epoch": 0.055197368421052634, + "grad_norm": 2.234375, + "grad_norm_var": 0.9796702067057291, + "learning_rate": 0.0001, + "loss": 3.3479, + "loss/crossentropy": 2.264913785457611, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.28024169653654096, + "loss/reg": 0.0, + "step": 8390 + }, + { + "epoch": 0.05526315789473684, + "grad_norm": 2.3125, + "grad_norm_var": 0.14442952473958334, + "learning_rate": 0.0001, + "loss": 3.2227, + "loss/crossentropy": 2.3250611424446106, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.29773685038089753, + "loss/reg": 0.0, + "step": 8400 + }, + { + "epoch": 0.055328947368421054, + "grad_norm": 3.453125, + "grad_norm_var": 0.1592681884765625, + "learning_rate": 0.0001, + "loss": 3.3882, + "loss/crossentropy": 2.73007869720459, + "loss/hidden": 3.3, + "loss/incoh": 0.0, + "loss/logits": 0.361674590408802, + "loss/reg": 0.0, + "step": 8410 + }, + { + "epoch": 0.05539473684210526, + "grad_norm": 2.5, + "grad_norm_var": 0.12535807291666667, + "learning_rate": 0.0001, + "loss": 3.3358, + "loss/crossentropy": 2.533698391914368, + "loss/hidden": 3.0359375, + "loss/incoh": 0.0, + "loss/logits": 0.32232231795787813, + "loss/reg": 0.0, + "step": 8420 + }, + { + "epoch": 0.055460526315789474, + "grad_norm": 2.640625, + "grad_norm_var": 413.9550415039063, + "learning_rate": 0.0001, + "loss": 3.4177, + "loss/crossentropy": 2.4008097648620605, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.31593555510044097, + "loss/reg": 0.0, + "step": 8430 + }, + { + "epoch": 0.05552631578947369, + "grad_norm": 3.234375, + "grad_norm_var": 413.1215077718099, + "learning_rate": 0.0001, + "loss": 3.3581, + "loss/crossentropy": 2.4226332664489747, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.25367428809404374, + "loss/reg": 0.0, + "step": 8440 + }, + { + "epoch": 0.055592105263157894, + "grad_norm": 2.484375, + "grad_norm_var": 0.14670817057291666, + "learning_rate": 0.0001, + "loss": 3.3078, + "loss/crossentropy": 2.630770039558411, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.3664619579911232, + "loss/reg": 0.0, + "step": 8450 + }, + { + "epoch": 0.05565789473684211, + "grad_norm": 6.75, + "grad_norm_var": 2.27388916015625, + "learning_rate": 0.0001, + "loss": 3.503, + "loss/crossentropy": 2.376649534702301, + "loss/hidden": 3.1640625, + "loss/incoh": 0.0, + "loss/logits": 0.3476260006427765, + "loss/reg": 0.0, + "step": 8460 + }, + { + "epoch": 0.055723684210526314, + "grad_norm": 2.640625, + "grad_norm_var": 2.541617838541667, + "learning_rate": 0.0001, + "loss": 3.4543, + "loss/crossentropy": 2.638149607181549, + "loss/hidden": 3.34375, + "loss/incoh": 0.0, + "loss/logits": 0.34109789580106736, + "loss/reg": 0.0, + "step": 8470 + }, + { + "epoch": 0.05578947368421053, + "grad_norm": 2.6875, + "grad_norm_var": 0.4665679931640625, + "learning_rate": 0.0001, + "loss": 3.5013, + "loss/crossentropy": 2.40644109249115, + "loss/hidden": 3.1515625, + "loss/incoh": 0.0, + "loss/logits": 0.3092473894357681, + "loss/reg": 0.0, + "step": 8480 + }, + { + "epoch": 0.055855263157894734, + "grad_norm": 2.8125, + "grad_norm_var": 1.1175282796223958, + "learning_rate": 0.0001, + "loss": 3.293, + "loss/crossentropy": 2.159450513124466, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.27864499390125275, + "loss/reg": 0.0, + "step": 8490 + }, + { + "epoch": 0.05592105263157895, + "grad_norm": 4.90625, + "grad_norm_var": 0.49722900390625, + "learning_rate": 0.0001, + "loss": 3.3045, + "loss/crossentropy": 2.355340528488159, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.2977081388235092, + "loss/reg": 0.0, + "step": 8500 + }, + { + "epoch": 0.05598684210526316, + "grad_norm": 2.46875, + "grad_norm_var": 0.4117502848307292, + "learning_rate": 0.0001, + "loss": 3.3273, + "loss/crossentropy": 2.284228873252869, + "loss/hidden": 3.046875, + "loss/incoh": 0.0, + "loss/logits": 0.27203311026096344, + "loss/reg": 0.0, + "step": 8510 + }, + { + "epoch": 0.05605263157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.15953776041666667, + "learning_rate": 0.0001, + "loss": 3.3038, + "loss/crossentropy": 2.6644370317459107, + "loss/hidden": 3.03125, + "loss/incoh": 0.0, + "loss/logits": 0.2977398321032524, + "loss/reg": 0.0, + "step": 8520 + }, + { + "epoch": 0.05611842105263158, + "grad_norm": 2.65625, + "grad_norm_var": 0.07891337076822917, + "learning_rate": 0.0001, + "loss": 3.3066, + "loss/crossentropy": 2.480556678771973, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.3119770348072052, + "loss/reg": 0.0, + "step": 8530 + }, + { + "epoch": 0.05618421052631579, + "grad_norm": 2.53125, + "grad_norm_var": 0.2862630208333333, + "learning_rate": 0.0001, + "loss": 3.3849, + "loss/crossentropy": 2.3611693739891053, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.27296979874372485, + "loss/reg": 0.0, + "step": 8540 + }, + { + "epoch": 0.05625, + "grad_norm": 2.453125, + "grad_norm_var": 0.41441141764322914, + "learning_rate": 0.0001, + "loss": 3.3496, + "loss/crossentropy": 2.3446286380290986, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.28327286094427107, + "loss/reg": 0.0, + "step": 8550 + }, + { + "epoch": 0.05631578947368421, + "grad_norm": 2.625, + "grad_norm_var": 6.07724609375, + "learning_rate": 0.0001, + "loss": 3.4711, + "loss/crossentropy": 1.7737745344638824, + "loss/hidden": 3.171875, + "loss/incoh": 0.0, + "loss/logits": 0.27109832018613816, + "loss/reg": 0.0, + "step": 8560 + }, + { + "epoch": 0.05638157894736842, + "grad_norm": 2.875, + "grad_norm_var": 5.751741536458334, + "learning_rate": 0.0001, + "loss": 3.3501, + "loss/crossentropy": 2.3749794125556947, + "loss/hidden": 3.0578125, + "loss/incoh": 0.0, + "loss/logits": 0.2803412050008774, + "loss/reg": 0.0, + "step": 8570 + }, + { + "epoch": 0.056447368421052635, + "grad_norm": 2.1875, + "grad_norm_var": 0.09153645833333333, + "learning_rate": 0.0001, + "loss": 3.3793, + "loss/crossentropy": 2.348732423782349, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2703657791018486, + "loss/reg": 0.0, + "step": 8580 + }, + { + "epoch": 0.05651315789473684, + "grad_norm": 2.984375, + "grad_norm_var": 0.14538472493489582, + "learning_rate": 0.0001, + "loss": 3.3984, + "loss/crossentropy": 2.2422220349311828, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.2956344410777092, + "loss/reg": 0.0, + "step": 8590 + }, + { + "epoch": 0.056578947368421055, + "grad_norm": 2.46875, + "grad_norm_var": 0.07770894368489584, + "learning_rate": 0.0001, + "loss": 3.3787, + "loss/crossentropy": 2.3457955360412597, + "loss/hidden": 3.2359375, + "loss/incoh": 0.0, + "loss/logits": 0.3119618773460388, + "loss/reg": 0.0, + "step": 8600 + }, + { + "epoch": 0.05664473684210526, + "grad_norm": 3.40625, + "grad_norm_var": 1.7676717122395833, + "learning_rate": 0.0001, + "loss": 3.4129, + "loss/crossentropy": 2.3159496188163757, + "loss/hidden": 3.2421875, + "loss/incoh": 0.0, + "loss/logits": 0.3751305788755417, + "loss/reg": 0.0, + "step": 8610 + }, + { + "epoch": 0.056710526315789475, + "grad_norm": 2.96875, + "grad_norm_var": 1.6932902018229166, + "learning_rate": 0.0001, + "loss": 3.4391, + "loss/crossentropy": 2.6694117546081544, + "loss/hidden": 3.053125, + "loss/incoh": 0.0, + "loss/logits": 0.29238851368427277, + "loss/reg": 0.0, + "step": 8620 + }, + { + "epoch": 0.05677631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.11296284993489583, + "learning_rate": 0.0001, + "loss": 3.3208, + "loss/crossentropy": 2.1584444522857664, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.2903019651770592, + "loss/reg": 0.0, + "step": 8630 + }, + { + "epoch": 0.056842105263157895, + "grad_norm": 2.078125, + "grad_norm_var": 0.12463785807291666, + "learning_rate": 0.0001, + "loss": 3.3435, + "loss/crossentropy": 2.2420501947402953, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.27747504264116285, + "loss/reg": 0.0, + "step": 8640 + }, + { + "epoch": 0.05690789473684211, + "grad_norm": 2.484375, + "grad_norm_var": 0.16119384765625, + "learning_rate": 0.0001, + "loss": 3.226, + "loss/crossentropy": 2.378132700920105, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.27239564061164856, + "loss/reg": 0.0, + "step": 8650 + }, + { + "epoch": 0.056973684210526315, + "grad_norm": 2.796875, + "grad_norm_var": 0.06622721354166666, + "learning_rate": 0.0001, + "loss": 3.3322, + "loss/crossentropy": 2.1032593488693236, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.2557524010539055, + "loss/reg": 0.0, + "step": 8660 + }, + { + "epoch": 0.05703947368421053, + "grad_norm": 2.640625, + "grad_norm_var": 1.2355143229166667, + "learning_rate": 0.0001, + "loss": 3.3289, + "loss/crossentropy": 2.41143513917923, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.27892655730247495, + "loss/reg": 0.0, + "step": 8670 + }, + { + "epoch": 0.057105263157894735, + "grad_norm": 2.40625, + "grad_norm_var": 0.021117146809895834, + "learning_rate": 0.0001, + "loss": 3.348, + "loss/crossentropy": 2.443818140029907, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.3347576320171356, + "loss/reg": 0.0, + "step": 8680 + }, + { + "epoch": 0.05717105263157895, + "grad_norm": 2.453125, + "grad_norm_var": 9.24689275122101e+16, + "learning_rate": 0.0001, + "loss": 3.3877, + "loss/crossentropy": 2.181317722797394, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.30654960721731184, + "loss/reg": 0.0, + "step": 8690 + }, + { + "epoch": 0.057236842105263155, + "grad_norm": 2.203125, + "grad_norm_var": 0.108984375, + "learning_rate": 0.0001, + "loss": 3.308, + "loss/crossentropy": 2.2454151153564452, + "loss/hidden": 3.240625, + "loss/incoh": 0.0, + "loss/logits": 0.3219583719968796, + "loss/reg": 0.0, + "step": 8700 + }, + { + "epoch": 0.05730263157894737, + "grad_norm": 2.40625, + "grad_norm_var": 0.44682515462239586, + "learning_rate": 0.0001, + "loss": 3.2909, + "loss/crossentropy": 2.226694929599762, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.27956253886222837, + "loss/reg": 0.0, + "step": 8710 + }, + { + "epoch": 0.057368421052631575, + "grad_norm": 2.171875, + "grad_norm_var": 0.46708577473958335, + "learning_rate": 0.0001, + "loss": 3.3322, + "loss/crossentropy": 2.329686003923416, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.27385311424732206, + "loss/reg": 0.0, + "step": 8720 + }, + { + "epoch": 0.05743421052631579, + "grad_norm": 2.703125, + "grad_norm_var": 0.09485575358072916, + "learning_rate": 0.0001, + "loss": 3.3243, + "loss/crossentropy": 2.262730371952057, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.26381057798862456, + "loss/reg": 0.0, + "step": 8730 + }, + { + "epoch": 0.0575, + "grad_norm": 2.453125, + "grad_norm_var": 0.07937825520833333, + "learning_rate": 0.0001, + "loss": 3.3226, + "loss/crossentropy": 2.43484423160553, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.2970830351114273, + "loss/reg": 0.0, + "step": 8740 + }, + { + "epoch": 0.05756578947368421, + "grad_norm": 4.0625, + "grad_norm_var": 0.23977864583333333, + "learning_rate": 0.0001, + "loss": 3.3517, + "loss/crossentropy": 2.2560600876808166, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.2825756400823593, + "loss/reg": 0.0, + "step": 8750 + }, + { + "epoch": 0.05763157894736842, + "grad_norm": 2.359375, + "grad_norm_var": 4.011067708333333, + "learning_rate": 0.0001, + "loss": 3.4317, + "loss/crossentropy": 2.1193652033805845, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.2872270569205284, + "loss/reg": 0.0, + "step": 8760 + }, + { + "epoch": 0.05769736842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.98638916015625, + "learning_rate": 0.0001, + "loss": 3.2657, + "loss/crossentropy": 2.2515121579170225, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2510280326008797, + "loss/reg": 0.0, + "step": 8770 + }, + { + "epoch": 0.05776315789473684, + "grad_norm": 2.90625, + "grad_norm_var": 1.39010009765625, + "learning_rate": 0.0001, + "loss": 3.3069, + "loss/crossentropy": 2.45401873588562, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.3077335625886917, + "loss/reg": 0.0, + "step": 8780 + }, + { + "epoch": 0.05782894736842105, + "grad_norm": 2.671875, + "grad_norm_var": 1.4149373372395833, + "learning_rate": 0.0001, + "loss": 3.3864, + "loss/crossentropy": 2.3450352430343626, + "loss/hidden": 3.421875, + "loss/incoh": 0.0, + "loss/logits": 0.3485978364944458, + "loss/reg": 0.0, + "step": 8790 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 2.453125, + "grad_norm_var": 0.23061421712239583, + "learning_rate": 0.0001, + "loss": 3.261, + "loss/crossentropy": 2.4229711413383486, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.3234298795461655, + "loss/reg": 0.0, + "step": 8800 + }, + { + "epoch": 0.057960526315789476, + "grad_norm": 2.703125, + "grad_norm_var": 4.344071451822916, + "learning_rate": 0.0001, + "loss": 3.2778, + "loss/crossentropy": 2.202963078022003, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2559796661138535, + "loss/reg": 0.0, + "step": 8810 + }, + { + "epoch": 0.05802631578947368, + "grad_norm": 2.640625, + "grad_norm_var": 4.178641764322917, + "learning_rate": 0.0001, + "loss": 3.3628, + "loss/crossentropy": 2.2936235070228577, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.32534674406051634, + "loss/reg": 0.0, + "step": 8820 + }, + { + "epoch": 0.058092105263157896, + "grad_norm": 2.046875, + "grad_norm_var": 0.17635091145833334, + "learning_rate": 0.0001, + "loss": 3.3078, + "loss/crossentropy": 2.535597097873688, + "loss/hidden": 3.1109375, + "loss/incoh": 0.0, + "loss/logits": 0.30167998671531676, + "loss/reg": 0.0, + "step": 8830 + }, + { + "epoch": 0.0581578947368421, + "grad_norm": 2.828125, + "grad_norm_var": 0.10144856770833334, + "learning_rate": 0.0001, + "loss": 3.3119, + "loss/crossentropy": 2.652754557132721, + "loss/hidden": 3.309375, + "loss/incoh": 0.0, + "loss/logits": 0.37436943501234055, + "loss/reg": 0.0, + "step": 8840 + }, + { + "epoch": 0.058223684210526316, + "grad_norm": 2.15625, + "grad_norm_var": 0.79921875, + "learning_rate": 0.0001, + "loss": 3.3639, + "loss/crossentropy": 2.4041113376617433, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.331952853500843, + "loss/reg": 0.0, + "step": 8850 + }, + { + "epoch": 0.05828947368421052, + "grad_norm": 2.671875, + "grad_norm_var": 0.8512603759765625, + "learning_rate": 0.0001, + "loss": 3.2301, + "loss/crossentropy": 2.7010722875595095, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.2689524292945862, + "loss/reg": 0.0, + "step": 8860 + }, + { + "epoch": 0.058355263157894736, + "grad_norm": 2.484375, + "grad_norm_var": 0.22805887858072918, + "learning_rate": 0.0001, + "loss": 3.301, + "loss/crossentropy": 2.3007858633995055, + "loss/hidden": 3.2953125, + "loss/incoh": 0.0, + "loss/logits": 0.38490410447120665, + "loss/reg": 0.0, + "step": 8870 + }, + { + "epoch": 0.05842105263157895, + "grad_norm": 4.375, + "grad_norm_var": 0.2703928629557292, + "learning_rate": 0.0001, + "loss": 3.3077, + "loss/crossentropy": 2.562314450740814, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.29355555921792986, + "loss/reg": 0.0, + "step": 8880 + }, + { + "epoch": 0.058486842105263157, + "grad_norm": 4.46875, + "grad_norm_var": 0.5262522379557292, + "learning_rate": 0.0001, + "loss": 3.2847, + "loss/crossentropy": 2.329223835468292, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.2784240961074829, + "loss/reg": 0.0, + "step": 8890 + }, + { + "epoch": 0.05855263157894737, + "grad_norm": 2.078125, + "grad_norm_var": 0.32392476399739584, + "learning_rate": 0.0001, + "loss": 3.2975, + "loss/crossentropy": 2.371010947227478, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.3012363612651825, + "loss/reg": 0.0, + "step": 8900 + }, + { + "epoch": 0.05861842105263158, + "grad_norm": 2.90625, + "grad_norm_var": 0.05488993326822917, + "learning_rate": 0.0001, + "loss": 3.3558, + "loss/crossentropy": 2.4505101799964906, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.28698593378067017, + "loss/reg": 0.0, + "step": 8910 + }, + { + "epoch": 0.05868421052631579, + "grad_norm": 2.890625, + "grad_norm_var": 1.541039021809896, + "learning_rate": 0.0001, + "loss": 3.4362, + "loss/crossentropy": 2.569356393814087, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.3198714107275009, + "loss/reg": 0.0, + "step": 8920 + }, + { + "epoch": 0.05875, + "grad_norm": 2.34375, + "grad_norm_var": 0.24500325520833333, + "learning_rate": 0.0001, + "loss": 3.3947, + "loss/crossentropy": 2.106434017419815, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.2644048437476158, + "loss/reg": 0.0, + "step": 8930 + }, + { + "epoch": 0.05881578947368421, + "grad_norm": 2.03125, + "grad_norm_var": 0.21669514973958334, + "learning_rate": 0.0001, + "loss": 3.2788, + "loss/crossentropy": 2.517351245880127, + "loss/hidden": 2.9921875, + "loss/incoh": 0.0, + "loss/logits": 0.27984755039215087, + "loss/reg": 0.0, + "step": 8940 + }, + { + "epoch": 0.058881578947368424, + "grad_norm": 2.328125, + "grad_norm_var": 0.06357421875, + "learning_rate": 0.0001, + "loss": 3.3106, + "loss/crossentropy": 2.3258296728134153, + "loss/hidden": 3.165625, + "loss/incoh": 0.0, + "loss/logits": 0.32734392732381823, + "loss/reg": 0.0, + "step": 8950 + }, + { + "epoch": 0.05894736842105263, + "grad_norm": 2.53125, + "grad_norm_var": 0.7411610921223958, + "learning_rate": 0.0001, + "loss": 3.4426, + "loss/crossentropy": 2.4165605783462523, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.3073220491409302, + "loss/reg": 0.0, + "step": 8960 + }, + { + "epoch": 0.059013157894736844, + "grad_norm": 2.21875, + "grad_norm_var": 0.44649149576822916, + "learning_rate": 0.0001, + "loss": 3.2202, + "loss/crossentropy": 2.4866004467010496, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.30207364708185197, + "loss/reg": 0.0, + "step": 8970 + }, + { + "epoch": 0.05907894736842105, + "grad_norm": 2.921875, + "grad_norm_var": 0.0566070556640625, + "learning_rate": 0.0001, + "loss": 3.2957, + "loss/crossentropy": 2.4917370676994324, + "loss/hidden": 3.078125, + "loss/incoh": 0.0, + "loss/logits": 0.3163344025611877, + "loss/reg": 0.0, + "step": 8980 + }, + { + "epoch": 0.059144736842105264, + "grad_norm": 2.4375, + "grad_norm_var": 0.0467193603515625, + "learning_rate": 0.0001, + "loss": 3.337, + "loss/crossentropy": 2.529834246635437, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.2896230161190033, + "loss/reg": 0.0, + "step": 8990 + }, + { + "epoch": 0.05921052631578947, + "grad_norm": 2.4375, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 3.2664, + "loss/crossentropy": 2.548252558708191, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.3721586674451828, + "loss/reg": 0.0, + "step": 9000 + }, + { + "epoch": 0.059276315789473684, + "grad_norm": 2.375, + "grad_norm_var": 0.010172526041666666, + "learning_rate": 0.0001, + "loss": 3.3136, + "loss/crossentropy": 2.628942942619324, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.29181756228208544, + "loss/reg": 0.0, + "step": 9010 + }, + { + "epoch": 0.0593421052631579, + "grad_norm": 2.484375, + "grad_norm_var": 0.039937337239583336, + "learning_rate": 0.0001, + "loss": 3.3276, + "loss/crossentropy": 2.35152667760849, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.2875461965799332, + "loss/reg": 0.0, + "step": 9020 + }, + { + "epoch": 0.059407894736842104, + "grad_norm": 2.625, + "grad_norm_var": 0.0728668212890625, + "learning_rate": 0.0001, + "loss": 3.3454, + "loss/crossentropy": 2.3906983017921446, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.2821692392230034, + "loss/reg": 0.0, + "step": 9030 + }, + { + "epoch": 0.05947368421052632, + "grad_norm": 2.5625, + "grad_norm_var": 0.0730133056640625, + "learning_rate": 0.0001, + "loss": 3.3058, + "loss/crossentropy": 2.4100876331329344, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2886385262012482, + "loss/reg": 0.0, + "step": 9040 + }, + { + "epoch": 0.059539473684210524, + "grad_norm": 2.5, + "grad_norm_var": 0.4940592447916667, + "learning_rate": 0.0001, + "loss": 3.4398, + "loss/crossentropy": 2.464267885684967, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.31053231209516524, + "loss/reg": 0.0, + "step": 9050 + }, + { + "epoch": 0.05960526315789474, + "grad_norm": 2.015625, + "grad_norm_var": 0.08632405598958333, + "learning_rate": 0.0001, + "loss": 3.3778, + "loss/crossentropy": 2.402372860908508, + "loss/hidden": 3.0296875, + "loss/incoh": 0.0, + "loss/logits": 0.35591588020324705, + "loss/reg": 0.0, + "step": 9060 + }, + { + "epoch": 0.059671052631578944, + "grad_norm": 3.296875, + "grad_norm_var": 0.12502848307291667, + "learning_rate": 0.0001, + "loss": 3.3312, + "loss/crossentropy": 2.3369374930858613, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.2521222412586212, + "loss/reg": 0.0, + "step": 9070 + }, + { + "epoch": 0.05973684210526316, + "grad_norm": 3.15625, + "grad_norm_var": 0.11041259765625, + "learning_rate": 0.0001, + "loss": 3.2977, + "loss/crossentropy": 2.2894081354141234, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.37748522460460665, + "loss/reg": 0.0, + "step": 9080 + }, + { + "epoch": 0.05980263157894737, + "grad_norm": 2.671875, + "grad_norm_var": 2.4495359778774493e+17, + "learning_rate": 0.0001, + "loss": 3.4555, + "loss/crossentropy": 2.248319935798645, + "loss/hidden": 3.0234375, + "loss/incoh": 0.0, + "loss/logits": 0.27714093402028084, + "loss/reg": 0.0, + "step": 9090 + }, + { + "epoch": 0.05986842105263158, + "grad_norm": 2.609375, + "grad_norm_var": 2.449535978075936e+17, + "learning_rate": 0.0001, + "loss": 3.2988, + "loss/crossentropy": 2.304659366607666, + "loss/hidden": 3.234375, + "loss/incoh": 0.0, + "loss/logits": 0.2838084354996681, + "loss/reg": 0.0, + "step": 9100 + }, + { + "epoch": 0.05993421052631579, + "grad_norm": 2.796875, + "grad_norm_var": 0.5627838134765625, + "learning_rate": 0.0001, + "loss": 3.2864, + "loss/crossentropy": 2.5072904348373415, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.30276854485273363, + "loss/reg": 0.0, + "step": 9110 + }, + { + "epoch": 0.06, + "grad_norm": 2.328125, + "grad_norm_var": 0.5690582275390625, + "learning_rate": 0.0001, + "loss": 3.2102, + "loss/crossentropy": 2.2461979389190674, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.2540374085307121, + "loss/reg": 0.0, + "step": 9120 + }, + { + "epoch": 0.06006578947368421, + "grad_norm": 2.21875, + "grad_norm_var": 0.11433817545572916, + "learning_rate": 0.0001, + "loss": 3.2304, + "loss/crossentropy": 2.3598265290260314, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.28421255201101303, + "loss/reg": 0.0, + "step": 9130 + }, + { + "epoch": 0.06013157894736842, + "grad_norm": 2.578125, + "grad_norm_var": 0.05432535807291667, + "learning_rate": 0.0001, + "loss": 3.2493, + "loss/crossentropy": 2.3720561623573304, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.31470999121665955, + "loss/reg": 0.0, + "step": 9140 + }, + { + "epoch": 0.06019736842105263, + "grad_norm": 2.453125, + "grad_norm_var": 0.08810933430989583, + "learning_rate": 0.0001, + "loss": 3.3173, + "loss/crossentropy": 2.2540945589542387, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.24728769958019256, + "loss/reg": 0.0, + "step": 9150 + }, + { + "epoch": 0.060263157894736845, + "grad_norm": 3.109375, + "grad_norm_var": 0.07870992024739583, + "learning_rate": 0.0001, + "loss": 3.295, + "loss/crossentropy": 2.2703887224197388, + "loss/hidden": 3.20625, + "loss/incoh": 0.0, + "loss/logits": 0.27733070850372316, + "loss/reg": 0.0, + "step": 9160 + }, + { + "epoch": 0.06032894736842105, + "grad_norm": 2.203125, + "grad_norm_var": 0.20073954264322916, + "learning_rate": 0.0001, + "loss": 3.2337, + "loss/crossentropy": 2.5513323664665224, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.27180006355047226, + "loss/reg": 0.0, + "step": 9170 + }, + { + "epoch": 0.060394736842105265, + "grad_norm": 2.234375, + "grad_norm_var": 0.6225545247395833, + "learning_rate": 0.0001, + "loss": 3.4001, + "loss/crossentropy": 2.1638057589530946, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2586470812559128, + "loss/reg": 0.0, + "step": 9180 + }, + { + "epoch": 0.06046052631578947, + "grad_norm": 2.140625, + "grad_norm_var": 0.37139867146809896, + "learning_rate": 0.0001, + "loss": 3.2305, + "loss/crossentropy": 2.4451101064682006, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.3055331766605377, + "loss/reg": 0.0, + "step": 9190 + }, + { + "epoch": 0.060526315789473685, + "grad_norm": 2.171875, + "grad_norm_var": 0.32624282836914065, + "learning_rate": 0.0001, + "loss": 3.2254, + "loss/crossentropy": 2.561469316482544, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.2592929035425186, + "loss/reg": 0.0, + "step": 9200 + }, + { + "epoch": 0.06059210526315789, + "grad_norm": 2.90625, + "grad_norm_var": 1.0789459228515625, + "learning_rate": 0.0001, + "loss": 3.3551, + "loss/crossentropy": 2.1200980842113495, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.2634043380618095, + "loss/reg": 0.0, + "step": 9210 + }, + { + "epoch": 0.060657894736842105, + "grad_norm": 2.0, + "grad_norm_var": 0.54371337890625, + "learning_rate": 0.0001, + "loss": 3.3742, + "loss/crossentropy": 2.4880159854888917, + "loss/hidden": 3.140625, + "loss/incoh": 0.0, + "loss/logits": 0.29063448309898376, + "loss/reg": 0.0, + "step": 9220 + }, + { + "epoch": 0.06072368421052632, + "grad_norm": 2.953125, + "grad_norm_var": 0.24761962890625, + "learning_rate": 0.0001, + "loss": 3.3781, + "loss/crossentropy": 2.558793306350708, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.3282888367772102, + "loss/reg": 0.0, + "step": 9230 + }, + { + "epoch": 0.060789473684210525, + "grad_norm": 4.75, + "grad_norm_var": 0.35944010416666666, + "learning_rate": 0.0001, + "loss": 3.2908, + "loss/crossentropy": 2.3837397813797, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.29975869655609133, + "loss/reg": 0.0, + "step": 9240 + }, + { + "epoch": 0.06085526315789474, + "grad_norm": 2.828125, + "grad_norm_var": 0.41629130045572915, + "learning_rate": 0.0001, + "loss": 3.3291, + "loss/crossentropy": 2.395397412776947, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.3036083221435547, + "loss/reg": 0.0, + "step": 9250 + }, + { + "epoch": 0.060921052631578945, + "grad_norm": 2.5, + "grad_norm_var": 0.09286702473958333, + "learning_rate": 0.0001, + "loss": 3.2991, + "loss/crossentropy": 2.350081342458725, + "loss/hidden": 3.234375, + "loss/incoh": 0.0, + "loss/logits": 0.31170106381177903, + "loss/reg": 0.0, + "step": 9260 + }, + { + "epoch": 0.06098684210526316, + "grad_norm": 3.125, + "grad_norm_var": 0.09381510416666666, + "learning_rate": 0.0001, + "loss": 3.2234, + "loss/crossentropy": 2.023473250865936, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.2770851716399193, + "loss/reg": 0.0, + "step": 9270 + }, + { + "epoch": 0.061052631578947365, + "grad_norm": 2.265625, + "grad_norm_var": 0.09631245930989583, + "learning_rate": 0.0001, + "loss": 3.2279, + "loss/crossentropy": 2.362475335597992, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.24814699292182923, + "loss/reg": 0.0, + "step": 9280 + }, + { + "epoch": 0.06111842105263158, + "grad_norm": 3.1875, + "grad_norm_var": 0.49576416015625, + "learning_rate": 0.0001, + "loss": 3.4035, + "loss/crossentropy": 2.5755128145217894, + "loss/hidden": 3.1328125, + "loss/incoh": 0.0, + "loss/logits": 0.35746499747037885, + "loss/reg": 0.0, + "step": 9290 + }, + { + "epoch": 0.06118421052631579, + "grad_norm": 2.953125, + "grad_norm_var": 0.09010009765625, + "learning_rate": 0.0001, + "loss": 3.2592, + "loss/crossentropy": 2.2949343085289002, + "loss/hidden": 3.08125, + "loss/incoh": 0.0, + "loss/logits": 0.2996257975697517, + "loss/reg": 0.0, + "step": 9300 + }, + { + "epoch": 0.06125, + "grad_norm": 2.421875, + "grad_norm_var": 0.3297190348307292, + "learning_rate": 0.0001, + "loss": 3.3329, + "loss/crossentropy": 2.4506813049316407, + "loss/hidden": 3.09375, + "loss/incoh": 0.0, + "loss/logits": 0.3265557274222374, + "loss/reg": 0.0, + "step": 9310 + }, + { + "epoch": 0.06131578947368421, + "grad_norm": 2.453125, + "grad_norm_var": 0.3309529622395833, + "learning_rate": 0.0001, + "loss": 3.2163, + "loss/crossentropy": 2.3078281760215758, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.2651080995798111, + "loss/reg": 0.0, + "step": 9320 + }, + { + "epoch": 0.06138157894736842, + "grad_norm": 2.28125, + "grad_norm_var": 0.26513671875, + "learning_rate": 0.0001, + "loss": 3.2706, + "loss/crossentropy": 2.1937548160552978, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.2700933560729027, + "loss/reg": 0.0, + "step": 9330 + }, + { + "epoch": 0.06144736842105263, + "grad_norm": 2.078125, + "grad_norm_var": 0.31164957682291666, + "learning_rate": 0.0001, + "loss": 3.4252, + "loss/crossentropy": 2.112582105398178, + "loss/hidden": 3.0984375, + "loss/incoh": 0.0, + "loss/logits": 0.26334969997406005, + "loss/reg": 0.0, + "step": 9340 + }, + { + "epoch": 0.06151315789473684, + "grad_norm": 2.328125, + "grad_norm_var": 0.33046875, + "learning_rate": 0.0001, + "loss": 3.3277, + "loss/crossentropy": 2.4319912672042845, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.2982776537537575, + "loss/reg": 0.0, + "step": 9350 + }, + { + "epoch": 0.06157894736842105, + "grad_norm": 2.171875, + "grad_norm_var": 0.48318684895833336, + "learning_rate": 0.0001, + "loss": 3.28, + "loss/crossentropy": 2.3863817691802978, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.2783980667591095, + "loss/reg": 0.0, + "step": 9360 + }, + { + "epoch": 0.061644736842105266, + "grad_norm": 2.59375, + "grad_norm_var": 0.4833730061848958, + "learning_rate": 0.0001, + "loss": 3.329, + "loss/crossentropy": 2.5484490633010863, + "loss/hidden": 3.03125, + "loss/incoh": 0.0, + "loss/logits": 0.3031032904982567, + "loss/reg": 0.0, + "step": 9370 + }, + { + "epoch": 0.06171052631578947, + "grad_norm": 2.203125, + "grad_norm_var": 0.08167317708333334, + "learning_rate": 0.0001, + "loss": 3.2587, + "loss/crossentropy": 2.2674924612045286, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.3650638833642006, + "loss/reg": 0.0, + "step": 9380 + }, + { + "epoch": 0.061776315789473686, + "grad_norm": 2.5625, + "grad_norm_var": 0.22155659993489582, + "learning_rate": 0.0001, + "loss": 3.3241, + "loss/crossentropy": 2.3348896741867065, + "loss/hidden": 3.1203125, + "loss/incoh": 0.0, + "loss/logits": 0.283182792365551, + "loss/reg": 0.0, + "step": 9390 + }, + { + "epoch": 0.06184210526315789, + "grad_norm": 3.15625, + "grad_norm_var": 1.1829427083333333, + "learning_rate": 0.0001, + "loss": 3.3629, + "loss/crossentropy": 2.464947986602783, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.28233895897865297, + "loss/reg": 0.0, + "step": 9400 + }, + { + "epoch": 0.061907894736842106, + "grad_norm": 2.296875, + "grad_norm_var": 1.1680623372395833, + "learning_rate": 0.0001, + "loss": 3.3087, + "loss/crossentropy": 2.179221343994141, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.3310952290892601, + "loss/reg": 0.0, + "step": 9410 + }, + { + "epoch": 0.06197368421052631, + "grad_norm": 2.21875, + "grad_norm_var": 2.0582183837890624, + "learning_rate": 0.0001, + "loss": 3.4293, + "loss/crossentropy": 2.413185155391693, + "loss/hidden": 3.18125, + "loss/incoh": 0.0, + "loss/logits": 0.3003757044672966, + "loss/reg": 0.0, + "step": 9420 + }, + { + "epoch": 0.062039473684210526, + "grad_norm": 2.421875, + "grad_norm_var": 0.3502349853515625, + "learning_rate": 0.0001, + "loss": 3.1251, + "loss/crossentropy": 2.3518580555915833, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.2455562546849251, + "loss/reg": 0.0, + "step": 9430 + }, + { + "epoch": 0.06210526315789474, + "grad_norm": 2.578125, + "grad_norm_var": 0.028425089518229165, + "learning_rate": 0.0001, + "loss": 3.3233, + "loss/crossentropy": 2.375118088722229, + "loss/hidden": 3.153125, + "loss/incoh": 0.0, + "loss/logits": 0.31473297625780106, + "loss/reg": 0.0, + "step": 9440 + }, + { + "epoch": 0.062171052631578946, + "grad_norm": 2.46875, + "grad_norm_var": 0.0333984375, + "learning_rate": 0.0001, + "loss": 3.2654, + "loss/crossentropy": 2.3164775133132935, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.2699767738580704, + "loss/reg": 0.0, + "step": 9450 + }, + { + "epoch": 0.06223684210526316, + "grad_norm": 2.46875, + "grad_norm_var": 0.0964263916015625, + "learning_rate": 0.0001, + "loss": 3.3552, + "loss/crossentropy": 2.233529049158096, + "loss/hidden": 3.21875, + "loss/incoh": 0.0, + "loss/logits": 0.3262931898236275, + "loss/reg": 0.0, + "step": 9460 + }, + { + "epoch": 0.062302631578947366, + "grad_norm": 2.1875, + "grad_norm_var": 24.440249633789062, + "learning_rate": 0.0001, + "loss": 3.2359, + "loss/crossentropy": 2.3248747825622558, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.24342142790555954, + "loss/reg": 0.0, + "step": 9470 + }, + { + "epoch": 0.06236842105263158, + "grad_norm": 2.5625, + "grad_norm_var": 0.06562398274739584, + "learning_rate": 0.0001, + "loss": 3.3237, + "loss/crossentropy": 2.588094711303711, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.3172804355621338, + "loss/reg": 0.0, + "step": 9480 + }, + { + "epoch": 0.062434210526315786, + "grad_norm": 2.375, + "grad_norm_var": 0.21744384765625, + "learning_rate": 0.0001, + "loss": 3.3475, + "loss/crossentropy": 2.390676462650299, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.27351657301187515, + "loss/reg": 0.0, + "step": 9490 + }, + { + "epoch": 0.0625, + "grad_norm": 2.703125, + "grad_norm_var": 0.0399078369140625, + "learning_rate": 0.0001, + "loss": 3.2819, + "loss/crossentropy": 2.3253311276435853, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.265305657684803, + "loss/reg": 0.0, + "step": 9500 + }, + { + "epoch": 0.06256578947368421, + "grad_norm": 2.25, + "grad_norm_var": 0.08692118326822916, + "learning_rate": 0.0001, + "loss": 3.2535, + "loss/crossentropy": 2.3912763714790346, + "loss/hidden": 2.9734375, + "loss/incoh": 0.0, + "loss/logits": 0.2748603358864784, + "loss/reg": 0.0, + "step": 9510 + }, + { + "epoch": 0.06263157894736843, + "grad_norm": 2.265625, + "grad_norm_var": 0.11210098266601562, + "learning_rate": 0.0001, + "loss": 3.2609, + "loss/crossentropy": 2.3854790568351745, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.2736155718564987, + "loss/reg": 0.0, + "step": 9520 + }, + { + "epoch": 0.06269736842105263, + "grad_norm": 2.9375, + "grad_norm_var": 0.13178888956705728, + "learning_rate": 0.0001, + "loss": 3.2434, + "loss/crossentropy": 2.4524547696113586, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.256032706797123, + "loss/reg": 0.0, + "step": 9530 + }, + { + "epoch": 0.06276315789473684, + "grad_norm": 2.234375, + "grad_norm_var": 0.08687744140625, + "learning_rate": 0.0001, + "loss": 3.3256, + "loss/crossentropy": 2.563627076148987, + "loss/hidden": 3.3046875, + "loss/incoh": 0.0, + "loss/logits": 0.3692674309015274, + "loss/reg": 0.0, + "step": 9540 + }, + { + "epoch": 0.06282894736842105, + "grad_norm": 2.4375, + "grad_norm_var": 0.13664957682291667, + "learning_rate": 0.0001, + "loss": 3.3197, + "loss/crossentropy": 2.3016056180000306, + "loss/hidden": 3.025, + "loss/incoh": 0.0, + "loss/logits": 0.2520491242408752, + "loss/reg": 0.0, + "step": 9550 + }, + { + "epoch": 0.06289473684210527, + "grad_norm": 2.828125, + "grad_norm_var": 0.4147857666015625, + "learning_rate": 0.0001, + "loss": 3.511, + "loss/crossentropy": 2.1577144265174866, + "loss/hidden": 3.1046875, + "loss/incoh": 0.0, + "loss/logits": 0.2929541230201721, + "loss/reg": 0.0, + "step": 9560 + }, + { + "epoch": 0.06296052631578947, + "grad_norm": 2.34375, + "grad_norm_var": 0.18341471354166666, + "learning_rate": 0.0001, + "loss": 3.2549, + "loss/crossentropy": 2.31901068687439, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.27463280111551286, + "loss/reg": 0.0, + "step": 9570 + }, + { + "epoch": 0.06302631578947368, + "grad_norm": 2.53125, + "grad_norm_var": 0.4622884114583333, + "learning_rate": 0.0001, + "loss": 3.3477, + "loss/crossentropy": 2.6131432056427, + "loss/hidden": 3.1140625, + "loss/incoh": 0.0, + "loss/logits": 0.38214774429798126, + "loss/reg": 0.0, + "step": 9580 + }, + { + "epoch": 0.0630921052631579, + "grad_norm": 2.484375, + "grad_norm_var": 0.13547337849934896, + "learning_rate": 0.0001, + "loss": 3.2885, + "loss/crossentropy": 2.451016199588776, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.2813758164644241, + "loss/reg": 0.0, + "step": 9590 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 2.1875, + "grad_norm_var": 0.2512003580729167, + "learning_rate": 0.0001, + "loss": 3.3345, + "loss/crossentropy": 2.1964026927947997, + "loss/hidden": 3.2765625, + "loss/incoh": 0.0, + "loss/logits": 0.31094489246606827, + "loss/reg": 0.0, + "step": 9600 + }, + { + "epoch": 0.06322368421052632, + "grad_norm": 2.25, + "grad_norm_var": 0.28735249837239585, + "learning_rate": 0.0001, + "loss": 3.2956, + "loss/crossentropy": 2.3820174098014832, + "loss/hidden": 3.1375, + "loss/incoh": 0.0, + "loss/logits": 0.30036603659391403, + "loss/reg": 0.0, + "step": 9610 + }, + { + "epoch": 0.06328947368421052, + "grad_norm": 2.609375, + "grad_norm_var": 0.9716102600097656, + "learning_rate": 0.0001, + "loss": 3.2363, + "loss/crossentropy": 2.4219281673431396, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2639324277639389, + "loss/reg": 0.0, + "step": 9620 + }, + { + "epoch": 0.06335526315789473, + "grad_norm": 2.8125, + "grad_norm_var": 0.8159016927083333, + "learning_rate": 0.0001, + "loss": 3.331, + "loss/crossentropy": 1.9574783891439438, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.2543580986559391, + "loss/reg": 0.0, + "step": 9630 + }, + { + "epoch": 0.06342105263157895, + "grad_norm": 2.6875, + "grad_norm_var": 0.06944961547851562, + "learning_rate": 0.0001, + "loss": 3.2008, + "loss/crossentropy": 2.210974097251892, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.2627203479409218, + "loss/reg": 0.0, + "step": 9640 + }, + { + "epoch": 0.06348684210526316, + "grad_norm": 2.9375, + "grad_norm_var": 0.2388336181640625, + "learning_rate": 0.0001, + "loss": 3.3255, + "loss/crossentropy": 2.225372338294983, + "loss/hidden": 3.0984375, + "loss/incoh": 0.0, + "loss/logits": 0.28057117611169813, + "loss/reg": 0.0, + "step": 9650 + }, + { + "epoch": 0.06355263157894737, + "grad_norm": 2.0, + "grad_norm_var": 0.3254954020182292, + "learning_rate": 0.0001, + "loss": 3.3021, + "loss/crossentropy": 2.190217161178589, + "loss/hidden": 3.1734375, + "loss/incoh": 0.0, + "loss/logits": 0.31579277813434603, + "loss/reg": 0.0, + "step": 9660 + }, + { + "epoch": 0.06361842105263157, + "grad_norm": 2.078125, + "grad_norm_var": 0.5466105143229166, + "learning_rate": 0.0001, + "loss": 3.3119, + "loss/crossentropy": 2.427161252498627, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.307109659910202, + "loss/reg": 0.0, + "step": 9670 + }, + { + "epoch": 0.06368421052631579, + "grad_norm": 2.171875, + "grad_norm_var": 0.48213602701822916, + "learning_rate": 0.0001, + "loss": 3.2915, + "loss/crossentropy": 2.2590057969093325, + "loss/hidden": 3.0765625, + "loss/incoh": 0.0, + "loss/logits": 0.2674726366996765, + "loss/reg": 0.0, + "step": 9680 + }, + { + "epoch": 0.06375, + "grad_norm": 2.53125, + "grad_norm_var": 0.2637858072916667, + "learning_rate": 0.0001, + "loss": 3.2742, + "loss/crossentropy": 2.3426333904266357, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2729641154408455, + "loss/reg": 0.0, + "step": 9690 + }, + { + "epoch": 0.06381578947368421, + "grad_norm": 2.453125, + "grad_norm_var": 0.3402252197265625, + "learning_rate": 0.0001, + "loss": 3.2983, + "loss/crossentropy": 2.3633025169372557, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.2770617350935936, + "loss/reg": 0.0, + "step": 9700 + }, + { + "epoch": 0.06388157894736841, + "grad_norm": 3.25, + "grad_norm_var": 0.23658447265625, + "learning_rate": 0.0001, + "loss": 3.2621, + "loss/crossentropy": 2.363292157649994, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.2777522191405296, + "loss/reg": 0.0, + "step": 9710 + }, + { + "epoch": 0.06394736842105263, + "grad_norm": 3.5625, + "grad_norm_var": 0.45250244140625, + "learning_rate": 0.0001, + "loss": 3.3017, + "loss/crossentropy": 2.4210567593574526, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2831977978348732, + "loss/reg": 0.0, + "step": 9720 + }, + { + "epoch": 0.06401315789473684, + "grad_norm": 2.453125, + "grad_norm_var": 0.2709147135416667, + "learning_rate": 0.0001, + "loss": 3.234, + "loss/crossentropy": 2.3109630227088926, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.31227574199438096, + "loss/reg": 0.0, + "step": 9730 + }, + { + "epoch": 0.06407894736842105, + "grad_norm": 2.15625, + "grad_norm_var": 0.38984273274739584, + "learning_rate": 0.0001, + "loss": 3.2414, + "loss/crossentropy": 2.225367599725723, + "loss/hidden": 3.334375, + "loss/incoh": 0.0, + "loss/logits": 0.24756639897823335, + "loss/reg": 0.0, + "step": 9740 + }, + { + "epoch": 0.06414473684210527, + "grad_norm": 3.65625, + "grad_norm_var": 0.40685933430989585, + "learning_rate": 0.0001, + "loss": 3.2959, + "loss/crossentropy": 2.633290505409241, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.27978702783584597, + "loss/reg": 0.0, + "step": 9750 + }, + { + "epoch": 0.06421052631578947, + "grad_norm": 2.3125, + "grad_norm_var": 1.353466796875, + "learning_rate": 0.0001, + "loss": 3.2732, + "loss/crossentropy": 2.377350616455078, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.27294492572546003, + "loss/reg": 0.0, + "step": 9760 + }, + { + "epoch": 0.06427631578947368, + "grad_norm": 2.375, + "grad_norm_var": 1.3017242431640625, + "learning_rate": 0.0001, + "loss": 3.3766, + "loss/crossentropy": 2.230624866485596, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.2353967770934105, + "loss/reg": 0.0, + "step": 9770 + }, + { + "epoch": 0.0643421052631579, + "grad_norm": 3.046875, + "grad_norm_var": 30.074632771809895, + "learning_rate": 0.0001, + "loss": 3.2769, + "loss/crossentropy": 2.250266909599304, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.2896205812692642, + "loss/reg": 0.0, + "step": 9780 + }, + { + "epoch": 0.06440789473684211, + "grad_norm": 2.578125, + "grad_norm_var": 30.214286295572915, + "learning_rate": 0.0001, + "loss": 3.2666, + "loss/crossentropy": 2.495633435249329, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2458545297384262, + "loss/reg": 0.0, + "step": 9790 + }, + { + "epoch": 0.06447368421052632, + "grad_norm": 1.96875, + "grad_norm_var": 0.09082743326822916, + "learning_rate": 0.0001, + "loss": 3.2638, + "loss/crossentropy": 2.140651452541351, + "loss/hidden": 3.0890625, + "loss/incoh": 0.0, + "loss/logits": 0.27739599645137786, + "loss/reg": 0.0, + "step": 9800 + }, + { + "epoch": 0.06453947368421052, + "grad_norm": 2.765625, + "grad_norm_var": 2.439875284830729, + "learning_rate": 0.0001, + "loss": 3.2928, + "loss/crossentropy": 2.273459422588348, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.26713447719812394, + "loss/reg": 0.0, + "step": 9810 + }, + { + "epoch": 0.06460526315789474, + "grad_norm": 2.640625, + "grad_norm_var": 2.7100901285807293, + "learning_rate": 0.0001, + "loss": 3.2839, + "loss/crossentropy": 2.4476850271224975, + "loss/hidden": 3.3984375, + "loss/incoh": 0.0, + "loss/logits": 0.35955790579319, + "loss/reg": 0.0, + "step": 9820 + }, + { + "epoch": 0.06467105263157895, + "grad_norm": 2.5, + "grad_norm_var": 0.46149800618489584, + "learning_rate": 0.0001, + "loss": 3.306, + "loss/crossentropy": 2.3544110536575316, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.31140194088220596, + "loss/reg": 0.0, + "step": 9830 + }, + { + "epoch": 0.06473684210526316, + "grad_norm": 2.4375, + "grad_norm_var": 0.15746968587239582, + "learning_rate": 0.0001, + "loss": 3.2209, + "loss/crossentropy": 2.2321391999721527, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2574504017829895, + "loss/reg": 0.0, + "step": 9840 + }, + { + "epoch": 0.06480263157894736, + "grad_norm": 2.171875, + "grad_norm_var": 0.10312093098958333, + "learning_rate": 0.0001, + "loss": 3.2513, + "loss/crossentropy": 2.311957097053528, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.2625372394919395, + "loss/reg": 0.0, + "step": 9850 + }, + { + "epoch": 0.06486842105263158, + "grad_norm": 2.421875, + "grad_norm_var": 0.0565826416015625, + "learning_rate": 0.0001, + "loss": 3.3056, + "loss/crossentropy": 2.331065666675568, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.277868440747261, + "loss/reg": 0.0, + "step": 9860 + }, + { + "epoch": 0.06493421052631579, + "grad_norm": 2.46875, + "grad_norm_var": 0.18406575520833332, + "learning_rate": 0.0001, + "loss": 3.3417, + "loss/crossentropy": 2.314790654182434, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.33730033934116366, + "loss/reg": 0.0, + "step": 9870 + }, + { + "epoch": 0.065, + "grad_norm": 2.359375, + "grad_norm_var": 0.08243815104166667, + "learning_rate": 0.0001, + "loss": 3.3522, + "loss/crossentropy": 2.489400041103363, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.28822820335626603, + "loss/reg": 0.0, + "step": 9880 + }, + { + "epoch": 0.06506578947368422, + "grad_norm": 2.171875, + "grad_norm_var": 0.0967437744140625, + "learning_rate": 0.0001, + "loss": 3.2368, + "loss/crossentropy": 2.3749926924705504, + "loss/hidden": 3.175, + "loss/incoh": 0.0, + "loss/logits": 0.3226087599992752, + "loss/reg": 0.0, + "step": 9890 + }, + { + "epoch": 0.06513157894736842, + "grad_norm": 2.265625, + "grad_norm_var": 0.11569722493489583, + "learning_rate": 0.0001, + "loss": 3.2881, + "loss/crossentropy": 2.190938687324524, + "loss/hidden": 3.0296875, + "loss/incoh": 0.0, + "loss/logits": 0.30352693498134614, + "loss/reg": 0.0, + "step": 9900 + }, + { + "epoch": 0.06519736842105263, + "grad_norm": 2.28125, + "grad_norm_var": 0.09851888020833334, + "learning_rate": 0.0001, + "loss": 3.2849, + "loss/crossentropy": 2.412072277069092, + "loss/hidden": 3.0234375, + "loss/incoh": 0.0, + "loss/logits": 0.32761459052562714, + "loss/reg": 0.0, + "step": 9910 + }, + { + "epoch": 0.06526315789473684, + "grad_norm": 2.34375, + "grad_norm_var": 0.49250895182291665, + "learning_rate": 0.0001, + "loss": 3.2984, + "loss/crossentropy": 2.2049274504184724, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.2710462361574173, + "loss/reg": 0.0, + "step": 9920 + }, + { + "epoch": 0.06532894736842106, + "grad_norm": 2.296875, + "grad_norm_var": 0.2756581624348958, + "learning_rate": 0.0001, + "loss": 3.3729, + "loss/crossentropy": 2.0697293996810915, + "loss/hidden": 3.3359375, + "loss/incoh": 0.0, + "loss/logits": 0.32963491380214693, + "loss/reg": 0.0, + "step": 9930 + }, + { + "epoch": 0.06539473684210527, + "grad_norm": 2.546875, + "grad_norm_var": 0.18677978515625, + "learning_rate": 0.0001, + "loss": 3.2972, + "loss/crossentropy": 2.3622434020042418, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.31804386228322984, + "loss/reg": 0.0, + "step": 9940 + }, + { + "epoch": 0.06546052631578947, + "grad_norm": 2.71875, + "grad_norm_var": 0.07353413899739583, + "learning_rate": 0.0001, + "loss": 3.3347, + "loss/crossentropy": 2.4146655321121218, + "loss/hidden": 3.309375, + "loss/incoh": 0.0, + "loss/logits": 0.36195366978645327, + "loss/reg": 0.0, + "step": 9950 + }, + { + "epoch": 0.06552631578947368, + "grad_norm": 2.65625, + "grad_norm_var": 0.031086222330729166, + "learning_rate": 0.0001, + "loss": 3.3045, + "loss/crossentropy": 2.499011588096619, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.27686055898666384, + "loss/reg": 0.0, + "step": 9960 + }, + { + "epoch": 0.0655921052631579, + "grad_norm": 2.359375, + "grad_norm_var": 0.36774800618489584, + "learning_rate": 0.0001, + "loss": 3.3505, + "loss/crossentropy": 2.2883435606956484, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.43403479307889936, + "loss/reg": 0.0, + "step": 9970 + }, + { + "epoch": 0.06565789473684211, + "grad_norm": 2.96875, + "grad_norm_var": 0.39895426432291664, + "learning_rate": 0.0001, + "loss": 3.3004, + "loss/crossentropy": 2.328636658191681, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.30042982697486875, + "loss/reg": 0.0, + "step": 9980 + }, + { + "epoch": 0.06572368421052631, + "grad_norm": 2.734375, + "grad_norm_var": 0.070458984375, + "learning_rate": 0.0001, + "loss": 3.3201, + "loss/crossentropy": 2.1099472880363463, + "loss/hidden": 3.2375, + "loss/incoh": 0.0, + "loss/logits": 0.2899234861135483, + "loss/reg": 0.0, + "step": 9990 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 2.40625, + "grad_norm_var": 0.06652018229166666, + "learning_rate": 0.0001, + "loss": 3.3435, + "loss/crossentropy": 2.2847598433494567, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.3000261425971985, + "loss/reg": 0.0, + "step": 10000 + }, + { + "epoch": 0.06585526315789474, + "grad_norm": 2.421875, + "grad_norm_var": 0.08883463541666667, + "learning_rate": 0.0001, + "loss": 3.2377, + "loss/crossentropy": 2.4516909599304197, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.3408001005649567, + "loss/reg": 0.0, + "step": 10010 + }, + { + "epoch": 0.06592105263157895, + "grad_norm": 2.40625, + "grad_norm_var": 4.412495930989583, + "learning_rate": 0.0001, + "loss": 3.417, + "loss/crossentropy": 2.3393358111381533, + "loss/hidden": 3.153125, + "loss/incoh": 0.0, + "loss/logits": 0.3251391679048538, + "loss/reg": 0.0, + "step": 10020 + }, + { + "epoch": 0.06598684210526316, + "grad_norm": 2.921875, + "grad_norm_var": 4.318382771809896, + "learning_rate": 0.0001, + "loss": 3.318, + "loss/crossentropy": 2.4476661682128906, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.25891698747873304, + "loss/reg": 0.0, + "step": 10030 + }, + { + "epoch": 0.06605263157894736, + "grad_norm": 2.265625, + "grad_norm_var": 2.029248046875, + "learning_rate": 0.0001, + "loss": 3.3175, + "loss/crossentropy": 2.5090669870376585, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.3005325973033905, + "loss/reg": 0.0, + "step": 10040 + }, + { + "epoch": 0.06611842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 18.96970926920573, + "learning_rate": 0.0001, + "loss": 3.3768, + "loss/crossentropy": 2.4451366662979126, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.2800415620207787, + "loss/reg": 0.0, + "step": 10050 + }, + { + "epoch": 0.06618421052631579, + "grad_norm": 2.9375, + "grad_norm_var": 0.11100260416666667, + "learning_rate": 0.0001, + "loss": 3.3131, + "loss/crossentropy": 2.3123559236526487, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.32703131139278413, + "loss/reg": 0.0, + "step": 10060 + }, + { + "epoch": 0.06625, + "grad_norm": 2.3125, + "grad_norm_var": 0.055985514322916666, + "learning_rate": 0.0001, + "loss": 3.2837, + "loss/crossentropy": 2.20616455078125, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.3035066843032837, + "loss/reg": 0.0, + "step": 10070 + }, + { + "epoch": 0.06631578947368422, + "grad_norm": 2.3125, + "grad_norm_var": 0.025560506184895835, + "learning_rate": 0.0001, + "loss": 3.202, + "loss/crossentropy": 2.3889974474906923, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.25791111290454866, + "loss/reg": 0.0, + "step": 10080 + }, + { + "epoch": 0.06638157894736842, + "grad_norm": 3.078125, + "grad_norm_var": 0.12398681640625, + "learning_rate": 0.0001, + "loss": 3.2662, + "loss/crossentropy": 2.0885241270065307, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.26190474927425383, + "loss/reg": 0.0, + "step": 10090 + }, + { + "epoch": 0.06644736842105263, + "grad_norm": 2.296875, + "grad_norm_var": 0.10334879557291667, + "learning_rate": 0.0001, + "loss": 3.2587, + "loss/crossentropy": 2.3084590315818785, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.292548905313015, + "loss/reg": 0.0, + "step": 10100 + }, + { + "epoch": 0.06651315789473684, + "grad_norm": 2.21875, + "grad_norm_var": 0.11416727701822917, + "learning_rate": 0.0001, + "loss": 3.214, + "loss/crossentropy": 2.519231605529785, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2500781774520874, + "loss/reg": 0.0, + "step": 10110 + }, + { + "epoch": 0.06657894736842106, + "grad_norm": 2.609375, + "grad_norm_var": 0.10711263020833334, + "learning_rate": 0.0001, + "loss": 3.3349, + "loss/crossentropy": 2.3334843158721923, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.2946275919675827, + "loss/reg": 0.0, + "step": 10120 + }, + { + "epoch": 0.06664473684210526, + "grad_norm": 2.75, + "grad_norm_var": 0.3732086181640625, + "learning_rate": 0.0001, + "loss": 3.3638, + "loss/crossentropy": 2.4801114797592163, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.29529736936092377, + "loss/reg": 0.0, + "step": 10130 + }, + { + "epoch": 0.06671052631578947, + "grad_norm": 2.5625, + "grad_norm_var": 0.4193511962890625, + "learning_rate": 0.0001, + "loss": 3.3195, + "loss/crossentropy": 2.3818048357963564, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.299163943529129, + "loss/reg": 0.0, + "step": 10140 + }, + { + "epoch": 0.06677631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.07637430826822916, + "learning_rate": 0.0001, + "loss": 3.2907, + "loss/crossentropy": 2.3112216353416444, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.24870002567768096, + "loss/reg": 0.0, + "step": 10150 + }, + { + "epoch": 0.0668421052631579, + "grad_norm": 3.859375, + "grad_norm_var": 0.16756083170572916, + "learning_rate": 0.0001, + "loss": 3.369, + "loss/crossentropy": 2.1794182300567626, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.3170015588402748, + "loss/reg": 0.0, + "step": 10160 + }, + { + "epoch": 0.06690789473684211, + "grad_norm": 2.34375, + "grad_norm_var": 0.17148030598958333, + "learning_rate": 0.0001, + "loss": 3.3343, + "loss/crossentropy": 2.125279116630554, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.27327116429805753, + "loss/reg": 0.0, + "step": 10170 + }, + { + "epoch": 0.06697368421052631, + "grad_norm": 2.03125, + "grad_norm_var": 1.8383626302083333, + "learning_rate": 0.0001, + "loss": 3.3548, + "loss/crossentropy": 2.4289526462554933, + "loss/hidden": 3.065625, + "loss/incoh": 0.0, + "loss/logits": 0.28756752908229827, + "loss/reg": 0.0, + "step": 10180 + }, + { + "epoch": 0.06703947368421052, + "grad_norm": 2.25, + "grad_norm_var": 0.4429677327473958, + "learning_rate": 0.0001, + "loss": 3.3431, + "loss/crossentropy": 2.297343075275421, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2415475845336914, + "loss/reg": 0.0, + "step": 10190 + }, + { + "epoch": 0.06710526315789474, + "grad_norm": 2.1875, + "grad_norm_var": 0.53648681640625, + "learning_rate": 0.0001, + "loss": 3.2412, + "loss/crossentropy": 2.291072869300842, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.3272585093975067, + "loss/reg": 0.0, + "step": 10200 + }, + { + "epoch": 0.06717105263157895, + "grad_norm": 2.53125, + "grad_norm_var": 0.03841044108072917, + "learning_rate": 0.0001, + "loss": 3.2881, + "loss/crossentropy": 2.3315325021743774, + "loss/hidden": 3.1046875, + "loss/incoh": 0.0, + "loss/logits": 0.2916677713394165, + "loss/reg": 0.0, + "step": 10210 + }, + { + "epoch": 0.06723684210526316, + "grad_norm": 2.28125, + "grad_norm_var": 0.05279541015625, + "learning_rate": 0.0001, + "loss": 3.2758, + "loss/crossentropy": 2.496378016471863, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.2993095234036446, + "loss/reg": 0.0, + "step": 10220 + }, + { + "epoch": 0.06730263157894736, + "grad_norm": 2.390625, + "grad_norm_var": 0.030101521809895834, + "learning_rate": 0.0001, + "loss": 3.184, + "loss/crossentropy": 1.954445093870163, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.23346130400896073, + "loss/reg": 0.0, + "step": 10230 + }, + { + "epoch": 0.06736842105263158, + "grad_norm": 3.6875, + "grad_norm_var": 0.2775675455729167, + "learning_rate": 0.0001, + "loss": 3.2837, + "loss/crossentropy": 2.4431410312652586, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.3258019149303436, + "loss/reg": 0.0, + "step": 10240 + }, + { + "epoch": 0.06743421052631579, + "grad_norm": 2.125, + "grad_norm_var": 0.33483784993489585, + "learning_rate": 0.0001, + "loss": 3.2903, + "loss/crossentropy": 2.0363747119903564, + "loss/hidden": 3.4796875, + "loss/incoh": 0.0, + "loss/logits": 0.3713301241397858, + "loss/reg": 0.0, + "step": 10250 + }, + { + "epoch": 0.0675, + "grad_norm": 2.4375, + "grad_norm_var": 0.3346028645833333, + "learning_rate": 0.0001, + "loss": 3.2875, + "loss/crossentropy": 2.1991047143936155, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.3233490988612175, + "loss/reg": 0.0, + "step": 10260 + }, + { + "epoch": 0.0675657894736842, + "grad_norm": 3.890625, + "grad_norm_var": 0.1937652587890625, + "learning_rate": 0.0001, + "loss": 3.2766, + "loss/crossentropy": 2.334232974052429, + "loss/hidden": 3.15625, + "loss/incoh": 0.0, + "loss/logits": 0.3195400908589363, + "loss/reg": 0.0, + "step": 10270 + }, + { + "epoch": 0.06763157894736842, + "grad_norm": 2.25, + "grad_norm_var": 0.7010162353515625, + "learning_rate": 0.0001, + "loss": 3.224, + "loss/crossentropy": 2.242165985703468, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.27829615995287893, + "loss/reg": 0.0, + "step": 10280 + }, + { + "epoch": 0.06769736842105263, + "grad_norm": 2.078125, + "grad_norm_var": 0.1137115478515625, + "learning_rate": 0.0001, + "loss": 3.2283, + "loss/crossentropy": 2.4408915996551515, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.2770886033773422, + "loss/reg": 0.0, + "step": 10290 + }, + { + "epoch": 0.06776315789473684, + "grad_norm": 2.75, + "grad_norm_var": 0.28396708170572915, + "learning_rate": 0.0001, + "loss": 3.2788, + "loss/crossentropy": 2.2959680914878846, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.2606198683381081, + "loss/reg": 0.0, + "step": 10300 + }, + { + "epoch": 0.06782894736842106, + "grad_norm": 2.859375, + "grad_norm_var": 0.0829498291015625, + "learning_rate": 0.0001, + "loss": 3.3487, + "loss/crossentropy": 2.430151104927063, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.25218638926744463, + "loss/reg": 0.0, + "step": 10310 + }, + { + "epoch": 0.06789473684210526, + "grad_norm": 3.203125, + "grad_norm_var": 0.3111979166666667, + "learning_rate": 0.0001, + "loss": 3.3676, + "loss/crossentropy": 2.1297863006591795, + "loss/hidden": 3.415625, + "loss/incoh": 0.0, + "loss/logits": 0.4299448400735855, + "loss/reg": 0.0, + "step": 10320 + }, + { + "epoch": 0.06796052631578947, + "grad_norm": 3.0625, + "grad_norm_var": 0.34820556640625, + "learning_rate": 0.0001, + "loss": 3.3013, + "loss/crossentropy": 2.2383357286453247, + "loss/hidden": 2.990625, + "loss/incoh": 0.0, + "loss/logits": 0.31647931337356566, + "loss/reg": 0.0, + "step": 10330 + }, + { + "epoch": 0.06802631578947368, + "grad_norm": 2.40625, + "grad_norm_var": 0.0734283447265625, + "learning_rate": 0.0001, + "loss": 3.2653, + "loss/crossentropy": 2.3983714103698732, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.29214349389076233, + "loss/reg": 0.0, + "step": 10340 + }, + { + "epoch": 0.0680921052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.0652984619140625, + "learning_rate": 0.0001, + "loss": 3.2579, + "loss/crossentropy": 2.5644800424575807, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.3019869774580002, + "loss/reg": 0.0, + "step": 10350 + }, + { + "epoch": 0.06815789473684211, + "grad_norm": 2.53125, + "grad_norm_var": 3.350255903165907e+17, + "learning_rate": 0.0001, + "loss": 3.3114, + "loss/crossentropy": 2.6736939191818236, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2768147349357605, + "loss/reg": 0.0, + "step": 10360 + }, + { + "epoch": 0.06822368421052631, + "grad_norm": 4.65625, + "grad_norm_var": 3.350255902575034e+17, + "learning_rate": 0.0001, + "loss": 3.3131, + "loss/crossentropy": 2.2038461327552796, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.31388128101825713, + "loss/reg": 0.0, + "step": 10370 + }, + { + "epoch": 0.06828947368421052, + "grad_norm": 3.78125, + "grad_norm_var": 0.5317220052083333, + "learning_rate": 0.0001, + "loss": 3.3243, + "loss/crossentropy": 2.3583734750747682, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.26419220119714737, + "loss/reg": 0.0, + "step": 10380 + }, + { + "epoch": 0.06835526315789474, + "grad_norm": 2.640625, + "grad_norm_var": 0.2758127848307292, + "learning_rate": 0.0001, + "loss": 3.287, + "loss/crossentropy": 2.2863503098487854, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.23900726288557053, + "loss/reg": 0.0, + "step": 10390 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 2.3125, + "grad_norm_var": 0.04855143229166667, + "learning_rate": 0.0001, + "loss": 3.2604, + "loss/crossentropy": 2.3840556263923647, + "loss/hidden": 3.14375, + "loss/incoh": 0.0, + "loss/logits": 0.30461184978485106, + "loss/reg": 0.0, + "step": 10400 + }, + { + "epoch": 0.06848684210526315, + "grad_norm": 2.953125, + "grad_norm_var": 0.15156962076822916, + "learning_rate": 0.0001, + "loss": 3.3062, + "loss/crossentropy": 2.088348960876465, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.2698833361268044, + "loss/reg": 0.0, + "step": 10410 + }, + { + "epoch": 0.06855263157894737, + "grad_norm": 2.875, + "grad_norm_var": 0.5885894775390625, + "learning_rate": 0.0001, + "loss": 3.3402, + "loss/crossentropy": 2.551842737197876, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.313777893781662, + "loss/reg": 0.0, + "step": 10420 + }, + { + "epoch": 0.06861842105263158, + "grad_norm": 2.46875, + "grad_norm_var": 0.9581939697265625, + "learning_rate": 0.0001, + "loss": 3.3486, + "loss/crossentropy": 2.148633885383606, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.29070123881101606, + "loss/reg": 0.0, + "step": 10430 + }, + { + "epoch": 0.06868421052631579, + "grad_norm": 2.59375, + "grad_norm_var": 0.2256988525390625, + "learning_rate": 0.0001, + "loss": 3.3073, + "loss/crossentropy": 2.0771638333797453, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.24762868583202363, + "loss/reg": 0.0, + "step": 10440 + }, + { + "epoch": 0.06875, + "grad_norm": 1.9453125, + "grad_norm_var": 0.30576553344726565, + "learning_rate": 0.0001, + "loss": 3.3608, + "loss/crossentropy": 1.9622422456741333, + "loss/hidden": 3.046875, + "loss/incoh": 0.0, + "loss/logits": 0.2823460906744003, + "loss/reg": 0.0, + "step": 10450 + }, + { + "epoch": 0.0688157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.24808731079101562, + "learning_rate": 0.0001, + "loss": 3.2114, + "loss/crossentropy": 2.4866459488868715, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.25186999291181567, + "loss/reg": 0.0, + "step": 10460 + }, + { + "epoch": 0.06888157894736842, + "grad_norm": 2.59375, + "grad_norm_var": 3.232233683268229, + "learning_rate": 0.0001, + "loss": 3.2672, + "loss/crossentropy": 2.4218720883131026, + "loss/hidden": 3.2421875, + "loss/incoh": 0.0, + "loss/logits": 0.34285663813352585, + "loss/reg": 0.0, + "step": 10470 + }, + { + "epoch": 0.06894736842105263, + "grad_norm": 3.0, + "grad_norm_var": 0.12942301432291667, + "learning_rate": 0.0001, + "loss": 3.2774, + "loss/crossentropy": 2.2948715806007387, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22767567485570908, + "loss/reg": 0.0, + "step": 10480 + }, + { + "epoch": 0.06901315789473685, + "grad_norm": 2.40625, + "grad_norm_var": 0.09120686848958333, + "learning_rate": 0.0001, + "loss": 3.2795, + "loss/crossentropy": 2.4549028277397156, + "loss/hidden": 3.0515625, + "loss/incoh": 0.0, + "loss/logits": 0.2874615803360939, + "loss/reg": 0.0, + "step": 10490 + }, + { + "epoch": 0.06907894736842106, + "grad_norm": 2.265625, + "grad_norm_var": 0.14446512858072916, + "learning_rate": 0.0001, + "loss": 3.2562, + "loss/crossentropy": 2.2758328318595886, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.24516315162181854, + "loss/reg": 0.0, + "step": 10500 + }, + { + "epoch": 0.06914473684210526, + "grad_norm": 2.328125, + "grad_norm_var": 0.167626953125, + "learning_rate": 0.0001, + "loss": 3.3066, + "loss/crossentropy": 2.422711133956909, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.24841044396162032, + "loss/reg": 0.0, + "step": 10510 + }, + { + "epoch": 0.06921052631578947, + "grad_norm": 2.953125, + "grad_norm_var": 0.0686187744140625, + "learning_rate": 0.0001, + "loss": 3.2834, + "loss/crossentropy": 2.210281264781952, + "loss/hidden": 3.3, + "loss/incoh": 0.0, + "loss/logits": 0.31483527421951296, + "loss/reg": 0.0, + "step": 10520 + }, + { + "epoch": 0.06927631578947369, + "grad_norm": 2.390625, + "grad_norm_var": 0.1522857666015625, + "learning_rate": 0.0001, + "loss": 3.2678, + "loss/crossentropy": 2.3035526394844057, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.26298564970493316, + "loss/reg": 0.0, + "step": 10530 + }, + { + "epoch": 0.0693421052631579, + "grad_norm": 2.140625, + "grad_norm_var": 1.5601959228515625, + "learning_rate": 0.0001, + "loss": 3.2926, + "loss/crossentropy": 2.5075936675071717, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.27058843374252317, + "loss/reg": 0.0, + "step": 10540 + }, + { + "epoch": 0.0694078947368421, + "grad_norm": 2.171875, + "grad_norm_var": 1.5597330729166667, + "learning_rate": 0.0001, + "loss": 3.2643, + "loss/crossentropy": 2.448484253883362, + "loss/hidden": 3.1421875, + "loss/incoh": 0.0, + "loss/logits": 0.3164616659283638, + "loss/reg": 0.0, + "step": 10550 + }, + { + "epoch": 0.06947368421052631, + "grad_norm": 2.15625, + "grad_norm_var": 0.3221638997395833, + "learning_rate": 0.0001, + "loss": 3.2548, + "loss/crossentropy": 2.3997972130775453, + "loss/hidden": 3.4125, + "loss/incoh": 0.0, + "loss/logits": 0.3671171858906746, + "loss/reg": 0.0, + "step": 10560 + }, + { + "epoch": 0.06953947368421053, + "grad_norm": 2.46875, + "grad_norm_var": 0.33424072265625, + "learning_rate": 0.0001, + "loss": 3.3035, + "loss/crossentropy": 2.3439933180809023, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.24507830888032914, + "loss/reg": 0.0, + "step": 10570 + }, + { + "epoch": 0.06960526315789474, + "grad_norm": 2.5625, + "grad_norm_var": 0.15203450520833334, + "learning_rate": 0.0001, + "loss": 3.3203, + "loss/crossentropy": 2.20079083442688, + "loss/hidden": 3.203125, + "loss/incoh": 0.0, + "loss/logits": 0.3732150986790657, + "loss/reg": 0.0, + "step": 10580 + }, + { + "epoch": 0.06967105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.23407796223958333, + "learning_rate": 0.0001, + "loss": 3.3251, + "loss/crossentropy": 2.353682446479797, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.2952089346945286, + "loss/reg": 0.0, + "step": 10590 + }, + { + "epoch": 0.06973684210526315, + "grad_norm": 2.453125, + "grad_norm_var": 0.22082926432291666, + "learning_rate": 0.0001, + "loss": 3.2719, + "loss/crossentropy": 2.3233517169952393, + "loss/hidden": 3.1015625, + "loss/incoh": 0.0, + "loss/logits": 0.290130452811718, + "loss/reg": 0.0, + "step": 10600 + }, + { + "epoch": 0.06980263157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.2807362874348958, + "learning_rate": 0.0001, + "loss": 3.2558, + "loss/crossentropy": 2.466183233261108, + "loss/hidden": 3.096875, + "loss/incoh": 0.0, + "loss/logits": 0.33012075573205946, + "loss/reg": 0.0, + "step": 10610 + }, + { + "epoch": 0.06986842105263158, + "grad_norm": 2.609375, + "grad_norm_var": 0.08990478515625, + "learning_rate": 0.0001, + "loss": 3.3285, + "loss/crossentropy": 2.3989938259124757, + "loss/hidden": 3.184375, + "loss/incoh": 0.0, + "loss/logits": 0.37008936554193494, + "loss/reg": 0.0, + "step": 10620 + }, + { + "epoch": 0.0699342105263158, + "grad_norm": 3.09375, + "grad_norm_var": 0.05269266764322917, + "learning_rate": 0.0001, + "loss": 3.2761, + "loss/crossentropy": 2.4531158804893494, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.27111856192350386, + "loss/reg": 0.0, + "step": 10630 + }, + { + "epoch": 0.07, + "grad_norm": 2.203125, + "grad_norm_var": 0.0650787353515625, + "learning_rate": 0.0001, + "loss": 3.22, + "loss/crossentropy": 2.44055380821228, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.25391832590103147, + "loss/reg": 0.0, + "step": 10640 + }, + { + "epoch": 0.0700657894736842, + "grad_norm": 2.484375, + "grad_norm_var": 0.0595367431640625, + "learning_rate": 0.0001, + "loss": 3.226, + "loss/crossentropy": 2.4790910482406616, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.2732370227575302, + "loss/reg": 0.0, + "step": 10650 + }, + { + "epoch": 0.07013157894736842, + "grad_norm": 2248146944.0, + "grad_norm_var": 3.158852919285514e+17, + "learning_rate": 0.0001, + "loss": 3.3733, + "loss/crossentropy": 2.1218923926353455, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.23474968373775482, + "loss/reg": 0.0, + "step": 10660 + }, + { + "epoch": 0.07019736842105263, + "grad_norm": 2.40625, + "grad_norm_var": 3.158852918981078e+17, + "learning_rate": 0.0001, + "loss": 3.2364, + "loss/crossentropy": 2.3490695118904115, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.3323436751961708, + "loss/reg": 0.0, + "step": 10670 + }, + { + "epoch": 0.07026315789473685, + "grad_norm": 3.1875, + "grad_norm_var": 0.07629801432291666, + "learning_rate": 0.0001, + "loss": 3.1831, + "loss/crossentropy": 2.3577569365501403, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.26663027703762054, + "loss/reg": 0.0, + "step": 10680 + }, + { + "epoch": 0.07032894736842105, + "grad_norm": 2.703125, + "grad_norm_var": 0.0871978759765625, + "learning_rate": 0.0001, + "loss": 3.2111, + "loss/crossentropy": 2.2620568752288817, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.3401878133416176, + "loss/reg": 0.0, + "step": 10690 + }, + { + "epoch": 0.07039473684210526, + "grad_norm": 2.484375, + "grad_norm_var": 0.03775634765625, + "learning_rate": 0.0001, + "loss": 3.252, + "loss/crossentropy": 2.223601281642914, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.26251988410949706, + "loss/reg": 0.0, + "step": 10700 + }, + { + "epoch": 0.07046052631578947, + "grad_norm": 2.53125, + "grad_norm_var": 0.020702107747395834, + "learning_rate": 0.0001, + "loss": 3.1904, + "loss/crossentropy": 2.2720033645629885, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.2705120757222176, + "loss/reg": 0.0, + "step": 10710 + }, + { + "epoch": 0.07052631578947369, + "grad_norm": 2.203125, + "grad_norm_var": 0.36824442545572916, + "learning_rate": 0.0001, + "loss": 3.2981, + "loss/crossentropy": 2.3247862100601195, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.25748861730098727, + "loss/reg": 0.0, + "step": 10720 + }, + { + "epoch": 0.0705921052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.27925796508789064, + "learning_rate": 0.0001, + "loss": 3.1731, + "loss/crossentropy": 1.9388428241014481, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.21091363951563835, + "loss/reg": 0.0, + "step": 10730 + }, + { + "epoch": 0.0706578947368421, + "grad_norm": 2.390625, + "grad_norm_var": 0.10903294881184895, + "learning_rate": 0.0001, + "loss": 3.2623, + "loss/crossentropy": 2.338471806049347, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.2822930008172989, + "loss/reg": 0.0, + "step": 10740 + }, + { + "epoch": 0.07072368421052631, + "grad_norm": 1.96875, + "grad_norm_var": 0.11550191243489584, + "learning_rate": 0.0001, + "loss": 3.2306, + "loss/crossentropy": 2.275705647468567, + "loss/hidden": 3.1640625, + "loss/incoh": 0.0, + "loss/logits": 0.30476620346307753, + "loss/reg": 0.0, + "step": 10750 + }, + { + "epoch": 0.07078947368421053, + "grad_norm": 2.171875, + "grad_norm_var": 1.7166033426920573, + "learning_rate": 0.0001, + "loss": 3.1802, + "loss/crossentropy": 2.375147843360901, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.3041233107447624, + "loss/reg": 0.0, + "step": 10760 + }, + { + "epoch": 0.07085526315789474, + "grad_norm": 2.75, + "grad_norm_var": 1.698127237955729, + "learning_rate": 0.0001, + "loss": 3.308, + "loss/crossentropy": 2.3641104817390444, + "loss/hidden": 2.9734375, + "loss/incoh": 0.0, + "loss/logits": 0.2578106954693794, + "loss/reg": 0.0, + "step": 10770 + }, + { + "epoch": 0.07092105263157895, + "grad_norm": 2.390625, + "grad_norm_var": 0.33463134765625, + "learning_rate": 0.0001, + "loss": 3.2267, + "loss/crossentropy": 2.3689509868621825, + "loss/hidden": 3.053125, + "loss/incoh": 0.0, + "loss/logits": 0.3170511037111282, + "loss/reg": 0.0, + "step": 10780 + }, + { + "epoch": 0.07098684210526315, + "grad_norm": 2.140625, + "grad_norm_var": 0.08876851399739584, + "learning_rate": 0.0001, + "loss": 3.2416, + "loss/crossentropy": 2.0522693753242494, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.25120987445116044, + "loss/reg": 0.0, + "step": 10790 + }, + { + "epoch": 0.07105263157894737, + "grad_norm": 2.734375, + "grad_norm_var": 0.1547027587890625, + "learning_rate": 0.0001, + "loss": 3.2808, + "loss/crossentropy": 2.193123185634613, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.30211101323366163, + "loss/reg": 0.0, + "step": 10800 + }, + { + "epoch": 0.07111842105263158, + "grad_norm": 2.09375, + "grad_norm_var": 0.11612955729166667, + "learning_rate": 0.0001, + "loss": 3.1868, + "loss/crossentropy": 2.6722516775131226, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.26585151851177213, + "loss/reg": 0.0, + "step": 10810 + }, + { + "epoch": 0.0711842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.3016998291015625, + "learning_rate": 0.0001, + "loss": 3.1683, + "loss/crossentropy": 2.480617439746857, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2681833073496819, + "loss/reg": 0.0, + "step": 10820 + }, + { + "epoch": 0.07125, + "grad_norm": 3.8125, + "grad_norm_var": 0.1811431884765625, + "learning_rate": 0.0001, + "loss": 3.2239, + "loss/crossentropy": 2.2105371236801146, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.27378717362880706, + "loss/reg": 0.0, + "step": 10830 + }, + { + "epoch": 0.07131578947368421, + "grad_norm": 2.203125, + "grad_norm_var": 0.17531636555989583, + "learning_rate": 0.0001, + "loss": 3.1981, + "loss/crossentropy": 2.201746928691864, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.3672773316502571, + "loss/reg": 0.0, + "step": 10840 + }, + { + "epoch": 0.07138157894736842, + "grad_norm": 4.28125, + "grad_norm_var": 0.34869791666666666, + "learning_rate": 0.0001, + "loss": 3.2799, + "loss/crossentropy": 2.322747588157654, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.262615929543972, + "loss/reg": 0.0, + "step": 10850 + }, + { + "epoch": 0.07144736842105263, + "grad_norm": 2.796875, + "grad_norm_var": 0.7465810139973958, + "learning_rate": 0.0001, + "loss": 3.2065, + "loss/crossentropy": 1.9148864209651948, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.2080842524766922, + "loss/reg": 0.0, + "step": 10860 + }, + { + "epoch": 0.07151315789473685, + "grad_norm": 2.53125, + "grad_norm_var": 0.053873697916666664, + "learning_rate": 0.0001, + "loss": 3.2002, + "loss/crossentropy": 2.314997375011444, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2829948261380196, + "loss/reg": 0.0, + "step": 10870 + }, + { + "epoch": 0.07157894736842105, + "grad_norm": 2.25, + "grad_norm_var": 0.08782145182291666, + "learning_rate": 0.0001, + "loss": 3.2222, + "loss/crossentropy": 2.4381492733955383, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2620039567351341, + "loss/reg": 0.0, + "step": 10880 + }, + { + "epoch": 0.07164473684210526, + "grad_norm": 2.984375, + "grad_norm_var": 0.10087788899739583, + "learning_rate": 0.0001, + "loss": 3.2769, + "loss/crossentropy": 2.478635573387146, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.30705118626356126, + "loss/reg": 0.0, + "step": 10890 + }, + { + "epoch": 0.07171052631578947, + "grad_norm": 2.515625, + "grad_norm_var": 0.12694066365559895, + "learning_rate": 0.0001, + "loss": 3.1767, + "loss/crossentropy": 2.145402270555496, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.26391510516405103, + "loss/reg": 0.0, + "step": 10900 + }, + { + "epoch": 0.07177631578947369, + "grad_norm": 2.34375, + "grad_norm_var": 0.13940404256184896, + "learning_rate": 0.0001, + "loss": 3.2976, + "loss/crossentropy": 2.1119534373283386, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.24341508150100707, + "loss/reg": 0.0, + "step": 10910 + }, + { + "epoch": 0.0718421052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.193408203125, + "learning_rate": 0.0001, + "loss": 3.2699, + "loss/crossentropy": 2.2503302097320557, + "loss/hidden": 3.196875, + "loss/incoh": 0.0, + "loss/logits": 0.2697600871324539, + "loss/reg": 0.0, + "step": 10920 + }, + { + "epoch": 0.0719078947368421, + "grad_norm": 4.9375, + "grad_norm_var": 1.1786092122395833, + "learning_rate": 0.0001, + "loss": 3.3635, + "loss/crossentropy": 2.2035016298294066, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.23886747062206268, + "loss/reg": 0.0, + "step": 10930 + }, + { + "epoch": 0.07197368421052631, + "grad_norm": 2.5, + "grad_norm_var": 1.1909088134765624, + "learning_rate": 0.0001, + "loss": 3.2664, + "loss/crossentropy": 2.438611125946045, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.2649322673678398, + "loss/reg": 0.0, + "step": 10940 + }, + { + "epoch": 0.07203947368421053, + "grad_norm": 2.71875, + "grad_norm_var": 0.1803375244140625, + "learning_rate": 0.0001, + "loss": 3.2795, + "loss/crossentropy": 2.300194537639618, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.2784364491701126, + "loss/reg": 0.0, + "step": 10950 + }, + { + "epoch": 0.07210526315789474, + "grad_norm": 2.328125, + "grad_norm_var": 0.13961181640625, + "learning_rate": 0.0001, + "loss": 3.196, + "loss/crossentropy": 2.297783041000366, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.23439482748508453, + "loss/reg": 0.0, + "step": 10960 + }, + { + "epoch": 0.07217105263157894, + "grad_norm": 2.5, + "grad_norm_var": 0.03483784993489583, + "learning_rate": 0.0001, + "loss": 3.2816, + "loss/crossentropy": 2.2468614101409914, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.26336451917886733, + "loss/reg": 0.0, + "step": 10970 + }, + { + "epoch": 0.07223684210526315, + "grad_norm": 2.515625, + "grad_norm_var": 0.027684529622395832, + "learning_rate": 0.0001, + "loss": 3.2387, + "loss/crossentropy": 2.5107889652252195, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2557321682572365, + "loss/reg": 0.0, + "step": 10980 + }, + { + "epoch": 0.07230263157894737, + "grad_norm": 2.5, + "grad_norm_var": 0.05127665201822917, + "learning_rate": 0.0001, + "loss": 3.2995, + "loss/crossentropy": 2.221148931980133, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.2837982401251793, + "loss/reg": 0.0, + "step": 10990 + }, + { + "epoch": 0.07236842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.08108317057291667, + "learning_rate": 0.0001, + "loss": 3.289, + "loss/crossentropy": 2.515943694114685, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.269567608833313, + "loss/reg": 0.0, + "step": 11000 + }, + { + "epoch": 0.0724342105263158, + "grad_norm": 2.328125, + "grad_norm_var": 0.10301106770833333, + "learning_rate": 0.0001, + "loss": 3.2421, + "loss/crossentropy": 2.22921404838562, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24364713877439498, + "loss/reg": 0.0, + "step": 11010 + }, + { + "epoch": 0.0725, + "grad_norm": 2.40625, + "grad_norm_var": 0.15533447265625, + "learning_rate": 0.0001, + "loss": 3.2872, + "loss/crossentropy": 2.5498072266578675, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.2900205120444298, + "loss/reg": 0.0, + "step": 11020 + }, + { + "epoch": 0.07256578947368421, + "grad_norm": 2.828125, + "grad_norm_var": 0.19057515462239583, + "learning_rate": 0.0001, + "loss": 3.2853, + "loss/crossentropy": 2.607512426376343, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.3007731422781944, + "loss/reg": 0.0, + "step": 11030 + }, + { + "epoch": 0.07263157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.13447240193684895, + "learning_rate": 0.0001, + "loss": 3.1687, + "loss/crossentropy": 1.9578089714050293, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23623087108135224, + "loss/reg": 0.0, + "step": 11040 + }, + { + "epoch": 0.07269736842105264, + "grad_norm": 2.453125, + "grad_norm_var": 0.2533192952473958, + "learning_rate": 0.0001, + "loss": 3.2864, + "loss/crossentropy": 2.473111295700073, + "loss/hidden": 3.334375, + "loss/incoh": 0.0, + "loss/logits": 0.31404276490211486, + "loss/reg": 0.0, + "step": 11050 + }, + { + "epoch": 0.07276315789473685, + "grad_norm": 2.625, + "grad_norm_var": 0.1464019775390625, + "learning_rate": 0.0001, + "loss": 3.2587, + "loss/crossentropy": 2.1964801430702208, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.26576130986213686, + "loss/reg": 0.0, + "step": 11060 + }, + { + "epoch": 0.07282894736842105, + "grad_norm": 2.921875, + "grad_norm_var": 0.12649637858072918, + "learning_rate": 0.0001, + "loss": 3.2663, + "loss/crossentropy": 2.2337945103645325, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2691087871789932, + "loss/reg": 0.0, + "step": 11070 + }, + { + "epoch": 0.07289473684210526, + "grad_norm": 2.125, + "grad_norm_var": 0.08884175618489583, + "learning_rate": 0.0001, + "loss": 3.3071, + "loss/crossentropy": 2.5003953099250795, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.3058940455317497, + "loss/reg": 0.0, + "step": 11080 + }, + { + "epoch": 0.07296052631578948, + "grad_norm": 2.46875, + "grad_norm_var": 0.14674479166666668, + "learning_rate": 0.0001, + "loss": 3.2275, + "loss/crossentropy": 2.3431849002838137, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2692634254693985, + "loss/reg": 0.0, + "step": 11090 + }, + { + "epoch": 0.07302631578947369, + "grad_norm": 2.609375, + "grad_norm_var": 0.08789774576822916, + "learning_rate": 0.0001, + "loss": 3.2168, + "loss/crossentropy": 2.2301442503929136, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.24614981114864348, + "loss/reg": 0.0, + "step": 11100 + }, + { + "epoch": 0.07309210526315789, + "grad_norm": 2.78125, + "grad_norm_var": 0.09053446451822916, + "learning_rate": 0.0001, + "loss": 3.3336, + "loss/crossentropy": 2.624728870391846, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.30191341042518616, + "loss/reg": 0.0, + "step": 11110 + }, + { + "epoch": 0.0731578947368421, + "grad_norm": 2.65625, + "grad_norm_var": 0.23191731770833332, + "learning_rate": 0.0001, + "loss": 3.3217, + "loss/crossentropy": 2.204717183113098, + "loss/hidden": 3.4859375, + "loss/incoh": 0.0, + "loss/logits": 0.5070606812834739, + "loss/reg": 0.0, + "step": 11120 + }, + { + "epoch": 0.07322368421052632, + "grad_norm": 2.640625, + "grad_norm_var": 0.2526519775390625, + "learning_rate": 0.0001, + "loss": 3.345, + "loss/crossentropy": 2.558793139457703, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.42952366173267365, + "loss/reg": 0.0, + "step": 11130 + }, + { + "epoch": 0.07328947368421053, + "grad_norm": 2.53125, + "grad_norm_var": 0.0856109619140625, + "learning_rate": 0.0001, + "loss": 3.3754, + "loss/crossentropy": 2.227391791343689, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.27742871195077895, + "loss/reg": 0.0, + "step": 11140 + }, + { + "epoch": 0.07335526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.07666015625, + "learning_rate": 0.0001, + "loss": 3.2194, + "loss/crossentropy": 2.32956976890564, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.2826590985059738, + "loss/reg": 0.0, + "step": 11150 + }, + { + "epoch": 0.07342105263157894, + "grad_norm": 8.625, + "grad_norm_var": 2.3614542643229166, + "learning_rate": 0.0001, + "loss": 3.2431, + "loss/crossentropy": 2.0696181058883667, + "loss/hidden": 3.3640625, + "loss/incoh": 0.0, + "loss/logits": 0.2718909472227097, + "loss/reg": 0.0, + "step": 11160 + }, + { + "epoch": 0.07348684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 2.3623697916666666, + "learning_rate": 0.0001, + "loss": 3.2983, + "loss/crossentropy": 2.5612054228782655, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.26728835999965667, + "loss/reg": 0.0, + "step": 11170 + }, + { + "epoch": 0.07355263157894737, + "grad_norm": 2.171875, + "grad_norm_var": 0.11398111979166667, + "learning_rate": 0.0001, + "loss": 3.1779, + "loss/crossentropy": 2.5427613735198973, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.24258261919021606, + "loss/reg": 0.0, + "step": 11180 + }, + { + "epoch": 0.07361842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.07968648274739583, + "learning_rate": 0.0001, + "loss": 3.1565, + "loss/crossentropy": 2.3772116184234617, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.24017660170793534, + "loss/reg": 0.0, + "step": 11190 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 2.9375, + "grad_norm_var": 0.058652496337890624, + "learning_rate": 0.0001, + "loss": 3.2383, + "loss/crossentropy": 2.284471166133881, + "loss/hidden": 3.0625, + "loss/incoh": 0.0, + "loss/logits": 0.31355464905500413, + "loss/reg": 0.0, + "step": 11200 + }, + { + "epoch": 0.07375, + "grad_norm": 2.5625, + "grad_norm_var": 0.04164937337239583, + "learning_rate": 0.0001, + "loss": 3.2288, + "loss/crossentropy": 2.1727640271186828, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.24059856683015823, + "loss/reg": 0.0, + "step": 11210 + }, + { + "epoch": 0.07381578947368421, + "grad_norm": 2.53125, + "grad_norm_var": 0.015363566080729167, + "learning_rate": 0.0001, + "loss": 3.2857, + "loss/crossentropy": 2.377478325366974, + "loss/hidden": 3.0984375, + "loss/incoh": 0.0, + "loss/logits": 0.2916824325919151, + "loss/reg": 0.0, + "step": 11220 + }, + { + "epoch": 0.07388157894736842, + "grad_norm": 3.140625, + "grad_norm_var": 0.04091695149739583, + "learning_rate": 0.0001, + "loss": 3.2848, + "loss/crossentropy": 2.372981941699982, + "loss/hidden": 3.140625, + "loss/incoh": 0.0, + "loss/logits": 0.3395596519112587, + "loss/reg": 0.0, + "step": 11230 + }, + { + "epoch": 0.07394736842105264, + "grad_norm": 2.59375, + "grad_norm_var": 1.1325103759765625, + "learning_rate": 0.0001, + "loss": 3.2954, + "loss/crossentropy": 2.571339511871338, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.3173587560653687, + "loss/reg": 0.0, + "step": 11240 + }, + { + "epoch": 0.07401315789473684, + "grad_norm": 2.828125, + "grad_norm_var": 0.11298421223958334, + "learning_rate": 0.0001, + "loss": 3.213, + "loss/crossentropy": 2.093873751163483, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.23975073918700218, + "loss/reg": 0.0, + "step": 11250 + }, + { + "epoch": 0.07407894736842105, + "grad_norm": 2.421875, + "grad_norm_var": 0.13909403483072916, + "learning_rate": 0.0001, + "loss": 3.25, + "loss/crossentropy": 2.55220787525177, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.29909972846508026, + "loss/reg": 0.0, + "step": 11260 + }, + { + "epoch": 0.07414473684210526, + "grad_norm": 2.734375, + "grad_norm_var": 0.11393229166666667, + "learning_rate": 0.0001, + "loss": 3.2424, + "loss/crossentropy": 2.18590772151947, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.2539385199546814, + "loss/reg": 0.0, + "step": 11270 + }, + { + "epoch": 0.07421052631578948, + "grad_norm": 3.078125, + "grad_norm_var": 0.07273661295572917, + "learning_rate": 0.0001, + "loss": 3.2354, + "loss/crossentropy": 2.4804351210594175, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2747698500752449, + "loss/reg": 0.0, + "step": 11280 + }, + { + "epoch": 0.07427631578947369, + "grad_norm": 2.140625, + "grad_norm_var": 0.17068684895833333, + "learning_rate": 0.0001, + "loss": 3.2114, + "loss/crossentropy": 2.4408376574516297, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.3068179443478584, + "loss/reg": 0.0, + "step": 11290 + }, + { + "epoch": 0.07434210526315789, + "grad_norm": 2.5625, + "grad_norm_var": 0.27847900390625, + "learning_rate": 0.0001, + "loss": 3.3225, + "loss/crossentropy": 2.284189748764038, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.2627576723694801, + "loss/reg": 0.0, + "step": 11300 + }, + { + "epoch": 0.0744078947368421, + "grad_norm": 2.546875, + "grad_norm_var": 0.22431233723958333, + "learning_rate": 0.0001, + "loss": 3.2794, + "loss/crossentropy": 2.348969095945358, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.22996244430541993, + "loss/reg": 0.0, + "step": 11310 + }, + { + "epoch": 0.07447368421052632, + "grad_norm": 2.890625, + "grad_norm_var": 0.8210245768229166, + "learning_rate": 0.0001, + "loss": 3.2548, + "loss/crossentropy": 2.455811655521393, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2752078205347061, + "loss/reg": 0.0, + "step": 11320 + }, + { + "epoch": 0.07453947368421053, + "grad_norm": 4.75, + "grad_norm_var": 0.3614095052083333, + "learning_rate": 0.0001, + "loss": 3.2713, + "loss/crossentropy": 2.4881197214126587, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.33918842375278474, + "loss/reg": 0.0, + "step": 11330 + }, + { + "epoch": 0.07460526315789473, + "grad_norm": 2.59375, + "grad_norm_var": 0.4044596354166667, + "learning_rate": 0.0001, + "loss": 3.334, + "loss/crossentropy": 2.4584757328033446, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.3295674562454224, + "loss/reg": 0.0, + "step": 11340 + }, + { + "epoch": 0.07467105263157894, + "grad_norm": 2.703125, + "grad_norm_var": 0.9745920817057292, + "learning_rate": 0.0001, + "loss": 3.2347, + "loss/crossentropy": 2.413297247886658, + "loss/hidden": 3.0890625, + "loss/incoh": 0.0, + "loss/logits": 0.2875028237700462, + "loss/reg": 0.0, + "step": 11350 + }, + { + "epoch": 0.07473684210526316, + "grad_norm": 2.609375, + "grad_norm_var": 0.0769439697265625, + "learning_rate": 0.0001, + "loss": 3.2495, + "loss/crossentropy": 2.297819769382477, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.28927008211612704, + "loss/reg": 0.0, + "step": 11360 + }, + { + "epoch": 0.07480263157894737, + "grad_norm": 2.171875, + "grad_norm_var": 0.054182942708333334, + "learning_rate": 0.0001, + "loss": 3.1982, + "loss/crossentropy": 2.2709707379341126, + "loss/hidden": 3.03125, + "loss/incoh": 0.0, + "loss/logits": 0.29085248410701753, + "loss/reg": 0.0, + "step": 11370 + }, + { + "epoch": 0.07486842105263158, + "grad_norm": 2.609375, + "grad_norm_var": 0.315283203125, + "learning_rate": 0.0001, + "loss": 3.2877, + "loss/crossentropy": 2.5602762937545775, + "loss/hidden": 3.1828125, + "loss/incoh": 0.0, + "loss/logits": 0.3155085578560829, + "loss/reg": 0.0, + "step": 11380 + }, + { + "epoch": 0.07493421052631578, + "grad_norm": 8.375, + "grad_norm_var": 2.533186848958333, + "learning_rate": 0.0001, + "loss": 3.2192, + "loss/crossentropy": 2.231611895561218, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.27611204236745834, + "loss/reg": 0.0, + "step": 11390 + }, + { + "epoch": 0.075, + "grad_norm": 2.109375, + "grad_norm_var": 2.313280232747396, + "learning_rate": 0.0001, + "loss": 3.3273, + "loss/crossentropy": 2.2164941787719727, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2650335058569908, + "loss/reg": 0.0, + "step": 11400 + }, + { + "epoch": 0.07506578947368421, + "grad_norm": 2.265625, + "grad_norm_var": 0.3658528645833333, + "learning_rate": 0.0001, + "loss": 3.2587, + "loss/crossentropy": 2.2440002799034118, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.3373932957649231, + "loss/reg": 0.0, + "step": 11410 + }, + { + "epoch": 0.07513157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.06290690104166667, + "learning_rate": 0.0001, + "loss": 3.1439, + "loss/crossentropy": 2.405060076713562, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.26099992990493776, + "loss/reg": 0.0, + "step": 11420 + }, + { + "epoch": 0.07519736842105264, + "grad_norm": 2.40625, + "grad_norm_var": 0.04853108723958333, + "learning_rate": 0.0001, + "loss": 3.1454, + "loss/crossentropy": 2.5482593178749084, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2424457401037216, + "loss/reg": 0.0, + "step": 11430 + }, + { + "epoch": 0.07526315789473684, + "grad_norm": 2.296875, + "grad_norm_var": 0.5184244791666667, + "learning_rate": 0.0001, + "loss": 3.2189, + "loss/crossentropy": 2.1846509099006655, + "loss/hidden": 3.0390625, + "loss/incoh": 0.0, + "loss/logits": 0.29890656769275664, + "loss/reg": 0.0, + "step": 11440 + }, + { + "epoch": 0.07532894736842105, + "grad_norm": 2.46875, + "grad_norm_var": 0.03676656087239583, + "learning_rate": 0.0001, + "loss": 3.2528, + "loss/crossentropy": 2.3350785970687866, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.26758097261190417, + "loss/reg": 0.0, + "step": 11450 + }, + { + "epoch": 0.07539473684210526, + "grad_norm": 2.40625, + "grad_norm_var": 56.07302958170573, + "learning_rate": 0.0001, + "loss": 3.3264, + "loss/crossentropy": 2.357296335697174, + "loss/hidden": 3.0171875, + "loss/incoh": 0.0, + "loss/logits": 0.32786626666784285, + "loss/reg": 0.0, + "step": 11460 + }, + { + "epoch": 0.07546052631578948, + "grad_norm": 2.71875, + "grad_norm_var": 0.05142822265625, + "learning_rate": 0.0001, + "loss": 3.1956, + "loss/crossentropy": 2.29429577589035, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22037848085165024, + "loss/reg": 0.0, + "step": 11470 + }, + { + "epoch": 0.07552631578947368, + "grad_norm": 2.84375, + "grad_norm_var": 0.09621988932291667, + "learning_rate": 0.0001, + "loss": 3.2053, + "loss/crossentropy": 2.450187027454376, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.27029853165149687, + "loss/reg": 0.0, + "step": 11480 + }, + { + "epoch": 0.07559210526315789, + "grad_norm": 2.421875, + "grad_norm_var": 0.06201171875, + "learning_rate": 0.0001, + "loss": 3.2252, + "loss/crossentropy": 1.9360981225967406, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.23251112401485444, + "loss/reg": 0.0, + "step": 11490 + }, + { + "epoch": 0.0756578947368421, + "grad_norm": 2.375, + "grad_norm_var": 0.07566731770833333, + "learning_rate": 0.0001, + "loss": 3.2003, + "loss/crossentropy": 2.21165417432785, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.2515560120344162, + "loss/reg": 0.0, + "step": 11500 + }, + { + "epoch": 0.07572368421052632, + "grad_norm": 2.546875, + "grad_norm_var": 0.07636311848958334, + "learning_rate": 0.0001, + "loss": 3.1758, + "loss/crossentropy": 2.5108731746673585, + "loss/hidden": 3.0515625, + "loss/incoh": 0.0, + "loss/logits": 0.31811543107032775, + "loss/reg": 0.0, + "step": 11510 + }, + { + "epoch": 0.07578947368421053, + "grad_norm": 2.21875, + "grad_norm_var": 0.17082926432291667, + "learning_rate": 0.0001, + "loss": 3.2811, + "loss/crossentropy": 2.1942604899406435, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.25081480443477633, + "loss/reg": 0.0, + "step": 11520 + }, + { + "epoch": 0.07585526315789473, + "grad_norm": 2.859375, + "grad_norm_var": 0.07385660807291666, + "learning_rate": 0.0001, + "loss": 3.239, + "loss/crossentropy": 2.0190611362457274, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.28278502970933916, + "loss/reg": 0.0, + "step": 11530 + }, + { + "epoch": 0.07592105263157894, + "grad_norm": 2.296875, + "grad_norm_var": 0.045166015625, + "learning_rate": 0.0001, + "loss": 3.2825, + "loss/crossentropy": 2.3928887605667115, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.31386475563049315, + "loss/reg": 0.0, + "step": 11540 + }, + { + "epoch": 0.07598684210526316, + "grad_norm": 2.34375, + "grad_norm_var": 0.05419514973958333, + "learning_rate": 0.0001, + "loss": 3.236, + "loss/crossentropy": 2.3381729245185854, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.2617738708853722, + "loss/reg": 0.0, + "step": 11550 + }, + { + "epoch": 0.07605263157894737, + "grad_norm": 3.421875, + "grad_norm_var": 0.10877278645833334, + "learning_rate": 0.0001, + "loss": 3.1435, + "loss/crossentropy": 2.1223382353782654, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.2554148808121681, + "loss/reg": 0.0, + "step": 11560 + }, + { + "epoch": 0.07611842105263159, + "grad_norm": 2.75, + "grad_norm_var": 0.13093973795572916, + "learning_rate": 0.0001, + "loss": 3.2511, + "loss/crossentropy": 2.4102694511413576, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.2813078135251999, + "loss/reg": 0.0, + "step": 11570 + }, + { + "epoch": 0.07618421052631578, + "grad_norm": 2.296875, + "grad_norm_var": 0.12099507649739584, + "learning_rate": 0.0001, + "loss": 3.2712, + "loss/crossentropy": 2.2883424520492555, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.27575887441635133, + "loss/reg": 0.0, + "step": 11580 + }, + { + "epoch": 0.07625, + "grad_norm": 2.09375, + "grad_norm_var": 0.15095926920572916, + "learning_rate": 0.0001, + "loss": 3.2856, + "loss/crossentropy": 2.4679728865623476, + "loss/hidden": 2.990625, + "loss/incoh": 0.0, + "loss/logits": 0.3342022061347961, + "loss/reg": 0.0, + "step": 11590 + }, + { + "epoch": 0.07631578947368421, + "grad_norm": 2.703125, + "grad_norm_var": 0.1062896728515625, + "learning_rate": 0.0001, + "loss": 3.2302, + "loss/crossentropy": 2.35604043006897, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.30481296926736834, + "loss/reg": 0.0, + "step": 11600 + }, + { + "epoch": 0.07638157894736843, + "grad_norm": 2.4375, + "grad_norm_var": 0.17685445149739584, + "learning_rate": 0.0001, + "loss": 3.3621, + "loss/crossentropy": 2.302362835407257, + "loss/hidden": 3.153125, + "loss/incoh": 0.0, + "loss/logits": 0.29829359203577044, + "loss/reg": 0.0, + "step": 11610 + }, + { + "epoch": 0.07644736842105262, + "grad_norm": 2.515625, + "grad_norm_var": 2.837443679954338e+17, + "learning_rate": 0.0001, + "loss": 3.365, + "loss/crossentropy": 2.3786125659942625, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.2504860758781433, + "loss/reg": 0.0, + "step": 11620 + }, + { + "epoch": 0.07651315789473684, + "grad_norm": 2.59375, + "grad_norm_var": 2.8374436804315274e+17, + "learning_rate": 0.0001, + "loss": 3.285, + "loss/crossentropy": 1.7914829134941102, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.26675148904323576, + "loss/reg": 0.0, + "step": 11630 + }, + { + "epoch": 0.07657894736842105, + "grad_norm": 2.40625, + "grad_norm_var": 0.047028605143229166, + "learning_rate": 0.0001, + "loss": 3.2398, + "loss/crossentropy": 2.3633928418159487, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.2506480649113655, + "loss/reg": 0.0, + "step": 11640 + }, + { + "epoch": 0.07664473684210527, + "grad_norm": 2.1875, + "grad_norm_var": 0.12795308430989583, + "learning_rate": 0.0001, + "loss": 3.1729, + "loss/crossentropy": 2.3739442467689513, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.24874649345874786, + "loss/reg": 0.0, + "step": 11650 + }, + { + "epoch": 0.07671052631578948, + "grad_norm": 3.140625, + "grad_norm_var": 0.057938639322916666, + "learning_rate": 0.0001, + "loss": 3.19, + "loss/crossentropy": 1.946711039543152, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.26388829201459885, + "loss/reg": 0.0, + "step": 11660 + }, + { + "epoch": 0.07677631578947368, + "grad_norm": 2.28125, + "grad_norm_var": 0.16907145182291666, + "learning_rate": 0.0001, + "loss": 3.2141, + "loss/crossentropy": 2.5971063375473022, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.29624524116516116, + "loss/reg": 0.0, + "step": 11670 + }, + { + "epoch": 0.07684210526315789, + "grad_norm": 2.796875, + "grad_norm_var": 0.20071512858072918, + "learning_rate": 0.0001, + "loss": 3.2566, + "loss/crossentropy": 2.5601096868515016, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.272810535132885, + "loss/reg": 0.0, + "step": 11680 + }, + { + "epoch": 0.0769078947368421, + "grad_norm": 2.375, + "grad_norm_var": 0.14016927083333333, + "learning_rate": 0.0001, + "loss": 3.1653, + "loss/crossentropy": 2.4755476355552672, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.27636886537075045, + "loss/reg": 0.0, + "step": 11690 + }, + { + "epoch": 0.07697368421052632, + "grad_norm": 2.484375, + "grad_norm_var": 0.641162109375, + "learning_rate": 0.0001, + "loss": 3.1798, + "loss/crossentropy": 2.558279812335968, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.24465988874435424, + "loss/reg": 0.0, + "step": 11700 + }, + { + "epoch": 0.07703947368421053, + "grad_norm": 2.25, + "grad_norm_var": 0.0510162353515625, + "learning_rate": 0.0001, + "loss": 3.1825, + "loss/crossentropy": 2.437064230442047, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.25304732471704483, + "loss/reg": 0.0, + "step": 11710 + }, + { + "epoch": 0.07710526315789473, + "grad_norm": 2.578125, + "grad_norm_var": 0.012482706705729167, + "learning_rate": 0.0001, + "loss": 3.1497, + "loss/crossentropy": 2.3207133412361145, + "loss/hidden": 3.0640625, + "loss/incoh": 0.0, + "loss/logits": 0.26052851378917696, + "loss/reg": 0.0, + "step": 11720 + }, + { + "epoch": 0.07717105263157895, + "grad_norm": 2.375, + "grad_norm_var": 0.03972066243489583, + "learning_rate": 0.0001, + "loss": 3.2354, + "loss/crossentropy": 2.210064744949341, + "loss/hidden": 3.1765625, + "loss/incoh": 0.0, + "loss/logits": 0.321417099237442, + "loss/reg": 0.0, + "step": 11730 + }, + { + "epoch": 0.07723684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.11972249348958333, + "learning_rate": 0.0001, + "loss": 3.2828, + "loss/crossentropy": 2.2914742827415466, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.30591298937797545, + "loss/reg": 0.0, + "step": 11740 + }, + { + "epoch": 0.07730263157894737, + "grad_norm": 2.875, + "grad_norm_var": 0.48640034993489584, + "learning_rate": 0.0001, + "loss": 3.2259, + "loss/crossentropy": 2.2794241905212402, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.3180560126900673, + "loss/reg": 0.0, + "step": 11750 + }, + { + "epoch": 0.07736842105263157, + "grad_norm": 3.15625, + "grad_norm_var": 0.4784576416015625, + "learning_rate": 0.0001, + "loss": 3.1508, + "loss/crossentropy": 2.237711024284363, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.22340844720602035, + "loss/reg": 0.0, + "step": 11760 + }, + { + "epoch": 0.07743421052631579, + "grad_norm": 2.34375, + "grad_norm_var": 0.2001129150390625, + "learning_rate": 0.0001, + "loss": 3.2992, + "loss/crossentropy": 2.4300220131874086, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.279655484855175, + "loss/reg": 0.0, + "step": 11770 + }, + { + "epoch": 0.0775, + "grad_norm": 2.4375, + "grad_norm_var": 0.0681549072265625, + "learning_rate": 0.0001, + "loss": 3.1433, + "loss/crossentropy": 2.1450002193450928, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.26157657504081727, + "loss/reg": 0.0, + "step": 11780 + }, + { + "epoch": 0.07756578947368421, + "grad_norm": 2.671875, + "grad_norm_var": 0.038492838541666664, + "learning_rate": 0.0001, + "loss": 3.2329, + "loss/crossentropy": 2.273455095291138, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2625778928399086, + "loss/reg": 0.0, + "step": 11790 + }, + { + "epoch": 0.07763157894736843, + "grad_norm": 2.34375, + "grad_norm_var": 0.06272379557291667, + "learning_rate": 0.0001, + "loss": 3.2736, + "loss/crossentropy": 2.2713128685951234, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.2852136388421059, + "loss/reg": 0.0, + "step": 11800 + }, + { + "epoch": 0.07769736842105263, + "grad_norm": 2.546875, + "grad_norm_var": 0.14368082682291666, + "learning_rate": 0.0001, + "loss": 3.2063, + "loss/crossentropy": 2.276504385471344, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.2478427141904831, + "loss/reg": 0.0, + "step": 11810 + }, + { + "epoch": 0.07776315789473684, + "grad_norm": 2.3125, + "grad_norm_var": 0.14405008951822917, + "learning_rate": 0.0001, + "loss": 3.2627, + "loss/crossentropy": 2.3879762291908264, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.26758207380771637, + "loss/reg": 0.0, + "step": 11820 + }, + { + "epoch": 0.07782894736842105, + "grad_norm": 2.21875, + "grad_norm_var": 16.03980712890625, + "learning_rate": 0.0001, + "loss": 3.1689, + "loss/crossentropy": 2.238702917098999, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.25569620728492737, + "loss/reg": 0.0, + "step": 11830 + }, + { + "epoch": 0.07789473684210527, + "grad_norm": 2.09375, + "grad_norm_var": 16.03136774698893, + "learning_rate": 0.0001, + "loss": 3.1737, + "loss/crossentropy": 2.11713285446167, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.2322181984782219, + "loss/reg": 0.0, + "step": 11840 + }, + { + "epoch": 0.07796052631578948, + "grad_norm": 3.53125, + "grad_norm_var": 0.14957249959309896, + "learning_rate": 0.0001, + "loss": 3.2023, + "loss/crossentropy": 2.279906690120697, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.26055515706539156, + "loss/reg": 0.0, + "step": 11850 + }, + { + "epoch": 0.07802631578947368, + "grad_norm": 2.453125, + "grad_norm_var": 0.1443756103515625, + "learning_rate": 0.0001, + "loss": 3.1657, + "loss/crossentropy": 2.338582932949066, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.23672561049461366, + "loss/reg": 0.0, + "step": 11860 + }, + { + "epoch": 0.0780921052631579, + "grad_norm": 2.171875, + "grad_norm_var": 0.08467508951822916, + "learning_rate": 0.0001, + "loss": 3.1991, + "loss/crossentropy": 2.3666534066200255, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.27455085664987566, + "loss/reg": 0.0, + "step": 11870 + }, + { + "epoch": 0.0781578947368421, + "grad_norm": 2.359375, + "grad_norm_var": 0.08329671223958333, + "learning_rate": 0.0001, + "loss": 3.1862, + "loss/crossentropy": 2.232216811180115, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.29752269983291624, + "loss/reg": 0.0, + "step": 11880 + }, + { + "epoch": 0.07822368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 3.2052, + "loss/crossentropy": 2.5155674695968626, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2883127599954605, + "loss/reg": 0.0, + "step": 11890 + }, + { + "epoch": 0.07828947368421052, + "grad_norm": 2.109375, + "grad_norm_var": 0.05334879557291667, + "learning_rate": 0.0001, + "loss": 3.1696, + "loss/crossentropy": 2.3109512329101562, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.2651562377810478, + "loss/reg": 0.0, + "step": 11900 + }, + { + "epoch": 0.07835526315789473, + "grad_norm": 2.578125, + "grad_norm_var": 0.1116363525390625, + "learning_rate": 0.0001, + "loss": 3.2305, + "loss/crossentropy": 2.4196372270584106, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.24704778790473939, + "loss/reg": 0.0, + "step": 11910 + }, + { + "epoch": 0.07842105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.10711161295572917, + "learning_rate": 0.0001, + "loss": 3.2486, + "loss/crossentropy": 2.359645998477936, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.28654517233371735, + "loss/reg": 0.0, + "step": 11920 + }, + { + "epoch": 0.07848684210526316, + "grad_norm": 2.75, + "grad_norm_var": 0.06504618326822917, + "learning_rate": 0.0001, + "loss": 3.2312, + "loss/crossentropy": 2.364006555080414, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.24593105614185334, + "loss/reg": 0.0, + "step": 11930 + }, + { + "epoch": 0.07855263157894737, + "grad_norm": 2.390625, + "grad_norm_var": 0.11108296712239583, + "learning_rate": 0.0001, + "loss": 3.1318, + "loss/crossentropy": 2.2041036009788515, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.2498387575149536, + "loss/reg": 0.0, + "step": 11940 + }, + { + "epoch": 0.07861842105263157, + "grad_norm": 3.015625, + "grad_norm_var": 0.06920166015625, + "learning_rate": 0.0001, + "loss": 3.1955, + "loss/crossentropy": 2.2502759456634522, + "loss/hidden": 3.053125, + "loss/incoh": 0.0, + "loss/logits": 0.27016896903514864, + "loss/reg": 0.0, + "step": 11950 + }, + { + "epoch": 0.07868421052631579, + "grad_norm": 3.125, + "grad_norm_var": 0.09563700358072917, + "learning_rate": 0.0001, + "loss": 3.2128, + "loss/crossentropy": 2.1190481543540955, + "loss/hidden": 3.1734375, + "loss/incoh": 0.0, + "loss/logits": 0.288416750729084, + "loss/reg": 0.0, + "step": 11960 + }, + { + "epoch": 0.07875, + "grad_norm": 2.203125, + "grad_norm_var": 0.15563863118489582, + "learning_rate": 0.0001, + "loss": 3.2263, + "loss/crossentropy": 1.9541548937559128, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.271015003323555, + "loss/reg": 0.0, + "step": 11970 + }, + { + "epoch": 0.07881578947368421, + "grad_norm": 3.015625, + "grad_norm_var": 0.09900614420572916, + "learning_rate": 0.0001, + "loss": 3.0889, + "loss/crossentropy": 2.3341493129730226, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.2499636933207512, + "loss/reg": 0.0, + "step": 11980 + }, + { + "epoch": 0.07888157894736843, + "grad_norm": 2.46875, + "grad_norm_var": 0.07329813639322917, + "learning_rate": 0.0001, + "loss": 3.1932, + "loss/crossentropy": 2.388286221027374, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.24724568724632262, + "loss/reg": 0.0, + "step": 11990 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 2.390625, + "grad_norm_var": 0.05429280598958333, + "learning_rate": 0.0001, + "loss": 3.1876, + "loss/crossentropy": 2.351203644275665, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.26266307979822157, + "loss/reg": 0.0, + "step": 12000 + }, + { + "epoch": 0.07901315789473684, + "grad_norm": 2.234375, + "grad_norm_var": 0.04205322265625, + "learning_rate": 0.0001, + "loss": 3.1379, + "loss/crossentropy": 2.3741995811462404, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.24828371405601501, + "loss/reg": 0.0, + "step": 12010 + }, + { + "epoch": 0.07907894736842105, + "grad_norm": 2.640625, + "grad_norm_var": 0.13201395670572916, + "learning_rate": 0.0001, + "loss": 3.2741, + "loss/crossentropy": 2.037536895275116, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.24230389446020126, + "loss/reg": 0.0, + "step": 12020 + }, + { + "epoch": 0.07914473684210527, + "grad_norm": 2.40625, + "grad_norm_var": 0.04157613118489583, + "learning_rate": 0.0001, + "loss": 3.1818, + "loss/crossentropy": 2.2516727566719057, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2339026600122452, + "loss/reg": 0.0, + "step": 12030 + }, + { + "epoch": 0.07921052631578947, + "grad_norm": 2.3125, + "grad_norm_var": 26.919873046875, + "learning_rate": 0.0001, + "loss": 3.2787, + "loss/crossentropy": 2.3474916219711304, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.2710026606917381, + "loss/reg": 0.0, + "step": 12040 + }, + { + "epoch": 0.07927631578947368, + "grad_norm": 2.453125, + "grad_norm_var": 26.796516927083335, + "learning_rate": 0.0001, + "loss": 3.254, + "loss/crossentropy": 2.153431460261345, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.23484777919948102, + "loss/reg": 0.0, + "step": 12050 + }, + { + "epoch": 0.0793421052631579, + "grad_norm": 2.71875, + "grad_norm_var": 0.0520416259765625, + "learning_rate": 0.0001, + "loss": 3.2866, + "loss/crossentropy": 2.137704038619995, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.2761687204241753, + "loss/reg": 0.0, + "step": 12060 + }, + { + "epoch": 0.07940789473684211, + "grad_norm": 2.4375, + "grad_norm_var": 0.05562515258789062, + "learning_rate": 0.0001, + "loss": 3.1658, + "loss/crossentropy": 2.2096517443656922, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.22131142765283585, + "loss/reg": 0.0, + "step": 12070 + }, + { + "epoch": 0.07947368421052632, + "grad_norm": 2.578125, + "grad_norm_var": 0.033300526936848956, + "learning_rate": 0.0001, + "loss": 3.1878, + "loss/crossentropy": 2.486378014087677, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.288157556951046, + "loss/reg": 0.0, + "step": 12080 + }, + { + "epoch": 0.07953947368421052, + "grad_norm": 2.359375, + "grad_norm_var": 0.18038736979166667, + "learning_rate": 0.0001, + "loss": 3.3441, + "loss/crossentropy": 2.1570433020591735, + "loss/hidden": 3.3609375, + "loss/incoh": 0.0, + "loss/logits": 0.33738467693328855, + "loss/reg": 0.0, + "step": 12090 + }, + { + "epoch": 0.07960526315789473, + "grad_norm": 2.234375, + "grad_norm_var": 0.14531962076822916, + "learning_rate": 0.0001, + "loss": 3.2092, + "loss/crossentropy": 2.11829297542572, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.29526630192995074, + "loss/reg": 0.0, + "step": 12100 + }, + { + "epoch": 0.07967105263157895, + "grad_norm": 2.328125, + "grad_norm_var": 0.11277669270833333, + "learning_rate": 0.0001, + "loss": 3.2437, + "loss/crossentropy": 2.151422083377838, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.31264509409666064, + "loss/reg": 0.0, + "step": 12110 + }, + { + "epoch": 0.07973684210526316, + "grad_norm": 3.046875, + "grad_norm_var": 0.09020182291666666, + "learning_rate": 0.0001, + "loss": 3.1802, + "loss/crossentropy": 2.190220355987549, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.26489182114601134, + "loss/reg": 0.0, + "step": 12120 + }, + { + "epoch": 0.07980263157894738, + "grad_norm": 2.359375, + "grad_norm_var": 0.10075581868489583, + "learning_rate": 0.0001, + "loss": 3.2733, + "loss/crossentropy": 2.4329964399337767, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.29891559183597566, + "loss/reg": 0.0, + "step": 12130 + }, + { + "epoch": 0.07986842105263157, + "grad_norm": 2.640625, + "grad_norm_var": 0.08171284993489583, + "learning_rate": 0.0001, + "loss": 3.2143, + "loss/crossentropy": 2.3487884759902955, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2699664428830147, + "loss/reg": 0.0, + "step": 12140 + }, + { + "epoch": 0.07993421052631579, + "grad_norm": 2.1875, + "grad_norm_var": 0.11846415201822917, + "learning_rate": 0.0001, + "loss": 3.1594, + "loss/crossentropy": 2.3031589150428773, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.2605236619710922, + "loss/reg": 0.0, + "step": 12150 + }, + { + "epoch": 0.08, + "grad_norm": 3.015625, + "grad_norm_var": 0.17751363118489583, + "learning_rate": 0.0001, + "loss": 3.3283, + "loss/crossentropy": 2.4235578894615175, + "loss/hidden": 3.265625, + "loss/incoh": 0.0, + "loss/logits": 0.40606142282485963, + "loss/reg": 0.0, + "step": 12160 + }, + { + "epoch": 0.08006578947368422, + "grad_norm": 2.46875, + "grad_norm_var": 0.18465067545572916, + "learning_rate": 0.0001, + "loss": 3.2069, + "loss/crossentropy": 2.3678341031074526, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.26937836706638335, + "loss/reg": 0.0, + "step": 12170 + }, + { + "epoch": 0.08013157894736841, + "grad_norm": 2.4375, + "grad_norm_var": 0.11225484212239584, + "learning_rate": 0.0001, + "loss": 3.1606, + "loss/crossentropy": 2.606902313232422, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2720926284790039, + "loss/reg": 0.0, + "step": 12180 + }, + { + "epoch": 0.08019736842105263, + "grad_norm": 2.140625, + "grad_norm_var": 0.03882548014322917, + "learning_rate": 0.0001, + "loss": 3.2091, + "loss/crossentropy": 2.389057195186615, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.2834290415048599, + "loss/reg": 0.0, + "step": 12190 + }, + { + "epoch": 0.08026315789473684, + "grad_norm": 3.328125, + "grad_norm_var": 0.082470703125, + "learning_rate": 0.0001, + "loss": 3.1846, + "loss/crossentropy": 2.1885082483291627, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.250583179295063, + "loss/reg": 0.0, + "step": 12200 + }, + { + "epoch": 0.08032894736842106, + "grad_norm": 2.328125, + "grad_norm_var": 0.14348551432291667, + "learning_rate": 0.0001, + "loss": 3.2234, + "loss/crossentropy": 2.2288808941841127, + "loss/hidden": 3.159375, + "loss/incoh": 0.0, + "loss/logits": 0.2857954427599907, + "loss/reg": 0.0, + "step": 12210 + }, + { + "epoch": 0.08039473684210527, + "grad_norm": 2.578125, + "grad_norm_var": 0.05944010416666667, + "learning_rate": 0.0001, + "loss": 3.1535, + "loss/crossentropy": 2.3943295001983644, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.27064385265111923, + "loss/reg": 0.0, + "step": 12220 + }, + { + "epoch": 0.08046052631578947, + "grad_norm": 2.921875, + "grad_norm_var": 0.051488240559895836, + "learning_rate": 0.0001, + "loss": 3.2165, + "loss/crossentropy": 2.36237952709198, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.29628041982650755, + "loss/reg": 0.0, + "step": 12230 + }, + { + "epoch": 0.08052631578947368, + "grad_norm": 2.390625, + "grad_norm_var": 0.06207682291666667, + "learning_rate": 0.0001, + "loss": 3.2112, + "loss/crossentropy": 2.26582453250885, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.3621983379125595, + "loss/reg": 0.0, + "step": 12240 + }, + { + "epoch": 0.0805921052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.08144124348958333, + "learning_rate": 0.0001, + "loss": 3.2711, + "loss/crossentropy": 2.4002971291542052, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.29988196343183515, + "loss/reg": 0.0, + "step": 12250 + }, + { + "epoch": 0.08065789473684211, + "grad_norm": 2.78125, + "grad_norm_var": 3.1181549072265624, + "learning_rate": 0.0001, + "loss": 3.2486, + "loss/crossentropy": 2.26135613322258, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.24128984957933425, + "loss/reg": 0.0, + "step": 12260 + }, + { + "epoch": 0.08072368421052632, + "grad_norm": 2.34375, + "grad_norm_var": 4.51636962890625, + "learning_rate": 0.0001, + "loss": 3.5227, + "loss/crossentropy": 2.2277899503707888, + "loss/hidden": 3.453125, + "loss/incoh": 0.0, + "loss/logits": 0.35484138429164885, + "loss/reg": 0.0, + "step": 12270 + }, + { + "epoch": 0.08078947368421052, + "grad_norm": 2.703125, + "grad_norm_var": 2.06083984375, + "learning_rate": 0.0001, + "loss": 3.2936, + "loss/crossentropy": 2.313612127304077, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2666293315589428, + "loss/reg": 0.0, + "step": 12280 + }, + { + "epoch": 0.08085526315789474, + "grad_norm": 4.625, + "grad_norm_var": 6.363402303059896, + "learning_rate": 0.0001, + "loss": 3.2228, + "loss/crossentropy": 2.3592599511146544, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.25869676619768145, + "loss/reg": 0.0, + "step": 12290 + }, + { + "epoch": 0.08092105263157895, + "grad_norm": 2.421875, + "grad_norm_var": 0.3772532145182292, + "learning_rate": 0.0001, + "loss": 3.2186, + "loss/crossentropy": 2.50057338476181, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.2711464300751686, + "loss/reg": 0.0, + "step": 12300 + }, + { + "epoch": 0.08098684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 0.07467447916666667, + "learning_rate": 0.0001, + "loss": 3.2317, + "loss/crossentropy": 2.381076216697693, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.24998296648263932, + "loss/reg": 0.0, + "step": 12310 + }, + { + "epoch": 0.08105263157894736, + "grad_norm": 2.53125, + "grad_norm_var": 1.58385009765625, + "learning_rate": 0.0001, + "loss": 3.3666, + "loss/crossentropy": 2.2987404227256776, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.24597887545824051, + "loss/reg": 0.0, + "step": 12320 + }, + { + "epoch": 0.08111842105263158, + "grad_norm": 2.421875, + "grad_norm_var": 3.158852918199495e+17, + "learning_rate": 0.0001, + "loss": 3.3716, + "loss/crossentropy": 2.415051448345184, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.29122400283813477, + "loss/reg": 0.0, + "step": 12330 + }, + { + "epoch": 0.08118421052631579, + "grad_norm": 2.734375, + "grad_norm_var": 3.158852918454168e+17, + "learning_rate": 0.0001, + "loss": 3.2774, + "loss/crossentropy": 2.082320672273636, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2748811081051826, + "loss/reg": 0.0, + "step": 12340 + }, + { + "epoch": 0.08125, + "grad_norm": 3.46875, + "grad_norm_var": 0.26638997395833336, + "learning_rate": 0.0001, + "loss": 3.2315, + "loss/crossentropy": 2.7044607162475587, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.32181375473737717, + "loss/reg": 0.0, + "step": 12350 + }, + { + "epoch": 0.08131578947368422, + "grad_norm": 2.765625, + "grad_norm_var": 0.22693684895833333, + "learning_rate": 0.0001, + "loss": 3.2351, + "loss/crossentropy": 2.7221840620040894, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.26635742783546446, + "loss/reg": 0.0, + "step": 12360 + }, + { + "epoch": 0.08138157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.26437174479166664, + "learning_rate": 0.0001, + "loss": 3.2068, + "loss/crossentropy": 2.3512622594833372, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.2477712720632553, + "loss/reg": 0.0, + "step": 12370 + }, + { + "epoch": 0.08144736842105263, + "grad_norm": 2.125, + "grad_norm_var": 0.3268707275390625, + "learning_rate": 0.0001, + "loss": 3.1974, + "loss/crossentropy": 2.210025131702423, + "loss/hidden": 3.05625, + "loss/incoh": 0.0, + "loss/logits": 0.36077398508787156, + "loss/reg": 0.0, + "step": 12380 + }, + { + "epoch": 0.08151315789473684, + "grad_norm": 2.546875, + "grad_norm_var": 0.5682525634765625, + "learning_rate": 0.0001, + "loss": 3.2079, + "loss/crossentropy": 2.0402897000312805, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.26192123591899874, + "loss/reg": 0.0, + "step": 12390 + }, + { + "epoch": 0.08157894736842106, + "grad_norm": 2.65625, + "grad_norm_var": 0.3712565104166667, + "learning_rate": 0.0001, + "loss": 3.2107, + "loss/crossentropy": 1.9560218453407288, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.21844983994960784, + "loss/reg": 0.0, + "step": 12400 + }, + { + "epoch": 0.08164473684210527, + "grad_norm": 3.171875, + "grad_norm_var": 0.4505859375, + "learning_rate": 0.0001, + "loss": 3.2759, + "loss/crossentropy": 2.3604748249053955, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.2524840489029884, + "loss/reg": 0.0, + "step": 12410 + }, + { + "epoch": 0.08171052631578947, + "grad_norm": 2.484375, + "grad_norm_var": 0.24068603515625, + "learning_rate": 0.0001, + "loss": 3.2728, + "loss/crossentropy": 2.517817199230194, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.29347013384103776, + "loss/reg": 0.0, + "step": 12420 + }, + { + "epoch": 0.08177631578947368, + "grad_norm": 2.40625, + "grad_norm_var": 0.12271728515625, + "learning_rate": 0.0001, + "loss": 3.2551, + "loss/crossentropy": 2.4665472149848937, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2709146931767464, + "loss/reg": 0.0, + "step": 12430 + }, + { + "epoch": 0.0818421052631579, + "grad_norm": 2.265625, + "grad_norm_var": 0.2295074462890625, + "learning_rate": 0.0001, + "loss": 3.2219, + "loss/crossentropy": 2.40498046875, + "loss/hidden": 3.1, + "loss/incoh": 0.0, + "loss/logits": 0.32999152690172195, + "loss/reg": 0.0, + "step": 12440 + }, + { + "epoch": 0.08190789473684211, + "grad_norm": 2.421875, + "grad_norm_var": 0.1619049072265625, + "learning_rate": 0.0001, + "loss": 3.0867, + "loss/crossentropy": 2.3188422203063963, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.29690912514925005, + "loss/reg": 0.0, + "step": 12450 + }, + { + "epoch": 0.08197368421052631, + "grad_norm": 2.09375, + "grad_norm_var": 0.07049051920572917, + "learning_rate": 0.0001, + "loss": 3.2682, + "loss/crossentropy": 2.588274967670441, + "loss/hidden": 3.4046875, + "loss/incoh": 0.0, + "loss/logits": 0.39090928733348845, + "loss/reg": 0.0, + "step": 12460 + }, + { + "epoch": 0.08203947368421052, + "grad_norm": 3.921875, + "grad_norm_var": 0.2250261942545573, + "learning_rate": 0.0001, + "loss": 3.2503, + "loss/crossentropy": 2.6322230458259583, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.2629552409052849, + "loss/reg": 0.0, + "step": 12470 + }, + { + "epoch": 0.08210526315789474, + "grad_norm": 2.203125, + "grad_norm_var": 0.18522109985351562, + "learning_rate": 0.0001, + "loss": 3.1313, + "loss/crossentropy": 2.1257114171981812, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23226768374443055, + "loss/reg": 0.0, + "step": 12480 + }, + { + "epoch": 0.08217105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.16383056640625, + "learning_rate": 0.0001, + "loss": 3.2008, + "loss/crossentropy": 1.9993813276290893, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.21686773076653482, + "loss/reg": 0.0, + "step": 12490 + }, + { + "epoch": 0.08223684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.12244364420572916, + "learning_rate": 0.0001, + "loss": 3.1643, + "loss/crossentropy": 2.4217318654060365, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.2706719309091568, + "loss/reg": 0.0, + "step": 12500 + }, + { + "epoch": 0.08230263157894736, + "grad_norm": 3.03125, + "grad_norm_var": 0.9795888264973959, + "learning_rate": 0.0001, + "loss": 3.2763, + "loss/crossentropy": 2.131495940685272, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.24229931831359863, + "loss/reg": 0.0, + "step": 12510 + }, + { + "epoch": 0.08236842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 1.021240234375, + "learning_rate": 0.0001, + "loss": 3.1894, + "loss/crossentropy": 2.2545747995376586, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.26802987307310105, + "loss/reg": 0.0, + "step": 12520 + }, + { + "epoch": 0.08243421052631579, + "grad_norm": 2.234375, + "grad_norm_var": 2.0329969940865024e+17, + "learning_rate": 0.0001, + "loss": 3.364, + "loss/crossentropy": 2.37786750793457, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.27177259773015977, + "loss/reg": 0.0, + "step": 12530 + }, + { + "epoch": 0.0825, + "grad_norm": 2.46875, + "grad_norm_var": 0.0586822509765625, + "learning_rate": 0.0001, + "loss": 3.1425, + "loss/crossentropy": 2.3742210388183596, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.25972897857427596, + "loss/reg": 0.0, + "step": 12540 + }, + { + "epoch": 0.08256578947368422, + "grad_norm": 2.21875, + "grad_norm_var": 0.16419270833333333, + "learning_rate": 0.0001, + "loss": 3.1808, + "loss/crossentropy": 2.2249507308006287, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.260022896528244, + "loss/reg": 0.0, + "step": 12550 + }, + { + "epoch": 0.08263157894736842, + "grad_norm": 3.765625, + "grad_norm_var": 0.1635406494140625, + "learning_rate": 0.0001, + "loss": 3.2244, + "loss/crossentropy": 2.417391860485077, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.24499612003564836, + "loss/reg": 0.0, + "step": 12560 + }, + { + "epoch": 0.08269736842105263, + "grad_norm": 2.328125, + "grad_norm_var": 0.24940999348958334, + "learning_rate": 0.0001, + "loss": 3.259, + "loss/crossentropy": 2.008757221698761, + "loss/hidden": 3.371875, + "loss/incoh": 0.0, + "loss/logits": 0.28915109634399416, + "loss/reg": 0.0, + "step": 12570 + }, + { + "epoch": 0.08276315789473684, + "grad_norm": 2.4375, + "grad_norm_var": 0.030516560872395834, + "learning_rate": 0.0001, + "loss": 3.0918, + "loss/crossentropy": 2.3331239223480225, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.27426364421844485, + "loss/reg": 0.0, + "step": 12580 + }, + { + "epoch": 0.08282894736842106, + "grad_norm": 2.3125, + "grad_norm_var": 0.4984527587890625, + "learning_rate": 0.0001, + "loss": 3.2675, + "loss/crossentropy": 2.3463852405548096, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.2746781826019287, + "loss/reg": 0.0, + "step": 12590 + }, + { + "epoch": 0.08289473684210526, + "grad_norm": 2.09375, + "grad_norm_var": 0.026822916666666665, + "learning_rate": 0.0001, + "loss": 3.0942, + "loss/crossentropy": 2.1946144729852675, + "loss/hidden": 3.0390625, + "loss/incoh": 0.0, + "loss/logits": 0.273574560880661, + "loss/reg": 0.0, + "step": 12600 + }, + { + "epoch": 0.08296052631578947, + "grad_norm": 2.75, + "grad_norm_var": 0.399658203125, + "learning_rate": 0.0001, + "loss": 3.3113, + "loss/crossentropy": 2.292886030673981, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.3399674043059349, + "loss/reg": 0.0, + "step": 12610 + }, + { + "epoch": 0.08302631578947368, + "grad_norm": 2.703125, + "grad_norm_var": 0.306982421875, + "learning_rate": 0.0001, + "loss": 3.3316, + "loss/crossentropy": 2.2384460091590883, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2688921958208084, + "loss/reg": 0.0, + "step": 12620 + }, + { + "epoch": 0.0830921052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.04449462890625, + "learning_rate": 0.0001, + "loss": 3.1944, + "loss/crossentropy": 2.2317716479301453, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.23216352015733718, + "loss/reg": 0.0, + "step": 12630 + }, + { + "epoch": 0.08315789473684211, + "grad_norm": 2.203125, + "grad_norm_var": 0.05810139973958333, + "learning_rate": 0.0001, + "loss": 3.1734, + "loss/crossentropy": 2.513770651817322, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.26424300968647, + "loss/reg": 0.0, + "step": 12640 + }, + { + "epoch": 0.08322368421052631, + "grad_norm": 2.765625, + "grad_norm_var": 0.060302734375, + "learning_rate": 0.0001, + "loss": 3.212, + "loss/crossentropy": 2.3643002271652223, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.2974396377801895, + "loss/reg": 0.0, + "step": 12650 + }, + { + "epoch": 0.08328947368421052, + "grad_norm": 2.765625, + "grad_norm_var": 0.11692301432291667, + "learning_rate": 0.0001, + "loss": 3.2013, + "loss/crossentropy": 2.200558376312256, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.26023727655410767, + "loss/reg": 0.0, + "step": 12660 + }, + { + "epoch": 0.08335526315789474, + "grad_norm": 2.34375, + "grad_norm_var": 0.0976470947265625, + "learning_rate": 0.0001, + "loss": 3.2425, + "loss/crossentropy": 2.3456878662109375, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.30842293947935107, + "loss/reg": 0.0, + "step": 12670 + }, + { + "epoch": 0.08342105263157895, + "grad_norm": 2.4375, + "grad_norm_var": 0.05621744791666667, + "learning_rate": 0.0001, + "loss": 3.1823, + "loss/crossentropy": 2.2649134039878844, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.24366314560174943, + "loss/reg": 0.0, + "step": 12680 + }, + { + "epoch": 0.08348684210526315, + "grad_norm": 2.328125, + "grad_norm_var": 0.059891764322916666, + "learning_rate": 0.0001, + "loss": 3.1859, + "loss/crossentropy": 2.0741775274276733, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.24329135864973067, + "loss/reg": 0.0, + "step": 12690 + }, + { + "epoch": 0.08355263157894736, + "grad_norm": 2.40625, + "grad_norm_var": 0.13880208333333333, + "learning_rate": 0.0001, + "loss": 3.1551, + "loss/crossentropy": 2.4719223856925963, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.24188547879457473, + "loss/reg": 0.0, + "step": 12700 + }, + { + "epoch": 0.08361842105263158, + "grad_norm": 2.8125, + "grad_norm_var": 0.32080078125, + "learning_rate": 0.0001, + "loss": 3.3068, + "loss/crossentropy": 2.279301416873932, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.2998178914189339, + "loss/reg": 0.0, + "step": 12710 + }, + { + "epoch": 0.08368421052631579, + "grad_norm": 2.65625, + "grad_norm_var": 0.341064453125, + "learning_rate": 0.0001, + "loss": 3.2089, + "loss/crossentropy": 2.4134485125541687, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.28444976508617403, + "loss/reg": 0.0, + "step": 12720 + }, + { + "epoch": 0.08375, + "grad_norm": 2.40625, + "grad_norm_var": 0.13637593587239583, + "learning_rate": 0.0001, + "loss": 3.2345, + "loss/crossentropy": 2.4403869032859804, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.26378336995840074, + "loss/reg": 0.0, + "step": 12730 + }, + { + "epoch": 0.0838157894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.0974273681640625, + "learning_rate": 0.0001, + "loss": 3.1501, + "loss/crossentropy": 2.1977667093276976, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.23563524186611176, + "loss/reg": 0.0, + "step": 12740 + }, + { + "epoch": 0.08388157894736842, + "grad_norm": 3.203125, + "grad_norm_var": 0.12566731770833334, + "learning_rate": 0.0001, + "loss": 3.2516, + "loss/crossentropy": 2.206720507144928, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.2731472015380859, + "loss/reg": 0.0, + "step": 12750 + }, + { + "epoch": 0.08394736842105263, + "grad_norm": 3.859375, + "grad_norm_var": 0.21768290201822918, + "learning_rate": 0.0001, + "loss": 3.2035, + "loss/crossentropy": 2.3951833486557006, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.26070789247751236, + "loss/reg": 0.0, + "step": 12760 + }, + { + "epoch": 0.08401315789473685, + "grad_norm": 2.140625, + "grad_norm_var": 0.36861572265625, + "learning_rate": 0.0001, + "loss": 3.3054, + "loss/crossentropy": 2.393438732624054, + "loss/hidden": 3.2328125, + "loss/incoh": 0.0, + "loss/logits": 0.3618380635976791, + "loss/reg": 0.0, + "step": 12770 + }, + { + "epoch": 0.08407894736842106, + "grad_norm": 2.28125, + "grad_norm_var": 0.24163004557291667, + "learning_rate": 0.0001, + "loss": 3.2972, + "loss/crossentropy": 2.510524129867554, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.26494802087545394, + "loss/reg": 0.0, + "step": 12780 + }, + { + "epoch": 0.08414473684210526, + "grad_norm": 2.640625, + "grad_norm_var": 0.07789713541666667, + "learning_rate": 0.0001, + "loss": 3.2375, + "loss/crossentropy": 2.326508915424347, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.25067940801382066, + "loss/reg": 0.0, + "step": 12790 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 2.625, + "grad_norm_var": 0.0654693603515625, + "learning_rate": 0.0001, + "loss": 3.1664, + "loss/crossentropy": 2.2524615049362184, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.23585905730724335, + "loss/reg": 0.0, + "step": 12800 + }, + { + "epoch": 0.08427631578947369, + "grad_norm": 2.296875, + "grad_norm_var": 0.05461324055989583, + "learning_rate": 0.0001, + "loss": 3.2086, + "loss/crossentropy": 2.586345672607422, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.2775671869516373, + "loss/reg": 0.0, + "step": 12810 + }, + { + "epoch": 0.0843421052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.157568359375, + "learning_rate": 0.0001, + "loss": 3.2467, + "loss/crossentropy": 2.4124717354774474, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.27734949439764023, + "loss/reg": 0.0, + "step": 12820 + }, + { + "epoch": 0.0844078947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.151953125, + "learning_rate": 0.0001, + "loss": 3.1838, + "loss/crossentropy": 2.217060422897339, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.31069841980934143, + "loss/reg": 0.0, + "step": 12830 + }, + { + "epoch": 0.08447368421052631, + "grad_norm": 2.46875, + "grad_norm_var": 0.15916341145833332, + "learning_rate": 0.0001, + "loss": 3.251, + "loss/crossentropy": 2.2915706515312193, + "loss/hidden": 3.128125, + "loss/incoh": 0.0, + "loss/logits": 0.28919376283884046, + "loss/reg": 0.0, + "step": 12840 + }, + { + "epoch": 0.08453947368421053, + "grad_norm": 2.8125, + "grad_norm_var": 0.05465494791666667, + "learning_rate": 0.0001, + "loss": 3.1509, + "loss/crossentropy": 2.3436198830604553, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2566779345273972, + "loss/reg": 0.0, + "step": 12850 + }, + { + "epoch": 0.08460526315789474, + "grad_norm": 2.796875, + "grad_norm_var": 0.12815653483072917, + "learning_rate": 0.0001, + "loss": 3.1553, + "loss/crossentropy": 2.6007506489753722, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.27069382518529894, + "loss/reg": 0.0, + "step": 12860 + }, + { + "epoch": 0.08467105263157895, + "grad_norm": 3.125, + "grad_norm_var": 0.09325764973958334, + "learning_rate": 0.0001, + "loss": 3.1848, + "loss/crossentropy": 2.3489827513694763, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.2542911395430565, + "loss/reg": 0.0, + "step": 12870 + }, + { + "epoch": 0.08473684210526315, + "grad_norm": 2.359375, + "grad_norm_var": 0.2053375244140625, + "learning_rate": 0.0001, + "loss": 3.2528, + "loss/crossentropy": 2.3320749402046204, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.25249498784542085, + "loss/reg": 0.0, + "step": 12880 + }, + { + "epoch": 0.08480263157894737, + "grad_norm": 2.375, + "grad_norm_var": 0.22139383951822916, + "learning_rate": 0.0001, + "loss": 3.2744, + "loss/crossentropy": 2.5565970659255983, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.30419613122940065, + "loss/reg": 0.0, + "step": 12890 + }, + { + "epoch": 0.08486842105263158, + "grad_norm": 2.390625, + "grad_norm_var": 0.12555338541666666, + "learning_rate": 0.0001, + "loss": 3.2548, + "loss/crossentropy": 2.382706320285797, + "loss/hidden": 3.0875, + "loss/incoh": 0.0, + "loss/logits": 0.28827311396598815, + "loss/reg": 0.0, + "step": 12900 + }, + { + "epoch": 0.08493421052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.031119791666666667, + "learning_rate": 0.0001, + "loss": 3.1646, + "loss/crossentropy": 2.209449625015259, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.27490794360637666, + "loss/reg": 0.0, + "step": 12910 + }, + { + "epoch": 0.085, + "grad_norm": 2.46875, + "grad_norm_var": 0.075537109375, + "learning_rate": 0.0001, + "loss": 3.1151, + "loss/crossentropy": 2.42991498708725, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.24879284277558328, + "loss/reg": 0.0, + "step": 12920 + }, + { + "epoch": 0.0850657894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.06396865844726562, + "learning_rate": 0.0001, + "loss": 3.1176, + "loss/crossentropy": 2.124844658374786, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.26795649230480195, + "loss/reg": 0.0, + "step": 12930 + }, + { + "epoch": 0.08513157894736842, + "grad_norm": 3.046875, + "grad_norm_var": 0.0884844462076823, + "learning_rate": 0.0001, + "loss": 3.1416, + "loss/crossentropy": 2.390605056285858, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2603973612189293, + "loss/reg": 0.0, + "step": 12940 + }, + { + "epoch": 0.08519736842105263, + "grad_norm": 2.15625, + "grad_norm_var": 0.11295572916666667, + "learning_rate": 0.0001, + "loss": 3.1506, + "loss/crossentropy": 2.3307228684425354, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.23968299478292465, + "loss/reg": 0.0, + "step": 12950 + }, + { + "epoch": 0.08526315789473685, + "grad_norm": 2.375, + "grad_norm_var": 0.07505594889322917, + "learning_rate": 0.0001, + "loss": 3.1409, + "loss/crossentropy": 2.32118815779686, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.27437605410814286, + "loss/reg": 0.0, + "step": 12960 + }, + { + "epoch": 0.08532894736842105, + "grad_norm": 2.8125, + "grad_norm_var": 0.05011393229166667, + "learning_rate": 0.0001, + "loss": 3.1943, + "loss/crossentropy": 2.1953859329223633, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.2964278385043144, + "loss/reg": 0.0, + "step": 12970 + }, + { + "epoch": 0.08539473684210526, + "grad_norm": 2.265625, + "grad_norm_var": 0.044733683268229164, + "learning_rate": 0.0001, + "loss": 3.1596, + "loss/crossentropy": 2.48675742149353, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.2866469621658325, + "loss/reg": 0.0, + "step": 12980 + }, + { + "epoch": 0.08546052631578947, + "grad_norm": 2.890625, + "grad_norm_var": 0.04439188639322917, + "learning_rate": 0.0001, + "loss": 3.1955, + "loss/crossentropy": 2.3276284098625184, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.254511134326458, + "loss/reg": 0.0, + "step": 12990 + }, + { + "epoch": 0.08552631578947369, + "grad_norm": 2.125, + "grad_norm_var": 0.18092041015625, + "learning_rate": 0.0001, + "loss": 3.3308, + "loss/crossentropy": 2.436275231838226, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.27356108725070954, + "loss/reg": 0.0, + "step": 13000 + }, + { + "epoch": 0.0855921052631579, + "grad_norm": 2.25, + "grad_norm_var": 0.17511393229166666, + "learning_rate": 0.0001, + "loss": 3.1302, + "loss/crossentropy": 2.2856626510620117, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.24442512094974517, + "loss/reg": 0.0, + "step": 13010 + }, + { + "epoch": 0.0856578947368421, + "grad_norm": 2.53125, + "grad_norm_var": 0.10161844889322917, + "learning_rate": 0.0001, + "loss": 3.1923, + "loss/crossentropy": 2.4494728326797484, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2575364217162132, + "loss/reg": 0.0, + "step": 13020 + }, + { + "epoch": 0.08572368421052631, + "grad_norm": 2.34375, + "grad_norm_var": 0.09909566243489583, + "learning_rate": 0.0001, + "loss": 3.1732, + "loss/crossentropy": 2.4833264112472535, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.2727135464549065, + "loss/reg": 0.0, + "step": 13030 + }, + { + "epoch": 0.08578947368421053, + "grad_norm": 3.203125, + "grad_norm_var": 0.12711588541666666, + "learning_rate": 0.0001, + "loss": 3.165, + "loss/crossentropy": 2.284275805950165, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.29309146106243134, + "loss/reg": 0.0, + "step": 13040 + }, + { + "epoch": 0.08585526315789474, + "grad_norm": 2.28125, + "grad_norm_var": 0.18906148274739584, + "learning_rate": 0.0001, + "loss": 3.1988, + "loss/crossentropy": 2.1231042385101317, + "loss/hidden": 3.0375, + "loss/incoh": 0.0, + "loss/logits": 0.246621835231781, + "loss/reg": 0.0, + "step": 13050 + }, + { + "epoch": 0.08592105263157895, + "grad_norm": 2.234375, + "grad_norm_var": 0.14317118326822917, + "learning_rate": 0.0001, + "loss": 3.2204, + "loss/crossentropy": 2.115370142459869, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.22535212635993956, + "loss/reg": 0.0, + "step": 13060 + }, + { + "epoch": 0.08598684210526315, + "grad_norm": 2.234375, + "grad_norm_var": 0.17832743326822917, + "learning_rate": 0.0001, + "loss": 3.1988, + "loss/crossentropy": 2.271882343292236, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.2518279105424881, + "loss/reg": 0.0, + "step": 13070 + }, + { + "epoch": 0.08605263157894737, + "grad_norm": 1.90625, + "grad_norm_var": 0.1100250244140625, + "learning_rate": 0.0001, + "loss": 3.2672, + "loss/crossentropy": 2.211618059873581, + "loss/hidden": 3.015625, + "loss/incoh": 0.0, + "loss/logits": 0.2709794193506241, + "loss/reg": 0.0, + "step": 13080 + }, + { + "epoch": 0.08611842105263158, + "grad_norm": 2.15625, + "grad_norm_var": 0.06676432291666666, + "learning_rate": 0.0001, + "loss": 3.2393, + "loss/crossentropy": 2.4678762197494506, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.2634593158960342, + "loss/reg": 0.0, + "step": 13090 + }, + { + "epoch": 0.0861842105263158, + "grad_norm": 2.484375, + "grad_norm_var": 0.030402628580729167, + "learning_rate": 0.0001, + "loss": 3.19, + "loss/crossentropy": 2.5426036715507507, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.26041875034570694, + "loss/reg": 0.0, + "step": 13100 + }, + { + "epoch": 0.08625, + "grad_norm": 2.34375, + "grad_norm_var": 0.028034464518229166, + "learning_rate": 0.0001, + "loss": 3.1556, + "loss/crossentropy": 2.1748136937618257, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.3230336934328079, + "loss/reg": 0.0, + "step": 13110 + }, + { + "epoch": 0.0863157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.3787750244140625, + "learning_rate": 0.0001, + "loss": 3.2653, + "loss/crossentropy": 2.352058470249176, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.3100366100668907, + "loss/reg": 0.0, + "step": 13120 + }, + { + "epoch": 0.08638157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.12065327962239583, + "learning_rate": 0.0001, + "loss": 3.2897, + "loss/crossentropy": 2.2920926332473757, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.24493622779846191, + "loss/reg": 0.0, + "step": 13130 + }, + { + "epoch": 0.08644736842105263, + "grad_norm": 3.140625, + "grad_norm_var": 0.17266337076822916, + "learning_rate": 0.0001, + "loss": 3.2541, + "loss/crossentropy": 2.253483748435974, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.24110897034406661, + "loss/reg": 0.0, + "step": 13140 + }, + { + "epoch": 0.08651315789473685, + "grad_norm": 3.140625, + "grad_norm_var": 0.15883687337239583, + "learning_rate": 0.0001, + "loss": 3.2161, + "loss/crossentropy": 2.490678381919861, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.2905942976474762, + "loss/reg": 0.0, + "step": 13150 + }, + { + "epoch": 0.08657894736842105, + "grad_norm": 2.671875, + "grad_norm_var": 0.124365234375, + "learning_rate": 0.0001, + "loss": 3.2037, + "loss/crossentropy": 2.607399010658264, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.3426252081990242, + "loss/reg": 0.0, + "step": 13160 + }, + { + "epoch": 0.08664473684210526, + "grad_norm": 2.578125, + "grad_norm_var": 0.20832697550455728, + "learning_rate": 0.0001, + "loss": 3.3389, + "loss/crossentropy": 2.5389102935791015, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.27587536424398423, + "loss/reg": 0.0, + "step": 13170 + }, + { + "epoch": 0.08671052631578947, + "grad_norm": 2.171875, + "grad_norm_var": 3.749950368844328e+17, + "learning_rate": 0.0001, + "loss": 3.3435, + "loss/crossentropy": 2.2043145060539246, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.27380194514989853, + "loss/reg": 0.0, + "step": 13180 + }, + { + "epoch": 0.08677631578947369, + "grad_norm": 3.203125, + "grad_norm_var": 0.10250651041666667, + "learning_rate": 0.0001, + "loss": 3.2179, + "loss/crossentropy": 2.5562551975250245, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.35649020969867706, + "loss/reg": 0.0, + "step": 13190 + }, + { + "epoch": 0.0868421052631579, + "grad_norm": 2.046875, + "grad_norm_var": 0.11367899576822917, + "learning_rate": 0.0001, + "loss": 3.1639, + "loss/crossentropy": 2.2194557189941406, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.2623781323432922, + "loss/reg": 0.0, + "step": 13200 + }, + { + "epoch": 0.0869078947368421, + "grad_norm": 2.59375, + "grad_norm_var": 0.09927978515625, + "learning_rate": 0.0001, + "loss": 3.203, + "loss/crossentropy": 2.3938650250434876, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.22924545854330064, + "loss/reg": 0.0, + "step": 13210 + }, + { + "epoch": 0.08697368421052631, + "grad_norm": 2.328125, + "grad_norm_var": 0.07858072916666667, + "learning_rate": 0.0001, + "loss": 3.1885, + "loss/crossentropy": 2.659112477302551, + "loss/hidden": 3.2625, + "loss/incoh": 0.0, + "loss/logits": 0.3096113160252571, + "loss/reg": 0.0, + "step": 13220 + }, + { + "epoch": 0.08703947368421053, + "grad_norm": 2.140625, + "grad_norm_var": 0.08105061848958334, + "learning_rate": 0.0001, + "loss": 3.1608, + "loss/crossentropy": 2.2919702410697935, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.28502654284238815, + "loss/reg": 0.0, + "step": 13230 + }, + { + "epoch": 0.08710526315789474, + "grad_norm": 2.421875, + "grad_norm_var": 0.2513631184895833, + "learning_rate": 0.0001, + "loss": 3.1995, + "loss/crossentropy": 2.206997013092041, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.20357893258333207, + "loss/reg": 0.0, + "step": 13240 + }, + { + "epoch": 0.08717105263157894, + "grad_norm": 2.1875, + "grad_norm_var": 0.06531575520833334, + "learning_rate": 0.0001, + "loss": 3.1267, + "loss/crossentropy": 2.095241755247116, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.23652465790510177, + "loss/reg": 0.0, + "step": 13250 + }, + { + "epoch": 0.08723684210526315, + "grad_norm": 2.328125, + "grad_norm_var": 0.110009765625, + "learning_rate": 0.0001, + "loss": 3.1799, + "loss/crossentropy": 2.1254674077033995, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.22684407681226731, + "loss/reg": 0.0, + "step": 13260 + }, + { + "epoch": 0.08730263157894737, + "grad_norm": 2.5625, + "grad_norm_var": 3.398986257643471e+17, + "learning_rate": 0.0001, + "loss": 3.358, + "loss/crossentropy": 2.3295519828796385, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.30277568846940994, + "loss/reg": 0.0, + "step": 13270 + }, + { + "epoch": 0.08736842105263158, + "grad_norm": 2.359375, + "grad_norm_var": 3.3989862579380115e+17, + "learning_rate": 0.0001, + "loss": 3.3036, + "loss/crossentropy": 2.496494376659393, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2583273336291313, + "loss/reg": 0.0, + "step": 13280 + }, + { + "epoch": 0.0874342105263158, + "grad_norm": 2.640625, + "grad_norm_var": 0.0663238525390625, + "learning_rate": 0.0001, + "loss": 3.2025, + "loss/crossentropy": 2.332583689689636, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.24232984483242034, + "loss/reg": 0.0, + "step": 13290 + }, + { + "epoch": 0.0875, + "grad_norm": 2.25, + "grad_norm_var": 0.43166910807291664, + "learning_rate": 0.0001, + "loss": 3.1339, + "loss/crossentropy": 2.394269013404846, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.24607907682657243, + "loss/reg": 0.0, + "step": 13300 + }, + { + "epoch": 0.08756578947368421, + "grad_norm": 2.40625, + "grad_norm_var": 0.112890625, + "learning_rate": 0.0001, + "loss": 3.1815, + "loss/crossentropy": 2.2034417927265166, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.25049934834241866, + "loss/reg": 0.0, + "step": 13310 + }, + { + "epoch": 0.08763157894736842, + "grad_norm": 2.6875, + "grad_norm_var": 0.0748443603515625, + "learning_rate": 0.0001, + "loss": 3.2491, + "loss/crossentropy": 2.058911919593811, + "loss/hidden": 3.1359375, + "loss/incoh": 0.0, + "loss/logits": 0.25788910537958143, + "loss/reg": 0.0, + "step": 13320 + }, + { + "epoch": 0.08769736842105263, + "grad_norm": 3.046875, + "grad_norm_var": 0.22857666015625, + "learning_rate": 0.0001, + "loss": 3.2051, + "loss/crossentropy": 2.3308457016944883, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.3006305813789368, + "loss/reg": 0.0, + "step": 13330 + }, + { + "epoch": 0.08776315789473685, + "grad_norm": 2.46875, + "grad_norm_var": 0.266796875, + "learning_rate": 0.0001, + "loss": 3.1792, + "loss/crossentropy": 2.392533528804779, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.25690300911664965, + "loss/reg": 0.0, + "step": 13340 + }, + { + "epoch": 0.08782894736842105, + "grad_norm": 2.015625, + "grad_norm_var": 0.24251302083333334, + "learning_rate": 0.0001, + "loss": 3.1342, + "loss/crossentropy": 2.303742027282715, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.24467559903860092, + "loss/reg": 0.0, + "step": 13350 + }, + { + "epoch": 0.08789473684210526, + "grad_norm": 3.21875, + "grad_norm_var": 0.26910400390625, + "learning_rate": 0.0001, + "loss": 3.2614, + "loss/crossentropy": 2.5700215101242065, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.346772675216198, + "loss/reg": 0.0, + "step": 13360 + }, + { + "epoch": 0.08796052631578948, + "grad_norm": 2.234375, + "grad_norm_var": 8.30354715983073, + "learning_rate": 0.0001, + "loss": 3.1987, + "loss/crossentropy": 2.3953630328178406, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.2601369693875313, + "loss/reg": 0.0, + "step": 13370 + }, + { + "epoch": 0.08802631578947369, + "grad_norm": 3.671875, + "grad_norm_var": 3.5722076416015627, + "learning_rate": 0.0001, + "loss": 3.2285, + "loss/crossentropy": 2.540582847595215, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.274001345038414, + "loss/reg": 0.0, + "step": 13380 + }, + { + "epoch": 0.08809210526315789, + "grad_norm": 2.359375, + "grad_norm_var": 0.2377593994140625, + "learning_rate": 0.0001, + "loss": 3.1787, + "loss/crossentropy": 2.3221506476402283, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.2874672919511795, + "loss/reg": 0.0, + "step": 13390 + }, + { + "epoch": 0.0881578947368421, + "grad_norm": 2.15625, + "grad_norm_var": 1.4219309488932292, + "learning_rate": 0.0001, + "loss": 3.1862, + "loss/crossentropy": 2.255697971582413, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.25251765847206115, + "loss/reg": 0.0, + "step": 13400 + }, + { + "epoch": 0.08822368421052632, + "grad_norm": 2.28125, + "grad_norm_var": 0.023307291666666667, + "learning_rate": 0.0001, + "loss": 3.1642, + "loss/crossentropy": 2.376703941822052, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2795195817947388, + "loss/reg": 0.0, + "step": 13410 + }, + { + "epoch": 0.08828947368421053, + "grad_norm": 2.359375, + "grad_norm_var": 0.32665608723958334, + "learning_rate": 0.0001, + "loss": 3.1802, + "loss/crossentropy": 2.390151119232178, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.255537736415863, + "loss/reg": 0.0, + "step": 13420 + }, + { + "epoch": 0.08835526315789474, + "grad_norm": 2.203125, + "grad_norm_var": 0.33414306640625, + "learning_rate": 0.0001, + "loss": 3.1779, + "loss/crossentropy": 2.2273300528526305, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2409852236509323, + "loss/reg": 0.0, + "step": 13430 + }, + { + "epoch": 0.08842105263157894, + "grad_norm": 25.75, + "grad_norm_var": 34.02027587890625, + "learning_rate": 0.0001, + "loss": 3.1974, + "loss/crossentropy": 2.281948208808899, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.2378145158290863, + "loss/reg": 0.0, + "step": 13440 + }, + { + "epoch": 0.08848684210526316, + "grad_norm": 2.75, + "grad_norm_var": 34.235252888997394, + "learning_rate": 0.0001, + "loss": 3.3736, + "loss/crossentropy": 2.267159104347229, + "loss/hidden": 3.1296875, + "loss/incoh": 0.0, + "loss/logits": 0.2702139914035797, + "loss/reg": 0.0, + "step": 13450 + }, + { + "epoch": 0.08855263157894737, + "grad_norm": 2.703125, + "grad_norm_var": 1.1851145426432292, + "learning_rate": 0.0001, + "loss": 3.16, + "loss/crossentropy": 2.3185499548912047, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.2232184700667858, + "loss/reg": 0.0, + "step": 13460 + }, + { + "epoch": 0.08861842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.07108968098958333, + "learning_rate": 0.0001, + "loss": 3.2029, + "loss/crossentropy": 2.1752323627471926, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.3101599723100662, + "loss/reg": 0.0, + "step": 13470 + }, + { + "epoch": 0.0886842105263158, + "grad_norm": 2.21875, + "grad_norm_var": 0.11365559895833334, + "learning_rate": 0.0001, + "loss": 3.2671, + "loss/crossentropy": 2.255975532531738, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.30118285566568376, + "loss/reg": 0.0, + "step": 13480 + }, + { + "epoch": 0.08875, + "grad_norm": 2.4375, + "grad_norm_var": 0.06363525390625, + "learning_rate": 0.0001, + "loss": 3.1907, + "loss/crossentropy": 2.2357093393802643, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.23682421892881395, + "loss/reg": 0.0, + "step": 13490 + }, + { + "epoch": 0.08881578947368421, + "grad_norm": 2.75, + "grad_norm_var": 0.05452372233072917, + "learning_rate": 0.0001, + "loss": 3.1246, + "loss/crossentropy": 2.3712179183959963, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.2336268275976181, + "loss/reg": 0.0, + "step": 13500 + }, + { + "epoch": 0.08888157894736842, + "grad_norm": 2.40625, + "grad_norm_var": 0.13904520670572917, + "learning_rate": 0.0001, + "loss": 3.1879, + "loss/crossentropy": 2.407870662212372, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.28650608360767366, + "loss/reg": 0.0, + "step": 13510 + }, + { + "epoch": 0.08894736842105264, + "grad_norm": 2.359375, + "grad_norm_var": 0.009284464518229167, + "learning_rate": 0.0001, + "loss": 3.1881, + "loss/crossentropy": 2.4734713077545165, + "loss/hidden": 3.071875, + "loss/incoh": 0.0, + "loss/logits": 0.30961792171001434, + "loss/reg": 0.0, + "step": 13520 + }, + { + "epoch": 0.08901315789473684, + "grad_norm": 2.796875, + "grad_norm_var": 0.03455301920572917, + "learning_rate": 0.0001, + "loss": 3.1521, + "loss/crossentropy": 2.1405319690704347, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.25160129070281984, + "loss/reg": 0.0, + "step": 13530 + }, + { + "epoch": 0.08907894736842105, + "grad_norm": 2.40625, + "grad_norm_var": 0.13059488932291666, + "learning_rate": 0.0001, + "loss": 3.2623, + "loss/crossentropy": 2.5234264612197874, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.2405148908495903, + "loss/reg": 0.0, + "step": 13540 + }, + { + "epoch": 0.08914473684210526, + "grad_norm": 3.109375, + "grad_norm_var": 0.14059244791666667, + "learning_rate": 0.0001, + "loss": 3.1637, + "loss/crossentropy": 2.4861895561218263, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2647073075175285, + "loss/reg": 0.0, + "step": 13550 + }, + { + "epoch": 0.08921052631578948, + "grad_norm": 2.09375, + "grad_norm_var": 0.08899332682291666, + "learning_rate": 0.0001, + "loss": 3.2354, + "loss/crossentropy": 2.241242003440857, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.25270578265190125, + "loss/reg": 0.0, + "step": 13560 + }, + { + "epoch": 0.08927631578947369, + "grad_norm": 2.703125, + "grad_norm_var": 0.15600484212239582, + "learning_rate": 0.0001, + "loss": 3.2692, + "loss/crossentropy": 2.5087321639060973, + "loss/hidden": 3.1078125, + "loss/incoh": 0.0, + "loss/logits": 0.2870207831263542, + "loss/reg": 0.0, + "step": 13570 + }, + { + "epoch": 0.08934210526315789, + "grad_norm": 1.9140625, + "grad_norm_var": 0.09006322224934896, + "learning_rate": 0.0001, + "loss": 3.2167, + "loss/crossentropy": 2.519565200805664, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2802609995007515, + "loss/reg": 0.0, + "step": 13580 + }, + { + "epoch": 0.0894078947368421, + "grad_norm": 2.859375, + "grad_norm_var": 0.08097508748372396, + "learning_rate": 0.0001, + "loss": 3.2167, + "loss/crossentropy": 2.6469456434249876, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.2738121926784515, + "loss/reg": 0.0, + "step": 13590 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 4.34375, + "grad_norm_var": 0.28693033854166666, + "learning_rate": 0.0001, + "loss": 3.0727, + "loss/crossentropy": 2.4280938267707826, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.24971676170825957, + "loss/reg": 0.0, + "step": 13600 + }, + { + "epoch": 0.08953947368421053, + "grad_norm": 2.171875, + "grad_norm_var": 0.28564046223958334, + "learning_rate": 0.0001, + "loss": 3.2647, + "loss/crossentropy": 2.331229364871979, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.25523588955402376, + "loss/reg": 0.0, + "step": 13610 + }, + { + "epoch": 0.08960526315789474, + "grad_norm": 2.03125, + "grad_norm_var": 0.1018463134765625, + "learning_rate": 0.0001, + "loss": 3.141, + "loss/crossentropy": 2.248876082897186, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.27915321439504626, + "loss/reg": 0.0, + "step": 13620 + }, + { + "epoch": 0.08967105263157894, + "grad_norm": 2.484375, + "grad_norm_var": 0.05377604166666667, + "learning_rate": 0.0001, + "loss": 3.1282, + "loss/crossentropy": 2.475773072242737, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.26457071453332903, + "loss/reg": 0.0, + "step": 13630 + }, + { + "epoch": 0.08973684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 0.09758707682291666, + "learning_rate": 0.0001, + "loss": 3.1607, + "loss/crossentropy": 2.3480275869369507, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.2902192771434784, + "loss/reg": 0.0, + "step": 13640 + }, + { + "epoch": 0.08980263157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.1151519775390625, + "learning_rate": 0.0001, + "loss": 3.1651, + "loss/crossentropy": 2.362102711200714, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.24841166138648987, + "loss/reg": 0.0, + "step": 13650 + }, + { + "epoch": 0.08986842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.0971832275390625, + "learning_rate": 0.0001, + "loss": 3.1725, + "loss/crossentropy": 2.134939956665039, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.2594160199165344, + "loss/reg": 0.0, + "step": 13660 + }, + { + "epoch": 0.08993421052631578, + "grad_norm": 2.296875, + "grad_norm_var": 0.09054361979166667, + "learning_rate": 0.0001, + "loss": 3.0604, + "loss/crossentropy": 2.22348096370697, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2797392845153809, + "loss/reg": 0.0, + "step": 13670 + }, + { + "epoch": 0.09, + "grad_norm": 2.65625, + "grad_norm_var": 0.0264556884765625, + "learning_rate": 0.0001, + "loss": 3.165, + "loss/crossentropy": 2.321420121192932, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.25273988842964173, + "loss/reg": 0.0, + "step": 13680 + }, + { + "epoch": 0.09006578947368421, + "grad_norm": 2.1875, + "grad_norm_var": 0.0448394775390625, + "learning_rate": 0.0001, + "loss": 3.2251, + "loss/crossentropy": 2.5713982224464416, + "loss/hidden": 3.1890625, + "loss/incoh": 0.0, + "loss/logits": 0.3743001103401184, + "loss/reg": 0.0, + "step": 13690 + }, + { + "epoch": 0.09013157894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.05821024576822917, + "learning_rate": 0.0001, + "loss": 3.1174, + "loss/crossentropy": 2.433469843864441, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.29491184949874877, + "loss/reg": 0.0, + "step": 13700 + }, + { + "epoch": 0.09019736842105264, + "grad_norm": 2.328125, + "grad_norm_var": 0.06334228515625, + "learning_rate": 0.0001, + "loss": 3.1422, + "loss/crossentropy": 2.498783230781555, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2630519106984138, + "loss/reg": 0.0, + "step": 13710 + }, + { + "epoch": 0.09026315789473684, + "grad_norm": 2.484375, + "grad_norm_var": 0.18053385416666667, + "learning_rate": 0.0001, + "loss": 3.2017, + "loss/crossentropy": 2.295622777938843, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.25550642013549807, + "loss/reg": 0.0, + "step": 13720 + }, + { + "epoch": 0.09032894736842105, + "grad_norm": 2.921875, + "grad_norm_var": 0.14754231770833334, + "learning_rate": 0.0001, + "loss": 3.2136, + "loss/crossentropy": 2.4960160851478577, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.29100794196128843, + "loss/reg": 0.0, + "step": 13730 + }, + { + "epoch": 0.09039473684210526, + "grad_norm": 2.5, + "grad_norm_var": 0.060888671875, + "learning_rate": 0.0001, + "loss": 3.182, + "loss/crossentropy": 2.543604516983032, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.26428850889205935, + "loss/reg": 0.0, + "step": 13740 + }, + { + "epoch": 0.09046052631578948, + "grad_norm": 2.59375, + "grad_norm_var": 0.1883697509765625, + "learning_rate": 0.0001, + "loss": 3.1144, + "loss/crossentropy": 2.495186424255371, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.24936733841896058, + "loss/reg": 0.0, + "step": 13750 + }, + { + "epoch": 0.09052631578947369, + "grad_norm": 2.515625, + "grad_norm_var": 0.17746480305989584, + "learning_rate": 0.0001, + "loss": 3.2332, + "loss/crossentropy": 2.5168472051620485, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.3214556619524956, + "loss/reg": 0.0, + "step": 13760 + }, + { + "epoch": 0.09059210526315789, + "grad_norm": 2.53125, + "grad_norm_var": 0.049853515625, + "learning_rate": 0.0001, + "loss": 3.1549, + "loss/crossentropy": 2.2676196336746215, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.28792900443077085, + "loss/reg": 0.0, + "step": 13770 + }, + { + "epoch": 0.0906578947368421, + "grad_norm": 2.890625, + "grad_norm_var": 0.50240478515625, + "learning_rate": 0.0001, + "loss": 3.1942, + "loss/crossentropy": 2.3297463774681093, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.20883744955062866, + "loss/reg": 0.0, + "step": 13780 + }, + { + "epoch": 0.09072368421052632, + "grad_norm": 2.546875, + "grad_norm_var": 0.4667154947916667, + "learning_rate": 0.0001, + "loss": 3.2349, + "loss/crossentropy": 2.1975256204605103, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.2709361925721169, + "loss/reg": 0.0, + "step": 13790 + }, + { + "epoch": 0.09078947368421053, + "grad_norm": 2.84375, + "grad_norm_var": 0.03693745930989583, + "learning_rate": 0.0001, + "loss": 3.1167, + "loss/crossentropy": 2.3970743119716644, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.2433250866830349, + "loss/reg": 0.0, + "step": 13800 + }, + { + "epoch": 0.09085526315789473, + "grad_norm": 2.609375, + "grad_norm_var": 0.7566965738932292, + "learning_rate": 0.0001, + "loss": 3.2493, + "loss/crossentropy": 2.269898569583893, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.23498842120170593, + "loss/reg": 0.0, + "step": 13810 + }, + { + "epoch": 0.09092105263157894, + "grad_norm": 2.265625, + "grad_norm_var": 0.050389607747395836, + "learning_rate": 0.0001, + "loss": 3.1946, + "loss/crossentropy": 2.5624868392944338, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.36430184692144396, + "loss/reg": 0.0, + "step": 13820 + }, + { + "epoch": 0.09098684210526316, + "grad_norm": 2.640625, + "grad_norm_var": 0.06546122233072917, + "learning_rate": 0.0001, + "loss": 3.225, + "loss/crossentropy": 2.2082170367240908, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2326130375266075, + "loss/reg": 0.0, + "step": 13830 + }, + { + "epoch": 0.09105263157894737, + "grad_norm": 3.046875, + "grad_norm_var": 0.1621002197265625, + "learning_rate": 0.0001, + "loss": 3.225, + "loss/crossentropy": 2.2408367514610292, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.25092404931783674, + "loss/reg": 0.0, + "step": 13840 + }, + { + "epoch": 0.09111842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.18394775390625, + "learning_rate": 0.0001, + "loss": 3.1397, + "loss/crossentropy": 2.092372101545334, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.25034971833229064, + "loss/reg": 0.0, + "step": 13850 + }, + { + "epoch": 0.09118421052631578, + "grad_norm": 2.421875, + "grad_norm_var": 0.0753313700358073, + "learning_rate": 0.0001, + "loss": 3.1936, + "loss/crossentropy": 2.2277904510498048, + "loss/hidden": 3.075, + "loss/incoh": 0.0, + "loss/logits": 0.28555874079465865, + "loss/reg": 0.0, + "step": 13860 + }, + { + "epoch": 0.09125, + "grad_norm": 2.203125, + "grad_norm_var": 0.06499608357747395, + "learning_rate": 0.0001, + "loss": 3.1857, + "loss/crossentropy": 2.0974882781505584, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.22869385927915573, + "loss/reg": 0.0, + "step": 13870 + }, + { + "epoch": 0.09131578947368421, + "grad_norm": 3.1875, + "grad_norm_var": 0.30895894368489585, + "learning_rate": 0.0001, + "loss": 3.1808, + "loss/crossentropy": 2.2242319107055666, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.24477246254682541, + "loss/reg": 0.0, + "step": 13880 + }, + { + "epoch": 0.09138157894736842, + "grad_norm": 2.546875, + "grad_norm_var": 0.30500895182291665, + "learning_rate": 0.0001, + "loss": 3.1587, + "loss/crossentropy": 2.2787875294685365, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.23667097985744476, + "loss/reg": 0.0, + "step": 13890 + }, + { + "epoch": 0.09144736842105264, + "grad_norm": 2.109375, + "grad_norm_var": 0.04114176432291667, + "learning_rate": 0.0001, + "loss": 3.1791, + "loss/crossentropy": 2.5015464782714845, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.26709298938512804, + "loss/reg": 0.0, + "step": 13900 + }, + { + "epoch": 0.09151315789473684, + "grad_norm": 1.9765625, + "grad_norm_var": 0.38769505818684896, + "learning_rate": 0.0001, + "loss": 3.1858, + "loss/crossentropy": 2.367018985748291, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2575811371207237, + "loss/reg": 0.0, + "step": 13910 + }, + { + "epoch": 0.09157894736842105, + "grad_norm": 2.8125, + "grad_norm_var": 0.12981338500976564, + "learning_rate": 0.0001, + "loss": 3.2014, + "loss/crossentropy": 2.3600114941596986, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2798996135592461, + "loss/reg": 0.0, + "step": 13920 + }, + { + "epoch": 0.09164473684210526, + "grad_norm": 2.40625, + "grad_norm_var": 0.5571940104166667, + "learning_rate": 0.0001, + "loss": 3.1415, + "loss/crossentropy": 2.07312273979187, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.27816066443920134, + "loss/reg": 0.0, + "step": 13930 + }, + { + "epoch": 0.09171052631578948, + "grad_norm": 2.234375, + "grad_norm_var": 0.16337483723958332, + "learning_rate": 0.0001, + "loss": 3.1846, + "loss/crossentropy": 2.3842572927474976, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.3044495239853859, + "loss/reg": 0.0, + "step": 13940 + }, + { + "epoch": 0.09177631578947368, + "grad_norm": 2.28125, + "grad_norm_var": 0.011617024739583334, + "learning_rate": 0.0001, + "loss": 3.2638, + "loss/crossentropy": 2.258384811878204, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.24848029464483262, + "loss/reg": 0.0, + "step": 13950 + }, + { + "epoch": 0.09184210526315789, + "grad_norm": 2.625, + "grad_norm_var": 0.08046875, + "learning_rate": 0.0001, + "loss": 3.2277, + "loss/crossentropy": 2.1830771923065186, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.282887265086174, + "loss/reg": 0.0, + "step": 13960 + }, + { + "epoch": 0.0919078947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.69127197265625, + "learning_rate": 0.0001, + "loss": 3.2795, + "loss/crossentropy": 2.5693406105041503, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2618736609816551, + "loss/reg": 0.0, + "step": 13970 + }, + { + "epoch": 0.09197368421052632, + "grad_norm": 2.265625, + "grad_norm_var": 1.2373443603515626, + "learning_rate": 0.0001, + "loss": 3.163, + "loss/crossentropy": 2.496062994003296, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.27756498008966446, + "loss/reg": 0.0, + "step": 13980 + }, + { + "epoch": 0.09203947368421053, + "grad_norm": 2.375, + "grad_norm_var": 1.256787109375, + "learning_rate": 0.0001, + "loss": 3.2974, + "loss/crossentropy": 2.233779698610306, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.22414856255054474, + "loss/reg": 0.0, + "step": 13990 + }, + { + "epoch": 0.09210526315789473, + "grad_norm": 2.3125, + "grad_norm_var": 0.04104817708333333, + "learning_rate": 0.0001, + "loss": 3.1465, + "loss/crossentropy": 2.3837480187416076, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.29815254360437393, + "loss/reg": 0.0, + "step": 14000 + }, + { + "epoch": 0.09217105263157895, + "grad_norm": 2.296875, + "grad_norm_var": 0.03504130045572917, + "learning_rate": 0.0001, + "loss": 3.1933, + "loss/crossentropy": 2.4089162349700928, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.28769134283065795, + "loss/reg": 0.0, + "step": 14010 + }, + { + "epoch": 0.09223684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.099560546875, + "learning_rate": 0.0001, + "loss": 3.2431, + "loss/crossentropy": 2.387049177289009, + "loss/hidden": 3.078125, + "loss/incoh": 0.0, + "loss/logits": 0.31867421939969065, + "loss/reg": 0.0, + "step": 14020 + }, + { + "epoch": 0.09230263157894737, + "grad_norm": 2.203125, + "grad_norm_var": 0.12417704264322917, + "learning_rate": 0.0001, + "loss": 3.0959, + "loss/crossentropy": 2.2009261429309843, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.2346379891037941, + "loss/reg": 0.0, + "step": 14030 + }, + { + "epoch": 0.09236842105263159, + "grad_norm": 2.484375, + "grad_norm_var": 0.0412750244140625, + "learning_rate": 0.0001, + "loss": 3.1679, + "loss/crossentropy": 2.379330587387085, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.23624493330717086, + "loss/reg": 0.0, + "step": 14040 + }, + { + "epoch": 0.09243421052631579, + "grad_norm": 2.515625, + "grad_norm_var": 0.25758056640625, + "learning_rate": 0.0001, + "loss": 3.1901, + "loss/crossentropy": 2.2017428398132326, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2819879144430161, + "loss/reg": 0.0, + "step": 14050 + }, + { + "epoch": 0.0925, + "grad_norm": 2.59375, + "grad_norm_var": 1.2106597900390625, + "learning_rate": 0.0001, + "loss": 3.1627, + "loss/crossentropy": 2.053563690185547, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.24388336688280104, + "loss/reg": 0.0, + "step": 14060 + }, + { + "epoch": 0.09256578947368421, + "grad_norm": 1.953125, + "grad_norm_var": 0.0501617431640625, + "learning_rate": 0.0001, + "loss": 3.1122, + "loss/crossentropy": 2.5748242855072023, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2592714115977287, + "loss/reg": 0.0, + "step": 14070 + }, + { + "epoch": 0.09263157894736843, + "grad_norm": 2.546875, + "grad_norm_var": 0.115869140625, + "learning_rate": 0.0001, + "loss": 3.2403, + "loss/crossentropy": 2.1434300899505616, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.23879239857196807, + "loss/reg": 0.0, + "step": 14080 + }, + { + "epoch": 0.09269736842105263, + "grad_norm": 2.53125, + "grad_norm_var": 0.08590087890625, + "learning_rate": 0.0001, + "loss": 3.187, + "loss/crossentropy": 2.078695094585419, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.29750160723924635, + "loss/reg": 0.0, + "step": 14090 + }, + { + "epoch": 0.09276315789473684, + "grad_norm": 3.078125, + "grad_norm_var": 0.06355692545572916, + "learning_rate": 0.0001, + "loss": 3.1426, + "loss/crossentropy": 2.4077930808067323, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2784098371863365, + "loss/reg": 0.0, + "step": 14100 + }, + { + "epoch": 0.09282894736842105, + "grad_norm": 2.421875, + "grad_norm_var": 0.28290913899739584, + "learning_rate": 0.0001, + "loss": 3.1999, + "loss/crossentropy": 2.551260459423065, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.30844325572252274, + "loss/reg": 0.0, + "step": 14110 + }, + { + "epoch": 0.09289473684210527, + "grad_norm": 2.890625, + "grad_norm_var": 0.28843994140625, + "learning_rate": 0.0001, + "loss": 3.1903, + "loss/crossentropy": 2.057539927959442, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.21393984705209732, + "loss/reg": 0.0, + "step": 14120 + }, + { + "epoch": 0.09296052631578948, + "grad_norm": 2.328125, + "grad_norm_var": 0.16035054524739584, + "learning_rate": 0.0001, + "loss": 3.2036, + "loss/crossentropy": 2.2166428923606873, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.28170192539691924, + "loss/reg": 0.0, + "step": 14130 + }, + { + "epoch": 0.09302631578947368, + "grad_norm": 2.328125, + "grad_norm_var": 0.04431864420572917, + "learning_rate": 0.0001, + "loss": 3.1706, + "loss/crossentropy": 2.5224907636642455, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2656204476952553, + "loss/reg": 0.0, + "step": 14140 + }, + { + "epoch": 0.09309210526315789, + "grad_norm": 2.28125, + "grad_norm_var": 0.13594462076822916, + "learning_rate": 0.0001, + "loss": 3.2365, + "loss/crossentropy": 2.365175998210907, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.28030987083911896, + "loss/reg": 0.0, + "step": 14150 + }, + { + "epoch": 0.0931578947368421, + "grad_norm": 2.375, + "grad_norm_var": 3.4480684566712595e+17, + "learning_rate": 0.0001, + "loss": 3.3698, + "loss/crossentropy": 2.4446977496147158, + "loss/hidden": 4.159375, + "loss/incoh": 0.0, + "loss/logits": 0.35163595527410507, + "loss/reg": 0.0, + "step": 14160 + }, + { + "epoch": 0.09322368421052632, + "grad_norm": 2.46875, + "grad_norm_var": 3.4480684570076774e+17, + "learning_rate": 0.0001, + "loss": 3.1773, + "loss/crossentropy": 2.144097054004669, + "loss/hidden": 3.0171875, + "loss/incoh": 0.0, + "loss/logits": 0.2777150124311447, + "loss/reg": 0.0, + "step": 14170 + }, + { + "epoch": 0.09328947368421053, + "grad_norm": 2.484375, + "grad_norm_var": 0.1343902587890625, + "learning_rate": 0.0001, + "loss": 3.1593, + "loss/crossentropy": 2.5162782430648805, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.2642303377389908, + "loss/reg": 0.0, + "step": 14180 + }, + { + "epoch": 0.09335526315789473, + "grad_norm": 2.203125, + "grad_norm_var": 0.169921875, + "learning_rate": 0.0001, + "loss": 3.1927, + "loss/crossentropy": 2.1481271982192993, + "loss/hidden": 3.034375, + "loss/incoh": 0.0, + "loss/logits": 0.27440374791622163, + "loss/reg": 0.0, + "step": 14190 + }, + { + "epoch": 0.09342105263157895, + "grad_norm": 2.21875, + "grad_norm_var": 0.188330078125, + "learning_rate": 0.0001, + "loss": 3.2049, + "loss/crossentropy": 2.4672377467155457, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.25575320422649384, + "loss/reg": 0.0, + "step": 14200 + }, + { + "epoch": 0.09348684210526316, + "grad_norm": 2.421875, + "grad_norm_var": 0.07030843098958334, + "learning_rate": 0.0001, + "loss": 3.1256, + "loss/crossentropy": 2.4822991728782653, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.23255562111735345, + "loss/reg": 0.0, + "step": 14210 + }, + { + "epoch": 0.09355263157894737, + "grad_norm": 2.421875, + "grad_norm_var": 0.06575698852539062, + "learning_rate": 0.0001, + "loss": 3.1061, + "loss/crossentropy": 2.4455429315567017, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.2238618478178978, + "loss/reg": 0.0, + "step": 14220 + }, + { + "epoch": 0.09361842105263157, + "grad_norm": 2.453125, + "grad_norm_var": 0.13621317545572917, + "learning_rate": 0.0001, + "loss": 3.1927, + "loss/crossentropy": 2.3609827399253844, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.3211557373404503, + "loss/reg": 0.0, + "step": 14230 + }, + { + "epoch": 0.09368421052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.1390777587890625, + "learning_rate": 0.0001, + "loss": 3.2222, + "loss/crossentropy": 2.4929265141487122, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.256229493021965, + "loss/reg": 0.0, + "step": 14240 + }, + { + "epoch": 0.09375, + "grad_norm": 2.28125, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 3.0973, + "loss/crossentropy": 2.396563506126404, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.3292677074670792, + "loss/reg": 0.0, + "step": 14250 + }, + { + "epoch": 0.09381578947368421, + "grad_norm": 3.046875, + "grad_norm_var": 3.801495474913411e+17, + "learning_rate": 0.0001, + "loss": 3.3636, + "loss/crossentropy": 2.1659668326377868, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.24123398140072821, + "loss/reg": 0.0, + "step": 14260 + }, + { + "epoch": 0.09388157894736843, + "grad_norm": 2.640625, + "grad_norm_var": 3.801495474059215e+17, + "learning_rate": 0.0001, + "loss": 3.2259, + "loss/crossentropy": 2.35439647436142, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.22794780433177947, + "loss/reg": 0.0, + "step": 14270 + }, + { + "epoch": 0.09394736842105263, + "grad_norm": 2.734375, + "grad_norm_var": 0.42942708333333335, + "learning_rate": 0.0001, + "loss": 3.1665, + "loss/crossentropy": 2.3001658797264097, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.26924325078725814, + "loss/reg": 0.0, + "step": 14280 + }, + { + "epoch": 0.09401315789473684, + "grad_norm": 2.34375, + "grad_norm_var": 0.0408843994140625, + "learning_rate": 0.0001, + "loss": 3.0518, + "loss/crossentropy": 2.3841129422187803, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.26079314202070236, + "loss/reg": 0.0, + "step": 14290 + }, + { + "epoch": 0.09407894736842105, + "grad_norm": 2.296875, + "grad_norm_var": 0.03784891764322917, + "learning_rate": 0.0001, + "loss": 3.1904, + "loss/crossentropy": 2.2710848689079284, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.23626017868518828, + "loss/reg": 0.0, + "step": 14300 + }, + { + "epoch": 0.09414473684210527, + "grad_norm": 2.921875, + "grad_norm_var": 0.11845296223958333, + "learning_rate": 0.0001, + "loss": 3.2715, + "loss/crossentropy": 2.4462394237518312, + "loss/hidden": 3.0515625, + "loss/incoh": 0.0, + "loss/logits": 0.3083674684166908, + "loss/reg": 0.0, + "step": 14310 + }, + { + "epoch": 0.09421052631578947, + "grad_norm": 2.609375, + "grad_norm_var": 0.115771484375, + "learning_rate": 0.0001, + "loss": 3.1137, + "loss/crossentropy": 2.4102493643760683, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.3285558119416237, + "loss/reg": 0.0, + "step": 14320 + }, + { + "epoch": 0.09427631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.0410552978515625, + "learning_rate": 0.0001, + "loss": 3.1163, + "loss/crossentropy": 2.496846008300781, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.2681970477104187, + "loss/reg": 0.0, + "step": 14330 + }, + { + "epoch": 0.0943421052631579, + "grad_norm": 3.28125, + "grad_norm_var": 0.3074615478515625, + "learning_rate": 0.0001, + "loss": 3.2179, + "loss/crossentropy": 2.368880546092987, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.27040198296308515, + "loss/reg": 0.0, + "step": 14340 + }, + { + "epoch": 0.09440789473684211, + "grad_norm": 2.359375, + "grad_norm_var": 0.087841796875, + "learning_rate": 0.0001, + "loss": 3.1677, + "loss/crossentropy": 2.379721689224243, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.3262082427740097, + "loss/reg": 0.0, + "step": 14350 + }, + { + "epoch": 0.09447368421052632, + "grad_norm": 3.03125, + "grad_norm_var": 0.06515299479166667, + "learning_rate": 0.0001, + "loss": 3.1541, + "loss/crossentropy": 2.3577764987945558, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.28836628049612045, + "loss/reg": 0.0, + "step": 14360 + }, + { + "epoch": 0.09453947368421052, + "grad_norm": 2.40625, + "grad_norm_var": 0.09937744140625, + "learning_rate": 0.0001, + "loss": 3.1429, + "loss/crossentropy": 2.2929248332977297, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2473236471414566, + "loss/reg": 0.0, + "step": 14370 + }, + { + "epoch": 0.09460526315789473, + "grad_norm": 2.90625, + "grad_norm_var": 0.056428019205729166, + "learning_rate": 0.0001, + "loss": 3.1331, + "loss/crossentropy": 2.11824688911438, + "loss/hidden": 2.59375, + "loss/incoh": 0.0, + "loss/logits": 0.19864632040262223, + "loss/reg": 0.0, + "step": 14380 + }, + { + "epoch": 0.09467105263157895, + "grad_norm": 2.5625, + "grad_norm_var": 0.2792154947916667, + "learning_rate": 0.0001, + "loss": 3.2558, + "loss/crossentropy": 2.4552414536476137, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.26964131295681, + "loss/reg": 0.0, + "step": 14390 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 2.828125, + "grad_norm_var": 0.1197418212890625, + "learning_rate": 0.0001, + "loss": 3.1627, + "loss/crossentropy": 2.4108232736587523, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2856267586350441, + "loss/reg": 0.0, + "step": 14400 + }, + { + "epoch": 0.09480263157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.09917704264322917, + "learning_rate": 0.0001, + "loss": 3.2683, + "loss/crossentropy": 2.245331883430481, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.26227104514837263, + "loss/reg": 0.0, + "step": 14410 + }, + { + "epoch": 0.09486842105263157, + "grad_norm": 2.71875, + "grad_norm_var": 0.47395426432291665, + "learning_rate": 0.0001, + "loss": 3.3242, + "loss/crossentropy": 2.5019222021102907, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2487493023276329, + "loss/reg": 0.0, + "step": 14420 + }, + { + "epoch": 0.09493421052631579, + "grad_norm": 2.84375, + "grad_norm_var": 0.8088053385416667, + "learning_rate": 0.0001, + "loss": 3.1846, + "loss/crossentropy": 2.239874541759491, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.30340075492858887, + "loss/reg": 0.0, + "step": 14430 + }, + { + "epoch": 0.095, + "grad_norm": 3.203125, + "grad_norm_var": 0.5774698893229167, + "learning_rate": 0.0001, + "loss": 3.2356, + "loss/crossentropy": 2.212349569797516, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.25182478949427606, + "loss/reg": 0.0, + "step": 14440 + }, + { + "epoch": 0.09506578947368421, + "grad_norm": 2.890625, + "grad_norm_var": 0.19168294270833333, + "learning_rate": 0.0001, + "loss": 3.2659, + "loss/crossentropy": 2.337146294116974, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2963370710611343, + "loss/reg": 0.0, + "step": 14450 + }, + { + "epoch": 0.09513157894736841, + "grad_norm": 2.5625, + "grad_norm_var": 0.10920308430989584, + "learning_rate": 0.0001, + "loss": 3.2442, + "loss/crossentropy": 1.8299875736236573, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.21587850898504257, + "loss/reg": 0.0, + "step": 14460 + }, + { + "epoch": 0.09519736842105263, + "grad_norm": 2.4375, + "grad_norm_var": 0.04309794108072917, + "learning_rate": 0.0001, + "loss": 3.1964, + "loss/crossentropy": 2.558639335632324, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.25251512974500656, + "loss/reg": 0.0, + "step": 14470 + }, + { + "epoch": 0.09526315789473684, + "grad_norm": 2.4375, + "grad_norm_var": 0.020686848958333334, + "learning_rate": 0.0001, + "loss": 3.1942, + "loss/crossentropy": 2.514283466339111, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.2890095472335815, + "loss/reg": 0.0, + "step": 14480 + }, + { + "epoch": 0.09532894736842105, + "grad_norm": 2.140625, + "grad_norm_var": 0.028123982747395835, + "learning_rate": 0.0001, + "loss": 3.1856, + "loss/crossentropy": 2.2959898948669433, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.24410185664892198, + "loss/reg": 0.0, + "step": 14490 + }, + { + "epoch": 0.09539473684210527, + "grad_norm": 2.328125, + "grad_norm_var": 0.03728841145833333, + "learning_rate": 0.0001, + "loss": 3.2528, + "loss/crossentropy": 2.421563959121704, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.30335861891508104, + "loss/reg": 0.0, + "step": 14500 + }, + { + "epoch": 0.09546052631578947, + "grad_norm": 2.625, + "grad_norm_var": 0.041825358072916666, + "learning_rate": 0.0001, + "loss": 3.1292, + "loss/crossentropy": 2.3713988065719604, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.2508593872189522, + "loss/reg": 0.0, + "step": 14510 + }, + { + "epoch": 0.09552631578947368, + "grad_norm": 2.015625, + "grad_norm_var": 0.12280985514322916, + "learning_rate": 0.0001, + "loss": 3.1975, + "loss/crossentropy": 2.3875895380973815, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.26996816471219065, + "loss/reg": 0.0, + "step": 14520 + }, + { + "epoch": 0.0955921052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.13038101196289062, + "learning_rate": 0.0001, + "loss": 3.1709, + "loss/crossentropy": 2.3352912187576296, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.23363660275936127, + "loss/reg": 0.0, + "step": 14530 + }, + { + "epoch": 0.09565789473684211, + "grad_norm": 2.5625, + "grad_norm_var": 0.11688206990559896, + "learning_rate": 0.0001, + "loss": 3.1364, + "loss/crossentropy": 2.473167669773102, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.24549597799777984, + "loss/reg": 0.0, + "step": 14540 + }, + { + "epoch": 0.09572368421052632, + "grad_norm": 2.734375, + "grad_norm_var": 0.060868326822916666, + "learning_rate": 0.0001, + "loss": 3.2086, + "loss/crossentropy": 2.3421871423721314, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.2576961562037468, + "loss/reg": 0.0, + "step": 14550 + }, + { + "epoch": 0.09578947368421052, + "grad_norm": 2.59375, + "grad_norm_var": 0.10832926432291666, + "learning_rate": 0.0001, + "loss": 3.1744, + "loss/crossentropy": 2.1765161633491514, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2244855582714081, + "loss/reg": 0.0, + "step": 14560 + }, + { + "epoch": 0.09585526315789474, + "grad_norm": 2.265625, + "grad_norm_var": 0.1071685791015625, + "learning_rate": 0.0001, + "loss": 3.1489, + "loss/crossentropy": 2.507940888404846, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.2875757083296776, + "loss/reg": 0.0, + "step": 14570 + }, + { + "epoch": 0.09592105263157895, + "grad_norm": 2.140625, + "grad_norm_var": 0.029523722330729165, + "learning_rate": 0.0001, + "loss": 3.1669, + "loss/crossentropy": 2.2886616230010985, + "loss/hidden": 3.0765625, + "loss/incoh": 0.0, + "loss/logits": 0.3050649344921112, + "loss/reg": 0.0, + "step": 14580 + }, + { + "epoch": 0.09598684210526316, + "grad_norm": 2.8125, + "grad_norm_var": 0.03560282389322917, + "learning_rate": 0.0001, + "loss": 3.1454, + "loss/crossentropy": 2.395359969139099, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.2635947346687317, + "loss/reg": 0.0, + "step": 14590 + }, + { + "epoch": 0.09605263157894736, + "grad_norm": 2.03125, + "grad_norm_var": 0.23357747395833334, + "learning_rate": 0.0001, + "loss": 3.2176, + "loss/crossentropy": 2.002879011631012, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.24021245390176774, + "loss/reg": 0.0, + "step": 14600 + }, + { + "epoch": 0.09611842105263158, + "grad_norm": 2.109375, + "grad_norm_var": 0.216552734375, + "learning_rate": 0.0001, + "loss": 3.1289, + "loss/crossentropy": 2.4440099954605103, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.3085349440574646, + "loss/reg": 0.0, + "step": 14610 + }, + { + "epoch": 0.09618421052631579, + "grad_norm": 2.21875, + "grad_norm_var": 0.18892822265625, + "learning_rate": 0.0001, + "loss": 3.2541, + "loss/crossentropy": 2.5584524154663084, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.33079054951667786, + "loss/reg": 0.0, + "step": 14620 + }, + { + "epoch": 0.09625, + "grad_norm": 2.546875, + "grad_norm_var": 0.18036702473958333, + "learning_rate": 0.0001, + "loss": 3.1899, + "loss/crossentropy": 2.386956262588501, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.2895423695445061, + "loss/reg": 0.0, + "step": 14630 + }, + { + "epoch": 0.09631578947368422, + "grad_norm": 2.390625, + "grad_norm_var": 0.15852762858072916, + "learning_rate": 0.0001, + "loss": 3.1443, + "loss/crossentropy": 2.0192247331142426, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.2274337187409401, + "loss/reg": 0.0, + "step": 14640 + }, + { + "epoch": 0.09638157894736842, + "grad_norm": 2.375, + "grad_norm_var": 0.11549072265625, + "learning_rate": 0.0001, + "loss": 3.1243, + "loss/crossentropy": 2.499123454093933, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.25674946457147596, + "loss/reg": 0.0, + "step": 14650 + }, + { + "epoch": 0.09644736842105263, + "grad_norm": 2130706432.0, + "grad_norm_var": 2.83744368059244e+17, + "learning_rate": 0.0001, + "loss": 3.251, + "loss/crossentropy": 2.1218835711479187, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.23685040771961213, + "loss/reg": 0.0, + "step": 14660 + }, + { + "epoch": 0.09651315789473684, + "grad_norm": 2.15625, + "grad_norm_var": 2.8374436794771485e+17, + "learning_rate": 0.0001, + "loss": 3.2008, + "loss/crossentropy": 2.125733083486557, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.3448520749807358, + "loss/reg": 0.0, + "step": 14670 + }, + { + "epoch": 0.09657894736842106, + "grad_norm": 2.40625, + "grad_norm_var": 0.24964090983072917, + "learning_rate": 0.0001, + "loss": 3.1553, + "loss/crossentropy": 2.604245328903198, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.23915667235851287, + "loss/reg": 0.0, + "step": 14680 + }, + { + "epoch": 0.09664473684210527, + "grad_norm": 2.40625, + "grad_norm_var": 0.11900634765625, + "learning_rate": 0.0001, + "loss": 3.1023, + "loss/crossentropy": 2.3761191368103027, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2546194761991501, + "loss/reg": 0.0, + "step": 14690 + }, + { + "epoch": 0.09671052631578947, + "grad_norm": 2.5625, + "grad_norm_var": 0.05559488932291667, + "learning_rate": 0.0001, + "loss": 3.1544, + "loss/crossentropy": 2.3285223722457884, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.24710593670606612, + "loss/reg": 0.0, + "step": 14700 + }, + { + "epoch": 0.09677631578947368, + "grad_norm": 2.4375, + "grad_norm_var": 0.13647842407226562, + "learning_rate": 0.0001, + "loss": 3.1385, + "loss/crossentropy": 2.364056706428528, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22887694388628005, + "loss/reg": 0.0, + "step": 14710 + }, + { + "epoch": 0.0968421052631579, + "grad_norm": 2.140625, + "grad_norm_var": 5.376778157552083, + "learning_rate": 0.0001, + "loss": 3.1773, + "loss/crossentropy": 2.2852516174316406, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.24893905222415924, + "loss/reg": 0.0, + "step": 14720 + }, + { + "epoch": 0.09690789473684211, + "grad_norm": 2.828125, + "grad_norm_var": 5.43084487915039, + "learning_rate": 0.0001, + "loss": 3.1089, + "loss/crossentropy": 2.1680606245994567, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.23210474848747253, + "loss/reg": 0.0, + "step": 14730 + }, + { + "epoch": 0.09697368421052631, + "grad_norm": 1.984375, + "grad_norm_var": 0.1650054931640625, + "learning_rate": 0.0001, + "loss": 3.1202, + "loss/crossentropy": 2.1676085114479067, + "loss/hidden": 3.103125, + "loss/incoh": 0.0, + "loss/logits": 0.2939779102802277, + "loss/reg": 0.0, + "step": 14740 + }, + { + "epoch": 0.09703947368421052, + "grad_norm": 2.078125, + "grad_norm_var": 0.17122294108072916, + "learning_rate": 0.0001, + "loss": 3.0635, + "loss/crossentropy": 2.498277449607849, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.22846879661083222, + "loss/reg": 0.0, + "step": 14750 + }, + { + "epoch": 0.09710526315789474, + "grad_norm": 2.296875, + "grad_norm_var": 0.11088765462239583, + "learning_rate": 0.0001, + "loss": 3.155, + "loss/crossentropy": 2.2534152626991273, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.2535191968083382, + "loss/reg": 0.0, + "step": 14760 + }, + { + "epoch": 0.09717105263157895, + "grad_norm": 2.625, + "grad_norm_var": 0.10987955729166667, + "learning_rate": 0.0001, + "loss": 3.2477, + "loss/crossentropy": 2.085835373401642, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.25164034962654114, + "loss/reg": 0.0, + "step": 14770 + }, + { + "epoch": 0.09723684210526316, + "grad_norm": 2.578125, + "grad_norm_var": 17.859382120768228, + "learning_rate": 0.0001, + "loss": 3.2842, + "loss/crossentropy": 1.8799749910831451, + "loss/hidden": 3.0765625, + "loss/incoh": 0.0, + "loss/logits": 0.24884901493787764, + "loss/reg": 0.0, + "step": 14780 + }, + { + "epoch": 0.09730263157894736, + "grad_norm": 2.078125, + "grad_norm_var": 17.996207682291665, + "learning_rate": 0.0001, + "loss": 3.2386, + "loss/crossentropy": 2.307372045516968, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.26386404484510423, + "loss/reg": 0.0, + "step": 14790 + }, + { + "epoch": 0.09736842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.06170247395833333, + "learning_rate": 0.0001, + "loss": 3.1452, + "loss/crossentropy": 2.454807901382446, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.3172868087887764, + "loss/reg": 0.0, + "step": 14800 + }, + { + "epoch": 0.09743421052631579, + "grad_norm": 2.5, + "grad_norm_var": 0.05920817057291667, + "learning_rate": 0.0001, + "loss": 3.1114, + "loss/crossentropy": 2.014724650979042, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.1908031925559044, + "loss/reg": 0.0, + "step": 14810 + }, + { + "epoch": 0.0975, + "grad_norm": 2.90625, + "grad_norm_var": 0.060445149739583336, + "learning_rate": 0.0001, + "loss": 3.2493, + "loss/crossentropy": 2.463605833053589, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24406316578388215, + "loss/reg": 0.0, + "step": 14820 + }, + { + "epoch": 0.09756578947368422, + "grad_norm": 2.28125, + "grad_norm_var": 0.29640299479166665, + "learning_rate": 0.0001, + "loss": 3.1614, + "loss/crossentropy": 2.501069176197052, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2565233051776886, + "loss/reg": 0.0, + "step": 14830 + }, + { + "epoch": 0.09763157894736842, + "grad_norm": 2.09375, + "grad_norm_var": 0.5406534830729167, + "learning_rate": 0.0001, + "loss": 3.1904, + "loss/crossentropy": 2.462419664859772, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.22374519556760789, + "loss/reg": 0.0, + "step": 14840 + }, + { + "epoch": 0.09769736842105263, + "grad_norm": 2.296875, + "grad_norm_var": 0.7624501546223958, + "learning_rate": 0.0001, + "loss": 3.2431, + "loss/crossentropy": 2.226536822319031, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.26644248366355894, + "loss/reg": 0.0, + "step": 14850 + }, + { + "epoch": 0.09776315789473684, + "grad_norm": 2.328125, + "grad_norm_var": 2.5783355712890623, + "learning_rate": 0.0001, + "loss": 3.3229, + "loss/crossentropy": 2.3811925053596497, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.22725498378276826, + "loss/reg": 0.0, + "step": 14860 + }, + { + "epoch": 0.09782894736842106, + "grad_norm": 2.90625, + "grad_norm_var": 2.3144205729166667, + "learning_rate": 0.0001, + "loss": 3.2061, + "loss/crossentropy": 2.5685184717178347, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.2757527410984039, + "loss/reg": 0.0, + "step": 14870 + }, + { + "epoch": 0.09789473684210526, + "grad_norm": 2.515625, + "grad_norm_var": 1.900218709309896, + "learning_rate": 0.0001, + "loss": 3.1588, + "loss/crossentropy": 2.224149799346924, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.24766245782375335, + "loss/reg": 0.0, + "step": 14880 + }, + { + "epoch": 0.09796052631578947, + "grad_norm": 2.421875, + "grad_norm_var": 0.13209228515625, + "learning_rate": 0.0001, + "loss": 3.1723, + "loss/crossentropy": 2.2869945645332335, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.3204286351799965, + "loss/reg": 0.0, + "step": 14890 + }, + { + "epoch": 0.09802631578947368, + "grad_norm": 2.71875, + "grad_norm_var": 0.036408487955729166, + "learning_rate": 0.0001, + "loss": 3.1244, + "loss/crossentropy": 2.276153302192688, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.2348289594054222, + "loss/reg": 0.0, + "step": 14900 + }, + { + "epoch": 0.0980921052631579, + "grad_norm": 3.328125, + "grad_norm_var": 0.0842926025390625, + "learning_rate": 0.0001, + "loss": 3.2024, + "loss/crossentropy": 2.2277270436286924, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.2504657179117203, + "loss/reg": 0.0, + "step": 14910 + }, + { + "epoch": 0.09815789473684211, + "grad_norm": 2.25, + "grad_norm_var": 0.15784098307291666, + "learning_rate": 0.0001, + "loss": 3.1643, + "loss/crossentropy": 2.601578450202942, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.27564091980457306, + "loss/reg": 0.0, + "step": 14920 + }, + { + "epoch": 0.09822368421052631, + "grad_norm": 2.21875, + "grad_norm_var": 0.12454325358072917, + "learning_rate": 0.0001, + "loss": 3.0857, + "loss/crossentropy": 2.4973155736923216, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2681492820382118, + "loss/reg": 0.0, + "step": 14930 + }, + { + "epoch": 0.09828947368421052, + "grad_norm": 2.703125, + "grad_norm_var": 0.04514872233072917, + "learning_rate": 0.0001, + "loss": 3.1214, + "loss/crossentropy": 2.526623797416687, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.26705596745014193, + "loss/reg": 0.0, + "step": 14940 + }, + { + "epoch": 0.09835526315789474, + "grad_norm": 2.21875, + "grad_norm_var": 0.23338216145833332, + "learning_rate": 0.0001, + "loss": 3.1468, + "loss/crossentropy": 2.273276376724243, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.28430237621068954, + "loss/reg": 0.0, + "step": 14950 + }, + { + "epoch": 0.09842105263157895, + "grad_norm": 2.1875, + "grad_norm_var": 0.2802398681640625, + "learning_rate": 0.0001, + "loss": 3.2042, + "loss/crossentropy": 2.4497693538665772, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.2505017280578613, + "loss/reg": 0.0, + "step": 14960 + }, + { + "epoch": 0.09848684210526316, + "grad_norm": 2.453125, + "grad_norm_var": 0.15282796223958334, + "learning_rate": 0.0001, + "loss": 3.1746, + "loss/crossentropy": 2.4736087679862977, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.24588750153779984, + "loss/reg": 0.0, + "step": 14970 + }, + { + "epoch": 0.09855263157894736, + "grad_norm": 2.203125, + "grad_norm_var": 0.11015218098958333, + "learning_rate": 0.0001, + "loss": 3.1539, + "loss/crossentropy": 2.373614990711212, + "loss/hidden": 3.0578125, + "loss/incoh": 0.0, + "loss/logits": 0.30631934851408005, + "loss/reg": 0.0, + "step": 14980 + }, + { + "epoch": 0.09861842105263158, + "grad_norm": 3.921875, + "grad_norm_var": 0.1558990478515625, + "learning_rate": 0.0001, + "loss": 3.1982, + "loss/crossentropy": 2.295030379295349, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.24311772882938384, + "loss/reg": 0.0, + "step": 14990 + }, + { + "epoch": 0.09868421052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.3582967122395833, + "learning_rate": 0.0001, + "loss": 3.2225, + "loss/crossentropy": 2.29546434879303, + "loss/hidden": 3.05625, + "loss/incoh": 0.0, + "loss/logits": 0.45574710667133334, + "loss/reg": 0.0, + "step": 15000 + }, + { + "epoch": 0.09875, + "grad_norm": 2.796875, + "grad_norm_var": 5.883512115478515, + "learning_rate": 0.0001, + "loss": 3.1782, + "loss/crossentropy": 2.4186841249465942, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.3855607584118843, + "loss/reg": 0.0, + "step": 15010 + }, + { + "epoch": 0.0988157894736842, + "grad_norm": 2.578125, + "grad_norm_var": 5.986443837483724, + "learning_rate": 0.0001, + "loss": 3.0661, + "loss/crossentropy": 2.409875476360321, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.23775186911225318, + "loss/reg": 0.0, + "step": 15020 + }, + { + "epoch": 0.09888157894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.055916086832682295, + "learning_rate": 0.0001, + "loss": 3.1756, + "loss/crossentropy": 2.187470281124115, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.2504939392209053, + "loss/reg": 0.0, + "step": 15030 + }, + { + "epoch": 0.09894736842105263, + "grad_norm": 2.296875, + "grad_norm_var": 0.058203125, + "learning_rate": 0.0001, + "loss": 3.2179, + "loss/crossentropy": 2.634449529647827, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2524956986308098, + "loss/reg": 0.0, + "step": 15040 + }, + { + "epoch": 0.09901315789473684, + "grad_norm": 2.4375, + "grad_norm_var": 0.09206441243489584, + "learning_rate": 0.0001, + "loss": 3.1804, + "loss/crossentropy": 2.3493194222450255, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.24196315556764603, + "loss/reg": 0.0, + "step": 15050 + }, + { + "epoch": 0.09907894736842106, + "grad_norm": 2.328125, + "grad_norm_var": 0.0587799072265625, + "learning_rate": 0.0001, + "loss": 3.1473, + "loss/crossentropy": 2.5005852222442626, + "loss/hidden": 3.015625, + "loss/incoh": 0.0, + "loss/logits": 0.2778876781463623, + "loss/reg": 0.0, + "step": 15060 + }, + { + "epoch": 0.09914473684210526, + "grad_norm": 2.890625, + "grad_norm_var": 0.07665913899739583, + "learning_rate": 0.0001, + "loss": 3.2621, + "loss/crossentropy": 2.30003308057785, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.28490780740976335, + "loss/reg": 0.0, + "step": 15070 + }, + { + "epoch": 0.09921052631578947, + "grad_norm": 2.78125, + "grad_norm_var": 0.07154541015625, + "learning_rate": 0.0001, + "loss": 3.1472, + "loss/crossentropy": 2.2739776968955994, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.2682348355650902, + "loss/reg": 0.0, + "step": 15080 + }, + { + "epoch": 0.09927631578947368, + "grad_norm": 2.484375, + "grad_norm_var": 0.044169108072916664, + "learning_rate": 0.0001, + "loss": 3.1016, + "loss/crossentropy": 2.2083258867263793, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23891242742538452, + "loss/reg": 0.0, + "step": 15090 + }, + { + "epoch": 0.0993421052631579, + "grad_norm": 2.34375, + "grad_norm_var": 0.0837890625, + "learning_rate": 0.0001, + "loss": 3.1153, + "loss/crossentropy": 2.4516371846199037, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.21450784876942636, + "loss/reg": 0.0, + "step": 15100 + }, + { + "epoch": 0.09940789473684211, + "grad_norm": 2.515625, + "grad_norm_var": 0.14197489420572917, + "learning_rate": 0.0001, + "loss": 3.1489, + "loss/crossentropy": 2.2019423693418503, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.278714843839407, + "loss/reg": 0.0, + "step": 15110 + }, + { + "epoch": 0.09947368421052631, + "grad_norm": 1.984375, + "grad_norm_var": 0.06102676391601562, + "learning_rate": 0.0001, + "loss": 3.0573, + "loss/crossentropy": 2.0795727133750916, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.24602137356996537, + "loss/reg": 0.0, + "step": 15120 + }, + { + "epoch": 0.09953947368421052, + "grad_norm": 2.484375, + "grad_norm_var": 0.04859619140625, + "learning_rate": 0.0001, + "loss": 3.1684, + "loss/crossentropy": 2.0709302008152006, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.27385044246912005, + "loss/reg": 0.0, + "step": 15130 + }, + { + "epoch": 0.09960526315789474, + "grad_norm": 3.359375, + "grad_norm_var": 0.10572509765625, + "learning_rate": 0.0001, + "loss": 3.2319, + "loss/crossentropy": 2.1219942808151244, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.31611712723970414, + "loss/reg": 0.0, + "step": 15140 + }, + { + "epoch": 0.09967105263157895, + "grad_norm": 2.265625, + "grad_norm_var": 0.30730794270833334, + "learning_rate": 0.0001, + "loss": 3.3146, + "loss/crossentropy": 2.36390962600708, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.2202586129307747, + "loss/reg": 0.0, + "step": 15150 + }, + { + "epoch": 0.09973684210526315, + "grad_norm": 3.015625, + "grad_norm_var": 0.3680948893229167, + "learning_rate": 0.0001, + "loss": 3.1903, + "loss/crossentropy": 2.592810094356537, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.31535505652427676, + "loss/reg": 0.0, + "step": 15160 + }, + { + "epoch": 0.09980263157894737, + "grad_norm": 2.171875, + "grad_norm_var": 0.06611226399739584, + "learning_rate": 0.0001, + "loss": 3.1583, + "loss/crossentropy": 2.4805197715759277, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.25672143548727033, + "loss/reg": 0.0, + "step": 15170 + }, + { + "epoch": 0.09986842105263158, + "grad_norm": 2.1875, + "grad_norm_var": 0.026691691080729166, + "learning_rate": 0.0001, + "loss": 3.0654, + "loss/crossentropy": 2.428369462490082, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2483712613582611, + "loss/reg": 0.0, + "step": 15180 + }, + { + "epoch": 0.09993421052631579, + "grad_norm": 2.203125, + "grad_norm_var": 0.16633707682291668, + "learning_rate": 0.0001, + "loss": 3.2011, + "loss/crossentropy": 1.8888699412345886, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.25502131283283236, + "loss/reg": 0.0, + "step": 15190 + }, + { + "epoch": 0.1, + "grad_norm": 2.515625, + "grad_norm_var": 0.09675191243489584, + "learning_rate": 0.0001, + "loss": 3.1893, + "loss/crossentropy": 2.1418832421302794, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.2491638869047165, + "loss/reg": 0.0, + "step": 15200 + }, + { + "epoch": 0.1000657894736842, + "grad_norm": 2.46875, + "grad_norm_var": 0.08440755208333334, + "learning_rate": 0.0001, + "loss": 3.2207, + "loss/crossentropy": 2.5305609703063965, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.28034003674983976, + "loss/reg": 0.0, + "step": 15210 + }, + { + "epoch": 0.10013157894736842, + "grad_norm": 2.703125, + "grad_norm_var": 0.10357666015625, + "learning_rate": 0.0001, + "loss": 3.2279, + "loss/crossentropy": 2.1161023378372192, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.2513925403356552, + "loss/reg": 0.0, + "step": 15220 + }, + { + "epoch": 0.10019736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.2027008056640625, + "learning_rate": 0.0001, + "loss": 3.2361, + "loss/crossentropy": 2.064685332775116, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.28131103515625, + "loss/reg": 0.0, + "step": 15230 + }, + { + "epoch": 0.10026315789473685, + "grad_norm": 2.28125, + "grad_norm_var": 0.14849853515625, + "learning_rate": 0.0001, + "loss": 3.151, + "loss/crossentropy": 2.4768277525901796, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.24746784269809724, + "loss/reg": 0.0, + "step": 15240 + }, + { + "epoch": 0.10032894736842106, + "grad_norm": 2.15625, + "grad_norm_var": 0.07953999837239584, + "learning_rate": 0.0001, + "loss": 3.1794, + "loss/crossentropy": 2.2046443462371825, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.26957911550998687, + "loss/reg": 0.0, + "step": 15250 + }, + { + "epoch": 0.10039473684210526, + "grad_norm": 2.484375, + "grad_norm_var": 0.12280985514322916, + "learning_rate": 0.0001, + "loss": 3.0606, + "loss/crossentropy": 2.418226730823517, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.21895384043455124, + "loss/reg": 0.0, + "step": 15260 + }, + { + "epoch": 0.10046052631578947, + "grad_norm": 2.46875, + "grad_norm_var": 0.09387613932291666, + "learning_rate": 0.0001, + "loss": 3.1144, + "loss/crossentropy": 2.1524213790893554, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.23847964853048326, + "loss/reg": 0.0, + "step": 15270 + }, + { + "epoch": 0.10052631578947369, + "grad_norm": 2.859375, + "grad_norm_var": 0.12705459594726562, + "learning_rate": 0.0001, + "loss": 3.0626, + "loss/crossentropy": 1.9326023817062379, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.24662483483552933, + "loss/reg": 0.0, + "step": 15280 + }, + { + "epoch": 0.1005921052631579, + "grad_norm": 2.640625, + "grad_norm_var": 8.221414947509766, + "learning_rate": 0.0001, + "loss": 3.2248, + "loss/crossentropy": 2.2926568508148195, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.26595802754163744, + "loss/reg": 0.0, + "step": 15290 + }, + { + "epoch": 0.1006578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 8.17893778483073, + "learning_rate": 0.0001, + "loss": 3.1429, + "loss/crossentropy": 2.4002704977989198, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.2911446109414101, + "loss/reg": 0.0, + "step": 15300 + }, + { + "epoch": 0.10072368421052631, + "grad_norm": 2.1875, + "grad_norm_var": 0.03648681640625, + "learning_rate": 0.0001, + "loss": 3.1424, + "loss/crossentropy": 2.1603388369083403, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.27493633329868317, + "loss/reg": 0.0, + "step": 15310 + }, + { + "epoch": 0.10078947368421053, + "grad_norm": 3.96875, + "grad_norm_var": 0.1912506103515625, + "learning_rate": 0.0001, + "loss": 3.133, + "loss/crossentropy": 2.279471695423126, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.23778771311044694, + "loss/reg": 0.0, + "step": 15320 + }, + { + "epoch": 0.10085526315789474, + "grad_norm": 2.140625, + "grad_norm_var": 0.2115875244140625, + "learning_rate": 0.0001, + "loss": 3.0811, + "loss/crossentropy": 2.1736138820648194, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.19810649901628494, + "loss/reg": 0.0, + "step": 15330 + }, + { + "epoch": 0.10092105263157895, + "grad_norm": 2.453125, + "grad_norm_var": 0.05100809733072917, + "learning_rate": 0.0001, + "loss": 3.1508, + "loss/crossentropy": 2.2725695729255677, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.2860790088772774, + "loss/reg": 0.0, + "step": 15340 + }, + { + "epoch": 0.10098684210526315, + "grad_norm": 2.6875, + "grad_norm_var": 2.056571451822917, + "learning_rate": 0.0001, + "loss": 3.2468, + "loss/crossentropy": 2.215101981163025, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.27275211960077284, + "loss/reg": 0.0, + "step": 15350 + }, + { + "epoch": 0.10105263157894737, + "grad_norm": 2.15625, + "grad_norm_var": 2.1172159830729167, + "learning_rate": 0.0001, + "loss": 3.1519, + "loss/crossentropy": 2.416505420207977, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.26659835278987887, + "loss/reg": 0.0, + "step": 15360 + }, + { + "epoch": 0.10111842105263158, + "grad_norm": 2.234375, + "grad_norm_var": 0.051708984375, + "learning_rate": 0.0001, + "loss": 3.0702, + "loss/crossentropy": 2.3915260195732118, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.23197022825479507, + "loss/reg": 0.0, + "step": 15370 + }, + { + "epoch": 0.1011842105263158, + "grad_norm": 2.359375, + "grad_norm_var": 0.03733622233072917, + "learning_rate": 0.0001, + "loss": 3.1794, + "loss/crossentropy": 2.4574419140815733, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.34623306542634963, + "loss/reg": 0.0, + "step": 15380 + }, + { + "epoch": 0.10125, + "grad_norm": 2.3125, + "grad_norm_var": 0.229052734375, + "learning_rate": 0.0001, + "loss": 3.1256, + "loss/crossentropy": 2.1932177782058715, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.22062241584062575, + "loss/reg": 0.0, + "step": 15390 + }, + { + "epoch": 0.1013157894736842, + "grad_norm": 2.234375, + "grad_norm_var": 0.08741861979166667, + "learning_rate": 0.0001, + "loss": 3.1753, + "loss/crossentropy": 2.2610700845718386, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.22664773613214492, + "loss/reg": 0.0, + "step": 15400 + }, + { + "epoch": 0.10138157894736842, + "grad_norm": 2.546875, + "grad_norm_var": 0.02672119140625, + "learning_rate": 0.0001, + "loss": 3.1537, + "loss/crossentropy": 2.394396644830704, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.25834341049194337, + "loss/reg": 0.0, + "step": 15410 + }, + { + "epoch": 0.10144736842105263, + "grad_norm": 2.484375, + "grad_norm_var": 0.034895833333333334, + "learning_rate": 0.0001, + "loss": 3.1834, + "loss/crossentropy": 2.290709137916565, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.2504901379346848, + "loss/reg": 0.0, + "step": 15420 + }, + { + "epoch": 0.10151315789473685, + "grad_norm": 2.734375, + "grad_norm_var": 5.647508748372396, + "learning_rate": 0.0001, + "loss": 3.1705, + "loss/crossentropy": 2.597213554382324, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.2691803678870201, + "loss/reg": 0.0, + "step": 15430 + }, + { + "epoch": 0.10157894736842105, + "grad_norm": 2.125, + "grad_norm_var": 0.07888997395833333, + "learning_rate": 0.0001, + "loss": 3.1069, + "loss/crossentropy": 2.1887213230133056, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2691598206758499, + "loss/reg": 0.0, + "step": 15440 + }, + { + "epoch": 0.10164473684210526, + "grad_norm": 3.109375, + "grad_norm_var": 0.0681793212890625, + "learning_rate": 0.0001, + "loss": 3.1639, + "loss/crossentropy": 2.097235471010208, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2574038878083229, + "loss/reg": 0.0, + "step": 15450 + }, + { + "epoch": 0.10171052631578947, + "grad_norm": 2.125, + "grad_norm_var": 0.0678863525390625, + "learning_rate": 0.0001, + "loss": 3.1301, + "loss/crossentropy": 2.2443934082984924, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.24395571500062943, + "loss/reg": 0.0, + "step": 15460 + }, + { + "epoch": 0.10177631578947369, + "grad_norm": 2.265625, + "grad_norm_var": 0.1054595947265625, + "learning_rate": 0.0001, + "loss": 3.2604, + "loss/crossentropy": 2.429443156719208, + "loss/hidden": 2.990625, + "loss/incoh": 0.0, + "loss/logits": 0.2746900498867035, + "loss/reg": 0.0, + "step": 15470 + }, + { + "epoch": 0.1018421052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.1326080322265625, + "learning_rate": 0.0001, + "loss": 3.221, + "loss/crossentropy": 2.423279583454132, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.27492027878761294, + "loss/reg": 0.0, + "step": 15480 + }, + { + "epoch": 0.1019078947368421, + "grad_norm": 2.484375, + "grad_norm_var": 0.1138092041015625, + "learning_rate": 0.0001, + "loss": 3.257, + "loss/crossentropy": 2.648502016067505, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.2935266062617302, + "loss/reg": 0.0, + "step": 15490 + }, + { + "epoch": 0.10197368421052631, + "grad_norm": 2.40625, + "grad_norm_var": 0.1727203369140625, + "learning_rate": 0.0001, + "loss": 3.1197, + "loss/crossentropy": 2.485008704662323, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2535603493452072, + "loss/reg": 0.0, + "step": 15500 + }, + { + "epoch": 0.10203947368421053, + "grad_norm": 2.84375, + "grad_norm_var": 0.18816731770833334, + "learning_rate": 0.0001, + "loss": 3.1804, + "loss/crossentropy": 2.180854117870331, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.24670835435390473, + "loss/reg": 0.0, + "step": 15510 + }, + { + "epoch": 0.10210526315789474, + "grad_norm": 2.453125, + "grad_norm_var": 0.07157796223958333, + "learning_rate": 0.0001, + "loss": 3.1547, + "loss/crossentropy": 2.5997716546058656, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.2878054201602936, + "loss/reg": 0.0, + "step": 15520 + }, + { + "epoch": 0.10217105263157895, + "grad_norm": 2.5625, + "grad_norm_var": 0.154443359375, + "learning_rate": 0.0001, + "loss": 3.1694, + "loss/crossentropy": 2.3764194369316103, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.24656722396612168, + "loss/reg": 0.0, + "step": 15530 + }, + { + "epoch": 0.10223684210526315, + "grad_norm": 2.328125, + "grad_norm_var": 0.25536702473958334, + "learning_rate": 0.0001, + "loss": 3.1474, + "loss/crossentropy": 1.9735815048217773, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.24598835706710814, + "loss/reg": 0.0, + "step": 15540 + }, + { + "epoch": 0.10230263157894737, + "grad_norm": 2.8125, + "grad_norm_var": 0.1596832275390625, + "learning_rate": 0.0001, + "loss": 3.1522, + "loss/crossentropy": 2.525629758834839, + "loss/hidden": 2.9921875, + "loss/incoh": 0.0, + "loss/logits": 0.33703051060438155, + "loss/reg": 0.0, + "step": 15550 + }, + { + "epoch": 0.10236842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.1472808837890625, + "learning_rate": 0.0001, + "loss": 3.212, + "loss/crossentropy": 2.03394900560379, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.24415955394506456, + "loss/reg": 0.0, + "step": 15560 + }, + { + "epoch": 0.1024342105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.07866923014322917, + "learning_rate": 0.0001, + "loss": 3.122, + "loss/crossentropy": 2.0943363308906555, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.23014722466468812, + "loss/reg": 0.0, + "step": 15570 + }, + { + "epoch": 0.1025, + "grad_norm": 2.359375, + "grad_norm_var": 0.057389322916666666, + "learning_rate": 0.0001, + "loss": 3.1268, + "loss/crossentropy": 2.033569025993347, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2175581559538841, + "loss/reg": 0.0, + "step": 15580 + }, + { + "epoch": 0.10256578947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.08776041666666666, + "learning_rate": 0.0001, + "loss": 3.0706, + "loss/crossentropy": 2.6018026471138, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2438505232334137, + "loss/reg": 0.0, + "step": 15590 + }, + { + "epoch": 0.10263157894736842, + "grad_norm": 2.078125, + "grad_norm_var": 0.1537994384765625, + "learning_rate": 0.0001, + "loss": 3.1439, + "loss/crossentropy": 2.196335256099701, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.2767083361744881, + "loss/reg": 0.0, + "step": 15600 + }, + { + "epoch": 0.10269736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.07605692545572916, + "learning_rate": 0.0001, + "loss": 3.1523, + "loss/crossentropy": 2.326652777194977, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2691790975630283, + "loss/reg": 0.0, + "step": 15610 + }, + { + "epoch": 0.10276315789473685, + "grad_norm": 2.21875, + "grad_norm_var": 0.09851455688476562, + "learning_rate": 0.0001, + "loss": 3.1315, + "loss/crossentropy": 2.219775491952896, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2539939820766449, + "loss/reg": 0.0, + "step": 15620 + }, + { + "epoch": 0.10282894736842105, + "grad_norm": 2.078125, + "grad_norm_var": 0.14008560180664062, + "learning_rate": 0.0001, + "loss": 3.1501, + "loss/crossentropy": 2.284542143344879, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.27733145356178285, + "loss/reg": 0.0, + "step": 15630 + }, + { + "epoch": 0.10289473684210526, + "grad_norm": 2.578125, + "grad_norm_var": 0.10869115193684896, + "learning_rate": 0.0001, + "loss": 3.1621, + "loss/crossentropy": 2.4228519797325134, + "loss/hidden": 3.08125, + "loss/incoh": 0.0, + "loss/logits": 0.32454578429460523, + "loss/reg": 0.0, + "step": 15640 + }, + { + "epoch": 0.10296052631578947, + "grad_norm": 2.21875, + "grad_norm_var": 0.11193211873372395, + "learning_rate": 0.0001, + "loss": 3.1631, + "loss/crossentropy": 2.3004646062850953, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.2865068309009075, + "loss/reg": 0.0, + "step": 15650 + }, + { + "epoch": 0.10302631578947369, + "grad_norm": 2.203125, + "grad_norm_var": 0.08788655598958334, + "learning_rate": 0.0001, + "loss": 3.2413, + "loss/crossentropy": 2.3326223611831667, + "loss/hidden": 3.0703125, + "loss/incoh": 0.0, + "loss/logits": 0.2786666050553322, + "loss/reg": 0.0, + "step": 15660 + }, + { + "epoch": 0.1030921052631579, + "grad_norm": 2.0625, + "grad_norm_var": 0.05933329264322917, + "learning_rate": 0.0001, + "loss": 3.0799, + "loss/crossentropy": 2.308107304573059, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.23495433181524278, + "loss/reg": 0.0, + "step": 15670 + }, + { + "epoch": 0.1031578947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.04383926391601563, + "learning_rate": 0.0001, + "loss": 3.0889, + "loss/crossentropy": 2.3051168084144593, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.278233802318573, + "loss/reg": 0.0, + "step": 15680 + }, + { + "epoch": 0.10322368421052631, + "grad_norm": 2.234375, + "grad_norm_var": 1.4827247619628907, + "learning_rate": 0.0001, + "loss": 3.1864, + "loss/crossentropy": 2.2644376397132873, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.22414357215166092, + "loss/reg": 0.0, + "step": 15690 + }, + { + "epoch": 0.10328947368421053, + "grad_norm": 2.296875, + "grad_norm_var": 0.962451171875, + "learning_rate": 0.0001, + "loss": 3.1546, + "loss/crossentropy": 2.424470007419586, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.24368802309036255, + "loss/reg": 0.0, + "step": 15700 + }, + { + "epoch": 0.10335526315789474, + "grad_norm": 2.328125, + "grad_norm_var": 0.07737223307291667, + "learning_rate": 0.0001, + "loss": 3.1624, + "loss/crossentropy": 2.459938275814056, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23296410143375396, + "loss/reg": 0.0, + "step": 15710 + }, + { + "epoch": 0.10342105263157894, + "grad_norm": 2.59375, + "grad_norm_var": 0.06443583170572917, + "learning_rate": 0.0001, + "loss": 3.1932, + "loss/crossentropy": 2.2153470873832704, + "loss/hidden": 3.146875, + "loss/incoh": 0.0, + "loss/logits": 0.33586266040802004, + "loss/reg": 0.0, + "step": 15720 + }, + { + "epoch": 0.10348684210526315, + "grad_norm": 3.53125, + "grad_norm_var": 0.17069905598958332, + "learning_rate": 0.0001, + "loss": 3.257, + "loss/crossentropy": 2.2473284363746644, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.24566184133291244, + "loss/reg": 0.0, + "step": 15730 + }, + { + "epoch": 0.10355263157894737, + "grad_norm": 2.78125, + "grad_norm_var": 0.163916015625, + "learning_rate": 0.0001, + "loss": 3.0707, + "loss/crossentropy": 2.620988368988037, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.29941551238298414, + "loss/reg": 0.0, + "step": 15740 + }, + { + "epoch": 0.10361842105263158, + "grad_norm": 2.703125, + "grad_norm_var": 0.35409749348958336, + "learning_rate": 0.0001, + "loss": 3.2423, + "loss/crossentropy": 2.3977234601974486, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.26520802527666093, + "loss/reg": 0.0, + "step": 15750 + }, + { + "epoch": 0.1036842105263158, + "grad_norm": 2.375, + "grad_norm_var": 3.301877391050758e+17, + "learning_rate": 0.0001, + "loss": 3.2653, + "loss/crossentropy": 2.3854068875312806, + "loss/hidden": 3.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.3163578942418098, + "loss/reg": 0.0, + "step": 15760 + }, + { + "epoch": 0.10375, + "grad_norm": 3.09375, + "grad_norm_var": 3.301877391679248e+17, + "learning_rate": 0.0001, + "loss": 3.1843, + "loss/crossentropy": 2.2795175433158876, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.26798346936702727, + "loss/reg": 0.0, + "step": 15770 + }, + { + "epoch": 0.10381578947368421, + "grad_norm": 2.578125, + "grad_norm_var": 0.1086334228515625, + "learning_rate": 0.0001, + "loss": 3.1522, + "loss/crossentropy": 2.125895881652832, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.23809853047132493, + "loss/reg": 0.0, + "step": 15780 + }, + { + "epoch": 0.10388157894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.09806289672851562, + "learning_rate": 0.0001, + "loss": 3.121, + "loss/crossentropy": 2.3405375838279725, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.23903141915798187, + "loss/reg": 0.0, + "step": 15790 + }, + { + "epoch": 0.10394736842105264, + "grad_norm": 2.1875, + "grad_norm_var": 0.05409927368164062, + "learning_rate": 0.0001, + "loss": 3.1165, + "loss/crossentropy": 2.371259605884552, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.24798257499933243, + "loss/reg": 0.0, + "step": 15800 + }, + { + "epoch": 0.10401315789473685, + "grad_norm": 2.625, + "grad_norm_var": 0.046727498372395836, + "learning_rate": 0.0001, + "loss": 3.0808, + "loss/crossentropy": 2.5483869433403017, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.2987742185592651, + "loss/reg": 0.0, + "step": 15810 + }, + { + "epoch": 0.10407894736842105, + "grad_norm": 2.8125, + "grad_norm_var": 0.43404541015625, + "learning_rate": 0.0001, + "loss": 3.1913, + "loss/crossentropy": 2.3843488097190857, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.41426307857036593, + "loss/reg": 0.0, + "step": 15820 + }, + { + "epoch": 0.10414473684210526, + "grad_norm": 2.1875, + "grad_norm_var": 0.42040913899739585, + "learning_rate": 0.0001, + "loss": 3.1412, + "loss/crossentropy": 2.451664757728577, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.2949830338358879, + "loss/reg": 0.0, + "step": 15830 + }, + { + "epoch": 0.10421052631578948, + "grad_norm": 2.046875, + "grad_norm_var": 12064558770995.855, + "learning_rate": 0.0001, + "loss": 3.2876, + "loss/crossentropy": 2.540658712387085, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.3069792494177818, + "loss/reg": 0.0, + "step": 15840 + }, + { + "epoch": 0.10427631578947369, + "grad_norm": 2.59375, + "grad_norm_var": 0.26617431640625, + "learning_rate": 0.0001, + "loss": 3.1409, + "loss/crossentropy": 2.3011590003967286, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23567215949296952, + "loss/reg": 0.0, + "step": 15850 + }, + { + "epoch": 0.10434210526315789, + "grad_norm": 2.21875, + "grad_norm_var": 0.0716217041015625, + "learning_rate": 0.0001, + "loss": 3.2032, + "loss/crossentropy": 2.624694299697876, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.2735205709934235, + "loss/reg": 0.0, + "step": 15860 + }, + { + "epoch": 0.1044078947368421, + "grad_norm": 2.671875, + "grad_norm_var": 0.037018839518229166, + "learning_rate": 0.0001, + "loss": 3.1547, + "loss/crossentropy": 2.3426929831504824, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.26750322580337527, + "loss/reg": 0.0, + "step": 15870 + }, + { + "epoch": 0.10447368421052632, + "grad_norm": 2.625, + "grad_norm_var": 0.0340728759765625, + "learning_rate": 0.0001, + "loss": 3.1257, + "loss/crossentropy": 2.0538764238357543, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.23220817893743514, + "loss/reg": 0.0, + "step": 15880 + }, + { + "epoch": 0.10453947368421053, + "grad_norm": 2.484375, + "grad_norm_var": 0.10313084920247396, + "learning_rate": 0.0001, + "loss": 3.0791, + "loss/crossentropy": 2.4493046522140505, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.264581099152565, + "loss/reg": 0.0, + "step": 15890 + }, + { + "epoch": 0.10460526315789474, + "grad_norm": 2.140625, + "grad_norm_var": 0.08592910766601562, + "learning_rate": 0.0001, + "loss": 3.1153, + "loss/crossentropy": 2.384776270389557, + "loss/hidden": 2.6609375, + "loss/incoh": 0.0, + "loss/logits": 0.20801746100187302, + "loss/reg": 0.0, + "step": 15900 + }, + { + "epoch": 0.10467105263157894, + "grad_norm": 2.203125, + "grad_norm_var": 0.03535054524739583, + "learning_rate": 0.0001, + "loss": 3.1818, + "loss/crossentropy": 2.2780667304992677, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.33164310455322266, + "loss/reg": 0.0, + "step": 15910 + }, + { + "epoch": 0.10473684210526316, + "grad_norm": 2.3125, + "grad_norm_var": 0.04937744140625, + "learning_rate": 0.0001, + "loss": 3.1317, + "loss/crossentropy": 2.3224631786346435, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.26515267193317416, + "loss/reg": 0.0, + "step": 15920 + }, + { + "epoch": 0.10480263157894737, + "grad_norm": 3.375, + "grad_norm_var": 0.09521077473958334, + "learning_rate": 0.0001, + "loss": 3.1393, + "loss/crossentropy": 2.26437486410141, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.25974428951740264, + "loss/reg": 0.0, + "step": 15930 + }, + { + "epoch": 0.10486842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.10048828125, + "learning_rate": 0.0001, + "loss": 3.203, + "loss/crossentropy": 2.216641104221344, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.3172867178916931, + "loss/reg": 0.0, + "step": 15940 + }, + { + "epoch": 0.10493421052631578, + "grad_norm": 2.15625, + "grad_norm_var": 0.0614898681640625, + "learning_rate": 0.0001, + "loss": 3.0768, + "loss/crossentropy": 2.6155009508132934, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2578451469540596, + "loss/reg": 0.0, + "step": 15950 + }, + { + "epoch": 0.105, + "grad_norm": 2.234375, + "grad_norm_var": 0.060269927978515624, + "learning_rate": 0.0001, + "loss": 3.161, + "loss/crossentropy": 2.5140405654907227, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2738805189728737, + "loss/reg": 0.0, + "step": 15960 + }, + { + "epoch": 0.10506578947368421, + "grad_norm": 2.59375, + "grad_norm_var": 0.050176747639973956, + "learning_rate": 0.0001, + "loss": 3.0829, + "loss/crossentropy": 2.133234918117523, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2213057592511177, + "loss/reg": 0.0, + "step": 15970 + }, + { + "epoch": 0.10513157894736842, + "grad_norm": 2.515625, + "grad_norm_var": 0.18106180826822918, + "learning_rate": 0.0001, + "loss": 3.2666, + "loss/crossentropy": 2.489140582084656, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.2818035438656807, + "loss/reg": 0.0, + "step": 15980 + }, + { + "epoch": 0.10519736842105264, + "grad_norm": 2.390625, + "grad_norm_var": 0.6164947509765625, + "learning_rate": 0.0001, + "loss": 3.1722, + "loss/crossentropy": 2.51891051530838, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2563257798552513, + "loss/reg": 0.0, + "step": 15990 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 2.75, + "grad_norm_var": 0.8845987955729167, + "learning_rate": 0.0001, + "loss": 3.1763, + "loss/crossentropy": 2.2177582025527953, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2367611363530159, + "loss/reg": 0.0, + "step": 16000 + }, + { + "epoch": 0.10532894736842105, + "grad_norm": 2.5625, + "grad_norm_var": 0.451318359375, + "learning_rate": 0.0001, + "loss": 3.1408, + "loss/crossentropy": 2.2017379879951475, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.20062217488884926, + "loss/reg": 0.0, + "step": 16010 + }, + { + "epoch": 0.10539473684210526, + "grad_norm": 2.390625, + "grad_norm_var": 0.050439453125, + "learning_rate": 0.0001, + "loss": 3.1466, + "loss/crossentropy": 2.2292946934700013, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.2783455640077591, + "loss/reg": 0.0, + "step": 16020 + }, + { + "epoch": 0.10546052631578948, + "grad_norm": 1.984375, + "grad_norm_var": 0.1126129150390625, + "learning_rate": 0.0001, + "loss": 3.1018, + "loss/crossentropy": 2.530960404872894, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.23581028431653978, + "loss/reg": 0.0, + "step": 16030 + }, + { + "epoch": 0.10552631578947369, + "grad_norm": 2.109375, + "grad_norm_var": 0.08625386555989584, + "learning_rate": 0.0001, + "loss": 3.1222, + "loss/crossentropy": 2.432818961143494, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.23912513256072998, + "loss/reg": 0.0, + "step": 16040 + }, + { + "epoch": 0.10559210526315789, + "grad_norm": 2.234375, + "grad_norm_var": 0.1727203369140625, + "learning_rate": 0.0001, + "loss": 3.1108, + "loss/crossentropy": 2.481075167655945, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2601512670516968, + "loss/reg": 0.0, + "step": 16050 + }, + { + "epoch": 0.1056578947368421, + "grad_norm": 2.171875, + "grad_norm_var": 0.22974853515625, + "learning_rate": 0.0001, + "loss": 3.1102, + "loss/crossentropy": 2.347836995124817, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.2967532381415367, + "loss/reg": 0.0, + "step": 16060 + }, + { + "epoch": 0.10572368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 3.920637003580729, + "learning_rate": 0.0001, + "loss": 3.1789, + "loss/crossentropy": 2.2461806178092956, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.2796615108847618, + "loss/reg": 0.0, + "step": 16070 + }, + { + "epoch": 0.10578947368421053, + "grad_norm": 3.234375, + "grad_norm_var": 3.8184529622395833, + "learning_rate": 0.0001, + "loss": 3.2028, + "loss/crossentropy": 2.2602067947387696, + "loss/hidden": 3.059375, + "loss/incoh": 0.0, + "loss/logits": 0.27418701648712157, + "loss/reg": 0.0, + "step": 16080 + }, + { + "epoch": 0.10585526315789473, + "grad_norm": 2.546875, + "grad_norm_var": 0.0978912353515625, + "learning_rate": 0.0001, + "loss": 3.087, + "loss/crossentropy": 2.4076030969619753, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.2339254654943943, + "loss/reg": 0.0, + "step": 16090 + }, + { + "epoch": 0.10592105263157894, + "grad_norm": 8.8125, + "grad_norm_var": 2.634016927083333, + "learning_rate": 0.0001, + "loss": 3.2404, + "loss/crossentropy": 1.9294680893421172, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.24796108528971672, + "loss/reg": 0.0, + "step": 16100 + }, + { + "epoch": 0.10598684210526316, + "grad_norm": 2.5, + "grad_norm_var": 2.7831013997395835, + "learning_rate": 0.0001, + "loss": 3.1866, + "loss/crossentropy": 1.8769205152988433, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.2611444815993309, + "loss/reg": 0.0, + "step": 16110 + }, + { + "epoch": 0.10605263157894737, + "grad_norm": 2.703125, + "grad_norm_var": 0.76617431640625, + "learning_rate": 0.0001, + "loss": 3.2357, + "loss/crossentropy": 2.2888787627220153, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.2839828670024872, + "loss/reg": 0.0, + "step": 16120 + }, + { + "epoch": 0.10611842105263158, + "grad_norm": 2.359375, + "grad_norm_var": 0.04062093098958333, + "learning_rate": 0.0001, + "loss": 3.1608, + "loss/crossentropy": 2.313342797756195, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2872613161802292, + "loss/reg": 0.0, + "step": 16130 + }, + { + "epoch": 0.10618421052631578, + "grad_norm": 2.28125, + "grad_norm_var": 0.4191691080729167, + "learning_rate": 0.0001, + "loss": 3.1753, + "loss/crossentropy": 2.6026643037796022, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.28212335854768755, + "loss/reg": 0.0, + "step": 16140 + }, + { + "epoch": 0.10625, + "grad_norm": 2.4375, + "grad_norm_var": 0.41529541015625, + "learning_rate": 0.0001, + "loss": 3.1346, + "loss/crossentropy": 2.556915271282196, + "loss/hidden": 3.1078125, + "loss/incoh": 0.0, + "loss/logits": 0.2837150752544403, + "loss/reg": 0.0, + "step": 16150 + }, + { + "epoch": 0.10631578947368421, + "grad_norm": 2.75, + "grad_norm_var": 0.053441365559895836, + "learning_rate": 0.0001, + "loss": 3.1471, + "loss/crossentropy": 2.399764931201935, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2460502192378044, + "loss/reg": 0.0, + "step": 16160 + }, + { + "epoch": 0.10638157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.8765462239583334, + "learning_rate": 0.0001, + "loss": 3.2831, + "loss/crossentropy": 2.2979734420776365, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.2876197725534439, + "loss/reg": 0.0, + "step": 16170 + }, + { + "epoch": 0.10644736842105264, + "grad_norm": 2.4375, + "grad_norm_var": 0.9296834309895833, + "learning_rate": 0.0001, + "loss": 3.2187, + "loss/crossentropy": 2.2085362553596495, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.24199980795383452, + "loss/reg": 0.0, + "step": 16180 + }, + { + "epoch": 0.10651315789473684, + "grad_norm": 2.234375, + "grad_norm_var": 0.06541239420572917, + "learning_rate": 0.0001, + "loss": 3.1524, + "loss/crossentropy": 2.4575427293777468, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2581774353981018, + "loss/reg": 0.0, + "step": 16190 + }, + { + "epoch": 0.10657894736842105, + "grad_norm": 3.34375, + "grad_norm_var": 0.2840983072916667, + "learning_rate": 0.0001, + "loss": 3.2356, + "loss/crossentropy": 2.444900369644165, + "loss/hidden": 3.1625, + "loss/incoh": 0.0, + "loss/logits": 0.295256008207798, + "loss/reg": 0.0, + "step": 16200 + }, + { + "epoch": 0.10664473684210526, + "grad_norm": 2.15625, + "grad_norm_var": 0.27533137003580727, + "learning_rate": 0.0001, + "loss": 3.0994, + "loss/crossentropy": 2.5305007696151733, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.2888329938054085, + "loss/reg": 0.0, + "step": 16210 + }, + { + "epoch": 0.10671052631578948, + "grad_norm": 2.609375, + "grad_norm_var": 0.6374224344889323, + "learning_rate": 0.0001, + "loss": 3.1837, + "loss/crossentropy": 2.394989788532257, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.22691280096769334, + "loss/reg": 0.0, + "step": 16220 + }, + { + "epoch": 0.10677631578947368, + "grad_norm": 2.546875, + "grad_norm_var": 0.6025950113932291, + "learning_rate": 0.0001, + "loss": 3.1035, + "loss/crossentropy": 2.3761568784713747, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.25212556272745135, + "loss/reg": 0.0, + "step": 16230 + }, + { + "epoch": 0.10684210526315789, + "grad_norm": 2.390625, + "grad_norm_var": 0.07968343098958333, + "learning_rate": 0.0001, + "loss": 3.1507, + "loss/crossentropy": 2.1518751621246337, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.25603573620319364, + "loss/reg": 0.0, + "step": 16240 + }, + { + "epoch": 0.1069078947368421, + "grad_norm": 2.03125, + "grad_norm_var": 0.0736968994140625, + "learning_rate": 0.0001, + "loss": 3.0942, + "loss/crossentropy": 2.2170523405075073, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.2497616469860077, + "loss/reg": 0.0, + "step": 16250 + }, + { + "epoch": 0.10697368421052632, + "grad_norm": 2.671875, + "grad_norm_var": 0.15055338541666666, + "learning_rate": 0.0001, + "loss": 3.18, + "loss/crossentropy": 2.2126652002334595, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2371530830860138, + "loss/reg": 0.0, + "step": 16260 + }, + { + "epoch": 0.10703947368421053, + "grad_norm": 2.359375, + "grad_norm_var": 0.1051666259765625, + "learning_rate": 0.0001, + "loss": 3.1175, + "loss/crossentropy": 2.5849027037620544, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.31244791448116305, + "loss/reg": 0.0, + "step": 16270 + }, + { + "epoch": 0.10710526315789473, + "grad_norm": 2.34375, + "grad_norm_var": 0.05666910807291667, + "learning_rate": 0.0001, + "loss": 3.2064, + "loss/crossentropy": 1.9444570660591125, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2180204689502716, + "loss/reg": 0.0, + "step": 16280 + }, + { + "epoch": 0.10717105263157894, + "grad_norm": 2.203125, + "grad_norm_var": 0.08806050618489583, + "learning_rate": 0.0001, + "loss": 3.0722, + "loss/crossentropy": 2.3104967713356017, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.2631675943732262, + "loss/reg": 0.0, + "step": 16290 + }, + { + "epoch": 0.10723684210526316, + "grad_norm": 2.28125, + "grad_norm_var": 0.018146769205729166, + "learning_rate": 0.0001, + "loss": 3.0587, + "loss/crossentropy": 2.1140322208404543, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.2300342008471489, + "loss/reg": 0.0, + "step": 16300 + }, + { + "epoch": 0.10730263157894737, + "grad_norm": 2.40625, + "grad_norm_var": 0.036774698893229166, + "learning_rate": 0.0001, + "loss": 3.2022, + "loss/crossentropy": 2.5614047765731813, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2788904532790184, + "loss/reg": 0.0, + "step": 16310 + }, + { + "epoch": 0.10736842105263159, + "grad_norm": 2.734375, + "grad_norm_var": 0.21725972493489584, + "learning_rate": 0.0001, + "loss": 3.1511, + "loss/crossentropy": 1.8924081802368165, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2040714904665947, + "loss/reg": 0.0, + "step": 16320 + }, + { + "epoch": 0.10743421052631578, + "grad_norm": 2.234375, + "grad_norm_var": 0.23220926920572918, + "learning_rate": 0.0001, + "loss": 3.1014, + "loss/crossentropy": 2.342508816719055, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.27500451505184176, + "loss/reg": 0.0, + "step": 16330 + }, + { + "epoch": 0.1075, + "grad_norm": 2.21875, + "grad_norm_var": 0.5662424723307292, + "learning_rate": 0.0001, + "loss": 3.2912, + "loss/crossentropy": 2.3959147095680238, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2863708436489105, + "loss/reg": 0.0, + "step": 16340 + }, + { + "epoch": 0.10756578947368421, + "grad_norm": 2.078125, + "grad_norm_var": 0.11278889973958334, + "learning_rate": 0.0001, + "loss": 3.1532, + "loss/crossentropy": 2.166390228271484, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.23770884573459625, + "loss/reg": 0.0, + "step": 16350 + }, + { + "epoch": 0.10763157894736843, + "grad_norm": 2.265625, + "grad_norm_var": 0.06780776977539063, + "learning_rate": 0.0001, + "loss": 3.0399, + "loss/crossentropy": 2.3057939767837525, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.2797847852110863, + "loss/reg": 0.0, + "step": 16360 + }, + { + "epoch": 0.10769736842105262, + "grad_norm": 2.390625, + "grad_norm_var": 0.038826243082682295, + "learning_rate": 0.0001, + "loss": 3.12, + "loss/crossentropy": 2.265635335445404, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.23047662824392318, + "loss/reg": 0.0, + "step": 16370 + }, + { + "epoch": 0.10776315789473684, + "grad_norm": 2.296875, + "grad_norm_var": 0.08754781087239584, + "learning_rate": 0.0001, + "loss": 3.0758, + "loss/crossentropy": 2.6614980459213258, + "loss/hidden": 2.6609375, + "loss/incoh": 0.0, + "loss/logits": 0.23254811465740205, + "loss/reg": 0.0, + "step": 16380 + }, + { + "epoch": 0.10782894736842105, + "grad_norm": 3.390625, + "grad_norm_var": 0.14463882446289061, + "learning_rate": 0.0001, + "loss": 3.1805, + "loss/crossentropy": 2.419999623298645, + "loss/hidden": 3.0984375, + "loss/incoh": 0.0, + "loss/logits": 0.3057109400629997, + "loss/reg": 0.0, + "step": 16390 + }, + { + "epoch": 0.10789473684210527, + "grad_norm": 2.234375, + "grad_norm_var": 0.12823893229166666, + "learning_rate": 0.0001, + "loss": 3.1467, + "loss/crossentropy": 2.145538020133972, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.23782579749822616, + "loss/reg": 0.0, + "step": 16400 + }, + { + "epoch": 0.10796052631578948, + "grad_norm": 2.40625, + "grad_norm_var": 0.07416890462239584, + "learning_rate": 0.0001, + "loss": 3.1471, + "loss/crossentropy": 2.3262511491775513, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.23651919960975648, + "loss/reg": 0.0, + "step": 16410 + }, + { + "epoch": 0.10802631578947368, + "grad_norm": 2.453125, + "grad_norm_var": 0.052245076497395834, + "learning_rate": 0.0001, + "loss": 3.1573, + "loss/crossentropy": 2.446836495399475, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.22134366482496262, + "loss/reg": 0.0, + "step": 16420 + }, + { + "epoch": 0.10809210526315789, + "grad_norm": 2.90625, + "grad_norm_var": 0.12043355305989584, + "learning_rate": 0.0001, + "loss": 3.2359, + "loss/crossentropy": 2.3113945603370665, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2689176544547081, + "loss/reg": 0.0, + "step": 16430 + }, + { + "epoch": 0.1081578947368421, + "grad_norm": 2.078125, + "grad_norm_var": 0.10204671223958334, + "learning_rate": 0.0001, + "loss": 3.1679, + "loss/crossentropy": 2.422752869129181, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.306715852022171, + "loss/reg": 0.0, + "step": 16440 + }, + { + "epoch": 0.10822368421052632, + "grad_norm": 2.296875, + "grad_norm_var": 0.06880594889322916, + "learning_rate": 0.0001, + "loss": 3.147, + "loss/crossentropy": 2.048931634426117, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2303798720240593, + "loss/reg": 0.0, + "step": 16450 + }, + { + "epoch": 0.10828947368421053, + "grad_norm": 2.234375, + "grad_norm_var": 0.15419514973958334, + "learning_rate": 0.0001, + "loss": 3.1877, + "loss/crossentropy": 2.68409343957901, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.25028681606054304, + "loss/reg": 0.0, + "step": 16460 + }, + { + "epoch": 0.10835526315789473, + "grad_norm": 2.3125, + "grad_norm_var": 0.09561258951822917, + "learning_rate": 0.0001, + "loss": 3.0881, + "loss/crossentropy": 2.3087923645973207, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.2732239991426468, + "loss/reg": 0.0, + "step": 16470 + }, + { + "epoch": 0.10842105263157895, + "grad_norm": 2.546875, + "grad_norm_var": 0.14474283854166667, + "learning_rate": 0.0001, + "loss": 3.1675, + "loss/crossentropy": 2.3147490501403807, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.25520786345005037, + "loss/reg": 0.0, + "step": 16480 + }, + { + "epoch": 0.10848684210526316, + "grad_norm": 2.21875, + "grad_norm_var": 0.12763671875, + "learning_rate": 0.0001, + "loss": 3.1948, + "loss/crossentropy": 2.4046076416969298, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.24635857343673706, + "loss/reg": 0.0, + "step": 16490 + }, + { + "epoch": 0.10855263157894737, + "grad_norm": 3.234375, + "grad_norm_var": 0.19675267537434896, + "learning_rate": 0.0001, + "loss": 3.134, + "loss/crossentropy": 2.1358281135559083, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.20830129384994506, + "loss/reg": 0.0, + "step": 16500 + }, + { + "epoch": 0.10861842105263157, + "grad_norm": 2.25, + "grad_norm_var": 0.20981216430664062, + "learning_rate": 0.0001, + "loss": 3.1161, + "loss/crossentropy": 2.2100176930427553, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.22965313643217086, + "loss/reg": 0.0, + "step": 16510 + }, + { + "epoch": 0.10868421052631579, + "grad_norm": 2.171875, + "grad_norm_var": 0.025813802083333334, + "learning_rate": 0.0001, + "loss": 3.1402, + "loss/crossentropy": 2.1080865144729612, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.24372481554746628, + "loss/reg": 0.0, + "step": 16520 + }, + { + "epoch": 0.10875, + "grad_norm": 2.390625, + "grad_norm_var": 0.03843994140625, + "learning_rate": 0.0001, + "loss": 3.112, + "loss/crossentropy": 2.184442663192749, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.25090422928333284, + "loss/reg": 0.0, + "step": 16530 + }, + { + "epoch": 0.10881578947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.03850682576497396, + "learning_rate": 0.0001, + "loss": 3.0917, + "loss/crossentropy": 2.2426281213760375, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.26249563246965407, + "loss/reg": 0.0, + "step": 16540 + }, + { + "epoch": 0.10888157894736843, + "grad_norm": 2.34375, + "grad_norm_var": 0.037082672119140625, + "learning_rate": 0.0001, + "loss": 3.1533, + "loss/crossentropy": 2.4359527707099913, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.28122682869434357, + "loss/reg": 0.0, + "step": 16550 + }, + { + "epoch": 0.10894736842105263, + "grad_norm": 2.515625, + "grad_norm_var": 0.06422526041666667, + "learning_rate": 0.0001, + "loss": 3.1086, + "loss/crossentropy": 2.679885816574097, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.25820834636688234, + "loss/reg": 0.0, + "step": 16560 + }, + { + "epoch": 0.10901315789473684, + "grad_norm": 2.375, + "grad_norm_var": 0.033543904622395836, + "learning_rate": 0.0001, + "loss": 3.1226, + "loss/crossentropy": 2.230025511980057, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.24839217215776443, + "loss/reg": 0.0, + "step": 16570 + }, + { + "epoch": 0.10907894736842105, + "grad_norm": 2.171875, + "grad_norm_var": 0.08507486979166666, + "learning_rate": 0.0001, + "loss": 3.158, + "loss/crossentropy": 2.3745269417762755, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.22690069079399108, + "loss/reg": 0.0, + "step": 16580 + }, + { + "epoch": 0.10914473684210527, + "grad_norm": 2.03125, + "grad_norm_var": 0.07500712076822917, + "learning_rate": 0.0001, + "loss": 3.0464, + "loss/crossentropy": 2.3726358652114867, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.22285908311605454, + "loss/reg": 0.0, + "step": 16590 + }, + { + "epoch": 0.10921052631578948, + "grad_norm": 2.984375, + "grad_norm_var": 0.0905426025390625, + "learning_rate": 0.0001, + "loss": 3.0802, + "loss/crossentropy": 2.297460901737213, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.24248487651348113, + "loss/reg": 0.0, + "step": 16600 + }, + { + "epoch": 0.10927631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.42377827962239584, + "learning_rate": 0.0001, + "loss": 3.1369, + "loss/crossentropy": 2.1492306351661683, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.25582308024168016, + "loss/reg": 0.0, + "step": 16610 + }, + { + "epoch": 0.1093421052631579, + "grad_norm": 2.703125, + "grad_norm_var": 0.38531494140625, + "learning_rate": 0.0001, + "loss": 3.1291, + "loss/crossentropy": 2.1944116175174715, + "loss/hidden": 3.025, + "loss/incoh": 0.0, + "loss/logits": 0.26006165742874143, + "loss/reg": 0.0, + "step": 16620 + }, + { + "epoch": 0.1094078947368421, + "grad_norm": 2.296875, + "grad_norm_var": 0.031305948893229164, + "learning_rate": 0.0001, + "loss": 3.0966, + "loss/crossentropy": 2.2580446600914, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2571749180555344, + "loss/reg": 0.0, + "step": 16630 + }, + { + "epoch": 0.10947368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 0.08385009765625, + "learning_rate": 0.0001, + "loss": 3.0758, + "loss/crossentropy": 2.711108660697937, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.23514284193515778, + "loss/reg": 0.0, + "step": 16640 + }, + { + "epoch": 0.10953947368421052, + "grad_norm": 2.53125, + "grad_norm_var": 0.39968973795572915, + "learning_rate": 0.0001, + "loss": 3.212, + "loss/crossentropy": 2.318574833869934, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.3364941954612732, + "loss/reg": 0.0, + "step": 16650 + }, + { + "epoch": 0.10960526315789473, + "grad_norm": 2.90625, + "grad_norm_var": 0.5502431233723958, + "learning_rate": 0.0001, + "loss": 3.1398, + "loss/crossentropy": 2.5402591228485107, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2698328003287315, + "loss/reg": 0.0, + "step": 16660 + }, + { + "epoch": 0.10967105263157895, + "grad_norm": 2.578125, + "grad_norm_var": 0.7628651936848958, + "learning_rate": 0.0001, + "loss": 3.2125, + "loss/crossentropy": 2.4290089428424837, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.23459196090698242, + "loss/reg": 0.0, + "step": 16670 + }, + { + "epoch": 0.10973684210526316, + "grad_norm": 2.546875, + "grad_norm_var": 0.5333984375, + "learning_rate": 0.0001, + "loss": 3.1568, + "loss/crossentropy": 2.4715004682540895, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2527153715491295, + "loss/reg": 0.0, + "step": 16680 + }, + { + "epoch": 0.10980263157894737, + "grad_norm": 2.421875, + "grad_norm_var": 0.01929931640625, + "learning_rate": 0.0001, + "loss": 3.1188, + "loss/crossentropy": 2.565124809741974, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2764893934130669, + "loss/reg": 0.0, + "step": 16690 + }, + { + "epoch": 0.10986842105263157, + "grad_norm": 2.609375, + "grad_norm_var": 0.7143287658691406, + "learning_rate": 0.0001, + "loss": 3.1851, + "loss/crossentropy": 2.1823901176452636, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.2804734021425247, + "loss/reg": 0.0, + "step": 16700 + }, + { + "epoch": 0.10993421052631579, + "grad_norm": 3.75, + "grad_norm_var": 0.800158437093099, + "learning_rate": 0.0001, + "loss": 3.1599, + "loss/crossentropy": 2.339951229095459, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.23388542532920836, + "loss/reg": 0.0, + "step": 16710 + }, + { + "epoch": 0.11, + "grad_norm": 3.0625, + "grad_norm_var": 0.46857808430989584, + "learning_rate": 0.0001, + "loss": 3.168, + "loss/crossentropy": 2.5235843658447266, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.25846762359142306, + "loss/reg": 0.0, + "step": 16720 + }, + { + "epoch": 0.11006578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.2997792561848958, + "learning_rate": 0.0001, + "loss": 3.199, + "loss/crossentropy": 2.4483086824417115, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.24846359938383103, + "loss/reg": 0.0, + "step": 16730 + }, + { + "epoch": 0.11013157894736843, + "grad_norm": 2.703125, + "grad_norm_var": 0.0768218994140625, + "learning_rate": 0.0001, + "loss": 3.1155, + "loss/crossentropy": 2.318689703941345, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.25379282981157303, + "loss/reg": 0.0, + "step": 16740 + }, + { + "epoch": 0.11019736842105263, + "grad_norm": 2.328125, + "grad_norm_var": 0.04757258097330729, + "learning_rate": 0.0001, + "loss": 3.0428, + "loss/crossentropy": 2.3445772767066955, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.26050496101379395, + "loss/reg": 0.0, + "step": 16750 + }, + { + "epoch": 0.11026315789473684, + "grad_norm": 3.265625, + "grad_norm_var": 0.09231669108072917, + "learning_rate": 0.0001, + "loss": 3.1426, + "loss/crossentropy": 2.2978576898574827, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.23892502784729003, + "loss/reg": 0.0, + "step": 16760 + }, + { + "epoch": 0.11032894736842105, + "grad_norm": 2.265625, + "grad_norm_var": 0.09435221354166666, + "learning_rate": 0.0001, + "loss": 3.0687, + "loss/crossentropy": 2.15471470952034, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.22209101915359497, + "loss/reg": 0.0, + "step": 16770 + }, + { + "epoch": 0.11039473684210527, + "grad_norm": 2.625, + "grad_norm_var": 0.084326171875, + "learning_rate": 0.0001, + "loss": 3.0934, + "loss/crossentropy": 2.0944029092788696, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2732590340077877, + "loss/reg": 0.0, + "step": 16780 + }, + { + "epoch": 0.11046052631578947, + "grad_norm": 2.296875, + "grad_norm_var": 0.07172749837239584, + "learning_rate": 0.0001, + "loss": 3.196, + "loss/crossentropy": 2.4396159648895264, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.25805698037147523, + "loss/reg": 0.0, + "step": 16790 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 2.53125, + "grad_norm_var": 0.04521382649739583, + "learning_rate": 0.0001, + "loss": 3.1492, + "loss/crossentropy": 2.3519849300384523, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.24649366587400437, + "loss/reg": 0.0, + "step": 16800 + }, + { + "epoch": 0.1105921052631579, + "grad_norm": 2.90625, + "grad_norm_var": 0.06435445149739584, + "learning_rate": 0.0001, + "loss": 3.1366, + "loss/crossentropy": 2.2999491453170777, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2394917294383049, + "loss/reg": 0.0, + "step": 16810 + }, + { + "epoch": 0.11065789473684211, + "grad_norm": 2.3125, + "grad_norm_var": 0.09391988118489583, + "learning_rate": 0.0001, + "loss": 3.1441, + "loss/crossentropy": 2.4930548071861267, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.2843811124563217, + "loss/reg": 0.0, + "step": 16820 + }, + { + "epoch": 0.11072368421052632, + "grad_norm": 2.3125, + "grad_norm_var": 0.23037007649739583, + "learning_rate": 0.0001, + "loss": 3.1744, + "loss/crossentropy": 2.327615487575531, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.25857883393764497, + "loss/reg": 0.0, + "step": 16830 + }, + { + "epoch": 0.11078947368421052, + "grad_norm": 2.046875, + "grad_norm_var": 0.08505859375, + "learning_rate": 0.0001, + "loss": 3.1966, + "loss/crossentropy": 2.0340147018432617, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.22139777690172197, + "loss/reg": 0.0, + "step": 16840 + }, + { + "epoch": 0.11085526315789473, + "grad_norm": 2.359375, + "grad_norm_var": 0.04459228515625, + "learning_rate": 0.0001, + "loss": 3.0559, + "loss/crossentropy": 2.327996277809143, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.2415407806634903, + "loss/reg": 0.0, + "step": 16850 + }, + { + "epoch": 0.11092105263157895, + "grad_norm": 2.296875, + "grad_norm_var": 0.15735677083333333, + "learning_rate": 0.0001, + "loss": 3.1824, + "loss/crossentropy": 2.3824142098426817, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.28444231003522874, + "loss/reg": 0.0, + "step": 16860 + }, + { + "epoch": 0.11098684210526316, + "grad_norm": 2.453125, + "grad_norm_var": 0.2694498697916667, + "learning_rate": 0.0001, + "loss": 3.1079, + "loss/crossentropy": 2.4686101198196413, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.23947114795446395, + "loss/reg": 0.0, + "step": 16870 + }, + { + "epoch": 0.11105263157894738, + "grad_norm": 2.53125, + "grad_norm_var": 0.2833811442057292, + "learning_rate": 0.0001, + "loss": 3.0544, + "loss/crossentropy": 2.2384608387947083, + "loss/hidden": 3.23125, + "loss/incoh": 0.0, + "loss/logits": 0.29755171537399294, + "loss/reg": 0.0, + "step": 16880 + }, + { + "epoch": 0.11111842105263157, + "grad_norm": 2.484375, + "grad_norm_var": 0.11591389973958334, + "learning_rate": 0.0001, + "loss": 3.1316, + "loss/crossentropy": 2.065184140205383, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.22740320414304732, + "loss/reg": 0.0, + "step": 16890 + }, + { + "epoch": 0.11118421052631579, + "grad_norm": 2.46875, + "grad_norm_var": 0.14423421223958333, + "learning_rate": 0.0001, + "loss": 3.1867, + "loss/crossentropy": 2.241389238834381, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2434727743268013, + "loss/reg": 0.0, + "step": 16900 + }, + { + "epoch": 0.11125, + "grad_norm": 2.390625, + "grad_norm_var": 0.14661458333333333, + "learning_rate": 0.0001, + "loss": 3.1063, + "loss/crossentropy": 2.1571604132652284, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.21824948787689208, + "loss/reg": 0.0, + "step": 16910 + }, + { + "epoch": 0.11131578947368422, + "grad_norm": 2.1875, + "grad_norm_var": 0.9939605712890625, + "learning_rate": 0.0001, + "loss": 3.2191, + "loss/crossentropy": 2.4117300748825072, + "loss/hidden": 3.2203125, + "loss/incoh": 0.0, + "loss/logits": 0.33291524201631545, + "loss/reg": 0.0, + "step": 16920 + }, + { + "epoch": 0.11138157894736841, + "grad_norm": 2.46875, + "grad_norm_var": 0.9513580322265625, + "learning_rate": 0.0001, + "loss": 3.2001, + "loss/crossentropy": 2.490310883522034, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.2631781131029129, + "loss/reg": 0.0, + "step": 16930 + }, + { + "epoch": 0.11144736842105263, + "grad_norm": 2.421875, + "grad_norm_var": 0.12056884765625, + "learning_rate": 0.0001, + "loss": 3.203, + "loss/crossentropy": 2.3007746815681456, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.2449628993868828, + "loss/reg": 0.0, + "step": 16940 + }, + { + "epoch": 0.11151315789473684, + "grad_norm": 2.515625, + "grad_norm_var": 0.11777242024739583, + "learning_rate": 0.0001, + "loss": 3.1245, + "loss/crossentropy": 2.3470576763153077, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.24132359698414801, + "loss/reg": 0.0, + "step": 16950 + }, + { + "epoch": 0.11157894736842106, + "grad_norm": 2.265625, + "grad_norm_var": 0.05214742024739583, + "learning_rate": 0.0001, + "loss": 3.115, + "loss/crossentropy": 2.4354201793670653, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.22888378351926802, + "loss/reg": 0.0, + "step": 16960 + }, + { + "epoch": 0.11164473684210527, + "grad_norm": 2.3125, + "grad_norm_var": 0.07139383951822917, + "learning_rate": 0.0001, + "loss": 3.1537, + "loss/crossentropy": 2.1988754749298094, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.22949053347110748, + "loss/reg": 0.0, + "step": 16970 + }, + { + "epoch": 0.11171052631578947, + "grad_norm": 2.375, + "grad_norm_var": 0.06669921875, + "learning_rate": 0.0001, + "loss": 3.113, + "loss/crossentropy": 2.2436501502990724, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.26571269184350965, + "loss/reg": 0.0, + "step": 16980 + }, + { + "epoch": 0.11177631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.1141998291015625, + "learning_rate": 0.0001, + "loss": 3.2373, + "loss/crossentropy": 2.0406009197235107, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.2788648784160614, + "loss/reg": 0.0, + "step": 16990 + }, + { + "epoch": 0.1118421052631579, + "grad_norm": 3.5, + "grad_norm_var": 0.13699544270833333, + "learning_rate": 0.0001, + "loss": 3.1371, + "loss/crossentropy": 2.282533049583435, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.27586452960968016, + "loss/reg": 0.0, + "step": 17000 + }, + { + "epoch": 0.11190789473684211, + "grad_norm": 2.390625, + "grad_norm_var": 0.13681233723958333, + "learning_rate": 0.0001, + "loss": 3.1898, + "loss/crossentropy": 2.192341995239258, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.2728851273655891, + "loss/reg": 0.0, + "step": 17010 + }, + { + "epoch": 0.11197368421052632, + "grad_norm": 2.9375, + "grad_norm_var": 0.08626302083333333, + "learning_rate": 0.0001, + "loss": 3.1208, + "loss/crossentropy": 2.036851680278778, + "loss/hidden": 3.090625, + "loss/incoh": 0.0, + "loss/logits": 0.29492041319608686, + "loss/reg": 0.0, + "step": 17020 + }, + { + "epoch": 0.11203947368421052, + "grad_norm": 3.171875, + "grad_norm_var": 0.38601888020833336, + "learning_rate": 0.0001, + "loss": 3.1351, + "loss/crossentropy": 2.013105309009552, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.21977066546678542, + "loss/reg": 0.0, + "step": 17030 + }, + { + "epoch": 0.11210526315789474, + "grad_norm": 2.515625, + "grad_norm_var": 0.515862782796224, + "learning_rate": 0.0001, + "loss": 3.0822, + "loss/crossentropy": 2.2523082733154296, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.23816863894462587, + "loss/reg": 0.0, + "step": 17040 + }, + { + "epoch": 0.11217105263157895, + "grad_norm": 2.4375, + "grad_norm_var": 0.13596598307291666, + "learning_rate": 0.0001, + "loss": 3.0894, + "loss/crossentropy": 2.474255657196045, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.28221793919801713, + "loss/reg": 0.0, + "step": 17050 + }, + { + "epoch": 0.11223684210526316, + "grad_norm": 2.375, + "grad_norm_var": 2.8374436804564963e+17, + "learning_rate": 0.0001, + "loss": 3.2437, + "loss/crossentropy": 2.5942065715789795, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.26605169773101806, + "loss/reg": 0.0, + "step": 17060 + }, + { + "epoch": 0.11230263157894736, + "grad_norm": 1.984375, + "grad_norm_var": 0.05279541015625, + "learning_rate": 0.0001, + "loss": 3.0525, + "loss/crossentropy": 2.034271013736725, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.2154896892607212, + "loss/reg": 0.0, + "step": 17070 + }, + { + "epoch": 0.11236842105263158, + "grad_norm": 2.25, + "grad_norm_var": 0.050902303059895834, + "learning_rate": 0.0001, + "loss": 3.1033, + "loss/crossentropy": 2.229046678543091, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.22233688235282897, + "loss/reg": 0.0, + "step": 17080 + }, + { + "epoch": 0.11243421052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.04163309733072917, + "learning_rate": 0.0001, + "loss": 3.1293, + "loss/crossentropy": 2.4799925684928894, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.2588403090834618, + "loss/reg": 0.0, + "step": 17090 + }, + { + "epoch": 0.1125, + "grad_norm": 2.1875, + "grad_norm_var": 0.1295074462890625, + "learning_rate": 0.0001, + "loss": 3.0953, + "loss/crossentropy": 2.308819645643234, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.23669061064720154, + "loss/reg": 0.0, + "step": 17100 + }, + { + "epoch": 0.11256578947368422, + "grad_norm": 2.140625, + "grad_norm_var": 0.7806142171223959, + "learning_rate": 0.0001, + "loss": 3.1353, + "loss/crossentropy": 2.3123676419258117, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.29565141499042513, + "loss/reg": 0.0, + "step": 17110 + }, + { + "epoch": 0.11263157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.08242085774739584, + "learning_rate": 0.0001, + "loss": 3.0662, + "loss/crossentropy": 2.262893891334534, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.23198095709085464, + "loss/reg": 0.0, + "step": 17120 + }, + { + "epoch": 0.11269736842105263, + "grad_norm": 2.265625, + "grad_norm_var": 0.1591949462890625, + "learning_rate": 0.0001, + "loss": 3.0855, + "loss/crossentropy": 2.1816434502601623, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.2444119155406952, + "loss/reg": 0.0, + "step": 17130 + }, + { + "epoch": 0.11276315789473684, + "grad_norm": 2.125, + "grad_norm_var": 0.14752197265625, + "learning_rate": 0.0001, + "loss": 3.1414, + "loss/crossentropy": 2.4847304582595826, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.26482110619544985, + "loss/reg": 0.0, + "step": 17140 + }, + { + "epoch": 0.11282894736842106, + "grad_norm": 2.109375, + "grad_norm_var": 0.09546610514322916, + "learning_rate": 0.0001, + "loss": 3.0923, + "loss/crossentropy": 2.5822364687919617, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.26294154226779937, + "loss/reg": 0.0, + "step": 17150 + }, + { + "epoch": 0.11289473684210527, + "grad_norm": 3.796875, + "grad_norm_var": 0.19820963541666667, + "learning_rate": 0.0001, + "loss": 3.0656, + "loss/crossentropy": 2.3146368622779847, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.23589982092380524, + "loss/reg": 0.0, + "step": 17160 + }, + { + "epoch": 0.11296052631578947, + "grad_norm": 2.0625, + "grad_norm_var": 1.0957618713378907, + "learning_rate": 0.0001, + "loss": 3.1693, + "loss/crossentropy": 2.355622184276581, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.2783960849046707, + "loss/reg": 0.0, + "step": 17170 + }, + { + "epoch": 0.11302631578947368, + "grad_norm": 2.515625, + "grad_norm_var": 1.0233965555826823, + "learning_rate": 0.0001, + "loss": 3.1617, + "loss/crossentropy": 2.3866767287254333, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.25886829346418383, + "loss/reg": 0.0, + "step": 17180 + }, + { + "epoch": 0.1130921052631579, + "grad_norm": 2.25, + "grad_norm_var": 0.08580322265625, + "learning_rate": 0.0001, + "loss": 3.1277, + "loss/crossentropy": 2.095993900299072, + "loss/hidden": 2.6625, + "loss/incoh": 0.0, + "loss/logits": 0.1956949472427368, + "loss/reg": 0.0, + "step": 17190 + }, + { + "epoch": 0.11315789473684211, + "grad_norm": 2.375, + "grad_norm_var": 0.09143778483072916, + "learning_rate": 0.0001, + "loss": 3.0835, + "loss/crossentropy": 2.3784751892089844, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.26899502128362657, + "loss/reg": 0.0, + "step": 17200 + }, + { + "epoch": 0.11322368421052631, + "grad_norm": 2.3125, + "grad_norm_var": 0.027274576822916667, + "learning_rate": 0.0001, + "loss": 3.1617, + "loss/crossentropy": 2.505306875705719, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.2929636001586914, + "loss/reg": 0.0, + "step": 17210 + }, + { + "epoch": 0.11328947368421052, + "grad_norm": 2.296875, + "grad_norm_var": 0.06923421223958333, + "learning_rate": 0.0001, + "loss": 3.12, + "loss/crossentropy": 2.1889270186424254, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.2669166073203087, + "loss/reg": 0.0, + "step": 17220 + }, + { + "epoch": 0.11335526315789474, + "grad_norm": 2.3125, + "grad_norm_var": 0.17213312784830728, + "learning_rate": 0.0001, + "loss": 3.0681, + "loss/crossentropy": 2.2528875708580016, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.23936019986867904, + "loss/reg": 0.0, + "step": 17230 + }, + { + "epoch": 0.11342105263157895, + "grad_norm": 2.546875, + "grad_norm_var": 0.12957763671875, + "learning_rate": 0.0001, + "loss": 3.119, + "loss/crossentropy": 2.2457367897033693, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.23305099308490754, + "loss/reg": 0.0, + "step": 17240 + }, + { + "epoch": 0.11348684210526316, + "grad_norm": 2.578125, + "grad_norm_var": 0.07983779907226562, + "learning_rate": 0.0001, + "loss": 3.0851, + "loss/crossentropy": 2.4546321392059327, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.24581009149551392, + "loss/reg": 0.0, + "step": 17250 + }, + { + "epoch": 0.11355263157894736, + "grad_norm": 4.1875, + "grad_norm_var": 0.2813791910807292, + "learning_rate": 0.0001, + "loss": 3.171, + "loss/crossentropy": 1.9349523544311524, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.31686680018901825, + "loss/reg": 0.0, + "step": 17260 + }, + { + "epoch": 0.11361842105263158, + "grad_norm": 2.59375, + "grad_norm_var": 1.4602701822916666, + "learning_rate": 0.0001, + "loss": 3.1656, + "loss/crossentropy": 2.2751689314842225, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.2878335312008858, + "loss/reg": 0.0, + "step": 17270 + }, + { + "epoch": 0.11368421052631579, + "grad_norm": 2.765625, + "grad_norm_var": 0.5687662760416666, + "learning_rate": 0.0001, + "loss": 3.0772, + "loss/crossentropy": 2.4036699175834655, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.24233294725418092, + "loss/reg": 0.0, + "step": 17280 + }, + { + "epoch": 0.11375, + "grad_norm": 2.96875, + "grad_norm_var": 0.06691792805989584, + "learning_rate": 0.0001, + "loss": 3.1161, + "loss/crossentropy": 2.4056329488754273, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.25776179879903793, + "loss/reg": 0.0, + "step": 17290 + }, + { + "epoch": 0.11381578947368422, + "grad_norm": 2.203125, + "grad_norm_var": 0.12681884765625, + "learning_rate": 0.0001, + "loss": 3.2097, + "loss/crossentropy": 2.4936662912368774, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.27261604368686676, + "loss/reg": 0.0, + "step": 17300 + }, + { + "epoch": 0.11388157894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.08603108723958333, + "learning_rate": 0.0001, + "loss": 3.1596, + "loss/crossentropy": 2.2334436774253845, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.27515108734369276, + "loss/reg": 0.0, + "step": 17310 + }, + { + "epoch": 0.11394736842105263, + "grad_norm": 2.390625, + "grad_norm_var": 0.06290690104166667, + "learning_rate": 0.0001, + "loss": 3.1322, + "loss/crossentropy": 2.3009248971939087, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2514318063855171, + "loss/reg": 0.0, + "step": 17320 + }, + { + "epoch": 0.11401315789473684, + "grad_norm": 2.953125, + "grad_norm_var": 0.19062093098958333, + "learning_rate": 0.0001, + "loss": 3.2222, + "loss/crossentropy": 2.2115599513053894, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.24124337881803512, + "loss/reg": 0.0, + "step": 17330 + }, + { + "epoch": 0.11407894736842106, + "grad_norm": 2.34375, + "grad_norm_var": 0.4003245035807292, + "learning_rate": 0.0001, + "loss": 3.1404, + "loss/crossentropy": 2.2533546447753907, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.25740948766469957, + "loss/reg": 0.0, + "step": 17340 + }, + { + "epoch": 0.11414473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.06924540201822917, + "learning_rate": 0.0001, + "loss": 3.1509, + "loss/crossentropy": 2.4188032507896424, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.27805479913949965, + "loss/reg": 0.0, + "step": 17350 + }, + { + "epoch": 0.11421052631578947, + "grad_norm": 2.96875, + "grad_norm_var": 0.6562489827473958, + "learning_rate": 0.0001, + "loss": 3.2795, + "loss/crossentropy": 2.1555517435073854, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.28488118648529054, + "loss/reg": 0.0, + "step": 17360 + }, + { + "epoch": 0.11427631578947368, + "grad_norm": 2.140625, + "grad_norm_var": 0.62613525390625, + "learning_rate": 0.0001, + "loss": 3.1637, + "loss/crossentropy": 2.351659083366394, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.2672264903783798, + "loss/reg": 0.0, + "step": 17370 + }, + { + "epoch": 0.1143421052631579, + "grad_norm": 2.09375, + "grad_norm_var": 0.08678385416666666, + "learning_rate": 0.0001, + "loss": 3.1242, + "loss/crossentropy": 2.2498091578483583, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24537185132503508, + "loss/reg": 0.0, + "step": 17380 + }, + { + "epoch": 0.11440789473684211, + "grad_norm": 2.53125, + "grad_norm_var": 0.17229817708333334, + "learning_rate": 0.0001, + "loss": 3.1776, + "loss/crossentropy": 2.401424062252045, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.3059739723801613, + "loss/reg": 0.0, + "step": 17390 + }, + { + "epoch": 0.11447368421052631, + "grad_norm": 2.125, + "grad_norm_var": 0.1625885009765625, + "learning_rate": 0.0001, + "loss": 3.1477, + "loss/crossentropy": 2.292530918121338, + "loss/hidden": 3.09375, + "loss/incoh": 0.0, + "loss/logits": 0.2784269869327545, + "loss/reg": 0.0, + "step": 17400 + }, + { + "epoch": 0.11453947368421052, + "grad_norm": 2.375, + "grad_norm_var": 0.09892171223958333, + "learning_rate": 0.0001, + "loss": 3.1288, + "loss/crossentropy": 2.464983069896698, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.2796273499727249, + "loss/reg": 0.0, + "step": 17410 + }, + { + "epoch": 0.11460526315789474, + "grad_norm": 2.921875, + "grad_norm_var": 0.09850972493489583, + "learning_rate": 0.0001, + "loss": 3.1289, + "loss/crossentropy": 2.1951366662979126, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.21696581244468688, + "loss/reg": 0.0, + "step": 17420 + }, + { + "epoch": 0.11467105263157895, + "grad_norm": 2.53125, + "grad_norm_var": 0.06843159993489584, + "learning_rate": 0.0001, + "loss": 3.1597, + "loss/crossentropy": 2.3925849914550783, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2907762542366982, + "loss/reg": 0.0, + "step": 17430 + }, + { + "epoch": 0.11473684210526315, + "grad_norm": 2.21875, + "grad_norm_var": 0.21046549479166668, + "learning_rate": 0.0001, + "loss": 3.1634, + "loss/crossentropy": 2.464526927471161, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2937367483973503, + "loss/reg": 0.0, + "step": 17440 + }, + { + "epoch": 0.11480263157894736, + "grad_norm": 2.203125, + "grad_norm_var": 0.047526041666666664, + "learning_rate": 0.0001, + "loss": 3.1144, + "loss/crossentropy": 2.072342965006828, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.2236027292907238, + "loss/reg": 0.0, + "step": 17450 + }, + { + "epoch": 0.11486842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.05799051920572917, + "learning_rate": 0.0001, + "loss": 3.0873, + "loss/crossentropy": 2.240265655517578, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.23422597646713256, + "loss/reg": 0.0, + "step": 17460 + }, + { + "epoch": 0.11493421052631579, + "grad_norm": 2.078125, + "grad_norm_var": 0.10058186848958334, + "learning_rate": 0.0001, + "loss": 3.127, + "loss/crossentropy": 2.3168819308280946, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.2430237874388695, + "loss/reg": 0.0, + "step": 17470 + }, + { + "epoch": 0.115, + "grad_norm": 2.625, + "grad_norm_var": 0.11015523274739583, + "learning_rate": 0.0001, + "loss": 3.0618, + "loss/crossentropy": 2.2366016268730164, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23987135738134385, + "loss/reg": 0.0, + "step": 17480 + }, + { + "epoch": 0.1150657894736842, + "grad_norm": 2.609375, + "grad_norm_var": 0.09355061848958333, + "learning_rate": 0.0001, + "loss": 3.1875, + "loss/crossentropy": 2.1879064559936525, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.24197361022233962, + "loss/reg": 0.0, + "step": 17490 + }, + { + "epoch": 0.11513157894736842, + "grad_norm": 2.421875, + "grad_norm_var": 7.735667928059896, + "learning_rate": 0.0001, + "loss": 3.2427, + "loss/crossentropy": 2.149078643321991, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.24705443456768988, + "loss/reg": 0.0, + "step": 17500 + }, + { + "epoch": 0.11519736842105263, + "grad_norm": 2.265625, + "grad_norm_var": 7.899019368489584, + "learning_rate": 0.0001, + "loss": 3.1837, + "loss/crossentropy": 2.3580456256866453, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.23221501410007478, + "loss/reg": 0.0, + "step": 17510 + }, + { + "epoch": 0.11526315789473685, + "grad_norm": 2.6875, + "grad_norm_var": 0.2678456624348958, + "learning_rate": 0.0001, + "loss": 3.079, + "loss/crossentropy": 2.414518404006958, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2647902578115463, + "loss/reg": 0.0, + "step": 17520 + }, + { + "epoch": 0.11532894736842106, + "grad_norm": 3.125, + "grad_norm_var": 0.14198811848958334, + "learning_rate": 0.0001, + "loss": 3.1613, + "loss/crossentropy": 2.416664445400238, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.22963229715824127, + "loss/reg": 0.0, + "step": 17530 + }, + { + "epoch": 0.11539473684210526, + "grad_norm": 2.53125, + "grad_norm_var": 0.7926177978515625, + "learning_rate": 0.0001, + "loss": 3.1532, + "loss/crossentropy": 2.1853476405143737, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2286560907959938, + "loss/reg": 0.0, + "step": 17540 + }, + { + "epoch": 0.11546052631578947, + "grad_norm": 2.859375, + "grad_norm_var": 0.8031534830729167, + "learning_rate": 0.0001, + "loss": 3.1557, + "loss/crossentropy": 2.6209848165512084, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.25361735969781873, + "loss/reg": 0.0, + "step": 17550 + }, + { + "epoch": 0.11552631578947369, + "grad_norm": 2.4375, + "grad_norm_var": 0.09949544270833334, + "learning_rate": 0.0001, + "loss": 3.2229, + "loss/crossentropy": 2.3588363409042357, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.30205955654382705, + "loss/reg": 0.0, + "step": 17560 + }, + { + "epoch": 0.1155921052631579, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07576471964518229, + "learning_rate": 0.0001, + "loss": 3.174, + "loss/crossentropy": 2.1386860758066177, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.24600727967917918, + "loss/reg": 0.0, + "step": 17570 + }, + { + "epoch": 0.1156578947368421, + "grad_norm": 2.171875, + "grad_norm_var": 0.07134577433268229, + "learning_rate": 0.0001, + "loss": 3.1143, + "loss/crossentropy": 2.5142947912216185, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.2483747810125351, + "loss/reg": 0.0, + "step": 17580 + }, + { + "epoch": 0.11572368421052631, + "grad_norm": 2.46875, + "grad_norm_var": 0.7249959309895834, + "learning_rate": 0.0001, + "loss": 3.1598, + "loss/crossentropy": 2.339717888832092, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2511411294341087, + "loss/reg": 0.0, + "step": 17590 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 2.671875, + "grad_norm_var": 0.9438435872395833, + "learning_rate": 0.0001, + "loss": 3.0902, + "loss/crossentropy": 2.3273319840431212, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.321346378326416, + "loss/reg": 0.0, + "step": 17600 + }, + { + "epoch": 0.11585526315789474, + "grad_norm": 2.484375, + "grad_norm_var": 0.93150634765625, + "learning_rate": 0.0001, + "loss": 3.1392, + "loss/crossentropy": 2.0881393194198608, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2542056769132614, + "loss/reg": 0.0, + "step": 17610 + }, + { + "epoch": 0.11592105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.05450846354166667, + "learning_rate": 0.0001, + "loss": 3.153, + "loss/crossentropy": 2.200914776325226, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2596256859600544, + "loss/reg": 0.0, + "step": 17620 + }, + { + "epoch": 0.11598684210526315, + "grad_norm": 2.34375, + "grad_norm_var": 0.06134440104166667, + "learning_rate": 0.0001, + "loss": 3.1138, + "loss/crossentropy": 2.391866648197174, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.24801254719495774, + "loss/reg": 0.0, + "step": 17630 + }, + { + "epoch": 0.11605263157894737, + "grad_norm": 2.5625, + "grad_norm_var": 0.19595947265625, + "learning_rate": 0.0001, + "loss": 3.1197, + "loss/crossentropy": 2.3731101989746093, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2734734550118446, + "loss/reg": 0.0, + "step": 17640 + }, + { + "epoch": 0.11611842105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.06402587890625, + "learning_rate": 0.0001, + "loss": 3.186, + "loss/crossentropy": 2.510706162452698, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.3243511900305748, + "loss/reg": 0.0, + "step": 17650 + }, + { + "epoch": 0.11618421052631579, + "grad_norm": 2.53125, + "grad_norm_var": 0.08347066243489583, + "learning_rate": 0.0001, + "loss": 3.1433, + "loss/crossentropy": 2.3694114327430724, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.23816563338041305, + "loss/reg": 0.0, + "step": 17660 + }, + { + "epoch": 0.11625, + "grad_norm": 2.515625, + "grad_norm_var": 0.07176106770833333, + "learning_rate": 0.0001, + "loss": 3.117, + "loss/crossentropy": 2.3822181940078737, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.2559090554714203, + "loss/reg": 0.0, + "step": 17670 + }, + { + "epoch": 0.1163157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.06597391764322917, + "learning_rate": 0.0001, + "loss": 3.167, + "loss/crossentropy": 2.236483609676361, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.24177847057580948, + "loss/reg": 0.0, + "step": 17680 + }, + { + "epoch": 0.11638157894736842, + "grad_norm": 2.140625, + "grad_norm_var": 0.05454813639322917, + "learning_rate": 0.0001, + "loss": 3.1207, + "loss/crossentropy": 2.386284852027893, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.23972099274396896, + "loss/reg": 0.0, + "step": 17690 + }, + { + "epoch": 0.11644736842105263, + "grad_norm": 2.546875, + "grad_norm_var": 0.029325358072916665, + "learning_rate": 0.0001, + "loss": 3.1551, + "loss/crossentropy": 2.3102688074111937, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.25639690458774567, + "loss/reg": 0.0, + "step": 17700 + }, + { + "epoch": 0.11651315789473685, + "grad_norm": 2.390625, + "grad_norm_var": 0.1159576416015625, + "learning_rate": 0.0001, + "loss": 3.1227, + "loss/crossentropy": 2.164554476737976, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2502566508948803, + "loss/reg": 0.0, + "step": 17710 + }, + { + "epoch": 0.11657894736842105, + "grad_norm": 2.453125, + "grad_norm_var": 0.16982421875, + "learning_rate": 0.0001, + "loss": 3.2736, + "loss/crossentropy": 2.166772598028183, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.24817814379930497, + "loss/reg": 0.0, + "step": 17720 + }, + { + "epoch": 0.11664473684210526, + "grad_norm": 2.75, + "grad_norm_var": 0.13912353515625, + "learning_rate": 0.0001, + "loss": 3.1985, + "loss/crossentropy": 2.1537662744522095, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.21837295591831207, + "loss/reg": 0.0, + "step": 17730 + }, + { + "epoch": 0.11671052631578947, + "grad_norm": 2.046875, + "grad_norm_var": 0.09753392537434896, + "learning_rate": 0.0001, + "loss": 3.1066, + "loss/crossentropy": 2.106878674030304, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.2489032343029976, + "loss/reg": 0.0, + "step": 17740 + }, + { + "epoch": 0.11677631578947369, + "grad_norm": 2.5625, + "grad_norm_var": 0.08820699055989584, + "learning_rate": 0.0001, + "loss": 3.1647, + "loss/crossentropy": 2.186918389797211, + "loss/hidden": 3.1171875, + "loss/incoh": 0.0, + "loss/logits": 0.26816043853759763, + "loss/reg": 0.0, + "step": 17750 + }, + { + "epoch": 0.1168421052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.14002176920572917, + "learning_rate": 0.0001, + "loss": 3.2041, + "loss/crossentropy": 2.130704140663147, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.26593275666236876, + "loss/reg": 0.0, + "step": 17760 + }, + { + "epoch": 0.1169078947368421, + "grad_norm": 3.6875, + "grad_norm_var": 0.16592508951822918, + "learning_rate": 0.0001, + "loss": 3.0748, + "loss/crossentropy": 2.4294887661933897, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.26008370518684387, + "loss/reg": 0.0, + "step": 17770 + }, + { + "epoch": 0.11697368421052631, + "grad_norm": 2.109375, + "grad_norm_var": 0.4354156494140625, + "learning_rate": 0.0001, + "loss": 3.2129, + "loss/crossentropy": 2.0633517861366273, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.2134759709239006, + "loss/reg": 0.0, + "step": 17780 + }, + { + "epoch": 0.11703947368421053, + "grad_norm": 2.5625, + "grad_norm_var": 0.114306640625, + "learning_rate": 0.0001, + "loss": 3.1969, + "loss/crossentropy": 2.470662558078766, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.32178338468074796, + "loss/reg": 0.0, + "step": 17790 + }, + { + "epoch": 0.11710526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.1042388916015625, + "learning_rate": 0.0001, + "loss": 3.132, + "loss/crossentropy": 2.180730104446411, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2408156231045723, + "loss/reg": 0.0, + "step": 17800 + }, + { + "epoch": 0.11717105263157895, + "grad_norm": 2.5, + "grad_norm_var": 0.08092041015625, + "learning_rate": 0.0001, + "loss": 3.1107, + "loss/crossentropy": 2.3672548174858092, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.2295123293995857, + "loss/reg": 0.0, + "step": 17810 + }, + { + "epoch": 0.11723684210526315, + "grad_norm": 2.40625, + "grad_norm_var": 0.029108683268229168, + "learning_rate": 0.0001, + "loss": 3.1063, + "loss/crossentropy": 2.486844336986542, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2817386701703072, + "loss/reg": 0.0, + "step": 17820 + }, + { + "epoch": 0.11730263157894737, + "grad_norm": 2.609375, + "grad_norm_var": 3.6479156901264755e+17, + "learning_rate": 0.0001, + "loss": 3.2511, + "loss/crossentropy": 2.3458006739616395, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.25344357788562777, + "loss/reg": 0.0, + "step": 17830 + }, + { + "epoch": 0.11736842105263158, + "grad_norm": 2.75, + "grad_norm_var": 1.1187174479166666, + "learning_rate": 0.0001, + "loss": 3.1952, + "loss/crossentropy": 2.3346620917320253, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.24867950975894929, + "loss/reg": 0.0, + "step": 17840 + }, + { + "epoch": 0.1174342105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.05538736979166667, + "learning_rate": 0.0001, + "loss": 3.1781, + "loss/crossentropy": 2.1594719171524046, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.32041922956705093, + "loss/reg": 0.0, + "step": 17850 + }, + { + "epoch": 0.1175, + "grad_norm": 2.3125, + "grad_norm_var": 0.13107096354166667, + "learning_rate": 0.0001, + "loss": 3.1675, + "loss/crossentropy": 2.335154187679291, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.23140522688627244, + "loss/reg": 0.0, + "step": 17860 + }, + { + "epoch": 0.1175657894736842, + "grad_norm": 2.421875, + "grad_norm_var": 0.061777496337890626, + "learning_rate": 0.0001, + "loss": 3.1431, + "loss/crossentropy": 2.448777401447296, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.24006084948778153, + "loss/reg": 0.0, + "step": 17870 + }, + { + "epoch": 0.11763157894736842, + "grad_norm": 2.25, + "grad_norm_var": 0.14251302083333334, + "learning_rate": 0.0001, + "loss": 3.2176, + "loss/crossentropy": 2.2756917595863344, + "loss/hidden": 3.0359375, + "loss/incoh": 0.0, + "loss/logits": 0.30769334733486176, + "loss/reg": 0.0, + "step": 17880 + }, + { + "epoch": 0.11769736842105263, + "grad_norm": 1.921875, + "grad_norm_var": 0.14058837890625, + "learning_rate": 0.0001, + "loss": 3.1661, + "loss/crossentropy": 2.3477345585823057, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.2401590347290039, + "loss/reg": 0.0, + "step": 17890 + }, + { + "epoch": 0.11776315789473685, + "grad_norm": 3.34375, + "grad_norm_var": 0.14404271443684896, + "learning_rate": 0.0001, + "loss": 3.1339, + "loss/crossentropy": 2.2531643748283385, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.27347354739904406, + "loss/reg": 0.0, + "step": 17900 + }, + { + "epoch": 0.11782894736842105, + "grad_norm": 2.59375, + "grad_norm_var": 0.10000991821289062, + "learning_rate": 0.0001, + "loss": 3.1182, + "loss/crossentropy": 2.2883235216140747, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.2531774565577507, + "loss/reg": 0.0, + "step": 17910 + }, + { + "epoch": 0.11789473684210526, + "grad_norm": 2.328125, + "grad_norm_var": 0.0377349853515625, + "learning_rate": 0.0001, + "loss": 3.1566, + "loss/crossentropy": 2.2789531648159027, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.235878374427557, + "loss/reg": 0.0, + "step": 17920 + }, + { + "epoch": 0.11796052631578947, + "grad_norm": 2.25, + "grad_norm_var": 0.06096598307291667, + "learning_rate": 0.0001, + "loss": 3.1068, + "loss/crossentropy": 2.3758100509643554, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.22890822887420653, + "loss/reg": 0.0, + "step": 17930 + }, + { + "epoch": 0.11802631578947369, + "grad_norm": 2.578125, + "grad_norm_var": 0.07280171712239583, + "learning_rate": 0.0001, + "loss": 3.1773, + "loss/crossentropy": 2.2372307777404785, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.24057336896657944, + "loss/reg": 0.0, + "step": 17940 + }, + { + "epoch": 0.1180921052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.12463785807291666, + "learning_rate": 0.0001, + "loss": 3.1969, + "loss/crossentropy": 2.1488450884819033, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.25413042977452277, + "loss/reg": 0.0, + "step": 17950 + }, + { + "epoch": 0.1181578947368421, + "grad_norm": 2.265625, + "grad_norm_var": 0.11864827473958334, + "learning_rate": 0.0001, + "loss": 3.2164, + "loss/crossentropy": 2.5382091283798216, + "loss/hidden": 3.13125, + "loss/incoh": 0.0, + "loss/logits": 0.26207938939332964, + "loss/reg": 0.0, + "step": 17960 + }, + { + "epoch": 0.11822368421052631, + "grad_norm": 2.0625, + "grad_norm_var": 0.06974283854166667, + "learning_rate": 0.0001, + "loss": 3.1713, + "loss/crossentropy": 2.3174261093139648, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.279338338971138, + "loss/reg": 0.0, + "step": 17970 + }, + { + "epoch": 0.11828947368421053, + "grad_norm": 2.6875, + "grad_norm_var": 0.04395243326822917, + "learning_rate": 0.0001, + "loss": 3.1261, + "loss/crossentropy": 2.182445216178894, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2768437474966049, + "loss/reg": 0.0, + "step": 17980 + }, + { + "epoch": 0.11835526315789474, + "grad_norm": 2.703125, + "grad_norm_var": 0.14431050618489583, + "learning_rate": 0.0001, + "loss": 3.1287, + "loss/crossentropy": 2.202815556526184, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.21700907945632936, + "loss/reg": 0.0, + "step": 17990 + }, + { + "epoch": 0.11842105263157894, + "grad_norm": 2.546875, + "grad_norm_var": 2.2837799072265623, + "learning_rate": 0.0001, + "loss": 3.1737, + "loss/crossentropy": 2.0051895678043365, + "loss/hidden": 3.2046875, + "loss/incoh": 0.0, + "loss/logits": 0.35186032503843306, + "loss/reg": 0.0, + "step": 18000 + }, + { + "epoch": 0.11848684210526315, + "grad_norm": 2.4375, + "grad_norm_var": 2.469374338785807, + "learning_rate": 0.0001, + "loss": 3.0848, + "loss/crossentropy": 1.872368621826172, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.2913415163755417, + "loss/reg": 0.0, + "step": 18010 + }, + { + "epoch": 0.11855263157894737, + "grad_norm": 2.046875, + "grad_norm_var": 0.1679278055826823, + "learning_rate": 0.0001, + "loss": 3.2063, + "loss/crossentropy": 2.367658519744873, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.27947854697704316, + "loss/reg": 0.0, + "step": 18020 + }, + { + "epoch": 0.11861842105263158, + "grad_norm": 3.40625, + "grad_norm_var": 0.15479227701822917, + "learning_rate": 0.0001, + "loss": 3.1836, + "loss/crossentropy": 2.087648892402649, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.2204158440232277, + "loss/reg": 0.0, + "step": 18030 + }, + { + "epoch": 0.1186842105263158, + "grad_norm": 3.171875, + "grad_norm_var": 0.18935546875, + "learning_rate": 0.0001, + "loss": 3.205, + "loss/crossentropy": 2.1874751687049865, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2435563921928406, + "loss/reg": 0.0, + "step": 18040 + }, + { + "epoch": 0.11875, + "grad_norm": 2.234375, + "grad_norm_var": 0.38063151041666665, + "learning_rate": 0.0001, + "loss": 3.1996, + "loss/crossentropy": 2.1633397549390794, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.21968650221824645, + "loss/reg": 0.0, + "step": 18050 + }, + { + "epoch": 0.11881578947368421, + "grad_norm": 2.140625, + "grad_norm_var": 0.5637278238932292, + "learning_rate": 0.0001, + "loss": 3.1736, + "loss/crossentropy": 2.281619644165039, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.295219412446022, + "loss/reg": 0.0, + "step": 18060 + }, + { + "epoch": 0.11888157894736842, + "grad_norm": 2.828125, + "grad_norm_var": 1.9192342122395833, + "learning_rate": 0.0001, + "loss": 3.2705, + "loss/crossentropy": 2.21823273897171, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.23566214740276337, + "loss/reg": 0.0, + "step": 18070 + }, + { + "epoch": 0.11894736842105263, + "grad_norm": 2.515625, + "grad_norm_var": 1.6334869384765625, + "learning_rate": 0.0001, + "loss": 3.1873, + "loss/crossentropy": 2.2668980836868284, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2256957158446312, + "loss/reg": 0.0, + "step": 18080 + }, + { + "epoch": 0.11901315789473685, + "grad_norm": 2.046875, + "grad_norm_var": 0.0713287353515625, + "learning_rate": 0.0001, + "loss": 3.0942, + "loss/crossentropy": 2.218550479412079, + "loss/hidden": 3.0296875, + "loss/incoh": 0.0, + "loss/logits": 0.3167494982481003, + "loss/reg": 0.0, + "step": 18090 + }, + { + "epoch": 0.11907894736842105, + "grad_norm": 2.375, + "grad_norm_var": 0.05681050618489583, + "learning_rate": 0.0001, + "loss": 3.1816, + "loss/crossentropy": 2.464331579208374, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.25844376236200334, + "loss/reg": 0.0, + "step": 18100 + }, + { + "epoch": 0.11914473684210526, + "grad_norm": 2.390625, + "grad_norm_var": 0.07647196451822917, + "learning_rate": 0.0001, + "loss": 3.1727, + "loss/crossentropy": 2.2838521599769592, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.2587725341320038, + "loss/reg": 0.0, + "step": 18110 + }, + { + "epoch": 0.11921052631578948, + "grad_norm": 2.1875, + "grad_norm_var": 0.13923238118489584, + "learning_rate": 0.0001, + "loss": 3.2584, + "loss/crossentropy": 2.3038102626800536, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.3060797408223152, + "loss/reg": 0.0, + "step": 18120 + }, + { + "epoch": 0.11927631578947369, + "grad_norm": 2.09375, + "grad_norm_var": 0.11160481770833333, + "learning_rate": 0.0001, + "loss": 3.0877, + "loss/crossentropy": 2.1922419667243958, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2569601759314537, + "loss/reg": 0.0, + "step": 18130 + }, + { + "epoch": 0.11934210526315789, + "grad_norm": 2.34375, + "grad_norm_var": 0.18005269368489582, + "learning_rate": 0.0001, + "loss": 3.2654, + "loss/crossentropy": 2.336120533943176, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.3214158996939659, + "loss/reg": 0.0, + "step": 18140 + }, + { + "epoch": 0.1194078947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.211083984375, + "learning_rate": 0.0001, + "loss": 3.1801, + "loss/crossentropy": 2.6258700489997864, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.23279473930597305, + "loss/reg": 0.0, + "step": 18150 + }, + { + "epoch": 0.11947368421052632, + "grad_norm": 2.46875, + "grad_norm_var": 0.097412109375, + "learning_rate": 0.0001, + "loss": 3.0988, + "loss/crossentropy": 2.378826451301575, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.24316011592745781, + "loss/reg": 0.0, + "step": 18160 + }, + { + "epoch": 0.11953947368421053, + "grad_norm": 2.953125, + "grad_norm_var": 0.25974934895833335, + "learning_rate": 0.0001, + "loss": 3.1615, + "loss/crossentropy": 2.3753814458847047, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2662284314632416, + "loss/reg": 0.0, + "step": 18170 + }, + { + "epoch": 0.11960526315789474, + "grad_norm": 2.28125, + "grad_norm_var": 0.2468658447265625, + "learning_rate": 0.0001, + "loss": 3.2061, + "loss/crossentropy": 2.2813811898231506, + "loss/hidden": 3.0953125, + "loss/incoh": 0.0, + "loss/logits": 0.3734820380806923, + "loss/reg": 0.0, + "step": 18180 + }, + { + "epoch": 0.11967105263157894, + "grad_norm": 2.96875, + "grad_norm_var": 0.12701416015625, + "learning_rate": 0.0001, + "loss": 3.16, + "loss/crossentropy": 2.2644060015678407, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.22710230052471161, + "loss/reg": 0.0, + "step": 18190 + }, + { + "epoch": 0.11973684210526316, + "grad_norm": 2.015625, + "grad_norm_var": 0.059342447916666666, + "learning_rate": 0.0001, + "loss": 3.0907, + "loss/crossentropy": 2.2262014031410216, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.20997287034988404, + "loss/reg": 0.0, + "step": 18200 + }, + { + "epoch": 0.11980263157894737, + "grad_norm": 2.484375, + "grad_norm_var": 0.04023335774739583, + "learning_rate": 0.0001, + "loss": 3.2103, + "loss/crossentropy": 2.261046063899994, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.262168163061142, + "loss/reg": 0.0, + "step": 18210 + }, + { + "epoch": 0.11986842105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.11513570149739584, + "learning_rate": 0.0001, + "loss": 3.194, + "loss/crossentropy": 2.39845809340477, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.2080080732703209, + "loss/reg": 0.0, + "step": 18220 + }, + { + "epoch": 0.1199342105263158, + "grad_norm": 2.140625, + "grad_norm_var": 0.12336324055989584, + "learning_rate": 0.0001, + "loss": 3.1321, + "loss/crossentropy": 2.221972668170929, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.2630168259143829, + "loss/reg": 0.0, + "step": 18230 + }, + { + "epoch": 0.12, + "grad_norm": 2.46875, + "grad_norm_var": 0.0994049072265625, + "learning_rate": 0.0001, + "loss": 3.192, + "loss/crossentropy": 1.910440945625305, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.22247039675712585, + "loss/reg": 0.0, + "step": 18240 + }, + { + "epoch": 0.12006578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.38481343587239586, + "learning_rate": 0.0001, + "loss": 3.211, + "loss/crossentropy": 2.3357484221458433, + "loss/hidden": 3.1015625, + "loss/incoh": 0.0, + "loss/logits": 0.28341811895370483, + "loss/reg": 0.0, + "step": 18250 + }, + { + "epoch": 0.12013157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.1193023681640625, + "learning_rate": 0.0001, + "loss": 3.1537, + "loss/crossentropy": 2.277235043048859, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2401238664984703, + "loss/reg": 0.0, + "step": 18260 + }, + { + "epoch": 0.12019736842105264, + "grad_norm": 2.5, + "grad_norm_var": 0.0804595947265625, + "learning_rate": 0.0001, + "loss": 3.1693, + "loss/crossentropy": 2.4965412855148315, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.23239507675170898, + "loss/reg": 0.0, + "step": 18270 + }, + { + "epoch": 0.12026315789473684, + "grad_norm": 2.265625, + "grad_norm_var": 0.1340240478515625, + "learning_rate": 0.0001, + "loss": 3.0471, + "loss/crossentropy": 2.187135934829712, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.22307218313217164, + "loss/reg": 0.0, + "step": 18280 + }, + { + "epoch": 0.12032894736842105, + "grad_norm": 2.03125, + "grad_norm_var": 0.08740946451822916, + "learning_rate": 0.0001, + "loss": 3.0703, + "loss/crossentropy": 2.144671416282654, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.23641343265771866, + "loss/reg": 0.0, + "step": 18290 + }, + { + "epoch": 0.12039473684210526, + "grad_norm": 2.484375, + "grad_norm_var": 0.0469390869140625, + "learning_rate": 0.0001, + "loss": 3.0766, + "loss/crossentropy": 2.3019802451133726, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.27486053854227066, + "loss/reg": 0.0, + "step": 18300 + }, + { + "epoch": 0.12046052631578948, + "grad_norm": 2.765625, + "grad_norm_var": 0.056086222330729164, + "learning_rate": 0.0001, + "loss": 3.1479, + "loss/crossentropy": 2.350484275817871, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.2915784493088722, + "loss/reg": 0.0, + "step": 18310 + }, + { + "epoch": 0.12052631578947369, + "grad_norm": 2.3125, + "grad_norm_var": 0.0907867431640625, + "learning_rate": 0.0001, + "loss": 3.085, + "loss/crossentropy": 2.519459903240204, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2799163952469826, + "loss/reg": 0.0, + "step": 18320 + }, + { + "epoch": 0.12059210526315789, + "grad_norm": 3.03125, + "grad_norm_var": 0.07711588541666667, + "learning_rate": 0.0001, + "loss": 3.1352, + "loss/crossentropy": 2.196255683898926, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.256213016808033, + "loss/reg": 0.0, + "step": 18330 + }, + { + "epoch": 0.1206578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 0.111083984375, + "learning_rate": 0.0001, + "loss": 3.1252, + "loss/crossentropy": 2.421483266353607, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.3222003743052483, + "loss/reg": 0.0, + "step": 18340 + }, + { + "epoch": 0.12072368421052632, + "grad_norm": 2.3125, + "grad_norm_var": 0.07574462890625, + "learning_rate": 0.0001, + "loss": 3.1338, + "loss/crossentropy": 2.248370945453644, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.24929146319627762, + "loss/reg": 0.0, + "step": 18350 + }, + { + "epoch": 0.12078947368421053, + "grad_norm": 2.375, + "grad_norm_var": 0.05138346354166667, + "learning_rate": 0.0001, + "loss": 3.1477, + "loss/crossentropy": 2.3680251955986025, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.2555671989917755, + "loss/reg": 0.0, + "step": 18360 + }, + { + "epoch": 0.12085526315789474, + "grad_norm": 3.296875, + "grad_norm_var": 0.06642964680989584, + "learning_rate": 0.0001, + "loss": 3.1165, + "loss/crossentropy": 2.199974000453949, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.24803201854228973, + "loss/reg": 0.0, + "step": 18370 + }, + { + "epoch": 0.12092105263157894, + "grad_norm": 2.171875, + "grad_norm_var": 0.11695556640625, + "learning_rate": 0.0001, + "loss": 3.1367, + "loss/crossentropy": 2.5192595601081846, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.24734148681163787, + "loss/reg": 0.0, + "step": 18380 + }, + { + "epoch": 0.12098684210526316, + "grad_norm": 2.453125, + "grad_norm_var": 0.05526936848958333, + "learning_rate": 0.0001, + "loss": 3.1724, + "loss/crossentropy": 2.450891613960266, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2809787794947624, + "loss/reg": 0.0, + "step": 18390 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 2.15625, + "grad_norm_var": 0.0400787353515625, + "learning_rate": 0.0001, + "loss": 3.1368, + "loss/crossentropy": 2.142946255207062, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.25829449892044065, + "loss/reg": 0.0, + "step": 18400 + }, + { + "epoch": 0.12111842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 0.12823893229166666, + "learning_rate": 0.0001, + "loss": 3.1313, + "loss/crossentropy": 2.4548865795135497, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2860646352171898, + "loss/reg": 0.0, + "step": 18410 + }, + { + "epoch": 0.12118421052631578, + "grad_norm": 2.46875, + "grad_norm_var": 0.22473042805989582, + "learning_rate": 0.0001, + "loss": 3.0336, + "loss/crossentropy": 2.4296185970306396, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24130426943302155, + "loss/reg": 0.0, + "step": 18420 + }, + { + "epoch": 0.12125, + "grad_norm": 2.546875, + "grad_norm_var": 0.24602864583333334, + "learning_rate": 0.0001, + "loss": 3.0905, + "loss/crossentropy": 2.2097928047180178, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.22747812122106553, + "loss/reg": 0.0, + "step": 18430 + }, + { + "epoch": 0.12131578947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.03878580729166667, + "learning_rate": 0.0001, + "loss": 3.1482, + "loss/crossentropy": 2.266276228427887, + "loss/hidden": 3.1015625, + "loss/incoh": 0.0, + "loss/logits": 0.3076672673225403, + "loss/reg": 0.0, + "step": 18440 + }, + { + "epoch": 0.12138157894736842, + "grad_norm": 2.046875, + "grad_norm_var": 0.19593098958333333, + "learning_rate": 0.0001, + "loss": 3.1753, + "loss/crossentropy": 2.3656784892082214, + "loss/hidden": 2.634375, + "loss/incoh": 0.0, + "loss/logits": 0.21082431972026824, + "loss/reg": 0.0, + "step": 18450 + }, + { + "epoch": 0.12144736842105264, + "grad_norm": 2.28125, + "grad_norm_var": 0.13351236979166667, + "learning_rate": 0.0001, + "loss": 3.1809, + "loss/crossentropy": 2.363306760787964, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.259796117246151, + "loss/reg": 0.0, + "step": 18460 + }, + { + "epoch": 0.12151315789473684, + "grad_norm": 2.921875, + "grad_norm_var": 0.10623372395833333, + "learning_rate": 0.0001, + "loss": 3.0904, + "loss/crossentropy": 2.2991411328315734, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.22767046988010406, + "loss/reg": 0.0, + "step": 18470 + }, + { + "epoch": 0.12157894736842105, + "grad_norm": 2.25, + "grad_norm_var": 0.051595052083333336, + "learning_rate": 0.0001, + "loss": 3.0997, + "loss/crossentropy": 2.479309868812561, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.2216094933450222, + "loss/reg": 0.0, + "step": 18480 + }, + { + "epoch": 0.12164473684210526, + "grad_norm": 2.171875, + "grad_norm_var": 0.029279581705729165, + "learning_rate": 0.0001, + "loss": 3.0982, + "loss/crossentropy": 2.224264907836914, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.2954641401767731, + "loss/reg": 0.0, + "step": 18490 + }, + { + "epoch": 0.12171052631578948, + "grad_norm": 2.484375, + "grad_norm_var": 0.0418365478515625, + "learning_rate": 0.0001, + "loss": 3.1278, + "loss/crossentropy": 2.1996599078178405, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.24900650084018708, + "loss/reg": 0.0, + "step": 18500 + }, + { + "epoch": 0.12177631578947369, + "grad_norm": 2.65625, + "grad_norm_var": 6.255301920572917, + "learning_rate": 0.0001, + "loss": 3.1924, + "loss/crossentropy": 2.388622558116913, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2473811611533165, + "loss/reg": 0.0, + "step": 18510 + }, + { + "epoch": 0.12184210526315789, + "grad_norm": 2.359375, + "grad_norm_var": 0.07906901041666667, + "learning_rate": 0.0001, + "loss": 3.1227, + "loss/crossentropy": 2.325872230529785, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2705776423215866, + "loss/reg": 0.0, + "step": 18520 + }, + { + "epoch": 0.1219078947368421, + "grad_norm": 3.96875, + "grad_norm_var": 0.2271197001139323, + "learning_rate": 0.0001, + "loss": 3.104, + "loss/crossentropy": 2.4012768149375914, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.24956294745206833, + "loss/reg": 0.0, + "step": 18530 + }, + { + "epoch": 0.12197368421052632, + "grad_norm": 2.015625, + "grad_norm_var": 0.2910316467285156, + "learning_rate": 0.0001, + "loss": 3.1206, + "loss/crossentropy": 2.605260455608368, + "loss/hidden": 3.33125, + "loss/incoh": 0.0, + "loss/logits": 0.2743007704615593, + "loss/reg": 0.0, + "step": 18540 + }, + { + "epoch": 0.12203947368421053, + "grad_norm": 2.1875, + "grad_norm_var": 0.1703277587890625, + "learning_rate": 0.0001, + "loss": 3.1836, + "loss/crossentropy": 1.9470459461212157, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.27280396595597267, + "loss/reg": 0.0, + "step": 18550 + }, + { + "epoch": 0.12210526315789473, + "grad_norm": 4.5625, + "grad_norm_var": 0.52919921875, + "learning_rate": 0.0001, + "loss": 3.1155, + "loss/crossentropy": 2.216350567340851, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.22622163146734237, + "loss/reg": 0.0, + "step": 18560 + }, + { + "epoch": 0.12217105263157894, + "grad_norm": 2.65625, + "grad_norm_var": 0.4434733072916667, + "learning_rate": 0.0001, + "loss": 3.1029, + "loss/crossentropy": 2.1628998279571534, + "loss/hidden": 3.0609375, + "loss/incoh": 0.0, + "loss/logits": 0.28237638175487517, + "loss/reg": 0.0, + "step": 18570 + }, + { + "epoch": 0.12223684210526316, + "grad_norm": 2.890625, + "grad_norm_var": 0.3641916910807292, + "learning_rate": 0.0001, + "loss": 3.1955, + "loss/crossentropy": 2.486808693408966, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.2668070778250694, + "loss/reg": 0.0, + "step": 18580 + }, + { + "epoch": 0.12230263157894737, + "grad_norm": 3.03125, + "grad_norm_var": 0.3584950764973958, + "learning_rate": 0.0001, + "loss": 3.1489, + "loss/crossentropy": 2.3242167592048646, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2651012405753136, + "loss/reg": 0.0, + "step": 18590 + }, + { + "epoch": 0.12236842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.05087890625, + "learning_rate": 0.0001, + "loss": 3.0364, + "loss/crossentropy": 2.4761088371276854, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.24524887502193451, + "loss/reg": 0.0, + "step": 18600 + }, + { + "epoch": 0.12243421052631578, + "grad_norm": 2.59375, + "grad_norm_var": 0.08817952473958333, + "learning_rate": 0.0001, + "loss": 3.1836, + "loss/crossentropy": 2.012187111377716, + "loss/hidden": 3.0765625, + "loss/incoh": 0.0, + "loss/logits": 0.2920076042413712, + "loss/reg": 0.0, + "step": 18610 + }, + { + "epoch": 0.1225, + "grad_norm": 2.421875, + "grad_norm_var": 0.07818094889322917, + "learning_rate": 0.0001, + "loss": 3.1844, + "loss/crossentropy": 2.3601612210273744, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.2700245052576065, + "loss/reg": 0.0, + "step": 18620 + }, + { + "epoch": 0.12256578947368421, + "grad_norm": 2.28125, + "grad_norm_var": 0.6979482014973958, + "learning_rate": 0.0001, + "loss": 3.1953, + "loss/crossentropy": 2.2805428981781004, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.21399059891700745, + "loss/reg": 0.0, + "step": 18630 + }, + { + "epoch": 0.12263157894736842, + "grad_norm": 2.578125, + "grad_norm_var": 0.09913736979166667, + "learning_rate": 0.0001, + "loss": 3.2462, + "loss/crossentropy": 2.3119712233543397, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.27173476070165636, + "loss/reg": 0.0, + "step": 18640 + }, + { + "epoch": 0.12269736842105264, + "grad_norm": 2.484375, + "grad_norm_var": 0.12066650390625, + "learning_rate": 0.0001, + "loss": 3.1387, + "loss/crossentropy": 2.1647680759429933, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.2955679178237915, + "loss/reg": 0.0, + "step": 18650 + }, + { + "epoch": 0.12276315789473684, + "grad_norm": 2.21875, + "grad_norm_var": 0.0642242431640625, + "learning_rate": 0.0001, + "loss": 3.0672, + "loss/crossentropy": 2.398548412322998, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.24050813913345337, + "loss/reg": 0.0, + "step": 18660 + }, + { + "epoch": 0.12282894736842105, + "grad_norm": 3.046875, + "grad_norm_var": 0.43463134765625, + "learning_rate": 0.0001, + "loss": 3.1768, + "loss/crossentropy": 2.007632791996002, + "loss/hidden": 3.2265625, + "loss/incoh": 0.0, + "loss/logits": 0.28355503678321836, + "loss/reg": 0.0, + "step": 18670 + }, + { + "epoch": 0.12289473684210526, + "grad_norm": 2.21875, + "grad_norm_var": 0.41833394368489585, + "learning_rate": 0.0001, + "loss": 3.1491, + "loss/crossentropy": 2.4610511898994445, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.23162013590335845, + "loss/reg": 0.0, + "step": 18680 + }, + { + "epoch": 0.12296052631578948, + "grad_norm": 4.15625, + "grad_norm_var": 0.29620335896809896, + "learning_rate": 0.0001, + "loss": 3.1762, + "loss/crossentropy": 2.11824317574501, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2083466961979866, + "loss/reg": 0.0, + "step": 18690 + }, + { + "epoch": 0.12302631578947368, + "grad_norm": 2.203125, + "grad_norm_var": 1.1225685119628905, + "learning_rate": 0.0001, + "loss": 3.1766, + "loss/crossentropy": 2.236714720726013, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.23704309910535812, + "loss/reg": 0.0, + "step": 18700 + }, + { + "epoch": 0.12309210526315789, + "grad_norm": 2.109375, + "grad_norm_var": 0.12454020182291667, + "learning_rate": 0.0001, + "loss": 3.0992, + "loss/crossentropy": 2.2541938424110413, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.23853187412023544, + "loss/reg": 0.0, + "step": 18710 + }, + { + "epoch": 0.1231578947368421, + "grad_norm": 2.171875, + "grad_norm_var": 0.070654296875, + "learning_rate": 0.0001, + "loss": 3.0403, + "loss/crossentropy": 2.4512630701065063, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.24162033200263977, + "loss/reg": 0.0, + "step": 18720 + }, + { + "epoch": 0.12322368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 3.1522, + "loss/crossentropy": 2.3199268102645876, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.23435179740190507, + "loss/reg": 0.0, + "step": 18730 + }, + { + "epoch": 0.12328947368421053, + "grad_norm": 2.421875, + "grad_norm_var": 0.0454254150390625, + "learning_rate": 0.0001, + "loss": 3.019, + "loss/crossentropy": 2.3065245509147645, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.22434473037719727, + "loss/reg": 0.0, + "step": 18740 + }, + { + "epoch": 0.12335526315789473, + "grad_norm": 2.515625, + "grad_norm_var": 0.1038726806640625, + "learning_rate": 0.0001, + "loss": 3.145, + "loss/crossentropy": 2.2620524525642396, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.2564009681344032, + "loss/reg": 0.0, + "step": 18750 + }, + { + "epoch": 0.12342105263157895, + "grad_norm": 3.40625, + "grad_norm_var": 0.1225982666015625, + "learning_rate": 0.0001, + "loss": 3.1243, + "loss/crossentropy": 2.06068754196167, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.23558274507522584, + "loss/reg": 0.0, + "step": 18760 + }, + { + "epoch": 0.12348684210526316, + "grad_norm": 2.4375, + "grad_norm_var": 0.28750712076822915, + "learning_rate": 0.0001, + "loss": 3.1153, + "loss/crossentropy": 2.5775238513946532, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2552958935499191, + "loss/reg": 0.0, + "step": 18770 + }, + { + "epoch": 0.12355263157894737, + "grad_norm": 2.484375, + "grad_norm_var": 0.09669596354166667, + "learning_rate": 0.0001, + "loss": 3.1108, + "loss/crossentropy": 2.100378167629242, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.23871416002511978, + "loss/reg": 0.0, + "step": 18780 + }, + { + "epoch": 0.12361842105263159, + "grad_norm": 5.5, + "grad_norm_var": 0.6776112874348958, + "learning_rate": 0.0001, + "loss": 3.1095, + "loss/crossentropy": 2.4402441143989564, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.23556852638721465, + "loss/reg": 0.0, + "step": 18790 + }, + { + "epoch": 0.12368421052631579, + "grad_norm": 2.28125, + "grad_norm_var": 0.6309967041015625, + "learning_rate": 0.0001, + "loss": 3.1612, + "loss/crossentropy": 2.3034533500671386, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2525601238012314, + "loss/reg": 0.0, + "step": 18800 + }, + { + "epoch": 0.12375, + "grad_norm": 2.90625, + "grad_norm_var": 0.06685282389322916, + "learning_rate": 0.0001, + "loss": 3.1566, + "loss/crossentropy": 2.447878336906433, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.2531526446342468, + "loss/reg": 0.0, + "step": 18810 + }, + { + "epoch": 0.12381578947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.17978515625, + "learning_rate": 0.0001, + "loss": 3.1701, + "loss/crossentropy": 2.297843897342682, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.2250390335917473, + "loss/reg": 0.0, + "step": 18820 + }, + { + "epoch": 0.12388157894736843, + "grad_norm": 2.4375, + "grad_norm_var": 0.06181640625, + "learning_rate": 0.0001, + "loss": 3.1129, + "loss/crossentropy": 2.1577285885810853, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2582122042775154, + "loss/reg": 0.0, + "step": 18830 + }, + { + "epoch": 0.12394736842105263, + "grad_norm": 2.859375, + "grad_norm_var": 0.08957926432291667, + "learning_rate": 0.0001, + "loss": 3.1521, + "loss/crossentropy": 2.5041862964630126, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.24232363551855088, + "loss/reg": 0.0, + "step": 18840 + }, + { + "epoch": 0.12401315789473684, + "grad_norm": 2.953125, + "grad_norm_var": 0.10745035807291667, + "learning_rate": 0.0001, + "loss": 3.1007, + "loss/crossentropy": 1.8846357107162475, + "loss/hidden": 3.096875, + "loss/incoh": 0.0, + "loss/logits": 0.2533442348241806, + "loss/reg": 0.0, + "step": 18850 + }, + { + "epoch": 0.12407894736842105, + "grad_norm": 2.65625, + "grad_norm_var": 0.11793212890625, + "learning_rate": 0.0001, + "loss": 3.0916, + "loss/crossentropy": 2.5479671955108643, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.26022554039955137, + "loss/reg": 0.0, + "step": 18860 + }, + { + "epoch": 0.12414473684210527, + "grad_norm": 3.203125, + "grad_norm_var": 0.4687652587890625, + "learning_rate": 0.0001, + "loss": 3.1202, + "loss/crossentropy": 2.3512953519821167, + "loss/hidden": 3.0296875, + "loss/incoh": 0.0, + "loss/logits": 0.2577581197023392, + "loss/reg": 0.0, + "step": 18870 + }, + { + "epoch": 0.12421052631578948, + "grad_norm": 2.265625, + "grad_norm_var": 0.49575907389322915, + "learning_rate": 0.0001, + "loss": 3.192, + "loss/crossentropy": 2.4191364645957947, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2868465960025787, + "loss/reg": 0.0, + "step": 18880 + }, + { + "epoch": 0.12427631578947368, + "grad_norm": 2.296875, + "grad_norm_var": 0.20038655598958333, + "learning_rate": 0.0001, + "loss": 3.1072, + "loss/crossentropy": 2.548805284500122, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.22707584351301194, + "loss/reg": 0.0, + "step": 18890 + }, + { + "epoch": 0.12434210526315789, + "grad_norm": 2.4375, + "grad_norm_var": 0.16153055826822918, + "learning_rate": 0.0001, + "loss": 3.0653, + "loss/crossentropy": 2.4287894129753114, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24110280126333236, + "loss/reg": 0.0, + "step": 18900 + }, + { + "epoch": 0.1244078947368421, + "grad_norm": 2.59375, + "grad_norm_var": 0.06378580729166666, + "learning_rate": 0.0001, + "loss": 3.1229, + "loss/crossentropy": 2.381858563423157, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.28408930599689486, + "loss/reg": 0.0, + "step": 18910 + }, + { + "epoch": 0.12447368421052632, + "grad_norm": 3.9375, + "grad_norm_var": 0.33860575358072914, + "learning_rate": 0.0001, + "loss": 3.1459, + "loss/crossentropy": 2.0421772241592406, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.24683285355567933, + "loss/reg": 0.0, + "step": 18920 + }, + { + "epoch": 0.12453947368421053, + "grad_norm": 2.390625, + "grad_norm_var": 0.3823638916015625, + "learning_rate": 0.0001, + "loss": 3.1332, + "loss/crossentropy": 2.528811717033386, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.26049076169729235, + "loss/reg": 0.0, + "step": 18930 + }, + { + "epoch": 0.12460526315789473, + "grad_norm": 2.3125, + "grad_norm_var": 0.1140045166015625, + "learning_rate": 0.0001, + "loss": 3.2262, + "loss/crossentropy": 2.391852283477783, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.3043837010860443, + "loss/reg": 0.0, + "step": 18940 + }, + { + "epoch": 0.12467105263157895, + "grad_norm": 2.03125, + "grad_norm_var": 0.06843973795572916, + "learning_rate": 0.0001, + "loss": 3.149, + "loss/crossentropy": 2.218567681312561, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.254660502076149, + "loss/reg": 0.0, + "step": 18950 + }, + { + "epoch": 0.12473684210526316, + "grad_norm": 2.375, + "grad_norm_var": 0.0237701416015625, + "learning_rate": 0.0001, + "loss": 3.1052, + "loss/crossentropy": 2.3154717803001406, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.2391164407134056, + "loss/reg": 0.0, + "step": 18960 + }, + { + "epoch": 0.12480263157894737, + "grad_norm": 2.34375, + "grad_norm_var": 0.05070699055989583, + "learning_rate": 0.0001, + "loss": 3.0759, + "loss/crossentropy": 2.2152688026428224, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.28472713232040403, + "loss/reg": 0.0, + "step": 18970 + }, + { + "epoch": 0.12486842105263157, + "grad_norm": 2.3125, + "grad_norm_var": 0.08042704264322917, + "learning_rate": 0.0001, + "loss": 3.1049, + "loss/crossentropy": 2.392467772960663, + "loss/hidden": 3.1921875, + "loss/incoh": 0.0, + "loss/logits": 0.34464606642723083, + "loss/reg": 0.0, + "step": 18980 + }, + { + "epoch": 0.12493421052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.07502848307291667, + "learning_rate": 0.0001, + "loss": 3.1321, + "loss/crossentropy": 2.4656677722930906, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.25502916276454923, + "loss/reg": 0.0, + "step": 18990 + }, + { + "epoch": 0.125, + "grad_norm": 2.421875, + "grad_norm_var": 1.4638824462890625, + "learning_rate": 0.0001, + "loss": 3.2055, + "loss/crossentropy": 2.27457115650177, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.21155266612768173, + "loss/reg": 0.0, + "step": 19000 + }, + { + "epoch": 0.1250657894736842, + "grad_norm": 2.15625, + "grad_norm_var": 0.5252115885416667, + "learning_rate": 0.0001, + "loss": 3.0721, + "loss/crossentropy": 2.232183575630188, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.23653523325920106, + "loss/reg": 0.0, + "step": 19010 + }, + { + "epoch": 0.12513157894736843, + "grad_norm": 2.546875, + "grad_norm_var": 0.3392415364583333, + "learning_rate": 0.0001, + "loss": 3.1599, + "loss/crossentropy": 2.134384286403656, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.23607225120067596, + "loss/reg": 0.0, + "step": 19020 + }, + { + "epoch": 0.12519736842105264, + "grad_norm": 2.4375, + "grad_norm_var": 0.14305013020833332, + "learning_rate": 0.0001, + "loss": 3.1322, + "loss/crossentropy": 2.406242322921753, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.2707079291343689, + "loss/reg": 0.0, + "step": 19030 + }, + { + "epoch": 0.12526315789473685, + "grad_norm": 2.234375, + "grad_norm_var": 0.34036051432291664, + "learning_rate": 0.0001, + "loss": 3.2069, + "loss/crossentropy": 2.4997928857803347, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.2432278200984001, + "loss/reg": 0.0, + "step": 19040 + }, + { + "epoch": 0.12532894736842104, + "grad_norm": 2.4375, + "grad_norm_var": 0.12541910807291667, + "learning_rate": 0.0001, + "loss": 3.0995, + "loss/crossentropy": 2.2165623545646667, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.24738555699586867, + "loss/reg": 0.0, + "step": 19050 + }, + { + "epoch": 0.12539473684210525, + "grad_norm": 2.8125, + "grad_norm_var": 0.15855712890625, + "learning_rate": 0.0001, + "loss": 3.1697, + "loss/crossentropy": 2.510625755786896, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.22799582332372664, + "loss/reg": 0.0, + "step": 19060 + }, + { + "epoch": 0.12546052631578947, + "grad_norm": 2.609375, + "grad_norm_var": 0.09364827473958333, + "learning_rate": 0.0001, + "loss": 3.2057, + "loss/crossentropy": 2.380051004886627, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.2702123373746872, + "loss/reg": 0.0, + "step": 19070 + }, + { + "epoch": 0.12552631578947368, + "grad_norm": 2.015625, + "grad_norm_var": 0.07896728515625, + "learning_rate": 0.0001, + "loss": 3.0865, + "loss/crossentropy": 2.2913742661476135, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.22653487473726272, + "loss/reg": 0.0, + "step": 19080 + }, + { + "epoch": 0.1255921052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.1556549072265625, + "learning_rate": 0.0001, + "loss": 3.1844, + "loss/crossentropy": 2.39451003074646, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.23494229055941104, + "loss/reg": 0.0, + "step": 19090 + }, + { + "epoch": 0.1256578947368421, + "grad_norm": 2.5, + "grad_norm_var": 0.42444254557291666, + "learning_rate": 0.0001, + "loss": 3.1173, + "loss/crossentropy": 2.2207372069358824, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2351771742105484, + "loss/reg": 0.0, + "step": 19100 + }, + { + "epoch": 0.12572368421052632, + "grad_norm": 2.3125, + "grad_norm_var": 0.405517578125, + "learning_rate": 0.0001, + "loss": 3.1686, + "loss/crossentropy": 2.544073963165283, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.23578422516584396, + "loss/reg": 0.0, + "step": 19110 + }, + { + "epoch": 0.12578947368421053, + "grad_norm": 2.28125, + "grad_norm_var": 0.022623697916666668, + "learning_rate": 0.0001, + "loss": 3.1575, + "loss/crossentropy": 2.0650166511535644, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.2400606468319893, + "loss/reg": 0.0, + "step": 19120 + }, + { + "epoch": 0.12585526315789475, + "grad_norm": 2.078125, + "grad_norm_var": 0.12542215983072916, + "learning_rate": 0.0001, + "loss": 3.1717, + "loss/crossentropy": 2.1879891753196716, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.23869821876287461, + "loss/reg": 0.0, + "step": 19130 + }, + { + "epoch": 0.12592105263157893, + "grad_norm": 2.84375, + "grad_norm_var": 2.683204189608586e+17, + "learning_rate": 0.0001, + "loss": 3.2235, + "loss/crossentropy": 2.4875245571136473, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.3171342611312866, + "loss/reg": 0.0, + "step": 19140 + }, + { + "epoch": 0.12598684210526315, + "grad_norm": 2.109375, + "grad_norm_var": 0.12364908854166666, + "learning_rate": 0.0001, + "loss": 3.1832, + "loss/crossentropy": 2.199462330341339, + "loss/hidden": 3.1125, + "loss/incoh": 0.0, + "loss/logits": 0.2820486217737198, + "loss/reg": 0.0, + "step": 19150 + }, + { + "epoch": 0.12605263157894736, + "grad_norm": 3.53125, + "grad_norm_var": 0.133642578125, + "learning_rate": 0.0001, + "loss": 3.2179, + "loss/crossentropy": 2.433344876766205, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.3452912583947182, + "loss/reg": 0.0, + "step": 19160 + }, + { + "epoch": 0.12611842105263157, + "grad_norm": 2.328125, + "grad_norm_var": 0.23087946573893228, + "learning_rate": 0.0001, + "loss": 3.073, + "loss/crossentropy": 2.107306253910065, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.29124595075845716, + "loss/reg": 0.0, + "step": 19170 + }, + { + "epoch": 0.1261842105263158, + "grad_norm": 2.640625, + "grad_norm_var": 2.206763811704668e+17, + "learning_rate": 0.0001, + "loss": 3.2818, + "loss/crossentropy": 2.17460697889328, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.26940477788448336, + "loss/reg": 0.0, + "step": 19180 + }, + { + "epoch": 0.12625, + "grad_norm": 2.25, + "grad_norm_var": 0.06741434733072917, + "learning_rate": 0.0001, + "loss": 3.1977, + "loss/crossentropy": 2.249357485771179, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.29158340096473695, + "loss/reg": 0.0, + "step": 19190 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 2.28125, + "grad_norm_var": 0.1665679931640625, + "learning_rate": 0.0001, + "loss": 3.0798, + "loss/crossentropy": 2.39562349319458, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.24128414690494537, + "loss/reg": 0.0, + "step": 19200 + }, + { + "epoch": 0.12638157894736843, + "grad_norm": 2.515625, + "grad_norm_var": 0.04352925618489583, + "learning_rate": 0.0001, + "loss": 3.0886, + "loss/crossentropy": 2.4379887223243712, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2693786233663559, + "loss/reg": 0.0, + "step": 19210 + }, + { + "epoch": 0.12644736842105264, + "grad_norm": 2.609375, + "grad_norm_var": 0.05164286295572917, + "learning_rate": 0.0001, + "loss": 3.052, + "loss/crossentropy": 2.327665722370148, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.37138707339763644, + "loss/reg": 0.0, + "step": 19220 + }, + { + "epoch": 0.12651315789473686, + "grad_norm": 2.09375, + "grad_norm_var": 1.5772623697916666, + "learning_rate": 0.0001, + "loss": 3.1007, + "loss/crossentropy": 2.1090051174163817, + "loss/hidden": 3.0875, + "loss/incoh": 0.0, + "loss/logits": 0.23532682955265044, + "loss/reg": 0.0, + "step": 19230 + }, + { + "epoch": 0.12657894736842104, + "grad_norm": 2.5625, + "grad_norm_var": 0.76259765625, + "learning_rate": 0.0001, + "loss": 3.087, + "loss/crossentropy": 2.3096412897109984, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2415456846356392, + "loss/reg": 0.0, + "step": 19240 + }, + { + "epoch": 0.12664473684210525, + "grad_norm": 2.5, + "grad_norm_var": 0.0485504150390625, + "learning_rate": 0.0001, + "loss": 3.0857, + "loss/crossentropy": 2.252851128578186, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.21757592558860778, + "loss/reg": 0.0, + "step": 19250 + }, + { + "epoch": 0.12671052631578947, + "grad_norm": 2.359375, + "grad_norm_var": 0.16155192057291667, + "learning_rate": 0.0001, + "loss": 3.1508, + "loss/crossentropy": 2.19182807803154, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.23997026532888413, + "loss/reg": 0.0, + "step": 19260 + }, + { + "epoch": 0.12677631578947368, + "grad_norm": 2.4375, + "grad_norm_var": 0.4118609110514323, + "learning_rate": 0.0001, + "loss": 3.1121, + "loss/crossentropy": 2.4667071104049683, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.23552417606115342, + "loss/reg": 0.0, + "step": 19270 + }, + { + "epoch": 0.1268421052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.04609553019205729, + "learning_rate": 0.0001, + "loss": 3.1763, + "loss/crossentropy": 2.1369692206382753, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.24942600578069687, + "loss/reg": 0.0, + "step": 19280 + }, + { + "epoch": 0.1269078947368421, + "grad_norm": 2.5, + "grad_norm_var": 0.0526031494140625, + "learning_rate": 0.0001, + "loss": 3.1068, + "loss/crossentropy": 2.11109459400177, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.25110483914613724, + "loss/reg": 0.0, + "step": 19290 + }, + { + "epoch": 0.12697368421052632, + "grad_norm": 2.578125, + "grad_norm_var": 0.053343709309895834, + "learning_rate": 0.0001, + "loss": 3.1517, + "loss/crossentropy": 2.380664014816284, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2508363798260689, + "loss/reg": 0.0, + "step": 19300 + }, + { + "epoch": 0.12703947368421054, + "grad_norm": 2.171875, + "grad_norm_var": 0.034032185872395836, + "learning_rate": 0.0001, + "loss": 3.1303, + "loss/crossentropy": 2.380405902862549, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2564548909664154, + "loss/reg": 0.0, + "step": 19310 + }, + { + "epoch": 0.12710526315789475, + "grad_norm": 2.375, + "grad_norm_var": 0.03234049479166667, + "learning_rate": 0.0001, + "loss": 3.1223, + "loss/crossentropy": 2.1872188091278075, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2912978962063789, + "loss/reg": 0.0, + "step": 19320 + }, + { + "epoch": 0.12717105263157893, + "grad_norm": 2.328125, + "grad_norm_var": 0.07059504191080729, + "learning_rate": 0.0001, + "loss": 3.1221, + "loss/crossentropy": 2.470194971561432, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.2459412097930908, + "loss/reg": 0.0, + "step": 19330 + }, + { + "epoch": 0.12723684210526315, + "grad_norm": 2.484375, + "grad_norm_var": 0.1386431376139323, + "learning_rate": 0.0001, + "loss": 3.1555, + "loss/crossentropy": 2.3521093368530273, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.2635854005813599, + "loss/reg": 0.0, + "step": 19340 + }, + { + "epoch": 0.12730263157894736, + "grad_norm": 2.4375, + "grad_norm_var": 0.053857421875, + "learning_rate": 0.0001, + "loss": 3.0594, + "loss/crossentropy": 2.099438285827637, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.22058220505714415, + "loss/reg": 0.0, + "step": 19350 + }, + { + "epoch": 0.12736842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.10086161295572917, + "learning_rate": 0.0001, + "loss": 3.1877, + "loss/crossentropy": 2.2927380204200745, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.25694535821676256, + "loss/reg": 0.0, + "step": 19360 + }, + { + "epoch": 0.1274342105263158, + "grad_norm": 1.9375, + "grad_norm_var": 0.05416259765625, + "learning_rate": 0.0001, + "loss": 3.0826, + "loss/crossentropy": 2.1295772194862366, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.254076661169529, + "loss/reg": 0.0, + "step": 19370 + }, + { + "epoch": 0.1275, + "grad_norm": 2.5625, + "grad_norm_var": 0.24744466145833333, + "learning_rate": 0.0001, + "loss": 3.1415, + "loss/crossentropy": 1.9256490916013718, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.24017905220389366, + "loss/reg": 0.0, + "step": 19380 + }, + { + "epoch": 0.12756578947368422, + "grad_norm": 2.421875, + "grad_norm_var": 0.5484659830729167, + "learning_rate": 0.0001, + "loss": 3.1094, + "loss/crossentropy": 2.4685181736946107, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.26331629455089567, + "loss/reg": 0.0, + "step": 19390 + }, + { + "epoch": 0.12763157894736843, + "grad_norm": 2.3125, + "grad_norm_var": 0.39381103515625, + "learning_rate": 0.0001, + "loss": 3.1037, + "loss/crossentropy": 2.2764881372451784, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.25390326231718063, + "loss/reg": 0.0, + "step": 19400 + }, + { + "epoch": 0.12769736842105264, + "grad_norm": 2.765625, + "grad_norm_var": 0.0698150634765625, + "learning_rate": 0.0001, + "loss": 3.1262, + "loss/crossentropy": 2.299801528453827, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2553006038069725, + "loss/reg": 0.0, + "step": 19410 + }, + { + "epoch": 0.12776315789473683, + "grad_norm": 2.453125, + "grad_norm_var": 0.266357421875, + "learning_rate": 0.0001, + "loss": 3.1239, + "loss/crossentropy": 2.0346228003501894, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.22659170776605606, + "loss/reg": 0.0, + "step": 19420 + }, + { + "epoch": 0.12782894736842104, + "grad_norm": 2.34375, + "grad_norm_var": 0.37324930826822916, + "learning_rate": 0.0001, + "loss": 3.1527, + "loss/crossentropy": 1.8951176881790162, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.23443188220262529, + "loss/reg": 0.0, + "step": 19430 + }, + { + "epoch": 0.12789473684210526, + "grad_norm": 2.71875, + "grad_norm_var": 0.30020243326822915, + "learning_rate": 0.0001, + "loss": 3.1742, + "loss/crossentropy": 2.0540109515190124, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.20190905332565307, + "loss/reg": 0.0, + "step": 19440 + }, + { + "epoch": 0.12796052631578947, + "grad_norm": 2.75, + "grad_norm_var": 0.19728190104166668, + "learning_rate": 0.0001, + "loss": 3.1528, + "loss/crossentropy": 2.3164134502410887, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.2684441477060318, + "loss/reg": 0.0, + "step": 19450 + }, + { + "epoch": 0.12802631578947368, + "grad_norm": 2.25, + "grad_norm_var": 0.2419586181640625, + "learning_rate": 0.0001, + "loss": 3.1599, + "loss/crossentropy": 2.380271017551422, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.2657022625207901, + "loss/reg": 0.0, + "step": 19460 + }, + { + "epoch": 0.1280921052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.11366780598958333, + "learning_rate": 0.0001, + "loss": 3.1441, + "loss/crossentropy": 2.4884063005447388, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.2683678835630417, + "loss/reg": 0.0, + "step": 19470 + }, + { + "epoch": 0.1281578947368421, + "grad_norm": 2038431744.0, + "grad_norm_var": 2.597002478369833e+17, + "learning_rate": 0.0001, + "loss": 3.2142, + "loss/crossentropy": 2.0524453282356263, + "loss/hidden": 3.80625, + "loss/incoh": 0.0, + "loss/logits": 0.2409697949886322, + "loss/reg": 0.0, + "step": 19480 + }, + { + "epoch": 0.12822368421052632, + "grad_norm": 2.203125, + "grad_norm_var": 2.597002478497235e+17, + "learning_rate": 0.0001, + "loss": 3.0528, + "loss/crossentropy": 2.2681432604789733, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.24559762477874755, + "loss/reg": 0.0, + "step": 19490 + }, + { + "epoch": 0.12828947368421054, + "grad_norm": 2.296875, + "grad_norm_var": 0.030345662434895834, + "learning_rate": 0.0001, + "loss": 3.0796, + "loss/crossentropy": 2.362069141864777, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.2367064341902733, + "loss/reg": 0.0, + "step": 19500 + }, + { + "epoch": 0.12835526315789475, + "grad_norm": 2.75, + "grad_norm_var": 0.059382120768229164, + "learning_rate": 0.0001, + "loss": 3.1465, + "loss/crossentropy": 2.260143554210663, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.25476695597171783, + "loss/reg": 0.0, + "step": 19510 + }, + { + "epoch": 0.12842105263157894, + "grad_norm": 2.40625, + "grad_norm_var": 0.18327534993489583, + "learning_rate": 0.0001, + "loss": 3.1443, + "loss/crossentropy": 2.322359097003937, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.22897413671016692, + "loss/reg": 0.0, + "step": 19520 + }, + { + "epoch": 0.12848684210526315, + "grad_norm": 2.078125, + "grad_norm_var": 0.4216379801432292, + "learning_rate": 0.0001, + "loss": 3.1736, + "loss/crossentropy": 2.658802056312561, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.26290144920349123, + "loss/reg": 0.0, + "step": 19530 + }, + { + "epoch": 0.12855263157894736, + "grad_norm": 2.0625, + "grad_norm_var": 0.1238677978515625, + "learning_rate": 0.0001, + "loss": 3.1818, + "loss/crossentropy": 2.389211559295654, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.29232275635004046, + "loss/reg": 0.0, + "step": 19540 + }, + { + "epoch": 0.12861842105263158, + "grad_norm": 2.421875, + "grad_norm_var": 0.049845123291015626, + "learning_rate": 0.0001, + "loss": 3.0708, + "loss/crossentropy": 2.186306023597717, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.2743732765316963, + "loss/reg": 0.0, + "step": 19550 + }, + { + "epoch": 0.1286842105263158, + "grad_norm": 2.40625, + "grad_norm_var": 0.024887847900390624, + "learning_rate": 0.0001, + "loss": 3.1515, + "loss/crossentropy": 2.325215721130371, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.24441724121570588, + "loss/reg": 0.0, + "step": 19560 + }, + { + "epoch": 0.12875, + "grad_norm": 2.40625, + "grad_norm_var": 0.18791910807291667, + "learning_rate": 0.0001, + "loss": 3.1163, + "loss/crossentropy": 2.4399210453033446, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.23570073395967484, + "loss/reg": 0.0, + "step": 19570 + }, + { + "epoch": 0.12881578947368422, + "grad_norm": 2.34375, + "grad_norm_var": 0.0600250244140625, + "learning_rate": 0.0001, + "loss": 3.0731, + "loss/crossentropy": 2.250354325771332, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.25124100893735885, + "loss/reg": 0.0, + "step": 19580 + }, + { + "epoch": 0.12888157894736843, + "grad_norm": 2.34375, + "grad_norm_var": 0.2105865478515625, + "learning_rate": 0.0001, + "loss": 3.1494, + "loss/crossentropy": 2.3004735589027403, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.250587160885334, + "loss/reg": 0.0, + "step": 19590 + }, + { + "epoch": 0.12894736842105264, + "grad_norm": 2.125, + "grad_norm_var": 0.23645833333333333, + "learning_rate": 0.0001, + "loss": 3.099, + "loss/crossentropy": 2.3559111833572386, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.26171091198921204, + "loss/reg": 0.0, + "step": 19600 + }, + { + "epoch": 0.12901315789473683, + "grad_norm": 2.328125, + "grad_norm_var": 0.028955078125, + "learning_rate": 0.0001, + "loss": 3.171, + "loss/crossentropy": 2.2754984378814695, + "loss/hidden": 3.046875, + "loss/incoh": 0.0, + "loss/logits": 0.2695039168000221, + "loss/reg": 0.0, + "step": 19610 + }, + { + "epoch": 0.12907894736842104, + "grad_norm": 2.703125, + "grad_norm_var": 0.12908426920572916, + "learning_rate": 0.0001, + "loss": 3.1609, + "loss/crossentropy": 2.457037115097046, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.2523172840476036, + "loss/reg": 0.0, + "step": 19620 + }, + { + "epoch": 0.12914473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.14287007649739583, + "learning_rate": 0.0001, + "loss": 3.0694, + "loss/crossentropy": 2.33393777012825, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.23025162070989608, + "loss/reg": 0.0, + "step": 19630 + }, + { + "epoch": 0.12921052631578947, + "grad_norm": 2.125, + "grad_norm_var": 0.029618326822916666, + "learning_rate": 0.0001, + "loss": 3.0684, + "loss/crossentropy": 2.155827796459198, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.22635919600725174, + "loss/reg": 0.0, + "step": 19640 + }, + { + "epoch": 0.12927631578947368, + "grad_norm": 2.03125, + "grad_norm_var": 0.31851806640625, + "learning_rate": 0.0001, + "loss": 3.112, + "loss/crossentropy": 2.2350030899047852, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.2474326401948929, + "loss/reg": 0.0, + "step": 19650 + }, + { + "epoch": 0.1293421052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.30481363932291666, + "learning_rate": 0.0001, + "loss": 3.0884, + "loss/crossentropy": 2.1235520601272584, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.23340977281332015, + "loss/reg": 0.0, + "step": 19660 + }, + { + "epoch": 0.1294078947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.058958943684895834, + "learning_rate": 0.0001, + "loss": 3.1092, + "loss/crossentropy": 2.436754751205444, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.2396928071975708, + "loss/reg": 0.0, + "step": 19670 + }, + { + "epoch": 0.12947368421052632, + "grad_norm": 2.84375, + "grad_norm_var": 0.2934804280598958, + "learning_rate": 0.0001, + "loss": 3.0531, + "loss/crossentropy": 2.24703209400177, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.23143238127231597, + "loss/reg": 0.0, + "step": 19680 + }, + { + "epoch": 0.12953947368421054, + "grad_norm": 2.25, + "grad_norm_var": 0.25420303344726564, + "learning_rate": 0.0001, + "loss": 3.0318, + "loss/crossentropy": 2.536008381843567, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.22230196446180345, + "loss/reg": 0.0, + "step": 19690 + }, + { + "epoch": 0.12960526315789472, + "grad_norm": 2.21875, + "grad_norm_var": 0.29778416951497394, + "learning_rate": 0.0001, + "loss": 3.0902, + "loss/crossentropy": 2.1229737393558024, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.2103697349317372, + "loss/reg": 0.0, + "step": 19700 + }, + { + "epoch": 0.12967105263157894, + "grad_norm": 2.0625, + "grad_norm_var": 0.09654032389322917, + "learning_rate": 0.0001, + "loss": 3.0303, + "loss/crossentropy": 2.360739004611969, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.238627889752388, + "loss/reg": 0.0, + "step": 19710 + }, + { + "epoch": 0.12973684210526315, + "grad_norm": 2.234375, + "grad_norm_var": 0.16044514973958332, + "learning_rate": 0.0001, + "loss": 3.082, + "loss/crossentropy": 2.3643787026405336, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.2354188710451126, + "loss/reg": 0.0, + "step": 19720 + }, + { + "epoch": 0.12980263157894736, + "grad_norm": 2.375, + "grad_norm_var": 0.23007405598958333, + "learning_rate": 0.0001, + "loss": 3.0884, + "loss/crossentropy": 2.3766650319099427, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.26477697044610976, + "loss/reg": 0.0, + "step": 19730 + }, + { + "epoch": 0.12986842105263158, + "grad_norm": 2.21875, + "grad_norm_var": 0.055946604410807295, + "learning_rate": 0.0001, + "loss": 3.0414, + "loss/crossentropy": 2.6002285480499268, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.33312758058309555, + "loss/reg": 0.0, + "step": 19740 + }, + { + "epoch": 0.1299342105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.13170547485351564, + "learning_rate": 0.0001, + "loss": 3.0758, + "loss/crossentropy": 2.457702159881592, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.25578114166855814, + "loss/reg": 0.0, + "step": 19750 + }, + { + "epoch": 0.13, + "grad_norm": 2.65625, + "grad_norm_var": 0.03232014973958333, + "learning_rate": 0.0001, + "loss": 3.0882, + "loss/crossentropy": 2.261428934335709, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.2278106167912483, + "loss/reg": 0.0, + "step": 19760 + }, + { + "epoch": 0.13006578947368422, + "grad_norm": 2.421875, + "grad_norm_var": 0.022248331705729166, + "learning_rate": 0.0001, + "loss": 3.0582, + "loss/crossentropy": 2.060434710979462, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.24812956005334855, + "loss/reg": 0.0, + "step": 19770 + }, + { + "epoch": 0.13013157894736843, + "grad_norm": 2.40625, + "grad_norm_var": 0.0256256103515625, + "learning_rate": 0.0001, + "loss": 3.08, + "loss/crossentropy": 2.2472903966903686, + "loss/hidden": 3.1, + "loss/incoh": 0.0, + "loss/logits": 0.34347400814294815, + "loss/reg": 0.0, + "step": 19780 + }, + { + "epoch": 0.13019736842105264, + "grad_norm": 2.265625, + "grad_norm_var": 0.0520660400390625, + "learning_rate": 0.0001, + "loss": 3.1154, + "loss/crossentropy": 2.3003188014030456, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2695119082927704, + "loss/reg": 0.0, + "step": 19790 + }, + { + "epoch": 0.13026315789473683, + "grad_norm": 2.578125, + "grad_norm_var": 0.08396809895833333, + "learning_rate": 0.0001, + "loss": 3.0827, + "loss/crossentropy": 2.2551465153694155, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.24707757085561752, + "loss/reg": 0.0, + "step": 19800 + }, + { + "epoch": 0.13032894736842104, + "grad_norm": 2.65625, + "grad_norm_var": 0.15972900390625, + "learning_rate": 0.0001, + "loss": 3.151, + "loss/crossentropy": 2.323357033729553, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.2660538278520107, + "loss/reg": 0.0, + "step": 19810 + }, + { + "epoch": 0.13039473684210526, + "grad_norm": 2.546875, + "grad_norm_var": 0.15328776041666667, + "learning_rate": 0.0001, + "loss": 3.0792, + "loss/crossentropy": 2.369428300857544, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2439693480730057, + "loss/reg": 0.0, + "step": 19820 + }, + { + "epoch": 0.13046052631578947, + "grad_norm": 2.328125, + "grad_norm_var": 0.09102274576822916, + "learning_rate": 0.0001, + "loss": 3.1636, + "loss/crossentropy": 2.2469497442245485, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.24479606077075006, + "loss/reg": 0.0, + "step": 19830 + }, + { + "epoch": 0.13052631578947368, + "grad_norm": 2.625, + "grad_norm_var": 0.052302042643229164, + "learning_rate": 0.0001, + "loss": 3.1694, + "loss/crossentropy": 2.4753658294677736, + "loss/hidden": 3.0390625, + "loss/incoh": 0.0, + "loss/logits": 0.3045656159520149, + "loss/reg": 0.0, + "step": 19840 + }, + { + "epoch": 0.1305921052631579, + "grad_norm": 2.265625, + "grad_norm_var": 0.0764068603515625, + "learning_rate": 0.0001, + "loss": 3.0959, + "loss/crossentropy": 2.4656317472457885, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.258389513194561, + "loss/reg": 0.0, + "step": 19850 + }, + { + "epoch": 0.1306578947368421, + "grad_norm": 2.515625, + "grad_norm_var": 0.06215718587239583, + "learning_rate": 0.0001, + "loss": 3.0406, + "loss/crossentropy": 2.440659189224243, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.26817646920681, + "loss/reg": 0.0, + "step": 19860 + }, + { + "epoch": 0.13072368421052633, + "grad_norm": 2.296875, + "grad_norm_var": 0.04420166015625, + "learning_rate": 0.0001, + "loss": 3.0968, + "loss/crossentropy": 2.206334137916565, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.24976521283388137, + "loss/reg": 0.0, + "step": 19870 + }, + { + "epoch": 0.13078947368421054, + "grad_norm": 2.109375, + "grad_norm_var": 0.08323567708333333, + "learning_rate": 0.0001, + "loss": 3.0685, + "loss/crossentropy": 2.21451940536499, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.21932003498077393, + "loss/reg": 0.0, + "step": 19880 + }, + { + "epoch": 0.13085526315789472, + "grad_norm": 2.84375, + "grad_norm_var": 0.17522786458333334, + "learning_rate": 0.0001, + "loss": 3.0628, + "loss/crossentropy": 2.1315925240516664, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.23312209993600846, + "loss/reg": 0.0, + "step": 19890 + }, + { + "epoch": 0.13092105263157894, + "grad_norm": 6.28125, + "grad_norm_var": 0.9668528238932291, + "learning_rate": 0.0001, + "loss": 3.1207, + "loss/crossentropy": 2.5357746481895447, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2542704403400421, + "loss/reg": 0.0, + "step": 19900 + }, + { + "epoch": 0.13098684210526315, + "grad_norm": 2.25, + "grad_norm_var": 0.9971913655598958, + "learning_rate": 0.0001, + "loss": 3.0247, + "loss/crossentropy": 2.53050377368927, + "loss/hidden": 2.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.2318735808134079, + "loss/reg": 0.0, + "step": 19910 + }, + { + "epoch": 0.13105263157894737, + "grad_norm": 2.78125, + "grad_norm_var": 0.14268290201822917, + "learning_rate": 0.0001, + "loss": 3.1545, + "loss/crossentropy": 2.475492477416992, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.29062798619270325, + "loss/reg": 0.0, + "step": 19920 + }, + { + "epoch": 0.13111842105263158, + "grad_norm": 2.390625, + "grad_norm_var": 0.08727925618489583, + "learning_rate": 0.0001, + "loss": 3.1316, + "loss/crossentropy": 2.4628885269165037, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.24764630049467087, + "loss/reg": 0.0, + "step": 19930 + }, + { + "epoch": 0.1311842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 0.07839736938476563, + "learning_rate": 0.0001, + "loss": 3.0597, + "loss/crossentropy": 2.46280722618103, + "loss/hidden": 2.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.2163504734635353, + "loss/reg": 0.0, + "step": 19940 + }, + { + "epoch": 0.13125, + "grad_norm": 2.34375, + "grad_norm_var": 0.0990875244140625, + "learning_rate": 0.0001, + "loss": 3.1648, + "loss/crossentropy": 2.4153407394886015, + "loss/hidden": 3.0875, + "loss/incoh": 0.0, + "loss/logits": 0.26917385756969453, + "loss/reg": 0.0, + "step": 19950 + }, + { + "epoch": 0.13131578947368422, + "grad_norm": 2.8125, + "grad_norm_var": 0.18907877604166667, + "learning_rate": 0.0001, + "loss": 3.1173, + "loss/crossentropy": 2.232962656021118, + "loss/hidden": 3.0734375, + "loss/incoh": 0.0, + "loss/logits": 0.3076439991593361, + "loss/reg": 0.0, + "step": 19960 + }, + { + "epoch": 0.13138157894736843, + "grad_norm": 2.3125, + "grad_norm_var": 0.13981119791666666, + "learning_rate": 0.0001, + "loss": 3.0478, + "loss/crossentropy": 2.353410243988037, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.24693673849105835, + "loss/reg": 0.0, + "step": 19970 + }, + { + "epoch": 0.13144736842105262, + "grad_norm": 3.15625, + "grad_norm_var": 0.1372711181640625, + "learning_rate": 0.0001, + "loss": 3.1452, + "loss/crossentropy": 2.121303880214691, + "loss/hidden": 3.125, + "loss/incoh": 0.0, + "loss/logits": 0.31650226265192033, + "loss/reg": 0.0, + "step": 19980 + }, + { + "epoch": 0.13151315789473683, + "grad_norm": 2.25, + "grad_norm_var": 0.1741607666015625, + "learning_rate": 0.0001, + "loss": 3.1028, + "loss/crossentropy": 2.43160719871521, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.2277207463979721, + "loss/reg": 0.0, + "step": 19990 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 2.140625, + "grad_norm_var": 0.15798238118489583, + "learning_rate": 0.0001, + "loss": 3.0532, + "loss/crossentropy": 2.2883418917655947, + "loss/hidden": 2.990625, + "loss/incoh": 0.0, + "loss/logits": 0.2525527849793434, + "loss/reg": 0.0, + "step": 20000 + }, + { + "epoch": 0.13164473684210526, + "grad_norm": 2.28125, + "grad_norm_var": 0.10852457682291666, + "learning_rate": 0.0001, + "loss": 3.0348, + "loss/crossentropy": 2.369931137561798, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.22242200672626494, + "loss/reg": 0.0, + "step": 20010 + }, + { + "epoch": 0.13171052631578947, + "grad_norm": 2.125, + "grad_norm_var": 0.0374664306640625, + "learning_rate": 0.0001, + "loss": 3.0316, + "loss/crossentropy": 2.1939921617507934, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.234733884036541, + "loss/reg": 0.0, + "step": 20020 + }, + { + "epoch": 0.13177631578947369, + "grad_norm": 2.515625, + "grad_norm_var": 0.07238667805989583, + "learning_rate": 0.0001, + "loss": 3.1278, + "loss/crossentropy": 2.4681461334228514, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.30574188083410264, + "loss/reg": 0.0, + "step": 20030 + }, + { + "epoch": 0.1318421052631579, + "grad_norm": 2.265625, + "grad_norm_var": 0.07011311848958333, + "learning_rate": 0.0001, + "loss": 3.0111, + "loss/crossentropy": 2.699459183216095, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.27421810775995253, + "loss/reg": 0.0, + "step": 20040 + }, + { + "epoch": 0.1319078947368421, + "grad_norm": 2.546875, + "grad_norm_var": 41.75862528483073, + "learning_rate": 0.0001, + "loss": 3.202, + "loss/crossentropy": 2.263700020313263, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.23062770962715148, + "loss/reg": 0.0, + "step": 20050 + }, + { + "epoch": 0.13197368421052633, + "grad_norm": 2.546875, + "grad_norm_var": 0.08103841145833333, + "learning_rate": 0.0001, + "loss": 3.1056, + "loss/crossentropy": 2.4495514154434206, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.2597358673810959, + "loss/reg": 0.0, + "step": 20060 + }, + { + "epoch": 0.13203947368421054, + "grad_norm": 2.21875, + "grad_norm_var": 0.10988667805989584, + "learning_rate": 0.0001, + "loss": 3.1429, + "loss/crossentropy": 2.5027857065200805, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.23284846246242524, + "loss/reg": 0.0, + "step": 20070 + }, + { + "epoch": 0.13210526315789473, + "grad_norm": 2.8125, + "grad_norm_var": 0.4439442952473958, + "learning_rate": 0.0001, + "loss": 3.1051, + "loss/crossentropy": 2.3987682700157165, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.2284790888428688, + "loss/reg": 0.0, + "step": 20080 + }, + { + "epoch": 0.13217105263157894, + "grad_norm": 4.9375, + "grad_norm_var": 0.8553456624348958, + "learning_rate": 0.0001, + "loss": 3.1699, + "loss/crossentropy": 2.415048587322235, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2737443670630455, + "loss/reg": 0.0, + "step": 20090 + }, + { + "epoch": 0.13223684210526315, + "grad_norm": 2.40625, + "grad_norm_var": 0.5597005208333333, + "learning_rate": 0.0001, + "loss": 3.1123, + "loss/crossentropy": 2.42084618806839, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.26620708405971527, + "loss/reg": 0.0, + "step": 20100 + }, + { + "epoch": 0.13230263157894737, + "grad_norm": 2.125, + "grad_norm_var": 0.022086588541666667, + "learning_rate": 0.0001, + "loss": 3.0069, + "loss/crossentropy": 2.4669047951698304, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.22731948494911194, + "loss/reg": 0.0, + "step": 20110 + }, + { + "epoch": 0.13236842105263158, + "grad_norm": 2.703125, + "grad_norm_var": 0.6456451416015625, + "learning_rate": 0.0001, + "loss": 3.1966, + "loss/crossentropy": 2.344004726409912, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.2402897983789444, + "loss/reg": 0.0, + "step": 20120 + }, + { + "epoch": 0.1324342105263158, + "grad_norm": 2.046875, + "grad_norm_var": 1.062083943684896, + "learning_rate": 0.0001, + "loss": 3.1146, + "loss/crossentropy": 2.375274932384491, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.22684457302093505, + "loss/reg": 0.0, + "step": 20130 + }, + { + "epoch": 0.1325, + "grad_norm": 2.265625, + "grad_norm_var": 0.046076456705729164, + "learning_rate": 0.0001, + "loss": 3.0369, + "loss/crossentropy": 2.366211712360382, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.24463532716035843, + "loss/reg": 0.0, + "step": 20140 + }, + { + "epoch": 0.13256578947368422, + "grad_norm": 1.9375, + "grad_norm_var": 0.12783915201822918, + "learning_rate": 0.0001, + "loss": 3.1105, + "loss/crossentropy": 2.384170186519623, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.28617204576730726, + "loss/reg": 0.0, + "step": 20150 + }, + { + "epoch": 0.13263157894736843, + "grad_norm": 2.171875, + "grad_norm_var": 0.5388671875, + "learning_rate": 0.0001, + "loss": 3.13, + "loss/crossentropy": 2.37620815038681, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.22599491178989412, + "loss/reg": 0.0, + "step": 20160 + }, + { + "epoch": 0.13269736842105262, + "grad_norm": 2.265625, + "grad_norm_var": 0.5671875, + "learning_rate": 0.0001, + "loss": 3.0816, + "loss/crossentropy": 2.3014034748077394, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.21943795531988144, + "loss/reg": 0.0, + "step": 20170 + }, + { + "epoch": 0.13276315789473683, + "grad_norm": 2.4375, + "grad_norm_var": 0.11660868326822917, + "learning_rate": 0.0001, + "loss": 3.1416, + "loss/crossentropy": 2.619466185569763, + "loss/hidden": 3.190625, + "loss/incoh": 0.0, + "loss/logits": 0.3190195769071579, + "loss/reg": 0.0, + "step": 20180 + }, + { + "epoch": 0.13282894736842105, + "grad_norm": 3.296875, + "grad_norm_var": 0.15591532389322918, + "learning_rate": 0.0001, + "loss": 3.128, + "loss/crossentropy": 2.3887166023254394, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.27821466475725176, + "loss/reg": 0.0, + "step": 20190 + }, + { + "epoch": 0.13289473684210526, + "grad_norm": 2.1875, + "grad_norm_var": 0.6188639322916667, + "learning_rate": 0.0001, + "loss": 3.0968, + "loss/crossentropy": 2.49861718416214, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2538172617554665, + "loss/reg": 0.0, + "step": 20200 + }, + { + "epoch": 0.13296052631578947, + "grad_norm": 2.34375, + "grad_norm_var": 0.20885416666666667, + "learning_rate": 0.0001, + "loss": 3.1671, + "loss/crossentropy": 2.220921754837036, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.23103244453668595, + "loss/reg": 0.0, + "step": 20210 + }, + { + "epoch": 0.1330263157894737, + "grad_norm": 2.625, + "grad_norm_var": 0.18879801432291668, + "learning_rate": 0.0001, + "loss": 3.1111, + "loss/crossentropy": 2.333580756187439, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2729429990053177, + "loss/reg": 0.0, + "step": 20220 + }, + { + "epoch": 0.1330921052631579, + "grad_norm": 2.171875, + "grad_norm_var": 0.0948883056640625, + "learning_rate": 0.0001, + "loss": 3.0829, + "loss/crossentropy": 2.3158997893333435, + "loss/hidden": 2.6203125, + "loss/incoh": 0.0, + "loss/logits": 0.19884210526943208, + "loss/reg": 0.0, + "step": 20230 + }, + { + "epoch": 0.13315789473684211, + "grad_norm": 2.375, + "grad_norm_var": 0.20250244140625, + "learning_rate": 0.0001, + "loss": 3.0896, + "loss/crossentropy": 2.369309663772583, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.26042123287916186, + "loss/reg": 0.0, + "step": 20240 + }, + { + "epoch": 0.13322368421052633, + "grad_norm": 3.015625, + "grad_norm_var": 0.17239176432291667, + "learning_rate": 0.0001, + "loss": 3.1863, + "loss/crossentropy": 2.2114102065563204, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.21532374620437622, + "loss/reg": 0.0, + "step": 20250 + }, + { + "epoch": 0.1332894736842105, + "grad_norm": 2.328125, + "grad_norm_var": 0.33000869750976564, + "learning_rate": 0.0001, + "loss": 3.0896, + "loss/crossentropy": 2.4173696994781495, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.2683261051774025, + "loss/reg": 0.0, + "step": 20260 + }, + { + "epoch": 0.13335526315789473, + "grad_norm": 2.875, + "grad_norm_var": 0.10729548136393229, + "learning_rate": 0.0001, + "loss": 3.1426, + "loss/crossentropy": 2.3808977246284484, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.3049054339528084, + "loss/reg": 0.0, + "step": 20270 + }, + { + "epoch": 0.13342105263157894, + "grad_norm": 2.484375, + "grad_norm_var": 0.1077789306640625, + "learning_rate": 0.0001, + "loss": 3.1685, + "loss/crossentropy": 2.244146704673767, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2616411089897156, + "loss/reg": 0.0, + "step": 20280 + }, + { + "epoch": 0.13348684210526315, + "grad_norm": 2.15625, + "grad_norm_var": 0.1076171875, + "learning_rate": 0.0001, + "loss": 3.1354, + "loss/crossentropy": 2.145916444063187, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.21995762139558792, + "loss/reg": 0.0, + "step": 20290 + }, + { + "epoch": 0.13355263157894737, + "grad_norm": 2.34375, + "grad_norm_var": 0.08604227701822917, + "learning_rate": 0.0001, + "loss": 3.0957, + "loss/crossentropy": 2.5188711285591125, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.264768448472023, + "loss/reg": 0.0, + "step": 20300 + }, + { + "epoch": 0.13361842105263158, + "grad_norm": 2.5625, + "grad_norm_var": 0.3717274983723958, + "learning_rate": 0.0001, + "loss": 3.0886, + "loss/crossentropy": 2.392267715930939, + "loss/hidden": 3.1953125, + "loss/incoh": 0.0, + "loss/logits": 0.3307821795344353, + "loss/reg": 0.0, + "step": 20310 + }, + { + "epoch": 0.1336842105263158, + "grad_norm": 2.46875, + "grad_norm_var": 0.39180399576822916, + "learning_rate": 0.0001, + "loss": 3.1034, + "loss/crossentropy": 2.2370328307151794, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.20846432447433472, + "loss/reg": 0.0, + "step": 20320 + }, + { + "epoch": 0.13375, + "grad_norm": 2.515625, + "grad_norm_var": 0.136962890625, + "learning_rate": 0.0001, + "loss": 3.1123, + "loss/crossentropy": 2.358556866645813, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.264502127468586, + "loss/reg": 0.0, + "step": 20330 + }, + { + "epoch": 0.13381578947368422, + "grad_norm": 2.25, + "grad_norm_var": 0.04733072916666667, + "learning_rate": 0.0001, + "loss": 3.1698, + "loss/crossentropy": 2.302602219581604, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2898910105228424, + "loss/reg": 0.0, + "step": 20340 + }, + { + "epoch": 0.13388157894736843, + "grad_norm": 2.75, + "grad_norm_var": 0.07590230305989583, + "learning_rate": 0.0001, + "loss": 3.155, + "loss/crossentropy": 2.2594828605651855, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.2617498949170113, + "loss/reg": 0.0, + "step": 20350 + }, + { + "epoch": 0.13394736842105262, + "grad_norm": 2.421875, + "grad_norm_var": 0.1132476806640625, + "learning_rate": 0.0001, + "loss": 3.1443, + "loss/crossentropy": 2.6042701482772825, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.2813595399260521, + "loss/reg": 0.0, + "step": 20360 + }, + { + "epoch": 0.13401315789473683, + "grad_norm": 2.265625, + "grad_norm_var": 3.111881782679394e+17, + "learning_rate": 0.0001, + "loss": 3.1969, + "loss/crossentropy": 2.596519351005554, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.24256999790668488, + "loss/reg": 0.0, + "step": 20370 + }, + { + "epoch": 0.13407894736842105, + "grad_norm": 2.515625, + "grad_norm_var": 3.111881782603852e+17, + "learning_rate": 0.0001, + "loss": 3.1352, + "loss/crossentropy": 2.1411483764648436, + "loss/hidden": 3.09375, + "loss/incoh": 0.0, + "loss/logits": 0.2837516859173775, + "loss/reg": 0.0, + "step": 20380 + }, + { + "epoch": 0.13414473684210526, + "grad_norm": 7.5, + "grad_norm_var": 1.6672810872395833, + "learning_rate": 0.0001, + "loss": 3.139, + "loss/crossentropy": 2.224585199356079, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.24173519760370255, + "loss/reg": 0.0, + "step": 20390 + }, + { + "epoch": 0.13421052631578947, + "grad_norm": 2.03125, + "grad_norm_var": 1.8035634358723958, + "learning_rate": 0.0001, + "loss": 3.0216, + "loss/crossentropy": 2.153792452812195, + "loss/hidden": 2.6015625, + "loss/incoh": 0.0, + "loss/logits": 0.19605738371610643, + "loss/reg": 0.0, + "step": 20400 + }, + { + "epoch": 0.1342763157894737, + "grad_norm": 2.703125, + "grad_norm_var": 0.18290913899739583, + "learning_rate": 0.0001, + "loss": 3.1134, + "loss/crossentropy": 2.3031864166259766, + "loss/hidden": 3.05, + "loss/incoh": 0.0, + "loss/logits": 0.26905038952827454, + "loss/reg": 0.0, + "step": 20410 + }, + { + "epoch": 0.1343421052631579, + "grad_norm": 2.5625, + "grad_norm_var": 0.04761454264322917, + "learning_rate": 0.0001, + "loss": 3.0931, + "loss/crossentropy": 2.250312161445618, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2365841895341873, + "loss/reg": 0.0, + "step": 20420 + }, + { + "epoch": 0.13440789473684212, + "grad_norm": 2.15625, + "grad_norm_var": 0.06604410807291666, + "learning_rate": 0.0001, + "loss": 3.0394, + "loss/crossentropy": 2.1318182408809663, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.2652772217988968, + "loss/reg": 0.0, + "step": 20430 + }, + { + "epoch": 0.13447368421052633, + "grad_norm": 2.21875, + "grad_norm_var": 0.054585774739583336, + "learning_rate": 0.0001, + "loss": 2.9721, + "loss/crossentropy": 2.237068510055542, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.21526216119527816, + "loss/reg": 0.0, + "step": 20440 + }, + { + "epoch": 0.13453947368421051, + "grad_norm": 2.203125, + "grad_norm_var": 0.05564676920572917, + "learning_rate": 0.0001, + "loss": 3.0973, + "loss/crossentropy": 2.0180070281028746, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.22148093655705453, + "loss/reg": 0.0, + "step": 20450 + }, + { + "epoch": 0.13460526315789473, + "grad_norm": 2.296875, + "grad_norm_var": 0.058821614583333334, + "learning_rate": 0.0001, + "loss": 3.176, + "loss/crossentropy": 2.1725693702697755, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.26163683980703356, + "loss/reg": 0.0, + "step": 20460 + }, + { + "epoch": 0.13467105263157894, + "grad_norm": 2.5625, + "grad_norm_var": 0.06783625284830729, + "learning_rate": 0.0001, + "loss": 3.0549, + "loss/crossentropy": 1.9785831272602081, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.21865816414356232, + "loss/reg": 0.0, + "step": 20470 + }, + { + "epoch": 0.13473684210526315, + "grad_norm": 2.859375, + "grad_norm_var": 0.12560806274414063, + "learning_rate": 0.0001, + "loss": 3.1254, + "loss/crossentropy": 2.316719186306, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.23244183510541916, + "loss/reg": 0.0, + "step": 20480 + }, + { + "epoch": 0.13480263157894737, + "grad_norm": 2.328125, + "grad_norm_var": 0.1337554931640625, + "learning_rate": 0.0001, + "loss": 3.0912, + "loss/crossentropy": 2.2532522082328796, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.2548024535179138, + "loss/reg": 0.0, + "step": 20490 + }, + { + "epoch": 0.13486842105263158, + "grad_norm": 2.203125, + "grad_norm_var": 0.05217692057291667, + "learning_rate": 0.0001, + "loss": 3.0455, + "loss/crossentropy": 2.3155321717262267, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.26803631633520125, + "loss/reg": 0.0, + "step": 20500 + }, + { + "epoch": 0.1349342105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.4710896809895833, + "learning_rate": 0.0001, + "loss": 3.1847, + "loss/crossentropy": 2.4292261719703676, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.26074815839529036, + "loss/reg": 0.0, + "step": 20510 + }, + { + "epoch": 0.135, + "grad_norm": 2.28125, + "grad_norm_var": 0.06902567545572917, + "learning_rate": 0.0001, + "loss": 3.1187, + "loss/crossentropy": 2.311373507976532, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.2725979134440422, + "loss/reg": 0.0, + "step": 20520 + }, + { + "epoch": 0.13506578947368422, + "grad_norm": 2.453125, + "grad_norm_var": 0.28644917805989584, + "learning_rate": 0.0001, + "loss": 3.0904, + "loss/crossentropy": 1.8008847087621689, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.2606606900691986, + "loss/reg": 0.0, + "step": 20530 + }, + { + "epoch": 0.1351315789473684, + "grad_norm": 2.125, + "grad_norm_var": 0.06108296712239583, + "learning_rate": 0.0001, + "loss": 3.1547, + "loss/crossentropy": 2.0716704607009886, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.21668420732021332, + "loss/reg": 0.0, + "step": 20540 + }, + { + "epoch": 0.13519736842105262, + "grad_norm": 2.84375, + "grad_norm_var": 0.35935872395833335, + "learning_rate": 0.0001, + "loss": 3.1812, + "loss/crossentropy": 2.343623089790344, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.25094970166683195, + "loss/reg": 0.0, + "step": 20550 + }, + { + "epoch": 0.13526315789473684, + "grad_norm": 2.609375, + "grad_norm_var": 0.20611979166666666, + "learning_rate": 0.0001, + "loss": 3.1019, + "loss/crossentropy": 2.1298214733600616, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.3111059933900833, + "loss/reg": 0.0, + "step": 20560 + }, + { + "epoch": 0.13532894736842105, + "grad_norm": 2.5, + "grad_norm_var": 0.07794596354166666, + "learning_rate": 0.0001, + "loss": 3.0833, + "loss/crossentropy": 2.1468781232833862, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.242007839679718, + "loss/reg": 0.0, + "step": 20570 + }, + { + "epoch": 0.13539473684210526, + "grad_norm": 2.4375, + "grad_norm_var": 0.3861223856608073, + "learning_rate": 0.0001, + "loss": 3.056, + "loss/crossentropy": 2.0755307257175444, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.21787920594215393, + "loss/reg": 0.0, + "step": 20580 + }, + { + "epoch": 0.13546052631578948, + "grad_norm": 2.09375, + "grad_norm_var": 0.048713938395182295, + "learning_rate": 0.0001, + "loss": 3.0617, + "loss/crossentropy": 2.167508864402771, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.2543604165315628, + "loss/reg": 0.0, + "step": 20590 + }, + { + "epoch": 0.1355263157894737, + "grad_norm": 4.40625, + "grad_norm_var": 0.31427408854166666, + "learning_rate": 0.0001, + "loss": 3.0881, + "loss/crossentropy": 2.1288257122039793, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.2275502547621727, + "loss/reg": 0.0, + "step": 20600 + }, + { + "epoch": 0.1355921052631579, + "grad_norm": 2.59375, + "grad_norm_var": 0.3212257385253906, + "learning_rate": 0.0001, + "loss": 3.0587, + "loss/crossentropy": 2.373918867111206, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.25912273228168486, + "loss/reg": 0.0, + "step": 20610 + }, + { + "epoch": 0.13565789473684212, + "grad_norm": 2.671875, + "grad_norm_var": 0.05368245442708333, + "learning_rate": 0.0001, + "loss": 3.0912, + "loss/crossentropy": 2.0399203658103944, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.23557177633047105, + "loss/reg": 0.0, + "step": 20620 + }, + { + "epoch": 0.1357236842105263, + "grad_norm": 2.703125, + "grad_norm_var": 0.11100031534830729, + "learning_rate": 0.0001, + "loss": 3.1051, + "loss/crossentropy": 2.5496551632881164, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.2645132452249527, + "loss/reg": 0.0, + "step": 20630 + }, + { + "epoch": 0.13578947368421052, + "grad_norm": 2.359375, + "grad_norm_var": 0.3220855712890625, + "learning_rate": 0.0001, + "loss": 3.0665, + "loss/crossentropy": 2.226586413383484, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.22172508686780928, + "loss/reg": 0.0, + "step": 20640 + }, + { + "epoch": 0.13585526315789473, + "grad_norm": 2.578125, + "grad_norm_var": 0.2806925455729167, + "learning_rate": 0.0001, + "loss": 3.1914, + "loss/crossentropy": 2.2754149079322814, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.27330992817878724, + "loss/reg": 0.0, + "step": 20650 + }, + { + "epoch": 0.13592105263157894, + "grad_norm": 2.34375, + "grad_norm_var": 0.05778172810872396, + "learning_rate": 0.0001, + "loss": 3.1347, + "loss/crossentropy": 2.1621429324150085, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2567806988954544, + "loss/reg": 0.0, + "step": 20660 + }, + { + "epoch": 0.13598684210526316, + "grad_norm": 2.375, + "grad_norm_var": 0.2951637268066406, + "learning_rate": 0.0001, + "loss": 3.0649, + "loss/crossentropy": 2.342224645614624, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.2490961804986, + "loss/reg": 0.0, + "step": 20670 + }, + { + "epoch": 0.13605263157894737, + "grad_norm": 2.109375, + "grad_norm_var": 0.27790425618489584, + "learning_rate": 0.0001, + "loss": 3.1081, + "loss/crossentropy": 2.522631120681763, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.28559967428445815, + "loss/reg": 0.0, + "step": 20680 + }, + { + "epoch": 0.13611842105263158, + "grad_norm": 2.03125, + "grad_norm_var": 0.07203369140625, + "learning_rate": 0.0001, + "loss": 3.0501, + "loss/crossentropy": 2.1872968673706055, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.21910873502492906, + "loss/reg": 0.0, + "step": 20690 + }, + { + "epoch": 0.1361842105263158, + "grad_norm": 2.40625, + "grad_norm_var": 0.06353251139322917, + "learning_rate": 0.0001, + "loss": 3.0469, + "loss/crossentropy": 2.0313059210777284, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.21065909266471863, + "loss/reg": 0.0, + "step": 20700 + }, + { + "epoch": 0.13625, + "grad_norm": 2.1875, + "grad_norm_var": 0.12810872395833334, + "learning_rate": 0.0001, + "loss": 3.1641, + "loss/crossentropy": 1.9590769171714784, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2205381214618683, + "loss/reg": 0.0, + "step": 20710 + }, + { + "epoch": 0.13631578947368422, + "grad_norm": 2.625, + "grad_norm_var": 0.039526112874348956, + "learning_rate": 0.0001, + "loss": 3.0038, + "loss/crossentropy": 2.181920811533928, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.20574837550520897, + "loss/reg": 0.0, + "step": 20720 + }, + { + "epoch": 0.1363815789473684, + "grad_norm": 2.1875, + "grad_norm_var": 0.04170710245768229, + "learning_rate": 0.0001, + "loss": 3.0758, + "loss/crossentropy": 2.5629841327667235, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.25942468345165254, + "loss/reg": 0.0, + "step": 20730 + }, + { + "epoch": 0.13644736842105262, + "grad_norm": 2.28125, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 3.1239, + "loss/crossentropy": 2.5672937512397764, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2589055135846138, + "loss/reg": 0.0, + "step": 20740 + }, + { + "epoch": 0.13651315789473684, + "grad_norm": 2.453125, + "grad_norm_var": 0.051558430989583334, + "learning_rate": 0.0001, + "loss": 3.1031, + "loss/crossentropy": 2.598326253890991, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.2481001317501068, + "loss/reg": 0.0, + "step": 20750 + }, + { + "epoch": 0.13657894736842105, + "grad_norm": 2.265625, + "grad_norm_var": 0.08925374348958333, + "learning_rate": 0.0001, + "loss": 3.0563, + "loss/crossentropy": 2.403074288368225, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.25444435328245163, + "loss/reg": 0.0, + "step": 20760 + }, + { + "epoch": 0.13664473684210526, + "grad_norm": 2.453125, + "grad_norm_var": 0.33211161295572916, + "learning_rate": 0.0001, + "loss": 3.1043, + "loss/crossentropy": 2.161120080947876, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2631052315235138, + "loss/reg": 0.0, + "step": 20770 + }, + { + "epoch": 0.13671052631578948, + "grad_norm": 2.203125, + "grad_norm_var": 0.2020416259765625, + "learning_rate": 0.0001, + "loss": 3.0514, + "loss/crossentropy": 2.601893973350525, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23912052661180497, + "loss/reg": 0.0, + "step": 20780 + }, + { + "epoch": 0.1367763157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.11573893229166667, + "learning_rate": 0.0001, + "loss": 3.0923, + "loss/crossentropy": 2.3733190536499023, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.21903542578220367, + "loss/reg": 0.0, + "step": 20790 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 2.59375, + "grad_norm_var": 0.08520406087239583, + "learning_rate": 0.0001, + "loss": 3.1174, + "loss/crossentropy": 2.344668173789978, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2994943633675575, + "loss/reg": 0.0, + "step": 20800 + }, + { + "epoch": 0.13690789473684212, + "grad_norm": 2.421875, + "grad_norm_var": 3.350255903434211e+17, + "learning_rate": 0.0001, + "loss": 3.2853, + "loss/crossentropy": 2.1936369478702544, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2379646047949791, + "loss/reg": 0.0, + "step": 20810 + }, + { + "epoch": 0.1369736842105263, + "grad_norm": 2.125, + "grad_norm_var": 0.025055948893229166, + "learning_rate": 0.0001, + "loss": 3.103, + "loss/crossentropy": 2.2369776248931883, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2727003887295723, + "loss/reg": 0.0, + "step": 20820 + }, + { + "epoch": 0.13703947368421052, + "grad_norm": 2.5625, + "grad_norm_var": 0.033788045247395836, + "learning_rate": 0.0001, + "loss": 3.0878, + "loss/crossentropy": 2.2766017496585844, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.21165435910224914, + "loss/reg": 0.0, + "step": 20830 + }, + { + "epoch": 0.13710526315789473, + "grad_norm": 5.84375, + "grad_norm_var": 0.75269775390625, + "learning_rate": 0.0001, + "loss": 3.1106, + "loss/crossentropy": 2.123941707611084, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2290189027786255, + "loss/reg": 0.0, + "step": 20840 + }, + { + "epoch": 0.13717105263157894, + "grad_norm": 2.234375, + "grad_norm_var": 0.7671132405598958, + "learning_rate": 0.0001, + "loss": 3.0839, + "loss/crossentropy": 2.239221286773682, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2762044548988342, + "loss/reg": 0.0, + "step": 20850 + }, + { + "epoch": 0.13723684210526316, + "grad_norm": 2.375, + "grad_norm_var": 0.1312896728515625, + "learning_rate": 0.0001, + "loss": 3.1207, + "loss/crossentropy": 2.4664002418518067, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.2304532825946808, + "loss/reg": 0.0, + "step": 20860 + }, + { + "epoch": 0.13730263157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.1725847880045573, + "learning_rate": 0.0001, + "loss": 3.1318, + "loss/crossentropy": 2.3205449104309084, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.22169369757175444, + "loss/reg": 0.0, + "step": 20870 + }, + { + "epoch": 0.13736842105263158, + "grad_norm": 2.375, + "grad_norm_var": 0.046533203125, + "learning_rate": 0.0001, + "loss": 3.1764, + "loss/crossentropy": 2.2292275547981264, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.26578798294067385, + "loss/reg": 0.0, + "step": 20880 + }, + { + "epoch": 0.1374342105263158, + "grad_norm": 2.21875, + "grad_norm_var": 0.013053385416666667, + "learning_rate": 0.0001, + "loss": 3.0906, + "loss/crossentropy": 2.4730883955955507, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2600066691637039, + "loss/reg": 0.0, + "step": 20890 + }, + { + "epoch": 0.1375, + "grad_norm": 2.78125, + "grad_norm_var": 0.03638916015625, + "learning_rate": 0.0001, + "loss": 3.1146, + "loss/crossentropy": 2.389441525936127, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2204804763197899, + "loss/reg": 0.0, + "step": 20900 + }, + { + "epoch": 0.1375657894736842, + "grad_norm": 2.015625, + "grad_norm_var": 0.05347391764322917, + "learning_rate": 0.0001, + "loss": 3.1005, + "loss/crossentropy": 2.182996892929077, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.23823060542345048, + "loss/reg": 0.0, + "step": 20910 + }, + { + "epoch": 0.1376315789473684, + "grad_norm": 2.328125, + "grad_norm_var": 0.0274810791015625, + "learning_rate": 0.0001, + "loss": 3.1323, + "loss/crossentropy": 2.3811920285224915, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.22427577376365662, + "loss/reg": 0.0, + "step": 20920 + }, + { + "epoch": 0.13769736842105262, + "grad_norm": 2.234375, + "grad_norm_var": 0.0668609619140625, + "learning_rate": 0.0001, + "loss": 3.1591, + "loss/crossentropy": 2.106644082069397, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.27698966562747956, + "loss/reg": 0.0, + "step": 20930 + }, + { + "epoch": 0.13776315789473684, + "grad_norm": 2.53125, + "grad_norm_var": 0.07274983723958334, + "learning_rate": 0.0001, + "loss": 3.165, + "loss/crossentropy": 2.3118181228637695, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.30020255893468856, + "loss/reg": 0.0, + "step": 20940 + }, + { + "epoch": 0.13782894736842105, + "grad_norm": 2.21875, + "grad_norm_var": 0.025804646809895835, + "learning_rate": 0.0001, + "loss": 3.1041, + "loss/crossentropy": 2.429260182380676, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.2653680741786957, + "loss/reg": 0.0, + "step": 20950 + }, + { + "epoch": 0.13789473684210526, + "grad_norm": 3.25, + "grad_norm_var": 0.0960528055826823, + "learning_rate": 0.0001, + "loss": 3.0853, + "loss/crossentropy": 1.888709381222725, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.21381159387528897, + "loss/reg": 0.0, + "step": 20960 + }, + { + "epoch": 0.13796052631578948, + "grad_norm": 2.140625, + "grad_norm_var": 0.09798355102539062, + "learning_rate": 0.0001, + "loss": 3.1087, + "loss/crossentropy": 2.210513544082642, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.24400804787874222, + "loss/reg": 0.0, + "step": 20970 + }, + { + "epoch": 0.1380263157894737, + "grad_norm": 2.0, + "grad_norm_var": 0.1358062744140625, + "learning_rate": 0.0001, + "loss": 3.1309, + "loss/crossentropy": 2.3269863963127135, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.25037053376436236, + "loss/reg": 0.0, + "step": 20980 + }, + { + "epoch": 0.1380921052631579, + "grad_norm": 2.03125, + "grad_norm_var": 0.13828125, + "learning_rate": 0.0001, + "loss": 3.103, + "loss/crossentropy": 2.4014554262161254, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.22934938669204713, + "loss/reg": 0.0, + "step": 20990 + }, + { + "epoch": 0.13815789473684212, + "grad_norm": 2.34375, + "grad_norm_var": 0.0584381103515625, + "learning_rate": 0.0001, + "loss": 3.1088, + "loss/crossentropy": 2.208330225944519, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.23984504938125611, + "loss/reg": 0.0, + "step": 21000 + }, + { + "epoch": 0.1382236842105263, + "grad_norm": 1.9296875, + "grad_norm_var": 0.1774614969889323, + "learning_rate": 0.0001, + "loss": 3.0588, + "loss/crossentropy": 2.496232438087463, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.29541969746351243, + "loss/reg": 0.0, + "step": 21010 + }, + { + "epoch": 0.13828947368421052, + "grad_norm": 2.46875, + "grad_norm_var": 0.4096514383951823, + "learning_rate": 0.0001, + "loss": 3.1337, + "loss/crossentropy": 2.167738914489746, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.23880672752857207, + "loss/reg": 0.0, + "step": 21020 + }, + { + "epoch": 0.13835526315789473, + "grad_norm": 2.34375, + "grad_norm_var": 0.06155192057291667, + "learning_rate": 0.0001, + "loss": 3.1325, + "loss/crossentropy": 2.1132714807987214, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.227035228908062, + "loss/reg": 0.0, + "step": 21030 + }, + { + "epoch": 0.13842105263157894, + "grad_norm": 2.3125, + "grad_norm_var": 0.09390360514322917, + "learning_rate": 0.0001, + "loss": 3.1269, + "loss/crossentropy": 2.467393732070923, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.24615048468112946, + "loss/reg": 0.0, + "step": 21040 + }, + { + "epoch": 0.13848684210526316, + "grad_norm": 2.171875, + "grad_norm_var": 0.035090128580729164, + "learning_rate": 0.0001, + "loss": 3.1166, + "loss/crossentropy": 2.2899758577346803, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.25930332988500593, + "loss/reg": 0.0, + "step": 21050 + }, + { + "epoch": 0.13855263157894737, + "grad_norm": 2.609375, + "grad_norm_var": 0.08716812133789062, + "learning_rate": 0.0001, + "loss": 3.074, + "loss/crossentropy": 2.3258708715438843, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.23002547025680542, + "loss/reg": 0.0, + "step": 21060 + }, + { + "epoch": 0.13861842105263159, + "grad_norm": 2.203125, + "grad_norm_var": 0.10960667928059896, + "learning_rate": 0.0001, + "loss": 3.1011, + "loss/crossentropy": 2.368458116054535, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.26667492985725405, + "loss/reg": 0.0, + "step": 21070 + }, + { + "epoch": 0.1386842105263158, + "grad_norm": 2.1875, + "grad_norm_var": 0.053807576497395836, + "learning_rate": 0.0001, + "loss": 3.0992, + "loss/crossentropy": 2.268073225021362, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.23710739463567734, + "loss/reg": 0.0, + "step": 21080 + }, + { + "epoch": 0.13875, + "grad_norm": 2.15625, + "grad_norm_var": 0.7524088541666667, + "learning_rate": 0.0001, + "loss": 3.123, + "loss/crossentropy": 2.116519570350647, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.2593864217400551, + "loss/reg": 0.0, + "step": 21090 + }, + { + "epoch": 0.1388157894736842, + "grad_norm": 2.125, + "grad_norm_var": 0.10280329386393229, + "learning_rate": 0.0001, + "loss": 3.0788, + "loss/crossentropy": 2.49674973487854, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.2637839734554291, + "loss/reg": 0.0, + "step": 21100 + }, + { + "epoch": 0.1388815789473684, + "grad_norm": 2.78125, + "grad_norm_var": 0.06420059204101562, + "learning_rate": 0.0001, + "loss": 3.0901, + "loss/crossentropy": 2.142958414554596, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.28233732134103773, + "loss/reg": 0.0, + "step": 21110 + }, + { + "epoch": 0.13894736842105262, + "grad_norm": 2.34375, + "grad_norm_var": 0.09973551432291666, + "learning_rate": 0.0001, + "loss": 3.1008, + "loss/crossentropy": 2.300196385383606, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.24371459782123567, + "loss/reg": 0.0, + "step": 21120 + }, + { + "epoch": 0.13901315789473684, + "grad_norm": 2.375, + "grad_norm_var": 0.07563247680664062, + "learning_rate": 0.0001, + "loss": 3.0722, + "loss/crossentropy": 2.424953353404999, + "loss/hidden": 3.08125, + "loss/incoh": 0.0, + "loss/logits": 0.28959580361843107, + "loss/reg": 0.0, + "step": 21130 + }, + { + "epoch": 0.13907894736842105, + "grad_norm": 2.484375, + "grad_norm_var": 0.05977554321289062, + "learning_rate": 0.0001, + "loss": 3.1188, + "loss/crossentropy": 2.3048367381095884, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.26107140332460405, + "loss/reg": 0.0, + "step": 21140 + }, + { + "epoch": 0.13914473684210527, + "grad_norm": 2.40625, + "grad_norm_var": 0.17164713541666668, + "learning_rate": 0.0001, + "loss": 3.1765, + "loss/crossentropy": 2.376292657852173, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.24798475652933122, + "loss/reg": 0.0, + "step": 21150 + }, + { + "epoch": 0.13921052631578948, + "grad_norm": 2.34375, + "grad_norm_var": 0.0533355712890625, + "learning_rate": 0.0001, + "loss": 3.0556, + "loss/crossentropy": 2.620091509819031, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2451164111495018, + "loss/reg": 0.0, + "step": 21160 + }, + { + "epoch": 0.1392763157894737, + "grad_norm": 2.328125, + "grad_norm_var": 0.07333882649739583, + "learning_rate": 0.0001, + "loss": 3.1244, + "loss/crossentropy": 2.2057873249053954, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.26459684371948244, + "loss/reg": 0.0, + "step": 21170 + }, + { + "epoch": 0.1393421052631579, + "grad_norm": 2.640625, + "grad_norm_var": 0.024214680989583334, + "learning_rate": 0.0001, + "loss": 3.0668, + "loss/crossentropy": 2.087160420417786, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.2084551602602005, + "loss/reg": 0.0, + "step": 21180 + }, + { + "epoch": 0.1394078947368421, + "grad_norm": 2.0625, + "grad_norm_var": 0.10177408854166667, + "learning_rate": 0.0001, + "loss": 3.0369, + "loss/crossentropy": 2.4722538352012635, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.28627206683158873, + "loss/reg": 0.0, + "step": 21190 + }, + { + "epoch": 0.1394736842105263, + "grad_norm": 2.4375, + "grad_norm_var": 3.921613566080729, + "learning_rate": 0.0001, + "loss": 3.2147, + "loss/crossentropy": 2.103289079666138, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2449532501399517, + "loss/reg": 0.0, + "step": 21200 + }, + { + "epoch": 0.13953947368421052, + "grad_norm": 2.71875, + "grad_norm_var": 3.8165679931640626, + "learning_rate": 0.0001, + "loss": 3.123, + "loss/crossentropy": 2.377441930770874, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.27957661896944047, + "loss/reg": 0.0, + "step": 21210 + }, + { + "epoch": 0.13960526315789473, + "grad_norm": 3.34375, + "grad_norm_var": 0.18837788899739583, + "learning_rate": 0.0001, + "loss": 3.1171, + "loss/crossentropy": 2.218759763240814, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.2158270835876465, + "loss/reg": 0.0, + "step": 21220 + }, + { + "epoch": 0.13967105263157895, + "grad_norm": 2.46875, + "grad_norm_var": 14.1974609375, + "learning_rate": 0.0001, + "loss": 3.1327, + "loss/crossentropy": 2.4489439368247985, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.23917225003242493, + "loss/reg": 0.0, + "step": 21230 + }, + { + "epoch": 0.13973684210526316, + "grad_norm": 3.140625, + "grad_norm_var": 14.437064361572265, + "learning_rate": 0.0001, + "loss": 3.0414, + "loss/crossentropy": 2.2649194478988646, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.21438237726688386, + "loss/reg": 0.0, + "step": 21240 + }, + { + "epoch": 0.13980263157894737, + "grad_norm": 2.046875, + "grad_norm_var": 2.151969401041667, + "learning_rate": 0.0001, + "loss": 3.1748, + "loss/crossentropy": 2.350678837299347, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.2586789205670357, + "loss/reg": 0.0, + "step": 21250 + }, + { + "epoch": 0.1398684210526316, + "grad_norm": 2.203125, + "grad_norm_var": 0.07167867024739584, + "learning_rate": 0.0001, + "loss": 3.1436, + "loss/crossentropy": 2.383778703212738, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.30858664512634276, + "loss/reg": 0.0, + "step": 21260 + }, + { + "epoch": 0.1399342105263158, + "grad_norm": 2.234375, + "grad_norm_var": 0.04163004557291667, + "learning_rate": 0.0001, + "loss": 3.0829, + "loss/crossentropy": 2.4344274759292603, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.24489458352327348, + "loss/reg": 0.0, + "step": 21270 + }, + { + "epoch": 0.14, + "grad_norm": 2.15625, + "grad_norm_var": 0.04241129557291667, + "learning_rate": 0.0001, + "loss": 3.0751, + "loss/crossentropy": 2.59662504196167, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.23801603466272353, + "loss/reg": 0.0, + "step": 21280 + }, + { + "epoch": 0.1400657894736842, + "grad_norm": 2.328125, + "grad_norm_var": 1.7194010416666667, + "learning_rate": 0.0001, + "loss": 3.0867, + "loss/crossentropy": 2.466573119163513, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.29062790870666505, + "loss/reg": 0.0, + "step": 21290 + }, + { + "epoch": 0.1401315789473684, + "grad_norm": 2.609375, + "grad_norm_var": 0.19259440104166667, + "learning_rate": 0.0001, + "loss": 3.2042, + "loss/crossentropy": 2.5092820644378664, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.3163674846291542, + "loss/reg": 0.0, + "step": 21300 + }, + { + "epoch": 0.14019736842105263, + "grad_norm": 2.046875, + "grad_norm_var": 0.09862874348958334, + "learning_rate": 0.0001, + "loss": 3.0485, + "loss/crossentropy": 2.1548155784606933, + "loss/hidden": 2.64375, + "loss/incoh": 0.0, + "loss/logits": 0.20280121639370918, + "loss/reg": 0.0, + "step": 21310 + }, + { + "epoch": 0.14026315789473684, + "grad_norm": 2.203125, + "grad_norm_var": 0.25152587890625, + "learning_rate": 0.0001, + "loss": 3.0701, + "loss/crossentropy": 2.1562862396240234, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.1931696727871895, + "loss/reg": 0.0, + "step": 21320 + }, + { + "epoch": 0.14032894736842105, + "grad_norm": 7.6875, + "grad_norm_var": 1.9003896077473958, + "learning_rate": 0.0001, + "loss": 3.1242, + "loss/crossentropy": 2.4868319749832155, + "loss/hidden": 3.2, + "loss/incoh": 0.0, + "loss/logits": 0.263777095079422, + "loss/reg": 0.0, + "step": 21330 + }, + { + "epoch": 0.14039473684210527, + "grad_norm": 2.140625, + "grad_norm_var": 2.187555948893229, + "learning_rate": 0.0001, + "loss": 3.0713, + "loss/crossentropy": 2.4403932213783266, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.239518903195858, + "loss/reg": 0.0, + "step": 21340 + }, + { + "epoch": 0.14046052631578948, + "grad_norm": 2.9375, + "grad_norm_var": 0.92603759765625, + "learning_rate": 0.0001, + "loss": 3.161, + "loss/crossentropy": 2.637459659576416, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.2770415723323822, + "loss/reg": 0.0, + "step": 21350 + }, + { + "epoch": 0.1405263157894737, + "grad_norm": 2.1875, + "grad_norm_var": 0.6560373942057292, + "learning_rate": 0.0001, + "loss": 3.1023, + "loss/crossentropy": 2.4345417737960817, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.23416574895381928, + "loss/reg": 0.0, + "step": 21360 + }, + { + "epoch": 0.1405921052631579, + "grad_norm": 4.625, + "grad_norm_var": 0.39641927083333334, + "learning_rate": 0.0001, + "loss": 3.0819, + "loss/crossentropy": 2.3080653309822083, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.203302001953125, + "loss/reg": 0.0, + "step": 21370 + }, + { + "epoch": 0.1406578947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.3663736979166667, + "learning_rate": 0.0001, + "loss": 3.005, + "loss/crossentropy": 2.312281048297882, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.22557048946619035, + "loss/reg": 0.0, + "step": 21380 + }, + { + "epoch": 0.1407236842105263, + "grad_norm": 2.203125, + "grad_norm_var": 0.0878326416015625, + "learning_rate": 0.0001, + "loss": 3.0862, + "loss/crossentropy": 2.725596809387207, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.26989305168390276, + "loss/reg": 0.0, + "step": 21390 + }, + { + "epoch": 0.14078947368421052, + "grad_norm": 2.59375, + "grad_norm_var": 0.7812662760416667, + "learning_rate": 0.0001, + "loss": 3.1293, + "loss/crossentropy": 2.4127392411231994, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.28430653512477877, + "loss/reg": 0.0, + "step": 21400 + }, + { + "epoch": 0.14085526315789473, + "grad_norm": 2.546875, + "grad_norm_var": 0.76865234375, + "learning_rate": 0.0001, + "loss": 3.1054, + "loss/crossentropy": 2.3319249004125595, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.22504624500870704, + "loss/reg": 0.0, + "step": 21410 + }, + { + "epoch": 0.14092105263157895, + "grad_norm": 2.09375, + "grad_norm_var": 0.09716389973958334, + "learning_rate": 0.0001, + "loss": 3.0617, + "loss/crossentropy": 2.6645477771759034, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.24796359091997147, + "loss/reg": 0.0, + "step": 21420 + }, + { + "epoch": 0.14098684210526316, + "grad_norm": 2.609375, + "grad_norm_var": 4.503599619195098e+17, + "learning_rate": 0.0001, + "loss": 3.2735, + "loss/crossentropy": 2.1648690342903136, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.23503894209861756, + "loss/reg": 0.0, + "step": 21430 + }, + { + "epoch": 0.14105263157894737, + "grad_norm": 2.078125, + "grad_norm_var": 0.07038472493489584, + "learning_rate": 0.0001, + "loss": 3.0659, + "loss/crossentropy": 2.477909338474274, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2327214926481247, + "loss/reg": 0.0, + "step": 21440 + }, + { + "epoch": 0.1411184210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.03515625, + "learning_rate": 0.0001, + "loss": 3.057, + "loss/crossentropy": 2.354469347000122, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.23355985432863235, + "loss/reg": 0.0, + "step": 21450 + }, + { + "epoch": 0.1411842105263158, + "grad_norm": 3.34375, + "grad_norm_var": 0.18144124348958332, + "learning_rate": 0.0001, + "loss": 3.1157, + "loss/crossentropy": 2.2400832891464235, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.24985998570919038, + "loss/reg": 0.0, + "step": 21460 + }, + { + "epoch": 0.14125, + "grad_norm": 5.65625, + "grad_norm_var": 0.7304921468098958, + "learning_rate": 0.0001, + "loss": 3.0675, + "loss/crossentropy": 2.324231135845184, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.22793666571378707, + "loss/reg": 0.0, + "step": 21470 + }, + { + "epoch": 0.1413157894736842, + "grad_norm": 2.109375, + "grad_norm_var": 0.7799641927083333, + "learning_rate": 0.0001, + "loss": 3.1565, + "loss/crossentropy": 2.4094609022140503, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.21761243641376496, + "loss/reg": 0.0, + "step": 21480 + }, + { + "epoch": 0.1413815789473684, + "grad_norm": 2.765625, + "grad_norm_var": 0.1587310791015625, + "learning_rate": 0.0001, + "loss": 3.0847, + "loss/crossentropy": 1.9431841492652893, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.24099752232432364, + "loss/reg": 0.0, + "step": 21490 + }, + { + "epoch": 0.14144736842105263, + "grad_norm": 2.1875, + "grad_norm_var": 0.0576324462890625, + "learning_rate": 0.0001, + "loss": 3.0518, + "loss/crossentropy": 2.5469526767730715, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.24212357103824617, + "loss/reg": 0.0, + "step": 21500 + }, + { + "epoch": 0.14151315789473684, + "grad_norm": 2.6875, + "grad_norm_var": 0.07248942057291667, + "learning_rate": 0.0001, + "loss": 3.1081, + "loss/crossentropy": 2.303622233867645, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23643973991274833, + "loss/reg": 0.0, + "step": 21510 + }, + { + "epoch": 0.14157894736842105, + "grad_norm": 2.140625, + "grad_norm_var": 0.16367085774739584, + "learning_rate": 0.0001, + "loss": 3.0839, + "loss/crossentropy": 2.178890359401703, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.23347833156585693, + "loss/reg": 0.0, + "step": 21520 + }, + { + "epoch": 0.14164473684210527, + "grad_norm": 2.4375, + "grad_norm_var": 0.0575836181640625, + "learning_rate": 0.0001, + "loss": 3.1122, + "loss/crossentropy": 2.4304057359695435, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.2439558282494545, + "loss/reg": 0.0, + "step": 21530 + }, + { + "epoch": 0.14171052631578948, + "grad_norm": 2.765625, + "grad_norm_var": 0.1728668212890625, + "learning_rate": 0.0001, + "loss": 3.1416, + "loss/crossentropy": 2.111624151468277, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2380165532231331, + "loss/reg": 0.0, + "step": 21540 + }, + { + "epoch": 0.1417763157894737, + "grad_norm": 2.6875, + "grad_norm_var": 0.17907613118489582, + "learning_rate": 0.0001, + "loss": 3.0817, + "loss/crossentropy": 2.2606616258621215, + "loss/hidden": 3.0546875, + "loss/incoh": 0.0, + "loss/logits": 0.24319592714309693, + "loss/reg": 0.0, + "step": 21550 + }, + { + "epoch": 0.1418421052631579, + "grad_norm": 2.90625, + "grad_norm_var": 0.065234375, + "learning_rate": 0.0001, + "loss": 3.0592, + "loss/crossentropy": 2.2833418786525725, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.24779771864414216, + "loss/reg": 0.0, + "step": 21560 + }, + { + "epoch": 0.1419078947368421, + "grad_norm": 2.1875, + "grad_norm_var": 0.19248758951822917, + "learning_rate": 0.0001, + "loss": 3.1323, + "loss/crossentropy": 2.2865166664123535, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2382546842098236, + "loss/reg": 0.0, + "step": 21570 + }, + { + "epoch": 0.1419736842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.19429931640625, + "learning_rate": 0.0001, + "loss": 3.0883, + "loss/crossentropy": 2.37155544757843, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.28304021507501603, + "loss/reg": 0.0, + "step": 21580 + }, + { + "epoch": 0.14203947368421052, + "grad_norm": 2.171875, + "grad_norm_var": 0.06051432291666667, + "learning_rate": 0.0001, + "loss": 3.1666, + "loss/crossentropy": 2.0952234268188477, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.28353812396526334, + "loss/reg": 0.0, + "step": 21590 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 2.203125, + "grad_norm_var": 0.0432769775390625, + "learning_rate": 0.0001, + "loss": 3.1277, + "loss/crossentropy": 2.6342113494873045, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.27442471832036974, + "loss/reg": 0.0, + "step": 21600 + }, + { + "epoch": 0.14217105263157895, + "grad_norm": 2.46875, + "grad_norm_var": 0.052018229166666666, + "learning_rate": 0.0001, + "loss": 3.0889, + "loss/crossentropy": 2.3795015037059786, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24964673370122908, + "loss/reg": 0.0, + "step": 21610 + }, + { + "epoch": 0.14223684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 0.15912984212239584, + "learning_rate": 0.0001, + "loss": 3.1592, + "loss/crossentropy": 2.435245943069458, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.36159887462854384, + "loss/reg": 0.0, + "step": 21620 + }, + { + "epoch": 0.14230263157894738, + "grad_norm": 2.203125, + "grad_norm_var": 0.19715067545572917, + "learning_rate": 0.0001, + "loss": 2.9695, + "loss/crossentropy": 2.3731093287467955, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.24517869502305983, + "loss/reg": 0.0, + "step": 21630 + }, + { + "epoch": 0.1423684210526316, + "grad_norm": 2.375, + "grad_norm_var": 0.029588826497395835, + "learning_rate": 0.0001, + "loss": 3.09, + "loss/crossentropy": 2.1785697817802427, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22660856544971467, + "loss/reg": 0.0, + "step": 21640 + }, + { + "epoch": 0.1424342105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.0379302978515625, + "learning_rate": 0.0001, + "loss": 3.1062, + "loss/crossentropy": 2.3706693768501284, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.27286539524793624, + "loss/reg": 0.0, + "step": 21650 + }, + { + "epoch": 0.1425, + "grad_norm": 2.40625, + "grad_norm_var": 0.05085347493489583, + "learning_rate": 0.0001, + "loss": 3.1297, + "loss/crossentropy": 2.173750901222229, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.2637738898396492, + "loss/reg": 0.0, + "step": 21660 + }, + { + "epoch": 0.1425657894736842, + "grad_norm": 3.046875, + "grad_norm_var": 0.10258153279622396, + "learning_rate": 0.0001, + "loss": 3.0637, + "loss/crossentropy": 2.133163595199585, + "loss/hidden": 2.6578125, + "loss/incoh": 0.0, + "loss/logits": 0.19840268045663834, + "loss/reg": 0.0, + "step": 21670 + }, + { + "epoch": 0.14263157894736841, + "grad_norm": 2.390625, + "grad_norm_var": 0.062646484375, + "learning_rate": 0.0001, + "loss": 3.0662, + "loss/crossentropy": 2.4523648262023925, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.23104819357395173, + "loss/reg": 0.0, + "step": 21680 + }, + { + "epoch": 0.14269736842105263, + "grad_norm": 2.625, + "grad_norm_var": 0.2750935872395833, + "learning_rate": 0.0001, + "loss": 3.1536, + "loss/crossentropy": 2.250650954246521, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.24096653312444688, + "loss/reg": 0.0, + "step": 21690 + }, + { + "epoch": 0.14276315789473684, + "grad_norm": 2.5, + "grad_norm_var": 0.26304931640625, + "learning_rate": 0.0001, + "loss": 3.0649, + "loss/crossentropy": 2.30162969827652, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.24212537854909896, + "loss/reg": 0.0, + "step": 21700 + }, + { + "epoch": 0.14282894736842106, + "grad_norm": 2.296875, + "grad_norm_var": 0.027521769205729168, + "learning_rate": 0.0001, + "loss": 3.1097, + "loss/crossentropy": 2.1112605214118956, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.26635565906763076, + "loss/reg": 0.0, + "step": 21710 + }, + { + "epoch": 0.14289473684210527, + "grad_norm": 2.859375, + "grad_norm_var": 0.0798492431640625, + "learning_rate": 0.0001, + "loss": 3.0973, + "loss/crossentropy": 2.3435486197471618, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.2827204465866089, + "loss/reg": 0.0, + "step": 21720 + }, + { + "epoch": 0.14296052631578948, + "grad_norm": 2.078125, + "grad_norm_var": 0.114111328125, + "learning_rate": 0.0001, + "loss": 3.1039, + "loss/crossentropy": 2.4754523396492005, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.29244270324707033, + "loss/reg": 0.0, + "step": 21730 + }, + { + "epoch": 0.1430263157894737, + "grad_norm": 2.0625, + "grad_norm_var": 0.18517252604166667, + "learning_rate": 0.0001, + "loss": 3.1588, + "loss/crossentropy": 2.2757002115249634, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.27034522593021393, + "loss/reg": 0.0, + "step": 21740 + }, + { + "epoch": 0.14309210526315788, + "grad_norm": 2.203125, + "grad_norm_var": 0.19296773274739584, + "learning_rate": 0.0001, + "loss": 3.112, + "loss/crossentropy": 2.52050644159317, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.2818415179848671, + "loss/reg": 0.0, + "step": 21750 + }, + { + "epoch": 0.1431578947368421, + "grad_norm": 2.265625, + "grad_norm_var": 0.1665679931640625, + "learning_rate": 0.0001, + "loss": 3.2325, + "loss/crossentropy": 2.3630916833877564, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.2456425666809082, + "loss/reg": 0.0, + "step": 21760 + }, + { + "epoch": 0.1432236842105263, + "grad_norm": 2.984375, + "grad_norm_var": 0.21158854166666666, + "learning_rate": 0.0001, + "loss": 3.0907, + "loss/crossentropy": 2.1248918056488035, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.22751243263483048, + "loss/reg": 0.0, + "step": 21770 + }, + { + "epoch": 0.14328947368421052, + "grad_norm": 2.25, + "grad_norm_var": 0.06939697265625, + "learning_rate": 0.0001, + "loss": 3.0952, + "loss/crossentropy": 2.273802196979523, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.2570581123232841, + "loss/reg": 0.0, + "step": 21780 + }, + { + "epoch": 0.14335526315789474, + "grad_norm": 2.4375, + "grad_norm_var": 0.025763956705729167, + "learning_rate": 0.0001, + "loss": 3.0584, + "loss/crossentropy": 2.096750485897064, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.21872427612543105, + "loss/reg": 0.0, + "step": 21790 + }, + { + "epoch": 0.14342105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.023412068684895832, + "learning_rate": 0.0001, + "loss": 3.0604, + "loss/crossentropy": 2.228634536266327, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2626715019345284, + "loss/reg": 0.0, + "step": 21800 + }, + { + "epoch": 0.14348684210526316, + "grad_norm": 2.546875, + "grad_norm_var": 0.28734512329101564, + "learning_rate": 0.0001, + "loss": 3.1324, + "loss/crossentropy": 2.475996434688568, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.23401851058006287, + "loss/reg": 0.0, + "step": 21810 + }, + { + "epoch": 0.14355263157894738, + "grad_norm": 2.03125, + "grad_norm_var": 0.39009984334309894, + "learning_rate": 0.0001, + "loss": 3.1624, + "loss/crossentropy": 2.285959267616272, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.24858510196208955, + "loss/reg": 0.0, + "step": 21820 + }, + { + "epoch": 0.1436184210526316, + "grad_norm": 2.5625, + "grad_norm_var": 0.18899739583333333, + "learning_rate": 0.0001, + "loss": 3.0702, + "loss/crossentropy": 2.463927984237671, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2442393772304058, + "loss/reg": 0.0, + "step": 21830 + }, + { + "epoch": 0.1436842105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.14226786295572916, + "learning_rate": 0.0001, + "loss": 3.0965, + "loss/crossentropy": 2.277498161792755, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.2153932645916939, + "loss/reg": 0.0, + "step": 21840 + }, + { + "epoch": 0.14375, + "grad_norm": 2.4375, + "grad_norm_var": 1.7546953837076822, + "learning_rate": 0.0001, + "loss": 3.108, + "loss/crossentropy": 2.3049885034561157, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.26804344207048414, + "loss/reg": 0.0, + "step": 21850 + }, + { + "epoch": 0.1438157894736842, + "grad_norm": 2.90625, + "grad_norm_var": 4.087398020426432, + "learning_rate": 0.0001, + "loss": 3.1586, + "loss/crossentropy": 2.239436650276184, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.27900682389736176, + "loss/reg": 0.0, + "step": 21860 + }, + { + "epoch": 0.14388157894736842, + "grad_norm": 2.28125, + "grad_norm_var": 0.09602762858072916, + "learning_rate": 0.0001, + "loss": 3.1657, + "loss/crossentropy": 2.385130798816681, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.28002837896347044, + "loss/reg": 0.0, + "step": 21870 + }, + { + "epoch": 0.14394736842105263, + "grad_norm": 2.71875, + "grad_norm_var": 0.12711588541666666, + "learning_rate": 0.0001, + "loss": 3.1437, + "loss/crossentropy": 2.426273798942566, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.3090307667851448, + "loss/reg": 0.0, + "step": 21880 + }, + { + "epoch": 0.14401315789473684, + "grad_norm": 2.484375, + "grad_norm_var": 0.09078776041666667, + "learning_rate": 0.0001, + "loss": 3.1105, + "loss/crossentropy": 2.148479038476944, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.22178827822208405, + "loss/reg": 0.0, + "step": 21890 + }, + { + "epoch": 0.14407894736842106, + "grad_norm": 2.5, + "grad_norm_var": 0.0625, + "learning_rate": 0.0001, + "loss": 3.1009, + "loss/crossentropy": 2.2989478588104246, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.21844121366739272, + "loss/reg": 0.0, + "step": 21900 + }, + { + "epoch": 0.14414473684210527, + "grad_norm": 2.265625, + "grad_norm_var": 0.10066630045572916, + "learning_rate": 0.0001, + "loss": 3.1368, + "loss/crossentropy": 2.459014880657196, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2873328909277916, + "loss/reg": 0.0, + "step": 21910 + }, + { + "epoch": 0.14421052631578948, + "grad_norm": 2.296875, + "grad_norm_var": 0.15559794108072916, + "learning_rate": 0.0001, + "loss": 3.048, + "loss/crossentropy": 2.0745410680770875, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.24865385442972182, + "loss/reg": 0.0, + "step": 21920 + }, + { + "epoch": 0.1442763157894737, + "grad_norm": 2.75, + "grad_norm_var": 0.1090240478515625, + "learning_rate": 0.0001, + "loss": 3.2472, + "loss/crossentropy": 2.216059982776642, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.25595484375953675, + "loss/reg": 0.0, + "step": 21930 + }, + { + "epoch": 0.14434210526315788, + "grad_norm": 2.59375, + "grad_norm_var": 0.06485773722330729, + "learning_rate": 0.0001, + "loss": 3.0388, + "loss/crossentropy": 2.3124427676200865, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.2212652564048767, + "loss/reg": 0.0, + "step": 21940 + }, + { + "epoch": 0.1444078947368421, + "grad_norm": 2.953125, + "grad_norm_var": 0.4722246805826823, + "learning_rate": 0.0001, + "loss": 3.2024, + "loss/crossentropy": 2.1076952695846556, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.2454703077673912, + "loss/reg": 0.0, + "step": 21950 + }, + { + "epoch": 0.1444736842105263, + "grad_norm": 2.59375, + "grad_norm_var": 0.6233306884765625, + "learning_rate": 0.0001, + "loss": 3.1576, + "loss/crossentropy": 2.3738736391067503, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.22644958645105362, + "loss/reg": 0.0, + "step": 21960 + }, + { + "epoch": 0.14453947368421052, + "grad_norm": 2.4375, + "grad_norm_var": 0.48958231608072916, + "learning_rate": 0.0001, + "loss": 3.1195, + "loss/crossentropy": 2.3681930541992187, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.23997818529605866, + "loss/reg": 0.0, + "step": 21970 + }, + { + "epoch": 0.14460526315789474, + "grad_norm": 2.671875, + "grad_norm_var": 0.06328125, + "learning_rate": 0.0001, + "loss": 3.086, + "loss/crossentropy": 2.202842915058136, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.22619501650333404, + "loss/reg": 0.0, + "step": 21980 + }, + { + "epoch": 0.14467105263157895, + "grad_norm": 2.25, + "grad_norm_var": 0.08901265462239584, + "learning_rate": 0.0001, + "loss": 3.0968, + "loss/crossentropy": 2.498619997501373, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2592793509364128, + "loss/reg": 0.0, + "step": 21990 + }, + { + "epoch": 0.14473684210526316, + "grad_norm": 2.75, + "grad_norm_var": 0.07700907389322917, + "learning_rate": 0.0001, + "loss": 3.0646, + "loss/crossentropy": 2.5274386525154116, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.2393401548266411, + "loss/reg": 0.0, + "step": 22000 + }, + { + "epoch": 0.14480263157894738, + "grad_norm": 2.609375, + "grad_norm_var": 0.03200581868489583, + "learning_rate": 0.0001, + "loss": 3.1375, + "loss/crossentropy": 2.331031596660614, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.22175868153572081, + "loss/reg": 0.0, + "step": 22010 + }, + { + "epoch": 0.1448684210526316, + "grad_norm": 2.53125, + "grad_norm_var": 0.031473541259765626, + "learning_rate": 0.0001, + "loss": 3.1187, + "loss/crossentropy": 2.1961219191551207, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.23576426208019258, + "loss/reg": 0.0, + "step": 22020 + }, + { + "epoch": 0.14493421052631578, + "grad_norm": 2.15625, + "grad_norm_var": 0.04307835896809896, + "learning_rate": 0.0001, + "loss": 3.0722, + "loss/crossentropy": 2.4104359984397887, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.25036042332649233, + "loss/reg": 0.0, + "step": 22030 + }, + { + "epoch": 0.145, + "grad_norm": 2.78125, + "grad_norm_var": 0.0566558837890625, + "learning_rate": 0.0001, + "loss": 3.0535, + "loss/crossentropy": 2.293129473924637, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.3266716688871384, + "loss/reg": 0.0, + "step": 22040 + }, + { + "epoch": 0.1450657894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.1048828125, + "learning_rate": 0.0001, + "loss": 3.151, + "loss/crossentropy": 2.1948832154273985, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2283545032143593, + "loss/reg": 0.0, + "step": 22050 + }, + { + "epoch": 0.14513157894736842, + "grad_norm": 2.15625, + "grad_norm_var": 0.85728759765625, + "learning_rate": 0.0001, + "loss": 3.1379, + "loss/crossentropy": 2.4909852266311647, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.26901226192712785, + "loss/reg": 0.0, + "step": 22060 + }, + { + "epoch": 0.14519736842105263, + "grad_norm": 2.390625, + "grad_norm_var": 0.8632476806640625, + "learning_rate": 0.0001, + "loss": 3.0345, + "loss/crossentropy": 2.4894071340560915, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2518943950533867, + "loss/reg": 0.0, + "step": 22070 + }, + { + "epoch": 0.14526315789473684, + "grad_norm": 2.5625, + "grad_norm_var": 0.035553995768229166, + "learning_rate": 0.0001, + "loss": 3.0777, + "loss/crossentropy": 2.250881004333496, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.254881876707077, + "loss/reg": 0.0, + "step": 22080 + }, + { + "epoch": 0.14532894736842106, + "grad_norm": 2.140625, + "grad_norm_var": 0.6524251302083334, + "learning_rate": 0.0001, + "loss": 3.0046, + "loss/crossentropy": 2.4635193943977356, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.28406478762626647, + "loss/reg": 0.0, + "step": 22090 + }, + { + "epoch": 0.14539473684210527, + "grad_norm": 2.203125, + "grad_norm_var": 0.2197906494140625, + "learning_rate": 0.0001, + "loss": 3.0607, + "loss/crossentropy": 2.176581871509552, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2112376168370247, + "loss/reg": 0.0, + "step": 22100 + }, + { + "epoch": 0.14546052631578948, + "grad_norm": 2.421875, + "grad_norm_var": 0.0716461181640625, + "learning_rate": 0.0001, + "loss": 3.09, + "loss/crossentropy": 2.418153405189514, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.24471160471439363, + "loss/reg": 0.0, + "step": 22110 + }, + { + "epoch": 0.1455263157894737, + "grad_norm": 2.203125, + "grad_norm_var": 0.024909464518229167, + "learning_rate": 0.0001, + "loss": 3.0661, + "loss/crossentropy": 2.34368360042572, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.26933753192424775, + "loss/reg": 0.0, + "step": 22120 + }, + { + "epoch": 0.14559210526315788, + "grad_norm": 2.28125, + "grad_norm_var": 0.0403228759765625, + "learning_rate": 0.0001, + "loss": 3.0961, + "loss/crossentropy": 2.2858925461769104, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.23935395181179048, + "loss/reg": 0.0, + "step": 22130 + }, + { + "epoch": 0.1456578947368421, + "grad_norm": 2.140625, + "grad_norm_var": 0.06553446451822917, + "learning_rate": 0.0001, + "loss": 3.1287, + "loss/crossentropy": 2.476478910446167, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.2711992233991623, + "loss/reg": 0.0, + "step": 22140 + }, + { + "epoch": 0.1457236842105263, + "grad_norm": 2.90625, + "grad_norm_var": 0.14517822265625, + "learning_rate": 0.0001, + "loss": 3.0753, + "loss/crossentropy": 2.095052421092987, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.2310307502746582, + "loss/reg": 0.0, + "step": 22150 + }, + { + "epoch": 0.14578947368421052, + "grad_norm": 2.234375, + "grad_norm_var": 0.13459243774414062, + "learning_rate": 0.0001, + "loss": 3.0609, + "loss/crossentropy": 2.252713418006897, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.2497629001736641, + "loss/reg": 0.0, + "step": 22160 + }, + { + "epoch": 0.14585526315789474, + "grad_norm": 2.5625, + "grad_norm_var": 0.06102701822916667, + "learning_rate": 0.0001, + "loss": 3.092, + "loss/crossentropy": 2.2792391180992126, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.2850720778107643, + "loss/reg": 0.0, + "step": 22170 + }, + { + "epoch": 0.14592105263157895, + "grad_norm": 2.828125, + "grad_norm_var": 0.2100738525390625, + "learning_rate": 0.0001, + "loss": 3.1172, + "loss/crossentropy": 2.116334557533264, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.22928555756807328, + "loss/reg": 0.0, + "step": 22180 + }, + { + "epoch": 0.14598684210526316, + "grad_norm": 2.46875, + "grad_norm_var": 0.12631734212239584, + "learning_rate": 0.0001, + "loss": 3.103, + "loss/crossentropy": 2.2788902163505553, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.24738141447305678, + "loss/reg": 0.0, + "step": 22190 + }, + { + "epoch": 0.14605263157894738, + "grad_norm": 2.0625, + "grad_norm_var": 0.05478108723958333, + "learning_rate": 0.0001, + "loss": 2.9932, + "loss/crossentropy": 2.3781439542770384, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.2298112317919731, + "loss/reg": 0.0, + "step": 22200 + }, + { + "epoch": 0.1461184210526316, + "grad_norm": 2.328125, + "grad_norm_var": 1.2361399332682292, + "learning_rate": 0.0001, + "loss": 3.1279, + "loss/crossentropy": 2.367407274246216, + "loss/hidden": 3.0234375, + "loss/incoh": 0.0, + "loss/logits": 0.25443960130214693, + "loss/reg": 0.0, + "step": 22210 + }, + { + "epoch": 0.14618421052631578, + "grad_norm": 2.484375, + "grad_norm_var": 0.03775634765625, + "learning_rate": 0.0001, + "loss": 3.1253, + "loss/crossentropy": 2.4262067079544067, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.23887630701065063, + "loss/reg": 0.0, + "step": 22220 + }, + { + "epoch": 0.14625, + "grad_norm": 2.359375, + "grad_norm_var": 0.19719136555989583, + "learning_rate": 0.0001, + "loss": 3.1058, + "loss/crossentropy": 2.547460687160492, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.25797063410282134, + "loss/reg": 0.0, + "step": 22230 + }, + { + "epoch": 0.1463157894736842, + "grad_norm": 2.421875, + "grad_norm_var": 0.2289446512858073, + "learning_rate": 0.0001, + "loss": 3.1238, + "loss/crossentropy": 2.1126400232315063, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.23666388988494874, + "loss/reg": 0.0, + "step": 22240 + }, + { + "epoch": 0.14638157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.20608495076497396, + "learning_rate": 0.0001, + "loss": 3.0914, + "loss/crossentropy": 2.5851184129714966, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.2918281376361847, + "loss/reg": 0.0, + "step": 22250 + }, + { + "epoch": 0.14644736842105263, + "grad_norm": 2.515625, + "grad_norm_var": 0.5136329650878906, + "learning_rate": 0.0001, + "loss": 3.0971, + "loss/crossentropy": 2.319287371635437, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.283975313603878, + "loss/reg": 0.0, + "step": 22260 + }, + { + "epoch": 0.14651315789473685, + "grad_norm": 2.625, + "grad_norm_var": 9.41323216756185, + "learning_rate": 0.0001, + "loss": 3.2274, + "loss/crossentropy": 2.350848126411438, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.3113620936870575, + "loss/reg": 0.0, + "step": 22270 + }, + { + "epoch": 0.14657894736842106, + "grad_norm": 3.71875, + "grad_norm_var": 0.20803120930989583, + "learning_rate": 0.0001, + "loss": 3.1285, + "loss/crossentropy": 2.175449311733246, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.235707426071167, + "loss/reg": 0.0, + "step": 22280 + }, + { + "epoch": 0.14664473684210527, + "grad_norm": 2.3125, + "grad_norm_var": 0.37470601399739584, + "learning_rate": 0.0001, + "loss": 3.1206, + "loss/crossentropy": 2.6359380006790163, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2809421643614769, + "loss/reg": 0.0, + "step": 22290 + }, + { + "epoch": 0.14671052631578949, + "grad_norm": 2.015625, + "grad_norm_var": 0.33459879557291666, + "learning_rate": 0.0001, + "loss": 3.1002, + "loss/crossentropy": 2.2076025128364565, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.22375066131353377, + "loss/reg": 0.0, + "step": 22300 + }, + { + "epoch": 0.14677631578947367, + "grad_norm": 4.34375, + "grad_norm_var": 0.37032877604166664, + "learning_rate": 0.0001, + "loss": 3.117, + "loss/crossentropy": 2.0601858735084533, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.3449657797813416, + "loss/reg": 0.0, + "step": 22310 + }, + { + "epoch": 0.14684210526315788, + "grad_norm": 2.40625, + "grad_norm_var": 0.7130360921223958, + "learning_rate": 0.0001, + "loss": 3.0669, + "loss/crossentropy": 2.2765709161758423, + "loss/hidden": 2.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.2070981428027153, + "loss/reg": 0.0, + "step": 22320 + }, + { + "epoch": 0.1469078947368421, + "grad_norm": 2.53125, + "grad_norm_var": 0.5365193684895834, + "learning_rate": 0.0001, + "loss": 3.0347, + "loss/crossentropy": 2.1606739163398743, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2066648006439209, + "loss/reg": 0.0, + "step": 22330 + }, + { + "epoch": 0.1469736842105263, + "grad_norm": 11.25, + "grad_norm_var": 5.021190388997396, + "learning_rate": 0.0001, + "loss": 3.1108, + "loss/crossentropy": 2.4670102834701537, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.20643151551485062, + "loss/reg": 0.0, + "step": 22340 + }, + { + "epoch": 0.14703947368421053, + "grad_norm": 2.609375, + "grad_norm_var": 4.964111328125, + "learning_rate": 0.0001, + "loss": 3.0792, + "loss/crossentropy": 2.1708640813827516, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.2493487134575844, + "loss/reg": 0.0, + "step": 22350 + }, + { + "epoch": 0.14710526315789474, + "grad_norm": 2.484375, + "grad_norm_var": 0.2525299072265625, + "learning_rate": 0.0001, + "loss": 3.1371, + "loss/crossentropy": 2.4388205766677857, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.25962347984313966, + "loss/reg": 0.0, + "step": 22360 + }, + { + "epoch": 0.14717105263157895, + "grad_norm": 2.0625, + "grad_norm_var": 0.1518707275390625, + "learning_rate": 0.0001, + "loss": 3.0391, + "loss/crossentropy": 2.295293319225311, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.2302995279431343, + "loss/reg": 0.0, + "step": 22370 + }, + { + "epoch": 0.14723684210526317, + "grad_norm": 3.609375, + "grad_norm_var": 0.20093994140625, + "learning_rate": 0.0001, + "loss": 3.1142, + "loss/crossentropy": 2.081736671924591, + "loss/hidden": 2.7296875, + "loss/incoh": 0.0, + "loss/logits": 0.21390146017074585, + "loss/reg": 0.0, + "step": 22380 + }, + { + "epoch": 0.14730263157894738, + "grad_norm": 2.390625, + "grad_norm_var": 0.1891021728515625, + "learning_rate": 0.0001, + "loss": 3.0243, + "loss/crossentropy": 2.1469112396240235, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.24780639857053757, + "loss/reg": 0.0, + "step": 22390 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 3.5, + "grad_norm_var": 0.41617431640625, + "learning_rate": 0.0001, + "loss": 3.1187, + "loss/crossentropy": 2.219987118244171, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.2700192674994469, + "loss/reg": 0.0, + "step": 22400 + }, + { + "epoch": 0.14743421052631578, + "grad_norm": 2.46875, + "grad_norm_var": 0.10216471354166666, + "learning_rate": 0.0001, + "loss": 3.1659, + "loss/crossentropy": 2.387179672718048, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.25483787804841995, + "loss/reg": 0.0, + "step": 22410 + }, + { + "epoch": 0.1475, + "grad_norm": 3.21875, + "grad_norm_var": 0.10955403645833334, + "learning_rate": 0.0001, + "loss": 3.1812, + "loss/crossentropy": 2.4982651591300966, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.23052496314048768, + "loss/reg": 0.0, + "step": 22420 + }, + { + "epoch": 0.1475657894736842, + "grad_norm": 2.6875, + "grad_norm_var": 0.0999664306640625, + "learning_rate": 0.0001, + "loss": 3.0656, + "loss/crossentropy": 2.2789443135261536, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.23072773963212967, + "loss/reg": 0.0, + "step": 22430 + }, + { + "epoch": 0.14763157894736842, + "grad_norm": 2.53125, + "grad_norm_var": 0.074658203125, + "learning_rate": 0.0001, + "loss": 3.0366, + "loss/crossentropy": 2.384058713912964, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.2288333684206009, + "loss/reg": 0.0, + "step": 22440 + }, + { + "epoch": 0.14769736842105263, + "grad_norm": 2.234375, + "grad_norm_var": 0.05872294108072917, + "learning_rate": 0.0001, + "loss": 3.1249, + "loss/crossentropy": 2.1794327974319456, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.25599117279052735, + "loss/reg": 0.0, + "step": 22450 + }, + { + "epoch": 0.14776315789473685, + "grad_norm": 2.578125, + "grad_norm_var": 0.20020243326822917, + "learning_rate": 0.0001, + "loss": 3.2428, + "loss/crossentropy": 2.3936930537223815, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.33337023556232454, + "loss/reg": 0.0, + "step": 22460 + }, + { + "epoch": 0.14782894736842106, + "grad_norm": 2.515625, + "grad_norm_var": 0.2228424072265625, + "learning_rate": 0.0001, + "loss": 3.0933, + "loss/crossentropy": 2.290210509300232, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.22466516196727754, + "loss/reg": 0.0, + "step": 22470 + }, + { + "epoch": 0.14789473684210527, + "grad_norm": 2.375, + "grad_norm_var": 0.07656962076822917, + "learning_rate": 0.0001, + "loss": 3.1112, + "loss/crossentropy": 2.1793927550315857, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2438851624727249, + "loss/reg": 0.0, + "step": 22480 + }, + { + "epoch": 0.1479605263157895, + "grad_norm": 2.171875, + "grad_norm_var": 0.15196024576822917, + "learning_rate": 0.0001, + "loss": 3.105, + "loss/crossentropy": 2.3326605677604677, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.22931813299655915, + "loss/reg": 0.0, + "step": 22490 + }, + { + "epoch": 0.14802631578947367, + "grad_norm": 2.1875, + "grad_norm_var": 0.15309244791666668, + "learning_rate": 0.0001, + "loss": 3.0875, + "loss/crossentropy": 2.222768235206604, + "loss/hidden": 2.9828125, + "loss/incoh": 0.0, + "loss/logits": 0.2537077710032463, + "loss/reg": 0.0, + "step": 22500 + }, + { + "epoch": 0.14809210526315789, + "grad_norm": 2.28125, + "grad_norm_var": 0.0938385009765625, + "learning_rate": 0.0001, + "loss": 3.1419, + "loss/crossentropy": 2.308623898029327, + "loss/hidden": 3.0859375, + "loss/incoh": 0.0, + "loss/logits": 0.27548344135284425, + "loss/reg": 0.0, + "step": 22510 + }, + { + "epoch": 0.1481578947368421, + "grad_norm": 2.15625, + "grad_norm_var": 0.07166239420572916, + "learning_rate": 0.0001, + "loss": 3.068, + "loss/crossentropy": 2.261019694805145, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.23012840151786804, + "loss/reg": 0.0, + "step": 22520 + }, + { + "epoch": 0.1482236842105263, + "grad_norm": 2.390625, + "grad_norm_var": 0.13349202473958333, + "learning_rate": 0.0001, + "loss": 3.1074, + "loss/crossentropy": 2.265119397640228, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2193696081638336, + "loss/reg": 0.0, + "step": 22530 + }, + { + "epoch": 0.14828947368421053, + "grad_norm": 2.578125, + "grad_norm_var": 2.090127618074695e+17, + "learning_rate": 0.0001, + "loss": 3.3232, + "loss/crossentropy": 2.414242672920227, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.24838838130235671, + "loss/reg": 0.0, + "step": 22540 + }, + { + "epoch": 0.14835526315789474, + "grad_norm": 2.25, + "grad_norm_var": 0.0516265869140625, + "learning_rate": 0.0001, + "loss": 3.147, + "loss/crossentropy": 2.4570279359817504, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.27158954441547395, + "loss/reg": 0.0, + "step": 22550 + }, + { + "epoch": 0.14842105263157895, + "grad_norm": 2.296875, + "grad_norm_var": 0.0767229715983073, + "learning_rate": 0.0001, + "loss": 3.0495, + "loss/crossentropy": 2.1562333941459655, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.252650773525238, + "loss/reg": 0.0, + "step": 22560 + }, + { + "epoch": 0.14848684210526317, + "grad_norm": 2.234375, + "grad_norm_var": 0.2514625549316406, + "learning_rate": 0.0001, + "loss": 3.1717, + "loss/crossentropy": 2.0726990699768066, + "loss/hidden": 3.05, + "loss/incoh": 0.0, + "loss/logits": 0.2678316295146942, + "loss/reg": 0.0, + "step": 22570 + }, + { + "epoch": 0.14855263157894738, + "grad_norm": 2.125, + "grad_norm_var": 0.21327718098958334, + "learning_rate": 0.0001, + "loss": 3.1387, + "loss/crossentropy": 2.32381352186203, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.2564924195408821, + "loss/reg": 0.0, + "step": 22580 + }, + { + "epoch": 0.14861842105263157, + "grad_norm": 2.140625, + "grad_norm_var": 0.08567301432291667, + "learning_rate": 0.0001, + "loss": 3.1625, + "loss/crossentropy": 2.15835280418396, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.3166464313864708, + "loss/reg": 0.0, + "step": 22590 + }, + { + "epoch": 0.14868421052631578, + "grad_norm": 2.5, + "grad_norm_var": 0.13315327962239584, + "learning_rate": 0.0001, + "loss": 3.0573, + "loss/crossentropy": 2.5024718761444094, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2705509811639786, + "loss/reg": 0.0, + "step": 22600 + }, + { + "epoch": 0.14875, + "grad_norm": 2.203125, + "grad_norm_var": 0.1475176493326823, + "learning_rate": 0.0001, + "loss": 3.0119, + "loss/crossentropy": 2.2811665177345275, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.23891925811767578, + "loss/reg": 0.0, + "step": 22610 + }, + { + "epoch": 0.1488157894736842, + "grad_norm": 2.40625, + "grad_norm_var": 31.435579172770183, + "learning_rate": 0.0001, + "loss": 3.0136, + "loss/crossentropy": 2.1407122373580934, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.26466352492570877, + "loss/reg": 0.0, + "step": 22620 + }, + { + "epoch": 0.14888157894736842, + "grad_norm": 2.171875, + "grad_norm_var": 0.10041402180989584, + "learning_rate": 0.0001, + "loss": 3.0561, + "loss/crossentropy": 2.3511832118034364, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.26619747579097747, + "loss/reg": 0.0, + "step": 22630 + }, + { + "epoch": 0.14894736842105263, + "grad_norm": 2.609375, + "grad_norm_var": 0.16308186848958334, + "learning_rate": 0.0001, + "loss": 3.1069, + "loss/crossentropy": 2.1790146827697754, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2223970666527748, + "loss/reg": 0.0, + "step": 22640 + }, + { + "epoch": 0.14901315789473685, + "grad_norm": 2.265625, + "grad_norm_var": 0.48355204264322915, + "learning_rate": 0.0001, + "loss": 3.1746, + "loss/crossentropy": 2.2969871520996095, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.28595788925886156, + "loss/reg": 0.0, + "step": 22650 + }, + { + "epoch": 0.14907894736842106, + "grad_norm": 2.828125, + "grad_norm_var": 0.38804931640625, + "learning_rate": 0.0001, + "loss": 3.1378, + "loss/crossentropy": 2.484911561012268, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.28142704218626025, + "loss/reg": 0.0, + "step": 22660 + }, + { + "epoch": 0.14914473684210527, + "grad_norm": 3.359375, + "grad_norm_var": 0.11812744140625, + "learning_rate": 0.0001, + "loss": 3.2449, + "loss/crossentropy": 2.3331188917160035, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2567854106426239, + "loss/reg": 0.0, + "step": 22670 + }, + { + "epoch": 0.14921052631578946, + "grad_norm": 2.0, + "grad_norm_var": 0.11311848958333333, + "learning_rate": 0.0001, + "loss": 3.0464, + "loss/crossentropy": 2.5099231958389283, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.22425275444984435, + "loss/reg": 0.0, + "step": 22680 + }, + { + "epoch": 0.14927631578947367, + "grad_norm": 2.265625, + "grad_norm_var": 0.033707682291666666, + "learning_rate": 0.0001, + "loss": 3.0841, + "loss/crossentropy": 2.5707973539829254, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.27995080798864364, + "loss/reg": 0.0, + "step": 22690 + }, + { + "epoch": 0.1493421052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.03998921712239583, + "learning_rate": 0.0001, + "loss": 3.0877, + "loss/crossentropy": 2.2857648015022276, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.2597381517291069, + "loss/reg": 0.0, + "step": 22700 + }, + { + "epoch": 0.1494078947368421, + "grad_norm": 2.421875, + "grad_norm_var": 0.05201416015625, + "learning_rate": 0.0001, + "loss": 3.0551, + "loss/crossentropy": 2.4230233311653135, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2558469220995903, + "loss/reg": 0.0, + "step": 22710 + }, + { + "epoch": 0.14947368421052631, + "grad_norm": 2.28125, + "grad_norm_var": 0.08981831868489583, + "learning_rate": 0.0001, + "loss": 3.0629, + "loss/crossentropy": 2.3983253479003905, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.25465885996818544, + "loss/reg": 0.0, + "step": 22720 + }, + { + "epoch": 0.14953947368421053, + "grad_norm": 2.34375, + "grad_norm_var": 0.14431966145833333, + "learning_rate": 0.0001, + "loss": 3.1884, + "loss/crossentropy": 2.227739405632019, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.27533823624253273, + "loss/reg": 0.0, + "step": 22730 + }, + { + "epoch": 0.14960526315789474, + "grad_norm": 2.5, + "grad_norm_var": 0.09478759765625, + "learning_rate": 0.0001, + "loss": 3.044, + "loss/crossentropy": 2.2618382215499877, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.22343547642230988, + "loss/reg": 0.0, + "step": 22740 + }, + { + "epoch": 0.14967105263157895, + "grad_norm": 2.390625, + "grad_norm_var": 0.18606363932291667, + "learning_rate": 0.0001, + "loss": 3.145, + "loss/crossentropy": 2.4264959335327148, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.26770799309015275, + "loss/reg": 0.0, + "step": 22750 + }, + { + "epoch": 0.14973684210526317, + "grad_norm": 2.5, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 3.2178, + "loss/crossentropy": 2.603011679649353, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.28421192467212675, + "loss/reg": 0.0, + "step": 22760 + }, + { + "epoch": 0.14980263157894738, + "grad_norm": 2.296875, + "grad_norm_var": 0.04109598795572917, + "learning_rate": 0.0001, + "loss": 3.0566, + "loss/crossentropy": 2.239471447467804, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.2365947112441063, + "loss/reg": 0.0, + "step": 22770 + }, + { + "epoch": 0.14986842105263157, + "grad_norm": 2.203125, + "grad_norm_var": 0.11721903483072917, + "learning_rate": 0.0001, + "loss": 3.0489, + "loss/crossentropy": 2.264164900779724, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2334253668785095, + "loss/reg": 0.0, + "step": 22780 + }, + { + "epoch": 0.14993421052631578, + "grad_norm": 2.65625, + "grad_norm_var": 0.1690093994140625, + "learning_rate": 0.0001, + "loss": 3.1266, + "loss/crossentropy": 2.3352009534835814, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.23781208097934722, + "loss/reg": 0.0, + "step": 22790 + }, + { + "epoch": 0.15, + "grad_norm": 3.25, + "grad_norm_var": 0.25569559733072916, + "learning_rate": 0.0001, + "loss": 3.1863, + "loss/crossentropy": 2.260995364189148, + "loss/hidden": 2.9921875, + "loss/incoh": 0.0, + "loss/logits": 0.28239033967256544, + "loss/reg": 0.0, + "step": 22800 + }, + { + "epoch": 0.1500657894736842, + "grad_norm": 2.203125, + "grad_norm_var": 0.5896321614583333, + "learning_rate": 0.0001, + "loss": 3.1214, + "loss/crossentropy": 2.487893545627594, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2422095239162445, + "loss/reg": 0.0, + "step": 22810 + }, + { + "epoch": 0.15013157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.44045817057291664, + "learning_rate": 0.0001, + "loss": 3.0838, + "loss/crossentropy": 2.484136939048767, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.22923200577497482, + "loss/reg": 0.0, + "step": 22820 + }, + { + "epoch": 0.15019736842105263, + "grad_norm": 2.890625, + "grad_norm_var": 3.111881782327837e+17, + "learning_rate": 0.0001, + "loss": 3.2029, + "loss/crossentropy": 2.462354898452759, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.23231214433908462, + "loss/reg": 0.0, + "step": 22830 + }, + { + "epoch": 0.15026315789473685, + "grad_norm": 2.25, + "grad_norm_var": 3.1118817821970925e+17, + "learning_rate": 0.0001, + "loss": 3.1305, + "loss/crossentropy": 2.1894614577293394, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.24518598318099977, + "loss/reg": 0.0, + "step": 22840 + }, + { + "epoch": 0.15032894736842106, + "grad_norm": 2.328125, + "grad_norm_var": 0.23975321451822917, + "learning_rate": 0.0001, + "loss": 3.0371, + "loss/crossentropy": 2.427228879928589, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.250908388197422, + "loss/reg": 0.0, + "step": 22850 + }, + { + "epoch": 0.15039473684210528, + "grad_norm": 7.125, + "grad_norm_var": 1.4729563395182292, + "learning_rate": 0.0001, + "loss": 3.1606, + "loss/crossentropy": 2.163099730014801, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.2531519740819931, + "loss/reg": 0.0, + "step": 22860 + }, + { + "epoch": 0.15046052631578946, + "grad_norm": 2.484375, + "grad_norm_var": 1.7556060791015624, + "learning_rate": 0.0001, + "loss": 3.127, + "loss/crossentropy": 2.4134068608284, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.2419949784874916, + "loss/reg": 0.0, + "step": 22870 + }, + { + "epoch": 0.15052631578947367, + "grad_norm": 2.234375, + "grad_norm_var": 0.6191080729166667, + "learning_rate": 0.0001, + "loss": 3.0458, + "loss/crossentropy": 2.2460831999778748, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.25571899116039276, + "loss/reg": 0.0, + "step": 22880 + }, + { + "epoch": 0.1505921052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.1325592041015625, + "learning_rate": 0.0001, + "loss": 3.1666, + "loss/crossentropy": 2.32741322517395, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.23611859530210494, + "loss/reg": 0.0, + "step": 22890 + }, + { + "epoch": 0.1506578947368421, + "grad_norm": 4.03125, + "grad_norm_var": 0.24289957682291666, + "learning_rate": 0.0001, + "loss": 3.1655, + "loss/crossentropy": 2.436409044265747, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.28332450836896894, + "loss/reg": 0.0, + "step": 22900 + }, + { + "epoch": 0.15072368421052632, + "grad_norm": 2.265625, + "grad_norm_var": 0.23401590983072917, + "learning_rate": 0.0001, + "loss": 2.9932, + "loss/crossentropy": 2.2307902753353117, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.23847298920154572, + "loss/reg": 0.0, + "step": 22910 + }, + { + "epoch": 0.15078947368421053, + "grad_norm": 2.5625, + "grad_norm_var": 0.12184244791666667, + "learning_rate": 0.0001, + "loss": 3.1045, + "loss/crossentropy": 2.2170926928520203, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.23038018941879274, + "loss/reg": 0.0, + "step": 22920 + }, + { + "epoch": 0.15085526315789474, + "grad_norm": 2.8125, + "grad_norm_var": 0.09688695271809895, + "learning_rate": 0.0001, + "loss": 3.0845, + "loss/crossentropy": 2.1675124526023866, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.20779158547520638, + "loss/reg": 0.0, + "step": 22930 + }, + { + "epoch": 0.15092105263157896, + "grad_norm": 2.4375, + "grad_norm_var": 0.7031776428222656, + "learning_rate": 0.0001, + "loss": 3.131, + "loss/crossentropy": 2.483162760734558, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2888242840766907, + "loss/reg": 0.0, + "step": 22940 + }, + { + "epoch": 0.15098684210526317, + "grad_norm": 2.21875, + "grad_norm_var": 0.6901682535807292, + "learning_rate": 0.0001, + "loss": 3.1206, + "loss/crossentropy": 1.9849469184875488, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.21038227528333664, + "loss/reg": 0.0, + "step": 22950 + }, + { + "epoch": 0.15105263157894736, + "grad_norm": 2.484375, + "grad_norm_var": 0.08320210774739584, + "learning_rate": 0.0001, + "loss": 3.1315, + "loss/crossentropy": 2.424567532539368, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.2791455164551735, + "loss/reg": 0.0, + "step": 22960 + }, + { + "epoch": 0.15111842105263157, + "grad_norm": 2.0, + "grad_norm_var": 0.0634674072265625, + "learning_rate": 0.0001, + "loss": 3.054, + "loss/crossentropy": 2.531536269187927, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.24857166707515715, + "loss/reg": 0.0, + "step": 22970 + }, + { + "epoch": 0.15118421052631578, + "grad_norm": 2.28125, + "grad_norm_var": 0.06180013020833333, + "learning_rate": 0.0001, + "loss": 3.0542, + "loss/crossentropy": 2.4494295597076414, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.24070152640342712, + "loss/reg": 0.0, + "step": 22980 + }, + { + "epoch": 0.15125, + "grad_norm": 2.078125, + "grad_norm_var": 0.07551167805989584, + "learning_rate": 0.0001, + "loss": 3.0773, + "loss/crossentropy": 2.418122172355652, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.24059886187314988, + "loss/reg": 0.0, + "step": 22990 + }, + { + "epoch": 0.1513157894736842, + "grad_norm": 2.328125, + "grad_norm_var": 0.1075592041015625, + "learning_rate": 0.0001, + "loss": 3.1222, + "loss/crossentropy": 2.184150278568268, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.26524696350097654, + "loss/reg": 0.0, + "step": 23000 + }, + { + "epoch": 0.15138157894736842, + "grad_norm": 2.546875, + "grad_norm_var": 0.12967020670572918, + "learning_rate": 0.0001, + "loss": 3.043, + "loss/crossentropy": 2.48363002538681, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.22335316240787506, + "loss/reg": 0.0, + "step": 23010 + }, + { + "epoch": 0.15144736842105264, + "grad_norm": 2.109375, + "grad_norm_var": 0.1149810791015625, + "learning_rate": 0.0001, + "loss": 3.1275, + "loss/crossentropy": 2.305170452594757, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.23285460025072097, + "loss/reg": 0.0, + "step": 23020 + }, + { + "epoch": 0.15151315789473685, + "grad_norm": 2.21875, + "grad_norm_var": 0.06519775390625, + "learning_rate": 0.0001, + "loss": 3.0832, + "loss/crossentropy": 2.5363330364227297, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.22520080506801604, + "loss/reg": 0.0, + "step": 23030 + }, + { + "epoch": 0.15157894736842106, + "grad_norm": 2.125, + "grad_norm_var": 0.39241129557291665, + "learning_rate": 0.0001, + "loss": 3.113, + "loss/crossentropy": 2.1367889642715454, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.29226300269365313, + "loss/reg": 0.0, + "step": 23040 + }, + { + "epoch": 0.15164473684210528, + "grad_norm": 2.171875, + "grad_norm_var": 0.39695638020833335, + "learning_rate": 0.0001, + "loss": 3.1147, + "loss/crossentropy": 2.4076698064804076, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.34615216627717016, + "loss/reg": 0.0, + "step": 23050 + }, + { + "epoch": 0.15171052631578946, + "grad_norm": 2.125, + "grad_norm_var": 0.23273824055989584, + "learning_rate": 0.0001, + "loss": 3.0922, + "loss/crossentropy": 2.2620026230812074, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.24059030711650847, + "loss/reg": 0.0, + "step": 23060 + }, + { + "epoch": 0.15177631578947368, + "grad_norm": 2.125, + "grad_norm_var": 0.258984375, + "learning_rate": 0.0001, + "loss": 3.0849, + "loss/crossentropy": 2.4243380606174467, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.28078764528036115, + "loss/reg": 0.0, + "step": 23070 + }, + { + "epoch": 0.1518421052631579, + "grad_norm": 2.796875, + "grad_norm_var": 4.336297930295782e+17, + "learning_rate": 0.0001, + "loss": 3.1967, + "loss/crossentropy": 2.3340250134468077, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2355951637029648, + "loss/reg": 0.0, + "step": 23080 + }, + { + "epoch": 0.1519078947368421, + "grad_norm": 2.265625, + "grad_norm_var": 4.336297930357517e+17, + "learning_rate": 0.0001, + "loss": 3.031, + "loss/crossentropy": 2.1574897170066833, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.21495652794837952, + "loss/reg": 0.0, + "step": 23090 + }, + { + "epoch": 0.15197368421052632, + "grad_norm": 2.109375, + "grad_norm_var": 0.1123199462890625, + "learning_rate": 0.0001, + "loss": 3.1392, + "loss/crossentropy": 2.405716967582703, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.27174981236457824, + "loss/reg": 0.0, + "step": 23100 + }, + { + "epoch": 0.15203947368421053, + "grad_norm": 2.359375, + "grad_norm_var": 2.003859923676365e+16, + "learning_rate": 0.0001, + "loss": 3.2336, + "loss/crossentropy": 2.339055967330933, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.28572248220443724, + "loss/reg": 0.0, + "step": 23110 + }, + { + "epoch": 0.15210526315789474, + "grad_norm": 2.125, + "grad_norm_var": 0.21015218098958333, + "learning_rate": 0.0001, + "loss": 3.0005, + "loss/crossentropy": 2.2984538078308105, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.21156432926654817, + "loss/reg": 0.0, + "step": 23120 + }, + { + "epoch": 0.15217105263157896, + "grad_norm": 2.140625, + "grad_norm_var": 0.08379618326822917, + "learning_rate": 0.0001, + "loss": 3.116, + "loss/crossentropy": 2.3342798829078673, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.22476595863699914, + "loss/reg": 0.0, + "step": 23130 + }, + { + "epoch": 0.15223684210526317, + "grad_norm": 2.5, + "grad_norm_var": 0.0606842041015625, + "learning_rate": 0.0001, + "loss": 3.0265, + "loss/crossentropy": 2.3210121750831605, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23322480022907258, + "loss/reg": 0.0, + "step": 23140 + }, + { + "epoch": 0.15230263157894736, + "grad_norm": 2.109375, + "grad_norm_var": 0.23906962076822916, + "learning_rate": 0.0001, + "loss": 3.0813, + "loss/crossentropy": 2.391731929779053, + "loss/hidden": 3.05625, + "loss/incoh": 0.0, + "loss/logits": 0.3979014977812767, + "loss/reg": 0.0, + "step": 23150 + }, + { + "epoch": 0.15236842105263157, + "grad_norm": 2.609375, + "grad_norm_var": 0.049214680989583336, + "learning_rate": 0.0001, + "loss": 3.0574, + "loss/crossentropy": 2.492924761772156, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.23957584500312806, + "loss/reg": 0.0, + "step": 23160 + }, + { + "epoch": 0.15243421052631578, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07945938110351562, + "learning_rate": 0.0001, + "loss": 3.0317, + "loss/crossentropy": 2.3342637419700623, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.30051169246435167, + "loss/reg": 0.0, + "step": 23170 + }, + { + "epoch": 0.1525, + "grad_norm": 2.78125, + "grad_norm_var": 0.1088396708170573, + "learning_rate": 0.0001, + "loss": 3.1944, + "loss/crossentropy": 2.306346225738525, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.246099853515625, + "loss/reg": 0.0, + "step": 23180 + }, + { + "epoch": 0.1525657894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.17786839803059895, + "learning_rate": 0.0001, + "loss": 3.0341, + "loss/crossentropy": 2.030124247074127, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.21610897928476333, + "loss/reg": 0.0, + "step": 23190 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 2.203125, + "grad_norm_var": 0.12164688110351562, + "learning_rate": 0.0001, + "loss": 3.0381, + "loss/crossentropy": 2.3866997003555297, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.25531851798295974, + "loss/reg": 0.0, + "step": 23200 + }, + { + "epoch": 0.15269736842105264, + "grad_norm": 2.140625, + "grad_norm_var": 0.13189697265625, + "learning_rate": 0.0001, + "loss": 3.1059, + "loss/crossentropy": 2.6614113092422484, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.26844717264175416, + "loss/reg": 0.0, + "step": 23210 + }, + { + "epoch": 0.15276315789473685, + "grad_norm": 2.328125, + "grad_norm_var": 0.24109598795572917, + "learning_rate": 0.0001, + "loss": 3.0657, + "loss/crossentropy": 2.384077048301697, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.2543630987405777, + "loss/reg": 0.0, + "step": 23220 + }, + { + "epoch": 0.15282894736842106, + "grad_norm": 2.21875, + "grad_norm_var": 0.27134501139322914, + "learning_rate": 0.0001, + "loss": 3.0512, + "loss/crossentropy": 2.4361581921577455, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.24168266355991364, + "loss/reg": 0.0, + "step": 23230 + }, + { + "epoch": 0.15289473684210525, + "grad_norm": 2.6875, + "grad_norm_var": 0.06411031087239584, + "learning_rate": 0.0001, + "loss": 3.1163, + "loss/crossentropy": 2.439437985420227, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.23927776366472245, + "loss/reg": 0.0, + "step": 23240 + }, + { + "epoch": 0.15296052631578946, + "grad_norm": 2.4375, + "grad_norm_var": 0.20168355305989583, + "learning_rate": 0.0001, + "loss": 3.1405, + "loss/crossentropy": 1.9147474735975265, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.27681511342525483, + "loss/reg": 0.0, + "step": 23250 + }, + { + "epoch": 0.15302631578947368, + "grad_norm": 2.4375, + "grad_norm_var": 0.3036122639973958, + "learning_rate": 0.0001, + "loss": 3.1139, + "loss/crossentropy": 1.9262932538986206, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.2078495942056179, + "loss/reg": 0.0, + "step": 23260 + }, + { + "epoch": 0.1530921052631579, + "grad_norm": 3.1875, + "grad_norm_var": 0.26516520182291664, + "learning_rate": 0.0001, + "loss": 3.0706, + "loss/crossentropy": 2.252467918395996, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.24831898286938667, + "loss/reg": 0.0, + "step": 23270 + }, + { + "epoch": 0.1531578947368421, + "grad_norm": 2.921875, + "grad_norm_var": 0.12827860514322917, + "learning_rate": 0.0001, + "loss": 3.1631, + "loss/crossentropy": 2.0835241615772246, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.2629551820456982, + "loss/reg": 0.0, + "step": 23280 + }, + { + "epoch": 0.15322368421052632, + "grad_norm": 4.0625, + "grad_norm_var": 0.2850494384765625, + "learning_rate": 0.0001, + "loss": 3.0116, + "loss/crossentropy": 2.533276152610779, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.25189711451530455, + "loss/reg": 0.0, + "step": 23290 + }, + { + "epoch": 0.15328947368421053, + "grad_norm": 1.984375, + "grad_norm_var": 0.28486328125, + "learning_rate": 0.0001, + "loss": 3.0549, + "loss/crossentropy": 2.2883153557777405, + "loss/hidden": 2.6359375, + "loss/incoh": 0.0, + "loss/logits": 0.20354210436344147, + "loss/reg": 0.0, + "step": 23300 + }, + { + "epoch": 0.15335526315789474, + "grad_norm": 3.03125, + "grad_norm_var": 0.1171295166015625, + "learning_rate": 0.0001, + "loss": 3.1273, + "loss/crossentropy": 2.3346426010131838, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.32933240532875063, + "loss/reg": 0.0, + "step": 23310 + }, + { + "epoch": 0.15342105263157896, + "grad_norm": 2.65625, + "grad_norm_var": 0.15787760416666666, + "learning_rate": 0.0001, + "loss": 3.141, + "loss/crossentropy": 2.1821151852607725, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2542765408754349, + "loss/reg": 0.0, + "step": 23320 + }, + { + "epoch": 0.15348684210526317, + "grad_norm": 2.640625, + "grad_norm_var": 0.1633453369140625, + "learning_rate": 0.0001, + "loss": 3.1175, + "loss/crossentropy": 2.303962028026581, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.25389211922883986, + "loss/reg": 0.0, + "step": 23330 + }, + { + "epoch": 0.15355263157894736, + "grad_norm": 2.46875, + "grad_norm_var": 6.715729777018229, + "learning_rate": 0.0001, + "loss": 3.1862, + "loss/crossentropy": 2.387230467796326, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.26721241027116777, + "loss/reg": 0.0, + "step": 23340 + }, + { + "epoch": 0.15361842105263157, + "grad_norm": 2.3125, + "grad_norm_var": 0.14257405598958334, + "learning_rate": 0.0001, + "loss": 3.1622, + "loss/crossentropy": 2.4242820143699646, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.29456254839897156, + "loss/reg": 0.0, + "step": 23350 + }, + { + "epoch": 0.15368421052631578, + "grad_norm": 2.4375, + "grad_norm_var": 0.18007405598958334, + "learning_rate": 0.0001, + "loss": 3.1082, + "loss/crossentropy": 2.1694801568984987, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.3005404189229012, + "loss/reg": 0.0, + "step": 23360 + }, + { + "epoch": 0.15375, + "grad_norm": 2.4375, + "grad_norm_var": 0.13083089192708333, + "learning_rate": 0.0001, + "loss": 3.0146, + "loss/crossentropy": 2.5201270818710326, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.23150605708360672, + "loss/reg": 0.0, + "step": 23370 + }, + { + "epoch": 0.1538157894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.1706621805826823, + "learning_rate": 0.0001, + "loss": 3.04, + "loss/crossentropy": 2.4117377281188963, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.22549189925193786, + "loss/reg": 0.0, + "step": 23380 + }, + { + "epoch": 0.15388157894736842, + "grad_norm": 2.4375, + "grad_norm_var": 0.049843088785807295, + "learning_rate": 0.0001, + "loss": 3.0355, + "loss/crossentropy": 2.166217362880707, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2460268869996071, + "loss/reg": 0.0, + "step": 23390 + }, + { + "epoch": 0.15394736842105264, + "grad_norm": 2.3125, + "grad_norm_var": 0.027437337239583335, + "learning_rate": 0.0001, + "loss": 3.0332, + "loss/crossentropy": 2.2912797331809998, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.2184738412499428, + "loss/reg": 0.0, + "step": 23400 + }, + { + "epoch": 0.15401315789473685, + "grad_norm": 2.46875, + "grad_norm_var": 0.02008056640625, + "learning_rate": 0.0001, + "loss": 3.049, + "loss/crossentropy": 2.2143723726272584, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23227078467607498, + "loss/reg": 0.0, + "step": 23410 + }, + { + "epoch": 0.15407894736842107, + "grad_norm": 2.46875, + "grad_norm_var": 0.04762140909830729, + "learning_rate": 0.0001, + "loss": 2.9746, + "loss/crossentropy": 2.1996548891067507, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.20859691947698594, + "loss/reg": 0.0, + "step": 23420 + }, + { + "epoch": 0.15414473684210525, + "grad_norm": 2.390625, + "grad_norm_var": 0.08440348307291666, + "learning_rate": 0.0001, + "loss": 3.1332, + "loss/crossentropy": 2.353720319271088, + "loss/hidden": 3.0734375, + "loss/incoh": 0.0, + "loss/logits": 0.32487219721078875, + "loss/reg": 0.0, + "step": 23430 + }, + { + "epoch": 0.15421052631578946, + "grad_norm": 2.421875, + "grad_norm_var": 0.04627278645833333, + "learning_rate": 0.0001, + "loss": 3.0352, + "loss/crossentropy": 1.994243037700653, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2228219524025917, + "loss/reg": 0.0, + "step": 23440 + }, + { + "epoch": 0.15427631578947368, + "grad_norm": 2.515625, + "grad_norm_var": 0.025699869791666666, + "learning_rate": 0.0001, + "loss": 3.0571, + "loss/crossentropy": 2.368081831932068, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.24828383028507234, + "loss/reg": 0.0, + "step": 23450 + }, + { + "epoch": 0.1543421052631579, + "grad_norm": 2.734375, + "grad_norm_var": 0.06443684895833333, + "learning_rate": 0.0001, + "loss": 3.0708, + "loss/crossentropy": 2.1184537053108214, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.2568272680044174, + "loss/reg": 0.0, + "step": 23460 + }, + { + "epoch": 0.1544078947368421, + "grad_norm": 2.421875, + "grad_norm_var": 0.13352762858072917, + "learning_rate": 0.0001, + "loss": 3.1323, + "loss/crossentropy": 2.3327906489372254, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.2940856009721756, + "loss/reg": 0.0, + "step": 23470 + }, + { + "epoch": 0.15447368421052632, + "grad_norm": 2.4375, + "grad_norm_var": 0.14630533854166666, + "learning_rate": 0.0001, + "loss": 3.0207, + "loss/crossentropy": 2.5257789850234986, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.24059889316558838, + "loss/reg": 0.0, + "step": 23480 + }, + { + "epoch": 0.15453947368421053, + "grad_norm": 2.34375, + "grad_norm_var": 0.059357706705729166, + "learning_rate": 0.0001, + "loss": 3.0888, + "loss/crossentropy": 2.0001726031303404, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2182185634970665, + "loss/reg": 0.0, + "step": 23490 + }, + { + "epoch": 0.15460526315789475, + "grad_norm": 2.203125, + "grad_norm_var": 0.07535171508789062, + "learning_rate": 0.0001, + "loss": 3.0415, + "loss/crossentropy": 2.4137069463729857, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.26817760467529295, + "loss/reg": 0.0, + "step": 23500 + }, + { + "epoch": 0.15467105263157896, + "grad_norm": 2.296875, + "grad_norm_var": 0.09633967081705729, + "learning_rate": 0.0001, + "loss": 3.0427, + "loss/crossentropy": 2.534043550491333, + "loss/hidden": 2.63125, + "loss/incoh": 0.0, + "loss/logits": 0.22827706933021547, + "loss/reg": 0.0, + "step": 23510 + }, + { + "epoch": 0.15473684210526314, + "grad_norm": 2.1875, + "grad_norm_var": 0.0346832275390625, + "learning_rate": 0.0001, + "loss": 3.0819, + "loss/crossentropy": 2.1481072187423704, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.22196744605898858, + "loss/reg": 0.0, + "step": 23520 + }, + { + "epoch": 0.15480263157894736, + "grad_norm": 2.734375, + "grad_norm_var": 0.07886962890625, + "learning_rate": 0.0001, + "loss": 3.1167, + "loss/crossentropy": 2.0396682798862455, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.22266072854399682, + "loss/reg": 0.0, + "step": 23530 + }, + { + "epoch": 0.15486842105263157, + "grad_norm": 2.0625, + "grad_norm_var": 0.038182576497395836, + "learning_rate": 0.0001, + "loss": 3.075, + "loss/crossentropy": 2.164088273048401, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2313580572605133, + "loss/reg": 0.0, + "step": 23540 + }, + { + "epoch": 0.15493421052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.2714515686035156, + "learning_rate": 0.0001, + "loss": 3.0593, + "loss/crossentropy": 2.304581105709076, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.24230952113866805, + "loss/reg": 0.0, + "step": 23550 + }, + { + "epoch": 0.155, + "grad_norm": 2.421875, + "grad_norm_var": 0.1161376953125, + "learning_rate": 0.0001, + "loss": 2.9886, + "loss/crossentropy": 1.9074640274047852, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.1951706364750862, + "loss/reg": 0.0, + "step": 23560 + }, + { + "epoch": 0.1550657894736842, + "grad_norm": 2.515625, + "grad_norm_var": 0.12790908813476562, + "learning_rate": 0.0001, + "loss": 3.0776, + "loss/crossentropy": 2.3702269911766054, + "loss/hidden": 3.0203125, + "loss/incoh": 0.0, + "loss/logits": 0.2716818228363991, + "loss/reg": 0.0, + "step": 23570 + }, + { + "epoch": 0.15513157894736843, + "grad_norm": 2.578125, + "grad_norm_var": 0.1339019775390625, + "learning_rate": 0.0001, + "loss": 3.0873, + "loss/crossentropy": 2.205389070510864, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2579692542552948, + "loss/reg": 0.0, + "step": 23580 + }, + { + "epoch": 0.15519736842105264, + "grad_norm": 2.296875, + "grad_norm_var": 0.06905924479166667, + "learning_rate": 0.0001, + "loss": 3.088, + "loss/crossentropy": 2.091149592399597, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2086069330573082, + "loss/reg": 0.0, + "step": 23590 + }, + { + "epoch": 0.15526315789473685, + "grad_norm": 2.390625, + "grad_norm_var": 0.056722005208333336, + "learning_rate": 0.0001, + "loss": 3.1228, + "loss/crossentropy": 2.2192450404167174, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.28743477165699005, + "loss/reg": 0.0, + "step": 23600 + }, + { + "epoch": 0.15532894736842107, + "grad_norm": 2.328125, + "grad_norm_var": 0.05885009765625, + "learning_rate": 0.0001, + "loss": 3.0878, + "loss/crossentropy": 2.531976878643036, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2344336360692978, + "loss/reg": 0.0, + "step": 23610 + }, + { + "epoch": 0.15539473684210525, + "grad_norm": 2.984375, + "grad_norm_var": 0.07700907389322917, + "learning_rate": 0.0001, + "loss": 3.0331, + "loss/crossentropy": 2.2710886120796205, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.2305494710803032, + "loss/reg": 0.0, + "step": 23620 + }, + { + "epoch": 0.15546052631578947, + "grad_norm": 2.625, + "grad_norm_var": 0.055964152018229164, + "learning_rate": 0.0001, + "loss": 3.1336, + "loss/crossentropy": 2.3865880966186523, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.25577795803546904, + "loss/reg": 0.0, + "step": 23630 + }, + { + "epoch": 0.15552631578947368, + "grad_norm": 2.109375, + "grad_norm_var": 0.05320536295572917, + "learning_rate": 0.0001, + "loss": 3.1079, + "loss/crossentropy": 2.388904869556427, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.26212394386529925, + "loss/reg": 0.0, + "step": 23640 + }, + { + "epoch": 0.1555921052631579, + "grad_norm": 2.25, + "grad_norm_var": 0.28601252237955727, + "learning_rate": 0.0001, + "loss": 3.0189, + "loss/crossentropy": 2.3917470812797545, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.28409580439329146, + "loss/reg": 0.0, + "step": 23650 + }, + { + "epoch": 0.1556578947368421, + "grad_norm": 2.390625, + "grad_norm_var": 0.05244038899739583, + "learning_rate": 0.0001, + "loss": 3.0892, + "loss/crossentropy": 2.266455078125, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.23892848715186119, + "loss/reg": 0.0, + "step": 23660 + }, + { + "epoch": 0.15572368421052632, + "grad_norm": 2.53125, + "grad_norm_var": 0.021256510416666666, + "learning_rate": 0.0001, + "loss": 3.1403, + "loss/crossentropy": 2.3996535420417784, + "loss/hidden": 3.0, + "loss/incoh": 0.0, + "loss/logits": 0.32787969559431074, + "loss/reg": 0.0, + "step": 23670 + }, + { + "epoch": 0.15578947368421053, + "grad_norm": 2.359375, + "grad_norm_var": 0.03367411295572917, + "learning_rate": 0.0001, + "loss": 3.0311, + "loss/crossentropy": 2.2268054604530336, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.25249754935503005, + "loss/reg": 0.0, + "step": 23680 + }, + { + "epoch": 0.15585526315789475, + "grad_norm": 2.328125, + "grad_norm_var": 0.112744140625, + "learning_rate": 0.0001, + "loss": 3.1572, + "loss/crossentropy": 2.4530532598495483, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.2573227033019066, + "loss/reg": 0.0, + "step": 23690 + }, + { + "epoch": 0.15592105263157896, + "grad_norm": 2.40625, + "grad_norm_var": 0.2130767822265625, + "learning_rate": 0.0001, + "loss": 3.0932, + "loss/crossentropy": 2.4343389749526976, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.26323390007019043, + "loss/reg": 0.0, + "step": 23700 + }, + { + "epoch": 0.15598684210526315, + "grad_norm": 2.421875, + "grad_norm_var": 0.15120340983072916, + "learning_rate": 0.0001, + "loss": 3.1834, + "loss/crossentropy": 2.1292245745658875, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.23169804438948632, + "loss/reg": 0.0, + "step": 23710 + }, + { + "epoch": 0.15605263157894736, + "grad_norm": 2.375, + "grad_norm_var": 0.0550201416015625, + "learning_rate": 0.0001, + "loss": 3.0296, + "loss/crossentropy": 2.4026230216026305, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.24430365711450577, + "loss/reg": 0.0, + "step": 23720 + }, + { + "epoch": 0.15611842105263157, + "grad_norm": 1.96875, + "grad_norm_var": 0.07566731770833333, + "learning_rate": 0.0001, + "loss": 3.0245, + "loss/crossentropy": 2.186066722869873, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.2246588721871376, + "loss/reg": 0.0, + "step": 23730 + }, + { + "epoch": 0.1561842105263158, + "grad_norm": 2.46875, + "grad_norm_var": 0.05691731770833333, + "learning_rate": 0.0001, + "loss": 3.0638, + "loss/crossentropy": 2.3755934476852416, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.23479508757591247, + "loss/reg": 0.0, + "step": 23740 + }, + { + "epoch": 0.15625, + "grad_norm": 2.640625, + "grad_norm_var": 0.06516520182291667, + "learning_rate": 0.0001, + "loss": 3.1125, + "loss/crossentropy": 2.1289533019065856, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.22816545218229295, + "loss/reg": 0.0, + "step": 23750 + }, + { + "epoch": 0.1563157894736842, + "grad_norm": 2.0625, + "grad_norm_var": 0.15618489583333334, + "learning_rate": 0.0001, + "loss": 3.1231, + "loss/crossentropy": 2.5586398363113405, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2851672574877739, + "loss/reg": 0.0, + "step": 23760 + }, + { + "epoch": 0.15638157894736843, + "grad_norm": 2.5, + "grad_norm_var": 0.1464752197265625, + "learning_rate": 0.0001, + "loss": 3.063, + "loss/crossentropy": 2.3234678387641905, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.2802169814705849, + "loss/reg": 0.0, + "step": 23770 + }, + { + "epoch": 0.15644736842105264, + "grad_norm": 2.5625, + "grad_norm_var": 3.3502559026353274e+17, + "learning_rate": 0.0001, + "loss": 3.217, + "loss/crossentropy": 2.1664799213409425, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.24028320759534835, + "loss/reg": 0.0, + "step": 23780 + }, + { + "epoch": 0.15651315789473685, + "grad_norm": 2.4375, + "grad_norm_var": 3.35025590324881e+17, + "learning_rate": 0.0001, + "loss": 3.0729, + "loss/crossentropy": 2.4900641202926637, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.22242969423532485, + "loss/reg": 0.0, + "step": 23790 + }, + { + "epoch": 0.15657894736842104, + "grad_norm": 3.125, + "grad_norm_var": 0.42876561482747394, + "learning_rate": 0.0001, + "loss": 3.0548, + "loss/crossentropy": 2.285599112510681, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.27018204629421233, + "loss/reg": 0.0, + "step": 23800 + }, + { + "epoch": 0.15664473684210525, + "grad_norm": 2.328125, + "grad_norm_var": 0.41369527180989585, + "learning_rate": 0.0001, + "loss": 2.9886, + "loss/crossentropy": 2.352921783924103, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.20102634727954866, + "loss/reg": 0.0, + "step": 23810 + }, + { + "epoch": 0.15671052631578947, + "grad_norm": 2.0625, + "grad_norm_var": 0.11093343098958333, + "learning_rate": 0.0001, + "loss": 3.0036, + "loss/crossentropy": 2.4093087553977965, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.24685168713331224, + "loss/reg": 0.0, + "step": 23820 + }, + { + "epoch": 0.15677631578947368, + "grad_norm": 2.140625, + "grad_norm_var": 0.0461578369140625, + "learning_rate": 0.0001, + "loss": 3.0504, + "loss/crossentropy": 2.2815535068511963, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.21896685659885406, + "loss/reg": 0.0, + "step": 23830 + }, + { + "epoch": 0.1568421052631579, + "grad_norm": 2.15625, + "grad_norm_var": 0.07251561482747396, + "learning_rate": 0.0001, + "loss": 2.9885, + "loss/crossentropy": 2.6134737491607667, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.22691430598497392, + "loss/reg": 0.0, + "step": 23840 + }, + { + "epoch": 0.1569078947368421, + "grad_norm": 4.53125, + "grad_norm_var": 0.35949605305989585, + "learning_rate": 0.0001, + "loss": 3.0464, + "loss/crossentropy": 2.220982587337494, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.22496577724814415, + "loss/reg": 0.0, + "step": 23850 + }, + { + "epoch": 0.15697368421052632, + "grad_norm": 2.1875, + "grad_norm_var": 0.4110636393229167, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.291455662250519, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2523698851466179, + "loss/reg": 0.0, + "step": 23860 + }, + { + "epoch": 0.15703947368421053, + "grad_norm": 2.46875, + "grad_norm_var": 0.2355133056640625, + "learning_rate": 0.0001, + "loss": 3.0867, + "loss/crossentropy": 2.4676188588142396, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.20559777021408082, + "loss/reg": 0.0, + "step": 23870 + }, + { + "epoch": 0.15710526315789475, + "grad_norm": 2.609375, + "grad_norm_var": 0.09799702962239583, + "learning_rate": 0.0001, + "loss": 3.0123, + "loss/crossentropy": 2.4020154595375063, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.24061929136514665, + "loss/reg": 0.0, + "step": 23880 + }, + { + "epoch": 0.15717105263157893, + "grad_norm": 2.578125, + "grad_norm_var": 2.1426422119140627, + "learning_rate": 0.0001, + "loss": 3.02, + "loss/crossentropy": 2.0822024583816527, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.23049053847789763, + "loss/reg": 0.0, + "step": 23890 + }, + { + "epoch": 0.15723684210526315, + "grad_norm": 2.1875, + "grad_norm_var": 2.0829254150390626, + "learning_rate": 0.0001, + "loss": 3.0151, + "loss/crossentropy": 2.210465407371521, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.24976756423711777, + "loss/reg": 0.0, + "step": 23900 + }, + { + "epoch": 0.15730263157894736, + "grad_norm": 2.546875, + "grad_norm_var": 0.5316731770833333, + "learning_rate": 0.0001, + "loss": 3.0642, + "loss/crossentropy": 2.1934141278266908, + "loss/hidden": 2.5921875, + "loss/incoh": 0.0, + "loss/logits": 0.18757687732577324, + "loss/reg": 0.0, + "step": 23910 + }, + { + "epoch": 0.15736842105263157, + "grad_norm": 2.53125, + "grad_norm_var": 0.5506998697916666, + "learning_rate": 0.0001, + "loss": 3.1533, + "loss/crossentropy": 2.2942312955856323, + "loss/hidden": 3.0890625, + "loss/incoh": 0.0, + "loss/logits": 0.25652203559875486, + "loss/reg": 0.0, + "step": 23920 + }, + { + "epoch": 0.1574342105263158, + "grad_norm": 2.875, + "grad_norm_var": 0.38448893229166664, + "learning_rate": 0.0001, + "loss": 3.0496, + "loss/crossentropy": 2.4290681004524233, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.22672214657068251, + "loss/reg": 0.0, + "step": 23930 + }, + { + "epoch": 0.1575, + "grad_norm": 2.796875, + "grad_norm_var": 0.35227864583333335, + "learning_rate": 0.0001, + "loss": 3.0868, + "loss/crossentropy": 2.1520013570785523, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2738748073577881, + "loss/reg": 0.0, + "step": 23940 + }, + { + "epoch": 0.15756578947368421, + "grad_norm": 2.359375, + "grad_norm_var": 0.07700907389322917, + "learning_rate": 0.0001, + "loss": 3.1436, + "loss/crossentropy": 2.5199429869651793, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2519889883697033, + "loss/reg": 0.0, + "step": 23950 + }, + { + "epoch": 0.15763157894736843, + "grad_norm": 2.171875, + "grad_norm_var": 1.195580037434896, + "learning_rate": 0.0001, + "loss": 3.0924, + "loss/crossentropy": 2.362662875652313, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.2809165805578232, + "loss/reg": 0.0, + "step": 23960 + }, + { + "epoch": 0.15769736842105264, + "grad_norm": 1.9921875, + "grad_norm_var": 1.2223609924316405, + "learning_rate": 0.0001, + "loss": 3.1196, + "loss/crossentropy": 2.360364031791687, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2587847799062729, + "loss/reg": 0.0, + "step": 23970 + }, + { + "epoch": 0.15776315789473686, + "grad_norm": 2.171875, + "grad_norm_var": 0.028562164306640624, + "learning_rate": 0.0001, + "loss": 3.0014, + "loss/crossentropy": 2.268549180030823, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.2278840996325016, + "loss/reg": 0.0, + "step": 23980 + }, + { + "epoch": 0.15782894736842104, + "grad_norm": 2.390625, + "grad_norm_var": 0.14148661295572917, + "learning_rate": 0.0001, + "loss": 3.0699, + "loss/crossentropy": 2.0497791051864622, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2620595782995224, + "loss/reg": 0.0, + "step": 23990 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 2.78125, + "grad_norm_var": 2.927515672594874e+17, + "learning_rate": 0.0001, + "loss": 3.2727, + "loss/crossentropy": 2.035860624909401, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.20649760514497756, + "loss/reg": 0.0, + "step": 24000 + }, + { + "epoch": 0.15796052631578947, + "grad_norm": 3.265625, + "grad_norm_var": 2.927515672138351e+17, + "learning_rate": 0.0001, + "loss": 3.1828, + "loss/crossentropy": 2.358655941486359, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.23838587403297423, + "loss/reg": 0.0, + "step": 24010 + }, + { + "epoch": 0.15802631578947368, + "grad_norm": 4.34375, + "grad_norm_var": 0.4474029541015625, + "learning_rate": 0.0001, + "loss": 3.0939, + "loss/crossentropy": 2.436113882064819, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.25293364077806474, + "loss/reg": 0.0, + "step": 24020 + }, + { + "epoch": 0.1580921052631579, + "grad_norm": 2.15625, + "grad_norm_var": 0.346875, + "learning_rate": 0.0001, + "loss": 3.1031, + "loss/crossentropy": 2.1950599789619445, + "loss/hidden": 3.025, + "loss/incoh": 0.0, + "loss/logits": 0.29097774922847747, + "loss/reg": 0.0, + "step": 24030 + }, + { + "epoch": 0.1581578947368421, + "grad_norm": 3.5, + "grad_norm_var": 0.84117431640625, + "learning_rate": 0.0001, + "loss": 3.138, + "loss/crossentropy": 2.488498842716217, + "loss/hidden": 2.9921875, + "loss/incoh": 0.0, + "loss/logits": 0.2546138882637024, + "loss/reg": 0.0, + "step": 24040 + }, + { + "epoch": 0.15822368421052632, + "grad_norm": 2.296875, + "grad_norm_var": 0.20335286458333332, + "learning_rate": 0.0001, + "loss": 3.1208, + "loss/crossentropy": 2.1634037137031554, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.23214713484048843, + "loss/reg": 0.0, + "step": 24050 + }, + { + "epoch": 0.15828947368421054, + "grad_norm": 2.375, + "grad_norm_var": 0.036454264322916666, + "learning_rate": 0.0001, + "loss": 3.1708, + "loss/crossentropy": 2.3869518637657166, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.2666103199124336, + "loss/reg": 0.0, + "step": 24060 + }, + { + "epoch": 0.15835526315789475, + "grad_norm": 2.234375, + "grad_norm_var": 0.1547515869140625, + "learning_rate": 0.0001, + "loss": 3.0924, + "loss/crossentropy": 2.2607127904891966, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.25745444297790526, + "loss/reg": 0.0, + "step": 24070 + }, + { + "epoch": 0.15842105263157893, + "grad_norm": 4.0, + "grad_norm_var": 0.22906494140625, + "learning_rate": 0.0001, + "loss": 3.0239, + "loss/crossentropy": 2.2656429171562196, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.2332296848297119, + "loss/reg": 0.0, + "step": 24080 + }, + { + "epoch": 0.15848684210526315, + "grad_norm": 2.078125, + "grad_norm_var": 0.23699442545572916, + "learning_rate": 0.0001, + "loss": 3.0181, + "loss/crossentropy": 2.2533790946006773, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.2367706872522831, + "loss/reg": 0.0, + "step": 24090 + }, + { + "epoch": 0.15855263157894736, + "grad_norm": 2.34375, + "grad_norm_var": 0.06172587076822917, + "learning_rate": 0.0001, + "loss": 3.045, + "loss/crossentropy": 2.3126723527908326, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.23582341223955156, + "loss/reg": 0.0, + "step": 24100 + }, + { + "epoch": 0.15861842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.08645426432291667, + "learning_rate": 0.0001, + "loss": 3.0989, + "loss/crossentropy": 2.284153974056244, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.3264383256435394, + "loss/reg": 0.0, + "step": 24110 + }, + { + "epoch": 0.1586842105263158, + "grad_norm": 3.671875, + "grad_norm_var": 172.1703125, + "learning_rate": 0.0001, + "loss": 3.2515, + "loss/crossentropy": 2.463814842700958, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.2696057379245758, + "loss/reg": 0.0, + "step": 24120 + }, + { + "epoch": 0.15875, + "grad_norm": 2.34375, + "grad_norm_var": 0.19868062337239584, + "learning_rate": 0.0001, + "loss": 3.1115, + "loss/crossentropy": 2.41875559091568, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.2305321291089058, + "loss/reg": 0.0, + "step": 24130 + }, + { + "epoch": 0.15881578947368422, + "grad_norm": 2.140625, + "grad_norm_var": 0.5208513895670573, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.051196885108948, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.22749217078089715, + "loss/reg": 0.0, + "step": 24140 + }, + { + "epoch": 0.15888157894736843, + "grad_norm": 2298478592.0, + "grad_norm_var": 3.301877386622895e+17, + "learning_rate": 0.0001, + "loss": 3.3081, + "loss/crossentropy": 2.3360164284706117, + "loss/hidden": 3.290625, + "loss/incoh": 0.0, + "loss/logits": 0.3266043797135353, + "loss/reg": 0.0, + "step": 24150 + }, + { + "epoch": 0.15894736842105264, + "grad_norm": 2.046875, + "grad_norm_var": 3.3018773920608314e+17, + "learning_rate": 0.0001, + "loss": 3.0123, + "loss/crossentropy": 2.5075667977333067, + "loss/hidden": 2.603125, + "loss/incoh": 0.0, + "loss/logits": 0.19614752382040024, + "loss/reg": 0.0, + "step": 24160 + }, + { + "epoch": 0.15901315789473683, + "grad_norm": 2.453125, + "grad_norm_var": 0.06570002237955729, + "learning_rate": 0.0001, + "loss": 3.0729, + "loss/crossentropy": 2.1672366857528687, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2613453507423401, + "loss/reg": 0.0, + "step": 24170 + }, + { + "epoch": 0.15907894736842104, + "grad_norm": 2.109375, + "grad_norm_var": 0.437475331624349, + "learning_rate": 0.0001, + "loss": 3.073, + "loss/crossentropy": 2.1420519262552262, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.20748247653245927, + "loss/reg": 0.0, + "step": 24180 + }, + { + "epoch": 0.15914473684210526, + "grad_norm": 2.359375, + "grad_norm_var": 0.5490468343098959, + "learning_rate": 0.0001, + "loss": 3.1018, + "loss/crossentropy": 2.5135774850845336, + "loss/hidden": 2.98125, + "loss/incoh": 0.0, + "loss/logits": 0.2358848750591278, + "loss/reg": 0.0, + "step": 24190 + }, + { + "epoch": 0.15921052631578947, + "grad_norm": 2.15625, + "grad_norm_var": 0.05199559529622396, + "learning_rate": 0.0001, + "loss": 3.0415, + "loss/crossentropy": 2.392611360549927, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.29126308858394623, + "loss/reg": 0.0, + "step": 24200 + }, + { + "epoch": 0.15927631578947368, + "grad_norm": 2.03125, + "grad_norm_var": 0.3502886454264323, + "learning_rate": 0.0001, + "loss": 3.0974, + "loss/crossentropy": 2.2008650302886963, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2575715593993664, + "loss/reg": 0.0, + "step": 24210 + }, + { + "epoch": 0.1593421052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.2436920166015625, + "learning_rate": 0.0001, + "loss": 3.1615, + "loss/crossentropy": 2.398923659324646, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.2272189199924469, + "loss/reg": 0.0, + "step": 24220 + }, + { + "epoch": 0.1594078947368421, + "grad_norm": 2.3125, + "grad_norm_var": 0.2711252848307292, + "learning_rate": 0.0001, + "loss": 3.1147, + "loss/crossentropy": 2.2159452080726623, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.24379997029900552, + "loss/reg": 0.0, + "step": 24230 + }, + { + "epoch": 0.15947368421052632, + "grad_norm": 2.40625, + "grad_norm_var": 0.16237691243489583, + "learning_rate": 0.0001, + "loss": 3.032, + "loss/crossentropy": 2.219667136669159, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2103893890976906, + "loss/reg": 0.0, + "step": 24240 + }, + { + "epoch": 0.15953947368421054, + "grad_norm": 2.625, + "grad_norm_var": 0.035920206705729166, + "learning_rate": 0.0001, + "loss": 3.1149, + "loss/crossentropy": 2.0510872304439545, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2106800675392151, + "loss/reg": 0.0, + "step": 24250 + }, + { + "epoch": 0.15960526315789475, + "grad_norm": 2.078125, + "grad_norm_var": 0.05742085774739583, + "learning_rate": 0.0001, + "loss": 2.9921, + "loss/crossentropy": 2.202706849575043, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.200262913107872, + "loss/reg": 0.0, + "step": 24260 + }, + { + "epoch": 0.15967105263157894, + "grad_norm": 2.03125, + "grad_norm_var": 0.47508138020833335, + "learning_rate": 0.0001, + "loss": 3.0355, + "loss/crossentropy": 2.312657380104065, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.25428467951714995, + "loss/reg": 0.0, + "step": 24270 + }, + { + "epoch": 0.15973684210526315, + "grad_norm": 2.09375, + "grad_norm_var": 0.11523030598958334, + "learning_rate": 0.0001, + "loss": 3.1753, + "loss/crossentropy": 2.6283345460891723, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2865023031830788, + "loss/reg": 0.0, + "step": 24280 + }, + { + "epoch": 0.15980263157894736, + "grad_norm": 2.640625, + "grad_norm_var": 0.04806315104166667, + "learning_rate": 0.0001, + "loss": 3.0594, + "loss/crossentropy": 2.4725473642349245, + "loss/hidden": 2.6453125, + "loss/incoh": 0.0, + "loss/logits": 0.20711587965488434, + "loss/reg": 0.0, + "step": 24290 + }, + { + "epoch": 0.15986842105263158, + "grad_norm": 2.3125, + "grad_norm_var": 2.7487790628399786e+17, + "learning_rate": 0.0001, + "loss": 3.265, + "loss/crossentropy": 2.2610708713531493, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.27779700309038163, + "loss/reg": 0.0, + "step": 24300 + }, + { + "epoch": 0.1599342105263158, + "grad_norm": 1.828125, + "grad_norm_var": 2.748779062992896e+17, + "learning_rate": 0.0001, + "loss": 3.0342, + "loss/crossentropy": 2.4125430822372436, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.24020388573408127, + "loss/reg": 0.0, + "step": 24310 + }, + { + "epoch": 0.16, + "grad_norm": 2.4375, + "grad_norm_var": 0.10875244140625, + "learning_rate": 0.0001, + "loss": 3.0387, + "loss/crossentropy": 2.2118794441223146, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.23726068288087845, + "loss/reg": 0.0, + "step": 24320 + }, + { + "epoch": 0.16006578947368422, + "grad_norm": 2.109375, + "grad_norm_var": 0.052897135416666664, + "learning_rate": 0.0001, + "loss": 3.0208, + "loss/crossentropy": 2.4390548706054687, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.24900538772344588, + "loss/reg": 0.0, + "step": 24330 + }, + { + "epoch": 0.16013157894736843, + "grad_norm": 2.21875, + "grad_norm_var": 0.5782511393229167, + "learning_rate": 0.0001, + "loss": 3.2442, + "loss/crossentropy": 2.3729852437973022, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.23976370990276336, + "loss/reg": 0.0, + "step": 24340 + }, + { + "epoch": 0.16019736842105264, + "grad_norm": 2.328125, + "grad_norm_var": 0.11790364583333333, + "learning_rate": 0.0001, + "loss": 3.0854, + "loss/crossentropy": 2.4694011569023133, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.25265379548072814, + "loss/reg": 0.0, + "step": 24350 + }, + { + "epoch": 0.16026315789473683, + "grad_norm": 2.3125, + "grad_norm_var": 0.0570465087890625, + "learning_rate": 0.0001, + "loss": 3.1662, + "loss/crossentropy": 2.044074738025665, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.24245730862021447, + "loss/reg": 0.0, + "step": 24360 + }, + { + "epoch": 0.16032894736842104, + "grad_norm": 2.296875, + "grad_norm_var": 0.0236724853515625, + "learning_rate": 0.0001, + "loss": 3.0431, + "loss/crossentropy": 2.3142729878425596, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.22269122749567033, + "loss/reg": 0.0, + "step": 24370 + }, + { + "epoch": 0.16039473684210526, + "grad_norm": 2.0625, + "grad_norm_var": 0.08518473307291667, + "learning_rate": 0.0001, + "loss": 3.1019, + "loss/crossentropy": 2.0431338131427763, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2301457904279232, + "loss/reg": 0.0, + "step": 24380 + }, + { + "epoch": 0.16046052631578947, + "grad_norm": 2.5, + "grad_norm_var": 0.08948160807291666, + "learning_rate": 0.0001, + "loss": 2.9974, + "loss/crossentropy": 2.215762954950333, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.21034832447767257, + "loss/reg": 0.0, + "step": 24390 + }, + { + "epoch": 0.16052631578947368, + "grad_norm": 2.390625, + "grad_norm_var": 0.013671875, + "learning_rate": 0.0001, + "loss": 3.0257, + "loss/crossentropy": 2.285198521614075, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.22049440741539, + "loss/reg": 0.0, + "step": 24400 + }, + { + "epoch": 0.1605921052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.04234110514322917, + "learning_rate": 0.0001, + "loss": 3.0891, + "loss/crossentropy": 2.509168195724487, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.28251273930072784, + "loss/reg": 0.0, + "step": 24410 + }, + { + "epoch": 0.1606578947368421, + "grad_norm": 2.71875, + "grad_norm_var": 0.33464253743489586, + "learning_rate": 0.0001, + "loss": 3.0359, + "loss/crossentropy": 2.0764550805091857, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2346973106265068, + "loss/reg": 0.0, + "step": 24420 + }, + { + "epoch": 0.16072368421052632, + "grad_norm": 2.265625, + "grad_norm_var": 0.04140625, + "learning_rate": 0.0001, + "loss": 3.0962, + "loss/crossentropy": 2.4705166459083556, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.2445474848151207, + "loss/reg": 0.0, + "step": 24430 + }, + { + "epoch": 0.16078947368421054, + "grad_norm": 2.34375, + "grad_norm_var": 0.3681060791015625, + "learning_rate": 0.0001, + "loss": 3.0701, + "loss/crossentropy": 2.347282111644745, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.27417414337396623, + "loss/reg": 0.0, + "step": 24440 + }, + { + "epoch": 0.16085526315789472, + "grad_norm": 2.609375, + "grad_norm_var": 0.0697662353515625, + "learning_rate": 0.0001, + "loss": 3.1662, + "loss/crossentropy": 2.1282618045806885, + "loss/hidden": 3.0671875, + "loss/incoh": 0.0, + "loss/logits": 0.32502798140048983, + "loss/reg": 0.0, + "step": 24450 + }, + { + "epoch": 0.16092105263157894, + "grad_norm": 2.546875, + "grad_norm_var": 0.06052958170572917, + "learning_rate": 0.0001, + "loss": 3.0394, + "loss/crossentropy": 2.5319641828536987, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.23511321395635604, + "loss/reg": 0.0, + "step": 24460 + }, + { + "epoch": 0.16098684210526315, + "grad_norm": 3.84375, + "grad_norm_var": 0.18154067993164064, + "learning_rate": 0.0001, + "loss": 3.0401, + "loss/crossentropy": 2.3912153840065002, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.2340209573507309, + "loss/reg": 0.0, + "step": 24470 + }, + { + "epoch": 0.16105263157894736, + "grad_norm": 2.234375, + "grad_norm_var": 0.45718485514322915, + "learning_rate": 0.0001, + "loss": 3.0803, + "loss/crossentropy": 2.4838199734687807, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.23594623059034348, + "loss/reg": 0.0, + "step": 24480 + }, + { + "epoch": 0.16111842105263158, + "grad_norm": 2.40625, + "grad_norm_var": 0.12635269165039062, + "learning_rate": 0.0001, + "loss": 3.0486, + "loss/crossentropy": 2.2548271775245667, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.22111207991838455, + "loss/reg": 0.0, + "step": 24490 + }, + { + "epoch": 0.1611842105263158, + "grad_norm": 1.953125, + "grad_norm_var": 0.08367487589518229, + "learning_rate": 0.0001, + "loss": 3.0843, + "loss/crossentropy": 2.364280033111572, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.2182385429739952, + "loss/reg": 0.0, + "step": 24500 + }, + { + "epoch": 0.16125, + "grad_norm": 2.28125, + "grad_norm_var": 0.13824462890625, + "learning_rate": 0.0001, + "loss": 3.1481, + "loss/crossentropy": 2.3593606472015383, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.24336900562047958, + "loss/reg": 0.0, + "step": 24510 + }, + { + "epoch": 0.16131578947368422, + "grad_norm": 2.15625, + "grad_norm_var": 0.24010009765625, + "learning_rate": 0.0001, + "loss": 3.0484, + "loss/crossentropy": 2.212929093837738, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.23406590819358825, + "loss/reg": 0.0, + "step": 24520 + }, + { + "epoch": 0.16138157894736843, + "grad_norm": 2.078125, + "grad_norm_var": 0.240869140625, + "learning_rate": 0.0001, + "loss": 3.1152, + "loss/crossentropy": 2.0746700882911684, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.20809975266456604, + "loss/reg": 0.0, + "step": 24530 + }, + { + "epoch": 0.16144736842105264, + "grad_norm": 2.40625, + "grad_norm_var": 0.1971343994140625, + "learning_rate": 0.0001, + "loss": 3.0887, + "loss/crossentropy": 2.5128459453582765, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.23968077450990677, + "loss/reg": 0.0, + "step": 24540 + }, + { + "epoch": 0.16151315789473683, + "grad_norm": 2.296875, + "grad_norm_var": 0.10077718098958334, + "learning_rate": 0.0001, + "loss": 3.0776, + "loss/crossentropy": 2.2077295780181885, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.24557382240891457, + "loss/reg": 0.0, + "step": 24550 + }, + { + "epoch": 0.16157894736842104, + "grad_norm": 2.15625, + "grad_norm_var": 0.09933268229166667, + "learning_rate": 0.0001, + "loss": 3.0416, + "loss/crossentropy": 2.3417739272117615, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.23852093145251274, + "loss/reg": 0.0, + "step": 24560 + }, + { + "epoch": 0.16164473684210526, + "grad_norm": 2.4375, + "grad_norm_var": 0.16413472493489584, + "learning_rate": 0.0001, + "loss": 3.1237, + "loss/crossentropy": 2.230830729007721, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.23145168870687485, + "loss/reg": 0.0, + "step": 24570 + }, + { + "epoch": 0.16171052631578947, + "grad_norm": 2.84375, + "grad_norm_var": 0.11370035807291666, + "learning_rate": 0.0001, + "loss": 3.1152, + "loss/crossentropy": 2.1511558890342712, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.24633188247680665, + "loss/reg": 0.0, + "step": 24580 + }, + { + "epoch": 0.16177631578947368, + "grad_norm": 2.15625, + "grad_norm_var": 0.3695818583170573, + "learning_rate": 0.0001, + "loss": 3.0297, + "loss/crossentropy": 2.3748416185379027, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.2599245056509972, + "loss/reg": 0.0, + "step": 24590 + }, + { + "epoch": 0.1618421052631579, + "grad_norm": 1.96875, + "grad_norm_var": 0.11649576822916667, + "learning_rate": 0.0001, + "loss": 3.0334, + "loss/crossentropy": 1.958118262887001, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2173558861017227, + "loss/reg": 0.0, + "step": 24600 + }, + { + "epoch": 0.1619078947368421, + "grad_norm": 2.171875, + "grad_norm_var": 0.09065729777018229, + "learning_rate": 0.0001, + "loss": 3.0806, + "loss/crossentropy": 2.3804409861564637, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.28529137223958967, + "loss/reg": 0.0, + "step": 24610 + }, + { + "epoch": 0.16197368421052633, + "grad_norm": 2.40625, + "grad_norm_var": 0.18765869140625, + "learning_rate": 0.0001, + "loss": 3.1415, + "loss/crossentropy": 2.4188234567642213, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.2486872687935829, + "loss/reg": 0.0, + "step": 24620 + }, + { + "epoch": 0.16203947368421054, + "grad_norm": 2.46875, + "grad_norm_var": 0.1975982666015625, + "learning_rate": 0.0001, + "loss": 3.0741, + "loss/crossentropy": 2.2737186551094055, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.21121124178171158, + "loss/reg": 0.0, + "step": 24630 + }, + { + "epoch": 0.16210526315789472, + "grad_norm": 2.5, + "grad_norm_var": 2.4922159830729167, + "learning_rate": 0.0001, + "loss": 3.0672, + "loss/crossentropy": 1.935386747121811, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.21296948455274106, + "loss/reg": 0.0, + "step": 24640 + }, + { + "epoch": 0.16217105263157894, + "grad_norm": 2.25, + "grad_norm_var": 0.053587849934895834, + "learning_rate": 0.0001, + "loss": 3.1024, + "loss/crossentropy": 2.456861126422882, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.29705640524625776, + "loss/reg": 0.0, + "step": 24650 + }, + { + "epoch": 0.16223684210526315, + "grad_norm": 2.140625, + "grad_norm_var": 0.2731679280598958, + "learning_rate": 0.0001, + "loss": 3.0683, + "loss/crossentropy": 2.3239927887916565, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.24205588102340697, + "loss/reg": 0.0, + "step": 24660 + }, + { + "epoch": 0.16230263157894737, + "grad_norm": 2.15625, + "grad_norm_var": 0.22311197916666667, + "learning_rate": 0.0001, + "loss": 3.0314, + "loss/crossentropy": 2.3312941789627075, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.28574763536453246, + "loss/reg": 0.0, + "step": 24670 + }, + { + "epoch": 0.16236842105263158, + "grad_norm": 2.859375, + "grad_norm_var": 0.24403889973958334, + "learning_rate": 0.0001, + "loss": 3.1553, + "loss/crossentropy": 2.493465173244476, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.23360646292567253, + "loss/reg": 0.0, + "step": 24680 + }, + { + "epoch": 0.1624342105263158, + "grad_norm": 2.84375, + "grad_norm_var": 0.16258138020833332, + "learning_rate": 0.0001, + "loss": 3.088, + "loss/crossentropy": 2.3201822876930236, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.24165551662445067, + "loss/reg": 0.0, + "step": 24690 + }, + { + "epoch": 0.1625, + "grad_norm": 2.84375, + "grad_norm_var": 0.18403218587239584, + "learning_rate": 0.0001, + "loss": 3.1727, + "loss/crossentropy": 2.2391496330499647, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.25802505761384964, + "loss/reg": 0.0, + "step": 24700 + }, + { + "epoch": 0.16256578947368422, + "grad_norm": 2.40625, + "grad_norm_var": 0.047663370768229164, + "learning_rate": 0.0001, + "loss": 3.0701, + "loss/crossentropy": 2.1174232959747314, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.25788239687681197, + "loss/reg": 0.0, + "step": 24710 + }, + { + "epoch": 0.16263157894736843, + "grad_norm": 2.390625, + "grad_norm_var": 0.03251520792643229, + "learning_rate": 0.0001, + "loss": 3.0028, + "loss/crossentropy": 2.303908097743988, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.22646306753158568, + "loss/reg": 0.0, + "step": 24720 + }, + { + "epoch": 0.16269736842105262, + "grad_norm": 2.328125, + "grad_norm_var": 0.09428609212239583, + "learning_rate": 0.0001, + "loss": 3.0573, + "loss/crossentropy": 2.4429296493530273, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.24618532359600068, + "loss/reg": 0.0, + "step": 24730 + }, + { + "epoch": 0.16276315789473683, + "grad_norm": 2.28125, + "grad_norm_var": 0.10568745930989583, + "learning_rate": 0.0001, + "loss": 3.0411, + "loss/crossentropy": 2.245293366909027, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.3406794846057892, + "loss/reg": 0.0, + "step": 24740 + }, + { + "epoch": 0.16282894736842105, + "grad_norm": 2.453125, + "grad_norm_var": 0.24233169555664064, + "learning_rate": 0.0001, + "loss": 3.0254, + "loss/crossentropy": 2.1815924525260924, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.20982651114463807, + "loss/reg": 0.0, + "step": 24750 + }, + { + "epoch": 0.16289473684210526, + "grad_norm": 2.140625, + "grad_norm_var": 0.051889801025390626, + "learning_rate": 0.0001, + "loss": 3.0337, + "loss/crossentropy": 2.530372714996338, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.22411757558584214, + "loss/reg": 0.0, + "step": 24760 + }, + { + "epoch": 0.16296052631578947, + "grad_norm": 2.34375, + "grad_norm_var": 0.025414021809895833, + "learning_rate": 0.0001, + "loss": 2.9668, + "loss/crossentropy": 2.271071231365204, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.23753461390733718, + "loss/reg": 0.0, + "step": 24770 + }, + { + "epoch": 0.16302631578947369, + "grad_norm": 2.765625, + "grad_norm_var": 0.0559722900390625, + "learning_rate": 0.0001, + "loss": 3.0646, + "loss/crossentropy": 2.2592864990234376, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.21606265306472777, + "loss/reg": 0.0, + "step": 24780 + }, + { + "epoch": 0.1630921052631579, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05602595011393229, + "learning_rate": 0.0001, + "loss": 3.059, + "loss/crossentropy": 2.021050810813904, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2471683457493782, + "loss/reg": 0.0, + "step": 24790 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.25566991170247394, + "learning_rate": 0.0001, + "loss": 3.0148, + "loss/crossentropy": 2.4111449480056764, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.23949681222438812, + "loss/reg": 0.0, + "step": 24800 + }, + { + "epoch": 0.16322368421052633, + "grad_norm": 2.421875, + "grad_norm_var": 0.21315816243489583, + "learning_rate": 0.0001, + "loss": 3.0918, + "loss/crossentropy": 2.5244083642959594, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2454654648900032, + "loss/reg": 0.0, + "step": 24810 + }, + { + "epoch": 0.16328947368421054, + "grad_norm": 2.21875, + "grad_norm_var": 0.09656575520833334, + "learning_rate": 0.0001, + "loss": 2.9972, + "loss/crossentropy": 2.4637012243270875, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.2438522458076477, + "loss/reg": 0.0, + "step": 24820 + }, + { + "epoch": 0.16335526315789473, + "grad_norm": 2.1875, + "grad_norm_var": 0.03803609212239583, + "learning_rate": 0.0001, + "loss": 3.0278, + "loss/crossentropy": 2.2508904099464417, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.24632504507899283, + "loss/reg": 0.0, + "step": 24830 + }, + { + "epoch": 0.16342105263157894, + "grad_norm": 2.96875, + "grad_norm_var": 0.0544921875, + "learning_rate": 0.0001, + "loss": 3.0273, + "loss/crossentropy": 2.2554102897644044, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.2244628369808197, + "loss/reg": 0.0, + "step": 24840 + }, + { + "epoch": 0.16348684210526315, + "grad_norm": 2.15625, + "grad_norm_var": 0.0915679931640625, + "learning_rate": 0.0001, + "loss": 2.9974, + "loss/crossentropy": 2.4115317344665526, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.22097424566745758, + "loss/reg": 0.0, + "step": 24850 + }, + { + "epoch": 0.16355263157894737, + "grad_norm": 2.328125, + "grad_norm_var": 0.0794342041015625, + "learning_rate": 0.0001, + "loss": 3.0875, + "loss/crossentropy": 2.2210743844509127, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2252424478530884, + "loss/reg": 0.0, + "step": 24860 + }, + { + "epoch": 0.16361842105263158, + "grad_norm": 2.984375, + "grad_norm_var": 3.853392423706864e+17, + "learning_rate": 0.0001, + "loss": 3.1398, + "loss/crossentropy": 2.5947876930236817, + "loss/hidden": 3.771875, + "loss/incoh": 0.0, + "loss/logits": 0.33976440876722336, + "loss/reg": 0.0, + "step": 24870 + }, + { + "epoch": 0.1636842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 3.853392423689082e+17, + "learning_rate": 0.0001, + "loss": 3.0576, + "loss/crossentropy": 2.177506458759308, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.254705648124218, + "loss/reg": 0.0, + "step": 24880 + }, + { + "epoch": 0.16375, + "grad_norm": 2.4375, + "grad_norm_var": 0.08782730102539063, + "learning_rate": 0.0001, + "loss": 3.0306, + "loss/crossentropy": 2.4746748208999634, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.2408411219716072, + "loss/reg": 0.0, + "step": 24890 + }, + { + "epoch": 0.16381578947368422, + "grad_norm": 2.1875, + "grad_norm_var": 0.07857666015625, + "learning_rate": 0.0001, + "loss": 3.0083, + "loss/crossentropy": 2.273878073692322, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.2608170732855797, + "loss/reg": 0.0, + "step": 24900 + }, + { + "epoch": 0.16388157894736843, + "grad_norm": 2.5, + "grad_norm_var": 0.17737528483072917, + "learning_rate": 0.0001, + "loss": 3.1428, + "loss/crossentropy": 2.240007960796356, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.22373474836349488, + "loss/reg": 0.0, + "step": 24910 + }, + { + "epoch": 0.16394736842105262, + "grad_norm": 2.25, + "grad_norm_var": 0.10565999348958334, + "learning_rate": 0.0001, + "loss": 3.0246, + "loss/crossentropy": 2.229018306732178, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.29760813266038894, + "loss/reg": 0.0, + "step": 24920 + }, + { + "epoch": 0.16401315789473683, + "grad_norm": 2.65625, + "grad_norm_var": 0.3898834228515625, + "learning_rate": 0.0001, + "loss": 3.1527, + "loss/crossentropy": 2.2012178540229796, + "loss/hidden": 2.959375, + "loss/incoh": 0.0, + "loss/logits": 0.28048097975552083, + "loss/reg": 0.0, + "step": 24930 + }, + { + "epoch": 0.16407894736842105, + "grad_norm": 2.765625, + "grad_norm_var": 0.1896636962890625, + "learning_rate": 0.0001, + "loss": 3.1143, + "loss/crossentropy": 2.396012032032013, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.31100255995988846, + "loss/reg": 0.0, + "step": 24940 + }, + { + "epoch": 0.16414473684210526, + "grad_norm": 2.109375, + "grad_norm_var": 0.11585464477539062, + "learning_rate": 0.0001, + "loss": 2.9923, + "loss/crossentropy": 2.35096116065979, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.1970454752445221, + "loss/reg": 0.0, + "step": 24950 + }, + { + "epoch": 0.16421052631578947, + "grad_norm": 2.4375, + "grad_norm_var": 0.048278554280598955, + "learning_rate": 0.0001, + "loss": 3.0105, + "loss/crossentropy": 2.1574088990688325, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.20654048770666122, + "loss/reg": 0.0, + "step": 24960 + }, + { + "epoch": 0.1642763157894737, + "grad_norm": 2.171875, + "grad_norm_var": 0.41565348307291666, + "learning_rate": 0.0001, + "loss": 3.1106, + "loss/crossentropy": 2.087399756908417, + "loss/hidden": 3.11875, + "loss/incoh": 0.0, + "loss/logits": 0.2551331013441086, + "loss/reg": 0.0, + "step": 24970 + }, + { + "epoch": 0.1643421052631579, + "grad_norm": 2.421875, + "grad_norm_var": 0.4007232666015625, + "learning_rate": 0.0001, + "loss": 3.1262, + "loss/crossentropy": 2.104507529735565, + "loss/hidden": 3.0828125, + "loss/incoh": 0.0, + "loss/logits": 0.3040700241923332, + "loss/reg": 0.0, + "step": 24980 + }, + { + "epoch": 0.16440789473684211, + "grad_norm": 2.46875, + "grad_norm_var": 0.29108072916666666, + "learning_rate": 0.0001, + "loss": 3.1264, + "loss/crossentropy": 2.393839418888092, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2309044197201729, + "loss/reg": 0.0, + "step": 24990 + }, + { + "epoch": 0.16447368421052633, + "grad_norm": 2.65625, + "grad_norm_var": 0.34716695149739585, + "learning_rate": 0.0001, + "loss": 3.0873, + "loss/crossentropy": 2.295112156867981, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2622707739472389, + "loss/reg": 0.0, + "step": 25000 + }, + { + "epoch": 0.1645394736842105, + "grad_norm": 2.9375, + "grad_norm_var": 2.9363606770833335, + "learning_rate": 0.0001, + "loss": 3.1494, + "loss/crossentropy": 2.0088412761688232, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2086470142006874, + "loss/reg": 0.0, + "step": 25010 + }, + { + "epoch": 0.16460526315789473, + "grad_norm": 2.46875, + "grad_norm_var": 3.0145467122395835, + "learning_rate": 0.0001, + "loss": 3.0267, + "loss/crossentropy": 2.230376589298248, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.25718885809183123, + "loss/reg": 0.0, + "step": 25020 + }, + { + "epoch": 0.16467105263157894, + "grad_norm": 2.25, + "grad_norm_var": 0.16364644368489584, + "learning_rate": 0.0001, + "loss": 3.1098, + "loss/crossentropy": 2.2357122182846068, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.20328374579548836, + "loss/reg": 0.0, + "step": 25030 + }, + { + "epoch": 0.16473684210526315, + "grad_norm": 2.15625, + "grad_norm_var": 0.13963597615559895, + "learning_rate": 0.0001, + "loss": 3.0687, + "loss/crossentropy": 2.3226524710655214, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.23719747960567475, + "loss/reg": 0.0, + "step": 25040 + }, + { + "epoch": 0.16480263157894737, + "grad_norm": 2298478592.0, + "grad_norm_var": 3.301877391914183e+17, + "learning_rate": 0.0001, + "loss": 3.151, + "loss/crossentropy": 2.2604412317276, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.2530272454023361, + "loss/reg": 0.0, + "step": 25050 + }, + { + "epoch": 0.16486842105263158, + "grad_norm": 2.203125, + "grad_norm_var": 3.301877391448801e+17, + "learning_rate": 0.0001, + "loss": 3.0688, + "loss/crossentropy": 2.374041938781738, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.23906515687704086, + "loss/reg": 0.0, + "step": 25060 + }, + { + "epoch": 0.1649342105263158, + "grad_norm": 2.078125, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 0.0001, + "loss": 3.0466, + "loss/crossentropy": 2.2033262491226195, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.2643999807536602, + "loss/reg": 0.0, + "step": 25070 + }, + { + "epoch": 0.165, + "grad_norm": 2.328125, + "grad_norm_var": 0.22410481770833332, + "learning_rate": 0.0001, + "loss": 3.0549, + "loss/crossentropy": 2.268805372714996, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.22531848698854445, + "loss/reg": 0.0, + "step": 25080 + }, + { + "epoch": 0.16506578947368422, + "grad_norm": 2.171875, + "grad_norm_var": 0.17864481608072916, + "learning_rate": 0.0001, + "loss": 3.05, + "loss/crossentropy": 2.32329341173172, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.23195795714855194, + "loss/reg": 0.0, + "step": 25090 + }, + { + "epoch": 0.16513157894736843, + "grad_norm": 2.078125, + "grad_norm_var": 0.07084859212239583, + "learning_rate": 0.0001, + "loss": 3.0737, + "loss/crossentropy": 2.3633965373039247, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.24620314687490463, + "loss/reg": 0.0, + "step": 25100 + }, + { + "epoch": 0.16519736842105262, + "grad_norm": 2.296875, + "grad_norm_var": 0.37231343587239585, + "learning_rate": 0.0001, + "loss": 3.0891, + "loss/crossentropy": 2.4329662203788756, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23993222489953042, + "loss/reg": 0.0, + "step": 25110 + }, + { + "epoch": 0.16526315789473683, + "grad_norm": 2.359375, + "grad_norm_var": 0.48580322265625, + "learning_rate": 0.0001, + "loss": 3.0705, + "loss/crossentropy": 2.418682646751404, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.22389767169952393, + "loss/reg": 0.0, + "step": 25120 + }, + { + "epoch": 0.16532894736842105, + "grad_norm": 2.140625, + "grad_norm_var": 0.08493550618489583, + "learning_rate": 0.0001, + "loss": 3.0513, + "loss/crossentropy": 2.394873285293579, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.2614659383893013, + "loss/reg": 0.0, + "step": 25130 + }, + { + "epoch": 0.16539473684210526, + "grad_norm": 2.875, + "grad_norm_var": 0.19609273274739583, + "learning_rate": 0.0001, + "loss": 3.0834, + "loss/crossentropy": 2.1957722663879395, + "loss/hidden": 2.9609375, + "loss/incoh": 0.0, + "loss/logits": 0.2501178741455078, + "loss/reg": 0.0, + "step": 25140 + }, + { + "epoch": 0.16546052631578947, + "grad_norm": 2.140625, + "grad_norm_var": 0.17849019368489583, + "learning_rate": 0.0001, + "loss": 3.024, + "loss/crossentropy": 2.389811897277832, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2392006903886795, + "loss/reg": 0.0, + "step": 25150 + }, + { + "epoch": 0.1655263157894737, + "grad_norm": 2.53125, + "grad_norm_var": 0.08551610310872396, + "learning_rate": 0.0001, + "loss": 3.0152, + "loss/crossentropy": 2.328965938091278, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.2474894031882286, + "loss/reg": 0.0, + "step": 25160 + }, + { + "epoch": 0.1655921052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.26106363932291665, + "learning_rate": 0.0001, + "loss": 3.092, + "loss/crossentropy": 2.5120302557945253, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.25525424629449844, + "loss/reg": 0.0, + "step": 25170 + }, + { + "epoch": 0.16565789473684212, + "grad_norm": 2.296875, + "grad_norm_var": 0.4998372395833333, + "learning_rate": 0.0001, + "loss": 3.1112, + "loss/crossentropy": 2.37037136554718, + "loss/hidden": 2.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.22218506336212157, + "loss/reg": 0.0, + "step": 25180 + }, + { + "epoch": 0.16572368421052633, + "grad_norm": 4.6875, + "grad_norm_var": 0.4266916910807292, + "learning_rate": 0.0001, + "loss": 3.1019, + "loss/crossentropy": 2.3847400188446044, + "loss/hidden": 3.021875, + "loss/incoh": 0.0, + "loss/logits": 0.2786879613995552, + "loss/reg": 0.0, + "step": 25190 + }, + { + "epoch": 0.16578947368421051, + "grad_norm": 2.53125, + "grad_norm_var": 0.4002766927083333, + "learning_rate": 0.0001, + "loss": 3.0387, + "loss/crossentropy": 2.2348836183547975, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.24782560914754867, + "loss/reg": 0.0, + "step": 25200 + }, + { + "epoch": 0.16585526315789473, + "grad_norm": 2.4375, + "grad_norm_var": 0.21065165201822916, + "learning_rate": 0.0001, + "loss": 3.1195, + "loss/crossentropy": 2.257748210430145, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.21636683791875838, + "loss/reg": 0.0, + "step": 25210 + }, + { + "epoch": 0.16592105263157894, + "grad_norm": 2.515625, + "grad_norm_var": 0.05474853515625, + "learning_rate": 0.0001, + "loss": 3.099, + "loss/crossentropy": 2.4620559096336363, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.21316515058279037, + "loss/reg": 0.0, + "step": 25220 + }, + { + "epoch": 0.16598684210526315, + "grad_norm": 2.34375, + "grad_norm_var": 0.03912353515625, + "learning_rate": 0.0001, + "loss": 3.0173, + "loss/crossentropy": 2.3012343645095825, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2387869328260422, + "loss/reg": 0.0, + "step": 25230 + }, + { + "epoch": 0.16605263157894737, + "grad_norm": 2.046875, + "grad_norm_var": 0.06116434733072917, + "learning_rate": 0.0001, + "loss": 3.0454, + "loss/crossentropy": 2.20231409072876, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.22614135295152665, + "loss/reg": 0.0, + "step": 25240 + }, + { + "epoch": 0.16611842105263158, + "grad_norm": 2.046875, + "grad_norm_var": 0.05122782389322917, + "learning_rate": 0.0001, + "loss": 3.0264, + "loss/crossentropy": 2.2305002450942992, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.22803680896759032, + "loss/reg": 0.0, + "step": 25250 + }, + { + "epoch": 0.1661842105263158, + "grad_norm": 2.0, + "grad_norm_var": 0.05510660807291667, + "learning_rate": 0.0001, + "loss": 3.0045, + "loss/crossentropy": 2.209262716770172, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.2319029837846756, + "loss/reg": 0.0, + "step": 25260 + }, + { + "epoch": 0.16625, + "grad_norm": 2.671875, + "grad_norm_var": 0.0671051025390625, + "learning_rate": 0.0001, + "loss": 3.0665, + "loss/crossentropy": 2.3621141076087953, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.24575576186180115, + "loss/reg": 0.0, + "step": 25270 + }, + { + "epoch": 0.16631578947368422, + "grad_norm": 2.359375, + "grad_norm_var": 0.1259674072265625, + "learning_rate": 0.0001, + "loss": 3.0746, + "loss/crossentropy": 2.297244334220886, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.21904474049806594, + "loss/reg": 0.0, + "step": 25280 + }, + { + "epoch": 0.1663815789473684, + "grad_norm": 2.4375, + "grad_norm_var": 0.12108968098958334, + "learning_rate": 0.0001, + "loss": 3.0591, + "loss/crossentropy": 2.167328989505768, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.254831300675869, + "loss/reg": 0.0, + "step": 25290 + }, + { + "epoch": 0.16644736842105262, + "grad_norm": 2.15625, + "grad_norm_var": 0.030272420247395834, + "learning_rate": 0.0001, + "loss": 3.0706, + "loss/crossentropy": 2.5636652946472167, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.26798896938562394, + "loss/reg": 0.0, + "step": 25300 + }, + { + "epoch": 0.16651315789473684, + "grad_norm": 2.46875, + "grad_norm_var": 0.03521728515625, + "learning_rate": 0.0001, + "loss": 3.0397, + "loss/crossentropy": 2.241175901889801, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.23810605853796005, + "loss/reg": 0.0, + "step": 25310 + }, + { + "epoch": 0.16657894736842105, + "grad_norm": 2.453125, + "grad_norm_var": 0.1312652587890625, + "learning_rate": 0.0001, + "loss": 3.0874, + "loss/crossentropy": 2.461783027648926, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.22298123091459274, + "loss/reg": 0.0, + "step": 25320 + }, + { + "epoch": 0.16664473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.01871337890625, + "learning_rate": 0.0001, + "loss": 3.0339, + "loss/crossentropy": 2.6030556201934814, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.2566943824291229, + "loss/reg": 0.0, + "step": 25330 + }, + { + "epoch": 0.16671052631578948, + "grad_norm": 5.375, + "grad_norm_var": 0.6346506754557292, + "learning_rate": 0.0001, + "loss": 3.0403, + "loss/crossentropy": 2.2260714411735534, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.21501264423131944, + "loss/reg": 0.0, + "step": 25340 + }, + { + "epoch": 0.1667763157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.6130198160807292, + "learning_rate": 0.0001, + "loss": 3.1302, + "loss/crossentropy": 2.2205970883369446, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2506598949432373, + "loss/reg": 0.0, + "step": 25350 + }, + { + "epoch": 0.1668421052631579, + "grad_norm": 2.34375, + "grad_norm_var": 0.07870686848958333, + "learning_rate": 0.0001, + "loss": 3.0579, + "loss/crossentropy": 2.3078066170215608, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.2546592831611633, + "loss/reg": 0.0, + "step": 25360 + }, + { + "epoch": 0.16690789473684212, + "grad_norm": 2.484375, + "grad_norm_var": 0.7152821858723958, + "learning_rate": 0.0001, + "loss": 3.084, + "loss/crossentropy": 2.103282463550568, + "loss/hidden": 3.0796875, + "loss/incoh": 0.0, + "loss/logits": 0.2719234719872475, + "loss/reg": 0.0, + "step": 25370 + }, + { + "epoch": 0.1669736842105263, + "grad_norm": 1.8984375, + "grad_norm_var": 1.6605242411295573, + "learning_rate": 0.0001, + "loss": 3.0337, + "loss/crossentropy": 2.533907437324524, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.2172473669052124, + "loss/reg": 0.0, + "step": 25380 + }, + { + "epoch": 0.16703947368421052, + "grad_norm": 2.421875, + "grad_norm_var": 0.841961415608724, + "learning_rate": 0.0001, + "loss": 3.1811, + "loss/crossentropy": 2.349000704288483, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.2332908734679222, + "loss/reg": 0.0, + "step": 25390 + }, + { + "epoch": 0.16710526315789473, + "grad_norm": 2.265625, + "grad_norm_var": 0.75146484375, + "learning_rate": 0.0001, + "loss": 3.0519, + "loss/crossentropy": 2.3112148702144624, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.2460096523165703, + "loss/reg": 0.0, + "step": 25400 + }, + { + "epoch": 0.16717105263157894, + "grad_norm": 2.125, + "grad_norm_var": 0.0905426025390625, + "learning_rate": 0.0001, + "loss": 3.0931, + "loss/crossentropy": 2.208834648132324, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.21595955416560172, + "loss/reg": 0.0, + "step": 25410 + }, + { + "epoch": 0.16723684210526316, + "grad_norm": 2.21875, + "grad_norm_var": 0.07610575358072917, + "learning_rate": 0.0001, + "loss": 3.0541, + "loss/crossentropy": 2.2315654158592224, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23381351083517074, + "loss/reg": 0.0, + "step": 25420 + }, + { + "epoch": 0.16730263157894737, + "grad_norm": 2.484375, + "grad_norm_var": 0.043302154541015624, + "learning_rate": 0.0001, + "loss": 3.0197, + "loss/crossentropy": 2.3990158438682556, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.24045146703720094, + "loss/reg": 0.0, + "step": 25430 + }, + { + "epoch": 0.16736842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.0761138916015625, + "learning_rate": 0.0001, + "loss": 3.0792, + "loss/crossentropy": 2.1985233664512633, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2625061020255089, + "loss/reg": 0.0, + "step": 25440 + }, + { + "epoch": 0.1674342105263158, + "grad_norm": 2.328125, + "grad_norm_var": 0.08713150024414062, + "learning_rate": 0.0001, + "loss": 3.0474, + "loss/crossentropy": 2.2765584170818327, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2559843085706234, + "loss/reg": 0.0, + "step": 25450 + }, + { + "epoch": 0.1675, + "grad_norm": 2.25, + "grad_norm_var": 0.06063003540039062, + "learning_rate": 0.0001, + "loss": 3.0819, + "loss/crossentropy": 2.241065299510956, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.22642052918672562, + "loss/reg": 0.0, + "step": 25460 + }, + { + "epoch": 0.16756578947368422, + "grad_norm": 2.15625, + "grad_norm_var": 0.5740468343098958, + "learning_rate": 0.0001, + "loss": 3.1251, + "loss/crossentropy": 2.388518822193146, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.283678263425827, + "loss/reg": 0.0, + "step": 25470 + }, + { + "epoch": 0.1676315789473684, + "grad_norm": 2.390625, + "grad_norm_var": 0.13244400024414063, + "learning_rate": 0.0001, + "loss": 3.09, + "loss/crossentropy": 2.2575212955474853, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.21582936197519303, + "loss/reg": 0.0, + "step": 25480 + }, + { + "epoch": 0.16769736842105262, + "grad_norm": 2.46875, + "grad_norm_var": 0.14921773274739583, + "learning_rate": 0.0001, + "loss": 3.0472, + "loss/crossentropy": 2.378064513206482, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.22392996102571489, + "loss/reg": 0.0, + "step": 25490 + }, + { + "epoch": 0.16776315789473684, + "grad_norm": 2.109375, + "grad_norm_var": 0.046083323160807294, + "learning_rate": 0.0001, + "loss": 3.1247, + "loss/crossentropy": 2.354731285572052, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24077038019895552, + "loss/reg": 0.0, + "step": 25500 + }, + { + "epoch": 0.16782894736842105, + "grad_norm": 2.0625, + "grad_norm_var": 0.024348958333333334, + "learning_rate": 0.0001, + "loss": 3.0441, + "loss/crossentropy": 2.645008158683777, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.2468577191233635, + "loss/reg": 0.0, + "step": 25510 + }, + { + "epoch": 0.16789473684210526, + "grad_norm": 2.1875, + "grad_norm_var": 0.10139567057291667, + "learning_rate": 0.0001, + "loss": 3.158, + "loss/crossentropy": 2.4286780834197996, + "loss/hidden": 3.015625, + "loss/incoh": 0.0, + "loss/logits": 0.33578050583601, + "loss/reg": 0.0, + "step": 25520 + }, + { + "epoch": 0.16796052631578948, + "grad_norm": 2.546875, + "grad_norm_var": 0.03992513020833333, + "learning_rate": 0.0001, + "loss": 3.0624, + "loss/crossentropy": 2.4203076124191285, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.27344991117715833, + "loss/reg": 0.0, + "step": 25530 + }, + { + "epoch": 0.1680263157894737, + "grad_norm": 2.546875, + "grad_norm_var": 0.03359273274739583, + "learning_rate": 0.0001, + "loss": 3.111, + "loss/crossentropy": 2.1932618856430053, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.27270109951496124, + "loss/reg": 0.0, + "step": 25540 + }, + { + "epoch": 0.1680921052631579, + "grad_norm": 2.359375, + "grad_norm_var": 0.03035252888997396, + "learning_rate": 0.0001, + "loss": 3.0572, + "loss/crossentropy": 2.1443075835704803, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.21711303144693375, + "loss/reg": 0.0, + "step": 25550 + }, + { + "epoch": 0.16815789473684212, + "grad_norm": 2.171875, + "grad_norm_var": 0.06535822550455729, + "learning_rate": 0.0001, + "loss": 3.0248, + "loss/crossentropy": 2.2873115301132203, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.26505507379770277, + "loss/reg": 0.0, + "step": 25560 + }, + { + "epoch": 0.1682236842105263, + "grad_norm": 2.171875, + "grad_norm_var": 0.21750895182291666, + "learning_rate": 0.0001, + "loss": 3.245, + "loss/crossentropy": 2.464116406440735, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.2560413718223572, + "loss/reg": 0.0, + "step": 25570 + }, + { + "epoch": 0.16828947368421052, + "grad_norm": 4.0625, + "grad_norm_var": 0.2865193684895833, + "learning_rate": 0.0001, + "loss": 3.1803, + "loss/crossentropy": 2.299769949913025, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2611501321196556, + "loss/reg": 0.0, + "step": 25580 + }, + { + "epoch": 0.16835526315789473, + "grad_norm": 2.21875, + "grad_norm_var": 0.24952799479166668, + "learning_rate": 0.0001, + "loss": 3.1427, + "loss/crossentropy": 2.0992822468280794, + "loss/hidden": 3.109375, + "loss/incoh": 0.0, + "loss/logits": 0.2576856903731823, + "loss/reg": 0.0, + "step": 25590 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 2.328125, + "grad_norm_var": 0.4341705322265625, + "learning_rate": 0.0001, + "loss": 3.101, + "loss/crossentropy": 2.0800092458724975, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.20497069805860518, + "loss/reg": 0.0, + "step": 25600 + }, + { + "epoch": 0.16848684210526316, + "grad_norm": 2.625, + "grad_norm_var": 0.4129231770833333, + "learning_rate": 0.0001, + "loss": 3.0582, + "loss/crossentropy": 2.3167444467544556, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.2798536166548729, + "loss/reg": 0.0, + "step": 25610 + }, + { + "epoch": 0.16855263157894737, + "grad_norm": 3.59375, + "grad_norm_var": 0.13404541015625, + "learning_rate": 0.0001, + "loss": 3.0518, + "loss/crossentropy": 2.456227493286133, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.27998181581497195, + "loss/reg": 0.0, + "step": 25620 + }, + { + "epoch": 0.16861842105263158, + "grad_norm": 2.03125, + "grad_norm_var": 0.15030899047851562, + "learning_rate": 0.0001, + "loss": 3.043, + "loss/crossentropy": 2.208226388692856, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.24489136189222335, + "loss/reg": 0.0, + "step": 25630 + }, + { + "epoch": 0.1686842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 1.020721181233724, + "learning_rate": 0.0001, + "loss": 3.1397, + "loss/crossentropy": 2.2634316802024843, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.27392966747283937, + "loss/reg": 0.0, + "step": 25640 + }, + { + "epoch": 0.16875, + "grad_norm": 2.40625, + "grad_norm_var": 0.980279286702474, + "learning_rate": 0.0001, + "loss": 3.0574, + "loss/crossentropy": 2.2914888978004457, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.23568858355283737, + "loss/reg": 0.0, + "step": 25650 + }, + { + "epoch": 0.1688157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.07039769490559895, + "learning_rate": 0.0001, + "loss": 3.139, + "loss/crossentropy": 2.1703743815422056, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.25462436228990554, + "loss/reg": 0.0, + "step": 25660 + }, + { + "epoch": 0.1688815789473684, + "grad_norm": 2.546875, + "grad_norm_var": 0.22234598795572916, + "learning_rate": 0.0001, + "loss": 3.0749, + "loss/crossentropy": 2.115765154361725, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.2619611322879791, + "loss/reg": 0.0, + "step": 25670 + }, + { + "epoch": 0.16894736842105262, + "grad_norm": 2.3125, + "grad_norm_var": 0.21928609212239583, + "learning_rate": 0.0001, + "loss": 3.1309, + "loss/crossentropy": 2.355665123462677, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.25024047791957854, + "loss/reg": 0.0, + "step": 25680 + }, + { + "epoch": 0.16901315789473684, + "grad_norm": 2.40625, + "grad_norm_var": 0.6276204427083333, + "learning_rate": 0.0001, + "loss": 3.2325, + "loss/crossentropy": 2.3543809175491335, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.28042998909950256, + "loss/reg": 0.0, + "step": 25690 + }, + { + "epoch": 0.16907894736842105, + "grad_norm": 2.15625, + "grad_norm_var": 0.55308837890625, + "learning_rate": 0.0001, + "loss": 3.1367, + "loss/crossentropy": 2.0251551032066346, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.2671583190560341, + "loss/reg": 0.0, + "step": 25700 + }, + { + "epoch": 0.16914473684210526, + "grad_norm": 2.5625, + "grad_norm_var": 0.8514638264973958, + "learning_rate": 0.0001, + "loss": 3.0816, + "loss/crossentropy": 2.3947930097579957, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.23190638422966003, + "loss/reg": 0.0, + "step": 25710 + }, + { + "epoch": 0.16921052631578948, + "grad_norm": 2.21875, + "grad_norm_var": 0.8416341145833334, + "learning_rate": 0.0001, + "loss": 3.0317, + "loss/crossentropy": 2.3030895352363587, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.25654992610216143, + "loss/reg": 0.0, + "step": 25720 + }, + { + "epoch": 0.1692763157894737, + "grad_norm": 2.203125, + "grad_norm_var": 0.1338043212890625, + "learning_rate": 0.0001, + "loss": 3.0553, + "loss/crossentropy": 2.4580634593963624, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.29121011197566987, + "loss/reg": 0.0, + "step": 25730 + }, + { + "epoch": 0.1693421052631579, + "grad_norm": 2.578125, + "grad_norm_var": 0.12617162068684895, + "learning_rate": 0.0001, + "loss": 3.0517, + "loss/crossentropy": 2.2136544942855836, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.2460548087954521, + "loss/reg": 0.0, + "step": 25740 + }, + { + "epoch": 0.16940789473684212, + "grad_norm": 2.34375, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 3.0731, + "loss/crossentropy": 2.2090337753295897, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.26342562288045884, + "loss/reg": 0.0, + "step": 25750 + }, + { + "epoch": 0.1694736842105263, + "grad_norm": 2.703125, + "grad_norm_var": 0.31311442057291666, + "learning_rate": 0.0001, + "loss": 3.1412, + "loss/crossentropy": 2.2381184220314028, + "loss/hidden": 3.1265625, + "loss/incoh": 0.0, + "loss/logits": 0.3542740896344185, + "loss/reg": 0.0, + "step": 25760 + }, + { + "epoch": 0.16953947368421052, + "grad_norm": 3.9375, + "grad_norm_var": 0.39954020182291666, + "learning_rate": 0.0001, + "loss": 3.1884, + "loss/crossentropy": 2.115560531616211, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.2626333341002464, + "loss/reg": 0.0, + "step": 25770 + }, + { + "epoch": 0.16960526315789473, + "grad_norm": 2.171875, + "grad_norm_var": 0.21519775390625, + "learning_rate": 0.0001, + "loss": 3.0564, + "loss/crossentropy": 1.9519969999790192, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.22164231091737746, + "loss/reg": 0.0, + "step": 25780 + }, + { + "epoch": 0.16967105263157894, + "grad_norm": 2.5625, + "grad_norm_var": 0.08953450520833334, + "learning_rate": 0.0001, + "loss": 3.1503, + "loss/crossentropy": 2.2421632409095764, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.23813997954130173, + "loss/reg": 0.0, + "step": 25790 + }, + { + "epoch": 0.16973684210526316, + "grad_norm": 2.015625, + "grad_norm_var": 0.1667144775390625, + "learning_rate": 0.0001, + "loss": 3.0487, + "loss/crossentropy": 1.9510473489761353, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.1950627200305462, + "loss/reg": 0.0, + "step": 25800 + }, + { + "epoch": 0.16980263157894737, + "grad_norm": 2.0625, + "grad_norm_var": 0.15810139973958334, + "learning_rate": 0.0001, + "loss": 3.0855, + "loss/crossentropy": 2.2411985039710998, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.25535731613636015, + "loss/reg": 0.0, + "step": 25810 + }, + { + "epoch": 0.16986842105263159, + "grad_norm": 2.03125, + "grad_norm_var": 1.0436358133951822, + "learning_rate": 0.0001, + "loss": 3.115, + "loss/crossentropy": 2.045396554470062, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.22041565626859666, + "loss/reg": 0.0, + "step": 25820 + }, + { + "epoch": 0.1699342105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.18515599568684896, + "learning_rate": 0.0001, + "loss": 3.0967, + "loss/crossentropy": 2.4469074010849, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.2558215782046318, + "loss/reg": 0.0, + "step": 25830 + }, + { + "epoch": 0.17, + "grad_norm": 2.109375, + "grad_norm_var": 0.38408203125, + "learning_rate": 0.0001, + "loss": 3.1042, + "loss/crossentropy": 2.038024789094925, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.28011882603168486, + "loss/reg": 0.0, + "step": 25840 + }, + { + "epoch": 0.1700657894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.12269694010416667, + "learning_rate": 0.0001, + "loss": 3.0198, + "loss/crossentropy": 2.246475076675415, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.21217122972011565, + "loss/reg": 0.0, + "step": 25850 + }, + { + "epoch": 0.1701315789473684, + "grad_norm": 2.140625, + "grad_norm_var": 0.5065388997395833, + "learning_rate": 0.0001, + "loss": 3.0953, + "loss/crossentropy": 2.2885293424129487, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.23523461371660231, + "loss/reg": 0.0, + "step": 25860 + }, + { + "epoch": 0.17019736842105262, + "grad_norm": 4.71875, + "grad_norm_var": 0.8165323893229167, + "learning_rate": 0.0001, + "loss": 3.0392, + "loss/crossentropy": 2.1574928402900695, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.25582125633955, + "loss/reg": 0.0, + "step": 25870 + }, + { + "epoch": 0.17026315789473684, + "grad_norm": 3.046875, + "grad_norm_var": 0.57008056640625, + "learning_rate": 0.0001, + "loss": 3.1252, + "loss/crossentropy": 2.563046908378601, + "loss/hidden": 3.2078125, + "loss/incoh": 0.0, + "loss/logits": 0.2562621384859085, + "loss/reg": 0.0, + "step": 25880 + }, + { + "epoch": 0.17032894736842105, + "grad_norm": 2.578125, + "grad_norm_var": 0.2592437744140625, + "learning_rate": 0.0001, + "loss": 3.054, + "loss/crossentropy": 2.4634771227836607, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.25339619666337965, + "loss/reg": 0.0, + "step": 25890 + }, + { + "epoch": 0.17039473684210527, + "grad_norm": 2.234375, + "grad_norm_var": 0.04925130208333333, + "learning_rate": 0.0001, + "loss": 3.0529, + "loss/crossentropy": 2.5104726552963257, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.24876073151826858, + "loss/reg": 0.0, + "step": 25900 + }, + { + "epoch": 0.17046052631578948, + "grad_norm": 2.75, + "grad_norm_var": 0.07171122233072917, + "learning_rate": 0.0001, + "loss": 3.0629, + "loss/crossentropy": 2.2496933460235597, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.2183684565126896, + "loss/reg": 0.0, + "step": 25910 + }, + { + "epoch": 0.1705263157894737, + "grad_norm": 2.5, + "grad_norm_var": 0.05900777180989583, + "learning_rate": 0.0001, + "loss": 3.0543, + "loss/crossentropy": 2.3338397264480593, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.26187729388475417, + "loss/reg": 0.0, + "step": 25920 + }, + { + "epoch": 0.1705921052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.018359375, + "learning_rate": 0.0001, + "loss": 3.0801, + "loss/crossentropy": 2.499437928199768, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.25035695284605025, + "loss/reg": 0.0, + "step": 25930 + }, + { + "epoch": 0.1706578947368421, + "grad_norm": 2.234375, + "grad_norm_var": 0.014264933268229167, + "learning_rate": 0.0001, + "loss": 3.0337, + "loss/crossentropy": 2.217645859718323, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.21889251619577407, + "loss/reg": 0.0, + "step": 25940 + }, + { + "epoch": 0.1707236842105263, + "grad_norm": 2.140625, + "grad_norm_var": 0.04052734375, + "learning_rate": 0.0001, + "loss": 2.9938, + "loss/crossentropy": 2.371463644504547, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.2288275495171547, + "loss/reg": 0.0, + "step": 25950 + }, + { + "epoch": 0.17078947368421052, + "grad_norm": 2.609375, + "grad_norm_var": 0.1755859375, + "learning_rate": 0.0001, + "loss": 3.0916, + "loss/crossentropy": 2.0315194338560105, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.23134928867220877, + "loss/reg": 0.0, + "step": 25960 + }, + { + "epoch": 0.17085526315789473, + "grad_norm": 2.140625, + "grad_norm_var": 0.18121744791666666, + "learning_rate": 0.0001, + "loss": 3.0837, + "loss/crossentropy": 2.1368157267570496, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2233546957373619, + "loss/reg": 0.0, + "step": 25970 + }, + { + "epoch": 0.17092105263157895, + "grad_norm": 2.640625, + "grad_norm_var": 0.027311197916666665, + "learning_rate": 0.0001, + "loss": 3.0984, + "loss/crossentropy": 2.3421939969062806, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.28762088268995284, + "loss/reg": 0.0, + "step": 25980 + }, + { + "epoch": 0.17098684210526316, + "grad_norm": 2.21875, + "grad_norm_var": 0.051656087239583336, + "learning_rate": 0.0001, + "loss": 3.0788, + "loss/crossentropy": 2.002690005302429, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.24228747338056564, + "loss/reg": 0.0, + "step": 25990 + }, + { + "epoch": 0.17105263157894737, + "grad_norm": 2.4375, + "grad_norm_var": 0.049397786458333336, + "learning_rate": 0.0001, + "loss": 3.0917, + "loss/crossentropy": 2.6486097812652587, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.24952505975961686, + "loss/reg": 0.0, + "step": 26000 + }, + { + "epoch": 0.1711184210526316, + "grad_norm": 2.0, + "grad_norm_var": 0.0459869384765625, + "learning_rate": 0.0001, + "loss": 3.0288, + "loss/crossentropy": 2.516502094268799, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.24382732957601547, + "loss/reg": 0.0, + "step": 26010 + }, + { + "epoch": 0.1711842105263158, + "grad_norm": 2.34375, + "grad_norm_var": 0.04586181640625, + "learning_rate": 0.0001, + "loss": 3.0415, + "loss/crossentropy": 2.171498954296112, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.20375996455550194, + "loss/reg": 0.0, + "step": 26020 + }, + { + "epoch": 0.17125, + "grad_norm": 2.296875, + "grad_norm_var": 0.0759600321451823, + "learning_rate": 0.0001, + "loss": 3.0905, + "loss/crossentropy": 2.2978348255157472, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.2875170633196831, + "loss/reg": 0.0, + "step": 26030 + }, + { + "epoch": 0.1713157894736842, + "grad_norm": 3.546875, + "grad_norm_var": 0.1603167215983073, + "learning_rate": 0.0001, + "loss": 3.0384, + "loss/crossentropy": 2.462969756126404, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2628189787268639, + "loss/reg": 0.0, + "step": 26040 + }, + { + "epoch": 0.1713815789473684, + "grad_norm": 2.46875, + "grad_norm_var": 0.13243815104166667, + "learning_rate": 0.0001, + "loss": 3.0513, + "loss/crossentropy": 2.537035143375397, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.24593261033296585, + "loss/reg": 0.0, + "step": 26050 + }, + { + "epoch": 0.17144736842105263, + "grad_norm": 2.453125, + "grad_norm_var": 0.061102040608723956, + "learning_rate": 0.0001, + "loss": 3.0445, + "loss/crossentropy": 2.2031071186065674, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.24524077475070954, + "loss/reg": 0.0, + "step": 26060 + }, + { + "epoch": 0.17151315789473684, + "grad_norm": 2.296875, + "grad_norm_var": 0.009764607747395833, + "learning_rate": 0.0001, + "loss": 3.0725, + "loss/crossentropy": 2.4979568481445313, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.2647134616971016, + "loss/reg": 0.0, + "step": 26070 + }, + { + "epoch": 0.17157894736842105, + "grad_norm": 2.53125, + "grad_norm_var": 0.026024373372395833, + "learning_rate": 0.0001, + "loss": 2.99, + "loss/crossentropy": 2.350401425361633, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.24495915323495865, + "loss/reg": 0.0, + "step": 26080 + }, + { + "epoch": 0.17164473684210527, + "grad_norm": 2.3125, + "grad_norm_var": 0.028857421875, + "learning_rate": 0.0001, + "loss": 3.0236, + "loss/crossentropy": 2.275194299221039, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.22880630493164061, + "loss/reg": 0.0, + "step": 26090 + }, + { + "epoch": 0.17171052631578948, + "grad_norm": 2.25, + "grad_norm_var": 0.03967692057291667, + "learning_rate": 0.0001, + "loss": 3.0823, + "loss/crossentropy": 2.000411808490753, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2721859060227871, + "loss/reg": 0.0, + "step": 26100 + }, + { + "epoch": 0.1717763157894737, + "grad_norm": 2.46875, + "grad_norm_var": 0.06353759765625, + "learning_rate": 0.0001, + "loss": 3.0778, + "loss/crossentropy": 2.305191624164581, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.2284097969532013, + "loss/reg": 0.0, + "step": 26110 + }, + { + "epoch": 0.1718421052631579, + "grad_norm": 2.28125, + "grad_norm_var": 0.34947509765625, + "learning_rate": 0.0001, + "loss": 3.0215, + "loss/crossentropy": 2.3149521112442017, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.27900824695825577, + "loss/reg": 0.0, + "step": 26120 + }, + { + "epoch": 0.1719078947368421, + "grad_norm": 2.65625, + "grad_norm_var": 0.39952977498372394, + "learning_rate": 0.0001, + "loss": 3.0197, + "loss/crossentropy": 2.421990168094635, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2581001713871956, + "loss/reg": 0.0, + "step": 26130 + }, + { + "epoch": 0.1719736842105263, + "grad_norm": 2.25, + "grad_norm_var": 0.19842529296875, + "learning_rate": 0.0001, + "loss": 3.115, + "loss/crossentropy": 2.473453497886658, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.25612604022026064, + "loss/reg": 0.0, + "step": 26140 + }, + { + "epoch": 0.17203947368421052, + "grad_norm": 2.015625, + "grad_norm_var": 0.06412328084309896, + "learning_rate": 0.0001, + "loss": 3.1205, + "loss/crossentropy": 2.395763027667999, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.25399915128946304, + "loss/reg": 0.0, + "step": 26150 + }, + { + "epoch": 0.17210526315789473, + "grad_norm": 3.21875, + "grad_norm_var": 3.7263832092285156, + "learning_rate": 0.0001, + "loss": 3.1443, + "loss/crossentropy": 2.3229804635047913, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.25968860387802123, + "loss/reg": 0.0, + "step": 26160 + }, + { + "epoch": 0.17217105263157895, + "grad_norm": 2.3125, + "grad_norm_var": 3.5683990478515626, + "learning_rate": 0.0001, + "loss": 3.0833, + "loss/crossentropy": 2.27206437587738, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.26493641585111616, + "loss/reg": 0.0, + "step": 26170 + }, + { + "epoch": 0.17223684210526316, + "grad_norm": 2.78125, + "grad_norm_var": 0.140771484375, + "learning_rate": 0.0001, + "loss": 3.1236, + "loss/crossentropy": 2.1949939370155334, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.24421997666358947, + "loss/reg": 0.0, + "step": 26180 + }, + { + "epoch": 0.17230263157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.08112691243489584, + "learning_rate": 0.0001, + "loss": 3.0485, + "loss/crossentropy": 2.292436492443085, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.2527725785970688, + "loss/reg": 0.0, + "step": 26190 + }, + { + "epoch": 0.1723684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 0.08532613118489583, + "learning_rate": 0.0001, + "loss": 3.1316, + "loss/crossentropy": 2.497014009952545, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2447853922843933, + "loss/reg": 0.0, + "step": 26200 + }, + { + "epoch": 0.1724342105263158, + "grad_norm": 2.421875, + "grad_norm_var": 0.142578125, + "learning_rate": 0.0001, + "loss": 3.1014, + "loss/crossentropy": 2.22248170375824, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2591711387038231, + "loss/reg": 0.0, + "step": 26210 + }, + { + "epoch": 0.1725, + "grad_norm": 2.109375, + "grad_norm_var": 0.12844009399414064, + "learning_rate": 0.0001, + "loss": 3.0061, + "loss/crossentropy": 2.3277372121810913, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2391911566257477, + "loss/reg": 0.0, + "step": 26220 + }, + { + "epoch": 0.1725657894736842, + "grad_norm": 2.46875, + "grad_norm_var": 0.03429361979166667, + "learning_rate": 0.0001, + "loss": 3.0468, + "loss/crossentropy": 2.593550610542297, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.26268754005432127, + "loss/reg": 0.0, + "step": 26230 + }, + { + "epoch": 0.1726315789473684, + "grad_norm": 3.59375, + "grad_norm_var": 0.13815689086914062, + "learning_rate": 0.0001, + "loss": 3.0785, + "loss/crossentropy": 2.037731957435608, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.19823871441185476, + "loss/reg": 0.0, + "step": 26240 + }, + { + "epoch": 0.17269736842105263, + "grad_norm": 2.203125, + "grad_norm_var": 0.1658404032389323, + "learning_rate": 0.0001, + "loss": 3.0753, + "loss/crossentropy": 2.365402173995972, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.22439467608928682, + "loss/reg": 0.0, + "step": 26250 + }, + { + "epoch": 0.17276315789473684, + "grad_norm": 2.296875, + "grad_norm_var": 0.42582906087239586, + "learning_rate": 0.0001, + "loss": 3.0247, + "loss/crossentropy": 2.4660609483718874, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2206026643514633, + "loss/reg": 0.0, + "step": 26260 + }, + { + "epoch": 0.17282894736842105, + "grad_norm": 2.265625, + "grad_norm_var": 0.4584307352701823, + "learning_rate": 0.0001, + "loss": 2.9979, + "loss/crossentropy": 2.3537610173225403, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.24690964370965957, + "loss/reg": 0.0, + "step": 26270 + }, + { + "epoch": 0.17289473684210527, + "grad_norm": 2.34375, + "grad_norm_var": 0.1235015869140625, + "learning_rate": 0.0001, + "loss": 3.0554, + "loss/crossentropy": 2.4086636185646055, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.22505185902118682, + "loss/reg": 0.0, + "step": 26280 + }, + { + "epoch": 0.17296052631578948, + "grad_norm": 2.296875, + "grad_norm_var": 0.26606343587239584, + "learning_rate": 0.0001, + "loss": 3.0265, + "loss/crossentropy": 2.120291304588318, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.20824237614870073, + "loss/reg": 0.0, + "step": 26290 + }, + { + "epoch": 0.1730263157894737, + "grad_norm": 2.296875, + "grad_norm_var": 0.18574117024739584, + "learning_rate": 0.0001, + "loss": 2.9919, + "loss/crossentropy": 2.1023782581090926, + "loss/hidden": 2.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.19826814979314805, + "loss/reg": 0.0, + "step": 26300 + }, + { + "epoch": 0.1730921052631579, + "grad_norm": 2.8125, + "grad_norm_var": 0.24006754557291668, + "learning_rate": 0.0001, + "loss": 3.1028, + "loss/crossentropy": 2.226356017589569, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.2514319851994514, + "loss/reg": 0.0, + "step": 26310 + }, + { + "epoch": 0.1731578947368421, + "grad_norm": 2.328125, + "grad_norm_var": 0.234326171875, + "learning_rate": 0.0001, + "loss": 3.0524, + "loss/crossentropy": 2.2389047503471375, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.23142587393522263, + "loss/reg": 0.0, + "step": 26320 + }, + { + "epoch": 0.1732236842105263, + "grad_norm": 2.203125, + "grad_norm_var": 0.07327041625976563, + "learning_rate": 0.0001, + "loss": 3.054, + "loss/crossentropy": 2.3862741947174073, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2310537427663803, + "loss/reg": 0.0, + "step": 26330 + }, + { + "epoch": 0.17328947368421052, + "grad_norm": 2.171875, + "grad_norm_var": 0.07641499837239583, + "learning_rate": 0.0001, + "loss": 3.0653, + "loss/crossentropy": 2.348588991165161, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.22305506616830825, + "loss/reg": 0.0, + "step": 26340 + }, + { + "epoch": 0.17335526315789473, + "grad_norm": 2.390625, + "grad_norm_var": 0.05279541015625, + "learning_rate": 0.0001, + "loss": 3.0529, + "loss/crossentropy": 2.036811423301697, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.23410281240940095, + "loss/reg": 0.0, + "step": 26350 + }, + { + "epoch": 0.17342105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.2839101155598958, + "learning_rate": 0.0001, + "loss": 3.1092, + "loss/crossentropy": 2.1091545104980467, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.23594892621040345, + "loss/reg": 0.0, + "step": 26360 + }, + { + "epoch": 0.17348684210526316, + "grad_norm": 2.59375, + "grad_norm_var": 0.061848958333333336, + "learning_rate": 0.0001, + "loss": 3.091, + "loss/crossentropy": 2.290253794193268, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.29689010232686996, + "loss/reg": 0.0, + "step": 26370 + }, + { + "epoch": 0.17355263157894738, + "grad_norm": 2.890625, + "grad_norm_var": 0.04799702962239583, + "learning_rate": 0.0001, + "loss": 3.0457, + "loss/crossentropy": 2.193889284133911, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.22776648998260499, + "loss/reg": 0.0, + "step": 26380 + }, + { + "epoch": 0.1736184210526316, + "grad_norm": 2.125, + "grad_norm_var": 0.08166910807291666, + "learning_rate": 0.0001, + "loss": 3.0939, + "loss/crossentropy": 2.2826230049133303, + "loss/hidden": 2.9390625, + "loss/incoh": 0.0, + "loss/logits": 0.2337636888027191, + "loss/reg": 0.0, + "step": 26390 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 2.390625, + "grad_norm_var": 0.16920572916666668, + "learning_rate": 0.0001, + "loss": 3.0459, + "loss/crossentropy": 2.233789896965027, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.20103665292263032, + "loss/reg": 0.0, + "step": 26400 + }, + { + "epoch": 0.17375, + "grad_norm": 2.125, + "grad_norm_var": 0.17185440063476562, + "learning_rate": 0.0001, + "loss": 3.0538, + "loss/crossentropy": 2.295468533039093, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.2207227498292923, + "loss/reg": 0.0, + "step": 26410 + }, + { + "epoch": 0.1738157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.048618316650390625, + "learning_rate": 0.0001, + "loss": 3.0468, + "loss/crossentropy": 2.405815064907074, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.22815033197402954, + "loss/reg": 0.0, + "step": 26420 + }, + { + "epoch": 0.17388157894736841, + "grad_norm": 2.53125, + "grad_norm_var": 0.05583394368489583, + "learning_rate": 0.0001, + "loss": 3.1146, + "loss/crossentropy": 2.2692611694335936, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.24037961512804032, + "loss/reg": 0.0, + "step": 26430 + }, + { + "epoch": 0.17394736842105263, + "grad_norm": 2.6875, + "grad_norm_var": 0.1251617431640625, + "learning_rate": 0.0001, + "loss": 3.1101, + "loss/crossentropy": 2.2592686772346497, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.24446354508399964, + "loss/reg": 0.0, + "step": 26440 + }, + { + "epoch": 0.17401315789473684, + "grad_norm": 2.484375, + "grad_norm_var": 0.10937093098958334, + "learning_rate": 0.0001, + "loss": 3.0527, + "loss/crossentropy": 2.291548252105713, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.25084199756383896, + "loss/reg": 0.0, + "step": 26450 + }, + { + "epoch": 0.17407894736842106, + "grad_norm": 2.984375, + "grad_norm_var": 0.06529947916666666, + "learning_rate": 0.0001, + "loss": 3.1052, + "loss/crossentropy": 2.37350834608078, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.2556142807006836, + "loss/reg": 0.0, + "step": 26460 + }, + { + "epoch": 0.17414473684210527, + "grad_norm": 2.375, + "grad_norm_var": 0.07598037719726562, + "learning_rate": 0.0001, + "loss": 3.0113, + "loss/crossentropy": 2.250531017780304, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2357124164700508, + "loss/reg": 0.0, + "step": 26470 + }, + { + "epoch": 0.17421052631578948, + "grad_norm": 2.625, + "grad_norm_var": 0.17610677083333334, + "learning_rate": 0.0001, + "loss": 3.1412, + "loss/crossentropy": 2.439283573627472, + "loss/hidden": 2.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.2121993750333786, + "loss/reg": 0.0, + "step": 26480 + }, + { + "epoch": 0.1742763157894737, + "grad_norm": 2.71875, + "grad_norm_var": 0.16172587076822917, + "learning_rate": 0.0001, + "loss": 3.0562, + "loss/crossentropy": 2.3756911277771, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2490501657128334, + "loss/reg": 0.0, + "step": 26490 + }, + { + "epoch": 0.17434210526315788, + "grad_norm": 2.140625, + "grad_norm_var": 0.050211588541666664, + "learning_rate": 0.0001, + "loss": 3.0538, + "loss/crossentropy": 2.5375622749328612, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.24366173297166824, + "loss/reg": 0.0, + "step": 26500 + }, + { + "epoch": 0.1744078947368421, + "grad_norm": 2.328125, + "grad_norm_var": 0.5032185872395833, + "learning_rate": 0.0001, + "loss": 3.0919, + "loss/crossentropy": 2.3419446110725404, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.2749262437224388, + "loss/reg": 0.0, + "step": 26510 + }, + { + "epoch": 0.1744736842105263, + "grad_norm": 2.5625, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 3.0832, + "loss/crossentropy": 2.070459759235382, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2193553477525711, + "loss/reg": 0.0, + "step": 26520 + }, + { + "epoch": 0.17453947368421052, + "grad_norm": 2.421875, + "grad_norm_var": 0.11748758951822917, + "learning_rate": 0.0001, + "loss": 3.0301, + "loss/crossentropy": 2.2101051688194273, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.2358426868915558, + "loss/reg": 0.0, + "step": 26530 + }, + { + "epoch": 0.17460526315789474, + "grad_norm": 2.5, + "grad_norm_var": 0.11904195149739584, + "learning_rate": 0.0001, + "loss": 2.9898, + "loss/crossentropy": 2.301384377479553, + "loss/hidden": 2.9671875, + "loss/incoh": 0.0, + "loss/logits": 0.270483261346817, + "loss/reg": 0.0, + "step": 26540 + }, + { + "epoch": 0.17467105263157895, + "grad_norm": 2.296875, + "grad_norm_var": 0.03364969889322917, + "learning_rate": 0.0001, + "loss": 2.9894, + "loss/crossentropy": 2.233541655540466, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.23210279494524003, + "loss/reg": 0.0, + "step": 26550 + }, + { + "epoch": 0.17473684210526316, + "grad_norm": 4.3125, + "grad_norm_var": 0.30920817057291666, + "learning_rate": 0.0001, + "loss": 3.0375, + "loss/crossentropy": 2.4949169993400573, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.27407604902982713, + "loss/reg": 0.0, + "step": 26560 + }, + { + "epoch": 0.17480263157894738, + "grad_norm": 2.4375, + "grad_norm_var": 0.4443511962890625, + "learning_rate": 0.0001, + "loss": 3.1064, + "loss/crossentropy": 2.180380403995514, + "loss/hidden": 3.06875, + "loss/incoh": 0.0, + "loss/logits": 0.2965380325913429, + "loss/reg": 0.0, + "step": 26570 + }, + { + "epoch": 0.1748684210526316, + "grad_norm": 2.421875, + "grad_norm_var": 0.12886962890625, + "learning_rate": 0.0001, + "loss": 3.0636, + "loss/crossentropy": 2.2048864006996154, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.24606209397315978, + "loss/reg": 0.0, + "step": 26580 + }, + { + "epoch": 0.1749342105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.090576171875, + "learning_rate": 0.0001, + "loss": 3.0745, + "loss/crossentropy": 2.0117301523685454, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.211642824113369, + "loss/reg": 0.0, + "step": 26590 + }, + { + "epoch": 0.175, + "grad_norm": 2.15625, + "grad_norm_var": 0.9207834879557292, + "learning_rate": 0.0001, + "loss": 2.9913, + "loss/crossentropy": 2.2407095432281494, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.25644198805093765, + "loss/reg": 0.0, + "step": 26600 + }, + { + "epoch": 0.1750657894736842, + "grad_norm": 2.03125, + "grad_norm_var": 0.9292063395182292, + "learning_rate": 0.0001, + "loss": 3.1122, + "loss/crossentropy": 2.510072946548462, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.24346872568130493, + "loss/reg": 0.0, + "step": 26610 + }, + { + "epoch": 0.17513157894736842, + "grad_norm": 2.140625, + "grad_norm_var": 1.2361806233723958, + "learning_rate": 0.0001, + "loss": 3.1052, + "loss/crossentropy": 2.0434396266937256, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.19786725342273712, + "loss/reg": 0.0, + "step": 26620 + }, + { + "epoch": 0.17519736842105263, + "grad_norm": 2.25, + "grad_norm_var": 1.2163075764973958, + "learning_rate": 0.0001, + "loss": 3.1224, + "loss/crossentropy": 2.2104805946350097, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.21630636900663375, + "loss/reg": 0.0, + "step": 26630 + }, + { + "epoch": 0.17526315789473684, + "grad_norm": 2.421875, + "grad_norm_var": 0.03358739217122396, + "learning_rate": 0.0001, + "loss": 3.028, + "loss/crossentropy": 2.178933525085449, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.20190188586711882, + "loss/reg": 0.0, + "step": 26640 + }, + { + "epoch": 0.17532894736842106, + "grad_norm": 2.265625, + "grad_norm_var": 0.09688212076822916, + "learning_rate": 0.0001, + "loss": 3.0897, + "loss/crossentropy": 2.2301127552986144, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.23655431792140008, + "loss/reg": 0.0, + "step": 26650 + }, + { + "epoch": 0.17539473684210527, + "grad_norm": 2.71875, + "grad_norm_var": 0.10475972493489584, + "learning_rate": 0.0001, + "loss": 3.0935, + "loss/crossentropy": 2.2094313383102415, + "loss/hidden": 2.9734375, + "loss/incoh": 0.0, + "loss/logits": 0.27910117208957674, + "loss/reg": 0.0, + "step": 26660 + }, + { + "epoch": 0.17546052631578948, + "grad_norm": 2.28125, + "grad_norm_var": 0.050446573893229166, + "learning_rate": 0.0001, + "loss": 3.0452, + "loss/crossentropy": 2.491715204715729, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.24501692056655883, + "loss/reg": 0.0, + "step": 26670 + }, + { + "epoch": 0.1755263157894737, + "grad_norm": 2.296875, + "grad_norm_var": 0.42122294108072916, + "learning_rate": 0.0001, + "loss": 3.1247, + "loss/crossentropy": 2.431166636943817, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.24856783598661422, + "loss/reg": 0.0, + "step": 26680 + }, + { + "epoch": 0.17559210526315788, + "grad_norm": 2.28125, + "grad_norm_var": 0.0764801025390625, + "learning_rate": 0.0001, + "loss": 3.1265, + "loss/crossentropy": 2.2466578841209413, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.22143050357699395, + "loss/reg": 0.0, + "step": 26690 + }, + { + "epoch": 0.1756578947368421, + "grad_norm": 2.421875, + "grad_norm_var": 0.06314697265625, + "learning_rate": 0.0001, + "loss": 3.0522, + "loss/crossentropy": 1.8643316149711608, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.22551749348640443, + "loss/reg": 0.0, + "step": 26700 + }, + { + "epoch": 0.1757236842105263, + "grad_norm": 2.421875, + "grad_norm_var": 0.12968648274739583, + "learning_rate": 0.0001, + "loss": 3.0386, + "loss/crossentropy": 1.784457266330719, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.17761861719191074, + "loss/reg": 0.0, + "step": 26710 + }, + { + "epoch": 0.17578947368421052, + "grad_norm": 2.484375, + "grad_norm_var": 0.13782450358072917, + "learning_rate": 0.0001, + "loss": 3.0727, + "loss/crossentropy": 2.388060462474823, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.25690646171569825, + "loss/reg": 0.0, + "step": 26720 + }, + { + "epoch": 0.17585526315789474, + "grad_norm": 2.3125, + "grad_norm_var": 0.5772450764973959, + "learning_rate": 0.0001, + "loss": 3.017, + "loss/crossentropy": 2.293041491508484, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.2976214215159416, + "loss/reg": 0.0, + "step": 26730 + }, + { + "epoch": 0.17592105263157895, + "grad_norm": 2.203125, + "grad_norm_var": 0.7439737955729167, + "learning_rate": 0.0001, + "loss": 3.1688, + "loss/crossentropy": 2.346044087409973, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.24346352070569993, + "loss/reg": 0.0, + "step": 26740 + }, + { + "epoch": 0.17598684210526316, + "grad_norm": 2.171875, + "grad_norm_var": 0.19332275390625, + "learning_rate": 0.0001, + "loss": 3.1074, + "loss/crossentropy": 2.3460346817970277, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.22802013754844666, + "loss/reg": 0.0, + "step": 26750 + }, + { + "epoch": 0.17605263157894738, + "grad_norm": 2.28125, + "grad_norm_var": 0.09132486979166667, + "learning_rate": 0.0001, + "loss": 3.0194, + "loss/crossentropy": 2.361405539512634, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.2546524882316589, + "loss/reg": 0.0, + "step": 26760 + }, + { + "epoch": 0.1761184210526316, + "grad_norm": 2.21875, + "grad_norm_var": 0.06636962890625, + "learning_rate": 0.0001, + "loss": 3.1365, + "loss/crossentropy": 2.184977853298187, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.24620451256632805, + "loss/reg": 0.0, + "step": 26770 + }, + { + "epoch": 0.17618421052631578, + "grad_norm": 2.453125, + "grad_norm_var": 0.04890034993489583, + "learning_rate": 0.0001, + "loss": 3.0491, + "loss/crossentropy": 2.3393765091896057, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.21822300404310227, + "loss/reg": 0.0, + "step": 26780 + }, + { + "epoch": 0.17625, + "grad_norm": 2.234375, + "grad_norm_var": 0.08136571248372396, + "learning_rate": 0.0001, + "loss": 3.0288, + "loss/crossentropy": 2.1801982522010803, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2308456853032112, + "loss/reg": 0.0, + "step": 26790 + }, + { + "epoch": 0.1763157894736842, + "grad_norm": 2.046875, + "grad_norm_var": 0.08675918579101563, + "learning_rate": 0.0001, + "loss": 3.0063, + "loss/crossentropy": 2.3970799446105957, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.23122948557138442, + "loss/reg": 0.0, + "step": 26800 + }, + { + "epoch": 0.17638157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.04929911295572917, + "learning_rate": 0.0001, + "loss": 3.0628, + "loss/crossentropy": 2.3741040945053102, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.26243693232536314, + "loss/reg": 0.0, + "step": 26810 + }, + { + "epoch": 0.17644736842105263, + "grad_norm": 2.265625, + "grad_norm_var": 0.07210184733072916, + "learning_rate": 0.0001, + "loss": 3.0187, + "loss/crossentropy": 2.2319773197174073, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.201442664116621, + "loss/reg": 0.0, + "step": 26820 + }, + { + "epoch": 0.17651315789473684, + "grad_norm": 2.171875, + "grad_norm_var": 0.20078023274739584, + "learning_rate": 0.0001, + "loss": 3.0845, + "loss/crossentropy": 2.0902840554714204, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.23434632122516633, + "loss/reg": 0.0, + "step": 26830 + }, + { + "epoch": 0.17657894736842106, + "grad_norm": 2.078125, + "grad_norm_var": 0.48001200358072915, + "learning_rate": 0.0001, + "loss": 3.0548, + "loss/crossentropy": 1.8517399907112122, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.1863703802227974, + "loss/reg": 0.0, + "step": 26840 + }, + { + "epoch": 0.17664473684210527, + "grad_norm": 2.21875, + "grad_norm_var": 0.4188313802083333, + "learning_rate": 0.0001, + "loss": 3.0383, + "loss/crossentropy": 2.398709797859192, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.2097424551844597, + "loss/reg": 0.0, + "step": 26850 + }, + { + "epoch": 0.17671052631578948, + "grad_norm": 2.265625, + "grad_norm_var": 0.14855855305989582, + "learning_rate": 0.0001, + "loss": 3.0439, + "loss/crossentropy": 2.2976414561271667, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.2062876433134079, + "loss/reg": 0.0, + "step": 26860 + }, + { + "epoch": 0.1767763157894737, + "grad_norm": 2.828125, + "grad_norm_var": 0.3957801818847656, + "learning_rate": 0.0001, + "loss": 3.1221, + "loss/crossentropy": 2.2302059173583983, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.2583988204598427, + "loss/reg": 0.0, + "step": 26870 + }, + { + "epoch": 0.17684210526315788, + "grad_norm": 2.09375, + "grad_norm_var": 0.34606831868489585, + "learning_rate": 0.0001, + "loss": 3.0334, + "loss/crossentropy": 2.2456519961357118, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.26051086038351057, + "loss/reg": 0.0, + "step": 26880 + }, + { + "epoch": 0.1769078947368421, + "grad_norm": 2.765625, + "grad_norm_var": 0.09192301432291666, + "learning_rate": 0.0001, + "loss": 3.0806, + "loss/crossentropy": 2.25178804397583, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.2424270063638687, + "loss/reg": 0.0, + "step": 26890 + }, + { + "epoch": 0.1769736842105263, + "grad_norm": 2.1875, + "grad_norm_var": 0.179736328125, + "learning_rate": 0.0001, + "loss": 3.0299, + "loss/crossentropy": 2.2696429252624513, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.25682739466428756, + "loss/reg": 0.0, + "step": 26900 + }, + { + "epoch": 0.17703947368421052, + "grad_norm": 2.0625, + "grad_norm_var": 0.44387613932291664, + "learning_rate": 0.0001, + "loss": 3.0266, + "loss/crossentropy": 2.349669575691223, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.24149384647607802, + "loss/reg": 0.0, + "step": 26910 + }, + { + "epoch": 0.17710526315789474, + "grad_norm": 2.21875, + "grad_norm_var": 0.34541727701822916, + "learning_rate": 0.0001, + "loss": 3.0779, + "loss/crossentropy": 2.20119423866272, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.21436720862984657, + "loss/reg": 0.0, + "step": 26920 + }, + { + "epoch": 0.17717105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.05929361979166667, + "learning_rate": 0.0001, + "loss": 3.0616, + "loss/crossentropy": 2.2472872853279116, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.22420328930020333, + "loss/reg": 0.0, + "step": 26930 + }, + { + "epoch": 0.17723684210526316, + "grad_norm": 2.734375, + "grad_norm_var": 0.08504231770833333, + "learning_rate": 0.0001, + "loss": 3.1167, + "loss/crossentropy": 2.308558535575867, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.2697313494980335, + "loss/reg": 0.0, + "step": 26940 + }, + { + "epoch": 0.17730263157894738, + "grad_norm": 2.375, + "grad_norm_var": 0.0501617431640625, + "learning_rate": 0.0001, + "loss": 3.0375, + "loss/crossentropy": 2.4767362475395203, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.22490220367908478, + "loss/reg": 0.0, + "step": 26950 + }, + { + "epoch": 0.1773684210526316, + "grad_norm": 4.25, + "grad_norm_var": 0.29531962076822915, + "learning_rate": 0.0001, + "loss": 3.0694, + "loss/crossentropy": 2.041945827007294, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.21851452216506004, + "loss/reg": 0.0, + "step": 26960 + }, + { + "epoch": 0.17743421052631578, + "grad_norm": 2.234375, + "grad_norm_var": 0.2590728759765625, + "learning_rate": 0.0001, + "loss": 3.0748, + "loss/crossentropy": 2.3148224115371705, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.24593613892793656, + "loss/reg": 0.0, + "step": 26970 + }, + { + "epoch": 0.1775, + "grad_norm": 2.375, + "grad_norm_var": 0.09059956868489584, + "learning_rate": 0.0001, + "loss": 3.0916, + "loss/crossentropy": 2.3537511348724367, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.26722831577062606, + "loss/reg": 0.0, + "step": 26980 + }, + { + "epoch": 0.1775657894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.09703776041666666, + "learning_rate": 0.0001, + "loss": 3.0801, + "loss/crossentropy": 2.3036428809165956, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.28045037388801575, + "loss/reg": 0.0, + "step": 26990 + }, + { + "epoch": 0.17763157894736842, + "grad_norm": 2.5, + "grad_norm_var": 0.0448883056640625, + "learning_rate": 0.0001, + "loss": 3.1064, + "loss/crossentropy": 2.210177004337311, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.20902514159679414, + "loss/reg": 0.0, + "step": 27000 + }, + { + "epoch": 0.17769736842105263, + "grad_norm": 2.640625, + "grad_norm_var": 0.408251953125, + "learning_rate": 0.0001, + "loss": 3.1257, + "loss/crossentropy": 2.354000687599182, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2463086098432541, + "loss/reg": 0.0, + "step": 27010 + }, + { + "epoch": 0.17776315789473685, + "grad_norm": 2.125, + "grad_norm_var": 0.07931086222330729, + "learning_rate": 0.0001, + "loss": 2.9938, + "loss/crossentropy": 2.2460681438446044, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.2801851168274879, + "loss/reg": 0.0, + "step": 27020 + }, + { + "epoch": 0.17782894736842106, + "grad_norm": 1.984375, + "grad_norm_var": 0.07854715983072917, + "learning_rate": 0.0001, + "loss": 3.0282, + "loss/crossentropy": 2.1731944918632506, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.22212435267865657, + "loss/reg": 0.0, + "step": 27030 + }, + { + "epoch": 0.17789473684210527, + "grad_norm": 2.25, + "grad_norm_var": 0.116162109375, + "learning_rate": 0.0001, + "loss": 3.0344, + "loss/crossentropy": 2.441677284240723, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.25195191353559493, + "loss/reg": 0.0, + "step": 27040 + }, + { + "epoch": 0.17796052631578949, + "grad_norm": 2.375, + "grad_norm_var": 0.15789388020833334, + "learning_rate": 0.0001, + "loss": 3.0824, + "loss/crossentropy": 2.30471476316452, + "loss/hidden": 3.0046875, + "loss/incoh": 0.0, + "loss/logits": 0.2780482307076454, + "loss/reg": 0.0, + "step": 27050 + }, + { + "epoch": 0.17802631578947367, + "grad_norm": 2.15625, + "grad_norm_var": 0.13613179524739583, + "learning_rate": 0.0001, + "loss": 3.0864, + "loss/crossentropy": 2.1705368638038633, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.2402413785457611, + "loss/reg": 0.0, + "step": 27060 + }, + { + "epoch": 0.17809210526315788, + "grad_norm": 2.609375, + "grad_norm_var": 0.09636128743489583, + "learning_rate": 0.0001, + "loss": 3.1186, + "loss/crossentropy": 2.1933916807174683, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.22591635286808015, + "loss/reg": 0.0, + "step": 27070 + }, + { + "epoch": 0.1781578947368421, + "grad_norm": 2.296875, + "grad_norm_var": 0.08137613932291667, + "learning_rate": 0.0001, + "loss": 3.0876, + "loss/crossentropy": 2.2060605615377424, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.2600886657834053, + "loss/reg": 0.0, + "step": 27080 + }, + { + "epoch": 0.1782236842105263, + "grad_norm": 3.0, + "grad_norm_var": 0.12360738118489584, + "learning_rate": 0.0001, + "loss": 3.0018, + "loss/crossentropy": 2.3198684811592103, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.23098773807287215, + "loss/reg": 0.0, + "step": 27090 + }, + { + "epoch": 0.17828947368421053, + "grad_norm": 2.703125, + "grad_norm_var": 0.10364583333333334, + "learning_rate": 0.0001, + "loss": 2.977, + "loss/crossentropy": 2.3410441994667055, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.25723920315504073, + "loss/reg": 0.0, + "step": 27100 + }, + { + "epoch": 0.17835526315789474, + "grad_norm": 2.609375, + "grad_norm_var": 0.08136393229166666, + "learning_rate": 0.0001, + "loss": 2.9812, + "loss/crossentropy": 2.290076696872711, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.20929899364709853, + "loss/reg": 0.0, + "step": 27110 + }, + { + "epoch": 0.17842105263157895, + "grad_norm": 2.53125, + "grad_norm_var": 0.061310831705729166, + "learning_rate": 0.0001, + "loss": 3.0998, + "loss/crossentropy": 2.6890960454940798, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.25387548208236693, + "loss/reg": 0.0, + "step": 27120 + }, + { + "epoch": 0.17848684210526317, + "grad_norm": 2.3125, + "grad_norm_var": 0.03313395182291667, + "learning_rate": 0.0001, + "loss": 3.0312, + "loss/crossentropy": 2.4378479957580566, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.22957875877618789, + "loss/reg": 0.0, + "step": 27130 + }, + { + "epoch": 0.17855263157894738, + "grad_norm": 2.328125, + "grad_norm_var": 0.11809794108072917, + "learning_rate": 0.0001, + "loss": 3.1739, + "loss/crossentropy": 2.2034701079130175, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.2694556161761284, + "loss/reg": 0.0, + "step": 27140 + }, + { + "epoch": 0.17861842105263157, + "grad_norm": 2.015625, + "grad_norm_var": 0.243505859375, + "learning_rate": 0.0001, + "loss": 3.0594, + "loss/crossentropy": 2.371540868282318, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.23407552391290665, + "loss/reg": 0.0, + "step": 27150 + }, + { + "epoch": 0.17868421052631578, + "grad_norm": 2.21875, + "grad_norm_var": 0.15914306640625, + "learning_rate": 0.0001, + "loss": 2.9942, + "loss/crossentropy": 1.9778084814548493, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.19847352504730226, + "loss/reg": 0.0, + "step": 27160 + }, + { + "epoch": 0.17875, + "grad_norm": 2.71875, + "grad_norm_var": 0.05803120930989583, + "learning_rate": 0.0001, + "loss": 3.0767, + "loss/crossentropy": 2.3956029176712037, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.21606186181306838, + "loss/reg": 0.0, + "step": 27170 + }, + { + "epoch": 0.1788157894736842, + "grad_norm": 2.703125, + "grad_norm_var": 0.07607014973958333, + "learning_rate": 0.0001, + "loss": 3.0248, + "loss/crossentropy": 2.2445591568946837, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.23037664219737053, + "loss/reg": 0.0, + "step": 27180 + }, + { + "epoch": 0.17888157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 0.08280843098958333, + "learning_rate": 0.0001, + "loss": 3.1128, + "loss/crossentropy": 1.8874721288681031, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2291864424943924, + "loss/reg": 0.0, + "step": 27190 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.08245035807291666, + "learning_rate": 0.0001, + "loss": 3.0599, + "loss/crossentropy": 2.321315813064575, + "loss/hidden": 3.0171875, + "loss/incoh": 0.0, + "loss/logits": 0.28183609843254087, + "loss/reg": 0.0, + "step": 27200 + }, + { + "epoch": 0.17901315789473685, + "grad_norm": 2.40625, + "grad_norm_var": 0.04579976399739583, + "learning_rate": 0.0001, + "loss": 3.1486, + "loss/crossentropy": 2.1769141793251037, + "loss/hidden": 3.1140625, + "loss/incoh": 0.0, + "loss/logits": 0.3312571823596954, + "loss/reg": 0.0, + "step": 27210 + }, + { + "epoch": 0.17907894736842106, + "grad_norm": 2.28125, + "grad_norm_var": 0.0505035400390625, + "learning_rate": 0.0001, + "loss": 3.0333, + "loss/crossentropy": 2.421796774864197, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.2053638830780983, + "loss/reg": 0.0, + "step": 27220 + }, + { + "epoch": 0.17914473684210527, + "grad_norm": 2.34375, + "grad_norm_var": 0.13408915201822916, + "learning_rate": 0.0001, + "loss": 3.0911, + "loss/crossentropy": 2.539614748954773, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2433784618973732, + "loss/reg": 0.0, + "step": 27230 + }, + { + "epoch": 0.1792105263157895, + "grad_norm": 2.125, + "grad_norm_var": 0.08008524576822916, + "learning_rate": 0.0001, + "loss": 3.0, + "loss/crossentropy": 2.1362823486328124, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.22769216746091842, + "loss/reg": 0.0, + "step": 27240 + }, + { + "epoch": 0.17927631578947367, + "grad_norm": 2.40625, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 2.9645, + "loss/crossentropy": 2.341468358039856, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.21539552509784698, + "loss/reg": 0.0, + "step": 27250 + }, + { + "epoch": 0.17934210526315789, + "grad_norm": 2.234375, + "grad_norm_var": 0.031174468994140624, + "learning_rate": 0.0001, + "loss": 3.0392, + "loss/crossentropy": 2.2766498208045958, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.2082691341638565, + "loss/reg": 0.0, + "step": 27260 + }, + { + "epoch": 0.1794078947368421, + "grad_norm": 2.328125, + "grad_norm_var": 0.16941909790039061, + "learning_rate": 0.0001, + "loss": 3.0554, + "loss/crossentropy": 2.537719178199768, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.21607491597533227, + "loss/reg": 0.0, + "step": 27270 + }, + { + "epoch": 0.1794736842105263, + "grad_norm": 2.078125, + "grad_norm_var": 0.23950093587239582, + "learning_rate": 0.0001, + "loss": 2.9943, + "loss/crossentropy": 2.4620283365249636, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.23522212058305741, + "loss/reg": 0.0, + "step": 27280 + }, + { + "epoch": 0.17953947368421053, + "grad_norm": 2.328125, + "grad_norm_var": 0.1655181884765625, + "learning_rate": 0.0001, + "loss": 2.9975, + "loss/crossentropy": 2.2144778609275817, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.22236768230795861, + "loss/reg": 0.0, + "step": 27290 + }, + { + "epoch": 0.17960526315789474, + "grad_norm": 2.46875, + "grad_norm_var": 0.4901529947916667, + "learning_rate": 0.0001, + "loss": 3.0091, + "loss/crossentropy": 2.1528895676136015, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.2013701803982258, + "loss/reg": 0.0, + "step": 27300 + }, + { + "epoch": 0.17967105263157895, + "grad_norm": 2.21875, + "grad_norm_var": 0.5030751546223958, + "learning_rate": 0.0001, + "loss": 2.932, + "loss/crossentropy": 2.2105651617050173, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2156506821513176, + "loss/reg": 0.0, + "step": 27310 + }, + { + "epoch": 0.17973684210526317, + "grad_norm": 2.75, + "grad_norm_var": 0.1188385009765625, + "learning_rate": 0.0001, + "loss": 3.0315, + "loss/crossentropy": 2.3311607003211976, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.22039272785186767, + "loss/reg": 0.0, + "step": 27320 + }, + { + "epoch": 0.17980263157894738, + "grad_norm": 2.5, + "grad_norm_var": 0.10274149576822916, + "learning_rate": 0.0001, + "loss": 2.9934, + "loss/crossentropy": 2.363705575466156, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.2591767907142639, + "loss/reg": 0.0, + "step": 27330 + }, + { + "epoch": 0.17986842105263157, + "grad_norm": 2.453125, + "grad_norm_var": 0.22634175618489583, + "learning_rate": 0.0001, + "loss": 3.0232, + "loss/crossentropy": 2.491378426551819, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.22361548170447348, + "loss/reg": 0.0, + "step": 27340 + }, + { + "epoch": 0.17993421052631578, + "grad_norm": 2.3125, + "grad_norm_var": 0.28927408854166664, + "learning_rate": 0.0001, + "loss": 3.0354, + "loss/crossentropy": 2.2730626463890076, + "loss/hidden": 2.95, + "loss/incoh": 0.0, + "loss/logits": 0.2728175431489944, + "loss/reg": 0.0, + "step": 27350 + }, + { + "epoch": 0.18, + "grad_norm": 2.03125, + "grad_norm_var": 0.2871734619140625, + "learning_rate": 0.0001, + "loss": 3.0815, + "loss/crossentropy": 2.222132349014282, + "loss/hidden": 3.0171875, + "loss/incoh": 0.0, + "loss/logits": 0.27575425505638124, + "loss/reg": 0.0, + "step": 27360 + }, + { + "epoch": 0.1800657894736842, + "grad_norm": 2.859375, + "grad_norm_var": 0.18010660807291667, + "learning_rate": 0.0001, + "loss": 3.0514, + "loss/crossentropy": 2.3982750535011292, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.24716890305280687, + "loss/reg": 0.0, + "step": 27370 + }, + { + "epoch": 0.18013157894736842, + "grad_norm": 3.625, + "grad_norm_var": 0.20855712890625, + "learning_rate": 0.0001, + "loss": 3.0229, + "loss/crossentropy": 2.3283283829689028, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.27690982520580293, + "loss/reg": 0.0, + "step": 27380 + }, + { + "epoch": 0.18019736842105263, + "grad_norm": 2.546875, + "grad_norm_var": 0.14794514973958334, + "learning_rate": 0.0001, + "loss": 3.1136, + "loss/crossentropy": 2.3320053696632383, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.22953213453292848, + "loss/reg": 0.0, + "step": 27390 + }, + { + "epoch": 0.18026315789473685, + "grad_norm": 2.78125, + "grad_norm_var": 0.15938212076822916, + "learning_rate": 0.0001, + "loss": 3.0693, + "loss/crossentropy": 2.5745165824890135, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.24661683887243271, + "loss/reg": 0.0, + "step": 27400 + }, + { + "epoch": 0.18032894736842106, + "grad_norm": 2.328125, + "grad_norm_var": 0.08530171712239583, + "learning_rate": 0.0001, + "loss": 3.027, + "loss/crossentropy": 2.291408562660217, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.2568038985133171, + "loss/reg": 0.0, + "step": 27410 + }, + { + "epoch": 0.18039473684210527, + "grad_norm": 3.0, + "grad_norm_var": 0.8119293212890625, + "learning_rate": 0.0001, + "loss": 3.0758, + "loss/crossentropy": 2.275824952125549, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.25695485174655913, + "loss/reg": 0.0, + "step": 27420 + }, + { + "epoch": 0.18046052631578946, + "grad_norm": 2.1875, + "grad_norm_var": 0.054488118489583334, + "learning_rate": 0.0001, + "loss": 2.9932, + "loss/crossentropy": 2.3114712476730346, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.23609411865472793, + "loss/reg": 0.0, + "step": 27430 + }, + { + "epoch": 0.18052631578947367, + "grad_norm": 2.5, + "grad_norm_var": 0.05273030598958333, + "learning_rate": 0.0001, + "loss": 3.0682, + "loss/crossentropy": 2.179815483093262, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.25313089042901993, + "loss/reg": 0.0, + "step": 27440 + }, + { + "epoch": 0.1805921052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.06285807291666666, + "learning_rate": 0.0001, + "loss": 3.0519, + "loss/crossentropy": 2.3046084105968476, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.27110700607299804, + "loss/reg": 0.0, + "step": 27450 + }, + { + "epoch": 0.1806578947368421, + "grad_norm": 2.15625, + "grad_norm_var": 0.06948954264322917, + "learning_rate": 0.0001, + "loss": 3.0719, + "loss/crossentropy": 2.2216897130012514, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2158276081085205, + "loss/reg": 0.0, + "step": 27460 + }, + { + "epoch": 0.18072368421052631, + "grad_norm": 2.625, + "grad_norm_var": 0.06531575520833334, + "learning_rate": 0.0001, + "loss": 3.0662, + "loss/crossentropy": 2.31493022441864, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.2338400349020958, + "loss/reg": 0.0, + "step": 27470 + }, + { + "epoch": 0.18078947368421053, + "grad_norm": 2.78125, + "grad_norm_var": 0.08967997233072916, + "learning_rate": 0.0001, + "loss": 3.1017, + "loss/crossentropy": 2.292111110687256, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.25805368572473525, + "loss/reg": 0.0, + "step": 27480 + }, + { + "epoch": 0.18085526315789474, + "grad_norm": 3.78125, + "grad_norm_var": 0.16054585774739583, + "learning_rate": 0.0001, + "loss": 3.0896, + "loss/crossentropy": 2.531006705760956, + "loss/hidden": 2.984375, + "loss/incoh": 0.0, + "loss/logits": 0.3327594205737114, + "loss/reg": 0.0, + "step": 27490 + }, + { + "epoch": 0.18092105263157895, + "grad_norm": 2.3125, + "grad_norm_var": 0.1436431884765625, + "learning_rate": 0.0001, + "loss": 3.0498, + "loss/crossentropy": 2.3085229277610777, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.23342303335666656, + "loss/reg": 0.0, + "step": 27500 + }, + { + "epoch": 0.18098684210526317, + "grad_norm": 2.828125, + "grad_norm_var": 0.24217122395833332, + "learning_rate": 0.0001, + "loss": 3.0115, + "loss/crossentropy": 2.476685440540314, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.2170659363269806, + "loss/reg": 0.0, + "step": 27510 + }, + { + "epoch": 0.18105263157894738, + "grad_norm": 2.546875, + "grad_norm_var": 0.13345438639322918, + "learning_rate": 0.0001, + "loss": 3.0171, + "loss/crossentropy": 2.280565071105957, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.20756303817033767, + "loss/reg": 0.0, + "step": 27520 + }, + { + "epoch": 0.18111842105263157, + "grad_norm": 2.5, + "grad_norm_var": 0.047200520833333336, + "learning_rate": 0.0001, + "loss": 2.9912, + "loss/crossentropy": 2.553924024105072, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.24719125479459764, + "loss/reg": 0.0, + "step": 27530 + }, + { + "epoch": 0.18118421052631578, + "grad_norm": 2.171875, + "grad_norm_var": 0.06653238932291666, + "learning_rate": 0.0001, + "loss": 3.121, + "loss/crossentropy": 2.174853026866913, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.31346396207809446, + "loss/reg": 0.0, + "step": 27540 + }, + { + "epoch": 0.18125, + "grad_norm": 2.828125, + "grad_norm_var": 0.04431966145833333, + "learning_rate": 0.0001, + "loss": 3.0695, + "loss/crossentropy": 2.329943561553955, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.22730601876974105, + "loss/reg": 0.0, + "step": 27550 + }, + { + "epoch": 0.1813157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.0733551025390625, + "learning_rate": 0.0001, + "loss": 3.0813, + "loss/crossentropy": 2.2811534643173217, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.24358074963092805, + "loss/reg": 0.0, + "step": 27560 + }, + { + "epoch": 0.18138157894736842, + "grad_norm": 2.1875, + "grad_norm_var": 0.05774739583333333, + "learning_rate": 0.0001, + "loss": 3.0169, + "loss/crossentropy": 2.3220547437667847, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.24033707976341248, + "loss/reg": 0.0, + "step": 27570 + }, + { + "epoch": 0.18144736842105263, + "grad_norm": 2.5625, + "grad_norm_var": 0.38362528483072916, + "learning_rate": 0.0001, + "loss": 3.0383, + "loss/crossentropy": 2.4912595987319945, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2603204816579819, + "loss/reg": 0.0, + "step": 27580 + }, + { + "epoch": 0.18151315789473685, + "grad_norm": 2.75, + "grad_norm_var": 0.1091949462890625, + "learning_rate": 0.0001, + "loss": 3.0424, + "loss/crossentropy": 2.4132495522499084, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.23840930834412574, + "loss/reg": 0.0, + "step": 27590 + }, + { + "epoch": 0.18157894736842106, + "grad_norm": 2.09375, + "grad_norm_var": 0.224072265625, + "learning_rate": 0.0001, + "loss": 3.0529, + "loss/crossentropy": 2.27939612865448, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2168305702507496, + "loss/reg": 0.0, + "step": 27600 + }, + { + "epoch": 0.18164473684210528, + "grad_norm": 2.453125, + "grad_norm_var": 0.354541015625, + "learning_rate": 0.0001, + "loss": 3.04, + "loss/crossentropy": 2.5797234058380125, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.3075165659189224, + "loss/reg": 0.0, + "step": 27610 + }, + { + "epoch": 0.18171052631578946, + "grad_norm": 2.59375, + "grad_norm_var": 0.28559544881184895, + "learning_rate": 0.0001, + "loss": 3.0175, + "loss/crossentropy": 2.457747685909271, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.2326775386929512, + "loss/reg": 0.0, + "step": 27620 + }, + { + "epoch": 0.18177631578947367, + "grad_norm": 2.078125, + "grad_norm_var": 0.2150957743326823, + "learning_rate": 0.0001, + "loss": 3.025, + "loss/crossentropy": 2.2331402122974398, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24202753305435182, + "loss/reg": 0.0, + "step": 27630 + }, + { + "epoch": 0.1818421052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.074267578125, + "learning_rate": 0.0001, + "loss": 3.0184, + "loss/crossentropy": 2.402931201457977, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.24029098004102706, + "loss/reg": 0.0, + "step": 27640 + }, + { + "epoch": 0.1819078947368421, + "grad_norm": 2.234375, + "grad_norm_var": 0.12086181640625, + "learning_rate": 0.0001, + "loss": 3.0823, + "loss/crossentropy": 2.1527361035346986, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.22135660648345948, + "loss/reg": 0.0, + "step": 27650 + }, + { + "epoch": 0.18197368421052632, + "grad_norm": 2.453125, + "grad_norm_var": 0.13780008951822917, + "learning_rate": 0.0001, + "loss": 3.0202, + "loss/crossentropy": 2.2100749254226684, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.2464701771736145, + "loss/reg": 0.0, + "step": 27660 + }, + { + "epoch": 0.18203947368421053, + "grad_norm": 2.28125, + "grad_norm_var": 0.08621317545572917, + "learning_rate": 0.0001, + "loss": 3.0173, + "loss/crossentropy": 2.400713062286377, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.2229515090584755, + "loss/reg": 0.0, + "step": 27670 + }, + { + "epoch": 0.18210526315789474, + "grad_norm": 2.03125, + "grad_norm_var": 0.03254292805989583, + "learning_rate": 0.0001, + "loss": 2.9616, + "loss/crossentropy": 2.053451955318451, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.2730902835726738, + "loss/reg": 0.0, + "step": 27680 + }, + { + "epoch": 0.18217105263157896, + "grad_norm": 2.53125, + "grad_norm_var": 0.05116551717122396, + "learning_rate": 0.0001, + "loss": 2.9723, + "loss/crossentropy": 2.321164917945862, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.2052845761179924, + "loss/reg": 0.0, + "step": 27690 + }, + { + "epoch": 0.18223684210526317, + "grad_norm": 2.640625, + "grad_norm_var": 0.06518961588541666, + "learning_rate": 0.0001, + "loss": 3.0474, + "loss/crossentropy": 2.1387670636177063, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.24108488559722902, + "loss/reg": 0.0, + "step": 27700 + }, + { + "epoch": 0.18230263157894736, + "grad_norm": 2.703125, + "grad_norm_var": 4.39171331639651e+17, + "learning_rate": 0.0001, + "loss": 3.143, + "loss/crossentropy": 2.4102607131004334, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.33623483330011367, + "loss/reg": 0.0, + "step": 27710 + }, + { + "epoch": 0.18236842105263157, + "grad_norm": 2.21875, + "grad_norm_var": 4.391713315923646e+17, + "learning_rate": 0.0001, + "loss": 3.053, + "loss/crossentropy": 2.215381217002869, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.21635367721319199, + "loss/reg": 0.0, + "step": 27720 + }, + { + "epoch": 0.18243421052631578, + "grad_norm": 2.09375, + "grad_norm_var": 0.118603515625, + "learning_rate": 0.0001, + "loss": 2.9848, + "loss/crossentropy": 2.456426572799683, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.22907530665397643, + "loss/reg": 0.0, + "step": 27730 + }, + { + "epoch": 0.1825, + "grad_norm": 2.140625, + "grad_norm_var": 0.2470123291015625, + "learning_rate": 0.0001, + "loss": 2.9824, + "loss/crossentropy": 2.0001117050647736, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.18826698660850524, + "loss/reg": 0.0, + "step": 27740 + }, + { + "epoch": 0.1825657894736842, + "grad_norm": 2.265625, + "grad_norm_var": 0.21778971354166668, + "learning_rate": 0.0001, + "loss": 3.0528, + "loss/crossentropy": 2.3025170087814333, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.22922251224517823, + "loss/reg": 0.0, + "step": 27750 + }, + { + "epoch": 0.18263157894736842, + "grad_norm": 2.46875, + "grad_norm_var": 0.07261962890625, + "learning_rate": 0.0001, + "loss": 3.0966, + "loss/crossentropy": 2.299086034297943, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.23557846546173095, + "loss/reg": 0.0, + "step": 27760 + }, + { + "epoch": 0.18269736842105264, + "grad_norm": 2.140625, + "grad_norm_var": 0.27241109212239584, + "learning_rate": 0.0001, + "loss": 3.0505, + "loss/crossentropy": 2.4849728107452393, + "loss/hidden": 2.68125, + "loss/incoh": 0.0, + "loss/logits": 0.22220651507377626, + "loss/reg": 0.0, + "step": 27770 + }, + { + "epoch": 0.18276315789473685, + "grad_norm": 2.3125, + "grad_norm_var": 0.29171549479166664, + "learning_rate": 0.0001, + "loss": 2.9851, + "loss/crossentropy": 2.3156984567642214, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.220241579413414, + "loss/reg": 0.0, + "step": 27780 + }, + { + "epoch": 0.18282894736842106, + "grad_norm": 2.1875, + "grad_norm_var": 0.0520416259765625, + "learning_rate": 0.0001, + "loss": 3.0197, + "loss/crossentropy": 2.4274224162101747, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.22342314720153808, + "loss/reg": 0.0, + "step": 27790 + }, + { + "epoch": 0.18289473684210528, + "grad_norm": 2.15625, + "grad_norm_var": 0.0919342041015625, + "learning_rate": 0.0001, + "loss": 3.0726, + "loss/crossentropy": 2.399091196060181, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.23315689116716384, + "loss/reg": 0.0, + "step": 27800 + }, + { + "epoch": 0.18296052631578946, + "grad_norm": 2.59375, + "grad_norm_var": 0.1072265625, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.2596776604652407, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.25109176337718964, + "loss/reg": 0.0, + "step": 27810 + }, + { + "epoch": 0.18302631578947368, + "grad_norm": 2.6875, + "grad_norm_var": 1.1102854410807292, + "learning_rate": 0.0001, + "loss": 3.1493, + "loss/crossentropy": 2.2746080636978148, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2303971141576767, + "loss/reg": 0.0, + "step": 27820 + }, + { + "epoch": 0.1830921052631579, + "grad_norm": 2.25, + "grad_norm_var": 1.1916015625, + "learning_rate": 0.0001, + "loss": 3.0322, + "loss/crossentropy": 2.2199900269508364, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.24132085889577864, + "loss/reg": 0.0, + "step": 27830 + }, + { + "epoch": 0.1831578947368421, + "grad_norm": 2.21875, + "grad_norm_var": 0.11803385416666666, + "learning_rate": 0.0001, + "loss": 2.9787, + "loss/crossentropy": 2.261062300205231, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.23909828811883926, + "loss/reg": 0.0, + "step": 27840 + }, + { + "epoch": 0.18322368421052632, + "grad_norm": 2.3125, + "grad_norm_var": 0.09485270182291666, + "learning_rate": 0.0001, + "loss": 3.0545, + "loss/crossentropy": 2.3090713739395143, + "loss/hidden": 2.68125, + "loss/incoh": 0.0, + "loss/logits": 0.20118657350540162, + "loss/reg": 0.0, + "step": 27850 + }, + { + "epoch": 0.18328947368421053, + "grad_norm": 2.390625, + "grad_norm_var": 0.24943008422851562, + "learning_rate": 0.0001, + "loss": 3.0424, + "loss/crossentropy": 2.3824569821357726, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.23975446224212646, + "loss/reg": 0.0, + "step": 27860 + }, + { + "epoch": 0.18335526315789474, + "grad_norm": 2.125, + "grad_norm_var": 0.14694010416666667, + "learning_rate": 0.0001, + "loss": 2.9925, + "loss/crossentropy": 2.227169597148895, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.22001251727342605, + "loss/reg": 0.0, + "step": 27870 + }, + { + "epoch": 0.18342105263157896, + "grad_norm": 2.28125, + "grad_norm_var": 0.06550191243489584, + "learning_rate": 0.0001, + "loss": 3.0493, + "loss/crossentropy": 2.5089489579200746, + "loss/hidden": 2.6625, + "loss/incoh": 0.0, + "loss/logits": 0.20695988237857818, + "loss/reg": 0.0, + "step": 27880 + }, + { + "epoch": 0.18348684210526317, + "grad_norm": 2.328125, + "grad_norm_var": 0.0484527587890625, + "learning_rate": 0.0001, + "loss": 2.9952, + "loss/crossentropy": 2.4806633472442625, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.27523170709609984, + "loss/reg": 0.0, + "step": 27890 + }, + { + "epoch": 0.18355263157894736, + "grad_norm": 4.3125, + "grad_norm_var": 0.2769521077473958, + "learning_rate": 0.0001, + "loss": 3.0394, + "loss/crossentropy": 2.239254057407379, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.2308262661099434, + "loss/reg": 0.0, + "step": 27900 + }, + { + "epoch": 0.18361842105263157, + "grad_norm": 2.625, + "grad_norm_var": 0.29397761027018227, + "learning_rate": 0.0001, + "loss": 3.0224, + "loss/crossentropy": 2.3714274525642396, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.26112145036458967, + "loss/reg": 0.0, + "step": 27910 + }, + { + "epoch": 0.18368421052631578, + "grad_norm": 2.3125, + "grad_norm_var": 0.05971450805664062, + "learning_rate": 0.0001, + "loss": 2.9517, + "loss/crossentropy": 2.0551604270935058, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.24491064846515656, + "loss/reg": 0.0, + "step": 27920 + }, + { + "epoch": 0.18375, + "grad_norm": 2.53125, + "grad_norm_var": 0.03795547485351562, + "learning_rate": 0.0001, + "loss": 3.0193, + "loss/crossentropy": 2.3411830067634583, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.23370584547519685, + "loss/reg": 0.0, + "step": 27930 + }, + { + "epoch": 0.1838157894736842, + "grad_norm": 2.40625, + "grad_norm_var": 0.1084625244140625, + "learning_rate": 0.0001, + "loss": 3.0802, + "loss/crossentropy": 2.78390554189682, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.26939452439546585, + "loss/reg": 0.0, + "step": 27940 + }, + { + "epoch": 0.18388157894736842, + "grad_norm": 2.359375, + "grad_norm_var": 0.41838277180989586, + "learning_rate": 0.0001, + "loss": 3.1153, + "loss/crossentropy": 2.2294405877590178, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.23037993758916855, + "loss/reg": 0.0, + "step": 27950 + }, + { + "epoch": 0.18394736842105264, + "grad_norm": 3.390625, + "grad_norm_var": 0.37854410807291666, + "learning_rate": 0.0001, + "loss": 3.1324, + "loss/crossentropy": 2.28962881565094, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.24506068229675293, + "loss/reg": 0.0, + "step": 27960 + }, + { + "epoch": 0.18401315789473685, + "grad_norm": 2.40625, + "grad_norm_var": 0.1893999735514323, + "learning_rate": 0.0001, + "loss": 3.0278, + "loss/crossentropy": 2.312084639072418, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2302939549088478, + "loss/reg": 0.0, + "step": 27970 + }, + { + "epoch": 0.18407894736842106, + "grad_norm": 2.359375, + "grad_norm_var": 0.15084228515625, + "learning_rate": 0.0001, + "loss": 3.0267, + "loss/crossentropy": 2.3484140396118165, + "loss/hidden": 2.6046875, + "loss/incoh": 0.0, + "loss/logits": 0.21540230959653855, + "loss/reg": 0.0, + "step": 27980 + }, + { + "epoch": 0.18414473684210525, + "grad_norm": 2.421875, + "grad_norm_var": 0.15601806640625, + "learning_rate": 0.0001, + "loss": 3.0073, + "loss/crossentropy": 2.4529793858528137, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.23430580049753189, + "loss/reg": 0.0, + "step": 27990 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 2.421875, + "grad_norm_var": 0.0614166259765625, + "learning_rate": 0.0001, + "loss": 3.0303, + "loss/crossentropy": 2.351777696609497, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2307715743780136, + "loss/reg": 0.0, + "step": 28000 + }, + { + "epoch": 0.18427631578947368, + "grad_norm": 4.0625, + "grad_norm_var": 0.21398824055989582, + "learning_rate": 0.0001, + "loss": 3.0205, + "loss/crossentropy": 2.435424494743347, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2605126053094864, + "loss/reg": 0.0, + "step": 28010 + }, + { + "epoch": 0.1843421052631579, + "grad_norm": 2.5625, + "grad_norm_var": 0.22447509765625, + "learning_rate": 0.0001, + "loss": 3.0798, + "loss/crossentropy": 2.3683685779571535, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.22029414623975754, + "loss/reg": 0.0, + "step": 28020 + }, + { + "epoch": 0.1844078947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.08081766764322916, + "learning_rate": 0.0001, + "loss": 3.0223, + "loss/crossentropy": 2.5003761768341066, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.2650137528777122, + "loss/reg": 0.0, + "step": 28030 + }, + { + "epoch": 0.18447368421052632, + "grad_norm": 2.28125, + "grad_norm_var": 0.07470601399739583, + "learning_rate": 0.0001, + "loss": 2.998, + "loss/crossentropy": 2.2510382771492004, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.23913081735372543, + "loss/reg": 0.0, + "step": 28040 + }, + { + "epoch": 0.18453947368421053, + "grad_norm": 2.140625, + "grad_norm_var": 0.12652587890625, + "learning_rate": 0.0001, + "loss": 3.0344, + "loss/crossentropy": 2.494665837287903, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.24169176146388055, + "loss/reg": 0.0, + "step": 28050 + }, + { + "epoch": 0.18460526315789474, + "grad_norm": 2.0625, + "grad_norm_var": 0.08466389973958334, + "learning_rate": 0.0001, + "loss": 3.104, + "loss/crossentropy": 2.3110733151435854, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.22967937737703323, + "loss/reg": 0.0, + "step": 28060 + }, + { + "epoch": 0.18467105263157896, + "grad_norm": 2.578125, + "grad_norm_var": 0.067431640625, + "learning_rate": 0.0001, + "loss": 3.0115, + "loss/crossentropy": 2.4786781549453734, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.23725456148386, + "loss/reg": 0.0, + "step": 28070 + }, + { + "epoch": 0.18473684210526317, + "grad_norm": 2.828125, + "grad_norm_var": 0.07773335774739583, + "learning_rate": 0.0001, + "loss": 3.0795, + "loss/crossentropy": 2.5213263630867004, + "loss/hidden": 3.0703125, + "loss/incoh": 0.0, + "loss/logits": 0.3203597366809845, + "loss/reg": 0.0, + "step": 28080 + }, + { + "epoch": 0.18480263157894736, + "grad_norm": 2.296875, + "grad_norm_var": 0.06311747233072916, + "learning_rate": 0.0001, + "loss": 2.9924, + "loss/crossentropy": 2.3318725705146788, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.21815032362937928, + "loss/reg": 0.0, + "step": 28090 + }, + { + "epoch": 0.18486842105263157, + "grad_norm": 2.40625, + "grad_norm_var": 0.42880859375, + "learning_rate": 0.0001, + "loss": 3.0976, + "loss/crossentropy": 2.257493245601654, + "loss/hidden": 2.6390625, + "loss/incoh": 0.0, + "loss/logits": 0.22243027836084367, + "loss/reg": 0.0, + "step": 28100 + }, + { + "epoch": 0.18493421052631578, + "grad_norm": 2.5625, + "grad_norm_var": 0.16021728515625, + "learning_rate": 0.0001, + "loss": 3.0607, + "loss/crossentropy": 2.320458722114563, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.22777790427207947, + "loss/reg": 0.0, + "step": 28110 + }, + { + "epoch": 0.185, + "grad_norm": 2.1875, + "grad_norm_var": 0.35485738118489585, + "learning_rate": 0.0001, + "loss": 3.0824, + "loss/crossentropy": 2.5934891939163207, + "loss/hidden": 2.6625, + "loss/incoh": 0.0, + "loss/logits": 0.217761267721653, + "loss/reg": 0.0, + "step": 28120 + }, + { + "epoch": 0.1850657894736842, + "grad_norm": 2.15625, + "grad_norm_var": 0.16167577107747397, + "learning_rate": 0.0001, + "loss": 3.0055, + "loss/crossentropy": 2.2394705057144164, + "loss/hidden": 2.9890625, + "loss/incoh": 0.0, + "loss/logits": 0.2515557982027531, + "loss/reg": 0.0, + "step": 28130 + }, + { + "epoch": 0.18513157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.14989598592122397, + "learning_rate": 0.0001, + "loss": 3.0126, + "loss/crossentropy": 2.3399840354919434, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.2646253302693367, + "loss/reg": 0.0, + "step": 28140 + }, + { + "epoch": 0.18519736842105264, + "grad_norm": 2.234375, + "grad_norm_var": 0.14189453125, + "learning_rate": 0.0001, + "loss": 3.0785, + "loss/crossentropy": 2.3720606327056886, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2922904878854752, + "loss/reg": 0.0, + "step": 28150 + }, + { + "epoch": 0.18526315789473685, + "grad_norm": 2.1875, + "grad_norm_var": 0.052567545572916666, + "learning_rate": 0.0001, + "loss": 3.0941, + "loss/crossentropy": 2.26825897693634, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.24575791507959366, + "loss/reg": 0.0, + "step": 28160 + }, + { + "epoch": 0.18532894736842107, + "grad_norm": 2.671875, + "grad_norm_var": 0.0409820556640625, + "learning_rate": 0.0001, + "loss": 3.0183, + "loss/crossentropy": 2.2412746131420134, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.22467602118849755, + "loss/reg": 0.0, + "step": 28170 + }, + { + "epoch": 0.18539473684210525, + "grad_norm": 2.40625, + "grad_norm_var": 0.022264607747395835, + "learning_rate": 0.0001, + "loss": 3.1005, + "loss/crossentropy": 2.465518927574158, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.25958890467882156, + "loss/reg": 0.0, + "step": 28180 + }, + { + "epoch": 0.18546052631578946, + "grad_norm": 2.5625, + "grad_norm_var": 0.026130167643229167, + "learning_rate": 0.0001, + "loss": 3.0111, + "loss/crossentropy": 2.4506253004074097, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.23345130831003189, + "loss/reg": 0.0, + "step": 28190 + }, + { + "epoch": 0.18552631578947368, + "grad_norm": 2.859375, + "grad_norm_var": 0.2981516520182292, + "learning_rate": 0.0001, + "loss": 3.1404, + "loss/crossentropy": 2.531968724727631, + "loss/hidden": 3.134375, + "loss/incoh": 0.0, + "loss/logits": 0.3312064751982689, + "loss/reg": 0.0, + "step": 28200 + }, + { + "epoch": 0.1855921052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.3452545166015625, + "learning_rate": 0.0001, + "loss": 3.0669, + "loss/crossentropy": 2.1583576440811156, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.22225692719221116, + "loss/reg": 0.0, + "step": 28210 + }, + { + "epoch": 0.1856578947368421, + "grad_norm": 2.5, + "grad_norm_var": 0.1086090087890625, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.0994983315467834, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.21283998042345048, + "loss/reg": 0.0, + "step": 28220 + }, + { + "epoch": 0.18572368421052632, + "grad_norm": 2.625, + "grad_norm_var": 0.09930013020833334, + "learning_rate": 0.0001, + "loss": 3.0418, + "loss/crossentropy": 2.208499777317047, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.21439559012651443, + "loss/reg": 0.0, + "step": 28230 + }, + { + "epoch": 0.18578947368421053, + "grad_norm": 3.609375, + "grad_norm_var": 0.1129302978515625, + "learning_rate": 0.0001, + "loss": 3.1019, + "loss/crossentropy": 2.507619249820709, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2734301760792732, + "loss/reg": 0.0, + "step": 28240 + }, + { + "epoch": 0.18585526315789475, + "grad_norm": 2.375, + "grad_norm_var": 0.09721577962239583, + "learning_rate": 0.0001, + "loss": 3.0004, + "loss/crossentropy": 2.440661299228668, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.20598573684692384, + "loss/reg": 0.0, + "step": 28250 + }, + { + "epoch": 0.18592105263157896, + "grad_norm": 3.046875, + "grad_norm_var": 0.13509012858072916, + "learning_rate": 0.0001, + "loss": 3.0432, + "loss/crossentropy": 2.13590213060379, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.21278525814414023, + "loss/reg": 0.0, + "step": 28260 + }, + { + "epoch": 0.18598684210526314, + "grad_norm": 2.3125, + "grad_norm_var": 0.10868733723958333, + "learning_rate": 0.0001, + "loss": 3.0082, + "loss/crossentropy": 2.4866502404212953, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2882431522011757, + "loss/reg": 0.0, + "step": 28270 + }, + { + "epoch": 0.18605263157894736, + "grad_norm": 2.40625, + "grad_norm_var": 0.03225504557291667, + "learning_rate": 0.0001, + "loss": 3.0845, + "loss/crossentropy": 1.8784153163433075, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.21082937121391296, + "loss/reg": 0.0, + "step": 28280 + }, + { + "epoch": 0.18611842105263157, + "grad_norm": 2.390625, + "grad_norm_var": 0.0173980712890625, + "learning_rate": 0.0001, + "loss": 3.0764, + "loss/crossentropy": 2.357287549972534, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.20526091530919074, + "loss/reg": 0.0, + "step": 28290 + }, + { + "epoch": 0.18618421052631579, + "grad_norm": 2.109375, + "grad_norm_var": 0.05768941243489583, + "learning_rate": 0.0001, + "loss": 3.0812, + "loss/crossentropy": 1.8806862443685533, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.18540635257959365, + "loss/reg": 0.0, + "step": 28300 + }, + { + "epoch": 0.18625, + "grad_norm": 2.71875, + "grad_norm_var": 0.0767730712890625, + "learning_rate": 0.0001, + "loss": 3.0752, + "loss/crossentropy": 2.2633087158203127, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2433292269706726, + "loss/reg": 0.0, + "step": 28310 + }, + { + "epoch": 0.1863157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.05572509765625, + "learning_rate": 0.0001, + "loss": 2.9722, + "loss/crossentropy": 2.328176748752594, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.226905982196331, + "loss/reg": 0.0, + "step": 28320 + }, + { + "epoch": 0.18638157894736843, + "grad_norm": 1.9765625, + "grad_norm_var": 0.08190485636393229, + "learning_rate": 0.0001, + "loss": 3.0092, + "loss/crossentropy": 2.532880795001984, + "loss/hidden": 2.971875, + "loss/incoh": 0.0, + "loss/logits": 0.3012142822146416, + "loss/reg": 0.0, + "step": 28330 + }, + { + "epoch": 0.18644736842105264, + "grad_norm": 2.625, + "grad_norm_var": 0.1636138916015625, + "learning_rate": 0.0001, + "loss": 3.0108, + "loss/crossentropy": 2.823889398574829, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.24069934040308, + "loss/reg": 0.0, + "step": 28340 + }, + { + "epoch": 0.18651315789473685, + "grad_norm": 2.234375, + "grad_norm_var": 0.10241597493489583, + "learning_rate": 0.0001, + "loss": 3.0001, + "loss/crossentropy": 2.35920227766037, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.26395892798900605, + "loss/reg": 0.0, + "step": 28350 + }, + { + "epoch": 0.18657894736842107, + "grad_norm": 2.4375, + "grad_norm_var": 0.08288472493489583, + "learning_rate": 0.0001, + "loss": 3.0675, + "loss/crossentropy": 2.070932924747467, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.20600728541612626, + "loss/reg": 0.0, + "step": 28360 + }, + { + "epoch": 0.18664473684210525, + "grad_norm": 2.125, + "grad_norm_var": 0.07918192545572916, + "learning_rate": 0.0001, + "loss": 3.017, + "loss/crossentropy": 2.307896840572357, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.25419241189956665, + "loss/reg": 0.0, + "step": 28370 + }, + { + "epoch": 0.18671052631578947, + "grad_norm": 2.40625, + "grad_norm_var": 0.05845947265625, + "learning_rate": 0.0001, + "loss": 3.0325, + "loss/crossentropy": 2.482656693458557, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.24442674964666367, + "loss/reg": 0.0, + "step": 28380 + }, + { + "epoch": 0.18677631578947368, + "grad_norm": 2.140625, + "grad_norm_var": 0.12398173014322916, + "learning_rate": 0.0001, + "loss": 3.1446, + "loss/crossentropy": 2.082607638835907, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.2496044546365738, + "loss/reg": 0.0, + "step": 28390 + }, + { + "epoch": 0.1868421052631579, + "grad_norm": 2.09375, + "grad_norm_var": 0.0723785400390625, + "learning_rate": 0.0001, + "loss": 3.071, + "loss/crossentropy": 2.2511567950248716, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.26172232031822207, + "loss/reg": 0.0, + "step": 28400 + }, + { + "epoch": 0.1869078947368421, + "grad_norm": 3.09375, + "grad_norm_var": 0.08268229166666667, + "learning_rate": 0.0001, + "loss": 3.1149, + "loss/crossentropy": 2.275085437297821, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.23399723172187806, + "loss/reg": 0.0, + "step": 28410 + }, + { + "epoch": 0.18697368421052632, + "grad_norm": 2.4375, + "grad_norm_var": 0.1056793212890625, + "learning_rate": 0.0001, + "loss": 3.0149, + "loss/crossentropy": 2.443694305419922, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.26309441328048705, + "loss/reg": 0.0, + "step": 28420 + }, + { + "epoch": 0.18703947368421053, + "grad_norm": 2.578125, + "grad_norm_var": 0.05745340983072917, + "learning_rate": 0.0001, + "loss": 3.0672, + "loss/crossentropy": 2.1224380493164063, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.23011831045150757, + "loss/reg": 0.0, + "step": 28430 + }, + { + "epoch": 0.18710526315789475, + "grad_norm": 2.015625, + "grad_norm_var": 0.1089752197265625, + "learning_rate": 0.0001, + "loss": 3.0275, + "loss/crossentropy": 2.3803366303443907, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.24528718441724778, + "loss/reg": 0.0, + "step": 28440 + }, + { + "epoch": 0.18717105263157896, + "grad_norm": 2.015625, + "grad_norm_var": 0.1678375244140625, + "learning_rate": 0.0001, + "loss": 3.0534, + "loss/crossentropy": 2.254507315158844, + "loss/hidden": 2.9875, + "loss/incoh": 0.0, + "loss/logits": 0.23182358890771865, + "loss/reg": 0.0, + "step": 28450 + }, + { + "epoch": 0.18723684210526315, + "grad_norm": 2.296875, + "grad_norm_var": 0.0425201416015625, + "learning_rate": 0.0001, + "loss": 3.0644, + "loss/crossentropy": 2.387832987308502, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.22640825510025026, + "loss/reg": 0.0, + "step": 28460 + }, + { + "epoch": 0.18730263157894736, + "grad_norm": 2.359375, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 3.0762, + "loss/crossentropy": 2.4295106649398805, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.2517640799283981, + "loss/reg": 0.0, + "step": 28470 + }, + { + "epoch": 0.18736842105263157, + "grad_norm": 1.96875, + "grad_norm_var": 0.15917561848958334, + "learning_rate": 0.0001, + "loss": 3.0049, + "loss/crossentropy": 2.472483456134796, + "loss/hidden": 2.6203125, + "loss/incoh": 0.0, + "loss/logits": 0.21084701418876647, + "loss/reg": 0.0, + "step": 28480 + }, + { + "epoch": 0.1874342105263158, + "grad_norm": 2.5, + "grad_norm_var": 0.15578511555989583, + "learning_rate": 0.0001, + "loss": 3.1143, + "loss/crossentropy": 1.9477856278419494, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.26639992743730545, + "loss/reg": 0.0, + "step": 28490 + }, + { + "epoch": 0.1875, + "grad_norm": 2.375, + "grad_norm_var": 0.022737630208333335, + "learning_rate": 0.0001, + "loss": 3.0663, + "loss/crossentropy": 2.3596234798431395, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.27315576672554015, + "loss/reg": 0.0, + "step": 28500 + }, + { + "epoch": 0.1875657894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.013753255208333334, + "learning_rate": 0.0001, + "loss": 3.0261, + "loss/crossentropy": 2.196711075305939, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.23315538316965104, + "loss/reg": 0.0, + "step": 28510 + }, + { + "epoch": 0.18763157894736843, + "grad_norm": 2.453125, + "grad_norm_var": 0.05598119099934896, + "learning_rate": 0.0001, + "loss": 3.1013, + "loss/crossentropy": 2.1932213962078095, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2575216740369797, + "loss/reg": 0.0, + "step": 28520 + }, + { + "epoch": 0.18769736842105264, + "grad_norm": 2.484375, + "grad_norm_var": 0.0827166239420573, + "learning_rate": 0.0001, + "loss": 3.0802, + "loss/crossentropy": 2.1390640258789064, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.21492594629526138, + "loss/reg": 0.0, + "step": 28530 + }, + { + "epoch": 0.18776315789473685, + "grad_norm": 2.5, + "grad_norm_var": 0.38186848958333336, + "learning_rate": 0.0001, + "loss": 3.1353, + "loss/crossentropy": 2.3883109092712402, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.24425326883792878, + "loss/reg": 0.0, + "step": 28540 + }, + { + "epoch": 0.18782894736842104, + "grad_norm": 2.0625, + "grad_norm_var": 0.30573628743489584, + "learning_rate": 0.0001, + "loss": 3.0361, + "loss/crossentropy": 2.115387570858002, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.287911082804203, + "loss/reg": 0.0, + "step": 28550 + }, + { + "epoch": 0.18789473684210525, + "grad_norm": 2.671875, + "grad_norm_var": 0.07654622395833334, + "learning_rate": 0.0001, + "loss": 3.1181, + "loss/crossentropy": 2.195990490913391, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.22947419285774232, + "loss/reg": 0.0, + "step": 28560 + }, + { + "epoch": 0.18796052631578947, + "grad_norm": 2.359375, + "grad_norm_var": 0.07486572265625, + "learning_rate": 0.0001, + "loss": 3.0473, + "loss/crossentropy": 2.4482214570045473, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.24045743197202682, + "loss/reg": 0.0, + "step": 28570 + }, + { + "epoch": 0.18802631578947368, + "grad_norm": 2.265625, + "grad_norm_var": 0.03916600545247396, + "learning_rate": 0.0001, + "loss": 3.0881, + "loss/crossentropy": 2.3246118783950807, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.3818572014570236, + "loss/reg": 0.0, + "step": 28580 + }, + { + "epoch": 0.1880921052631579, + "grad_norm": 2.5625, + "grad_norm_var": 0.16263427734375, + "learning_rate": 0.0001, + "loss": 3.0568, + "loss/crossentropy": 2.3318132519721986, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.25452076345682145, + "loss/reg": 0.0, + "step": 28590 + }, + { + "epoch": 0.1881578947368421, + "grad_norm": 2.484375, + "grad_norm_var": 0.20039443969726561, + "learning_rate": 0.0001, + "loss": 3.0826, + "loss/crossentropy": 2.0651711583137513, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.20308290272951127, + "loss/reg": 0.0, + "step": 28600 + }, + { + "epoch": 0.18822368421052632, + "grad_norm": 2.53125, + "grad_norm_var": 0.16236750284830728, + "learning_rate": 0.0001, + "loss": 3.0752, + "loss/crossentropy": 2.307542312145233, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.2218876764178276, + "loss/reg": 0.0, + "step": 28610 + }, + { + "epoch": 0.18828947368421053, + "grad_norm": 4.53125, + "grad_norm_var": 0.3535011291503906, + "learning_rate": 0.0001, + "loss": 3.0657, + "loss/crossentropy": 2.2695202469825744, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.24460041224956514, + "loss/reg": 0.0, + "step": 28620 + }, + { + "epoch": 0.18835526315789475, + "grad_norm": 2.15625, + "grad_norm_var": 0.34776102701822914, + "learning_rate": 0.0001, + "loss": 3.0931, + "loss/crossentropy": 2.409075605869293, + "loss/hidden": 3.2140625, + "loss/incoh": 0.0, + "loss/logits": 0.3033790022134781, + "loss/reg": 0.0, + "step": 28630 + }, + { + "epoch": 0.18842105263157893, + "grad_norm": 2.21875, + "grad_norm_var": 0.04940999348958333, + "learning_rate": 0.0001, + "loss": 3.0248, + "loss/crossentropy": 2.3460227608680726, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.24943882077932358, + "loss/reg": 0.0, + "step": 28640 + }, + { + "epoch": 0.18848684210526315, + "grad_norm": 2.078125, + "grad_norm_var": 0.18352762858072916, + "learning_rate": 0.0001, + "loss": 3.0946, + "loss/crossentropy": 2.8061397314071654, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2542914628982544, + "loss/reg": 0.0, + "step": 28650 + }, + { + "epoch": 0.18855263157894736, + "grad_norm": 2.203125, + "grad_norm_var": 0.05283203125, + "learning_rate": 0.0001, + "loss": 3.023, + "loss/crossentropy": 2.0528534650802612, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2569132924079895, + "loss/reg": 0.0, + "step": 28660 + }, + { + "epoch": 0.18861842105263157, + "grad_norm": 2.171875, + "grad_norm_var": 0.026520792643229166, + "learning_rate": 0.0001, + "loss": 2.9614, + "loss/crossentropy": 2.2640815138816834, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.25332382023334504, + "loss/reg": 0.0, + "step": 28670 + }, + { + "epoch": 0.1886842105263158, + "grad_norm": 1.953125, + "grad_norm_var": 0.037451171875, + "learning_rate": 0.0001, + "loss": 3.0214, + "loss/crossentropy": 2.0984261095523835, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.27365772873163224, + "loss/reg": 0.0, + "step": 28680 + }, + { + "epoch": 0.18875, + "grad_norm": 2.703125, + "grad_norm_var": 0.07590738932291667, + "learning_rate": 0.0001, + "loss": 3.089, + "loss/crossentropy": 2.2889229536056517, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2945134401321411, + "loss/reg": 0.0, + "step": 28690 + }, + { + "epoch": 0.18881578947368421, + "grad_norm": 2.234375, + "grad_norm_var": 0.04885152180989583, + "learning_rate": 0.0001, + "loss": 3.0409, + "loss/crossentropy": 2.164911460876465, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.22368502318859101, + "loss/reg": 0.0, + "step": 28700 + }, + { + "epoch": 0.18888157894736843, + "grad_norm": 2.109375, + "grad_norm_var": 0.1095855712890625, + "learning_rate": 0.0001, + "loss": 3.003, + "loss/crossentropy": 2.2275232434272767, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.21968200653791428, + "loss/reg": 0.0, + "step": 28710 + }, + { + "epoch": 0.18894736842105264, + "grad_norm": 2.5625, + "grad_norm_var": 0.05891494750976563, + "learning_rate": 0.0001, + "loss": 2.985, + "loss/crossentropy": 2.429938530921936, + "loss/hidden": 2.6046875, + "loss/incoh": 0.0, + "loss/logits": 0.22446376383304595, + "loss/reg": 0.0, + "step": 28720 + }, + { + "epoch": 0.18901315789473686, + "grad_norm": 1.984375, + "grad_norm_var": 0.13444722493489583, + "learning_rate": 0.0001, + "loss": 2.9926, + "loss/crossentropy": 2.139791202545166, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2599222779273987, + "loss/reg": 0.0, + "step": 28730 + }, + { + "epoch": 0.18907894736842104, + "grad_norm": 2.8125, + "grad_norm_var": 0.15026041666666667, + "learning_rate": 0.0001, + "loss": 2.9502, + "loss/crossentropy": 2.5147767782211305, + "loss/hidden": 2.609375, + "loss/incoh": 0.0, + "loss/logits": 0.20266549810767173, + "loss/reg": 0.0, + "step": 28740 + }, + { + "epoch": 0.18914473684210525, + "grad_norm": 3.578125, + "grad_norm_var": 0.20552978515625, + "learning_rate": 0.0001, + "loss": 3.051, + "loss/crossentropy": 2.4063800454139708, + "loss/hidden": 3.08125, + "loss/incoh": 0.0, + "loss/logits": 0.3112260654568672, + "loss/reg": 0.0, + "step": 28750 + }, + { + "epoch": 0.18921052631578947, + "grad_norm": 2.453125, + "grad_norm_var": 0.14830322265625, + "learning_rate": 0.0001, + "loss": 3.057, + "loss/crossentropy": 2.2508232951164246, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.22451825439929962, + "loss/reg": 0.0, + "step": 28760 + }, + { + "epoch": 0.18927631578947368, + "grad_norm": 2.875, + "grad_norm_var": 0.11416727701822917, + "learning_rate": 0.0001, + "loss": 3.0512, + "loss/crossentropy": 2.149078315496445, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.21534078270196916, + "loss/reg": 0.0, + "step": 28770 + }, + { + "epoch": 0.1893421052631579, + "grad_norm": 2.265625, + "grad_norm_var": 0.61968994140625, + "learning_rate": 0.0001, + "loss": 3.0034, + "loss/crossentropy": 2.3696099877357484, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.2401156485080719, + "loss/reg": 0.0, + "step": 28780 + }, + { + "epoch": 0.1894078947368421, + "grad_norm": 2.203125, + "grad_norm_var": 0.4911092122395833, + "learning_rate": 0.0001, + "loss": 3.0951, + "loss/crossentropy": 2.0147340178489683, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.2364984579384327, + "loss/reg": 0.0, + "step": 28790 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 2.203125, + "grad_norm_var": 0.06545817057291667, + "learning_rate": 0.0001, + "loss": 3.0985, + "loss/crossentropy": 2.4977503657341003, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.25097098499536513, + "loss/reg": 0.0, + "step": 28800 + }, + { + "epoch": 0.18953947368421054, + "grad_norm": 1.96875, + "grad_norm_var": 0.08662007649739584, + "learning_rate": 0.0001, + "loss": 3.0047, + "loss/crossentropy": 2.1395092368125916, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.20759201645851136, + "loss/reg": 0.0, + "step": 28810 + }, + { + "epoch": 0.18960526315789475, + "grad_norm": 2.78125, + "grad_norm_var": 0.09783426920572917, + "learning_rate": 0.0001, + "loss": 3.0329, + "loss/crossentropy": 2.3047179579734802, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22515993937849998, + "loss/reg": 0.0, + "step": 28820 + }, + { + "epoch": 0.18967105263157893, + "grad_norm": 2.609375, + "grad_norm_var": 0.07600504557291667, + "learning_rate": 0.0001, + "loss": 3.0786, + "loss/crossentropy": 2.4169238924980165, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.23725131005048752, + "loss/reg": 0.0, + "step": 28830 + }, + { + "epoch": 0.18973684210526315, + "grad_norm": 2.78125, + "grad_norm_var": 0.09014460245768229, + "learning_rate": 0.0001, + "loss": 3.0935, + "loss/crossentropy": 2.473812985420227, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.23891154676675797, + "loss/reg": 0.0, + "step": 28840 + }, + { + "epoch": 0.18980263157894736, + "grad_norm": 2.265625, + "grad_norm_var": 0.3475870768229167, + "learning_rate": 0.0001, + "loss": 3.0779, + "loss/crossentropy": 2.1757280230522156, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.2538435861468315, + "loss/reg": 0.0, + "step": 28850 + }, + { + "epoch": 0.18986842105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.2500233968098958, + "learning_rate": 0.0001, + "loss": 3.0282, + "loss/crossentropy": 2.321690630912781, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2390053778886795, + "loss/reg": 0.0, + "step": 28860 + }, + { + "epoch": 0.1899342105263158, + "grad_norm": 2.765625, + "grad_norm_var": 0.12604878743489584, + "learning_rate": 0.0001, + "loss": 3.057, + "loss/crossentropy": 2.0470433115959166, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.24679146856069564, + "loss/reg": 0.0, + "step": 28870 + }, + { + "epoch": 0.19, + "grad_norm": 2.921875, + "grad_norm_var": 0.17348531087239583, + "learning_rate": 0.0001, + "loss": 3.1491, + "loss/crossentropy": 2.022314542531967, + "loss/hidden": 2.86875, + "loss/incoh": 0.0, + "loss/logits": 0.23445970118045806, + "loss/reg": 0.0, + "step": 28880 + }, + { + "epoch": 0.19006578947368422, + "grad_norm": 2.515625, + "grad_norm_var": 0.06368815104166667, + "learning_rate": 0.0001, + "loss": 3.0043, + "loss/crossentropy": 2.0953749775886537, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23144432604312898, + "loss/reg": 0.0, + "step": 28890 + }, + { + "epoch": 0.19013157894736843, + "grad_norm": 2.0, + "grad_norm_var": 0.04041239420572917, + "learning_rate": 0.0001, + "loss": 3.0328, + "loss/crossentropy": 2.2653061032295225, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.23457860052585602, + "loss/reg": 0.0, + "step": 28900 + }, + { + "epoch": 0.19019736842105264, + "grad_norm": 2.359375, + "grad_norm_var": 0.04980061848958333, + "learning_rate": 0.0001, + "loss": 3.0439, + "loss/crossentropy": 2.222883141040802, + "loss/hidden": 3.0453125, + "loss/incoh": 0.0, + "loss/logits": 0.2797882482409477, + "loss/reg": 0.0, + "step": 28910 + }, + { + "epoch": 0.19026315789473683, + "grad_norm": 2.296875, + "grad_norm_var": 0.0666168212890625, + "learning_rate": 0.0001, + "loss": 3.0709, + "loss/crossentropy": 2.3731595158576964, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.21689550429582596, + "loss/reg": 0.0, + "step": 28920 + }, + { + "epoch": 0.19032894736842104, + "grad_norm": 3.5, + "grad_norm_var": 0.15308837890625, + "learning_rate": 0.0001, + "loss": 3.0906, + "loss/crossentropy": 2.3481295704841614, + "loss/hidden": 2.5953125, + "loss/incoh": 0.0, + "loss/logits": 0.20085408240556718, + "loss/reg": 0.0, + "step": 28930 + }, + { + "epoch": 0.19039473684210526, + "grad_norm": 2.59375, + "grad_norm_var": 0.427294667561849, + "learning_rate": 0.0001, + "loss": 3.0422, + "loss/crossentropy": 2.1098085761070253, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.20854987427592278, + "loss/reg": 0.0, + "step": 28940 + }, + { + "epoch": 0.19046052631578947, + "grad_norm": 2.609375, + "grad_norm_var": 2.635267893473307, + "learning_rate": 0.0001, + "loss": 3.0915, + "loss/crossentropy": 2.1917474389076235, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.20776809751987457, + "loss/reg": 0.0, + "step": 28950 + }, + { + "epoch": 0.19052631578947368, + "grad_norm": 2.125, + "grad_norm_var": 0.062235514322916664, + "learning_rate": 0.0001, + "loss": 3.0466, + "loss/crossentropy": 2.0871485114097594, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.21217636987566948, + "loss/reg": 0.0, + "step": 28960 + }, + { + "epoch": 0.1905921052631579, + "grad_norm": 2.1875, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 3.0856, + "loss/crossentropy": 2.1593764424324036, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.20974511429667472, + "loss/reg": 0.0, + "step": 28970 + }, + { + "epoch": 0.1906578947368421, + "grad_norm": 2.234375, + "grad_norm_var": 0.09622395833333333, + "learning_rate": 0.0001, + "loss": 2.983, + "loss/crossentropy": 2.2977413177490233, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.25048716068267823, + "loss/reg": 0.0, + "step": 28980 + }, + { + "epoch": 0.19072368421052632, + "grad_norm": 2.4375, + "grad_norm_var": 0.11470947265625, + "learning_rate": 0.0001, + "loss": 3.0439, + "loss/crossentropy": 2.235085117816925, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2309124991297722, + "loss/reg": 0.0, + "step": 28990 + }, + { + "epoch": 0.19078947368421054, + "grad_norm": 2.28125, + "grad_norm_var": 0.23596598307291666, + "learning_rate": 0.0001, + "loss": 3.0173, + "loss/crossentropy": 2.5475085377693176, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.25446673631668093, + "loss/reg": 0.0, + "step": 29000 + }, + { + "epoch": 0.19085526315789475, + "grad_norm": 2.59375, + "grad_norm_var": 0.18063151041666667, + "learning_rate": 0.0001, + "loss": 3.0599, + "loss/crossentropy": 2.327496898174286, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2621834442019463, + "loss/reg": 0.0, + "step": 29010 + }, + { + "epoch": 0.19092105263157894, + "grad_norm": 2.546875, + "grad_norm_var": 0.0766754150390625, + "learning_rate": 0.0001, + "loss": 3.0494, + "loss/crossentropy": 2.0966503024101257, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.21290977895259858, + "loss/reg": 0.0, + "step": 29020 + }, + { + "epoch": 0.19098684210526315, + "grad_norm": 2.3125, + "grad_norm_var": 0.0940338134765625, + "learning_rate": 0.0001, + "loss": 3.0697, + "loss/crossentropy": 2.193461000919342, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.2083764299750328, + "loss/reg": 0.0, + "step": 29030 + }, + { + "epoch": 0.19105263157894736, + "grad_norm": 2.484375, + "grad_norm_var": 0.20330785115559896, + "learning_rate": 0.0001, + "loss": 3.0073, + "loss/crossentropy": 2.1486512422561646, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2170367144048214, + "loss/reg": 0.0, + "step": 29040 + }, + { + "epoch": 0.19111842105263158, + "grad_norm": 2.4375, + "grad_norm_var": 0.05134989420572917, + "learning_rate": 0.0001, + "loss": 2.9996, + "loss/crossentropy": 2.359115946292877, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.2235071614384651, + "loss/reg": 0.0, + "step": 29050 + }, + { + "epoch": 0.1911842105263158, + "grad_norm": 2.140625, + "grad_norm_var": 0.05135498046875, + "learning_rate": 0.0001, + "loss": 3.0619, + "loss/crossentropy": 2.4227041006088257, + "loss/hidden": 2.978125, + "loss/incoh": 0.0, + "loss/logits": 0.316775880753994, + "loss/reg": 0.0, + "step": 29060 + }, + { + "epoch": 0.19125, + "grad_norm": 3.375, + "grad_norm_var": 0.10562235514322917, + "learning_rate": 0.0001, + "loss": 3.0286, + "loss/crossentropy": 2.532416009902954, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.2569827824831009, + "loss/reg": 0.0, + "step": 29070 + }, + { + "epoch": 0.19131578947368422, + "grad_norm": 2.21875, + "grad_norm_var": 0.11773173014322917, + "learning_rate": 0.0001, + "loss": 3.0518, + "loss/crossentropy": 2.35415917634964, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.23699928000569342, + "loss/reg": 0.0, + "step": 29080 + }, + { + "epoch": 0.19138157894736843, + "grad_norm": 2.234375, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 3.0043, + "loss/crossentropy": 2.044126057624817, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.23696889132261276, + "loss/reg": 0.0, + "step": 29090 + }, + { + "epoch": 0.19144736842105264, + "grad_norm": 2.3125, + "grad_norm_var": 0.08551432291666666, + "learning_rate": 0.0001, + "loss": 3.1223, + "loss/crossentropy": 2.4426622867584227, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.251794858276844, + "loss/reg": 0.0, + "step": 29100 + }, + { + "epoch": 0.19151315789473683, + "grad_norm": 1.9921875, + "grad_norm_var": 0.2218523661295573, + "learning_rate": 0.0001, + "loss": 3.069, + "loss/crossentropy": 2.5686187982559203, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.250452946126461, + "loss/reg": 0.0, + "step": 29110 + }, + { + "epoch": 0.19157894736842104, + "grad_norm": 2.328125, + "grad_norm_var": 0.4294288635253906, + "learning_rate": 0.0001, + "loss": 3.0377, + "loss/crossentropy": 2.2918658018112184, + "loss/hidden": 2.596875, + "loss/incoh": 0.0, + "loss/logits": 0.21649599373340606, + "loss/reg": 0.0, + "step": 29120 + }, + { + "epoch": 0.19164473684210526, + "grad_norm": 2.234375, + "grad_norm_var": 0.028401692708333332, + "learning_rate": 0.0001, + "loss": 3.1086, + "loss/crossentropy": 2.3098790526390074, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.2144581601023674, + "loss/reg": 0.0, + "step": 29130 + }, + { + "epoch": 0.19171052631578947, + "grad_norm": 2.171875, + "grad_norm_var": 0.020048014322916665, + "learning_rate": 0.0001, + "loss": 3.0103, + "loss/crossentropy": 2.470157337188721, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.2766988605260849, + "loss/reg": 0.0, + "step": 29140 + }, + { + "epoch": 0.19177631578947368, + "grad_norm": 2.296875, + "grad_norm_var": 0.02623291015625, + "learning_rate": 0.0001, + "loss": 2.9856, + "loss/crossentropy": 2.317296016216278, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2372336909174919, + "loss/reg": 0.0, + "step": 29150 + }, + { + "epoch": 0.1918421052631579, + "grad_norm": 2.5, + "grad_norm_var": 0.0629547119140625, + "learning_rate": 0.0001, + "loss": 3.0697, + "loss/crossentropy": 2.1767319798469544, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.2674897871911526, + "loss/reg": 0.0, + "step": 29160 + }, + { + "epoch": 0.1919078947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.05247294108072917, + "learning_rate": 0.0001, + "loss": 2.9929, + "loss/crossentropy": 2.2281210064888, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.24070803225040435, + "loss/reg": 0.0, + "step": 29170 + }, + { + "epoch": 0.19197368421052632, + "grad_norm": 2.484375, + "grad_norm_var": 0.30436909993489586, + "learning_rate": 0.0001, + "loss": 3.0128, + "loss/crossentropy": 2.2079304993152618, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.20276689082384108, + "loss/reg": 0.0, + "step": 29180 + }, + { + "epoch": 0.19203947368421054, + "grad_norm": 2.28125, + "grad_norm_var": 0.331884765625, + "learning_rate": 0.0001, + "loss": 3.0248, + "loss/crossentropy": 2.3135082483291627, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.2161063551902771, + "loss/reg": 0.0, + "step": 29190 + }, + { + "epoch": 0.19210526315789472, + "grad_norm": 2.28125, + "grad_norm_var": 0.15064697265625, + "learning_rate": 0.0001, + "loss": 3.0372, + "loss/crossentropy": 2.377930212020874, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.22175273448228836, + "loss/reg": 0.0, + "step": 29200 + }, + { + "epoch": 0.19217105263157894, + "grad_norm": 2.125, + "grad_norm_var": 0.12327372233072917, + "learning_rate": 0.0001, + "loss": 3.1012, + "loss/crossentropy": 2.4531391739845274, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2535667777061462, + "loss/reg": 0.0, + "step": 29210 + }, + { + "epoch": 0.19223684210526315, + "grad_norm": 2.328125, + "grad_norm_var": 0.07448628743489584, + "learning_rate": 0.0001, + "loss": 3.1458, + "loss/crossentropy": 2.2929787397384644, + "loss/hidden": 2.996875, + "loss/incoh": 0.0, + "loss/logits": 0.25961560308933257, + "loss/reg": 0.0, + "step": 29220 + }, + { + "epoch": 0.19230263157894736, + "grad_norm": 2.4375, + "grad_norm_var": 0.04540608723958333, + "learning_rate": 0.0001, + "loss": 3.0156, + "loss/crossentropy": 2.265605902671814, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.27446902841329573, + "loss/reg": 0.0, + "step": 29230 + }, + { + "epoch": 0.19236842105263158, + "grad_norm": 2.328125, + "grad_norm_var": 0.06565348307291667, + "learning_rate": 0.0001, + "loss": 3.1152, + "loss/crossentropy": 2.376231300830841, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.2864221647381783, + "loss/reg": 0.0, + "step": 29240 + }, + { + "epoch": 0.1924342105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.06280085245768229, + "learning_rate": 0.0001, + "loss": 3.07, + "loss/crossentropy": 2.394097375869751, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2377302184700966, + "loss/reg": 0.0, + "step": 29250 + }, + { + "epoch": 0.1925, + "grad_norm": 3.40625, + "grad_norm_var": 0.122607421875, + "learning_rate": 0.0001, + "loss": 3.1114, + "loss/crossentropy": 2.5634241580963133, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.22698014378547668, + "loss/reg": 0.0, + "step": 29260 + }, + { + "epoch": 0.19256578947368422, + "grad_norm": 3.171875, + "grad_norm_var": 0.11873372395833333, + "learning_rate": 0.0001, + "loss": 3.0884, + "loss/crossentropy": 2.3539575576782226, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.2572378695011139, + "loss/reg": 0.0, + "step": 29270 + }, + { + "epoch": 0.19263157894736843, + "grad_norm": 2.640625, + "grad_norm_var": 0.14241536458333334, + "learning_rate": 0.0001, + "loss": 3.0603, + "loss/crossentropy": 2.3354499697685243, + "loss/hidden": 3.003125, + "loss/incoh": 0.0, + "loss/logits": 0.2519584596157074, + "loss/reg": 0.0, + "step": 29280 + }, + { + "epoch": 0.19269736842105264, + "grad_norm": 3.5625, + "grad_norm_var": 0.3069081624348958, + "learning_rate": 0.0001, + "loss": 3.0894, + "loss/crossentropy": 2.0824127376079558, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.28106222823262217, + "loss/reg": 0.0, + "step": 29290 + }, + { + "epoch": 0.19276315789473683, + "grad_norm": 2.328125, + "grad_norm_var": 0.1501617431640625, + "learning_rate": 0.0001, + "loss": 3.0121, + "loss/crossentropy": 2.1776723742485045, + "loss/hidden": 2.6359375, + "loss/incoh": 0.0, + "loss/logits": 0.20980774164199828, + "loss/reg": 0.0, + "step": 29300 + }, + { + "epoch": 0.19282894736842104, + "grad_norm": 2.796875, + "grad_norm_var": 0.2937164306640625, + "learning_rate": 0.0001, + "loss": 3.1225, + "loss/crossentropy": 2.157558262348175, + "loss/hidden": 3.0953125, + "loss/incoh": 0.0, + "loss/logits": 0.2775505542755127, + "loss/reg": 0.0, + "step": 29310 + }, + { + "epoch": 0.19289473684210526, + "grad_norm": 3.734375, + "grad_norm_var": 0.33560791015625, + "learning_rate": 0.0001, + "loss": 3.0748, + "loss/crossentropy": 2.381651961803436, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2246159717440605, + "loss/reg": 0.0, + "step": 29320 + }, + { + "epoch": 0.19296052631578947, + "grad_norm": 2.84375, + "grad_norm_var": 2.1095540364583334, + "learning_rate": 0.0001, + "loss": 3.2928, + "loss/crossentropy": 2.0944801807403564, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.23710142374038695, + "loss/reg": 0.0, + "step": 29330 + }, + { + "epoch": 0.19302631578947368, + "grad_norm": 1.9453125, + "grad_norm_var": 0.2770851135253906, + "learning_rate": 0.0001, + "loss": 3.1, + "loss/crossentropy": 2.4732042074203493, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.23788287490606308, + "loss/reg": 0.0, + "step": 29340 + }, + { + "epoch": 0.1930921052631579, + "grad_norm": 2.046875, + "grad_norm_var": 0.35339330037434896, + "learning_rate": 0.0001, + "loss": 3.0744, + "loss/crossentropy": 2.4158215165138244, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.23991909474134446, + "loss/reg": 0.0, + "step": 29350 + }, + { + "epoch": 0.1931578947368421, + "grad_norm": 4.03125, + "grad_norm_var": 0.3200754801432292, + "learning_rate": 0.0001, + "loss": 3.1003, + "loss/crossentropy": 2.448209798336029, + "loss/hidden": 3.3078125, + "loss/incoh": 0.0, + "loss/logits": 0.3389866009354591, + "loss/reg": 0.0, + "step": 29360 + }, + { + "epoch": 0.19322368421052633, + "grad_norm": 2.796875, + "grad_norm_var": 0.22730712890625, + "learning_rate": 0.0001, + "loss": 3.1074, + "loss/crossentropy": 1.9778976082801818, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.2316925495862961, + "loss/reg": 0.0, + "step": 29370 + }, + { + "epoch": 0.19328947368421054, + "grad_norm": 2.390625, + "grad_norm_var": 0.09205729166666667, + "learning_rate": 0.0001, + "loss": 3.1044, + "loss/crossentropy": 2.344563841819763, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.2164514496922493, + "loss/reg": 0.0, + "step": 29380 + }, + { + "epoch": 0.19335526315789472, + "grad_norm": 2.3125, + "grad_norm_var": 0.07641499837239583, + "learning_rate": 0.0001, + "loss": 3.1445, + "loss/crossentropy": 2.186362612247467, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.21316134929656982, + "loss/reg": 0.0, + "step": 29390 + }, + { + "epoch": 0.19342105263157894, + "grad_norm": 2.265625, + "grad_norm_var": 0.5996897379557292, + "learning_rate": 0.0001, + "loss": 3.11, + "loss/crossentropy": 2.546473169326782, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.22721548825502397, + "loss/reg": 0.0, + "step": 29400 + }, + { + "epoch": 0.19348684210526315, + "grad_norm": 2.34375, + "grad_norm_var": 0.6576881408691406, + "learning_rate": 0.0001, + "loss": 2.9865, + "loss/crossentropy": 2.210515594482422, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.20338982120156288, + "loss/reg": 0.0, + "step": 29410 + }, + { + "epoch": 0.19355263157894737, + "grad_norm": 2.46875, + "grad_norm_var": 0.06096572875976562, + "learning_rate": 0.0001, + "loss": 3.0997, + "loss/crossentropy": 2.5028133630752563, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.3376652091741562, + "loss/reg": 0.0, + "step": 29420 + }, + { + "epoch": 0.19361842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.25368626912434894, + "learning_rate": 0.0001, + "loss": 3.0177, + "loss/crossentropy": 2.2334850907325743, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.2385883465409279, + "loss/reg": 0.0, + "step": 29430 + }, + { + "epoch": 0.1936842105263158, + "grad_norm": 1.9765625, + "grad_norm_var": 0.24181493123372397, + "learning_rate": 0.0001, + "loss": 3.0405, + "loss/crossentropy": 1.898663866519928, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.20240384489297866, + "loss/reg": 0.0, + "step": 29440 + }, + { + "epoch": 0.19375, + "grad_norm": 2.515625, + "grad_norm_var": 0.03530044555664062, + "learning_rate": 0.0001, + "loss": 3.0841, + "loss/crossentropy": 2.3499651670455934, + "loss/hidden": 2.93125, + "loss/incoh": 0.0, + "loss/logits": 0.27134826183319094, + "loss/reg": 0.0, + "step": 29450 + }, + { + "epoch": 0.19381578947368422, + "grad_norm": 2.28125, + "grad_norm_var": 0.02574462890625, + "learning_rate": 0.0001, + "loss": 3.0395, + "loss/crossentropy": 2.3818777322769167, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.25071706622838974, + "loss/reg": 0.0, + "step": 29460 + }, + { + "epoch": 0.19388157894736843, + "grad_norm": 2.125, + "grad_norm_var": 0.018602498372395835, + "learning_rate": 0.0001, + "loss": 3.0232, + "loss/crossentropy": 2.1610496282577514, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2543530076742172, + "loss/reg": 0.0, + "step": 29470 + }, + { + "epoch": 0.19394736842105262, + "grad_norm": 2.828125, + "grad_norm_var": 0.12858784993489583, + "learning_rate": 0.0001, + "loss": 3.0346, + "loss/crossentropy": 2.3739108026027678, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.22633943632245063, + "loss/reg": 0.0, + "step": 29480 + }, + { + "epoch": 0.19401315789473683, + "grad_norm": 2.1875, + "grad_norm_var": 0.116162109375, + "learning_rate": 0.0001, + "loss": 3.0797, + "loss/crossentropy": 2.3364730775356293, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.2235547423362732, + "loss/reg": 0.0, + "step": 29490 + }, + { + "epoch": 0.19407894736842105, + "grad_norm": 2.5625, + "grad_norm_var": 0.04695612589518229, + "learning_rate": 0.0001, + "loss": 3.0498, + "loss/crossentropy": 2.2981515765190124, + "loss/hidden": 2.9703125, + "loss/incoh": 0.0, + "loss/logits": 0.3249427303671837, + "loss/reg": 0.0, + "step": 29500 + }, + { + "epoch": 0.19414473684210526, + "grad_norm": 2.421875, + "grad_norm_var": 0.06395848592122395, + "learning_rate": 0.0001, + "loss": 3.0062, + "loss/crossentropy": 2.392876994609833, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.2336449593305588, + "loss/reg": 0.0, + "step": 29510 + }, + { + "epoch": 0.19421052631578947, + "grad_norm": 2.3125, + "grad_norm_var": 0.07955322265625, + "learning_rate": 0.0001, + "loss": 3.0426, + "loss/crossentropy": 2.3927242517471314, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.23498377203941345, + "loss/reg": 0.0, + "step": 29520 + }, + { + "epoch": 0.19427631578947369, + "grad_norm": 2.203125, + "grad_norm_var": 0.030354817708333332, + "learning_rate": 0.0001, + "loss": 3.0498, + "loss/crossentropy": 1.9698969006538392, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2563909277319908, + "loss/reg": 0.0, + "step": 29530 + }, + { + "epoch": 0.1943421052631579, + "grad_norm": 2.703125, + "grad_norm_var": 0.08201395670572917, + "learning_rate": 0.0001, + "loss": 3.0928, + "loss/crossentropy": 2.228722929954529, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.2367407873272896, + "loss/reg": 0.0, + "step": 29540 + }, + { + "epoch": 0.1944078947368421, + "grad_norm": 2.453125, + "grad_norm_var": 0.067626953125, + "learning_rate": 0.0001, + "loss": 3.0422, + "loss/crossentropy": 2.4088454604148866, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.24344853162765503, + "loss/reg": 0.0, + "step": 29550 + }, + { + "epoch": 0.19447368421052633, + "grad_norm": 2.3125, + "grad_norm_var": 0.07860921223958334, + "learning_rate": 0.0001, + "loss": 3.06, + "loss/crossentropy": 2.1968480169773104, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.22112407311797141, + "loss/reg": 0.0, + "step": 29560 + }, + { + "epoch": 0.19453947368421054, + "grad_norm": 2.296875, + "grad_norm_var": 0.2692942301432292, + "learning_rate": 0.0001, + "loss": 3.0565, + "loss/crossentropy": 2.3560967564582826, + "loss/hidden": 3.040625, + "loss/incoh": 0.0, + "loss/logits": 0.2932572916150093, + "loss/reg": 0.0, + "step": 29570 + }, + { + "epoch": 0.19460526315789473, + "grad_norm": 2.109375, + "grad_norm_var": 0.30523859659830727, + "learning_rate": 0.0001, + "loss": 3.0003, + "loss/crossentropy": 2.0988304018974304, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.24731282144784927, + "loss/reg": 0.0, + "step": 29580 + }, + { + "epoch": 0.19467105263157894, + "grad_norm": 2.703125, + "grad_norm_var": 0.055944569905598956, + "learning_rate": 0.0001, + "loss": 3.0501, + "loss/crossentropy": 2.2525598287582396, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.20050241351127623, + "loss/reg": 0.0, + "step": 29590 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 2.421875, + "grad_norm_var": 0.038557942708333334, + "learning_rate": 0.0001, + "loss": 3.0499, + "loss/crossentropy": 2.117692744731903, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.20816649496555328, + "loss/reg": 0.0, + "step": 29600 + }, + { + "epoch": 0.19480263157894737, + "grad_norm": 2.140625, + "grad_norm_var": 0.048628743489583334, + "learning_rate": 0.0001, + "loss": 3.0671, + "loss/crossentropy": 2.2347262859344483, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.24547438323497772, + "loss/reg": 0.0, + "step": 29610 + }, + { + "epoch": 0.19486842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 0.1297027587890625, + "learning_rate": 0.0001, + "loss": 3.1136, + "loss/crossentropy": 2.1894510865211485, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.26874606013298036, + "loss/reg": 0.0, + "step": 29620 + }, + { + "epoch": 0.1949342105263158, + "grad_norm": 2.796875, + "grad_norm_var": 0.11825764973958333, + "learning_rate": 0.0001, + "loss": 3.1032, + "loss/crossentropy": 2.314576745033264, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.2493584305047989, + "loss/reg": 0.0, + "step": 29630 + }, + { + "epoch": 0.195, + "grad_norm": 2.015625, + "grad_norm_var": 0.18583577473958332, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.548297035694122, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.24562474042177201, + "loss/reg": 0.0, + "step": 29640 + }, + { + "epoch": 0.19506578947368422, + "grad_norm": 2.90625, + "grad_norm_var": 0.21096903483072918, + "learning_rate": 0.0001, + "loss": 3.0832, + "loss/crossentropy": 2.685001492500305, + "loss/hidden": 2.8703125, + "loss/incoh": 0.0, + "loss/logits": 0.27619747072458267, + "loss/reg": 0.0, + "step": 29650 + }, + { + "epoch": 0.19513157894736843, + "grad_norm": 3.046875, + "grad_norm_var": 0.10445556640625, + "learning_rate": 0.0001, + "loss": 3.0406, + "loss/crossentropy": 2.471345865726471, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.233747598528862, + "loss/reg": 0.0, + "step": 29660 + }, + { + "epoch": 0.19519736842105262, + "grad_norm": 2.46875, + "grad_norm_var": 0.1791412353515625, + "learning_rate": 0.0001, + "loss": 3.0572, + "loss/crossentropy": 2.378851294517517, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.222721017152071, + "loss/reg": 0.0, + "step": 29670 + }, + { + "epoch": 0.19526315789473683, + "grad_norm": 3.328125, + "grad_norm_var": 0.25057144165039064, + "learning_rate": 0.0001, + "loss": 3.027, + "loss/crossentropy": 2.091909795999527, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.24436787366867066, + "loss/reg": 0.0, + "step": 29680 + }, + { + "epoch": 0.19532894736842105, + "grad_norm": 2.421875, + "grad_norm_var": 0.22474339803059895, + "learning_rate": 0.0001, + "loss": 3.0429, + "loss/crossentropy": 2.328868269920349, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.262276391685009, + "loss/reg": 0.0, + "step": 29690 + }, + { + "epoch": 0.19539473684210526, + "grad_norm": 2.28125, + "grad_norm_var": 0.15204671223958333, + "learning_rate": 0.0001, + "loss": 3.0549, + "loss/crossentropy": 2.296106255054474, + "loss/hidden": 2.9734375, + "loss/incoh": 0.0, + "loss/logits": 0.2577256761491299, + "loss/reg": 0.0, + "step": 29700 + }, + { + "epoch": 0.19546052631578947, + "grad_norm": 2.5625, + "grad_norm_var": 0.20373433430989582, + "learning_rate": 0.0001, + "loss": 3.0571, + "loss/crossentropy": 2.495807719230652, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.2599473804235458, + "loss/reg": 0.0, + "step": 29710 + }, + { + "epoch": 0.1955263157894737, + "grad_norm": 2.1875, + "grad_norm_var": 0.73043212890625, + "learning_rate": 0.0001, + "loss": 3.0451, + "loss/crossentropy": 2.489153337478638, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.2395859479904175, + "loss/reg": 0.0, + "step": 29720 + }, + { + "epoch": 0.1955921052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.09890034993489584, + "learning_rate": 0.0001, + "loss": 2.9846, + "loss/crossentropy": 2.1876869797706604, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.20331611335277558, + "loss/reg": 0.0, + "step": 29730 + }, + { + "epoch": 0.19565789473684211, + "grad_norm": 2.78125, + "grad_norm_var": 0.12862040201822916, + "learning_rate": 0.0001, + "loss": 3.0944, + "loss/crossentropy": 2.548052740097046, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.23612861186265946, + "loss/reg": 0.0, + "step": 29740 + }, + { + "epoch": 0.19572368421052633, + "grad_norm": 2.1875, + "grad_norm_var": 0.038802083333333334, + "learning_rate": 0.0001, + "loss": 3.0041, + "loss/crossentropy": 2.450440764427185, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.20733811408281327, + "loss/reg": 0.0, + "step": 29750 + }, + { + "epoch": 0.1957894736842105, + "grad_norm": 2.671875, + "grad_norm_var": 0.035497029622395836, + "learning_rate": 0.0001, + "loss": 3.0924, + "loss/crossentropy": 2.022493052482605, + "loss/hidden": 3.0265625, + "loss/incoh": 0.0, + "loss/logits": 0.26215304881334306, + "loss/reg": 0.0, + "step": 29760 + }, + { + "epoch": 0.19585526315789473, + "grad_norm": 6.1875, + "grad_norm_var": 0.9495025634765625, + "learning_rate": 0.0001, + "loss": 3.0461, + "loss/crossentropy": 2.2956669092178346, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24869376718997954, + "loss/reg": 0.0, + "step": 29770 + }, + { + "epoch": 0.19592105263157894, + "grad_norm": 2.40625, + "grad_norm_var": 1.1040974934895833, + "learning_rate": 0.0001, + "loss": 3.0038, + "loss/crossentropy": 2.301337730884552, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.2174853652715683, + "loss/reg": 0.0, + "step": 29780 + }, + { + "epoch": 0.19598684210526315, + "grad_norm": 2.5625, + "grad_norm_var": 0.26876627604166664, + "learning_rate": 0.0001, + "loss": 3.1862, + "loss/crossentropy": 1.7189460813999176, + "loss/hidden": 2.953125, + "loss/incoh": 0.0, + "loss/logits": 0.21047319620847701, + "loss/reg": 0.0, + "step": 29790 + }, + { + "epoch": 0.19605263157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.042464192708333334, + "learning_rate": 0.0001, + "loss": 3.0449, + "loss/crossentropy": 2.3193385720252992, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.2548700511455536, + "loss/reg": 0.0, + "step": 29800 + }, + { + "epoch": 0.19611842105263158, + "grad_norm": 3.71875, + "grad_norm_var": 0.27180887858072916, + "learning_rate": 0.0001, + "loss": 3.008, + "loss/crossentropy": 2.3117380261421205, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.2046105980873108, + "loss/reg": 0.0, + "step": 29810 + }, + { + "epoch": 0.1961842105263158, + "grad_norm": 4.28125, + "grad_norm_var": 0.516106923421224, + "learning_rate": 0.0001, + "loss": 3.0925, + "loss/crossentropy": 2.3800249338150024, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.26439079344272615, + "loss/reg": 0.0, + "step": 29820 + }, + { + "epoch": 0.19625, + "grad_norm": 2.546875, + "grad_norm_var": 0.3475870768229167, + "learning_rate": 0.0001, + "loss": 3.0886, + "loss/crossentropy": 2.284978838264942, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.24723461624234916, + "loss/reg": 0.0, + "step": 29830 + }, + { + "epoch": 0.19631578947368422, + "grad_norm": 2.25, + "grad_norm_var": 0.29708836873372396, + "learning_rate": 0.0001, + "loss": 3.0244, + "loss/crossentropy": 2.231731951236725, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.3149738535284996, + "loss/reg": 0.0, + "step": 29840 + }, + { + "epoch": 0.19638157894736843, + "grad_norm": 2.296875, + "grad_norm_var": 0.287847646077474, + "learning_rate": 0.0001, + "loss": 3.0728, + "loss/crossentropy": 2.453169012069702, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.25355182588100433, + "loss/reg": 0.0, + "step": 29850 + }, + { + "epoch": 0.19644736842105262, + "grad_norm": 2.296875, + "grad_norm_var": 0.2615386962890625, + "learning_rate": 0.0001, + "loss": 3.0352, + "loss/crossentropy": 2.174148201942444, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.2032485894858837, + "loss/reg": 0.0, + "step": 29860 + }, + { + "epoch": 0.19651315789473683, + "grad_norm": 2.6875, + "grad_norm_var": 0.25632705688476565, + "learning_rate": 0.0001, + "loss": 3.0585, + "loss/crossentropy": 2.483177053928375, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2651928335428238, + "loss/reg": 0.0, + "step": 29870 + }, + { + "epoch": 0.19657894736842105, + "grad_norm": 2.140625, + "grad_norm_var": 0.12946370442708333, + "learning_rate": 0.0001, + "loss": 3.0076, + "loss/crossentropy": 2.4216135859489443, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2323567435145378, + "loss/reg": 0.0, + "step": 29880 + }, + { + "epoch": 0.19664473684210526, + "grad_norm": 2.1875, + "grad_norm_var": 0.07423477172851563, + "learning_rate": 0.0001, + "loss": 2.9992, + "loss/crossentropy": 2.43337767124176, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.2374792516231537, + "loss/reg": 0.0, + "step": 29890 + }, + { + "epoch": 0.19671052631578947, + "grad_norm": 2.609375, + "grad_norm_var": 0.09332275390625, + "learning_rate": 0.0001, + "loss": 3.1552, + "loss/crossentropy": 2.066901612281799, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.2512317180633545, + "loss/reg": 0.0, + "step": 29900 + }, + { + "epoch": 0.1967763157894737, + "grad_norm": 2.265625, + "grad_norm_var": 0.3540598551432292, + "learning_rate": 0.0001, + "loss": 3.0271, + "loss/crossentropy": 2.4260810136795046, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.26536442786455156, + "loss/reg": 0.0, + "step": 29910 + }, + { + "epoch": 0.1968421052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.10589090983072917, + "learning_rate": 0.0001, + "loss": 3.0213, + "loss/crossentropy": 2.4159180164337157, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.2190088465809822, + "loss/reg": 0.0, + "step": 29920 + }, + { + "epoch": 0.19690789473684212, + "grad_norm": 2.703125, + "grad_norm_var": 0.03914286295572917, + "learning_rate": 0.0001, + "loss": 3.0847, + "loss/crossentropy": 2.061979150772095, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.20891676545143129, + "loss/reg": 0.0, + "step": 29930 + }, + { + "epoch": 0.19697368421052633, + "grad_norm": 3.625, + "grad_norm_var": 0.14216206868489584, + "learning_rate": 0.0001, + "loss": 3.0243, + "loss/crossentropy": 2.06211262345314, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.22453068271279336, + "loss/reg": 0.0, + "step": 29940 + }, + { + "epoch": 0.19703947368421051, + "grad_norm": 2.453125, + "grad_norm_var": 0.13738505045572916, + "learning_rate": 0.0001, + "loss": 2.9814, + "loss/crossentropy": 2.1103883236646652, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.1844348356127739, + "loss/reg": 0.0, + "step": 29950 + }, + { + "epoch": 0.19710526315789473, + "grad_norm": 2.1875, + "grad_norm_var": 0.026537068684895835, + "learning_rate": 0.0001, + "loss": 3.0008, + "loss/crossentropy": 2.553321826457977, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.21997541338205337, + "loss/reg": 0.0, + "step": 29960 + }, + { + "epoch": 0.19717105263157894, + "grad_norm": 2.109375, + "grad_norm_var": 0.02939453125, + "learning_rate": 0.0001, + "loss": 3.0382, + "loss/crossentropy": 2.4657467365264893, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.2679098337888718, + "loss/reg": 0.0, + "step": 29970 + }, + { + "epoch": 0.19723684210526315, + "grad_norm": 2.328125, + "grad_norm_var": 0.32203369140625, + "learning_rate": 0.0001, + "loss": 3.083, + "loss/crossentropy": 2.2925270318984987, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.2816790759563446, + "loss/reg": 0.0, + "step": 29980 + }, + { + "epoch": 0.19730263157894737, + "grad_norm": 2.828125, + "grad_norm_var": 0.3014149983723958, + "learning_rate": 0.0001, + "loss": 3.0358, + "loss/crossentropy": 2.652257299423218, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.23255081921815873, + "loss/reg": 0.0, + "step": 29990 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 2.171875, + "grad_norm_var": 0.061799112955729166, + "learning_rate": 0.0001, + "loss": 2.9585, + "loss/crossentropy": 2.3400336265563966, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.2366102933883667, + "loss/reg": 0.0, + "step": 30000 + }, + { + "epoch": 0.1974342105263158, + "grad_norm": 2.578125, + "grad_norm_var": 0.06998672485351562, + "learning_rate": 0.0001, + "loss": 3.0787, + "loss/crossentropy": 2.2949488759040833, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.24081108272075652, + "loss/reg": 0.0, + "step": 30010 + }, + { + "epoch": 0.1975, + "grad_norm": 2.359375, + "grad_norm_var": 0.41442769368489585, + "learning_rate": 0.0001, + "loss": 3.0288, + "loss/crossentropy": 2.414297103881836, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.23484020233154296, + "loss/reg": 0.0, + "step": 30020 + }, + { + "epoch": 0.19756578947368422, + "grad_norm": 2.21875, + "grad_norm_var": 0.18356704711914062, + "learning_rate": 0.0001, + "loss": 3.0326, + "loss/crossentropy": 2.300756549835205, + "loss/hidden": 2.5453125, + "loss/incoh": 0.0, + "loss/logits": 0.19037204384803771, + "loss/reg": 0.0, + "step": 30030 + }, + { + "epoch": 0.1976315789473684, + "grad_norm": 2.46875, + "grad_norm_var": 0.07932942708333333, + "learning_rate": 0.0001, + "loss": 3.0541, + "loss/crossentropy": 2.5434207677841187, + "loss/hidden": 2.6859375, + "loss/incoh": 0.0, + "loss/logits": 0.2503844425082207, + "loss/reg": 0.0, + "step": 30040 + }, + { + "epoch": 0.19769736842105262, + "grad_norm": 2.453125, + "grad_norm_var": 0.03378499348958333, + "learning_rate": 0.0001, + "loss": 3.0416, + "loss/crossentropy": 2.435779368877411, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.22367140799760818, + "loss/reg": 0.0, + "step": 30050 + }, + { + "epoch": 0.19776315789473684, + "grad_norm": 2.40625, + "grad_norm_var": 0.11531575520833333, + "learning_rate": 0.0001, + "loss": 3.0175, + "loss/crossentropy": 2.614890933036804, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.2448316603899002, + "loss/reg": 0.0, + "step": 30060 + }, + { + "epoch": 0.19782894736842105, + "grad_norm": 2.1875, + "grad_norm_var": 0.13720703125, + "learning_rate": 0.0001, + "loss": 3.0335, + "loss/crossentropy": 2.0392467260360716, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.22009356170892716, + "loss/reg": 0.0, + "step": 30070 + }, + { + "epoch": 0.19789473684210526, + "grad_norm": 2.203125, + "grad_norm_var": 0.028351847330729166, + "learning_rate": 0.0001, + "loss": 3.0004, + "loss/crossentropy": 2.463162088394165, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.2216602995991707, + "loss/reg": 0.0, + "step": 30080 + }, + { + "epoch": 0.19796052631578948, + "grad_norm": 1.859375, + "grad_norm_var": 0.05168863932291667, + "learning_rate": 0.0001, + "loss": 2.9309, + "loss/crossentropy": 2.386566638946533, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.23080737441778182, + "loss/reg": 0.0, + "step": 30090 + }, + { + "epoch": 0.1980263157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.40987955729166664, + "learning_rate": 0.0001, + "loss": 2.9946, + "loss/crossentropy": 2.5444513320922852, + "loss/hidden": 2.6296875, + "loss/incoh": 0.0, + "loss/logits": 0.22146540880203247, + "loss/reg": 0.0, + "step": 30100 + }, + { + "epoch": 0.1980921052631579, + "grad_norm": 2.328125, + "grad_norm_var": 0.45288263956705727, + "learning_rate": 0.0001, + "loss": 3.0193, + "loss/crossentropy": 2.485434365272522, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23654820621013642, + "loss/reg": 0.0, + "step": 30110 + }, + { + "epoch": 0.19815789473684212, + "grad_norm": 2.3125, + "grad_norm_var": 0.20391820271809896, + "learning_rate": 0.0001, + "loss": 3.0528, + "loss/crossentropy": 2.378788614273071, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2376804992556572, + "loss/reg": 0.0, + "step": 30120 + }, + { + "epoch": 0.1982236842105263, + "grad_norm": 2.25, + "grad_norm_var": 0.10331624348958333, + "learning_rate": 0.0001, + "loss": 3.0188, + "loss/crossentropy": 2.154638743400574, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.20261103957891463, + "loss/reg": 0.0, + "step": 30130 + }, + { + "epoch": 0.19828947368421052, + "grad_norm": 2.40625, + "grad_norm_var": 0.08589655558268229, + "learning_rate": 0.0001, + "loss": 2.9379, + "loss/crossentropy": 2.2816161513328552, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.19446674287319182, + "loss/reg": 0.0, + "step": 30140 + }, + { + "epoch": 0.19835526315789473, + "grad_norm": 3.03125, + "grad_norm_var": 0.10558980305989583, + "learning_rate": 0.0001, + "loss": 3.0039, + "loss/crossentropy": 2.333029532432556, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.21476658284664155, + "loss/reg": 0.0, + "step": 30150 + }, + { + "epoch": 0.19842105263157894, + "grad_norm": 2.390625, + "grad_norm_var": 0.08321940104166667, + "learning_rate": 0.0001, + "loss": 2.9808, + "loss/crossentropy": 2.155623471736908, + "loss/hidden": 2.5859375, + "loss/incoh": 0.0, + "loss/logits": 0.18124678283929824, + "loss/reg": 0.0, + "step": 30160 + }, + { + "epoch": 0.19848684210526316, + "grad_norm": 1.9453125, + "grad_norm_var": 0.13827311197916667, + "learning_rate": 0.0001, + "loss": 3.0116, + "loss/crossentropy": 2.109735357761383, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.24903104603290557, + "loss/reg": 0.0, + "step": 30170 + }, + { + "epoch": 0.19855263157894737, + "grad_norm": 2.375, + "grad_norm_var": 0.040135701497395836, + "learning_rate": 0.0001, + "loss": 3.0087, + "loss/crossentropy": 2.290113925933838, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.21611811220645905, + "loss/reg": 0.0, + "step": 30180 + }, + { + "epoch": 0.19861842105263158, + "grad_norm": 2.390625, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 2.995, + "loss/crossentropy": 1.9855021834373474, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.1928846351802349, + "loss/reg": 0.0, + "step": 30190 + }, + { + "epoch": 0.1986842105263158, + "grad_norm": 2.265625, + "grad_norm_var": 0.059366607666015626, + "learning_rate": 0.0001, + "loss": 3.0265, + "loss/crossentropy": 2.213775265216827, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2045104242861271, + "loss/reg": 0.0, + "step": 30200 + }, + { + "epoch": 0.19875, + "grad_norm": 3.203125, + "grad_norm_var": 0.10831883748372396, + "learning_rate": 0.0001, + "loss": 3.0971, + "loss/crossentropy": 2.2141775846481324, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.2511264935135841, + "loss/reg": 0.0, + "step": 30210 + }, + { + "epoch": 0.19881578947368422, + "grad_norm": 2.265625, + "grad_norm_var": 0.18625895182291666, + "learning_rate": 0.0001, + "loss": 3.1376, + "loss/crossentropy": 2.201507192850113, + "loss/hidden": 3.01875, + "loss/incoh": 0.0, + "loss/logits": 0.27276814319193365, + "loss/reg": 0.0, + "step": 30220 + }, + { + "epoch": 0.1988815789473684, + "grad_norm": 2.71875, + "grad_norm_var": 0.1700347900390625, + "learning_rate": 0.0001, + "loss": 3.0325, + "loss/crossentropy": 2.4042665481567385, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.22706928849220276, + "loss/reg": 0.0, + "step": 30230 + }, + { + "epoch": 0.19894736842105262, + "grad_norm": 2.671875, + "grad_norm_var": 0.18758036295572916, + "learning_rate": 0.0001, + "loss": 3.049, + "loss/crossentropy": 2.0507681727409364, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.22476800307631492, + "loss/reg": 0.0, + "step": 30240 + }, + { + "epoch": 0.19901315789473684, + "grad_norm": 2.5, + "grad_norm_var": 0.07414957682291666, + "learning_rate": 0.0001, + "loss": 2.9881, + "loss/crossentropy": 2.2702227234840393, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.2327321708202362, + "loss/reg": 0.0, + "step": 30250 + }, + { + "epoch": 0.19907894736842105, + "grad_norm": 3.40625, + "grad_norm_var": 0.29836832682291664, + "learning_rate": 0.0001, + "loss": 3.1337, + "loss/crossentropy": 2.3777806520462037, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2500802963972092, + "loss/reg": 0.0, + "step": 30260 + }, + { + "epoch": 0.19914473684210526, + "grad_norm": 2.4375, + "grad_norm_var": 0.27034403483072916, + "learning_rate": 0.0001, + "loss": 3.0123, + "loss/crossentropy": 2.431059980392456, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.2794636771082878, + "loss/reg": 0.0, + "step": 30270 + }, + { + "epoch": 0.19921052631578948, + "grad_norm": 2.515625, + "grad_norm_var": 0.022591145833333333, + "learning_rate": 0.0001, + "loss": 3.0564, + "loss/crossentropy": 2.491440224647522, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.27763516157865525, + "loss/reg": 0.0, + "step": 30280 + }, + { + "epoch": 0.1992763157894737, + "grad_norm": 2.640625, + "grad_norm_var": 0.24989827473958334, + "learning_rate": 0.0001, + "loss": 3.0591, + "loss/crossentropy": 2.443405735492706, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.2293305829167366, + "loss/reg": 0.0, + "step": 30290 + }, + { + "epoch": 0.1993421052631579, + "grad_norm": 2.296875, + "grad_norm_var": 0.044205729166666666, + "learning_rate": 0.0001, + "loss": 2.9622, + "loss/crossentropy": 2.2449229061603546, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.24314892143011094, + "loss/reg": 0.0, + "step": 30300 + }, + { + "epoch": 0.19940789473684212, + "grad_norm": 2.453125, + "grad_norm_var": 0.03411458333333333, + "learning_rate": 0.0001, + "loss": 3.038, + "loss/crossentropy": 2.5837315797805784, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.25667293965816496, + "loss/reg": 0.0, + "step": 30310 + }, + { + "epoch": 0.1994736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.03482666015625, + "learning_rate": 0.0001, + "loss": 3.03, + "loss/crossentropy": 2.4695034623146057, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.24289859235286712, + "loss/reg": 0.0, + "step": 30320 + }, + { + "epoch": 0.19953947368421052, + "grad_norm": 2.75, + "grad_norm_var": 0.08709309895833334, + "learning_rate": 0.0001, + "loss": 2.9926, + "loss/crossentropy": 2.263628613948822, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.25994745194911956, + "loss/reg": 0.0, + "step": 30330 + }, + { + "epoch": 0.19960526315789473, + "grad_norm": 2.40625, + "grad_norm_var": 0.12483317057291667, + "learning_rate": 0.0001, + "loss": 3.0437, + "loss/crossentropy": 2.263992178440094, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.22506246864795684, + "loss/reg": 0.0, + "step": 30340 + }, + { + "epoch": 0.19967105263157894, + "grad_norm": 2.0, + "grad_norm_var": 0.09973042805989583, + "learning_rate": 0.0001, + "loss": 3.0212, + "loss/crossentropy": 2.4082891941070557, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.24035572707653047, + "loss/reg": 0.0, + "step": 30350 + }, + { + "epoch": 0.19973684210526316, + "grad_norm": 2.125, + "grad_norm_var": 0.04556884765625, + "learning_rate": 0.0001, + "loss": 3.0589, + "loss/crossentropy": 2.2902461647987367, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.20010970458388327, + "loss/reg": 0.0, + "step": 30360 + }, + { + "epoch": 0.19980263157894737, + "grad_norm": 2.234375, + "grad_norm_var": 0.047098795572916664, + "learning_rate": 0.0001, + "loss": 3.0519, + "loss/crossentropy": 2.260884428024292, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.23876612037420272, + "loss/reg": 0.0, + "step": 30370 + }, + { + "epoch": 0.19986842105263158, + "grad_norm": 2.203125, + "grad_norm_var": 0.17197265625, + "learning_rate": 0.0001, + "loss": 3.0596, + "loss/crossentropy": 2.293976974487305, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.24686284512281417, + "loss/reg": 0.0, + "step": 30380 + }, + { + "epoch": 0.1999342105263158, + "grad_norm": 1.96875, + "grad_norm_var": 0.4962565104166667, + "learning_rate": 0.0001, + "loss": 3.0143, + "loss/crossentropy": 2.0594027996063233, + "loss/hidden": 2.8625, + "loss/incoh": 0.0, + "loss/logits": 0.2534930631518364, + "loss/reg": 0.0, + "step": 30390 + }, + { + "epoch": 0.2, + "grad_norm": 2.140625, + "grad_norm_var": 0.5117421468098958, + "learning_rate": 0.0001, + "loss": 3.0243, + "loss/crossentropy": 2.2039811968803407, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2378777429461479, + "loss/reg": 0.0, + "step": 30400 + }, + { + "epoch": 0.2000657894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.06318257649739584, + "learning_rate": 0.0001, + "loss": 3.0318, + "loss/crossentropy": 2.306100535392761, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.25150387436151506, + "loss/reg": 0.0, + "step": 30410 + }, + { + "epoch": 0.2001315789473684, + "grad_norm": 2.171875, + "grad_norm_var": 0.048249308268229166, + "learning_rate": 0.0001, + "loss": 3.0054, + "loss/crossentropy": 2.154050374031067, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.23610089272260665, + "loss/reg": 0.0, + "step": 30420 + }, + { + "epoch": 0.20019736842105262, + "grad_norm": 2.09375, + "grad_norm_var": 0.1066314697265625, + "learning_rate": 0.0001, + "loss": 3.0415, + "loss/crossentropy": 2.3580272018909456, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.21100984290242195, + "loss/reg": 0.0, + "step": 30430 + }, + { + "epoch": 0.20026315789473684, + "grad_norm": 2.40625, + "grad_norm_var": 0.10287984212239583, + "learning_rate": 0.0001, + "loss": 3.021, + "loss/crossentropy": 2.3812718391418457, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.2449337661266327, + "loss/reg": 0.0, + "step": 30440 + }, + { + "epoch": 0.20032894736842105, + "grad_norm": 2.421875, + "grad_norm_var": 0.05493876139322917, + "learning_rate": 0.0001, + "loss": 2.9937, + "loss/crossentropy": 2.406654155254364, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.28666631430387496, + "loss/reg": 0.0, + "step": 30450 + }, + { + "epoch": 0.20039473684210526, + "grad_norm": 2.140625, + "grad_norm_var": 0.0645904541015625, + "learning_rate": 0.0001, + "loss": 3.1228, + "loss/crossentropy": 2.3998040676116945, + "loss/hidden": 2.9015625, + "loss/incoh": 0.0, + "loss/logits": 0.2500763192772865, + "loss/reg": 0.0, + "step": 30460 + }, + { + "epoch": 0.20046052631578948, + "grad_norm": 2.234375, + "grad_norm_var": 0.1094146728515625, + "learning_rate": 0.0001, + "loss": 2.9741, + "loss/crossentropy": 2.1604729771614073, + "loss/hidden": 2.590625, + "loss/incoh": 0.0, + "loss/logits": 0.19615808725357056, + "loss/reg": 0.0, + "step": 30470 + }, + { + "epoch": 0.2005263157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.1017974853515625, + "learning_rate": 0.0001, + "loss": 3.0351, + "loss/crossentropy": 2.3099865555763244, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.2315102458000183, + "loss/reg": 0.0, + "step": 30480 + }, + { + "epoch": 0.2005921052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.41324462890625, + "learning_rate": 0.0001, + "loss": 3.0974, + "loss/crossentropy": 2.3289413452148438, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2756644278764725, + "loss/reg": 0.0, + "step": 30490 + }, + { + "epoch": 0.20065789473684212, + "grad_norm": 2.390625, + "grad_norm_var": 1.2159505208333334, + "learning_rate": 0.0001, + "loss": 2.9922, + "loss/crossentropy": 2.2126728296279907, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.22418673560023308, + "loss/reg": 0.0, + "step": 30500 + }, + { + "epoch": 0.2007236842105263, + "grad_norm": 2.515625, + "grad_norm_var": 0.9500343322753906, + "learning_rate": 0.0001, + "loss": 2.9726, + "loss/crossentropy": 2.3763804376125335, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.22033569663763047, + "loss/reg": 0.0, + "step": 30510 + }, + { + "epoch": 0.20078947368421052, + "grad_norm": 2.09375, + "grad_norm_var": 0.14224853515625, + "learning_rate": 0.0001, + "loss": 3.0223, + "loss/crossentropy": 2.400152790546417, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.21929119899868965, + "loss/reg": 0.0, + "step": 30520 + }, + { + "epoch": 0.20085526315789473, + "grad_norm": 2.09375, + "grad_norm_var": 103.514990234375, + "learning_rate": 0.0001, + "loss": 3.0504, + "loss/crossentropy": 2.1772961974143983, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.23286587446928025, + "loss/reg": 0.0, + "step": 30530 + }, + { + "epoch": 0.20092105263157894, + "grad_norm": 2.53125, + "grad_norm_var": 0.11510391235351562, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.2763838887214662, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.22003234028816224, + "loss/reg": 0.0, + "step": 30540 + }, + { + "epoch": 0.20098684210526316, + "grad_norm": 2.203125, + "grad_norm_var": 0.08960367838541666, + "learning_rate": 0.0001, + "loss": 2.9552, + "loss/crossentropy": 2.418367159366608, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2519043207168579, + "loss/reg": 0.0, + "step": 30550 + }, + { + "epoch": 0.20105263157894737, + "grad_norm": 2.484375, + "grad_norm_var": 0.07086588541666666, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.4948137521743776, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.24468859434127807, + "loss/reg": 0.0, + "step": 30560 + }, + { + "epoch": 0.20111842105263159, + "grad_norm": 4.21875, + "grad_norm_var": 0.2363677978515625, + "learning_rate": 0.0001, + "loss": 3.0781, + "loss/crossentropy": 2.697233200073242, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.2648474559187889, + "loss/reg": 0.0, + "step": 30570 + }, + { + "epoch": 0.2011842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.26945699055989586, + "learning_rate": 0.0001, + "loss": 2.9756, + "loss/crossentropy": 2.3618945240974427, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.2500054851174355, + "loss/reg": 0.0, + "step": 30580 + }, + { + "epoch": 0.20125, + "grad_norm": 2.78125, + "grad_norm_var": 0.09013264973958333, + "learning_rate": 0.0001, + "loss": 3.0325, + "loss/crossentropy": 2.1298818975687026, + "loss/hidden": 2.5125, + "loss/incoh": 0.0, + "loss/logits": 0.1715396959334612, + "loss/reg": 0.0, + "step": 30590 + }, + { + "epoch": 0.2013157894736842, + "grad_norm": 2.296875, + "grad_norm_var": 0.09729715983072916, + "learning_rate": 0.0001, + "loss": 3.0119, + "loss/crossentropy": 2.204944038391113, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.21538405269384384, + "loss/reg": 0.0, + "step": 30600 + }, + { + "epoch": 0.2013815789473684, + "grad_norm": 2.296875, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 2.9775, + "loss/crossentropy": 2.4482433080673216, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.22656193226575852, + "loss/reg": 0.0, + "step": 30610 + }, + { + "epoch": 0.20144736842105262, + "grad_norm": 2.234375, + "grad_norm_var": 0.03572769165039062, + "learning_rate": 0.0001, + "loss": 3.0182, + "loss/crossentropy": 2.0842928767204283, + "loss/hidden": 2.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.2059495523571968, + "loss/reg": 0.0, + "step": 30620 + }, + { + "epoch": 0.20151315789473684, + "grad_norm": 2.3125, + "grad_norm_var": 0.1525286356608073, + "learning_rate": 0.0001, + "loss": 3.0799, + "loss/crossentropy": 2.3205880761146545, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.22585390508174896, + "loss/reg": 0.0, + "step": 30630 + }, + { + "epoch": 0.20157894736842105, + "grad_norm": 2.453125, + "grad_norm_var": 0.08941650390625, + "learning_rate": 0.0001, + "loss": 3.0334, + "loss/crossentropy": 2.287918508052826, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.23107146471738815, + "loss/reg": 0.0, + "step": 30640 + }, + { + "epoch": 0.20164473684210527, + "grad_norm": 2.109375, + "grad_norm_var": 0.0238433837890625, + "learning_rate": 0.0001, + "loss": 2.9747, + "loss/crossentropy": 2.4189619421958923, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.2165643572807312, + "loss/reg": 0.0, + "step": 30650 + }, + { + "epoch": 0.20171052631578948, + "grad_norm": 2.375, + "grad_norm_var": 0.07734349568684896, + "learning_rate": 0.0001, + "loss": 2.9809, + "loss/crossentropy": 2.484665501117706, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.25413639694452284, + "loss/reg": 0.0, + "step": 30660 + }, + { + "epoch": 0.2017763157894737, + "grad_norm": 2.328125, + "grad_norm_var": 0.13760477701822918, + "learning_rate": 0.0001, + "loss": 3.0598, + "loss/crossentropy": 1.993758463859558, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.23019451200962066, + "loss/reg": 0.0, + "step": 30670 + }, + { + "epoch": 0.2018421052631579, + "grad_norm": 2.25, + "grad_norm_var": 0.16373697916666666, + "learning_rate": 0.0001, + "loss": 3.0066, + "loss/crossentropy": 2.508138656616211, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.29257247895002364, + "loss/reg": 0.0, + "step": 30680 + }, + { + "epoch": 0.2019078947368421, + "grad_norm": 2.765625, + "grad_norm_var": 0.16415786743164062, + "learning_rate": 0.0001, + "loss": 3.0694, + "loss/crossentropy": 2.178850865364075, + "loss/hidden": 2.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.19227326661348343, + "loss/reg": 0.0, + "step": 30690 + }, + { + "epoch": 0.2019736842105263, + "grad_norm": 2.015625, + "grad_norm_var": 0.12350234985351563, + "learning_rate": 0.0001, + "loss": 3.0484, + "loss/crossentropy": 2.395913541316986, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.23005425035953522, + "loss/reg": 0.0, + "step": 30700 + }, + { + "epoch": 0.20203947368421052, + "grad_norm": 2.15625, + "grad_norm_var": 0.07646077473958333, + "learning_rate": 0.0001, + "loss": 2.9845, + "loss/crossentropy": 2.0765319585800173, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.21056767106056212, + "loss/reg": 0.0, + "step": 30710 + }, + { + "epoch": 0.20210526315789473, + "grad_norm": 2.34375, + "grad_norm_var": 0.14062093098958334, + "learning_rate": 0.0001, + "loss": 3.0582, + "loss/crossentropy": 2.1704251885414125, + "loss/hidden": 2.7296875, + "loss/incoh": 0.0, + "loss/logits": 0.217359322309494, + "loss/reg": 0.0, + "step": 30720 + }, + { + "epoch": 0.20217105263157895, + "grad_norm": 2.390625, + "grad_norm_var": 0.1090484619140625, + "learning_rate": 0.0001, + "loss": 3.0285, + "loss/crossentropy": 2.1888681292533874, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.23789106756448747, + "loss/reg": 0.0, + "step": 30730 + }, + { + "epoch": 0.20223684210526316, + "grad_norm": 1.9921875, + "grad_norm_var": 0.2569536844889323, + "learning_rate": 0.0001, + "loss": 3.0386, + "loss/crossentropy": 2.161084806919098, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.23299970030784606, + "loss/reg": 0.0, + "step": 30740 + }, + { + "epoch": 0.20230263157894737, + "grad_norm": 2.046875, + "grad_norm_var": 0.14261245727539062, + "learning_rate": 0.0001, + "loss": 2.9869, + "loss/crossentropy": 2.4508134722709656, + "loss/hidden": 2.6078125, + "loss/incoh": 0.0, + "loss/logits": 0.20573023706674576, + "loss/reg": 0.0, + "step": 30750 + }, + { + "epoch": 0.2023684210526316, + "grad_norm": 2.734375, + "grad_norm_var": 0.08206761678059896, + "learning_rate": 0.0001, + "loss": 3.1039, + "loss/crossentropy": 2.5278658986091616, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.2716485261917114, + "loss/reg": 0.0, + "step": 30760 + }, + { + "epoch": 0.2024342105263158, + "grad_norm": 2.3125, + "grad_norm_var": 0.14897435506184895, + "learning_rate": 0.0001, + "loss": 3.0202, + "loss/crossentropy": 2.2195838689804077, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.29928734600543977, + "loss/reg": 0.0, + "step": 30770 + }, + { + "epoch": 0.2025, + "grad_norm": 2.265625, + "grad_norm_var": 0.3571248372395833, + "learning_rate": 0.0001, + "loss": 2.9956, + "loss/crossentropy": 1.9182006061077117, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.2259790524840355, + "loss/reg": 0.0, + "step": 30780 + }, + { + "epoch": 0.2025657894736842, + "grad_norm": 2.359375, + "grad_norm_var": 0.3158365885416667, + "learning_rate": 0.0001, + "loss": 3.043, + "loss/crossentropy": 2.2211042761802675, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.23300761282444, + "loss/reg": 0.0, + "step": 30790 + }, + { + "epoch": 0.2026315789473684, + "grad_norm": 2.546875, + "grad_norm_var": 0.1349609375, + "learning_rate": 0.0001, + "loss": 3.046, + "loss/crossentropy": 2.2689759850502016, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.22895248532295226, + "loss/reg": 0.0, + "step": 30800 + }, + { + "epoch": 0.20269736842105263, + "grad_norm": 2.484375, + "grad_norm_var": 0.03440653483072917, + "learning_rate": 0.0001, + "loss": 2.9736, + "loss/crossentropy": 2.2549991250038146, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.24080493301153183, + "loss/reg": 0.0, + "step": 30810 + }, + { + "epoch": 0.20276315789473684, + "grad_norm": 2.90625, + "grad_norm_var": 1.0274088541666666, + "learning_rate": 0.0001, + "loss": 3.0971, + "loss/crossentropy": 2.3225981116294863, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.24680722504854202, + "loss/reg": 0.0, + "step": 30820 + }, + { + "epoch": 0.20282894736842105, + "grad_norm": 2.21875, + "grad_norm_var": 1.2762102762858072, + "learning_rate": 0.0001, + "loss": 3.0638, + "loss/crossentropy": 2.355189287662506, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.23843924850225448, + "loss/reg": 0.0, + "step": 30830 + }, + { + "epoch": 0.20289473684210527, + "grad_norm": 2.671875, + "grad_norm_var": 2.2499407450358073, + "learning_rate": 0.0001, + "loss": 3.2035, + "loss/crossentropy": 2.2234743475914, + "loss/hidden": 2.934375, + "loss/incoh": 0.0, + "loss/logits": 0.2502759709954262, + "loss/reg": 0.0, + "step": 30840 + }, + { + "epoch": 0.20296052631578948, + "grad_norm": 2.0625, + "grad_norm_var": 2.0923665364583335, + "learning_rate": 0.0001, + "loss": 3.0222, + "loss/crossentropy": 2.535164365172386, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.21734880432486534, + "loss/reg": 0.0, + "step": 30850 + }, + { + "epoch": 0.2030263157894737, + "grad_norm": 2.265625, + "grad_norm_var": 0.03389383951822917, + "learning_rate": 0.0001, + "loss": 2.9959, + "loss/crossentropy": 2.4880054354667664, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2341625601053238, + "loss/reg": 0.0, + "step": 30860 + }, + { + "epoch": 0.2030921052631579, + "grad_norm": 2.140625, + "grad_norm_var": 0.0282135009765625, + "learning_rate": 0.0001, + "loss": 3.0451, + "loss/crossentropy": 2.277066648006439, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.2593472898006439, + "loss/reg": 0.0, + "step": 30870 + }, + { + "epoch": 0.2031578947368421, + "grad_norm": 2.140625, + "grad_norm_var": 0.03958333333333333, + "learning_rate": 0.0001, + "loss": 3.0158, + "loss/crossentropy": 2.1649017184972763, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.2148670382797718, + "loss/reg": 0.0, + "step": 30880 + }, + { + "epoch": 0.2032236842105263, + "grad_norm": 2.1875, + "grad_norm_var": 0.37971903483072916, + "learning_rate": 0.0001, + "loss": 2.9899, + "loss/crossentropy": 2.3688385248184205, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2734518602490425, + "loss/reg": 0.0, + "step": 30890 + }, + { + "epoch": 0.20328947368421052, + "grad_norm": 2.203125, + "grad_norm_var": 0.3890452067057292, + "learning_rate": 0.0001, + "loss": 3.0088, + "loss/crossentropy": 2.501011300086975, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.2248419776558876, + "loss/reg": 0.0, + "step": 30900 + }, + { + "epoch": 0.20335526315789473, + "grad_norm": 2.546875, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 3.0529, + "loss/crossentropy": 2.4394679188728334, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.23242839723825454, + "loss/reg": 0.0, + "step": 30910 + }, + { + "epoch": 0.20342105263157895, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06285374959309896, + "learning_rate": 0.0001, + "loss": 2.9565, + "loss/crossentropy": 2.1788210391998293, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.21422138065099716, + "loss/reg": 0.0, + "step": 30920 + }, + { + "epoch": 0.20348684210526316, + "grad_norm": 3.09375, + "grad_norm_var": 0.09987157185872396, + "learning_rate": 0.0001, + "loss": 2.948, + "loss/crossentropy": 2.4877074480056764, + "loss/hidden": 2.6375, + "loss/incoh": 0.0, + "loss/logits": 0.22587471902370454, + "loss/reg": 0.0, + "step": 30930 + }, + { + "epoch": 0.20355263157894737, + "grad_norm": 2.890625, + "grad_norm_var": 0.2575154622395833, + "learning_rate": 0.0001, + "loss": 3.0854, + "loss/crossentropy": 2.3099985003471373, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.23227183222770692, + "loss/reg": 0.0, + "step": 30940 + }, + { + "epoch": 0.2036184210526316, + "grad_norm": 2.125, + "grad_norm_var": 0.18185221354166667, + "learning_rate": 0.0001, + "loss": 3.1055, + "loss/crossentropy": 2.228105306625366, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.24457025676965713, + "loss/reg": 0.0, + "step": 30950 + }, + { + "epoch": 0.2036842105263158, + "grad_norm": 2.40625, + "grad_norm_var": 0.08284098307291667, + "learning_rate": 0.0001, + "loss": 3.131, + "loss/crossentropy": 2.395846438407898, + "loss/hidden": 3.04375, + "loss/incoh": 0.0, + "loss/logits": 0.2948496550321579, + "loss/reg": 0.0, + "step": 30960 + }, + { + "epoch": 0.20375, + "grad_norm": 2.328125, + "grad_norm_var": 0.07164306640625, + "learning_rate": 0.0001, + "loss": 2.9739, + "loss/crossentropy": 2.221297824382782, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.21414069309830666, + "loss/reg": 0.0, + "step": 30970 + }, + { + "epoch": 0.2038157894736842, + "grad_norm": 2.234375, + "grad_norm_var": 0.13599853515625, + "learning_rate": 0.0001, + "loss": 3.1104, + "loss/crossentropy": 2.293365275859833, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2863286077976227, + "loss/reg": 0.0, + "step": 30980 + }, + { + "epoch": 0.2038815789473684, + "grad_norm": 2.265625, + "grad_norm_var": 0.12737630208333334, + "learning_rate": 0.0001, + "loss": 2.9774, + "loss/crossentropy": 2.293133610486984, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.22788952738046647, + "loss/reg": 0.0, + "step": 30990 + }, + { + "epoch": 0.20394736842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.01656494140625, + "learning_rate": 0.0001, + "loss": 2.9724, + "loss/crossentropy": 2.4067237973213196, + "loss/hidden": 2.584375, + "loss/incoh": 0.0, + "loss/logits": 0.220338836312294, + "loss/reg": 0.0, + "step": 31000 + }, + { + "epoch": 0.20401315789473684, + "grad_norm": 2.1875, + "grad_norm_var": 0.984301503499349, + "learning_rate": 0.0001, + "loss": 3.0445, + "loss/crossentropy": 2.3435217142105103, + "loss/hidden": 2.9984375, + "loss/incoh": 0.0, + "loss/logits": 0.28915616124868393, + "loss/reg": 0.0, + "step": 31010 + }, + { + "epoch": 0.20407894736842105, + "grad_norm": 3.0, + "grad_norm_var": 0.10872573852539062, + "learning_rate": 0.0001, + "loss": 3.133, + "loss/crossentropy": 2.3546693086624146, + "loss/hidden": 2.8234375, + "loss/incoh": 0.0, + "loss/logits": 0.2506974846124649, + "loss/reg": 0.0, + "step": 31020 + }, + { + "epoch": 0.20414473684210527, + "grad_norm": 2.59375, + "grad_norm_var": 0.33428929646809896, + "learning_rate": 0.0001, + "loss": 3.0211, + "loss/crossentropy": 2.2451566100120544, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.23826195299625397, + "loss/reg": 0.0, + "step": 31030 + }, + { + "epoch": 0.20421052631578948, + "grad_norm": 2.140625, + "grad_norm_var": 0.31133626302083334, + "learning_rate": 0.0001, + "loss": 3.0125, + "loss/crossentropy": 2.142116904258728, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.19715375155210496, + "loss/reg": 0.0, + "step": 31040 + }, + { + "epoch": 0.2042763157894737, + "grad_norm": 2.375, + "grad_norm_var": 0.1289629618326823, + "learning_rate": 0.0001, + "loss": 3.0955, + "loss/crossentropy": 2.210948419570923, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.2645356684923172, + "loss/reg": 0.0, + "step": 31050 + }, + { + "epoch": 0.2043421052631579, + "grad_norm": 2.140625, + "grad_norm_var": 0.1048004150390625, + "learning_rate": 0.0001, + "loss": 3.1043, + "loss/crossentropy": 2.0977415561676027, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.2552319660782814, + "loss/reg": 0.0, + "step": 31060 + }, + { + "epoch": 0.2044078947368421, + "grad_norm": 2.296875, + "grad_norm_var": 0.08701070149739583, + "learning_rate": 0.0001, + "loss": 3.1067, + "loss/crossentropy": 2.264439880847931, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2655411049723625, + "loss/reg": 0.0, + "step": 31070 + }, + { + "epoch": 0.2044736842105263, + "grad_norm": 2.25, + "grad_norm_var": 0.06838785807291667, + "learning_rate": 0.0001, + "loss": 3.0627, + "loss/crossentropy": 2.0332413136959078, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.21882982477545737, + "loss/reg": 0.0, + "step": 31080 + }, + { + "epoch": 0.20453947368421052, + "grad_norm": 2.484375, + "grad_norm_var": 0.10865478515625, + "learning_rate": 0.0001, + "loss": 3.0466, + "loss/crossentropy": 2.2589586079120636, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.2315130352973938, + "loss/reg": 0.0, + "step": 31090 + }, + { + "epoch": 0.20460526315789473, + "grad_norm": 2.375, + "grad_norm_var": 0.37800191243489584, + "learning_rate": 0.0001, + "loss": 3.0036, + "loss/crossentropy": 2.3440281629562376, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.2972772717475891, + "loss/reg": 0.0, + "step": 31100 + }, + { + "epoch": 0.20467105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.29983723958333336, + "learning_rate": 0.0001, + "loss": 2.954, + "loss/crossentropy": 2.47297340631485, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.24424962252378463, + "loss/reg": 0.0, + "step": 31110 + }, + { + "epoch": 0.20473684210526316, + "grad_norm": 2.140625, + "grad_norm_var": 0.2591634114583333, + "learning_rate": 0.0001, + "loss": 3.1546, + "loss/crossentropy": 2.436366784572601, + "loss/hidden": 2.9546875, + "loss/incoh": 0.0, + "loss/logits": 0.3093844935297966, + "loss/reg": 0.0, + "step": 31120 + }, + { + "epoch": 0.20480263157894738, + "grad_norm": 2.453125, + "grad_norm_var": 0.2005859375, + "learning_rate": 0.0001, + "loss": 3.0745, + "loss/crossentropy": 2.140506219863892, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.27134309709072113, + "loss/reg": 0.0, + "step": 31130 + }, + { + "epoch": 0.2048684210526316, + "grad_norm": 2.125, + "grad_norm_var": 0.032713826497395834, + "learning_rate": 0.0001, + "loss": 3.0449, + "loss/crossentropy": 2.5245323538780213, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.2610665872693062, + "loss/reg": 0.0, + "step": 31140 + }, + { + "epoch": 0.2049342105263158, + "grad_norm": 2.0625, + "grad_norm_var": 0.11193745930989583, + "learning_rate": 0.0001, + "loss": 3.0709, + "loss/crossentropy": 2.5337566137313843, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.2527821347117424, + "loss/reg": 0.0, + "step": 31150 + }, + { + "epoch": 0.205, + "grad_norm": 3.953125, + "grad_norm_var": 0.3135943094889323, + "learning_rate": 0.0001, + "loss": 3.1177, + "loss/crossentropy": 2.2265261888504027, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.2549376994371414, + "loss/reg": 0.0, + "step": 31160 + }, + { + "epoch": 0.2050657894736842, + "grad_norm": 2.484375, + "grad_norm_var": 0.3199534098307292, + "learning_rate": 0.0001, + "loss": 3.0723, + "loss/crossentropy": 2.47224338054657, + "loss/hidden": 2.6328125, + "loss/incoh": 0.0, + "loss/logits": 0.2135899156332016, + "loss/reg": 0.0, + "step": 31170 + }, + { + "epoch": 0.20513157894736841, + "grad_norm": 2.078125, + "grad_norm_var": 0.08787333170572917, + "learning_rate": 0.0001, + "loss": 3.006, + "loss/crossentropy": 1.9317480087280274, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.1986733317375183, + "loss/reg": 0.0, + "step": 31180 + }, + { + "epoch": 0.20519736842105263, + "grad_norm": 2.546875, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 0.0001, + "loss": 3.0381, + "loss/crossentropy": 2.470042097568512, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.2246125504374504, + "loss/reg": 0.0, + "step": 31190 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 2.578125, + "grad_norm_var": 0.05276692708333333, + "learning_rate": 0.0001, + "loss": 2.9766, + "loss/crossentropy": 2.350657284259796, + "loss/hidden": 2.675, + "loss/incoh": 0.0, + "loss/logits": 0.22050067782402039, + "loss/reg": 0.0, + "step": 31200 + }, + { + "epoch": 0.20532894736842106, + "grad_norm": 2.125, + "grad_norm_var": 0.0873931884765625, + "learning_rate": 0.0001, + "loss": 3.0405, + "loss/crossentropy": 2.3005827188491823, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.2127441346645355, + "loss/reg": 0.0, + "step": 31210 + }, + { + "epoch": 0.20539473684210527, + "grad_norm": 2.78125, + "grad_norm_var": 5.270894368489583, + "learning_rate": 0.0001, + "loss": 3.1111, + "loss/crossentropy": 2.4935895919799806, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2504632011055946, + "loss/reg": 0.0, + "step": 31220 + }, + { + "epoch": 0.20546052631578948, + "grad_norm": 2.75, + "grad_norm_var": 0.06770426432291667, + "learning_rate": 0.0001, + "loss": 2.9834, + "loss/crossentropy": 2.355222475528717, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2353107064962387, + "loss/reg": 0.0, + "step": 31230 + }, + { + "epoch": 0.2055263157894737, + "grad_norm": 2.078125, + "grad_norm_var": 0.209130859375, + "learning_rate": 0.0001, + "loss": 3.0459, + "loss/crossentropy": 2.504118573665619, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22558754682540894, + "loss/reg": 0.0, + "step": 31240 + }, + { + "epoch": 0.20559210526315788, + "grad_norm": 2.296875, + "grad_norm_var": 0.34091389973958336, + "learning_rate": 0.0001, + "loss": 3.0119, + "loss/crossentropy": 2.1399470806121825, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2328704759478569, + "loss/reg": 0.0, + "step": 31250 + }, + { + "epoch": 0.2056578947368421, + "grad_norm": 2.921875, + "grad_norm_var": 0.24337565104166667, + "learning_rate": 0.0001, + "loss": 3.0416, + "loss/crossentropy": 2.251539409160614, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.3552345484495163, + "loss/reg": 0.0, + "step": 31260 + }, + { + "epoch": 0.2057236842105263, + "grad_norm": 2.15625, + "grad_norm_var": 0.28566080729166665, + "learning_rate": 0.0001, + "loss": 3.0568, + "loss/crossentropy": 2.310221529006958, + "loss/hidden": 2.940625, + "loss/incoh": 0.0, + "loss/logits": 0.26540524512529373, + "loss/reg": 0.0, + "step": 31270 + }, + { + "epoch": 0.20578947368421052, + "grad_norm": 2.65625, + "grad_norm_var": 0.18033447265625, + "learning_rate": 0.0001, + "loss": 3.0158, + "loss/crossentropy": 2.489114725589752, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.24016214907169342, + "loss/reg": 0.0, + "step": 31280 + }, + { + "epoch": 0.20585526315789474, + "grad_norm": 2.09375, + "grad_norm_var": 0.09659830729166667, + "learning_rate": 0.0001, + "loss": 3.0496, + "loss/crossentropy": 1.9956150293350219, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.19284655302762985, + "loss/reg": 0.0, + "step": 31290 + }, + { + "epoch": 0.20592105263157895, + "grad_norm": 2.109375, + "grad_norm_var": 0.0728179931640625, + "learning_rate": 0.0001, + "loss": 3.0451, + "loss/crossentropy": 2.383978569507599, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.21748270690441132, + "loss/reg": 0.0, + "step": 31300 + }, + { + "epoch": 0.20598684210526316, + "grad_norm": 2.3125, + "grad_norm_var": 0.05699462890625, + "learning_rate": 0.0001, + "loss": 3.031, + "loss/crossentropy": 2.064331567287445, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.21027912348508834, + "loss/reg": 0.0, + "step": 31310 + }, + { + "epoch": 0.20605263157894738, + "grad_norm": 2.28125, + "grad_norm_var": 0.27751363118489586, + "learning_rate": 0.0001, + "loss": 3.0386, + "loss/crossentropy": 2.5503376483917237, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2552351266145706, + "loss/reg": 0.0, + "step": 31320 + }, + { + "epoch": 0.2061184210526316, + "grad_norm": 2.109375, + "grad_norm_var": 0.15871480305989583, + "learning_rate": 0.0001, + "loss": 3.0885, + "loss/crossentropy": 2.408406615257263, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.22667487859725952, + "loss/reg": 0.0, + "step": 31330 + }, + { + "epoch": 0.2061842105263158, + "grad_norm": 2.546875, + "grad_norm_var": 0.11311009724934896, + "learning_rate": 0.0001, + "loss": 3.0571, + "loss/crossentropy": 2.199143981933594, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.22705173045396804, + "loss/reg": 0.0, + "step": 31340 + }, + { + "epoch": 0.20625, + "grad_norm": 2.4375, + "grad_norm_var": 0.11831868489583333, + "learning_rate": 0.0001, + "loss": 3.0539, + "loss/crossentropy": 1.9996395468711854, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.22363494634628295, + "loss/reg": 0.0, + "step": 31350 + }, + { + "epoch": 0.2063157894736842, + "grad_norm": 2.53125, + "grad_norm_var": 0.0868364969889323, + "learning_rate": 0.0001, + "loss": 3.053, + "loss/crossentropy": 2.067913568019867, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2136005848646164, + "loss/reg": 0.0, + "step": 31360 + }, + { + "epoch": 0.20638157894736842, + "grad_norm": 2.0, + "grad_norm_var": 0.086376953125, + "learning_rate": 0.0001, + "loss": 3.0064, + "loss/crossentropy": 2.300830841064453, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2269425220787525, + "loss/reg": 0.0, + "step": 31370 + }, + { + "epoch": 0.20644736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.05623753865559896, + "learning_rate": 0.0001, + "loss": 3.0327, + "loss/crossentropy": 2.283673417568207, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.22951046377420425, + "loss/reg": 0.0, + "step": 31380 + }, + { + "epoch": 0.20651315789473684, + "grad_norm": 2.5, + "grad_norm_var": 0.15467020670572917, + "learning_rate": 0.0001, + "loss": 3.0314, + "loss/crossentropy": 2.2721763372421266, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.2015401691198349, + "loss/reg": 0.0, + "step": 31390 + }, + { + "epoch": 0.20657894736842106, + "grad_norm": 2.25, + "grad_norm_var": 0.17532958984375, + "learning_rate": 0.0001, + "loss": 3.0562, + "loss/crossentropy": 2.3926843643188476, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.22229372709989548, + "loss/reg": 0.0, + "step": 31400 + }, + { + "epoch": 0.20664473684210527, + "grad_norm": 2.421875, + "grad_norm_var": 0.16061986287434896, + "learning_rate": 0.0001, + "loss": 3.0372, + "loss/crossentropy": 2.2162522673606873, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.21594754755496978, + "loss/reg": 0.0, + "step": 31410 + }, + { + "epoch": 0.20671052631578948, + "grad_norm": 2.171875, + "grad_norm_var": 0.19730606079101562, + "learning_rate": 0.0001, + "loss": 2.9993, + "loss/crossentropy": 2.2487839818000794, + "loss/hidden": 2.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.23367298394441605, + "loss/reg": 0.0, + "step": 31420 + }, + { + "epoch": 0.2067763157894737, + "grad_norm": 2.140625, + "grad_norm_var": 0.22303059895833333, + "learning_rate": 0.0001, + "loss": 3.03, + "loss/crossentropy": 2.304809939861298, + "loss/hidden": 2.6609375, + "loss/incoh": 0.0, + "loss/logits": 0.21387835443019867, + "loss/reg": 0.0, + "step": 31430 + }, + { + "epoch": 0.20684210526315788, + "grad_norm": 2.421875, + "grad_norm_var": 0.18980204264322917, + "learning_rate": 0.0001, + "loss": 3.0591, + "loss/crossentropy": 2.428087902069092, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.2132945567369461, + "loss/reg": 0.0, + "step": 31440 + }, + { + "epoch": 0.2069078947368421, + "grad_norm": 2.4375, + "grad_norm_var": 0.11033426920572917, + "learning_rate": 0.0001, + "loss": 3.0247, + "loss/crossentropy": 2.3415855526924134, + "loss/hidden": 2.7296875, + "loss/incoh": 0.0, + "loss/logits": 0.2590311750769615, + "loss/reg": 0.0, + "step": 31450 + }, + { + "epoch": 0.2069736842105263, + "grad_norm": 2.171875, + "grad_norm_var": 2.33717041015625, + "learning_rate": 0.0001, + "loss": 3.0733, + "loss/crossentropy": 2.0509816646575927, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.22177468091249466, + "loss/reg": 0.0, + "step": 31460 + }, + { + "epoch": 0.20703947368421052, + "grad_norm": 2.125, + "grad_norm_var": 0.06643473307291667, + "learning_rate": 0.0001, + "loss": 3.01, + "loss/crossentropy": 2.0959688067436217, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2369927003979683, + "loss/reg": 0.0, + "step": 31470 + }, + { + "epoch": 0.20710526315789474, + "grad_norm": 2.234375, + "grad_norm_var": 0.07058690388997396, + "learning_rate": 0.0001, + "loss": 3.0335, + "loss/crossentropy": 2.2887695908546446, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.21632779389619827, + "loss/reg": 0.0, + "step": 31480 + }, + { + "epoch": 0.20717105263157895, + "grad_norm": 2.171875, + "grad_norm_var": 0.0905914306640625, + "learning_rate": 0.0001, + "loss": 3.022, + "loss/crossentropy": 2.251658821105957, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.27912095934152603, + "loss/reg": 0.0, + "step": 31490 + }, + { + "epoch": 0.20723684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.13886617024739584, + "learning_rate": 0.0001, + "loss": 3.0013, + "loss/crossentropy": 2.200524830818176, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.20394676253199578, + "loss/reg": 0.0, + "step": 31500 + }, + { + "epoch": 0.20730263157894738, + "grad_norm": 2.328125, + "grad_norm_var": 0.11672770182291667, + "learning_rate": 0.0001, + "loss": 3.1006, + "loss/crossentropy": 2.410679817199707, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.2168733671307564, + "loss/reg": 0.0, + "step": 31510 + }, + { + "epoch": 0.2073684210526316, + "grad_norm": 2.40625, + "grad_norm_var": 0.16353759765625, + "learning_rate": 0.0001, + "loss": 2.9942, + "loss/crossentropy": 2.1653492391109466, + "loss/hidden": 2.5796875, + "loss/incoh": 0.0, + "loss/logits": 0.18678945749998094, + "loss/reg": 0.0, + "step": 31520 + }, + { + "epoch": 0.20743421052631578, + "grad_norm": 2.546875, + "grad_norm_var": 0.22099202473958332, + "learning_rate": 0.0001, + "loss": 3.0174, + "loss/crossentropy": 2.4728673577308653, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.24382270276546478, + "loss/reg": 0.0, + "step": 31530 + }, + { + "epoch": 0.2075, + "grad_norm": 2.921875, + "grad_norm_var": 0.20640869140625, + "learning_rate": 0.0001, + "loss": 3.065, + "loss/crossentropy": 2.3791961908340453, + "loss/hidden": 3.0109375, + "loss/incoh": 0.0, + "loss/logits": 0.29753602892160413, + "loss/reg": 0.0, + "step": 31540 + }, + { + "epoch": 0.2075657894736842, + "grad_norm": 2.125, + "grad_norm_var": 0.07801005045572916, + "learning_rate": 0.0001, + "loss": 2.9835, + "loss/crossentropy": 2.1349298536777495, + "loss/hidden": 2.6390625, + "loss/incoh": 0.0, + "loss/logits": 0.2179326094686985, + "loss/reg": 0.0, + "step": 31550 + }, + { + "epoch": 0.20763157894736842, + "grad_norm": 1.9921875, + "grad_norm_var": 0.3508989969889323, + "learning_rate": 0.0001, + "loss": 3.0481, + "loss/crossentropy": 2.128028416633606, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.24575520306825638, + "loss/reg": 0.0, + "step": 31560 + }, + { + "epoch": 0.20769736842105263, + "grad_norm": 2.140625, + "grad_norm_var": 0.36219253540039065, + "learning_rate": 0.0001, + "loss": 2.992, + "loss/crossentropy": 2.366794526576996, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.2530692145228386, + "loss/reg": 0.0, + "step": 31570 + }, + { + "epoch": 0.20776315789473684, + "grad_norm": 2.125, + "grad_norm_var": 0.18565165201822917, + "learning_rate": 0.0001, + "loss": 2.9888, + "loss/crossentropy": 2.5053478240966798, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.24881082475185395, + "loss/reg": 0.0, + "step": 31580 + }, + { + "epoch": 0.20782894736842106, + "grad_norm": 2.484375, + "grad_norm_var": 0.22190348307291666, + "learning_rate": 0.0001, + "loss": 3.0479, + "loss/crossentropy": 2.115998589992523, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2497893139719963, + "loss/reg": 0.0, + "step": 31590 + }, + { + "epoch": 0.20789473684210527, + "grad_norm": 2.34375, + "grad_norm_var": 0.0385406494140625, + "learning_rate": 0.0001, + "loss": 2.9967, + "loss/crossentropy": 2.2767433404922484, + "loss/hidden": 2.6578125, + "loss/incoh": 0.0, + "loss/logits": 0.22527251839637757, + "loss/reg": 0.0, + "step": 31600 + }, + { + "epoch": 0.20796052631578948, + "grad_norm": 2.140625, + "grad_norm_var": 0.11295166015625, + "learning_rate": 0.0001, + "loss": 3.0751, + "loss/crossentropy": 2.271492937207222, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.2081324838101864, + "loss/reg": 0.0, + "step": 31610 + }, + { + "epoch": 0.2080263157894737, + "grad_norm": 2.1875, + "grad_norm_var": 0.1297271728515625, + "learning_rate": 0.0001, + "loss": 3.0283, + "loss/crossentropy": 2.107327163219452, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.23592574074864386, + "loss/reg": 0.0, + "step": 31620 + }, + { + "epoch": 0.20809210526315788, + "grad_norm": 2.078125, + "grad_norm_var": 0.06132583618164063, + "learning_rate": 0.0001, + "loss": 3.0553, + "loss/crossentropy": 2.4657732486724853, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.2265238046646118, + "loss/reg": 0.0, + "step": 31630 + }, + { + "epoch": 0.2081578947368421, + "grad_norm": 2.59375, + "grad_norm_var": 0.5120076497395833, + "learning_rate": 0.0001, + "loss": 3.0358, + "loss/crossentropy": 2.4768943071365355, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.2473340943455696, + "loss/reg": 0.0, + "step": 31640 + }, + { + "epoch": 0.2082236842105263, + "grad_norm": 2.734375, + "grad_norm_var": 0.35252176920572914, + "learning_rate": 0.0001, + "loss": 3.0949, + "loss/crossentropy": 2.5071337461471557, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.23168605715036392, + "loss/reg": 0.0, + "step": 31650 + }, + { + "epoch": 0.20828947368421052, + "grad_norm": 2.484375, + "grad_norm_var": 0.19873758951822917, + "learning_rate": 0.0001, + "loss": 3.0033, + "loss/crossentropy": 2.3884094834327696, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.2127884179353714, + "loss/reg": 0.0, + "step": 31660 + }, + { + "epoch": 0.20835526315789474, + "grad_norm": 3.421875, + "grad_norm_var": 0.33858133951822916, + "learning_rate": 0.0001, + "loss": 2.9746, + "loss/crossentropy": 2.172113299369812, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.20361884087324142, + "loss/reg": 0.0, + "step": 31670 + }, + { + "epoch": 0.20842105263157895, + "grad_norm": 2.28125, + "grad_norm_var": 0.36571858723958334, + "learning_rate": 0.0001, + "loss": 2.9778, + "loss/crossentropy": 2.3501265048980713, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.21666382998228073, + "loss/reg": 0.0, + "step": 31680 + }, + { + "epoch": 0.20848684210526316, + "grad_norm": 2.546875, + "grad_norm_var": 0.12886962890625, + "learning_rate": 0.0001, + "loss": 3.1071, + "loss/crossentropy": 2.8127560138702394, + "loss/hidden": 3.221875, + "loss/incoh": 0.0, + "loss/logits": 0.25830034017562864, + "loss/reg": 0.0, + "step": 31690 + }, + { + "epoch": 0.20855263157894738, + "grad_norm": 2.34375, + "grad_norm_var": 0.04257405598958333, + "learning_rate": 0.0001, + "loss": 3.0355, + "loss/crossentropy": 2.0618197679519654, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.22268864512443542, + "loss/reg": 0.0, + "step": 31700 + }, + { + "epoch": 0.2086184210526316, + "grad_norm": 2.03125, + "grad_norm_var": 0.10972391764322917, + "learning_rate": 0.0001, + "loss": 2.9789, + "loss/crossentropy": 2.4682215332984923, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.27270212322473525, + "loss/reg": 0.0, + "step": 31710 + }, + { + "epoch": 0.20868421052631578, + "grad_norm": 2.5, + "grad_norm_var": 0.108251953125, + "learning_rate": 0.0001, + "loss": 3.0187, + "loss/crossentropy": 2.3017919063568115, + "loss/hidden": 2.6421875, + "loss/incoh": 0.0, + "loss/logits": 0.2289327010512352, + "loss/reg": 0.0, + "step": 31720 + }, + { + "epoch": 0.20875, + "grad_norm": 2.3125, + "grad_norm_var": 0.08994140625, + "learning_rate": 0.0001, + "loss": 3.0376, + "loss/crossentropy": 2.2758594751358032, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.23465103954076766, + "loss/reg": 0.0, + "step": 31730 + }, + { + "epoch": 0.2088157894736842, + "grad_norm": 2.15625, + "grad_norm_var": 0.0693511962890625, + "learning_rate": 0.0001, + "loss": 2.9794, + "loss/crossentropy": 2.3359742760658264, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.25490144789218905, + "loss/reg": 0.0, + "step": 31740 + }, + { + "epoch": 0.20888157894736842, + "grad_norm": 2.375, + "grad_norm_var": 0.035399373372395834, + "learning_rate": 0.0001, + "loss": 3.0153, + "loss/crossentropy": 2.2797624588012697, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.1947791814804077, + "loss/reg": 0.0, + "step": 31750 + }, + { + "epoch": 0.20894736842105263, + "grad_norm": 2.25, + "grad_norm_var": 0.048460896809895834, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.221743679046631, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2385573446750641, + "loss/reg": 0.0, + "step": 31760 + }, + { + "epoch": 0.20901315789473685, + "grad_norm": 2.4375, + "grad_norm_var": 0.0992828369140625, + "learning_rate": 0.0001, + "loss": 3.0352, + "loss/crossentropy": 2.3273940563201903, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.22714275866746902, + "loss/reg": 0.0, + "step": 31770 + }, + { + "epoch": 0.20907894736842106, + "grad_norm": 2.484375, + "grad_norm_var": 0.05660400390625, + "learning_rate": 0.0001, + "loss": 2.9783, + "loss/crossentropy": 2.5335972189903258, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.22051512748003005, + "loss/reg": 0.0, + "step": 31780 + }, + { + "epoch": 0.20914473684210527, + "grad_norm": 2.28125, + "grad_norm_var": 0.0770172119140625, + "learning_rate": 0.0001, + "loss": 3.0626, + "loss/crossentropy": 2.1290286660194395, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.24563827514648437, + "loss/reg": 0.0, + "step": 31790 + }, + { + "epoch": 0.20921052631578949, + "grad_norm": 2.875, + "grad_norm_var": 0.12021484375, + "learning_rate": 0.0001, + "loss": 3.1026, + "loss/crossentropy": 2.277496612071991, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.23920933827757834, + "loss/reg": 0.0, + "step": 31800 + }, + { + "epoch": 0.20927631578947367, + "grad_norm": 2.25, + "grad_norm_var": 0.07742513020833333, + "learning_rate": 0.0001, + "loss": 3.0043, + "loss/crossentropy": 2.3269221425056457, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.2029653638601303, + "loss/reg": 0.0, + "step": 31810 + }, + { + "epoch": 0.20934210526315788, + "grad_norm": 2.296875, + "grad_norm_var": 0.02303034464518229, + "learning_rate": 0.0001, + "loss": 3.0615, + "loss/crossentropy": 2.352055883407593, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.2182576224207878, + "loss/reg": 0.0, + "step": 31820 + }, + { + "epoch": 0.2094078947368421, + "grad_norm": 2.21875, + "grad_norm_var": 0.048500315348307295, + "learning_rate": 0.0001, + "loss": 3.0806, + "loss/crossentropy": 2.143119287490845, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.24638065993785857, + "loss/reg": 0.0, + "step": 31830 + }, + { + "epoch": 0.2094736842105263, + "grad_norm": 2.171875, + "grad_norm_var": 0.13288472493489584, + "learning_rate": 0.0001, + "loss": 3.1094, + "loss/crossentropy": 2.344961977005005, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.24402716457843782, + "loss/reg": 0.0, + "step": 31840 + }, + { + "epoch": 0.20953947368421053, + "grad_norm": 2.5625, + "grad_norm_var": 0.39104817708333334, + "learning_rate": 0.0001, + "loss": 3.0091, + "loss/crossentropy": 2.3289579272270204, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.21923962533473967, + "loss/reg": 0.0, + "step": 31850 + }, + { + "epoch": 0.20960526315789474, + "grad_norm": 2.25, + "grad_norm_var": 0.35844624837239586, + "learning_rate": 0.0001, + "loss": 3.1472, + "loss/crossentropy": 2.2578481793403626, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.2393498733639717, + "loss/reg": 0.0, + "step": 31860 + }, + { + "epoch": 0.20967105263157895, + "grad_norm": 2.6875, + "grad_norm_var": 0.103369140625, + "learning_rate": 0.0001, + "loss": 3.0043, + "loss/crossentropy": 2.1479632019996644, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.21754586473107337, + "loss/reg": 0.0, + "step": 31870 + }, + { + "epoch": 0.20973684210526317, + "grad_norm": 2.265625, + "grad_norm_var": 0.05641988118489583, + "learning_rate": 0.0001, + "loss": 3.0913, + "loss/crossentropy": 2.31242595911026, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.3139140591025352, + "loss/reg": 0.0, + "step": 31880 + }, + { + "epoch": 0.20980263157894738, + "grad_norm": 2.4375, + "grad_norm_var": 0.14700113932291667, + "learning_rate": 0.0001, + "loss": 3.0276, + "loss/crossentropy": 2.421666181087494, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.21645313799381255, + "loss/reg": 0.0, + "step": 31890 + }, + { + "epoch": 0.20986842105263157, + "grad_norm": 2.15625, + "grad_norm_var": 0.6790028889973958, + "learning_rate": 0.0001, + "loss": 3.0781, + "loss/crossentropy": 2.2751805901527407, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.22757814675569535, + "loss/reg": 0.0, + "step": 31900 + }, + { + "epoch": 0.20993421052631578, + "grad_norm": 2.109375, + "grad_norm_var": 0.14388020833333334, + "learning_rate": 0.0001, + "loss": 2.978, + "loss/crossentropy": 2.3446357250213623, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.21789460927248, + "loss/reg": 0.0, + "step": 31910 + }, + { + "epoch": 0.21, + "grad_norm": 2.40625, + "grad_norm_var": 0.1076324462890625, + "learning_rate": 0.0001, + "loss": 3.0473, + "loss/crossentropy": 2.3983862519264223, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.21307818740606307, + "loss/reg": 0.0, + "step": 31920 + }, + { + "epoch": 0.2100657894736842, + "grad_norm": 2.59375, + "grad_norm_var": 0.03589579264322917, + "learning_rate": 0.0001, + "loss": 3.0171, + "loss/crossentropy": 2.37126282453537, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.205704665184021, + "loss/reg": 0.0, + "step": 31930 + }, + { + "epoch": 0.21013157894736842, + "grad_norm": 2.25, + "grad_norm_var": 0.08915608723958333, + "learning_rate": 0.0001, + "loss": 3.0071, + "loss/crossentropy": 2.2473104357719422, + "loss/hidden": 2.6234375, + "loss/incoh": 0.0, + "loss/logits": 0.2207282856106758, + "loss/reg": 0.0, + "step": 31940 + }, + { + "epoch": 0.21019736842105263, + "grad_norm": 2.296875, + "grad_norm_var": 0.03990478515625, + "learning_rate": 0.0001, + "loss": 3.0541, + "loss/crossentropy": 2.4488544821739198, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.26014447808265684, + "loss/reg": 0.0, + "step": 31950 + }, + { + "epoch": 0.21026315789473685, + "grad_norm": 2.25, + "grad_norm_var": 0.0229156494140625, + "learning_rate": 0.0001, + "loss": 3.0762, + "loss/crossentropy": 1.9240918695926665, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.19550290815532206, + "loss/reg": 0.0, + "step": 31960 + }, + { + "epoch": 0.21032894736842106, + "grad_norm": 2.296875, + "grad_norm_var": 0.03746337890625, + "learning_rate": 0.0001, + "loss": 3.0402, + "loss/crossentropy": 2.1934541881084444, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.19934962689876556, + "loss/reg": 0.0, + "step": 31970 + }, + { + "epoch": 0.21039473684210527, + "grad_norm": 2.203125, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 2.9459, + "loss/crossentropy": 2.071032130718231, + "loss/hidden": 2.61875, + "loss/incoh": 0.0, + "loss/logits": 0.20514383763074875, + "loss/reg": 0.0, + "step": 31980 + }, + { + "epoch": 0.2104605263157895, + "grad_norm": 2.484375, + "grad_norm_var": 0.02534764607747396, + "learning_rate": 0.0001, + "loss": 3.0513, + "loss/crossentropy": 2.274570310115814, + "loss/hidden": 2.9515625, + "loss/incoh": 0.0, + "loss/logits": 0.3155985027551651, + "loss/reg": 0.0, + "step": 31990 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 2.8125, + "grad_norm_var": 3.301877391722643e+17, + "learning_rate": 0.0001, + "loss": 3.1093, + "loss/crossentropy": 2.0131736040115356, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.2141942039132118, + "loss/reg": 0.0, + "step": 32000 + }, + { + "epoch": 0.21059210526315789, + "grad_norm": 3.078125, + "grad_norm_var": 0.2028717041015625, + "learning_rate": 0.0001, + "loss": 2.9957, + "loss/crossentropy": 2.213227319717407, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.22059416845440866, + "loss/reg": 0.0, + "step": 32010 + }, + { + "epoch": 0.2106578947368421, + "grad_norm": 2.53125, + "grad_norm_var": 0.09852676391601563, + "learning_rate": 0.0001, + "loss": 2.9318, + "loss/crossentropy": 2.2327592849731444, + "loss/hidden": 2.6125, + "loss/incoh": 0.0, + "loss/logits": 0.19538245126605033, + "loss/reg": 0.0, + "step": 32020 + }, + { + "epoch": 0.2107236842105263, + "grad_norm": 2.125, + "grad_norm_var": 0.10897191365559895, + "learning_rate": 0.0001, + "loss": 2.9561, + "loss/crossentropy": 2.415834832191467, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.2341999500989914, + "loss/reg": 0.0, + "step": 32030 + }, + { + "epoch": 0.21078947368421053, + "grad_norm": 2.328125, + "grad_norm_var": 0.0959185282389323, + "learning_rate": 0.0001, + "loss": 2.9979, + "loss/crossentropy": 2.4787596464157104, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.23873308449983596, + "loss/reg": 0.0, + "step": 32040 + }, + { + "epoch": 0.21085526315789474, + "grad_norm": 2.359375, + "grad_norm_var": 0.12280985514322916, + "learning_rate": 0.0001, + "loss": 2.9578, + "loss/crossentropy": 2.3683923482894897, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.22164048850536347, + "loss/reg": 0.0, + "step": 32050 + }, + { + "epoch": 0.21092105263157895, + "grad_norm": 4.40625, + "grad_norm_var": 0.3617828369140625, + "learning_rate": 0.0001, + "loss": 3.0122, + "loss/crossentropy": 2.336903250217438, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.2218891516327858, + "loss/reg": 0.0, + "step": 32060 + }, + { + "epoch": 0.21098684210526317, + "grad_norm": 2.046875, + "grad_norm_var": 0.31454671223958336, + "learning_rate": 0.0001, + "loss": 2.9132, + "loss/crossentropy": 2.3459483861923216, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.20081116408109664, + "loss/reg": 0.0, + "step": 32070 + }, + { + "epoch": 0.21105263157894738, + "grad_norm": 2.171875, + "grad_norm_var": 0.04011942545572917, + "learning_rate": 0.0001, + "loss": 2.9616, + "loss/crossentropy": 2.1744574666023255, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.21101347357034683, + "loss/reg": 0.0, + "step": 32080 + }, + { + "epoch": 0.21111842105263157, + "grad_norm": 3.921875, + "grad_norm_var": 0.22661107381184895, + "learning_rate": 0.0001, + "loss": 2.9912, + "loss/crossentropy": 2.1881649017333986, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2135306864976883, + "loss/reg": 0.0, + "step": 32090 + }, + { + "epoch": 0.21118421052631578, + "grad_norm": 2.3125, + "grad_norm_var": 0.20420506795247395, + "learning_rate": 0.0001, + "loss": 3.0331, + "loss/crossentropy": 2.0982267916202546, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.20456733107566832, + "loss/reg": 0.0, + "step": 32100 + }, + { + "epoch": 0.21125, + "grad_norm": 2.15625, + "grad_norm_var": 0.12456766764322917, + "learning_rate": 0.0001, + "loss": 2.9625, + "loss/crossentropy": 2.291160595417023, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2443986713886261, + "loss/reg": 0.0, + "step": 32110 + }, + { + "epoch": 0.2113157894736842, + "grad_norm": 2.125, + "grad_norm_var": 0.3302968343098958, + "learning_rate": 0.0001, + "loss": 3.0737, + "loss/crossentropy": 2.3988569140434266, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.22473453879356384, + "loss/reg": 0.0, + "step": 32120 + }, + { + "epoch": 0.21138157894736842, + "grad_norm": 2.671875, + "grad_norm_var": 1.0375, + "learning_rate": 0.0001, + "loss": 2.9681, + "loss/crossentropy": 2.157117784023285, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.25956482589244845, + "loss/reg": 0.0, + "step": 32130 + }, + { + "epoch": 0.21144736842105263, + "grad_norm": 2.234375, + "grad_norm_var": 0.967632802327474, + "learning_rate": 0.0001, + "loss": 3.0129, + "loss/crossentropy": 2.007625603675842, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.21298689395189285, + "loss/reg": 0.0, + "step": 32140 + }, + { + "epoch": 0.21151315789473685, + "grad_norm": 2.6875, + "grad_norm_var": 0.09070536295572916, + "learning_rate": 0.0001, + "loss": 2.9862, + "loss/crossentropy": 2.002223217487335, + "loss/hidden": 2.584375, + "loss/incoh": 0.0, + "loss/logits": 0.18768413811922074, + "loss/reg": 0.0, + "step": 32150 + }, + { + "epoch": 0.21157894736842106, + "grad_norm": 2.40625, + "grad_norm_var": 0.04879557291666667, + "learning_rate": 0.0001, + "loss": 3.0679, + "loss/crossentropy": 2.470254373550415, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2670274436473846, + "loss/reg": 0.0, + "step": 32160 + }, + { + "epoch": 0.21164473684210527, + "grad_norm": 2.484375, + "grad_norm_var": 0.04566650390625, + "learning_rate": 0.0001, + "loss": 2.9691, + "loss/crossentropy": 2.489493703842163, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2385897770524025, + "loss/reg": 0.0, + "step": 32170 + }, + { + "epoch": 0.21171052631578946, + "grad_norm": 2.625, + "grad_norm_var": 0.051512654622395834, + "learning_rate": 0.0001, + "loss": 3.079, + "loss/crossentropy": 2.4059490084648134, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2276516616344452, + "loss/reg": 0.0, + "step": 32180 + }, + { + "epoch": 0.21177631578947367, + "grad_norm": 2.15625, + "grad_norm_var": 0.18515625, + "learning_rate": 0.0001, + "loss": 2.9959, + "loss/crossentropy": 2.1902307987213137, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.22064511626958846, + "loss/reg": 0.0, + "step": 32190 + }, + { + "epoch": 0.2118421052631579, + "grad_norm": 3.09375, + "grad_norm_var": 0.21318359375, + "learning_rate": 0.0001, + "loss": 3.0886, + "loss/crossentropy": 2.2975401639938355, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.21864379346370696, + "loss/reg": 0.0, + "step": 32200 + }, + { + "epoch": 0.2119078947368421, + "grad_norm": 3.65625, + "grad_norm_var": 0.3596425374348958, + "learning_rate": 0.0001, + "loss": 3.0936, + "loss/crossentropy": 2.3152296900749207, + "loss/hidden": 2.64375, + "loss/incoh": 0.0, + "loss/logits": 0.21215650141239167, + "loss/reg": 0.0, + "step": 32210 + }, + { + "epoch": 0.21197368421052631, + "grad_norm": 2.171875, + "grad_norm_var": 0.4088053385416667, + "learning_rate": 0.0001, + "loss": 3.0212, + "loss/crossentropy": 2.345796763896942, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.22906456291675567, + "loss/reg": 0.0, + "step": 32220 + }, + { + "epoch": 0.21203947368421053, + "grad_norm": 2.5, + "grad_norm_var": 0.36056315104166664, + "learning_rate": 0.0001, + "loss": 2.9913, + "loss/crossentropy": 2.2359737753868103, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.21989362984895705, + "loss/reg": 0.0, + "step": 32230 + }, + { + "epoch": 0.21210526315789474, + "grad_norm": 2.390625, + "grad_norm_var": 0.1499908447265625, + "learning_rate": 0.0001, + "loss": 2.9931, + "loss/crossentropy": 2.3540435433387756, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.20313581377267836, + "loss/reg": 0.0, + "step": 32240 + }, + { + "epoch": 0.21217105263157895, + "grad_norm": 3.171875, + "grad_norm_var": 0.08063151041666666, + "learning_rate": 0.0001, + "loss": 2.966, + "loss/crossentropy": 2.3999782681465147, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.20714087784290314, + "loss/reg": 0.0, + "step": 32250 + }, + { + "epoch": 0.21223684210526317, + "grad_norm": 2.25, + "grad_norm_var": 0.08381245930989584, + "learning_rate": 0.0001, + "loss": 3.0748, + "loss/crossentropy": 2.40082049369812, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.24139860570430755, + "loss/reg": 0.0, + "step": 32260 + }, + { + "epoch": 0.21230263157894738, + "grad_norm": 2.40625, + "grad_norm_var": 0.0259185791015625, + "learning_rate": 0.0001, + "loss": 2.9642, + "loss/crossentropy": 2.5221059322357178, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.24260111004114152, + "loss/reg": 0.0, + "step": 32270 + }, + { + "epoch": 0.21236842105263157, + "grad_norm": 3.421875, + "grad_norm_var": 5.511455956857455e+17, + "learning_rate": 0.0001, + "loss": 3.1235, + "loss/crossentropy": 2.582364892959595, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.2309617057442665, + "loss/reg": 0.0, + "step": 32280 + }, + { + "epoch": 0.21243421052631578, + "grad_norm": 2.578125, + "grad_norm_var": 8.515180979484448e+17, + "learning_rate": 0.0001, + "loss": 3.1265, + "loss/crossentropy": 1.9406600832939147, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.25122445076704025, + "loss/reg": 0.0, + "step": 32290 + }, + { + "epoch": 0.2125, + "grad_norm": 2.5, + "grad_norm_var": 0.11577860514322917, + "learning_rate": 0.0001, + "loss": 3.0392, + "loss/crossentropy": 1.9736381709575652, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.24801484644412994, + "loss/reg": 0.0, + "step": 32300 + }, + { + "epoch": 0.2125657894736842, + "grad_norm": 2.28125, + "grad_norm_var": 0.11686909993489583, + "learning_rate": 0.0001, + "loss": 3.0, + "loss/crossentropy": 2.57318320274353, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.25067331790924074, + "loss/reg": 0.0, + "step": 32310 + }, + { + "epoch": 0.21263157894736842, + "grad_norm": 2.234375, + "grad_norm_var": 0.11437352498372395, + "learning_rate": 0.0001, + "loss": 2.9663, + "loss/crossentropy": 2.3444815397262575, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.23782878071069719, + "loss/reg": 0.0, + "step": 32320 + }, + { + "epoch": 0.21269736842105263, + "grad_norm": 2.359375, + "grad_norm_var": 0.05796483357747396, + "learning_rate": 0.0001, + "loss": 2.9547, + "loss/crossentropy": 2.2045453786849976, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.2417543426156044, + "loss/reg": 0.0, + "step": 32330 + }, + { + "epoch": 0.21276315789473685, + "grad_norm": 2.171875, + "grad_norm_var": 0.045344034830729164, + "learning_rate": 0.0001, + "loss": 3.0757, + "loss/crossentropy": 2.349012243747711, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2658751055598259, + "loss/reg": 0.0, + "step": 32340 + }, + { + "epoch": 0.21282894736842106, + "grad_norm": 2.25, + "grad_norm_var": 0.04952799479166667, + "learning_rate": 0.0001, + "loss": 2.9861, + "loss/crossentropy": 2.4626625537872315, + "loss/hidden": 2.8953125, + "loss/incoh": 0.0, + "loss/logits": 0.25371641367673875, + "loss/reg": 0.0, + "step": 32350 + }, + { + "epoch": 0.21289473684210528, + "grad_norm": 2.40625, + "grad_norm_var": 0.06630223592122396, + "learning_rate": 0.0001, + "loss": 2.9574, + "loss/crossentropy": 2.186726263165474, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.20951557606458665, + "loss/reg": 0.0, + "step": 32360 + }, + { + "epoch": 0.21296052631578946, + "grad_norm": 2.71875, + "grad_norm_var": 0.10197728474934896, + "learning_rate": 0.0001, + "loss": 2.965, + "loss/crossentropy": 2.340285396575928, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.23474745452404022, + "loss/reg": 0.0, + "step": 32370 + }, + { + "epoch": 0.21302631578947367, + "grad_norm": 2.6875, + "grad_norm_var": 0.157373046875, + "learning_rate": 0.0001, + "loss": 3.0962, + "loss/crossentropy": 2.47646986246109, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.26605418920516966, + "loss/reg": 0.0, + "step": 32380 + }, + { + "epoch": 0.2130921052631579, + "grad_norm": 2.15625, + "grad_norm_var": 0.10019429524739583, + "learning_rate": 0.0001, + "loss": 2.9578, + "loss/crossentropy": 1.7385891020298003, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.17920974977314472, + "loss/reg": 0.0, + "step": 32390 + }, + { + "epoch": 0.2131578947368421, + "grad_norm": 2.171875, + "grad_norm_var": 0.14401041666666667, + "learning_rate": 0.0001, + "loss": 2.9707, + "loss/crossentropy": 2.182602137327194, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.2166632428765297, + "loss/reg": 0.0, + "step": 32400 + }, + { + "epoch": 0.21322368421052632, + "grad_norm": 2.375, + "grad_norm_var": 0.15817057291666667, + "learning_rate": 0.0001, + "loss": 3.0277, + "loss/crossentropy": 2.3320940494537354, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.263668055832386, + "loss/reg": 0.0, + "step": 32410 + }, + { + "epoch": 0.21328947368421053, + "grad_norm": 2.859375, + "grad_norm_var": 0.156201171875, + "learning_rate": 0.0001, + "loss": 3.0197, + "loss/crossentropy": 2.338312292098999, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.22362788915634155, + "loss/reg": 0.0, + "step": 32420 + }, + { + "epoch": 0.21335526315789474, + "grad_norm": 2.15625, + "grad_norm_var": 0.376025390625, + "learning_rate": 0.0001, + "loss": 3.0193, + "loss/crossentropy": 2.3314969301223756, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.2156234934926033, + "loss/reg": 0.0, + "step": 32430 + }, + { + "epoch": 0.21342105263157896, + "grad_norm": 2.296875, + "grad_norm_var": 0.6317545572916666, + "learning_rate": 0.0001, + "loss": 2.97, + "loss/crossentropy": 2.3391250133514405, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.21457959488034248, + "loss/reg": 0.0, + "step": 32440 + }, + { + "epoch": 0.21348684210526317, + "grad_norm": 2.5625, + "grad_norm_var": 0.3609375, + "learning_rate": 0.0001, + "loss": 3.0389, + "loss/crossentropy": 2.267109489440918, + "loss/hidden": 3.1234375, + "loss/incoh": 0.0, + "loss/logits": 0.3404072761535645, + "loss/reg": 0.0, + "step": 32450 + }, + { + "epoch": 0.21355263157894736, + "grad_norm": 2.390625, + "grad_norm_var": 0.06378580729166666, + "learning_rate": 0.0001, + "loss": 2.9758, + "loss/crossentropy": 2.3441552996635435, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.24242025315761567, + "loss/reg": 0.0, + "step": 32460 + }, + { + "epoch": 0.21361842105263157, + "grad_norm": 2.46875, + "grad_norm_var": 0.0823883056640625, + "learning_rate": 0.0001, + "loss": 3.0308, + "loss/crossentropy": 2.2937331914901735, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.22603273689746856, + "loss/reg": 0.0, + "step": 32470 + }, + { + "epoch": 0.21368421052631578, + "grad_norm": 2.921875, + "grad_norm_var": 0.14341532389322917, + "learning_rate": 0.0001, + "loss": 3.0095, + "loss/crossentropy": 2.071279937028885, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.1986673153936863, + "loss/reg": 0.0, + "step": 32480 + }, + { + "epoch": 0.21375, + "grad_norm": 2.828125, + "grad_norm_var": 0.11785481770833334, + "learning_rate": 0.0001, + "loss": 3.0166, + "loss/crossentropy": 2.0449776887893676, + "loss/hidden": 2.9375, + "loss/incoh": 0.0, + "loss/logits": 0.2660417690873146, + "loss/reg": 0.0, + "step": 32490 + }, + { + "epoch": 0.2138157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.07841389973958333, + "learning_rate": 0.0001, + "loss": 3.0209, + "loss/crossentropy": 2.22779695391655, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2454911906272173, + "loss/reg": 0.0, + "step": 32500 + }, + { + "epoch": 0.21388157894736842, + "grad_norm": 2.28125, + "grad_norm_var": 0.22735087076822916, + "learning_rate": 0.0001, + "loss": 3.0449, + "loss/crossentropy": 2.1109942197799683, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.25361132323741914, + "loss/reg": 0.0, + "step": 32510 + }, + { + "epoch": 0.21394736842105264, + "grad_norm": 2.109375, + "grad_norm_var": 0.21687825520833334, + "learning_rate": 0.0001, + "loss": 2.9946, + "loss/crossentropy": 2.594680833816528, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.24347000420093537, + "loss/reg": 0.0, + "step": 32520 + }, + { + "epoch": 0.21401315789473685, + "grad_norm": 2.25, + "grad_norm_var": 0.027469889322916666, + "learning_rate": 0.0001, + "loss": 2.935, + "loss/crossentropy": 2.4400468468666077, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.2244595929980278, + "loss/reg": 0.0, + "step": 32530 + }, + { + "epoch": 0.21407894736842106, + "grad_norm": 2.34375, + "grad_norm_var": 0.02568359375, + "learning_rate": 0.0001, + "loss": 2.9661, + "loss/crossentropy": 2.336955714225769, + "loss/hidden": 2.6265625, + "loss/incoh": 0.0, + "loss/logits": 0.1992204010486603, + "loss/reg": 0.0, + "step": 32540 + }, + { + "epoch": 0.21414473684210528, + "grad_norm": 2.109375, + "grad_norm_var": 0.051488240559895836, + "learning_rate": 0.0001, + "loss": 3.0259, + "loss/crossentropy": 2.2532342314720153, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.26036129891872406, + "loss/reg": 0.0, + "step": 32550 + }, + { + "epoch": 0.21421052631578946, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03230158487955729, + "learning_rate": 0.0001, + "loss": 2.9443, + "loss/crossentropy": 2.112717604637146, + "loss/hidden": 2.5703125, + "loss/incoh": 0.0, + "loss/logits": 0.17500633224844933, + "loss/reg": 0.0, + "step": 32560 + }, + { + "epoch": 0.21427631578947368, + "grad_norm": 2.40625, + "grad_norm_var": 0.03321711222330729, + "learning_rate": 0.0001, + "loss": 3.0203, + "loss/crossentropy": 2.288592982292175, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.22093368023633958, + "loss/reg": 0.0, + "step": 32570 + }, + { + "epoch": 0.2143421052631579, + "grad_norm": 2.390625, + "grad_norm_var": 0.32156575520833336, + "learning_rate": 0.0001, + "loss": 2.9199, + "loss/crossentropy": 2.1705434560775756, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.20051559060811996, + "loss/reg": 0.0, + "step": 32580 + }, + { + "epoch": 0.2144078947368421, + "grad_norm": 2.359375, + "grad_norm_var": 0.06936442057291667, + "learning_rate": 0.0001, + "loss": 3.0023, + "loss/crossentropy": 2.0764291286468506, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.20810510888695716, + "loss/reg": 0.0, + "step": 32590 + }, + { + "epoch": 0.21447368421052632, + "grad_norm": 2.03125, + "grad_norm_var": 0.09537760416666667, + "learning_rate": 0.0001, + "loss": 2.9267, + "loss/crossentropy": 2.377918064594269, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.23365774154663085, + "loss/reg": 0.0, + "step": 32600 + }, + { + "epoch": 0.21453947368421053, + "grad_norm": 2.671875, + "grad_norm_var": 0.13495686848958333, + "learning_rate": 0.0001, + "loss": 3.0409, + "loss/crossentropy": 2.418037164211273, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.2420778289437294, + "loss/reg": 0.0, + "step": 32610 + }, + { + "epoch": 0.21460526315789474, + "grad_norm": 2.875, + "grad_norm_var": 0.37043863932291665, + "learning_rate": 0.0001, + "loss": 2.9946, + "loss/crossentropy": 2.272017073631287, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.213658407330513, + "loss/reg": 0.0, + "step": 32620 + }, + { + "epoch": 0.21467105263157896, + "grad_norm": 2.53125, + "grad_norm_var": 0.40995686848958335, + "learning_rate": 0.0001, + "loss": 2.9827, + "loss/crossentropy": 2.6424944162368775, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.2500556200742722, + "loss/reg": 0.0, + "step": 32630 + }, + { + "epoch": 0.21473684210526317, + "grad_norm": 2.234375, + "grad_norm_var": 6.259407552083333, + "learning_rate": 0.0001, + "loss": 3.0884, + "loss/crossentropy": 2.5622973680496215, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.23999259918928145, + "loss/reg": 0.0, + "step": 32640 + }, + { + "epoch": 0.21480263157894736, + "grad_norm": 6.65625, + "grad_norm_var": 1.3544270833333334, + "learning_rate": 0.0001, + "loss": 3.0648, + "loss/crossentropy": 2.322819399833679, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.288960862159729, + "loss/reg": 0.0, + "step": 32650 + }, + { + "epoch": 0.21486842105263157, + "grad_norm": 2.296875, + "grad_norm_var": 1.3127237955729167, + "learning_rate": 0.0001, + "loss": 3.0818, + "loss/crossentropy": 2.179678440093994, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.31069366484880445, + "loss/reg": 0.0, + "step": 32660 + }, + { + "epoch": 0.21493421052631578, + "grad_norm": 2.640625, + "grad_norm_var": 0.10913798014322916, + "learning_rate": 0.0001, + "loss": 3.0115, + "loss/crossentropy": 2.2929873704910277, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.24380201399326323, + "loss/reg": 0.0, + "step": 32670 + }, + { + "epoch": 0.215, + "grad_norm": 2.5, + "grad_norm_var": 0.05563151041666667, + "learning_rate": 0.0001, + "loss": 3.017, + "loss/crossentropy": 1.9826122522354126, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.18868125975131989, + "loss/reg": 0.0, + "step": 32680 + }, + { + "epoch": 0.2150657894736842, + "grad_norm": 2.21875, + "grad_norm_var": 0.08579813639322917, + "learning_rate": 0.0001, + "loss": 2.9559, + "loss/crossentropy": 2.4682058572769163, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.22441071420907974, + "loss/reg": 0.0, + "step": 32690 + }, + { + "epoch": 0.21513157894736842, + "grad_norm": 2.265625, + "grad_norm_var": 0.07283503214518229, + "learning_rate": 0.0001, + "loss": 2.9565, + "loss/crossentropy": 2.255161941051483, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.2831642434000969, + "loss/reg": 0.0, + "step": 32700 + }, + { + "epoch": 0.21519736842105264, + "grad_norm": 2.25, + "grad_norm_var": 0.1724273681640625, + "learning_rate": 0.0001, + "loss": 3.0657, + "loss/crossentropy": 2.233939862251282, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.21798643767833709, + "loss/reg": 0.0, + "step": 32710 + }, + { + "epoch": 0.21526315789473685, + "grad_norm": 3.1875, + "grad_norm_var": 1.3363352457682292, + "learning_rate": 0.0001, + "loss": 3.0325, + "loss/crossentropy": 2.5652535796165465, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2587194755673409, + "loss/reg": 0.0, + "step": 32720 + }, + { + "epoch": 0.21532894736842106, + "grad_norm": 2.484375, + "grad_norm_var": 0.130419921875, + "learning_rate": 0.0001, + "loss": 3.0003, + "loss/crossentropy": 2.1636557817459106, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.20836876183748246, + "loss/reg": 0.0, + "step": 32730 + }, + { + "epoch": 0.21539473684210525, + "grad_norm": 2.046875, + "grad_norm_var": 0.035302734375, + "learning_rate": 0.0001, + "loss": 2.9321, + "loss/crossentropy": 2.4362986207008364, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.24660978615283966, + "loss/reg": 0.0, + "step": 32740 + }, + { + "epoch": 0.21546052631578946, + "grad_norm": 2.5, + "grad_norm_var": 0.035105133056640626, + "learning_rate": 0.0001, + "loss": 2.9716, + "loss/crossentropy": 2.255325746536255, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.24538252800703048, + "loss/reg": 0.0, + "step": 32750 + }, + { + "epoch": 0.21552631578947368, + "grad_norm": 2.515625, + "grad_norm_var": 0.09314676920572916, + "learning_rate": 0.0001, + "loss": 3.0161, + "loss/crossentropy": 2.329508912563324, + "loss/hidden": 2.6359375, + "loss/incoh": 0.0, + "loss/logits": 0.21971380114555358, + "loss/reg": 0.0, + "step": 32760 + }, + { + "epoch": 0.2155921052631579, + "grad_norm": 2.625, + "grad_norm_var": 0.20466206868489584, + "learning_rate": 0.0001, + "loss": 2.963, + "loss/crossentropy": 2.262472677230835, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.20472683310508727, + "loss/reg": 0.0, + "step": 32770 + }, + { + "epoch": 0.2156578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 0.25344009399414064, + "learning_rate": 0.0001, + "loss": 3.0138, + "loss/crossentropy": 2.3205514192581176, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.23903629779815674, + "loss/reg": 0.0, + "step": 32780 + }, + { + "epoch": 0.21572368421052632, + "grad_norm": 3.09375, + "grad_norm_var": 2.3024370829264322, + "learning_rate": 0.0001, + "loss": 3.0763, + "loss/crossentropy": 2.315890896320343, + "loss/hidden": 2.85625, + "loss/incoh": 0.0, + "loss/logits": 0.24931414425373077, + "loss/reg": 0.0, + "step": 32790 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 2.53125, + "grad_norm_var": 0.34871317545572916, + "learning_rate": 0.0001, + "loss": 3.0259, + "loss/crossentropy": 2.2797498226165773, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.20434827357530594, + "loss/reg": 0.0, + "step": 32800 + }, + { + "epoch": 0.21585526315789474, + "grad_norm": 2.203125, + "grad_norm_var": 0.021117146809895834, + "learning_rate": 0.0001, + "loss": 2.9999, + "loss/crossentropy": 2.095989489555359, + "loss/hidden": 3.10625, + "loss/incoh": 0.0, + "loss/logits": 0.2900803714990616, + "loss/reg": 0.0, + "step": 32810 + }, + { + "epoch": 0.21592105263157896, + "grad_norm": 2.453125, + "grad_norm_var": 0.06117121378580729, + "learning_rate": 0.0001, + "loss": 3.03, + "loss/crossentropy": 2.325262129306793, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.28796064853668213, + "loss/reg": 0.0, + "step": 32820 + }, + { + "epoch": 0.21598684210526317, + "grad_norm": 4.96875, + "grad_norm_var": 0.48582356770833335, + "learning_rate": 0.0001, + "loss": 2.9917, + "loss/crossentropy": 2.521669340133667, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.2139764204621315, + "loss/reg": 0.0, + "step": 32830 + }, + { + "epoch": 0.21605263157894736, + "grad_norm": 2.296875, + "grad_norm_var": 0.48948567708333335, + "learning_rate": 0.0001, + "loss": 2.9862, + "loss/crossentropy": 2.479197156429291, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.22702316492795943, + "loss/reg": 0.0, + "step": 32840 + }, + { + "epoch": 0.21611842105263157, + "grad_norm": 2.71875, + "grad_norm_var": 0.04804280598958333, + "learning_rate": 0.0001, + "loss": 2.9695, + "loss/crossentropy": 1.8314170956611633, + "loss/hidden": 2.66875, + "loss/incoh": 0.0, + "loss/logits": 0.19034175872802733, + "loss/reg": 0.0, + "step": 32850 + }, + { + "epoch": 0.21618421052631578, + "grad_norm": 3.15625, + "grad_norm_var": 0.12819798787434897, + "learning_rate": 0.0001, + "loss": 2.9729, + "loss/crossentropy": 2.3301822662353517, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.23585740774869918, + "loss/reg": 0.0, + "step": 32860 + }, + { + "epoch": 0.21625, + "grad_norm": 2.53125, + "grad_norm_var": 3.20617589974401e+17, + "learning_rate": 0.0001, + "loss": 3.0646, + "loss/crossentropy": 2.3723347544670106, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.22372013479471206, + "loss/reg": 0.0, + "step": 32870 + }, + { + "epoch": 0.2163157894736842, + "grad_norm": 2.484375, + "grad_norm_var": 3.2061758995523174e+17, + "learning_rate": 0.0001, + "loss": 3.0732, + "loss/crossentropy": 2.337008368968964, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.24527821093797683, + "loss/reg": 0.0, + "step": 32880 + }, + { + "epoch": 0.21638157894736842, + "grad_norm": 2.5625, + "grad_norm_var": 0.18332926432291666, + "learning_rate": 0.0001, + "loss": 3.029, + "loss/crossentropy": 2.056360971927643, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.20862108021974562, + "loss/reg": 0.0, + "step": 32890 + }, + { + "epoch": 0.21644736842105264, + "grad_norm": 2.359375, + "grad_norm_var": 0.2578277587890625, + "learning_rate": 0.0001, + "loss": 3.0287, + "loss/crossentropy": 2.425034213066101, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.22822874337434768, + "loss/reg": 0.0, + "step": 32900 + }, + { + "epoch": 0.21651315789473685, + "grad_norm": 2.734375, + "grad_norm_var": 0.1828125, + "learning_rate": 0.0001, + "loss": 3.046, + "loss/crossentropy": 2.0453652262687685, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.2606233850121498, + "loss/reg": 0.0, + "step": 32910 + }, + { + "epoch": 0.21657894736842107, + "grad_norm": 2.453125, + "grad_norm_var": 0.1352203369140625, + "learning_rate": 0.0001, + "loss": 2.992, + "loss/crossentropy": 2.245816957950592, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.2316955327987671, + "loss/reg": 0.0, + "step": 32920 + }, + { + "epoch": 0.21664473684210525, + "grad_norm": 2.359375, + "grad_norm_var": 0.11937255859375, + "learning_rate": 0.0001, + "loss": 2.8992, + "loss/crossentropy": 2.3973164916038514, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.21730958074331283, + "loss/reg": 0.0, + "step": 32930 + }, + { + "epoch": 0.21671052631578946, + "grad_norm": 2.265625, + "grad_norm_var": 0.16926167805989584, + "learning_rate": 0.0001, + "loss": 3.076, + "loss/crossentropy": 2.3131038188934325, + "loss/hidden": 3.0328125, + "loss/incoh": 0.0, + "loss/logits": 0.2888943269848824, + "loss/reg": 0.0, + "step": 32940 + }, + { + "epoch": 0.21677631578947368, + "grad_norm": 2.640625, + "grad_norm_var": 0.1410552978515625, + "learning_rate": 0.0001, + "loss": 3.0965, + "loss/crossentropy": 2.441923999786377, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.2226712241768837, + "loss/reg": 0.0, + "step": 32950 + }, + { + "epoch": 0.2168421052631579, + "grad_norm": 5.6875, + "grad_norm_var": 0.6874338785807291, + "learning_rate": 0.0001, + "loss": 3.0441, + "loss/crossentropy": 2.0053435921669007, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.20919274985790254, + "loss/reg": 0.0, + "step": 32960 + }, + { + "epoch": 0.2169078947368421, + "grad_norm": 2.03125, + "grad_norm_var": 0.7307291666666667, + "learning_rate": 0.0001, + "loss": 3.0422, + "loss/crossentropy": 2.2273124933242796, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2289419651031494, + "loss/reg": 0.0, + "step": 32970 + }, + { + "epoch": 0.21697368421052632, + "grad_norm": 2.328125, + "grad_norm_var": 0.0592681884765625, + "learning_rate": 0.0001, + "loss": 2.972, + "loss/crossentropy": 2.315923011302948, + "loss/hidden": 2.603125, + "loss/incoh": 0.0, + "loss/logits": 0.19858411103487014, + "loss/reg": 0.0, + "step": 32980 + }, + { + "epoch": 0.21703947368421053, + "grad_norm": 2.625, + "grad_norm_var": 0.06795145670572916, + "learning_rate": 0.0001, + "loss": 3.1097, + "loss/crossentropy": 2.0356208801269533, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.18648817762732506, + "loss/reg": 0.0, + "step": 32990 + }, + { + "epoch": 0.21710526315789475, + "grad_norm": 2.25, + "grad_norm_var": 0.087548828125, + "learning_rate": 0.0001, + "loss": 3.1037, + "loss/crossentropy": 2.4323093771934508, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.2574485644698143, + "loss/reg": 0.0, + "step": 33000 + }, + { + "epoch": 0.21717105263157896, + "grad_norm": 2.375, + "grad_norm_var": 0.08557535807291666, + "learning_rate": 0.0001, + "loss": 2.9955, + "loss/crossentropy": 2.2170889139175416, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.23443114012479782, + "loss/reg": 0.0, + "step": 33010 + }, + { + "epoch": 0.21723684210526314, + "grad_norm": 2.828125, + "grad_norm_var": 0.0577056884765625, + "learning_rate": 0.0001, + "loss": 3.0261, + "loss/crossentropy": 2.2761878967285156, + "loss/hidden": 2.85, + "loss/incoh": 0.0, + "loss/logits": 0.23024188429117204, + "loss/reg": 0.0, + "step": 33020 + }, + { + "epoch": 0.21730263157894736, + "grad_norm": 2.5, + "grad_norm_var": 0.041844685872395836, + "learning_rate": 0.0001, + "loss": 3.0025, + "loss/crossentropy": 2.281536602973938, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.2611322790384293, + "loss/reg": 0.0, + "step": 33030 + }, + { + "epoch": 0.21736842105263157, + "grad_norm": 2.34375, + "grad_norm_var": 2.2070302327473956, + "learning_rate": 0.0001, + "loss": 3.0541, + "loss/crossentropy": 2.285531198978424, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.27087974548339844, + "loss/reg": 0.0, + "step": 33040 + }, + { + "epoch": 0.21743421052631579, + "grad_norm": 2.1875, + "grad_norm_var": 2.3124013264973957, + "learning_rate": 0.0001, + "loss": 3.0046, + "loss/crossentropy": 2.1499021500349045, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2288394898176193, + "loss/reg": 0.0, + "step": 33050 + }, + { + "epoch": 0.2175, + "grad_norm": 2.375, + "grad_norm_var": 0.185888671875, + "learning_rate": 0.0001, + "loss": 3.01, + "loss/crossentropy": 2.2262683510780334, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.21322729140520097, + "loss/reg": 0.0, + "step": 33060 + }, + { + "epoch": 0.2175657894736842, + "grad_norm": 2.078125, + "grad_norm_var": 0.19086278279622396, + "learning_rate": 0.0001, + "loss": 2.9547, + "loss/crossentropy": 2.512454855442047, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.22262426763772963, + "loss/reg": 0.0, + "step": 33070 + }, + { + "epoch": 0.21763157894736843, + "grad_norm": 2.296875, + "grad_norm_var": 0.06533584594726563, + "learning_rate": 0.0001, + "loss": 3.0191, + "loss/crossentropy": 2.4302139401435854, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.21147249341011048, + "loss/reg": 0.0, + "step": 33080 + }, + { + "epoch": 0.21769736842105264, + "grad_norm": 2.265625, + "grad_norm_var": 0.05804036458333333, + "learning_rate": 0.0001, + "loss": 2.9211, + "loss/crossentropy": 2.288132381439209, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.20329886302351952, + "loss/reg": 0.0, + "step": 33090 + }, + { + "epoch": 0.21776315789473685, + "grad_norm": 2.09375, + "grad_norm_var": 0.07825419108072916, + "learning_rate": 0.0001, + "loss": 2.9798, + "loss/crossentropy": 2.4942794919013975, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.22770892977714538, + "loss/reg": 0.0, + "step": 33100 + }, + { + "epoch": 0.21782894736842107, + "grad_norm": 2.078125, + "grad_norm_var": 0.46223551432291665, + "learning_rate": 0.0001, + "loss": 2.998, + "loss/crossentropy": 2.496278202533722, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23407908529043198, + "loss/reg": 0.0, + "step": 33110 + }, + { + "epoch": 0.21789473684210525, + "grad_norm": 2.546875, + "grad_norm_var": 0.1009674072265625, + "learning_rate": 0.0001, + "loss": 3.0352, + "loss/crossentropy": 2.0836499094963075, + "loss/hidden": 2.740625, + "loss/incoh": 0.0, + "loss/logits": 0.22313288152217864, + "loss/reg": 0.0, + "step": 33120 + }, + { + "epoch": 0.21796052631578947, + "grad_norm": 2.1875, + "grad_norm_var": 0.0770660400390625, + "learning_rate": 0.0001, + "loss": 2.982, + "loss/crossentropy": 2.413359189033508, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.23794451206922532, + "loss/reg": 0.0, + "step": 33130 + }, + { + "epoch": 0.21802631578947368, + "grad_norm": 2.4375, + "grad_norm_var": 0.17366536458333334, + "learning_rate": 0.0001, + "loss": 2.9588, + "loss/crossentropy": 2.390314495563507, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.20836979001760483, + "loss/reg": 0.0, + "step": 33140 + }, + { + "epoch": 0.2180921052631579, + "grad_norm": 2.40625, + "grad_norm_var": 0.1214263916015625, + "learning_rate": 0.0001, + "loss": 3.0312, + "loss/crossentropy": 2.1217519342899323, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.22108341604471207, + "loss/reg": 0.0, + "step": 33150 + }, + { + "epoch": 0.2181578947368421, + "grad_norm": 2.359375, + "grad_norm_var": 0.023981730143229168, + "learning_rate": 0.0001, + "loss": 2.9862, + "loss/crossentropy": 2.3113851308822633, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.27378889471292495, + "loss/reg": 0.0, + "step": 33160 + }, + { + "epoch": 0.21822368421052632, + "grad_norm": 2.921875, + "grad_norm_var": 0.07545166015625, + "learning_rate": 0.0001, + "loss": 3.0085, + "loss/crossentropy": 2.366367816925049, + "loss/hidden": 2.68125, + "loss/incoh": 0.0, + "loss/logits": 0.20626646727323533, + "loss/reg": 0.0, + "step": 33170 + }, + { + "epoch": 0.21828947368421053, + "grad_norm": 2.734375, + "grad_norm_var": 0.08708394368489583, + "learning_rate": 0.0001, + "loss": 2.9855, + "loss/crossentropy": 2.2448646426200867, + "loss/hidden": 2.8296875, + "loss/incoh": 0.0, + "loss/logits": 0.2878586873412132, + "loss/reg": 0.0, + "step": 33180 + }, + { + "epoch": 0.21835526315789475, + "grad_norm": 2.15625, + "grad_norm_var": 0.0867828369140625, + "learning_rate": 0.0001, + "loss": 2.9798, + "loss/crossentropy": 2.2996390104293822, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.22233397513628006, + "loss/reg": 0.0, + "step": 33190 + }, + { + "epoch": 0.21842105263157896, + "grad_norm": 2.6875, + "grad_norm_var": 0.12769266764322917, + "learning_rate": 0.0001, + "loss": 3.0705, + "loss/crossentropy": 2.22303249835968, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.32309592962265016, + "loss/reg": 0.0, + "step": 33200 + }, + { + "epoch": 0.21848684210526315, + "grad_norm": 2.28125, + "grad_norm_var": 0.07171122233072917, + "learning_rate": 0.0001, + "loss": 3.0575, + "loss/crossentropy": 2.455464780330658, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2447066068649292, + "loss/reg": 0.0, + "step": 33210 + }, + { + "epoch": 0.21855263157894736, + "grad_norm": 2.421875, + "grad_norm_var": 0.13964436848958334, + "learning_rate": 0.0001, + "loss": 3.0108, + "loss/crossentropy": 2.4379093527793883, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.20427114516496658, + "loss/reg": 0.0, + "step": 33220 + }, + { + "epoch": 0.21861842105263157, + "grad_norm": 2.53125, + "grad_norm_var": 0.14078776041666666, + "learning_rate": 0.0001, + "loss": 3.0186, + "loss/crossentropy": 2.3393636345863342, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24139494746923446, + "loss/reg": 0.0, + "step": 33230 + }, + { + "epoch": 0.2186842105263158, + "grad_norm": 2.203125, + "grad_norm_var": 0.017975870768229166, + "learning_rate": 0.0001, + "loss": 3.0254, + "loss/crossentropy": 2.313693457841873, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.25786808133125305, + "loss/reg": 0.0, + "step": 33240 + }, + { + "epoch": 0.21875, + "grad_norm": 2.28125, + "grad_norm_var": 0.048111979166666666, + "learning_rate": 0.0001, + "loss": 2.9975, + "loss/crossentropy": 2.3757463097572327, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.21639765352010726, + "loss/reg": 0.0, + "step": 33250 + }, + { + "epoch": 0.2188157894736842, + "grad_norm": 2.359375, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 0.0001, + "loss": 2.9429, + "loss/crossentropy": 2.540412497520447, + "loss/hidden": 2.565625, + "loss/incoh": 0.0, + "loss/logits": 0.1985073819756508, + "loss/reg": 0.0, + "step": 33260 + }, + { + "epoch": 0.21888157894736843, + "grad_norm": 2.53125, + "grad_norm_var": 0.03837788899739583, + "learning_rate": 0.0001, + "loss": 3.0685, + "loss/crossentropy": 2.373577296733856, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.2263040378689766, + "loss/reg": 0.0, + "step": 33270 + }, + { + "epoch": 0.21894736842105264, + "grad_norm": 2.390625, + "grad_norm_var": 0.029686482747395833, + "learning_rate": 0.0001, + "loss": 2.9608, + "loss/crossentropy": 2.2283737421035767, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23245082497596742, + "loss/reg": 0.0, + "step": 33280 + }, + { + "epoch": 0.21901315789473685, + "grad_norm": 2.328125, + "grad_norm_var": 0.09920145670572916, + "learning_rate": 0.0001, + "loss": 3.0048, + "loss/crossentropy": 2.2985653638839723, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.27912328094244004, + "loss/reg": 0.0, + "step": 33290 + }, + { + "epoch": 0.21907894736842104, + "grad_norm": 2.109375, + "grad_norm_var": 0.10875651041666666, + "learning_rate": 0.0001, + "loss": 3.005, + "loss/crossentropy": 2.342892110347748, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.2504061296582222, + "loss/reg": 0.0, + "step": 33300 + }, + { + "epoch": 0.21914473684210525, + "grad_norm": 2.546875, + "grad_norm_var": 0.04578348795572917, + "learning_rate": 0.0001, + "loss": 3.0108, + "loss/crossentropy": 2.337607777118683, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.20644718632102013, + "loss/reg": 0.0, + "step": 33310 + }, + { + "epoch": 0.21921052631578947, + "grad_norm": 2.140625, + "grad_norm_var": 0.08603108723958333, + "learning_rate": 0.0001, + "loss": 2.9696, + "loss/crossentropy": 2.080331861972809, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.27878897339105607, + "loss/reg": 0.0, + "step": 33320 + }, + { + "epoch": 0.21927631578947368, + "grad_norm": 1.96875, + "grad_norm_var": 0.17981363932291666, + "learning_rate": 0.0001, + "loss": 3.0076, + "loss/crossentropy": 2.3555988073349, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.24901487827301025, + "loss/reg": 0.0, + "step": 33330 + }, + { + "epoch": 0.2193421052631579, + "grad_norm": 2.359375, + "grad_norm_var": 0.17130533854166666, + "learning_rate": 0.0001, + "loss": 3.0368, + "loss/crossentropy": 2.223631227016449, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.19968705028295516, + "loss/reg": 0.0, + "step": 33340 + }, + { + "epoch": 0.2194078947368421, + "grad_norm": 2.15625, + "grad_norm_var": 0.22228190104166667, + "learning_rate": 0.0001, + "loss": 3.0191, + "loss/crossentropy": 2.108024871349335, + "loss/hidden": 2.5875, + "loss/incoh": 0.0, + "loss/logits": 0.20175025016069412, + "loss/reg": 0.0, + "step": 33350 + }, + { + "epoch": 0.21947368421052632, + "grad_norm": 2.421875, + "grad_norm_var": 0.2382720947265625, + "learning_rate": 0.0001, + "loss": 3.0196, + "loss/crossentropy": 2.2824084401130675, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.20648818761110305, + "loss/reg": 0.0, + "step": 33360 + }, + { + "epoch": 0.21953947368421053, + "grad_norm": 2.234375, + "grad_norm_var": 0.06386311848958333, + "learning_rate": 0.0001, + "loss": 2.983, + "loss/crossentropy": 2.193799388408661, + "loss/hidden": 2.6, + "loss/incoh": 0.0, + "loss/logits": 0.20398449748754502, + "loss/reg": 0.0, + "step": 33370 + }, + { + "epoch": 0.21960526315789475, + "grad_norm": 2.84375, + "grad_norm_var": 0.21061375935872395, + "learning_rate": 0.0001, + "loss": 3.0511, + "loss/crossentropy": 2.4952160239219667, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.21543749868869783, + "loss/reg": 0.0, + "step": 33380 + }, + { + "epoch": 0.21967105263157893, + "grad_norm": 2.515625, + "grad_norm_var": 0.6498410542805989, + "learning_rate": 0.0001, + "loss": 3.0987, + "loss/crossentropy": 2.4116694569587707, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2724195793271065, + "loss/reg": 0.0, + "step": 33390 + }, + { + "epoch": 0.21973684210526315, + "grad_norm": 2.0625, + "grad_norm_var": 0.23567072550455728, + "learning_rate": 0.0001, + "loss": 2.9416, + "loss/crossentropy": 2.597557008266449, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2337816461920738, + "loss/reg": 0.0, + "step": 33400 + }, + { + "epoch": 0.21980263157894736, + "grad_norm": 2.109375, + "grad_norm_var": 0.03090387980143229, + "learning_rate": 0.0001, + "loss": 2.9881, + "loss/crossentropy": 2.446978306770325, + "loss/hidden": 2.609375, + "loss/incoh": 0.0, + "loss/logits": 0.2176991730928421, + "loss/reg": 0.0, + "step": 33410 + }, + { + "epoch": 0.21986842105263157, + "grad_norm": 2.140625, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 3.0409, + "loss/crossentropy": 2.0214170694351195, + "loss/hidden": 3.0140625, + "loss/incoh": 0.0, + "loss/logits": 0.23912209570407866, + "loss/reg": 0.0, + "step": 33420 + }, + { + "epoch": 0.2199342105263158, + "grad_norm": 2.234375, + "grad_norm_var": 0.01568603515625, + "learning_rate": 0.0001, + "loss": 3.042, + "loss/crossentropy": 2.4408559799194336, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.23146803677082062, + "loss/reg": 0.0, + "step": 33430 + }, + { + "epoch": 0.22, + "grad_norm": 2.71875, + "grad_norm_var": 0.05896809895833333, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.699678826332092, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.250095309317112, + "loss/reg": 0.0, + "step": 33440 + }, + { + "epoch": 0.22006578947368421, + "grad_norm": 2.453125, + "grad_norm_var": 0.14205322265625, + "learning_rate": 0.0001, + "loss": 3.0448, + "loss/crossentropy": 2.2925549387931823, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.24470344185829163, + "loss/reg": 0.0, + "step": 33450 + }, + { + "epoch": 0.22013157894736843, + "grad_norm": 2.25, + "grad_norm_var": 3.588060506184896, + "learning_rate": 0.0001, + "loss": 3.1012, + "loss/crossentropy": 2.3695211172103883, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.24156490415334703, + "loss/reg": 0.0, + "step": 33460 + }, + { + "epoch": 0.22019736842105264, + "grad_norm": 2382364672.0, + "grad_norm_var": 3.5472883849629094e+17, + "learning_rate": 0.0001, + "loss": 3.1654, + "loss/crossentropy": 2.488659930229187, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.21741390377283096, + "loss/reg": 0.0, + "step": 33470 + }, + { + "epoch": 0.22026315789473686, + "grad_norm": 2.15625, + "grad_norm_var": 3.547288386504622e+17, + "learning_rate": 0.0001, + "loss": 3.0377, + "loss/crossentropy": 2.2884308457374574, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.231295208632946, + "loss/reg": 0.0, + "step": 33480 + }, + { + "epoch": 0.22032894736842104, + "grad_norm": 2.421875, + "grad_norm_var": 0.049658203125, + "learning_rate": 0.0001, + "loss": 3.06, + "loss/crossentropy": 2.163191545009613, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22781991362571716, + "loss/reg": 0.0, + "step": 33490 + }, + { + "epoch": 0.22039473684210525, + "grad_norm": 2.328125, + "grad_norm_var": 0.08287353515625, + "learning_rate": 0.0001, + "loss": 2.9897, + "loss/crossentropy": 2.2403807282447814, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.23052917122840882, + "loss/reg": 0.0, + "step": 33500 + }, + { + "epoch": 0.22046052631578947, + "grad_norm": 2.328125, + "grad_norm_var": 0.4391029357910156, + "learning_rate": 0.0001, + "loss": 2.9359, + "loss/crossentropy": 2.215052628517151, + "loss/hidden": 2.6265625, + "loss/incoh": 0.0, + "loss/logits": 0.17493754550814627, + "loss/reg": 0.0, + "step": 33510 + }, + { + "epoch": 0.22052631578947368, + "grad_norm": 2.03125, + "grad_norm_var": 1.1144650777180989, + "learning_rate": 0.0001, + "loss": 3.0608, + "loss/crossentropy": 2.0380016922950746, + "loss/hidden": 2.90625, + "loss/incoh": 0.0, + "loss/logits": 0.2704147264361382, + "loss/reg": 0.0, + "step": 33520 + }, + { + "epoch": 0.2205921052631579, + "grad_norm": 2.109375, + "grad_norm_var": 1.154686482747396, + "learning_rate": 0.0001, + "loss": 3.0527, + "loss/crossentropy": 2.5095550060272216, + "loss/hidden": 2.9453125, + "loss/incoh": 0.0, + "loss/logits": 0.2915314584970474, + "loss/reg": 0.0, + "step": 33530 + }, + { + "epoch": 0.2206578947368421, + "grad_norm": 2.21875, + "grad_norm_var": 0.4706939697265625, + "learning_rate": 0.0001, + "loss": 2.9389, + "loss/crossentropy": 2.175918400287628, + "loss/hidden": 2.59375, + "loss/incoh": 0.0, + "loss/logits": 0.19029978811740875, + "loss/reg": 0.0, + "step": 33540 + }, + { + "epoch": 0.22072368421052632, + "grad_norm": 2.296875, + "grad_norm_var": 0.13570556640625, + "learning_rate": 0.0001, + "loss": 3.1049, + "loss/crossentropy": 2.1491673469543455, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.28144592940807345, + "loss/reg": 0.0, + "step": 33550 + }, + { + "epoch": 0.22078947368421054, + "grad_norm": 2.265625, + "grad_norm_var": 0.20730692545572918, + "learning_rate": 0.0001, + "loss": 3.0324, + "loss/crossentropy": 2.10293396115303, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.20049710869789122, + "loss/reg": 0.0, + "step": 33560 + }, + { + "epoch": 0.22085526315789475, + "grad_norm": 2.40625, + "grad_norm_var": 0.20046284993489583, + "learning_rate": 0.0001, + "loss": 3.082, + "loss/crossentropy": 2.4584061741828918, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.235392826795578, + "loss/reg": 0.0, + "step": 33570 + }, + { + "epoch": 0.22092105263157893, + "grad_norm": 2.265625, + "grad_norm_var": 0.0829742431640625, + "learning_rate": 0.0001, + "loss": 3.0276, + "loss/crossentropy": 2.107621490955353, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2272868797183037, + "loss/reg": 0.0, + "step": 33580 + }, + { + "epoch": 0.22098684210526315, + "grad_norm": 2.484375, + "grad_norm_var": 0.25580952962239584, + "learning_rate": 0.0001, + "loss": 3.0614, + "loss/crossentropy": 2.120447838306427, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.19922292605042458, + "loss/reg": 0.0, + "step": 33590 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 2.078125, + "grad_norm_var": 0.027180989583333332, + "learning_rate": 0.0001, + "loss": 2.9857, + "loss/crossentropy": 2.192478084564209, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2926119461655617, + "loss/reg": 0.0, + "step": 33600 + }, + { + "epoch": 0.22111842105263158, + "grad_norm": 2.09375, + "grad_norm_var": 0.07001851399739584, + "learning_rate": 0.0001, + "loss": 3.0663, + "loss/crossentropy": 2.4031628251075743, + "loss/hidden": 2.6421875, + "loss/incoh": 0.0, + "loss/logits": 0.21948793083429335, + "loss/reg": 0.0, + "step": 33610 + }, + { + "epoch": 0.2211842105263158, + "grad_norm": 3.671875, + "grad_norm_var": 0.183837890625, + "learning_rate": 0.0001, + "loss": 3.0731, + "loss/crossentropy": 2.2898489594459535, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2655272573232651, + "loss/reg": 0.0, + "step": 33620 + }, + { + "epoch": 0.22125, + "grad_norm": 2.25, + "grad_norm_var": 1.4440388997395834, + "learning_rate": 0.0001, + "loss": 3.0888, + "loss/crossentropy": 2.334256184101105, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.20996386408805848, + "loss/reg": 0.0, + "step": 33630 + }, + { + "epoch": 0.22131578947368422, + "grad_norm": 2.375, + "grad_norm_var": 0.04494527180989583, + "learning_rate": 0.0001, + "loss": 3.0546, + "loss/crossentropy": 2.3563809156417848, + "loss/hidden": 3.0484375, + "loss/incoh": 0.0, + "loss/logits": 0.29970877766609194, + "loss/reg": 0.0, + "step": 33640 + }, + { + "epoch": 0.22138157894736843, + "grad_norm": 2.28125, + "grad_norm_var": 0.040283203125, + "learning_rate": 0.0001, + "loss": 3.0316, + "loss/crossentropy": 2.469383955001831, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22515997290611267, + "loss/reg": 0.0, + "step": 33650 + }, + { + "epoch": 0.22144736842105264, + "grad_norm": 2.21875, + "grad_norm_var": 0.0654449462890625, + "learning_rate": 0.0001, + "loss": 3.1037, + "loss/crossentropy": 2.4161542892456054, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.22671497017145156, + "loss/reg": 0.0, + "step": 33660 + }, + { + "epoch": 0.22151315789473683, + "grad_norm": 3.953125, + "grad_norm_var": 0.27838134765625, + "learning_rate": 0.0001, + "loss": 3.0341, + "loss/crossentropy": 2.415839672088623, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2382939413189888, + "loss/reg": 0.0, + "step": 33670 + }, + { + "epoch": 0.22157894736842104, + "grad_norm": 2.203125, + "grad_norm_var": 0.9553260803222656, + "learning_rate": 0.0001, + "loss": 3.0584, + "loss/crossentropy": 2.353093123435974, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.276655937731266, + "loss/reg": 0.0, + "step": 33680 + }, + { + "epoch": 0.22164473684210526, + "grad_norm": 2.421875, + "grad_norm_var": 0.8398048400878906, + "learning_rate": 0.0001, + "loss": 3.0327, + "loss/crossentropy": 2.2812642812728883, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.20394409000873565, + "loss/reg": 0.0, + "step": 33690 + }, + { + "epoch": 0.22171052631578947, + "grad_norm": 1.859375, + "grad_norm_var": 0.10286051432291667, + "learning_rate": 0.0001, + "loss": 3.0121, + "loss/crossentropy": 2.2089013338088987, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.2560999572277069, + "loss/reg": 0.0, + "step": 33700 + }, + { + "epoch": 0.22177631578947368, + "grad_norm": 2.453125, + "grad_norm_var": 0.224267578125, + "learning_rate": 0.0001, + "loss": 2.9909, + "loss/crossentropy": 2.579922378063202, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.2395647093653679, + "loss/reg": 0.0, + "step": 33710 + }, + { + "epoch": 0.2218421052631579, + "grad_norm": 2.28125, + "grad_norm_var": 0.2459307352701823, + "learning_rate": 0.0001, + "loss": 2.9947, + "loss/crossentropy": 2.269551432132721, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.22979858070611953, + "loss/reg": 0.0, + "step": 33720 + }, + { + "epoch": 0.2219078947368421, + "grad_norm": 2.0, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 3.0223, + "loss/crossentropy": 2.5746334075927733, + "loss/hidden": 2.675, + "loss/incoh": 0.0, + "loss/logits": 0.237695574760437, + "loss/reg": 0.0, + "step": 33730 + }, + { + "epoch": 0.22197368421052632, + "grad_norm": 2.5, + "grad_norm_var": 0.0932281494140625, + "learning_rate": 0.0001, + "loss": 3.0095, + "loss/crossentropy": 2.111787271499634, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.21114924997091294, + "loss/reg": 0.0, + "step": 33740 + }, + { + "epoch": 0.22203947368421054, + "grad_norm": 2.390625, + "grad_norm_var": 0.1626129150390625, + "learning_rate": 0.0001, + "loss": 3.0218, + "loss/crossentropy": 2.392720115184784, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.19515267461538316, + "loss/reg": 0.0, + "step": 33750 + }, + { + "epoch": 0.22210526315789475, + "grad_norm": 2.09375, + "grad_norm_var": 0.049347941080729166, + "learning_rate": 0.0001, + "loss": 3.0357, + "loss/crossentropy": 2.4739153146743775, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.3125626042485237, + "loss/reg": 0.0, + "step": 33760 + }, + { + "epoch": 0.22217105263157894, + "grad_norm": 1.9453125, + "grad_norm_var": 0.1904070536295573, + "learning_rate": 0.0001, + "loss": 2.974, + "loss/crossentropy": 2.290574276447296, + "loss/hidden": 2.8140625, + "loss/incoh": 0.0, + "loss/logits": 0.23448731899261474, + "loss/reg": 0.0, + "step": 33770 + }, + { + "epoch": 0.22223684210526315, + "grad_norm": 2.125, + "grad_norm_var": 0.13733495076497396, + "learning_rate": 0.0001, + "loss": 2.9673, + "loss/crossentropy": 2.317961239814758, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.21905806362628938, + "loss/reg": 0.0, + "step": 33780 + }, + { + "epoch": 0.22230263157894736, + "grad_norm": 2.390625, + "grad_norm_var": 0.16253255208333334, + "learning_rate": 0.0001, + "loss": 3.0608, + "loss/crossentropy": 2.170513927936554, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.20969858914613723, + "loss/reg": 0.0, + "step": 33790 + }, + { + "epoch": 0.22236842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.20142822265625, + "learning_rate": 0.0001, + "loss": 3.0502, + "loss/crossentropy": 2.3547741293907167, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.21735511124134063, + "loss/reg": 0.0, + "step": 33800 + }, + { + "epoch": 0.2224342105263158, + "grad_norm": 2.53125, + "grad_norm_var": 0.62681884765625, + "learning_rate": 0.0001, + "loss": 2.9739, + "loss/crossentropy": 2.3395539283752442, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.20967817306518555, + "loss/reg": 0.0, + "step": 33810 + }, + { + "epoch": 0.2225, + "grad_norm": 2.09375, + "grad_norm_var": 0.0564117431640625, + "learning_rate": 0.0001, + "loss": 2.9871, + "loss/crossentropy": 2.294986367225647, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.2147073432803154, + "loss/reg": 0.0, + "step": 33820 + }, + { + "epoch": 0.22256578947368422, + "grad_norm": 2.09375, + "grad_norm_var": 0.07506103515625, + "learning_rate": 0.0001, + "loss": 2.9679, + "loss/crossentropy": 2.257847762107849, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.30166392475366594, + "loss/reg": 0.0, + "step": 33830 + }, + { + "epoch": 0.22263157894736843, + "grad_norm": 3.40625, + "grad_norm_var": 0.17619527180989583, + "learning_rate": 0.0001, + "loss": 2.9993, + "loss/crossentropy": 2.2673079133033753, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.21112384274601936, + "loss/reg": 0.0, + "step": 33840 + }, + { + "epoch": 0.22269736842105264, + "grad_norm": 5.0, + "grad_norm_var": 0.5702311197916666, + "learning_rate": 0.0001, + "loss": 3.0953, + "loss/crossentropy": 2.632491409778595, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.23320973068475723, + "loss/reg": 0.0, + "step": 33850 + }, + { + "epoch": 0.22276315789473683, + "grad_norm": 2.375, + "grad_norm_var": 0.7403310139973959, + "learning_rate": 0.0001, + "loss": 3.0625, + "loss/crossentropy": 2.331642270088196, + "loss/hidden": 2.88125, + "loss/incoh": 0.0, + "loss/logits": 0.23803979456424712, + "loss/reg": 0.0, + "step": 33860 + }, + { + "epoch": 0.22282894736842104, + "grad_norm": 2.21875, + "grad_norm_var": 0.041890462239583336, + "learning_rate": 0.0001, + "loss": 3.034, + "loss/crossentropy": 2.2647625207901, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.20834072977304458, + "loss/reg": 0.0, + "step": 33870 + }, + { + "epoch": 0.22289473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.0492095947265625, + "learning_rate": 0.0001, + "loss": 3.0249, + "loss/crossentropy": 2.259378707408905, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.21118512228131295, + "loss/reg": 0.0, + "step": 33880 + }, + { + "epoch": 0.22296052631578947, + "grad_norm": 2.875, + "grad_norm_var": 0.10131810506184896, + "learning_rate": 0.0001, + "loss": 3.0156, + "loss/crossentropy": 2.31290363073349, + "loss/hidden": 2.63125, + "loss/incoh": 0.0, + "loss/logits": 0.2051762267947197, + "loss/reg": 0.0, + "step": 33890 + }, + { + "epoch": 0.22302631578947368, + "grad_norm": 2.15625, + "grad_norm_var": 0.11342544555664062, + "learning_rate": 0.0001, + "loss": 3.0365, + "loss/crossentropy": 1.8784321069717407, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.21909460723400115, + "loss/reg": 0.0, + "step": 33900 + }, + { + "epoch": 0.2230921052631579, + "grad_norm": 2.34375, + "grad_norm_var": 0.05671284993489583, + "learning_rate": 0.0001, + "loss": 3.016, + "loss/crossentropy": 2.3022819638252257, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.22228346467018129, + "loss/reg": 0.0, + "step": 33910 + }, + { + "epoch": 0.2231578947368421, + "grad_norm": 2.296875, + "grad_norm_var": 0.23657938639322917, + "learning_rate": 0.0001, + "loss": 3.0624, + "loss/crossentropy": 2.335461437702179, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.23024386763572693, + "loss/reg": 0.0, + "step": 33920 + }, + { + "epoch": 0.22322368421052632, + "grad_norm": 2.40625, + "grad_norm_var": 0.18298238118489582, + "learning_rate": 0.0001, + "loss": 3.1342, + "loss/crossentropy": 2.149783802032471, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.24487016648054122, + "loss/reg": 0.0, + "step": 33930 + }, + { + "epoch": 0.22328947368421054, + "grad_norm": 2.625, + "grad_norm_var": 0.07650731404622396, + "learning_rate": 0.0001, + "loss": 3.0341, + "loss/crossentropy": 2.248632514476776, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.22311466187238693, + "loss/reg": 0.0, + "step": 33940 + }, + { + "epoch": 0.22335526315789472, + "grad_norm": 2.140625, + "grad_norm_var": 0.0645416259765625, + "learning_rate": 0.0001, + "loss": 3.1104, + "loss/crossentropy": 2.367274534702301, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.2507220461964607, + "loss/reg": 0.0, + "step": 33950 + }, + { + "epoch": 0.22342105263157894, + "grad_norm": 2.90625, + "grad_norm_var": 0.05906473795572917, + "learning_rate": 0.0001, + "loss": 3.0713, + "loss/crossentropy": 2.3802406549453736, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.23559605777263642, + "loss/reg": 0.0, + "step": 33960 + }, + { + "epoch": 0.22348684210526315, + "grad_norm": 2.5625, + "grad_norm_var": 0.045979817708333336, + "learning_rate": 0.0001, + "loss": 3.0398, + "loss/crossentropy": 2.283581781387329, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.22127386182546616, + "loss/reg": 0.0, + "step": 33970 + }, + { + "epoch": 0.22355263157894736, + "grad_norm": 2.34375, + "grad_norm_var": 0.0328277587890625, + "learning_rate": 0.0001, + "loss": 3.0036, + "loss/crossentropy": 2.2841804146766664, + "loss/hidden": 2.9328125, + "loss/incoh": 0.0, + "loss/logits": 0.29590369313955306, + "loss/reg": 0.0, + "step": 33980 + }, + { + "epoch": 0.22361842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.0925201416015625, + "learning_rate": 0.0001, + "loss": 3.0846, + "loss/crossentropy": 2.1158400774002075, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.21308634355664252, + "loss/reg": 0.0, + "step": 33990 + }, + { + "epoch": 0.2236842105263158, + "grad_norm": 2.203125, + "grad_norm_var": 0.1506744384765625, + "learning_rate": 0.0001, + "loss": 3.081, + "loss/crossentropy": 2.3101623356342316, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.22568342536687852, + "loss/reg": 0.0, + "step": 34000 + }, + { + "epoch": 0.22375, + "grad_norm": 3.28125, + "grad_norm_var": 0.21297200520833334, + "learning_rate": 0.0001, + "loss": 3.0695, + "loss/crossentropy": 2.493696868419647, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.32419712096452713, + "loss/reg": 0.0, + "step": 34010 + }, + { + "epoch": 0.22381578947368422, + "grad_norm": 2.453125, + "grad_norm_var": 0.470361328125, + "learning_rate": 0.0001, + "loss": 3.0831, + "loss/crossentropy": 2.181097960472107, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.21221623122692107, + "loss/reg": 0.0, + "step": 34020 + }, + { + "epoch": 0.22388157894736843, + "grad_norm": 2.28125, + "grad_norm_var": 0.4900716145833333, + "learning_rate": 0.0001, + "loss": 2.9904, + "loss/crossentropy": 2.2486413717269897, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.2214418590068817, + "loss/reg": 0.0, + "step": 34030 + }, + { + "epoch": 0.22394736842105264, + "grad_norm": 2.4375, + "grad_norm_var": 0.026363118489583334, + "learning_rate": 0.0001, + "loss": 3.0049, + "loss/crossentropy": 2.058054494857788, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.20110160410404204, + "loss/reg": 0.0, + "step": 34040 + }, + { + "epoch": 0.22401315789473683, + "grad_norm": 2.109375, + "grad_norm_var": 0.8867828369140625, + "learning_rate": 0.0001, + "loss": 3.0557, + "loss/crossentropy": 2.183249998092651, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.3117926768958569, + "loss/reg": 0.0, + "step": 34050 + }, + { + "epoch": 0.22407894736842104, + "grad_norm": 2.625, + "grad_norm_var": 0.73414306640625, + "learning_rate": 0.0001, + "loss": 3.0466, + "loss/crossentropy": 2.2317178010940553, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.23523171246051788, + "loss/reg": 0.0, + "step": 34060 + }, + { + "epoch": 0.22414473684210526, + "grad_norm": 2.078125, + "grad_norm_var": 0.11988525390625, + "learning_rate": 0.0001, + "loss": 3.0361, + "loss/crossentropy": 2.1958236932754516, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.2175719380378723, + "loss/reg": 0.0, + "step": 34070 + }, + { + "epoch": 0.22421052631578947, + "grad_norm": 2.578125, + "grad_norm_var": 0.08212483723958333, + "learning_rate": 0.0001, + "loss": 3.0053, + "loss/crossentropy": 2.3172508239746095, + "loss/hidden": 2.640625, + "loss/incoh": 0.0, + "loss/logits": 0.21442833691835403, + "loss/reg": 0.0, + "step": 34080 + }, + { + "epoch": 0.22427631578947368, + "grad_norm": 2.03125, + "grad_norm_var": 0.13967666625976563, + "learning_rate": 0.0001, + "loss": 2.9913, + "loss/crossentropy": 2.244894874095917, + "loss/hidden": 2.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.19675317481160165, + "loss/reg": 0.0, + "step": 34090 + }, + { + "epoch": 0.2243421052631579, + "grad_norm": 2.375, + "grad_norm_var": 0.16440200805664062, + "learning_rate": 0.0001, + "loss": 3.046, + "loss/crossentropy": 2.256273639202118, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.22018496468663215, + "loss/reg": 0.0, + "step": 34100 + }, + { + "epoch": 0.2244078947368421, + "grad_norm": 2.046875, + "grad_norm_var": 0.15823160807291667, + "learning_rate": 0.0001, + "loss": 2.9706, + "loss/crossentropy": 2.450822639465332, + "loss/hidden": 2.896875, + "loss/incoh": 0.0, + "loss/logits": 0.31932070925831796, + "loss/reg": 0.0, + "step": 34110 + }, + { + "epoch": 0.22447368421052633, + "grad_norm": 2.484375, + "grad_norm_var": 0.212939453125, + "learning_rate": 0.0001, + "loss": 3.0805, + "loss/crossentropy": 2.363582265377045, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.24354881346225737, + "loss/reg": 0.0, + "step": 34120 + }, + { + "epoch": 0.22453947368421054, + "grad_norm": 2.125, + "grad_norm_var": 0.07653706868489583, + "learning_rate": 0.0001, + "loss": 3.0054, + "loss/crossentropy": 2.4329180479049684, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.2980032041668892, + "loss/reg": 0.0, + "step": 34130 + }, + { + "epoch": 0.22460526315789472, + "grad_norm": 2.640625, + "grad_norm_var": 0.14047012329101563, + "learning_rate": 0.0001, + "loss": 3.0115, + "loss/crossentropy": 2.339194095134735, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.22101039290428162, + "loss/reg": 0.0, + "step": 34140 + }, + { + "epoch": 0.22467105263157894, + "grad_norm": 2.296875, + "grad_norm_var": 0.14289957682291668, + "learning_rate": 0.0001, + "loss": 2.9833, + "loss/crossentropy": 2.1932947993278504, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2560951545834541, + "loss/reg": 0.0, + "step": 34150 + }, + { + "epoch": 0.22473684210526315, + "grad_norm": 2.0, + "grad_norm_var": 0.11972249348958333, + "learning_rate": 0.0001, + "loss": 3.0913, + "loss/crossentropy": 2.341690111160278, + "loss/hidden": 2.6296875, + "loss/incoh": 0.0, + "loss/logits": 0.21807242110371589, + "loss/reg": 0.0, + "step": 34160 + }, + { + "epoch": 0.22480263157894737, + "grad_norm": 2.625, + "grad_norm_var": 0.039937337239583336, + "learning_rate": 0.0001, + "loss": 3.0886, + "loss/crossentropy": 1.9180511385202408, + "loss/hidden": 2.9625, + "loss/incoh": 0.0, + "loss/logits": 0.2749033223837614, + "loss/reg": 0.0, + "step": 34170 + }, + { + "epoch": 0.22486842105263158, + "grad_norm": 2.140625, + "grad_norm_var": 0.0544586181640625, + "learning_rate": 0.0001, + "loss": 3.0813, + "loss/crossentropy": 2.0566360354423523, + "loss/hidden": 2.9953125, + "loss/incoh": 0.0, + "loss/logits": 0.22760727405548095, + "loss/reg": 0.0, + "step": 34180 + }, + { + "epoch": 0.2249342105263158, + "grad_norm": 2.25, + "grad_norm_var": 0.0463043212890625, + "learning_rate": 0.0001, + "loss": 3.0512, + "loss/crossentropy": 2.291218078136444, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.23884439915418626, + "loss/reg": 0.0, + "step": 34190 + }, + { + "epoch": 0.225, + "grad_norm": 2.171875, + "grad_norm_var": 0.04059956868489583, + "learning_rate": 0.0001, + "loss": 3.0016, + "loss/crossentropy": 2.409184718132019, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.22745371460914612, + "loss/reg": 0.0, + "step": 34200 + }, + { + "epoch": 0.22506578947368422, + "grad_norm": 2.40625, + "grad_norm_var": 0.028944651285807293, + "learning_rate": 0.0001, + "loss": 3.0011, + "loss/crossentropy": 2.541308951377869, + "loss/hidden": 2.5921875, + "loss/incoh": 0.0, + "loss/logits": 0.19628863781690598, + "loss/reg": 0.0, + "step": 34210 + }, + { + "epoch": 0.22513157894736843, + "grad_norm": 2.796875, + "grad_norm_var": 0.080078125, + "learning_rate": 0.0001, + "loss": 3.0556, + "loss/crossentropy": 1.890171855688095, + "loss/hidden": 2.975, + "loss/incoh": 0.0, + "loss/logits": 0.23669045567512512, + "loss/reg": 0.0, + "step": 34220 + }, + { + "epoch": 0.22519736842105262, + "grad_norm": 2.28125, + "grad_norm_var": 0.031754557291666666, + "learning_rate": 0.0001, + "loss": 3.0229, + "loss/crossentropy": 1.9730781435966491, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2429889589548111, + "loss/reg": 0.0, + "step": 34230 + }, + { + "epoch": 0.22526315789473683, + "grad_norm": 2.234375, + "grad_norm_var": 0.12756754557291666, + "learning_rate": 0.0001, + "loss": 3.0908, + "loss/crossentropy": 2.430511474609375, + "loss/hidden": 2.928125, + "loss/incoh": 0.0, + "loss/logits": 0.26906117498874665, + "loss/reg": 0.0, + "step": 34240 + }, + { + "epoch": 0.22532894736842105, + "grad_norm": 3.5625, + "grad_norm_var": 0.36015625, + "learning_rate": 0.0001, + "loss": 3.1105, + "loss/crossentropy": 2.387774920463562, + "loss/hidden": 2.9265625, + "loss/incoh": 0.0, + "loss/logits": 0.23492622524499893, + "loss/reg": 0.0, + "step": 34250 + }, + { + "epoch": 0.22539473684210526, + "grad_norm": 2.546875, + "grad_norm_var": 0.5035804748535156, + "learning_rate": 0.0001, + "loss": 2.9848, + "loss/crossentropy": 2.3150025963783265, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.22484272122383117, + "loss/reg": 0.0, + "step": 34260 + }, + { + "epoch": 0.22546052631578947, + "grad_norm": 2.53125, + "grad_norm_var": 0.47118504842122394, + "learning_rate": 0.0001, + "loss": 2.9996, + "loss/crossentropy": 2.1695404648780823, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.20138196349143983, + "loss/reg": 0.0, + "step": 34270 + }, + { + "epoch": 0.22552631578947369, + "grad_norm": 2.1875, + "grad_norm_var": 0.055063629150390626, + "learning_rate": 0.0001, + "loss": 2.9955, + "loss/crossentropy": 2.348333418369293, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.2416067436337471, + "loss/reg": 0.0, + "step": 34280 + }, + { + "epoch": 0.2255921052631579, + "grad_norm": 2.484375, + "grad_norm_var": 0.1240631103515625, + "learning_rate": 0.0001, + "loss": 3.051, + "loss/crossentropy": 2.1614134192466734, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.2166384145617485, + "loss/reg": 0.0, + "step": 34290 + }, + { + "epoch": 0.2256578947368421, + "grad_norm": 2.125, + "grad_norm_var": 0.18293355305989584, + "learning_rate": 0.0001, + "loss": 2.9882, + "loss/crossentropy": 2.1018754601478578, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.19822993129491806, + "loss/reg": 0.0, + "step": 34300 + }, + { + "epoch": 0.22572368421052633, + "grad_norm": 2.515625, + "grad_norm_var": 0.10800374348958333, + "learning_rate": 0.0001, + "loss": 3.1428, + "loss/crossentropy": 2.4500314474105833, + "loss/hidden": 2.990625, + "loss/incoh": 0.0, + "loss/logits": 0.3025324195623398, + "loss/reg": 0.0, + "step": 34310 + }, + { + "epoch": 0.22578947368421054, + "grad_norm": 2.078125, + "grad_norm_var": 0.0825592041015625, + "learning_rate": 0.0001, + "loss": 3.0246, + "loss/crossentropy": 2.1041884064674377, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.22658382654190062, + "loss/reg": 0.0, + "step": 34320 + }, + { + "epoch": 0.22585526315789473, + "grad_norm": 2.40625, + "grad_norm_var": 0.07895406087239583, + "learning_rate": 0.0001, + "loss": 3.0125, + "loss/crossentropy": 2.397817623615265, + "loss/hidden": 3.00625, + "loss/incoh": 0.0, + "loss/logits": 0.28272619247436526, + "loss/reg": 0.0, + "step": 34330 + }, + { + "epoch": 0.22592105263157894, + "grad_norm": 1.953125, + "grad_norm_var": 0.08463134765625, + "learning_rate": 0.0001, + "loss": 3.0419, + "loss/crossentropy": 2.0744609951972963, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.27017629742622373, + "loss/reg": 0.0, + "step": 34340 + }, + { + "epoch": 0.22598684210526315, + "grad_norm": 3.171875, + "grad_norm_var": 0.10567118326822916, + "learning_rate": 0.0001, + "loss": 3.054, + "loss/crossentropy": 2.5243443489074706, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.22290815711021422, + "loss/reg": 0.0, + "step": 34350 + }, + { + "epoch": 0.22605263157894737, + "grad_norm": 2.671875, + "grad_norm_var": 0.20874735514322917, + "learning_rate": 0.0001, + "loss": 3.0311, + "loss/crossentropy": 2.291029953956604, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.23554186373949051, + "loss/reg": 0.0, + "step": 34360 + }, + { + "epoch": 0.22611842105263158, + "grad_norm": 2.65625, + "grad_norm_var": 0.17711588541666667, + "learning_rate": 0.0001, + "loss": 3.0285, + "loss/crossentropy": 2.4715088486671446, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2226739391684532, + "loss/reg": 0.0, + "step": 34370 + }, + { + "epoch": 0.2261842105263158, + "grad_norm": 2.765625, + "grad_norm_var": 0.07121988932291666, + "learning_rate": 0.0001, + "loss": 2.9697, + "loss/crossentropy": 2.226884996891022, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.19653952047228812, + "loss/reg": 0.0, + "step": 34380 + }, + { + "epoch": 0.22625, + "grad_norm": 2.140625, + "grad_norm_var": 0.11801656087239583, + "learning_rate": 0.0001, + "loss": 3.0042, + "loss/crossentropy": 2.6288553953170775, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.22481338530778885, + "loss/reg": 0.0, + "step": 34390 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 2.234375, + "grad_norm_var": 0.056029256184895834, + "learning_rate": 0.0001, + "loss": 2.9881, + "loss/crossentropy": 2.086442506313324, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.19545092582702636, + "loss/reg": 0.0, + "step": 34400 + }, + { + "epoch": 0.22638157894736843, + "grad_norm": 2.34375, + "grad_norm_var": 0.07093098958333334, + "learning_rate": 0.0001, + "loss": 2.9785, + "loss/crossentropy": 2.1755173921585085, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.2160940498113632, + "loss/reg": 0.0, + "step": 34410 + }, + { + "epoch": 0.22644736842105262, + "grad_norm": 2.78125, + "grad_norm_var": 0.5419667561848959, + "learning_rate": 0.0001, + "loss": 3.148, + "loss/crossentropy": 2.306460678577423, + "loss/hidden": 2.9765625, + "loss/incoh": 0.0, + "loss/logits": 0.23442021161317825, + "loss/reg": 0.0, + "step": 34420 + }, + { + "epoch": 0.22651315789473683, + "grad_norm": 2.359375, + "grad_norm_var": 0.509814453125, + "learning_rate": 0.0001, + "loss": 3.144, + "loss/crossentropy": 2.4273258805274964, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.2834622025489807, + "loss/reg": 0.0, + "step": 34430 + }, + { + "epoch": 0.22657894736842105, + "grad_norm": 2.046875, + "grad_norm_var": 0.20230712890625, + "learning_rate": 0.0001, + "loss": 3.0404, + "loss/crossentropy": 2.382106697559357, + "loss/hidden": 2.840625, + "loss/incoh": 0.0, + "loss/logits": 0.2566205784678459, + "loss/reg": 0.0, + "step": 34440 + }, + { + "epoch": 0.22664473684210526, + "grad_norm": 2.734375, + "grad_norm_var": 1.1934967041015625, + "learning_rate": 0.0001, + "loss": 3.1251, + "loss/crossentropy": 2.4071131587028503, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.25932526737451556, + "loss/reg": 0.0, + "step": 34450 + }, + { + "epoch": 0.22671052631578947, + "grad_norm": 3.65625, + "grad_norm_var": 0.17838109334309896, + "learning_rate": 0.0001, + "loss": 3.0335, + "loss/crossentropy": 2.3701361656188964, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2191499412059784, + "loss/reg": 0.0, + "step": 34460 + }, + { + "epoch": 0.2267763157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.1755633036295573, + "learning_rate": 0.0001, + "loss": 3.1017, + "loss/crossentropy": 1.7690342009067535, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.19500833228230477, + "loss/reg": 0.0, + "step": 34470 + }, + { + "epoch": 0.2268421052631579, + "grad_norm": 2.265625, + "grad_norm_var": 0.36656494140625, + "learning_rate": 0.0001, + "loss": 3.014, + "loss/crossentropy": 2.1445329546928407, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.192388217151165, + "loss/reg": 0.0, + "step": 34480 + }, + { + "epoch": 0.22690789473684211, + "grad_norm": 3.65625, + "grad_norm_var": 0.5748331705729167, + "learning_rate": 0.0001, + "loss": 3.156, + "loss/crossentropy": 2.152642047405243, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.23683252632617952, + "loss/reg": 0.0, + "step": 34490 + }, + { + "epoch": 0.22697368421052633, + "grad_norm": 2.46875, + "grad_norm_var": 0.1991119384765625, + "learning_rate": 0.0001, + "loss": 3.1698, + "loss/crossentropy": 2.343445897102356, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.25687034577131274, + "loss/reg": 0.0, + "step": 34500 + }, + { + "epoch": 0.2270394736842105, + "grad_norm": 2.109375, + "grad_norm_var": 0.1173828125, + "learning_rate": 0.0001, + "loss": 3.0052, + "loss/crossentropy": 2.352896881103516, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.27792870104312895, + "loss/reg": 0.0, + "step": 34510 + }, + { + "epoch": 0.22710526315789473, + "grad_norm": 2.328125, + "grad_norm_var": 0.05995992024739583, + "learning_rate": 0.0001, + "loss": 3.0362, + "loss/crossentropy": 2.2525784373283386, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.25623630434274675, + "loss/reg": 0.0, + "step": 34520 + }, + { + "epoch": 0.22717105263157894, + "grad_norm": 2.1875, + "grad_norm_var": 2.48424072265625, + "learning_rate": 0.0001, + "loss": 3.097, + "loss/crossentropy": 2.3944430470466616, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.23495300412178038, + "loss/reg": 0.0, + "step": 34530 + }, + { + "epoch": 0.22723684210526315, + "grad_norm": 2.75, + "grad_norm_var": 2.325169881184896, + "learning_rate": 0.0001, + "loss": 3.0441, + "loss/crossentropy": 2.1533145189285277, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.20518074482679366, + "loss/reg": 0.0, + "step": 34540 + }, + { + "epoch": 0.22730263157894737, + "grad_norm": 3.453125, + "grad_norm_var": 0.6099609375, + "learning_rate": 0.0001, + "loss": 3.1133, + "loss/crossentropy": 2.298123669624329, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.23540952503681184, + "loss/reg": 0.0, + "step": 34550 + }, + { + "epoch": 0.22736842105263158, + "grad_norm": 2.625, + "grad_norm_var": 0.169775390625, + "learning_rate": 0.0001, + "loss": 3.0075, + "loss/crossentropy": 2.333153510093689, + "loss/hidden": 2.8203125, + "loss/incoh": 0.0, + "loss/logits": 0.23416389673948287, + "loss/reg": 0.0, + "step": 34560 + }, + { + "epoch": 0.2274342105263158, + "grad_norm": 2.125, + "grad_norm_var": 0.14983317057291667, + "learning_rate": 0.0001, + "loss": 3.0272, + "loss/crossentropy": 2.455455815792084, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.26056560724973676, + "loss/reg": 0.0, + "step": 34570 + }, + { + "epoch": 0.2275, + "grad_norm": 2.359375, + "grad_norm_var": 0.17576065063476562, + "learning_rate": 0.0001, + "loss": 2.9814, + "loss/crossentropy": 1.9633507668972014, + "loss/hidden": 2.5953125, + "loss/incoh": 0.0, + "loss/logits": 0.19498348981142044, + "loss/reg": 0.0, + "step": 34580 + }, + { + "epoch": 0.22756578947368422, + "grad_norm": 2.1875, + "grad_norm_var": 0.1496246337890625, + "learning_rate": 0.0001, + "loss": 3.0304, + "loss/crossentropy": 1.8702698707580567, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.2022824764251709, + "loss/reg": 0.0, + "step": 34590 + }, + { + "epoch": 0.22763157894736843, + "grad_norm": 2.46875, + "grad_norm_var": 0.1550445556640625, + "learning_rate": 0.0001, + "loss": 3.027, + "loss/crossentropy": 2.588277578353882, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.28010246604681016, + "loss/reg": 0.0, + "step": 34600 + }, + { + "epoch": 0.22769736842105262, + "grad_norm": 2.46875, + "grad_norm_var": 0.09648335774739583, + "learning_rate": 0.0001, + "loss": 3.0036, + "loss/crossentropy": 2.3437662720680237, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.21182923167943954, + "loss/reg": 0.0, + "step": 34610 + }, + { + "epoch": 0.22776315789473683, + "grad_norm": 2.65625, + "grad_norm_var": 0.08466389973958334, + "learning_rate": 0.0001, + "loss": 3.0695, + "loss/crossentropy": 2.3003593921661376, + "loss/hidden": 2.7984375, + "loss/incoh": 0.0, + "loss/logits": 0.2246791511774063, + "loss/reg": 0.0, + "step": 34620 + }, + { + "epoch": 0.22782894736842105, + "grad_norm": 2.625, + "grad_norm_var": 2.332779947916667, + "learning_rate": 0.0001, + "loss": 2.9537, + "loss/crossentropy": 2.2143739581108095, + "loss/hidden": 2.6265625, + "loss/incoh": 0.0, + "loss/logits": 0.2086007311940193, + "loss/reg": 0.0, + "step": 34630 + }, + { + "epoch": 0.22789473684210526, + "grad_norm": 2.328125, + "grad_norm_var": 2.5328409830729166, + "learning_rate": 0.0001, + "loss": 3.0153, + "loss/crossentropy": 2.1308518052101135, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.2038565307855606, + "loss/reg": 0.0, + "step": 34640 + }, + { + "epoch": 0.22796052631578947, + "grad_norm": 2.40625, + "grad_norm_var": 0.36980361938476564, + "learning_rate": 0.0001, + "loss": 2.9625, + "loss/crossentropy": 2.088499677181244, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.268018501996994, + "loss/reg": 0.0, + "step": 34650 + }, + { + "epoch": 0.2280263157894737, + "grad_norm": 1962934272.0, + "grad_norm_var": 2.408194341792645e+17, + "learning_rate": 0.0001, + "loss": 3.0604, + "loss/crossentropy": 2.413490664958954, + "loss/hidden": 2.634375, + "loss/incoh": 0.0, + "loss/logits": 0.21956574469804763, + "loss/reg": 0.0, + "step": 34660 + }, + { + "epoch": 0.2280921052631579, + "grad_norm": 2.484375, + "grad_norm_var": 2.4081943414143712e+17, + "learning_rate": 0.0001, + "loss": 3.0371, + "loss/crossentropy": 2.1357826590538025, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.19804717004299163, + "loss/reg": 0.0, + "step": 34670 + }, + { + "epoch": 0.22815789473684212, + "grad_norm": 2.203125, + "grad_norm_var": 0.16970113118489583, + "learning_rate": 0.0001, + "loss": 2.9031, + "loss/crossentropy": 2.4601009845733643, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.23078858107328415, + "loss/reg": 0.0, + "step": 34680 + }, + { + "epoch": 0.22822368421052633, + "grad_norm": 2.28125, + "grad_norm_var": 0.0817535400390625, + "learning_rate": 0.0001, + "loss": 3.0007, + "loss/crossentropy": 2.170434999465942, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.23267455995082856, + "loss/reg": 0.0, + "step": 34690 + }, + { + "epoch": 0.22828947368421051, + "grad_norm": 2.796875, + "grad_norm_var": 0.08743489583333333, + "learning_rate": 0.0001, + "loss": 2.9957, + "loss/crossentropy": 2.378873956203461, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.25174411833286287, + "loss/reg": 0.0, + "step": 34700 + }, + { + "epoch": 0.22835526315789473, + "grad_norm": 2.625, + "grad_norm_var": 0.0942400614420573, + "learning_rate": 0.0001, + "loss": 2.9195, + "loss/crossentropy": 2.3702409982681276, + "loss/hidden": 2.684375, + "loss/incoh": 0.0, + "loss/logits": 0.2114430248737335, + "loss/reg": 0.0, + "step": 34710 + }, + { + "epoch": 0.22842105263157894, + "grad_norm": 2.5625, + "grad_norm_var": 0.1205230712890625, + "learning_rate": 0.0001, + "loss": 2.9774, + "loss/crossentropy": 2.1158406376838683, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2316122904419899, + "loss/reg": 0.0, + "step": 34720 + }, + { + "epoch": 0.22848684210526315, + "grad_norm": 2.75, + "grad_norm_var": 0.04747899373372396, + "learning_rate": 0.0001, + "loss": 2.9868, + "loss/crossentropy": 2.2178874969482423, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.22333518415689468, + "loss/reg": 0.0, + "step": 34730 + }, + { + "epoch": 0.22855263157894737, + "grad_norm": 2.34375, + "grad_norm_var": 0.08137105305989584, + "learning_rate": 0.0001, + "loss": 3.0257, + "loss/crossentropy": 2.364703667163849, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.22582925111055374, + "loss/reg": 0.0, + "step": 34740 + }, + { + "epoch": 0.22861842105263158, + "grad_norm": 2.03125, + "grad_norm_var": 0.0731842041015625, + "learning_rate": 0.0001, + "loss": 2.9844, + "loss/crossentropy": 2.06840842962265, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.24444130361080169, + "loss/reg": 0.0, + "step": 34750 + }, + { + "epoch": 0.2286842105263158, + "grad_norm": 2.34375, + "grad_norm_var": 0.06175308227539063, + "learning_rate": 0.0001, + "loss": 3.0122, + "loss/crossentropy": 2.5480314254760743, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.3109170734882355, + "loss/reg": 0.0, + "step": 34760 + }, + { + "epoch": 0.22875, + "grad_norm": 2.296875, + "grad_norm_var": 0.06448567708333333, + "learning_rate": 0.0001, + "loss": 2.9767, + "loss/crossentropy": 2.4194095849990847, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.23701667934656143, + "loss/reg": 0.0, + "step": 34770 + }, + { + "epoch": 0.22881578947368422, + "grad_norm": 2.296875, + "grad_norm_var": 0.13319676717122395, + "learning_rate": 0.0001, + "loss": 3.0117, + "loss/crossentropy": 2.330536985397339, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.1930417075753212, + "loss/reg": 0.0, + "step": 34780 + }, + { + "epoch": 0.2288815789473684, + "grad_norm": 2.140625, + "grad_norm_var": 0.13925755818684896, + "learning_rate": 0.0001, + "loss": 3.0049, + "loss/crossentropy": 2.055137485265732, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.1990390993654728, + "loss/reg": 0.0, + "step": 34790 + }, + { + "epoch": 0.22894736842105262, + "grad_norm": 2.375, + "grad_norm_var": 0.0531402587890625, + "learning_rate": 0.0001, + "loss": 2.9451, + "loss/crossentropy": 2.3400762557983397, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.2514717549085617, + "loss/reg": 0.0, + "step": 34800 + }, + { + "epoch": 0.22901315789473684, + "grad_norm": 2.25, + "grad_norm_var": 0.14586588541666667, + "learning_rate": 0.0001, + "loss": 3.0989, + "loss/crossentropy": 2.3216898441314697, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.20793744474649428, + "loss/reg": 0.0, + "step": 34810 + }, + { + "epoch": 0.22907894736842105, + "grad_norm": 2.28125, + "grad_norm_var": 0.23554280598958333, + "learning_rate": 0.0001, + "loss": 3.1053, + "loss/crossentropy": 2.368983769416809, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23178687542676926, + "loss/reg": 0.0, + "step": 34820 + }, + { + "epoch": 0.22914473684210526, + "grad_norm": 2.140625, + "grad_norm_var": 0.09830322265625, + "learning_rate": 0.0001, + "loss": 2.9238, + "loss/crossentropy": 2.1155460715293883, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.21411084979772568, + "loss/reg": 0.0, + "step": 34830 + }, + { + "epoch": 0.22921052631578948, + "grad_norm": 2.796875, + "grad_norm_var": 0.6712799072265625, + "learning_rate": 0.0001, + "loss": 3.0934, + "loss/crossentropy": 1.9747644543647767, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.22649588584899902, + "loss/reg": 0.0, + "step": 34840 + }, + { + "epoch": 0.2292763157894737, + "grad_norm": 2.125, + "grad_norm_var": 0.6342081705729167, + "learning_rate": 0.0001, + "loss": 2.968, + "loss/crossentropy": 2.282280433177948, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.24287160784006118, + "loss/reg": 0.0, + "step": 34850 + }, + { + "epoch": 0.2293421052631579, + "grad_norm": 2.453125, + "grad_norm_var": 0.676806640625, + "learning_rate": 0.0001, + "loss": 3.0144, + "loss/crossentropy": 2.423893141746521, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.27746856659650804, + "loss/reg": 0.0, + "step": 34860 + }, + { + "epoch": 0.22940789473684212, + "grad_norm": 2.109375, + "grad_norm_var": 0.15266927083333334, + "learning_rate": 0.0001, + "loss": 2.9868, + "loss/crossentropy": 2.2340946078300474, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.23156266957521437, + "loss/reg": 0.0, + "step": 34870 + }, + { + "epoch": 0.2294736842105263, + "grad_norm": 2.015625, + "grad_norm_var": 0.12720947265625, + "learning_rate": 0.0001, + "loss": 2.9166, + "loss/crossentropy": 2.3554929852485658, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.2240227773785591, + "loss/reg": 0.0, + "step": 34880 + }, + { + "epoch": 0.22953947368421052, + "grad_norm": 2.015625, + "grad_norm_var": 0.1996978759765625, + "learning_rate": 0.0001, + "loss": 2.9409, + "loss/crossentropy": 2.1706709206104278, + "loss/hidden": 2.6015625, + "loss/incoh": 0.0, + "loss/logits": 0.1888253793120384, + "loss/reg": 0.0, + "step": 34890 + }, + { + "epoch": 0.22960526315789473, + "grad_norm": 2.1875, + "grad_norm_var": 0.26612955729166665, + "learning_rate": 0.0001, + "loss": 3.0791, + "loss/crossentropy": 2.2094788432121275, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.21013466268777847, + "loss/reg": 0.0, + "step": 34900 + }, + { + "epoch": 0.22967105263157894, + "grad_norm": 2.359375, + "grad_norm_var": 0.3094716389973958, + "learning_rate": 0.0001, + "loss": 3.0647, + "loss/crossentropy": 2.008031153678894, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.2073305867612362, + "loss/reg": 0.0, + "step": 34910 + }, + { + "epoch": 0.22973684210526316, + "grad_norm": 2.234375, + "grad_norm_var": 0.16913960774739584, + "learning_rate": 0.0001, + "loss": 2.9933, + "loss/crossentropy": 2.3257227003574372, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.2534154921770096, + "loss/reg": 0.0, + "step": 34920 + }, + { + "epoch": 0.22980263157894737, + "grad_norm": 2.609375, + "grad_norm_var": 0.09401041666666667, + "learning_rate": 0.0001, + "loss": 3.078, + "loss/crossentropy": 2.299355685710907, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.4137351721525192, + "loss/reg": 0.0, + "step": 34930 + }, + { + "epoch": 0.22986842105263158, + "grad_norm": 2.484375, + "grad_norm_var": 0.059163411458333336, + "learning_rate": 0.0001, + "loss": 3.066, + "loss/crossentropy": 2.063184082508087, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.2067345403134823, + "loss/reg": 0.0, + "step": 34940 + }, + { + "epoch": 0.2299342105263158, + "grad_norm": 2.34375, + "grad_norm_var": 0.0319000244140625, + "learning_rate": 0.0001, + "loss": 3.0384, + "loss/crossentropy": 2.3379054069519043, + "loss/hidden": 2.625, + "loss/incoh": 0.0, + "loss/logits": 0.21520548760890962, + "loss/reg": 0.0, + "step": 34950 + }, + { + "epoch": 0.23, + "grad_norm": 2.1875, + "grad_norm_var": 0.04029541015625, + "learning_rate": 0.0001, + "loss": 3.0224, + "loss/crossentropy": 2.2715999722480773, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.23961079716682435, + "loss/reg": 0.0, + "step": 34960 + }, + { + "epoch": 0.23006578947368422, + "grad_norm": 2.375, + "grad_norm_var": 0.017545572916666665, + "learning_rate": 0.0001, + "loss": 2.9942, + "loss/crossentropy": 2.547387886047363, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.2642354011535645, + "loss/reg": 0.0, + "step": 34970 + }, + { + "epoch": 0.2301315789473684, + "grad_norm": 2.703125, + "grad_norm_var": 0.0195465087890625, + "learning_rate": 0.0001, + "loss": 2.9968, + "loss/crossentropy": 2.3052905440330504, + "loss/hidden": 2.6859375, + "loss/incoh": 0.0, + "loss/logits": 0.2478991135954857, + "loss/reg": 0.0, + "step": 34980 + }, + { + "epoch": 0.23019736842105262, + "grad_norm": 2.296875, + "grad_norm_var": 0.03509114583333333, + "learning_rate": 0.0001, + "loss": 2.9466, + "loss/crossentropy": 2.136173462867737, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.23240896165370942, + "loss/reg": 0.0, + "step": 34990 + }, + { + "epoch": 0.23026315789473684, + "grad_norm": 2.578125, + "grad_norm_var": 0.04448140462239583, + "learning_rate": 0.0001, + "loss": 3.0289, + "loss/crossentropy": 2.2616278886795045, + "loss/hidden": 2.6890625, + "loss/incoh": 0.0, + "loss/logits": 0.2103984072804451, + "loss/reg": 0.0, + "step": 35000 + }, + { + "epoch": 0.23032894736842105, + "grad_norm": 2.21875, + "grad_norm_var": 0.055464680989583334, + "learning_rate": 0.0001, + "loss": 3.0574, + "loss/crossentropy": 2.234957480430603, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22579004764556884, + "loss/reg": 0.0, + "step": 35010 + }, + { + "epoch": 0.23039473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.0838531494140625, + "learning_rate": 0.0001, + "loss": 3.0444, + "loss/crossentropy": 2.2260616183280946, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.2393326446413994, + "loss/reg": 0.0, + "step": 35020 + }, + { + "epoch": 0.23046052631578948, + "grad_norm": 2.4375, + "grad_norm_var": 0.110595703125, + "learning_rate": 0.0001, + "loss": 3.0705, + "loss/crossentropy": 2.2648324608802795, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.2452955961227417, + "loss/reg": 0.0, + "step": 35030 + }, + { + "epoch": 0.2305263157894737, + "grad_norm": 2.234375, + "grad_norm_var": 0.05959879557291667, + "learning_rate": 0.0001, + "loss": 3.0123, + "loss/crossentropy": 2.3035391211509704, + "loss/hidden": 2.675, + "loss/incoh": 0.0, + "loss/logits": 0.20398545116186143, + "loss/reg": 0.0, + "step": 35040 + }, + { + "epoch": 0.2305921052631579, + "grad_norm": 2.34375, + "grad_norm_var": 0.10580952962239583, + "learning_rate": 0.0001, + "loss": 3.0008, + "loss/crossentropy": 2.467655122280121, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2141210839152336, + "loss/reg": 0.0, + "step": 35050 + }, + { + "epoch": 0.23065789473684212, + "grad_norm": 2.640625, + "grad_norm_var": 0.12373046875, + "learning_rate": 0.0001, + "loss": 2.9259, + "loss/crossentropy": 2.4413907408714293, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.21587167531251908, + "loss/reg": 0.0, + "step": 35060 + }, + { + "epoch": 0.2307236842105263, + "grad_norm": 2.4375, + "grad_norm_var": 0.18179931640625, + "learning_rate": 0.0001, + "loss": 3.0733, + "loss/crossentropy": 2.5797677636146545, + "loss/hidden": 3.096875, + "loss/incoh": 0.0, + "loss/logits": 0.3117774799466133, + "loss/reg": 0.0, + "step": 35070 + }, + { + "epoch": 0.23078947368421052, + "grad_norm": 2.46875, + "grad_norm_var": 8.836572265625, + "learning_rate": 0.0001, + "loss": 2.98, + "loss/crossentropy": 2.0451542496681214, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.25981259644031524, + "loss/reg": 0.0, + "step": 35080 + }, + { + "epoch": 0.23085526315789473, + "grad_norm": 2.140625, + "grad_norm_var": 8.937858072916667, + "learning_rate": 0.0001, + "loss": 2.9729, + "loss/crossentropy": 2.4952213764190674, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.27273062616586685, + "loss/reg": 0.0, + "step": 35090 + }, + { + "epoch": 0.23092105263157894, + "grad_norm": 2.421875, + "grad_norm_var": 0.039632161458333336, + "learning_rate": 0.0001, + "loss": 3.0086, + "loss/crossentropy": 2.1235549569129946, + "loss/hidden": 2.846875, + "loss/incoh": 0.0, + "loss/logits": 0.2294046923518181, + "loss/reg": 0.0, + "step": 35100 + }, + { + "epoch": 0.23098684210526316, + "grad_norm": 2.15625, + "grad_norm_var": 0.2903065999348958, + "learning_rate": 0.0001, + "loss": 3.004, + "loss/crossentropy": 2.2860934376716613, + "loss/hidden": 3.009375, + "loss/incoh": 0.0, + "loss/logits": 0.23025134950876236, + "loss/reg": 0.0, + "step": 35110 + }, + { + "epoch": 0.23105263157894737, + "grad_norm": 3.125, + "grad_norm_var": 0.1121978759765625, + "learning_rate": 0.0001, + "loss": 2.9751, + "loss/crossentropy": 2.2237872958183287, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.2481079339981079, + "loss/reg": 0.0, + "step": 35120 + }, + { + "epoch": 0.23111842105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.12190348307291667, + "learning_rate": 0.0001, + "loss": 3.0254, + "loss/crossentropy": 2.367337203025818, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.24131873100996018, + "loss/reg": 0.0, + "step": 35130 + }, + { + "epoch": 0.2311842105263158, + "grad_norm": 2.5, + "grad_norm_var": 0.061310831705729166, + "learning_rate": 0.0001, + "loss": 3.0044, + "loss/crossentropy": 2.287555253505707, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.24582310616970063, + "loss/reg": 0.0, + "step": 35140 + }, + { + "epoch": 0.23125, + "grad_norm": 2.046875, + "grad_norm_var": 0.0275543212890625, + "learning_rate": 0.0001, + "loss": 3.0166, + "loss/crossentropy": 2.3102270364761353, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.22130452394485473, + "loss/reg": 0.0, + "step": 35150 + }, + { + "epoch": 0.2313157894736842, + "grad_norm": 2.359375, + "grad_norm_var": 0.022557576497395832, + "learning_rate": 0.0001, + "loss": 3.0183, + "loss/crossentropy": 2.3422921657562257, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.26338334679603576, + "loss/reg": 0.0, + "step": 35160 + }, + { + "epoch": 0.2313815789473684, + "grad_norm": 3.203125, + "grad_norm_var": 0.0848297119140625, + "learning_rate": 0.0001, + "loss": 3.0288, + "loss/crossentropy": 2.1819352507591248, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.21551729440689088, + "loss/reg": 0.0, + "step": 35170 + }, + { + "epoch": 0.23144736842105262, + "grad_norm": 2.296875, + "grad_norm_var": 0.17290751139322916, + "learning_rate": 0.0001, + "loss": 3.0282, + "loss/crossentropy": 2.512664186954498, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.20309768319129945, + "loss/reg": 0.0, + "step": 35180 + }, + { + "epoch": 0.23151315789473684, + "grad_norm": 2.453125, + "grad_norm_var": 3.398986258287209e+17, + "learning_rate": 0.0001, + "loss": 3.1551, + "loss/crossentropy": 2.3065868735313417, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.2547546997666359, + "loss/reg": 0.0, + "step": 35190 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 2.203125, + "grad_norm_var": 0.2131744384765625, + "learning_rate": 0.0001, + "loss": 3.0611, + "loss/crossentropy": 2.5265056133270263, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.21266352534294128, + "loss/reg": 0.0, + "step": 35200 + }, + { + "epoch": 0.23164473684210526, + "grad_norm": 2.921875, + "grad_norm_var": 0.4802480061848958, + "learning_rate": 0.0001, + "loss": 3.1018, + "loss/crossentropy": 2.378112018108368, + "loss/hidden": 2.68125, + "loss/incoh": 0.0, + "loss/logits": 0.2218308851122856, + "loss/reg": 0.0, + "step": 35210 + }, + { + "epoch": 0.23171052631578948, + "grad_norm": 2.171875, + "grad_norm_var": 0.28329671223958336, + "learning_rate": 0.0001, + "loss": 2.9809, + "loss/crossentropy": 2.3841970801353454, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.20064097195863723, + "loss/reg": 0.0, + "step": 35220 + }, + { + "epoch": 0.2317763157894737, + "grad_norm": 2.28125, + "grad_norm_var": 0.13202718098958333, + "learning_rate": 0.0001, + "loss": 3.0415, + "loss/crossentropy": 2.2034762263298036, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.26952334195375444, + "loss/reg": 0.0, + "step": 35230 + }, + { + "epoch": 0.2318421052631579, + "grad_norm": 3.484375, + "grad_norm_var": 0.17197850545247395, + "learning_rate": 0.0001, + "loss": 2.9833, + "loss/crossentropy": 2.333740735054016, + "loss/hidden": 2.6328125, + "loss/incoh": 0.0, + "loss/logits": 0.2028803788125515, + "loss/reg": 0.0, + "step": 35240 + }, + { + "epoch": 0.23190789473684212, + "grad_norm": 2.25, + "grad_norm_var": 0.12504247029622395, + "learning_rate": 0.0001, + "loss": 3.0428, + "loss/crossentropy": 1.9891301035881042, + "loss/hidden": 2.871875, + "loss/incoh": 0.0, + "loss/logits": 0.33240934908390046, + "loss/reg": 0.0, + "step": 35250 + }, + { + "epoch": 0.2319736842105263, + "grad_norm": 2.125, + "grad_norm_var": 0.0198394775390625, + "learning_rate": 0.0001, + "loss": 2.9679, + "loss/crossentropy": 2.394805371761322, + "loss/hidden": 2.65, + "loss/incoh": 0.0, + "loss/logits": 0.2079624727368355, + "loss/reg": 0.0, + "step": 35260 + }, + { + "epoch": 0.23203947368421052, + "grad_norm": 2.125, + "grad_norm_var": 0.07119140625, + "learning_rate": 0.0001, + "loss": 2.9843, + "loss/crossentropy": 2.466187059879303, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.21209322810173034, + "loss/reg": 0.0, + "step": 35270 + }, + { + "epoch": 0.23210526315789473, + "grad_norm": 2.796875, + "grad_norm_var": 0.7116737365722656, + "learning_rate": 0.0001, + "loss": 2.9743, + "loss/crossentropy": 2.2582387804985045, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2430204778909683, + "loss/reg": 0.0, + "step": 35280 + }, + { + "epoch": 0.23217105263157894, + "grad_norm": 3.28125, + "grad_norm_var": 3.206175898909409e+17, + "learning_rate": 0.0001, + "loss": 3.1474, + "loss/crossentropy": 2.2272575974464415, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.20790190547704696, + "loss/reg": 0.0, + "step": 35290 + }, + { + "epoch": 0.23223684210526316, + "grad_norm": 2.75, + "grad_norm_var": 3.206175899148288e+17, + "learning_rate": 0.0001, + "loss": 3.0553, + "loss/crossentropy": 2.3405486226081846, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.24497875720262527, + "loss/reg": 0.0, + "step": 35300 + }, + { + "epoch": 0.23230263157894737, + "grad_norm": 2.3125, + "grad_norm_var": 0.12532145182291668, + "learning_rate": 0.0001, + "loss": 3.0759, + "loss/crossentropy": 2.316471701860428, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.23606702983379363, + "loss/reg": 0.0, + "step": 35310 + }, + { + "epoch": 0.23236842105263159, + "grad_norm": 2.25, + "grad_norm_var": 0.07993876139322917, + "learning_rate": 0.0001, + "loss": 3.0771, + "loss/crossentropy": 2.068638467788696, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.2723642893135548, + "loss/reg": 0.0, + "step": 35320 + }, + { + "epoch": 0.2324342105263158, + "grad_norm": 2.375, + "grad_norm_var": 0.1643463134765625, + "learning_rate": 0.0001, + "loss": 3.0607, + "loss/crossentropy": 2.209253963828087, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.2534912425093353, + "loss/reg": 0.0, + "step": 35330 + }, + { + "epoch": 0.2325, + "grad_norm": 2.3125, + "grad_norm_var": 0.0254302978515625, + "learning_rate": 0.0001, + "loss": 3.0043, + "loss/crossentropy": 2.3624029099941253, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.25295102000236513, + "loss/reg": 0.0, + "step": 35340 + }, + { + "epoch": 0.2325657894736842, + "grad_norm": 2.171875, + "grad_norm_var": 0.04498291015625, + "learning_rate": 0.0001, + "loss": 3.0386, + "loss/crossentropy": 2.3273918867111205, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.22172463536262513, + "loss/reg": 0.0, + "step": 35350 + }, + { + "epoch": 0.2326315789473684, + "grad_norm": 2.390625, + "grad_norm_var": 0.055403645833333334, + "learning_rate": 0.0001, + "loss": 3.0584, + "loss/crossentropy": 1.895402479171753, + "loss/hidden": 3.0015625, + "loss/incoh": 0.0, + "loss/logits": 0.23743433207273484, + "loss/reg": 0.0, + "step": 35360 + }, + { + "epoch": 0.23269736842105262, + "grad_norm": 2.421875, + "grad_norm_var": 0.056962076822916666, + "learning_rate": 0.0001, + "loss": 3.1011, + "loss/crossentropy": 2.0022137641906737, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.21825481653213502, + "loss/reg": 0.0, + "step": 35370 + }, + { + "epoch": 0.23276315789473684, + "grad_norm": 2.40625, + "grad_norm_var": 0.08046468098958333, + "learning_rate": 0.0001, + "loss": 3.0227, + "loss/crossentropy": 1.9984350323677063, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.2356773540377617, + "loss/reg": 0.0, + "step": 35380 + }, + { + "epoch": 0.23282894736842105, + "grad_norm": 3.140625, + "grad_norm_var": 0.13664957682291667, + "learning_rate": 0.0001, + "loss": 3.0069, + "loss/crossentropy": 2.1517210602760315, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.22341941446065902, + "loss/reg": 0.0, + "step": 35390 + }, + { + "epoch": 0.23289473684210527, + "grad_norm": 2.40625, + "grad_norm_var": 0.26809794108072915, + "learning_rate": 0.0001, + "loss": 3.0346, + "loss/crossentropy": 2.380366015434265, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2661885678768158, + "loss/reg": 0.0, + "step": 35400 + }, + { + "epoch": 0.23296052631578948, + "grad_norm": 2.671875, + "grad_norm_var": 0.036149088541666666, + "learning_rate": 0.0001, + "loss": 3.0274, + "loss/crossentropy": 2.084783911705017, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22119598612189292, + "loss/reg": 0.0, + "step": 35410 + }, + { + "epoch": 0.2330263157894737, + "grad_norm": 2.0625, + "grad_norm_var": 0.21288960774739582, + "learning_rate": 0.0001, + "loss": 2.9574, + "loss/crossentropy": 2.288650333881378, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.2464343011379242, + "loss/reg": 0.0, + "step": 35420 + }, + { + "epoch": 0.2330921052631579, + "grad_norm": 2.765625, + "grad_norm_var": 0.22062174479166666, + "learning_rate": 0.0001, + "loss": 3.0144, + "loss/crossentropy": 2.1648478746414184, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.2272023007273674, + "loss/reg": 0.0, + "step": 35430 + }, + { + "epoch": 0.2331578947368421, + "grad_norm": 2.546875, + "grad_norm_var": 0.07825113932291666, + "learning_rate": 0.0001, + "loss": 2.9664, + "loss/crossentropy": 2.4016910076141356, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.19540313482284546, + "loss/reg": 0.0, + "step": 35440 + }, + { + "epoch": 0.2332236842105263, + "grad_norm": 2.0625, + "grad_norm_var": 0.12473526000976562, + "learning_rate": 0.0001, + "loss": 3.0054, + "loss/crossentropy": 2.2149259626865385, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.211626535654068, + "loss/reg": 0.0, + "step": 35450 + }, + { + "epoch": 0.23328947368421052, + "grad_norm": 2.21875, + "grad_norm_var": 0.2657244364420573, + "learning_rate": 0.0001, + "loss": 3.022, + "loss/crossentropy": 2.0099823474884033, + "loss/hidden": 2.8078125, + "loss/incoh": 0.0, + "loss/logits": 0.21212436258792877, + "loss/reg": 0.0, + "step": 35460 + }, + { + "epoch": 0.23335526315789473, + "grad_norm": 2.375, + "grad_norm_var": 0.035560862223307295, + "learning_rate": 0.0001, + "loss": 2.959, + "loss/crossentropy": 2.149253064393997, + "loss/hidden": 2.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.19234723448753357, + "loss/reg": 0.0, + "step": 35470 + }, + { + "epoch": 0.23342105263157895, + "grad_norm": 2.28125, + "grad_norm_var": 0.196630859375, + "learning_rate": 0.0001, + "loss": 3.0157, + "loss/crossentropy": 2.3657627940177917, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.223566497862339, + "loss/reg": 0.0, + "step": 35480 + }, + { + "epoch": 0.23348684210526316, + "grad_norm": 2.171875, + "grad_norm_var": 0.024527994791666667, + "learning_rate": 0.0001, + "loss": 2.9839, + "loss/crossentropy": 2.4007309794425966, + "loss/hidden": 2.596875, + "loss/incoh": 0.0, + "loss/logits": 0.19649036675691606, + "loss/reg": 0.0, + "step": 35490 + }, + { + "epoch": 0.23355263157894737, + "grad_norm": 3.046875, + "grad_norm_var": 0.05445556640625, + "learning_rate": 0.0001, + "loss": 3.0323, + "loss/crossentropy": 2.414763641357422, + "loss/hidden": 2.9296875, + "loss/incoh": 0.0, + "loss/logits": 0.2565377399325371, + "loss/reg": 0.0, + "step": 35500 + }, + { + "epoch": 0.2336184210526316, + "grad_norm": 2.625, + "grad_norm_var": 0.04409077962239583, + "learning_rate": 0.0001, + "loss": 2.9834, + "loss/crossentropy": 2.272168481349945, + "loss/hidden": 2.6125, + "loss/incoh": 0.0, + "loss/logits": 0.2014573760330677, + "loss/reg": 0.0, + "step": 35510 + }, + { + "epoch": 0.2336842105263158, + "grad_norm": 2.234375, + "grad_norm_var": 0.091162109375, + "learning_rate": 0.0001, + "loss": 3.0118, + "loss/crossentropy": 2.4486998319625854, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.2566034272313118, + "loss/reg": 0.0, + "step": 35520 + }, + { + "epoch": 0.23375, + "grad_norm": 2.34375, + "grad_norm_var": 0.11804097493489583, + "learning_rate": 0.0001, + "loss": 2.9697, + "loss/crossentropy": 2.2795116662979127, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.20817176550626754, + "loss/reg": 0.0, + "step": 35530 + }, + { + "epoch": 0.2338157894736842, + "grad_norm": 2.40625, + "grad_norm_var": 0.05882059733072917, + "learning_rate": 0.0001, + "loss": 2.9864, + "loss/crossentropy": 2.1612624883651734, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.234655499458313, + "loss/reg": 0.0, + "step": 35540 + }, + { + "epoch": 0.2338815789473684, + "grad_norm": 2.15625, + "grad_norm_var": 0.051545206705729166, + "learning_rate": 0.0001, + "loss": 2.9369, + "loss/crossentropy": 2.342636692523956, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.2195874720811844, + "loss/reg": 0.0, + "step": 35550 + }, + { + "epoch": 0.23394736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.2511678059895833, + "learning_rate": 0.0001, + "loss": 2.9652, + "loss/crossentropy": 2.4172530651092528, + "loss/hidden": 2.675, + "loss/incoh": 0.0, + "loss/logits": 0.22586058229207992, + "loss/reg": 0.0, + "step": 35560 + }, + { + "epoch": 0.23401315789473684, + "grad_norm": 2.265625, + "grad_norm_var": 0.21130269368489582, + "learning_rate": 0.0001, + "loss": 2.9721, + "loss/crossentropy": 2.19435909986496, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.22134042978286744, + "loss/reg": 0.0, + "step": 35570 + }, + { + "epoch": 0.23407894736842105, + "grad_norm": 2.1875, + "grad_norm_var": 0.12469075520833334, + "learning_rate": 0.0001, + "loss": 3.0487, + "loss/crossentropy": 2.491572880744934, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.2359505832195282, + "loss/reg": 0.0, + "step": 35580 + }, + { + "epoch": 0.23414473684210527, + "grad_norm": 2.546875, + "grad_norm_var": 0.24637832641601562, + "learning_rate": 0.0001, + "loss": 3.0188, + "loss/crossentropy": 2.0867708206176756, + "loss/hidden": 2.946875, + "loss/incoh": 0.0, + "loss/logits": 0.29949132055044175, + "loss/reg": 0.0, + "step": 35590 + }, + { + "epoch": 0.23421052631578948, + "grad_norm": 2.78125, + "grad_norm_var": 0.09173075358072917, + "learning_rate": 0.0001, + "loss": 3.0162, + "loss/crossentropy": 2.500631070137024, + "loss/hidden": 2.6359375, + "loss/incoh": 0.0, + "loss/logits": 0.23038013875484467, + "loss/reg": 0.0, + "step": 35600 + }, + { + "epoch": 0.2342763157894737, + "grad_norm": 2.78125, + "grad_norm_var": 0.09622294108072917, + "learning_rate": 0.0001, + "loss": 2.9738, + "loss/crossentropy": 2.330187511444092, + "loss/hidden": 2.6375, + "loss/incoh": 0.0, + "loss/logits": 0.21412627547979354, + "loss/reg": 0.0, + "step": 35610 + }, + { + "epoch": 0.2343421052631579, + "grad_norm": 2.09375, + "grad_norm_var": 0.06207275390625, + "learning_rate": 0.0001, + "loss": 2.9646, + "loss/crossentropy": 2.0748894095420836, + "loss/hidden": 2.9234375, + "loss/incoh": 0.0, + "loss/logits": 0.2262689083814621, + "loss/reg": 0.0, + "step": 35620 + }, + { + "epoch": 0.2344078947368421, + "grad_norm": 2.546875, + "grad_norm_var": 0.0324859619140625, + "learning_rate": 0.0001, + "loss": 2.9761, + "loss/crossentropy": 2.415894079208374, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.20092394277453424, + "loss/reg": 0.0, + "step": 35630 + }, + { + "epoch": 0.2344736842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.1505859375, + "learning_rate": 0.0001, + "loss": 3.0498, + "loss/crossentropy": 2.2875362753868105, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.24587944149971008, + "loss/reg": 0.0, + "step": 35640 + }, + { + "epoch": 0.23453947368421052, + "grad_norm": 2.515625, + "grad_norm_var": 0.16611226399739584, + "learning_rate": 0.0001, + "loss": 2.9052, + "loss/crossentropy": 2.4340951919555662, + "loss/hidden": 2.6359375, + "loss/incoh": 0.0, + "loss/logits": 0.22377320230007172, + "loss/reg": 0.0, + "step": 35650 + }, + { + "epoch": 0.23460526315789473, + "grad_norm": 2.140625, + "grad_norm_var": 0.06776936848958333, + "learning_rate": 0.0001, + "loss": 2.9088, + "loss/crossentropy": 2.3177594542503357, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.2174980953335762, + "loss/reg": 0.0, + "step": 35660 + }, + { + "epoch": 0.23467105263157895, + "grad_norm": 2.1875, + "grad_norm_var": 0.0744049072265625, + "learning_rate": 0.0001, + "loss": 3.0171, + "loss/crossentropy": 2.2726083517074587, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.30478386878967284, + "loss/reg": 0.0, + "step": 35670 + }, + { + "epoch": 0.23473684210526316, + "grad_norm": 2.734375, + "grad_norm_var": 0.0516754150390625, + "learning_rate": 0.0001, + "loss": 3.072, + "loss/crossentropy": 2.2021562099456786, + "loss/hidden": 3.0625, + "loss/incoh": 0.0, + "loss/logits": 0.27211851328611375, + "loss/reg": 0.0, + "step": 35680 + }, + { + "epoch": 0.23480263157894737, + "grad_norm": 2.515625, + "grad_norm_var": 0.05318603515625, + "learning_rate": 0.0001, + "loss": 3.0061, + "loss/crossentropy": 2.2337870597839355, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.23637954592704774, + "loss/reg": 0.0, + "step": 35690 + }, + { + "epoch": 0.2348684210526316, + "grad_norm": 2.546875, + "grad_norm_var": 0.21002197265625, + "learning_rate": 0.0001, + "loss": 3.0582, + "loss/crossentropy": 1.9850136280059814, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.2096769317984581, + "loss/reg": 0.0, + "step": 35700 + }, + { + "epoch": 0.2349342105263158, + "grad_norm": 6.125, + "grad_norm_var": 1.515673828125, + "learning_rate": 0.0001, + "loss": 2.9576, + "loss/crossentropy": 2.496402633190155, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.22533617317676544, + "loss/reg": 0.0, + "step": 35710 + }, + { + "epoch": 0.235, + "grad_norm": 2.359375, + "grad_norm_var": 0.95963134765625, + "learning_rate": 0.0001, + "loss": 3.0797, + "loss/crossentropy": 2.140752410888672, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.20643131881952287, + "loss/reg": 0.0, + "step": 35720 + }, + { + "epoch": 0.2350657894736842, + "grad_norm": 2.796875, + "grad_norm_var": 0.19202067057291666, + "learning_rate": 0.0001, + "loss": 2.9408, + "loss/crossentropy": 1.9449581146240233, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.1967112362384796, + "loss/reg": 0.0, + "step": 35730 + }, + { + "epoch": 0.2351315789473684, + "grad_norm": 2.34375, + "grad_norm_var": 0.718365224202474, + "learning_rate": 0.0001, + "loss": 2.873, + "loss/crossentropy": 2.059830403327942, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.1984706148505211, + "loss/reg": 0.0, + "step": 35740 + }, + { + "epoch": 0.23519736842105263, + "grad_norm": 1.90625, + "grad_norm_var": 0.7783404032389323, + "learning_rate": 0.0001, + "loss": 2.9617, + "loss/crossentropy": 2.5269897818565368, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.24567708522081375, + "loss/reg": 0.0, + "step": 35750 + }, + { + "epoch": 0.23526315789473684, + "grad_norm": 2.75, + "grad_norm_var": 0.3910441080729167, + "learning_rate": 0.0001, + "loss": 3.049, + "loss/crossentropy": 2.2286418437957765, + "loss/hidden": 3.084375, + "loss/incoh": 0.0, + "loss/logits": 0.22506199181079864, + "loss/reg": 0.0, + "step": 35760 + }, + { + "epoch": 0.23532894736842105, + "grad_norm": 2.75, + "grad_norm_var": 0.20345052083333334, + "learning_rate": 0.0001, + "loss": 3.0262, + "loss/crossentropy": 2.321590745449066, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.22042199671268464, + "loss/reg": 0.0, + "step": 35770 + }, + { + "epoch": 0.23539473684210527, + "grad_norm": 2.328125, + "grad_norm_var": 0.6254981994628906, + "learning_rate": 0.0001, + "loss": 2.9981, + "loss/crossentropy": 2.3971486926078795, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.21692513525485993, + "loss/reg": 0.0, + "step": 35780 + }, + { + "epoch": 0.23546052631578948, + "grad_norm": 2.078125, + "grad_norm_var": 0.2902809143066406, + "learning_rate": 0.0001, + "loss": 2.931, + "loss/crossentropy": 2.3023766756057737, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.2162985995411873, + "loss/reg": 0.0, + "step": 35790 + }, + { + "epoch": 0.2355263157894737, + "grad_norm": 2.421875, + "grad_norm_var": 0.16299540201822918, + "learning_rate": 0.0001, + "loss": 3.0115, + "loss/crossentropy": 2.355253207683563, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.24053574204444886, + "loss/reg": 0.0, + "step": 35800 + }, + { + "epoch": 0.2355921052631579, + "grad_norm": 2.15625, + "grad_norm_var": 0.10937398274739583, + "learning_rate": 0.0001, + "loss": 2.9729, + "loss/crossentropy": 2.493846869468689, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.2182188354432583, + "loss/reg": 0.0, + "step": 35810 + }, + { + "epoch": 0.2356578947368421, + "grad_norm": 2.296875, + "grad_norm_var": 0.46917317708333334, + "learning_rate": 0.0001, + "loss": 3.0177, + "loss/crossentropy": 2.3108774185180665, + "loss/hidden": 2.796875, + "loss/incoh": 0.0, + "loss/logits": 0.17969555854797364, + "loss/reg": 0.0, + "step": 35820 + }, + { + "epoch": 0.2357236842105263, + "grad_norm": 2.21875, + "grad_norm_var": 0.29129231770833336, + "learning_rate": 0.0001, + "loss": 3.0199, + "loss/crossentropy": 2.1752161145210267, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.25736078470945356, + "loss/reg": 0.0, + "step": 35830 + }, + { + "epoch": 0.23578947368421052, + "grad_norm": 2.390625, + "grad_norm_var": 0.15127665201822918, + "learning_rate": 0.0001, + "loss": 3.0057, + "loss/crossentropy": 2.21168737411499, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.24892309606075286, + "loss/reg": 0.0, + "step": 35840 + }, + { + "epoch": 0.23585526315789473, + "grad_norm": 2.328125, + "grad_norm_var": 0.38827718098958336, + "learning_rate": 0.0001, + "loss": 2.9855, + "loss/crossentropy": 2.4601115822792052, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.23503982722759248, + "loss/reg": 0.0, + "step": 35850 + }, + { + "epoch": 0.23592105263157895, + "grad_norm": 2.375, + "grad_norm_var": 0.0817779541015625, + "learning_rate": 0.0001, + "loss": 2.9943, + "loss/crossentropy": 2.4704690098762514, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2563426047563553, + "loss/reg": 0.0, + "step": 35860 + }, + { + "epoch": 0.23598684210526316, + "grad_norm": 2.078125, + "grad_norm_var": 0.16731541951497395, + "learning_rate": 0.0001, + "loss": 3.0406, + "loss/crossentropy": 2.6593142986297607, + "loss/hidden": 3.028125, + "loss/incoh": 0.0, + "loss/logits": 0.2891248628497124, + "loss/reg": 0.0, + "step": 35870 + }, + { + "epoch": 0.23605263157894738, + "grad_norm": 2.234375, + "grad_norm_var": 0.13696263631184896, + "learning_rate": 0.0001, + "loss": 3.0187, + "loss/crossentropy": 2.4739042282104493, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.23506251722574234, + "loss/reg": 0.0, + "step": 35880 + }, + { + "epoch": 0.2361184210526316, + "grad_norm": 2.296875, + "grad_norm_var": 0.159375, + "learning_rate": 0.0001, + "loss": 3.0207, + "loss/crossentropy": 2.2195907831192017, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.20974744409322738, + "loss/reg": 0.0, + "step": 35890 + }, + { + "epoch": 0.2361842105263158, + "grad_norm": 2.21875, + "grad_norm_var": 0.2710039774576823, + "learning_rate": 0.0001, + "loss": 2.9166, + "loss/crossentropy": 2.1691187381744386, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.20902955271303653, + "loss/reg": 0.0, + "step": 35900 + }, + { + "epoch": 0.23625, + "grad_norm": 2.484375, + "grad_norm_var": 0.021848297119140624, + "learning_rate": 0.0001, + "loss": 2.9421, + "loss/crossentropy": 2.380651593208313, + "loss/hidden": 2.64375, + "loss/incoh": 0.0, + "loss/logits": 0.21168311238288878, + "loss/reg": 0.0, + "step": 35910 + }, + { + "epoch": 0.2363157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.0631256103515625, + "learning_rate": 0.0001, + "loss": 2.9784, + "loss/crossentropy": 2.194112575054169, + "loss/hidden": 2.925, + "loss/incoh": 0.0, + "loss/logits": 0.2107112467288971, + "loss/reg": 0.0, + "step": 35920 + }, + { + "epoch": 0.23638157894736841, + "grad_norm": 2.296875, + "grad_norm_var": 0.19700419108072917, + "learning_rate": 0.0001, + "loss": 3.0817, + "loss/crossentropy": 2.289298951625824, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.1993821457028389, + "loss/reg": 0.0, + "step": 35930 + }, + { + "epoch": 0.23644736842105263, + "grad_norm": 2.828125, + "grad_norm_var": 0.07625325520833333, + "learning_rate": 0.0001, + "loss": 2.9474, + "loss/crossentropy": 2.4226235032081602, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.2387748032808304, + "loss/reg": 0.0, + "step": 35940 + }, + { + "epoch": 0.23651315789473684, + "grad_norm": 2.390625, + "grad_norm_var": 0.04461263020833333, + "learning_rate": 0.0001, + "loss": 2.9628, + "loss/crossentropy": 2.380166971683502, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.2143217995762825, + "loss/reg": 0.0, + "step": 35950 + }, + { + "epoch": 0.23657894736842106, + "grad_norm": 2.3125, + "grad_norm_var": 0.1749908447265625, + "learning_rate": 0.0001, + "loss": 2.9418, + "loss/crossentropy": 2.286650228500366, + "loss/hidden": 2.5953125, + "loss/incoh": 0.0, + "loss/logits": 0.21095365434885024, + "loss/reg": 0.0, + "step": 35960 + }, + { + "epoch": 0.23664473684210527, + "grad_norm": 2.3125, + "grad_norm_var": 0.11900634765625, + "learning_rate": 0.0001, + "loss": 3.0537, + "loss/crossentropy": 2.2950440287590026, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.2576613754034042, + "loss/reg": 0.0, + "step": 35970 + }, + { + "epoch": 0.23671052631578948, + "grad_norm": 2.28125, + "grad_norm_var": 0.12654520670572916, + "learning_rate": 0.0001, + "loss": 2.9324, + "loss/crossentropy": 2.236435151100159, + "loss/hidden": 2.6484375, + "loss/incoh": 0.0, + "loss/logits": 0.20467466115951538, + "loss/reg": 0.0, + "step": 35980 + }, + { + "epoch": 0.2367763157894737, + "grad_norm": 2.40625, + "grad_norm_var": 0.12952473958333333, + "learning_rate": 0.0001, + "loss": 2.9877, + "loss/crossentropy": 2.503733921051025, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.21396732181310654, + "loss/reg": 0.0, + "step": 35990 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 2.109375, + "grad_norm_var": 0.10692952473958334, + "learning_rate": 0.0001, + "loss": 2.9584, + "loss/crossentropy": 2.493756449222565, + "loss/hidden": 2.9109375, + "loss/incoh": 0.0, + "loss/logits": 0.23922124207019807, + "loss/reg": 0.0, + "step": 36000 + }, + { + "epoch": 0.2369078947368421, + "grad_norm": 2.28125, + "grad_norm_var": 0.92545166015625, + "learning_rate": 0.0001, + "loss": 3.0174, + "loss/crossentropy": 1.8596992015838623, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.19636918008327484, + "loss/reg": 0.0, + "step": 36010 + }, + { + "epoch": 0.2369736842105263, + "grad_norm": 2.28125, + "grad_norm_var": 0.9360260009765625, + "learning_rate": 0.0001, + "loss": 2.9994, + "loss/crossentropy": 2.239300674200058, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.21985187232494355, + "loss/reg": 0.0, + "step": 36020 + }, + { + "epoch": 0.23703947368421052, + "grad_norm": 4.625, + "grad_norm_var": 2.2611793518066405, + "learning_rate": 0.0001, + "loss": 2.987, + "loss/crossentropy": 2.22670122385025, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.22871969491243363, + "loss/reg": 0.0, + "step": 36030 + }, + { + "epoch": 0.23710526315789474, + "grad_norm": 3.0, + "grad_norm_var": 0.4241167704264323, + "learning_rate": 0.0001, + "loss": 3.0196, + "loss/crossentropy": 2.0241116285324097, + "loss/hidden": 2.94375, + "loss/incoh": 0.0, + "loss/logits": 0.24767006561160088, + "loss/reg": 0.0, + "step": 36040 + }, + { + "epoch": 0.23717105263157895, + "grad_norm": 2.09375, + "grad_norm_var": 0.30030924479166665, + "learning_rate": 0.0001, + "loss": 2.9484, + "loss/crossentropy": 2.086419093608856, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.1900019347667694, + "loss/reg": 0.0, + "step": 36050 + }, + { + "epoch": 0.23723684210526316, + "grad_norm": 2.25, + "grad_norm_var": 0.1086822509765625, + "learning_rate": 0.0001, + "loss": 2.9919, + "loss/crossentropy": 2.1598266899585723, + "loss/hidden": 2.634375, + "loss/incoh": 0.0, + "loss/logits": 0.26148284450173376, + "loss/reg": 0.0, + "step": 36060 + }, + { + "epoch": 0.23730263157894738, + "grad_norm": 3.203125, + "grad_norm_var": 0.10439453125, + "learning_rate": 0.0001, + "loss": 3.0653, + "loss/crossentropy": 2.1726396083831787, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.2597889766097069, + "loss/reg": 0.0, + "step": 36070 + }, + { + "epoch": 0.2373684210526316, + "grad_norm": 2.140625, + "grad_norm_var": 0.10669657389322916, + "learning_rate": 0.0001, + "loss": 2.9768, + "loss/crossentropy": 2.1650060296058653, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.22962034344673157, + "loss/reg": 0.0, + "step": 36080 + }, + { + "epoch": 0.2374342105263158, + "grad_norm": 2.40625, + "grad_norm_var": 0.04478759765625, + "learning_rate": 0.0001, + "loss": 2.9233, + "loss/crossentropy": 2.289682912826538, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.2165387198328972, + "loss/reg": 0.0, + "step": 36090 + }, + { + "epoch": 0.2375, + "grad_norm": 1.9921875, + "grad_norm_var": 0.09746068318684896, + "learning_rate": 0.0001, + "loss": 3.0195, + "loss/crossentropy": 2.332055389881134, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.21832392662763594, + "loss/reg": 0.0, + "step": 36100 + }, + { + "epoch": 0.2375657894736842, + "grad_norm": 2.75, + "grad_norm_var": 0.07844619750976563, + "learning_rate": 0.0001, + "loss": 3.0759, + "loss/crossentropy": 2.5219852209091185, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.27026659697294236, + "loss/reg": 0.0, + "step": 36110 + }, + { + "epoch": 0.23763157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.07219950358072917, + "learning_rate": 0.0001, + "loss": 2.9762, + "loss/crossentropy": 2.2172631919384003, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.22206666469573974, + "loss/reg": 0.0, + "step": 36120 + }, + { + "epoch": 0.23769736842105263, + "grad_norm": 2.0625, + "grad_norm_var": 0.11806640625, + "learning_rate": 0.0001, + "loss": 2.9813, + "loss/crossentropy": 2.235606110095978, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.23971866220235824, + "loss/reg": 0.0, + "step": 36130 + }, + { + "epoch": 0.23776315789473684, + "grad_norm": 2.53125, + "grad_norm_var": 1.190087890625, + "learning_rate": 0.0001, + "loss": 3.0329, + "loss/crossentropy": 2.2865034997463227, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.32822101563215256, + "loss/reg": 0.0, + "step": 36140 + }, + { + "epoch": 0.23782894736842106, + "grad_norm": 2.625, + "grad_norm_var": 1.3279296875, + "learning_rate": 0.0001, + "loss": 3.0021, + "loss/crossentropy": 2.2038097441196443, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.21367209404706955, + "loss/reg": 0.0, + "step": 36150 + }, + { + "epoch": 0.23789473684210527, + "grad_norm": 2.265625, + "grad_norm_var": 0.05592041015625, + "learning_rate": 0.0001, + "loss": 3.0254, + "loss/crossentropy": 2.2345643639564514, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.27571815252304077, + "loss/reg": 0.0, + "step": 36160 + }, + { + "epoch": 0.23796052631578948, + "grad_norm": 2.65625, + "grad_norm_var": 0.11005757649739584, + "learning_rate": 0.0001, + "loss": 3.039, + "loss/crossentropy": 2.1099373579025267, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.21341636329889296, + "loss/reg": 0.0, + "step": 36170 + }, + { + "epoch": 0.2380263157894737, + "grad_norm": 2.125, + "grad_norm_var": 0.114306640625, + "learning_rate": 0.0001, + "loss": 2.9229, + "loss/crossentropy": 2.1778077363967894, + "loss/hidden": 2.5546875, + "loss/incoh": 0.0, + "loss/logits": 0.18448112457990645, + "loss/reg": 0.0, + "step": 36180 + }, + { + "epoch": 0.23809210526315788, + "grad_norm": 2.375, + "grad_norm_var": 0.042867024739583336, + "learning_rate": 0.0001, + "loss": 2.9876, + "loss/crossentropy": 2.5516823649406435, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.25054328292608263, + "loss/reg": 0.0, + "step": 36190 + }, + { + "epoch": 0.2381578947368421, + "grad_norm": 2.390625, + "grad_norm_var": 0.07666727701822916, + "learning_rate": 0.0001, + "loss": 2.9734, + "loss/crossentropy": 2.257242572307587, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.2207578793168068, + "loss/reg": 0.0, + "step": 36200 + }, + { + "epoch": 0.2382236842105263, + "grad_norm": 2.078125, + "grad_norm_var": 0.07727864583333334, + "learning_rate": 0.0001, + "loss": 2.999, + "loss/crossentropy": 2.093051493167877, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.2044597864151001, + "loss/reg": 0.0, + "step": 36210 + }, + { + "epoch": 0.23828947368421052, + "grad_norm": 2.0625, + "grad_norm_var": 0.11516825358072917, + "learning_rate": 0.0001, + "loss": 2.9287, + "loss/crossentropy": 2.1220389723777773, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.19910518899559976, + "loss/reg": 0.0, + "step": 36220 + }, + { + "epoch": 0.23835526315789474, + "grad_norm": 2.65625, + "grad_norm_var": 0.2674763997395833, + "learning_rate": 0.0001, + "loss": 3.0577, + "loss/crossentropy": 2.1114853382110597, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.2416158512234688, + "loss/reg": 0.0, + "step": 36230 + }, + { + "epoch": 0.23842105263157895, + "grad_norm": 2.671875, + "grad_norm_var": 0.05162353515625, + "learning_rate": 0.0001, + "loss": 2.9816, + "loss/crossentropy": 2.332682567834854, + "loss/hidden": 2.640625, + "loss/incoh": 0.0, + "loss/logits": 0.2057093556970358, + "loss/reg": 0.0, + "step": 36240 + }, + { + "epoch": 0.23848684210526316, + "grad_norm": 2.15625, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 2.9878, + "loss/crossentropy": 2.26646374464035, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.22881892770528794, + "loss/reg": 0.0, + "step": 36250 + }, + { + "epoch": 0.23855263157894738, + "grad_norm": 2.109375, + "grad_norm_var": 0.0685455322265625, + "learning_rate": 0.0001, + "loss": 3.041, + "loss/crossentropy": 2.0819905757904054, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.2033102184534073, + "loss/reg": 0.0, + "step": 36260 + }, + { + "epoch": 0.2386184210526316, + "grad_norm": 3.453125, + "grad_norm_var": 2.275194295247396, + "learning_rate": 0.0001, + "loss": 3.0941, + "loss/crossentropy": 2.1767791748046874, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.24875595569610595, + "loss/reg": 0.0, + "step": 36270 + }, + { + "epoch": 0.23868421052631578, + "grad_norm": 2.28125, + "grad_norm_var": 3.0041412353515624, + "learning_rate": 0.0001, + "loss": 3.0775, + "loss/crossentropy": 2.389254295825958, + "loss/hidden": 2.659375, + "loss/incoh": 0.0, + "loss/logits": 0.20708534121513367, + "loss/reg": 0.0, + "step": 36280 + }, + { + "epoch": 0.23875, + "grad_norm": 2.3125, + "grad_norm_var": 1.19429931640625, + "learning_rate": 0.0001, + "loss": 2.9765, + "loss/crossentropy": 2.2551061868667603, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.23167352080345155, + "loss/reg": 0.0, + "step": 36290 + }, + { + "epoch": 0.2388157894736842, + "grad_norm": 2.578125, + "grad_norm_var": 0.4574127197265625, + "learning_rate": 0.0001, + "loss": 3.0732, + "loss/crossentropy": 2.3170044660568236, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.22510029524564742, + "loss/reg": 0.0, + "step": 36300 + }, + { + "epoch": 0.23888157894736842, + "grad_norm": 2.421875, + "grad_norm_var": 0.6108904520670573, + "learning_rate": 0.0001, + "loss": 3.0454, + "loss/crossentropy": 1.9875805854797364, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.21809106022119523, + "loss/reg": 0.0, + "step": 36310 + }, + { + "epoch": 0.23894736842105263, + "grad_norm": 2.40625, + "grad_norm_var": 0.7914265950520833, + "learning_rate": 0.0001, + "loss": 2.9925, + "loss/crossentropy": 2.3829528450965882, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2381367027759552, + "loss/reg": 0.0, + "step": 36320 + }, + { + "epoch": 0.23901315789473684, + "grad_norm": 2.234375, + "grad_norm_var": 0.3254778544108073, + "learning_rate": 0.0001, + "loss": 3.0144, + "loss/crossentropy": 2.196928286552429, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.24151963144540786, + "loss/reg": 0.0, + "step": 36330 + }, + { + "epoch": 0.23907894736842106, + "grad_norm": 2.375, + "grad_norm_var": 0.0215240478515625, + "learning_rate": 0.0001, + "loss": 2.9815, + "loss/crossentropy": 2.2826131939888, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.23180599957704545, + "loss/reg": 0.0, + "step": 36340 + }, + { + "epoch": 0.23914473684210527, + "grad_norm": 2.671875, + "grad_norm_var": 0.7601847330729167, + "learning_rate": 0.0001, + "loss": 3.025, + "loss/crossentropy": 2.4469813466072083, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.21003625690937042, + "loss/reg": 0.0, + "step": 36350 + }, + { + "epoch": 0.23921052631578948, + "grad_norm": 2.3125, + "grad_norm_var": 0.04129231770833333, + "learning_rate": 0.0001, + "loss": 2.9786, + "loss/crossentropy": 2.5087629079818727, + "loss/hidden": 2.7390625, + "loss/incoh": 0.0, + "loss/logits": 0.22830777615308762, + "loss/reg": 0.0, + "step": 36360 + }, + { + "epoch": 0.2392763157894737, + "grad_norm": 2.40625, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 3.0514, + "loss/crossentropy": 2.0158798813819887, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.22552752420306205, + "loss/reg": 0.0, + "step": 36370 + }, + { + "epoch": 0.23934210526315788, + "grad_norm": 2.25, + "grad_norm_var": 0.056298828125, + "learning_rate": 0.0001, + "loss": 3.0251, + "loss/crossentropy": 1.9761714577674865, + "loss/hidden": 2.859375, + "loss/incoh": 0.0, + "loss/logits": 0.23326522260904312, + "loss/reg": 0.0, + "step": 36380 + }, + { + "epoch": 0.2394078947368421, + "grad_norm": 2.640625, + "grad_norm_var": 0.19085286458333334, + "learning_rate": 0.0001, + "loss": 3.0543, + "loss/crossentropy": 2.1724894046783447, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.2123780742287636, + "loss/reg": 0.0, + "step": 36390 + }, + { + "epoch": 0.2394736842105263, + "grad_norm": 2.46875, + "grad_norm_var": 0.1413726806640625, + "learning_rate": 0.0001, + "loss": 2.9757, + "loss/crossentropy": 2.1725098967552183, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.2148168534040451, + "loss/reg": 0.0, + "step": 36400 + }, + { + "epoch": 0.23953947368421052, + "grad_norm": 2.640625, + "grad_norm_var": 4.847835032145182, + "learning_rate": 0.0001, + "loss": 3.046, + "loss/crossentropy": 2.4329644799232484, + "loss/hidden": 2.8671875, + "loss/incoh": 0.0, + "loss/logits": 0.26010534912347794, + "loss/reg": 0.0, + "step": 36410 + }, + { + "epoch": 0.23960526315789474, + "grad_norm": 2.453125, + "grad_norm_var": 4.855410766601563, + "learning_rate": 0.0001, + "loss": 2.9563, + "loss/crossentropy": 2.215455192327499, + "loss/hidden": 2.909375, + "loss/incoh": 0.0, + "loss/logits": 0.23449745997786522, + "loss/reg": 0.0, + "step": 36420 + }, + { + "epoch": 0.23967105263157895, + "grad_norm": 2.4375, + "grad_norm_var": 0.0796539306640625, + "learning_rate": 0.0001, + "loss": 3.015, + "loss/crossentropy": 2.411003601551056, + "loss/hidden": 2.8921875, + "loss/incoh": 0.0, + "loss/logits": 0.27160040885210035, + "loss/reg": 0.0, + "step": 36430 + }, + { + "epoch": 0.23973684210526316, + "grad_norm": 2.328125, + "grad_norm_var": 0.0956207275390625, + "learning_rate": 0.0001, + "loss": 2.9926, + "loss/crossentropy": 2.542527449131012, + "loss/hidden": 2.8, + "loss/incoh": 0.0, + "loss/logits": 0.23901409804821014, + "loss/reg": 0.0, + "step": 36440 + }, + { + "epoch": 0.23980263157894738, + "grad_norm": 2.421875, + "grad_norm_var": 0.1344146728515625, + "learning_rate": 0.0001, + "loss": 3.0325, + "loss/crossentropy": 2.28466220498085, + "loss/hidden": 2.665625, + "loss/incoh": 0.0, + "loss/logits": 0.21214617267251015, + "loss/reg": 0.0, + "step": 36450 + }, + { + "epoch": 0.2398684210526316, + "grad_norm": 2.15625, + "grad_norm_var": 0.07780659993489583, + "learning_rate": 0.0001, + "loss": 3.0155, + "loss/crossentropy": 2.521458077430725, + "loss/hidden": 2.7625, + "loss/incoh": 0.0, + "loss/logits": 0.24949344843626023, + "loss/reg": 0.0, + "step": 36460 + }, + { + "epoch": 0.23993421052631578, + "grad_norm": 3.171875, + "grad_norm_var": 0.10063374837239583, + "learning_rate": 0.0001, + "loss": 3.0598, + "loss/crossentropy": 2.11941602230072, + "loss/hidden": 2.8125, + "loss/incoh": 0.0, + "loss/logits": 0.265853063762188, + "loss/reg": 0.0, + "step": 36470 + }, + { + "epoch": 0.24, + "grad_norm": 2.078125, + "grad_norm_var": 0.15134175618489584, + "learning_rate": 0.0001, + "loss": 3.1036, + "loss/crossentropy": 2.159947466850281, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.23642594143748283, + "loss/reg": 0.0, + "step": 36480 + }, + { + "epoch": 0.2400657894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.1106109619140625, + "learning_rate": 0.0001, + "loss": 2.9888, + "loss/crossentropy": 2.2564534187316894, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.24869659841060637, + "loss/reg": 0.0, + "step": 36490 + }, + { + "epoch": 0.24013157894736842, + "grad_norm": 2.546875, + "grad_norm_var": 0.04983723958333333, + "learning_rate": 0.0001, + "loss": 3.107, + "loss/crossentropy": 2.3479647517204283, + "loss/hidden": 2.715625, + "loss/incoh": 0.0, + "loss/logits": 0.2286713719367981, + "loss/reg": 0.0, + "step": 36500 + }, + { + "epoch": 0.24019736842105263, + "grad_norm": 2.5, + "grad_norm_var": 0.07700093587239583, + "learning_rate": 0.0001, + "loss": 3.0664, + "loss/crossentropy": 2.4374531388282774, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.23715483397245407, + "loss/reg": 0.0, + "step": 36510 + }, + { + "epoch": 0.24026315789473685, + "grad_norm": 3.171875, + "grad_norm_var": 0.28349202473958335, + "learning_rate": 0.0001, + "loss": 3.0541, + "loss/crossentropy": 1.9271621584892273, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.2382538564503193, + "loss/reg": 0.0, + "step": 36520 + }, + { + "epoch": 0.24032894736842106, + "grad_norm": 2.1875, + "grad_norm_var": 0.26936747233072916, + "learning_rate": 0.0001, + "loss": 2.9974, + "loss/crossentropy": 2.3039157390594482, + "loss/hidden": 2.5375, + "loss/incoh": 0.0, + "loss/logits": 0.18885463178157808, + "loss/reg": 0.0, + "step": 36530 + }, + { + "epoch": 0.24039473684210527, + "grad_norm": 2.359375, + "grad_norm_var": 0.04563700358072917, + "learning_rate": 0.0001, + "loss": 2.9944, + "loss/crossentropy": 2.1725877285003663, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.22165239751338958, + "loss/reg": 0.0, + "step": 36540 + }, + { + "epoch": 0.24046052631578949, + "grad_norm": 2.46875, + "grad_norm_var": 0.026154581705729166, + "learning_rate": 0.0001, + "loss": 3.011, + "loss/crossentropy": 2.393988037109375, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.25498510152101517, + "loss/reg": 0.0, + "step": 36550 + }, + { + "epoch": 0.24052631578947367, + "grad_norm": 5.125, + "grad_norm_var": 0.5747548421223958, + "learning_rate": 0.0001, + "loss": 3.0477, + "loss/crossentropy": 2.3300102829933165, + "loss/hidden": 2.921875, + "loss/incoh": 0.0, + "loss/logits": 0.2910366252064705, + "loss/reg": 0.0, + "step": 36560 + }, + { + "epoch": 0.24059210526315788, + "grad_norm": 2.390625, + "grad_norm_var": 0.541943359375, + "learning_rate": 0.0001, + "loss": 3.0418, + "loss/crossentropy": 2.2268447399139406, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.236025869846344, + "loss/reg": 0.0, + "step": 36570 + }, + { + "epoch": 0.2406578947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.11101786295572917, + "learning_rate": 0.0001, + "loss": 3.0594, + "loss/crossentropy": 2.4716415405273438, + "loss/hidden": 2.8453125, + "loss/incoh": 0.0, + "loss/logits": 0.2721579551696777, + "loss/reg": 0.0, + "step": 36580 + }, + { + "epoch": 0.2407236842105263, + "grad_norm": 2.609375, + "grad_norm_var": 0.09099934895833334, + "learning_rate": 0.0001, + "loss": 2.9772, + "loss/crossentropy": 2.2797286033630373, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.24081944078207015, + "loss/reg": 0.0, + "step": 36590 + }, + { + "epoch": 0.24078947368421053, + "grad_norm": 2.1875, + "grad_norm_var": 0.04234110514322917, + "learning_rate": 0.0001, + "loss": 2.9358, + "loss/crossentropy": 2.2048792719841, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2422194480895996, + "loss/reg": 0.0, + "step": 36600 + }, + { + "epoch": 0.24085526315789474, + "grad_norm": 2.34375, + "grad_norm_var": 0.061766560872395834, + "learning_rate": 0.0001, + "loss": 3.0394, + "loss/crossentropy": 2.4691983580589296, + "loss/hidden": 2.903125, + "loss/incoh": 0.0, + "loss/logits": 0.2871833577752113, + "loss/reg": 0.0, + "step": 36610 + }, + { + "epoch": 0.24092105263157895, + "grad_norm": 2.1875, + "grad_norm_var": 0.07330322265625, + "learning_rate": 0.0001, + "loss": 3.0204, + "loss/crossentropy": 2.3721364736557007, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.24769402742385865, + "loss/reg": 0.0, + "step": 36620 + }, + { + "epoch": 0.24098684210526317, + "grad_norm": 2.40625, + "grad_norm_var": 0.08896077473958333, + "learning_rate": 0.0001, + "loss": 3.0345, + "loss/crossentropy": 2.6213492870330812, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.22394069284200668, + "loss/reg": 0.0, + "step": 36630 + }, + { + "epoch": 0.24105263157894738, + "grad_norm": 2.609375, + "grad_norm_var": 0.0711822509765625, + "learning_rate": 0.0001, + "loss": 2.9613, + "loss/crossentropy": 2.4529595017433166, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.2483013778924942, + "loss/reg": 0.0, + "step": 36640 + }, + { + "epoch": 0.24111842105263157, + "grad_norm": 2.34375, + "grad_norm_var": 0.024247233072916666, + "learning_rate": 0.0001, + "loss": 3.0826, + "loss/crossentropy": 2.5188237547874452, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.25313877910375593, + "loss/reg": 0.0, + "step": 36650 + }, + { + "epoch": 0.24118421052631578, + "grad_norm": 1.96875, + "grad_norm_var": 0.41513264973958336, + "learning_rate": 0.0001, + "loss": 3.0485, + "loss/crossentropy": 2.358896279335022, + "loss/hidden": 2.8015625, + "loss/incoh": 0.0, + "loss/logits": 0.2380443513393402, + "loss/reg": 0.0, + "step": 36660 + }, + { + "epoch": 0.24125, + "grad_norm": 2088763392.0, + "grad_norm_var": 4.570212428561056e+17, + "learning_rate": 0.0001, + "loss": 3.3122, + "loss/crossentropy": 2.4535842657089235, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.23562741428613662, + "loss/reg": 0.0, + "step": 36670 + }, + { + "epoch": 0.2413157894736842, + "grad_norm": 2.453125, + "grad_norm_var": 4.5702124295693926e+17, + "learning_rate": 0.0001, + "loss": 3.0286, + "loss/crossentropy": 2.1914322853088377, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.25221034586429597, + "loss/reg": 0.0, + "step": 36680 + }, + { + "epoch": 0.24138157894736842, + "grad_norm": 2.578125, + "grad_norm_var": 0.031927235921223956, + "learning_rate": 0.0001, + "loss": 3.0104, + "loss/crossentropy": 2.4763585209846495, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.2514804720878601, + "loss/reg": 0.0, + "step": 36690 + }, + { + "epoch": 0.24144736842105263, + "grad_norm": 3.1875, + "grad_norm_var": 0.08718236287434895, + "learning_rate": 0.0001, + "loss": 3.159, + "loss/crossentropy": 2.0223356008529665, + "loss/hidden": 3.146875, + "loss/incoh": 0.0, + "loss/logits": 0.26726412028074265, + "loss/reg": 0.0, + "step": 36700 + }, + { + "epoch": 0.24151315789473685, + "grad_norm": 2.328125, + "grad_norm_var": 0.056004842122395836, + "learning_rate": 0.0001, + "loss": 3.0129, + "loss/crossentropy": 2.2822505116462706, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.21236415654420854, + "loss/reg": 0.0, + "step": 36710 + }, + { + "epoch": 0.24157894736842106, + "grad_norm": 2.1875, + "grad_norm_var": 0.049605305989583334, + "learning_rate": 0.0001, + "loss": 2.9728, + "loss/crossentropy": 2.152122360467911, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.22159827202558519, + "loss/reg": 0.0, + "step": 36720 + }, + { + "epoch": 0.24164473684210527, + "grad_norm": 2.390625, + "grad_norm_var": 0.2835845947265625, + "learning_rate": 0.0001, + "loss": 3.1062, + "loss/crossentropy": 2.415015733242035, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.2637294501066208, + "loss/reg": 0.0, + "step": 36730 + }, + { + "epoch": 0.2417105263157895, + "grad_norm": 2.359375, + "grad_norm_var": 0.24265950520833332, + "learning_rate": 0.0001, + "loss": 3.0508, + "loss/crossentropy": 2.143397808074951, + "loss/hidden": 2.81875, + "loss/incoh": 0.0, + "loss/logits": 0.26159815937280656, + "loss/reg": 0.0, + "step": 36740 + }, + { + "epoch": 0.24177631578947367, + "grad_norm": 2.875, + "grad_norm_var": 0.0584381103515625, + "learning_rate": 0.0001, + "loss": 3.1012, + "loss/crossentropy": 2.3698354959487915, + "loss/hidden": 2.8609375, + "loss/incoh": 0.0, + "loss/logits": 0.26768290251493454, + "loss/reg": 0.0, + "step": 36750 + }, + { + "epoch": 0.24184210526315789, + "grad_norm": 1.9375, + "grad_norm_var": 0.9022532145182292, + "learning_rate": 0.0001, + "loss": 2.9442, + "loss/crossentropy": 2.1970738768577576, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2042816124856472, + "loss/reg": 0.0, + "step": 36760 + }, + { + "epoch": 0.2419078947368421, + "grad_norm": 2.15625, + "grad_norm_var": 0.9153279622395833, + "learning_rate": 0.0001, + "loss": 3.0157, + "loss/crossentropy": 2.345754420757294, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.23231206387281417, + "loss/reg": 0.0, + "step": 36770 + }, + { + "epoch": 0.2419736842105263, + "grad_norm": 2.25, + "grad_norm_var": 0.08898111979166666, + "learning_rate": 0.0001, + "loss": 3.0409, + "loss/crossentropy": 2.3259128451347353, + "loss/hidden": 2.9046875, + "loss/incoh": 0.0, + "loss/logits": 0.23587240129709244, + "loss/reg": 0.0, + "step": 36780 + }, + { + "epoch": 0.24203947368421053, + "grad_norm": 2.5625, + "grad_norm_var": 0.07978108723958334, + "learning_rate": 0.0001, + "loss": 3.0165, + "loss/crossentropy": 2.5162180185317995, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.2471260607242584, + "loss/reg": 0.0, + "step": 36790 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 2.234375, + "grad_norm_var": 0.03634440104166667, + "learning_rate": 0.0001, + "loss": 3.0087, + "loss/crossentropy": 2.3737236499786376, + "loss/hidden": 2.6671875, + "loss/incoh": 0.0, + "loss/logits": 0.2186498686671257, + "loss/reg": 0.0, + "step": 36800 + }, + { + "epoch": 0.24217105263157895, + "grad_norm": 2.09375, + "grad_norm_var": 0.049605305989583334, + "learning_rate": 0.0001, + "loss": 3.0755, + "loss/crossentropy": 2.353866970539093, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.22997004687786102, + "loss/reg": 0.0, + "step": 36810 + }, + { + "epoch": 0.24223684210526317, + "grad_norm": 2.234375, + "grad_norm_var": 0.11220296223958333, + "learning_rate": 0.0001, + "loss": 3.0278, + "loss/crossentropy": 2.421842908859253, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.21242005676031112, + "loss/reg": 0.0, + "step": 36820 + }, + { + "epoch": 0.24230263157894738, + "grad_norm": 2.203125, + "grad_norm_var": 0.16858622233072917, + "learning_rate": 0.0001, + "loss": 3.1032, + "loss/crossentropy": 2.3163668155670165, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.24295762777328492, + "loss/reg": 0.0, + "step": 36830 + }, + { + "epoch": 0.24236842105263157, + "grad_norm": 2.984375, + "grad_norm_var": 0.0773150126139323, + "learning_rate": 0.0001, + "loss": 2.9972, + "loss/crossentropy": 2.162323606014252, + "loss/hidden": 2.9359375, + "loss/incoh": 0.0, + "loss/logits": 0.2663910940289497, + "loss/reg": 0.0, + "step": 36840 + }, + { + "epoch": 0.24243421052631578, + "grad_norm": 2.171875, + "grad_norm_var": 0.21162923177083334, + "learning_rate": 0.0001, + "loss": 2.962, + "loss/crossentropy": 2.159565594792366, + "loss/hidden": 2.6578125, + "loss/incoh": 0.0, + "loss/logits": 0.19483270347118378, + "loss/reg": 0.0, + "step": 36850 + }, + { + "epoch": 0.2425, + "grad_norm": 2.28125, + "grad_norm_var": 0.3277259826660156, + "learning_rate": 0.0001, + "loss": 3.0165, + "loss/crossentropy": 2.5494640946388243, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.21899326890707016, + "loss/reg": 0.0, + "step": 36860 + }, + { + "epoch": 0.2425657894736842, + "grad_norm": 2.296875, + "grad_norm_var": 0.02713190714518229, + "learning_rate": 0.0001, + "loss": 2.9519, + "loss/crossentropy": 2.3883594393730165, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.21375252604484557, + "loss/reg": 0.0, + "step": 36870 + }, + { + "epoch": 0.24263157894736842, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03628107706705729, + "learning_rate": 0.0001, + "loss": 2.9473, + "loss/crossentropy": 2.1211004137992857, + "loss/hidden": 2.8984375, + "loss/incoh": 0.0, + "loss/logits": 0.2303142726421356, + "loss/reg": 0.0, + "step": 36880 + }, + { + "epoch": 0.24269736842105263, + "grad_norm": 2.6875, + "grad_norm_var": 0.053929646809895836, + "learning_rate": 0.0001, + "loss": 2.9946, + "loss/crossentropy": 2.2483023762702943, + "loss/hidden": 2.965625, + "loss/incoh": 0.0, + "loss/logits": 0.2498287908732891, + "loss/reg": 0.0, + "step": 36890 + }, + { + "epoch": 0.24276315789473685, + "grad_norm": 2.390625, + "grad_norm_var": 0.026569620768229166, + "learning_rate": 0.0001, + "loss": 3.0207, + "loss/crossentropy": 2.4569414138793944, + "loss/hidden": 2.703125, + "loss/incoh": 0.0, + "loss/logits": 0.22985844165086747, + "loss/reg": 0.0, + "step": 36900 + }, + { + "epoch": 0.24282894736842106, + "grad_norm": 2.953125, + "grad_norm_var": 0.06067708333333333, + "learning_rate": 0.0001, + "loss": 3.0104, + "loss/crossentropy": 2.438031816482544, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.207527032494545, + "loss/reg": 0.0, + "step": 36910 + }, + { + "epoch": 0.24289473684210527, + "grad_norm": 2.28125, + "grad_norm_var": 0.08677469889322917, + "learning_rate": 0.0001, + "loss": 2.9992, + "loss/crossentropy": 2.4970327615737915, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.24948519468307495, + "loss/reg": 0.0, + "step": 36920 + }, + { + "epoch": 0.24296052631578946, + "grad_norm": 2.5, + "grad_norm_var": 0.07526041666666666, + "learning_rate": 0.0001, + "loss": 3.0188, + "loss/crossentropy": 2.4824806571006777, + "loss/hidden": 2.9140625, + "loss/incoh": 0.0, + "loss/logits": 0.26679628640413283, + "loss/reg": 0.0, + "step": 36930 + }, + { + "epoch": 0.24302631578947367, + "grad_norm": 3.125, + "grad_norm_var": 0.15319595336914063, + "learning_rate": 0.0001, + "loss": 3.0338, + "loss/crossentropy": 2.588907778263092, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.2500623852014542, + "loss/reg": 0.0, + "step": 36940 + }, + { + "epoch": 0.2430921052631579, + "grad_norm": 2.5, + "grad_norm_var": 0.105810546875, + "learning_rate": 0.0001, + "loss": 3.0026, + "loss/crossentropy": 2.233619010448456, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.23702021837234497, + "loss/reg": 0.0, + "step": 36950 + }, + { + "epoch": 0.2431578947368421, + "grad_norm": 2.046875, + "grad_norm_var": 0.032210286458333334, + "learning_rate": 0.0001, + "loss": 2.9011, + "loss/crossentropy": 2.494797945022583, + "loss/hidden": 2.65, + "loss/incoh": 0.0, + "loss/logits": 0.2184271454811096, + "loss/reg": 0.0, + "step": 36960 + }, + { + "epoch": 0.24322368421052631, + "grad_norm": 2.3125, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 3.048, + "loss/crossentropy": 2.323407733440399, + "loss/hidden": 2.8640625, + "loss/incoh": 0.0, + "loss/logits": 0.2522672712802887, + "loss/reg": 0.0, + "step": 36970 + }, + { + "epoch": 0.24328947368421053, + "grad_norm": 2.84375, + "grad_norm_var": 0.04624735514322917, + "learning_rate": 0.0001, + "loss": 3.0306, + "loss/crossentropy": 2.248076152801514, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.22199858874082565, + "loss/reg": 0.0, + "step": 36980 + }, + { + "epoch": 0.24335526315789474, + "grad_norm": 2.21875, + "grad_norm_var": 0.38259989420572915, + "learning_rate": 0.0001, + "loss": 2.9916, + "loss/crossentropy": 2.251713329553604, + "loss/hidden": 2.6640625, + "loss/incoh": 0.0, + "loss/logits": 0.21513912975788116, + "loss/reg": 0.0, + "step": 36990 + }, + { + "epoch": 0.24342105263157895, + "grad_norm": 2.390625, + "grad_norm_var": 0.384716796875, + "learning_rate": 0.0001, + "loss": 3.0287, + "loss/crossentropy": 2.0817331850528715, + "loss/hidden": 2.8484375, + "loss/incoh": 0.0, + "loss/logits": 0.21513576656579972, + "loss/reg": 0.0, + "step": 37000 + }, + { + "epoch": 0.24348684210526317, + "grad_norm": 2.546875, + "grad_norm_var": 0.07632420857747396, + "learning_rate": 0.0001, + "loss": 3.0023, + "loss/crossentropy": 2.351958858966827, + "loss/hidden": 3.0078125, + "loss/incoh": 0.0, + "loss/logits": 0.27439084053039553, + "loss/reg": 0.0, + "step": 37010 + }, + { + "epoch": 0.24355263157894738, + "grad_norm": 2.0625, + "grad_norm_var": 0.12300186157226563, + "learning_rate": 0.0001, + "loss": 3.0848, + "loss/crossentropy": 2.2964416265487673, + "loss/hidden": 2.76875, + "loss/incoh": 0.0, + "loss/logits": 0.28409299105405805, + "loss/reg": 0.0, + "step": 37020 + }, + { + "epoch": 0.24361842105263157, + "grad_norm": 2.6875, + "grad_norm_var": 0.18337173461914064, + "learning_rate": 0.0001, + "loss": 2.9231, + "loss/crossentropy": 2.2850769579410555, + "loss/hidden": 2.6234375, + "loss/incoh": 0.0, + "loss/logits": 0.20450889468193054, + "loss/reg": 0.0, + "step": 37030 + }, + { + "epoch": 0.24368421052631578, + "grad_norm": 2.3125, + "grad_norm_var": 0.18819071451822916, + "learning_rate": 0.0001, + "loss": 3.0757, + "loss/crossentropy": 2.157426190376282, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.23339182287454605, + "loss/reg": 0.0, + "step": 37040 + }, + { + "epoch": 0.24375, + "grad_norm": 2.796875, + "grad_norm_var": 0.0508209228515625, + "learning_rate": 0.0001, + "loss": 3.0667, + "loss/crossentropy": 2.3142905950546266, + "loss/hidden": 2.91875, + "loss/incoh": 0.0, + "loss/logits": 0.27201273292303085, + "loss/reg": 0.0, + "step": 37050 + }, + { + "epoch": 0.2438157894736842, + "grad_norm": 2.078125, + "grad_norm_var": 0.054230753580729166, + "learning_rate": 0.0001, + "loss": 2.9379, + "loss/crossentropy": 2.182029736042023, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.2124377816915512, + "loss/reg": 0.0, + "step": 37060 + }, + { + "epoch": 0.24388157894736842, + "grad_norm": 1.96875, + "grad_norm_var": 0.05328776041666667, + "learning_rate": 0.0001, + "loss": 2.9728, + "loss/crossentropy": 2.47544287443161, + "loss/hidden": 2.734375, + "loss/incoh": 0.0, + "loss/logits": 0.20328531116247178, + "loss/reg": 0.0, + "step": 37070 + }, + { + "epoch": 0.24394736842105263, + "grad_norm": 2.28125, + "grad_norm_var": 0.11983617146809895, + "learning_rate": 0.0001, + "loss": 3.038, + "loss/crossentropy": 2.315932643413544, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.2507309168577194, + "loss/reg": 0.0, + "step": 37080 + }, + { + "epoch": 0.24401315789473685, + "grad_norm": 3.125, + "grad_norm_var": 0.5823707580566406, + "learning_rate": 0.0001, + "loss": 3.0428, + "loss/crossentropy": 2.0966883838176726, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.24450836032629014, + "loss/reg": 0.0, + "step": 37090 + }, + { + "epoch": 0.24407894736842106, + "grad_norm": 2.3125, + "grad_norm_var": 0.562109375, + "learning_rate": 0.0001, + "loss": 3.0281, + "loss/crossentropy": 2.111643207073212, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.22489524409174919, + "loss/reg": 0.0, + "step": 37100 + }, + { + "epoch": 0.24414473684210528, + "grad_norm": 2.65625, + "grad_norm_var": 0.048238118489583336, + "learning_rate": 0.0001, + "loss": 2.9667, + "loss/crossentropy": 2.2703915894031526, + "loss/hidden": 2.7421875, + "loss/incoh": 0.0, + "loss/logits": 0.20935535281896592, + "loss/reg": 0.0, + "step": 37110 + }, + { + "epoch": 0.24421052631578946, + "grad_norm": 2.234375, + "grad_norm_var": 0.12493082682291666, + "learning_rate": 0.0001, + "loss": 3.0306, + "loss/crossentropy": 2.4712467312812807, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.21050616949796677, + "loss/reg": 0.0, + "step": 37120 + }, + { + "epoch": 0.24427631578947367, + "grad_norm": 2.078125, + "grad_norm_var": 0.22398173014322917, + "learning_rate": 0.0001, + "loss": 2.9499, + "loss/crossentropy": 2.1810320615768433, + "loss/hidden": 2.6078125, + "loss/incoh": 0.0, + "loss/logits": 0.1909588247537613, + "loss/reg": 0.0, + "step": 37130 + }, + { + "epoch": 0.2443421052631579, + "grad_norm": 2.1875, + "grad_norm_var": 0.18337300618489583, + "learning_rate": 0.0001, + "loss": 3.0502, + "loss/crossentropy": 2.259117543697357, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.2917395427823067, + "loss/reg": 0.0, + "step": 37140 + }, + { + "epoch": 0.2444078947368421, + "grad_norm": 2.328125, + "grad_norm_var": 0.04478251139322917, + "learning_rate": 0.0001, + "loss": 3.003, + "loss/crossentropy": 2.3330495953559875, + "loss/hidden": 2.825, + "loss/incoh": 0.0, + "loss/logits": 0.25376534312963483, + "loss/reg": 0.0, + "step": 37150 + }, + { + "epoch": 0.24447368421052632, + "grad_norm": 2.125, + "grad_norm_var": 0.025809733072916667, + "learning_rate": 0.0001, + "loss": 2.9673, + "loss/crossentropy": 2.2595450043678285, + "loss/hidden": 2.815625, + "loss/incoh": 0.0, + "loss/logits": 0.24186031073331832, + "loss/reg": 0.0, + "step": 37160 + }, + { + "epoch": 0.24453947368421053, + "grad_norm": 2.3125, + "grad_norm_var": 0.040257771809895836, + "learning_rate": 0.0001, + "loss": 3.0137, + "loss/crossentropy": 2.3244964241981507, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.22835270911455155, + "loss/reg": 0.0, + "step": 37170 + }, + { + "epoch": 0.24460526315789474, + "grad_norm": 2.328125, + "grad_norm_var": 0.18911844889322918, + "learning_rate": 0.0001, + "loss": 3.095, + "loss/crossentropy": 2.1679759979248048, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.22593683898448944, + "loss/reg": 0.0, + "step": 37180 + }, + { + "epoch": 0.24467105263157896, + "grad_norm": 2.390625, + "grad_norm_var": 0.0819488525390625, + "learning_rate": 0.0001, + "loss": 3.0209, + "loss/crossentropy": 2.3522397756576536, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.22514844089746475, + "loss/reg": 0.0, + "step": 37190 + }, + { + "epoch": 0.24473684210526317, + "grad_norm": 3.171875, + "grad_norm_var": 0.14850972493489584, + "learning_rate": 0.0001, + "loss": 3.0028, + "loss/crossentropy": 2.1798877120018005, + "loss/hidden": 2.84375, + "loss/incoh": 0.0, + "loss/logits": 0.26098706275224687, + "loss/reg": 0.0, + "step": 37200 + }, + { + "epoch": 0.24480263157894736, + "grad_norm": 3.4375, + "grad_norm_var": 0.16476949055989584, + "learning_rate": 0.0001, + "loss": 3.0878, + "loss/crossentropy": 2.3389397978782656, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.23207588642835617, + "loss/reg": 0.0, + "step": 37210 + }, + { + "epoch": 0.24486842105263157, + "grad_norm": 1.96875, + "grad_norm_var": 0.13612874348958334, + "learning_rate": 0.0001, + "loss": 3.0629, + "loss/crossentropy": 2.29145188331604, + "loss/hidden": 2.9859375, + "loss/incoh": 0.0, + "loss/logits": 0.23295368999242783, + "loss/reg": 0.0, + "step": 37220 + }, + { + "epoch": 0.24493421052631578, + "grad_norm": 2.578125, + "grad_norm_var": 0.0704742431640625, + "learning_rate": 0.0001, + "loss": 3.0207, + "loss/crossentropy": 2.263294315338135, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.21528264433145522, + "loss/reg": 0.0, + "step": 37230 + }, + { + "epoch": 0.245, + "grad_norm": 2.765625, + "grad_norm_var": 0.04729410807291667, + "learning_rate": 0.0001, + "loss": 2.9616, + "loss/crossentropy": 2.4668697357177733, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2833425417542458, + "loss/reg": 0.0, + "step": 37240 + }, + { + "epoch": 0.2450657894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.1173736572265625, + "learning_rate": 0.0001, + "loss": 3.0141, + "loss/crossentropy": 2.310600745677948, + "loss/hidden": 2.6984375, + "loss/incoh": 0.0, + "loss/logits": 0.2221137210726738, + "loss/reg": 0.0, + "step": 37250 + }, + { + "epoch": 0.24513157894736842, + "grad_norm": 3.359375, + "grad_norm_var": 0.15969136555989583, + "learning_rate": 0.0001, + "loss": 3.0175, + "loss/crossentropy": 2.113830578327179, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.2413550451397896, + "loss/reg": 0.0, + "step": 37260 + }, + { + "epoch": 0.24519736842105264, + "grad_norm": 2.625, + "grad_norm_var": 0.12274983723958334, + "learning_rate": 0.0001, + "loss": 3.0058, + "loss/crossentropy": 1.9976770758628846, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.19502876847982406, + "loss/reg": 0.0, + "step": 37270 + }, + { + "epoch": 0.24526315789473685, + "grad_norm": 2.40625, + "grad_norm_var": 0.11809488932291666, + "learning_rate": 0.0001, + "loss": 2.9752, + "loss/crossentropy": 2.1309264838695525, + "loss/hidden": 2.65, + "loss/incoh": 0.0, + "loss/logits": 0.20758389085531234, + "loss/reg": 0.0, + "step": 37280 + }, + { + "epoch": 0.24532894736842106, + "grad_norm": 2.21875, + "grad_norm_var": 0.12764383951822916, + "learning_rate": 0.0001, + "loss": 3.044, + "loss/crossentropy": 2.142688637971878, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.2955352425575256, + "loss/reg": 0.0, + "step": 37290 + }, + { + "epoch": 0.24539473684210528, + "grad_norm": 2.640625, + "grad_norm_var": 0.0466949462890625, + "learning_rate": 0.0001, + "loss": 2.9294, + "loss/crossentropy": 2.255223333835602, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.22907140627503395, + "loss/reg": 0.0, + "step": 37300 + }, + { + "epoch": 0.24546052631578946, + "grad_norm": 2.125, + "grad_norm_var": 0.085693359375, + "learning_rate": 0.0001, + "loss": 2.9874, + "loss/crossentropy": 2.121692883968353, + "loss/hidden": 2.7140625, + "loss/incoh": 0.0, + "loss/logits": 0.195634426176548, + "loss/reg": 0.0, + "step": 37310 + }, + { + "epoch": 0.24552631578947368, + "grad_norm": 2.703125, + "grad_norm_var": 0.12754694620768228, + "learning_rate": 0.0001, + "loss": 3.0103, + "loss/crossentropy": 2.2369035363197325, + "loss/hidden": 2.8828125, + "loss/incoh": 0.0, + "loss/logits": 0.2212720662355423, + "loss/reg": 0.0, + "step": 37320 + }, + { + "epoch": 0.2455921052631579, + "grad_norm": 2.953125, + "grad_norm_var": 0.14957046508789062, + "learning_rate": 0.0001, + "loss": 2.9211, + "loss/crossentropy": 1.6413456916809082, + "loss/hidden": 2.603125, + "loss/incoh": 0.0, + "loss/logits": 0.18164349719882011, + "loss/reg": 0.0, + "step": 37330 + }, + { + "epoch": 0.2456578947368421, + "grad_norm": 2.484375, + "grad_norm_var": 0.17405497233072917, + "learning_rate": 0.0001, + "loss": 3.0623, + "loss/crossentropy": 2.128761112689972, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.20798836946487426, + "loss/reg": 0.0, + "step": 37340 + }, + { + "epoch": 0.24572368421052632, + "grad_norm": 2.46875, + "grad_norm_var": 0.05827534993489583, + "learning_rate": 0.0001, + "loss": 2.9564, + "loss/crossentropy": 2.271031451225281, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.21863894239068032, + "loss/reg": 0.0, + "step": 37350 + }, + { + "epoch": 0.24578947368421053, + "grad_norm": 2.265625, + "grad_norm_var": 0.10803629557291666, + "learning_rate": 0.0001, + "loss": 3.0487, + "loss/crossentropy": 2.5324214935302733, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.2654300585389137, + "loss/reg": 0.0, + "step": 37360 + }, + { + "epoch": 0.24585526315789474, + "grad_norm": 2.046875, + "grad_norm_var": 0.0805816650390625, + "learning_rate": 0.0001, + "loss": 2.9872, + "loss/crossentropy": 2.476090502738953, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.22690275460481643, + "loss/reg": 0.0, + "step": 37370 + }, + { + "epoch": 0.24592105263157896, + "grad_norm": 2.109375, + "grad_norm_var": 0.0459136962890625, + "learning_rate": 0.0001, + "loss": 2.9355, + "loss/crossentropy": 2.158538544178009, + "loss/hidden": 3.05625, + "loss/incoh": 0.0, + "loss/logits": 0.2366640105843544, + "loss/reg": 0.0, + "step": 37380 + }, + { + "epoch": 0.24598684210526317, + "grad_norm": 1.8828125, + "grad_norm_var": 0.048860422770182294, + "learning_rate": 0.0001, + "loss": 3.0021, + "loss/crossentropy": 2.2990810751914976, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.20744879692792892, + "loss/reg": 0.0, + "step": 37390 + }, + { + "epoch": 0.24605263157894736, + "grad_norm": 2.40625, + "grad_norm_var": 0.052247873942057294, + "learning_rate": 0.0001, + "loss": 2.9696, + "loss/crossentropy": 2.2528524160385133, + "loss/hidden": 2.7546875, + "loss/incoh": 0.0, + "loss/logits": 0.22408121079206467, + "loss/reg": 0.0, + "step": 37400 + }, + { + "epoch": 0.24611842105263157, + "grad_norm": 2.140625, + "grad_norm_var": 0.046122233072916664, + "learning_rate": 0.0001, + "loss": 2.991, + "loss/crossentropy": 2.4535847425460817, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.27996278256177903, + "loss/reg": 0.0, + "step": 37410 + }, + { + "epoch": 0.24618421052631578, + "grad_norm": 2.03125, + "grad_norm_var": 0.50777587890625, + "learning_rate": 0.0001, + "loss": 2.9405, + "loss/crossentropy": 2.2263678312301636, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.21364209055900574, + "loss/reg": 0.0, + "step": 37420 + }, + { + "epoch": 0.24625, + "grad_norm": 2.03125, + "grad_norm_var": 0.0452301025390625, + "learning_rate": 0.0001, + "loss": 2.966, + "loss/crossentropy": 2.3186843156814576, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.251382140815258, + "loss/reg": 0.0, + "step": 37430 + }, + { + "epoch": 0.2463157894736842, + "grad_norm": 2.203125, + "grad_norm_var": 0.035674794514973955, + "learning_rate": 0.0001, + "loss": 2.9334, + "loss/crossentropy": 2.3233260989189146, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.26353937536478045, + "loss/reg": 0.0, + "step": 37440 + }, + { + "epoch": 0.24638157894736842, + "grad_norm": 2.03125, + "grad_norm_var": 0.09699071248372396, + "learning_rate": 0.0001, + "loss": 3.0099, + "loss/crossentropy": 2.1750458002090456, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.22462447360157967, + "loss/reg": 0.0, + "step": 37450 + }, + { + "epoch": 0.24644736842105264, + "grad_norm": 2.390625, + "grad_norm_var": 0.09807942708333334, + "learning_rate": 0.0001, + "loss": 2.9668, + "loss/crossentropy": 2.59984233379364, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.25129624158144, + "loss/reg": 0.0, + "step": 37460 + }, + { + "epoch": 0.24651315789473685, + "grad_norm": 2.109375, + "grad_norm_var": 0.04807840983072917, + "learning_rate": 0.0001, + "loss": 2.9464, + "loss/crossentropy": 2.4232494115829466, + "loss/hidden": 2.6578125, + "loss/incoh": 0.0, + "loss/logits": 0.22763155549764633, + "loss/reg": 0.0, + "step": 37470 + }, + { + "epoch": 0.24657894736842106, + "grad_norm": 2.421875, + "grad_norm_var": 0.2599283854166667, + "learning_rate": 0.0001, + "loss": 3.0313, + "loss/crossentropy": 2.314221677184105, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.19144997373223305, + "loss/reg": 0.0, + "step": 37480 + }, + { + "epoch": 0.24664473684210525, + "grad_norm": 2.484375, + "grad_norm_var": 0.3412068684895833, + "learning_rate": 0.0001, + "loss": 2.98, + "loss/crossentropy": 2.4459351778030394, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.24818085730075837, + "loss/reg": 0.0, + "step": 37490 + }, + { + "epoch": 0.24671052631578946, + "grad_norm": 2.234375, + "grad_norm_var": 0.31691080729166665, + "learning_rate": 0.0001, + "loss": 3.0414, + "loss/crossentropy": 2.1927252769470216, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.2274562269449234, + "loss/reg": 0.0, + "step": 37500 + }, + { + "epoch": 0.24677631578947368, + "grad_norm": 2.3125, + "grad_norm_var": 0.2977935791015625, + "learning_rate": 0.0001, + "loss": 3.0205, + "loss/crossentropy": 2.0236656427383424, + "loss/hidden": 2.8734375, + "loss/incoh": 0.0, + "loss/logits": 0.23172616437077523, + "loss/reg": 0.0, + "step": 37510 + }, + { + "epoch": 0.2468421052631579, + "grad_norm": 2.171875, + "grad_norm_var": 0.050593058268229164, + "learning_rate": 0.0001, + "loss": 2.9751, + "loss/crossentropy": 2.0957834839820864, + "loss/hidden": 2.8515625, + "loss/incoh": 0.0, + "loss/logits": 0.23045892268419266, + "loss/reg": 0.0, + "step": 37520 + }, + { + "epoch": 0.2469078947368421, + "grad_norm": 2.140625, + "grad_norm_var": 0.09323628743489583, + "learning_rate": 0.0001, + "loss": 2.9756, + "loss/crossentropy": 2.189413595199585, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.22707376182079314, + "loss/reg": 0.0, + "step": 37530 + }, + { + "epoch": 0.24697368421052632, + "grad_norm": 2.0, + "grad_norm_var": 0.056183878580729166, + "learning_rate": 0.0001, + "loss": 2.9589, + "loss/crossentropy": 2.733688974380493, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.2146020546555519, + "loss/reg": 0.0, + "step": 37540 + }, + { + "epoch": 0.24703947368421053, + "grad_norm": 2.390625, + "grad_norm_var": 0.04794921875, + "learning_rate": 0.0001, + "loss": 3.0378, + "loss/crossentropy": 2.206034767627716, + "loss/hidden": 2.803125, + "loss/incoh": 0.0, + "loss/logits": 0.24584971666336058, + "loss/reg": 0.0, + "step": 37550 + }, + { + "epoch": 0.24710526315789474, + "grad_norm": 2.25, + "grad_norm_var": 0.058288319905598955, + "learning_rate": 0.0001, + "loss": 2.9179, + "loss/crossentropy": 1.987370991706848, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.194708900898695, + "loss/reg": 0.0, + "step": 37560 + }, + { + "epoch": 0.24717105263157896, + "grad_norm": 2.21875, + "grad_norm_var": 0.03406956990559896, + "learning_rate": 0.0001, + "loss": 2.9143, + "loss/crossentropy": 2.281442165374756, + "loss/hidden": 2.69375, + "loss/incoh": 0.0, + "loss/logits": 0.19732007533311843, + "loss/reg": 0.0, + "step": 37570 + }, + { + "epoch": 0.24723684210526317, + "grad_norm": 2.0625, + "grad_norm_var": 0.0267730712890625, + "learning_rate": 0.0001, + "loss": 2.9615, + "loss/crossentropy": 2.3091693341732027, + "loss/hidden": 2.6921875, + "loss/incoh": 0.0, + "loss/logits": 0.22810056209564208, + "loss/reg": 0.0, + "step": 37580 + }, + { + "epoch": 0.24730263157894736, + "grad_norm": 2.578125, + "grad_norm_var": 0.0711334228515625, + "learning_rate": 0.0001, + "loss": 3.0095, + "loss/crossentropy": 2.276491713523865, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2274973288178444, + "loss/reg": 0.0, + "step": 37590 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 2.4375, + "grad_norm_var": 0.052994791666666666, + "learning_rate": 0.0001, + "loss": 3.0237, + "loss/crossentropy": 2.096492087841034, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.22436774969100953, + "loss/reg": 0.0, + "step": 37600 + }, + { + "epoch": 0.24743421052631578, + "grad_norm": 2.21875, + "grad_norm_var": 0.023664347330729165, + "learning_rate": 0.0001, + "loss": 2.9825, + "loss/crossentropy": 2.1003798633813857, + "loss/hidden": 2.6734375, + "loss/incoh": 0.0, + "loss/logits": 0.21021526902914048, + "loss/reg": 0.0, + "step": 37610 + }, + { + "epoch": 0.2475, + "grad_norm": 2.21875, + "grad_norm_var": 0.12235921223958333, + "learning_rate": 0.0001, + "loss": 2.9975, + "loss/crossentropy": 2.338260293006897, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.2373722493648529, + "loss/reg": 0.0, + "step": 37620 + }, + { + "epoch": 0.2475657894736842, + "grad_norm": 2.75, + "grad_norm_var": 0.1563738505045573, + "learning_rate": 0.0001, + "loss": 2.9835, + "loss/crossentropy": 2.4219950675964355, + "loss/hidden": 2.6015625, + "loss/incoh": 0.0, + "loss/logits": 0.210829646140337, + "loss/reg": 0.0, + "step": 37630 + }, + { + "epoch": 0.24763157894736842, + "grad_norm": 2.859375, + "grad_norm_var": 3.647915690527556e+17, + "learning_rate": 0.0001, + "loss": 3.1081, + "loss/crossentropy": 2.1713775038719176, + "loss/hidden": 2.603125, + "loss/incoh": 0.0, + "loss/logits": 0.20183515548706055, + "loss/reg": 0.0, + "step": 37640 + }, + { + "epoch": 0.24769736842105264, + "grad_norm": 2.328125, + "grad_norm_var": 3.6479156911331085e+17, + "learning_rate": 0.0001, + "loss": 2.9457, + "loss/crossentropy": 2.509985637664795, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.23514220416545867, + "loss/reg": 0.0, + "step": 37650 + }, + { + "epoch": 0.24776315789473685, + "grad_norm": 4.125, + "grad_norm_var": 0.30654703776041664, + "learning_rate": 0.0001, + "loss": 2.934, + "loss/crossentropy": 2.2876363396644592, + "loss/hidden": 2.640625, + "loss/incoh": 0.0, + "loss/logits": 0.20297650545835494, + "loss/reg": 0.0, + "step": 37660 + }, + { + "epoch": 0.24782894736842107, + "grad_norm": 2.328125, + "grad_norm_var": 0.27844416300455727, + "learning_rate": 0.0001, + "loss": 2.9907, + "loss/crossentropy": 2.4115204930305483, + "loss/hidden": 2.8875, + "loss/incoh": 0.0, + "loss/logits": 0.2483261451125145, + "loss/reg": 0.0, + "step": 37670 + }, + { + "epoch": 0.24789473684210525, + "grad_norm": 2.375, + "grad_norm_var": 0.05608317057291667, + "learning_rate": 0.0001, + "loss": 2.9908, + "loss/crossentropy": 2.3460915803909304, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.24570741802453994, + "loss/reg": 0.0, + "step": 37680 + }, + { + "epoch": 0.24796052631578946, + "grad_norm": 2.21875, + "grad_norm_var": 0.1174468994140625, + "learning_rate": 0.0001, + "loss": 2.9498, + "loss/crossentropy": 2.2238924860954286, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.21785593777894974, + "loss/reg": 0.0, + "step": 37690 + }, + { + "epoch": 0.24802631578947368, + "grad_norm": 2.265625, + "grad_norm_var": 0.18594462076822918, + "learning_rate": 0.0001, + "loss": 2.9304, + "loss/crossentropy": 2.2358608484268188, + "loss/hidden": 2.7125, + "loss/incoh": 0.0, + "loss/logits": 0.20752606838941573, + "loss/reg": 0.0, + "step": 37700 + }, + { + "epoch": 0.2480921052631579, + "grad_norm": 2.015625, + "grad_norm_var": 6.753564453125, + "learning_rate": 0.0001, + "loss": 3.002, + "loss/crossentropy": 2.481326103210449, + "loss/hidden": 2.96875, + "loss/incoh": 0.0, + "loss/logits": 0.34728084355592725, + "loss/reg": 0.0, + "step": 37710 + }, + { + "epoch": 0.2481578947368421, + "grad_norm": 2.34375, + "grad_norm_var": 6.80933837890625, + "learning_rate": 0.0001, + "loss": 2.9933, + "loss/crossentropy": 2.11043701171875, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.231504824757576, + "loss/reg": 0.0, + "step": 37720 + }, + { + "epoch": 0.24822368421052632, + "grad_norm": 2.171875, + "grad_norm_var": 0.0999176025390625, + "learning_rate": 0.0001, + "loss": 2.998, + "loss/crossentropy": 2.3754311203956604, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.2291927695274353, + "loss/reg": 0.0, + "step": 37730 + }, + { + "epoch": 0.24828947368421053, + "grad_norm": 2.390625, + "grad_norm_var": 0.044677734375, + "learning_rate": 0.0001, + "loss": 2.9851, + "loss/crossentropy": 2.2369036078453064, + "loss/hidden": 2.5625, + "loss/incoh": 0.0, + "loss/logits": 0.19738300889730453, + "loss/reg": 0.0, + "step": 37740 + }, + { + "epoch": 0.24835526315789475, + "grad_norm": 2.6875, + "grad_norm_var": 0.265234375, + "learning_rate": 0.0001, + "loss": 3.0716, + "loss/crossentropy": 2.236533558368683, + "loss/hidden": 3.1171875, + "loss/incoh": 0.0, + "loss/logits": 0.2648838981986046, + "loss/reg": 0.0, + "step": 37750 + }, + { + "epoch": 0.24842105263157896, + "grad_norm": 2.125, + "grad_norm_var": 0.28010660807291665, + "learning_rate": 0.0001, + "loss": 3.0245, + "loss/crossentropy": 2.3721311211586, + "loss/hidden": 2.790625, + "loss/incoh": 0.0, + "loss/logits": 0.27348231226205827, + "loss/reg": 0.0, + "step": 37760 + }, + { + "epoch": 0.24848684210526314, + "grad_norm": 2.265625, + "grad_norm_var": 0.13202718098958333, + "learning_rate": 0.0001, + "loss": 3.0549, + "loss/crossentropy": 2.2753346085548403, + "loss/hidden": 3.0125, + "loss/incoh": 0.0, + "loss/logits": 0.2692110911011696, + "loss/reg": 0.0, + "step": 37770 + }, + { + "epoch": 0.24855263157894736, + "grad_norm": 2.359375, + "grad_norm_var": 0.10732014973958333, + "learning_rate": 0.0001, + "loss": 3.022, + "loss/crossentropy": 2.2350252270698547, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.19948195964097976, + "loss/reg": 0.0, + "step": 37780 + }, + { + "epoch": 0.24861842105263157, + "grad_norm": 3.375, + "grad_norm_var": 0.13839518229166667, + "learning_rate": 0.0001, + "loss": 2.9917, + "loss/crossentropy": 2.175236439704895, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.21911766976118088, + "loss/reg": 0.0, + "step": 37790 + }, + { + "epoch": 0.24868421052631579, + "grad_norm": 2.046875, + "grad_norm_var": 0.10273030598958334, + "learning_rate": 0.0001, + "loss": 2.9491, + "loss/crossentropy": 2.3891021251678466, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.2441677287220955, + "loss/reg": 0.0, + "step": 37800 + }, + { + "epoch": 0.24875, + "grad_norm": 2.296875, + "grad_norm_var": 0.11543782552083333, + "learning_rate": 0.0001, + "loss": 2.9554, + "loss/crossentropy": 2.1731114864349363, + "loss/hidden": 2.8859375, + "loss/incoh": 0.0, + "loss/logits": 0.24577540904283524, + "loss/reg": 0.0, + "step": 37810 + }, + { + "epoch": 0.2488157894736842, + "grad_norm": 2.46875, + "grad_norm_var": 0.13479588826497396, + "learning_rate": 0.0001, + "loss": 3.0308, + "loss/crossentropy": 2.459308052062988, + "loss/hidden": 2.8546875, + "loss/incoh": 0.0, + "loss/logits": 0.25517907589673994, + "loss/reg": 0.0, + "step": 37820 + }, + { + "epoch": 0.24888157894736843, + "grad_norm": 2.828125, + "grad_norm_var": 0.06494140625, + "learning_rate": 0.0001, + "loss": 3.0135, + "loss/crossentropy": 2.296464502811432, + "loss/hidden": 2.8265625, + "loss/incoh": 0.0, + "loss/logits": 0.21491459012031555, + "loss/reg": 0.0, + "step": 37830 + }, + { + "epoch": 0.24894736842105264, + "grad_norm": 2.21875, + "grad_norm_var": 0.64693603515625, + "learning_rate": 0.0001, + "loss": 3.0707, + "loss/crossentropy": 2.414829707145691, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.2108585089445114, + "loss/reg": 0.0, + "step": 37840 + }, + { + "epoch": 0.24901315789473685, + "grad_norm": 2.25, + "grad_norm_var": 0.5141021728515625, + "learning_rate": 0.0001, + "loss": 2.9619, + "loss/crossentropy": 1.9343510389328002, + "loss/hidden": 2.775, + "loss/incoh": 0.0, + "loss/logits": 0.22914507612586021, + "loss/reg": 0.0, + "step": 37850 + }, + { + "epoch": 0.24907894736842107, + "grad_norm": 2.53125, + "grad_norm_var": 0.1226226806640625, + "learning_rate": 0.0001, + "loss": 2.967, + "loss/crossentropy": 2.5030432462692263, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.23077891543507575, + "loss/reg": 0.0, + "step": 37860 + }, + { + "epoch": 0.24914473684210525, + "grad_norm": 2.3125, + "grad_norm_var": 0.07437235514322917, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.3556608080863954, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.2477458581328392, + "loss/reg": 0.0, + "step": 37870 + }, + { + "epoch": 0.24921052631578947, + "grad_norm": 2.3125, + "grad_norm_var": 0.05681050618489583, + "learning_rate": 0.0001, + "loss": 2.9916, + "loss/crossentropy": 2.2093961358070375, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.19773360043764115, + "loss/reg": 0.0, + "step": 37880 + }, + { + "epoch": 0.24927631578947368, + "grad_norm": 2.28125, + "grad_norm_var": 0.032624308268229166, + "learning_rate": 0.0001, + "loss": 2.9833, + "loss/crossentropy": 2.070064514875412, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.20810991451144217, + "loss/reg": 0.0, + "step": 37890 + }, + { + "epoch": 0.2493421052631579, + "grad_norm": 2.4375, + "grad_norm_var": 0.3275227864583333, + "learning_rate": 0.0001, + "loss": 3.087, + "loss/crossentropy": 2.4348979711532595, + "loss/hidden": 2.7703125, + "loss/incoh": 0.0, + "loss/logits": 0.1990129753947258, + "loss/reg": 0.0, + "step": 37900 + }, + { + "epoch": 0.2494078947368421, + "grad_norm": 2.109375, + "grad_norm_var": 0.1884661356608073, + "learning_rate": 0.0001, + "loss": 2.9224, + "loss/crossentropy": 2.313380515575409, + "loss/hidden": 2.6796875, + "loss/incoh": 0.0, + "loss/logits": 0.20934066772460938, + "loss/reg": 0.0, + "step": 37910 + }, + { + "epoch": 0.24947368421052632, + "grad_norm": 2.109375, + "grad_norm_var": 0.2013567606608073, + "learning_rate": 0.0001, + "loss": 2.9925, + "loss/crossentropy": 2.2767374873161317, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.21055591180920602, + "loss/reg": 0.0, + "step": 37920 + }, + { + "epoch": 0.24953947368421053, + "grad_norm": 2.921875, + "grad_norm_var": 0.29850972493489586, + "learning_rate": 0.0001, + "loss": 2.9993, + "loss/crossentropy": 2.361280381679535, + "loss/hidden": 2.75625, + "loss/incoh": 0.0, + "loss/logits": 0.2328987866640091, + "loss/reg": 0.0, + "step": 37930 + }, + { + "epoch": 0.24960526315789475, + "grad_norm": 2.59375, + "grad_norm_var": 0.2740234375, + "learning_rate": 0.0001, + "loss": 2.9261, + "loss/crossentropy": 2.1607108235359194, + "loss/hidden": 2.828125, + "loss/incoh": 0.0, + "loss/logits": 0.21131338626146318, + "loss/reg": 0.0, + "step": 37940 + }, + { + "epoch": 0.24967105263157896, + "grad_norm": 2.578125, + "grad_norm_var": 0.15468343098958334, + "learning_rate": 0.0001, + "loss": 2.9839, + "loss/crossentropy": 2.2607654333114624, + "loss/hidden": 2.6546875, + "loss/incoh": 0.0, + "loss/logits": 0.18967671394348146, + "loss/reg": 0.0, + "step": 37950 + }, + { + "epoch": 0.24973684210526315, + "grad_norm": 2.578125, + "grad_norm_var": 0.1874956766764323, + "learning_rate": 0.0001, + "loss": 3.0293, + "loss/crossentropy": 2.483866810798645, + "loss/hidden": 2.6421875, + "loss/incoh": 0.0, + "loss/logits": 0.22582052797079086, + "loss/reg": 0.0, + "step": 37960 + }, + { + "epoch": 0.24980263157894736, + "grad_norm": 2.40625, + "grad_norm_var": 0.21528294881184895, + "learning_rate": 0.0001, + "loss": 3.017, + "loss/crossentropy": 2.568509268760681, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.25563120394945144, + "loss/reg": 0.0, + "step": 37970 + }, + { + "epoch": 0.24986842105263157, + "grad_norm": 2.046875, + "grad_norm_var": 0.21802469889322917, + "learning_rate": 0.0001, + "loss": 3.002, + "loss/crossentropy": 2.394374597072601, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.2696389377117157, + "loss/reg": 0.0, + "step": 37980 + }, + { + "epoch": 0.2499342105263158, + "grad_norm": 2.296875, + "grad_norm_var": 0.41775716145833336, + "learning_rate": 0.0001, + "loss": 3.0951, + "loss/crossentropy": 2.279202771186829, + "loss/hidden": 2.853125, + "loss/incoh": 0.0, + "loss/logits": 0.22951147556304932, + "loss/reg": 0.0, + "step": 37990 + }, + { + "epoch": 0.25, + "grad_norm": 2.5, + "grad_norm_var": 0.25235087076822915, + "learning_rate": 0.0001, + "loss": 2.9227, + "loss/crossentropy": 2.0242787480354307, + "loss/hidden": 2.65625, + "loss/incoh": 0.0, + "loss/logits": 0.20535071119666098, + "loss/reg": 0.0, + "step": 38000 + }, + { + "epoch": 0.2500657894736842, + "grad_norm": 2.640625, + "grad_norm_var": 0.08347880045572917, + "learning_rate": 0.0001, + "loss": 2.9955, + "loss/crossentropy": 2.109722208976746, + "loss/hidden": 2.83125, + "loss/incoh": 0.0, + "loss/logits": 0.20487915128469467, + "loss/reg": 0.0, + "step": 38010 + }, + { + "epoch": 0.2501315789473684, + "grad_norm": 2.328125, + "grad_norm_var": 6.417805380649222e+17, + "learning_rate": 0.0001, + "loss": 3.1302, + "loss/crossentropy": 2.31266930103302, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.22627270370721816, + "loss/reg": 0.0, + "step": 38020 + }, + { + "epoch": 0.2501973684210526, + "grad_norm": 2.84375, + "grad_norm_var": 0.0590240478515625, + "learning_rate": 0.0001, + "loss": 2.9995, + "loss/crossentropy": 2.147220182418823, + "loss/hidden": 2.6078125, + "loss/incoh": 0.0, + "loss/logits": 0.18802944347262382, + "loss/reg": 0.0, + "step": 38030 + }, + { + "epoch": 0.25026315789473685, + "grad_norm": 2.421875, + "grad_norm_var": 0.18092041015625, + "learning_rate": 0.0001, + "loss": 3.014, + "loss/crossentropy": 2.1841834664344786, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.23548600152134896, + "loss/reg": 0.0, + "step": 38040 + }, + { + "epoch": 0.25032894736842104, + "grad_norm": 2.5, + "grad_norm_var": 0.17408854166666668, + "learning_rate": 0.0001, + "loss": 3.0632, + "loss/crossentropy": 2.1282356858253477, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.2339039534330368, + "loss/reg": 0.0, + "step": 38050 + }, + { + "epoch": 0.2503947368421053, + "grad_norm": 2.53125, + "grad_norm_var": 0.15420633951822918, + "learning_rate": 0.0001, + "loss": 2.9998, + "loss/crossentropy": 2.2876673698425294, + "loss/hidden": 2.621875, + "loss/incoh": 0.0, + "loss/logits": 0.20793070793151855, + "loss/reg": 0.0, + "step": 38060 + }, + { + "epoch": 0.25046052631578947, + "grad_norm": 2.765625, + "grad_norm_var": 0.17851460774739583, + "learning_rate": 0.0001, + "loss": 3.0085, + "loss/crossentropy": 2.327640974521637, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.26281672716140747, + "loss/reg": 0.0, + "step": 38070 + }, + { + "epoch": 0.2505263157894737, + "grad_norm": 2.703125, + "grad_norm_var": 0.6514231363932291, + "learning_rate": 0.0001, + "loss": 3.0383, + "loss/crossentropy": 1.913532590866089, + "loss/hidden": 2.725, + "loss/incoh": 0.0, + "loss/logits": 0.20542303174734117, + "loss/reg": 0.0, + "step": 38080 + }, + { + "epoch": 0.2505921052631579, + "grad_norm": 1.984375, + "grad_norm_var": 0.7180580139160156, + "learning_rate": 0.0001, + "loss": 2.9836, + "loss/crossentropy": 2.27760968208313, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.23442062735557556, + "loss/reg": 0.0, + "step": 38090 + }, + { + "epoch": 0.2506578947368421, + "grad_norm": 4.125, + "grad_norm_var": 0.5421974182128906, + "learning_rate": 0.0001, + "loss": 2.9747, + "loss/crossentropy": 2.13877215385437, + "loss/hidden": 2.7046875, + "loss/incoh": 0.0, + "loss/logits": 0.19861070141196252, + "loss/reg": 0.0, + "step": 38100 + }, + { + "epoch": 0.2507236842105263, + "grad_norm": 3.59375, + "grad_norm_var": 0.27887369791666666, + "learning_rate": 0.0001, + "loss": 3.1014, + "loss/crossentropy": 2.2125035911798476, + "loss/hidden": 2.7234375, + "loss/incoh": 0.0, + "loss/logits": 0.20661384277045727, + "loss/reg": 0.0, + "step": 38110 + }, + { + "epoch": 0.2507894736842105, + "grad_norm": 2.25, + "grad_norm_var": 0.3249908447265625, + "learning_rate": 0.0001, + "loss": 3.0608, + "loss/crossentropy": 2.3948875069618225, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.21930506974458694, + "loss/reg": 0.0, + "step": 38120 + }, + { + "epoch": 0.25085526315789475, + "grad_norm": 2.53125, + "grad_norm_var": 0.28649063110351564, + "learning_rate": 0.0001, + "loss": 2.9885, + "loss/crossentropy": 2.292639744281769, + "loss/hidden": 2.7921875, + "loss/incoh": 0.0, + "loss/logits": 0.2267945870757103, + "loss/reg": 0.0, + "step": 38130 + }, + { + "epoch": 0.25092105263157893, + "grad_norm": 2.59375, + "grad_norm_var": 0.05728759765625, + "learning_rate": 0.0001, + "loss": 2.9936, + "loss/crossentropy": 2.097894775867462, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.20571077913045882, + "loss/reg": 0.0, + "step": 38140 + }, + { + "epoch": 0.2509868421052632, + "grad_norm": 3.375, + "grad_norm_var": 3.1128214518229167, + "learning_rate": 0.0001, + "loss": 2.9914, + "loss/crossentropy": 2.272007715702057, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.21056424751877784, + "loss/reg": 0.0, + "step": 38150 + }, + { + "epoch": 0.25105263157894736, + "grad_norm": 2.421875, + "grad_norm_var": 0.16922098795572918, + "learning_rate": 0.0001, + "loss": 2.9551, + "loss/crossentropy": 2.2579661190509794, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.26000856757164004, + "loss/reg": 0.0, + "step": 38160 + }, + { + "epoch": 0.2511184210526316, + "grad_norm": 2.15625, + "grad_norm_var": 0.1134674072265625, + "learning_rate": 0.0001, + "loss": 2.9975, + "loss/crossentropy": 2.0489343285560606, + "loss/hidden": 2.6265625, + "loss/incoh": 0.0, + "loss/logits": 0.17201969623565674, + "loss/reg": 0.0, + "step": 38170 + }, + { + "epoch": 0.2511842105263158, + "grad_norm": 2.28125, + "grad_norm_var": 0.030980428059895832, + "learning_rate": 0.0001, + "loss": 2.9523, + "loss/crossentropy": 2.252878558635712, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.23066311478614807, + "loss/reg": 0.0, + "step": 38180 + }, + { + "epoch": 0.25125, + "grad_norm": 2.140625, + "grad_norm_var": 0.0248931884765625, + "learning_rate": 0.0001, + "loss": 2.9488, + "loss/crossentropy": 2.3150659799575806, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.21407101899385453, + "loss/reg": 0.0, + "step": 38190 + }, + { + "epoch": 0.2513157894736842, + "grad_norm": 2.640625, + "grad_norm_var": 2.1658424377441405, + "learning_rate": 0.0001, + "loss": 2.9789, + "loss/crossentropy": 2.4279683232307434, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.2308952897787094, + "loss/reg": 0.0, + "step": 38200 + }, + { + "epoch": 0.2513815789473684, + "grad_norm": 3.75, + "grad_norm_var": 1.915710194905599, + "learning_rate": 0.0001, + "loss": 2.9791, + "loss/crossentropy": 2.454149055480957, + "loss/hidden": 2.778125, + "loss/incoh": 0.0, + "loss/logits": 0.27904040217399595, + "loss/reg": 0.0, + "step": 38210 + }, + { + "epoch": 0.25144736842105264, + "grad_norm": 2.28125, + "grad_norm_var": 0.16384175618489583, + "learning_rate": 0.0001, + "loss": 3.025, + "loss/crossentropy": 2.4395872354507446, + "loss/hidden": 2.7875, + "loss/incoh": 0.0, + "loss/logits": 0.24970148950815202, + "loss/reg": 0.0, + "step": 38220 + }, + { + "epoch": 0.2515131578947368, + "grad_norm": 3.1875, + "grad_norm_var": 0.13736572265625, + "learning_rate": 0.0001, + "loss": 2.9912, + "loss/crossentropy": 2.4788294553756716, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.21640673130750657, + "loss/reg": 0.0, + "step": 38230 + }, + { + "epoch": 0.25157894736842107, + "grad_norm": 2.359375, + "grad_norm_var": 0.1446929931640625, + "learning_rate": 0.0001, + "loss": 2.978, + "loss/crossentropy": 2.15856648683548, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.22779612690210344, + "loss/reg": 0.0, + "step": 38240 + }, + { + "epoch": 0.25164473684210525, + "grad_norm": 2.484375, + "grad_norm_var": 0.028629557291666666, + "learning_rate": 0.0001, + "loss": 3.0221, + "loss/crossentropy": 2.5230648517608643, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.22997782975435258, + "loss/reg": 0.0, + "step": 38250 + }, + { + "epoch": 0.2517105263157895, + "grad_norm": 2.21875, + "grad_norm_var": 0.03369852701822917, + "learning_rate": 0.0001, + "loss": 2.9928, + "loss/crossentropy": 2.3802834033966063, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.25779378563165667, + "loss/reg": 0.0, + "step": 38260 + }, + { + "epoch": 0.2517763157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.18492838541666667, + "learning_rate": 0.0001, + "loss": 2.9498, + "loss/crossentropy": 2.31593804359436, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.245054192841053, + "loss/reg": 0.0, + "step": 38270 + }, + { + "epoch": 0.25184210526315787, + "grad_norm": 3.0625, + "grad_norm_var": 0.1485015869140625, + "learning_rate": 0.0001, + "loss": 3.0313, + "loss/crossentropy": 2.241119909286499, + "loss/hidden": 2.60625, + "loss/incoh": 0.0, + "loss/logits": 0.19746284037828446, + "loss/reg": 0.0, + "step": 38280 + }, + { + "epoch": 0.2519078947368421, + "grad_norm": 2.53125, + "grad_norm_var": 0.10631510416666666, + "learning_rate": 0.0001, + "loss": 3.0635, + "loss/crossentropy": 2.0983599185943604, + "loss/hidden": 2.99375, + "loss/incoh": 0.0, + "loss/logits": 0.25142730176448824, + "loss/reg": 0.0, + "step": 38290 + }, + { + "epoch": 0.2519736842105263, + "grad_norm": 3.28125, + "grad_norm_var": 0.11451416015625, + "learning_rate": 0.0001, + "loss": 3.046, + "loss/crossentropy": 2.36417875289917, + "loss/hidden": 2.7890625, + "loss/incoh": 0.0, + "loss/logits": 0.24252529442310333, + "loss/reg": 0.0, + "step": 38300 + }, + { + "epoch": 0.25203947368421054, + "grad_norm": 3.046875, + "grad_norm_var": 0.11712239583333334, + "learning_rate": 0.0001, + "loss": 3.1178, + "loss/crossentropy": 2.3146223425865173, + "loss/hidden": 2.71875, + "loss/incoh": 0.0, + "loss/logits": 0.26736019253730775, + "loss/reg": 0.0, + "step": 38310 + }, + { + "epoch": 0.2521052631578947, + "grad_norm": 2.609375, + "grad_norm_var": 0.7847076416015625, + "learning_rate": 0.0001, + "loss": 3.0818, + "loss/crossentropy": 2.2367316365242003, + "loss/hidden": 2.5859375, + "loss/incoh": 0.0, + "loss/logits": 0.19362774714827538, + "loss/reg": 0.0, + "step": 38320 + }, + { + "epoch": 0.25217105263157896, + "grad_norm": 2.359375, + "grad_norm_var": 0.3490386962890625, + "learning_rate": 0.0001, + "loss": 2.9877, + "loss/crossentropy": 2.212986183166504, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.21934635937213898, + "loss/reg": 0.0, + "step": 38330 + }, + { + "epoch": 0.25223684210526315, + "grad_norm": 2.71875, + "grad_norm_var": 0.0618560791015625, + "learning_rate": 0.0001, + "loss": 3.0083, + "loss/crossentropy": 2.424324858188629, + "loss/hidden": 2.6875, + "loss/incoh": 0.0, + "loss/logits": 0.2144932597875595, + "loss/reg": 0.0, + "step": 38340 + }, + { + "epoch": 0.2523026315789474, + "grad_norm": 2.21875, + "grad_norm_var": 0.05483296712239583, + "learning_rate": 0.0001, + "loss": 2.9994, + "loss/crossentropy": 2.3819736361503603, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.282964862883091, + "loss/reg": 0.0, + "step": 38350 + }, + { + "epoch": 0.2523684210526316, + "grad_norm": 2.421875, + "grad_norm_var": 6.966681585372321e+17, + "learning_rate": 0.0001, + "loss": 3.0863, + "loss/crossentropy": 2.094909679889679, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.21131259053945542, + "loss/reg": 0.0, + "step": 38360 + }, + { + "epoch": 0.2524342105263158, + "grad_norm": 1.9765625, + "grad_norm_var": 0.49862848917643227, + "learning_rate": 0.0001, + "loss": 2.9827, + "loss/crossentropy": 2.215614175796509, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.2258283868432045, + "loss/reg": 0.0, + "step": 38370 + }, + { + "epoch": 0.2525, + "grad_norm": 2.328125, + "grad_norm_var": 0.4853370666503906, + "learning_rate": 0.0001, + "loss": 2.9659, + "loss/crossentropy": 2.0444746017456055, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.17757057920098304, + "loss/reg": 0.0, + "step": 38380 + }, + { + "epoch": 0.2525657894736842, + "grad_norm": 2.4375, + "grad_norm_var": 0.09589436848958334, + "learning_rate": 0.0001, + "loss": 3.0212, + "loss/crossentropy": 2.3424991607666015, + "loss/hidden": 2.6484375, + "loss/incoh": 0.0, + "loss/logits": 0.20197789669036864, + "loss/reg": 0.0, + "step": 38390 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 2.34375, + "grad_norm_var": 0.10813395182291667, + "learning_rate": 0.0001, + "loss": 2.9537, + "loss/crossentropy": 2.090820240974426, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.18726751990616322, + "loss/reg": 0.0, + "step": 38400 + }, + { + "epoch": 0.2526973684210526, + "grad_norm": 2.5625, + "grad_norm_var": 0.06373291015625, + "learning_rate": 0.0001, + "loss": 2.9561, + "loss/crossentropy": 1.846102112531662, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.1744068369269371, + "loss/reg": 0.0, + "step": 38410 + }, + { + "epoch": 0.25276315789473686, + "grad_norm": 2.34375, + "grad_norm_var": 0.10798924763997396, + "learning_rate": 0.0001, + "loss": 2.9147, + "loss/crossentropy": 2.336298942565918, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.2202483594417572, + "loss/reg": 0.0, + "step": 38420 + }, + { + "epoch": 0.25282894736842104, + "grad_norm": 2.671875, + "grad_norm_var": 0.21279195149739583, + "learning_rate": 0.0001, + "loss": 3.0227, + "loss/crossentropy": 2.413504159450531, + "loss/hidden": 2.8359375, + "loss/incoh": 0.0, + "loss/logits": 0.2817773073911667, + "loss/reg": 0.0, + "step": 38430 + }, + { + "epoch": 0.2528947368421053, + "grad_norm": 2.0625, + "grad_norm_var": 0.4711985270182292, + "learning_rate": 0.0001, + "loss": 2.9499, + "loss/crossentropy": 2.3754114389419554, + "loss/hidden": 2.6328125, + "loss/incoh": 0.0, + "loss/logits": 0.20899627953767777, + "loss/reg": 0.0, + "step": 38440 + }, + { + "epoch": 0.25296052631578947, + "grad_norm": 2.5625, + "grad_norm_var": 0.11868489583333333, + "learning_rate": 0.0001, + "loss": 3.0609, + "loss/crossentropy": 2.1895971417427065, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.27013812959194183, + "loss/reg": 0.0, + "step": 38450 + }, + { + "epoch": 0.2530263157894737, + "grad_norm": 2.140625, + "grad_norm_var": 0.07203776041666667, + "learning_rate": 0.0001, + "loss": 2.9742, + "loss/crossentropy": 2.398327910900116, + "loss/hidden": 2.534375, + "loss/incoh": 0.0, + "loss/logits": 0.20559832453727722, + "loss/reg": 0.0, + "step": 38460 + }, + { + "epoch": 0.2530921052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.042479451497395834, + "learning_rate": 0.0001, + "loss": 3.0595, + "loss/crossentropy": 2.1632952094078064, + "loss/hidden": 3.0921875, + "loss/incoh": 0.0, + "loss/logits": 0.26632697582244874, + "loss/reg": 0.0, + "step": 38470 + }, + { + "epoch": 0.2531578947368421, + "grad_norm": 2.390625, + "grad_norm_var": 0.0286285400390625, + "learning_rate": 0.0001, + "loss": 2.948, + "loss/crossentropy": 2.3557206630706786, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.23956954926252366, + "loss/reg": 0.0, + "step": 38480 + }, + { + "epoch": 0.2532236842105263, + "grad_norm": 2.484375, + "grad_norm_var": 0.06353759765625, + "learning_rate": 0.0001, + "loss": 3.0005, + "loss/crossentropy": 2.1524956703186033, + "loss/hidden": 2.6375, + "loss/incoh": 0.0, + "loss/logits": 0.2026323951780796, + "loss/reg": 0.0, + "step": 38490 + }, + { + "epoch": 0.2532894736842105, + "grad_norm": 2.265625, + "grad_norm_var": 0.04701309204101563, + "learning_rate": 0.0001, + "loss": 2.9439, + "loss/crossentropy": 2.081376886367798, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.20958617925643921, + "loss/reg": 0.0, + "step": 38500 + }, + { + "epoch": 0.25335526315789475, + "grad_norm": 2.375, + "grad_norm_var": 0.057708485921223955, + "learning_rate": 0.0001, + "loss": 3.008, + "loss/crossentropy": 2.419314205646515, + "loss/hidden": 2.9125, + "loss/incoh": 0.0, + "loss/logits": 0.28001276552677157, + "loss/reg": 0.0, + "step": 38510 + }, + { + "epoch": 0.25342105263157894, + "grad_norm": 2.140625, + "grad_norm_var": 280.5541015625, + "learning_rate": 0.0001, + "loss": 3.124, + "loss/crossentropy": 2.209311616420746, + "loss/hidden": 3.334375, + "loss/incoh": 0.0, + "loss/logits": 0.4400923550128937, + "loss/reg": 0.0, + "step": 38520 + }, + { + "epoch": 0.2534868421052632, + "grad_norm": 2.34375, + "grad_norm_var": 280.8059855143229, + "learning_rate": 0.0001, + "loss": 2.963, + "loss/crossentropy": 2.386173665523529, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.20247507095336914, + "loss/reg": 0.0, + "step": 38530 + }, + { + "epoch": 0.25355263157894736, + "grad_norm": 2.375, + "grad_norm_var": 1.2670237223307292, + "learning_rate": 0.0001, + "loss": 2.9903, + "loss/crossentropy": 2.3542658567428587, + "loss/hidden": 2.759375, + "loss/incoh": 0.0, + "loss/logits": 0.229895980656147, + "loss/reg": 0.0, + "step": 38540 + }, + { + "epoch": 0.2536184210526316, + "grad_norm": 2.328125, + "grad_norm_var": 0.12191340128580729, + "learning_rate": 0.0001, + "loss": 3.0164, + "loss/crossentropy": 2.4656338930130004, + "loss/hidden": 2.653125, + "loss/incoh": 0.0, + "loss/logits": 0.2140819951891899, + "loss/reg": 0.0, + "step": 38550 + }, + { + "epoch": 0.2536842105263158, + "grad_norm": 2.546875, + "grad_norm_var": 9.503270467122396, + "learning_rate": 0.0001, + "loss": 3.0177, + "loss/crossentropy": 2.4247732520103455, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.26161234080791473, + "loss/reg": 0.0, + "step": 38560 + }, + { + "epoch": 0.25375, + "grad_norm": 2.1875, + "grad_norm_var": 9.574365234375, + "learning_rate": 0.0001, + "loss": 3.0298, + "loss/crossentropy": 2.2777958273887635, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.3407336473464966, + "loss/reg": 0.0, + "step": 38570 + }, + { + "epoch": 0.2538157894736842, + "grad_norm": 2.53125, + "grad_norm_var": 0.3904693603515625, + "learning_rate": 0.0001, + "loss": 2.933, + "loss/crossentropy": 2.220312762260437, + "loss/hidden": 2.7015625, + "loss/incoh": 0.0, + "loss/logits": 0.2032586969435215, + "loss/reg": 0.0, + "step": 38580 + }, + { + "epoch": 0.2538815789473684, + "grad_norm": 2.78125, + "grad_norm_var": 0.08896077473958333, + "learning_rate": 0.0001, + "loss": 2.9606, + "loss/crossentropy": 2.1711514949798585, + "loss/hidden": 2.646875, + "loss/incoh": 0.0, + "loss/logits": 0.17734147608280182, + "loss/reg": 0.0, + "step": 38590 + }, + { + "epoch": 0.25394736842105264, + "grad_norm": 2.21875, + "grad_norm_var": 0.20473531087239583, + "learning_rate": 0.0001, + "loss": 2.9321, + "loss/crossentropy": 1.9470943689346314, + "loss/hidden": 2.6, + "loss/incoh": 0.0, + "loss/logits": 0.19334470629692077, + "loss/reg": 0.0, + "step": 38600 + }, + { + "epoch": 0.25401315789473683, + "grad_norm": 3.03125, + "grad_norm_var": 0.06678059895833334, + "learning_rate": 0.0001, + "loss": 2.9233, + "loss/crossentropy": 2.273166114091873, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.20435749739408493, + "loss/reg": 0.0, + "step": 38610 + }, + { + "epoch": 0.25407894736842107, + "grad_norm": 2.15625, + "grad_norm_var": 0.11378580729166667, + "learning_rate": 0.0001, + "loss": 2.9767, + "loss/crossentropy": 2.571880316734314, + "loss/hidden": 2.6453125, + "loss/incoh": 0.0, + "loss/logits": 0.21721504628658295, + "loss/reg": 0.0, + "step": 38620 + }, + { + "epoch": 0.25414473684210526, + "grad_norm": 2.3125, + "grad_norm_var": 0.08742574055989584, + "learning_rate": 0.0001, + "loss": 2.9226, + "loss/crossentropy": 2.2264232873916625, + "loss/hidden": 2.890625, + "loss/incoh": 0.0, + "loss/logits": 0.21014079004526137, + "loss/reg": 0.0, + "step": 38630 + }, + { + "epoch": 0.2542105263157895, + "grad_norm": 2.578125, + "grad_norm_var": 0.05276285807291667, + "learning_rate": 0.0001, + "loss": 2.9634, + "loss/crossentropy": 2.2364309787750245, + "loss/hidden": 2.6953125, + "loss/incoh": 0.0, + "loss/logits": 0.20938622057437897, + "loss/reg": 0.0, + "step": 38640 + }, + { + "epoch": 0.2542763157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.03668619791666667, + "learning_rate": 0.0001, + "loss": 3.0195, + "loss/crossentropy": 2.135267126560211, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.2628340318799019, + "loss/reg": 0.0, + "step": 38650 + }, + { + "epoch": 0.25434210526315787, + "grad_norm": 2.3125, + "grad_norm_var": 1.3797353108723958, + "learning_rate": 0.0001, + "loss": 2.927, + "loss/crossentropy": 2.403776562213898, + "loss/hidden": 2.6421875, + "loss/incoh": 0.0, + "loss/logits": 0.2078718587756157, + "loss/reg": 0.0, + "step": 38660 + }, + { + "epoch": 0.2544078947368421, + "grad_norm": 2.265625, + "grad_norm_var": 1.3981770833333333, + "learning_rate": 0.0001, + "loss": 2.9993, + "loss/crossentropy": 2.358154129981995, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2573642887175083, + "loss/reg": 0.0, + "step": 38670 + }, + { + "epoch": 0.2544736842105263, + "grad_norm": 2.046875, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 2.9781, + "loss/crossentropy": 2.3121828198432923, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.2003457449376583, + "loss/reg": 0.0, + "step": 38680 + }, + { + "epoch": 0.25453947368421054, + "grad_norm": 6.4375, + "grad_norm_var": 1.1110694885253907, + "learning_rate": 0.0001, + "loss": 2.9505, + "loss/crossentropy": 2.153343600034714, + "loss/hidden": 2.9796875, + "loss/incoh": 0.0, + "loss/logits": 0.25137421637773516, + "loss/reg": 0.0, + "step": 38690 + }, + { + "epoch": 0.2546052631578947, + "grad_norm": 2.203125, + "grad_norm_var": 1.0830393473307292, + "learning_rate": 0.0001, + "loss": 2.9718, + "loss/crossentropy": 2.052643448114395, + "loss/hidden": 2.9171875, + "loss/incoh": 0.0, + "loss/logits": 0.25727019011974334, + "loss/reg": 0.0, + "step": 38700 + }, + { + "epoch": 0.25467105263157896, + "grad_norm": 2.15625, + "grad_norm_var": 0.0537506103515625, + "learning_rate": 0.0001, + "loss": 3.006, + "loss/crossentropy": 2.152147728204727, + "loss/hidden": 2.8328125, + "loss/incoh": 0.0, + "loss/logits": 0.23068247735500336, + "loss/reg": 0.0, + "step": 38710 + }, + { + "epoch": 0.25473684210526315, + "grad_norm": 2.25, + "grad_norm_var": 0.0999420166015625, + "learning_rate": 0.0001, + "loss": 2.9765, + "loss/crossentropy": 2.2647915482521057, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.24034264236688613, + "loss/reg": 0.0, + "step": 38720 + }, + { + "epoch": 0.2548026315789474, + "grad_norm": 2.890625, + "grad_norm_var": 0.19918212890625, + "learning_rate": 0.0001, + "loss": 2.9698, + "loss/crossentropy": 2.4958234548568727, + "loss/hidden": 2.7671875, + "loss/incoh": 0.0, + "loss/logits": 0.23010408282279968, + "loss/reg": 0.0, + "step": 38730 + }, + { + "epoch": 0.2548684210526316, + "grad_norm": 2.421875, + "grad_norm_var": 0.15207926432291666, + "learning_rate": 0.0001, + "loss": 2.9802, + "loss/crossentropy": 2.2727454662323, + "loss/hidden": 2.696875, + "loss/incoh": 0.0, + "loss/logits": 0.20178039968013764, + "loss/reg": 0.0, + "step": 38740 + }, + { + "epoch": 0.25493421052631576, + "grad_norm": 2.234375, + "grad_norm_var": 0.06728108723958333, + "learning_rate": 0.0001, + "loss": 3.036, + "loss/crossentropy": 2.2367177844047545, + "loss/hidden": 2.865625, + "loss/incoh": 0.0, + "loss/logits": 0.24128163158893584, + "loss/reg": 0.0, + "step": 38750 + }, + { + "epoch": 0.255, + "grad_norm": 2.109375, + "grad_norm_var": 0.06168212890625, + "learning_rate": 0.0001, + "loss": 2.9621, + "loss/crossentropy": 2.34770849943161, + "loss/hidden": 2.7171875, + "loss/incoh": 0.0, + "loss/logits": 0.21328310072422027, + "loss/reg": 0.0, + "step": 38760 + }, + { + "epoch": 0.2550657894736842, + "grad_norm": 2.203125, + "grad_norm_var": 0.1226470947265625, + "learning_rate": 0.0001, + "loss": 2.9942, + "loss/crossentropy": 2.306888747215271, + "loss/hidden": 2.8390625, + "loss/incoh": 0.0, + "loss/logits": 0.2507788211107254, + "loss/reg": 0.0, + "step": 38770 + }, + { + "epoch": 0.25513157894736843, + "grad_norm": 2.359375, + "grad_norm_var": 0.17222900390625, + "learning_rate": 0.0001, + "loss": 2.8878, + "loss/crossentropy": 2.0050193548202513, + "loss/hidden": 2.7515625, + "loss/incoh": 0.0, + "loss/logits": 0.18458792492747306, + "loss/reg": 0.0, + "step": 38780 + }, + { + "epoch": 0.2551973684210526, + "grad_norm": 2.125, + "grad_norm_var": 0.0305816650390625, + "learning_rate": 0.0001, + "loss": 2.9807, + "loss/crossentropy": 2.279655563831329, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.22222616970539094, + "loss/reg": 0.0, + "step": 38790 + }, + { + "epoch": 0.25526315789473686, + "grad_norm": 2.3125, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 0.0001, + "loss": 2.9266, + "loss/crossentropy": 1.8874349355697633, + "loss/hidden": 2.6609375, + "loss/incoh": 0.0, + "loss/logits": 0.1566801816225052, + "loss/reg": 0.0, + "step": 38800 + }, + { + "epoch": 0.25532894736842104, + "grad_norm": 2.140625, + "grad_norm_var": 0.13137919108072918, + "learning_rate": 0.0001, + "loss": 2.9532, + "loss/crossentropy": 2.0287832260131835, + "loss/hidden": 2.8375, + "loss/incoh": 0.0, + "loss/logits": 0.19014589190483094, + "loss/reg": 0.0, + "step": 38810 + }, + { + "epoch": 0.2553947368421053, + "grad_norm": 2.328125, + "grad_norm_var": 0.16416727701822917, + "learning_rate": 0.0001, + "loss": 2.9798, + "loss/crossentropy": 2.183877873420715, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.21431390047073365, + "loss/reg": 0.0, + "step": 38820 + }, + { + "epoch": 0.25546052631578947, + "grad_norm": 2.453125, + "grad_norm_var": 0.24260965983072916, + "learning_rate": 0.0001, + "loss": 2.9393, + "loss/crossentropy": 2.440583086013794, + "loss/hidden": 2.6515625, + "loss/incoh": 0.0, + "loss/logits": 0.20157338082790374, + "loss/reg": 0.0, + "step": 38830 + }, + { + "epoch": 0.25552631578947366, + "grad_norm": 2.265625, + "grad_norm_var": 0.1785797119140625, + "learning_rate": 0.0001, + "loss": 2.9731, + "loss/crossentropy": 2.5752979278564454, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.24179309010505676, + "loss/reg": 0.0, + "step": 38840 + }, + { + "epoch": 0.2555921052631579, + "grad_norm": 2.640625, + "grad_norm_var": 0.1367340087890625, + "learning_rate": 0.0001, + "loss": 2.9657, + "loss/crossentropy": 2.35681791305542, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.24579865038394927, + "loss/reg": 0.0, + "step": 38850 + }, + { + "epoch": 0.2556578947368421, + "grad_norm": 2.234375, + "grad_norm_var": 0.13662007649739583, + "learning_rate": 0.0001, + "loss": 2.939, + "loss/crossentropy": 2.2475174188613893, + "loss/hidden": 2.7375, + "loss/incoh": 0.0, + "loss/logits": 0.21202882528305053, + "loss/reg": 0.0, + "step": 38860 + }, + { + "epoch": 0.2557236842105263, + "grad_norm": 2.234375, + "grad_norm_var": 0.2050933837890625, + "learning_rate": 0.0001, + "loss": 2.9837, + "loss/crossentropy": 2.3431418418884276, + "loss/hidden": 2.7859375, + "loss/incoh": 0.0, + "loss/logits": 0.23483068943023683, + "loss/reg": 0.0, + "step": 38870 + }, + { + "epoch": 0.2557894736842105, + "grad_norm": 3.03125, + "grad_norm_var": 2.675121053059896, + "learning_rate": 0.0001, + "loss": 2.9856, + "loss/crossentropy": 2.483526587486267, + "loss/hidden": 2.7828125, + "loss/incoh": 0.0, + "loss/logits": 0.2897988960146904, + "loss/reg": 0.0, + "step": 38880 + }, + { + "epoch": 0.25585526315789475, + "grad_norm": 2.21875, + "grad_norm_var": 0.0619781494140625, + "learning_rate": 0.0001, + "loss": 2.9633, + "loss/crossentropy": 2.1767768919467927, + "loss/hidden": 2.75, + "loss/incoh": 0.0, + "loss/logits": 0.234356290102005, + "loss/reg": 0.0, + "step": 38890 + }, + { + "epoch": 0.25592105263157894, + "grad_norm": 2.96875, + "grad_norm_var": 0.2576568603515625, + "learning_rate": 0.0001, + "loss": 2.9781, + "loss/crossentropy": 2.4225967705249785, + "loss/hidden": 2.5765625, + "loss/incoh": 0.0, + "loss/logits": 0.23021362945437432, + "loss/reg": 0.0, + "step": 38900 + }, + { + "epoch": 0.2559868421052632, + "grad_norm": 2.625, + "grad_norm_var": 0.21616923014322917, + "learning_rate": 0.0001, + "loss": 2.9813, + "loss/crossentropy": 2.215132641792297, + "loss/hidden": 2.74375, + "loss/incoh": 0.0, + "loss/logits": 0.23095793277025223, + "loss/reg": 0.0, + "step": 38910 + }, + { + "epoch": 0.25605263157894737, + "grad_norm": 2.015625, + "grad_norm_var": 0.08826497395833334, + "learning_rate": 0.0001, + "loss": 2.9896, + "loss/crossentropy": 2.1709822535514833, + "loss/hidden": 2.7578125, + "loss/incoh": 0.0, + "loss/logits": 0.20210467725992204, + "loss/reg": 0.0, + "step": 38920 + }, + { + "epoch": 0.2561184210526316, + "grad_norm": 2.375, + "grad_norm_var": 0.08967997233072916, + "learning_rate": 0.0001, + "loss": 2.9936, + "loss/crossentropy": 2.343753528594971, + "loss/hidden": 2.8421875, + "loss/incoh": 0.0, + "loss/logits": 0.22285044342279434, + "loss/reg": 0.0, + "step": 38930 + }, + { + "epoch": 0.2561842105263158, + "grad_norm": 2.421875, + "grad_norm_var": 0.029637654622395832, + "learning_rate": 0.0001, + "loss": 2.918, + "loss/crossentropy": 2.4347527027130127, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.2453876256942749, + "loss/reg": 0.0, + "step": 38940 + }, + { + "epoch": 0.25625, + "grad_norm": 2.265625, + "grad_norm_var": 0.058405558268229164, + "learning_rate": 0.0001, + "loss": 2.9806, + "loss/crossentropy": 2.24286550283432, + "loss/hidden": 2.878125, + "loss/incoh": 0.0, + "loss/logits": 0.24712017327547073, + "loss/reg": 0.0, + "step": 38950 + }, + { + "epoch": 0.2563157894736842, + "grad_norm": 2.390625, + "grad_norm_var": 0.051756795247395834, + "learning_rate": 0.0001, + "loss": 2.9834, + "loss/crossentropy": 2.170624256134033, + "loss/hidden": 2.7609375, + "loss/incoh": 0.0, + "loss/logits": 0.2218889057636261, + "loss/reg": 0.0, + "step": 38960 + }, + { + "epoch": 0.2563815789473684, + "grad_norm": 2.140625, + "grad_norm_var": 0.0666015625, + "learning_rate": 0.0001, + "loss": 3.0572, + "loss/crossentropy": 2.154159963130951, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.20279831290245057, + "loss/reg": 0.0, + "step": 38970 + }, + { + "epoch": 0.25644736842105265, + "grad_norm": 2.484375, + "grad_norm_var": 0.08325093587239583, + "learning_rate": 0.0001, + "loss": 2.9366, + "loss/crossentropy": 2.1254536151885985, + "loss/hidden": 2.9421875, + "loss/incoh": 0.0, + "loss/logits": 0.24286270141601562, + "loss/reg": 0.0, + "step": 38980 + }, + { + "epoch": 0.25651315789473683, + "grad_norm": 2.828125, + "grad_norm_var": 0.1099609375, + "learning_rate": 0.0001, + "loss": 2.9717, + "loss/crossentropy": 2.1491544008255006, + "loss/hidden": 2.915625, + "loss/incoh": 0.0, + "loss/logits": 0.27067394405603407, + "loss/reg": 0.0, + "step": 38990 + }, + { + "epoch": 0.2565789473684211, + "grad_norm": 2.6875, + "grad_norm_var": 0.6976226806640625, + "learning_rate": 0.0001, + "loss": 3.0136, + "loss/crossentropy": 2.2009241580963135, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.19883604645729064, + "loss/reg": 0.0, + "step": 39000 + }, + { + "epoch": 0.25664473684210526, + "grad_norm": 2.828125, + "grad_norm_var": 0.6546183268229167, + "learning_rate": 0.0001, + "loss": 2.9775, + "loss/crossentropy": 2.086053115129471, + "loss/hidden": 2.89375, + "loss/incoh": 0.0, + "loss/logits": 0.26883399933576585, + "loss/reg": 0.0, + "step": 39010 + }, + { + "epoch": 0.2567105263157895, + "grad_norm": 3.71875, + "grad_norm_var": 0.23528238932291667, + "learning_rate": 0.0001, + "loss": 2.9801, + "loss/crossentropy": 2.121158719062805, + "loss/hidden": 2.8046875, + "loss/incoh": 0.0, + "loss/logits": 0.22595785260200502, + "loss/reg": 0.0, + "step": 39020 + }, + { + "epoch": 0.2567763157894737, + "grad_norm": 2.203125, + "grad_norm_var": 0.22114156087239584, + "learning_rate": 0.0001, + "loss": 2.8994, + "loss/crossentropy": 2.2681718945503233, + "loss/hidden": 2.6765625, + "loss/incoh": 0.0, + "loss/logits": 0.25748113095760344, + "loss/reg": 0.0, + "step": 39030 + }, + { + "epoch": 0.25684210526315787, + "grad_norm": 2.28125, + "grad_norm_var": 0.08943583170572916, + "learning_rate": 0.0001, + "loss": 2.8768, + "loss/crossentropy": 2.4575144767761232, + "loss/hidden": 2.578125, + "loss/incoh": 0.0, + "loss/logits": 0.20152714401483535, + "loss/reg": 0.0, + "step": 39040 + }, + { + "epoch": 0.2569078947368421, + "grad_norm": 2.46875, + "grad_norm_var": 0.0328765869140625, + "learning_rate": 0.0001, + "loss": 2.9306, + "loss/crossentropy": 2.369240176677704, + "loss/hidden": 2.809375, + "loss/incoh": 0.0, + "loss/logits": 0.22774229645729066, + "loss/reg": 0.0, + "step": 39050 + }, + { + "epoch": 0.2569736842105263, + "grad_norm": 2.640625, + "grad_norm_var": 3.1118817818339136e+17, + "learning_rate": 0.0001, + "loss": 3.0462, + "loss/crossentropy": 2.1120986580848693, + "loss/hidden": 2.884375, + "loss/incoh": 0.0, + "loss/logits": 0.2498351290822029, + "loss/reg": 0.0, + "step": 39060 + }, + { + "epoch": 0.25703947368421054, + "grad_norm": 2.3125, + "grad_norm_var": 0.034886678059895836, + "learning_rate": 0.0001, + "loss": 2.8904, + "loss/crossentropy": 2.1073123455047607, + "loss/hidden": 2.5921875, + "loss/incoh": 0.0, + "loss/logits": 0.19584316536784172, + "loss/reg": 0.0, + "step": 39070 + }, + { + "epoch": 0.2571052631578947, + "grad_norm": 2.890625, + "grad_norm_var": 0.07847900390625, + "learning_rate": 0.0001, + "loss": 2.9755, + "loss/crossentropy": 2.3284847140312195, + "loss/hidden": 2.8578125, + "loss/incoh": 0.0, + "loss/logits": 0.2493668183684349, + "loss/reg": 0.0, + "step": 39080 + }, + { + "epoch": 0.25717105263157897, + "grad_norm": 2.0625, + "grad_norm_var": 0.12560221354166667, + "learning_rate": 0.0001, + "loss": 2.9003, + "loss/crossentropy": 2.334082317352295, + "loss/hidden": 2.546875, + "loss/incoh": 0.0, + "loss/logits": 0.1966678135097027, + "loss/reg": 0.0, + "step": 39090 + }, + { + "epoch": 0.25723684210526315, + "grad_norm": 2.453125, + "grad_norm_var": 0.08367513020833334, + "learning_rate": 0.0001, + "loss": 2.9616, + "loss/crossentropy": 2.6055662870407104, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.2218620851635933, + "loss/reg": 0.0, + "step": 39100 + }, + { + "epoch": 0.2573026315789474, + "grad_norm": 2.703125, + "grad_norm_var": 0.22674051920572916, + "learning_rate": 0.0001, + "loss": 2.9551, + "loss/crossentropy": 2.1810374021530152, + "loss/hidden": 2.5875, + "loss/incoh": 0.0, + "loss/logits": 0.20281792134046556, + "loss/reg": 0.0, + "step": 39110 + }, + { + "epoch": 0.2573684210526316, + "grad_norm": 2.328125, + "grad_norm_var": 0.13672587076822917, + "learning_rate": 0.0001, + "loss": 2.9471, + "loss/crossentropy": 2.150469708442688, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.21119417250156403, + "loss/reg": 0.0, + "step": 39120 + }, + { + "epoch": 0.25743421052631577, + "grad_norm": 2.15625, + "grad_norm_var": 0.7312978108723959, + "learning_rate": 0.0001, + "loss": 2.9604, + "loss/crossentropy": 1.9471443891525269, + "loss/hidden": 2.8109375, + "loss/incoh": 0.0, + "loss/logits": 0.2092149168252945, + "loss/reg": 0.0, + "step": 39130 + }, + { + "epoch": 0.2575, + "grad_norm": 2.109375, + "grad_norm_var": 0.7445330301920573, + "learning_rate": 0.0001, + "loss": 2.9095, + "loss/crossentropy": 2.2832953572273254, + "loss/hidden": 2.64375, + "loss/incoh": 0.0, + "loss/logits": 0.21912914961576463, + "loss/reg": 0.0, + "step": 39140 + }, + { + "epoch": 0.2575657894736842, + "grad_norm": 3.15625, + "grad_norm_var": 0.14458719889322916, + "learning_rate": 0.0001, + "loss": 2.9719, + "loss/crossentropy": 2.3353841066360475, + "loss/hidden": 2.6859375, + "loss/incoh": 0.0, + "loss/logits": 0.22544662952423095, + "loss/reg": 0.0, + "step": 39150 + }, + { + "epoch": 0.25763157894736843, + "grad_norm": 2.359375, + "grad_norm_var": 0.3525136311848958, + "learning_rate": 0.0001, + "loss": 3.0065, + "loss/crossentropy": 2.3361639380455017, + "loss/hidden": 2.7296875, + "loss/incoh": 0.0, + "loss/logits": 0.2251232832670212, + "loss/reg": 0.0, + "step": 39160 + }, + { + "epoch": 0.2576973684210526, + "grad_norm": 2.359375, + "grad_norm_var": 0.295556640625, + "learning_rate": 0.0001, + "loss": 3.0384, + "loss/crossentropy": 2.324616348743439, + "loss/hidden": 2.746875, + "loss/incoh": 0.0, + "loss/logits": 0.23191166073083877, + "loss/reg": 0.0, + "step": 39170 + }, + { + "epoch": 0.25776315789473686, + "grad_norm": 2.359375, + "grad_norm_var": 0.06609700520833334, + "learning_rate": 0.0001, + "loss": 2.9448, + "loss/crossentropy": 2.2094304859638214, + "loss/hidden": 2.7359375, + "loss/incoh": 0.0, + "loss/logits": 0.24130020663142204, + "loss/reg": 0.0, + "step": 39180 + }, + { + "epoch": 0.25782894736842105, + "grad_norm": 2.03125, + "grad_norm_var": 0.09587376912434896, + "learning_rate": 0.0001, + "loss": 2.9087, + "loss/crossentropy": 2.379458463191986, + "loss/hidden": 2.70625, + "loss/incoh": 0.0, + "loss/logits": 0.2207609809935093, + "loss/reg": 0.0, + "step": 39190 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 2.8125, + "grad_norm_var": 0.10582275390625, + "learning_rate": 0.0001, + "loss": 3.0122, + "loss/crossentropy": 2.1732101857662203, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.20026833638548852, + "loss/reg": 0.0, + "step": 39200 + }, + { + "epoch": 0.2579605263157895, + "grad_norm": 2.21875, + "grad_norm_var": 0.07842992146809896, + "learning_rate": 0.0001, + "loss": 2.9131, + "loss/crossentropy": 1.8863205194473267, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.1805630184710026, + "loss/reg": 0.0, + "step": 39210 + }, + { + "epoch": 0.25802631578947366, + "grad_norm": 2.140625, + "grad_norm_var": 0.16445719401041667, + "learning_rate": 0.0001, + "loss": 2.9865, + "loss/crossentropy": 2.3708848357200623, + "loss/hidden": 2.7453125, + "loss/incoh": 0.0, + "loss/logits": 0.2134988009929657, + "loss/reg": 0.0, + "step": 39220 + }, + { + "epoch": 0.2580921052631579, + "grad_norm": 2.109375, + "grad_norm_var": 0.043190256754557295, + "learning_rate": 0.0001, + "loss": 2.8704, + "loss/crossentropy": 2.5470376133918764, + "loss/hidden": 2.6828125, + "loss/incoh": 0.0, + "loss/logits": 0.21798246651887893, + "loss/reg": 0.0, + "step": 39230 + }, + { + "epoch": 0.2581578947368421, + "grad_norm": 3.75, + "grad_norm_var": 0.15417378743489582, + "learning_rate": 0.0001, + "loss": 2.9229, + "loss/crossentropy": 2.3233034729957582, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.2058998629450798, + "loss/reg": 0.0, + "step": 39240 + }, + { + "epoch": 0.2582236842105263, + "grad_norm": 2.34375, + "grad_norm_var": 0.23391087849934897, + "learning_rate": 0.0001, + "loss": 2.9943, + "loss/crossentropy": 2.3962061643600463, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.2253645345568657, + "loss/reg": 0.0, + "step": 39250 + }, + { + "epoch": 0.2582894736842105, + "grad_norm": 2.3125, + "grad_norm_var": 0.0898577372233073, + "learning_rate": 0.0001, + "loss": 2.9846, + "loss/crossentropy": 2.2847093820571898, + "loss/hidden": 2.79375, + "loss/incoh": 0.0, + "loss/logits": 0.20621878653764725, + "loss/reg": 0.0, + "step": 39260 + }, + { + "epoch": 0.25835526315789475, + "grad_norm": 2.859375, + "grad_norm_var": 0.062235514322916664, + "learning_rate": 0.0001, + "loss": 3.0113, + "loss/crossentropy": 2.1093714237213135, + "loss/hidden": 2.7484375, + "loss/incoh": 0.0, + "loss/logits": 0.22439000010490417, + "loss/reg": 0.0, + "step": 39270 + }, + { + "epoch": 0.25842105263157894, + "grad_norm": 2.625, + "grad_norm_var": 0.061091105143229164, + "learning_rate": 0.0001, + "loss": 2.9406, + "loss/crossentropy": 2.3419513583183287, + "loss/hidden": 2.625, + "loss/incoh": 0.0, + "loss/logits": 0.2029005065560341, + "loss/reg": 0.0, + "step": 39280 + }, + { + "epoch": 0.2584868421052632, + "grad_norm": 1.96875, + "grad_norm_var": 0.0909332275390625, + "learning_rate": 0.0001, + "loss": 2.8985, + "loss/crossentropy": 2.230232834815979, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.20411441028118132, + "loss/reg": 0.0, + "step": 39290 + }, + { + "epoch": 0.25855263157894737, + "grad_norm": 3.0625, + "grad_norm_var": 0.1186676025390625, + "learning_rate": 0.0001, + "loss": 2.9583, + "loss/crossentropy": 2.2822274684906008, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.21612002551555634, + "loss/reg": 0.0, + "step": 39300 + }, + { + "epoch": 0.25861842105263155, + "grad_norm": 3.640625, + "grad_norm_var": 0.1718414306640625, + "learning_rate": 0.0001, + "loss": 3.0234, + "loss/crossentropy": 2.483612024784088, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.22477086931467055, + "loss/reg": 0.0, + "step": 39310 + }, + { + "epoch": 0.2586842105263158, + "grad_norm": 2.578125, + "grad_norm_var": 0.18178609212239583, + "learning_rate": 0.0001, + "loss": 2.9349, + "loss/crossentropy": 2.3175944685935974, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.2550591230392456, + "loss/reg": 0.0, + "step": 39320 + }, + { + "epoch": 0.25875, + "grad_norm": 2.265625, + "grad_norm_var": 0.08434956868489583, + "learning_rate": 0.0001, + "loss": 2.8974, + "loss/crossentropy": 2.333976149559021, + "loss/hidden": 2.6578125, + "loss/incoh": 0.0, + "loss/logits": 0.20737813785672188, + "loss/reg": 0.0, + "step": 39330 + }, + { + "epoch": 0.2588157894736842, + "grad_norm": 2.65625, + "grad_norm_var": 0.0402984619140625, + "learning_rate": 0.0001, + "loss": 2.8992, + "loss/crossentropy": 1.9081893861293793, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.19327801540493966, + "loss/reg": 0.0, + "step": 39340 + }, + { + "epoch": 0.2588815789473684, + "grad_norm": 2.34375, + "grad_norm_var": 0.03901341756184896, + "learning_rate": 0.0001, + "loss": 2.8945, + "loss/crossentropy": 2.112404853105545, + "loss/hidden": 2.6296875, + "loss/incoh": 0.0, + "loss/logits": 0.1989109069108963, + "loss/reg": 0.0, + "step": 39350 + }, + { + "epoch": 0.25894736842105265, + "grad_norm": 2.25, + "grad_norm_var": 0.02379124959309896, + "learning_rate": 0.0001, + "loss": 2.8712, + "loss/crossentropy": 2.0947551012039183, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.20299985110759736, + "loss/reg": 0.0, + "step": 39360 + }, + { + "epoch": 0.25901315789473683, + "grad_norm": 2.703125, + "grad_norm_var": 0.08593343098958334, + "learning_rate": 0.0001, + "loss": 2.9641, + "loss/crossentropy": 2.160096913576126, + "loss/hidden": 2.7328125, + "loss/incoh": 0.0, + "loss/logits": 0.23282611221075059, + "loss/reg": 0.0, + "step": 39370 + }, + { + "epoch": 0.2590789473684211, + "grad_norm": 2.46875, + "grad_norm_var": 0.09907124837239584, + "learning_rate": 0.0001, + "loss": 3.0059, + "loss/crossentropy": 2.1344396591186525, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.20642745792865752, + "loss/reg": 0.0, + "step": 39380 + }, + { + "epoch": 0.25914473684210526, + "grad_norm": 2.546875, + "grad_norm_var": 0.0337066650390625, + "learning_rate": 0.0001, + "loss": 2.9501, + "loss/crossentropy": 2.211651337146759, + "loss/hidden": 2.575, + "loss/incoh": 0.0, + "loss/logits": 0.20566888749599457, + "loss/reg": 0.0, + "step": 39390 + }, + { + "epoch": 0.25921052631578945, + "grad_norm": 2.828125, + "grad_norm_var": 0.3254191080729167, + "learning_rate": 0.0001, + "loss": 2.9949, + "loss/crossentropy": 2.0947441935539244, + "loss/hidden": 2.771875, + "loss/incoh": 0.0, + "loss/logits": 0.22096366733312606, + "loss/reg": 0.0, + "step": 39400 + }, + { + "epoch": 0.2592763157894737, + "grad_norm": 1.9921875, + "grad_norm_var": 0.19651667277018228, + "learning_rate": 0.0001, + "loss": 2.9392, + "loss/crossentropy": 2.3617849826812742, + "loss/hidden": 2.80625, + "loss/incoh": 0.0, + "loss/logits": 0.25957188159227373, + "loss/reg": 0.0, + "step": 39410 + }, + { + "epoch": 0.2593421052631579, + "grad_norm": 2.328125, + "grad_norm_var": 0.0928179423014323, + "learning_rate": 0.0001, + "loss": 2.9103, + "loss/crossentropy": 2.373174512386322, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.2120439499616623, + "loss/reg": 0.0, + "step": 39420 + }, + { + "epoch": 0.2594078947368421, + "grad_norm": 2.484375, + "grad_norm_var": 0.053587849934895834, + "learning_rate": 0.0001, + "loss": 2.9039, + "loss/crossentropy": 2.2536337614059447, + "loss/hidden": 2.6078125, + "loss/incoh": 0.0, + "loss/logits": 0.18805657625198363, + "loss/reg": 0.0, + "step": 39430 + }, + { + "epoch": 0.2594736842105263, + "grad_norm": 2.0, + "grad_norm_var": 0.3204264322916667, + "learning_rate": 0.0001, + "loss": 3.0201, + "loss/crossentropy": 2.052464544773102, + "loss/hidden": 2.7265625, + "loss/incoh": 0.0, + "loss/logits": 0.2089213587343693, + "loss/reg": 0.0, + "step": 39440 + }, + { + "epoch": 0.25953947368421054, + "grad_norm": 3.078125, + "grad_norm_var": 5.6367123205395795e+17, + "learning_rate": 0.0001, + "loss": 3.1226, + "loss/crossentropy": 2.0904810786247254, + "loss/hidden": 2.53125, + "loss/incoh": 0.0, + "loss/logits": 0.18436047285795212, + "loss/reg": 0.0, + "step": 39450 + }, + { + "epoch": 0.25960526315789473, + "grad_norm": 2.53125, + "grad_norm_var": 5.636712321591454e+17, + "learning_rate": 0.0001, + "loss": 2.8972, + "loss/crossentropy": 2.383215081691742, + "loss/hidden": 2.7203125, + "loss/incoh": 0.0, + "loss/logits": 0.2062357634305954, + "loss/reg": 0.0, + "step": 39460 + }, + { + "epoch": 0.25967105263157897, + "grad_norm": 2.703125, + "grad_norm_var": 0.2125139872233073, + "learning_rate": 0.0001, + "loss": 3.0526, + "loss/crossentropy": 2.035197800397873, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.22759751938283443, + "loss/reg": 0.0, + "step": 39470 + }, + { + "epoch": 0.25973684210526315, + "grad_norm": 2.265625, + "grad_norm_var": 0.1047503153483073, + "learning_rate": 0.0001, + "loss": 2.8596, + "loss/crossentropy": 2.0758556723594666, + "loss/hidden": 2.5875, + "loss/incoh": 0.0, + "loss/logits": 0.18243583887815476, + "loss/reg": 0.0, + "step": 39480 + }, + { + "epoch": 0.2598026315789474, + "grad_norm": 2.046875, + "grad_norm_var": 0.6220232645670573, + "learning_rate": 0.0001, + "loss": 2.9124, + "loss/crossentropy": 2.337350380420685, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.2153509557247162, + "loss/reg": 0.0, + "step": 39490 + }, + { + "epoch": 0.2598684210526316, + "grad_norm": 2.28125, + "grad_norm_var": 0.24768651326497396, + "learning_rate": 0.0001, + "loss": 2.899, + "loss/crossentropy": 2.2104454159736635, + "loss/hidden": 2.671875, + "loss/incoh": 0.0, + "loss/logits": 0.23622069656848907, + "loss/reg": 0.0, + "step": 39500 + }, + { + "epoch": 0.25993421052631577, + "grad_norm": 2.25, + "grad_norm_var": 0.06471354166666667, + "learning_rate": 0.0001, + "loss": 2.8843, + "loss/crossentropy": 2.006416219472885, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.21008393466472625, + "loss/reg": 0.0, + "step": 39510 + }, + { + "epoch": 0.26, + "grad_norm": 2.15625, + "grad_norm_var": 0.5674112955729167, + "learning_rate": 0.0001, + "loss": 3.0434, + "loss/crossentropy": 2.2104514479637145, + "loss/hidden": 2.7734375, + "loss/incoh": 0.0, + "loss/logits": 0.24286225736141204, + "loss/reg": 0.0, + "step": 39520 + }, + { + "epoch": 0.2600657894736842, + "grad_norm": 3.125, + "grad_norm_var": 2.339940388997396, + "learning_rate": 0.0001, + "loss": 2.9646, + "loss/crossentropy": 2.1471508383750915, + "loss/hidden": 2.875, + "loss/incoh": 0.0, + "loss/logits": 0.2512055471539497, + "loss/reg": 0.0, + "step": 39530 + }, + { + "epoch": 0.26013157894736844, + "grad_norm": 2.53125, + "grad_norm_var": 1.9162261962890625, + "learning_rate": 0.0001, + "loss": 3.015, + "loss/crossentropy": 2.252353233098984, + "loss/hidden": 2.821875, + "loss/incoh": 0.0, + "loss/logits": 0.22746440172195434, + "loss/reg": 0.0, + "step": 39540 + }, + { + "epoch": 0.2601973684210526, + "grad_norm": 2801795072.0, + "grad_norm_var": 4.9062847572554765e+17, + "learning_rate": 0.0001, + "loss": 3.0698, + "loss/crossentropy": 2.506903576850891, + "loss/hidden": 2.6703125, + "loss/incoh": 0.0, + "loss/logits": 0.22553565353155136, + "loss/reg": 0.0, + "step": 39550 + }, + { + "epoch": 0.26026315789473686, + "grad_norm": 2.15625, + "grad_norm_var": 4.9062847573576256e+17, + "learning_rate": 0.0001, + "loss": 2.9221, + "loss/crossentropy": 2.4941019773483277, + "loss/hidden": 2.7796875, + "loss/incoh": 0.0, + "loss/logits": 0.23029526472091674, + "loss/reg": 0.0, + "step": 39560 + }, + { + "epoch": 0.26032894736842105, + "grad_norm": 2.609375, + "grad_norm_var": 0.12497456868489583, + "learning_rate": 0.0001, + "loss": 2.8649, + "loss/crossentropy": 2.1986557483673095, + "loss/hidden": 2.6109375, + "loss/incoh": 0.0, + "loss/logits": 0.18305001109838487, + "loss/reg": 0.0, + "step": 39570 + }, + { + "epoch": 0.2603947368421053, + "grad_norm": 2.453125, + "grad_norm_var": 0.10641276041666667, + "learning_rate": 0.0001, + "loss": 2.917, + "loss/crossentropy": 2.1990869998931886, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.1851473018527031, + "loss/reg": 0.0, + "step": 39580 + }, + { + "epoch": 0.2604605263157895, + "grad_norm": 2.375, + "grad_norm_var": 1.3978749593098958, + "learning_rate": 0.0001, + "loss": 3.0133, + "loss/crossentropy": 2.436345875263214, + "loss/hidden": 2.9578125, + "loss/incoh": 0.0, + "loss/logits": 0.30376828759908675, + "loss/reg": 0.0, + "step": 39590 + }, + { + "epoch": 0.26052631578947366, + "grad_norm": 2.734375, + "grad_norm_var": 0.1521148681640625, + "learning_rate": 0.0001, + "loss": 2.9345, + "loss/crossentropy": 2.342786800861359, + "loss/hidden": 2.5828125, + "loss/incoh": 0.0, + "loss/logits": 0.19189061373472213, + "loss/reg": 0.0, + "step": 39600 + }, + { + "epoch": 0.2605921052631579, + "grad_norm": 2.140625, + "grad_norm_var": 0.15281575520833332, + "learning_rate": 0.0001, + "loss": 2.9767, + "loss/crossentropy": 2.4071336150169373, + "loss/hidden": 2.721875, + "loss/incoh": 0.0, + "loss/logits": 0.22345878183841705, + "loss/reg": 0.0, + "step": 39610 + }, + { + "epoch": 0.2606578947368421, + "grad_norm": 1.9140625, + "grad_norm_var": 0.280962880452474, + "learning_rate": 0.0001, + "loss": 2.9462, + "loss/crossentropy": 2.1481564074754713, + "loss/hidden": 2.6421875, + "loss/incoh": 0.0, + "loss/logits": 0.20991018489003183, + "loss/reg": 0.0, + "step": 39620 + }, + { + "epoch": 0.26072368421052633, + "grad_norm": 3.609375, + "grad_norm_var": 0.6643674214680989, + "learning_rate": 0.0001, + "loss": 3.0336, + "loss/crossentropy": 2.0887478232383727, + "loss/hidden": 2.753125, + "loss/incoh": 0.0, + "loss/logits": 0.22009581476449966, + "loss/reg": 0.0, + "step": 39630 + }, + { + "epoch": 0.2607894736842105, + "grad_norm": 2.21875, + "grad_norm_var": 0.36109619140625, + "learning_rate": 0.0001, + "loss": 3.0258, + "loss/crossentropy": 2.138615000247955, + "loss/hidden": 2.765625, + "loss/incoh": 0.0, + "loss/logits": 0.2106044813990593, + "loss/reg": 0.0, + "step": 39640 + }, + { + "epoch": 0.26085526315789476, + "grad_norm": 2.140625, + "grad_norm_var": 0.3787750244140625, + "learning_rate": 0.0001, + "loss": 3.0633, + "loss/crossentropy": 1.9567537367343903, + "loss/hidden": 2.9484375, + "loss/incoh": 0.0, + "loss/logits": 0.2987550154328346, + "loss/reg": 0.0, + "step": 39650 + }, + { + "epoch": 0.26092105263157894, + "grad_norm": 2.1875, + "grad_norm_var": 0.15900065104166666, + "learning_rate": 0.0001, + "loss": 2.9489, + "loss/crossentropy": 2.524258053302765, + "loss/hidden": 2.7078125, + "loss/incoh": 0.0, + "loss/logits": 0.20932213813066483, + "loss/reg": 0.0, + "step": 39660 + }, + { + "epoch": 0.2609868421052632, + "grad_norm": 3.140625, + "grad_norm_var": 0.073583984375, + "learning_rate": 0.0001, + "loss": 2.9985, + "loss/crossentropy": 2.552958643436432, + "loss/hidden": 2.6203125, + "loss/incoh": 0.0, + "loss/logits": 0.21318122893571853, + "loss/reg": 0.0, + "step": 39670 + }, + { + "epoch": 0.26105263157894737, + "grad_norm": 2.359375, + "grad_norm_var": 0.0707672119140625, + "learning_rate": 0.0001, + "loss": 2.9692, + "loss/crossentropy": 2.2176095962524416, + "loss/hidden": 2.9203125, + "loss/incoh": 0.0, + "loss/logits": 0.2208583801984787, + "loss/reg": 0.0, + "step": 39680 + }, + { + "epoch": 0.26111842105263156, + "grad_norm": 1.875, + "grad_norm_var": 0.058869425455729166, + "learning_rate": 0.0001, + "loss": 2.9243, + "loss/crossentropy": 2.2860087156295776, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.2486639067530632, + "loss/reg": 0.0, + "step": 39690 + }, + { + "epoch": 0.2611842105263158, + "grad_norm": 2.515625, + "grad_norm_var": 0.06942952473958333, + "learning_rate": 0.0001, + "loss": 2.9554, + "loss/crossentropy": 2.3581568241119384, + "loss/hidden": 2.834375, + "loss/incoh": 0.0, + "loss/logits": 0.25539501309394835, + "loss/reg": 0.0, + "step": 39700 + }, + { + "epoch": 0.26125, + "grad_norm": 2.453125, + "grad_norm_var": 1.3456217447916667, + "learning_rate": 0.0001, + "loss": 2.9286, + "loss/crossentropy": 2.494968020915985, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.23517013043165208, + "loss/reg": 0.0, + "step": 39710 + }, + { + "epoch": 0.2613157894736842, + "grad_norm": 2.171875, + "grad_norm_var": 1.3488566080729167, + "learning_rate": 0.0001, + "loss": 3.0312, + "loss/crossentropy": 2.37692905664444, + "loss/hidden": 2.9, + "loss/incoh": 0.0, + "loss/logits": 0.29073845595121384, + "loss/reg": 0.0, + "step": 39720 + }, + { + "epoch": 0.2613815789473684, + "grad_norm": 2.5, + "grad_norm_var": 0.202294921875, + "learning_rate": 0.0001, + "loss": 3.0294, + "loss/crossentropy": 2.281411385536194, + "loss/hidden": 2.7953125, + "loss/incoh": 0.0, + "loss/logits": 0.22122962027788162, + "loss/reg": 0.0, + "step": 39730 + }, + { + "epoch": 0.26144736842105265, + "grad_norm": 2.015625, + "grad_norm_var": 0.2295318603515625, + "learning_rate": 0.0001, + "loss": 2.9489, + "loss/crossentropy": 2.2690483570098876, + "loss/hidden": 2.9078125, + "loss/incoh": 0.0, + "loss/logits": 0.2626466929912567, + "loss/reg": 0.0, + "step": 39740 + }, + { + "epoch": 0.26151315789473684, + "grad_norm": 2.515625, + "grad_norm_var": 0.16734619140625, + "learning_rate": 0.0001, + "loss": 3.0073, + "loss/crossentropy": 2.394347441196442, + "loss/hidden": 2.709375, + "loss/incoh": 0.0, + "loss/logits": 0.2102888874709606, + "loss/reg": 0.0, + "step": 39750 + }, + { + "epoch": 0.2615789473684211, + "grad_norm": 2.125, + "grad_norm_var": 0.2183990478515625, + "learning_rate": 0.0001, + "loss": 2.9586, + "loss/crossentropy": 2.1268094956874846, + "loss/hidden": 2.728125, + "loss/incoh": 0.0, + "loss/logits": 0.20743194743990898, + "loss/reg": 0.0, + "step": 39760 + }, + { + "epoch": 0.26164473684210526, + "grad_norm": 2.640625, + "grad_norm_var": 0.1597564697265625, + "learning_rate": 0.0001, + "loss": 2.9826, + "loss/crossentropy": 2.3416704416275023, + "loss/hidden": 2.6140625, + "loss/incoh": 0.0, + "loss/logits": 0.20019679591059686, + "loss/reg": 0.0, + "step": 39770 + }, + { + "epoch": 0.26171052631578945, + "grad_norm": 3.03125, + "grad_norm_var": 0.08323160807291667, + "learning_rate": 0.0001, + "loss": 2.9672, + "loss/crossentropy": 2.0245181202888487, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.21602422520518302, + "loss/reg": 0.0, + "step": 39780 + }, + { + "epoch": 0.2617763157894737, + "grad_norm": 2.515625, + "grad_norm_var": 0.12693583170572917, + "learning_rate": 0.0001, + "loss": 3.0371, + "loss/crossentropy": 2.2631885528564455, + "loss/hidden": 2.8890625, + "loss/incoh": 0.0, + "loss/logits": 0.2282739758491516, + "loss/reg": 0.0, + "step": 39790 + }, + { + "epoch": 0.2618421052631579, + "grad_norm": 2.234375, + "grad_norm_var": 0.0588287353515625, + "learning_rate": 0.0001, + "loss": 2.9485, + "loss/crossentropy": 2.1909173846244814, + "loss/hidden": 2.8171875, + "loss/incoh": 0.0, + "loss/logits": 0.22533988654613496, + "loss/reg": 0.0, + "step": 39800 + }, + { + "epoch": 0.2619078947368421, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07842178344726562, + "learning_rate": 0.0001, + "loss": 2.9508, + "loss/crossentropy": 2.3490309596061705, + "loss/hidden": 2.8765625, + "loss/incoh": 0.0, + "loss/logits": 0.2864062897861004, + "loss/reg": 0.0, + "step": 39810 + }, + { + "epoch": 0.2619736842105263, + "grad_norm": 2.8125, + "grad_norm_var": 0.08014500935872396, + "learning_rate": 0.0001, + "loss": 2.9507, + "loss/crossentropy": 2.5319572567939757, + "loss/hidden": 2.6375, + "loss/incoh": 0.0, + "loss/logits": 0.2143532671034336, + "loss/reg": 0.0, + "step": 39820 + }, + { + "epoch": 0.26203947368421054, + "grad_norm": 1.96875, + "grad_norm_var": 0.0728515625, + "learning_rate": 0.0001, + "loss": 2.9447, + "loss/crossentropy": 2.350716245174408, + "loss/hidden": 2.5578125, + "loss/incoh": 0.0, + "loss/logits": 0.21392515301704407, + "loss/reg": 0.0, + "step": 39830 + }, + { + "epoch": 0.26210526315789473, + "grad_norm": 2.3125, + "grad_norm_var": 0.07763264973958334, + "learning_rate": 0.0001, + "loss": 2.9706, + "loss/crossentropy": 2.2897894740104676, + "loss/hidden": 2.73125, + "loss/incoh": 0.0, + "loss/logits": 0.242967389523983, + "loss/reg": 0.0, + "step": 39840 + }, + { + "epoch": 0.26217105263157897, + "grad_norm": 2.34375, + "grad_norm_var": 0.138525390625, + "learning_rate": 0.0001, + "loss": 3.0426, + "loss/crossentropy": 2.309167265892029, + "loss/hidden": 2.9640625, + "loss/incoh": 0.0, + "loss/logits": 0.2831707686185837, + "loss/reg": 0.0, + "step": 39850 + }, + { + "epoch": 0.26223684210526316, + "grad_norm": 2.453125, + "grad_norm_var": 0.13242085774739584, + "learning_rate": 0.0001, + "loss": 3.0234, + "loss/crossentropy": 2.3926889896392822, + "loss/hidden": 2.5875, + "loss/incoh": 0.0, + "loss/logits": 0.19550751000642777, + "loss/reg": 0.0, + "step": 39860 + }, + { + "epoch": 0.26230263157894734, + "grad_norm": 2.265625, + "grad_norm_var": 0.028873697916666666, + "learning_rate": 0.0001, + "loss": 2.8699, + "loss/crossentropy": 2.389265012741089, + "loss/hidden": 2.5765625, + "loss/incoh": 0.0, + "loss/logits": 0.20654226988554, + "loss/reg": 0.0, + "step": 39870 + }, + { + "epoch": 0.2623684210526316, + "grad_norm": 2.3125, + "grad_norm_var": 0.04534098307291667, + "learning_rate": 0.0001, + "loss": 2.9553, + "loss/crossentropy": 2.052034729719162, + "loss/hidden": 2.7765625, + "loss/incoh": 0.0, + "loss/logits": 0.20443758517503738, + "loss/reg": 0.0, + "step": 39880 + }, + { + "epoch": 0.26243421052631577, + "grad_norm": 2.25, + "grad_norm_var": 0.06367899576822916, + "learning_rate": 0.0001, + "loss": 2.8898, + "loss/crossentropy": 2.276283013820648, + "loss/hidden": 2.78125, + "loss/incoh": 0.0, + "loss/logits": 0.25378709435462954, + "loss/reg": 0.0, + "step": 39890 + }, + { + "epoch": 0.2625, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06096369425455729, + "learning_rate": 0.0001, + "loss": 2.9238, + "loss/crossentropy": 2.3180172562599184, + "loss/hidden": 2.7109375, + "loss/incoh": 0.0, + "loss/logits": 0.23101864904165267, + "loss/reg": 0.0, + "step": 39900 + }, + { + "epoch": 0.2625657894736842, + "grad_norm": 2.34375, + "grad_norm_var": 0.1415911356608073, + "learning_rate": 0.0001, + "loss": 2.9487, + "loss/crossentropy": 2.6349021911621096, + "loss/hidden": 3.0421875, + "loss/incoh": 0.0, + "loss/logits": 0.27188998460769653, + "loss/reg": 0.0, + "step": 39910 + }, + { + "epoch": 0.26263157894736844, + "grad_norm": 2.3125, + "grad_norm_var": 0.11770731608072917, + "learning_rate": 0.0001, + "loss": 2.8688, + "loss/crossentropy": 2.276544678211212, + "loss/hidden": 2.7640625, + "loss/incoh": 0.0, + "loss/logits": 0.20636183619499207, + "loss/reg": 0.0, + "step": 39920 + }, + { + "epoch": 0.2626973684210526, + "grad_norm": 2.1875, + "grad_norm_var": 0.1878814697265625, + "learning_rate": 0.0001, + "loss": 3.023, + "loss/crossentropy": 2.388458263874054, + "loss/hidden": 2.7296875, + "loss/incoh": 0.0, + "loss/logits": 0.23089798912405968, + "loss/reg": 0.0, + "step": 39930 + }, + { + "epoch": 0.26276315789473687, + "grad_norm": 2.484375, + "grad_norm_var": 0.14111226399739582, + "learning_rate": 0.0001, + "loss": 2.9919, + "loss/crossentropy": 2.0341047286987304, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.21918695122003556, + "loss/reg": 0.0, + "step": 39940 + }, + { + "epoch": 0.26282894736842105, + "grad_norm": 3.1875, + "grad_norm_var": 0.183544921875, + "learning_rate": 0.0001, + "loss": 3.0396, + "loss/crossentropy": 2.3217403292655945, + "loss/hidden": 2.95625, + "loss/incoh": 0.0, + "loss/logits": 0.26592089235782623, + "loss/reg": 0.0, + "step": 39950 + }, + { + "epoch": 0.26289473684210524, + "grad_norm": 2.140625, + "grad_norm_var": 0.0977691650390625, + "learning_rate": 0.0001, + "loss": 2.9174, + "loss/crossentropy": 2.55605833530426, + "loss/hidden": 2.690625, + "loss/incoh": 0.0, + "loss/logits": 0.2847170978784561, + "loss/reg": 0.0, + "step": 39960 + }, + { + "epoch": 0.2629605263157895, + "grad_norm": 2.5625, + "grad_norm_var": 0.06572977701822917, + "learning_rate": 0.0001, + "loss": 2.9685, + "loss/crossentropy": 1.9914783239364624, + "loss/hidden": 2.784375, + "loss/incoh": 0.0, + "loss/logits": 0.20305539071559905, + "loss/reg": 0.0, + "step": 39970 + }, + { + "epoch": 0.26302631578947366, + "grad_norm": 2.25, + "grad_norm_var": 0.04635416666666667, + "learning_rate": 0.0001, + "loss": 2.9532, + "loss/crossentropy": 2.478356397151947, + "loss/hidden": 2.8796875, + "loss/incoh": 0.0, + "loss/logits": 0.2187245175242424, + "loss/reg": 0.0, + "step": 39980 + }, + { + "epoch": 0.2630921052631579, + "grad_norm": 2.3125, + "grad_norm_var": 0.03802083333333333, + "learning_rate": 0.0001, + "loss": 2.9101, + "loss/crossentropy": 2.4963066220283507, + "loss/hidden": 2.678125, + "loss/incoh": 0.0, + "loss/logits": 0.23317703753709793, + "loss/reg": 0.0, + "step": 39990 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 2.6875, + "grad_norm_var": 0.10871480305989584, + "learning_rate": 0.0001, + "loss": 2.9937, + "loss/crossentropy": 2.340533971786499, + "loss/hidden": 2.7, + "loss/incoh": 0.0, + "loss/logits": 0.22518680542707442, + "loss/reg": 0.0, + "step": 40000 + } + ], + "logging_steps": 10, + "max_steps": 152000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 20000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8575100320088064e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}