|
{ |
|
"best_metric": 0.7979517910648003, |
|
"best_model_checkpoint": "checkpoint/cross_encoder_20250522_full_data/checkpoint-16219", |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 16219, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.9638161659240723, |
|
"learning_rate": 4.315925766076824e-09, |
|
"loss": 0.6827, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.8890857696533203, |
|
"learning_rate": 8.631851532153649e-07, |
|
"loss": 0.6745, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.8610997200012207, |
|
"learning_rate": 1.7263703064307298e-06, |
|
"loss": 0.5921, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.058478832244873, |
|
"learning_rate": 2.5895554596460943e-06, |
|
"loss": 0.5238, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.51100492477417, |
|
"learning_rate": 3.4527406128614595e-06, |
|
"loss": 0.4943, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.381241798400879, |
|
"learning_rate": 4.3159257660768235e-06, |
|
"loss": 0.4882, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.5782713890075684, |
|
"learning_rate": 5.179110919292189e-06, |
|
"loss": 0.4782, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.436559200286865, |
|
"learning_rate": 6.042296072507553e-06, |
|
"loss": 0.4719, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.6600098609924316, |
|
"learning_rate": 6.905481225722919e-06, |
|
"loss": 0.4593, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.711264610290527, |
|
"learning_rate": 7.768666378938283e-06, |
|
"loss": 0.4629, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 7.427570819854736, |
|
"learning_rate": 8.631851532153647e-06, |
|
"loss": 0.4549, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 2.386420965194702, |
|
"learning_rate": 9.495036685369013e-06, |
|
"loss": 0.4572, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_f2": 0.7404627946070191, |
|
"eval_loss": 0.4705376923084259, |
|
"eval_precision": 0.47348360206208184, |
|
"eval_recall": 0.8619708466453674, |
|
"eval_runtime": 147.2641, |
|
"eval_samples_per_second": 816.166, |
|
"eval_steps_per_second": 6.376, |
|
"step": 2317 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.0445728302001953, |
|
"learning_rate": 9.999609111230013e-06, |
|
"loss": 0.4511, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 2.2899084091186523, |
|
"learning_rate": 9.995456298935638e-06, |
|
"loss": 0.447, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 4.465543270111084, |
|
"learning_rate": 9.986768615435655e-06, |
|
"loss": 0.4405, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 4.2559404373168945, |
|
"learning_rate": 9.973553947402149e-06, |
|
"loss": 0.4436, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.245833873748779, |
|
"learning_rate": 9.955824291100119e-06, |
|
"loss": 0.4393, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 4.579006671905518, |
|
"learning_rate": 9.933595741497281e-06, |
|
"loss": 0.4376, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 7.389001369476318, |
|
"learning_rate": 9.906888477653065e-06, |
|
"loss": 0.4373, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.0490312576293945, |
|
"learning_rate": 9.875726744400081e-06, |
|
"loss": 0.4345, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 2.065431594848633, |
|
"learning_rate": 9.840138830334685e-06, |
|
"loss": 0.4327, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 3.0262961387634277, |
|
"learning_rate": 9.800157042136608e-06, |
|
"loss": 0.4327, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 2.6107230186462402, |
|
"learning_rate": 9.755817675240981e-06, |
|
"loss": 0.4294, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 4.621052265167236, |
|
"learning_rate": 9.707160980889367e-06, |
|
"loss": 0.4283, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_f2": 0.7717545519035234, |
|
"eval_loss": 0.4515298902988434, |
|
"eval_precision": 0.4774057047330478, |
|
"eval_recall": 0.9123901757188498, |
|
"eval_runtime": 148.5197, |
|
"eval_samples_per_second": 809.266, |
|
"eval_steps_per_second": 6.322, |
|
"step": 4634 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 3.324246644973755, |
|
"learning_rate": 9.65423112958972e-06, |
|
"loss": 0.4244, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 4.125003337860107, |
|
"learning_rate": 9.597076171018426e-06, |
|
"loss": 0.4183, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 2.1959779262542725, |
|
"learning_rate": 9.535747990400856e-06, |
|
"loss": 0.4164, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.7736337184906006, |
|
"learning_rate": 9.470302261409985e-06, |
|
"loss": 0.4205, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 4.9517903327941895, |
|
"learning_rate": 9.400798395625894e-06, |
|
"loss": 0.4176, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 6.079417705535889, |
|
"learning_rate": 9.327299488601976e-06, |
|
"loss": 0.4202, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 1.3246735334396362, |
|
"learning_rate": 9.249872262586839e-06, |
|
"loss": 0.4218, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 2.6098692417144775, |
|
"learning_rate": 9.168587005953913e-06, |
|
"loss": 0.4182, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 3.9474799633026123, |
|
"learning_rate": 9.083517509393716e-06, |
|
"loss": 0.415, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 3.5382373332977295, |
|
"learning_rate": 8.994740998926724e-06, |
|
"loss": 0.4186, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 2.33327054977417, |
|
"learning_rate": 8.902338065797648e-06, |
|
"loss": 0.4115, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_f2": 0.7773410482632972, |
|
"eval_loss": 0.4485355019569397, |
|
"eval_precision": 0.47962529274004684, |
|
"eval_recall": 0.9201277955271565, |
|
"eval_runtime": 147.9088, |
|
"eval_samples_per_second": 812.609, |
|
"eval_steps_per_second": 6.349, |
|
"step": 6951 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 3.269026517868042, |
|
"learning_rate": 8.806392593314781e-06, |
|
"loss": 0.4167, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 2.844139575958252, |
|
"learning_rate": 8.70699168070078e-06, |
|
"loss": 0.4101, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 2.2456936836242676, |
|
"learning_rate": 8.604225564024074e-06, |
|
"loss": 0.4095, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 2.3910915851593018, |
|
"learning_rate": 8.498187534282632e-06, |
|
"loss": 0.407, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 2.2186901569366455, |
|
"learning_rate": 8.388973852714463e-06, |
|
"loss": 0.4044, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 3.1750833988189697, |
|
"learning_rate": 8.276683663411758e-06, |
|
"loss": 0.4048, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 5.709492206573486, |
|
"learning_rate": 8.161418903317936e-06, |
|
"loss": 0.4061, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 6.942538738250732, |
|
"learning_rate": 8.043284209689402e-06, |
|
"loss": 0.4067, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 1.3552676439285278, |
|
"learning_rate": 7.922386825105899e-06, |
|
"loss": 0.4046, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 3.5627899169921875, |
|
"learning_rate": 7.798836500115803e-06, |
|
"loss": 0.407, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.0421602725982666, |
|
"learning_rate": 7.672745393604649e-06, |
|
"loss": 0.4037, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 4.619537830352783, |
|
"learning_rate": 7.544227970977395e-06, |
|
"loss": 0.4021, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_f2": 0.7901536373601518, |
|
"eval_loss": 0.4387025237083435, |
|
"eval_precision": 0.5217129071170085, |
|
"eval_recall": 0.9067991214057508, |
|
"eval_runtime": 148.7755, |
|
"eval_samples_per_second": 807.875, |
|
"eval_steps_per_second": 6.312, |
|
"step": 9268 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.7471312284469604, |
|
"learning_rate": 7.413400900246815e-06, |
|
"loss": 0.3959, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 7.655578136444092, |
|
"learning_rate": 7.280382946122369e-06, |
|
"loss": 0.3996, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 4.281186580657959, |
|
"learning_rate": 7.1452948621957e-06, |
|
"loss": 0.3943, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 3.821415424346924, |
|
"learning_rate": 7.0082592813206175e-06, |
|
"loss": 0.3972, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 2.6942179203033447, |
|
"learning_rate": 6.869400604287093e-06, |
|
"loss": 0.3939, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 4.14546012878418, |
|
"learning_rate": 6.7288448868903225e-06, |
|
"loss": 0.3997, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 2.8214612007141113, |
|
"learning_rate": 6.586719725497375e-06, |
|
"loss": 0.3993, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 2.5755832195281982, |
|
"learning_rate": 6.443154141215318e-06, |
|
"loss": 0.3969, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 1.6153743267059326, |
|
"learning_rate": 6.298278462765959e-06, |
|
"loss": 0.3987, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 7.3942766189575195, |
|
"learning_rate": 6.152224208173533e-06, |
|
"loss": 0.3946, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.083901047706604, |
|
"learning_rate": 6.005123965372751e-06, |
|
"loss": 0.3918, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_f2": 0.7704512576415139, |
|
"eval_loss": 0.44662219285964966, |
|
"eval_precision": 0.6110967168819632, |
|
"eval_recall": 0.8241813099041534, |
|
"eval_runtime": 149.5247, |
|
"eval_samples_per_second": 803.827, |
|
"eval_steps_per_second": 6.28, |
|
"step": 11585 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 3.2887141704559326, |
|
"learning_rate": 5.857111271845573e-06, |
|
"loss": 0.3935, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"grad_norm": 2.212423801422119, |
|
"learning_rate": 5.708320493395999e-06, |
|
"loss": 0.3908, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 0.9465045928955078, |
|
"learning_rate": 5.558886702172891e-06, |
|
"loss": 0.3854, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"grad_norm": 5.766241073608398, |
|
"learning_rate": 5.408945554051591e-06, |
|
"loss": 0.3912, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"grad_norm": 4.334184646606445, |
|
"learning_rate": 5.258633165485625e-06, |
|
"loss": 0.3866, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 4.286846160888672, |
|
"learning_rate": 5.108085989940292e-06, |
|
"loss": 0.3919, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 1.167948603630066, |
|
"learning_rate": 4.95744069402033e-06, |
|
"loss": 0.3898, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 4.312135696411133, |
|
"learning_rate": 4.806834033404065e-06, |
|
"loss": 0.3894, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 2.726715564727783, |
|
"learning_rate": 4.6564027286967275e-06, |
|
"loss": 0.3918, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 5.78, |
|
"grad_norm": 2.4171202182769775, |
|
"learning_rate": 4.5062833413156e-06, |
|
"loss": 0.3917, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"grad_norm": 5.957132339477539, |
|
"learning_rate": 4.3566121495196656e-06, |
|
"loss": 0.3849, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 4.632606029510498, |
|
"learning_rate": 4.20752502469631e-06, |
|
"loss": 0.3879, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_f2": 0.7946750167423844, |
|
"eval_loss": 0.43369919061660767, |
|
"eval_precision": 0.5782680276588739, |
|
"eval_recall": 0.8766972843450479, |
|
"eval_runtime": 148.7395, |
|
"eval_samples_per_second": 808.07, |
|
"eval_steps_per_second": 6.313, |
|
"step": 13902 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 1.9424314498901367, |
|
"learning_rate": 4.0591573080173995e-06, |
|
"loss": 0.3873, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.13, |
|
"grad_norm": 2.831543445587158, |
|
"learning_rate": 3.911643687576664e-06, |
|
"loss": 0.3807, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 6.21, |
|
"grad_norm": 2.7500486373901367, |
|
"learning_rate": 3.7651180761199505e-06, |
|
"loss": 0.3801, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 6.3, |
|
"grad_norm": 4.015429496765137, |
|
"learning_rate": 3.619713489479354e-06, |
|
"loss": 0.3824, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 6.39, |
|
"grad_norm": 2.4831278324127197, |
|
"learning_rate": 3.4755619258215407e-06, |
|
"loss": 0.3808, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 6.47, |
|
"grad_norm": 7.442523002624512, |
|
"learning_rate": 3.3327942458199193e-06, |
|
"loss": 0.385, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 4.242701530456543, |
|
"learning_rate": 3.1915400538594333e-06, |
|
"loss": 0.3832, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 6.65, |
|
"grad_norm": 5.000258922576904, |
|
"learning_rate": 3.0519275803818014e-06, |
|
"loss": 0.3805, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 2.0560362339019775, |
|
"learning_rate": 2.914083565478024e-06, |
|
"loss": 0.381, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 1.8777313232421875, |
|
"learning_rate": 2.7781331438338317e-06, |
|
"loss": 0.3831, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 1.0822844505310059, |
|
"learning_rate": 2.6441997311325196e-06, |
|
"loss": 0.3797, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 3.2328083515167236, |
|
"learning_rate": 2.5124049120182916e-06, |
|
"loss": 0.383, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_f2": 0.7979517910648003, |
|
"eval_loss": 0.4335618019104004, |
|
"eval_precision": 0.5632793509486378, |
|
"eval_recall": 0.8907248402555911, |
|
"eval_runtime": 148.5629, |
|
"eval_samples_per_second": 809.031, |
|
"eval_steps_per_second": 6.321, |
|
"step": 16219 |
|
} |
|
], |
|
"logging_steps": 200, |
|
"max_steps": 23170, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 5.462269789659464e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|