|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 30, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.318470299243927, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.4455, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.02355371043086052, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 0.0053, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.036702342331409454, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.004, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"eval_loss": 0.0037715784274041653, |
|
"eval_runtime": 108.061, |
|
"eval_samples_per_second": 9.254, |
|
"eval_steps_per_second": 0.231, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.07428745925426483, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.0038, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.07172686606645584, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.0038, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.03130387142300606, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.0036, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"eval_loss": 0.0035628757905215025, |
|
"eval_runtime": 108.1607, |
|
"eval_samples_per_second": 9.246, |
|
"eval_steps_per_second": 0.231, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.02135792188346386, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.0036, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.1094546914100647, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 0.0033, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.15919151902198792, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0035, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.003310541156679392, |
|
"eval_runtime": 108.1107, |
|
"eval_samples_per_second": 9.25, |
|
"eval_steps_per_second": 0.231, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.024381157010793686, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0029, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.05485767126083374, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 0.002, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.08088912814855576, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.0024, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"eval_loss": 0.0021427080500870943, |
|
"eval_runtime": 108.1623, |
|
"eval_samples_per_second": 9.245, |
|
"eval_steps_per_second": 0.231, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.09581312537193298, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.0019, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.06600484997034073, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.0012, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.08135157078504562, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0011, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"eval_loss": 0.0011440212838351727, |
|
"eval_runtime": 108.1819, |
|
"eval_samples_per_second": 9.244, |
|
"eval_steps_per_second": 0.231, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.0257169920951128, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.0018, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 0.02252952568233013, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 0.0013, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.015104785561561584, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.0011, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 0.0008804052486084402, |
|
"eval_runtime": 108.241, |
|
"eval_samples_per_second": 9.239, |
|
"eval_steps_per_second": 0.231, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.037444427609443665, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.0009, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.041397638618946075, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0009, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.04828115180134773, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.001, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"eval_loss": 0.0008426142740063369, |
|
"eval_runtime": 108.1605, |
|
"eval_samples_per_second": 9.246, |
|
"eval_steps_per_second": 0.231, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.09075287729501724, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.0011, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 0.025804603472352028, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.0009, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.04228019714355469, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.0008, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"eval_loss": 0.000761281349696219, |
|
"eval_runtime": 108.2466, |
|
"eval_samples_per_second": 9.238, |
|
"eval_steps_per_second": 0.231, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.07708187401294708, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0008, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.090398870408535, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.0007, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.10483791679143906, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.0009, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.0007036189781501889, |
|
"eval_runtime": 108.3447, |
|
"eval_samples_per_second": 9.23, |
|
"eval_steps_per_second": 0.231, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 0.046087298542261124, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.0008, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 0.016715016216039658, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.0007, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.0191776305437088, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0006, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"eval_loss": 0.0006860981229692698, |
|
"eval_runtime": 108.3015, |
|
"eval_samples_per_second": 9.233, |
|
"eval_steps_per_second": 0.231, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 0.03184030205011368, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.0007, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.025107963010668755, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.0007, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.023573119193315506, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 0.0006, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"eval_loss": 0.0006219326751306653, |
|
"eval_runtime": 108.2828, |
|
"eval_samples_per_second": 9.235, |
|
"eval_steps_per_second": 0.231, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 0.04575636237859726, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.0007, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.1070500984787941, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0007, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.03425155580043793, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.0004, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.0006177299073897302, |
|
"eval_runtime": 108.3475, |
|
"eval_samples_per_second": 9.23, |
|
"eval_steps_per_second": 0.231, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 0.030520088970661163, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.0006, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.010435817763209343, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 0.0005, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.020339515060186386, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 0.0007, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"eval_loss": 0.0006012282683514059, |
|
"eval_runtime": 108.2882, |
|
"eval_samples_per_second": 9.235, |
|
"eval_steps_per_second": 0.231, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.10952532291412354, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0008, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.8222222222222222, |
|
"grad_norm": 0.017915133386850357, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 0.0007, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.04453803971409798, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.0005, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"eval_loss": 0.0006457158015109599, |
|
"eval_runtime": 108.2276, |
|
"eval_samples_per_second": 9.24, |
|
"eval_steps_per_second": 0.231, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.911111111111111, |
|
"grad_norm": 0.02466416358947754, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.0006, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 0.030644970014691353, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 0.0005, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.01875125616788864, |
|
"learning_rate": 0.0, |
|
"loss": 0.0005, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.0005785770481452346, |
|
"eval_runtime": 108.2462, |
|
"eval_samples_per_second": 9.238, |
|
"eval_steps_per_second": 0.231, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 450, |
|
"total_flos": 1.6644589041987092e+18, |
|
"train_loss": 0.01136111196440955, |
|
"train_runtime": 7966.0994, |
|
"train_samples_per_second": 2.26, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6644589041987092e+18, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|