| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.28134063809217, | |
| "eval_steps": 100, | |
| "global_step": 3600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12890750886239125, | |
| "grad_norm": 0.9527155866900715, | |
| "learning_rate": 0.001, | |
| "loss": 4.2951, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2578150177247825, | |
| "grad_norm": 0.8613266657338488, | |
| "learning_rate": 0.0009995777619868967, | |
| "loss": 4.1174, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2578150177247825, | |
| "eval_loss": 4.061758041381836, | |
| "eval_runtime": 13.6791, | |
| "eval_samples_per_second": 73.104, | |
| "eval_steps_per_second": 2.339, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3867225265871737, | |
| "grad_norm": 0.588963638591784, | |
| "learning_rate": 0.000998311761801199, | |
| "loss": 4.035, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.515630035449565, | |
| "grad_norm": 0.3000507476828121, | |
| "learning_rate": 0.000996204139796873, | |
| "loss": 3.9794, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.515630035449565, | |
| "eval_loss": 3.951843500137329, | |
| "eval_runtime": 13.2341, | |
| "eval_samples_per_second": 75.563, | |
| "eval_steps_per_second": 2.418, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6445375443119562, | |
| "grad_norm": 1.0882364902367292, | |
| "learning_rate": 0.0009932584592096643, | |
| "loss": 3.9453, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7734450531743474, | |
| "grad_norm": 0.6363211565344357, | |
| "learning_rate": 0.0009894797001329398, | |
| "loss": 3.917, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7734450531743474, | |
| "eval_loss": 3.8788869380950928, | |
| "eval_runtime": 13.3005, | |
| "eval_samples_per_second": 75.185, | |
| "eval_steps_per_second": 2.406, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9023525620367386, | |
| "grad_norm": 0.9910006063725697, | |
| "learning_rate": 0.0009848742510981292, | |
| "loss": 3.805, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.03126007089913, | |
| "grad_norm": 5.070998393490313, | |
| "learning_rate": 0.0009794498982740008, | |
| "loss": 3.7973, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.03126007089913, | |
| "eval_loss": 3.929041862487793, | |
| "eval_runtime": 13.2112, | |
| "eval_samples_per_second": 75.693, | |
| "eval_steps_per_second": 2.422, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1601675797615212, | |
| "grad_norm": 1.5111119419703132, | |
| "learning_rate": 0.0009732158123030324, | |
| "loss": 3.98, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2890750886239124, | |
| "grad_norm": 2.9266272457246685, | |
| "learning_rate": 0.0009661825327971316, | |
| "loss": 3.9676, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2890750886239124, | |
| "eval_loss": 3.992887258529663, | |
| "eval_runtime": 13.2769, | |
| "eval_samples_per_second": 75.319, | |
| "eval_steps_per_second": 2.41, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4179825974863036, | |
| "grad_norm": 1.9372695993602536, | |
| "learning_rate": 0.0009583619505189177, | |
| "loss": 4.0102, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5468901063486948, | |
| "grad_norm": 1.1325425311995052, | |
| "learning_rate": 0.0009497672872786908, | |
| "loss": 4.0347, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5468901063486948, | |
| "eval_loss": 4.0086283683776855, | |
| "eval_runtime": 13.2747, | |
| "eval_samples_per_second": 75.331, | |
| "eval_steps_per_second": 2.411, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.675797615211086, | |
| "grad_norm": 1.0653229015184138, | |
| "learning_rate": 0.0009404130735810749, | |
| "loss": 4.015, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8047051240734773, | |
| "grad_norm": 0.9867803701996282, | |
| "learning_rate": 0.0009303151240591263, | |
| "loss": 4.0545, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8047051240734773, | |
| "eval_loss": 4.0230817794799805, | |
| "eval_runtime": 13.2485, | |
| "eval_samples_per_second": 75.48, | |
| "eval_steps_per_second": 2.415, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9336126329358685, | |
| "grad_norm": 1.3505264820000504, | |
| "learning_rate": 0.0009194905107374401, | |
| "loss": 4.0391, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.06252014179826, | |
| "grad_norm": 1.4563064030505048, | |
| "learning_rate": 0.0009079575341694557, | |
| "loss": 4.0552, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.06252014179826, | |
| "eval_loss": 4.040332317352295, | |
| "eval_runtime": 13.2387, | |
| "eval_samples_per_second": 75.536, | |
| "eval_steps_per_second": 2.417, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.191427650660651, | |
| "grad_norm": 1.2688404260592696, | |
| "learning_rate": 0.0008957356924977609, | |
| "loss": 4.0285, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.3203351595230424, | |
| "grad_norm": 1.922527341942018, | |
| "learning_rate": 0.0008828456484896984, | |
| "loss": 4.0291, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.3203351595230424, | |
| "eval_loss": 4.0184197425842285, | |
| "eval_runtime": 13.2837, | |
| "eval_samples_per_second": 75.28, | |
| "eval_steps_per_second": 2.409, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.4492426683854336, | |
| "grad_norm": 1.1751317607621916, | |
| "learning_rate": 0.0008693091946040104, | |
| "loss": 4.0198, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.578150177247825, | |
| "grad_norm": 1.4781189301871374, | |
| "learning_rate": 0.000855149216147576, | |
| "loss": 4.0389, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.578150177247825, | |
| "eval_loss": 4.022523403167725, | |
| "eval_runtime": 13.2647, | |
| "eval_samples_per_second": 75.388, | |
| "eval_steps_per_second": 2.412, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.707057686110216, | |
| "grad_norm": 2.8481102078113363, | |
| "learning_rate": 0.000840389652584536, | |
| "loss": 4.0337, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.8359651949726072, | |
| "grad_norm": 1.2842129496698058, | |
| "learning_rate": 0.0008250554570632107, | |
| "loss": 4.0293, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.8359651949726072, | |
| "eval_loss": 4.018503189086914, | |
| "eval_runtime": 13.2539, | |
| "eval_samples_per_second": 75.45, | |
| "eval_steps_per_second": 2.414, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.9648727038349985, | |
| "grad_norm": 3.7613179214902197, | |
| "learning_rate": 0.0008091725542292438, | |
| "loss": 4.0283, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.0937802126973897, | |
| "grad_norm": 2.6521101503823346, | |
| "learning_rate": 0.000792767796396289, | |
| "loss": 4.0347, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.0937802126973897, | |
| "eval_loss": 4.021185398101807, | |
| "eval_runtime": 13.2939, | |
| "eval_samples_per_second": 75.222, | |
| "eval_steps_per_second": 2.407, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.222687721559781, | |
| "grad_norm": 1.7003701014793156, | |
| "learning_rate": 0.0007758689181483412, | |
| "loss": 4.0275, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.351595230422172, | |
| "grad_norm": 1.3295380943734334, | |
| "learning_rate": 0.000758504489450466, | |
| "loss": 4.0208, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.351595230422172, | |
| "eval_loss": 4.011188507080078, | |
| "eval_runtime": 13.2452, | |
| "eval_samples_per_second": 75.499, | |
| "eval_steps_per_second": 2.416, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.4805027392845633, | |
| "grad_norm": 1.3487659259714693, | |
| "learning_rate": 0.0007407038673471959, | |
| "loss": 4.0191, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.6094102481469545, | |
| "grad_norm": 0.9861657777387535, | |
| "learning_rate": 0.0007224971463302587, | |
| "loss": 4.0204, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.6094102481469545, | |
| "eval_loss": 4.0111846923828125, | |
| "eval_runtime": 13.2369, | |
| "eval_samples_per_second": 75.546, | |
| "eval_steps_per_second": 2.417, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.7383177570093458, | |
| "grad_norm": 1.0800491613545316, | |
| "learning_rate": 0.0007039151074595432, | |
| "loss": 4.024, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.867225265871737, | |
| "grad_norm": 0.9644339387320809, | |
| "learning_rate": 0.0006849891663233264, | |
| "loss": 4.023, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.867225265871737, | |
| "eval_loss": 4.018366813659668, | |
| "eval_runtime": 13.2404, | |
| "eval_samples_per_second": 75.526, | |
| "eval_steps_per_second": 2.417, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.996132774734128, | |
| "grad_norm": 0.8344627432775175, | |
| "learning_rate": 0.0006657513199257385, | |
| "loss": 4.0233, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.12504028359652, | |
| "grad_norm": 1.4143353248285266, | |
| "learning_rate": 0.0006462340925912611, | |
| "loss": 4.0253, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.12504028359652, | |
| "eval_loss": 4.0222063064575195, | |
| "eval_runtime": 13.2332, | |
| "eval_samples_per_second": 75.567, | |
| "eval_steps_per_second": 2.418, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.253947792458911, | |
| "grad_norm": 0.970349395404879, | |
| "learning_rate": 0.0006264704809777159, | |
| "loss": 4.02, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.382855301321302, | |
| "grad_norm": 0.7142593470613441, | |
| "learning_rate": 0.0006064938982907064, | |
| "loss": 4.0186, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.382855301321302, | |
| "eval_loss": 4.019730567932129, | |
| "eval_runtime": 13.2783, | |
| "eval_samples_per_second": 75.311, | |
| "eval_steps_per_second": 2.41, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.5117628101836935, | |
| "grad_norm": 1.088690826939789, | |
| "learning_rate": 0.0005863381177938257, | |
| "loss": 4.0203, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.640670319046085, | |
| "grad_norm": 0.81043587990677, | |
| "learning_rate": 0.0005660372157101351, | |
| "loss": 4.0124, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.640670319046085, | |
| "eval_loss": 4.00410795211792, | |
| "eval_runtime": 13.2665, | |
| "eval_samples_per_second": 75.378, | |
| "eval_steps_per_second": 2.412, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.769577827908476, | |
| "grad_norm": 1.218343880260603, | |
| "learning_rate": 0.0005456255136114464, | |
| "loss": 4.0095, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.898485336770867, | |
| "grad_norm": 1.2906531132627939, | |
| "learning_rate": 0.0005251375203928073, | |
| "loss": 4.013, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.898485336770867, | |
| "eval_loss": 4.00699520111084, | |
| "eval_runtime": 13.2744, | |
| "eval_samples_per_second": 75.333, | |
| "eval_steps_per_second": 2.411, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 5.027392845633258, | |
| "grad_norm": 1.0003413201127862, | |
| "learning_rate": 0.0005046078739302906, | |
| "loss": 4.0152, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 5.15630035449565, | |
| "grad_norm": 0.7713832151052984, | |
| "learning_rate": 0.00048407128252072126, | |
| "loss": 4.0107, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.15630035449565, | |
| "eval_loss": 4.002959728240967, | |
| "eval_runtime": 13.3166, | |
| "eval_samples_per_second": 75.094, | |
| "eval_steps_per_second": 2.403, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.285207863358041, | |
| "grad_norm": 0.6926092337577483, | |
| "learning_rate": 0.0004635624662023483, | |
| "loss": 4.0076, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.414115372220432, | |
| "grad_norm": 0.9646552016098948, | |
| "learning_rate": 0.00044311609805566555, | |
| "loss": 4.0071, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.414115372220432, | |
| "eval_loss": 3.9943130016326904, | |
| "eval_runtime": 13.2641, | |
| "eval_samples_per_second": 75.392, | |
| "eval_steps_per_second": 2.413, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.543022881082823, | |
| "grad_norm": 0.8166002448538517, | |
| "learning_rate": 0.00042276674558362195, | |
| "loss": 4.0057, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 5.6719303899452145, | |
| "grad_norm": 0.49959029988570036, | |
| "learning_rate": 0.0004025488122703244, | |
| "loss": 4.0044, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.6719303899452145, | |
| "eval_loss": 3.9941368103027344, | |
| "eval_runtime": 13.2299, | |
| "eval_samples_per_second": 75.586, | |
| "eval_steps_per_second": 2.419, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.800837898807606, | |
| "grad_norm": 0.604816142158251, | |
| "learning_rate": 0.00038249647941703896, | |
| "loss": 4.0004, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.929745407669997, | |
| "grad_norm": 1.051098262809377, | |
| "learning_rate": 0.00036264364835382446, | |
| "loss": 4.0034, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.929745407669997, | |
| "eval_loss": 3.991546392440796, | |
| "eval_runtime": 13.2828, | |
| "eval_samples_per_second": 75.286, | |
| "eval_steps_per_second": 2.409, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 6.058652916532388, | |
| "grad_norm": 1.2325517427842247, | |
| "learning_rate": 0.00034302388312449484, | |
| "loss": 4.0017, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 6.187560425394779, | |
| "grad_norm": 0.5210785551693312, | |
| "learning_rate": 0.0003236703537418149, | |
| "loss": 4.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.187560425394779, | |
| "eval_loss": 3.9918057918548584, | |
| "eval_runtime": 13.2383, | |
| "eval_samples_per_second": 75.538, | |
| "eval_steps_per_second": 2.417, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.316467934257171, | |
| "grad_norm": 0.8240857136804, | |
| "learning_rate": 0.0003046157801088601, | |
| "loss": 3.9959, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 6.445375443119562, | |
| "grad_norm": 0.8419495345371018, | |
| "learning_rate": 0.00028589237670135104, | |
| "loss": 3.9975, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.445375443119562, | |
| "eval_loss": 3.9909591674804688, | |
| "eval_runtime": 13.1974, | |
| "eval_samples_per_second": 75.772, | |
| "eval_steps_per_second": 2.425, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.574282951981953, | |
| "grad_norm": 2.509427017361252, | |
| "learning_rate": 0.00026753179810448663, | |
| "loss": 3.9967, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 6.703190460844344, | |
| "grad_norm": 8.950423322917407, | |
| "learning_rate": 0.00024956508549634946, | |
| "loss": 3.9988, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.703190460844344, | |
| "eval_loss": 3.988790512084961, | |
| "eval_runtime": 13.3034, | |
| "eval_samples_per_second": 75.169, | |
| "eval_steps_per_second": 2.405, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.832097969706735, | |
| "grad_norm": 1.705160199155534, | |
| "learning_rate": 0.00023202261416836547, | |
| "loss": 3.9991, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 6.961005478569127, | |
| "grad_norm": 0.932479506457096, | |
| "learning_rate": 0.00021493404217153968, | |
| "loss": 3.995, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.961005478569127, | |
| "eval_loss": 3.9868602752685547, | |
| "eval_runtime": 13.269, | |
| "eval_samples_per_second": 75.364, | |
| "eval_steps_per_second": 2.412, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 7.089912987431518, | |
| "grad_norm": 0.9530127193674827, | |
| "learning_rate": 0.0001983282601752883, | |
| "loss": 3.995, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 7.218820496293909, | |
| "grad_norm": 0.7419385503887451, | |
| "learning_rate": 0.00018223334262363987, | |
| "loss": 3.9892, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 7.218820496293909, | |
| "eval_loss": 3.9841201305389404, | |
| "eval_runtime": 13.3292, | |
| "eval_samples_per_second": 75.023, | |
| "eval_steps_per_second": 2.401, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 7.3477280051563, | |
| "grad_norm": 1.2151076203513222, | |
| "learning_rate": 0.00016667650027138136, | |
| "loss": 3.9926, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 7.4766355140186915, | |
| "grad_norm": 0.8366273124185153, | |
| "learning_rate": 0.00015168403418039446, | |
| "loss": 3.9918, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 7.4766355140186915, | |
| "eval_loss": 3.9802000522613525, | |
| "eval_runtime": 13.3133, | |
| "eval_samples_per_second": 75.113, | |
| "eval_steps_per_second": 2.404, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 7.605543022881083, | |
| "grad_norm": 0.7843548485538294, | |
| "learning_rate": 0.0001372812912539579, | |
| "loss": 3.9914, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 7.734450531743474, | |
| "grad_norm": 1.8243695731678793, | |
| "learning_rate": 0.0001234926213841896, | |
| "loss": 3.9869, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 7.734450531743474, | |
| "eval_loss": 3.9797914028167725, | |
| "eval_runtime": 13.28, | |
| "eval_samples_per_second": 75.301, | |
| "eval_steps_per_second": 2.41, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 7.863358040605865, | |
| "grad_norm": 1.0099425089232086, | |
| "learning_rate": 0.00011034133628508007, | |
| "loss": 3.9861, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 7.992265549468256, | |
| "grad_norm": 0.7468508814468853, | |
| "learning_rate": 9.784967008071359e-05, | |
| "loss": 3.9879, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 7.992265549468256, | |
| "eval_loss": 3.9778568744659424, | |
| "eval_runtime": 13.2408, | |
| "eval_samples_per_second": 75.524, | |
| "eval_steps_per_second": 2.417, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 8.121173058330648, | |
| "grad_norm": 0.946264274777096, | |
| "learning_rate": 8.60387417153083e-05, | |
| "loss": 3.9853, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 8.25008056719304, | |
| "grad_norm": 0.9395931478621461, | |
| "learning_rate": 7.492851924862745e-05, | |
| "loss": 3.9849, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 8.25008056719304, | |
| "eval_loss": 3.976435899734497, | |
| "eval_runtime": 13.2788, | |
| "eval_samples_per_second": 75.308, | |
| "eval_steps_per_second": 2.41, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 8.37898807605543, | |
| "grad_norm": 0.7564304900836561, | |
| "learning_rate": 6.453778609712553e-05, | |
| "loss": 3.9837, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 8.507895584917822, | |
| "grad_norm": 2.7551475980059297, | |
| "learning_rate": 5.4884109277901715e-05, | |
| "loss": 3.9855, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 8.507895584917822, | |
| "eval_loss": 3.9746084213256836, | |
| "eval_runtime": 13.2449, | |
| "eval_samples_per_second": 75.501, | |
| "eval_steps_per_second": 2.416, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 8.636803093780212, | |
| "grad_norm": 0.8044306898019998, | |
| "learning_rate": 4.5983809709151646e-05, | |
| "loss": 3.9826, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 8.765710602642605, | |
| "grad_norm": 0.9568034742858677, | |
| "learning_rate": 3.7851934617325454e-05, | |
| "loss": 3.9828, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 8.765710602642605, | |
| "eval_loss": 3.973742723464966, | |
| "eval_runtime": 13.2588, | |
| "eval_samples_per_second": 75.422, | |
| "eval_steps_per_second": 2.413, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 8.894618111504995, | |
| "grad_norm": 1.4505687452280627, | |
| "learning_rate": 3.0502232097644462e-05, | |
| "loss": 3.9813, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 9.023525620367387, | |
| "grad_norm": 2.29574427985573, | |
| "learning_rate": 2.3947127870985044e-05, | |
| "loss": 3.9807, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 9.023525620367387, | |
| "eval_loss": 3.973203659057617, | |
| "eval_runtime": 13.2536, | |
| "eval_samples_per_second": 75.451, | |
| "eval_steps_per_second": 2.414, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 9.152433129229777, | |
| "grad_norm": 1.8077601710285893, | |
| "learning_rate": 1.8197704276423508e-05, | |
| "loss": 3.9793, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 9.28134063809217, | |
| "grad_norm": 0.7300497045069999, | |
| "learning_rate": 1.3263681534961238e-05, | |
| "loss": 3.9804, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 9.28134063809217, | |
| "eval_loss": 3.9732446670532227, | |
| "eval_runtime": 13.2151, | |
| "eval_samples_per_second": 75.671, | |
| "eval_steps_per_second": 2.421, | |
| "step": 3600 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3870, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3201337160368128.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |