|
{ |
|
"best_metric": 2.5859532356262207, |
|
"best_model_checkpoint": "./FT_models/[LDH]0224_docs_chatml/checkpoint-500", |
|
"epoch": 52.23880597014925, |
|
"eval_steps": 500, |
|
"global_step": 3500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.7784017324447632, |
|
"learning_rate": 0.0001999995055317446, |
|
"loss": 2.1851, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.5991038084030151, |
|
"learning_rate": 0.0001999955498150411, |
|
"loss": 1.5526, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.6780698895454407, |
|
"learning_rate": 0.00019998763853811184, |
|
"loss": 1.405, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.7235689759254456, |
|
"learning_rate": 0.00019997577201390606, |
|
"loss": 1.3309, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.7904847860336304, |
|
"learning_rate": 0.0001999599507118322, |
|
"loss": 1.1353, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.7685543894767761, |
|
"learning_rate": 0.00019994017525773913, |
|
"loss": 1.0295, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.044776119402985, |
|
"grad_norm": 0.963201642036438, |
|
"learning_rate": 0.0001999164464338918, |
|
"loss": 0.9815, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.1940298507462686, |
|
"grad_norm": 1.0725153684616089, |
|
"learning_rate": 0.0001998887651789398, |
|
"loss": 0.7019, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.3432835820895521, |
|
"grad_norm": 1.2167037725448608, |
|
"learning_rate": 0.0001998571325878806, |
|
"loss": 0.639, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 1.256081461906433, |
|
"learning_rate": 0.00019982154991201608, |
|
"loss": 0.5614, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6417910447761193, |
|
"grad_norm": 0.7639100551605225, |
|
"learning_rate": 0.00019978201855890308, |
|
"loss": 0.5479, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.7910447761194028, |
|
"grad_norm": 1.0223500728607178, |
|
"learning_rate": 0.00019973854009229763, |
|
"loss": 0.3933, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.9402985074626866, |
|
"grad_norm": 1.0072954893112183, |
|
"learning_rate": 0.00019969111623209323, |
|
"loss": 0.4029, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.08955223880597, |
|
"grad_norm": 1.0809686183929443, |
|
"learning_rate": 0.00019963974885425266, |
|
"loss": 0.3913, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.2388059701492535, |
|
"grad_norm": 0.4724240005016327, |
|
"learning_rate": 0.00019958443999073397, |
|
"loss": 0.2301, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.388059701492537, |
|
"grad_norm": 0.6505578756332397, |
|
"learning_rate": 0.00019952519182940993, |
|
"loss": 0.2533, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.5373134328358207, |
|
"grad_norm": 0.39939674735069275, |
|
"learning_rate": 0.0001994620067139815, |
|
"loss": 0.1551, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.6865671641791042, |
|
"grad_norm": 0.26736512780189514, |
|
"learning_rate": 0.00019939488714388524, |
|
"loss": 0.2489, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.835820895522388, |
|
"grad_norm": 0.7464708089828491, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 0.2118, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.348090261220932, |
|
"learning_rate": 0.0001992488554155135, |
|
"loss": 0.2277, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.1343283582089554, |
|
"grad_norm": 0.4606758654117584, |
|
"learning_rate": 0.0001991699490338681, |
|
"loss": 0.1216, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.283582089552239, |
|
"grad_norm": 0.5326898694038391, |
|
"learning_rate": 0.00019908711975058637, |
|
"loss": 0.0946, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.4328358208955225, |
|
"grad_norm": 0.5780682563781738, |
|
"learning_rate": 0.00019900037084217637, |
|
"loss": 0.0982, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.582089552238806, |
|
"grad_norm": 0.5874966382980347, |
|
"learning_rate": 0.00019890970574019617, |
|
"loss": 0.1244, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.7313432835820897, |
|
"grad_norm": 0.5044897198677063, |
|
"learning_rate": 0.00019881512803111796, |
|
"loss": 0.1127, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8805970149253732, |
|
"grad_norm": 0.45550045371055603, |
|
"learning_rate": 0.00019871664145618657, |
|
"loss": 0.1211, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.029850746268656, |
|
"grad_norm": 0.3092655539512634, |
|
"learning_rate": 0.00019861424991127115, |
|
"loss": 0.1156, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.17910447761194, |
|
"grad_norm": 0.4061109721660614, |
|
"learning_rate": 0.00019850795744671116, |
|
"loss": 0.0695, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.3283582089552235, |
|
"grad_norm": 0.36566805839538574, |
|
"learning_rate": 0.00019839776826715614, |
|
"loss": 0.0677, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"grad_norm": 0.29778343439102173, |
|
"learning_rate": 0.00019828368673139947, |
|
"loss": 0.0445, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.6268656716417915, |
|
"grad_norm": 0.18671129643917084, |
|
"learning_rate": 0.00019816571735220583, |
|
"loss": 0.0644, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.776119402985074, |
|
"grad_norm": 0.3877570629119873, |
|
"learning_rate": 0.0001980438647961327, |
|
"loss": 0.0912, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.925373134328359, |
|
"grad_norm": 0.5179444551467896, |
|
"learning_rate": 0.00019791813388334581, |
|
"loss": 0.0607, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.074626865671641, |
|
"grad_norm": 0.15815891325473785, |
|
"learning_rate": 0.00019778852958742853, |
|
"loss": 0.0542, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.223880597014926, |
|
"grad_norm": 0.42946234345436096, |
|
"learning_rate": 0.00019765505703518496, |
|
"loss": 0.0563, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.373134328358209, |
|
"grad_norm": 0.5582062005996704, |
|
"learning_rate": 0.00019751772150643722, |
|
"loss": 0.0408, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.522388059701493, |
|
"grad_norm": 0.2707679569721222, |
|
"learning_rate": 0.0001973765284338167, |
|
"loss": 0.0395, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.6716417910447765, |
|
"grad_norm": 0.5061952471733093, |
|
"learning_rate": 0.00019723148340254892, |
|
"loss": 0.0501, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.82089552238806, |
|
"grad_norm": 0.31659457087516785, |
|
"learning_rate": 0.0001970825921502328, |
|
"loss": 0.049, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.970149253731344, |
|
"grad_norm": 0.2836756110191345, |
|
"learning_rate": 0.00019692986056661356, |
|
"loss": 0.0483, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 6.119402985074627, |
|
"grad_norm": 0.1509529948234558, |
|
"learning_rate": 0.0001967732946933499, |
|
"loss": 0.0394, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 6.268656716417911, |
|
"grad_norm": 0.2609387934207916, |
|
"learning_rate": 0.00019661290072377482, |
|
"loss": 0.0388, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.417910447761194, |
|
"grad_norm": 0.5987827777862549, |
|
"learning_rate": 0.0001964486850026507, |
|
"loss": 0.0383, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.567164179104478, |
|
"grad_norm": 0.4549473822116852, |
|
"learning_rate": 0.00019628065402591845, |
|
"loss": 0.038, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.7164179104477615, |
|
"grad_norm": 0.2026955932378769, |
|
"learning_rate": 0.0001961088144404403, |
|
"loss": 0.0384, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.865671641791045, |
|
"grad_norm": 0.21201670169830322, |
|
"learning_rate": 0.00019593317304373705, |
|
"loss": 0.0347, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 7.014925373134329, |
|
"grad_norm": 0.0839223638176918, |
|
"learning_rate": 0.00019575373678371909, |
|
"loss": 0.0458, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 7.164179104477612, |
|
"grad_norm": 0.38603273034095764, |
|
"learning_rate": 0.0001955705127584117, |
|
"loss": 0.0366, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 7.313432835820896, |
|
"grad_norm": 0.22731441259384155, |
|
"learning_rate": 0.00019538350821567404, |
|
"loss": 0.0324, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.462686567164179, |
|
"grad_norm": 0.3230607509613037, |
|
"learning_rate": 0.00019519273055291266, |
|
"loss": 0.0321, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.462686567164179, |
|
"eval_loss": 2.5859532356262207, |
|
"eval_runtime": 34.8624, |
|
"eval_samples_per_second": 9.093, |
|
"eval_steps_per_second": 4.561, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.611940298507463, |
|
"grad_norm": 0.1323220282793045, |
|
"learning_rate": 0.00019499818731678873, |
|
"loss": 0.0271, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.7611940298507465, |
|
"grad_norm": 0.17675843834877014, |
|
"learning_rate": 0.00019479988620291956, |
|
"loss": 0.0356, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.91044776119403, |
|
"grad_norm": 0.5695576071739197, |
|
"learning_rate": 0.00019459783505557424, |
|
"loss": 0.0377, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 8.059701492537313, |
|
"grad_norm": 0.14659564197063446, |
|
"learning_rate": 0.0001943920418673633, |
|
"loss": 0.0295, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 8.208955223880597, |
|
"grad_norm": 0.3027053475379944, |
|
"learning_rate": 0.0001941825147789225, |
|
"loss": 0.0283, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 8.35820895522388, |
|
"grad_norm": 0.3131660521030426, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.0354, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.507462686567164, |
|
"grad_norm": 0.2541915476322174, |
|
"learning_rate": 0.00019375229220208276, |
|
"loss": 0.0305, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.656716417910447, |
|
"grad_norm": 0.15617048740386963, |
|
"learning_rate": 0.0001935316137321543, |
|
"loss": 0.0374, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.805970149253731, |
|
"grad_norm": 0.8228221535682678, |
|
"learning_rate": 0.00019330723539826375, |
|
"loss": 0.0332, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.955223880597014, |
|
"grad_norm": 0.29615482687950134, |
|
"learning_rate": 0.0001930791660762262, |
|
"loss": 0.0319, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 9.104477611940299, |
|
"grad_norm": 0.222768172621727, |
|
"learning_rate": 0.0001928474147878626, |
|
"loss": 0.0279, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 9.253731343283581, |
|
"grad_norm": 0.1895366758108139, |
|
"learning_rate": 0.0001926119907006426, |
|
"loss": 0.0337, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 9.402985074626866, |
|
"grad_norm": 0.19570475816726685, |
|
"learning_rate": 0.00019237290312732226, |
|
"loss": 0.0276, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.552238805970148, |
|
"grad_norm": 0.5487334728240967, |
|
"learning_rate": 0.0001921301615255754, |
|
"loss": 0.0346, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.701492537313433, |
|
"grad_norm": 0.17226798832416534, |
|
"learning_rate": 0.00019188377549761963, |
|
"loss": 0.0256, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.850746268656717, |
|
"grad_norm": 0.2766994833946228, |
|
"learning_rate": 0.00019163375478983632, |
|
"loss": 0.0296, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.25644323229789734, |
|
"learning_rate": 0.00019138010929238534, |
|
"loss": 0.0326, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 10.149253731343283, |
|
"grad_norm": 0.3244101405143738, |
|
"learning_rate": 0.0001911228490388136, |
|
"loss": 0.0262, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 10.298507462686567, |
|
"grad_norm": 0.24429234862327576, |
|
"learning_rate": 0.00019086198420565823, |
|
"loss": 0.0321, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 10.447761194029852, |
|
"grad_norm": 0.15597452223300934, |
|
"learning_rate": 0.000190597525112044, |
|
"loss": 0.0314, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.597014925373134, |
|
"grad_norm": 0.22565604746341705, |
|
"learning_rate": 0.00019032948221927524, |
|
"loss": 0.028, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 10.746268656716419, |
|
"grad_norm": 0.1876748502254486, |
|
"learning_rate": 0.00019005786613042185, |
|
"loss": 0.0292, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 10.895522388059701, |
|
"grad_norm": 0.33821797370910645, |
|
"learning_rate": 0.00018978268758989991, |
|
"loss": 0.0289, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 11.044776119402986, |
|
"grad_norm": 0.1624440997838974, |
|
"learning_rate": 0.00018950395748304678, |
|
"loss": 0.0306, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 11.194029850746269, |
|
"grad_norm": 0.17152149975299835, |
|
"learning_rate": 0.0001892216868356904, |
|
"loss": 0.0293, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 11.343283582089553, |
|
"grad_norm": 0.22814153134822845, |
|
"learning_rate": 0.00018893588681371303, |
|
"loss": 0.0294, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 11.492537313432836, |
|
"grad_norm": 0.3319993317127228, |
|
"learning_rate": 0.00018864656872260985, |
|
"loss": 0.0282, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 11.64179104477612, |
|
"grad_norm": 0.1329302042722702, |
|
"learning_rate": 0.00018835374400704154, |
|
"loss": 0.029, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 11.791044776119403, |
|
"grad_norm": 0.2312331646680832, |
|
"learning_rate": 0.00018805742425038145, |
|
"loss": 0.0313, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 11.940298507462687, |
|
"grad_norm": 0.11133890599012375, |
|
"learning_rate": 0.00018775762117425777, |
|
"loss": 0.0291, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 12.08955223880597, |
|
"grad_norm": 0.42977747321128845, |
|
"learning_rate": 0.00018745434663808942, |
|
"loss": 0.0293, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 12.238805970149254, |
|
"grad_norm": 0.4400336742401123, |
|
"learning_rate": 0.00018714761263861728, |
|
"loss": 0.0258, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 12.388059701492537, |
|
"grad_norm": 0.3886387348175049, |
|
"learning_rate": 0.00018683743130942928, |
|
"loss": 0.03, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 12.537313432835822, |
|
"grad_norm": 0.1791338473558426, |
|
"learning_rate": 0.00018652381492048083, |
|
"loss": 0.0259, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 12.686567164179104, |
|
"grad_norm": 0.17430442571640015, |
|
"learning_rate": 0.00018620677587760916, |
|
"loss": 0.0282, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 12.835820895522389, |
|
"grad_norm": 0.3342001140117645, |
|
"learning_rate": 0.00018588632672204264, |
|
"loss": 0.0301, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 12.985074626865671, |
|
"grad_norm": 0.2671709358692169, |
|
"learning_rate": 0.00018556248012990468, |
|
"loss": 0.0269, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 13.134328358208956, |
|
"grad_norm": 0.27415236830711365, |
|
"learning_rate": 0.0001852352489117124, |
|
"loss": 0.026, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 13.283582089552239, |
|
"grad_norm": 0.13104292750358582, |
|
"learning_rate": 0.0001849046460118698, |
|
"loss": 0.0228, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 13.432835820895523, |
|
"grad_norm": 0.0972188338637352, |
|
"learning_rate": 0.00018457068450815562, |
|
"loss": 0.0224, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 13.582089552238806, |
|
"grad_norm": 0.36645978689193726, |
|
"learning_rate": 0.00018423337761120618, |
|
"loss": 0.023, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 13.73134328358209, |
|
"grad_norm": 0.09311423450708389, |
|
"learning_rate": 0.00018389273866399275, |
|
"loss": 0.0249, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 13.880597014925373, |
|
"grad_norm": 0.26524588465690613, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.0283, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 14.029850746268657, |
|
"grad_norm": 0.09386450052261353, |
|
"learning_rate": 0.00018320151864916135, |
|
"loss": 0.0248, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 14.17910447761194, |
|
"grad_norm": 0.15628236532211304, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 0.0242, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 14.328358208955224, |
|
"grad_norm": 0.09542356431484222, |
|
"learning_rate": 0.00018249713383394303, |
|
"loss": 0.0227, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 14.477611940298507, |
|
"grad_norm": 0.07725339382886887, |
|
"learning_rate": 0.00018214003937446253, |
|
"loss": 0.0238, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 14.626865671641792, |
|
"grad_norm": 0.10331985354423523, |
|
"learning_rate": 0.0001817796956716578, |
|
"loss": 0.0269, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 14.776119402985074, |
|
"grad_norm": 0.15463414788246155, |
|
"learning_rate": 0.00018141611697977529, |
|
"loss": 0.0217, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 14.925373134328359, |
|
"grad_norm": 0.14817017316818237, |
|
"learning_rate": 0.0001810493176810292, |
|
"loss": 0.025, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 14.925373134328359, |
|
"eval_loss": 2.6268842220306396, |
|
"eval_runtime": 34.7235, |
|
"eval_samples_per_second": 9.129, |
|
"eval_steps_per_second": 4.579, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 15.074626865671641, |
|
"grad_norm": 0.12652496993541718, |
|
"learning_rate": 0.00018067931228503246, |
|
"loss": 0.0209, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 15.223880597014926, |
|
"grad_norm": 0.10728372633457184, |
|
"learning_rate": 0.00018030611542822257, |
|
"loss": 0.0222, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 15.373134328358208, |
|
"grad_norm": 0.06331785768270493, |
|
"learning_rate": 0.00017992974187328305, |
|
"loss": 0.0217, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 15.522388059701493, |
|
"grad_norm": 0.21516793966293335, |
|
"learning_rate": 0.000179550206508559, |
|
"loss": 0.022, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 15.671641791044776, |
|
"grad_norm": 0.06852179765701294, |
|
"learning_rate": 0.00017916752434746856, |
|
"loss": 0.0213, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 15.82089552238806, |
|
"grad_norm": 0.07877414673566818, |
|
"learning_rate": 0.00017878171052790868, |
|
"loss": 0.0244, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 15.970149253731343, |
|
"grad_norm": 0.28540125489234924, |
|
"learning_rate": 0.00017839278031165658, |
|
"loss": 0.0244, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 16.119402985074625, |
|
"grad_norm": 0.060050830245018005, |
|
"learning_rate": 0.00017800074908376584, |
|
"loss": 0.0197, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 16.26865671641791, |
|
"grad_norm": 0.14403647184371948, |
|
"learning_rate": 0.0001776056323519579, |
|
"loss": 0.0197, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 16.417910447761194, |
|
"grad_norm": 0.14129780232906342, |
|
"learning_rate": 0.00017720744574600863, |
|
"loss": 0.0235, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 16.567164179104477, |
|
"grad_norm": 0.14378951489925385, |
|
"learning_rate": 0.00017680620501712996, |
|
"loss": 0.0213, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 16.71641791044776, |
|
"grad_norm": 0.09062501788139343, |
|
"learning_rate": 0.00017640192603734692, |
|
"loss": 0.0247, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 16.865671641791046, |
|
"grad_norm": 0.33645904064178467, |
|
"learning_rate": 0.00017599462479886974, |
|
"loss": 0.0236, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 17.01492537313433, |
|
"grad_norm": 0.10567416250705719, |
|
"learning_rate": 0.00017558431741346122, |
|
"loss": 0.0227, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 17.16417910447761, |
|
"grad_norm": 0.3064155578613281, |
|
"learning_rate": 0.00017517102011179933, |
|
"loss": 0.0238, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 17.313432835820894, |
|
"grad_norm": 0.256085067987442, |
|
"learning_rate": 0.00017475474924283536, |
|
"loss": 0.021, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 17.46268656716418, |
|
"grad_norm": 0.8420864343643188, |
|
"learning_rate": 0.000174335521273147, |
|
"loss": 0.0241, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 17.611940298507463, |
|
"grad_norm": 0.21292446553707123, |
|
"learning_rate": 0.00017391335278628712, |
|
"loss": 0.0251, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 17.761194029850746, |
|
"grad_norm": 0.14111167192459106, |
|
"learning_rate": 0.0001734882604821276, |
|
"loss": 0.0251, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 17.91044776119403, |
|
"grad_norm": 0.2401103377342224, |
|
"learning_rate": 0.00017306026117619889, |
|
"loss": 0.0294, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 18.059701492537314, |
|
"grad_norm": 0.19608861207962036, |
|
"learning_rate": 0.00017262937179902472, |
|
"loss": 0.025, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 18.208955223880597, |
|
"grad_norm": 0.10760471224784851, |
|
"learning_rate": 0.00017219560939545246, |
|
"loss": 0.0252, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 18.35820895522388, |
|
"grad_norm": 0.1591477394104004, |
|
"learning_rate": 0.0001717589911239788, |
|
"loss": 0.0225, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 18.507462686567163, |
|
"grad_norm": 0.21945133805274963, |
|
"learning_rate": 0.00017131953425607104, |
|
"loss": 0.0234, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 18.65671641791045, |
|
"grad_norm": 0.147572860121727, |
|
"learning_rate": 0.00017087725617548385, |
|
"loss": 0.0216, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 18.80597014925373, |
|
"grad_norm": 0.19251006841659546, |
|
"learning_rate": 0.00017043217437757164, |
|
"loss": 0.0258, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 18.955223880597014, |
|
"grad_norm": 0.3675619959831238, |
|
"learning_rate": 0.00016998430646859654, |
|
"loss": 0.0263, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 19.104477611940297, |
|
"grad_norm": 0.2582189440727234, |
|
"learning_rate": 0.00016953367016503182, |
|
"loss": 0.0258, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 19.253731343283583, |
|
"grad_norm": 0.22689688205718994, |
|
"learning_rate": 0.00016908028329286112, |
|
"loss": 0.0226, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 19.402985074626866, |
|
"grad_norm": 0.24671746790409088, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.0224, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 19.55223880597015, |
|
"grad_norm": 0.20800915360450745, |
|
"learning_rate": 0.00016816532968995328, |
|
"loss": 0.0276, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 19.701492537313435, |
|
"grad_norm": 0.27109411358833313, |
|
"learning_rate": 0.00016770379915236766, |
|
"loss": 0.0345, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 19.850746268656717, |
|
"grad_norm": 0.14396627247333527, |
|
"learning_rate": 0.00016723959043104728, |
|
"loss": 0.0244, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.3481923043727875, |
|
"learning_rate": 0.00016677272188886483, |
|
"loss": 0.0256, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 20.149253731343283, |
|
"grad_norm": 0.4062567353248596, |
|
"learning_rate": 0.00016630321199390867, |
|
"loss": 0.0195, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 20.298507462686565, |
|
"grad_norm": 0.41222649812698364, |
|
"learning_rate": 0.00016583107931875192, |
|
"loss": 0.023, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 20.44776119402985, |
|
"grad_norm": 0.15970966219902039, |
|
"learning_rate": 0.00016535634253971794, |
|
"loss": 0.024, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 20.597014925373134, |
|
"grad_norm": 0.2472175806760788, |
|
"learning_rate": 0.00016487902043614173, |
|
"loss": 0.0259, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 20.746268656716417, |
|
"grad_norm": 0.30030208826065063, |
|
"learning_rate": 0.00016439913188962685, |
|
"loss": 0.0287, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 20.895522388059703, |
|
"grad_norm": 0.17114506661891937, |
|
"learning_rate": 0.0001639166958832985, |
|
"loss": 0.0279, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 21.044776119402986, |
|
"grad_norm": 0.22200174629688263, |
|
"learning_rate": 0.00016343173150105278, |
|
"loss": 0.0254, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 21.19402985074627, |
|
"grad_norm": 0.3260650336742401, |
|
"learning_rate": 0.0001629442579268016, |
|
"loss": 0.0252, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 21.34328358208955, |
|
"grad_norm": 0.2409895658493042, |
|
"learning_rate": 0.0001624542944437139, |
|
"loss": 0.0255, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 21.492537313432837, |
|
"grad_norm": 0.37012478709220886, |
|
"learning_rate": 0.00016196186043345288, |
|
"loss": 0.0293, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 21.64179104477612, |
|
"grad_norm": 0.1385307013988495, |
|
"learning_rate": 0.00016146697537540924, |
|
"loss": 0.0273, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 21.791044776119403, |
|
"grad_norm": 0.33346429467201233, |
|
"learning_rate": 0.0001609696588459307, |
|
"loss": 0.0294, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 21.940298507462686, |
|
"grad_norm": 0.14864549040794373, |
|
"learning_rate": 0.00016046993051754756, |
|
"loss": 0.0269, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 22.08955223880597, |
|
"grad_norm": 0.11181981861591339, |
|
"learning_rate": 0.0001599678101581945, |
|
"loss": 0.022, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 22.238805970149254, |
|
"grad_norm": 0.18521477282047272, |
|
"learning_rate": 0.00015946331763042867, |
|
"loss": 0.0189, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 22.388059701492537, |
|
"grad_norm": 0.07250799983739853, |
|
"learning_rate": 0.00015895647289064396, |
|
"loss": 0.0191, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 22.388059701492537, |
|
"eval_loss": 2.901611566543579, |
|
"eval_runtime": 34.7441, |
|
"eval_samples_per_second": 9.124, |
|
"eval_steps_per_second": 4.576, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 22.53731343283582, |
|
"grad_norm": 0.11993291229009628, |
|
"learning_rate": 0.0001584472959882815, |
|
"loss": 0.0232, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 22.686567164179106, |
|
"grad_norm": 0.08708442002534866, |
|
"learning_rate": 0.0001579358070650367, |
|
"loss": 0.0203, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 22.83582089552239, |
|
"grad_norm": 0.17798146605491638, |
|
"learning_rate": 0.00015742202635406235, |
|
"loss": 0.0278, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 22.98507462686567, |
|
"grad_norm": 0.33858776092529297, |
|
"learning_rate": 0.0001569059741791684, |
|
"loss": 0.0266, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 23.134328358208954, |
|
"grad_norm": 0.14069178700447083, |
|
"learning_rate": 0.0001563876709540178, |
|
"loss": 0.0235, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 23.28358208955224, |
|
"grad_norm": 0.23493413627147675, |
|
"learning_rate": 0.00015586713718131922, |
|
"loss": 0.0225, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 23.432835820895523, |
|
"grad_norm": 0.23869164288043976, |
|
"learning_rate": 0.0001553443934520159, |
|
"loss": 0.0207, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 23.582089552238806, |
|
"grad_norm": 0.04802766814827919, |
|
"learning_rate": 0.00015481946044447099, |
|
"loss": 0.0208, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 23.73134328358209, |
|
"grad_norm": 0.10397443175315857, |
|
"learning_rate": 0.00015429235892364994, |
|
"loss": 0.0218, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 23.880597014925375, |
|
"grad_norm": 0.06602863222360611, |
|
"learning_rate": 0.00015376310974029873, |
|
"loss": 0.0191, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 24.029850746268657, |
|
"grad_norm": 0.12586766481399536, |
|
"learning_rate": 0.0001532317338301192, |
|
"loss": 0.0222, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 24.17910447761194, |
|
"grad_norm": 0.058740176260471344, |
|
"learning_rate": 0.00015269825221294098, |
|
"loss": 0.0194, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 24.328358208955223, |
|
"grad_norm": 0.046583324670791626, |
|
"learning_rate": 0.0001521626859918898, |
|
"loss": 0.0192, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 24.47761194029851, |
|
"grad_norm": 0.07122834771871567, |
|
"learning_rate": 0.00015162505635255287, |
|
"loss": 0.0191, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 24.62686567164179, |
|
"grad_norm": 0.17352773249149323, |
|
"learning_rate": 0.0001510853845621409, |
|
"loss": 0.0215, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 24.776119402985074, |
|
"grad_norm": 0.039071228355169296, |
|
"learning_rate": 0.00015054369196864644, |
|
"loss": 0.0194, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 24.925373134328357, |
|
"grad_norm": 0.0495145358145237, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0196, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 25.074626865671643, |
|
"grad_norm": 0.06830393522977829, |
|
"learning_rate": 0.0001494543301632219, |
|
"loss": 0.0196, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 25.223880597014926, |
|
"grad_norm": 0.046369269490242004, |
|
"learning_rate": 0.0001489067040435717, |
|
"loss": 0.0203, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 25.37313432835821, |
|
"grad_norm": 0.08401994407176971, |
|
"learning_rate": 0.00014835714330369446, |
|
"loss": 0.0177, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 25.52238805970149, |
|
"grad_norm": 0.15754491090774536, |
|
"learning_rate": 0.0001478056696827636, |
|
"loss": 0.0183, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 25.671641791044777, |
|
"grad_norm": 0.03660280629992485, |
|
"learning_rate": 0.00014725230499562119, |
|
"loss": 0.0187, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 25.82089552238806, |
|
"grad_norm": 0.03743986785411835, |
|
"learning_rate": 0.00014669707113191483, |
|
"loss": 0.0177, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 25.970149253731343, |
|
"grad_norm": 0.061036206781864166, |
|
"learning_rate": 0.00014613999005523174, |
|
"loss": 0.0216, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 26.119402985074625, |
|
"grad_norm": 0.052410393953323364, |
|
"learning_rate": 0.00014558108380223012, |
|
"loss": 0.0179, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 26.26865671641791, |
|
"grad_norm": 0.04502878338098526, |
|
"learning_rate": 0.00014502037448176734, |
|
"loss": 0.0173, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 26.417910447761194, |
|
"grad_norm": 0.09054642915725708, |
|
"learning_rate": 0.00014445788427402528, |
|
"loss": 0.0175, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 26.567164179104477, |
|
"grad_norm": 0.04439815133810043, |
|
"learning_rate": 0.00014389363542963306, |
|
"loss": 0.0166, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 26.71641791044776, |
|
"grad_norm": 0.057578206062316895, |
|
"learning_rate": 0.00014332765026878687, |
|
"loss": 0.0204, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 26.865671641791046, |
|
"grad_norm": 0.04316519573330879, |
|
"learning_rate": 0.00014275995118036693, |
|
"loss": 0.0204, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 27.01492537313433, |
|
"grad_norm": 0.044371455907821655, |
|
"learning_rate": 0.00014219056062105193, |
|
"loss": 0.0188, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 27.16417910447761, |
|
"grad_norm": 0.0417649932205677, |
|
"learning_rate": 0.00014161950111443077, |
|
"loss": 0.0167, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 27.313432835820894, |
|
"grad_norm": 0.08106902241706848, |
|
"learning_rate": 0.0001410467952501114, |
|
"loss": 0.0196, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 27.46268656716418, |
|
"grad_norm": 0.1356726437807083, |
|
"learning_rate": 0.00014047246568282736, |
|
"loss": 0.0214, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 27.611940298507463, |
|
"grad_norm": 0.11032121628522873, |
|
"learning_rate": 0.00013989653513154165, |
|
"loss": 0.0188, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 27.761194029850746, |
|
"grad_norm": 0.1375930905342102, |
|
"learning_rate": 0.0001393190263785479, |
|
"loss": 0.0217, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 27.91044776119403, |
|
"grad_norm": 0.12069140374660492, |
|
"learning_rate": 0.00013873996226856933, |
|
"loss": 0.0198, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 28.059701492537314, |
|
"grad_norm": 0.043185122311115265, |
|
"learning_rate": 0.00013815936570785487, |
|
"loss": 0.0182, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 28.208955223880597, |
|
"grad_norm": 0.14993228018283844, |
|
"learning_rate": 0.00013757725966327322, |
|
"loss": 0.0167, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 28.35820895522388, |
|
"grad_norm": 0.0337139368057251, |
|
"learning_rate": 0.00013699366716140435, |
|
"loss": 0.0186, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 28.507462686567163, |
|
"grad_norm": 0.26397907733917236, |
|
"learning_rate": 0.0001364086112876284, |
|
"loss": 0.0186, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 28.65671641791045, |
|
"grad_norm": 0.06083005666732788, |
|
"learning_rate": 0.00013582211518521273, |
|
"loss": 0.0195, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 28.80597014925373, |
|
"grad_norm": 0.10358510911464691, |
|
"learning_rate": 0.00013523420205439646, |
|
"loss": 0.0186, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 28.955223880597014, |
|
"grad_norm": 0.06939724832773209, |
|
"learning_rate": 0.00013464489515147238, |
|
"loss": 0.0201, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 29.104477611940297, |
|
"grad_norm": 0.038348495960235596, |
|
"learning_rate": 0.00013405421778786737, |
|
"loss": 0.0198, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 29.253731343283583, |
|
"grad_norm": 0.045545510947704315, |
|
"learning_rate": 0.00013346219332922016, |
|
"loss": 0.0199, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 29.402985074626866, |
|
"grad_norm": 0.03619709983468056, |
|
"learning_rate": 0.0001328688451944569, |
|
"loss": 0.018, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 29.55223880597015, |
|
"grad_norm": 0.03463749587535858, |
|
"learning_rate": 0.00013227419685486492, |
|
"loss": 0.0192, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 29.701492537313435, |
|
"grad_norm": 0.16788923740386963, |
|
"learning_rate": 0.0001316782718331643, |
|
"loss": 0.0209, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 29.850746268656717, |
|
"grad_norm": 0.04790572449564934, |
|
"learning_rate": 0.00013108109370257712, |
|
"loss": 0.0211, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 29.850746268656717, |
|
"eval_loss": 2.8224031925201416, |
|
"eval_runtime": 34.7119, |
|
"eval_samples_per_second": 9.132, |
|
"eval_steps_per_second": 4.581, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 0.33791643381118774, |
|
"learning_rate": 0.00013048268608589533, |
|
"loss": 0.0198, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 30.149253731343283, |
|
"grad_norm": 0.04068003594875336, |
|
"learning_rate": 0.00012988307265454597, |
|
"loss": 0.0192, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 30.298507462686565, |
|
"grad_norm": 0.05943215265870094, |
|
"learning_rate": 0.00012928227712765504, |
|
"loss": 0.0181, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 30.44776119402985, |
|
"grad_norm": 0.18460267782211304, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.0208, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 30.597014925373134, |
|
"grad_norm": 0.061664972454309464, |
|
"learning_rate": 0.00012807723489661495, |
|
"loss": 0.0205, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 30.746268656716417, |
|
"grad_norm": 0.36015012860298157, |
|
"learning_rate": 0.0001274730358607583, |
|
"loss": 0.0184, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 30.895522388059703, |
|
"grad_norm": 0.1974068284034729, |
|
"learning_rate": 0.00012686775006405946, |
|
"loss": 0.0196, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 31.044776119402986, |
|
"grad_norm": 0.12781082093715668, |
|
"learning_rate": 0.0001262614014500282, |
|
"loss": 0.021, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 31.19402985074627, |
|
"grad_norm": 0.22529159486293793, |
|
"learning_rate": 0.00012565401400421651, |
|
"loss": 0.018, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 31.34328358208955, |
|
"grad_norm": 0.0831318125128746, |
|
"learning_rate": 0.00012504561175326985, |
|
"loss": 0.0194, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 31.492537313432837, |
|
"grad_norm": 0.07306008040904999, |
|
"learning_rate": 0.0001244362187639767, |
|
"loss": 0.0183, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 31.64179104477612, |
|
"grad_norm": 0.08519799262285233, |
|
"learning_rate": 0.0001238258591423165, |
|
"loss": 0.0184, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 31.791044776119403, |
|
"grad_norm": 0.061566926538944244, |
|
"learning_rate": 0.00012321455703250616, |
|
"loss": 0.0198, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 31.940298507462686, |
|
"grad_norm": 0.04921621084213257, |
|
"learning_rate": 0.0001226023366160449, |
|
"loss": 0.0192, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 32.08955223880597, |
|
"grad_norm": 0.0568234883248806, |
|
"learning_rate": 0.00012198922211075778, |
|
"loss": 0.0186, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 32.23880597014925, |
|
"grad_norm": 0.09815705567598343, |
|
"learning_rate": 0.00012137523776983757, |
|
"loss": 0.0175, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 32.38805970149254, |
|
"grad_norm": 0.18607860803604126, |
|
"learning_rate": 0.00012076040788088554, |
|
"loss": 0.0178, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 32.53731343283582, |
|
"grad_norm": 0.1101093739271164, |
|
"learning_rate": 0.00012014475676495052, |
|
"loss": 0.0179, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 32.6865671641791, |
|
"grad_norm": 0.03449343517422676, |
|
"learning_rate": 0.000119528308775567, |
|
"loss": 0.0206, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 32.83582089552239, |
|
"grad_norm": 0.042853228747844696, |
|
"learning_rate": 0.00011891108829779165, |
|
"loss": 0.0191, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 32.985074626865675, |
|
"grad_norm": 0.048137255012989044, |
|
"learning_rate": 0.00011829311974723867, |
|
"loss": 0.0182, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 33.134328358208954, |
|
"grad_norm": 0.19318771362304688, |
|
"learning_rate": 0.00011767442756911417, |
|
"loss": 0.0178, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 33.28358208955224, |
|
"grad_norm": 0.08087719976902008, |
|
"learning_rate": 0.00011705503623724898, |
|
"loss": 0.0177, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 33.43283582089552, |
|
"grad_norm": 0.04939524829387665, |
|
"learning_rate": 0.00011643497025313061, |
|
"loss": 0.0194, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 33.582089552238806, |
|
"grad_norm": 0.04750213399529457, |
|
"learning_rate": 0.0001158142541449341, |
|
"loss": 0.0183, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 33.73134328358209, |
|
"grad_norm": 0.04823266714811325, |
|
"learning_rate": 0.0001151929124665516, |
|
"loss": 0.0172, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 33.88059701492537, |
|
"grad_norm": 0.03645576909184456, |
|
"learning_rate": 0.00011457096979662114, |
|
"loss": 0.018, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 34.02985074626866, |
|
"grad_norm": 0.025920415297150612, |
|
"learning_rate": 0.00011394845073755455, |
|
"loss": 0.0178, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 34.17910447761194, |
|
"grad_norm": 0.038914065808057785, |
|
"learning_rate": 0.00011332537991456398, |
|
"loss": 0.0168, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 34.32835820895522, |
|
"grad_norm": 0.07641326636075974, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.0194, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 34.47761194029851, |
|
"grad_norm": 0.08722823858261108, |
|
"learning_rate": 0.00011207768158581613, |
|
"loss": 0.0199, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 34.62686567164179, |
|
"grad_norm": 0.4223034679889679, |
|
"learning_rate": 0.00011145310343571411, |
|
"loss": 0.02, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 34.776119402985074, |
|
"grad_norm": 0.12001374363899231, |
|
"learning_rate": 0.0001108280722310462, |
|
"loss": 0.0173, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 34.92537313432836, |
|
"grad_norm": 0.14997516572475433, |
|
"learning_rate": 0.00011020261269639842, |
|
"loss": 0.0195, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 35.07462686567164, |
|
"grad_norm": 0.052674125880002975, |
|
"learning_rate": 0.00010957674957330042, |
|
"loss": 0.018, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 35.223880597014926, |
|
"grad_norm": 0.06538953632116318, |
|
"learning_rate": 0.00010895050761924668, |
|
"loss": 0.0168, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 35.37313432835821, |
|
"grad_norm": 0.08561014384031296, |
|
"learning_rate": 0.00010832391160671729, |
|
"loss": 0.0177, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 35.52238805970149, |
|
"grad_norm": 0.057601574808359146, |
|
"learning_rate": 0.00010769698632219794, |
|
"loss": 0.0176, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 35.67164179104478, |
|
"grad_norm": 0.051563702523708344, |
|
"learning_rate": 0.00010706975656519946, |
|
"loss": 0.0196, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 35.82089552238806, |
|
"grad_norm": 0.034905366599559784, |
|
"learning_rate": 0.00010644224714727681, |
|
"loss": 0.0183, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 35.97014925373134, |
|
"grad_norm": 0.053671374917030334, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.0179, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 36.11940298507463, |
|
"grad_norm": 0.06900997459888458, |
|
"learning_rate": 0.00010518648862921012, |
|
"loss": 0.0187, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 36.26865671641791, |
|
"grad_norm": 0.12105145305395126, |
|
"learning_rate": 0.00010455828920356115, |
|
"loss": 0.0164, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 36.417910447761194, |
|
"grad_norm": 0.04216486215591431, |
|
"learning_rate": 0.00010392990946401313, |
|
"loss": 0.0187, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 36.56716417910448, |
|
"grad_norm": 0.03389362245798111, |
|
"learning_rate": 0.00010330137426761135, |
|
"loss": 0.02, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 36.71641791044776, |
|
"grad_norm": 0.06512041389942169, |
|
"learning_rate": 0.00010267270847755048, |
|
"loss": 0.0181, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 36.865671641791046, |
|
"grad_norm": 0.052388746291399, |
|
"learning_rate": 0.00010204393696219117, |
|
"loss": 0.0171, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 37.014925373134325, |
|
"grad_norm": 0.16947126388549805, |
|
"learning_rate": 0.00010141508459407623, |
|
"loss": 0.0205, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 37.16417910447761, |
|
"grad_norm": 0.03826919198036194, |
|
"learning_rate": 0.00010078617624894684, |
|
"loss": 0.0168, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 37.3134328358209, |
|
"grad_norm": 0.033569592982530594, |
|
"learning_rate": 0.00010015723680475846, |
|
"loss": 0.0162, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 37.3134328358209, |
|
"eval_loss": 2.9721479415893555, |
|
"eval_runtime": 34.6726, |
|
"eval_samples_per_second": 9.143, |
|
"eval_steps_per_second": 4.586, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 37.46268656716418, |
|
"grad_norm": 0.03183290734887123, |
|
"learning_rate": 9.95282911406968e-05, |
|
"loss": 0.0175, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 37.61194029850746, |
|
"grad_norm": 0.03882099688053131, |
|
"learning_rate": 9.889936413619356e-05, |
|
"loss": 0.0162, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 37.76119402985075, |
|
"grad_norm": 0.08923082053661346, |
|
"learning_rate": 9.827048066994225e-05, |
|
"loss": 0.0206, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 37.91044776119403, |
|
"grad_norm": 0.07512518018484116, |
|
"learning_rate": 9.764166561891432e-05, |
|
"loss": 0.018, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 38.059701492537314, |
|
"grad_norm": 0.07171567529439926, |
|
"learning_rate": 9.70129438573747e-05, |
|
"loss": 0.0189, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 38.208955223880594, |
|
"grad_norm": 0.041823286563158035, |
|
"learning_rate": 9.63843402558981e-05, |
|
"loss": 0.0182, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 38.35820895522388, |
|
"grad_norm": 0.042579639703035355, |
|
"learning_rate": 9.57558796803852e-05, |
|
"loss": 0.015, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 38.507462686567166, |
|
"grad_norm": 0.032053157687187195, |
|
"learning_rate": 9.512758699107879e-05, |
|
"loss": 0.0192, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 38.656716417910445, |
|
"grad_norm": 0.03747331723570824, |
|
"learning_rate": 9.449948704158071e-05, |
|
"loss": 0.0185, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 38.80597014925373, |
|
"grad_norm": 0.03378698602318764, |
|
"learning_rate": 9.38716046778684e-05, |
|
"loss": 0.0177, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 38.95522388059702, |
|
"grad_norm": 0.0920565128326416, |
|
"learning_rate": 9.324396473731217e-05, |
|
"loss": 0.0186, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 39.1044776119403, |
|
"grad_norm": 0.10099617391824722, |
|
"learning_rate": 9.261659204769284e-05, |
|
"loss": 0.017, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 39.25373134328358, |
|
"grad_norm": 0.040173519402742386, |
|
"learning_rate": 9.198951142621929e-05, |
|
"loss": 0.0175, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 39.40298507462686, |
|
"grad_norm": 0.04502606391906738, |
|
"learning_rate": 9.136274767854716e-05, |
|
"loss": 0.0181, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 39.55223880597015, |
|
"grad_norm": 0.039172179996967316, |
|
"learning_rate": 9.07363255977973e-05, |
|
"loss": 0.018, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 39.701492537313435, |
|
"grad_norm": 0.05952875688672066, |
|
"learning_rate": 9.011026996357503e-05, |
|
"loss": 0.0171, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 39.850746268656714, |
|
"grad_norm": 0.03593125194311142, |
|
"learning_rate": 8.948460554099018e-05, |
|
"loss": 0.017, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 0.041005708277225494, |
|
"learning_rate": 8.885935707967716e-05, |
|
"loss": 0.0164, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 40.149253731343286, |
|
"grad_norm": 0.03663647174835205, |
|
"learning_rate": 8.823454931281616e-05, |
|
"loss": 0.0177, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 40.298507462686565, |
|
"grad_norm": 0.034003522247076035, |
|
"learning_rate": 8.76102069561545e-05, |
|
"loss": 0.0166, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 40.44776119402985, |
|
"grad_norm": 0.06096246466040611, |
|
"learning_rate": 8.698635470702923e-05, |
|
"loss": 0.0166, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 40.59701492537313, |
|
"grad_norm": 0.03656260296702385, |
|
"learning_rate": 8.636301724339004e-05, |
|
"loss": 0.0162, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 40.74626865671642, |
|
"grad_norm": 0.03990943357348442, |
|
"learning_rate": 8.574021922282292e-05, |
|
"loss": 0.018, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 40.8955223880597, |
|
"grad_norm": 0.03584331274032593, |
|
"learning_rate": 8.511798528157512e-05, |
|
"loss": 0.0184, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 41.04477611940298, |
|
"grad_norm": 0.027507085353136063, |
|
"learning_rate": 8.449634003358022e-05, |
|
"loss": 0.0163, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 41.19402985074627, |
|
"grad_norm": 0.03266240283846855, |
|
"learning_rate": 8.387530806948476e-05, |
|
"loss": 0.0175, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 41.343283582089555, |
|
"grad_norm": 0.041966021060943604, |
|
"learning_rate": 8.325491395567541e-05, |
|
"loss": 0.0175, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 41.492537313432834, |
|
"grad_norm": 0.03868953138589859, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.0179, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 41.64179104477612, |
|
"grad_norm": 0.03475317731499672, |
|
"learning_rate": 8.201613741733203e-05, |
|
"loss": 0.0159, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 41.791044776119406, |
|
"grad_norm": 0.04897564649581909, |
|
"learning_rate": 8.13978039955308e-05, |
|
"loss": 0.0183, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 41.940298507462686, |
|
"grad_norm": 0.03372865915298462, |
|
"learning_rate": 8.078020642754274e-05, |
|
"loss": 0.0169, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 42.08955223880597, |
|
"grad_norm": 0.046989791095256805, |
|
"learning_rate": 8.016336914389874e-05, |
|
"loss": 0.0174, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 42.23880597014925, |
|
"grad_norm": 0.03214934468269348, |
|
"learning_rate": 7.954731654505491e-05, |
|
"loss": 0.0146, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 42.38805970149254, |
|
"grad_norm": 0.05004828795790672, |
|
"learning_rate": 7.89320730004274e-05, |
|
"loss": 0.0182, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 42.53731343283582, |
|
"grad_norm": 0.031027644872665405, |
|
"learning_rate": 7.831766284742807e-05, |
|
"loss": 0.0156, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 42.6865671641791, |
|
"grad_norm": 0.04323369264602661, |
|
"learning_rate": 7.77041103905023e-05, |
|
"loss": 0.0177, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 42.83582089552239, |
|
"grad_norm": 0.03035310097038746, |
|
"learning_rate": 7.709143990016702e-05, |
|
"loss": 0.0164, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 42.985074626865675, |
|
"grad_norm": 0.03815029561519623, |
|
"learning_rate": 7.6479675612051e-05, |
|
"loss": 0.0205, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 43.134328358208954, |
|
"grad_norm": 0.039352428168058395, |
|
"learning_rate": 7.586884172593609e-05, |
|
"loss": 0.0169, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 43.28358208955224, |
|
"grad_norm": 0.031735971570014954, |
|
"learning_rate": 7.525896240479976e-05, |
|
"loss": 0.0159, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 43.43283582089552, |
|
"grad_norm": 0.03945886343717575, |
|
"learning_rate": 7.465006177385953e-05, |
|
"loss": 0.0166, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 43.582089552238806, |
|
"grad_norm": 0.030156582593917847, |
|
"learning_rate": 7.404216391961847e-05, |
|
"loss": 0.0176, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 43.73134328358209, |
|
"grad_norm": 0.03693369776010513, |
|
"learning_rate": 7.343529288891239e-05, |
|
"loss": 0.0159, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 43.88059701492537, |
|
"grad_norm": 0.03338786959648132, |
|
"learning_rate": 7.282947268795877e-05, |
|
"loss": 0.0171, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 44.02985074626866, |
|
"grad_norm": 0.021122202277183533, |
|
"learning_rate": 7.222472728140695e-05, |
|
"loss": 0.0178, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 44.17910447761194, |
|
"grad_norm": 0.02877069264650345, |
|
"learning_rate": 7.162108059139032e-05, |
|
"loss": 0.0186, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 44.32835820895522, |
|
"grad_norm": 0.07340731471776962, |
|
"learning_rate": 7.101855649657991e-05, |
|
"loss": 0.0187, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 44.47761194029851, |
|
"grad_norm": 0.04397398233413696, |
|
"learning_rate": 7.041717883123977e-05, |
|
"loss": 0.0169, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 44.62686567164179, |
|
"grad_norm": 0.03213610127568245, |
|
"learning_rate": 6.981697138428434e-05, |
|
"loss": 0.017, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 44.776119402985074, |
|
"grad_norm": 0.024713682010769844, |
|
"learning_rate": 6.921795789833723e-05, |
|
"loss": 0.016, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 44.776119402985074, |
|
"eval_loss": 3.2444493770599365, |
|
"eval_runtime": 34.6924, |
|
"eval_samples_per_second": 9.137, |
|
"eval_steps_per_second": 4.583, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 44.92537313432836, |
|
"grad_norm": 0.028246300294995308, |
|
"learning_rate": 6.862016206879216e-05, |
|
"loss": 0.017, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 45.07462686567164, |
|
"grad_norm": 0.03457006812095642, |
|
"learning_rate": 6.802360754287547e-05, |
|
"loss": 0.0162, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 45.223880597014926, |
|
"grad_norm": 0.04756162688136101, |
|
"learning_rate": 6.742831791871096e-05, |
|
"loss": 0.0176, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 45.37313432835821, |
|
"grad_norm": 0.03616981953382492, |
|
"learning_rate": 6.683431674438612e-05, |
|
"loss": 0.0153, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 45.52238805970149, |
|
"grad_norm": 0.03674984350800514, |
|
"learning_rate": 6.624162751702076e-05, |
|
"loss": 0.0155, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 45.67164179104478, |
|
"grad_norm": 0.026007067412137985, |
|
"learning_rate": 6.565027368183769e-05, |
|
"loss": 0.0167, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 45.82089552238806, |
|
"grad_norm": 0.027320370078086853, |
|
"learning_rate": 6.506027863123492e-05, |
|
"loss": 0.0183, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 45.97014925373134, |
|
"grad_norm": 0.037796761840581894, |
|
"learning_rate": 6.447166570386063e-05, |
|
"loss": 0.0184, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 46.11940298507463, |
|
"grad_norm": 0.03350226208567619, |
|
"learning_rate": 6.388445818368991e-05, |
|
"loss": 0.0156, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 46.26865671641791, |
|
"grad_norm": 0.026382336392998695, |
|
"learning_rate": 6.329867929910347e-05, |
|
"loss": 0.0156, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 46.417910447761194, |
|
"grad_norm": 0.03873557224869728, |
|
"learning_rate": 6.271435222196916e-05, |
|
"loss": 0.021, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 46.56716417910448, |
|
"grad_norm": 0.039839617908000946, |
|
"learning_rate": 6.213150006672499e-05, |
|
"loss": 0.0172, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 46.71641791044776, |
|
"grad_norm": 0.03648209199309349, |
|
"learning_rate": 6.15501458894651e-05, |
|
"loss": 0.0161, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 46.865671641791046, |
|
"grad_norm": 0.04052448272705078, |
|
"learning_rate": 6.097031268702746e-05, |
|
"loss": 0.0178, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 47.014925373134325, |
|
"grad_norm": 0.03359508886933327, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.0172, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 47.16417910447761, |
|
"grad_norm": 0.03347504511475563, |
|
"learning_rate": 5.981530089223489e-05, |
|
"loss": 0.0162, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 47.3134328358209, |
|
"grad_norm": 0.03658764436841011, |
|
"learning_rate": 5.924016798910037e-05, |
|
"loss": 0.0164, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 47.46268656716418, |
|
"grad_norm": 0.02819441817700863, |
|
"learning_rate": 5.866664743742162e-05, |
|
"loss": 0.0169, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 47.61194029850746, |
|
"grad_norm": 0.03421681374311447, |
|
"learning_rate": 5.809476192415905e-05, |
|
"loss": 0.0185, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 47.76119402985075, |
|
"grad_norm": 0.050075121223926544, |
|
"learning_rate": 5.752453407159522e-05, |
|
"loss": 0.0178, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 47.91044776119403, |
|
"grad_norm": 0.030316824093461037, |
|
"learning_rate": 5.69559864364402e-05, |
|
"loss": 0.0151, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 48.059701492537314, |
|
"grad_norm": 0.053834062069654465, |
|
"learning_rate": 5.6389141508938903e-05, |
|
"loss": 0.0173, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 48.208955223880594, |
|
"grad_norm": 0.036416202783584595, |
|
"learning_rate": 5.5824021711981686e-05, |
|
"loss": 0.0158, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 48.35820895522388, |
|
"grad_norm": 0.030989298596978188, |
|
"learning_rate": 5.5260649400217326e-05, |
|
"loss": 0.017, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 48.507462686567166, |
|
"grad_norm": 0.023621903732419014, |
|
"learning_rate": 5.469904685916861e-05, |
|
"loss": 0.0202, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 48.656716417910445, |
|
"grad_norm": 0.03500565141439438, |
|
"learning_rate": 5.4139236304350935e-05, |
|
"loss": 0.0178, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 48.80597014925373, |
|
"grad_norm": 0.040891725569963455, |
|
"learning_rate": 5.3581239880393375e-05, |
|
"loss": 0.0179, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 48.95522388059702, |
|
"grad_norm": 0.027273844927549362, |
|
"learning_rate": 5.302507966016295e-05, |
|
"loss": 0.014, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 49.1044776119403, |
|
"grad_norm": 0.02311500906944275, |
|
"learning_rate": 5.247077764389099e-05, |
|
"loss": 0.0167, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 49.25373134328358, |
|
"grad_norm": 0.052333466708660126, |
|
"learning_rate": 5.191835575830352e-05, |
|
"loss": 0.0177, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 49.40298507462686, |
|
"grad_norm": 0.042159441858530045, |
|
"learning_rate": 5.136783585575336e-05, |
|
"loss": 0.0162, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 49.55223880597015, |
|
"grad_norm": 0.025089124217629433, |
|
"learning_rate": 5.081923971335582e-05, |
|
"loss": 0.0166, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 49.701492537313435, |
|
"grad_norm": 0.03574312478303909, |
|
"learning_rate": 5.0272589032127594e-05, |
|
"loss": 0.0168, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 49.850746268656714, |
|
"grad_norm": 0.035489972680807114, |
|
"learning_rate": 4.972790543612783e-05, |
|
"loss": 0.0149, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 0.04449532926082611, |
|
"learning_rate": 4.918521047160308e-05, |
|
"loss": 0.0185, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 50.149253731343286, |
|
"grad_norm": 0.037168972194194794, |
|
"learning_rate": 4.864452560613485e-05, |
|
"loss": 0.0159, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 50.298507462686565, |
|
"grad_norm": 0.032931167632341385, |
|
"learning_rate": 4.810587222779043e-05, |
|
"loss": 0.0165, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 50.44776119402985, |
|
"grad_norm": 0.08401083201169968, |
|
"learning_rate": 4.756927164427685e-05, |
|
"loss": 0.0186, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 50.59701492537313, |
|
"grad_norm": 0.03464508801698685, |
|
"learning_rate": 4.703474508209793e-05, |
|
"loss": 0.0156, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 50.74626865671642, |
|
"grad_norm": 0.03466491773724556, |
|
"learning_rate": 4.650231368571486e-05, |
|
"loss": 0.0173, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 50.8955223880597, |
|
"grad_norm": 0.030173856765031815, |
|
"learning_rate": 4.597199851670932e-05, |
|
"loss": 0.0161, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 51.04477611940298, |
|
"grad_norm": 0.029572507366538048, |
|
"learning_rate": 4.54438205529508e-05, |
|
"loss": 0.0162, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 51.19402985074627, |
|
"grad_norm": 0.032448384910821915, |
|
"learning_rate": 4.491780068776663e-05, |
|
"loss": 0.0173, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 51.343283582089555, |
|
"grad_norm": 0.029228707775473595, |
|
"learning_rate": 4.4393959729115244e-05, |
|
"loss": 0.0182, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 51.492537313432834, |
|
"grad_norm": 0.03576023131608963, |
|
"learning_rate": 4.387231839876349e-05, |
|
"loss": 0.0156, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 51.64179104477612, |
|
"grad_norm": 0.02814898081123829, |
|
"learning_rate": 4.335289733146665e-05, |
|
"loss": 0.0163, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 51.791044776119406, |
|
"grad_norm": 0.05810336023569107, |
|
"learning_rate": 4.283571707415214e-05, |
|
"loss": 0.0172, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 51.940298507462686, |
|
"grad_norm": 0.03793029487133026, |
|
"learning_rate": 4.2320798085107036e-05, |
|
"loss": 0.0177, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 52.08955223880597, |
|
"grad_norm": 0.02555203065276146, |
|
"learning_rate": 4.18081607331685e-05, |
|
"loss": 0.0168, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 52.23880597014925, |
|
"grad_norm": 0.028642071411013603, |
|
"learning_rate": 4.129782529691815e-05, |
|
"loss": 0.0167, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 52.23880597014925, |
|
"eval_loss": 3.4037022590637207, |
|
"eval_runtime": 34.709, |
|
"eval_samples_per_second": 9.133, |
|
"eval_steps_per_second": 4.581, |
|
"step": 3500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 75, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.873632111370404e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|